def testSkipRecords(self):
        """
    This calls estimateAnomalyLikelihoods with various values of skipRecords
    """

        # Check happy path
        data1 = _generateSampleData(mean=0.1)[0:200]
        data1 = data1 + (_generateSampleData(mean=0.9)[0:200])

        likelihoods, _, estimatorParams = (an.estimateAnomalyLikelihoods(
            data1, skipRecords=200))

        # Check results are correct, i.e. we are actually skipping the first 50
        dParams = estimatorParams["distribution"]
        self.assertWithinEpsilon(dParams["mean"], 0.9, epsilon=0.1)

        # Check case where skipRecords > num records
        # In this case a null distribution should be returned which makes all
        # the likelihoods reasonably high
        likelihoods, _, estimatorParams = (an.estimateAnomalyLikelihoods(
            data1, skipRecords=500))
        self.assertEqual(len(likelihoods), len(data1))
        self.assertTrue(likelihoods.sum() >= 0.3 * len(likelihoods))

        # Check the case where skipRecords == num records
        likelihoods, _, estimatorParams = (an.estimateAnomalyLikelihoods(
            data1, skipRecords=len(data1)))
        self.assertEqual(len(likelihoods), len(data1))
        self.assertTrue(likelihoods.sum() >= 0.3 * len(likelihoods))
Exemplo n.º 2
0
  def testSkipRecords(self):
    """
    This calls estimateAnomalyLikelihoods with various values of skipRecords
    """

    # Check happy path
    data1 = _generateSampleData(mean=0.1)[0:200]
    data1 = data1 + (_generateSampleData(mean=0.9)[0:200])

    likelihoods, _, estimatorParams = (
      an.estimateAnomalyLikelihoods(data1, skipRecords=200)
    )

    # Check results are correct, i.e. we are actually skipping the first 50
    dParams = estimatorParams["distribution"]
    self.assertWithinEpsilon(dParams["mean"], 0.9, epsilon=0.1)

    # Check case where skipRecords > num records
    # In this case a null distribution should be returned which makes all
    # the likelihoods reasonably high
    likelihoods, _, estimatorParams = (
      an.estimateAnomalyLikelihoods(data1, skipRecords=500)
    )
    self.assertEqual(len(likelihoods), len(data1))
    self.assertTrue(likelihoods.sum() >= 0.3 * len(likelihoods))

    # Check the case where skipRecords == num records
    likelihoods, _, estimatorParams = (
      an.estimateAnomalyLikelihoods(data1, skipRecords=len(data1))
    )
    self.assertEqual(len(likelihoods), len(data1))
    self.assertTrue(likelihoods.sum() >= 0.3 * len(likelihoods))
    def testVeryFewScores(self):
        """
    This calls estimateAnomalyLikelihoods and updateAnomalyLikelihoods
    with one or no scores.
    """

        # Generate an estimate using two data points
        data1 = _generateSampleData(mean=42.0, variance=1e-10)

        _, _, estimatorParams = (an.estimateAnomalyLikelihoods(data1[0:2]))

        self.assertTrue(an.isValidEstimatorParams(estimatorParams))

        # Check that the estimated mean is that value
        dParams = estimatorParams["distribution"]
        self.assertWithinEpsilon(dParams["mean"], data1[0][2])

        # Can't generate an estimate using no data points
        data1 = numpy.zeros(0)
        with self.assertRaises(ValueError):
            an.estimateAnomalyLikelihoods(data1)

        # Can't update with no scores
        with self.assertRaises(ValueError):
            an.updateAnomalyLikelihoods(data1, estimatorParams)
Exemplo n.º 4
0
  def testVeryFewScores(self):
    """
    This calls estimateAnomalyLikelihoods and updateAnomalyLikelihoods
    with one or no scores.
    """

    # Generate an estimate using two data points
    data1 = _generateSampleData(mean=42.0, variance=1e-10)

    _, _, estimatorParams = (
      an.estimateAnomalyLikelihoods(data1[0:2])
    )

    self.assertTrue(an.isValidEstimatorParams(estimatorParams))

    # Check that the estimated mean is that value
    dParams = estimatorParams["distribution"]
    self.assertWithinEpsilon(dParams["mean"], data1[0][2])

    # Can't generate an estimate using no data points
    data1 = numpy.zeros(0)
    with self.assertRaises(ValueError):
      an.estimateAnomalyLikelihoods(data1)

    # Can't update with no scores
    with self.assertRaises(ValueError):
      an.updateAnomalyLikelihoods(data1, estimatorParams)
Exemplo n.º 5
0
  def testFlatAnomalyScores(self):
    """
    This calls estimateAnomalyLikelihoods with flat distributions and
    ensures things don't crash.
    """

    # Generate an estimate using fake distribution of anomaly scores.
    data1 = _generateSampleData(mean=42.0, variance=1e-10)

    likelihoods, avgRecordList, estimatorParams = (
      an.estimateAnomalyLikelihoods(data1[0:1000])
    )
    self.assertEqual(len(likelihoods), 1000)
    self.assertEqual(len(avgRecordList), 1000)
    self.assertTrue(an.isValidEstimatorParams(estimatorParams))

    ## Check that the estimated mean is correct
    dParams = estimatorParams["distribution"]
    self.assertWithinEpsilon(dParams["mean"], data1[0][2])

    # If you deviate from the mean, you should get probability 0
    # Test this by sending in just slightly different values.
    data2 = _generateSampleData(mean=42.5, variance=1e-10)
    likelihoods2, _, _ = (
      an.updateAnomalyLikelihoods(data2[0:10], estimatorParams)
    )

    # The likelihoods should go to zero very quickly
    self.assertLessEqual(likelihoods2.sum(), 0.01)


    # Test edge case where anomaly scores are very close to 0
    # In this case we don't let likelihood to get too low. An average
    # anomaly score of 0.1 should be essentially zero, but an average
    # of 0.04 should be higher
    data3 = _generateSampleData(mean=0.01, variance=1e-6)

    _, _, estimatorParams3 = (
      an.estimateAnomalyLikelihoods(data3[0:1000])
    )

    data4 = _generateSampleData(mean=0.1, variance=1e-6)
    likelihoods4, _, estimatorParams4 = (
      an.updateAnomalyLikelihoods(data4[0:20], estimatorParams3)
    )

    # Average of 0.1 should go to zero
    self.assertLessEqual(likelihoods4[10:].mean(), 0.002)

    data5 = _generateSampleData(mean=0.05, variance=1e-6)
    likelihoods5, _, _ = (
      an.updateAnomalyLikelihoods(data5[0:20], estimatorParams4)
    )

    # The likelihoods should be low but not near zero
    self.assertLessEqual(likelihoods5[10:].mean(), 0.28)
    self.assertGreater(likelihoods5[10:].mean(), 0.015)
Exemplo n.º 6
0
  def testFlatAnomalyScores(self):
    """
    This calls estimateAnomalyLikelihoods with flat distributions and
    ensures things don't crash.
    """

    # Generate an estimate using fake distribution of anomaly scores.
    data1 = _generateSampleData(mean=42.0, variance=1e-10)

    likelihoods, avgRecordList, estimatorParams = (
      an.estimateAnomalyLikelihoods(data1[0:1000])
    )
    self.assertEqual(len(likelihoods), 1000)
    self.assertEqual(len(avgRecordList), 1000)
    self.assertTrue(an.isValidEstimatorParams(estimatorParams))

    ## Check that the estimated mean is correct
    dParams = estimatorParams["distribution"]
    self.assertWithinEpsilon(dParams["mean"], data1[0][2])

    # If you deviate from the mean, you should get probability 0
    # Test this by sending in just slightly different values.
    data2 = _generateSampleData(mean=42.5, variance=1e-10)
    likelihoods2, _, _ = (
      an.updateAnomalyLikelihoods(data2[0:10], estimatorParams)
    )

    # The likelihoods should go to zero very quickly
    self.assertLessEqual(likelihoods2.sum(), 0.01)


    # Test edge case where anomaly scores are very close to 0
    # In this case we don't let likelihood to get too low. An average
    # anomaly score of 0.1 should be essentially zero, but an average
    # of 0.04 should be higher
    data3 = _generateSampleData(mean=0.01, variance=1e-6)

    _, _, estimatorParams3 = (
      an.estimateAnomalyLikelihoods(data3[0:1000])
    )

    data4 = _generateSampleData(mean=0.1, variance=1e-6)
    likelihoods4, _, estimatorParams4 = (
      an.updateAnomalyLikelihoods(data4[0:20], estimatorParams3)
    )

    # Average of 0.1 should go to zero
    self.assertLessEqual(likelihoods4[10:].mean(), 0.002)

    data5 = _generateSampleData(mean=0.05, variance=1e-6)
    likelihoods5, _, _ = (
      an.updateAnomalyLikelihoods(data5[0:20], estimatorParams4)
    )

    # The likelihoods should be low but not near zero
    self.assertLessEqual(likelihoods5[10:].mean(), 0.28)
    self.assertGreater(likelihoods5[10:].mean(), 0.015)
Exemplo n.º 7
0
  def likelihood(self, value, anomalyScore, dttm):
    """
    Given the current metric value, plus the current anomaly score, output the
    anomalyLikelihood for this record.
    """
    dataPoint = (dttm, value, anomalyScore)
    # We ignore the first probationaryPeriod data points
    if len(self._historicalScores) < self._probationaryPeriod:
      likelihood = 0.5
    else:
      # On a rolling basis we re-estimate the distribution every 100 iterations
      if self._distribution is None or (self._iteration % 100 == 0):
        _, _, self._distribution = (
          anomaly_likelihood.estimateAnomalyLikelihoods(
            self._historicalScores,
            skipRecords = self._numentaLearningPeriod)
          )

      likelihoods, _, self._distribution = (
        anomaly_likelihood.updateAnomalyLikelihoods([dataPoint],
          self._distribution)
      )
      likelihood = 1.0 - likelihoods[0]

    # Before we exit update historical scores and iteration
    self._historicalScores.append(dataPoint)
    self._iteration += 1

    return likelihood
  def testCaseIncreasedAnomalyScore(self):
    """
    Test F: small anomaly score every 20 records, but then a large one when you
    would expect a small one. This should be anomalous.
    """

    # Generate initial data
    data = []
    data = self._addSampleData(data, spikePeriod=20,
                               spikeValue=0.4, numSamples=1000)

    _, _, estimatorParams = (
      an.estimateAnomalyLikelihoods(data)
    )

    # Now feed in a more frequent distribution
    data = self._addSampleData(spikePeriod=20, spikeValue=1.0,
                               numSamples=100)
    likelihoods2, _, _ = (
      an.updateAnomalyLikelihoods(data, estimatorParams)
    )

    # We should detect highly unusual behavior
    self.assertTrue(likelihoods2.min() < 0.0003)

    # We should detect it pretty often
    self.assertTrue((likelihoods2 < 0.0003).sum() > 40)
  def testCaseIncreasedSpikeFrequency(self):
    """
    Test E: bunches of anomalies every 20 records that become even more
    frequent. This should be anomalous.
    """

    # Generate initial data
    data = []
    for _ in range(30):
      data = self._addSampleData(data, spikePeriod=0, numSamples=30)
      data = self._addSampleData(data, spikePeriod=3, numSamples=10)

    _, _, estimatorParams = (
      an.estimateAnomalyLikelihoods(data[0:1000])
    )

    # Now feed in a more frequent distribution
    data = self._addSampleData(spikePeriod=0, numSamples=30)
    data = self._addSampleData(data, spikePeriod=1, numSamples=10)
    likelihoods2, _, _ = (
      an.updateAnomalyLikelihoods(data, estimatorParams)
    )

    # The likelihood should become anomalous but only near the end
    self.assertTrue(likelihoods2[0:30].min() > 0.01)
    self.assertTrue(likelihoods2[-5:].min() < 0.002)
Exemplo n.º 10
0
  def testCaseContinuousBunchesOfSpikes(self):
    """
    Test D: bunches of anomalies every 20 records that continue. This should not
    be anomalous.
    """

    # Generate initial data
    data = []
    for _ in range(30):
      data = self._addSampleData(data, spikePeriod=0, numSamples=30)
      data = self._addSampleData(data, spikePeriod=3, numSamples=10)

    _, _, estimatorParams = (
      an.estimateAnomalyLikelihoods(data[0:1000])
    )

    # Now feed in the same distribution
    data = self._addSampleData(spikePeriod=0, numSamples=30)
    data = self._addSampleData(data, spikePeriod=3, numSamples=10)
    likelihoods2, _, _ = (
      an.updateAnomalyLikelihoods(data, estimatorParams)
    )

    # The likelihood should be reasonable high everywhere
    self.assertTrue(likelihoods2.min() > 0.01)
Exemplo n.º 11
0
  def testCaseUnusuallyHighSpikeFrequency(self):
    """
    Test B: one anomaly spike every 20 records. Then we suddenly get a bunch
    in a row. The likelihood of those spikes should be low.
    """
    data = self._addSampleData(spikePeriod=20, numSamples=1019)

    _, _, estimatorParams = (
      an.estimateAnomalyLikelihoods(data[0:1000])
    )

    # If we continue to see the same distribution, we should get reasonable
    # likelihoods
    data = self._addSampleData(numSamples=119, spikePeriod=20)
    likelihoods1, _, estimatorParams1 = (
      an.updateAnomalyLikelihoods(data, estimatorParams)
    )

    # The minimum likelihood should be reasonably high
    self.assertTrue((likelihoods1.min() > 0.1 ))

    data = self._addSampleData(numSamples=20, spikePeriod=2)
    likelihoods2, _, _ = (
      an.updateAnomalyLikelihoods(data, estimatorParams1)
    )

    # The likelihood once you get past the initial averaging should be very low.
    self.assertTrue((likelihoods2[5:].sum() / 15.0) < 0.001)
Exemplo n.º 12
0
  def testCaseIncreasedSpikeFrequency(self):
    """
    Test E: bunches of anomalies every 20 records that become even more
    frequent. This should be anomalous.
    """

    # Generate initial data
    data = []
    for _ in range(30):
      data = self._addSampleData(data, spikePeriod=0, numSamples=30)
      data = self._addSampleData(data, spikePeriod=3, numSamples=10)

    _, _, estimatorParams = (
      an.estimateAnomalyLikelihoods(data[0:1000])
    )

    # Now feed in a more frequent distribution
    data = self._addSampleData(spikePeriod=0, numSamples=30)
    data = self._addSampleData(data, spikePeriod=1, numSamples=10)
    likelihoods2, _, _ = (
      an.updateAnomalyLikelihoods(data, estimatorParams)
    )

    # The likelihood should become anomalous but only near the end
    self.assertTrue(likelihoods2[0:30].min() > 0.01)
    self.assertTrue(likelihoods2[-5:].min() < 0.002)
Exemplo n.º 13
0
    def testEstimateAnomalyLikelihoods(self):
        """
    This calls estimateAnomalyLikelihoods to estimate the distribution on fake
    data and validates the results
    """

        # Generate an estimate using fake distribution of anomaly scores.
        data1 = _generateSampleData(mean=0.2)

        likelihoods, avgRecordList, estimatorParams = (
            an.estimateAnomalyLikelihoods(data1[0:1000]))
        self.assertEqual(len(likelihoods), 1000)
        self.assertEqual(len(avgRecordList), 1000)
        self.assertTrue(an.isValidEstimatorParams(estimatorParams))

        # Check that the sum is correct
        avgParams = estimatorParams["movingAverage"]
        total = 0
        for v in avgRecordList:
            total = total + v[2]
        self.assertTrue(avgParams["total"], total)

        # Check that the estimated mean is correct
        dParams = estimatorParams["distribution"]
        self.assertWithinEpsilon(dParams["mean"],
                                 total / float(len(avgRecordList)))

        # Number of points with lower than 2% probability should be pretty low
        # but not zero. Can't use exact 2% here due to random variations
        self.assertLessEqual(numpy.sum(likelihoods < 0.02), 50)
        self.assertGreaterEqual(numpy.sum(likelihoods < 0.02), 1)
Exemplo n.º 14
0
    def testUpdateAnomalyLikelihoods(self):
        """
    A slight more complex test. This calls estimateAnomalyLikelihoods
    to estimate the distribution on fake data, followed by several calls
    to updateAnomalyLikelihoods.
    """

        # ------------------------------------------
        # Step 1. Generate an initial estimate using fake distribution of anomaly
        # scores.
        data1 = _generateSampleData(mean=0.2)[0:1000]
        _, _, estimatorParams = an.estimateAnomalyLikelihoods(data1, averagingWindow=5)

        # ------------------------------------------
        # Step 2. Generate some new data with a higher average anomaly
        # score. Using the estimator from step 1, to compute likelihoods. Now we
        # should see a lot more anomalies.
        data2 = _generateSampleData(mean=0.6)[0:300]
        likelihoods2, avgRecordList2, estimatorParams2 = an.updateAnomalyLikelihoods(data2, estimatorParams)
        self.assertEqual(len(likelihoods2), len(data2))
        self.assertEqual(len(avgRecordList2), len(data2))
        self.assertTrue(an.isValidEstimatorParams(estimatorParams))

        # The new running total should be different
        self.assertNotEqual(estimatorParams2["movingAverage"]["total"], estimatorParams["movingAverage"]["total"])

        # We should have many more samples where likelihood is < 0.01, but not all
        self.assertGreaterEqual(numpy.sum(likelihoods2 < 0.01), 25)
        self.assertLessEqual(numpy.sum(likelihoods2 < 0.01), 250)

        # ------------------------------------------
        # Step 3. Generate some new data with the expected average anomaly score. We
        # should see fewer anomalies than in Step 2.
        data3 = _generateSampleData(mean=0.2)[0:1000]
        likelihoods3, avgRecordList3, estimatorParams3 = an.updateAnomalyLikelihoods(data3, estimatorParams2)

        self.assertEqual(len(likelihoods3), len(data3))
        self.assertEqual(len(avgRecordList3), len(data3))
        self.assertTrue(an.isValidEstimatorParams(estimatorParams3))

        # The new running total should be different
        self.assertNotEqual(estimatorParams3["movingAverage"]["total"], estimatorParams["movingAverage"]["total"])
        self.assertNotEqual(estimatorParams3["movingAverage"]["total"], estimatorParams2["movingAverage"]["total"])

        # We should have a small number samples where likelihood is < 0.02, but at
        # least one
        self.assertGreaterEqual(numpy.sum(likelihoods3 < 0.01), 1)
        self.assertLessEqual(numpy.sum(likelihoods3 < 0.01), 100)

        # ------------------------------------------
        # Step 4. Validate that sending data incrementally is the same as sending
        # in one batch
        allData = data1
        allData.extend(data2)
        allData.extend(data3)

        # Compute moving average of all the data and check it's the same
        _, historicalValuesAll, totalAll = an._anomalyScoreMovingAverage(allData, windowSize=5)
        self.assertEqual(sum(historicalValuesAll), sum(estimatorParams3["movingAverage"]["historicalValues"]))
        self.assertEqual(totalAll, estimatorParams3["movingAverage"]["total"])
Exemplo n.º 15
0
  def likelihood(self, value, anomalyScore, dttm):
    """
    Given the current metric value, plus the current anomaly score, output the
    anomalyLikelihood for this record.
    """
    dataPoint = (dttm, value, anomalyScore)
    # We ignore the first probationaryPeriod data points
    if len(self._historicalScores) < self._probationaryPeriod:
      likelihood = 0.5
    else:
      # On a rolling basis we re-estimate the distribution every 100 iterations
      if self._distribution is None or (self._iteration % 100 == 0):
        _, _, self._distribution = (
          anomaly_likelihood.estimateAnomalyLikelihoods(
            self._historicalScores,
            skipRecords = self._numentaLearningPeriod)
          )

      likelihoods, _, self._distribution = (
        anomaly_likelihood.updateAnomalyLikelihoods([dataPoint],
          self._distribution)
      )
      likelihood = 1.0 - likelihoods[0]

    # Before we exit update historical scores and iteration
    self._historicalScores.append(dataPoint)
    self._iteration += 1

    return likelihood
Exemplo n.º 16
0
  def testEstimateAnomalyLikelihoodsMalformedRecords(self):
    """
    This calls estimateAnomalyLikelihoods with malformed records, which should
    be quietly skipped.
    """

    # Generate an estimate using fake distribution of anomaly scores.
    data1 = _generateSampleData(mean=0.2)
    data1 = data1 + [(2, 2), (2, 2, 2, 2), (), (2)]  # Malformed records

    likelihoods, avgRecordList, estimatorParams = (
      an.estimateAnomalyLikelihoods(data1[0:1000])
    )
    self.assertEqual(len(likelihoods), 1000)
    self.assertEqual(len(avgRecordList), 1000)
    self.assertTrue(an.isValidEstimatorParams(estimatorParams))

    # Check that the sum is correct
    avgParams = estimatorParams["movingAverage"]
    total = 0
    for v in avgRecordList:
      total = total + v[2]
    self.assertTrue(avgParams["total"], total)

    # Check that the estimated mean is correct
    dParams = estimatorParams["distribution"]
    self.assertWithinEpsilon(dParams["mean"],
                             total / float(len(avgRecordList)))
Exemplo n.º 17
0
  def testCaseUnusuallyHighSpikeFrequency(self):
    """
    Test B: one anomaly spike every 20 records. Then we suddenly get a bunch
    in a row. The likelihood of those spikes should be low.
    """
    data = self._addSampleData(spikePeriod=20, numSamples=1019)

    _, _, estimatorParams = (
      an.estimateAnomalyLikelihoods(data[0:1000])
    )

    # If we continue to see the same distribution, we should get reasonable
    # likelihoods
    data = self._addSampleData(numSamples=119, spikePeriod=20)
    likelihoods1, _, estimatorParams1 = (
      an.updateAnomalyLikelihoods(data, estimatorParams)
    )

    # The minimum likelihood should be reasonably high
    self.assertTrue((likelihoods1.min() > 0.1 ))

    data = self._addSampleData(numSamples=20, spikePeriod=2)
    likelihoods2, _, _ = (
      an.updateAnomalyLikelihoods(data, estimatorParams1)
    )

    # The likelihood once you get past the initial averaging should be very low.
    self.assertTrue((likelihoods2[5:].sum() / 15.0) < 0.001)
    def testCaseIncreasedAnomalyScore(self):
        """
    Test F: small anomaly score every 20 records, but then a large one when you
    would expect a small one. This should be anomalous.
    """

        # Generate initial data
        data = []
        data = self._addSampleData(data,
                                   spikePeriod=20,
                                   spikeValue=0.4,
                                   numSamples=1000)

        _, _, estimatorParams = (an.estimateAnomalyLikelihoods(data))

        # Now feed in a more frequent distribution
        data = self._addSampleData(spikePeriod=20,
                                   spikeValue=1.0,
                                   numSamples=100)
        likelihoods2, _, _ = (an.updateAnomalyLikelihoods(
            data, estimatorParams))

        # We should detect highly unusual behavior
        self.assertTrue(likelihoods2.min() < 0.0003)

        # We should detect it pretty often
        self.assertTrue((likelihoods2 < 0.0003).sum() > 40)
Exemplo n.º 19
0
  def testEstimateAnomalyLikelihoods(self):
    """
    This calls estimateAnomalyLikelihoods to estimate the distribution on fake
    data and validates the results
    """

    # Generate an estimate using fake distribution of anomaly scores.
    data1 = _generateSampleData(mean=0.2)

    likelihoods, avgRecordList, estimatorParams = (
      an.estimateAnomalyLikelihoods(data1[0:1000])
    )
    self.assertEqual(len(likelihoods), 1000)
    self.assertEqual(len(avgRecordList), 1000)
    self.assertTrue(an.isValidEstimatorParams(estimatorParams))

    # Check that the sum is correct
    avgParams = estimatorParams["movingAverage"]
    total = 0
    for v in avgRecordList:
      total = total + v[2]
    self.assertTrue(avgParams["total"], total)

    # Check that the estimated mean is correct
    dParams = estimatorParams["distribution"]
    self.assertWithinEpsilon(dParams["mean"],
                             total / float(len(avgRecordList)))

    # Number of points with lower than 2% probability should be pretty low
    # but not zero. Can't use exact 2% here due to random variations
    self.assertLessEqual(numpy.sum(likelihoods < 0.02), 50)
    self.assertGreaterEqual(numpy.sum(likelihoods < 0.02), 1)
Exemplo n.º 20
0
  def testCaseContinuousBunchesOfSpikes(self):
    """
    Test D: bunches of anomalies every 20 records that continue. This should not
    be anomalous.
    """

    # Generate initial data
    data = []
    for _ in range(30):
      data = self._addSampleData(data, spikePeriod=0, numSamples=30)
      data = self._addSampleData(data, spikePeriod=3, numSamples=10)

    _, _, estimatorParams = (
      an.estimateAnomalyLikelihoods(data[0:1000])
    )

    # Now feed in the same distribution
    data = self._addSampleData(spikePeriod=0, numSamples=30)
    data = self._addSampleData(data, spikePeriod=3, numSamples=10)
    likelihoods2, _, _ = (
      an.updateAnomalyLikelihoods(data, estimatorParams)
    )

    # The likelihood should be reasonable high everywhere
    self.assertTrue(likelihoods2.min() > 0.01)
Exemplo n.º 21
0
    def testEstimateAnomalyLikelihoodsMalformedRecords(self):
        """
    This calls estimateAnomalyLikelihoods with malformed records, which should
    be quietly skipped.
    """

        # Generate a fake distribution of anomaly scores, and add malformed records
        data1 = _generateSampleData(mean=0.2)
        data1 = data1[0:1000] + [(2, 2)] + [(2, 2, 2, 2)] + [()] + [(2)]

        likelihoods, avgRecordList, estimatorParams = (
            an.estimateAnomalyLikelihoods(data1[0:1004]))
        self.assertEqual(len(likelihoods), 1000)
        self.assertEqual(len(avgRecordList), 1000)
        self.assertTrue(an.isValidEstimatorParams(estimatorParams))

        # Check that the sum is correct
        avgParams = estimatorParams["movingAverage"]
        total = 0
        for v in avgRecordList:
            total = total + v[2]
        self.assertTrue(avgParams["total"], total)

        # Check that the estimated mean is correct
        dParams = estimatorParams["distribution"]
        self.assertWithinEpsilon(dParams["mean"],
                                 total / float(len(avgRecordList)))
Exemplo n.º 22
0
    def testEstimateAnomalyLikelihoodsCategoryValues(self):
        start = datetime.datetime(2017, 1, 1, 0, 0, 0)
        delta = datetime.timedelta(minutes=5)
        dts = [start + (i * delta) for i in xrange(10)]
        values = ["a", "b", "c", "d", "e"] * 2
        rawScores = [0.1 * i for i in xrange(10)]
        data = zip(dts, values, rawScores)

        likelihoods, avgRecordList, estimatorParams = (
            an.estimateAnomalyLikelihoods(data))
        self.assertEqual(len(likelihoods), 10)
        self.assertEqual(len(avgRecordList), 10)
        self.assertTrue(an.isValidEstimatorParams(estimatorParams))
    def testCaseSingleSpike(self):
        """
    No anomalies, and then you see a single spike. The likelihood of that
    spike should be 0
    """
        data = self._addSampleData(spikePeriod=0, numSamples=1000)

        _, _, estimatorParams = (an.estimateAnomalyLikelihoods(data[0:1000]))

        data = self._addSampleData(numSamples=1, spikePeriod=1)
        likelihoods1, _, _ = (an.updateAnomalyLikelihoods(
            data, estimatorParams))

        self.assertWithinEpsilon(likelihoods1[0], 0.0)
Exemplo n.º 24
0
  def testEstimateAnomalyLikelihoodsCategoryValues(self):
    start = datetime.datetime(2017, 1, 1, 0, 0, 0)
    delta = datetime.timedelta(minutes=5)
    dts = [start + (i * delta) for i in xrange(10)]
    values = ["a", "b", "c", "d", "e"] * 2
    rawScores = [0.1 * i for i in xrange(10)]
    data = zip(dts, values, rawScores)

    likelihoods, avgRecordList, estimatorParams = (
      an.estimateAnomalyLikelihoods(data)
    )
    self.assertEqual(len(likelihoods), 10)
    self.assertEqual(len(avgRecordList), 10)
    self.assertTrue(an.isValidEstimatorParams(estimatorParams))
    def testCaseMissingSpike(self):
        """
    Test C: one anomaly every 20 records, but then see none. The likelihood
    at the end should be very low.
    """

        # Initial data
        data = self._addSampleData(spikePeriod=20, numSamples=1019)
        _, _, estimatorParams = (an.estimateAnomalyLikelihoods(data[0:1000]))

        # Now feed in none
        data = self._addSampleData(numSamples=100, spikePeriod=0)
        likelihoods2, _, _ = (an.updateAnomalyLikelihoods(
            data, estimatorParams))

        # The likelihood once you get past the initial averaging should be very low.
        self.assertTrue((likelihoods2[5:].sum() / 15.0) < 0.0001)
Exemplo n.º 26
0
  def testCaseSingleSpike(self):
    """
    No anomalies, and then you see a single spike. The likelihood of that
    spike should be 0
    """
    data = self._addSampleData(spikePeriod=0, numSamples=1000)

    _, _, estimatorParams = (
      an.estimateAnomalyLikelihoods(data[0:1000])
    )

    data = self._addSampleData(numSamples=1, spikePeriod=1)
    likelihoods1, _, _ = (
      an.updateAnomalyLikelihoods(data, estimatorParams)
    )

    self.assertWithinEpsilon(likelihoods1[0], 0.0)
Exemplo n.º 27
0
    def testFlatMetricScores(self):
        """
    This calls estimateAnomalyLikelihoods with flat metric values. In this case
    we should use the null distribution, which gets reasonably high likelihood
    for everything.
    """
        # Generate samples with very flat metric values
        data1 = _generateSampleData(metricMean=42.0, metricVariance=1e-10)[0:1000]

        likelihoods, _, estimatorParams = an.estimateAnomalyLikelihoods(data1)

        # Check that we do indeed get reasonable likelihood values
        self.assertEqual(len(likelihoods), len(data1))
        self.assertTrue(likelihoods.sum() >= 0.4 * len(likelihoods))

        # Check that we do indeed get null distribution
        self.assertDictEqual(estimatorParams["distribution"], an.nullDistribution())
Exemplo n.º 28
0
    def testBadParams(self):
        """
    Calls updateAnomalyLikelihoods with bad params.
    """

        # Generate an estimate using one data point
        data1 = _generateSampleData(mean=42.0, variance=1e-10)

        _, _, estimatorParams = an.estimateAnomalyLikelihoods(data1[0:1])

        self.assertTrue(an.isValidEstimatorParams(estimatorParams))

        # Can't pass in a bad params structure
        with self.assertRaises(ValueError):
            an.updateAnomalyLikelihoods(data1, {"haha": "heehee"})

        # Can't pass in something not a dict
        with self.assertRaises(ValueError):
            an.updateAnomalyLikelihoods(data1, 42.0)
Exemplo n.º 29
0
    def testBadParams(self):
        """
    Calls updateAnomalyLikelihoods with bad params.
    """

        # Generate an estimate using one data point
        data1 = _generateSampleData(mean=42.0, variance=1e-10)

        _, _, estimatorParams = (an.estimateAnomalyLikelihoods(data1[0:1]))

        self.assertTrue(an.isValidEstimatorParams(estimatorParams))

        # Can't pass in a bad params structure
        with self.assertRaises(ValueError):
            an.updateAnomalyLikelihoods(data1, {"haha": "heehee"})

        # Can't pass in something not a dict
        with self.assertRaises(ValueError):
            an.updateAnomalyLikelihoods(data1, 42.0)
Exemplo n.º 30
0
  def testCaseMissingSpike(self):
    """
    Test C: one anomaly every 20 records, but then see none. The likelihood
    at the end should be very low.
    """

    # Initial data
    data = self._addSampleData(spikePeriod=20, numSamples=1019)
    _, _, estimatorParams = (
      an.estimateAnomalyLikelihoods(data[0:1000])
    )

    # Now feed in none
    data = self._addSampleData(numSamples=100, spikePeriod=0)
    likelihoods2, _, _ = (
      an.updateAnomalyLikelihoods(data, estimatorParams)
    )

    # The likelihood once you get past the initial averaging should be very low.
    self.assertTrue((likelihoods2[5:].sum() / 15.0) < 0.0001)
Exemplo n.º 31
0
    def testFlatMetricScores(self):
        """
    This calls estimateAnomalyLikelihoods with flat metric values. In this case
    we should use the null distribution, which gets reasonably high likelihood
    for everything.
    """
        # Generate samples with very flat metric values
        data1 = _generateSampleData(metricMean=42.0,
                                    metricVariance=1e-10)[0:1000]

        likelihoods, _, estimatorParams = (
            an.estimateAnomalyLikelihoods(data1))

        # Check that we do indeed get reasonable likelihood values
        self.assertEqual(len(likelihoods), len(data1))
        self.assertTrue(likelihoods.sum() >= 0.4 * len(likelihoods))

        # Check that we do indeed get null distribution
        self.assertDictEqual(estimatorParams["distribution"],
                             an.nullDistribution())
Exemplo n.º 32
0
    def testNABAnomalyLikelihood(self):
        """
    Tests the specific calls to nupic/algorithms/anomaly_likelihood as they"re
    made in "NAB/detectors/numenta/numenta_detector.py".
    Note "NAB/.../numenta_detector.py" has its own class AnomalyLikelihood,
    different from nupic/algorithms/anomaly_likelihood.AnomalyLikelihood, but
    which calls the functions estimateAnomalyLikelihoods() and 
    updateAnomalyLikelihoods() from "nupic/algorithms/anomaly_likelihood.py".
    """
        # AnomalyLikelihood object initial values
        iteration = 0
        probationaryPeriod = 4
        historicalScores = []

        likelihoodList = []
        for dataPoint in self.data:
            # Ignore the first probationaryPeriod data points
            if len(historicalScores) < probationaryPeriod:
                likelihood = 0.5
            else:
                if iteration % 4 == 0:
                    _, _, distribution = an.estimateAnomalyLikelihoods(
                        historicalScores, skipRecords=probationaryPeriod)
                    likelihoods, _, distribution = an.updateAnomalyLikelihoods(
                        [dataPoint], distribution)
                    likelihood = 1.0 - likelihoods[0]
            historicalScores.append(dataPoint)
            iteration += 1
            likelihoodList.append(likelihood)

        truthLikelihoodList = [
            0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.044565462999999972,
            0.044565462999999972, 0.044565462999999972, 0.044565462999999972,
            0.90319951499999995, 0.90319951499999995, 0.90319951499999995,
            0.90319951499999995, 0.78814460099999994, 0.78814460099999994,
            0.78814460099999994, 0.78814460099999994
        ]
        for i in xrange(len(likelihoodList)):
            self.assertAlmostEqual(likelihoodList[i],
                                   truthLikelihoodList[i],
                                   msg="unequal values are at index %i" % i)
Exemplo n.º 33
0
 def testNABAnomalyLikelihood(self):
   """
   Tests the specific calls to nupic/algorithms/anomaly_likelihood as they"re
   made in "NAB/detectors/numenta/numenta_detector.py".
   Note "NAB/.../numenta_detector.py" has its own class AnomalyLikelihood,
   different from nupic/algorithms/anomaly_likelihood.AnomalyLikelihood, but
   which calls the functions estimateAnomalyLikelihoods() and 
   updateAnomalyLikelihoods() from "nupic/algorithms/anomaly_likelihood.py".
   """
   # AnomalyLikelihood object initial values
   iteration = 0
   probationaryPeriod = 4
   historicalScores = []
   
   likelihoodList = []
   for dataPoint in self.data:
     # Ignore the first probationaryPeriod data points
     if len(historicalScores) < probationaryPeriod:
       likelihood = 0.5
     else:
       if iteration % 4 == 0:
         _, _, distribution = an.estimateAnomalyLikelihoods(
                                historicalScores,
                                skipRecords = probationaryPeriod)
         likelihoods, _, distribution = an.updateAnomalyLikelihoods(
                               [dataPoint], distribution)
         likelihood = 1.0 - likelihoods[0]
     historicalScores.append(dataPoint)
     iteration += 1
     likelihoodList.append(likelihood)
   
   truthLikelihoodList = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,
                          0.044565462999999972, 0.044565462999999972,
                          0.044565462999999972, 0.044565462999999972,
                          0.90319951499999995, 0.90319951499999995,
                          0.90319951499999995, 0.90319951499999995,
                          0.78814460099999994, 0.78814460099999994,
                          0.78814460099999994, 0.78814460099999994]
   for i in xrange(len(likelihoodList)):
     self.assertAlmostEqual(likelihoodList[i], truthLikelihoodList[i],
       msg="unequal values are at index %i" % i)
Exemplo n.º 34
0
    def testUpdateAnomalyLikelihoods(self):
        """
    A slight more complex test. This calls estimateAnomalyLikelihoods
    to estimate the distribution on fake data, followed by several calls
    to updateAnomalyLikelihoods.
    """

        #------------------------------------------
        # Step 1. Generate an initial estimate using fake distribution of anomaly
        # scores.
        data1 = _generateSampleData(mean=0.2)[0:1000]
        _, _, estimatorParams = (an.estimateAnomalyLikelihoods(
            data1, averagingWindow=5))

        #------------------------------------------
        # Step 2. Generate some new data with a higher average anomaly
        # score. Using the estimator from step 1, to compute likelihoods. Now we
        # should see a lot more anomalies.
        data2 = _generateSampleData(mean=0.6)[0:300]
        likelihoods2, avgRecordList2, estimatorParams2 = (
            an.updateAnomalyLikelihoods(data2, estimatorParams))
        self.assertEqual(len(likelihoods2), len(data2))
        self.assertEqual(len(avgRecordList2), len(data2))
        self.assertTrue(an.isValidEstimatorParams(estimatorParams))

        # The new running total should be different
        self.assertNotEqual(estimatorParams2["movingAverage"]["total"],
                            estimatorParams["movingAverage"]["total"])

        # We should have many more samples where likelihood is < 0.01, but not all
        self.assertGreaterEqual(numpy.sum(likelihoods2 < 0.01), 25)
        self.assertLessEqual(numpy.sum(likelihoods2 < 0.01), 250)

        #------------------------------------------
        # Step 3. Generate some new data with the expected average anomaly score. We
        # should see fewer anomalies than in Step 2.
        data3 = _generateSampleData(mean=0.2)[0:1000]
        likelihoods3, avgRecordList3, estimatorParams3 = (
            an.updateAnomalyLikelihoods(data3, estimatorParams2))

        self.assertEqual(len(likelihoods3), len(data3))
        self.assertEqual(len(avgRecordList3), len(data3))
        self.assertTrue(an.isValidEstimatorParams(estimatorParams3))

        # The new running total should be different
        self.assertNotEqual(estimatorParams3["movingAverage"]["total"],
                            estimatorParams["movingAverage"]["total"])
        self.assertNotEqual(estimatorParams3["movingAverage"]["total"],
                            estimatorParams2["movingAverage"]["total"])

        # We should have a small number samples where likelihood is < 0.02, but at
        # least one
        self.assertGreaterEqual(numpy.sum(likelihoods3 < 0.01), 1)
        self.assertLessEqual(numpy.sum(likelihoods3 < 0.01), 100)

        #------------------------------------------
        # Step 4. Validate that sending data incrementally is the same as sending
        # in one batch
        allData = data1
        allData.extend(data2)
        allData.extend(data3)

        # Compute moving average of all the data and check it's the same
        _, historicalValuesAll, totalAll = (an._anomalyScoreMovingAverage(
            allData, windowSize=5))
        self.assertEqual(
            sum(historicalValuesAll),
            sum(estimatorParams3["movingAverage"]["historicalValues"]))
        self.assertEqual(totalAll, estimatorParams3["movingAverage"]["total"])
Exemplo n.º 35
0
    def _generateAnomalyParams(self, metricID, statsSampleCache,
                               defaultAnomalyParams):
        """
    Generate the model's anomaly likelihood parameters from the given sample
    cache.

    :param metricID: the metric ID
    :param statsSampleCache: a sequence of MetricData instances that
      comprise the cache of samples for the current inference result batch with
      valid raw_anomaly_score in the processed order (by rowid/timestamp). At
      least self._statisticsMinSampleSize samples are needed.
    :param defaultAnomalyParams: the default anomaly params value; if can't
      generate new ones (not enough samples in cache), this value will be
      returned verbatim

    :returns: new anomaly likelihood parameters; defaultAnomalyParams, if there
      are not enough samples in statsSampleCache.
    """
        if len(statsSampleCache) < self._statisticsMinSampleSize:
            # Not enough samples in cache
            # TODO: unit-test this
            self._log.error(
                "Not enough samples in cache to update anomaly params for model=%s: "
                "have=%d, which is less than min=%d; firstRowID=%s; lastRowID=%s.",
                metricID, len(statsSampleCache), self._statisticsMinSampleSize,
                statsSampleCache[0].rowid if statsSampleCache else None,
                statsSampleCache[-1].rowid if statsSampleCache else None)

            return defaultAnomalyParams

        # We have enough samples to generate anomaly params
        lastRowID = statsSampleCache[-1].rowid

        numSamples = min(len(statsSampleCache), self._statisticsSampleSize)

        # Create input sequence for algorithms
        samplesIter = itertools.islice(statsSampleCache,
                                       len(statsSampleCache) - numSamples,
                                       len(statsSampleCache))

        scores = tuple((
            row.timestamp,
            row.metric_value,
            row.raw_anomaly_score,
        ) for row in samplesIter)

        assert len(scores) >= self._statisticsMinSampleSize, (
            "_generateAnomalyParams: samples count=%d is smaller than min=%d; "
            "model=%s; lastRowID=%s") % (
                len(scores),
                self._statisticsMinSampleSize,
                metricID,
                lastRowID,
            )

        assert len(scores) <= self._statisticsSampleSize, (
            "_generateAnomalyParams: samples count=%d is larger than max=%d; "
            "model=%s; lastRowID=%s") % (
                len(scores),
                self._statisticsSampleSize,
                metricID,
                lastRowID,
            )

        # Calculate estimator parameters
        # We ignore statistics from the first day of data (288 records) since the
        # CLA is still learning. For simplicity, this logic continues to ignore the
        # first day of data even once the window starts sliding.
        _, _, params = algorithms.estimateAnomalyLikelihoods(
            anomalyScores=scores, skipRecords=NUM_SKIP_RECORDS)

        anomalyParams = {}
        anomalyParams["last_rowid_for_stats"] = lastRowID
        anomalyParams["params"] = params

        self._log.debug(
            "Generated anomaly params for model=%s using "
            "numRows=%d with rows=[%s..%s]", metricID, numSamples,
            statsSampleCache[-numSamples].rowid, statsSampleCache[-1].rowid)

        return anomalyParams
  def _generateAnomalyParams(self, metricID, statsSampleCache,
                             defaultAnomalyParams):
    """
    Generate the model's anomaly likelihood parameters from the given sample
    cache.

    :param metricID: the metric ID
    :param statsSampleCache: a sequence of MetricData instances that
      comprise the cache of samples for the current inference result batch with
      valid raw_anomaly_score in the processed order (by rowid/timestamp). At
      least self._statisticsMinSampleSize samples are needed.
    :param defaultAnomalyParams: the default anomaly params value; if can't
      generate new ones (not enough samples in cache), this value will be
      returned verbatim

    :returns: new anomaly likelihood parameters; defaultAnomalyParams, if there
      are not enough samples in statsSampleCache.
    """
    if len(statsSampleCache) < self._statisticsMinSampleSize:
      # Not enough samples in cache
      # TODO: unit-test this
      self._log.error(
        "Not enough samples in cache to update anomaly params for model=%s: "
        "have=%d, which is less than min=%d; firstRowID=%s; lastRowID=%s.",
        metricID, len(statsSampleCache), self._statisticsMinSampleSize,
        statsSampleCache[0].rowid if statsSampleCache else None,
        statsSampleCache[-1].rowid if statsSampleCache else None)

      return defaultAnomalyParams

    # We have enough samples to generate anomaly params
    lastRowID = statsSampleCache[-1].rowid

    numSamples = min(len(statsSampleCache), self._statisticsSampleSize)

    # Create input sequence for algorithms
    samplesIter = itertools.islice(
      statsSampleCache,
      len(statsSampleCache) - numSamples,
      len(statsSampleCache))

    scores = tuple(
      (row.timestamp, row.metric_value, row.raw_anomaly_score,)
      for row in samplesIter)

    assert len(scores) >= self._statisticsMinSampleSize, (
      "_generateAnomalyParams: samples count=%d is smaller than min=%d; "
      "model=%s; lastRowID=%s") % (len(scores), self._statisticsMinSampleSize,
                                   metricID, lastRowID,)

    assert len(scores) <= self._statisticsSampleSize, (
      "_generateAnomalyParams: samples count=%d is larger than max=%d; "
      "model=%s; lastRowID=%s") % (len(scores), self._statisticsSampleSize,
                                   metricID, lastRowID,)

    # Calculate estimator parameters
    # We ignore statistics from the first day of data (288 records) since the
    # CLA is still learning. For simplicity, this logic continues to ignore the
    # first day of data even once the window starts sliding.
    _, _, params = algorithms.estimateAnomalyLikelihoods(
      anomalyScores=scores, skipRecords=NUM_SKIP_RECORDS)

    anomalyParams = {}
    anomalyParams["last_rowid_for_stats"] = lastRowID
    anomalyParams["params"] = params

    self._log.debug("Generated anomaly params for model=%s using "
                    "numRows=%d with rows=[%s..%s]",
                    metricID, numSamples,
                    statsSampleCache[-numSamples].rowid,
                    statsSampleCache[-1].rowid)

    return anomalyParams