def testVeryFewScores(self): """ This calls estimateAnomalyLikelihoods and updateAnomalyLikelihoods with one or no scores. """ # Generate an estimate using two data points data1 = _generateSampleData(mean=42.0, variance=1e-10, seed=self.GLOBAL_TEST_SEED) _, _, estimatorParams = (an.estimateAnomalyLikelihoods(data1[0:2])) self.assertTrue(an.isValidEstimatorParams(estimatorParams)) # Check that the estimated mean is that value dParams = estimatorParams["distribution"] self.assertWithinEpsilon(dParams["mean"], data1[0][2]) # Can't generate an estimate using no data points data1 = numpy.zeros(0) with self.assertRaises(ValueError): an.estimateAnomalyLikelihoods(data1) # Can't update with no scores with self.assertRaises(ValueError): an.updateAnomalyLikelihoods(data1, estimatorParams)
def testFlatAnomalyScores(self): """ This calls estimateAnomalyLikelihoods with flat distributions and ensures things don't crash. """ # Generate an estimate using fake distribution of anomaly scores. data1 = _generateSampleData(mean=42.0, variance=1e-10, seed=self.GLOBAL_TEST_SEED) likelihoods, avgRecordList, estimatorParams = ( an.estimateAnomalyLikelihoods(data1[0:1000])) self.assertEqual(len(likelihoods), 1000) self.assertEqual(len(avgRecordList), 1000) self.assertTrue(an.isValidEstimatorParams(estimatorParams)) ## Check that the estimated mean is correct dParams = estimatorParams["distribution"] self.assertWithinEpsilon(dParams["mean"], data1[0][2]) # If you deviate from the mean, you should get probability 0 # Test this by sending in just slightly different values. data2 = _generateSampleData(mean=42.5, variance=1e-10, seed=self.GLOBAL_TEST_SEED) likelihoods2, _, _ = (an.updateAnomalyLikelihoods( data2[0:10], estimatorParams)) # The likelihoods should go to zero very quickly self.assertLessEqual(likelihoods2.sum(), 0.01) # Test edge case where anomaly scores are very close to 0 # In this case we don't let likelihood to get too low. An average # anomaly score of 0.1 should be essentially zero, but an average # of 0.04 should be higher data3 = _generateSampleData(mean=0.01, variance=1e-6, seed=self.GLOBAL_TEST_SEED) _, _, estimatorParams3 = (an.estimateAnomalyLikelihoods(data3[0:1000])) data4 = _generateSampleData(mean=0.1, variance=1e-6, seed=self.GLOBAL_TEST_SEED) likelihoods4, _, estimatorParams4 = (an.updateAnomalyLikelihoods( data4[0:20], estimatorParams3)) # Average of 0.1 should go to zero self.assertLessEqual(likelihoods4[10:].mean(), 0.002) data5 = _generateSampleData(mean=0.05, variance=1e-6, seed=self.GLOBAL_TEST_SEED) likelihoods5, _, _ = (an.updateAnomalyLikelihoods( data5[0:20], estimatorParams4)) # The likelihoods should be low but not near zero self.assertLessEqual(likelihoods5[10:].mean(), 0.28) self.assertGreater(likelihoods5[10:].mean(), 0.015)
def testSkipRecords(self): """ This calls estimateAnomalyLikelihoods with various values of skipRecords """ # Check happy path data1 = _generateSampleData(mean=0.1, seed=self.GLOBAL_TEST_SEED)[0:200] data1 = data1 + (_generateSampleData( mean=0.9, seed=self.GLOBAL_TEST_SEED)[0:200]) likelihoods, _, estimatorParams = (an.estimateAnomalyLikelihoods( data1, skipRecords=200)) # Check results are correct, i.e. we are actually skipping the first 50 dParams = estimatorParams["distribution"] self.assertWithinEpsilon(dParams["mean"], 0.9, epsilon=0.15) # Check case where skipRecords > num records # In this case a null distribution should be returned which makes all # the likelihoods reasonably high likelihoods, _, estimatorParams = (an.estimateAnomalyLikelihoods( data1, skipRecords=500)) self.assertEqual(len(likelihoods), len(data1)) self.assertTrue(likelihoods.sum() >= 0.3 * len(likelihoods)) # Check the case where skipRecords == num records likelihoods, _, estimatorParams = (an.estimateAnomalyLikelihoods( data1, skipRecords=len(data1))) self.assertEqual(len(likelihoods), len(data1)) self.assertTrue(likelihoods.sum() >= 0.3 * len(likelihoods))
def testCaseIncreasedAnomalyScore(self): """ Test F: small anomaly score every 20 records, but then a large one when you would expect a small one. This should be anomalous. """ # Generate initial data data = [] data = self._addSampleData(data, spikePeriod=20, spikeValue=0.4, numSamples=1000) _, _, estimatorParams = ( an.estimateAnomalyLikelihoods(data) ) # Now feed in a more frequent distribution data = self._addSampleData(spikePeriod=20, spikeValue=1.0, numSamples=100) likelihoods2, _, _ = ( an.updateAnomalyLikelihoods(data, estimatorParams) ) # We should detect highly unusual behavior self.assertTrue(likelihoods2.min() < 0.0003) # We should detect it pretty often self.assertTrue((likelihoods2 < 0.0003).sum() > 40)
def testCaseIncreasedSpikeFrequency(self): """ Test E: bunches of anomalies every 20 records that become even more frequent. This should be anomalous. """ # Generate initial data data = [] for _ in range(30): data = self._addSampleData(data, spikePeriod=0, numSamples=30) data = self._addSampleData(data, spikePeriod=3, numSamples=10) _, _, estimatorParams = ( an.estimateAnomalyLikelihoods(data[0:1000]) ) # Now feed in a more frequent distribution data = self._addSampleData(spikePeriod=0, numSamples=30) data = self._addSampleData(data, spikePeriod=1, numSamples=10) likelihoods2, _, _ = ( an.updateAnomalyLikelihoods(data, estimatorParams) ) # The likelihood should become anomalous but only near the end self.assertTrue(likelihoods2[0:30].min() > 0.01) self.assertTrue(likelihoods2[-5:].min() < 0.002)
def testCaseContinuousBunchesOfSpikes(self): """ Test D: bunches of anomalies every 20 records that continue. This should not be anomalous. """ # Generate initial data data = [] for _ in range(30): data = self._addSampleData(data, spikePeriod=0, numSamples=30) data = self._addSampleData(data, spikePeriod=3, numSamples=10) _, _, estimatorParams = ( an.estimateAnomalyLikelihoods(data[0:1000]) ) # Now feed in the same distribution data = self._addSampleData(spikePeriod=0, numSamples=30) data = self._addSampleData(data, spikePeriod=3, numSamples=10) likelihoods2, _, _ = ( an.updateAnomalyLikelihoods(data, estimatorParams) ) # The likelihood should be reasonable high everywhere self.assertTrue(likelihoods2.min() > 0.01)
def testCaseUnusuallyHighSpikeFrequency(self): """ Test B: one anomaly spike every 20 records. Then we suddenly get a bunch in a row. The likelihood of those spikes should be low. """ data = self._addSampleData(spikePeriod=20, numSamples=1019) _, _, estimatorParams = ( an.estimateAnomalyLikelihoods(data[0:1000]) ) # If we continue to see the same distribution, we should get reasonable # likelihoods data = self._addSampleData(numSamples=119, spikePeriod=20) likelihoods1, _, estimatorParams1 = ( an.updateAnomalyLikelihoods(data, estimatorParams) ) # The minimum likelihood should be reasonably high self.assertTrue((likelihoods1.min() > 0.1 )) data = self._addSampleData(numSamples=20, spikePeriod=2) likelihoods2, _, _ = ( an.updateAnomalyLikelihoods(data, estimatorParams1) ) # The likelihood once you get past the initial averaging should be very low. self.assertTrue((likelihoods2[5:].sum() / 15.0) < 0.001)
def testEstimateAnomalyLikelihoodsMalformedRecords(self): """ This calls estimateAnomalyLikelihoods with malformed records, which should be quietly skipped. """ # Generate a fake distribution of anomaly scores, and add malformed records data1 = _generateSampleData(mean=0.2, seed=self.GLOBAL_TEST_SEED) data1 = data1[0:1000] + [(2, 2)] + [(2, 2, 2, 2)] + [()] + [(2)] likelihoods, avgRecordList, estimatorParams = ( an.estimateAnomalyLikelihoods(data1[0:1004])) self.assertEqual(len(likelihoods), 1000) self.assertEqual(len(avgRecordList), 1000) self.assertTrue(an.isValidEstimatorParams(estimatorParams)) # Check that the sum is correct avgParams = estimatorParams["movingAverage"] total = 0 for v in avgRecordList: total = total + v[2] self.assertTrue(avgParams["total"], total) # Check that the estimated mean is correct dParams = estimatorParams["distribution"] self.assertWithinEpsilon(dParams["mean"], total / float(len(avgRecordList)))
def testEstimateAnomalyLikelihoods(self): """ This calls estimateAnomalyLikelihoods to estimate the distribution on fake data and validates the results """ # Generate an estimate using fake distribution of anomaly scores. data1 = _generateSampleData(mean=0.2, seed=self.GLOBAL_TEST_SEED) likelihoods, avgRecordList, estimatorParams = ( an.estimateAnomalyLikelihoods(data1[0:1000])) self.assertEqual(len(likelihoods), 1000) self.assertEqual(len(avgRecordList), 1000) self.assertTrue(an.isValidEstimatorParams(estimatorParams)) # Check that the sum is correct avgParams = estimatorParams["movingAverage"] total = 0 for v in avgRecordList: total = total + v[2] self.assertTrue(avgParams["total"], total) # Check that the estimated mean is correct dParams = estimatorParams["distribution"] self.assertWithinEpsilon(dParams["mean"], total / float(len(avgRecordList))) # Number of points with lower than 2% probability should be pretty low # but not zero. Can't use exact 2% here due to random variations self.assertLessEqual(numpy.sum(likelihoods < 0.02), 66) self.assertGreaterEqual(numpy.sum(likelihoods < 0.02), 1)
def testEstimateAnomalyLikelihoodsCategoryValues(self): start = datetime.datetime(2017, 1, 1, 0, 0, 0) delta = datetime.timedelta(minutes=5) dts = [start + (i * delta) for i in range(10)] values = ["a", "b", "c", "d", "e"] * 2 rawScores = [0.1 * i for i in range(10)] data = list(zip(dts, values, rawScores)) likelihoods, avgRecordList, estimatorParams = ( an.estimateAnomalyLikelihoods(data)) self.assertEqual(len(likelihoods), 10) self.assertEqual(len(avgRecordList), 10) self.assertTrue(an.isValidEstimatorParams(estimatorParams))
def testCaseSingleSpike(self): """ No anomalies, and then you see a single spike. The likelihood of that spike should be 0 """ data = self._addSampleData(spikePeriod=0, numSamples=1000) _, _, estimatorParams = ( an.estimateAnomalyLikelihoods(data[0:1000]) ) data = self._addSampleData(numSamples=1, spikePeriod=1) likelihoods1, _, _ = ( an.updateAnomalyLikelihoods(data, estimatorParams) ) self.assertWithinEpsilon(likelihoods1[0], 0.0)
def testCaseMissingSpike(self): """ Test C: one anomaly every 20 records, but then see none. The likelihood at the end should be very low. """ # Initial data data = self._addSampleData(spikePeriod=20, numSamples=1019) _, _, estimatorParams = ( an.estimateAnomalyLikelihoods(data[0:1000]) ) # Now feed in none data = self._addSampleData(numSamples=100, spikePeriod=0) likelihoods2, _, _ = ( an.updateAnomalyLikelihoods(data, estimatorParams) ) # The likelihood once you get past the initial averaging should be very low. self.assertTrue((likelihoods2[5:].sum() / 15.0) < 0.0001)
def testBadParams(self): """ Calls updateAnomalyLikelihoods with bad params. """ # Generate an estimate using one data point data1 = _generateSampleData(mean=42.0, variance=1e-10, seed=self.GLOBAL_TEST_SEED) _, _, estimatorParams = (an.estimateAnomalyLikelihoods(data1[0:1])) self.assertTrue(an.isValidEstimatorParams(estimatorParams)) # Can't pass in a bad params structure with self.assertRaises(ValueError): an.updateAnomalyLikelihoods(data1, {"haha": "heehee"}) # Can't pass in something not a dict with self.assertRaises(ValueError): an.updateAnomalyLikelihoods(data1, 42.0)
def testFlatMetricScores(self): """ This calls estimateAnomalyLikelihoods with flat metric values. In this case we should use the null distribution, which gets reasonably high likelihood for everything. """ # Generate samples with very flat metric values data1 = _generateSampleData(metricMean=42.0, metricVariance=1e-10, seed=self.GLOBAL_TEST_SEED)[0:1000] likelihoods, _, estimatorParams = ( an.estimateAnomalyLikelihoods(data1)) # Check that we do indeed get reasonable likelihood values self.assertEqual(len(likelihoods), len(data1)) self.assertTrue(likelihoods.sum() >= 0.4 * len(likelihoods)) # Check that we do indeed get null distribution and with appropriate mean self.assertDictEqual( estimatorParams["distribution"], an.nullDistribution(estimatorParams["distribution"]["mean"]))
def testUpdateAnomalyLikelihoods(self): """ A slight more complex test. This calls estimateAnomalyLikelihoods to estimate the distribution on fake data, followed by several calls to updateAnomalyLikelihoods. """ #------------------------------------------ # Step 1. Generate an initial estimate using fake distribution of anomaly # scores. data1 = _generateSampleData(mean=0.2, seed=self.GLOBAL_TEST_SEED)[0:1000] _, _, estimatorParams = (an.estimateAnomalyLikelihoods( data1, averagingWindow=5)) #------------------------------------------ # Step 2. Generate some new data with a higher average anomaly # score. Using the estimator from step 1, to compute likelihoods. Now we # should see a lot more anomalies. data2 = _generateSampleData(mean=0.6, seed=self.GLOBAL_TEST_SEED)[0:300] likelihoods2, avgRecordList2, estimatorParams2 = ( an.updateAnomalyLikelihoods(data2, estimatorParams)) self.assertEqual(len(likelihoods2), len(data2)) self.assertEqual(len(avgRecordList2), len(data2)) self.assertTrue(an.isValidEstimatorParams(estimatorParams)) # The new running total should be different self.assertNotEqual(estimatorParams2["movingAverage"]["total"], estimatorParams["movingAverage"]["total"]) # We should have many more samples where likelihood is < 0.01, but not all self.assertGreaterEqual(numpy.sum(likelihoods2 < 0.01), 25) self.assertLessEqual(numpy.sum(likelihoods2 < 0.01), 250) #------------------------------------------ # Step 3. Generate some new data with the expected average anomaly score. We # should see fewer anomalies than in Step 2. # Note: same data properties as in step1 but different seed seed2 = self.GLOBAL_TEST_SEED + 1 if self.GLOBAL_TEST_SEED != None else None data3 = _generateSampleData(mean=0.2, seed=seed2)[0:1000] likelihoods3, avgRecordList3, estimatorParams3 = ( an.updateAnomalyLikelihoods(data3, estimatorParams2)) self.assertEqual(len(likelihoods3), len(data3)) self.assertEqual(len(avgRecordList3), len(data3)) self.assertTrue(an.isValidEstimatorParams(estimatorParams3)) # The new running total should be different self.assertNotEqual(estimatorParams3["movingAverage"]["total"], estimatorParams["movingAverage"]["total"]) self.assertNotEqual(estimatorParams3["movingAverage"]["total"], estimatorParams2["movingAverage"]["total"]) # We should have a small number samples where likelihood is < 0.02, but at # least one self.assertGreaterEqual(numpy.sum(likelihoods3 < 0.01), 1) self.assertLessEqual(numpy.sum(likelihoods3 < 0.01), 100) #------------------------------------------ # Step 4. Validate that sending data incrementally is the same as sending # in one batch allData = data1 allData.extend(data2) allData.extend(data3) # Compute moving average of all the data and check it's the same _, historicalValuesAll, totalAll = (an._anomalyScoreMovingAverage( allData, windowSize=5)) self.assertEqual( sum(historicalValuesAll), sum(estimatorParams3["movingAverage"]["historicalValues"])) self.assertEqual(totalAll, estimatorParams3["movingAverage"]["total"])