def testCalcScoreByThresholdReturnsExpectedScores(self): fnWeight = 5.0 o = Sweeper() o.fnWeight = fnWeight fakeInput = [ AnomalyPoint(0, 0.5, -1000, 'probationary'), # Should never contribute to score (probationary) AnomalyPoint(1, 0.5, -1000, 'probationary'), # Should never contribute to score (probationary) AnomalyPoint(2, 0.0, -3, None), # Should never contribute to score (anomaly == 0.0) AnomalyPoint(4, 0.2, 20, 'windowA'), # Should be used instead of next row when threshold <= 0.2 AnomalyPoint(5, 0.3, 10, 'windowA'), # Should be used for winowA _until_ threshold <= 0.2 AnomalyPoint(6, 0.5, 5, 'windowB'), # Only score for windowB, but won't be used until threshold <= 0.5 AnomalyPoint(7, 0.5, -3, None), ] expectedScoresByThreshold = [ ThresholdScore(1.1, -2 * fnWeight, 0, 2, 0, 3, 5), # two windows, both false negatives at this threshold ThresholdScore(0.5, 5 - 3 - fnWeight, 1, 1, 1, 2, 5), # Both 'anomalyScore == 0.5' score, windowA is still FN ThresholdScore(0.3, 5 - 3 + 10, 2, 1, 1, 1, 5), # Both windows now have a TP ThresholdScore(0.2, 5 - 3 + 20, 3, 1, 1, 0, 5), # windowA gets a new max value due to row 4 becoming active ThresholdScore(0.0, 5 - 3 + 20 - 3, 3, 0, 2, 0, 5), ] actual = o.calcScoreByThreshold(fakeInput) assert actual == expectedScoresByThreshold
def testCalcScoreByThresholdReturnsExpectedScores(self): fnWeight = 5.0 o = Sweeper() o.fnWeight = fnWeight fakeInput = [ AnomalyPoint(0, 0.5, -1000, 'probationary' ), # Should never contribute to score (probationary) AnomalyPoint(1, 0.5, -1000, 'probationary' ), # Should never contribute to score (probationary) AnomalyPoint( 2, 0.0, -3, None), # Should never contribute to score (anomaly == 0.0) AnomalyPoint( 4, 0.2, 20, 'windowA' ), # Should be used instead of next row when threshold <= 0.2 AnomalyPoint( 5, 0.3, 10, 'windowA' ), # Should be used for winowA _until_ threshold <= 0.2 AnomalyPoint( 6, 0.5, 5, 'windowB' ), # Only score for windowB, but won't be used until threshold <= 0.5 AnomalyPoint(7, 0.5, -3, None), ] expectedScoresByThreshold = [ ThresholdScore( 1.1, -2 * fnWeight, 0, 2, 0, 3, 5), # two windows, both false negatives at this threshold ThresholdScore( 0.5, 5 - 3 - fnWeight, 1, 1, 1, 2, 5), # Both 'anomalyScore == 0.5' score, windowA is still FN ThresholdScore(0.3, 5 - 3 + 10, 2, 1, 1, 1, 5), # Both windows now have a TP ThresholdScore( 0.2, 5 - 3 + 20, 3, 1, 1, 0, 5 ), # windowA gets a new max value due to row 4 becoming active ThresholdScore(0.0, 5 - 3 + 20 - 3, 3, 0, 2, 0, 5), ] actual = o.calcScoreByThreshold(fakeInput) assert actual == expectedScoresByThreshold
def optimizeThreshold(args): """Optimize the threshold for a given combination of detector and profile. @param args (tuple) Contains: detectorName (string) Name of detector. costMatrix (dict) Cost matrix to weight the true positives, false negatives, and false positives during scoring. resultsCorpus (nab.Corpus) Corpus object that holds the per record anomaly scores for a given detector. corpusLabel (nab.CorpusLabel) Ground truth anomaly labels for the NAB corpus. probationaryPercent (float) Percent of each data file not to be considered during scoring. @return (dict) Contains: "threshold" (float) Threshold that returns the largest score from the Objective function. "score" (float) The score from the objective function given the threshold. """ (detectorName, costMatrix, resultsCorpus, corpusLabel, probationaryPercent) = args sweeper = Sweeper(probationPercent=probationaryPercent, costMatrix=costMatrix) # First, get the sweep-scores for each row in each data set allAnomalyRows = [] for relativePath, dataSet in resultsCorpus.dataFiles.iteritems(): if "_scores.csv" in relativePath: continue # relativePath: raw dataset file, # e.g. 'artificialNoAnomaly/art_noisy.csv' relativePath = convertResultsPathToDataPath( os.path.join(detectorName, relativePath)) windows = corpusLabel.windows[relativePath] labels = corpusLabel.labels[relativePath] timestamps = labels['timestamp'] anomalyScores = dataSet.data["anomaly_score"] curAnomalyRows = sweeper.calcSweepScore(timestamps, anomalyScores, windows, relativePath) allAnomalyRows.extend(curAnomalyRows) # Get scores by threshold for the entire corpus scoresByThreshold = sweeper.calcScoreByThreshold(allAnomalyRows) scoresByThreshold = sorted(scoresByThreshold, key=lambda x: x.score, reverse=True) bestParams = scoresByThreshold[0] print( "Optimizer found a max score of {} with anomaly threshold {}.".format( bestParams.score, bestParams.threshold)) return {"threshold": bestParams.threshold, "score": bestParams.score}
def optimizeThreshold(args): """Optimize the threshold for a given combination of detector and profile. @param args (tuple) Contains: detectorName (string) Name of detector. costMatrix (dict) Cost matrix to weight the true positives, false negatives, and false positives during scoring. resultsCorpus (nab.Corpus) Corpus object that holds the per record anomaly scores for a given detector. corpusLabel (nab.CorpusLabel) Ground truth anomaly labels for the NAB corpus. probationaryPercent (float) Percent of each data file not to be considered during scoring. @return (dict) Contains: "threshold" (float) Threshold that returns the largest score from the Objective function. "score" (float) The score from the objective function given the threshold. """ (detectorName, costMatrix, resultsCorpus, corpusLabel, probationaryPercent) = args sweeper = Sweeper( probationPercent=probationaryPercent, costMatrix=costMatrix ) # First, get the sweep-scores for each row in each data set allAnomalyRows = [] for relativePath, dataSet in resultsCorpus.dataFiles.iteritems(): if "_scores.csv" in relativePath: continue # relativePath: raw dataset file, # e.g. 'artificialNoAnomaly/art_noisy.csv' relativePath = convertResultsPathToDataPath( os.path.join(detectorName, relativePath)) windows = corpusLabel.windows[relativePath] labels = corpusLabel.labels[relativePath] timestamps = labels['timestamp'] anomalyScores = dataSet.data["anomaly_score"] curAnomalyRows = sweeper.calcSweepScore( timestamps, anomalyScores, windows, relativePath ) allAnomalyRows.extend(curAnomalyRows) # Get scores by threshold for the entire corpus scoresByThreshold = sweeper.calcScoreByThreshold(allAnomalyRows) scoresByThreshold = sorted( scoresByThreshold,key=lambda x: x.score, reverse=True) bestParams = scoresByThreshold[0] print("Optimizer found a max score of {} with anomaly threshold {}.".format( bestParams.score, bestParams.threshold )) return { "threshold": bestParams.threshold, "score": bestParams.score }