Exemplo n.º 1
0
Arquivo: scorer.py Projeto: imclab/NAB
def scoreCorpus(threshold, args):
  """Given a score to the corpus given a detector and a user profile.

  Scores the corpus in parallel.

  @param threshold  (float)   Threshold value to convert an anomaly score value
                              to a detection.

  @param args       (tuple)   Arguments necessary to call scoreHelper
  """
  (pool,
   detector,
   username,
   costMatrix,
   resultsCorpus,
   corpusLabel,
   probationaryPercent) = args

  args = []
  for relativePath, dataSet in resultsCorpus.dataFiles.iteritems():
    if relativePath == detector + "_scores.csv":
      continue

    relativePath = convertResultsPathToDataPath( \
      os.path.join(detector, relativePath))

    windows = corpusLabel.windows[relativePath]
    labels = corpusLabel.labels[relativePath]

    probationaryPeriod = math.floor(
      probationaryPercent * labels.shape[0])

    predicted = convertAnomalyScoresToDetections(
      dataSet.data["anomaly_score"], threshold)

    args.append((
      detector,
      username,
      relativePath,
      threshold,
      predicted,
      windows,
      labels,
      costMatrix,
      probationaryPeriod))

  results = pool.map(scoreDataSet, args)

  return results
Exemplo n.º 2
0
def optimizeThreshold(args):
    """Optimize the threshold for a given combination of detector and profile.

  @param args       (tuple)   Contains:

    detectorName        (string)                Name of detector.

    costMatrix          (dict)                  Cost matrix to weight the
                                                true positives, false negatives,
                                                and false positives during
                                                scoring.
    resultsCorpus       (nab.Corpus)            Corpus object that holds the per
                                                record anomaly scores for a
                                                given detector.
    corpusLabel         (nab.CorpusLabel)       Ground truth anomaly labels for
                                                the NAB corpus.
    probationaryPercent (float)                 Percent of each data file not
                                                to be considered during scoring.

  @return (dict) Contains:
        "threshold" (float)   Threshold that returns the largest score from the
                              Objective function.

        "score"     (float)   The score from the objective function given the
                              threshold.
  """
    (detectorName, costMatrix, resultsCorpus, corpusLabel,
     probationaryPercent) = args

    sweeper = Sweeper(probationPercent=probationaryPercent,
                      costMatrix=costMatrix)

    # First, get the sweep-scores for each row in each data set
    allAnomalyRows = []
    for relativePath, dataSet in resultsCorpus.dataFiles.iteritems():
        if "_scores.csv" in relativePath:
            continue

        # relativePath: raw dataset file,
        # e.g. 'artificialNoAnomaly/art_noisy.csv'
        relativePath = convertResultsPathToDataPath(
            os.path.join(detectorName, relativePath))

        windows = corpusLabel.windows[relativePath]
        labels = corpusLabel.labels[relativePath]
        timestamps = labels['timestamp']
        anomalyScores = dataSet.data["anomaly_score"]

        curAnomalyRows = sweeper.calcSweepScore(timestamps, anomalyScores,
                                                windows, relativePath)
        allAnomalyRows.extend(curAnomalyRows)

    # Get scores by threshold for the entire corpus
    scoresByThreshold = sweeper.calcScoreByThreshold(allAnomalyRows)
    scoresByThreshold = sorted(scoresByThreshold,
                               key=lambda x: x.score,
                               reverse=True)
    bestParams = scoresByThreshold[0]

    print(
        "Optimizer found a max score of {} with anomaly threshold {}.".format(
            bestParams.score, bestParams.threshold))

    return {"threshold": bestParams.threshold, "score": bestParams.score}
Exemplo n.º 3
0
def scoreCorpus(threshold, args):
  """Scores the corpus given a detector's results and a user profile.

  Scores the corpus in parallel.

  @param threshold  (float)   Threshold value to convert an anomaly score value
                              to a detection.

  @param args       (tuple)   Contains:
  
    pool                (multiprocessing.Pool)  Pool of processes to perform
                                                tasks in parallel.
    detectorName        (string)                Name of detector.
    
    profileName         (string)                Name of scoring profile.
    
    costMatrix          (dict)                  Cost matrix to weight the
                                                true positives, false negatives,
                                                and false positives during
                                                scoring.
    resultsDetectorDir  (string)                Directory for the results CSVs.
    
    resultsCorpus       (nab.Corpus)            Corpus object that holds the per
                                                record anomaly scores for a
                                                given detector.
    corpusLabel         (nab.CorpusLabel)       Ground truth anomaly labels for
                                                the NAB corpus.
    probationaryPercent (float)                 Percent of each data file not
                                                to be considered during scoring.
  """
  (pool,
   detectorName,
   profileName,
   costMatrix,
   resultsDetectorDir,
   resultsCorpus,
   corpusLabel,
   probationaryPercent,
   scoreFlag) = args
   
  args = []
  for relativePath, dataSet in resultsCorpus.dataFiles.iteritems():
    if "_scores.csv" in relativePath:
      continue

    # relativePath: raw dataset file,
    # e.g. 'artificialNoAnomaly/art_noisy.csv'
    relativePath = convertResultsPathToDataPath( \
      os.path.join(detectorName, relativePath))

    # outputPath: dataset results file,
    # e.g. 'results/detector/artificialNoAnomaly/detector_art_noisy.csv'
    relativeDir, fileName = os.path.split(relativePath)
    fileName =  detectorName + "_" + fileName
    outputPath = os.path.join(resultsDetectorDir, relativeDir, fileName)

    windows = corpusLabel.windows[relativePath]
    labels = corpusLabel.labels[relativePath]

    probationaryPeriod = math.floor(probationaryPercent * labels.shape[0])

    predicted = convertAnomalyScoresToDetections(
      dataSet.data["anomaly_score"], threshold)

    args.append((
      detectorName,
      profileName,
      relativePath,
      outputPath,
      threshold,
      predicted,
      windows,
      labels,
      costMatrix,
      probationaryPeriod,
      scoreFlag))

  results = pool.map(scoreDataSet, args)

  # Total the 6 scoring metrics for all data files
  totals = [None]*3 + [0]*6
  for row in results:
    for i in xrange(6):
      totals[i+3] += row[i+4]

  results.append(["Totals"] + totals)

  resultsDF = pandas.DataFrame(data=results,
                               columns=("Detector", "Profile", "File",
                                        "Threshold", "Score", "TP", "TN",
                                        "FP", "FN", "Total_Count"))

  return resultsDF
def scoreCorpus(threshold, args):
    """Scores the corpus given a detector's results and a user profile.

  Scores the corpus in parallel.

  @param threshold  (float)   Threshold value to convert an anomaly score value
                              to a detection.

  @param args       (tuple)   Contains:

    pool                (multiprocessing.Pool)  Pool of processes to perform
                                                tasks in parallel.
    detectorName        (string)                Name of detector.

    profileName         (string)                Name of scoring profile.

    costMatrix          (dict)                  Cost matrix to weight the
                                                true positives, false negatives,
                                                and false positives during
                                                scoring.
    resultsDetectorDir  (string)                Directory for the results CSVs.

    resultsCorpus       (nab.Corpus)            Corpus object that holds the per
                                                record anomaly scores for a
                                                given detector.
    corpusLabel         (nab.CorpusLabel)       Ground truth anomaly labels for
                                                the NAB corpus.
    probationaryPercent (float)                 Percent of each data file not
                                                to be considered during scoring.
  """
    (pool, detectorName, profileName, costMatrix, resultsDetectorDir,
     resultsCorpus, corpusLabel, probationaryPercent, scoreFlag) = args

    args = []
    for relativePath, dataSet in resultsCorpus.dataFiles.iteritems():
        if "_scores.csv" in relativePath:
            continue

        # relativePath: raw dataset file,
        # e.g. 'artificialNoAnomaly/art_noisy.csv'
        relativePath = convertResultsPathToDataPath( \
          os.path.join(detectorName, relativePath))

        # outputPath: dataset results file,
        # e.g. 'results/detector/artificialNoAnomaly/detector_art_noisy.csv'
        relativeDir, fileName = os.path.split(relativePath)
        fileName = detectorName + "_" + fileName
        outputPath = os.path.join(resultsDetectorDir, relativeDir, fileName)

        windows = corpusLabel.windows[relativePath]
        labels = corpusLabel.labels[relativePath]

        probationaryPeriod = getProbationPeriod(probationaryPercent,
                                                labels.shape[0])

        predicted = convertAnomalyScoresToDetections(
            dataSet.data["anomaly_score"], threshold)

        args.append((detectorName, profileName, relativePath, outputPath,
                     threshold, predicted, windows, labels, costMatrix,
                     probationaryPeriod, scoreFlag))

    results = pool.map(scoreDataSet, args)

    # Total the 6 scoring metrics for all data files
    totals = [None] * 3 + [0] * 6
    for row in results:
        for i in xrange(6):
            totals[i + 3] += row[i + 4]

    results.append(["Totals"] + totals)

    resultsDF = pandas.DataFrame(data=results,
                                 columns=("Detector", "Profile", "File",
                                          "Threshold", "Score", "TP", "TN",
                                          "FP", "FN", "Total_Count"))

    return resultsDF
Exemplo n.º 5
0
def optimizeThreshold(args):
  """Optimize the threshold for a given combination of detector and profile.

  @param args       (tuple)   Contains:

    detectorName        (string)                Name of detector.

    costMatrix          (dict)                  Cost matrix to weight the
                                                true positives, false negatives,
                                                and false positives during
                                                scoring.
    resultsCorpus       (nab.Corpus)            Corpus object that holds the per
                                                record anomaly scores for a
                                                given detector.
    corpusLabel         (nab.CorpusLabel)       Ground truth anomaly labels for
                                                the NAB corpus.
    probationaryPercent (float)                 Percent of each data file not
                                                to be considered during scoring.

  @return (dict) Contains:
        "threshold" (float)   Threshold that returns the largest score from the
                              Objective function.

        "score"     (float)   The score from the objective function given the
                              threshold.
  """
  (detectorName,
   costMatrix,
   resultsCorpus,
   corpusLabel,
   probationaryPercent) = args

  sweeper = Sweeper(
    probationPercent=probationaryPercent,
    costMatrix=costMatrix
  )

  # First, get the sweep-scores for each row in each data set
  allAnomalyRows = []
  for relativePath, dataSet in resultsCorpus.dataFiles.iteritems():
    if "_scores.csv" in relativePath:
      continue

    # relativePath: raw dataset file,
    # e.g. 'artificialNoAnomaly/art_noisy.csv'
    relativePath = convertResultsPathToDataPath(
      os.path.join(detectorName, relativePath))

    windows = corpusLabel.windows[relativePath]
    labels = corpusLabel.labels[relativePath]
    timestamps = labels['timestamp']
    anomalyScores = dataSet.data["anomaly_score"]

    curAnomalyRows = sweeper.calcSweepScore(
      timestamps,
      anomalyScores,
      windows,
      relativePath
    )
    allAnomalyRows.extend(curAnomalyRows)

  # Get scores by threshold for the entire corpus
  scoresByThreshold = sweeper.calcScoreByThreshold(allAnomalyRows)
  scoresByThreshold = sorted(
    scoresByThreshold,key=lambda x: x.score, reverse=True)
  bestParams = scoresByThreshold[0]

  print("Optimizer found a max score of {} with anomaly threshold {}.".format(
    bestParams.score, bestParams.threshold
  ))

  return {
    "threshold": bestParams.threshold,
    "score": bestParams.score
  }