def testCaseSingleSpike(self): """ No anomalies, and then you see a single spike. That spike should be an anomaly. """ an = AnomalyLikelihood(100) for _ in range(1000): an.compute(0) anom = an.compute(1) self.assertAlmostEqual(anom, 1.0, places=3)
def testCaseUnusuallyHighSpikeFrequency(self): """ Test B: one anomaly spike every 20 records. Then we suddenly get a bunch in a row. The likelihood of those spikes should be high. """ an = AnomalyLikelihood() data = self._addSampleData(spikePeriod=20, numSamples=3000) anom = [an.compute(x) for x in data] # If we continue to see the same distribution, we should get reasonable # likelihoods. max_anom = max(anom[-100:]) self.assertTrue(max_anom < threshold) # Make 20 spikes in a row. anom = [an.compute(1.0) for x in range(20)] # Check for anomaly detected. self.assertTrue(max(anom) > threshold)
def testCaseIncreasedAnomalyScore(self): """ Test F: small anomaly score every 20 records, but then a large one when you would expect a small one. This should be anomalous. """ an = AnomalyLikelihood() data = self._addSampleData(spikePeriod=20, spikeValue=0.4, numSamples=3000) anom = [an.compute(x) for x in data] # Now feed in a larger magnitude distribution. data = self._addSampleData(spikePeriod=20, spikeValue=1.0, numSamples=100) anom = [an.compute(x) for x in data] self.assertTrue(max(anom) > threshold)
def testCaseContinuousBunchesOfSpikes(self): """ Test D: bunches of anomalies every 40 records that continue. This should not be anomalous. """ an = AnomalyLikelihood() # Generate initial data data = [] for _ in range(30): data = self._addSampleData(data, spikePeriod=0, numSamples=30) data = self._addSampleData(data, spikePeriod=3, numSamples=10) anom = [an.compute(x) for x in data[:1000]] # Now feed in the same distribution data = self._addSampleData(spikePeriod=0, numSamples=30) data = self._addSampleData(data, spikePeriod=3, numSamples=10) anom = [an.compute(x) for x in data] self.assertTrue(max(anom) < threshold)
def testCaseIncreasedSpikeFrequency(self): """ Test E: bunches of anomalies every 20 records that become even more frequent. This should be anomalous. """ an = AnomalyLikelihood(500) # Generate initial data data = [] for _ in range(30): data = self._addSampleData(data, spikePeriod=0, numSamples=30) data = self._addSampleData(data, spikePeriod=3, numSamples=10) anom = [an.compute(x) for x in data[:1000]] # Now feed in a more frequent distribution data = self._addSampleData(spikePeriod=0, numSamples=30) data = self._addSampleData(data, spikePeriod=1, numSamples=10) anom = [an.compute(x) for x in data] # The likelihood should become anomalous but only near the end self.assertTrue(max(anom[0:30]) < threshold) self.assertTrue(max(anom[30:]) > threshold)
def main(parameters=default_parameters, argv=None, verbose=True): if verbose: import pprint print("Parameters:") pprint.pprint(parameters, indent=4) print("") # Read the input file. records = [] with open(_INPUT_FILE_PATH, "r") as fin: reader = csv.reader(fin) headers = next(reader) next(reader) next(reader) for record in reader: records.append(record) # Make the Encoders. These will convert input data into binary representations. dateEncoder = DateEncoder(timeOfDay=parameters["enc"]["time"]["timeOfDay"], weekend=parameters["enc"]["time"]["weekend"]) scalarEncoderParams = RDSE_Parameters() scalarEncoderParams.size = parameters["enc"]["value"]["size"] scalarEncoderParams.sparsity = parameters["enc"]["value"]["sparsity"] scalarEncoderParams.resolution = parameters["enc"]["value"]["resolution"] scalarEncoder = RDSE(scalarEncoderParams) encodingWidth = (dateEncoder.size + scalarEncoder.size) enc_info = Metrics([encodingWidth], 999999999) # Make the HTM. SpatialPooler & TemporalMemory & associated tools. spParams = parameters["sp"] sp = SpatialPooler(inputDimensions=(encodingWidth, ), columnDimensions=(spParams["columnCount"], ), potentialPct=spParams["potentialPct"], potentialRadius=encodingWidth, globalInhibition=True, localAreaDensity=spParams["localAreaDensity"], synPermInactiveDec=spParams["synPermInactiveDec"], synPermActiveInc=spParams["synPermActiveInc"], synPermConnected=spParams["synPermConnected"], boostStrength=spParams["boostStrength"], wrapAround=True) sp_info = Metrics(sp.getColumnDimensions(), 999999999) tmParams = parameters["tm"] tm = TemporalMemory( columnDimensions=(spParams["columnCount"], ), cellsPerColumn=tmParams["cellsPerColumn"], activationThreshold=tmParams["activationThreshold"], initialPermanence=tmParams["initialPerm"], connectedPermanence=spParams["synPermConnected"], minThreshold=tmParams["minThreshold"], maxNewSynapseCount=tmParams["newSynapseCount"], permanenceIncrement=tmParams["permanenceInc"], permanenceDecrement=tmParams["permanenceDec"], predictedSegmentDecrement=0.0, maxSegmentsPerCell=tmParams["maxSegmentsPerCell"], maxSynapsesPerSegment=tmParams["maxSynapsesPerSegment"]) tm_info = Metrics([tm.numberOfCells()], 999999999) anomaly_history = AnomalyLikelihood(parameters["anomaly"]["period"]) predictor = Predictor(steps=[1, 5], alpha=parameters["predictor"]['sdrc_alpha']) predictor_resolution = 1 # Iterate through every datum in the dataset, record the inputs & outputs. inputs = [] anomaly = [] anomalyProb = [] predictions = {1: [], 5: []} for count, record in enumerate(records): # Convert date string into Python date object. dateString = datetime.datetime.strptime(record[0], "%m/%d/%y %H:%M") # Convert data value string into float. consumption = float(record[1]) inputs.append(consumption) # Call the encoders to create bit representations for each value. These are SDR objects. dateBits = dateEncoder.encode(dateString) consumptionBits = scalarEncoder.encode(consumption) # Concatenate all these encodings into one large encoding for Spatial Pooling. encoding = SDR(encodingWidth).concatenate([consumptionBits, dateBits]) enc_info.addData(encoding) # Create an SDR to represent active columns, This will be populated by the # compute method below. It must have the same dimensions as the Spatial Pooler. activeColumns = SDR(sp.getColumnDimensions()) # Execute Spatial Pooling algorithm over input space. sp.compute(encoding, True, activeColumns) sp_info.addData(activeColumns) # Execute Temporal Memory algorithm over active mini-columns. tm.compute(activeColumns, learn=True) tm_info.addData(tm.getActiveCells().flatten()) # Predict what will happen, and then train the predictor based on what just happened. pdf = predictor.infer(tm.getActiveCells()) for n in (1, 5): if pdf[n]: predictions[n].append(np.argmax(pdf[n]) * predictor_resolution) else: predictions[n].append(float('nan')) anomaly.append(tm.anomaly) anomalyProb.append(anomaly_history.compute(tm.anomaly)) predictor.learn(count, tm.getActiveCells(), int(consumption / predictor_resolution)) # Print information & statistics about the state of the HTM. print("Encoded Input", enc_info) print("") print("Spatial Pooler Mini-Columns", sp_info) print(str(sp)) print("") print("Temporal Memory Cells", tm_info) print(str(tm)) print("") # Shift the predictions so that they are aligned with the input they predict. for n_steps, pred_list in predictions.items(): for x in range(n_steps): pred_list.insert(0, float('nan')) pred_list.pop() # Calculate the predictive accuracy, Root-Mean-Squared accuracy = {1: 0, 5: 0} accuracy_samples = {1: 0, 5: 0} for idx, inp in enumerate(inputs): for n in predictions: # For each [N]umber of time steps ahead which was predicted. val = predictions[n][idx] if not math.isnan(val): accuracy[n] += (inp - val)**2 accuracy_samples[n] += 1 for n in sorted(predictions): accuracy[n] = (accuracy[n] / accuracy_samples[n])**.5 print("Predictive Error (RMS)", n, "steps ahead:", accuracy[n]) # Show info about the anomaly (mean & std) print("Anomaly Mean", np.mean(anomaly)) print("Anomaly Std ", np.std(anomaly)) # Plot the Predictions and Anomalies. if verbose: try: import matplotlib.pyplot as plt except: print( "WARNING: failed to import matplotlib, plots cannot be shown.") return -accuracy[5] plt.subplot(2, 1, 1) plt.title("Predictions") plt.xlabel("Time") plt.ylabel("Power Consumption") plt.plot( np.arange(len(inputs)), inputs, 'red', np.arange(len(inputs)), predictions[1], 'blue', np.arange(len(inputs)), predictions[5], 'green', ) plt.legend(labels=('Input', '1 Step Prediction, Shifted 1 step', '5 Step Prediction, Shifted 5 steps')) plt.subplot(2, 1, 2) plt.title("Anomaly Score") plt.xlabel("Time") plt.ylabel("Power Consumption") inputs = np.array(inputs) / max(inputs) plt.plot( np.arange(len(inputs)), inputs, 'black', np.arange(len(inputs)), anomaly, 'blue', np.arange(len(inputs)), anomalyProb, 'red', ) plt.legend(labels=('Input', 'Instantaneous Anomaly', 'Anomaly Likelihood')) plt.show() return -accuracy[5]
class HtmcoreDetector(AnomalyDetector): """ This detector uses an HTM based anomaly detection technique. """ def __init__(self, *args, **kwargs): super(HtmcoreDetector, self).__init__(*args, **kwargs) ## API for controlling settings of htm.core HTM detector: # Set this to False if you want to get results based on raw scores # without using AnomalyLikelihood. This will give worse results, but # useful for checking the efficacy of AnomalyLikelihood. You will need # to re-optimize the thresholds when running with this setting. self.useLikelihood = True self.useSpatialAnomaly = True self.verbose = True # Add the "HTMCORE_OPTIMIZE" flag to the environment variables to use the optimization. # If present, it reads the parameters from ./params.json # If absent, it uses the global variable "default_parameters". self.use_optimization = 'HTMCORE_OPTIMIZE' in os.environ ## internal members # (listed here for easier understanding) # initialized in `initialize()` self.encTimestamp = None self.encValue = None self.sp = None self.tm = None self.anLike = None # optional debug info self.enc_info = None self.sp_info = None self.tm_info = None # internal helper variables: self.inputs_ = [] self.iteration_ = 0 def getAdditionalHeaders(self): """Returns a list of strings.""" return ["raw_score"] #TODO optional: add "prediction" def handleRecord(self, inputData): """Returns a tuple (anomalyScore, rawScore). @param inputData is a dict {"timestamp" : Timestamp(), "value" : float} @return tuple (anomalyScore, <any other fields specified in `getAdditionalHeaders()`>, ...) """ # Send it to Numenta detector and get back the results return self.modelRun(inputData["timestamp"], inputData["value"]) def initialize(self): # toggle parameters here if self.use_optimization: parameters = read_params('params.json') else: parameters = default_parameters # setup spatial anomaly if self.useSpatialAnomaly: # Keep track of value range for spatial anomaly detection self.minVal = None self.maxVal = None ## setup Enc, SP, TM, Likelihood # Make the Encoders. These will convert input data into binary representations. self.encTimestamp = DateEncoder( timeOfDay=parameters["enc"]["time"]["timeOfDay"], weekend=parameters["enc"]["time"]["weekend"]) scalarEncoderParams = RDSE_Parameters() scalarEncoderParams.size = parameters["enc"]["value"]["size"] scalarEncoderParams.sparsity = parameters["enc"]["value"]["sparsity"] scalarEncoderParams.resolution = parameters["enc"]["value"][ "resolution"] self.encValue = RDSE(scalarEncoderParams) encodingWidth = (self.encTimestamp.size + self.encValue.size) self.enc_info = Metrics([encodingWidth], 999999999) # Make the HTM. SpatialPooler & TemporalMemory & associated tools. # SpatialPooler spParams = parameters["sp"] self.sp = SpatialPooler( inputDimensions=(encodingWidth, ), columnDimensions=(spParams["columnCount"], ), potentialPct=spParams["potentialPct"], potentialRadius=encodingWidth, globalInhibition=True, localAreaDensity=spParams["localAreaDensity"], synPermInactiveDec=spParams["synPermInactiveDec"], synPermActiveInc=spParams["synPermActiveInc"], synPermConnected=spParams["synPermConnected"], boostStrength=spParams["boostStrength"], wrapAround=True, seed=0, ) self.sp_info = Metrics(self.sp.getColumnDimensions(), 999999999) # TemporalMemory tmParams = parameters["tm"] self.tm = TemporalMemory( columnDimensions=(spParams["columnCount"], ), cellsPerColumn=tmParams["cellsPerColumn"], activationThreshold=tmParams["activationThreshold"], initialPermanence=tmParams["initialPerm"], connectedPermanence=spParams["synPermConnected"], minThreshold=tmParams["minThreshold"], maxNewSynapseCount=tmParams["newSynapseCount"], permanenceIncrement=tmParams["permanenceInc"], permanenceDecrement=tmParams["permanenceDec"], predictedSegmentDecrement=0.0, maxSegmentsPerCell=tmParams["maxSegmentsPerCell"], maxSynapsesPerSegment=tmParams["maxSynapsesPerSegment"], externalPredictiveInputs=self.encTimestamp.size, seed=0, ) self.tm_info = Metrics([self.tm.numberOfCells()], 999999999) # setup likelihood, these settings are used in NAB if self.useLikelihood: learningPeriod = int(math.floor(self.probationaryPeriod / 2.0)) self.anomalyLikelihood = AnomalyLikelihood(learningPeriod) # Predictor # self.predictor = Predictor( steps=[1, 5], alpha=parameters["predictor"]['sdrc_alpha'] ) # predictor_resolution = 1 # initialize pandaBaker if PANDA_VIS_BAKE_DATA: self.BuildPandaSystem(self.sp, self.tm, parameters["enc"]["value"]["size"], self.encTimestamp.size) def modelRun(self, ts, val): """ Run a single pass through HTM model @params ts - Timestamp @params val - float input value @return rawAnomalyScore computed for the `val` in this step """ ## run data through our model pipeline: enc -> SP -> TM -> Anomaly self.inputs_.append(val) self.iteration_ += 1 # 1. Encoding # Call the encoders to create bit representations for each value. These are SDR objects. dateBits = self.encTimestamp.encode(ts) valueBits = self.encValue.encode(float(val)) # Concatenate all these encodings into one large encoding for Spatial Pooling. encoding = SDR(self.encTimestamp.size + self.encValue.size).concatenate([valueBits, dateBits]) self.enc_info.addData(encoding) # 2. Spatial Pooler # Create an SDR to represent active columns, This will be populated by the # compute method below. It must have the same dimensions as the Spatial Pooler. activeColumns = SDR(self.sp.getColumnDimensions()) # Execute Spatial Pooling algorithm over input space. self.sp.compute(encoding, True, activeColumns) self.sp_info.addData(activeColumns) # 3. Temporal Memory # Execute Temporal Memory algorithm over active mini-columns. # to get predictive cells we need to call activateDendrites & activateCells separately if PANDA_VIS_BAKE_DATA: # activateDendrites calculates active segments self.tm.activateDendrites(learn=True) # predictive cells are calculated directly from active segments predictiveCells = self.tm.getPredictiveCells() # activates cells in columns by TM algorithm (winners, bursting...) self.tm.activateCells(activeColumns, learn=True) else: self.tm.compute(activeColumns, learn=True) self.tm_info.addData(self.tm.getActiveCells().flatten()) # 4.1 (optional) Predictor #TODO optional #TODO optional: also return an error metric on predictions (RMSE, R2,...) # 4.2 Anomaly # handle spatial, contextual (raw, likelihood) anomalies # -Spatial spatialAnomaly = 0.0 #TODO optional: make this computed in SP (and later improve) if self.useSpatialAnomaly: # Update min/max values and check if there is a spatial anomaly if self.minVal != self.maxVal: tolerance = (self.maxVal - self.minVal) * SPATIAL_TOLERANCE maxExpected = self.maxVal + tolerance minExpected = self.minVal - tolerance if val > maxExpected or val < minExpected: spatialAnomaly = 1.0 if self.maxVal is None or val > self.maxVal: self.maxVal = val if self.minVal is None or val < self.minVal: self.minVal = val temporalAnomaly = raw = self.tm.anomaly if self.useLikelihood: temporalAnomaly = self.anomalyLikelihood.compute(temporalAnomaly) anomalyScore = max( spatialAnomaly, temporalAnomaly) # this is the "main" anomaly, compared in NAB # 5. print stats if self.verbose and self.iteration_ % 1000 == 0: # print(self.enc_info) # print(self.sp_info) # print(self.tm_info) pass # 6. panda vis if PANDA_VIS_BAKE_DATA: # ------------------HTMpandaVis---------------------- # see more about this structure at https://github.com/htm-community/HTMpandaVis/blob/master/pandaBaker/README.md # fill up values pandaBaker.inputs["Value"].stringValue = "value: {:.2f}".format( val) pandaBaker.inputs["Value"].bits = valueBits.sparse pandaBaker.inputs["TimeOfDay"].stringValue = str(ts) pandaBaker.inputs["TimeOfDay"].bits = dateBits.sparse pandaBaker.layers["Layer1"].activeColumns = activeColumns.sparse pandaBaker.layers["Layer1"].winnerCells = self.tm.getWinnerCells( ).sparse pandaBaker.layers[ "Layer1"].predictiveCells = predictiveCells.sparse pandaBaker.layers["Layer1"].activeCells = self.tm.getActiveCells( ).sparse # customizable datastreams to be show on the DASH PLOTS pandaBaker.dataStreams["rawAnomaly"].value = temporalAnomaly pandaBaker.dataStreams["value"].value = val pandaBaker.dataStreams["numberOfWinnerCells"].value = len( self.tm.getWinnerCells().sparse) pandaBaker.dataStreams["numberOfPredictiveCells"].value = len( predictiveCells.sparse) pandaBaker.dataStreams[ "valueInput_sparsity"].value = valueBits.getSparsity() pandaBaker.dataStreams[ "dateInput_sparsity"].value = dateBits.getSparsity() pandaBaker.dataStreams[ "Layer1_SP_overlap_metric"].value = self.sp_info.overlap.overlap pandaBaker.dataStreams[ "Layer1_TM_overlap_metric"].value = self.sp_info.overlap.overlap pandaBaker.dataStreams[ "Layer1_SP_activation_frequency"].value = self.sp_info.activationFrequency.mean( ) pandaBaker.dataStreams[ "Layer1_TM_activation_frequency"].value = self.tm_info.activationFrequency.mean( ) pandaBaker.dataStreams[ "Layer1_SP_entropy"].value = self.sp_info.activationFrequency.mean( ) pandaBaker.dataStreams[ "Layer1_TM_entropy"].value = self.tm_info.activationFrequency.mean( ) pandaBaker.StoreIteration(self.iteration_ - 1) print("ITERATION: " + str(self.iteration_ - 1)) # ------------------HTMpandaVis---------------------- return (anomalyScore, raw) # with this method, the structure for visualization is defined def BuildPandaSystem(self, sp, tm, consumptionBits_size, dateBits_size): # we have two inputs connected to proximal synapses of Layer1 pandaBaker.inputs["Value"] = cInput(consumptionBits_size) pandaBaker.inputs["TimeOfDay"] = cInput(dateBits_size) pandaBaker.layers["Layer1"] = cLayer( sp, tm) # Layer1 has Spatial Pooler & Temporal Memory pandaBaker.layers["Layer1"].proximalInputs = [ "Value", "TimeOfDay", ] pandaBaker.layers["Layer1"].distalInputs = ["Layer1"] # data for dash plots streams = [ "rawAnomaly", "value", "numberOfWinnerCells", "numberOfPredictiveCells", "valueInput_sparsity", "dateInput_sparsity", "Layer1_SP_overlap_metric", "Layer1_TM_overlap_metric", "Layer1_SP_activation_frequency", "Layer1_TM_activation_frequency", "Layer1_SP_entropy", "Layer1_TM_entropy" ] pandaBaker.dataStreams = dict( (name, cDataStream()) for name in streams) # create dicts for more comfortable code # could be also written like: pandaBaker.dataStreams["myStreamName"] = cDataStream() pandaBaker.PrepareDatabase()