class UnivHTMDetector(object): """ This detector uses an HTM based anomaly detection technique. """ def __init__(self, name, probationaryPeriod, smoothingKernelSize, htmParams=None, verbose=False): self.useSpatialAnomaly = True self.verbose = verbose self.name = name # for logging self.probationaryPeriod = probationaryPeriod self.parameters = parameters_best self.minVal = None self.maxVal = None self.spatial_tolerance = None self.encTimestamp = None self.encValue = None self.sp = None self.tm = None self.anomalyLikelihood = None # optional debug info self.enc_info = None self.sp_info = None self.tm_info = None # for initialization self.init_data = [] self.is_initialized = False self.iteration_ = 0 # for smoothing with gaussian self.historic_raw_anomaly_scores = deque(maxlen=smoothingKernelSize) self.kernel = None self.learningPeriod = None def initialize(self, input_min=0, input_max=0): # setup spatial anomaly if self.useSpatialAnomaly: self.spatial_tolerance = self.parameters["spatial_tolerance"] ## setup Enc, SP, TM # Make the Encoders. These will convert input data into binary representations. self.encTimestamp = DateEncoder(timeOfDay=self.parameters["enc"]["time"]["timeOfDay"]) scalarEncoderParams = RDSE_Parameters() scalarEncoderParams.size = self.parameters["enc"]["value"]["size"] scalarEncoderParams.activeBits = self.parameters["enc"]["value"]["activeBits"] scalarEncoderParams.resolution = max(0.001, (input_max - input_min) / 130) scalarEncoderParams.seed = self.parameters["enc"]["value"]["seed"] self.encValue = RDSE(scalarEncoderParams) encodingWidth = (self.encTimestamp.size + self.encValue.size) self.enc_info = Metrics([encodingWidth], 999999999) # Make the HTM. SpatialPooler & TemporalMemory & associated tools. # SpatialPooler spParams = self.parameters["sp"] self.sp = SpatialPooler( inputDimensions=(encodingWidth,), columnDimensions=(spParams["columnDimensions"],), potentialRadius=encodingWidth, potentialPct=spParams["potentialPct"], globalInhibition=spParams["globalInhibition"], localAreaDensity=spParams["localAreaDensity"], numActiveColumnsPerInhArea=spParams["numActiveColumnsPerInhArea"], stimulusThreshold=spParams["stimulusThreshold"], synPermInactiveDec=spParams["synPermInactiveDec"], synPermActiveInc=spParams["synPermActiveInc"], synPermConnected=spParams["synPermConnected"], boostStrength=spParams["boostStrength"], wrapAround=spParams["wrapAround"], minPctOverlapDutyCycle=spParams["minPctOverlapDutyCycle"], dutyCyclePeriod=spParams["dutyCyclePeriod"], seed=spParams["seed"], ) self.sp_info = Metrics(self.sp.getColumnDimensions(), 999999999) # TemporalMemory tmParams = self.parameters["tm"] self.tm = TemporalMemory( columnDimensions=(spParams["columnDimensions"],), cellsPerColumn=tmParams["cellsPerColumn"], activationThreshold=tmParams["activationThreshold"], initialPermanence=tmParams["initialPermanence"], connectedPermanence=tmParams["connectedPermanence"], minThreshold=tmParams["minThreshold"], maxNewSynapseCount=tmParams["maxNewSynapseCount"], permanenceIncrement=tmParams["permanenceIncrement"], permanenceDecrement=tmParams["permanenceDecrement"], predictedSegmentDecrement=tmParams["predictedSegmentDecrement"], maxSegmentsPerCell=tmParams["maxSegmentsPerCell"], maxSynapsesPerSegment=tmParams["maxSynapsesPerSegment"], seed=tmParams["seed"] ) self.tm_info = Metrics([self.tm.numberOfCells()], 999999999) anParams = self.parameters["anomaly"]["likelihood"] self.learningPeriod = int(math.floor(self.probationaryPeriod / 2.0)) self.anomalyLikelihood = AnomalyLikelihood( learningPeriod=self.learningPeriod, estimationSamples=self.probationaryPeriod - self.learningPeriod, reestimationPeriod=anParams["reestimationPeriod"]) self.kernel = self._gauss_kernel(self.historic_raw_anomaly_scores.maxlen, self.historic_raw_anomaly_scores.maxlen) def modelRun(self, ts, val): """ Run a single pass through HTM model @config ts - Timestamp @config val - float input value @return rawAnomalyScore computed for the `val` in this step """ self.iteration_ += 1 # 0. During the probation period, gather the data and return 0.01. if self.iteration_ <= self.probationaryPeriod: self.init_data.append((ts, val)) return 0.01 if self.is_initialized is False: if self.verbose: print("[{}] Initializing".format(self.name)) temp_iteration = self.iteration_ vals = [i[1] for i in self.init_data] self.initialize(input_min=min(vals), input_max=max(vals)) self.is_initialized = True for ts, val in self.init_data: self.modelRun(ts, val) self.iteration_ = temp_iteration if self.verbose: print("[{}] Initialization done".format(self.name)) ## run data through our model pipeline: enc -> SP -> TM -> Anomaly # 1. Encoding # Call the encoders to create bit representations for each value. These are SDR objects. dateBits = self.encTimestamp.encode(ts) valueBits = self.encValue.encode(float(val)) # Concatenate all these encodings into one large encoding for Spatial Pooling. encoding = SDR(self.encTimestamp.size + self.encValue.size).concatenate([valueBits, dateBits]) self.enc_info.addData(encoding) # 2. Spatial Pooler # Create an SDR to represent active columns, This will be populated by the # compute method below. It must have the same dimensions as the Spatial Pooler. activeColumns = SDR(self.sp.getColumnDimensions()) # Execute Spatial Pooling algorithm over input space. self.sp.compute(encoding, True, activeColumns) self.sp_info.addData(activeColumns) # 3. Temporal Memory # Execute Temporal Memory algorithm over active mini-columns. self.tm.compute(activeColumns, learn=True) self.tm_info.addData(self.tm.getActiveCells().flatten()) # 4. Anomaly # handle spatial, contextual (raw, likelihood) anomalies # -Spatial spatialAnomaly = 0.0 if self.useSpatialAnomaly: # Update min/max values and check if there is a spatial anomaly if self.minVal != self.maxVal: tolerance = (self.maxVal - self.minVal) * self.spatial_tolerance maxExpected = self.maxVal + tolerance minExpected = self.minVal - tolerance if val > maxExpected or val < minExpected: spatialAnomaly = 1.0 if self.maxVal is None or val > self.maxVal: self.maxVal = val if self.minVal is None or val < self.minVal: self.minVal = val # -Temporal raw = self.tm.anomaly like = self.anomalyLikelihood.anomalyProbability(val, raw, ts) logScore = self.anomalyLikelihood.computeLogLikelihood(like) temporalAnomaly = logScore anomalyScore = max(spatialAnomaly, temporalAnomaly) # this is the "main" anomaly, compared in NAB # 5. Apply smoothing self.historic_raw_anomaly_scores.append(anomalyScore) historic_scores = np.asarray(self.historic_raw_anomaly_scores) convolved = np.convolve(historic_scores, self.kernel, 'valid') anomalyScore = convolved[-1] return anomalyScore @staticmethod def estimateNormal(sampleData, performLowerBoundCheck=True): """ :param sampleData: :type sampleData: Numpy array. :param performLowerBoundCheck: :type performLowerBoundCheck: bool :returns: A dict containing the parameters of a normal distribution based on the ``sampleData``. """ mean = np.mean(sampleData) variance = np.var(sampleData) st_dev = 0 if performLowerBoundCheck: # Handle edge case of almost no deviations and super low anomaly scores. We # find that such low anomaly means can happen, but then the slightest blip # of anomaly score can cause the likelihood to jump up to red. if mean < 0.03: mean = 0.03 # Catch all for super low variance to handle numerical precision issues if variance < 0.0003: variance = 0.0003 # Compute standard deviation if variance > 0: st_dev = math.sqrt(variance) return mean, variance, st_dev @staticmethod def _calcSkipRecords(numIngested, windowSize, learningPeriod): """Return the value of skipRecords for passing to estimateAnomalyLikelihoods If `windowSize` is very large (bigger than the amount of data) then this could just return `learningPeriod`. But when some values have fallen out of the historical sliding window of anomaly records, then we have to take those into account as well so we return the `learningPeriod` minus the number shifted out. :param numIngested - (int) number of data points that have been added to the sliding window of historical data points. :param windowSize - (int) size of sliding window of historical data points. :param learningPeriod - (int) the number of iterations required for the algorithm to learn the basic patterns in the dataset and for the anomaly score to 'settle down'. """ numShiftedOut = max(0, numIngested - windowSize) return min(numIngested, max(0, learningPeriod - numShiftedOut)) @staticmethod def _gauss_kernel(std, size): def _norm_pdf(x, mean, sd): var = float(sd) ** 2 denom = (2 * math.pi * var) ** .5 num = math.exp(-(float(x) - float(mean)) ** 2 / (2 * var)) return num / denom kernel = [2 * _norm_pdf(idx, 0, std) for idx in list(range(-size + 1, 1))] kernel = np.array(kernel) kernel = np.flip(kernel) kernel = kernel / sum(kernel) return kernel
# # Run the encoder and measure some statistics about its output. # if args.category: n_samples = int(args.maximum - args.minimum + 1) else: n_samples = (args.maximum - args.minimum) / enc.parameters.resolution oversample = 2 # Use more samples than needed to avoid aliasing & artifacts. n_samples = int(round(oversample * n_samples)) sdrs = [] for i in np.linspace(args.minimum, args.maximum, n_samples): sdrs.append(enc.encode(i)) M = Metrics([enc.size], len(sdrs) + 1) for s in sdrs: M.addData(s) print("Statistics:") print("Encoded %d inputs." % len(sdrs)) print("Output " + str(M)) # # Plot the Receptive Field of each bit in the encoder. # import matplotlib.pyplot as plt if 'matplotlib.pyplot' in modules: rf = np.zeros([enc.size, len(sdrs)], dtype=np.uint8) for i in range(len(sdrs)): rf[:, i] = sdrs[i].dense plt.imshow(rf, interpolation='nearest') plt.title("RDSE Receptive Fields") plt.ylabel("Cell Number")
class HtmcoreDetector(AnomalyDetector): """ This detector uses an HTM based anomaly detection technique. """ def __init__(self, *args, **kwargs): super(HtmcoreDetector, self).__init__(*args, **kwargs) ## API for controlling settings of htm.core HTM detector: # Set this to False if you want to get results based on raw scores # without using AnomalyLikelihood. This will give worse results, but # useful for checking the efficacy of AnomalyLikelihood. You will need # to re-optimize the thresholds when running with this setting. self.useLikelihood = True self.useSpatialAnomaly = True self.verbose = True # Set this to true if you want to use the optimization. # If true, it reads the parameters from ./params.json # If false, it reads the parameters from ./best_params.json self.use_optimization = False ## internal members # (listed here for easier understanding) # initialized in `initialize()` self.encTimestamp = None self.encValue = None self.sp = None self.tm = None self.anLike = None # optional debug info self.enc_info = None self.sp_info = None self.tm_info = None # internal helper variables: self.inputs_ = [] self.iteration_ = 0 def getAdditionalHeaders(self): """Returns a list of strings.""" return ["raw_score"] #TODO optional: add "prediction" def handleRecord(self, inputData): """Returns a tuple (anomalyScore, rawScore). @param inputData is a dict {"timestamp" : Timestamp(), "value" : float} @return tuple (anomalyScore, <any other fields specified in `getAdditionalHeaders()`>, ...) """ # Send it to Numenta detector and get back the results return self.modelRun(inputData["timestamp"], inputData["value"]) def initialize(self): # toggle parameters here if self.use_optimization: parameters = get_params('params.json') else: parameters = parameters_numenta_comparable # setup spatial anomaly if self.useSpatialAnomaly: # Keep track of value range for spatial anomaly detection self.minVal = None self.maxVal = None ## setup Enc, SP, TM, Likelihood # Make the Encoders. These will convert input data into binary representations. self.encTimestamp = DateEncoder(timeOfDay= parameters["enc"]["time"]["timeOfDay"], weekend = parameters["enc"]["time"]["weekend"]) scalarEncoderParams = RDSE_Parameters() scalarEncoderParams.size = parameters["enc"]["value"]["size"] scalarEncoderParams.sparsity = parameters["enc"]["value"]["sparsity"] scalarEncoderParams.resolution = parameters["enc"]["value"]["resolution"] self.encValue = RDSE( scalarEncoderParams ) encodingWidth = (self.encTimestamp.size + self.encValue.size) self.enc_info = Metrics( [encodingWidth], 999999999 ) # Make the HTM. SpatialPooler & TemporalMemory & associated tools. # SpatialPooler spParams = parameters["sp"] self.sp = SpatialPooler( inputDimensions = (encodingWidth,), columnDimensions = (spParams["columnCount"],), potentialPct = spParams["potentialPct"], potentialRadius = encodingWidth, globalInhibition = True, localAreaDensity = spParams["localAreaDensity"], synPermInactiveDec = spParams["synPermInactiveDec"], synPermActiveInc = spParams["synPermActiveInc"], synPermConnected = spParams["synPermConnected"], boostStrength = spParams["boostStrength"], wrapAround = True ) self.sp_info = Metrics( self.sp.getColumnDimensions(), 999999999 ) # TemporalMemory tmParams = parameters["tm"] self.tm = TemporalMemory( columnDimensions = (spParams["columnCount"],), cellsPerColumn = tmParams["cellsPerColumn"], activationThreshold = tmParams["activationThreshold"], initialPermanence = tmParams["initialPerm"], connectedPermanence = spParams["synPermConnected"], minThreshold = tmParams["minThreshold"], maxNewSynapseCount = tmParams["newSynapseCount"], permanenceIncrement = tmParams["permanenceInc"], permanenceDecrement = tmParams["permanenceDec"], predictedSegmentDecrement = 0.0, maxSegmentsPerCell = tmParams["maxSegmentsPerCell"], maxSynapsesPerSegment = tmParams["maxSynapsesPerSegment"] ) self.tm_info = Metrics( [self.tm.numberOfCells()], 999999999 ) # setup likelihood, these settings are used in NAB if self.useLikelihood: anParams = parameters["anomaly"]["likelihood"] learningPeriod = int(math.floor(self.probationaryPeriod / 2.0)) self.anomalyLikelihood = AnomalyLikelihood( learningPeriod= learningPeriod, estimationSamples= self.probationaryPeriod - learningPeriod, reestimationPeriod= anParams["reestimationPeriod"]) # Predictor # self.predictor = Predictor( steps=[1, 5], alpha=parameters["predictor"]['sdrc_alpha'] ) # predictor_resolution = 1 # initialize pandaBaker if PANDA_VIS_BAKE_DATA: self.BuildPandaSystem(self.sp, self.tm, parameters["enc"]["value"]["size"], self.encTimestamp.size) def modelRun(self, ts, val): """ Run a single pass through HTM model @params ts - Timestamp @params val - float input value @return rawAnomalyScore computed for the `val` in this step """ ## run data through our model pipeline: enc -> SP -> TM -> Anomaly self.inputs_.append( val ) self.iteration_ += 1 # 1. Encoding # Call the encoders to create bit representations for each value. These are SDR objects. dateBits = self.encTimestamp.encode(ts) valueBits = self.encValue.encode(float(val)) # Concatenate all these encodings into one large encoding for Spatial Pooling. encoding = SDR( self.encTimestamp.size + self.encValue.size ).concatenate([valueBits, dateBits]) self.enc_info.addData( encoding ) # 2. Spatial Pooler # Create an SDR to represent active columns, This will be populated by the # compute method below. It must have the same dimensions as the Spatial Pooler. activeColumns = SDR( self.sp.getColumnDimensions() ) # Execute Spatial Pooling algorithm over input space. self.sp.compute(encoding, True, activeColumns) self.sp_info.addData( activeColumns ) # 3. Temporal Memory # Execute Temporal Memory algorithm over active mini-columns. # to get predictive cells we need to call activateDendrites & activateCells separately if PANDA_VIS_BAKE_DATA: # activateDendrites calculates active segments self.tm.activateDendrites(learn=True) # predictive cells are calculated directly from active segments predictiveCells = self.tm.getPredictiveCells() # activates cells in columns by TM algorithm (winners, bursting...) self.tm.activateCells(activeColumns, learn=True) else: self.tm.compute(activeColumns, learn=True) self.tm_info.addData( self.tm.getActiveCells().flatten() ) # 4.1 (optional) Predictor #TODO optional #TODO optional: also return an error metric on predictions (RMSE, R2,...) # 4.2 Anomaly # handle spatial, contextual (raw, likelihood) anomalies # -Spatial spatialAnomaly = 0.0 #TODO optional: make this computed in SP (and later improve) if self.useSpatialAnomaly: # Update min/max values and check if there is a spatial anomaly if self.minVal != self.maxVal: tolerance = (self.maxVal - self.minVal) * SPATIAL_TOLERANCE maxExpected = self.maxVal + tolerance minExpected = self.minVal - tolerance if val > maxExpected or val < minExpected: spatialAnomaly = 1.0 if self.maxVal is None or val > self.maxVal: self.maxVal = val if self.minVal is None or val < self.minVal: self.minVal = val # -temporal (raw) raw= self.tm.anomaly temporalAnomaly = raw if self.useLikelihood: # Compute log(anomaly likelihood) like = self.anomalyLikelihood.anomalyProbability(val, raw, ts) logScore = self.anomalyLikelihood.computeLogLikelihood(like) temporalAnomaly = logScore #TODO optional: TM to provide anomaly {none, raw, likelihood}, compare correctness with the py anomaly_likelihood anomalyScore = max(spatialAnomaly, temporalAnomaly) # this is the "main" anomaly, compared in NAB # 5. print stats if self.verbose and self.iteration_ % 1000 == 0: # print(self.enc_info) # print(self.sp_info) # print(self.tm_info) pass # 6. panda vis if PANDA_VIS_BAKE_DATA: # ------------------HTMpandaVis---------------------- # see more about this structure at https://github.com/htm-community/HTMpandaVis/blob/master/pandaBaker/README.md # fill up values pandaBaker.inputs["Value"].stringValue = "value: {:.2f}".format(val) pandaBaker.inputs["Value"].bits = valueBits.sparse pandaBaker.inputs["TimeOfDay"].stringValue = str(ts) pandaBaker.inputs["TimeOfDay"].bits = dateBits.sparse pandaBaker.layers["Layer1"].activeColumns = activeColumns.sparse pandaBaker.layers["Layer1"].winnerCells = self.tm.getWinnerCells().sparse pandaBaker.layers["Layer1"].predictiveCells = predictiveCells.sparse pandaBaker.layers["Layer1"].activeCells = self.tm.getActiveCells().sparse # customizable datastreams to be show on the DASH PLOTS pandaBaker.dataStreams["rawAnomaly"].value = temporalAnomaly pandaBaker.dataStreams["value"].value = val pandaBaker.dataStreams["numberOfWinnerCells"].value = len(self.tm.getWinnerCells().sparse) pandaBaker.dataStreams["numberOfPredictiveCells"].value = len(predictiveCells.sparse) pandaBaker.dataStreams["valueInput_sparsity"].value = valueBits.getSparsity() pandaBaker.dataStreams["dateInput_sparsity"].value = dateBits.getSparsity() pandaBaker.dataStreams["Layer1_SP_overlap_metric"].value = self.sp_info.overlap.overlap pandaBaker.dataStreams["Layer1_TM_overlap_metric"].value = self.sp_info.overlap.overlap pandaBaker.dataStreams["Layer1_SP_activation_frequency"].value = self.sp_info.activationFrequency.mean() pandaBaker.dataStreams["Layer1_TM_activation_frequency"].value = self.tm_info.activationFrequency.mean() pandaBaker.dataStreams["Layer1_SP_entropy"].value = self.sp_info.activationFrequency.mean() pandaBaker.dataStreams["Layer1_TM_entropy"].value = self.tm_info.activationFrequency.mean() pandaBaker.StoreIteration(self.iteration_-1) print("ITERATION: " + str(self.iteration_-1)) # ------------------HTMpandaVis---------------------- return (anomalyScore, raw) # with this method, the structure for visualization is defined def BuildPandaSystem(self, sp, tm, consumptionBits_size, dateBits_size): # we have two inputs connected to proximal synapses of Layer1 pandaBaker.inputs["Value"] = cInput(consumptionBits_size) pandaBaker.inputs["TimeOfDay"] = cInput(dateBits_size) pandaBaker.layers["Layer1"] = cLayer(sp, tm) # Layer1 has Spatial Pooler & Temporal Memory pandaBaker.layers["Layer1"].proximalInputs = [ "Value", "TimeOfDay", ] pandaBaker.layers["Layer1"].distalInputs = ["Layer1"] # data for dash plots streams = ["rawAnomaly", "value", "numberOfWinnerCells", "numberOfPredictiveCells", "valueInput_sparsity", "dateInput_sparsity", "Layer1_SP_overlap_metric", "Layer1_TM_overlap_metric", "Layer1_SP_activation_frequency", "Layer1_TM_activation_frequency", "Layer1_SP_entropy", "Layer1_TM_entropy" ] pandaBaker.dataStreams = dict((name, cDataStream()) for name in streams) # create dicts for more comfortable code # could be also written like: pandaBaker.dataStreams["myStreamName"] = cDataStream() pandaBaker.PrepareDatabase()
similar = {"doc": document, "bits": current} else: if (distance(current, reference["bits"]) < distance( similar["bits"], reference["bits"])): similar = {"doc": document, "bits": current} if not unsimilar: unsimilar = {"doc": document, "bits": current} else: if (distance(current, reference["bits"]) > distance( unsimilar["bits"], reference["bits"])): unsimilar = {"doc": document, "bits": current} report = Metrics([encoder.size], len(sdrs) + 1) for sdr in sdrs: report.addData(sdr) print("Statistics:") print("\tEncoded %d Document inputs." % len(sdrs)) print("\tOutput: " + str(report)) print("Similarity:") print("\tReference:\n\t\t" + str(reference["doc"])) print("\tMOST Similar (Distance = " + str(distance(similar["bits"], reference["bits"])) + "):") print("\t\t" + str(similar["doc"])) print("\tLEAST Similar (Distance = " + str(distance(unsimilar["bits"], reference["bits"])) + "):") print("\t\t" + str(unsimilar["doc"])) # Plot the Receptive Field of each bit in the encoder.
def main(parameters=default_parameters, argv=None, verbose=True): if verbose: import pprint print("Parameters:") pprint.pprint(parameters, indent=4) print("") # Read the input file. records = [] with open(_INPUT_FILE_PATH, "r") as fin: reader = csv.reader(fin) headers = next(reader) next(reader) next(reader) for record in reader: records.append(record) # Make the Encoders. These will convert input data into binary representations. dateEncoder = DateEncoder(timeOfDay= parameters["enc"]["time"]["timeOfDay"], weekend = parameters["enc"]["time"]["weekend"]) scalarEncoderParams = RDSE_Parameters() scalarEncoderParams.size = parameters["enc"]["value"]["size"] scalarEncoderParams.sparsity = parameters["enc"]["value"]["sparsity"] scalarEncoderParams.resolution = parameters["enc"]["value"]["resolution"] scalarEncoder = RDSE( scalarEncoderParams ) encodingWidth = (dateEncoder.size + scalarEncoder.size) enc_info = Metrics( [encodingWidth], 999999999 ) # Make the HTM. SpatialPooler & TemporalMemory & associated tools. spParams = parameters["sp"] sp = SpatialPooler( inputDimensions = (encodingWidth,), columnDimensions = (spParams["columnCount"],), potentialPct = spParams["potentialPct"], potentialRadius = encodingWidth, globalInhibition = True, localAreaDensity = spParams["localAreaDensity"], synPermInactiveDec = spParams["synPermInactiveDec"], synPermActiveInc = spParams["synPermActiveInc"], synPermConnected = spParams["synPermConnected"], boostStrength = spParams["boostStrength"], wrapAround = True ) sp_info = Metrics( sp.getColumnDimensions(), 999999999 ) tmParams = parameters["tm"] tm = TemporalMemory( columnDimensions = (spParams["columnCount"],), cellsPerColumn = tmParams["cellsPerColumn"], activationThreshold = tmParams["activationThreshold"], initialPermanence = tmParams["initialPerm"], connectedPermanence = spParams["synPermConnected"], minThreshold = tmParams["minThreshold"], maxNewSynapseCount = tmParams["newSynapseCount"], permanenceIncrement = tmParams["permanenceInc"], permanenceDecrement = tmParams["permanenceDec"], predictedSegmentDecrement = 0.0, maxSegmentsPerCell = tmParams["maxSegmentsPerCell"], maxSynapsesPerSegment = tmParams["maxSynapsesPerSegment"] ) tm_info = Metrics( [tm.numberOfCells()], 999999999 ) # setup likelihood, these settings are used in NAB anParams = parameters["anomaly"]["likelihood"] probationaryPeriod = int(math.floor(float(anParams["probationaryPct"])*len(records))) learningPeriod = int(math.floor(probationaryPeriod / 2.0)) anomaly_history = AnomalyLikelihood(learningPeriod= learningPeriod, estimationSamples= probationaryPeriod - learningPeriod, reestimationPeriod= anParams["reestimationPeriod"]) predictor = Predictor( steps=[1, 5], alpha=parameters["predictor"]['sdrc_alpha'] ) predictor_resolution = 1 # Iterate through every datum in the dataset, record the inputs & outputs. inputs = [] anomaly = [] anomalyProb = [] predictions = {1: [], 5: []} for count, record in enumerate(records): # Convert date string into Python date object. dateString = datetime.datetime.strptime(record[0], "%m/%d/%y %H:%M") # Convert data value string into float. consumption = float(record[1]) inputs.append( consumption ) # Call the encoders to create bit representations for each value. These are SDR objects. dateBits = dateEncoder.encode(dateString) consumptionBits = scalarEncoder.encode(consumption) # Concatenate all these encodings into one large encoding for Spatial Pooling. encoding = SDR( encodingWidth ).concatenate([consumptionBits, dateBits]) enc_info.addData( encoding ) # Create an SDR to represent active columns, This will be populated by the # compute method below. It must have the same dimensions as the Spatial Pooler. activeColumns = SDR( sp.getColumnDimensions() ) # Execute Spatial Pooling algorithm over input space. sp.compute(encoding, True, activeColumns) sp_info.addData( activeColumns ) # Execute Temporal Memory algorithm over active mini-columns. tm.compute(activeColumns, learn=True) tm_info.addData( tm.getActiveCells().flatten() ) # Predict what will happen, and then train the predictor based on what just happened. pdf = predictor.infer( count, tm.getActiveCells() ) for n in (1, 5): if pdf[n]: predictions[n].append( np.argmax( pdf[n] ) * predictor_resolution ) else: predictions[n].append(float('nan')) predictor.learn( count, tm.getActiveCells(), int(consumption / predictor_resolution)) anomalyLikelihood = anomaly_history.anomalyProbability( consumption, tm.anomaly ) anomaly.append( tm.anomaly ) anomalyProb.append( anomalyLikelihood ) # Print information & statistics about the state of the HTM. print("Encoded Input", enc_info) print("") print("Spatial Pooler Mini-Columns", sp_info) print(str(sp)) print("") print("Temporal Memory Cells", tm_info) print(str(tm)) print("") # Shift the predictions so that they are aligned with the input they predict. for n_steps, pred_list in predictions.items(): for x in range(n_steps): pred_list.insert(0, float('nan')) pred_list.pop() # Calculate the predictive accuracy, Root-Mean-Squared accuracy = {1: 0, 5: 0} accuracy_samples = {1: 0, 5: 0} for idx, inp in enumerate(inputs): for n in predictions: # For each [N]umber of time steps ahead which was predicted. val = predictions[n][ idx ] if not math.isnan(val): accuracy[n] += (inp - val) ** 2 accuracy_samples[n] += 1 for n in sorted(predictions): accuracy[n] = (accuracy[n] / accuracy_samples[n]) ** .5 print("Predictive Error (RMS)", n, "steps ahead:", accuracy[n]) # Show info about the anomaly (mean & std) print("Anomaly Mean", np.mean(anomaly)) print("Anomaly Std ", np.std(anomaly)) # Plot the Predictions and Anomalies. if verbose: try: import matplotlib.pyplot as plt except: print("WARNING: failed to import matplotlib, plots cannot be shown.") return -accuracy[5] plt.subplot(2,1,1) plt.title("Predictions") plt.xlabel("Time") plt.ylabel("Power Consumption") plt.plot(np.arange(len(inputs)), inputs, 'red', np.arange(len(inputs)), predictions[1], 'blue', np.arange(len(inputs)), predictions[5], 'green',) plt.legend(labels=('Input', '1 Step Prediction, Shifted 1 step', '5 Step Prediction, Shifted 5 steps')) plt.subplot(2,1,2) plt.title("Anomaly Score") plt.xlabel("Time") plt.ylabel("Power Consumption") inputs = np.array(inputs) / max(inputs) plt.plot(np.arange(len(inputs)), inputs, 'red', np.arange(len(inputs)), anomaly, 'blue',) plt.legend(labels=('Input', 'Anomaly Score')) plt.show() return -accuracy[5]
def testStatistics(self): # 100 random simple English words run mass encoding stats against testCorpus = [ "find", "any", "new", "work", "part", "take", "get", "place", "made", "live", "where", "after", "back", "little", "only", "round", "man", "year", "came", "show", "every", "good", "me", "give", "our", "under", "name", "very", "through", "just", "form", "sentence", "great", "think", "say", "help", "low", "line", "differ", "turn", "cause", "much", "mean", "before", "move", "right", "boy", "old", "too", "same", "tell", "does", "set", "three", "want", "air", "well", "also", "play", "small", "end", "put", "home", "read", "hand", "port", "large", "spell", "add", "even", "land", "here", "must", "big", "high", "such", "follow", "act", "why", "ask", "men", "change", "went", "light", "kind", "off", "need", "house", "picture", "try", "us", "again", "animal", "point", "mother", "world", "near", "build", "self", "earth"] num_samples = 1000 # number of documents to run num_tokens = 10 # tokens per document # Case 1 = tokenSimilarity OFF params1 = SimHashDocumentEncoderParameters() params1.size = 400 params1.sparsity = 0.33 params1.tokenSimilarity = False encoder1 = SimHashDocumentEncoder(params1) # Case 2 = tokenSimilarity ON params2 = params1 params2.tokenSimilarity = True encoder2 = SimHashDocumentEncoder(params2) sdrs1 = [] sdrs2 = [] for _ in range(num_samples): document = [] for _ in range(num_tokens - 1): token = testCorpus[random.randint(0, len(testCorpus) - 1)] document.append(token) sdrs1.append(encoder1.encode(document)) sdrs2.append(encoder2.encode(document)) report1 = Metrics([encoder1.size], len(sdrs1) + 1) report2 = Metrics([encoder2.size], len(sdrs2) + 1) for sdr in sdrs1: report1.addData(sdr) for sdr in sdrs2: report2.addData(sdr) # Assertions for Case 1 = tokenSimilarity OFF assert(report1.activationFrequency.entropy() > 0.87) assert(report1.activationFrequency.min() > 0.01) assert(report1.activationFrequency.max() < 0.99) assert(report1.activationFrequency.mean() > params1.sparsity - 0.005) assert(report1.activationFrequency.mean() < params1.sparsity + 0.005) assert(report1.overlap.min() > 0.21) assert(report1.overlap.max() > 0.53) assert(report1.overlap.mean() > 0.38) assert(report1.sparsity.min() > params1.sparsity - 0.01) assert(report1.sparsity.max() < params1.sparsity + 0.01) assert(report1.sparsity.mean() > params1.sparsity - 0.005) assert(report1.sparsity.mean() < params1.sparsity + 0.005) # Assertions for Case 2 = tokenSimilarity ON assert(report2.activationFrequency.entropy() > 0.59) assert(report2.activationFrequency.min() >= 0) assert(report2.activationFrequency.max() <= 1) assert(report2.activationFrequency.mean() > params2.sparsity - 0.005) assert(report2.activationFrequency.mean() < params2.sparsity + 0.005) assert(report2.overlap.min() > 0.38) assert(report2.overlap.max() > 0.78) assert(report2.overlap.mean() > 0.61) assert(report2.sparsity.min() > params2.sparsity - 0.01) assert(report2.sparsity.max() < params2.sparsity + 0.01) assert(report2.sparsity.mean() > params2.sparsity - 0.005) assert(report2.sparsity.mean() < params2.sparsity + 0.005)
class HTMCoreDetector(object): def __init__(self, inputMin, inputMax, probationaryPeriod, *args, **kwargs): self.inputMin = inputMin self.inputMax = inputMax self.probationaryPeriod = probationaryPeriod ## API for controlling settings of htm.core HTM detector: # Set this to False if you want to get results based on raw scores # without using AnomalyLikelihood. This will give worse results, but # useful for checking the efficacy of AnomalyLikelihood. You will need # to re-optimize the thresholds when running with this setting. self.useLikelihood = True self.verbose = False ## internal members # (listed here for easier understanding) # initialized in `initialize()` self.encTimestamp = None self.encValue = None self.sp = None self.tm = None self.anLike = None # optional debug info self.enc_info = None self.sp_info = None self.tm_info = None # internal helper variables: self.inputs_ = [] self.iteration_ = 0 def handleRecord(self, ts, val): """Returns a tuple (anomalyScore, rawScore). @param ts Timestamp @param val float @return tuple (anomalyScore, <any other fields specified in `getAdditionalHeaders()`>, ...) """ # Send it to Numenta detector and get back the results return self.modelRun(ts, val) def initialize(self): # toggle parameters here # parameters = default_parameters parameters = parameters_numenta_comparable ## setup Enc, SP, TM, Likelihood # Make the Encoders. These will convert input data into binary representations. self.encTimestamp = DateEncoder( timeOfDay=parameters["enc"]["time"]["timeOfDay"], weekend=parameters["enc"]["time"]["weekend"], season=parameters["enc"]["time"]["season"], dayOfWeek=parameters["enc"]["time"]["dayOfWeek"]) scalarEncoderParams = EncParameters() scalarEncoderParams.size = parameters["enc"]["value"]["size"] scalarEncoderParams.sparsity = parameters["enc"]["value"]["sparsity"] scalarEncoderParams.resolution = parameters["enc"]["value"][ "resolution"] self.encValue = Encoder(scalarEncoderParams) encodingWidth = (self.encTimestamp.size + self.encValue.size) self.enc_info = Metrics([encodingWidth], 999999999) # Make the HTM. SpatialPooler & TemporalMemory & associated tools. # SpatialPooler spParams = parameters["sp"] self.sp = SpatialPooler( inputDimensions=(encodingWidth, ), columnDimensions=(spParams["columnCount"], ), potentialPct=spParams["potentialPct"], potentialRadius=spParams["potentialRadius"], globalInhibition=True, localAreaDensity=spParams["localAreaDensity"], stimulusThreshold=spParams["stimulusThreshold"], synPermInactiveDec=spParams["synPermInactiveDec"], synPermActiveInc=spParams["synPermActiveInc"], synPermConnected=spParams["synPermConnected"], boostStrength=spParams["boostStrength"], wrapAround=True) self.sp_info = Metrics(self.sp.getColumnDimensions(), 999999999) # TemporalMemory tmParams = parameters["tm"] self.tm = TemporalMemory( columnDimensions=(spParams["columnCount"], ), cellsPerColumn=tmParams["cellsPerColumn"], activationThreshold=tmParams["activationThreshold"], initialPermanence=tmParams["initialPerm"], connectedPermanence=spParams["synPermConnected"], minThreshold=tmParams["minThreshold"], maxNewSynapseCount=tmParams["newSynapseCount"], permanenceIncrement=tmParams["permanenceInc"], permanenceDecrement=tmParams["permanenceDec"], predictedSegmentDecrement=0.0, maxSegmentsPerCell=tmParams["maxSegmentsPerCell"], maxSynapsesPerSegment=tmParams["maxSynapsesPerSegment"]) self.tm_info = Metrics([self.tm.numberOfCells()], 999999999) # setup likelihood, these settings are used in NAB if self.useLikelihood: anParams = parameters["anomaly"]["likelihood"] learningPeriod = int(math.floor(self.probationaryPeriod / 2.0)) self.anomalyLikelihood = AnomalyLikelihood( learningPeriod=learningPeriod, estimationSamples=self.probationaryPeriod - learningPeriod, reestimationPeriod=anParams["reestimationPeriod"]) # Predictor # self.predictor = Predictor( steps=[1, 5], alpha=parameters["predictor"]['sdrc_alpha'] ) # predictor_resolution = 1 def modelRun(self, ts, val): """ Run a single pass through HTM model @params ts - Timestamp @params val - float input value @return rawAnomalyScore computed for the `val` in this step """ ## run data through our model pipeline: enc -> SP -> TM -> Anomaly self.inputs_.append(val) self.iteration_ += 1 # 1. Encoding # Call the encoders to create bit representations for each value. These are SDR objects. dateBits = self.encTimestamp.encode(ts) valueBits = self.encValue.encode(float(val)) # Concatenate all these encodings into one large encoding for Spatial Pooling. encoding = SDR(self.encTimestamp.size + self.encValue.size).concatenate([valueBits, dateBits]) self.enc_info.addData(encoding) # 2. Spatial Pooler # Create an SDR to represent active columns, This will be populated by the # compute method below. It must have the same dimensions as the Spatial Pooler. activeColumns = SDR(self.sp.getColumnDimensions()) # Execute Spatial Pooling algorithm over input space. self.sp.compute(encoding, True, activeColumns) self.sp_info.addData(activeColumns) # 3. Temporal Memory # Execute Temporal Memory algorithm over active mini-columns. self.tm.compute(activeColumns, learn=True) self.tm_info.addData(self.tm.getActiveCells().flatten()) # 4.1 (optional) Predictor #TODO optional # TODO optional: also return an error metric on predictions (RMSE, R2,...) # 4.2 Anomaly # handle contextual (raw, likelihood) anomalies # -temporal (raw) raw = self.tm.anomaly temporalAnomaly = raw if self.useLikelihood: # Compute log(anomaly likelihood) like = self.anomalyLikelihood.anomalyProbability(val, raw, ts) logScore = self.anomalyLikelihood.computeLogLikelihood(like) temporalAnomaly = logScore # TODO optional: TM to provide anomaly {none, raw, likelihood}, compare correctness with the py anomaly_likelihood anomalyScore = temporalAnomaly # this is the "main" anomaly, compared in NAB # 5. print stats if self.verbose and self.iteration_ % 1000 == 0: print(self.enc_info) print(self.sp_info) print(self.tm_info) pass return anomalyScore, raw