def setAnalysisParametersMcod(clusterer): k = util.getParameter('mcod_k') radius = util.getParameter('mcod_radius') windowSize = util.getParameter('mcod_windowsize') result = setAnalysisParameters_Mcod(clusterer, k, radius, windowSize) clusterer.prepareForUse() util.thisLogger.logInfo(result)
def __init__(self): # Setup Parameters util.params = None self.dnnModelPath = util.getFullFileName( util.getParameter('DnnModelPath')) self.numTrainingInstances = util.getParameter( 'NumActivationTrainingInstances') self.timestamp = datetime.datetime.now().strftime("%y%m%d_%H%M%S") self.outputName = util.getSetupFileDescription( ) + '--' + self.timestamp self.outputDir = 'output/%s' % (self.outputName) util.makeDirectory(self.outputDir) util.isLoggingEnabled = util.getParameter('LoggingEnabled') util.logPath = self.outputDir + '/%s.log' % (self.outputName) util.logLevel = util.getParameter('LogLevel') util.thisLogger = util.Logger() util.storeSetupParamsInLog() # Setup memory environment self.processorType = processorType = util.getParameter('ProcessorType') self.startTime = datetime.datetime.now() self.streamList = None self.clustererList = None self.classMaxValues1 = None # max value of raw activation data self.classMaxValues2 = None # max value of reduced activation data self.flatActivations = None self.activationBatches = None self.batchFlatActivations = None self.reducedFlatActivations = None
def createClusterer(stream): # set the clusterer clusterer = gateway.entry_point.Moa_Clusterers_Outliers_MCOD_New() k = util.getParameter('mcod_k') radius = util.getParameter('mcod_radius') windowSize = util.getParameter('mcod_windowsize') setAnalysisParameters_Mcod(clusterer, k, radius, windowSize) clusterer.setModelContext(stream.getHeader()) clusterer.prepareForUse() return clusterer
def getOutOfFilterData(): x_train, y_train, x_test, y_test = getAllData() dataDiscrepancyClass = util.getParameter('DataDiscrepancyClass') classes = [dataDiscrepancyClass] x_train, y_train, x_test, y_test = filterData(x_train, y_train, x_test, y_test, classes) return x_train, y_train, x_test, y_test
def getActivations(model, inData): actDict = {} # Activation Layer Extraction layers = util.getParameter("IncludedLayers") if (layers == 'all'): layers = np.arange(len(model.layers)) else: layers = np.asarray( layers.replace('[', '').replace(']', '').split(',')).astype(int) util.thisLogger.logInfo('Applying activation extraction to layers: %s' % (layers)) for layerNum in layers: getLastLayerOutput = K.function([model.layers[0].input], [model.layers[layerNum].output]) # all values from one layer if layerNum in layers: layer_output = getLastLayerOutput([inData]) actDict["activation" + str(layerNum)] = layer_output[0] return actDict
def getModelFileName(): datasetName = util.getParameter('DatasetName') classes = util.getParameter('DataClasses') classes = np.asarray(classes.replace('[', '').replace( ']', '').split(',')).astype(int) classesStr = '%sclasses' % (len(classes)) for c in classes: classesStr += str(c) numActivationTrainingInstances = util.getParameter( 'NumActivationTrainingInstances') modelFileName = '%s/models/reduce/%s_%s_single_undercomp_%s.h5' % ( os.getcwd(), datasetName, classesStr, str(numActivationTrainingInstances)) util.thisLogger.logInfo('Reduction model filename: %s' % (modelFileName)) #modelFileName = '/home/jupyter/deepactistream/models/reduce/%s_%s_single_undercomp_%s.h5'%(datasetName,classesStr,str(numActivationTrainingInstances)) return modelFileName
def getInstanceParameters(): dataDiscrepancyFrequency = util.getParameter( 'DataDiscrepancyFrequency') # i.e. 1in1 splitData = dataDiscrepancyFrequency.split('in') numDiscrepancy = int( splitData[0].strip()) # first number is number of discrepancies numNonDiscrepancy = int( splitData[1].strip()) # second number is number of non-discrepancies return numDiscrepancy, numNonDiscrepancy
def startDataInputStream(streamList, clustererList, reductionModel, dnnModel, x_test, maxClassValues1, maxClassValues2, dataDir, filePrefix): numUnseenInstances = util.getParameter('NumUnseenInstances') util.thisLogger.logInfo("---------- start of data input stream ----------") unseenInstances, unseenResults = startDataInputStream_Time( streamList, clustererList, reductionModel, dnnModel, x_test, numUnseenInstances, maxClassValues1, maxClassValues2, dataDir, filePrefix) util.thisLogger.logInfo( "----------- end of %s data input stream ----------") return unseenInstances, unseenResults
def train_undercompleteAutoencoder(flatActivations, batchFlatActivations): # trains an undercomplete autoencoder on the activations from the training # data useBatches = True data = None input_size = None processorType = util.getParameter('ProcessorType') if processorType == "GPU": if useBatches == False: data = np.array(flatActivations) input_size = len(flatActivations[0]) else: # activations are a list/numpy array data = np.array(flatActivations) input_size = flatActivations[0].size()[0] autoencoder = design_autoencoder(input_size) epochs = 50 autoencoder_trained = None # training in batches if (useBatches): autoencoder = trainInBatches(autoencoder, batchFlatActivations, epochs) else: act_train, act_valid = train_test_split(data, test_size=0.33, shuffle=True) autoencoder_trained = autoencoder.fit(act_train, act_train, batch_size=128, epochs=epochs, verbose=1, validation_data=(act_valid, act_valid)) # plot loss = autoencoder_trained.history['loss'] val_loss = autoencoder_trained.history['val_loss'] epochs = range(epochs) plt.figure() plt.plot(epochs, loss, 'bo', label='Training loss') plt.plot(epochs, val_loss, 'b', label='Validation loss') plt.title('Training and validation loss') plt.legend() plt.show() return autoencoder
def getFilteredData(): x_train, y_train, x_test, y_test = getAllData() # Only train and test on the first 5 classes # dataFilter = util.getParameter('DataFilter') # if(dataFilter == '5classes'): # classes = [0,1,2,3,4] # Todo: specify this in setup file? # elif(dataFilter == '2classes'): classes = util.getParameter('DataClasses') classes = np.asarray(classes.replace('[', '').replace( ']', '').split(',')).astype(int) util.thisLogger.logInfo('Data classes to be used: %s' % (classes)) #print('before filtering: classarray: %s, x_train: %s'%(classes, len(x_train))) #print('classarray: %s, y_train: %s'%(classes, len(y_train))) x_train, y_train, x_test, y_test = filterData(x_train, y_train, x_test, y_test, classes) #print('after filtering: classarray: %s, x_train: %s'%(classes, len(x_train))) #print('classarray: %s, y_train: %s'%(classes, len(y_train))) return x_train, y_train, x_test, y_test
def setPredictions(dnnModel): global unseenDataList instances = [x.instance for x in unseenDataList] instances = np.reshape(instances, (len(unseenDataList), 32, 32, 3)) instances = np.asarray(instances) predictions = np.argmax(dnnModel.predict(instances), axis=1) instances = [] # reset instances to save memory # The DNN is trained to output 0 or 1 only. # get the original classes it was trained on and transform the outputs classes = util.getParameter('DataClasses') classes = np.asarray(classes.replace('[', '').replace( ']', '').split(',')).astype(int) util.thisLogger.logInfo('Data classes to be used: %s' % (classes)) count = 0 for c in classes: predictions = np.where(predictions == count, c, predictions) count += 1 for index in range(len(predictions)): unseenDataList[index].predictedResult = predictions[index]
def reduce_autoenc(flatActivations, batchFlatActivations): # reduces the activations for an unseen instance and returns # the reduced activations and the neuron labels global reductionModel encodedOutput = None # Take only the first and middle layers of the model encoder = Model(inputs=reductionModel.input, outputs=reductionModel.layers[1].output) processorType = util.getParameter('ProcessorType') if processorType == "GPU": encodedOutput = encoder.predict(flatActivations, steps=1) else: encodedOutput = encoder.predict(np.array(flatActivations)) # change to python lists encodedOutput = encodedOutput.tolist() util.thisLogger.logInfo("Activations reduced from %s to %s elements" % (len(flatActivations[0]), len(encodedOutput[0]))) return encodedOutput
def processDataStream(self): # start the thread to process the streams so that new instances get clustered thread1 = threading.Thread(target=analyse.processStreamInstances, args=(self.streamList, self.clustererList, self.numTrainingInstances, self.outputDir, self.outputName, False, True), daemon=True) thread1.start() unseenInstancesObjList = datastream.startDataInputStream( self.streamList, self.clustererList, reduce.reductionModel, self.dnnModel, self.x_test, self.classMaxValues1, self.classMaxValues2, self.outputDir, self.outputName) # reshape into original array types unseenInstances = [x.instance for x in unseenInstancesObjList[0]] unseenResults = [x.correctResult for x in unseenInstancesObjList[0]] dataDiscrepancyClass = self.layerExtraction = util.getParameter( "DataDiscrepancyClass") # append unseen instances to the training instances unseenInstances = np.append(unseenInstances, unseenResults, axis=1) classes = np.unique(self.y_train) for dataClass in classes: # Filter unseen instances to only include CE and data discrepancy class filteredInstances = list( filter( lambda x: (x[len(unseenInstances[0]) - 1] == dataClass or x[len( unseenInstances[0]) - 1] == dataDiscrepancyClass), unseenInstances)) trainingActivations = util.readFromCsv( '%s/%s_trainingactivations_%s.csv' % (self.outputDir, self.outputName, dataClass)) labels = np.arange(len(trainingActivations[0])) labels = np.append(labels, len(labels)) classValues = np.full((trainingActivations.shape[0], 1), 'Train_' + str(dataClass)) trainingActivations = np.append(trainingActivations, classValues, axis=1) # axis=1 means add columns trainingActivations = np.concatenate( (trainingActivations, filteredInstances), axis=0) # axis=0 means add rows trainingActivations = np.concatenate( ([labels], trainingActivations), axis=0) # axis=0 means add rows analyse.stopProcessing() thread1.join() # capture any unprocessed instances analyse.processStreamInstances(self.streamList, self.clustererList, self.numTrainingInstances, self.outputDir, self.outputName, True, True) util.thisLogger.logInfo('End of instance processing') # get outlier results and store in csv if analyse.results != None: util.createResults(unseenInstancesObjList, analyse.results, self.outputDir, self.outputName) util.killMoaGateway() endTime = datetime.datetime.now() util.thisLogger.logInfo('Total run time: ' + str(endTime - self.startTime)) util.thisLogger.closeLog()
def processStreamInstance(stream, clusterer, i, numInstances, dataDir, prefix, saveOutlierResults=False): global results if saveOutlierResults == True: if results == None: results = [] results.append([ 'ID', 'Instance = class %s' % (util.getParameter('DataDiscrepancyClass')), 'Class', 'Outlier' ]) trainFilename = '%s/%s_trainingactivations_%s.csv' % (dataDir, prefix, i) numSamples = 1 try: while stream.hasMoreInstances(): newInstEx = stream.nextIdInstance() if (newInstEx is not None): instId = newInstEx.getIdAsString() newInst = newInstEx.getInstanceExample().getData() if saveOutlierResults == True: # determine if the instance is an outlier outlierResult = 'clusterer not defined' clusterResult = None inlierResult = None outlierResult = gateway.entry_point.Moa_Clusterers_Outliers_MCOD_addAndAnalyse( clusterer, newInst, trainFilename, numCpus) if (numInstances == -1): if (outlierResult[0] == 'DATA,OUTLIER,OUTLIER'): util.thisLogger.logInfoColour( "[%s] Activation data for stream %s, instance %s: %s" % (instId, i, numSamples, outlierResult), 'red') elif (outlierResult[0] == 'DATA,OUTLIER,NOT_OUTLIER'): util.thisLogger.logInfoColour( "[%s] Activation data for stream %s, instance %s: %s" % (instId, i, numSamples, outlierResult), 'green') else: util.thisLogger.logInfoColour( "[%s] Activation data for stream %s, instance %s: %s" % (instId, i, numSamples, outlierResult), 'magenta') # at this point we don't know if the original instance was an ND instance or CE as this is on a different thread # mark is as ND for now, and update relevant entries with CE when creating the results result = [ instId, 'ND', i, outlierResult[0].replace('DATA,OUTLIER,', '') ] results.append(result) else: # add instance as a training instance and do not do any outlier anlysis on it gateway.entry_point.Moa_Clusterers_Outliers_MCOD_processNewInstanceImplTrain( clusterer, newInst) numSamples += 1 except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() util.thisLogger.logInfo( 'problem reading stream: %s, %s, %s, %s, %s' % (e, exc_type, exc_obj, exc_tb, exc_tb.tb_lineno)) if gateway != None: gateway.entry_point.StopCreatingTrainedClusterers()
def getActivations(x_train, numActivationTrainingInstances, model, dnnModel, y_train): util.thisLogger.logInfo( "------ start of activation data extraction for training data -------") startTime = datetime.datetime.now() # Only get activations from the instances that are correctly classified y_predict = np.argmax(dnnModel.predict(x_train), axis=1) # The DNN is trained to output 0 or 1 only. # get the original classes it was trained on and transform the outputs classes = util.getParameter('DataClasses') classes = np.asarray(classes.replace('[', '').replace( ']', '').split(',')).astype(int) util.thisLogger.logInfo('Data classes to be used: %s' % (classes)) count = 0 for c in classes: y_predict = np.where(y_predict == count, c, y_predict) count += 1 incorrectPredictIndexes = [] for i in range(0, len(y_predict) - 1): if (y_predict[i] != y_train[i]): incorrectPredictIndexes.append(i) x_train = np.delete(x_train, incorrectPredictIndexes, axis=0) y_train = np.delete(y_train, incorrectPredictIndexes, axis=0) y_predict = np.delete(y_predict, incorrectPredictIndexes, axis=0) # train in batches activationTrainingBatchSize = util.getParameter( 'ActivationTrainingBatchSize') if numActivationTrainingInstances == -1: numActivationTrainingInstances = len(x_train) xData = x_train[:numActivationTrainingInstances, ] batchData = list(util.chunks(xData, activationTrainingBatchSize)) activationData = [] numBatches = len(batchData) batchActivationData = [[] for i in range(numBatches)] for batchIndex in range(numBatches): batch = batchData[batchIndex] util.thisLogger.logInfo("Training batch " + str(batchIndex + 1) + " of " + str(len(batchData)) + " (" + str(len(batch)) + " instances)") # Get activations and set up streams for the training data # get reduced activations for all training data in one go # Train in a loop util.thisLogger.logInfo( str(len(batch)) + " instances selected from training data") activations, numLayers = extract.getActivationData(model, batch) batchActivationData[batchIndex].append(activations) activationData.append(activations) util.thisLogger.logInfo( "Filter Layers: DNN has %s activation layers, getting activation data for %s instances." % (numLayers, len(batch))) endTime = datetime.datetime.now() util.thisLogger.logInfo('Total training time: ' + str(endTime - startTime)) util.thisLogger.logInfo( "------- end of activation data extraction for training data --------") util.thisLogger.logInfo("") return numLayers, batchData, activationData, batchActivationData