def main(configuration):
    totalExecutionTime = []

    for dataset in configuration['dataset']['availableDataset']:
        print(dataset + '\n')

        truth = groundTruth('GrounTruth/' + dataset + '.txt', fileType='csv')

        for node in configuration['nodes']:
            print('Dataset {} - Node: {} loading ...'.format(dataset, node))

            df = pd.read_csv(configuration['dataset']['path'] + node + dataset + '.csv', low_memory=False).dropna() \
                .drop('Unnamed: 0', axis=1)
            print('Done.')

            times = df['time'].astype('int')
            df = df.drop(['time'], axis=1)

            # Without bravo
            df = df.loc[:, df.std() != 0]
            dfNormalized = normalize_matrix(df).dropna(axis=1)

            bufferDF = dfNormalized[0: configuration['sampleSkip']]
            testDF = dfNormalized[configuration['sampleSkip']:]

            # Anomaly DenStream initialization with the parameters in the configuration file
            aden = DenStream(lamb=configuration['denstreamParameters']['lambda'],
                             epsilon=configuration['denstreamParameters']['epsilon'],
                             beta=configuration['denstreamParameters']['beta'],
                             mu=configuration['denstreamParameters']['mu'],
                             startingBuffer=bufferDF,
                             tp=configuration['denstreamParameters']['tp'])
            aden.runInitialization()

            print('Running algorithm ...')
            outputCurrentNode = []
            startingSimulation = time.time()
            for sampleNumber in range(len(testDF)):
                sample = testDF.iloc[sampleNumber]
                result = aden.runOnNewSample(Sample(sample.values, times.iloc[sampleNumber]))
                outputCurrentNode.append(result)
            endSimulation = time.time() - startingSimulation
            totalExecutionTime.append(endSimulation)
            print('Done in {}'.format(endSimulation))

            df['result'] = [False] * configuration['sampleSkip'] + outputCurrentNode

            print("Number of anomalies in " + str(node) + " is: ", outputCurrentNode.count(True),
                  len(outputCurrentNode))

            if configuration['detectionCriterion'] == 'spatialDetection':
                df['time'] = times
                df[['result', 'time']].to_csv('Data/ResultsSpatialDetection/' + configuration[
                    'featureModel'] + '/' + dataset + '_DENSTREAM_' + node + '.csv', sep=',')
    return aden, truth, df, times, dfNormalized, testDF
예제 #2
0

for node in config["nodes"]:
    node = 'spine2'
    print(node)
    for dataset in config['dataset']['availableDataset']:
        dataset = 'portflap_first'
        print(dataset)
        table = PrettyTable()
        f_name = str(Path(__file__).parent
                     ) + '/Data/DatasetByNodes/' + node + dataset + '.csv'
        print(f_name)
        if os.path.isfile(f_name):
            df = pd.read_csv(f_name, low_memory=False).drop('Unnamed: 0',
                                                            axis=1)
            truth = groundTruth('GrounTruth/' + dataset + '.txt',
                                fileType='csv')
        print("Files are loaded")

        metric_lst = config['featureList']

        if len(truth.df.index[truth.df.Node == node].tolist()) >= 1:
            node_idx = truth.df.index[truth.df.Node == node].tolist()
            anomalyTime = pd.DataFrame(dtype=bool)

            if len(node_idx) == 1:
                anomalyTime = (df.time.astype('int64') >=
                               truth.events[node_idx[0]]['startTime'] - 50) & (
                                   df.time.astype('int64') <=
                                   truth.events[node_idx[0]]['endTime'] + 50)
            if len(node_idx) == 2:
                anomalyTime = (
예제 #3
0
def main(configuration):

	resultByNode = {}

	totalExecutionTime = []

	for dataset in configuration['dataset']['list']:

		"""Iterate on all the datasets chosen in the configuration list and read the ground truth file"""
		truth = groundTruth('GrounTruth/'+dataset+'.txt', fileType='csv')

		"""Iterate on all the nodes chosen in the configuration file"""
		for node in configuration['nodes']:

			"""Read node dataset"""
			print 'Dataset {} - Node: {} loading ...'.format(dataset,node),
			df = pd.read_csv(configuration['dataset']['path']+node+dataset+'.csv', low_memory = False)\
							.dropna()\
							.drop('Unnamed: 0', axis=1)
			print 'Done.'

			times = df['time'].astype('int')
			df = df.drop(['time'], axis=1)

			"""Select the chosen features in the configuration file"""
			"""By default the dataset contains all the features"""
			"""If ControlPlane is chosen: only the CP features are extracted from the dataset"""
			"""If DataPlane is chosen: the CP features are discarded, obtaining a dataset with only DataPlane"""
			"""If CompleteFeatures is chosen: pass"""
			if configuration['featureModel'] == 'ControlPlane':
				df = df[configuration['featureList']]
			elif configuration['featureModel'] == 'DataPlane':
				df = df.drop(configuration['featureList'], axis=1)
			elif configuration['featureModel'] == 'CompleteFeatures':
				pass
			else:
				sys.exit('Something wrong in configuration feature model')

			"""Dataset normalization"""	
			df = df.loc[:,df.std()!=0]
			dfNormalized = normalize_matrix(df).dropna(axis=1)
           
			bufferDF = dfNormalized[0:configuration['sampleSkip']]
			testDF = dfNormalized[configuration['sampleSkip']:]

			"""Anomaly DenStream initialization with the parameters in the configuration file"""
			aden = DenStream(lamb = configuration['denstreamParameters']['lambda'],\
							epsilon = configuration['denstreamParameters']['epsilon'],\
							beta = configuration['denstreamParameters']['beta'],\
							mu = configuration['denstreamParameters']['mu'],\
							startingBuffer = bufferDF,
							tp = configuration['denstreamParameters']['tp'])
			aden.runInitialization()

			"""Iterate on all the rows in the dataset and run .runOnNewSample() method of the algorithm"""
			"""The algorithm tries to merge the new sample to the existing clusters"""
			"""If the algorithm merges the sample to a core-mmc: the sample is considered Normal and returns False"""
			"""If the algorithm merges the sample to a outlier-mc or generates a new outlier-mc: the sample is considered Anomalous and returns True""" 
			print 'Running algorithm ...',
			startingSimulation = time.time()
			outputCurrentNode = []
			for sampleNumber in range(len(testDF)):
				sample = testDF.iloc[sampleNumber]
				result = aden.runOnNewSample(Sample(sample.values, times.iloc[sampleNumber]))
				outputCurrentNode.append(result)
			### END Running ###
			endSimulation = time.time() - startingSimulation
			totalExecutionTime.append(endSimulation)
			print 'Done in {}'.format(endSimulation)

			df['result'] = [False] * configuration['sampleSkip'] + outputCurrentNode

			"""Depending on the detection criterion chosen in the configuration file the script produces:"""
			"""1- Results and statistics compared to grountruth if timedetection chosen"""
			"""2- Results for each node if spatialdetection chosen. To compare the results with the groundtruth there is the need to run spatialPerformance.py"""
			if configuration['detectionCriterion'] == 'spatialDetection':
				df['time'] =  times
				df[['result','time']].to_csv('Data/ResultsSpatialDetection/'+configuration['featureModel']+'/'+dataset+'_DENSTREAM_'+node+'.csv', sep=',')

			elif configuration['detectionCriterion'] == 'timeDetection':
				statistics = Statistics(node, truth)
				resultByNode[node+dataset] = statistics.getNodeResult(df, times, kMAX=5)
			else:
				sys.exit('Error detectionCriterion')

	"""	Print result on file if multicoreAnalysis ON. Used only for grid optimization. Very long task""" 
	if configuration['multicoreAnalysis']['ON'] == 'YES':
		path = "DataPlane/"
		with open("Results/"+path+str(configuration["algorithmParameters"]["lambda"])+"_"+str(configuration["algorithmParameters"]["beta"])+"_PRF.json", "w") as outputfile:
			json.dump(statistics.getPrecisionRecallFalseRate(resultByNode, kMAX=5, plot=False), outputfile, indent=4, sort_keys=True)

		resultdelay = statistics.getDelay(resultByNode, kMAX=5, plot=False)

		record = {}

		for row in range(len(resultdelay[0])):
			record['k'+str(row+1)] = list(resultdelay[0][row])

		for row in range(len(resultdelay[1])):
			record['hop'+str(row)] = list(resultdelay[1]['hop'+str(row)])

		with open("Results/"+path+str(configuration["algorithmParameters"]["lambda"])+"_"+str(configuration["algorithmParameters"]["beta"])+"_delay.json", "w") as outputfile:
			json.dump(record, outputfile, indent=4, sort_keys=True)

		with open("Results/"+path+str(configuration["algorithmParameters"]["lambda"])+"_"+str(configuration["algorithmParameters"]["beta"])+"_execTime.json", "w") as outputfile:
			json.dump({'execTime':totalExecutionTime}, outputfile, indent=4, sort_keys=True)

	else:
		"""Compute statistics if time detection chosen"""
		"""The script compares the results with the ground truth and computes precision/recall"""
		"""In the end, writes the results on "resultsKT.json" file, in the "Visualiation" folder""" 
		if configuration['detectionCriterion'] == 'timeDetection':
			resStatistics =  statistics.getPrecisionRecallFalseRate(resultByNode, kMAX=5, plot=True)
			resDelay =  statistics.getDelay(resultByNode, kMAX=5, plot=True)

			print resStatistics
			print resDelay

			resStatistics['Delay'] = resDelay[0][:,0].tolist()
			resStatistics['errDelay'] = resDelay[1]['hop0'].tolist()

			with open('Visualization/resultsKT_'+configuration['featureModel']+'.json', 'w') as outfile:
			    json.dump(resStatistics, outfile, indent=2)

			print 'Time: {}'.format(np.sum(totalExecutionTime))

		"""return all the variables"""
		return aden, truth, df, times, dfNormalized