def test_duplicate_instances(): instances = ( (1, 1, 1, 1), (2, 2, 2, 2), (3, 3, 3, 3), (2, 1, 1, 1), (1, 2, 1, 1), (2, 2, 2, 2), (2, 2, 2, 2), (1, 1, 1, 1), (1, 1, 1, 1), (13, 13, 13, 13) ) lof.outliers(1, instances)
def paramSweep(folderName, paramRange): """ Calculate Local Outlier Factor (LOF) for all files in the given folder by using the mean values of the 12 MFCC features and sweep through different values of the minPts parameter @param folderName: Name of the folder in the "sound" folder, i.e. if you want to folder the folder ./sound/car give "car" a folderName @param paramRange: Range for the minPts parameter as list """ fileFeatureDict = extractFeatures(folderName) """ Build list of the mean values: """ values = fileFeatureDict.values() featureList = [] for entry in values: featureList.append(tuple(entry)) if(fileFeatureDict is not None): for minPts in paramRange: print "------ minPts = " + str(minPts) + "---------" lof = outliers(minPts, featureList) for outlier in lof: print "File " + str(fileFeatureDict.keys()[outlier["index"]]) + " has LOF of " + str(outlier["lof"]) print "Total number of outliers: " + str(len(lof))
def Anomaly_Detect(request): time = request.POST.get('time',None) rf = open('static/day/' + time + '/sum.json','r') lines = json.load(rf) all_nums = [] for line in lines: all_nums.append(line[0]) instances = [] for i in range(len(all_nums)): if i!=0 and i%4 == 0: instances.append((all_nums[i-4],all_nums[i-3],all_nums[i-2],all_nums[i-1])) M = -1 instances.append((all_nums[-4],all_nums[-3],all_nums[-2],all_nums[-1])) for tu in instances: M = max(M,max(tu)) exceptions = outliers(5,instances) error_index = '' #有异常的点的下标集合,用空格分隔 for outlier in exceptions: #每个outlier对象有一个value值可以限制一个点是不是离群值 if outlier['lof'] > 1.23: error_index = error_index + ' ' + str(outlier['index']) return JsonResponse({ 'count_max':M, 'index':error_index })
def plotPoints(coords): LOF = lof.outliers(5, coords) ''' for coord in coords: if lof.local_outlier_factor(5, coord) > 1: portland.plot(float(coord[0]), float(coord[1]), 'ro', markersize=7, latlon=True) else: portland.plot(float(coord[0]), float(coord[1]), 'bo', markersize=7, latlon=True) ''' print LOF '''
def test_normalization_problems(k, user, instances, user_review, candidate=None, count=None): temp_list = [] outers = lof.outliers(k, instances, candidate) user_outer["user"] = user for i, outer in enumerate(outers): index = outer["index"] temp_list.append(dict(outer, **user_review[index])) user_outer["outer"] = temp_list user_outer["count"] = count saveFile(user_outer) return outers
def removeOutliers(folderName, minPts=10): """ Move files that were detected as outliers to the outliers/ folder @param folderName: Name of the folder in the "sound" folder, i.e. if you want to folder the folder ./sound/car give "car" a folderName @param minPts: Parameter for LOF algorithm: number of nearest neighbors used in defining the local neighborhood of the object (see Breunig paper for details). Default value is 10. """ fileFeatureDict = extractFeatures(folderName) if(fileFeatureDict is not None): """ Build list of the mean values: """ values = fileFeatureDict.values() featureList = [] for entry in values: featureList.append(tuple(entry)) """ Calculate local outlier factors for each file: """ lof = outliers(minPts, featureList) dir = str(os.getcwd()) + "/sound/" + folderName + "/" outlierDir = os.getcwd() + "/outliers/" + str(folderName) if not os.path.exists(outlierDir): os.makedirs(outlierDir) print(str(len(lof)) + " outliers were found for the class " + str(folderName) + " and will be moved to the outliers folder") successfullyMoved = 0 for outlier in lof: fileDir = dir + str(fileFeatureDict.keys()[outlier["index"]]) command = str("mv '" + str(fileDir) + "' '" + str(outlierDir) + "/'") p = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = p.communicate() if str(err) == "": successfullyMoved = successfullyMoved + 1 if len(lof) == successfullyMoved: print("All outliers were successfully removed for the class " + str(folderName)) return True else: print("Problems occured, when removing outliers for the class " + str(folderName)) return False
def test_duplicate_instances(): instances = ((1, 1, 1, 1), (2, 2, 2, 2), (3, 3, 3, 3), (2, 1, 1, 1), (1, 2, 1, 1), (2, 2, 2, 2), (2, 2, 2, 2), (1, 1, 1, 1), (1, 1, 1, 1), (13, 13, 13, 13)) lof.outliers(1, instances)
def test_outliers(): lof.outliers(1, instances)
def test_normalization_problems(): # see issue https://github.com/damjankuznar/pylof/issues/7 instances = [(1., 2., 3.), (2., 3., 4.), (1., 2., 4.), (1., 2., 1.)] l = lof.outliers(1, instances)
import time import numpy as np from matplotlib import pyplot as plt import lof arr = np.genfromtxt("tests/dataset.csv", skip_header=1, dtype=np.float64, delimiter=",") timings = [] for num_instances in [ 20, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000 ]: start = time.time() l = lof.outliers(10, arr[:num_instances]) timings.append((num_instances, time.time() - start)) timings = np.array(timings) print timings.tolist() plt.plot(timings[:, 0], timings[:, 1]) plt.show()
def test_outliers(instances): t = lof.outliers(2, instances) for outlier in t: print outlier["lof"], outlier["instance"] return t
import lof import csv import numpy as np if __name__ == "__main__": # Load the data into tuple data = np.loadtxt("click-stream event.csv", delimiter=',') dataTuple = tuple(map(tuple, data)) # Print top 5 outliers which use Manhanttan distance print("top 5 outliers which use Manhanttan distance") print(lof.outliersM(2, dataTuple)[:5]) # Print top 5 outliers which use Euclidean distance print("top 5 outliers which use Euclidean distance") print(lof.outliers(3, dataTuple)[:5])
## (-5.1400897823762239, -1.3359248994019064), ## (5.2586932439960243, 0.032431285797532586), ## (6.3610915734502838, -0.99059648246991894), ## (-0.31086913190231447, -2.8352818694180644), ## (1.2288582719783967, -1.1362795178325829), ## (-0.17986204466346614, -0.32813130288006365), ## (2.2532002509929216, -0.5142311840491649), ## (-0.75397166138399296, 2.2465141276038754), ## (1.9382517648161239, -1.7276112460593251), ## (1.6809250808549676, -2.3433636210337503), ## (0.68466572523884783, 1.4374914487477481), ## (2.0032364431791514, -2.9191062023123635), ## (-1.7565895138024741, 0.96995712544043267), ## (3.3809644295064505, 6.7497121359292684), ## (-4.2764152718650896, 5.6551328734397766), ## (-3.6347215445083019, -0.85149861984875741), ## (-5.6249411288060385, -3.9251965527768755), ## (4.6033708001912093, 1.3375110154658127), ## (-0.685421751407983, -0.73115552984211407), ## (-2.3744241805625044, 1.3443896265777866)] #print(len(instances)) results = outliers(5, instances) for outlier in results: to_write = (str(outlier["lof"]) + "," + str(outlier["instance"]) + "," + str(outlier["index"]) + "\n") out.write(to_write) out.close() print("outlier detection done")
k_list = lof.initialize_k_list(data) benchmark = 1 # the gap between k and RC's size l = lof.LOF(data, k_list) normalized_data_array = np.array(l.instances) ( data_position, data_position_min, data_position_max, ) = mymds.get_position(normalized_data_array, 2) #print(data_position) print("start to compute RC") rare_centers = lof.outliers(data, k_list) #print(rare_centers) center = rare_centers[0] center_index = center["index"] center_kinf = center["k_inf"] # center point and its k-neighbours neighbours = lof.get_neighbours(center_kinf, data[center_index], data) # put togather as RC category = copy.deepcopy(neighbours) category.append(data[center_index]) #print(category) category_mean = ca.category_mean_relative(category, feature_mean) print(category_mean) category_index = []
(-0.75397166138399296, 2.2465141276038754), (1.9382517648161239, -1.7276112460593251), (1.6809250808549676, -2.3433636210337503), (0.68466572523884783, 1.4374914487477481), (2.0032364431791514, -2.9191062023123635), (-1.7565895138024741, 0.96995712544043267), (3.3809644295064505, 6.7497121359292684), (-4.2764152718650896, 5.6551328734397766), (-3.6347215445083019, -0.85149861984875741), (-5.6249411288060385, -3.9251965527768755), (4.6033708001912093, 1.3375110154658127), (-0.685421751407983, -0.73115552984211407), (-2.3744241805625044, 1.3443896265777866)] from lof import outliers lof = outliers(7, instances) for outlier in lof: print(outlier["lof"], outlier["instance"]) from matplotlib import pyplot as p x,y = zip(*instances) p.scatter(x,y, 20, color="#0000FF") for outlier in lof: value = outlier["lof"] instance = outlier["instance"] color = "#FF0000" if value > 1 else "#00FF00"
# (114.48442 ,36.57594), # (114.48442 ,36.57522), # (114.48436 ,36.57478), # (114.48419 ,36.57419), # (114.48414 ,36.57389), # (114.48414 ,36.57297), # (114.48408 ,36.57278), # (114.484 ,36.57239), # (114.48394 ,36.57183), # (114.48375 ,36.57106) # ] instances = ReadFromHive.getInstance() print(instances) from lof import outliers lof = outliers(5, instances) for outlier in lof: print(outlier["lof"], outlier["instance"]) from matplotlib import pyplot as p x, y = zip(*instances) p.scatter(x, y, 20, color="#0000FF") for outlier in lof: value = outlier["lof"] instance = outlier["instance"] color = "#FF0000" if value > 6 else "#00FF00" p.scatter(instance[0], instance[1],
def test_normalization_problems(): # see issue https://github.com/damjankuznar/pylof/issues/7 instances = [(1.,2.,3.),(2.,3.,4.),(1.,2.,4.),(1.,2.,1.)] l = lof.outliers(1, instances)
def main(): # actual provenance data of flow files flowFileData = [] # list of features used in model modelFeatures = [ "eventType", "componentId", "entitySize", "durationMillis" ] dedupeFeatures = [ 'eventId', "eventType", "componentId", "entitySize", "durationMillis", 'componentType', 'updatedAttributes' ] # "eventType", "componentId", "entitySize", "durationMillis" # features that need to be type casted to int intFeatures = ["entitySize", "durationMillis"] fileDirectory = "/Users/wsong/Desktop/nifi/provenance-data/random-50000delay-mod-1000/*" saveFigureDirectory = "/Users/wsong/Desktop/Flow Provenance Graphs/Working with CSV/" flowName = "Random Time Delay" flowFileData = loadProvenanceData(fileDirectory, 500000) removeProvenanceReporterContamination(flowFileData) print "list size after contamination removed", len(flowFileData) cleanFeatures(flowFileData, dedupeFeatures) """# populate random times so not all identical points for event in flowFileData: event["durationMillis"] = random.uniform(0, 1)""" # obtain anomaly count and anomaly locations groundTruth = findGroundTruth(flowFileData) anomalyIndexList = [] count = 0 for num in list(enumerate(groundTruth)): if num[1][1] == 1: anomalyIndexList.append(num[0]) count += 1 print "number of anomalies", count print "number of events:", len(flowFileData) print "anomaly indicies", anomalyIndexList # populate anomalous times for index in anomalyIndexList: flowFileData[index]["durationMillis"] = random.uniform(50, 100) print flowFileData[index]["durationMillis"] # [dict(t) for t in set([tuple(sorted(d.items())) for d in flowFileData])] print "removing dupilcates" # the below solution cant even finish # [i for n, i in enumerate(flowFileData) if i not in flowFileData[n + 1:]] print "done removing duplicates" rawData = copy.deepcopy(flowFileData) cleanFeatures(flowFileData, modelFeatures) # cast integer features to int for dataPoint in flowFileData: for feature in intFeatures: dataPoint[feature] = float(dataPoint[feature]) # loads features from a dictionary # link for reference: # http://scikit-learn.org/stable/modules/feature_extraction.html#dict-feature-extraction vec = DictVectorizer() data = vec.fit_transform(flowFileData).toarray() dataScaled = preprocessing.scale(data) # dataScaled = preprocessing.MinMaxScaler().fit_transform(data) print "Original data Dimensions:", dataScaled.shape instances = [] for dataPoint in dataScaled: instances.append(tuple(dataPoint)) print 'starting lof' lof = outliers(5, instances) for outlier in lof: value = outlier["lof"] index = outlier["index"] print value, index """# run PCA # sklearn_pca = sklearnPCA(n_components=.99) sklearn_pca = sklearnPCA(n_components=3) dataReduced = sklearn_pca.fit_transform(dataScaled) print "Variance Accounted for:", sklearn_pca.explained_variance_ratio_ print "PCA Data Dimensions:", dataReduced.shape""" """ # use_colours = {0: 'green', 1: 'red'} use_colours = {'LogAttribute': 'blue', 'GenerateFlowFile': 'green', 'ExecuteScript': 'red', 'Input Port': 'black', 'PutFile': 'purple'} use_sizes = {0: 10, 1: 50} use_markers = {0: 'o', 1: 'x'} fig = plt.figure() ax = fig.gca(projection='3d') ax.set_title(flowName) ax.set_xlabel('Column a') ax.set_ylabel('Column b') ax.set_zlabel('Column c') ax.view_init(elev=50, azim=60) # elevation and angle ax.dist = 12 ax.scatter( dataReduced[0:len(dataReduced), 0], dataReduced[0:len(dataReduced), 1], dataReduced[0:len(dataReduced), 2], # data color=[use_colours[x["componentType"]] for x in rawData], # marker colour marker='o', # marker shape s=[use_sizes[x[1]] for x in groundTruth] # marker size ) classes = ['LogAttribute', 'GenerateFlowFile', 'ExecuteScript', 'Input Port', 'PutFile'] class_colours = ['blue', 'green', 'red', 'black', 'purple'] recs = [] for i in range(0,len(class_colours)): recs.append(mpatches.Rectangle((0,0),1,1,fc=class_colours[i])) plt.legend(recs,classes, loc = 4, fontsize=10) # color=[use_colours[x[1]] for x in groundTruth] plt.show()""" """for i in xrange(0, 80, 20): for j in xrange(0, 100, 45): ax.view_init(elev=i, azim=j) plt.savefig(saveFigureDirectory + flowName + " elev"+str(i)+" angle"+str(j)+".png")""" print "script complete"
## (6.3610915734502838, -0.99059648246991894), ## (-0.31086913190231447, -2.8352818694180644), ## (1.2288582719783967, -1.1362795178325829), ## (-0.17986204466346614, -0.32813130288006365), ## (2.2532002509929216, -0.5142311840491649), ## (-0.75397166138399296, 2.2465141276038754), ## (1.9382517648161239, -1.7276112460593251), ## (1.6809250808549676, -2.3433636210337503), ## (0.68466572523884783, 1.4374914487477481), ## (2.0032364431791514, -2.9191062023123635), ## (-1.7565895138024741, 0.96995712544043267), ## (3.3809644295064505, 6.7497121359292684), ## (-4.2764152718650896, 5.6551328734397766), ## (-3.6347215445083019, -0.85149861984875741), ## (-5.6249411288060385, -3.9251965527768755), ## (4.6033708001912093, 1.3375110154658127), ## (-0.685421751407983, -0.73115552984211407), ## (-2.3744241805625044, 1.3443896265777866)] #print(len(instances)) results = outliers(5,instances) for outlier in results: to_write=(str(outlier["lof"]) + "," + str(outlier["instance"]) + "," + str(outlier["index"]) + "\n") out.write(to_write) out.close() print("outlier detection done")