示例#1
0
def test_duplicate_instances():
    instances = (
        (1, 1, 1, 1),
        (2, 2, 2, 2),
        (3, 3, 3, 3),
        (2, 1, 1, 1),
        (1, 2, 1, 1),
        (2, 2, 2, 2),
        (2, 2, 2, 2),
        (1, 1, 1, 1),
        (1, 1, 1, 1),
        (13, 13, 13, 13)
        )
    lof.outliers(1, instances)
def paramSweep(folderName, paramRange):
    """
    Calculate Local Outlier Factor (LOF) for all files in the given folder by using the mean values of the 12 MFCC features and
    sweep through different values of the minPts parameter
    @param folderName: Name of the folder in the "sound" folder, i.e. if you want to folder the folder ./sound/car give "car" a folderName
    @param paramRange: Range for the minPts parameter as list
    """
    fileFeatureDict = extractFeatures(folderName)
    
    """ Build list of the mean values: """
    values = fileFeatureDict.values()
    
    featureList = []
    for entry in values:
        featureList.append(tuple(entry))
    
    if(fileFeatureDict is not None):
        
        for minPts in paramRange:
            print "------ minPts = " + str(minPts) + "---------"
            lof = outliers(minPts, featureList)
            for outlier in lof:
                print "File " + str(fileFeatureDict.keys()[outlier["index"]]) + " has LOF of " + str(outlier["lof"])
            
            print "Total number of outliers: " + str(len(lof))
示例#3
0
def Anomaly_Detect(request):
	time = request.POST.get('time',None)
	rf = open('static/day/' + time + '/sum.json','r')
	lines = json.load(rf)

	all_nums = []
	for line in lines:
		all_nums.append(line[0])

	instances = []
	for i in range(len(all_nums)):
		if i!=0 and i%4 == 0:
			instances.append((all_nums[i-4],all_nums[i-3],all_nums[i-2],all_nums[i-1]))

	M = -1
	instances.append((all_nums[-4],all_nums[-3],all_nums[-2],all_nums[-1]))
	for tu in instances:
		M = max(M,max(tu))

	exceptions = outliers(5,instances)  

	error_index = ''  #有异常的点的下标集合,用空格分隔
	for outlier in exceptions:    #每个outlier对象有一个value值可以限制一个点是不是离群值
		if outlier['lof'] > 1.23:
			error_index = error_index + ' ' + str(outlier['index'])

	return JsonResponse({
		'count_max':M,
		'index':error_index
		})
示例#4
0
def plotPoints(coords):
    LOF = lof.outliers(5, coords)
    '''
    for coord in coords:
        if lof.local_outlier_factor(5, coord) > 1:
            portland.plot(float(coord[0]), float(coord[1]), 'ro', markersize=7, latlon=True)
        else:
            portland.plot(float(coord[0]), float(coord[1]), 'bo', markersize=7, latlon=True)
    '''
    print LOF
    '''
示例#5
0
def test_normalization_problems(k,
                                user,
                                instances,
                                user_review,
                                candidate=None,
                                count=None):
    temp_list = []
    outers = lof.outliers(k, instances, candidate)
    user_outer["user"] = user
    for i, outer in enumerate(outers):
        index = outer["index"]
        temp_list.append(dict(outer, **user_review[index]))
    user_outer["outer"] = temp_list
    user_outer["count"] = count
    saveFile(user_outer)
    return outers
def removeOutliers(folderName, minPts=10):
    """
    Move files that were detected as outliers to the outliers/ folder
    @param folderName: Name of the folder in the "sound" folder, i.e. if you want to folder the folder ./sound/car give "car" a folderName
    @param minPts: Parameter for LOF algorithm: number of nearest neighbors used in defining the local neighborhood of
    the object (see Breunig paper for details). Default value is 10.
    """
    fileFeatureDict = extractFeatures(folderName)

    if(fileFeatureDict is not None):

        """ Build list of the mean values: """
        values = fileFeatureDict.values()
        featureList = []
        for entry in values:
            featureList.append(tuple(entry))

        """ Calculate local outlier factors for each file: """
        lof = outliers(minPts, featureList)

        dir = str(os.getcwd()) + "/sound/" + folderName + "/"

        outlierDir = os.getcwd() + "/outliers/" + str(folderName)
        if not os.path.exists(outlierDir):
            os.makedirs(outlierDir)

        print(str(len(lof)) + " outliers were found for the class " + str(folderName) + " and will be moved to the outliers folder")

        successfullyMoved = 0

        for outlier in lof:

            fileDir = dir + str(fileFeatureDict.keys()[outlier["index"]])
            command = str("mv '" + str(fileDir) + "' '" + str(outlierDir) + "/'")

            p = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            out, err = p.communicate()

            if str(err) == "":
                successfullyMoved = successfullyMoved + 1

        if len(lof) == successfullyMoved:
            print("All outliers were successfully removed for the class " + str(folderName))
            return True
        else:
            print("Problems occured, when removing outliers for the class " + str(folderName))
            return False
示例#7
0
def test_duplicate_instances():
    instances = ((1, 1, 1, 1), (2, 2, 2, 2), (3, 3, 3, 3), (2, 1, 1, 1),
                 (1, 2, 1, 1), (2, 2, 2, 2), (2, 2, 2, 2), (1, 1, 1, 1),
                 (1, 1, 1, 1), (13, 13, 13, 13))

    lof.outliers(1, instances)
示例#8
0
def test_outliers():
    lof.outliers(1, instances)
示例#9
0
def test_normalization_problems():
    # see issue https://github.com/damjankuznar/pylof/issues/7

    instances = [(1., 2., 3.), (2., 3., 4.), (1., 2., 4.), (1., 2., 1.)]

    l = lof.outliers(1, instances)
示例#10
0
文件: test_lof.py 项目: minhhx/pylof
def test_outliers():
    lof.outliers(1, instances)
示例#11
0
import time

import numpy as np
from matplotlib import pyplot as plt
import lof

arr = np.genfromtxt("tests/dataset.csv",
                    skip_header=1,
                    dtype=np.float64,
                    delimiter=",")

timings = []
for num_instances in [
        20, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000
]:
    start = time.time()
    l = lof.outliers(10, arr[:num_instances])
    timings.append((num_instances, time.time() - start))

timings = np.array(timings)
print timings.tolist()
plt.plot(timings[:, 0], timings[:, 1])
plt.show()
def test_outliers(instances):
    t = lof.outliers(2, instances)
    for outlier in t:
        print outlier["lof"], outlier["instance"]
    return t
示例#13
0
import lof
import csv
import numpy as np

if __name__ == "__main__":
    # Load the data into tuple
    data = np.loadtxt("click-stream event.csv", delimiter=',')
    dataTuple = tuple(map(tuple, data))

    # Print top 5 outliers which use Manhanttan distance
    print("top 5 outliers which use Manhanttan distance")
    print(lof.outliersM(2, dataTuple)[:5])

    # Print top 5 outliers which use Euclidean distance
    print("top 5 outliers which use Euclidean distance")
    print(lof.outliers(3, dataTuple)[:5])
示例#14
0
## (-5.1400897823762239, -1.3359248994019064),
## (5.2586932439960243, 0.032431285797532586),
## (6.3610915734502838, -0.99059648246991894),
## (-0.31086913190231447, -2.8352818694180644),
## (1.2288582719783967, -1.1362795178325829),
## (-0.17986204466346614, -0.32813130288006365),
## (2.2532002509929216, -0.5142311840491649),
## (-0.75397166138399296, 2.2465141276038754),
## (1.9382517648161239, -1.7276112460593251),
## (1.6809250808549676, -2.3433636210337503),
## (0.68466572523884783, 1.4374914487477481),
## (2.0032364431791514, -2.9191062023123635),
## (-1.7565895138024741, 0.96995712544043267),
## (3.3809644295064505, 6.7497121359292684),
## (-4.2764152718650896, 5.6551328734397766),
## (-3.6347215445083019, -0.85149861984875741),
## (-5.6249411288060385, -3.9251965527768755),
## (4.6033708001912093, 1.3375110154658127),
## (-0.685421751407983, -0.73115552984211407),
## (-2.3744241805625044, 1.3443896265777866)]

#print(len(instances))

results = outliers(5, instances)
for outlier in results:
    to_write = (str(outlier["lof"]) + "," + str(outlier["instance"]) + "," +
                str(outlier["index"]) + "\n")
    out.write(to_write)
out.close()
print("outlier detection done")
示例#15
0
k_list = lof.initialize_k_list(data)
benchmark = 1  # the gap between k and RC's size

l = lof.LOF(data, k_list)

normalized_data_array = np.array(l.instances)
(
    data_position,
    data_position_min,
    data_position_max,
) = mymds.get_position(normalized_data_array, 2)
#print(data_position)

print("start to compute RC")

rare_centers = lof.outliers(data, k_list)
#print(rare_centers)

center = rare_centers[0]
center_index = center["index"]
center_kinf = center["k_inf"]
# center point and its k-neighbours
neighbours = lof.get_neighbours(center_kinf, data[center_index], data)
# put togather as RC
category = copy.deepcopy(neighbours)
category.append(data[center_index])
#print(category)

category_mean = ca.category_mean_relative(category, feature_mean)
print(category_mean)
category_index = []
示例#16
0
 (-0.75397166138399296, 2.2465141276038754),
 (1.9382517648161239, -1.7276112460593251),
 (1.6809250808549676, -2.3433636210337503),
 (0.68466572523884783, 1.4374914487477481),
 (2.0032364431791514, -2.9191062023123635),
 (-1.7565895138024741, 0.96995712544043267),
 (3.3809644295064505, 6.7497121359292684),
 (-4.2764152718650896, 5.6551328734397766),
 (-3.6347215445083019, -0.85149861984875741),
 (-5.6249411288060385, -3.9251965527768755),
 (4.6033708001912093, 1.3375110154658127),
 (-0.685421751407983, -0.73115552984211407),
 (-2.3744241805625044, 1.3443896265777866)]

from lof import outliers
lof = outliers(7, instances)

for outlier in lof:
    print(outlier["lof"], outlier["instance"])



from matplotlib import pyplot as p

x,y = zip(*instances)
p.scatter(x,y, 20, color="#0000FF")

for outlier in lof:
    value = outlier["lof"]
    instance = outlier["instance"]
    color = "#FF0000" if value > 1 else "#00FF00"
示例#17
0
# (114.48442	,36.57594),
# (114.48442	,36.57522),
# (114.48436	,36.57478),
# (114.48419	,36.57419),
# (114.48414	,36.57389),
# (114.48414	,36.57297),
# (114.48408	,36.57278),
# (114.484	,36.57239),
# (114.48394	,36.57183),
# (114.48375	,36.57106)
# ]

instances = ReadFromHive.getInstance()
print(instances)
from lof import outliers
lof = outliers(5, instances)

for outlier in lof:
    print(outlier["lof"], outlier["instance"])

from matplotlib import pyplot as p

x, y = zip(*instances)
p.scatter(x, y, 20, color="#0000FF")

for outlier in lof:
    value = outlier["lof"]
    instance = outlier["instance"]
    color = "#FF0000" if value > 6 else "#00FF00"
    p.scatter(instance[0],
              instance[1],
示例#18
0
def test_normalization_problems():
    # see issue https://github.com/damjankuznar/pylof/issues/7
    instances = [(1.,2.,3.),(2.,3.,4.),(1.,2.,4.),(1.,2.,1.)]
    l = lof.outliers(1, instances)
示例#19
0
def main():

    # actual provenance data of flow files
    flowFileData = []

    # list of features used in model
    modelFeatures = [
        "eventType", "componentId", "entitySize", "durationMillis"
    ]
    dedupeFeatures = [
        'eventId', "eventType", "componentId", "entitySize", "durationMillis",
        'componentType', 'updatedAttributes'
    ]
    # "eventType", "componentId", "entitySize", "durationMillis"
    # features that need to be type casted to int
    intFeatures = ["entitySize", "durationMillis"]

    fileDirectory = "/Users/wsong/Desktop/nifi/provenance-data/random-50000delay-mod-1000/*"
    saveFigureDirectory = "/Users/wsong/Desktop/Flow Provenance Graphs/Working with CSV/"
    flowName = "Random Time Delay"
    flowFileData = loadProvenanceData(fileDirectory, 500000)
    removeProvenanceReporterContamination(flowFileData)
    print "list size after contamination removed", len(flowFileData)
    cleanFeatures(flowFileData, dedupeFeatures)
    """# populate random times so not all identical points
    for event in flowFileData:
        event["durationMillis"] = random.uniform(0, 1)"""
    # obtain anomaly count and anomaly locations
    groundTruth = findGroundTruth(flowFileData)
    anomalyIndexList = []
    count = 0
    for num in list(enumerate(groundTruth)):
        if num[1][1] == 1:
            anomalyIndexList.append(num[0])
            count += 1
    print "number of anomalies", count
    print "number of events:", len(flowFileData)
    print "anomaly indicies", anomalyIndexList
    # populate anomalous times
    for index in anomalyIndexList:
        flowFileData[index]["durationMillis"] = random.uniform(50, 100)
        print flowFileData[index]["durationMillis"]

    # [dict(t) for t in set([tuple(sorted(d.items())) for d in flowFileData])]
    print "removing dupilcates"
    # the below solution cant even finish
    # [i for n, i in enumerate(flowFileData) if i not in flowFileData[n + 1:]]
    print "done removing duplicates"
    rawData = copy.deepcopy(flowFileData)

    cleanFeatures(flowFileData, modelFeatures)

    # cast integer features to int
    for dataPoint in flowFileData:
        for feature in intFeatures:
            dataPoint[feature] = float(dataPoint[feature])

    # loads features from a dictionary
    # link for reference:
    # http://scikit-learn.org/stable/modules/feature_extraction.html#dict-feature-extraction
    vec = DictVectorizer()
    data = vec.fit_transform(flowFileData).toarray()

    dataScaled = preprocessing.scale(data)
    # dataScaled = preprocessing.MinMaxScaler().fit_transform(data)

    print "Original data Dimensions:", dataScaled.shape
    instances = []
    for dataPoint in dataScaled:
        instances.append(tuple(dataPoint))

    print 'starting lof'
    lof = outliers(5, instances)
    for outlier in lof:
        value = outlier["lof"]
        index = outlier["index"]
        print value, index
    """# run PCA
    # sklearn_pca = sklearnPCA(n_components=.99)
    sklearn_pca = sklearnPCA(n_components=3)
    dataReduced = sklearn_pca.fit_transform(dataScaled)
    print "Variance Accounted for:", sklearn_pca.explained_variance_ratio_

    print "PCA Data Dimensions:", dataReduced.shape"""
    """
    # use_colours = {0: 'green', 1: 'red'}
    use_colours = {'LogAttribute': 'blue', 'GenerateFlowFile': 'green', 'ExecuteScript': 'red', 'Input Port': 'black', 'PutFile': 'purple'}
    use_sizes = {0: 10, 1: 50}
    use_markers = {0: 'o', 1: 'x'}
    fig = plt.figure()
    ax = fig.gca(projection='3d')
    ax.set_title(flowName)
    ax.set_xlabel('Column a')
    ax.set_ylabel('Column b')
    ax.set_zlabel('Column c')
    ax.view_init(elev=50, azim=60)              # elevation and angle
    ax.dist = 12
    ax.scatter(
           dataReduced[0:len(dataReduced), 0], dataReduced[0:len(dataReduced), 1], dataReduced[0:len(dataReduced), 2],  # data
           color=[use_colours[x["componentType"]] for x in rawData],     # marker colour
           marker='o',  # marker shape
           s=[use_sizes[x[1]] for x in groundTruth]          # marker size
           )
    classes = ['LogAttribute', 'GenerateFlowFile', 'ExecuteScript', 'Input Port', 'PutFile']
    class_colours = ['blue', 'green', 'red', 'black', 'purple']
    recs = []
    for i in range(0,len(class_colours)):
        recs.append(mpatches.Rectangle((0,0),1,1,fc=class_colours[i]))
    plt.legend(recs,classes, loc = 4, fontsize=10)
    # color=[use_colours[x[1]] for x in groundTruth]
    plt.show()"""
    """for i in xrange(0, 80, 20):
        for j in xrange(0, 100, 45):
            ax.view_init(elev=i, azim=j)
            plt.savefig(saveFigureDirectory + flowName + " elev"+str(i)+" angle"+str(j)+".png")"""

    print "script complete"
示例#20
0
## (6.3610915734502838, -0.99059648246991894),
## (-0.31086913190231447, -2.8352818694180644),
## (1.2288582719783967, -1.1362795178325829),
## (-0.17986204466346614, -0.32813130288006365),
## (2.2532002509929216, -0.5142311840491649),
## (-0.75397166138399296, 2.2465141276038754),
## (1.9382517648161239, -1.7276112460593251),
## (1.6809250808549676, -2.3433636210337503),
## (0.68466572523884783, 1.4374914487477481),
## (2.0032364431791514, -2.9191062023123635),
## (-1.7565895138024741, 0.96995712544043267),
## (3.3809644295064505, 6.7497121359292684),
## (-4.2764152718650896, 5.6551328734397766),
## (-3.6347215445083019, -0.85149861984875741),
## (-5.6249411288060385, -3.9251965527768755),
## (4.6033708001912093, 1.3375110154658127),
## (-0.685421751407983, -0.73115552984211407),
## (-2.3744241805625044, 1.3443896265777866)]

#print(len(instances))



        
results = outliers(5,instances)
for outlier in results:  
    to_write=(str(outlier["lof"]) + "," + str(outlier["instance"]) + "," + str(outlier["index"]) + "\n")
    out.write(to_write)
out.close()
print("outlier detection done")