-
Notifications
You must be signed in to change notification settings - Fork 0
/
clustering.py
84 lines (65 loc) · 1.99 KB
/
clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import time
from cluster import ClusterTool
from collections import defaultdict
###################################################################################################
#Test values
#filename = "test.txt"
#out_path = "/home/pmourlanne/stuff/nx_community/test/"
#threshold_IP = 3
#Actual values
filename = "/home/pmourlanne/graph/output_parsed"
out_path = "/home/pmourlanne/graph/save/"
threshold_IP = 50
client_name = '64d7c69f0679a9ed3df7597352afda1018d63558c87fecc8fcc26e09'
###################################################################################################
def loadAccounts(filename):
accounts = defaultdict(set)
i = 0
j = 0
start = time.time()
print ""
print "Creating the list of accounts"
with open(filename) as data:
for line in data:
words = line.split()
username = words[-1]
IP = words[0]
client = words[2]
if client == client_name:
accounts[username].add(IP)
i += 1
print "List of accounts created in %d seconds" %(time.time() - start)
print ""
return accounts
def findSeeds(accounts, threshold):
seeds = set()
for username in accounts:
if len(accounts[username]) >= threshold:
for IP in accounts[username]:
seeds.add(IP)
return seeds
def clusterFromFile(path):
ct = ClusterTool()
ct.loadGraph(path)
ct.g.buildClusters()
ct.saveClusters(path)
###################################################################################################
start = time.time()
#thresholds = (20, 50, 100, 250)
thresholds = list()
#thresholds.append(100)
for threshold in thresholds:
accounts = loadAccounts(filename)
seeds = findSeeds(accounts, threshold)
ct = ClusterTool(seeds)
ct.addAccountList(accounts)
ct.buildGraph(False, 20)
path = out_path + str(threshold) + "IPS/"
ct.saveGraph(path)
del ct
print ""
print "Graph for threshold %d built and saved in %d seconds" %(threshold, time.time() - start)
print ""
path = out_path + "100IPS/"
clusterFromFile(path)
print "Script finished in %d seconds" %(time.time() - start)