/
Community_Detection.py
501 lines (395 loc) · 35.4 KB
/
Community_Detection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
# -*- coding: utf-8 -*-
"""Project-5 (Prudhviraj Sheela).ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1fFLw8nrph7YRQXlUDEnOKfVxpmlNrfoy
###***Project-5 (Community Detection using BigCLAM)***
###***Name : Prudhviraj Sheela***
###***OSU CWID : A20228857***
"""
# Importing all the necessary packages
import networkx as nx
import numpy as np
import pandas as pd
import math
"""***Loading of the Youtube Edge List Data***"""
# In below step we read the edge list data and obtain the graph
G = nx.read_edgelist("/content/YouTube.edgelist", nodetype=int)
# In the below step we try to obtain the number of nodes and edges for the given graph
nodes_V = len(G.nodes())
edges_E = len(G.edges())
print(nodes_V,edges_E)
"""***Loading the Ground Truth Communities Data File***"""
groundtruth_communities_file = "/content/groundtruth_communities.txt"
groundtruth_communities_list = [] # This list stores all the details of each node and its associated community as list of sublists
with open(groundtruth_communities_file) as fg: # Opening the ground truth communities file
for line in fg: # Parses through each community from the file
groundtruth_communities_list.append([int(a) for a in line.split()]) # Appends each communities nodes as a list which forms up the entire groundtruth communities list
# In the below step we find the total number of communities present from the ground truth communities file loaded
K_Comm = len(groundtruth_communities_list)
K_Comm
"""# ***STEP-I: Factor Matrix Initialization***
***Method-1: Factor Matrix Initialization for 20 Percent Seed Communities***
***I) Loading the 20 Percent Seed Communities Data File***
***II) Computing the Factorization Matrix associated to 20 Percent Seed Communities and associated conductance calculation***
"""
percent20_seeds_communities_file = "/content/20percent_seed_communities.txt"
percent20_seeds_communities_list = [] # This list stores all the details of each node and its associated community as list of sublists
with open(percent20_seeds_communities_file) as fp: # Opening the 20 percent seed communities file
for line in fp: # Parses through each community from the file
percent20_seeds_communities_list.append([int(a) for a in line.split()]) # Appends each communities nodes as a list which forms up the entire 20 percent seed communities list
# In this below step we are initially creating a factor matriz initialization based on the nodes in the community
fact_matrix_seed20 = np.zeros((nodes_V,K_Comm)) # Initializing the factor matrix with all zeros and the size is |V|*K
for a in range(len(percent20_seeds_communities_list)): # Iterating through each community list of 20 percent seeds
for b in percent20_seeds_communities_list[a]: # Iterating through each node of that particular community
fact_matrix_seed20[b][a] = 1 # Update the value in the factor matrix with '1' for the corresponding node and community combination
fact_matrix_seed20 # Displays the initial factor matrix for 20 percent seeds data
# In this below step we compute the conductance for the remaining values in the "fact_matrix_20"
for a in range(len(fact_matrix_seed20)): # Iterating through the range of initial factor matrix obtained for 20 percent seed communities
value_minimum = np.inf # Initialized minimum value
for b in range(len(fact_matrix_seed20[0])): # Iterating through each node within the factor matrix
conduct = nx.conductance(G, (percent20_seeds_communities_list[b]+list(G.neighbors(a))+[a])) # Computing the conductance associated with the particular node and their associated neighbors data
if value_minimum > conduct: # If obtained conductance value is less then the minimum value then we reassign them
value_minimum = conduct # Now we swap the values obtained
conduct_minimum = b # This stores the node number having the minimum conductance
for c in list(G.neighbors(a))+[a]: # In this loop we evaluate if 'u' is locally minimal to 'c' and update it with '1' else keep at as '0'
fact_matrix_seed20[c][conduct_minimum] = 1
fact_matrix_seed20 # Displays the final factor matrix for 20 percent seed communities obtained on computing the conductance
"""***Method-2: Factor Matrix Initialization for Neighborhood Seed Communities***
***I) Loading the Neighborhood Seed Communities Data File***
***II) Computing the Factorization Matrix associated to Neighborhood Seed Communities and associated conductance calculation***
"""
neighbor_seeds_communities_file = "/content/neighborhood_seeds.txt"
neighbor_seeds_communities_list = [] # This list stores all the details of each node and its associated community as list of sublists
with open(neighbor_seeds_communities_file) as fn: # Opening the neighborhood seed communities file
for line in fn: # Parses through each community from the file
neighbor_seeds_communities_list.append([int(a) for a in line.split()]) # Appends each communities nodes as a list which forms up the entire neighborhood seed communities list
# In this below step we are initially creating a factor matriz initialization based on the nodes in the community
fact_matrix_neighborseeds = np.zeros((nodes_V,K_Comm)) # Initializing the factor matrix with all zeros and the size is |V|*K
for a in range(len(neighbor_seeds_communities_list)): # Iterating through each community list of neighborhood seeds
for b in neighbor_seeds_communities_list[a]: # Iterating through each node of that particular community
fact_matrix_neighborseeds[b][a] = 1 # Update the value in the factor matrix with '1' for the corresponding node and community combination
fact_matrix_neighborseeds # Displays the initial factor matrix for neighborhood seeds data
# In this below step we compute the conductance for the remaining values in the "fact_matrix_neighborseeds"
for a in range(len(fact_matrix_neighborseeds)): # Iterating through the range of initial factor matrix obtained for neighborhood communities
value_minimum = np.inf # Initialized minimum value
for b in range(len(fact_matrix_neighborseeds[0])): # Iterating through each node within the factor matrix
conduct = nx.conductance(G, (neighbor_seeds_communities_list[b]+list(G.neighbors(a))+[a])) # Computing the conductance associated with the particular node and their associated neighbors data
if value_minimum > conduct: # If obtained conductance value is less then the minimum value then we reassign them
value_minimum = conduct # Now we swap the values obtained
conduct_minimum = b # This stores the community number having the minimum conductance of all communities
for c in list(G.neighbors(a))+[a]: # In this loop we evaluate if 'u' is locally minimal to 'c' and update it with '1' else keep at as '0'
fact_matrix_neighborseeds[c][conduct_minimum] = 1
fact_matrix_neighborseeds # Displays the final factor matrix for neighborhood seed communities obtained on computing the conductance
"""***Method-3: Simple Factor Matrix Initialization with random numbers between [0,1]***"""
fact_simple = np.random.rand(nodes_V,K_Comm) # Creating a simple factor matrix with random values in range between [0,1]
fact_simple = np.around(fact_simple,decimals=4) # Rounding each value in the entry to a decimal place of '4'
fact_simple # Displays the final simple factor matrix obtained
"""# ***STEP-II: Matrix Factorization Optimization using BigCLAM v2.0***
***Method-1: Matrix Factorization for 20 Percent Seed Communities Data File***
"""
percentage_change=0
for iter in range(300): # Given number of iterations
summation_all_nodes = fact_matrix_seed20.sum(axis=0) # Computes F(w) value this is used to add up the sum of all nodes communities with respect to the axis
for a in range(len(fact_matrix_seed20)): # Iterating through each nodes in the factor matrix for computing gradient
neighbors_data = list(G.neighbors(a)) # Collecting the list of neighboring nodes of the nodes choosen
summation_nodes_v = [0]*K_Comm # Creating initial summation of nodes for the number of communities
Delta_1 = [0]*K_Comm # Initializing an array with zeros for all the communities
for b in neighbors_data: # Iterating through each neigbor data
if (1-np.exp(-np.matmul(fact_matrix_seed20[a],fact_matrix_seed20[b].transpose()))): # It checks if the calculated value of gradient of factor matrix is "nan" or not and then calculates the gradient function
Delta_12 = np.exp(-np.matmul(fact_matrix_seed20[a],fact_matrix_seed20[b].transpose()))/(1-np.exp(-np.matmul(fact_matrix_seed20[a],fact_matrix_seed20[b].transpose()))) # Computing the delta gradient function
else: # If the obtained value of gradient would raise an infinite value (or) division by zero error we replace it with "0"
Delta_12 = 0 # Replacing with '0' if any kind of math errors occur 'or' nan values occur
Delta_1 = (fact_matrix_seed20[b]*Delta_12) + Delta_1 # This calculates the first term of the gradient function
summation_nodes_v = fact_matrix_seed20[b] + summation_nodes_v # This finds the summation of F(v) which is the second term in the formula
Delta_2 = summation_all_nodes - fact_matrix_seed20[a] - summation_nodes_v # Final F(v) obtained for non-belonging neighbor nodes which is the 2nd term in the formula
Delta_Main = Delta_1 - Delta_2 # The final gradient is computed by subtracting both the above Delta_1 and Delta_2 terms found out
change = 0.1*sum(Delta_Main)/sum(fact_matrix_seed20[a])
if change < 0.001: # This condition checks if the F-Value
percentage_change+=1
fact_matrix_seed20[a] = fact_matrix_seed20[a] + (0.001*Delta_Main) # Updates the corresponding row value
Non_Neg_Vector = fact_matrix_seed20[a] < 0 # Projects F[u] back to a non-negative vector and if the value of F[u][c]<0 then change the value F[u][c] = 0
fact_matrix_seed20[a][Non_Neg_Vector] = 0 # Updating the F[u][c]=0
if percentage_change == nodes_V:
break
fact_matrix_seed20 # Displays the updated 20 percent seed community factorial matrix after performing matrix factorization using BigCLAM
"""***Method-2: Matrix Factorization for Neighborhood Seed Communities Data File***"""
percentage_change = 0
for iter in range(300): # Given number of iterations
summation_all_nodes = fact_matrix_neighborseeds.sum(axis=0) # Computes F(w) value this is used to add up the sum of all nodes communities with respect to the axis
for a in range(len(fact_matrix_neighborseeds)): # Iterating through each nodes in the factor matrix for computing gradient
neighbors_data = list(G.neighbors(a)) # Collecting the list of neighboring nodes of the nodes choosen
summation_nodes_v = [0]*K_Comm # Creating initial summation of nodes for the number of communities
Delta_1 = [0]*K_Comm # Initializing an array with zeros for all the communities
for b in neighbors_data: # Iterating through each neigbor data
if (1-np.exp(-np.matmul(fact_matrix_neighborseeds[a],fact_matrix_neighborseeds[b].transpose()))): # It checks if the calculated value of gradient of factor matrix is "nan" or not and then calculates the gradient function
Delta_12 = np.exp(-np.matmul(fact_matrix_neighborseeds[a],fact_matrix_neighborseeds[b].transpose()))/(1-np.exp(-np.matmul(fact_matrix_neighborseeds[a],fact_matrix_neighborseeds[b].transpose()))) # Computing the delta gradient function
else: # If the obtained value of gradient would raise an infinite value (or) division by zero error we replace it with "0"
Delta_12 = 0 # Replacing with '0' if any kind of math errors occur 'or' nan values occur
Delta_1 = (fact_matrix_neighborseeds[b]*Delta_12) + Delta_1 # This calculates the first term of the gradient function
summation_nodes_v = fact_matrix_neighborseeds[b] + summation_nodes_v # This finds the summation of F(v) which is the second term in the formula
Delta_2 = summation_all_nodes - fact_matrix_neighborseeds[a] - summation_nodes_v # Final F(v) obtained for non-belonging neighbor nodes which is the 2nd term in the formula
Delta_Main = Delta_1 - Delta_2 # The final gradient is computed by subtracting both the above Delta_1 and Delta_2 terms found out
change = 0.1*sum(Delta_Main)/sum(fact_matrix_neighborseeds[a])
if change < 0.001:
percentage_change+=1
fact_matrix_neighborseeds[a] = fact_matrix_neighborseeds[a] + (0.001*Delta_Main) # Updates the corresponding row value
Non_Neg_Vector = fact_matrix_neighborseeds[a] < 0 # Projects F[u] back to a non-negative vector and if the value of F[u][c]<0 then change the value F[u][c] = 0
fact_matrix_neighborseeds[a][Non_Neg_Vector] = 0 # Updating the F[u][c]=0
if percentage_change == nodes_V:
break
fact_matrix_neighborseeds # Displays the updated neighborhood community factorial matrix after performing matrix factorization using BigCLAM
"""***Method-3: Matrix Factorization for Simple Factor Matrix generated randomly***"""
percentage_change = 0
for iter in range(300): # Given number of iterations
summation_all_nodes = fact_simple.sum(axis=0) # Computes F(w) value this is used to add up the sum of all nodes communities with respect to the axis
for a in range(len(fact_simple)): # Iterating through each nodes in the factor matrix for computing gradient
neighbors_data = list(G.neighbors(a)) # Collecting the list of neighboring nodes of the nodes choosen
summation_nodes_v = [0]*K_Comm # Creating initial summation of nodes for the number of communities
Delta_1 = [0]*K_Comm # Initializing an array with zeros for all the communities
for b in neighbors_data: # Iterating through each neigbor data
if (1-np.exp(-np.matmul(fact_simple[a],fact_simple[b].transpose()))): # It checks if the calculated value of gradient of factor matrix is "nan" or not and then calculates the gradient function
Delta_12 = np.exp(-np.matmul(fact_simple[a],fact_simple[b].transpose()))/(1-np.exp(-np.matmul(fact_simple[a],fact_simple[b].transpose()))) # Computing the delta gradient function
else: # If the obtained value of gradient would raise an infinite value (or) division by zero error we replace it with "0"
Delta_12 = 0 # Replacing with '0' if any kind of math errors occur 'or' nan values occur
Delta_1 = (fact_simple[b]*Delta_12) + Delta_1 # This calculates the first term of the gradient function
summation_nodes_v = fact_simple[b] + summation_nodes_v # This finds the summation of F(v) which is the second term in the formula
Delta_2 = summation_all_nodes - fact_simple[a] - summation_nodes_v # Final F(v) obtained for non-belonging neighbor nodes which is the 2nd term in the formula
Delta_Main = Delta_1 - Delta_2 # The final gradient is computed by subtracting both the above Delta_1 and Delta_2 terms found out
change = 0.1*sum(Delta_Main)/sum(fact_simple[a])
if change < 0.0001:
percentage_change+=1
fact_simple[a] = fact_simple[a] + (0.0001*Delta_Main) # Updates the corresponding row value
Non_Neg_Vector = fact_simple[a] < 0 # Projects F[u] back to a non-negative vector and if the value of F[u][c]<0 then change the value F[u][c] = 0
fact_simple[a][Non_Neg_Vector] = 0 # Updating the F[u][c]=0
if percentage_change == nodes_V:
break
fact_simple # Displays the updated simple random factorial matrix after performing matrix factorization using BigCLAM
"""# ***Step-III: Community Assignment***
***Initialzing the value of δ = sqrt(1-log(1- 𝜀)) where 𝜀 = 10^(-8)***
"""
delta = math.sqrt(-math.log(1-math.pow(10,-8))) # Given value of delta
delta
"""***Method-1: Community Assignment for 20 Percent Seed Communities Data***"""
factmat20_comm_membership = dict() # Initialize a dictionary to store which nodes correspond to which community once assigned
for a in range(len(fact_matrix_seed20)): # Iterates through each node
for b in range(len(fact_matrix_seed20[0])): # Iterates through each community
if b not in factmat20_comm_membership.keys(): # Checks if the community is present in the dictionary created and if it is not present it creates an empty list
factmat20_comm_membership[b] = [] # Initializing empty list
if fact_matrix_seed20[a][b] >= delta: # Checking if the F[u][c] >= delta and updating the community value in dictionary by appending the nodes
factmat20_comm_membership[b].append(a)
print(factmat20_comm_membership) # Displays the obtained communities dictionary for 20 Percent Seed Communities
"""***Output-1: Writing the detected communities for 20 Percent Seed Communities data into an output file***"""
file_det1 = open('/content/20percent_seed_detected.txt','w') # The predicted communities data is written as output to this file
file_det1.write(str("Community")+str("--->")+str("Nodes")) # This is the header format in the file written
file_det1.write("\n")
for key,value in factmat20_comm_membership.items(): # We iterate through the dictionary and write its content to the file line by line
file_det1.write(str(key)+str("--->")+str(value))
file_det1.write("\n")
file_det1.close() # Close the file once values are written to file
"""***Method-2: Community Assignment for Neighoborhood Communities Data***"""
factmatneighbor_comm_membership = dict() # Initialize a dictionary to store which nodes correspond to which community once assigned
for a in range(len(fact_matrix_neighborseeds)): # Iterates through each node
for b in range(len(fact_matrix_neighborseeds[0])): # Iterates through each community
if b not in factmatneighbor_comm_membership.keys(): # Checks if the community is present in the dictionary created and if it is not present it creates an empty list
factmatneighbor_comm_membership[b] = [] # Initializing empty list
if fact_matrix_neighborseeds[a][b] >= delta: # Checking if the F[u][c] >= delta and updating the community value in dictionary by appending the nodes
factmatneighbor_comm_membership[b].append(a)
print(factmatneighbor_comm_membership) # Displays the obtained communities dictionary for Neighborhood Seed Communities
"""***Output-2: Writing the detected communities for neighborhood communities data into an output file***"""
file_det2 = open('/content/neigborhood_seed_detected.txt','w') # The predicted communities data is written as output to this file
file_det2.write(str("Community")+str("--->")+str("Nodes")) # This is the header format in the file written
file_det2.write("\n")
for key,value in factmatneighbor_comm_membership.items(): # We iterate through the dictionary and write its content to the file line by line
file_det2.write(str(key)+str("--->")+str(value))
file_det2.write("\n")
file_det2.close() # Close the file once values are written to file
"""***Method-3: Community Assignment for simple random generated data***"""
factmatsimple_comm_membership = dict() # Initialize a dictionary to store which nodes correspond to which community once assigned
for a in range(len(fact_simple)): # Iterates through each node
for b in range(len(fact_simple[0])): # Iterates through each community
if b not in factmatsimple_comm_membership.keys(): # Checks if the community is present in the dictionary created and if it is not present it creates an empty list
factmatsimple_comm_membership[b] = [] # Initializing empty list
if fact_simple[a][b] >= delta: # Checking if the F[u][c] >= delta and updating the community value in dictionary by appending the nodes
factmatsimple_comm_membership[b].append(a)
print(factmatsimple_comm_membership) # Displays the obtained communities dictionary for Neighborhood Seed Communities
"""***Output-3: Writing the detected communities for simple random generated data into an output file***"""
file_det3 = open('/content/simple_generated_detected.txt','w') # The predicted communities data is written as output to this file
file_det3.write(str("Community")+str("--->")+str("Nodes")) # This is the header format in the file written
file_det3.write("\n")
for key,value in factmatsimple_comm_membership.items(): # We iterate through the dictionary and write its content to the file line by line
file_det3.write(str(key)+str("--->")+str(value))
file_det3.write("\n")
file_det3.close() # Close the file once values are written to file
"""# ***Step-IV: Evaluation using Recall***
"""
# In this cell we obtain the dictionary of elements from the ground truth communities file
groundtruth_communities = dict() # Initialize a dictionary to store which nodes correspond to which community once assigned
i=0 # Counter used to increment the community number
with open(groundtruth_communities_file) as gf: # We open the file and reading its data
for line in gf: # Iterates through each line
gr_list1 = [int(a) for a in line.split()] # Stroing all the nodes in a list
groundtruth_communities[i] = gr_list1 # Assigning the list of nodes to community
i+=1 # Incrementing the community number
print(groundtruth_communities) # Diplays the dictionary of ground truth communities data of each community as key and its value as list of nodes
"""***Method-1: Evaluating the Recall Score for 20 Percent Seed Communities Data***
"""
matching = {} # This stores the predicted communities that match with the ground truth communities
collected = [] # Collects the community which are matched
for a in groundtruth_communities: # Iterating through the ground truth (original) communities
for b in factmat20_comm_membership: # Iterating through predicted communitiies
if a not in matching.keys(): # Checks if the key exists in the matching dictionary else assigns it a default community as '-1' and recall score as '0'
matching[a] = (-1,0)
if matching[a][1] < len(set(factmat20_comm_membership[a])&set(groundtruth_communities[b]))/len(set(groundtruth_communities[b])): # Checks if the matching score for that community is less than that of the predicted community choosen
if b not in collected: # If predicted community is not present in the list then we append it by updating the matching score of the community associated
matching[a] = (b,len(set(factmat20_comm_membership[a])&set(groundtruth_communities[b]))/len(set(groundtruth_communities[b]))) # Updated recall score for the community associated
collected.append(matching[a][0]) # Appends the node associated to the collection list
recall_20per_seed = 0 # Initial score assigned for the predicted community
for a in matching: # If the community matches then we evaluate the recall score
recall_20per_seed = recall_20per_seed + matching[a][1] # Updated recall score by adding the matching elements
recall_20per_seed = recall_20per_seed/K_Comm # Final recall score obtained by dividing with the number of communities
recall_20per_seed # Displays the recall score for 20 percent seed communities data
"""***Method-2: Evaluating the Recall Score for Neighborhood Seed Communities Data***
"""
matching = {} # This stores the predicted communities that match with the ground truth communities
collected = [] # Collects the community which are matched
for a in groundtruth_communities: # Iterating through the ground truth (original) communities
for b in factmatneighbor_comm_membership: # Iterating through predicted communitiies
if a not in matching.keys(): # Checks if the key exists in the matching dictionary else assigns it a default community as '-1' and recall score as '0'
matching[a] = (-1,0)
if matching[a][1] < len(set(factmatneighbor_comm_membership[a])&set(groundtruth_communities[b]))/len(set(groundtruth_communities[b])): # Checks if the matching score for that community is less than that of the predicted community choosen
if b not in collected: # If predicted community is not present in the list then we append it by updating the matching score of the community associated
matching[a] = (b,len(set(factmatneighbor_comm_membership[a])&set(groundtruth_communities[b]))/len(set(groundtruth_communities[b]))) # Updated recall score for the community associated
collected.append(matching[a][0]) # Appends the node associated to the collection list
recall_neighborhood_seed = 0 # Initial score assigned for the predicted community
for a in matching: # If the community matches then we evaluate the recall score
recall_neighborhood_seed = recall_neighborhood_seed + matching[a][1] # Updated recall score by adding the matching elements
recall_neighborhood_seed = recall_neighborhood_seed/K_Comm # Final recall score obtained by dividing with the number of communities
recall_neighborhood_seed # Displays the recall score for neighborhood seed communities data
"""***Method-3: Evaluating the Recall Score for simple random generated Data***
"""
matching = {} # This stores the predicted communities that match with the ground truth communities
collected = [] # Collects the community which are matched
for a in groundtruth_communities: # Iterating through the ground truth (original) communities
for b in factmatsimple_comm_membership: # Iterating through predicted communitiies
if a not in matching.keys(): # Checks if the key exists in the matching dictionary else assigns it a default community as '-1' and recall score as '0'
matching[a] = (-1,0)
if matching[a][1] < len(set(factmatsimple_comm_membership[a])&set(groundtruth_communities[b]))/len(set(groundtruth_communities[b])): # Checks if the matching score for that community is less than that of the predicted community choosen
if b not in collected: # If predicted community is not present in the list then we append it by updating the matching score of the community associated
matching[a] = (b,len(set(factmatsimple_comm_membership[a])&set(groundtruth_communities[b]))/len(set(groundtruth_communities[b]))) # Updated recall score for the community associated
collected.append(matching[a][0]) # Appends the node associated to the collection list
recall_simple_seed = 0 # Initial score assigned for the predicted community
for a in matching: # If the community matches then we evaluate the recall score
recall_simple_seed = recall_simple_seed + matching[a][1] # Updated recall score by adding the matching elements
recall_simple_seed = recall_simple_seed/K_Comm # Final recall score obtained by dividing with the number of communities
recall_simple_seed # Displays the recall score for simple random initialized data
"""## ***Graph-1:***
***Horizontal Bar Plot between types of factorizations and their recall scores***
"""
import matplotlib.pyplot as plt # Package required for creating plots
# The below line creates a horizontal plot
plt.barh(["20 Percent Seed Community","Neighborhood Community","Randomly Initialized"],[recall_20per_seed,recall_neighborhood_seed,recall_simple_seed], align='center', label="Recall Score")
plt.legend() # Attaches a legend to the plot
plt.ylabel('Types of factor matrix initializations') # Sets the y-label
plt.xlabel('Recall Scores') # Sets the x-label
plt.title('Recall Scores v/s Types of factor matrix initializations') # Sets the title of the plot
plt.show() # Displays the plot
"""## ***Creating Additonal Graphs***
## ***Graph-2:***
For constructing the below graph I have evaluated the Miss Rate or False Negative Rate associated for the different types of factor matrix initializations and I have plotted the required graph.
Miss Rate = 1 - Hit Rate
(Here, Hit Rate = Recall Score)
***Method-1: Evaluating the Miss Rates for 20 Percent Seed Communities Data***
"""
missrate_20per_seed = 1-recall_20per_seed
missrate_20per_seed
"""***Method-2: Evaluating the Miss Rates for Neighborhood Seed Communities Data***"""
missrate_neighborhood_seed = 1 - recall_neighborhood_seed
missrate_neighborhood_seed
"""***Method-3: Evaluating the Miss Rates for simple random generated Data***"""
missrate_simple_seed = 1 - recall_simple_seed
missrate_simple_seed
"""***Horizontal Bar Plot between types of factorizations and their miss rates***"""
# The below line creates a horizontal plot
plt.barh(["20 Percent Seed Community","Neighborhood Community","Randomly Initialized"],[missrate_20per_seed,missrate_neighborhood_seed,missrate_simple_seed], align='center', label="Miss Rates")
plt.legend() # Attaches a legend to the plot
plt.ylabel('Types of factor matrix initializations') # Sets the y-label
plt.xlabel('Miss Rates') # Sets the x-label
plt.title('Miss Rates v/s Types of factor matrix initializations') # Sets the title of the plot
plt.show() # Displays the plot
"""## ***Graph-3:***
For constructing the below graph I have evaluated the Precision Score associated for the different types of factor matrix initializations and I have plotted the required graph.
***Method-1: Evaluating the Precision Score for 20 Percent Seed Communities Data***
"""
matching = {} # This stores the predicted communities that match with the ground truth communities
collected = [] # Collects the community which are matched
for a in groundtruth_communities: # Iterating through the ground truth (original) communities
for b in factmat20_comm_membership: # Iterating through predicted communitiies
if a not in matching.keys(): # Checks if the key exists in the matching dictionary else assigns it a default community as '-1' and recall score as '0'
matching[a] = (-1,0)
if len(set(factmat20_comm_membership[a])) == 0: # If the length of predicted community is '0' we simply skip the community to avoid division by zero error
continue
else:
if b not in collected: # If predicted community is not present in the list then we append it by updating the matching score of the community associated
matching[a] = (b,len(set(factmat20_comm_membership[a])&set(groundtruth_communities[b]))/len(set(factmat20_comm_membership[a]))) # Updated precision score for the community associated
collected.append(matching[a][0]) # Appends the node associated to the collection list
precision_20per_seed = 0 # Initial score assigned for the predicted community
for a in matching: # If the community matches then we evaluate the precision score
precision_20per_seed = precision_20per_seed + matching[a][1] # Updated precision score by adding the matching elements
precision_20per_seed = precision_20per_seed/K_Comm # Final precision score obtained by dividing with the number of communities
precision_20per_seed # Displays the precision score for 20 percent seed communities data
"""***Method-2: Evaluating the Precision Score for Neighborhood Seed Communities Data***"""
matching = {} # This stores the predicted communities that match with the ground truth communities
collected = [] # Collects the community which are matched
for a in groundtruth_communities: # Iterating through the ground truth (original) communities
for b in factmatneighbor_comm_membership: # Iterating through predicted communitiies
if a not in matching.keys(): # Checks if the key exists in the matching dictionary else assigns it a default community as '-1' and recall score as '0'
matching[a] = (-1,0)
if len(set(factmatneighbor_comm_membership[a])) == 0: # If the length of predicted community is '0' we simply skip the community to avoid division by zero error
continue
else:
if b not in collected: # If predicted community is not present in the list then we append it by updating the matching score of the community associated
matching[a] = (b,len(set(factmatneighbor_comm_membership[a])&set(groundtruth_communities[b]))/len(set(factmatneighbor_comm_membership[a]))) # Updated precision score for the community associated
collected.append(matching[a][0]) # Appends the node associated to the collection list
precision_neighborhood_seed = 0 # Initial score assigned for the predicted community
for a in matching: # If the community matches then we evaluate the precision score
precision_neighborhood_seed = precision_neighborhood_seed + matching[a][1] # Updated precision score by adding the matching elements
precision_neighborhood_seed = precision_neighborhood_seed/K_Comm # Final precision score obtained by dividing with the number of communities
precision_neighborhood_seed # Displays the precision score for neighborhood seed communities data
"""***Method-3: Evaluating the Precision Score for simple random generated Data***"""
matching = {} # This stores the predicted communities that match with the ground truth communities
collected = [] # Collects the community which are matched
for a in groundtruth_communities: # Iterating through the ground truth (original) communities
for b in factmatsimple_comm_membership: # Iterating through predicted communitiies
if a not in matching.keys(): # Checks if the key exists in the matching dictionary else assigns it a default community as '-1' and recall score as '0'
matching[a] = (-1,0)
if len(set(factmatsimple_comm_membership[a])) == 0: # If the length of predicted community is '0' we simply skip the community to avoid division by zero error
continue
else:
if b not in collected: # If predicted community is not present in the list then we append it by updating the matching score of the community associated
matching[a] = (b,len(set(factmatsimple_comm_membership[a])&set(groundtruth_communities[b]))/len(set(factmatsimple_comm_membership[a]))) # Updated precision score for the community associated
collected.append(matching[a][0]) # Appends the node associated to the collection list
precision_simple_seed = 0 # Initial score assigned for the predicted community
for a in matching: # If the community matches then we evaluate the precision score
precision_simple_seed = precision_simple_seed + matching[a][1] # Updated precision score by adding the matching elements
precision_simple_seed = precision_simple_seed/K_Comm # Final precision score obtained by dividing with the number of communities
precision_simple_seed # Displays the precision score for simple random initialzed data
"""***Horizontal Bar Plot between types of factorizations and their precision scores***"""
# The below line creates a horizontal plot
plt.barh(["20 Percent Seed Community","Neighborhood Community","Randomly Initialized"],[precision_20per_seed,precision_neighborhood_seed,precision_simple_seed], align='center', label="Precision Scores")
plt.legend() # Attaches a legend to the plot
plt.ylabel('Types of factor matrix initializations') # Sets the y-label
plt.xlabel('Precision Scores') # Sets the x-label
plt.title('Precision Scores v/s Types of factor matrix initializations') # Sets the title of the plot
plt.show() # Displays the plot
"""## ***Comparing and Constrating the results obtained for the 3 types of graphs:***
***A) For Graph-I (Recall Score v/s Types of Factor Matrix Initializtions):***
--> Recall Score = (Original Documents ∩ Predicted Communities) / Original Documents
--> On my observations from the graph plotted it is clear that the simple randomly initialized matrix has a higher recall score compared to other matrix initialization methods. This states that the randomly initialized matrix has a higher level correspondance between both the detected communities and the ground truth communities (original data) and are returning positive results.
***B) For Graph-II (Miss Rate v/s Types of Factor Matrix Initializtions):***
--> Miss Rate = 1 - Recall Score
--> On my observations from the graph plotted it is clear that the neighborhood seed community matrix has a higher miss rate compared to other matrix initialization methods. This states that the neighborhood seed communities has a lower level of correspondance between both the detected communities and the ground truth communities (original data).
***C) For Graph-III (Precision Score v/s Types of Factor Matrix Initializtions):***
--> Precision Score = (Original Documents ∩ Predicted Communities) / Predicted Documents
--> On my observation from the graph plotted it is clear that the randomly initialized matrix has a higher precision score comapred to other matrix initialization methods. This states that the radnomly initialized matrix has a higher accuracy between both the detected communities and the ground truth communities (original data) and are returning accurate results.
"""