def generate_positive_features(): features = [] count = 0 print("Generating positive features......") for sample in positive_samples: if (count % 100 == 0): print(count) count += 1 feature = [] try: preds = nx.resource_allocation_index(UG, [sample]) for u, v, p in preds: feature.append(p) preds = nx.jaccard_coefficient(UG, [sample]) for u, v, p in preds: feature.append(p) preds = nx.adamic_adar_index(UG, [sample]) for u, v, p in preds: feature.append(p) preds = nx.preferential_attachment(UG, [sample]) for u, v, p in preds: feature.append(p) preds = nx.cn_soundarajan_hopcroft(UG, [sample]) for u, v, p in preds: feature.append(p) preds = nx.ra_index_soundarajan_hopcroft(UG, [sample]) for u, v, p in preds: feature.append(p) preds = nx.within_inter_cluster(UG, [sample]) for u, v, p in preds: feature.append(p) feature.append(1) # label=1 except: print("one error at: " + str(count)) pass features.append(feature) print("positive features: " + str(len(features))) return features
def community_features(U, node_features, edges, sinks, sources): within_inter_cluster = lisify_links( nx.within_inter_cluster(U, ebunch=edges)) communities_dict = nx.algorithms.community.asyn_fluidc(U, k=5) communities = [] for community in communities_dict: communities.append(list(community)) community_ids = np.zeros(node_features.shape[0]) for i in range(5): community_ids[communities[i]] = i communities_same = ( community_ids[sources] == community_ids[sinks]).astype(int) for i in range(node_features.shape[0]): U.node[i]['community'] = int(community_ids[i]) community_features = np.vstack([communities_same, within_inter_cluster]).T logger.info(f'community_features generated: {community_features.shape}') return community_features
def get_features(L, flag): X = [[] for i in range(len(L))] #=====================Social features(user-to-user graph)====================== #g0.adamic adar score if flag['g0'] is True: print("get feature g0") preds = nx.adamic_adar_index(G, L) cnt = 0 for (u, v, p) in preds: X[cnt].append(p) cnt += 1 #g1.jaccard coefficient if flag['g1'] is True: print("get feature g1") preds = nx.jaccard_coefficient(G, L) cnt = 0 for (u, v, p) in preds: X[cnt].append(p) cnt += 1 #g2.resource_allocation if flag['g2'] is True: print("get feature g2") preds = nx.resource_allocation_index(G, L) cnt = 0 for (u, v, p) in preds: X[cnt].append(p) cnt += 1 #g3.preferentail_attachment if flag['g3'] is True: print("get feature g3") preds = nx.preferential_attachment(G, L) cnt = 0 for (u, v, p) in preds: X[cnt].append(p) cnt += 1 #g4.shortest path length if flag['g4'] is True: print("get feature g4") cnt = 0 for (u, v) in L: if G.has_edge(u, v): G.remove_edge(u, v) if nx.has_path(G, u, v): X[cnt].append( nx.shortest_path_length(G, source=u, target=v) / 50000) else: X[cnt].append(1) G.add_edge(u, v) else: if nx.has_path(G, u, v): X[cnt].append( nx.shortest_path_length(G, source=u, target=v) / 50000) else: X[cnt].append(1) cnt += 1 #g5.common neighbors if flag['g5'] is True: print("get feature g5") cnt = 0 for (u, v) in L: if G.has_edge(u, v): G.remove_edge(u, v) T = [w for w in nx.common_neighbors(G, u, v)] G.add_edge(u, v) else: T = [w for w in nx.common_neighbors(G, u, v)] X[cnt].append(len(T)) cnt += 1 #g6.Approximate katz for social graph if flag['g6'] is True: print("get feature g6") cnt = 0 for (u, v) in L: p = 0 if G.has_edge(u, v): G.remove_edge(u, v) for x in G.neighbors(u): for y in G.neighbors(v): if x == y or G.has_edge(x, y): p += 1 G.add_edge(u, v) else: for x in G.neighbors(u): for y in G.neighbors(v): if x == y or G.has_edge(x, y): p += 1 X[cnt].append(p) cnt += 1 if flag['g7'] is True: print("get feature g7") cnt = 0 with open("best_part_G.txt", "r") as f: for line in f: v, c = line.split() c = int(c) G.node[v]['community'] = c iters = nx.cn_soundarajan_hopcroft(G, L) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(preds[(u, v)]) cnt += 1 if flag['g8'] is True: print("get feature g8") cnt = 0 with open("best_part_G.txt", "r") as f: for line in f: if line == "": continue v, c = line.split() c = int(c) G.node[v]['community'] = c iters = nx.ra_index_soundarajan_hopcroft(G, L) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(preds[(u, v)]) cnt += 1 if flag['g9'] is True: print("get feature g9") cnt = 0 with open("best_part_G.txt", "r") as f: for line in f: v, c = line.split() c = int(c) G.node[v]['community'] = c iters = nx.within_inter_cluster(G, L, delta=0.5) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(preds[(u, v)]) cnt += 1 if flag['g10'] is True: print("get feature g10") cnt = 0 with open("dendo_G.txt", "r") as f: line = f.readline() p_dict = {(u, v): 0.0 for (u, v) in L} for line in f: if 'level' in line: l = int(line.split()[1]) if l != 0: iters = nx.cn_soundarajan_hopcroft(G, L) for (u, v, p) in iters: p_dict[(u, v)] += p else: v, c = line.split() c = int(c) G.node[v]['community'] = c iters = nx.cn_soundarajan_hopcroft(G, L) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(p_dict[(u, v)] + preds[(u, v)]) cnt += 1 del p_dict del preds if flag['g11'] is True: print("get feature g11") cnt = 0 with open("dendo_G.txt", "r") as f: line = f.readline() p_dict = {(u, v): 0.0 for (u, v) in L} for line in f: if 'level' in line: l = int(line.split()[1]) if l != 0: iters = nx.ra_index_soundarajan_hopcroft(G, L) for (u, v, p) in iters: p_dict[(u, v)] += p else: v, c = line.split() c = int(c) G.node[v]['community'] = c iters = nx.ra_index_soundarajan_hopcroft(G, L) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(p_dict[(u, v)] + preds[(u, v)]) cnt += 1 del p_dict del preds if flag['g12'] is True: print("get feature g12") cnt = 0 with open("dendo_G.txt", "r") as f: line = f.readline() p_dict = {(u, v): 0.0 for (u, v) in L} for line in f: if 'level' in line: l = int(line.split()[1]) if l != 0: iters = nx.within_inter_cluster(G, L) for (u, v, p) in iters: p_dict[(u, v)] += p else: v, c = line.split() c = int(c) G.node[v]['community'] = c iters = nx.within_inter_cluster(G, L) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(p_dict[(u, v)] + preds[(u, v)]) cnt += 1 del p_dict del preds #=========================checkin features========================================= #c0.follower number if flag['c0'] is True: print("get feature c0") cnt = 0 for (u, v) in L: X[cnt].append(U[u]['follow_cnt'] * U[v]['follow_cnt']) # fu*fv cnt += 1 #c1.same time same location if flag['c1'] is True: print("get feature c1") cnt = 0 for (u, v) in L: p = calculate_CCC(G, u, v) X[cnt].append(p) cnt += 1 #c2.same time same distinct spot if flag['c2'] is True: print("get deature c2") cnt = 0 for (u, v) in L: p = 0 dis_same_spot = [] for k in C[u]: if k[1] not in dis_same_spot and k in C[v]: dis_same_spot.append(k[1]) p += 1 X[cnt].append(p) cnt += 1 #c3.same distinct spot (not necessarily same time) if flag['c3'] is True: cnt = 0 print("get feature c3") for (u, v) in L: p = 0 dis_same_spot = [] for k in C[u]: if k[1] not in dis_same_spot: for m in C[v]: if k[1] == m[1]: dis_same_spot.append(k[1]) p += 1 break X[cnt].append(p) cnt += 1 #c4.min Entropy if flag['c4'] is True: print("get feature c4") cnt = 0 for (u, v) in L: p = 0 E_list = [] for k in C[u]: if k in C[v]: spot = k[1] if spot in S and S[spot]['entropy'] > 0: E_list.append(S[spot]['entropy']) if len(E_list) > 0: p = min(E_list) X[cnt].append(p) cnt += 1 #c5. distance of mean_LL if flag['c5'] is True: cnt = 0 print("get feature c5") for (u, v) in L: dist = np.sqrt((U[u]['mean_LL'][0] - U[v]['mean_LL'][0])**2 + (U[u]['mean_LL'][1] - U[v]['mean_LL'][1])**2) X[cnt].append(dist) cnt += 1 #c6.weighted same location if flag['c6'] is True: print("get feature c6") cnt = 0 for (u, v) in L: p = 0 for k in C[u]: if k in C[v]: spot = k[1] #if spot in S and S[spot]['entropy'] > 0: #p += 1/S[spot]['entropy'] if spot in S: dist = np.sqrt( (S[spot]['LL'][0] - U[u]['mean_LL'][0])**2 + (S[spot]['LL'][1] - U[u]['mean_LL'][1])**2) p += dist dist = np.sqrt( (S[spot]['LL'][0] - U[v]['mean_LL'][0])**2 + (S[spot]['LL'][1] - U[v]['mean_LL'][1])**2) p += dist X[cnt].append(p) cnt += 1 #c7.PP if flag['c7'] is True: print("get feature c7") cnt = 0 for (u, v) in L: p = len(C[u]) * len(C[v]) X[cnt].append(p) cnt += 1 #c8.Total Common Friend Closeness (TCFC) if flag['c8'] is True: print("get feature c8") cnt = 0 for (u, v) in L: p = 0 if G.has_edge(u, v): G.remove_edge(u, v) for w in nx.common_neighbors(G, u, v): T1 = [x for x in nx.common_neighbors(G, u, w)] T2 = [x for x in nx.common_neighbors(G, v, w)] p += len(T1) * len(T2) G.add_edge(u, v) else: for w in nx.common_neighbors(G, u, v): T1 = [x for x in nx.common_neighbors(G, u, w)] T2 = [x for x in nx.common_neighbors(G, v, w)] p += len(T1) * len(T2) X[cnt].append(p) cnt += 1 #c9.Total Common friend Checkin Count (TCFCC) if flag['c9'] is True: print("get feature c9") cnt = 0 for (u, v) in L: p = 0 if G.has_edge(u, v): G.remove_edge(u, v) for w in nx.common_neighbors(G, u, v): p += calculate_CCC(G, u, w) * calculate_CCC(G, v, w) G.add_edge(u, v) else: for w in nx.common_neighbors(G, u, v): p += calculate_CCC(G, u, w) * calculate_CCC(G, v, w) X[cnt].append(p) cnt += 1 #c10. Common Category Checkin Counts Product (CCCP) if flag['c10'] is True: print("get feature c10") cnt = 0 for (u, v) in L: p = 0 for cat in U[u]['cate']: if cat in U[v]['cate']: p += U[u]['cate'][cat] * U[v]['cate'][cat] X[cnt].append(p) cnt += 1 #c11. Common Category Checkin Counts Product Ratio(CCCPR) if flag['c11'] is True: print("get feature c11") cnt = 0 for (u, v) in L: p = 0 u_cate_total = sum(U[u]['cate'][cat]**2 for cat in U[u]['cate']) v_cate_total = sum(U[v]['cate'][cat]**2 for cat in U[v]['cate']) for cat in U[u]['cate']: if cat in U[v]['cate']: p += (U[u]['cate'][cat] * U[v]['cate'][cat] / np.sqrt(u_cate_total * v_cate_total)) X[cnt].append(p) cnt += 1 #c12.trip route length all if flag['c12'] is True: print("get feature c12") cnt = 0 for (u, v) in L: tripDayLen1 = list() tripDayLen2 = list() tripDay = "starting" tripLen = 0.0 lastSpot = [0.0, 0.0] for k in C[u]: if not (lastSpot[0] == 0.0 and lastSpot[1] == 0.0): if k[1] in S: tripLen += np.sqrt((lastSpot[0] - S[k[1]]['LL'][0])**2 + (lastSpot[1] - S[k[1]]['LL'][1])**2) lastSpot[0] = S[k[1]]['LL'][0] lastSpot[1] = S[k[1]]['LL'][1] else: if k[1] in S: lastSpot[0] = S[k[1]]['LL'][0] lastSpot[1] = S[k[1]]['LL'][1] tripDay = "starting" tripLen2 = 0.0 lastSpot = [0.0, 0.0] for k in C[v]: if not (lastSpot[0] == 0.0 and lastSpot[1] == 0.0): if k[1] in S: tripLen2 += np.sqrt( (lastSpot[0] - S[k[1]]['LL'][0])**2 + (lastSpot[1] - S[k[1]]['LL'][1])**2) lastSpot[0] = S[k[1]]['LL'][0] lastSpot[1] = S[k[1]]['LL'][1] else: if k[1] in S: lastSpot[0] = S[k[1]]['LL'][0] lastSpot[1] = S[k[1]]['LL'][1] X[cnt].append(tripLen + tripLen2) cnt += 1 #=========================Heter Graph features===================================== #h0.Approximate katz for bipartite graph if flag['h0'] is True: print("get feature h0") cnt = 0 for (u, v) in L: p = 0 for x in B.neighbors(u): for y in B.neighbors(v): if x == y or B.has_edge(x, y): p += 1 X[cnt].append(p) cnt += 1 #h1.Approximate katz on HB if flag['h1'] is True: print("get feature h1") cnt = 0 for (u, v) in L: p = 0 if HB.has_edge(u, v): HB.remove_edge(u, v) for x in HB.neighbors(u): for y in HB.neighbors(v): if x == y or HB.has_edge(x, y): p += 1 HB.add_edge(u, v) else: for x in HB.neighbors(u): for y in HB.neighbors(v): if x == y or HB.has_edge(x, y): p += 1 X[cnt].append(p) cnt += 1 #h2.Approximate katz on H if flag['h2'] is True: print("get feature h2") cnt = 0 for (u, v) in L: p = 0 if H.has_edge(u, v): H.remove_edge(u, v) for x in H.neighbors(u): for y in H.neighbors(v): if x == y or H.has_edge(x, y): p += 1 H.add_edge(u, v) else: for x in H.neighbors(u): for y in H.neighbors(v): if x == y or H.has_edge(x, y): p += 1 X[cnt].append(p) cnt += 1 #h3.shortest path length on B if flag['h3'] is True: print("get feature h3") cnt = 0 for (u, v) in L: if nx.has_path(B, u, v): X[cnt].append( nx.shortest_path_length(B, source=u, target=v) / 50000) else: X[cnt].append(1) cnt += 1 #h4.clustering coefiicient on H if flag['h4'] is True: print("get feature h4") cnt = 0 for (u, v) in L: if H.has_edge(u, v): H.remove_edge(u, v) p = nx.clustering(H, u) * nx.clustering(H, v) H.add_edge(u, v) else: p = nx.clustering(H, u) * nx.clustering(H, v) X[cnt].append(p) cnt += 1 #h5. number of (user's loc friends)'s loc friends if flag['h5'] is True: print("get feature h5") cnt = 0 for (u, v) in L: counter1 = 0 for neighbor in H.neighbors(u): if not neighbor.isnumeric(): for neighbor2 in H.neighbors(neighbor): if not neighbor.isnumeric(): counter1 += 1 counter2 = 0 for neighbor in H.neighbors(v): if not neighbor.isnumeric(): for neighbor2 in H.neighbors(neighbor): if not neighbor.isnumeric(): counter2 += 1 #print(str(counter1)+" "+str(counter2)+"\n") X[cnt].append(counter1 * counter2) cnt += 1 #h6. location friends' degree sum if flag['h6'] is True: print("get feature h6") cnt = 0 for (u, v) in L: counter1 = 0 for locationNeighbor in H.neighbors(u): if not locationNeighbor.isnumeric(): #print(str(locationNeighbor)+"\n") if locationNeighbor in LG: counter1 += LG.degree(locationNeighbor) counter2 = 0 for locationNeighbor in H.neighbors(v): if not locationNeighbor.isnumeric(): if locationNeighbor in LG: counter2 += LG.degree(locationNeighbor) X[cnt].append(counter1 * counter2) cnt += 1 #h7. Approximate katz for social graph if flag['h7'] is True: print("get feature h7") cnt = 0 for (u, v) in L: counter = 0 for node in H.neighbors(u): if not node.isnumeric(): for node2 in H.neighbors(v): if not node2.isnumeric(): if node == node2 or H.has_edge(node, node2): counter += 1 X[cnt].append(counter) cnt += 1 #h8. adamic adar score on H if flag['h8'] is True: print("get feature h8") preds = nx.adamic_adar_index(H, L) cnt = 0 for (u, v, p) in preds: X[cnt].append(p) cnt += 1 #h9. resource_allocation on H if flag['h9'] is True: print("get feature h9") preds = nx.resource_allocation_index(H, L) cnt = 0 for (u, v, p) in preds: X[cnt].append(p) cnt += 1 #h10. shortest path length on H if flag['h10'] is True: print("get feature h10") cnt = 0 for (u, v) in L: if H.has_edge(u, v): H.remove_edge(u, v) if nx.has_path(H, u, v): X[cnt].append( nx.shortest_path_length(H, source=u, target=v) / 50000) else: X[cnt].append(1) H.add_edge(u, v) else: if nx.has_path(H, u, v): X[cnt].append( nx.shortest_path_length(H, source=u, target=v) / 50000) else: X[cnt].append(1) cnt += 1 #h11. common neighbors on H if flag['h11'] is True: print("get feature h11") cnt = 0 for (u, v) in L: if H.has_edge(u, v): H.remove_edge(u, v) T = [w for w in nx.common_neighbors(H, u, v)] H.add_edge(u, v) else: T = [w for w in nx.common_neighbors(H, u, v)] X[cnt].append(len(T)) cnt += 1 #h12.Approximate katz for social graph if flag['h12'] is True: print("get feature h12") cnt = 0 for (u, v) in L: p = 0 if H.has_edge(u, v): H.remove_edge(u, v) for x in H.neighbors(u): for y in H.neighbors(v): if x == y or H.has_edge(x, y): p += 1 H.add_edge(u, v) else: for x in H.neighbors(u): for y in H.neighbors(v): if x == y or H.has_edge(x, y): p += 1 X[cnt].append(p) cnt += 1 if flag['h13'] is True: print("get feature h13") cnt = 0 with open("best_part_HB.txt", "r") as f: for line in f: v, c = line.split() c = int(c) HB.node[v]['community'] = c iters = nx.cn_soundarajan_hopcroft(HB, L) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(preds[(u, v)]) cnt += 1 if flag['h14'] is True: print("get feature h14") cnt = 0 with open("best_part_HB.txt", "r") as f: for line in f: if line == "": continue v, c = line.split() c = int(c) HB.node[v]['community'] = c iters = nx.ra_index_soundarajan_hopcroft(HB, L) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(preds[(u, v)]) cnt += 1 if flag['h15'] is True: print("get feature h15") cnt = 0 with open("best_part_HB.txt", "r") as f: for line in f: v, c = line.split() c = int(c) HB.node[v]['community'] = c iters = nx.within_inter_cluster(HB, L, delta=0.5) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(preds[(u, v)]) cnt += 1 if flag['h16'] is True: print("get feature h16") cnt = 0 with open("dendo_HB.txt", "r") as f: line = f.readline() p_dict = {(u, v): 0.0 for (u, v) in L} for line in f: if 'level' in line: l = int(line.split()[1]) if l != 0: iters = nx.cn_soundarajan_hopcroft(HB, L) for (u, v, p) in iters: p_dict[(u, v)] += p else: v, c = line.split() c = int(c) HB.node[v]['community'] = c iters = nx.cn_soundarajan_hopcroft(HB, L) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(p_dict[(u, v)] + preds[(u, v)]) cnt += 1 del p_dict del preds if flag['h17'] is True: print("get feature h17") cnt = 0 with open("dendo_HB.txt", "r") as f: line = f.readline() p_dict = {(u, v): 0.0 for (u, v) in L} for line in f: if 'level' in line: l = int(line.split()[1]) if l != 0: iters = nx.ra_index_soundarajan_hopcroft(HB, L) for (u, v, p) in iters: p_dict[(u, v)] += p else: v, c = line.split() c = int(c) HB.node[v]['community'] = c iters = nx.ra_index_soundarajan_hopcroft(HB, L) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(p_dict[(u, v)] + preds[(u, v)]) cnt += 1 del p_dict del preds if flag['h18'] is True: print("get feature h18") cnt = 0 with open("dendo_HB.txt", "r") as f: line = f.readline() p_dict = {(u, v): 0.0 for (u, v) in L} for line in f: if 'level' in line: l = int(line.split()[1]) if l != 0: iters = nx.within_inter_cluster(HB, L) for (u, v, p) in iters: p_dict[(u, v)] += p else: v, c = line.split() c = int(c) HB.node[v]['community'] = c iters = nx.within_inter_cluster(HB, L) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(p_dict[(u, v)] + preds[(u, v)]) cnt += 1 del p_dict del preds return X
delimiter="\t", nodetype=str) partition = community.best_partition(G) nx.set_node_attributes(G, name='community', values=partition) ap = list(all_pairs(G.nodes())) cn = cn.cnbors(G, ap) rai = nx.resource_allocation_index(G, ap) jc = nx.jaccard_coefficient(G, ap) aai = nx.adamic_adar_index(G, ap) pa = nx.preferential_attachment(G, ap) ccn = nx.cn_soundarajan_hopcroft(G, ap) cra = nx.ra_index_soundarajan_hopcroft(G, ap) wic = nx.within_inter_cluster(G, ap, community='community') u, v, s1, s2, s3, s4, s5, s6, s7, s8, has_edge = ([] for i in range(11)) for m1, m2, m3, m4, m5, m6, m7, m8 in zip(cn, rai, jc, aai, pa, ccn, cra, wic): u.append(m1[0]) v.append(m1[1]) s1.append(m1[2]) s2.append(m2[2]) s3.append(m3[2]) s4.append(m4[2]) s5.append(m5[2]) s6.append(m6[2]) s7.append(m7[2]) s8.append(m8[2]) has_edge.append(int(G.has_edge(m1[0], m2[1])))
positive_predictions_proba_jcc.append( list(nx.jaccard_coefficient(G, [edge]))[0][2]) positive_predictions_proba_ra.append( list(nx.resource_allocation_index(G, [edge]))[0][2]) positive_predictions_proba_aa.append( list(nx.adamic_adar_index(G, [edge]))[0][2]) positive_predictions_proba_pa.append( list(nx.preferential_attachment(G, [edge]))[0][2]) positive_predictions_proba_cnsh.append( list(nx.cn_soundarajan_hopcroft( G, [edge]))[0][2]) # needs community information positive_predictions_proba_rash.append( list(nx.ra_index_soundarajan_hopcroft( G, [edge]))[0][2]) # needs community information positive_predictions_proba_wic.append( list(nx.within_inter_cluster( G, [edge]))[0][2]) # needs community information positive_predictions_proba_slp_DegCent.append( list(SLP_prediction(G, [edge], centrality="DegCent"))[0][2]) positive_predictions_proba_slp_EigenCent.append( list(SLP_prediction(G, [edge], centrality="EigenCent"))[0][2]) positive_predictions_proba_slp_ClosenessCent.append( list(SLP_prediction(G, [edge], centrality="ClosenessCent"))[0][2]) positive_predictions_proba_slp_BetweenCent.append( list(SLP_prediction(G, [edge], centrality="BetweenCent"))[0][2]) positive_predictions_proba_slp_PageRank.append( list(SLP_prediction(G, [edge], centrality="PageRank"))[0][2]) positive_predictions_proba_slpc_DegCent.append( list(SLPC_prediction( G, [edge], centrality="DegCent"))[0][2]) # needs community information positive_predictions_proba_slpc_EigenCent.append(
def set_edge_weight(self, edge_weight_method='weight'): if edge_weight_method == 'weight': return # Centrality based methods elif edge_weight_method == 'edge_betweenness_centrality': print("comptuing edge_betweenness_centrality..") C = nx.edge_betweenness_centrality(self.G, weight='weight') print("done!") elif edge_weight_method == 'edge_betweenness_centrality_subset': print("comptuing edge_betweenness_centrality_subset..") C = nx.edge_current_flow_betweenness_centrality(self.G, weight='weight') print('done') elif edge_weight_method == 'edge_current_flow_betweenness_centrality_subset': print( "comptuing edge_current_flow_betweenness_centrality_subset..") C = nx.edge_current_flow_betweenness_centrality_subset( self.G, weight='weight') print('done') elif edge_weight_method == 'edge_load_centrality': print("comptuing edge_load_centrality..") C = nx.edge_load_centrality(self.G) print('done!') # Link Prediction based methods elif edge_weight_method == 'adamic_adar_index': print("comptuing adamic_adar_index ..") preds = nx.adamic_adar_index(self.G, self.G.edges()) C = {} for u, v, p in preds: C[(u, v)] = p elif edge_weight_method == 'ra_index_soundarajan_hopcroft': print("comptuing ra_index_soundarajan_hopcroft ..") preds = nx.ra_index_soundarajan_hopcroft(self.G, self.G.edges()) C = {} for u, v, p in preds: C[(u, v)] = p elif edge_weight_method == 'preferential_attachment': print("comptuing preferential_attachment ..") preds = nx.preferential_attachment(self.G, self.G.edges()) C = {} for u, v, p in preds: C[(u, v)] = p #elif edge_weight_method=='cn_soundarajan_hopcroft': # print("comptuing cn_soundarajan_hopcroft ..") # preds=nx.cn_soundarajan_hopcroft(self.G,self.G.edges()) # C={} # for u, v, p in preds: # C[(u,v)]=p elif edge_weight_method == 'within_inter_cluster': print("comptuing within_inter_cluster ..") preds = nx.within_inter_cluster(self.G, self.G.edges()) C = {} for u, v, p in preds: C[(u, v)] = p elif edge_weight_method == 'resource_allocation_index': print("comptuing resource allocation index ..") preds = nx.resource_allocation_index(self.G, self.G.edges()) C = {} for u, v, p in preds: C[(u, v)] = p elif edge_weight_method == 'jaccard_coefficient': print("comptuing jaccard_coefficient..") preds = nx.jaccard_coefficient(self.G, self.G.edges()) C = {} for u, v, p in preds: C[(u, v)] = p print('done!') for u, v, d in self.G.edges(data=True): if edge_weight_method == None: d['weight'] = 1 else: d['weight'] = C[(u, v)] return 1
def within_inter_cluster(uG, ni, nj, rand_node): a, b = nx.within_inter_cluster(uG, [(ni, nj), (ni, rand_node)]) return a[2], b[2]
def compute_variable(self, variable_name, train: bool, load=True, path_to_file=None, save=True): assert variable_name in self.handled_variables, "Variable %s is not handled. Handled variables are : %s" % ( variable_name, str(self.handled_variables)) if load and train: if path_to_file is None and os.path.isfile( "variables/%s.npy" % variable_name): print("Loading STANDARD %s file!" % variable_name) result = np.load("variables/%s.npy" % variable_name) return result[:self.nb_training_samples] elif path_to_file is not None and os.path.isfile(path_to_file): print("Loading CUSTOM %s file!" % variable_name) result = np.load(path_to_file) return result[:self.nb_training_samples] print("Did not find saved %s in `variables` folder." % variable_name) if load and not train: if path_to_file is None and os.path.isfile( "variables/TEST_%s.npy" % variable_name): print("Loading STANDARD TEST_%s file!" % variable_name) result = np.load("variables/TEST_%s.npy" % variable_name) return result[:self.nb_training_samples] elif path_to_file is not None and os.path.isfile(path_to_file): print("Loading CUSTOM %s file!" % variable_name) result = np.load(path_to_file) return result[:self.nb_training_samples] print("Did not find saved TEST_%s in `variables` folder." % variable_name) print("Starting computation of %s..." % variable_name) t1 = time() gd = self.graph_structure.graph_dicts # "graph_dictionaries if train: nb_of_samples = self.nb_training_samples else: nb_of_samples = self.nb_testing_samples result = np.zeros(shape=nb_of_samples) for i in range(nb_of_samples): if train: t = self.train_array[i] else: t = self.test_array[i] if variable_name == "publication_2": result[i] = np.log( len( set(self.node_information.loc[t[0], "publication_2"]) & set(self.node_information.loc[t[1], "publication_2"])) + 1) elif variable_name == "adam_coeff": if train: if t[2] == 1: self.graph_structure.g.remove_edge(t[0], t[1]) result[i] = \ next(nx.algorithms.link_prediction.adamic_adar_index(self.graph_structure.g, [(t[0], t[1])]))[2] self.graph_structure.g.add_edge(t[0], t[1]) else: result[i] = \ next(nx.algorithms.link_prediction.adamic_adar_index(self.graph_structure.g, [(t[0], t[1])]))[2] else: result[i] = \ next(nx.algorithms.link_prediction.adamic_adar_index(self.graph_structure.g, [(t[0], t[1])]))[2] elif variable_name == "overlapping_words_in_title": result[i] = compute_intersection( self.node_information.loc[t[0], "title"], self.node_information.loc[t[1], "title"], self.stemmer, self.stpwds) elif variable_name == "number_of_common_authors": result[i] = nbr_common_authors( self.node_information.loc[t[0], "author"], self.node_information.loc[t[1], "author"]) elif variable_name == "difference_of_years": result[i] = abs(self.node_information.loc[t[0], 'year'] - self.node_information.loc[t[1], 'year']) elif variable_name == "affinity_between_authors": result[i] = compute_affinity_between_authors( self.node_information.loc[t[0], 'author'], self.node_information.loc[t[1], 'author'], self.authors_dict) elif variable_name == "identical_journal": result[i] = np.int(self.node_information.loc[t[0], 'journal'] == self.node_information.loc[t[1], 'journal']) elif variable_name == "l2_distance": result[i] = np.linalg.norm( self.node_information.loc[t[0], 'wv'] - self.node_information.loc[t[1], 'wv']) elif variable_name == "cosine_distance_tfid": v1 = self.node_information.loc[t[0], "wv_tfid"] v2 = self.node_information.loc[t[1], "wv_tfid"] try: b1 = np.isnan(v1) except TypeError: b1 = False try: b2 = np.isnan(v2) except TypeError: b2 = False if not b1 and not b2: result[i] = cosine_similarity(v1, v2) else: result[i] = 0 elif variable_name == "l2_distance_between_titles": dst = np.linalg.norm( self.node_information.loc[t[0], 'title_wv'] - self.node_information.loc[t[1], 'title_wv']) if np.isnan(dst): result[i] = 0 else: result[i] = dst # elif variable_name == "cosine_distance_between_titles": # result[i] = cosine_distances( # np.nan_to_num(self.node_information.loc[t[0], 'title_wv']).reshape(-1, 1) - (self.node_information.loc[t[1], 'title_wv']).reshape(-1, 1) # )[0][0] elif variable_name == "common_neighbors": result[i] = len( sorted( nx.common_neighbors(self.graph_structure.g, t[0], t[1]))) elif variable_name == "clustering_coeff": result[i] = gd["clustering_coeff"][ t[0]] * gd["clustering_coeff"][t[1]] elif variable_name == "betweenness": result[i] = gd["betweenness"][t[0]] * gd["betweenness"][t[1]] elif variable_name == "closeness": result[i] = gd["closeness"][t[0]] * gd["closeness"][t[1]] elif variable_name == "degree": result[i] = gd["degree"][t[0]] * gd["degree"][t[1]] elif variable_name == "eigenvector": result[i] = gd["eigenvector"][t[0]] * gd["eigenvector"][t[1]] elif variable_name == "jaccard_coeff": if train: if t[2] == 1: self.graph_structure.g.remove_edge(t[0], t[1]) result[i] = next( nx.jaccard_coefficient(self.graph_structure.g, [(t[0], t[1])]))[2] self.graph_structure.g.add_edge(t[0], t[1]) else: result[i] = next( nx.jaccard_coefficient(self.graph_structure.g, [(t[0], t[1])]))[2] else: result[i] = next( nx.jaccard_coefficient(self.graph_structure.g, [(t[0], t[1])]))[2] elif variable_name == "shortest_path": if train: if t[2] == 1: assert self.graph_structure.g.has_edge( t[0], t[1] ), "There's a problem with the structure of the graph for id %i and %i" % ( t[0], t[1]) self.graph_structure.g.remove_edge(t[0], t[1]) try: result[ i] = 1 / nx.algorithms.shortest_paths.generic.shortest_path_length( self.graph_structure.g, t[0], t[1]) except nx.NetworkXNoPath: result[i] = 0 self.graph_structure.g.add_edge(t[0], t[1]) else: try: result[ i] = 1 / nx.algorithms.shortest_paths.generic.shortest_path_length( self.graph_structure.g, t[0], t[1]) except nx.NetworkXNoPath: result[i] = 0 else: try: result[ i] = 1 / nx.algorithms.shortest_paths.generic.shortest_path_length( self.graph_structure.g, t[0], t[1]) except nx.NetworkXNoPath: result[i] = 0 elif variable_name == "pagerank": result[i] = gd["pagerank"][t[0]] * gd["pagerank"][t[1]] elif variable_name == "community": if self.graph_structure.partition[ t[0]] == self.graph_structure.partition[t[1]]: result[i] = 1 else: result[i] = 0 elif variable_name == "lp_resource_allocation_index": if train: if t[2] == 1: self.graph_structure.g.remove_edge(t[0], t[1]) result[i] = sorted( nx.resource_allocation_index( self.graph_structure.g, [(t[0], t[1])]))[0][2] self.graph_structure.g.add_edge(t[0], t[1]) else: result[i] = sorted( nx.resource_allocation_index( self.graph_structure.g, [(t[0], t[1])]))[0][2] else: result[i] = sorted( nx.resource_allocation_index(self.graph_structure.g, [(t[0], t[1])]))[0][2] elif variable_name == "lp_preferential_attachment": if train: if t[2] == 1: self.graph_structure.g.remove_edge(t[0], t[1]) result[i] = sorted( nx.preferential_attachment(self.graph_structure.g, [(t[0], t[1])]))[0][2] self.graph_structure.g.add_edge(t[0], t[1]) else: result[i] = sorted( nx.preferential_attachment(self.graph_structure.g, [(t[0], t[1])]))[0][2] else: result[i] = sorted( nx.preferential_attachment(self.graph_structure.g, [(t[0], t[1])]))[0][2] elif variable_name == "lp_cn_soundarajan": if train: if t[2] == 1: self.graph_structure.g.remove_edge(t[0], t[1]) result[i] = sorted( nx.cn_soundarajan_hopcroft(self.graph_structure.g, [(t[0], t[1])]))[0][2] self.graph_structure.g.add_edge(t[0], t[1]) else: result[i] = sorted( nx.cn_soundarajan_hopcroft(self.graph_structure.g, [(t[0], t[1])]))[0][2] else: result[i] = sorted( nx.cn_soundarajan_hopcroft(self.graph_structure.g, [(t[0], t[1])]))[0][2] elif variable_name == "lp_ra_index_soundarajan": if train: if t[2] == 1: self.graph_structure.g.remove_edge(t[0], t[1]) result[i] = sorted( nx.ra_index_soundarajan_hopcroft( self.graph_structure.g, [(t[0], t[1])]))[0][2] self.graph_structure.g.add_edge(t[0], t[1]) else: result[i] = sorted( nx.ra_index_soundarajan_hopcroft( self.graph_structure.g, [(t[0], t[1])]))[0][2] else: result[i] = sorted( nx.ra_index_soundarajan_hopcroft( self.graph_structure.g, [(t[0], t[1])]))[0][2] elif variable_name == "lp_within_inter_cluster": if train: if t[2] == 1: self.graph_structure.g.remove_edge(t[0], t[1]) result[i] = sorted( nx.within_inter_cluster(self.graph_structure.g, [(t[0], t[1])]))[0][2] self.graph_structure.g.add_edge(t[0], t[1]) else: result[i] = sorted( nx.within_inter_cluster(self.graph_structure.g, [(t[0], t[1])]))[0][2] else: result[i] = sorted( nx.within_inter_cluster(self.graph_structure.g, [(t[0], t[1])]))[0][2] print("Did %s column in %5.1fs" % (variable_name, time() - t1)) if save and train: print("Saved variable %s in `variables` directory." % variable_name) np.save("variables/" + variable_name, result) if save and not train: np.save("variables/TEST_" + variable_name, result) print("Saved variable TEST_%s in `variables` directory." % variable_name) if np.isnan(result).shape[0] >= 1: print("Careful, you have nan values !") result[np.isnan(result)] = 0 return result