def preferential_attachment_scores(g_train, train_test_split): if g_train.is_directed(): # Only defined for undirected graphs g_train = g_train.to_undirected() adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \ test_edges, test_edges_false = train_test_split # Unpack input start_time = time.time() pa_scores = {} # Calculate scores pa_matrix = np.zeros(adj_train.shape) for u, v, p in nx.preferential_attachment(g_train, ebunch=get_ebunch(train_test_split)): # (u, v) = node indices, p = Jaccard coefficient pa_matrix[u][v] = p pa_matrix[v][u] = p # make sure it's symmetric pa_matrix = pa_matrix / pa_matrix.max() # Normalize matrix runtime = time.time() - start_time pa_roc, pa_ap = get_roc_score(test_edges, test_edges_false, pa_matrix) pa_scores['test_roc'] = pa_roc # pa_scores['test_roc_curve'] = pa_roc_curve pa_scores['test_ap'] = pa_ap pa_scores['runtime'] = runtime return pa_scores
def feature_extractor(graph, samples, deg_centrality): """ Creates a feature vector for each edge of the graph contained in samples """ feature_vector = [] number_nodes_out = 0 for edge in tqdm(samples): source_node, target_node = edge[0], edge[1] # Degree Centrality if (source_node not in list( deg_centrality.keys())) or (target_node not in list( deg_centrality.keys())): feature_vector.append(np.array([0, 0, 0, 0, 0, 0])) number_nodes_out += 1 else: source_degree_centrality = deg_centrality[source_node] target_degree_centrality = deg_centrality[target_node] # # Betweeness centrality measure # diff_bt = betweeness_centrality[target_node] - betweeness_centrality[source_node] # Preferential Attachement pref_attach = list( nx.preferential_attachment(graph, [(source_node, target_node)]))[0][2] # AdamicAdar aai = list( nx.adamic_adar_index(graph, [(source_node, target_node)]))[0][2] # Jaccard jacard_coeff = list( nx.jaccard_coefficient(graph, [(source_node, target_node)]))[0][2] # Ressource allocation index res_all = list( nx.resource_allocation_index( graph, [(source_node, target_node)]))[0][2] # Create edge feature vector with all metric computed above feature_vector.append( np.array([ source_degree_centrality, target_degree_centrality, pref_attach, aai, jacard_coeff, res_all ])) print(f"Number nodes out: {number_nodes_out}") return np.array(feature_vector)
def preferential_attachment(G): graph_preferential_attachment = nx.Graph() file = open("FOF_edges.txt", "rb") fof_graph = nx.read_edgelist(file, delimiter=',') #assign the PA for G based on freind to friend edges and then add them iterator cal = nx.preferential_attachment(G, ebunch=fof_graph.edges()) #from PA iterator create a graph for u, v, x in cal: graph_preferential_attachment.add_edge(u, v, score=x) file2 = open("graph_preferential_attachment.txt", "wb+") nx.write_edgelist(graph_preferential_attachment, file2, delimiter=',') print(len(graph_preferential_attachment))
def print_sim_node(g, x=3003425278, y=3003475283): print("vertex pair:", x, "and", y) print("n of neighbors", x, ":", len(list(g.neighbors(x)))) print("n of neighbors", y, ":", len(list(g.neighbors(y)))) print("degree of", x, ":", g.degree(x)) print("degree of", y, ":", g.degree(y)) print("common neighbosr:", len(list(nx.common_neighbors(g, x, y)))) print("Jaccard coefficient:", list(nx.jaccard_coefficient(g, [(x, y)]))[0][2]) print("Adamic/Adar:", list(nx.adamic_adar_index(g, [(x, y)]))[0][2]) print("preferential attachment:", list(nx.preferential_attachment(g, [(x, y)]))[0][2])
def aggregated_dataset(all_dfs, g_undirected): aggregated_df = all_dfs.sort_values(by='Timestamp', ascending=True) aggregated_df = all_dfs.groupby(['id1', 'id2', 'type'], as_index=False)['weight'].sum() aggregated_df = aggregated_df.set_index(['id1', 'id2']) aggregated_df['preferential attachment'] = [ i[2] for i in nx.preferential_attachment(g_undirected, aggregated_df.index) ] aggregated_df['Common Neighbors'] = aggregated_df.index.map( lambda id: len(list(nx.common_neighbors(g_undirected, id[0], id[1])))) aggregated_df['label'] = 1 aggregated_df.to_pickle("./dummy.pkl") return aggregated_df
def link_prediction(G): # predictions = [] predictions1 = nx.resource_allocation_index(G, G.edges()) predictions2 = nx.jaccard_coefficient(G, G.edges()) predictions3 = nx.adamic_adar_index(G, G.edges()) predictions4 = nx.preferential_attachment(G, G.edges()) # predictions.extend([predictions1, predictions2, predictions3, predictions4]) lst = [] try: for u, v, p in predictions1: lst.append((u, v, p)) print('(%d, %d) -> %.8f' % (u, v, p)) except ZeroDivisionError: print("ZeroDivisionError: float division by zero") x = 1
def metric_coefficients(graph, df_train, df_test): """ Detail: It computes the metric coefficients like jaccard, adamic,preferential attachment and resource allocation Arguments: graph -> nx.Graph() df_train -> pd.DataFrame() df_test -> pd.DataFrame() Return: df_train -> pd.DataFrame() df_test -> pd.DataFrame() """ filename_testing = os.path.join(Setup.path_project(__file__), "data", "testing.txt") filename_training = os.path.join(Setup.path_project(__file__), "data", "training.txt") for filename, df in zip([filename_training, filename_testing], [df_train, df_test]): jaccard = [] adamic_adar = [] # Adamic-Adar inde pa = [] # preferential attachment ra = [] # resource allocation with open(filename, "r") as f: for line in f: line = line.split() for u, v, p in nx.jaccard_coefficient( graph, [(line[0], line[1])]): jaccard.append(p) for u, v, p in nx.adamic_adar_index( graph, [(line[0], line[1])]): adamic_adar.append(p) for u, v, p in nx.preferential_attachment( graph, [(line[0], line[1])]): pa.append(p) for u, v, p in nx.resource_allocation_index( graph, [(line[0], line[1])]): ra.append(p) df["Jaccard"] = jaccard df["Adamic-Adar"] = adamic_adar df["Preferential Attachment"] = pa df["Resource Allocation"] = ra return df_train, df_test
def new_connections_predictions(): common_neigh = [(e[0], e[1], len(list(nx.common_neighbors(G, e[0], e[1])))) for e in nx.non_edges(G)] df1 = pd.DataFrame(index = [(item[0],item[1]) for item in common_neigh]) df1['common_neigh'] = [item[2] for item in common_neigh] Jaccard_coef = list(nx.jaccard_coefficient(G)) df2 = pd.DataFrame(index = [(item[0],item[1]) for item in Jaccard_coef]) df2['Jaccard_coef'] = [item[2] for item in Jaccard_coef] Resource_allocation = list(nx.resource_allocation_index(G)) df3 = pd.DataFrame(index = [(item[0],item[1]) for item in Resource_allocation]) df3['Resource_allocation'] = [item[2] for item in Resource_allocation] preferential_attachment = list(nx.preferential_attachment(G)) df4 = pd.DataFrame(index = [(item[0],item[1]) for item in preferential_attachment]) df4['preferential_attachment'] = [item[2] for item in preferential_attachment] connections = df1.join(df2, how = 'inner').join(df3, how = 'inner').join(df4, how = 'inner') future_connections_mixed = future_connections.join(connections, how = 'left') future_connections_missing = future_connections_mixed[future_connections_mixed['Future Connection'].isnull()] future_connections_okay = future_connections_mixed[~future_connections['Future Connection'].isnull()] X_train = future_connections_okay.drop(['Future Connection'], axis = 1) remove_list = ['common_neigh', 'Jaccard_coef', 'Resource_allocation', 'preferential_attachment'] Y_train = future_connections_okay.drop(remove_list, axis = 1) X_test = future_connections_missing.drop(['Future Connection'], axis = 1) scaler = MinMaxScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) clf = MLPClassifier(hidden_layer_sizes = [100, 10], alpha = 0.001, random_state = 0, solver = 'lbfgs', verbose = 0) clf.fit(X_train_scaled, Y_train) y_proba_test = clf.predict_proba(X_test_scaled)[:,1] X_test['proba_scores'] = y_proba_test return X_test.iloc[:, -1]
def generate_positive_features(): features = [] count = 0 print("Generating positive features......") for sample in positive_samples: if (count % 100 == 0): print(count) count += 1 feature = [] try: preds = nx.resource_allocation_index(UG, [sample]) for u, v, p in preds: feature.append(p) preds = nx.jaccard_coefficient(UG, [sample]) for u, v, p in preds: feature.append(p) preds = nx.adamic_adar_index(UG, [sample]) for u, v, p in preds: feature.append(p) preds = nx.preferential_attachment(UG, [sample]) for u, v, p in preds: feature.append(p) preds = nx.cn_soundarajan_hopcroft(UG, [sample]) for u, v, p in preds: feature.append(p) preds = nx.ra_index_soundarajan_hopcroft(UG, [sample]) for u, v, p in preds: feature.append(p) preds = nx.within_inter_cluster(UG, [sample]) for u, v, p in preds: feature.append(p) feature.append(1) # label=1 except: print("one error at: " + str(count)) pass features.append(feature) print("positive features: " + str(len(features))) return features
def new_connections_predictions(): # Your Code Here from sklearn import model_selection #creating features according to applications.py file common_neighbors = [ len(list(nx.common_neighbors(G, edge[0], edge[1]))) for edge in future_connections.index ] preferential_attachment = [ item[2] for item in list( nx.preferential_attachment(G, ebunch=future_connections.index)) ] adamic = [ item[2] for item in list( nx.adamic_adar_index(G, ebunch=future_connections.index)) ] future_connections['Common Neighbors'] = common_neighbors future_connections['Preferential Attachment'] = preferential_attachment future_connections['Adamic Adar Index'] = adamic future_connections.head(5) #split train and test sets to feed to classifier train_set = future_connections.dropna() test_set = future_connections[ future_connections['Future Connection'].isnull()] X = train_set.iloc[:, 1:] y = train_set.iloc[:, 0] X_test = test_set.iloc[:, 1:] X_train, x_test, Y_train, y_test = model_selection.train_test_split( X, y, random_state=0) scaler = MinMaxScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) #creating the model model = MLPClassifier(hidden_layer_sizes=[10, 5], alpha=5, random_state=0, solver='lbfgs', verbose=0) model.fit(X_train_scaled, Y_train) test_proba = model.predict_proba(X_test_scaled)[:, 1] prediction = pd.Series(test_proba, X_test.index) return prediction # Your Answer Here
def new_connections_predictions(): for node in G.nodes(): G.node[node]['community'] = G.node[node]['Department'] preferential_attachment = list(nx.preferential_attachment(G)) df = pd.DataFrame(index=[(x[0], x[1]) for x in preferential_attachment]) df['preferential_attachment'] = [x[2] for x in preferential_attachment] cn_soundarajan_hopcroft = list(nx.cn_soundarajan_hopcroft(G)) df_cn_soundarajan_hopcroft = pd.DataFrame( index=[(x[0], x[1]) for x in cn_soundarajan_hopcroft]) df_cn_soundarajan_hopcroft['cn_soundarajan_hopcroft'] = [ x[2] for x in cn_soundarajan_hopcroft ] df = df.join(df_cn_soundarajan_hopcroft, how='outer') df['cn_soundarajan_hopcroft'] = df['cn_soundarajan_hopcroft'].fillna( value=0) df['resource_allocation_index'] = [ x[2] for x in list(nx.resource_allocation_index(G)) ] df['jaccard_coefficient'] = [x[2] for x in list(nx.jaccard_coefficient(G))] df = future_connections.join(df, how='outer') df_train = df[~pd.isnull(df['Future Connection'])] df_test = df[pd.isnull(df['Future Connection'])] features = [ 'cn_soundarajan_hopcroft', 'preferential_attachment', 'resource_allocation_index', 'jaccard_coefficient' ] X_train = df_train[features] Y_train = df_train['Future Connection'] X_test = df_test[features] scaler = MinMaxScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) clf = MLPClassifier(hidden_layer_sizes=[10, 5], alpha=5, random_state=0, solver='lbfgs', verbose=0) clf.fit(X_train_scaled, Y_train) test_proba = clf.predict_proba(X_test_scaled)[:, 1] predictions = pd.Series(test_proba, X_test.index) target = future_connections[pd.isnull( future_connections['Future Connection'])] target['prob'] = [predictions[x] for x in target.index] return target['prob']
def new_connections_predictions(): from sklearn import preprocessing from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression df =future_connections; df['preferential attachment'] = [i[2] for i in nx.preferential_attachment(G, df.index)] df['common_neighbors'] = df.index.map(lambda edge: len(list(nx.common_neighbors(G, edge[0], edge[1])))) df_test = df[df['Future Connection'].isnull()] df_train = df[~df['Future Connection'].isnull()] X_notnull = df_train[['preferential attachment','common_neighbors']] y_notnull = df_train[['Future Connection']] X_pred = df_test[['preferential attachment','common_neighbors']] X_train, X_test, y_train, y_test = train_test_split(X_notnull, y_notnull, test_size=0.2) scaler = preprocessing.StandardScaler().fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) X_pred = scaler.transform(X_pred) clf = LogisticRegression() clf.fit(X_train, y_train) #print('Training Score :\t', clf.score(X_train, y_train)) #print('Test Score :\t\t', clf.score(X_test, y_test)) y_pred = clf.predict_proba(X_pred)[:,1] #df_test.index df2 = pd.Series(y_pred) df2.index = df_test.index return df2
def new_connections_predictions(): for node in G.nodes(): G.node[node]["community"] = G.node[node]["Department"] preferential_attachment = list(nx.preferential_attachment(G)) df_preferential_attachment = pd.DataFrame(index=[(x[0], x[1]) for x in preferential_attachment]) df_preferential_attachment["preferential_attachment"] = [x[2] for x in preferential_attachment] cn_soundarajan_hopcroft = list(nx.cn_soundarajan_hopcroft(G)) df_cn_soundarajan_hopcroft = pd.DataFrame(index=[(x[0], x[1]) for x in cn_soundarajan_hopcroft]) df_cn_soundarajan_hopcroft["cn_soundarajan_hopcroft"] = [x[2] for x in cn_soundarajan_hopcroft] df = df_preferential_attachment.join(df_cn_soundarajan_hopcroft, how="outer") df["cn_soundarajan_hopcroft"] = df["cn_soundarajan_hopcroft"].fillna(value=0) df["resource_allocation_index"] = [x[2] for x in list(nx.resource_allocation_index(G))] df["jaccard_coefficient"] = [x[2] for x in list(nx.jaccard_coefficient(G))] df = future_connections.join(df, how="outer") df["Future Connection"] = df["Future Connection"].fillna(-1) future_connections["Future Connection"] = future_connections["Future Connection"].fillna(-1) features = ["cn_soundarajan_hopcroft", "preferential_attachment", "resource_allocation_index", "jaccard_coefficient"] X_train = df[df["Future Connection"]!=-1][features] y_train = df[df["Future Connection"]!=-1]["Future Connection"] X_test = df[df["Future Connection"]==-1][features] scaler = MinMaxScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) clf = MLPClassifier(alpha=5, random_state=0, solver="lbfgs").fit(X_train_scaled, y_train) predictions = clf.predict_proba(X_test_scaled)[:, 1] predictions_formated = pd.Series(predictions, X_test.index) result = future_connections[future_connections["Future Connection"]==-1] result["probability"] = [predictions_formated[x] for x in result.index] return result["probability"]
def stalker_evolution(M): # Common Neighbors CN = [(e[0], e[1], len(list(nx.common_neighbors(M, e[0], e[1])))) for e in nx.non_edges(M)] CN.sort(key=operator.itemgetter(2), reverse=True) # Jaccard coef jaccard = list(nx.jaccard_coefficient(M)) jaccard.sort(key=operator.itemgetter(2), reverse=True) # Resource Allocation index RA = list(nx.resource_allocation_index(M)) RA.sort(key=operator.itemgetter(2), reverse=True) # Adamic-Adar index AA = list(nx.adamic_adar_index(M)) AA.sort(key=operator.itemgetter(2), reverse=True) # Preferential Attachement PA = list(nx.preferential_attachment(M)) PA.sort(key=operator.itemgetter(2), reverse=True) # Community Common Neighbors !!! requires graph to have node attribute: 'community' !!! # CCN = list(nx.cn_soundarajan_hopcroft(M)) # CCN.sort(key=operator.itemgetter(2), reverse = True) # Community Resource Allocation !!! requires graph to have node attribute: 'community' !!! # CRA = list(nx.ra_index_soundarajan_hopcroft(M)) # CRA.sort(key=operator.itemgetter(2), reverse = True) # ###################### Prediction of future edge formation #################### FM = M for i in PA[0:int(0.1 * len(M.edges()))]: FM.add_edge(i[0], i[1], value='new') for i in CN[0:int(0.1 * len(M.edges()))]: FM.add_edge(i[0], i[1], value='new') return FM
def link_scores(graph, all_dfs, labels, g_undirected): lst = [] lst2 = [] predictions1 = nx.preferential_attachment(g_undirected, g_undirected.edges()) [lst.append((u, v, p)) for u, v, p in predictions1] predictions1 = {(k, v): n for k, v, n in lst} all_dfs['Preferential_Attachment'] = all_dfs.apply(lambda x: map_predictions_to_df(predictions1, x), axis=1) predictions3 = nx.resource_allocation_index(g_undirected, g_undirected.edges()) try: [lst2.append((u, v, p)) for u, v, p in predictions3] predictions3 = {(k, v): n for k, v, n in lst2} all_dfs['Resource_allocation'] = all_dfs.apply(lambda x: map_predictions_to_df(predictions3, x), axis=1) except ZeroDivisionError: print("ZeroDivisionError: float division by zero") return all_dfs
def extract_edge_feature(G, unG, head, tail, node_feat): ''' featrue 1. head node feature 2. tail node featrue 3. pmi 4. num common successors 5. num common predecessors 6. num pred(head) & succ(tail) 7. num common neighbor 8. jaccard 9. resource 10. adamic 11. has path ''' head_feat = node_feat[head] if head in node_feat else [0] * 131 tail_feat = node_feat[tail] if tail in node_feat else [0] * 131 all_feat = head_feat + tail_feat if head not in G or tail not in G: return all_feat + [0] * 9 all_feat.append(0 if (head, tail not in PMI_dict) else PMI_dict[(head, tail)]) #263 all_feat.append(len(set(G.successors(head)) & set(G.successors(tail)))) all_feat.append(len(set(G.predecessors(head)) & set(G.predecessors(tail)))) all_feat.append(len(set(G.predecessors(head)) & set(G.successors(tail)))) all_feat.append(len(set(nx.common_neighbors(unG, head, tail)))) all_feat.append(list(nx.jaccard_coefficient(unG, [(head, tail)]))[0][2]) all_feat.append( list(nx.resource_allocation_index(unG, [(head, tail)]))[0][2]) all_feat.append(list(nx.adamic_adar_index(unG, [(head, tail)]))[0][2]) all_feat.append( list(nx.preferential_attachment(unG, [(head, tail)]))[0][2]) #271 #all_feat.append(eb_cent[(head, tail)]) #all_feat.append(nx.has_path(G, head, tail)) return all_feat
def new_connections_predictions(): from sklearn.ensemble import GradientBoostingClassifier future_connections['pref_attachment'] = [ list(nx.preferential_attachment(G, [node_pair]))[0][2] for node_pair in future_connections.index ] future_connections['comm_neighbors'] = [ len(list(nx.common_neighbors(G, node_pair[0], node_pair[1]))) for node_pair in future_connections.index ] train_data = future_connections[~future_connections['Future Connection']. isnull()] test_data = future_connections[ future_connections['Future Connection'].isnull()] clf = GradientBoostingClassifier() clf.fit(train_data[['pref_attachment', 'comm_neighbors']].values, train_data['Future Connection'].values) preds = clf.predict_proba(test_data[['pref_attachment', 'comm_neighbors']].values)[:, 1] return pd.Series(preds, index=test_data.index)
def new_connections_predictions(): df = future_connections df['Department'] = [ 1. if G.node[connection[0]]['Department'] == G.node[connection[1]]['Department'] else 0. for connection in future_connections.index ] df['pa'] = [ i[2] for i in nx.preferential_attachment(G, ebunch=future_connections.index) ] df['cn'] = [ len(set(nx.common_neighbors(G, connection[0], connection[1]))) for connection in future_connections.index ] df_train = df.dropna() df_test = df[df['Future Connection'].isnull()] from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression #from sklearn.metrics import roc_auc_score #X_train, X_test, y_train, y_test = train_test_split(df_train[['Department', 'pa', 'cn']], df_train['Future Connection']) #from sklearn.ensemble import RandomForestClassifier #from sklearn.model_selection import cross_val_score #rfc = RandomForestClassifier(n_estimators=100, max_depth=3) #print(np.mean(cross_val_score(rfc, X_train, y_train, cv=5, scoring='roc_auc'))) lr = LogisticRegression() #print(np.mean(cross_val_score(lr, X_train, y_train, cv=5, scoring='roc_auc'))) #lr.fit(X_train, y_train.values.reshape(-1,1)) # predict probabilities #lr_probs = model.predict_proba(X_test)[:, 1] #rfc = RandomForestClassifier(n_estimators=100, max_depth=5) lr.fit(df_train[['Department', 'pa', 'cn']], df_train['Future Connection']) lr_probs = lr.predict_proba(df_test[['Department', 'pa', 'cn']])[:, 1] result_series = pd.Series(lr_probs, index=df_test.index) return result_series
def get_all_proximity_score(G, edges): proximity_score_list = [[] for i in itertools.repeat(None, len(edges))] cc = [ nx.square_clustering(G, edge[0]) + nx.square_clustering(G, edge[1]) for edge in edges ] cn = [ len(list(nx.common_neighbors(G, edge[0], edge[1]))) for edge in edges ] jc = nx.jaccard_coefficient(G, edges) pa = nx.preferential_attachment(G, edges) rai = nx.resource_allocation_index(G, edges) for i, data in enumerate(cc): proximity_score_list[i].append(data) for i, data in enumerate(cn): proximity_score_list[i].append(data) for i, data in enumerate(jc): proximity_score_list[i].append(data[2]) for i, data in enumerate(pa): proximity_score_list[i].append(data[2]) for i, data in enumerate(rai): proximity_score_list[i].append(data[2]) return proximity_score_list
def predict(self,): preds = None if self.algo_name == 'RAI': print('RAI') # 0.707 preds = nx.resource_allocation_index(self.G, self.edges_to_prediction) if self.algo_name == 'jaccard': print('jaccard') # 0.628 preds = nx.jaccard_coefficient(self.G, self.edges_to_prediction) if self.algo_name == 'adamic_adar_index': print('adamic_adar_index') #0.687 preds = nx.adamic_adar_index(self.G, self.edges_to_prediction ) if self.algo_name == 'preferential_attachment': print('preferential_attachment') #0.498 preds = nx.preferential_attachment(self.G,self.edges_to_prediction ) if preds is None: raise ValueError('Algorithm was not found: %s Or something weird happened in prediction' % self.algo_name) predictions1 = [(i,v) for (v, i) in sorted([(p, (u, v)) for (u, v, p) in preds], reverse=True)] # get the cells of matrix in ascending order of cell value print(1) predictions2 = predictions1 # the following is redundent here... #[t for t in predictions1 if t[0]<t[1]] # just upper half of the matrix and predictions larger than 0 return predictions2
def similarities_matrices_calc(graphs): # for idx in range(len(graphs)): nodes = list(graphs.nodes) GD = {} CN = {} G = graphs.to_undirected() # graph must be undirected in order for functions to work for first_node in nodes: for second_node in nodes: ## 6.1 find the common neighbors of nodes neighbors = [] temp_neighbors = nx.common_neighbors(G, first_node, second_node) for p in temp_neighbors: neighbors.append(p) CN[first_node, second_node] = len(neighbors) # 6.2 find the graph distance try: distance = nx.shortest_path_length(G, first_node, second_node) GD[first_node, second_node] = distance except: continue # 6.3 find the jaccard coefficient jaccard = nx.jaccard_coefficient(G) # 6.4 find the adamic adar adamic = nx.adamic_adar_index(G) # 6.5 find the preferential attachment preferential = nx.preferential_attachment(G) return CN, GD, jaccard, adamic, preferential
def preferential_attachment_scores(g_train, train_test_split): adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \ test_edges, test_edges_false = train_test_split # Unpack input start_time = time.time() pa_scores = {} # Calculate scores pa_matrix = np.zeros(adj_train.shape) for u, v, p in nx.preferential_attachment( g_train): # (u, v) = node indices, p = Jaccard coefficient pa_matrix[u][v] = p pa_matrix[v][u] = p # make sure it's symmetric pa_matrix = pa_matrix / pa_matrix.max() # Normalize matrix runtime = time.time() - start_time pa_roc, pa_roc_curve, pa_ap = get_roc_score(test_edges, test_edges_false, pa_matrix) pa_scores['test_roc'] = pa_roc pa_scores['test_roc_curve'] = pa_roc_curve pa_scores['test_ap'] = pa_ap pa_scores['runtime'] = runtime return pa_scores
def new_friends(self, G): """Creates new edges using the built in function nx.preferential_attachment to make the network dynamic. Only adds edges a percentage of the time, which depends on how high the preferential attachment value is. :param G= a networkx digraph :return G """ H = G.to_undirected( ) #creates an undirected copy of the original graph n = nx.preferential_attachment( H ) #uses the preferential_attachment method from networkx to create friends for u, v, p in n: chance = random.randint( 0, 100) #chance is a randomly generated number between 0 and 100 if p >= len( G.edges ) and chance >= 90: #creates a new relationship (edge) between two nodes if their preferential G.add_edge( u, v, weight=random.uniform(-1, 1) ) #attachment number is higher than the total number of edges and else: #chance is greater than 90. continue return G
def get_features(L, flag): X = [[] for i in range(len(L))] #=====================Social features(user-to-user graph)====================== #g0.adamic adar score if flag['g0'] is True: print("get feature g0") preds = nx.adamic_adar_index(G, L) cnt = 0 for (u, v, p) in preds: X[cnt].append(p) cnt += 1 #g1.jaccard coefficient if flag['g1'] is True: print("get feature g1") preds = nx.jaccard_coefficient(G, L) cnt = 0 for (u, v, p) in preds: X[cnt].append(p) cnt += 1 #g2.resource_allocation if flag['g2'] is True: print("get feature g2") preds = nx.resource_allocation_index(G, L) cnt = 0 for (u, v, p) in preds: X[cnt].append(p) cnt += 1 #g3.preferentail_attachment if flag['g3'] is True: print("get feature g3") preds = nx.preferential_attachment(G, L) cnt = 0 for (u, v, p) in preds: X[cnt].append(p) cnt += 1 #g4.shortest path length if flag['g4'] is True: print("get feature g4") cnt = 0 for (u, v) in L: if G.has_edge(u, v): G.remove_edge(u, v) if nx.has_path(G, u, v): X[cnt].append( nx.shortest_path_length(G, source=u, target=v) / 50000) else: X[cnt].append(1) G.add_edge(u, v) else: if nx.has_path(G, u, v): X[cnt].append( nx.shortest_path_length(G, source=u, target=v) / 50000) else: X[cnt].append(1) cnt += 1 #g5.common neighbors if flag['g5'] is True: print("get feature g5") cnt = 0 for (u, v) in L: if G.has_edge(u, v): G.remove_edge(u, v) T = [w for w in nx.common_neighbors(G, u, v)] G.add_edge(u, v) else: T = [w for w in nx.common_neighbors(G, u, v)] X[cnt].append(len(T)) cnt += 1 #g6.Approximate katz for social graph if flag['g6'] is True: print("get feature g6") cnt = 0 for (u, v) in L: p = 0 if G.has_edge(u, v): G.remove_edge(u, v) for x in G.neighbors(u): for y in G.neighbors(v): if x == y or G.has_edge(x, y): p += 1 G.add_edge(u, v) else: for x in G.neighbors(u): for y in G.neighbors(v): if x == y or G.has_edge(x, y): p += 1 X[cnt].append(p) cnt += 1 if flag['g7'] is True: print("get feature g7") cnt = 0 with open("best_part_G.txt", "r") as f: for line in f: v, c = line.split() c = int(c) G.node[v]['community'] = c iters = nx.cn_soundarajan_hopcroft(G, L) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(preds[(u, v)]) cnt += 1 if flag['g8'] is True: print("get feature g8") cnt = 0 with open("best_part_G.txt", "r") as f: for line in f: if line == "": continue v, c = line.split() c = int(c) G.node[v]['community'] = c iters = nx.ra_index_soundarajan_hopcroft(G, L) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(preds[(u, v)]) cnt += 1 if flag['g9'] is True: print("get feature g9") cnt = 0 with open("best_part_G.txt", "r") as f: for line in f: v, c = line.split() c = int(c) G.node[v]['community'] = c iters = nx.within_inter_cluster(G, L, delta=0.5) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(preds[(u, v)]) cnt += 1 if flag['g10'] is True: print("get feature g10") cnt = 0 with open("dendo_G.txt", "r") as f: line = f.readline() p_dict = {(u, v): 0.0 for (u, v) in L} for line in f: if 'level' in line: l = int(line.split()[1]) if l != 0: iters = nx.cn_soundarajan_hopcroft(G, L) for (u, v, p) in iters: p_dict[(u, v)] += p else: v, c = line.split() c = int(c) G.node[v]['community'] = c iters = nx.cn_soundarajan_hopcroft(G, L) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(p_dict[(u, v)] + preds[(u, v)]) cnt += 1 del p_dict del preds if flag['g11'] is True: print("get feature g11") cnt = 0 with open("dendo_G.txt", "r") as f: line = f.readline() p_dict = {(u, v): 0.0 for (u, v) in L} for line in f: if 'level' in line: l = int(line.split()[1]) if l != 0: iters = nx.ra_index_soundarajan_hopcroft(G, L) for (u, v, p) in iters: p_dict[(u, v)] += p else: v, c = line.split() c = int(c) G.node[v]['community'] = c iters = nx.ra_index_soundarajan_hopcroft(G, L) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(p_dict[(u, v)] + preds[(u, v)]) cnt += 1 del p_dict del preds if flag['g12'] is True: print("get feature g12") cnt = 0 with open("dendo_G.txt", "r") as f: line = f.readline() p_dict = {(u, v): 0.0 for (u, v) in L} for line in f: if 'level' in line: l = int(line.split()[1]) if l != 0: iters = nx.within_inter_cluster(G, L) for (u, v, p) in iters: p_dict[(u, v)] += p else: v, c = line.split() c = int(c) G.node[v]['community'] = c iters = nx.within_inter_cluster(G, L) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(p_dict[(u, v)] + preds[(u, v)]) cnt += 1 del p_dict del preds #=========================checkin features========================================= #c0.follower number if flag['c0'] is True: print("get feature c0") cnt = 0 for (u, v) in L: X[cnt].append(U[u]['follow_cnt'] * U[v]['follow_cnt']) # fu*fv cnt += 1 #c1.same time same location if flag['c1'] is True: print("get feature c1") cnt = 0 for (u, v) in L: p = calculate_CCC(G, u, v) X[cnt].append(p) cnt += 1 #c2.same time same distinct spot if flag['c2'] is True: print("get deature c2") cnt = 0 for (u, v) in L: p = 0 dis_same_spot = [] for k in C[u]: if k[1] not in dis_same_spot and k in C[v]: dis_same_spot.append(k[1]) p += 1 X[cnt].append(p) cnt += 1 #c3.same distinct spot (not necessarily same time) if flag['c3'] is True: cnt = 0 print("get feature c3") for (u, v) in L: p = 0 dis_same_spot = [] for k in C[u]: if k[1] not in dis_same_spot: for m in C[v]: if k[1] == m[1]: dis_same_spot.append(k[1]) p += 1 break X[cnt].append(p) cnt += 1 #c4.min Entropy if flag['c4'] is True: print("get feature c4") cnt = 0 for (u, v) in L: p = 0 E_list = [] for k in C[u]: if k in C[v]: spot = k[1] if spot in S and S[spot]['entropy'] > 0: E_list.append(S[spot]['entropy']) if len(E_list) > 0: p = min(E_list) X[cnt].append(p) cnt += 1 #c5. distance of mean_LL if flag['c5'] is True: cnt = 0 print("get feature c5") for (u, v) in L: dist = np.sqrt((U[u]['mean_LL'][0] - U[v]['mean_LL'][0])**2 + (U[u]['mean_LL'][1] - U[v]['mean_LL'][1])**2) X[cnt].append(dist) cnt += 1 #c6.weighted same location if flag['c6'] is True: print("get feature c6") cnt = 0 for (u, v) in L: p = 0 for k in C[u]: if k in C[v]: spot = k[1] #if spot in S and S[spot]['entropy'] > 0: #p += 1/S[spot]['entropy'] if spot in S: dist = np.sqrt( (S[spot]['LL'][0] - U[u]['mean_LL'][0])**2 + (S[spot]['LL'][1] - U[u]['mean_LL'][1])**2) p += dist dist = np.sqrt( (S[spot]['LL'][0] - U[v]['mean_LL'][0])**2 + (S[spot]['LL'][1] - U[v]['mean_LL'][1])**2) p += dist X[cnt].append(p) cnt += 1 #c7.PP if flag['c7'] is True: print("get feature c7") cnt = 0 for (u, v) in L: p = len(C[u]) * len(C[v]) X[cnt].append(p) cnt += 1 #c8.Total Common Friend Closeness (TCFC) if flag['c8'] is True: print("get feature c8") cnt = 0 for (u, v) in L: p = 0 if G.has_edge(u, v): G.remove_edge(u, v) for w in nx.common_neighbors(G, u, v): T1 = [x for x in nx.common_neighbors(G, u, w)] T2 = [x for x in nx.common_neighbors(G, v, w)] p += len(T1) * len(T2) G.add_edge(u, v) else: for w in nx.common_neighbors(G, u, v): T1 = [x for x in nx.common_neighbors(G, u, w)] T2 = [x for x in nx.common_neighbors(G, v, w)] p += len(T1) * len(T2) X[cnt].append(p) cnt += 1 #c9.Total Common friend Checkin Count (TCFCC) if flag['c9'] is True: print("get feature c9") cnt = 0 for (u, v) in L: p = 0 if G.has_edge(u, v): G.remove_edge(u, v) for w in nx.common_neighbors(G, u, v): p += calculate_CCC(G, u, w) * calculate_CCC(G, v, w) G.add_edge(u, v) else: for w in nx.common_neighbors(G, u, v): p += calculate_CCC(G, u, w) * calculate_CCC(G, v, w) X[cnt].append(p) cnt += 1 #c10. Common Category Checkin Counts Product (CCCP) if flag['c10'] is True: print("get feature c10") cnt = 0 for (u, v) in L: p = 0 for cat in U[u]['cate']: if cat in U[v]['cate']: p += U[u]['cate'][cat] * U[v]['cate'][cat] X[cnt].append(p) cnt += 1 #c11. Common Category Checkin Counts Product Ratio(CCCPR) if flag['c11'] is True: print("get feature c11") cnt = 0 for (u, v) in L: p = 0 u_cate_total = sum(U[u]['cate'][cat]**2 for cat in U[u]['cate']) v_cate_total = sum(U[v]['cate'][cat]**2 for cat in U[v]['cate']) for cat in U[u]['cate']: if cat in U[v]['cate']: p += (U[u]['cate'][cat] * U[v]['cate'][cat] / np.sqrt(u_cate_total * v_cate_total)) X[cnt].append(p) cnt += 1 #c12.trip route length all if flag['c12'] is True: print("get feature c12") cnt = 0 for (u, v) in L: tripDayLen1 = list() tripDayLen2 = list() tripDay = "starting" tripLen = 0.0 lastSpot = [0.0, 0.0] for k in C[u]: if not (lastSpot[0] == 0.0 and lastSpot[1] == 0.0): if k[1] in S: tripLen += np.sqrt((lastSpot[0] - S[k[1]]['LL'][0])**2 + (lastSpot[1] - S[k[1]]['LL'][1])**2) lastSpot[0] = S[k[1]]['LL'][0] lastSpot[1] = S[k[1]]['LL'][1] else: if k[1] in S: lastSpot[0] = S[k[1]]['LL'][0] lastSpot[1] = S[k[1]]['LL'][1] tripDay = "starting" tripLen2 = 0.0 lastSpot = [0.0, 0.0] for k in C[v]: if not (lastSpot[0] == 0.0 and lastSpot[1] == 0.0): if k[1] in S: tripLen2 += np.sqrt( (lastSpot[0] - S[k[1]]['LL'][0])**2 + (lastSpot[1] - S[k[1]]['LL'][1])**2) lastSpot[0] = S[k[1]]['LL'][0] lastSpot[1] = S[k[1]]['LL'][1] else: if k[1] in S: lastSpot[0] = S[k[1]]['LL'][0] lastSpot[1] = S[k[1]]['LL'][1] X[cnt].append(tripLen + tripLen2) cnt += 1 #=========================Heter Graph features===================================== #h0.Approximate katz for bipartite graph if flag['h0'] is True: print("get feature h0") cnt = 0 for (u, v) in L: p = 0 for x in B.neighbors(u): for y in B.neighbors(v): if x == y or B.has_edge(x, y): p += 1 X[cnt].append(p) cnt += 1 #h1.Approximate katz on HB if flag['h1'] is True: print("get feature h1") cnt = 0 for (u, v) in L: p = 0 if HB.has_edge(u, v): HB.remove_edge(u, v) for x in HB.neighbors(u): for y in HB.neighbors(v): if x == y or HB.has_edge(x, y): p += 1 HB.add_edge(u, v) else: for x in HB.neighbors(u): for y in HB.neighbors(v): if x == y or HB.has_edge(x, y): p += 1 X[cnt].append(p) cnt += 1 #h2.Approximate katz on H if flag['h2'] is True: print("get feature h2") cnt = 0 for (u, v) in L: p = 0 if H.has_edge(u, v): H.remove_edge(u, v) for x in H.neighbors(u): for y in H.neighbors(v): if x == y or H.has_edge(x, y): p += 1 H.add_edge(u, v) else: for x in H.neighbors(u): for y in H.neighbors(v): if x == y or H.has_edge(x, y): p += 1 X[cnt].append(p) cnt += 1 #h3.shortest path length on B if flag['h3'] is True: print("get feature h3") cnt = 0 for (u, v) in L: if nx.has_path(B, u, v): X[cnt].append( nx.shortest_path_length(B, source=u, target=v) / 50000) else: X[cnt].append(1) cnt += 1 #h4.clustering coefiicient on H if flag['h4'] is True: print("get feature h4") cnt = 0 for (u, v) in L: if H.has_edge(u, v): H.remove_edge(u, v) p = nx.clustering(H, u) * nx.clustering(H, v) H.add_edge(u, v) else: p = nx.clustering(H, u) * nx.clustering(H, v) X[cnt].append(p) cnt += 1 #h5. number of (user's loc friends)'s loc friends if flag['h5'] is True: print("get feature h5") cnt = 0 for (u, v) in L: counter1 = 0 for neighbor in H.neighbors(u): if not neighbor.isnumeric(): for neighbor2 in H.neighbors(neighbor): if not neighbor.isnumeric(): counter1 += 1 counter2 = 0 for neighbor in H.neighbors(v): if not neighbor.isnumeric(): for neighbor2 in H.neighbors(neighbor): if not neighbor.isnumeric(): counter2 += 1 #print(str(counter1)+" "+str(counter2)+"\n") X[cnt].append(counter1 * counter2) cnt += 1 #h6. location friends' degree sum if flag['h6'] is True: print("get feature h6") cnt = 0 for (u, v) in L: counter1 = 0 for locationNeighbor in H.neighbors(u): if not locationNeighbor.isnumeric(): #print(str(locationNeighbor)+"\n") if locationNeighbor in LG: counter1 += LG.degree(locationNeighbor) counter2 = 0 for locationNeighbor in H.neighbors(v): if not locationNeighbor.isnumeric(): if locationNeighbor in LG: counter2 += LG.degree(locationNeighbor) X[cnt].append(counter1 * counter2) cnt += 1 #h7. Approximate katz for social graph if flag['h7'] is True: print("get feature h7") cnt = 0 for (u, v) in L: counter = 0 for node in H.neighbors(u): if not node.isnumeric(): for node2 in H.neighbors(v): if not node2.isnumeric(): if node == node2 or H.has_edge(node, node2): counter += 1 X[cnt].append(counter) cnt += 1 #h8. adamic adar score on H if flag['h8'] is True: print("get feature h8") preds = nx.adamic_adar_index(H, L) cnt = 0 for (u, v, p) in preds: X[cnt].append(p) cnt += 1 #h9. resource_allocation on H if flag['h9'] is True: print("get feature h9") preds = nx.resource_allocation_index(H, L) cnt = 0 for (u, v, p) in preds: X[cnt].append(p) cnt += 1 #h10. shortest path length on H if flag['h10'] is True: print("get feature h10") cnt = 0 for (u, v) in L: if H.has_edge(u, v): H.remove_edge(u, v) if nx.has_path(H, u, v): X[cnt].append( nx.shortest_path_length(H, source=u, target=v) / 50000) else: X[cnt].append(1) H.add_edge(u, v) else: if nx.has_path(H, u, v): X[cnt].append( nx.shortest_path_length(H, source=u, target=v) / 50000) else: X[cnt].append(1) cnt += 1 #h11. common neighbors on H if flag['h11'] is True: print("get feature h11") cnt = 0 for (u, v) in L: if H.has_edge(u, v): H.remove_edge(u, v) T = [w for w in nx.common_neighbors(H, u, v)] H.add_edge(u, v) else: T = [w for w in nx.common_neighbors(H, u, v)] X[cnt].append(len(T)) cnt += 1 #h12.Approximate katz for social graph if flag['h12'] is True: print("get feature h12") cnt = 0 for (u, v) in L: p = 0 if H.has_edge(u, v): H.remove_edge(u, v) for x in H.neighbors(u): for y in H.neighbors(v): if x == y or H.has_edge(x, y): p += 1 H.add_edge(u, v) else: for x in H.neighbors(u): for y in H.neighbors(v): if x == y or H.has_edge(x, y): p += 1 X[cnt].append(p) cnt += 1 if flag['h13'] is True: print("get feature h13") cnt = 0 with open("best_part_HB.txt", "r") as f: for line in f: v, c = line.split() c = int(c) HB.node[v]['community'] = c iters = nx.cn_soundarajan_hopcroft(HB, L) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(preds[(u, v)]) cnt += 1 if flag['h14'] is True: print("get feature h14") cnt = 0 with open("best_part_HB.txt", "r") as f: for line in f: if line == "": continue v, c = line.split() c = int(c) HB.node[v]['community'] = c iters = nx.ra_index_soundarajan_hopcroft(HB, L) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(preds[(u, v)]) cnt += 1 if flag['h15'] is True: print("get feature h15") cnt = 0 with open("best_part_HB.txt", "r") as f: for line in f: v, c = line.split() c = int(c) HB.node[v]['community'] = c iters = nx.within_inter_cluster(HB, L, delta=0.5) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(preds[(u, v)]) cnt += 1 if flag['h16'] is True: print("get feature h16") cnt = 0 with open("dendo_HB.txt", "r") as f: line = f.readline() p_dict = {(u, v): 0.0 for (u, v) in L} for line in f: if 'level' in line: l = int(line.split()[1]) if l != 0: iters = nx.cn_soundarajan_hopcroft(HB, L) for (u, v, p) in iters: p_dict[(u, v)] += p else: v, c = line.split() c = int(c) HB.node[v]['community'] = c iters = nx.cn_soundarajan_hopcroft(HB, L) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(p_dict[(u, v)] + preds[(u, v)]) cnt += 1 del p_dict del preds if flag['h17'] is True: print("get feature h17") cnt = 0 with open("dendo_HB.txt", "r") as f: line = f.readline() p_dict = {(u, v): 0.0 for (u, v) in L} for line in f: if 'level' in line: l = int(line.split()[1]) if l != 0: iters = nx.ra_index_soundarajan_hopcroft(HB, L) for (u, v, p) in iters: p_dict[(u, v)] += p else: v, c = line.split() c = int(c) HB.node[v]['community'] = c iters = nx.ra_index_soundarajan_hopcroft(HB, L) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(p_dict[(u, v)] + preds[(u, v)]) cnt += 1 del p_dict del preds if flag['h18'] is True: print("get feature h18") cnt = 0 with open("dendo_HB.txt", "r") as f: line = f.readline() p_dict = {(u, v): 0.0 for (u, v) in L} for line in f: if 'level' in line: l = int(line.split()[1]) if l != 0: iters = nx.within_inter_cluster(HB, L) for (u, v, p) in iters: p_dict[(u, v)] += p else: v, c = line.split() c = int(c) HB.node[v]['community'] = c iters = nx.within_inter_cluster(HB, L) preds = {(u, v): p for (u, v, p) in iters} for (u, v) in L: X[cnt].append(p_dict[(u, v)] + preds[(u, v)]) cnt += 1 del p_dict del preds return X
# ### Extracting attributes # # Using `nx.get_edge_attributes`, it's easy to extract the edge attributes in the graph into DataFrame columns. # In[ ]: df['weight'] = pd.Series(nx.get_edge_attributes(G, 'weight')) df # ### Creating edge based features # # Many of the networkx functions related to edges return a nested data structures. We can extract the relevant data using list comprehension. # In[ ]: df['preferential attachment'] = [ i[2] for i in nx.preferential_attachment(G, df.index) ] df # In the case where the function expects two nodes to be passed in, we can map the index to a lamda function. # In[ ]: df['Common Neighbors'] = df.index.map( lambda city: len(list(nx.common_neighbors(G, city[0], city[1])))) df
def similarity_matrices(edges, nodes): # Calculates the similarity matrices if len(nodes) == 0: print('V* is empty, skipping to next t.') return -1 g = nx.DiGraph() g.add_edges_from(edges) ung = nx.Graph(g) gd = zeros((len(nodes), len(nodes))) cn = zeros((len(nodes), len(nodes))) jc = zeros((len(nodes), len(nodes))) a = zeros((len(nodes), len(nodes))) pa = zeros((len(nodes), len(nodes))) for i in range(len(nodes)): for j in range(len(nodes)): try: gd[i][j] = nx.shortest_path_length(g, nodes[i], nodes[j]) except nx.NetworkXNoPath: gd[i][j] = -1 pass except nx.NodeNotFound: gd[i][j] = -1 pass try: cn[i][j] = len( sorted(nx.common_neighbors(ung, nodes[i], nodes[j]))) except nx.NetworkXError: cn[i][j] = -1 pass try: for u, v, p in nx.jaccard_coefficient(ung, [(nodes[i], nodes[j])]): jc[i][j] = p except: jc[i][j] = -1 pass try: for u, v, p in nx.adamic_adar_index(ung, [(nodes[i], nodes[j])]): a[i][j] = p except ZeroDivisionError: a[i][j] = -1 pass except nx.NetworkXError: a[i][j] = -1 pass try: for u, v, p in nx.preferential_attachment( ung, [(nodes[i], nodes[j])]): pa[i][j] = p except: pa[i][j] = -1 pass #### k = 0 for par in parameter_list: ind_list = [] if k == 0: ref = gd t = 'Pgd' elif k == 1: ref = cn t = 'Pcn' elif k == 2: ref = jc t = 'Pjc' elif k == 3: ref = a t = 'Pa' else: ref = pa t = 'Ppa' for i in range(par): flat_ind = argmax(ref) dim_ind = tuple((flat_ind // len(nodes), flat_ind % len(nodes))) ref[dim_ind[0]][dim_ind[1]] = -1 ind_list.append(dim_ind) cnt = 0 for j in ind_list: if tuple((nodes[j[0]], nodes[j[1]])) in edges or tuple( (nodes[j[1]], nodes[j[0]])) in edges: cnt += 1 k += 1 print(t, cnt / par)
# In[9]: df['weight'] = pd.Series(nx.get_edge_attributes(G, 'weight')) df # ### Creating edge based features # # Many of the networkx functions related to edges return a nested data structures. We can extract the relevant data using list comprehension. # In[10]: df['preferential attachment'] = [i[2] for i in nx.preferential_attachment(G, df.index)] df # In the case where the function expects two nodes to be passed in, we can map the index to a lamda function. # In[11]: df['Common Neighbors'] = df.index.map(lambda city: len(list(nx.common_neighbors(G, city[0], city[1])))) df # In[ ]:
print('Reading %s_topological_network.csv...' % (prog_languages[prog_lang_id])) t_network = [] for row in data: dev_id_1 = int(row[0]) dev_id_2 = int(row[1]) t_network.append((dev_id_1, dev_id_2)) csvfile.close() with open('../Files/topological_metrics.csv', 'a') as a: metrics_file = csv.writer(a, delimiter=',') print('Writing topological metrics for', prog_languages[prog_lang_id]) for dev_pair in t_network: neighborhood_overlap = nx.jaccard_coefficient(G, [dev_pair]) adamic_acar = nx.adamic_adar_index(G, [dev_pair]) preferential_attachment = nx.preferential_attachment(G, [dev_pair]) for u, v, p in neighborhood_overlap: NO = p for u, v, p in adamic_acar: AA = p for u, v, p in preferential_attachment: PA = p metrics_file.writerow( [prog_lang_id, dev_pair[0], dev_pair[1], NO, AA, PA]) a.close()
common_neighbors = np.zeros(n) # computing features for training set for i in tqdm(range(len(id1))): if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1: G.remove_edge(id1[i], id2[i]) pred = nx.jaccard_coefficient(G, [(id1[i], id2[i])]) pred = [(u, v, p) for (u, v, p) in pred] jaccard[i] = pred[0][2] pred = nx.adamic_adar_index(G, [(id1[i], id2[i])]) pred = [(u, v, p) for (u, v, p) in pred] adar[i] = pred[0][2] pred = nx.preferential_attachment(G, [(id1[i], id2[i])]) pred = [(u, v, p) for (u, v, p) in pred] preferential_attachment[i] = pred[0][2] pred = nx.resource_allocation_index(G, [(id1[i], id2[i])]) pred = [(u, v, p) for (u, v, p) in pred] resource_allocation_index[i] = pred[0][2] pred = nx.common_neighbors(G, id1[i], id2[i]) pred = len([u for u in pred]) common_neighbors[i] = pred if training.at[str(id1[i]) + "|" + str(id2[i]), "target"] == 1: G.add_edge(id1[i], id2[i]) # add feature to data-frame
n = nx.number_of_nodes(g) e = nx.number_of_edges(g) d = nx.degree_histogram(g) t = nx.transitivity(g) kc = nx.core_number(g) nx.set_node_attributes(g,'k_core',kc) dc=nx.degree_centrality(g) nx.set_node_attributes(g,'dc',dc) # nihs=nx.get_edge_attributes(g,'nih') print "Graph has %d nodes and %d edges and %f transitivity" %(n, e, t) #grab initial pairs pairs = list(nx.preferential_attachment(g)) pairs += list(nx.preferential_attachment(g,g.edges())) yr = int(str(year)[0:4]) for pair in pairs: x,y,p = pair xyr = int(str(g.node[x]['fyr'])[0:4]) if 'fyr' in g.node[x].keys() else None yyr = int(str(g.node[y]['fyr'])[0:4]) if 'fyr' in g.node[y].keys() else None xft = int(str(g.node[x]['firsttie'])[0:4]) if 'firsttie' in g.node[x].keys() else None yft = int(str(g.node[y]['firsttie'])[0:4]) if 'firsttie' in g.node[y].keys() else None c = len(list(nx.common_neighbors(g,x,y))) row = [yr,str(x)+":"+str(y),int(g.has_edge(x,y)), g.node[x]['dc'], g.node[y]['dc'], yr-xft if xft > 0 and xft < yr else 0, (1 if yr==yyr else 0) if yyr != None else None,
def link_prediction(G, query_nodes, target_nodes, n_edges, start_dist, alg = "ra"): """Selects a random set of links between based on the scores calculated by a standard link-prediction algorithm from networkx library Parameters ---------- G : Networkx graph The graph from which the team will be selected. query : list The set of nodes from which random walker starts. target : list The set of nodes from where the random walker ends. n_edges : integer the number of links to be added start_dist: list The starting distribution over the query set alg: string A string describing the link-prediction algorithm to be used Returns ------- links : list The set of links that reduce the absorbing RW centrality ac_scores: list The set of scores of adding the links """ assert alg in ["ra", "pa", "jaccard", "aa"], "alg must be one of [\"ra\", \"pa\", \"jaccard\", \"aa\"]." H = G.copy() query_set_size = len(query_nodes) map_query_to_org = dict(zip(query_nodes, range(query_set_size))) P = csc_matrix(nx.google_matrix(H, alpha=1)) P_abs = P[list(query_nodes),:][:,list(query_nodes)] F = compute_fundamental(P_abs) row_sums = start_dist.dot(F.sum())[0,0] candidates = list(product(query_nodes, target_nodes)) eligible = [candidates[i] for i in range(len(candidates)) if H.has_edge(candidates[i][0], candidates[i][1]) == False] links_to_add = [] if alg == 'ra': preds = nx.resource_allocation_index(H, eligible) elif alg == 'jaccard': preds = nx.jaccard_coefficient(H, eligible) elif alg == 'aa': preds = nx.adamic_adar_index(H, eligible) elif alg == 'pa': preds = nx.preferential_attachment(H, eligible) for u,v,p in preds: links_to_add.append((u,v,p)) links_to_add.sort(key=lambda x: x[2], reverse = True) ac_scores = [] ac_scores.append(row_sums) i = 0 while i < n_edges: F_updated = update_fundamental_mat(F, H, map_query_to_org, links_to_add[i][0]) H.add_edge(links_to_add[i][0], links_to_add[i][1]) abs_cen = start_dist.dot(F_updated.sum(axis = 1))[0,0] F = F_updated ac_scores.append(abs_cen) i += 1 return links_to_add, ac_scores
CN.sort(key=operator.itemgetter(2), reverse=True) # Jaccard coef jaccard = list(nx.jaccard_coefficient(M)) jaccard.sort(key=operator.itemgetter(2), reverse=True) # Resource Allocation index RA = list(nx.resource_allocation_index(M)) RA.sort(key=operator.itemgetter(2), reverse=True) # Adamic-Adar index AA = list(nx.adamic_adar_index(M)) AA.sort(key=operator.itemgetter(2), reverse=True) # Preferential Attachement PA = list(nx.preferential_attachment(M)) PA.sort(key=operator.itemgetter(2), reverse=True) # Community Common Neighbors !!! requires graph to have node attribute: 'community' !!! #CCN = list(nx.cn_soundarajan_hopcroft(M)) #CCN.sort(key=operator.itemgetter(2), reverse = True) # Community Resource Allocation !!! requires graph to have node attribute: 'community' !!! #CRA = list(nx.ra_index_soundarajan_hopcroft(M)) #CRA.sort(key=operator.itemgetter(2), reverse = True) # ###################### Prediction on Future Edge Linkage #################### FM = M for i in PA[0:int(0.1 * len(M.edges()))]: FM.add_edge(i[0], i[1], value='new')
#4 Create seed and time tracker random.seed(0) t1 = datetime.now() #5 Create a test set of 95 percent from G edges_to_remove_from_twt = random.sample(twt.edges(), int(0.05 * twt.number_of_edges())) twt_test = twt.copy() twt_test.remove_edges_from(edges_to_remove_from_twt) print("Number of edges deleted : %d" % len(edges_to_remove_from_twt)) print("Number of edges remaining : %d" % (twt_test.number_of_edges())) #6 Transform twt_test to undirected twt_test = twt_test.to_undirected() #7 Calculate JC and PA AUC as features for negative edges pred_jc_test_neg = list(nx.preferential_attachment(twt_test)) pred_pa_test_neg = list(nx.jaccard_coefficient(twt_test)) #8 Calculate JC and PA AUC as features for positive edges pred_jc_test_pos = list(nx.preferential_attachment(twt_test,twt_test.edges())) pred_pa_test_pos = list(nx.jaccard_coefficient(twt_test, twt_test.edges())) #9 Combine negative and positive predictions pred_jc_test_total = (pred_jc_test_neg + pred_jc_test_pos) pred_pa_test_total = (pred_pa_test_neg + pred_pa_test_pos) print("Number of negative edges : %d" % len(pred_jc_test_neg)) print("Number of positive edges : %d" % len(pred_jc_test_pos)) #[2] Dataframe================================================================ #1 Create score dataframe df
def sort_edges_by_preferential_attachment(graph, edges): edges_sorted = sorted(list(nx.preferential_attachment( graph, edges)), key=lambda l: l[2], reverse=True, cmp=compare_with_ties) return [(row[0], row[1]) for row in edges_sorted], [row[2] for row in edges_sorted]