def main(): print("Processing Aqualung Wiki") Aq = Aqualung() print("Processing Ian Anderson Wiki") IanA = Ian() print("Generating Similarity rankings for Aqualung") As = nx.simrank_similarity(Aq) A = [[As[u][v] for v in sorted(As[u])] for u in sorted(As)] sim_array = array(A) print("Generating Similarity rankings for Ian Anderson") Ia = nx.simrank_similarity(Aq) IanAr = [[Ia[u][v] for v in sorted(Ia[u])] for u in sorted(Ia)] Ian_array = array(IanAr) AqL = list(Aq.nodes) IanAL = list(IanA.nodes) print("\nSimilarities for Aqualung") for x in range(len(sim_array)): for y in range(len(sim_array[x])): if x == y: break elif sim_array[x][y] >= 0.01: print(AqL[x], " | ", AqL[y], " | ", sim_array[x][y]) print("\nSimilarities for Ian Anderson") for x in range(len(Ian_array)): for y in range(len(Ian_array[x])): if x == y: break elif Ian_array[x][y] >= 0.01: print(IanAL[x], " | ", IanAL[y], " | ", Ian_array[x][y]) #removing nodes for networkx.difference for node in AqL: if (node in IanAL): continue else: Aq.remove_node(node) for node in IanAL: if (node in AqL): continue else: IanA.remove_node(node) print("\nComputing Difference") D = nx.difference(Aq, IanA) D.remove_nodes_from(list(nx.isolates(D))) print(nx.info(D)) #Computing Intersection print("\nIntersection") I = nx.intersection(Aq, IanA) I.remove_nodes_from(list(nx.isolates(I))) print(nx.info(I))
def test_simrank_between_versions(self): G = nx.cycle_graph(5) # _python tolerance 1e-4 expected_python_tol4 = { 0: 1, 1: 0.394512499239852, 2: 0.5703550452791322, 3: 0.5703550452791323, 4: 0.394512499239852, } # _numpy tolerance 1e-4 expected_numpy_tol4 = { 0: 1.0, 1: 0.3947180735764555, 2: 0.570482097206368, 3: 0.570482097206368, 4: 0.3947180735764555, } actual = nx.simrank_similarity(G, source=0) assert expected_numpy_tol4 == pytest.approx(actual, abs=1e-7) # versions differ at 1e-4 level but equal at 1e-3 assert expected_python_tol4 != pytest.approx(actual, abs=1e-4) assert expected_python_tol4 == pytest.approx(actual, abs=1e-3) actual = nx.similarity._simrank_similarity_python(G, source=0) assert expected_python_tol4 == pytest.approx(actual, abs=1e-7) # versions differ at 1e-4 level but equal at 1e-3 assert expected_numpy_tol4 != pytest.approx(actual, abs=1e-4) assert expected_numpy_tol4 == pytest.approx(actual, abs=1e-3)
def _simrank(self, nodes): ord1 = self.G_ord1(nodes) g = nx.DiGraph() g.add_nodes_from(nodes.tolist()) g.add_edges_from(ord1) sim = nx.simrank_similarity(g) return sim[nodes[0]][nodes[1]]
def simrank_radius(G, u, r): real_paths1 = nx.single_source_shortest_path(G, u, r) g = G.subgraph(list(real_paths1.keys())) sim = nx.simrank_similarity(g, u) sim_list = [] node_list = [] degree_list = [] for n in sim: sim_list.append(sim[n]) node_list.append(n) degree_list.append(G.degree(n)) d = {'node_name': node_list, 'degree': degree_list, 'simrank': sim_list} df = pd.DataFrame(d) # print(df) df['simrank'] = normalize_rdd(df, 1, 1000, 'simrank') df['simrank'] = np.log10(df['simrank']) # print(df) return df
def test_simrank_source_no_target(self): G = nx.cycle_graph(5) expected = { 0: 1, 1: 0.3951219505902448, 2: 0.5707317069281646, 3: 0.5707317069281646, 4: 0.3951219505902449, } actual = nx.simrank_similarity(G, source=0) assert expected == actual # For a DiGraph test, use the first graph from the paper cited in # the docs: https://dl.acm.org/doi/pdf/10.1145/775047.775126 G = nx.DiGraph() G.add_node(0, label="Univ") G.add_node(1, label="ProfA") G.add_node(2, label="ProfB") G.add_node(3, label="StudentA") G.add_node(4, label="StudentB") G.add_edges_from([(0, 1), (0, 2), (1, 3), (2, 4), (4, 2), (3, 0)]) expected = { 0: 1, 1: 0.0, 2: 0.1323363991265798, 3: 0.0, 4: 0.03387811817640443 } # Use the importance_factor from the paper to get the same numbers. actual = nx.algorithms.similarity.simrank_similarity( G, importance_factor=0.8, source=0) assert expected == actual
def test_simrank_no_source_no_target(self): G = nx.cycle_graph(5) expected = {0: {0: 1, 1: 0.3951219505902448, 2: 0.5707317069281646, 3: 0.5707317069281646, 4: 0.3951219505902449}, 1: {0: 0.3951219505902448, 1: 1, 2: 0.3951219505902449, 3: 0.5707317069281646, 4: 0.5707317069281646}, 2: {0: 0.5707317069281646, 1: 0.3951219505902449, 2: 1, 3: 0.3951219505902449, 4: 0.5707317069281646}, 3: {0: 0.5707317069281646, 1: 0.5707317069281646, 2: 0.3951219505902449, 3: 1, 4: 0.3951219505902449}, 4: {0: 0.3951219505902449, 1: 0.5707317069281646, 2: 0.5707317069281646, 3: 0.3951219505902449, 4: 1}} actual = nx.simrank_similarity(G) assert expected == actual # For a DiGraph test, use the first graph from the paper cited in # the docs: https://dl.acm.org/doi/pdf/10.1145/775047.775126 G = nx.DiGraph() G.add_node(0, label='Univ') G.add_node(1, label='ProfA') G.add_node(2, label='ProfB') G.add_node(3, label='StudentA') G.add_node(4, label='StudentB') G.add_edges_from([(0, 1), (0, 2), (1, 3), (2, 4), (4, 2), (3, 0)]) expected = { 0: {0: 1, 1: 0.0, 2: 0.1323363991265798, 3: 0.0, 4: 0.03387811817640443}, 1: {0: 0.0, 1: 1, 2: 0.4135512472705618, 3: 0.0, 4: 0.10586911930126384}, 2: {0: 0.1323363991265798, 1: 0.4135512472705618, 2: 1, 3: 0.04234764772050554, 4: 0.08822426608438655}, 3: {0: 0.0, 1: 0.0, 2: 0.04234764772050554, 3: 1, 4: 0.3308409978164495}, 4: {0: 0.03387811817640443, 1: 0.10586911930126384, 2: 0.08822426608438655, 3: 0.3308409978164495, 4: 1} } # Use the importance_factor from the paper to get the same numbers. actual = nx.algorithms.similarity.simrank_similarity(G, importance_factor=0.8) assert expected == actual
def test_simrank_source_no_target(self): G = nx.cycle_graph(5) expected = { 0: 1, 1: 0.3951219505902448, 2: 0.5707317069281646, 3: 0.5707317069281646, 4: 0.3951219505902449 } actual = nx.simrank_similarity(G, source=0) assert expected == actual
def main(): #Start graph G = nx.DiGraph() usrlist = [] users = open("users_noColHdrs.csv") for x in users: G.add_node(x[0:-1],user='******') usrlist.append(x[0:-1]) with open("movies_noColHdrs.csv", mode ='r') as file: Users = csv.reader(file) for line in Users: G.add_node(line[0],title=line[0], year=line[1], netflixRating=line[2]) with open("userRatedMovie_noColHdrs.csv", mode ='r') as file: Edges = csv.reader(file) for line in Edges: G.add_edge(line[0],line[1],rating=line[2]) print("Node info: ",G.nodes.data()) print("\nEdge info: ",G.edges.data()) displayGraphWithEdgeLabels(G, 'rating') for n in usrlist: for y in usrlist: if n == y: continue else: G.add_edge(n,y,similarity=nx.simrank_similarity(G,n,y)) displayGraphWithEdgeLabels(G, 'similarity') for n in usrlist: for x in list(G.out_edges(n)): if x[1] in usrlist: continue for y in usrlist: if n == y: continue for z in list(G.out_edges(y)): if z[1] in usrlist: continue if x[1] == z[1]: G.add_edge(n,y,is_like="is_like") displayGraphWithEdgeLabels(G,'is_like')
def CoSimRank(G, src=0, ngh=1, importance_factor=0.85, max_iterations=100, tolerance=0.000001): print("Start SimRank") t0 = time.time() similarity = nx.simrank_similarity(G, source=src, target=ngh, importance_factor=importance_factor, max_iterations=max_iterations, tolerance=tolerance) t1 = time.time() print("Time: ", t1 - t0) print("Similarity: ", similarity)
def test_simrank_no_source_no_target(self): G = nx.cycle_graph(5) expected = { 0: { 0: 1, 1: 0.3951219505902448, 2: 0.5707317069281646, 3: 0.5707317069281646, 4: 0.3951219505902449 }, 1: { 0: 0.3951219505902448, 1: 1, 2: 0.3951219505902449, 3: 0.5707317069281646, 4: 0.5707317069281646 }, 2: { 0: 0.5707317069281646, 1: 0.3951219505902449, 2: 1, 3: 0.3951219505902449, 4: 0.5707317069281646 }, 3: { 0: 0.5707317069281646, 1: 0.5707317069281646, 2: 0.3951219505902449, 3: 1, 4: 0.3951219505902449 }, 4: { 0: 0.3951219505902449, 1: 0.5707317069281646, 2: 0.5707317069281646, 3: 0.3951219505902449, 4: 1 } } actual = nx.simrank_similarity(G) assert expected == actual
def test_simrank_source_and_target(self): G = nx.cycle_graph(5) expected = 1 actual = nx.simrank_similarity(G, source=0, target=0) # For a DiGraph test, use the first graph from the paper cited in # the docs: https://dl.acm.org/doi/pdf/10.1145/775047.775126 G = nx.DiGraph() G.add_node(0, label="Univ") G.add_node(1, label="ProfA") G.add_node(2, label="ProfB") G.add_node(3, label="StudentA") G.add_node(4, label="StudentB") G.add_edges_from([(0, 1), (0, 2), (1, 3), (2, 4), (4, 2), (3, 0)]) expected = 0.1323363991265798 # Use the importance_factor from the paper to get the same numbers. # Use the pair (0,2) because (0,0) and (0,1) have trivial results. actual = nx.algorithms.similarity.simrank_similarity( G, importance_factor=0.8, source=0, target=2) assert expected == actual
def simrank(G, u): sim = nx.simrank_similarity(G, u) sim_list = [] node_list = [] degree_list = [] for n in sim: sim_list.append(sim[n]) node_list.append(n) degree_list.append(G.degree(n)) d = {'node_name': node_list, 'degree': degree_list, 'simrank': sim_list} df = pd.DataFrame(d) # print(df) # df['simrank'] = normalize_rdd(df, 1, 1000, 'simrank') # df['simrank'] = np.log10(df['simrank']) # print(df) return df
def test_simrank_source_and_target(self): G = nx.cycle_graph(5) expected = 1 actual = nx.simrank_similarity(G, source=0, target=0) assert_equal(expected, actual)
def test_simrank_source_no_target(self): G = nx.cycle_graph(5) expected = {0: 1, 1: 0.3951219505902448, 2: 0.5707317069281646, 3: 0.5707317069281646, 4: 0.3951219505902449} actual = nx.simrank_similarity(G, source=0) assert_equal(expected, actual)
# Loading of the dataframe filename = "data/jester-800-10.csv" jokes_columns = [5, 7, 8, 13, 15, 16, 17, 18, 19, 20] jokes_names = [f"joke_{joke_column}" for joke_column in jokes_columns] jokes_df = pd.read_csv(filename) # Construction of the bipartite graph. for i, user in jokes_df.iterrows(): G.add_node(user["id"]) for name in jokes_names: if user[name] == 1: G.add_edge(user["id"], name) # Calculation of the SimRank similarity matrix sim = nx.simrank_similarity(G, max_iterations=1) # Selection of the user to test. user_id = 16210 # Calculation of the k nearest neighbors. n_neighbors = 5 neighbours = sim[user_id] sorted_neighbors = {k: v for k, v in sorted(neighbours.items(), key=lambda item: item[1], reverse=True)} i = 0 k_sorted_neighbors = {} for k in sorted_neighbors: if i <= n_neighbors: if i > 0:
for line in lines: t = tuple(line.strip().split(',')) G.add_edge(*t) h, a = nx.hits(G, max_iter=100) h = dict(sorted(h.items(), key=lambda x: x[0])) a = dict(sorted(a.items(), key=lambda x: x[0])) print(np.round(list(a.values()), 3)) print(np.round(list(h.values()), 3)) pr = nx.pagerank(G) pr = dict(sorted(pr.items(), key=lambda x: x[0])) print(np.round(list(pr.values()), 3)) sim = nx.simrank_similarity(G) lol = [[sim[u][v] for v in sorted(sim[u])] for u in sorted(sim)] sim_array = np.round(array(lol), 3) print(sim_array) nx.draw(G, with_labels=True, node_size=2000, edge_color='#eb4034', width=3, font_size=16, font_weight=500, arrowsize=20, alpha=0.8) plt.savefig("graph.png")
def get_simrank_matrix(G): sim = nx.simrank_similarity(G) matrix = pd.DataFrame.from_dict(sim) return matrix
def total_repair_reg(g, metric='sqeuclidean', reg=0.01, eta=1, log=False): """ Repairing of the graph with OT and the sinkhorn version :param g: a graph to repair. The protected attribute is the node attribute :param metric: the distance metric for the cost matrix :param reg : entropic regularisation term :param case: the new graph is by nature a weighed one :return: the repaired graph, the transportation plan, the cost matrix """ x = nx.adjacency_matrix(g) s = nx.get_node_attributes(g, 's') s = np.fromiter(s.values(), dtype=int) otdists = [ 'cosine', 'dice', 'euclidean', 'hamming', 'jaccard', 'mahalanobis', 'matching', 'seuclidean', 'sqeuclidean', ] if issparse(x): x = x.todense() # Separate rows adjacency matrix based on the protected attribute idx_p0 = np.where(s == 0) x_0 = x[idx_p0] idx_p1 = np.where(s == 1) x_1 = x[idx_p1] # Get the barycenter between adj0 and adj1 n0, d0 = x_0.shape n1, d1 = x_1.shape # Compute barycenters using POT library # Uniform distributions on samples a = np.ones((n0, )) / n0 b = np.ones((n1, )) / n1 # loss matrix if metric in otdists: m = np.asarray(ot.dist(x_0, x_1, metric=metric)) elif metric == 'simrank': sim = nx.simrank_similarity(g) m_sim = [[sim[u][v] for v in sorted(sim[u])] for u in sorted(sim)] m = np.asarray(m_sim) m = np.asarray(m / m.max()) # Transport # kwargs = {'sim': 'gauss', 'alpha': 0.5} kwargs = {'sim': 'knn', 'nn': 5, 'alpha': 0.5} gamma = compute_transport(x_0, x_1, method='laplace', metric=metric, weights='unif', reg=reg, nbitermax=5000, solver=None, wparam=1, **kwargs) # Total data repair pi_0 = n0 / (n0 + n1) pi_1 = 1 - pi_0 x_0_rep = pi_0 * x_0 + n0 * pi_1 * np.dot(gamma, x_1) x_1_rep = pi_1 * x_1 + n1 * pi_0 * np.dot(gamma.T, x_0) new_x = np.zeros(x.shape) new_x[idx_p0, :] = x_0_rep new_x[idx_p1, :] = x_1_rep return new_x, s, gamma, m
def test_simrank_no_source_no_target(self): G = nx.cycle_graph(5) expected = {0: {0: 1, 1: 0.3951219505902448, 2: 0.5707317069281646, 3: 0.5707317069281646, 4: 0.3951219505902449}, 1: {0: 0.3951219505902448, 1: 1, 2: 0.3951219505902449, 3: 0.5707317069281646, 4: 0.5707317069281646}, 2: {0: 0.5707317069281646, 1: 0.3951219505902449, 2: 1, 3: 0.3951219505902449, 4: 0.5707317069281646}, 3: {0: 0.5707317069281646, 1: 0.5707317069281646, 2: 0.3951219505902449, 3: 1, 4: 0.3951219505902449}, 4: {0: 0.3951219505902449, 1: 0.5707317069281646, 2: 0.5707317069281646, 3: 0.3951219505902449, 4: 1}} actual = nx.simrank_similarity(G) assert_equal(expected, actual)
def total_repair_emd(g, metric='euclidean', log=False, name='plot_cost_gamma'): """ Repairing of the graph with OT and the emd version :param g: a graph to repair. The protected attribute is a feature of the node :param metric: the distance metric for the cost matrix :param log: if true plot the cost matrix and the transportation plan :param name: name of the file to save the figures :return: the repaired graph, the transportation plan, the cost matrix """ x = nx.adjacency_matrix(g) s = nx.get_node_attributes(g, 's') s = np.fromiter(s.values(), dtype=int) otdists = [ 'cosine', 'dice', 'euclidean', 'hamming', 'jaccard', 'mahalanobis', 'matching', 'seuclidean', 'sqeuclidean', ] if issparse(x): x = x.todense() # Separate rows adjacency matrix based on the protected attribute idx_p0 = np.where(s == 0) x_0 = x[idx_p0] idx_p1 = np.where(s == 1) x_1 = x[idx_p1] # Get the barycenter between adj0 and adj1 n0, d0 = x_0.shape n1, d1 = x_1.shape # Compute barycenters using POT library # Uniform distributions on samples a = np.ones((n0, )) / n0 b = np.ones((n1, )) / n1 # loss matrix if metric in otdists: m = np.asarray(ot.dist(x_0, x_1, metric=metric)) elif metric == 'simrank': sim = nx.simrank_similarity(g) m_sim = [[sim[u][v] for v in sorted(sim[u])] for u in sorted(sim)] m = np.asarray(m_sim) m = np.asarray(m / m.max()) # Exact transport gamma = ot.emd(a, b, m) # Total data repair pi_0 = n0 / (n0 + n1) pi_1 = 1 - pi_0 x_0_rep = pi_0 * x_0 + n0 * pi_1 * np.dot(gamma, x_1) x_1_rep = pi_1 * x_1 + n1 * pi_0 * np.dot(gamma.T, x_0) new_x = np.zeros(x.shape) new_x[idx_p0, :] = x_0_rep new_x[idx_p1, :] = x_1_rep if log: plt.imshow(gamma) plt.colorbar() plt.show() plt.savefig('gamma_' + name + '.png') plt.imshow(m) plt.colorbar() plt.show() plt.savefig('costMatrix_' + name + '.png') return new_x, s, gamma, m
# print(len(tup)) # tup.sort(key = lambda x: float(x[0]), reverse = True) # # i=0 # for a,b,c in tup: # i = i+1 # print(a,b,c) # if i == 20: # break # ============================================================================= # ============================================================================= # node = [] # while len(output) > 0: # y = output.pop() # node.append(y) # # print(len(node)) # ============================================================================= s = nx.simrank_similarity(G, output[0], output[1]) print(s) #f = open('resul7.txt', 'w') #f.write(str(s)) #f.close # ============================================================================= # for x in node: # print(x) # print(s[x]) # ============================================================================= print("sim")
def test_simrank_source_and_target(self): G = nx.cycle_graph(5) expected = 1 actual = nx.simrank_similarity(G, source=0, target=0) assert expected == actual
def update_output_div_similaritytable(n_clicks,Account_1,Account_2,amount_range,start_date,end_date): if ((Account_1 is None) or (Account_2 is None) or (start_date is None) or (end_date is None)): print('Preventing update of time and frequency graphs') raise PreventUpdate sd=datetime.strptime(start_date.split(' ')[0],'%Y-%m-%d').date() ed=datetime.strptime(end_date.split(' ')[0],'%Y-%m-%d').date() granular_bank_data=df[(df['datee'] >= sd) & (df['datee'] <= ed) & (df['amount']>=amount_range[0]) & (df['amount']<=amount_range[1])] tot_bank=filter_df(Account_1,sd,ed,amount_range[0],amount_range[1]) adj_data = tot_bank if Account_1 != Account_2: a_temp=filter_df(Account_2,sd,ed,amount_range[0],amount_range[1]) a=a_temp[(a_temp['originn']==Account_2) & (a_temp['dest'] != Account_1)] b=a_temp[(a_temp['dest']==Account_2) & (a_temp['originn'] != Account_1)] adj_data=pd.concat([adj_data,a]) adj_data=pd.concat([adj_data,b]) for i in adj_data['originn'].unique(): for j in adj_data['dest'].unique(): if i != Account_1 and j != Account_1 and i != Account_2 and j != Account_2: origin_i=granular_bank_data[(granular_bank_data['originn']== i) & (granular_bank_data['dest']== j)] adj_data=pd.concat([adj_data,origin_i]) two_edges=pd.DataFrame({'source': adj_data['originn'],'target':adj_data['dest'] ,'weight': adj_data['amount'] ,'color': ['green' if x<=100 else 'red' for x in adj_data['amount']] }) G_two_edge = nx.from_pandas_edgelist(two_edges,'source','target', edge_attr=['weight','color'],create_using=nx.MultiDiGraph()) sim_outgoing = nx.simrank_similarity(G_two_edge) sim_incoming = simrank_similarity_Incoming(G_two_edge) new_out={} for i in sim_outgoing.keys(): for j in sim_outgoing.keys(): new_out[i +'_'+j]=sim_outgoing_two_nodes(sim_outgoing,i,j) new_in={} for i in sim_outgoing.keys(): for j in sim_outgoing.keys(): new_in[i +'_'+j]=sim_incoming_two_nodes(sim_incoming,i,j) result_df=[] result_df=pd.DataFrame(new_out.keys()) result_df['Accounts']=pd.DataFrame(new_out.keys()) result_df['sim_out']=pd.DataFrame(new_out.values()) result_df['sim_in']=pd.DataFrame(new_in.values()) result_df1=result_df[['Accounts','sim_out','sim_in']] columns=[ {"name" : 'Accounts', "id" : 'Accounts'}, {"name" : 'sim_out', "id" :'sim_out'}, {"name" : 'sim_in', "id" : 'sim_in'} ] data=result_df1.to_dict('records') table=dash_table.DataTable( columns=columns, data=data, editable=False, filter_action="native", sort_action="native", sort_mode="multi", style_header={'backgroundColor': 'rgb(30, 30, 30)', 'textAlign': 'left'}, style_table={ 'width': '100%', #'height': '70vh', 'maxHeight': '70vh', #'overflowY': 'hidden' }, fixed_rows={ 'headers': True, 'data': 0 }, style_cell_conditional=[ {'if': {'column_id': 'Accounts'}, 'width': '50%'}, {'if': {'column_id': 'sim_out'}, 'width': '25%'}, {'if': {'column_id': 'sim_in'}, 'width': '25%'}], style_cell={ 'backgroundColor': 'rgb(50, 50, 50)', 'color': 'white', 'textAlign': 'left' }, ) return table