def test_jaccard_multi_column(graph_file): gc.collect() M = utils.read_csv_for_nx(graph_file) cu_M = cudf.DataFrame() cu_M["src_0"] = cudf.Series(M["0"]) cu_M["dst_0"] = cudf.Series(M["1"]) cu_M["src_1"] = cu_M["src_0"] + 1000 cu_M["dst_1"] = cu_M["dst_0"] + 1000 G1 = cugraph.Graph() G1.from_cudf_edgelist(cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"]) vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]] vertex_pair = vertex_pair[:5] df_res = cugraph.jaccard(G1, vertex_pair) G2 = cugraph.Graph() G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0") df_exp = cugraph.jaccard(G2, vertex_pair[["src_0", "dst_0"]]) # Calculating mismatch assert df_res["jaccard_coeff"].equals(df_exp["jaccard_coeff"])
def test_jaccard_multi_column(read_csv): M, _ = read_csv cu_M = cudf.DataFrame() cu_M["src_0"] = cudf.Series(M["0"]) cu_M["dst_0"] = cudf.Series(M["1"]) cu_M["src_1"] = cu_M["src_0"] + 1000 cu_M["dst_1"] = cu_M["dst_0"] + 1000 G1 = cugraph.Graph() G1.from_cudf_edgelist(cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"]) vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]] vertex_pair = vertex_pair[:5] df_res = cugraph.jaccard(G1, vertex_pair) G2 = cugraph.Graph() G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0") df_exp = cugraph.jaccard(G2, vertex_pair[["src_0", "dst_0"]]) # Calculating mismatch actual = df_res.sort_values("0_source").reset_index() expected = df_exp.sort_values("source").reset_index() assert_series_equal(actual["jaccard_coeff"], expected["jaccard_coeff"])
def test_jaccard_two_hop_edge_vals(managed, pool, graph_file): gc.collect() rmm.reinitialize(managed_memory=managed, pool_allocator=pool, initial_pool_size=2 << 27) assert (rmm.is_initialized()) M = utils.read_csv_for_nx(graph_file) cu_M = utils.read_csv_file(graph_file) Gnx = nx.from_pandas_edgelist(M, source='0', target='1', edge_attr='weight', create_using=nx.Graph()) G = cugraph.Graph() G.from_cudf_edgelist(cu_M, source='0', destination='1', edge_attr='2') pairs = G.get_two_hop_neighbors() nx_pairs = [] for i in range(len(pairs)): nx_pairs.append((pairs['first'][i], pairs['second'][i])) preds = nx.jaccard_coefficient(Gnx, nx_pairs) nx_coeff = [] for u, v, p in preds: nx_coeff.append(p) df = cugraph.jaccard(G, pairs) df = df.sort_values(by=['source', 'destination']) assert len(nx_coeff) == len(df) for i in range(len(df)): diff = abs(nx_coeff[i] - df['jaccard_coeff'][i]) assert diff < 1.0e-6
def predict(self, first, second): return ( cugraph.jaccard( self._G, first.astype("int32"), second.astype("int32") ) > self._threshold )
def test_jaccard_two_hop_edge_vals(graph_file): gc.collect() M = utils.read_csv_for_nx(graph_file) cu_M = utils.read_csv_file(graph_file) Gnx = nx.from_pandas_edgelist(M, source="0", target="1", edge_attr="weight", create_using=nx.Graph()) G = cugraph.Graph() G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2") pairs = (G.get_two_hop_neighbors().sort_values(["first", "second" ]).reset_index(drop=True)) nx_pairs = [] for i in range(len(pairs)): nx_pairs.append((pairs["first"].iloc[i], pairs["second"].iloc[i])) preds = nx.jaccard_coefficient(Gnx, nx_pairs) nx_coeff = [] for u, v, p in preds: nx_coeff.append(p) df = cugraph.jaccard(G, pairs) df = df.sort_values(by=["source", "destination"]).reset_index(drop=True) assert len(nx_coeff) == len(df) for i in range(len(df)): diff = abs(nx_coeff[i] - df["jaccard_coeff"].iloc[i]) assert diff < 1.0e-6
def test_jaccard_two_hop(managed, pool, graph_file): gc.collect() rmm.finalize() rmm_cfg.use_managed_memory = managed rmm_cfg.use_pool_allocator = pool rmm.initialize() assert (rmm.is_initialized()) M = read_mtx_file(graph_file) M = M.tocsr() Gnx = nx.DiGraph(M).to_undirected() G = cugraph.Graph() row_offsets = cudf.Series(M.indptr) col_indices = cudf.Series(M.indices) G.add_adj_list(row_offsets, col_indices, None) pairs = G.get_two_hop_neighbors() nx_pairs = [] for i in range(len(pairs)): nx_pairs.append((pairs['first'][i], pairs['second'][i])) preds = nx.jaccard_coefficient(Gnx, nx_pairs) nx_coeff = [] for u, v, p in preds: nx_coeff.append(p) df = cugraph.jaccard(G, pairs['first'], pairs['second']) assert len(nx_coeff) == len(df) for i in range(len(df)): diff = abs(nx_coeff[i] - df['jaccard_coeff'][i]) assert diff < 1.0e-6
def test_jaccard_two_hop(graph_file): gc.collect() M = utils.read_csv_for_nx(graph_file) cu_M = utils.read_csv_file(graph_file) Gnx = nx.from_pandas_edgelist(M, source='0', target='1', create_using=nx.Graph()) G = cugraph.Graph() G.from_cudf_edgelist(cu_M, source='0', destination='1') pairs = G.get_two_hop_neighbors() print(pairs) nx_pairs = [] for i in range(len(pairs)): nx_pairs.append((pairs['first'].iloc[i], pairs['second'].iloc[i])) preds = nx.jaccard_coefficient(Gnx, nx_pairs) nx_coeff = [] for u, v, p in preds: nx_coeff.append(p) df = cugraph.jaccard(G, pairs) df = df.sort_values(by=['source', 'destination']) assert len(nx_coeff) == len(df) for i in range(len(df)): diff = abs(nx_coeff[i] - df['jaccard_coeff'].iloc[i]) assert diff < 1.0e-6
def test_jaccard_two_hop_edge_vals(managed, pool, graph_file): gc.collect() rmm.reinitialize( managed_memory=managed, pool_allocator=pool, initial_pool_size=2 << 27 ) assert(rmm.is_initialized()) M = utils.read_csv_for_nx(graph_file) M = M.tocsr() Gnx = nx.DiGraph(M).to_undirected() G = cugraph.Graph() row_offsets = cudf.Series(M.indptr) col_indices = cudf.Series(M.indices) values = cudf.Series(M.data) G.from_cudf_adjlist(row_offsets, col_indices, values) pairs = G.get_two_hop_neighbors() nx_pairs = [] for i in range(len(pairs)): nx_pairs.append((pairs['first'][i], pairs['second'][i])) preds = nx.jaccard_coefficient(Gnx, nx_pairs) nx_coeff = [] for u, v, p in preds: nx_coeff.append(p) df = cugraph.jaccard(G, pairs['first'], pairs['second']) assert len(nx_coeff) == len(df) for i in range(len(df)): diff = abs(nx_coeff[i] - df['jaccard_coeff'][i]) assert diff < 1.0e-6
def cugraph_call(cu_M, edgevals=False): '''M = M.tocsr() if M is None: raise TypeError('Could not read the input graph') if M.shape[0] != M.shape[1]: raise TypeError('Shape is not square') ''' # Device data sources = cu_M['0'] destinations = cu_M['1'] if edgevals is False: values = None else: values = cu_M['2'] G = cugraph.Graph() G.add_edge_list(sources, destinations, values) # cugraph Jaccard Call t1 = time.time() df = cugraph.jaccard(G) t2 = time.time() - t1 print('Time : ' + str(t2)) return df['source'].to_array(), df['destination'].to_array(),\ df['jaccard_coeff'].to_array()
def cugraph_call(cu_M, edgevals=False): G = cugraph.Graph() if edgevals is True: G.from_cudf_edgelist(cu_M, source='0', destination='1', edge_attr='2') else: G.from_cudf_edgelist(cu_M, source='0', destination='1') # cugraph Jaccard Call t1 = time.time() df = cugraph.jaccard(G) t2 = time.time() - t1 print('Time : ' + str(t2)) print(df) return df['source'].to_array(), df['destination'].to_array(),\ df['jaccard_coeff'].to_array()
def train(self, val_edges, val_edges_false): validation_set = np.array( list(tuple(e) for e in val_edges) + list(tuple(e) for e in val_edges_false) ) surface = cugraph.jaccard( self._G, first=cudf.Series(validation_set[:, 0]).astype("int32"), second=cudf.Series(validation_set[:, 1]).astype("int32"), ) actual = np.array([1] * len(val_edges) + [0] * len(val_edges_false)) def _func(threshold): pred = surface.iloc[:, 2] > threshold return roc_auc_score(actual, pred) self._threshold = max(np.arange(0.0, 1.0, 0.01), key=_func) return self
def compare_jaccard_two_hop(G, Gnx): """ Compute both cugraph and nx jaccard after extracting the two hop neighbors from G and compare both results """ pairs = (G.get_two_hop_neighbors().sort_values(["first", "second" ]).reset_index(drop=True)) nx_pairs = list(pairs.to_records(index=False)) preds = nx.jaccard_coefficient(Gnx, nx_pairs) nx_coeff = [] for u, v, p in preds: nx_coeff.append(p) df = cugraph.jaccard(G, pairs) df = df.sort_values(by=["source", "destination"]).reset_index(drop=True) assert len(nx_coeff) == len(df) for i in range(len(df)): diff = abs(nx_coeff[i] - df["jaccard_coeff"].iloc[i]) assert diff < 1.0e-6
def cugraph_call(cu_M, edgevals=False): G = cugraph.Graph() if edgevals is True: G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2") else: G.from_cudf_edgelist(cu_M, source="0", destination="1") # cugraph Jaccard Call t1 = time.time() df = cugraph.jaccard(G) t2 = time.time() - t1 print("Time : " + str(t2)) df = df.sort_values(["source", "destination"]).reset_index(drop=True) return ( df["source"].to_array(), df["destination"].to_array(), df["jaccard_coeff"].to_array(), )
def test_jaccard_two_hop(graph_file): M = read_mtx_file(graph_file) M = M.tocsr() Gnx = nx.DiGraph(M).to_undirected() G = cugraph.Graph() row_offsets = cudf.Series(M.indptr) col_indices = cudf.Series(M.indices) G.add_adj_list(row_offsets, col_indices, None) pairs = G.get_two_hop_neighbors() nx_pairs = [] for i in range(len(pairs)): nx_pairs.append((pairs['first'][i], pairs['second'][i])) preds = nx.jaccard_coefficient(Gnx, nx_pairs) nx_coeff = [] for u, v, p in preds: nx_coeff.append(p) df = cugraph.jaccard(G, pairs['first'], pairs['second']) assert len(nx_coeff) == len(df) for i in range(len(df)): diff = abs(nx_coeff[i] - df['jaccard_coeff'][i]) assert diff < 1.0e-6
def cugraph_call(cu_M, edgevals=False): '''M = M.tocsr() if M is None: raise TypeError('Could not read the input graph') if M.shape[0] != M.shape[1]: raise TypeError('Shape is not square') ''' G = cugraph.DiGraph() if edgevals is True: G.from_cudf_edgelist(cu_M, source='0', target='1', edge_attr='2') else: G.from_cudf_edgelist(cu_M, source='0', target='1') # cugraph Jaccard Call t1 = time.time() df = cugraph.jaccard(G) t2 = time.time() - t1 print('Time : '+str(t2)) return df['source'].to_array(), df['destination'].to_array(),\ df['jaccard_coeff'].to_array()
def cluster( X, n_neighbors=30, community="louvain", metric="euclidean", algorithm="brute", similarity="jaccard", min_size=10, distributed=False, ): """ Clusters Parameters ---------- X : cudf.DataFrame Input cell-by-feature dataframe. n_neighbors : int Number of neighbors for kNN. community: string Community detection algorithm to use. Deault is 'louvain'. metric: string Distance metric to use for kNN. Currently, only 'euclidean' is supported. algorithm: string The query algorithm to use. Currently, only 'brute' is supported. similarity: string Similarity metric to use for neighbor edge refinement. Default is 'jaccard'. min_size: int Minimum cluster size. distributed: bool If True, use a multi-GPU dask cluster for kNN search. Returns ------- communities: cudf.DataFrame Community labels. G: cugraph.Graph k-neighbors graph. Q: float Modularity score for detected communities. Q is not returned if community='ecg' is used. """ tic = time.time() # Go! idx = find_neighbors(X, n_neighbors, metric, algorithm, distributed) print(f"Neighbors computed in {time.time() - tic} seconds...") subtic = time.time() G = kneighbors_graph(idx, n_neighbors, X.shape[0]) if similarity == "overlap": print("Computing overlap similarity...", flush=True) G = cugraph.overlap(G) else: similarity = "jaccard" print("Computing Jaccard similarity...", flush=True) G = cugraph.jaccard(G) print( f"{similarity} graph constructed in {time.time() - subtic} seconds...", flush=True, ) g = cugraph.symmetrize_df(G, "source", "destination") G = cugraph.Graph() G.from_cudf_edgelist(g, edge_attr=f"{similarity}_coeff") del g if community == "louvain": print("Running Louvain modularity optimization...", flush=True) parts, Q = cugraph.louvain(G, max_iter=1000) communities = sort_by_size( cp.asarray(parts.sort_values(by="vertex").partition), min_size) n_parts = cp.unique(communities).shape[0] print(f"grapheno completed in {time.time() - tic} seconds...", flush=True) print(f"Communities detected: {n_parts}", flush=True) print(f"Modularity: {Q}", flush=True) return communities, G, Q elif community == "leiden": print("Running Leiden modularity optimization...", flush=True) parts, Q = cugraph.leiden(G, max_iter=1000) communities = sort_by_size( cp.asarray(parts.sort_values(by="vertex").partition), min_size) n_parts = cp.unique(communities).shape[0] print(f"grapheno completed in {time.time() - tic} seconds...", flush=True) print(f"Communities detected: {n_parts}", flush=True) print(f"Modularity: {Q}", flush=True) return communities, G, Q elif community == "ecg": print("Running ECG...", flush=True) parts = cugraph.ecg(G) communities = sort_by_size( cp.asarray(parts.sort_values(by="vertex").partition), min_size) n_parts = cp.unique(communities).shape[0] print(f"grapheno completed in {time.time() - tic} seconds...", flush=True) print(f"Communities detected: {n_parts}", flush=True) return communities, G, None # Insert any community/clustering method... elif community == "your favorite method": pass
def jaccard_baseline_test(G, sources, destinations, labels, num_positive): results = cugraph.jaccard(G, first=sources, second=destinations).to_pandas().dropna() top_n = results.nlargest(num_positive, "jaccard_coeff") print("Jaccard", calculate_f1_score(top_n, sources, destinations, labels))