def test_overlap_multi_column(graph_file): gc.collect() M = utils.read_csv_for_nx(graph_file) cu_M = cudf.DataFrame() cu_M["src_0"] = cudf.Series(M["0"]) cu_M["dst_0"] = cudf.Series(M["1"]) cu_M["src_1"] = cu_M["src_0"] + 1000 cu_M["dst_1"] = cu_M["dst_0"] + 1000 G1 = cugraph.Graph() G1.from_cudf_edgelist(cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"]) vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]] vertex_pair = vertex_pair[:5] df_res = cugraph.overlap(G1, vertex_pair) G2 = cugraph.Graph() G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0") df_exp = cugraph.overlap(G2, vertex_pair[["src_0", "dst_0"]]) # Calculating mismatch assert df_res["overlap_coeff"].equals(df_exp["overlap_coeff"])
def test_overlap_multi_column(graph_file): M = utils.read_csv_for_nx(graph_file) cu_M = cudf.DataFrame() cu_M["src_0"] = cudf.Series(M["0"]) cu_M["dst_0"] = cudf.Series(M["1"]) cu_M["src_1"] = cu_M["src_0"] + 1000 cu_M["dst_1"] = cu_M["dst_0"] + 1000 G1 = cugraph.Graph() G1.from_cudf_edgelist(cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"]) vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]] vertex_pair = vertex_pair[:5] df_res = cugraph.overlap(G1, vertex_pair) G2 = cugraph.Graph() G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0") df_exp = cugraph.overlap(G2, vertex_pair[["src_0", "dst_0"]]) # Calculating mismatch actual = df_res.sort_values("0_source").reset_index() expected = df_exp.sort_values("source").reset_index() assert_series_equal(actual["overlap_coeff"], expected["overlap_coeff"])
def cugraph_call(cu_M, pairs, edgevals=False): G = cugraph.DiGraph() # Device data if edgevals is True: G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2") else: G.from_cudf_edgelist(cu_M, source="0", destination="1") # cugraph Overlap Call t1 = time.time() df = cugraph.overlap(G, pairs) t2 = time.time() - t1 print("Time : " + str(t2)) df = df.sort_values(by=["source", "destination"]) return df["overlap_coeff"].to_array()
def cugraph_call(cu_M, pairs, edgevals=False): G = cugraph.DiGraph() # Device data if edgevals is True: G.from_cudf_edgelist(cu_M, source='0', destination='1', edge_attr='2') else: G.from_cudf_edgelist(cu_M, source='0', destination='1') # cugraph Overlap Call t1 = time.time() df = cugraph.overlap(G, pairs) t2 = time.time() - t1 print('Time : ' + str(t2)) df = df.sort_values(by=['source', 'destination']) return df['overlap_coeff'].to_array()
def cugraph_call(cu_M, first, second, edgevals=False): G = cugraph.DiGraph() # Device data if edgevals is True: G.from_cudf_edgelist(cu_M, source='0', target='1', edge_attr='2') else: G.from_cudf_edgelist(cu_M, source='0', target='1') # cugraph Overlap Call t1 = time.time() df = cugraph.overlap(G, first, second) t2 = time.time() - t1 print('Time : ' + str(t2)) return df['overlap_coeff'].to_array()
def cugraph_call(cu_M, first, second, edgevals=False): # Device data sources = cu_M['0'] destinations = cu_M['1'] if edgevals is False: values = None else: values = cu_M['2'] G = cugraph.Graph() G.add_edge_list(sources, destinations, values) # cugraph Overlap Call t1 = time.time() df = cugraph.overlap(G, first, second) t2 = time.time() - t1 print('Time : ' + str(t2)) return df['overlap_coeff'].to_array()
def overlap_baseline_test(G, sources, destinations, labels, num_positive): results = cugraph.overlap(G, first=sources, second=destinations).to_pandas().dropna() top_n = results.nlargest(num_positive, "overlap_coeff") print("Overlap", calculate_f1_score(top_n, sources, destinations, labels))
def cluster( X, n_neighbors=30, community="louvain", metric="euclidean", algorithm="brute", similarity="jaccard", min_size=10, distributed=False, ): """ Clusters Parameters ---------- X : cudf.DataFrame Input cell-by-feature dataframe. n_neighbors : int Number of neighbors for kNN. community: string Community detection algorithm to use. Deault is 'louvain'. metric: string Distance metric to use for kNN. Currently, only 'euclidean' is supported. algorithm: string The query algorithm to use. Currently, only 'brute' is supported. similarity: string Similarity metric to use for neighbor edge refinement. Default is 'jaccard'. min_size: int Minimum cluster size. distributed: bool If True, use a multi-GPU dask cluster for kNN search. Returns ------- communities: cudf.DataFrame Community labels. G: cugraph.Graph k-neighbors graph. Q: float Modularity score for detected communities. Q is not returned if community='ecg' is used. """ tic = time.time() # Go! idx = find_neighbors(X, n_neighbors, metric, algorithm, distributed) print(f"Neighbors computed in {time.time() - tic} seconds...") subtic = time.time() G = kneighbors_graph(idx, n_neighbors, X.shape[0]) if similarity == "overlap": print("Computing overlap similarity...", flush=True) G = cugraph.overlap(G) else: similarity = "jaccard" print("Computing Jaccard similarity...", flush=True) G = cugraph.jaccard(G) print( f"{similarity} graph constructed in {time.time() - subtic} seconds...", flush=True, ) g = cugraph.symmetrize_df(G, "source", "destination") G = cugraph.Graph() G.from_cudf_edgelist(g, edge_attr=f"{similarity}_coeff") del g if community == "louvain": print("Running Louvain modularity optimization...", flush=True) parts, Q = cugraph.louvain(G, max_iter=1000) communities = sort_by_size( cp.asarray(parts.sort_values(by="vertex").partition), min_size) n_parts = cp.unique(communities).shape[0] print(f"grapheno completed in {time.time() - tic} seconds...", flush=True) print(f"Communities detected: {n_parts}", flush=True) print(f"Modularity: {Q}", flush=True) return communities, G, Q elif community == "leiden": print("Running Leiden modularity optimization...", flush=True) parts, Q = cugraph.leiden(G, max_iter=1000) communities = sort_by_size( cp.asarray(parts.sort_values(by="vertex").partition), min_size) n_parts = cp.unique(communities).shape[0] print(f"grapheno completed in {time.time() - tic} seconds...", flush=True) print(f"Communities detected: {n_parts}", flush=True) print(f"Modularity: {Q}", flush=True) return communities, G, Q elif community == "ecg": print("Running ECG...", flush=True) parts = cugraph.ecg(G) communities = sort_by_size( cp.asarray(parts.sort_values(by="vertex").partition), min_size) n_parts = cp.unique(communities).shape[0] print(f"grapheno completed in {time.time() - tic} seconds...", flush=True) print(f"Communities detected: {n_parts}", flush=True) return communities, G, None # Insert any community/clustering method... elif community == "your favorite method": pass