def test_symmetrize_df(graph_file): gc.collect() cu_M = utils.read_csv_file(graph_file+'.csv') sym_df = cugraph.symmetrize_df(cu_M, '0', '1') compare(cu_M['0'], cu_M['1'], cu_M['2'], sym_df['0'], sym_df['1'], sym_df['2'])
def test_symmetrize_df(graph_file): gc.collect() cu_M = utils.read_csv_file(graph_file) sym_df = cugraph.symmetrize_df(cu_M, "0", "1") compare(cu_M["0"], cu_M["1"], cu_M["2"], sym_df["0"], sym_df["1"], sym_df["2"])
def test_symmetrize_bad_weights(): src = [0, 0, 0, 0, 1, 2] dst = [1, 2, 3, 4, 0, 3] val = [1.0, 1.0, 1.0, 1.0, 2.0, 1.0] df = pd.DataFrame({'src': src, 'dst': dst, 'val': val}) gdf = cudf.DataFrame.from_pandas(df[['src', 'dst', 'val']]) sym_df = cugraph.symmetrize_df(gdf, 'src', 'dst') compare(gdf['src'], gdf['dst'], gdf['val'], sym_df['src'], sym_df['dst'], sym_df['val'])
def test_symmetrize_df(managed, pool, graph_file): gc.collect() rmm.reinitialize(managed_memory=managed, pool_allocator=pool) assert (rmm.is_initialized()) cu_M = utils.read_csv_file(graph_file + '.csv') sym_df = cugraph.symmetrize_df(cu_M, '0', '1') compare(cu_M['0'], cu_M['1'], cu_M['2'], sym_df['0'], sym_df['1'], sym_df['2'])
def test_symmetrize_bad_weights(): src = [0, 0, 0, 0, 1, 2] dst = [1, 2, 3, 4, 0, 3] val = [1.0, 1.0, 1.0, 1.0, 2.0, 1.0] df = pd.DataFrame({"src": src, "dst": dst, "val": val}) gdf = cudf.DataFrame.from_pandas(df[["src", "dst", "val"]]) sym_df = cugraph.symmetrize_df(gdf, "src", "dst") compare( gdf["src"], gdf["dst"], gdf["val"], sym_df["src"], sym_df["dst"], sym_df["val"], )
def cluster( X, n_neighbors=30, community="louvain", metric="euclidean", algorithm="brute", similarity="jaccard", min_size=10, distributed=False, ): """ Clusters Parameters ---------- X : cudf.DataFrame Input cell-by-feature dataframe. n_neighbors : int Number of neighbors for kNN. community: string Community detection algorithm to use. Deault is 'louvain'. metric: string Distance metric to use for kNN. Currently, only 'euclidean' is supported. algorithm: string The query algorithm to use. Currently, only 'brute' is supported. similarity: string Similarity metric to use for neighbor edge refinement. Default is 'jaccard'. min_size: int Minimum cluster size. distributed: bool If True, use a multi-GPU dask cluster for kNN search. Returns ------- communities: cudf.DataFrame Community labels. G: cugraph.Graph k-neighbors graph. Q: float Modularity score for detected communities. Q is not returned if community='ecg' is used. """ tic = time.time() # Go! idx = find_neighbors(X, n_neighbors, metric, algorithm, distributed) print(f"Neighbors computed in {time.time() - tic} seconds...") subtic = time.time() G = kneighbors_graph(idx, n_neighbors, X.shape[0]) if similarity == "overlap": print("Computing overlap similarity...", flush=True) G = cugraph.overlap(G) else: similarity = "jaccard" print("Computing Jaccard similarity...", flush=True) G = cugraph.jaccard(G) print( f"{similarity} graph constructed in {time.time() - subtic} seconds...", flush=True, ) g = cugraph.symmetrize_df(G, "source", "destination") G = cugraph.Graph() G.from_cudf_edgelist(g, edge_attr=f"{similarity}_coeff") del g if community == "louvain": print("Running Louvain modularity optimization...", flush=True) parts, Q = cugraph.louvain(G, max_iter=1000) communities = sort_by_size( cp.asarray(parts.sort_values(by="vertex").partition), min_size) n_parts = cp.unique(communities).shape[0] print(f"grapheno completed in {time.time() - tic} seconds...", flush=True) print(f"Communities detected: {n_parts}", flush=True) print(f"Modularity: {Q}", flush=True) return communities, G, Q elif community == "leiden": print("Running Leiden modularity optimization...", flush=True) parts, Q = cugraph.leiden(G, max_iter=1000) communities = sort_by_size( cp.asarray(parts.sort_values(by="vertex").partition), min_size) n_parts = cp.unique(communities).shape[0] print(f"grapheno completed in {time.time() - tic} seconds...", flush=True) print(f"Communities detected: {n_parts}", flush=True) print(f"Modularity: {Q}", flush=True) return communities, G, Q elif community == "ecg": print("Running ECG...", flush=True) parts = cugraph.ecg(G) communities = sort_by_size( cp.asarray(parts.sort_values(by="vertex").partition), min_size) n_parts = cp.unique(communities).shape[0] print(f"grapheno completed in {time.time() - tic} seconds...", flush=True) print(f"Communities detected: {n_parts}", flush=True) return communities, G, None # Insert any community/clustering method... elif community == "your favorite method": pass