def test_dask_mg_degree(client_connection): gc.collect() input_data_path = r"../datasets/karate.csv" chunksize = cugraph.dask.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst") g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst") merge_df = (dg.in_degree().merge(g.in_degree(), on="vertex", suffixes=["_dg", "_g"]).compute()) assert merge_df["degree_dg"].equals(merge_df["degree_g"])
def test_dask_mg_degree(client_connection): gc.collect() input_data_path = r"../datasets/karate.csv" chunksize = cugraph.dask.get_chunksize(input_data_path) ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) df = cudf.read_csv(input_data_path, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, 'src', 'dst') g = cugraph.DiGraph() g.from_cudf_edgelist(df, 'src', 'dst') merge_df = dg.in_degree().merge(g.in_degree(), on="vertex", suffixes=['_dg', '_g']).compute() assert merge_df['degree_dg'].equals(merge_df['degree_g'])
def test_katz_centrality_multi_column(graph_file): gc.collect() cu_M = utils.read_csv_file(graph_file) cu_M.rename(columns={'0': 'src_0', '1': 'dst_0'}, inplace=True) cu_M['src_1'] = cu_M['src_0'] + 1000 cu_M['dst_1'] = cu_M['dst_0'] + 1000 G1 = cugraph.DiGraph() G1.from_cudf_edgelist(cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"]) G2 = cugraph.DiGraph() G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0") k_df_exp = cugraph.katz_centrality(G2, alpha=None, max_iter=1000) k_df_exp = k_df_exp.sort_values("vertex").reset_index(drop=True) nstart = cudf.DataFrame() nstart['vertex_0'] = k_df_exp['vertex'] nstart['vertex_1'] = nstart['vertex_0'] + 1000 nstart['values'] = k_df_exp['katz_centrality'] k_df_res = cugraph.katz_centrality(G1, nstart=nstart, alpha=None, max_iter=1000) k_df_res = k_df_res.sort_values("0_vertex").reset_index(drop=True) k_df_res.rename(columns={'0_vertex': 'vertex'}, inplace=True) top_res = topKVertices(k_df_res, "katz_centrality", 10) top_exp = topKVertices(k_df_exp, "katz_centrality", 10) assert top_res.equals(top_exp)
def test_cugraph_edge_map_to_cugraph_edge_set(): """ +-+ ------> |1| | +-+ | | | 9 6 | | | v +-+ <-8- +-+ +-+ |0| |2| <-5- |3| +-+ -7-> +-+ +-+""" dpr = mg.resolver sources = [0, 0, 1, 2, 3] destinations = [1, 2, 2, 0, 2] weights = [9, 7, 6, 8, 5] gdf = cudf.DataFrame( {"Source": sources, "Destination": destinations, "Weight": weights} ) g_x = cugraph.DiGraph() g_x.from_cudf_edgelist( gdf, source="Source", destination="Destination", edge_attr="Weight" ) x = dpr.wrappers.EdgeMap.CuGraphEdgeMap(g_x) g_intermediate = cugraph.DiGraph() g_intermediate.from_cudf_edgelist(gdf, source="Source", destination="Destination") intermediate = dpr.wrappers.EdgeSet.CuGraphEdgeSet(g_intermediate) y = dpr.translate(x, CuGraphEdgeSet) dpr.assert_equal(y, intermediate) assert len(dpr.plan.translate(x, CuGraphEdgeSet)) == 1
def test_multi_column_unrenumbering(managed, pool, graph_file): gc.collect() rmm.reinitialize(managed_memory=managed, pool_allocator=pool, initial_pool_size=2 << 27) assert (rmm.is_initialized()) translate = 100 cu_M = utils.read_csv_file(graph_file) cu_M['00'] = cu_M['0'] + translate cu_M['11'] = cu_M['1'] + translate G = cugraph.DiGraph() G.from_cudf_edgelist(cu_M, ['0', '00'], ['1', '11']) result_multi = cugraph.pagerank(G).sort_values(by='0').\ reset_index(drop=True) G = cugraph.DiGraph() G.from_cudf_edgelist(cu_M, '0', '1') result_single = cugraph.pagerank(G) result_exp = cudf.DataFrame() result_exp['0'] = result_single['vertex'] result_exp['1'] = result_single['vertex'] + translate result_exp['pagerank'] = result_single['pagerank'] assert result_multi.equals(result_exp)
def test_modularity_clustering_with_edgevals(graph_file, partitions): # Read in the graph and get a cugraph object M = utils.read_csv_for_nx(graph_file, read_weights_in_sp=False) M = M.tocsr().sorted_indices() cu_M = utils.read_csv_file(graph_file, read_weights_in_sp=False) row_offsets = cudf.Series(M.indptr) col_indices = cudf.Series(M.indices) val = cudf.Series(M.data) G_adj = cugraph.DiGraph() G_adj.from_cudf_adjlist(row_offsets, col_indices, val) G_edge = cugraph.DiGraph() G_edge.from_cudf_edgelist(cu_M, source='0', target='1', edge_attr='2') # Get the modularity score for partitioning versus random assignment cu_vid, cu_score = cugraph_call(G_adj, partitions) rand_vid, rand_score = random_call(G_adj, partitions) # Assert that the partitioning has better modularity than the random # assignment assert cu_score < rand_score # Get the modularity score for partitioning versus random assignment cu_vid, cu_score = cugraph_call(G_edge, partitions) rand_vid, rand_score = random_call(G_edge, partitions) # Assert that the partitioning has better modularity than the random # assignment assert cu_score < rand_score
def test_dask_katz_centrality(client_connection): gc.collect() input_data_path = r"../datasets/karate.csv" chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst") dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst") largest_out_degree = g.degrees().nlargest(n=1, columns="out_degree") largest_out_degree = largest_out_degree["out_degree"].iloc[0] katz_alpha = 1 / (largest_out_degree + 1) mg_res = dcg.katz_centrality(dg, alpha=katz_alpha, tol=1e-6) mg_res = mg_res.compute() import networkx as nx from cugraph.tests import utils NM = utils.read_csv_for_nx(input_data_path) Gnx = nx.from_pandas_edgelist( NM, create_using=nx.DiGraph(), source="0", target="1" ) nk = nx.katz_centrality(Gnx, alpha=katz_alpha) import pandas as pd pdf = pd.DataFrame(nk.items(), columns=['vertex', 'katz_centrality']) exp_res = cudf.DataFrame(pdf) err = 0 tol = 1.0e-05 compare_res = exp_res.merge( mg_res, on="vertex", suffixes=["_local", "_dask"] ) for i in range(len(compare_res)): diff = abs( compare_res["katz_centrality_local"].iloc[i] - compare_res["katz_centrality_dask"].iloc[i] ) if diff > tol * 1.1: err = err + 1 assert err == 0
def test_dask_bfs(dask_client): gc.collect() input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "netscience.csv").as_posix() print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) def modify_dataset(df): temp_df = cudf.DataFrame() temp_df['src'] = df['src'] + 1000 temp_df['dst'] = df['dst'] + 1000 temp_df['value'] = df['value'] return cudf.concat([df, temp_df]) meta = ddf._meta ddf = ddf.map_partitions(modify_dataset, meta=meta) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = modify_dataset(df) g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst") dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst") expected_dist = cugraph.bfs(g, [0, 1000]) result_dist = dcg.bfs(dg, [0, 1000]) result_dist = result_dist.compute() compare_dist = expected_dist.merge(result_dist, on="vertex", suffixes=["_local", "_dask"]) err = 0 for i in range(len(compare_dist)): if (compare_dist["distance_local"].iloc[i] != compare_dist["distance_dask"].iloc[i]): err = err + 1 assert err == 0
def test_dask_pagerank(client_connection, personalization_perc): gc.collect() input_data_path = r"../datasets/karate.csv" chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst") dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst") # Pre compute local data and personalize personalization = None if personalization_perc != 0: dg.compute_local_data(by="dst") personalization = personalize(dg.number_of_vertices(), personalization_perc) expected_pr = cugraph.pagerank(g, personalization=personalization, tol=1e-6) result_pr = dcg.pagerank(dg, personalization=personalization, tol=1e-6) result_pr = result_pr.compute() err = 0 tol = 1.0e-05 assert len(expected_pr) == len(result_pr) compare_pr = expected_pr.merge(result_pr, on="vertex", suffixes=["_local", "_dask"]) for i in range(len(compare_pr)): diff = abs(compare_pr["pagerank_local"].iloc[i] - compare_pr["pagerank_dask"].iloc[i]) if diff > tol * 1.1: err = err + 1 assert err == 0
def test_dask_pagerank(client_connection, personalization_perc): gc.collect() # FIXME: update this to allow dataset to be parameterized and have dataset # part of test param id (see other tests) input_data_path = r"../datasets/karate.csv" print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst") dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst") personalization = None if personalization_perc != 0: personalization, p = personalize(g.nodes(), personalization_perc) expected_pr = cugraph.pagerank(g, personalization=personalization, tol=1e-6) result_pr = dcg.pagerank(dg, personalization=personalization, tol=1e-6) result_pr = result_pr.compute() err = 0 tol = 1.0e-05 assert len(expected_pr) == len(result_pr) compare_pr = expected_pr.merge(result_pr, on="vertex", suffixes=["_local", "_dask"]) for i in range(len(compare_pr)): diff = abs(compare_pr["pagerank_local"].iloc[i] - compare_pr["pagerank_dask"].iloc[i]) if diff > tol * 1.1: err = err + 1 assert err == 0
def test_dask_bfs_multi_column_depthlimit(dask_client): gc.collect() input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "netscience.csv").as_posix() print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src_a", "dst_a", "value"], dtype=["int32", "int32", "float32"], ) ddf['src_b'] = ddf['src_a'] + 1000 ddf['dst_b'] = ddf['dst_a'] + 1000 df = cudf.read_csv( input_data_path, delimiter=" ", names=["src_a", "dst_a", "value"], dtype=["int32", "int32", "float32"], ) df['src_b'] = df['src_a'] + 1000 df['dst_b'] = df['dst_a'] + 1000 g = cugraph.DiGraph() g.from_cudf_edgelist(df, ["src_a", "src_b"], ["dst_a", "dst_b"]) dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, ["src_a", "src_b"], ["dst_a", "dst_b"]) start = cudf.DataFrame() start['a'] = [0] start['b'] = [1000] depth_limit = 18 expected_dist = cugraph.bfs(g, start, depth_limit=depth_limit) result_dist = dcg.bfs(dg, start, depth_limit=depth_limit) result_dist = result_dist.compute() compare_dist = expected_dist.merge(result_dist, on=["0_vertex", "1_vertex"], suffixes=["_local", "_dask"]) err = 0 for i in range(len(compare_dist)): if (compare_dist["distance_local"].iloc[i] <= depth_limit and compare_dist["distance_dask"].iloc[i] <= depth_limit and compare_dist["distance_local"].iloc[i] != compare_dist["distance_dask"].iloc[i]): err = err + 1 assert err == 0
def test_dask_pagerank(dask_client, personalization_perc): gc.collect() input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "karate.csv").as_posix() print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst") dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst") personalization = None if personalization_perc != 0: personalization, p = personalize(g.nodes(), personalization_perc) expected_pr = cugraph.pagerank(g, personalization=personalization, tol=1e-6) result_pr = dcg.pagerank(dg, personalization=personalization, tol=1e-6) result_pr = result_pr.compute() err = 0 tol = 1.0e-05 assert len(expected_pr) == len(result_pr) compare_pr = expected_pr.merge(result_pr, on="vertex", suffixes=["_local", "_dask"]) for i in range(len(compare_pr)): diff = abs(compare_pr["pagerank_local"].iloc[i] - compare_pr["pagerank_dask"].iloc[i]) if diff > tol * 1.1: err = err + 1 assert err == 0
def test_dask_pagerank(client_connection): gc.collect() pandas.set_option("display.max_rows", 10000) input_data_path = r"../datasets/karate.csv" chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst") dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst") # Pre compute local data # dg.compute_local_data(by='dst') expected_pr = cugraph.pagerank(g) result_pr = dcg.pagerank(dg) err = 0 tol = 1.0e-05 assert len(expected_pr) == len(result_pr) compare_pr = expected_pr.merge(result_pr, on="vertex", suffixes=["_local", "_dask"]) for i in range(len(compare_pr)): diff = abs(compare_pr["pagerank_local"].iloc[i] - compare_pr["pagerank_dask"].iloc[i]) if diff > tol * 1.1: err = err + 1 print("Mismatches:", err) assert err == 0
def construct_graph(dask_dataframe, symmetric=False): """ dask_dataframe contains weighted and undirected edges with self loops. Multiple edges will likely be present as well. The returned Graph object must be symmetrized and have self loops removed. """ G = cugraph.DiGraph() if len(dask_dataframe.columns) > 2: if symmetric: #symmetrize dask dataframe dask_dataframe = symmetrize_ddf(dask_dataframe, 'src', 'dst', 'weight') G.from_dask_cudf_edgelist(dask_dataframe, source="src", destination="dst", edge_attr="weight") #G.from_dask_cudf_edgelist( # dask_dataframe, source="0", destination="1", edge_attr="2") else: if symmetric: #symmetrize dask dataframe dask_dataframe = symmetrize_ddf(dask_dataframe, 'src', 'dst') G.from_dask_cudf_edgelist(dask_dataframe, source="src", destination="dst") return G
def test_to_undirected(graph_file): # Read data and then convert to directed by dropped some edges cu_M = utils.read_csv_file(graph_file) cu_M = cu_M[cu_M["0"] <= cu_M["1"]].reset_index(drop=True) M = utils.read_csv_for_nx(graph_file) M = M[M["0"] <= M["1"]] assert len(cu_M) == len(M) # cugraph add_edge_list DiG = cugraph.DiGraph() DiG.from_cudf_edgelist(cu_M, source="0", destination="1") DiGnx = nx.from_pandas_edgelist( M, source="0", target="1", create_using=nx.DiGraph() ) for index, row in cu_M.to_pandas().iterrows(): assert DiG.has_edge(row['0'], row['1']) assert not DiG.has_edge(row['1'], row['0']) G = DiG.to_undirected() Gnx = DiGnx.to_undirected() assert G.number_of_nodes() == Gnx.number_of_nodes() assert G.number_of_edges() == Gnx.number_of_edges() for index, row in cu_M.to_pandas().iterrows(): assert G.has_edge(row['0'], row['1']) assert G.has_edge(row['1'], row['0'])
def test_add_edge_or_adj_list_after_add_edge_or_adj_list(graph_file): Mnx = utils.read_csv_for_nx(graph_file) df = cudf.DataFrame() df["src"] = cudf.Series(Mnx["0"]) df["dst"] = cudf.Series(Mnx["1"]) N = max(max(Mnx["0"]), max(Mnx["1"])) + 1 Mcsr = scipy.sparse.csr_matrix( (Mnx.weight, (Mnx["0"], Mnx["1"])), shape=(N, N) ) offsets = cudf.Series(Mcsr.indptr) indices = cudf.Series(Mcsr.indices) G = cugraph.DiGraph() # If cugraph has at least one graph representation, adding a new graph # should fail to prevent a single graph object storing two different # graphs. # If cugraph has a graph edge list, adding a new graph should fail. G.from_cudf_edgelist(df, source="src", destination="dst") with pytest.raises(Exception): G.from_cudf_edgelist(df, source="src", destination="dst") with pytest.raises(Exception): G.from_cudf_adjlist(offsets, indices, None) G.delete_edge_list() # If cugraph has a graph adjacency list, adding a new graph should fail. G.from_cudf_adjlist(offsets, indices, None) with pytest.raises(Exception): G.from_cudf_edgelist(df, source="src", destination="dst") with pytest.raises(Exception): G.from_cudf_adjlist(offsets, indices, None) G.delete_adj_list()
def test_filter_unreachable(graph_file, source): gc.collect() cu_M = utils.read_csv_file(graph_file) print("sources size = " + str(len(cu_M))) print("destinations size = " + str(len(cu_M))) # cugraph Pagerank Call G = cugraph.DiGraph() G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2") print("cugraph Solving... ") t1 = time.time() df = cugraph.sssp(G, source) t2 = time.time() - t1 print("Time : " + str(t2)) reachable_df = cugraph.filter_unreachable(df) if np.issubdtype(df["distance"].dtype, np.integer): inf = np.iinfo(reachable_df["distance"].dtype).max # noqa: F841 assert len(reachable_df.query("distance == @inf")) == 0 elif np.issubdtype(df["distance"].dtype, np.inexact): inf = np.finfo(reachable_df["distance"].dtype).max # noqa: F841 assert len(reachable_df.query("distance == @inf")) == 0 assert len(reachable_df) != 0
def test_unweighted_directed_networkx_to_cugraph(): """ +-+ ------> |1| | +-+ | | | | v +-+ <-- +-+ +-+ |0| |2| <-- |3| +-+ --> +-+ +-+""" dpr = mg.resolver networkx_graph_data = [ (0, 1), (0, 2), (2, 0), (1, 2), (3, 2), ] networkx_graph_unwrapped = nx.DiGraph() networkx_graph_unwrapped.add_edges_from(networkx_graph_data) x = dpr.wrappers.Graph.NetworkXGraph(networkx_graph_unwrapped) sources = [0, 0, 1, 2, 3] destinations = [1, 2, 2, 0, 2] cdf = cudf.DataFrame({"source": sources, "destination": destinations}) g = cugraph.DiGraph() g.from_cudf_edgelist(cdf, source="source", destination="destination") intermediate = dpr.wrappers.Graph.CuGraph(g, None) y = dpr.translate(x, CuGraph) dpr.assert_equal(y, intermediate) assert len(dpr.plan.translate(x, CuGraph)) == 1
def test_pandas_edge_set_to_cugraph_edge_set(): """ +-+ ------> |1| | +-+ | | | | v +-+ <-- +-+ +-+ |0| |2| <-- |3| +-+ --> +-+ +-+""" dpr = mg.resolver pdf = pd.DataFrame({"src": (0, 0, 2, 1, 3), "dst": (1, 2, 0, 2, 2)}) x = dpr.wrappers.EdgeSet.PandasEdgeSet(pdf, src_label="src", dst_label="dst", is_directed=True) sources = [0, 0, 1, 2, 3] destinations = [1, 2, 2, 0, 2] cdf = cudf.DataFrame({"source": sources, "destination": destinations}) g = cugraph.DiGraph() g.from_cudf_edgelist(cdf, source="source", destination="destination") intermediate = dpr.wrappers.EdgeSet.CuGraphEdgeSet(g) y = dpr.translate(x, CuGraphEdgeSet) dpr.assert_equal(y, intermediate) assert len(dpr.plan.translate(x, CuGraphEdgeSet)) == 1
def test_scipy_edge_set_to_cugraph_edge_set(): """ +-+ ------> |1| | +-+ | | | | v +-+ <-- +-+ +-+ |0| |2| <-- |3| +-+ --> +-+ +-+""" dpr = mg.resolver scipy_sparse_matrix = ss.csr_matrix( np.array([ [0, 1, 1, 0], [0, 0, 1, 0], [1, 0, 0, 0], [0, 0, 1, 0], ])) x = dpr.wrappers.EdgeSet.ScipyEdgeSet(scipy_sparse_matrix) sources = [0, 0, 1, 2, 3] destinations = [1, 2, 2, 0, 2] cdf = cudf.DataFrame({"Source": sources, "Destination": destinations}) g = cugraph.DiGraph() g.from_cudf_edgelist(cdf, source="Source", destination="Destination") intermediate = dpr.wrappers.EdgeSet.CuGraphEdgeSet(g) y = dpr.translate(x, CuGraphEdgeSet) dpr.assert_equal(y, intermediate) assert len(dpr.plan.translate(x, CuGraphEdgeSet)) == 1
def cugraph_call(cu_M, source, edgevals=False): G = cugraph.DiGraph() if edgevals is True: G.from_cudf_edgelist(cu_M, source='0', destination='1', edge_attr='2') else: G.from_cudf_edgelist(cu_M, source='0', destination='1') print('sources size = ' + str(len(cu_M['0']))) print('destinations size = ' + str(len(cu_M['1']))) print('cugraph Solving... ') t1 = time.time() df = cugraph.sssp(G, source) t2 = time.time() - t1 print('Cugraph Time : ' + str(t2)) if (np.issubdtype(df['distance'].dtype, np.integer)): max_val = np.iinfo(df['distance'].dtype).max else: max_val = np.finfo(df['distance'].dtype).max verts_np = df['vertex'].to_array() dist_np = df['distance'].to_array() pred_np = df['predecessor'].to_array() result = dict(zip(verts_np, zip(dist_np, pred_np))) return result, max_val
def cugraph_call(cu_M, max_iter, tol, alpha, personalization, nstart): # cugraph Pagerank Call G = cugraph.DiGraph() G.from_cudf_edgelist(cu_M, source="0", destination="1") t1 = time.time() df = cugraph.pagerank( G, alpha=alpha, max_iter=max_iter, tol=tol, personalization=personalization, nstart=nstart, ) t2 = time.time() - t1 print("Cugraph Time : " + str(t2)) # Sort Pagerank values sorted_pr = [] df = df.sort_values("vertex").reset_index(drop=True) pr_scores = df["pagerank"].to_array() for i, rank in enumerate(pr_scores): sorted_pr.append((i, rank)) return sorted_pr
def test_compute_local_data(client_connection): gc.collect() input_data_path = r"../datasets/karate.csv" chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, source="src", destination="dst", edge_attr="value") # Compute_local_data dg.compute_local_data(by="dst") data = dg.local_data["data"] by = dg.local_data["by"] assert by == "dst" assert Comms.is_initialized() global_num_edges = data.local_data["edges"].sum() assert global_num_edges == dg.number_of_edges() global_num_verts = data.local_data["verts"].sum() assert global_num_verts == dg.number_of_nodes()
def test_to_undirected(graph_file): gc.collect() cu_M = utils.read_csv_file(graph_file) cu_M = cu_M[cu_M["0"] <= cu_M["1"]].reset_index(drop=True) M = utils.read_csv_for_nx(graph_file) M = M[M["0"] <= M["1"]] assert len(cu_M) == len(M) # cugraph add_edge_list DiG = cugraph.DiGraph() DiG.from_cudf_edgelist(cu_M, source="0", destination="1") DiGnx = nx.from_pandas_edgelist(M, source="0", target="1", create_using=nx.DiGraph()) G = DiG.to_undirected() Gnx = DiGnx.to_undirected() assert G.number_of_nodes() == Gnx.number_of_nodes() assert G.number_of_edges() == Gnx.number_of_edges() edgelist_df = G.edgelist.edgelist_df for i in range(len(edgelist_df)): assert Gnx.has_edge(edgelist_df.iloc[i]["src"], edgelist_df.iloc[i]["dst"])
def test_delete_edge_list_delete_adj_list(graph_file): gc.collect() Mnx = utils.read_csv_for_nx(graph_file) df = cudf.DataFrame() df["src"] = cudf.Series(Mnx["0"]) df["dst"] = cudf.Series(Mnx["1"]) N = max(max(Mnx["0"]), max(Mnx["1"])) + 1 Mcsr = scipy.sparse.csr_matrix((Mnx.weight, (Mnx["0"], Mnx["1"])), shape=(N, N)) offsets = cudf.Series(Mcsr.indptr) indices = cudf.Series(Mcsr.indices) # cugraph delete_adj_list delete_edge_list call G = cugraph.DiGraph() G.from_cudf_edgelist(df, source="src", destination="dst") G.delete_edge_list() with pytest.raises(Exception): G.view_adj_list() G.from_cudf_adjlist(offsets, indices, None) G.delete_adj_list() with pytest.raises(Exception): G.view_edge_list()
def test_degrees_functionality(managed, pool, graph_file): gc.collect() rmm.reinitialize(managed_memory=managed, pool_allocator=pool, initial_pool_size=2 << 27) assert (rmm.is_initialized()) M = utils.read_csv_for_nx(graph_file) cu_M = utils.read_csv_file(graph_file) G = cugraph.DiGraph() G.from_cudf_edgelist(cu_M, source='0', target='1', edge_attr='2') Gnx = nx.DiGraph(M) df = G.degrees() nx_in_degree = Gnx.in_degree() nx_out_degree = Gnx.out_degree() err_in_degree = 0 err_out_degree = 0 for i in range(len(df)): if (df['in_degree'][i] != nx_in_degree[i]): err_in_degree = err_in_degree + 1 if (df['out_degree'][i] != nx_out_degree[i]): err_out_degree = err_out_degree + 1 assert err_in_degree == 0 assert err_out_degree == 0
def test_degrees_functionality(graph_file): gc.collect() M = utils.read_csv_for_nx(graph_file) cu_M = utils.read_csv_file(graph_file) G = cugraph.DiGraph() G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2") Gnx = nx.from_pandas_edgelist(M, source="0", target="1", create_using=nx.DiGraph()) df = G.degrees() nx_in_degree = Gnx.in_degree() nx_out_degree = Gnx.out_degree() err_in_degree = 0 err_out_degree = 0 for i in range(len(df)): if df["in_degree"][i] != nx_in_degree[df["vertex"][i]]: err_in_degree = err_in_degree + 1 if df["out_degree"][i] != nx_out_degree[df["vertex"][i]]: err_out_degree = err_out_degree + 1 assert err_in_degree == 0 assert err_out_degree == 0
def test_from_edgelist(client_connection): # FIXME: update this to allow dataset to be parameterized and have dataset # part of test param id (see other tests) input_data_path = r"../datasets/karate.csv" print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) dg1 = cugraph.from_edgelist(ddf, source="src", destination="dst", edge_attr="value", create_using=cugraph.DiGraph) dg2 = cugraph.DiGraph() dg2.from_dask_cudf_edgelist(ddf, source="src", destination="dst", edge_attr="value") assert dg1.EdgeList == dg2.EdgeList
def test_Graph_from_MultiGraph(graph_file): # FIXME: Migrate to new test fixtures for Graph setup once available cuM = utils.read_csv_file(graph_file) GM = cugraph.MultiGraph() GM.from_cudf_edgelist(cuM, source="0", destination="1", edge_attr="2") nxM = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True) GnxM = nx.from_pandas_edgelist( nxM, source="0", target="1", edge_attr="weight", create_using=nx.MultiGraph(), ) G = cugraph.Graph(GM) Gnx = nx.Graph(GnxM) assert Gnx.number_of_edges() == G.number_of_edges() GdM = cugraph.MultiDiGraph() GdM.from_cudf_edgelist(cuM, source="0", destination="1", edge_attr="2") GnxdM = nx.from_pandas_edgelist( nxM, source="0", target="1", edge_attr="weight", create_using=nx.MultiGraph(), ) Gd = cugraph.DiGraph(GdM) Gnxd = nx.DiGraph(GnxdM) assert Gnxd.number_of_edges() == Gd.number_of_edges()
def test_from_edgelist(dask_client): input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "karate.csv").as_posix() print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) dg1 = cugraph.from_edgelist(ddf, source="src", destination="dst", edge_attr="value", create_using=cugraph.DiGraph) dg2 = cugraph.DiGraph() dg2.from_dask_cudf_edgelist(ddf, source="src", destination="dst", edge_attr="value") assert dg1.EdgeList == dg2.EdgeList