def test_subgraph_extraction_DiGraph(graph_file): gc.collect() M = utils.read_csv_for_nx(graph_file) verts = np.zeros(3, dtype=np.int32) verts[0] = 0 verts[1] = 1 verts[2] = 17 cu_sg = cugraph_call(M, verts) nx_sg = nx_call(M, verts) assert compare_edges(cu_sg, nx_sg)
def test_wjaccard(graph_file): gc.collect() M = utils.read_csv_for_nx(graph_file) cu_M = utils.read_csv_file(graph_file) # suppress F841 (local variable is assigned but never used) in flake8 # no networkX equivalent to compare cu_coeff against... cu_coeff = cugraph_call(cu_M) # noqa: F841 nx_coeff = networkx_call(M) for i in range(len(cu_coeff)): diff = abs(nx_coeff[i] - cu_coeff[i]) assert diff < 1.0e-6
def calc_k_cores(graph_file): cu_M = utils.read_csv_file(graph_file) G = cugraph.DiGraph() G.from_cudf_edgelist(cu_M, source='0', destination='1') ck = cugraph.k_core(G) NM = utils.read_csv_for_nx(graph_file) Gnx = nx.from_pandas_edgelist(NM, source='0', target='1', create_using=nx.DiGraph()) nk = nx.k_core(Gnx) return ck, nk
def test_ecg_clustering_nx(graph_file, min_weight, ensemble_size): gc.collect() # Read in the graph and get a NetworkX graph M = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True) G = nx.from_pandas_edgelist( M, source="0", target="1", edge_attr="weight", create_using=nx.Graph() ) # Get the modularity score for partitioning versus random assignment _ = cugraph.ecg(G, min_weight, ensemble_size, "weight")
def test_from_edgelist(graph_file): """ Compare the resulting Graph objs from cugraph.from_edgelist() calls of both a cudf and pandas DataFrame and ensure the results are equal. """ df = utils.read_csv_file(graph_file) pdf = utils.read_csv_for_nx(graph_file) G1 = cugraph.from_edgelist(df, source="0", destination="1") G2 = cugraph.from_edgelist(pdf, source="0", destination="1") assert G1.EdgeList == G2.EdgeList
def test_nx_convert_undirected(graph_file): # read data and create a Nx Graph nx_df = utils.read_csv_for_nx(graph_file) nxG = nx.from_pandas_edgelist(nx_df, "0", "1", create_using=nx.Graph) assert nx.is_directed(nxG) is False assert nx.is_weighted(nxG) is False cuG = cugraph.utilities.convert_from_nx(nxG) assert cuG.is_directed() is False assert cuG.is_weighted() is False _compare_graphs(nxG, cuG, has_wt=False)
def calc_k_cores(graph_file): cu_M = utils.read_csv_file(graph_file) G = cugraph.DiGraph() G.from_cudf_edgelist(cu_M, source='0', target='1') ck = cugraph.k_core(G) NM = utils.read_csv_for_nx(graph_file) NM = NM.tocsr() Gnx = nx.DiGraph(NM) nk = nx.k_core(Gnx) return ck, nk
def test_k_core_Graph_nx(graph_file): gc.collect() NM = utils.read_csv_for_nx(graph_file) Gnx = nx.from_pandas_edgelist(NM, source="0", target="1", create_using=nx.Graph()) nc = nx.k_core(Gnx) cc = cugraph.k_core(Gnx) assert nx.is_isomorphic(nc, cc)
def calc_k_cores(graph_file): M = utils.read_csv_file(graph_file) G = cugraph.Graph() G.add_edge_list(M['0'], M['1']) ck = cugraph.k_core(G) NM = utils.read_csv_for_nx(graph_file) NM = NM.tocsr() Gnx = nx.DiGraph(NM) nk = nx.k_core(Gnx) return ck, nk
def test_core_number_nx(graph_file): gc.collect() NM = utils.read_csv_for_nx(graph_file) Gnx = nx.from_pandas_edgelist(NM, source="0", target="1", create_using=nx.Graph()) nc = nx.core_number(Gnx) cc = cugraph.core_number(Gnx) assert nc == cc
def test_sssp_data_type_conversion(managed, pool, graph_file, source): gc.collect() rmm.reinitialize(managed_memory=managed, pool_allocator=pool, initial_pool_size=2 << 27) assert (rmm.is_initialized()) M = utils.read_csv_for_nx(graph_file) cu_M = utils.read_csv_file(graph_file) # cugraph call with int32 weights cu_M['2'] = cu_M['2'].astype(np.int32) G = cugraph.DiGraph() G.from_cudf_edgelist(cu_M, source='0', target='1', edge_attr='2') # assert cugraph weights is int32 assert G.edgelist.edgelist_df['weights'].dtype == np.int32 df = cugraph.sssp(G, source) max_val = np.finfo(df['distance'].dtype).max verts_np = df['vertex'].to_array() dist_np = df['distance'].to_array() pred_np = df['predecessor'].to_array() cu_paths = dict(zip(verts_np, zip(dist_np, pred_np))) # networkx call with int32 weights M = M.tocsr() M.data = M.data.astype(np.int32) Gnx = nx.DiGraph(M) # assert nx weights is int32 assert list(Gnx.edges(data=True))[0][2]['weight'].dtype == np.int32 nx_paths = nx.single_source_dijkstra_path_length(Gnx, source) # Calculating mismatch err = 0 for vid in cu_paths: # Validate vertices that are reachable # NOTE : If distance type is float64 then cu_paths[vid][0] # should be compared against np.finfo(np.float64).max) if (cu_paths[vid][0] != max_val): if (cu_paths[vid][0] != nx_paths[vid]): err = err + 1 # check pred dist + edge_weight = current dist if (vid != source): pred = cu_paths[vid][1] edge_weight = Gnx[pred][vid]['weight'] if (cu_paths[pred][0] + edge_weight != cu_paths[vid][0]): err = err + 1 else: if (vid in nx_paths.keys()): err = err + 1 assert err == 0
def test_sssp_data_type_conversion(graph_file, source): gc.collect() M = utils.read_csv_for_nx(graph_file) cu_M = utils.read_csv_file(graph_file) # cugraph call with int32 weights cu_M["2"] = cu_M["2"].astype(np.int32) G = cugraph.DiGraph() G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2") # assert cugraph weights is int32 assert G.edgelist.edgelist_df["weights"].dtype == np.int32 df = cugraph.sssp(G, source) max_val = np.finfo(df["distance"].dtype).max verts_np = df["vertex"].to_array() dist_np = df["distance"].to_array() pred_np = df["predecessor"].to_array() cu_paths = dict(zip(verts_np, zip(dist_np, pred_np))) # networkx call with int32 weights M["weight"] = M["weight"].astype(np.int32) Gnx = nx.from_pandas_edgelist( M, source="0", target="1", edge_attr="weight", create_using=nx.DiGraph(), ) # assert nx weights is int assert type(list(Gnx.edges(data=True))[0][2]["weight"]) is int nx_paths = nx.single_source_dijkstra_path_length(Gnx, source) # Calculating mismatch err = 0 for vid in cu_paths: # Validate vertices that are reachable # NOTE : If distance type is float64 then cu_paths[vid][0] # should be compared against np.finfo(np.float64).max) if cu_paths[vid][0] != max_val: if cu_paths[vid][0] != nx_paths[vid]: err = err + 1 # check pred dist + edge_weight = current dist if vid != source: pred = cu_paths[vid][1] edge_weight = Gnx[pred][vid]["weight"] if cu_paths[pred][0] + edge_weight != cu_paths[vid][0]: err = err + 1 else: if vid in nx_paths.keys(): err = err + 1 assert err == 0
def test_dask_katz_centrality(client_connection): gc.collect() # FIXME: update this to allow dataset to be parameterized and have dataset # part of test param id (see other tests) input_data_path = r"../datasets/karate.csv" print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst") largest_out_degree = dg.out_degree().compute().\ nlargest(n=1, columns="degree") largest_out_degree = largest_out_degree["degree"].iloc[0] katz_alpha = 1 / (largest_out_degree + 1) mg_res = dcg.katz_centrality(dg, alpha=katz_alpha, tol=1e-6) mg_res = mg_res.compute() import networkx as nx from cugraph.tests import utils NM = utils.read_csv_for_nx(input_data_path) Gnx = nx.from_pandas_edgelist(NM, create_using=nx.DiGraph(), source="0", target="1") nk = nx.katz_centrality(Gnx, alpha=katz_alpha) import pandas as pd pdf = pd.DataFrame(nk.items(), columns=['vertex', 'katz_centrality']) exp_res = cudf.DataFrame(pdf) err = 0 tol = 1.0e-05 compare_res = exp_res.merge(mg_res, on="vertex", suffixes=["_local", "_dask"]) for i in range(len(compare_res)): diff = abs(compare_res["katz_centrality_local"].iloc[i] - compare_res["katz_centrality_dask"].iloc[i]) if diff > tol * 1.1: err = err + 1 assert err == 0
def test_mg_renumber_common_col_names(graph_file, dask_client): """ Ensure that commonly-used column names in the input do not conflict with names used internally by NumberMap. """ M = utils.read_csv_for_nx(graph_file) sources = cudf.Series(M["0"]) destinations = cudf.Series(M["1"]) numbers = range(len(sources)) offset_numbers = [n + 1 for n in numbers] floats = [float(n) for n in numbers] # test multi-column ("legacy" renumbering code path) gdf = cudf.DataFrame({ "src": numbers, "dst": numbers, "weights": floats, "col_a": sources, "col_b": sources, "col_c": destinations, "col_d": destinations }) ddf = dask.dataframe.from_pandas( gdf, npartitions=len(dask_client.scheduler_info()['workers'])) renumbered_df, renumber_map = NumberMap.renumber(ddf, ["col_a", "col_b"], ["col_c", "col_d"]) assert renumber_map.renumbered_src_col_name != "src" assert renumber_map.renumbered_dst_col_name != "dst" assert renumber_map.renumbered_src_col_name in renumbered_df.columns assert renumber_map.renumbered_dst_col_name in renumbered_df.columns # test experimental renumbering code path gdf = cudf.DataFrame({ "src": numbers, "dst": offset_numbers, "weights": floats, "col_a": sources, "col_b": destinations }) ddf = dask.dataframe.from_pandas( gdf, npartitions=len(dask_client.scheduler_info()['workers'])) renumbered_df, renumber_map = NumberMap.renumber(ddf, "col_a", "col_b") assert renumber_map.renumbered_src_col_name != "src" assert renumber_map.renumbered_dst_col_name != "dst" assert renumber_map.renumbered_src_col_name in renumbered_df.columns assert renumber_map.renumbered_dst_col_name in renumbered_df.columns
def test_mg_renumber(graph_file, dask_client): M = utils.read_csv_for_nx(graph_file) sources = cudf.Series(M["0"]) destinations = cudf.Series(M["1"]) translate = 1000 gdf = cudf.DataFrame() gdf["src_old"] = sources gdf["dst_old"] = destinations gdf["src"] = sources + translate gdf["dst"] = destinations + translate ddf = dask.dataframe.from_pandas( gdf, npartitions=len(dask_client.scheduler_info()['workers'])) # preserve_order is not supported for MG renumbered_df, renumber_map = NumberMap.renumber(ddf, ["src", "src_old"], ["dst", "dst_old"], preserve_order=False) unrenumbered_df = renumber_map.unrenumber( renumbered_df, renumber_map.renumbered_src_col_name, preserve_order=False) unrenumbered_df = renumber_map.unrenumber( unrenumbered_df, renumber_map.renumbered_dst_col_name, preserve_order=False) # sort needed only for comparisons, since preserve_order is False gdf = gdf.sort_values(by=["src", "src_old", "dst", "dst_old"]) gdf = gdf.reset_index() unrenumbered_df = unrenumbered_df.compute() src = renumber_map.renumbered_src_col_name dst = renumber_map.renumbered_dst_col_name unrenumbered_df = unrenumbered_df.sort_values( by=[f"0_{src}", f"1_{src}", f"0_{dst}", f"1_{dst}"]) unrenumbered_df = unrenumbered_df.reset_index() assert_series_equal(gdf["src"], unrenumbered_df[f"0_{src}"], check_names=False) assert_series_equal(gdf["src_old"], unrenumbered_df[f"1_{src}"], check_names=False) assert_series_equal(gdf["dst"], unrenumbered_df[f"0_{dst}"], check_names=False) assert_series_equal(gdf["dst_old"], unrenumbered_df[f"1_{dst}"], check_names=False)
def test_triangles_edge_vals(managed, pool, graph_file): gc.collect() rmm.reinitialize(managed_memory=managed, pool_allocator=pool, initial_pool_size=2 << 27) assert (rmm.is_initialized()) M = utils.read_csv_for_nx(graph_file) cu_count = cugraph_call(M, edgevals=True) nx_count = networkx_call(M) assert cu_count == nx_count
def test_bfs(managed, pool, graph_file): gc.collect() M = utils.read_csv_for_nx(graph_file) cu_M = utils.read_csv_file(graph_file) base_vid, base_dist = base_call(M, 0) cugraph_vid, cugraph_dist = cugraph_call(cu_M, np.int32(0)) # Calculating mismatch num_dist = np.count_nonzero(base_dist != _int_max) assert num_dist == len(cugraph_dist)
def test_networkx_compatibility(managed, pool, graph_file): gc.collect() rmm.finalize() rmm_config.use_managed_memory = managed rmm_config.use_pool_allocator = pool rmm_config.initial_pool_size = 2 << 27 rmm.initialize() assert (rmm.is_initialized()) # test from_cudf_edgelist() M = utils.read_csv_for_nx(graph_file) df = pd.DataFrame() df['source'] = pd.Series(M.row) df['target'] = pd.Series(M.col) df['weight'] = pd.Series(M.data) gdf = cudf.from_pandas(df) # cugraph.Graph() is implicitly a directed graph right at this moment, so # we should use nx.DiGraph() for comparison. Gnx = nx.from_pandas_edgelist(df, source='source', target='target', edge_attr=['weight'], create_using=nx.DiGraph) G = cugraph.from_cudf_edgelist(gdf, source='source', target='target', weight='weight') assert compare_graphs(Gnx, G) Gnx.clear() G.clear() # cugraph.Graph() is implicitly a directed graph right at this moment, so # we should use nx.DiGraph() for comparison. Gnx = nx.from_pandas_edgelist(df, source='source', target='target', create_using=nx.DiGraph) G = cugraph.from_cudf_edgelist(gdf, source='source', target='target') assert compare_graphs(Gnx, G) Gnx.clear() G.clear()
def test_jaccard_nx(graph_file): gc.collect() M = utils.read_csv_for_nx(graph_file) Gnx = nx.from_pandas_edgelist( M, source="0", target="1", create_using=nx.Graph() ) nx_j = nx.jaccard_coefficient(Gnx) nv_js = sorted(nx_j, key=len, reverse=True) cg_j = cugraph.jaccard_coefficient(Gnx) assert len(nv_js) > len(cg_j)
def read_csv(request): """ Read csv file for both networkx and cugraph """ Mnx = utils.read_csv_for_nx(request.param) N = max(max(Mnx["0"]), max(Mnx["1"])) + 1 M = scipy.sparse.csr_matrix( (Mnx.weight, (Mnx["0"], Mnx["1"])), shape=(N, N) ) cu_M = utils.read_csv_file(request.param) print("cu_M is \n", cu_M) return M, cu_M
def test_number_of_vertices(graph_file): cu_M = utils.read_csv_file(graph_file) M = utils.read_csv_for_nx(graph_file) if M is None: raise TypeError("Could not read the input graph") # cugraph add_edge_list G = cugraph.DiGraph() G.from_cudf_edgelist(cu_M, source="0", destination="1") Gnx = nx.from_pandas_edgelist( M, source="0", target="1", create_using=nx.DiGraph() ) assert G.number_of_vertices() == Gnx.number_of_nodes()
def test_two_hop_neighbors(graph_file): cu_M = utils.read_csv_file(graph_file) G = cugraph.DiGraph() G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2") df = G.get_two_hop_neighbors() Mnx = utils.read_csv_for_nx(graph_file) N = max(max(Mnx["0"]), max(Mnx["1"])) + 1 Mcsr = scipy.sparse.csr_matrix((Mnx.weight, (Mnx["0"], Mnx["1"])), shape=(N, N)) find_two_paths(df, Mcsr) check_all_two_hops(df, Mcsr)
def test_ktruss_subgraph_Graph_nx(graph_file, nx_ground_truth): gc.collect() k = 5 M = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True) G = nx.from_pandas_edgelist(M, source="0", target="1", edge_attr="weight", create_using=nx.Graph()) k_subgraph = cugraph.k_truss(G, k) k_truss_nx = nx.k_truss(G, k) assert nx.is_isomorphic(k_subgraph, k_truss_nx)
def test_add_edge_or_adj_list_after_add_edge_or_adj_list( managed, pool, graph_file): gc.collect() rmm.finalize() rmm_config.use_managed_memory = managed rmm_config.use_pool_allocator = pool rmm_config.initial_pool_size = 2 << 27 rmm.initialize() assert (rmm.is_initialized()) M = utils.read_csv_for_nx(graph_file) sources = cudf.Series(M.row) destinations = cudf.Series(M.col) M = M.tocsr() if M is None: raise TypeError('Could not read the input graph') if M.shape[0] != M.shape[1]: raise TypeError('Shape is not square') offsets = cudf.Series(M.indptr) indices = cudf.Series(M.indices) G = cugraph.Graph() # If cugraph has at least one graph representation, adding a new graph # should fail to prevent a single graph object storing two different # graphs. # If cugraph has a graph edge list, adding a new graph should fail. G.add_edge_list(sources, destinations, None) with pytest.raises(libcudf.GDFError.GDFError) as excinfo: G.add_edge_list(sources, destinations, None) assert excinfo.value.errcode.decode() == 'GDF_INVALID_API_CALL' with pytest.raises(libcudf.GDFError.GDFError) as excinfo: G.add_adj_list(offsets, indices, None) assert excinfo.value.errcode.decode() == 'GDF_INVALID_API_CALL' G.delete_edge_list() # If cugraph has a graph adjacency list, adding a new graph should fail. G.add_adj_list(sources, destinations, None) with pytest.raises(libcudf.GDFError.GDFError) as excinfo: G.add_edge_list(sources, destinations, None) assert excinfo.value.errcode.decode() == 'GDF_INVALID_API_CALL' with pytest.raises(libcudf.GDFError.GDFError) as excinfo: G.add_adj_list(offsets, indices, None) assert excinfo.value.errcode.decode() == 'GDF_INVALID_API_CALL' G.delete_adj_list()
def test_networkx_compatibility(graph_file): gc.collect() # test from_cudf_edgelist() M = utils.read_csv_for_nx(graph_file) df = pd.DataFrame() df["source"] = pd.Series(M["0"]) df["target"] = pd.Series(M["1"]) df["weight"] = pd.Series(M.weight) gdf = cudf.from_pandas(df) Gnx = nx.from_pandas_edgelist( df, source="source", target="target", edge_attr="weight", create_using=nx.DiGraph, ) G = cugraph.from_cudf_edgelist( gdf, source="source", destination="target", edge_attr="weight", create_using=cugraph.DiGraph, ) print('g from gdf = \n', gdf) print('nx from df = \n', df) assert compare_graphs(Gnx, G) Gnx.clear() G.clear() Gnx = nx.from_pandas_edgelist(df, source="source", target="target", create_using=nx.DiGraph) G = cugraph.from_cudf_edgelist( gdf, source="source", destination="target", create_using=cugraph.DiGraph, ) assert compare_graphs(Gnx, G) Gnx.clear() G.clear()
def test_nx_convert_multicol(graph_file): # read data and create a Nx Graph nx_df = utils.read_csv_for_nx(graph_file) G = nx.DiGraph() for row in nx_df.iterrows(): G.add_edge(row[1]["0"], row[1]["1"], count=[row[1]["0"], row[1]["1"]]) nxG = nx.from_pandas_edgelist(nx_df, "0", "1") cuG = cugraph.utilities.convert_from_nx(nxG) assert nxG.number_of_nodes() == cuG.number_of_nodes() assert nxG.number_of_edges() == cuG.number_of_edges()
def calc_core_number(graph_file): M = utils.read_csv_file(graph_file) G = cugraph.DiGraph() G.from_cudf_edgelist(M, source='0', destination='1') cn = cugraph.core_number(G) NM = utils.read_csv_for_nx(graph_file) Gnx = nx.from_pandas_edgelist(NM, source='0', target='1', create_using=nx.Graph()) nc = nx.core_number(Gnx) pdf = [nc[k] for k in sorted(nc.keys())] cn['nx_core_number'] = pdf cn = cn.rename({'core_number': 'cu_core_number'}) return cn
def test_triangles(managed, pool, graph_file): gc.collect() rmm.finalize() rmm_config.use_managed_memory = managed rmm_config.use_pool_allocator = pool rmm_config.initial_pool_size = 2 << 27 rmm.initialize() assert (rmm.is_initialized()) M = utils.read_csv_for_nx(graph_file) cu_count = cugraph_call(M) nx_count = networkx_call(M) assert cu_count == nx_count
def calc_core_number(graph_file): M = utils.read_csv_file(graph_file) G = cugraph.DiGraph() G.from_cudf_edgelist(M, source='0', target='1') cn = cugraph.core_number(G) NM = utils.read_csv_for_nx(graph_file) NM = NM.tocsr() Gnx = nx.Graph(NM) nc = nx.core_number(Gnx) pdf = pd.DataFrame(nc, index=[0]).T cn['nx_core_number'] = pdf[0] cn = cn.rename({'core_number': 'cu_core_number'}) return cn
def test_triangles_nx(graph_file): gc.collect() M = utils.read_csv_for_nx(graph_file) G = nx.from_pandas_edgelist( M, source="0", target="1", create_using=nx.Graph() ) cu_count = cugraph.triangles(G) dic = nx.triangles(G) nx_count = 0 for i in dic.keys(): nx_count += dic[i] assert cu_count == nx_count