def test_to_undirected(graph_file): # Read data and then convert to directed by dropped some edges cu_M = utils.read_csv_file(graph_file) cu_M = cu_M[cu_M["0"] <= cu_M["1"]].reset_index(drop=True) M = utils.read_csv_for_nx(graph_file) M = M[M["0"] <= M["1"]] assert len(cu_M) == len(M) # cugraph add_edge_list DiG = cugraph.Graph(directed=True) DiG.from_cudf_edgelist(cu_M, source="0", destination="1") DiGnx = nx.from_pandas_edgelist(M, source="0", target="1", create_using=nx.DiGraph()) for index, row in cu_M.to_pandas().iterrows(): assert DiG.has_edge(row['0'], row['1']) assert not DiG.has_edge(row['1'], row['0']) G = DiG.to_undirected() Gnx = DiGnx.to_undirected() assert not G.is_directed() assert G.number_of_nodes() == Gnx.number_of_nodes() assert G.number_of_edges() == Gnx.number_of_edges() for index, row in cu_M.to_pandas().iterrows(): assert G.has_edge(row['0'], row['1']) assert G.has_edge(row['1'], row['0'])
def test_invalid_has_node(): df = cudf.DataFrame([[1, 2]], columns=["src", "dst"]) G = cugraph.Graph() G.from_cudf_edgelist(df, source="src", destination="dst") assert not G.has_node(-1) assert not G.has_node(0) assert not G.has_node(G.number_of_nodes() + 1)
def test_degrees_functionality(graph_file): M = utils.read_csv_for_nx(graph_file) cu_M = utils.read_csv_file(graph_file) G = cugraph.Graph(directed=True) G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2") Gnx = nx.from_pandas_edgelist(M, source="0", target="1", create_using=nx.DiGraph()) df = G.degrees() nx_in_degree = Gnx.in_degree() nx_out_degree = Gnx.out_degree() err_in_degree = 0 err_out_degree = 0 for i in range(len(df)): if df["in_degree"][i] != nx_in_degree[df["vertex"][i]]: err_in_degree = err_in_degree + 1 if df["out_degree"][i] != nx_out_degree[df["vertex"][i]]: err_out_degree = err_out_degree + 1 assert err_in_degree == 0 assert err_out_degree == 0
def setup_function(): global DiGraph_inst gc.collect() # Set the global DiGraph_inst. This is used for calls that require a Graph # type or instance to be provided for tests that use a directed graph. DiGraph_inst = cugraph.Graph(directed=True) # noqa: F841
def generate_mg_batch_cugraph_graph_from_file(graph_file, directed=True): client = get_client() _ddf = read_dask_cudf_csv_file(graph_file) ddf = client.persist(_ddf) G = cugraph.DiGraph() if directed else cugraph.Graph() G.from_dask_cudf_edgelist(ddf) return G
def graphs(request): with NamedTemporaryFile(mode="w+", suffix=".csv") as graph_tf: graph_tf.writelines(request.param) graph_tf.seek(0) nx_G = nx.read_weighted_edgelist(graph_tf.name, delimiter=',') cudf_df = cudf.read_csv(graph_tf.name, names=["src", "dst", "data"], delimiter=",", dtype=["int32", "int32", "float64"]) cugraph_G = cugraph.Graph() cugraph_G.from_cudf_edgelist(cudf_df, source="src", destination="dst", edge_attr="data") # construct cupy coo_matrix graph i = [] j = [] weights = [] for index in range(cudf_df.shape[0]): vertex1 = cudf_df.iloc[index]["src"] vertex2 = cudf_df.iloc[index]["dst"] weight = cudf_df.iloc[index]["data"] i += [vertex1, vertex2] j += [vertex2, vertex1] weights += [weight, weight] i = cupy.array(i) j = cupy.array(j) weights = cupy.array(weights) largest_vertex = max(cupy.amax(i), cupy.amax(j)) cupy_df = cupy_coo_matrix( (weights, (i, j)), shape=(largest_vertex + 1, largest_vertex + 1)) yield cugraph_G, nx_G, cupy_df
def test_add_edge_or_adj_list_after_add_edge_or_adj_list(graph_file): Mnx = utils.read_csv_for_nx(graph_file) df = cudf.DataFrame() df["src"] = cudf.Series(Mnx["0"]) df["dst"] = cudf.Series(Mnx["1"]) N = max(max(Mnx["0"]), max(Mnx["1"])) + 1 Mcsr = scipy.sparse.csr_matrix((Mnx.weight, (Mnx["0"], Mnx["1"])), shape=(N, N)) offsets = cudf.Series(Mcsr.indptr) indices = cudf.Series(Mcsr.indices) G = cugraph.Graph(directed=True) # If cugraph has at least one graph representation, adding a new graph # should fail to prevent a single graph object storing two different # graphs. # If cugraph has a graph edge list, adding a new graph should fail. G.from_cudf_edgelist(df, source="src", destination="dst") with pytest.raises(Exception): G.from_cudf_edgelist(df, source="src", destination="dst") with pytest.raises(Exception): G.from_cudf_adjlist(offsets, indices, None) G.delete_edge_list() # If cugraph has a graph adjacency list, adding a new graph should fail. G.from_cudf_adjlist(offsets, indices, None) with pytest.raises(Exception): G.from_cudf_edgelist(df, source="src", destination="dst") with pytest.raises(Exception): G.from_cudf_adjlist(offsets, indices, None) G.delete_adj_list()
def test_woverlap(graph_file): gc.collect() Mnx = utils.read_csv_for_nx(graph_file) N = max(max(Mnx["0"]), max(Mnx["1"])) + 1 M = scipy.sparse.csr_matrix((Mnx.weight, (Mnx["0"], Mnx["1"])), shape=(N, N)) cu_M = utils.read_csv_file(graph_file) G = cugraph.Graph() G.from_cudf_edgelist(cu_M, source="0", destination="1") pairs = (G.get_two_hop_neighbors().sort_values(["first", "second" ]).reset_index(drop=True)) cu_coeff = cugraph_call(cu_M, pairs) cpu_coeff = cpu_call(M, pairs["first"], pairs["second"]) assert len(cu_coeff) == len(cpu_coeff) for i in range(len(cu_coeff)): if np.isnan(cpu_coeff[i]): assert np.isnan(cu_coeff[i]) elif np.isnan(cu_coeff[i]): assert cpu_coeff[i] == cu_coeff[i] else: diff = abs(cpu_coeff[i] - cu_coeff[i]) assert diff < 1.0e-6
def test_force_atlas2_multi_column_pos_list(graph_file, score, max_iter, barnes_hut_optimize): cu_M = utils.read_csv_file(graph_file) test_callback = TestCallback() pos = cugraph_call(cu_M, max_iter=max_iter, pos_list=None, outbound_attraction_distribution=True, lin_log_mode=False, prevent_overlapping=False, edge_weight_influence=1.0, jitter_tolerance=1.0, barnes_hut_optimize=False, barnes_hut_theta=0.5, scaling_ratio=2.0, strong_gravity_mode=False, gravity=1.0, callback=test_callback) cu_M.rename(columns={'0': 'src_0', '1': 'dst_0'}, inplace=True) cu_M['src_1'] = cu_M['src_0'] + 1000 cu_M['dst_1'] = cu_M['dst_0'] + 1000 G = cugraph.Graph() G.from_cudf_edgelist( cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"], edge_attr="2" ) pos_list = cudf.DataFrame() pos_list['vertex_0'] = pos['vertex'] pos_list['vertex_1'] = pos_list['vertex_0'] + 1000 pos_list['x'] = pos['x'] pos_list['y'] = pos['y'] cu_pos = cugraph.force_atlas2( G, max_iter=max_iter, pos_list=pos_list, outbound_attraction_distribution=True, lin_log_mode=False, prevent_overlapping=False, edge_weight_influence=1.0, jitter_tolerance=1.0, barnes_hut_optimize=False, barnes_hut_theta=0.5, scaling_ratio=2.0, strong_gravity_mode=False, gravity=1.0, callback=test_callback) cu_pos = cu_pos.sort_values('0_vertex') matrix_file = graph_file.with_suffix(".mtx") M = scipy.io.mmread(matrix_file) M = M.todense() cu_trust = trustworthiness(M, cu_pos[["x", "y"]].to_pandas()) print(cu_trust, score) assert cu_trust > score
def test_dask_pagerank(dask_client): pandas.set_option("display.max_rows", 10000) input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "karate.csv").as_posix() chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g = cugraph.Graph(directed=True) g.from_cudf_edgelist(df, "src", "dst") dg = cugraph.Graph(directed=True) dg.from_dask_cudf_edgelist(ddf, "src", "dst") expected_pr = cugraph.pagerank(g) result_pr = dcg.pagerank(dg).compute() err = 0 tol = 1.0e-05 assert len(expected_pr) == len(result_pr) compare_pr = expected_pr.merge(result_pr, on="vertex", suffixes=["_local", "_dask"]) for i in range(len(compare_pr)): diff = abs(compare_pr["pagerank_local"].iloc[i] - compare_pr["pagerank_dask"].iloc[i]) if diff > tol * 1.1: err = err + 1 print("Mismatches:", err) assert err == 0
def createGraph(edgelist_gdf, auto_csr): G = cugraph.Graph() G.add_edge_list(edgelist_gdf["src"], edgelist_gdf["dst"], edgelist_gdf["val"]) if auto_csr == 0: G.view_adj_list() G.view_transposed_adj_list() return G
def get_shortest_paths(edges_df, point_of_interest): G_gpu = cugraph.Graph() G_gpu.from_cudf_edgelist( edges_df, source='src', destination='dst', edge_attr='time' ) shortest_paths = cugraph.traversal.sssp(G_gpu, point_of_interest) shortest_paths = shortest_paths.drop('predecessor', axis=1) shortest_paths.columns = ['time', 'vertex'] return shortest_paths
def test_graph_init_with_multigraph(): """ Ensures only a valid MultiGraph instance can be used to initialize a Graph by checking if either the correct exception is raised or no exception at all. """ nxMG = nx.MultiGraph() with pytest.raises(TypeError): cugraph.Graph(m_graph=nxMG) gdf = cudf.DataFrame({"src": [0, 1, 2], "dst": [1, 2, 3]}) cMG = cugraph.MultiGraph() cMG.from_cudf_edgelist(gdf, source="src", destination="dst") cugraph.Graph(m_graph=cMG) cDiMG = cugraph.MultiDiGraph() # deprecated, but should still work cDiMG.from_cudf_edgelist(gdf, source="src", destination="dst") cugraph.Graph(m_graph=cDiMG)
def generate_cugraph_graph_from_file(graph_file, directed=True, edgevals=False): cu_M = read_csv_file(graph_file) G = cugraph.DiGraph() if directed else cugraph.Graph() if edgevals: G.from_cudf_edgelist(cu_M, source='0', destination='1', edge_attr='2') else: G.from_cudf_edgelist(cu_M, source='0', destination='1') return G
def test_ktruss_subgraph_Graph(graph_file, nx_ground_truth): gc.collect() k = 5 cu_M = utils.read_csv_file(graph_file) G = cugraph.Graph() G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2") k_subgraph = cugraph.ktruss_subgraph(G, k) compare_k_truss(k_subgraph, k, nx_ground_truth)
def test_has_node(graph_file): cu_M = utils.read_csv_file(graph_file) nodes = cudf.concat([cu_M["0"], cu_M["1"]]).unique() # cugraph add_edge_list G = cugraph.Graph() G.from_cudf_edgelist(cu_M, source="0", destination="1") for n in nodes.values_host: assert G.has_node(n)
def test_transpose_from_adj_list(graph_file): M = read_mtx_file(graph_file+'.mtx').tocsr() offsets = cudf.Series(M.indptr) indices = cudf.Series(M.indices) G = cugraph.Graph() G.add_adj_list(offsets, indices, None) G.add_transposed_adj_list() Mt = M.transpose().tocsr() toff, tind = G.view_transposed_adj_list() assert compare_series(tind, Mt.indices) assert compare_offsets(toff, Mt.indptr)
def cugraph_call(M, edgevals=False): M = M.tocoo() rows = cudf.Series(M.row) cols = cudf.Series(M.col) if edgevals is False: values = None else: values = cudf.Series(M.data) G = cugraph.Graph() G.add_edge_list(rows, cols, values) return cugraph.triangles(G)
def test_has_edge(graph_file): cu_M = utils.read_csv_file(graph_file) cu_M = cu_M[cu_M["0"] <= cu_M["1"]].reset_index(drop=True) # cugraph add_edge_list G = cugraph.Graph() G.from_cudf_edgelist(cu_M, source="0", destination="1") for index, row in cu_M.to_pandas().iterrows(): assert G.has_edge(row['0'], row['1']) assert G.has_edge(row['1'], row['0'])
def createGraph(edgelist_gdf, createDiGraph, renumber, symmetrized): if createDiGraph: G = cugraph.DiGraph() else: G = cugraph.Graph(symmetrized=symmetrized) G.from_cudf_edgelist(edgelist_gdf, source="src", destination="dst", edge_attr="val", renumber=renumber) return G
def test_sorensen_two_hop_edge_vals(read_csv): M, cu_M = read_csv Gnx = nx.from_pandas_edgelist( M, source="0", target="1", edge_attr="weight", create_using=nx.Graph() ) G = cugraph.Graph() G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2") compare_sorensen_two_hop(G, Gnx)
def cugraph_k_truss_subgraph(graph_file, k, directed): # directed is used to create either a Graph or DiGraph so the returned # cugraph can be compared to nx graph of same type. cu_M = utils.read_csv_file(graph_file) if directed: G = cugraph.DiGraph() else: G = cugraph.Graph() G.from_cudf_edgelist(cu_M, source='0', destination='1', edge_attr='2') k_subgraph = cugraph.ktruss_subgraph(G, k) return k_subgraph
def test_modularity_clustering(managed, pool, graph_file, partitions): gc.collect() rmm.finalize() rmm_config.use_managed_memory = managed rmm_config.use_pool_allocator = pool rmm_config.initial_pool_size = 2 << 27 rmm.initialize() assert (rmm.is_initialized()) # Read in the graph and get a cugraph object M = utils.read_csv_for_nx(graph_file).tocsr() cu_M = utils.read_csv_file(graph_file, read_weights_in_sp=False) row_offsets = cudf.Series(M.indptr) col_indices = cudf.Series(M.indices) sources = cu_M['0'] destinations = cu_M['1'] G_adj = cugraph.Graph() G_adj.add_adj_list(row_offsets, col_indices) G_edge = cugraph.Graph() G_edge.add_edge_list(sources, destinations) # Get the modularity score for partitioning versus random assignment cu_vid, cu_score = cugraph_call(G_adj, partitions) rand_vid, rand_score = random_call(G_adj, partitions) # Assert that the partitioning has better modularity than the random # assignment assert cu_score < rand_score # Get the modularity score for partitioning versus random assignment cu_vid, cu_score = cugraph_call(G_edge, partitions) rand_vid, rand_score = random_call(G_edge, partitions) # Assert that the partitioning has better modularity than the random # assignment assert cu_score < rand_score
def test_view_edge_list_from_adj_list(graph_file): M = read_mtx_file(graph_file+'.mtx').tocsr() offsets = cudf.Series(M.indptr) indices = cudf.Series(M.indices) G = cugraph.Graph() G.add_adj_list(offsets, indices, None) src2, dst2 = G.view_edge_list() M = M.tocoo() src1 = M.row dst1 = M.col assert compare_series(src1, src2) assert compare_series(dst1, dst2)
def test_jaccard_two_hop(read_csv): M, cu_M = read_csv Gnx = nx.from_pandas_edgelist(M, source="0", target="1", create_using=nx.Graph()) G = cugraph.Graph() G.from_cudf_edgelist(cu_M, source="0", destination="1") compare_jaccard_two_hop(G, Gnx)
def calc_k_cores(graph_file): M = utils.read_csv_file(graph_file) G = cugraph.Graph() G.add_edge_list(M['0'], M['1']) ck = cugraph.k_core(G) NM = utils.read_csv_for_nx(graph_file) NM = NM.tocsr() Gnx = nx.DiGraph(NM) nk = nx.k_core(Gnx) return ck, nk
def test_has_node(graph_file): gc.collect() cu_M = utils.read_csv_file(graph_file) nodes = cudf.concat([cu_M['0'], cu_M['1']]).unique() # cugraph add_edge_list G = cugraph.Graph() G.from_cudf_edgelist(cu_M, source='0', destination='1') for n in nodes: assert G.has_node(n)
def cugraph_call(M, edgevals=False): G = cugraph.Graph() cu_M = cudf.DataFrame() cu_M['src'] = cudf.Series(M['0']) cu_M['dst'] = cudf.Series(M['1']) if edgevals is True: cu_M['weights'] = cudf.Series(M['weight']) G.from_cudf_edgelist(cu_M, source='src', destination='dst', edge_attr='weights') else: G.from_cudf_edgelist(cu_M, source='src', destination='dst') return cugraph.triangles(G)
def convert_from_nx(nxG, weight=None, do_renumber=True): """ weight: weight column name. Only used if nxG.is_weighted() is True """ if isinstance(nxG, nx.classes.digraph.DiGraph): G = cugraph.Graph(directed=True) elif isinstance(nxG, nx.classes.graph.Graph): G = cugraph.Graph() else: raise TypeError( f"nxG must be either a NetworkX Graph or DiGraph, got {type(nxG)}") is_weighted = nx.is_weighted(nxG) if is_weighted is False: _gdf = convert_unweighted_to_gdf(nxG) G.from_cudf_edgelist(_gdf, source="src", destination="dst", edge_attr=None, renumber=do_renumber) else: if weight is None: _gdf = convert_weighted_unnamed_to_gdf(nxG) G.from_cudf_edgelist(_gdf, source="source", destination="target", edge_attr='weight', renumber=do_renumber) else: _gdf = convert_weighted_named_to_gdf(nxG, weight) G.from_cudf_edgelist(_gdf, source="src", destination="dst", edge_attr='weight', renumber=do_renumber) return G
def test_dask_katz_centrality(dask_client): gc.collect() input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "karate.csv").as_posix() print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) dg = cugraph.Graph(directed=True) dg.from_dask_cudf_edgelist(ddf, "src", "dst") largest_out_degree = dg.out_degree().compute().\ nlargest(n=1, columns="degree") largest_out_degree = largest_out_degree["degree"].iloc[0] katz_alpha = 1 / (largest_out_degree + 1) mg_res = dcg.katz_centrality(dg, alpha=katz_alpha, tol=1e-6) mg_res = mg_res.compute() import networkx as nx from cugraph.tests import utils NM = utils.read_csv_for_nx(input_data_path) Gnx = nx.from_pandas_edgelist( NM, create_using=nx.DiGraph(), source="0", target="1" ) nk = nx.katz_centrality(Gnx, alpha=katz_alpha) import pandas as pd pdf = pd.DataFrame(nk.items(), columns=['vertex', 'katz_centrality']) exp_res = cudf.DataFrame(pdf) err = 0 tol = 1.0e-05 compare_res = exp_res.merge( mg_res, on="vertex", suffixes=["_local", "_dask"] ) for i in range(len(compare_res)): diff = abs( compare_res["katz_centrality_local"].iloc[i] - compare_res["katz_centrality_dask"].iloc[i] ) if diff > tol * 1.1: err = err + 1 assert err == 0