def test_add_edge_list_to_adj_list(managed, pool, graph_file): gc.collect() rmm.finalize() rmm_config.use_managed_memory = managed rmm_config.use_pool_allocator = pool rmm_config.initial_pool_size = 2 << 27 rmm.initialize() assert (rmm.is_initialized()) cu_M = utils.read_csv_file(graph_file) sources = cu_M['0'] destinations = cu_M['1'] M = utils.read_csv_for_nx(graph_file).tocsr() if M is None: raise TypeError('Could not read the input graph') if M.shape[0] != M.shape[1]: raise TypeError('Shape is not square') offsets_exp = M.indptr indices_exp = M.indices # cugraph add_egde_list to_adj_list call G = cugraph.Graph() G.add_edge_list(sources, destinations, None) offsets_cu, indices_cu, values_cu = G.view_adj_list() assert compare_offsets(offsets_cu, offsets_exp) assert compare_series(indices_cu, indices_exp) assert values_cu is None
def test_add_adj_list_to_edge_list(managed, pool, graph_file): gc.collect() rmm.finalize() rmm_config.use_managed_memory = managed rmm_config.use_pool_allocator = pool rmm_config.initial_pool_size = 2 << 27 rmm.initialize() assert (rmm.is_initialized()) M = utils.read_csv_for_nx(graph_file).tocsr() if M is None: raise TypeError('Could not read the input graph') if M.shape[0] != M.shape[1]: raise TypeError('Shape is not square') offsets = cudf.Series(M.indptr) indices = cudf.Series(M.indices) M = M.tocoo() sources_exp = cudf.Series(M.row) destinations_exp = cudf.Series(M.col) # cugraph add_adj_list to_edge_list call G = cugraph.Graph() G.add_adj_list(offsets, indices, None) sources, destinations, values = G.view_edge_list() sources_cu = np.array(sources) destinations_cu = np.array(destinations) assert compare_series(sources_cu, sources_exp) assert compare_series(destinations_cu, destinations_exp) assert values is None
def test_modularity_clustering(managed, pool, graph_file, partitions): gc.collect() rmm.finalize() rmm_config.use_managed_memory = managed rmm_config.use_pool_allocator = pool rmm_config.initial_pool_size = 2 << 27 rmm.initialize() assert (rmm.is_initialized()) # Read in the graph and get a cugraph object cu_M = utils.read_csv_file(graph_file, read_weights_in_sp=False) sources = cu_M['0'] destinations = cu_M['1'] values = cu_M['2'] G = cugraph.Graph() G.add_edge_list(sources, destinations, values) # Get the modularity score for partitioning versus random assignment cu_score = cugraph_call(G, partitions) rand_score = random_call(G, partitions) # Assert that the partitioning has better modularity than the random # assignment assert cu_score > rand_score
def test_pagerank(managed, pool, graph_file, max_iter, tol, alpha, personalization_perc, has_guess): gc.collect() rmm.finalize() rmm_config.use_managed_memory = managed rmm_config.use_pool_allocator = pool rmm_config.initial_pool_size = 2 << 27 rmm.initialize() assert (rmm.is_initialized()) M = utils.read_csv_for_nx(graph_file) networkx_pr, networkx_prsn = networkx_call(M, max_iter, tol, alpha, personalization_perc) cu_nstart = None if has_guess == 1: cu_nstart = cudify(networkx_pr) max_iter = 5 cu_prsn = cudify(networkx_prsn) cu_M = utils.read_csv_file(graph_file) cugraph_pr = cugraph_call(cu_M, max_iter, tol, alpha, cu_prsn, cu_nstart) # Calculating mismatch networkx_pr = sorted(networkx_pr.items(), key=lambda x: x[0]) err = 0 assert len(cugraph_pr) == len(networkx_pr) for i in range(len(cugraph_pr)): if (abs(cugraph_pr[i][1] - networkx_pr[i][1]) > tol * 1.1 and cugraph_pr[i][0] == networkx_pr[i][0]): err = err + 1 print("Mismatches:", err) assert err < (0.01 * len(cugraph_pr))
def test_jaccard_edgevals(managed, pool, graph_file): gc.collect() rmm.finalize() rmm_config.use_managed_memory = managed rmm_config.use_pool_allocator = pool rmm_config.initial_pool_size = 2 << 27 rmm.initialize() assert (rmm.is_initialized()) M = utils.read_csv_for_nx(graph_file) cu_M = utils.read_csv_file(graph_file) cu_src, cu_dst, cu_coeff = cugraph_call(cu_M, edgevals=True) nx_src, nx_dst, nx_coeff = networkx_call(M) # Calculating mismatch err = 0 tol = 1.0e-06 assert len(cu_coeff) == len(nx_coeff) for i in range(len(cu_coeff)): if (abs(cu_coeff[i] - nx_coeff[i]) > tol * 1.1 and cu_src[i] == nx_src[i] and cu_dst[i] == nx_dst[i]): err += 1 print("Mismatches: %d" % err) assert err == 0
def test_jaccard_two_hop_edge_vals(managed, pool, graph_file): gc.collect() rmm.finalize() rmm_config.use_managed_memory = managed rmm_config.use_pool_allocator = pool rmm_config.initial_pool_size = 2 << 27 rmm.initialize() assert (rmm.is_initialized()) M = utils.read_csv_for_nx(graph_file) M = M.tocsr() Gnx = nx.DiGraph(M).to_undirected() G = cugraph.Graph() row_offsets = cudf.Series(M.indptr) col_indices = cudf.Series(M.indices) values = cudf.Series(M.data) G.add_adj_list(row_offsets, col_indices, values) pairs = G.get_two_hop_neighbors() nx_pairs = [] for i in range(len(pairs)): nx_pairs.append((pairs['first'][i], pairs['second'][i])) preds = nx.jaccard_coefficient(Gnx, nx_pairs) nx_coeff = [] for u, v, p in preds: nx_coeff.append(p) df = cugraph.jaccard(G, pairs['first'], pairs['second']) assert len(nx_coeff) == len(df) for i in range(len(df)): diff = abs(nx_coeff[i] - df['jaccard_coeff'][i]) assert diff < 1.0e-6
def test_louvain_with_edgevals(managed, pool, graph_file): gc.collect() rmm.finalize() rmm_config.use_managed_memory = managed rmm_config.use_pool_allocator = pool rmm_config.initial_pool_size = 2 << 27 rmm.initialize() assert (rmm.is_initialized()) M = utils.read_csv_for_nx(graph_file) cu_M = utils.read_csv_file(graph_file) cu_parts, cu_mod = cugraph_call(cu_M, edgevals=True) nx_parts = networkx_call(M) # Calculating modularity scores for comparison Gnx = nx.Graph(M) cu_map = {0: 0} for i in range(len(cu_parts)): cu_map[cu_parts['vertex'][i]] = cu_parts['partition'][i] assert set(nx_parts.keys()) == set(cu_map.keys()) cu_mod_nx = community.modularity(cu_map, Gnx) nx_mod = community.modularity(nx_parts, Gnx) assert len(cu_parts) == len(nx_parts) assert cu_mod > (.82 * nx_mod) print(cu_mod) print(cu_mod_nx) print(nx_mod) assert abs(cu_mod - cu_mod_nx) < .0001
def test_renumber_files(managed, pool, graph_file): gc.collect() rmm.finalize() rmm_config.use_managed_memory = managed rmm_config.use_pool_allocator = pool rmm_config.initial_pool_size = 2 << 27 rmm.initialize() assert (rmm.is_initialized()) M = utils.read_csv_for_nx(graph_file) sources = cudf.Series(M.row) destinations = cudf.Series(M.col) translate = 1000 source_translated = cudf.Series([x + translate for x in sources]) dest_translated = cudf.Series([x + translate for x in destinations]) src, dst, numbering = cugraph.renumber(source_translated, dest_translated) for i in range(len(sources)): assert sources[i] == (numbering[src[i]] - translate) assert destinations[i] == (numbering[dst[i]] - translate)
def test_strong_cc(managed, pool, graph_file): gc.collect() rmm.finalize() rmm_config.use_managed_memory = managed rmm_config.use_pool_allocator = pool rmm_config.initial_pool_size = 2 << 27 rmm.initialize() assert (rmm.is_initialized()) M = utils.read_csv_for_nx(graph_file) netx_labels = networkx_strong_call(M) cu_M = utils.read_csv_file(graph_file) cugraph_labels = cugraph_strong_call(cu_M) # NetX returns a list of components, each component being a # collection (set{}) of vertex indices; # # while cugraph returns a component label for each vertex; nx_n_components = len(netx_labels) cg_n_components = get_n_uniqs(cugraph_labels) assert nx_n_components == cg_n_components lst_nx_components_lens = [len(c) for c in sorted(netx_labels, key=len)] # get counts of uniques: # lst_cg_components_lens = sorted(get_uniq_counts(cugraph_labels)) assert lst_nx_components_lens == lst_cg_components_lens
def test_symmetrize_unweighted(managed, pool, graph_file): gc.collect() rmm.finalize() rmm_config.use_managed_memory = managed rmm_config.use_pool_allocator = pool rmm.initialize() assert (rmm.is_initialized()) cu_M = utils.read_csv_file(graph_file + '.csv') sym_sources, sym_destinations = cugraph.symmetrize(cu_M['0'], cu_M['1']) # # Check to see if all pairs in sources/destinations exist in # both directions # # Try this with join logic. Note that if we create data frames # we can join the data frames (using the DataFrame.merge function). # The symmetrize function should contain every edge that was contained # in the input data. So if we join the input data with the output # the length of the data frames should be equal. # sym_df = cudf.DataFrame() sym_df['src_s'] = sym_sources sym_df['dst_s'] = sym_destinations orig_df = cudf.DataFrame() orig_df['src'] = cu_M['0'] orig_df['dst'] = cu_M['1'] compare(orig_df['src'], orig_df['dst'], None, sym_df['src_s'], sym_df['dst_s'], None)
def test_sssp(managed, pool, graph_file, source): gc.collect() rmm.finalize() rmm_config.use_managed_memory = managed rmm_config.use_pool_allocator = pool rmm_config.initial_pool_size = 2 << 27 rmm.initialize() assert(rmm.is_initialized()) M = utils.read_csv_for_nx(graph_file) cu_M = utils.read_csv_file(graph_file) cu_paths, max_val = cugraph_call(cu_M, source) nx_paths, Gnx = networkx_call(M, source) # Calculating mismatch err = 0 for vid in cu_paths: # Validate vertices that are reachable # NOTE : If distance type is float64 then cu_paths[vid][0] # should be compared against np.finfo(np.float64).max) if (cu_paths[vid][0] != max_val): if(cu_paths[vid][0] != nx_paths[vid]): err = err + 1 # check pred dist + 1 = current dist (since unweighted) pred = cu_paths[vid][1] if(vid != source and cu_paths[pred][0] + 1 != cu_paths[vid][0]): err = err + 1 else: if (vid in nx_paths.keys()): err = err + 1 assert err == 0
def test_overlap_edge_vals(managed, pool, graph_file): gc.collect() rmm.finalize() rmm_config.use_managed_memory = managed rmm_config.use_pool_allocator = pool rmm_config.initial_pool_size = 2 << 27 rmm.initialize() assert (rmm.is_initialized()) M = utils.read_csv_for_nx(graph_file) M = M.tocsr().sorted_indices() cu_M = utils.read_csv_file(graph_file) row_offsets = cudf.Series(M.indptr) col_indices = cudf.Series(M.indices) G = cugraph.Graph() G.add_adj_list(row_offsets, col_indices, None) pairs = G.get_two_hop_neighbors() cu_coeff = cugraph_call(cu_M, pairs['first'], pairs['second'], edgevals=True) cpu_coeff = cpu_call(M, pairs['first'], pairs['second']) assert len(cu_coeff) == len(cpu_coeff) for i in range(len(cu_coeff)): if np.isnan(cpu_coeff[i]): assert np.isnan(cu_coeff[i]) elif np.isnan(cu_coeff[i]): assert cpu_coeff[i] == cu_coeff[i] else: diff = abs(cpu_coeff[i] - cu_coeff[i]) assert diff < 1.0e-6
def initialize_rmm(): import rmm from rmm import rmm_config as rmm_cfg import cudf rmm.finalize() rmm_cfg.use_managed_memorclient.run(initialize_rmm)y = True rmm.initialize() print(rmm_cfg.use_managed_memory)
def test_rmm_modes(dtype, nelem, managed, pool): rmm.finalize() rmm_config.use_managed_memory = managed rmm_config.use_pool_allocator = pool rmm.initialize() assert rmm.is_initialized() array_tester(dtype, nelem)
def test_core_number(managed, pool, graph_file): gc.collect() rmm.finalize() rmm_config.use_managed_memory = managed rmm_config.use_pool_allocator = pool rmm.initialize() assert (rmm.is_initialized()) cn = calc_core_number(graph_file) assert cn['cu_core_number'].equals(cn['nx_core_number'])
def test_core_number(managed, pool, graph_file): gc.collect() rmm.finalize() rmm_config.use_managed_memory = managed rmm_config.use_pool_allocator = pool rmm.initialize() assert (rmm.is_initialized()) cu_kcore, nx_kcore = calc_k_cores(graph_file) assert compare_edges(cu_kcore, nx_kcore)
def test_networkx_compatibility(managed, pool, graph_file): gc.collect() rmm.finalize() rmm_config.use_managed_memory = managed rmm_config.use_pool_allocator = pool rmm_config.initial_pool_size = 2 << 27 rmm.initialize() assert (rmm.is_initialized()) # test from_cudf_edgelist() M = utils.read_csv_for_nx(graph_file) df = pd.DataFrame() df['source'] = pd.Series(M.row) df['target'] = pd.Series(M.col) df['weight'] = pd.Series(M.data) gdf = cudf.from_pandas(df) # cugraph.Graph() is implicitly a directed graph right at this moment, so # we should use nx.DiGraph() for comparison. Gnx = nx.from_pandas_edgelist(df, source='source', target='target', edge_attr=['weight'], create_using=nx.DiGraph) G = cugraph.from_cudf_edgelist(gdf, source='source', target='target', weight='weight') assert compare_graphs(Gnx, G) Gnx.clear() G.clear() # cugraph.Graph() is implicitly a directed graph right at this moment, so # we should use nx.DiGraph() for comparison. Gnx = nx.from_pandas_edgelist(df, source='source', target='target', create_using=nx.DiGraph) G = cugraph.from_cudf_edgelist(gdf, source='source', target='target') assert compare_graphs(Gnx, G) Gnx.clear() G.clear()
def test_grmat_gen(managed, pool): gc.collect() rmm.finalize() rmm_config.use_managed_memory = managed rmm_config.use_pool_allocator = pool rmm_config.initial_pool_size = 2 << 27 rmm.initialize() assert (rmm.is_initialized()) vertices, edges, sources, destinations = cugraph.grmat_gen( 'grmat --rmat_scale=2 --rmat_edgefactor=2 --device=0 --normalized' ' --quiet')
def test_add_edge_or_adj_list_after_add_edge_or_adj_list( managed, pool, graph_file): gc.collect() rmm.finalize() rmm_config.use_managed_memory = managed rmm_config.use_pool_allocator = pool rmm_config.initial_pool_size = 2 << 27 rmm.initialize() assert (rmm.is_initialized()) M = utils.read_csv_for_nx(graph_file) sources = cudf.Series(M.row) destinations = cudf.Series(M.col) M = M.tocsr() if M is None: raise TypeError('Could not read the input graph') if M.shape[0] != M.shape[1]: raise TypeError('Shape is not square') offsets = cudf.Series(M.indptr) indices = cudf.Series(M.indices) G = cugraph.Graph() # If cugraph has at least one graph representation, adding a new graph # should fail to prevent a single graph object storing two different # graphs. # If cugraph has a graph edge list, adding a new graph should fail. G.add_edge_list(sources, destinations, None) with pytest.raises(libcudf.GDFError.GDFError) as excinfo: G.add_edge_list(sources, destinations, None) assert excinfo.value.errcode.decode() == 'GDF_INVALID_API_CALL' with pytest.raises(libcudf.GDFError.GDFError) as excinfo: G.add_adj_list(offsets, indices, None) assert excinfo.value.errcode.decode() == 'GDF_INVALID_API_CALL' G.delete_edge_list() # If cugraph has a graph adjacency list, adding a new graph should fail. G.add_adj_list(sources, destinations, None) with pytest.raises(libcudf.GDFError.GDFError) as excinfo: G.add_edge_list(sources, destinations, None) assert excinfo.value.errcode.decode() == 'GDF_INVALID_API_CALL' with pytest.raises(libcudf.GDFError.GDFError) as excinfo: G.add_adj_list(offsets, indices, None) assert excinfo.value.errcode.decode() == 'GDF_INVALID_API_CALL' G.delete_adj_list()
def test_triangles_edge_vals(managed, pool, graph_file): gc.collect() rmm.finalize() rmm_config.use_managed_memory = managed rmm_config.use_pool_allocator = pool rmm_config.initial_pool_size = 2 << 27 rmm.initialize() assert (rmm.is_initialized()) M = utils.read_csv_for_nx(graph_file) cu_count = cugraph_call(M, edgevals=True) nx_count = networkx_call(M) assert cu_count == nx_count
def test_symmetrize_df(managed, pool, graph_file): gc.collect() rmm.finalize() rmm_config.use_managed_memory = managed rmm_config.use_pool_allocator = pool rmm.initialize() assert (rmm.is_initialized()) cu_M = utils.read_csv_file(graph_file + '.csv') sym_df = cugraph.symmetrize_df(cu_M, '0', '1') compare(cu_M['0'], cu_M['1'], cu_M['2'], sym_df['0'], sym_df['1'], sym_df['2'])
def test_katz_centrality(managed, pool, graph_file): gc.collect() rmm.finalize() rmm_config.use_managed_memory = managed rmm_config.use_pool_allocator = pool rmm.initialize() assert (rmm.is_initialized()) katz_scores = calc_katz(graph_file) topKNX = topKVertices(katz_scores, 'nx_katz', 10) topKCU = topKVertices(katz_scores, 'cu_katz', 10) assert topKNX.equals(topKCU)
def test_subgraph_extraction(managed, pool, graph_file): gc.collect() rmm.finalize() rmm_config.use_managed_memory = managed rmm_config.use_pool_allocator = pool rmm_config.initial_pool_size = 2 << 27 rmm.initialize() assert (rmm.is_initialized()) M = utils.read_csv_for_nx(graph_file) verts = np.zeros(3, dtype=np.int32) verts[0] = 0 verts[1] = 1 verts[2] = 17 cu_sg = cugraph_call(M, verts) nx_sg = nx_call(M, verts) assert compare_edges(cu_sg, nx_sg, verts)
def test_degree_functionality(managed, pool, graph_file): gc.collect() rmm.finalize() rmm_config.use_managed_memory = managed rmm_config.use_pool_allocator = pool rmm_config.initial_pool_size = 2 << 27 rmm.initialize() assert (rmm.is_initialized()) M = utils.read_csv_for_nx(graph_file) cu_M = utils.read_csv_file(graph_file) sources = cu_M['0'] destinations = cu_M['1'] values = cu_M['2'] G = cugraph.Graph() G.add_edge_list(sources, destinations, values) Gnx = nx.DiGraph(M) df_in_degree = G.in_degree() df_out_degree = G.out_degree() df_degree = G.degree() nx_in_degree = Gnx.in_degree() nx_out_degree = Gnx.out_degree() nx_degree = Gnx.degree() err_in_degree = 0 err_out_degree = 0 err_degree = 0 for i in range(len(df_degree)): if (df_in_degree['degree'][i] != nx_in_degree[i]): err_in_degree = err_in_degree + 1 if (df_out_degree['degree'][i] != nx_out_degree[i]): err_out_degree = err_out_degree + 1 if (df_degree['degree'][i] != nx_degree[i]): err_degree = err_degree + 1 assert err_in_degree == 0 assert err_out_degree == 0 assert err_degree == 0
def initialize_rmm_pool(): import rmm from rmm import rmm_config rmm_config.use_pool_allocator = True # set to 2GiB. Default is 1/2 total GPU memory rmm_config.initial_pool_size = 2 << 30 # default is false rmm_config.use_managed_memory = False rmm_config.enable_logging = True return rmm.initialize()
def test_wjaccard(managed, pool, graph_file): gc.collect() rmm.finalize() rmm_config.use_managed_memory = managed rmm_config.use_pool_allocator = pool rmm_config.initial_pool_size = 2 << 27 rmm.initialize() assert (rmm.is_initialized()) M = utils.read_csv_for_nx(graph_file) cu_M = utils.read_csv_file(graph_file) # suppress F841 (local variable is assigned but never used) in flake8 # no networkX equivalent to compare cu_coeff against... cu_coeff = cugraph_call(cu_M) # noqa: F841 nx_coeff = networkx_call(M) for i in range(len(cu_coeff)): diff = abs(nx_coeff[i] - cu_coeff[i]) assert diff < 1.0e-6
def test_modularity_clustering(managed, pool, graph_file, partitions): gc.collect() rmm.finalize() rmm_config.use_managed_memory = managed rmm_config.use_pool_allocator = pool rmm_config.initial_pool_size = 2 << 27 rmm.initialize() assert (rmm.is_initialized()) # Read in the graph and get a cugraph object M = utils.read_csv_for_nx(graph_file).tocsr() cu_M = utils.read_csv_file(graph_file, read_weights_in_sp=False) row_offsets = cudf.Series(M.indptr) col_indices = cudf.Series(M.indices) sources = cu_M['0'] destinations = cu_M['1'] G_adj = cugraph.Graph() G_adj.add_adj_list(row_offsets, col_indices) G_edge = cugraph.Graph() G_edge.add_edge_list(sources, destinations) # Get the modularity score for partitioning versus random assignment cu_vid, cu_score = cugraph_call(G_adj, partitions) rand_vid, rand_score = random_call(G_adj, partitions) # Assert that the partitioning has better modularity than the random # assignment assert cu_score < rand_score # Get the modularity score for partitioning versus random assignment cu_vid, cu_score = cugraph_call(G_edge, partitions) rand_vid, rand_score = random_call(G_edge, partitions) # Assert that the partitioning has better modularity than the random # assignment assert cu_score < rand_score
def test_filter_unreachable(managed, pool, graph_file, source): gc.collect() rmm.finalize() rmm_config.use_managed_memory = managed rmm_config.use_pool_allocator = pool rmm_config.initial_pool_size = 2 << 27 rmm.initialize() assert(rmm.is_initialized()) cu_M = utils.read_csv_file(graph_file) # Device data sources = cu_M['0'] destinations = cu_M['1'] print('sources size = ' + str(len(sources))) print('destinations size = ' + str(len(destinations))) # cugraph Pagerank Call G = cugraph.Graph() G.add_edge_list(sources, destinations) print('cugraph Solving... ') t1 = time.time() df = cugraph.sssp(G, source) t2 = time.time() - t1 print('Time : '+str(t2)) reachable_df = cugraph.filter_unreachable(df) if(np.issubdtype(df['distance'].dtype, np.integer)): inf = np.iinfo(reachable_df['distance'].dtype).max # noqa: F841 assert len(reachable_df.query("distance == @inf")) == 0 elif(np.issubdtype(df['distance'].dtype, np.inexact)): inf = np.finfo(reachable_df['distance'].dtype).max # noqa: F841 assert len(reachable_df.query("distance == @inf")) == 0 assert len(reachable_df) != 0
def test_transpose_from_adj_list(managed, pool, graph_file): gc.collect() rmm.finalize() rmm_config.use_managed_memory = managed rmm_config.use_pool_allocator = pool rmm_config.initial_pool_size = 2 << 27 rmm.initialize() assert (rmm.is_initialized()) M = utils.read_csv_for_nx(graph_file).tocsr() offsets = cudf.Series(M.indptr) indices = cudf.Series(M.indices) G = cugraph.Graph() G.add_adj_list(offsets, indices, None) G.add_transposed_adj_list() Mt = M.transpose().tocsr() toff, tind, tval = G.view_transposed_adj_list() assert compare_series(tind, Mt.indices) assert compare_offsets(toff, Mt.indptr) assert tval is None
def test_bfs(managed, pool, graph_file): gc.collect() rmm.finalize() rmm_config.use_managed_memory = managed rmm_config.use_pool_allocator = pool rmm_config.initial_pool_size = 2 << 27 rmm.initialize() assert (rmm.is_initialized()) M = utils.read_csv_for_nx(graph_file) cu_M = utils.read_csv_file(graph_file) base_vid, base_dist = base_call(M, 0) cugraph_vid, cugraph_dist = cugraph_call(cu_M, 0) # Calculating mismatch assert len(base_dist) == len(cugraph_dist) for i in range(len(cugraph_dist)): assert base_vid[i] == cugraph_vid[i] assert base_dist[i] == cugraph_dist[i]