def cugraph_Call(M, source): # Device data sources = cudf.Series(M.row) destinations = cudf.Series(M.col) print('sources size = ' + str(len(sources))) print('destinations size = ' + str(len(destinations))) # cugraph Pagerank Call G = cugraph.Graph() G.add_edge_list(sources, destinations, None) print('cugraph Solving... ') t1 = time.time() dist = cugraph.sssp(G, source) t2 = time.time() - t1 print('Time : ' + str(t2)) distances = [] for i, d in enumerate(dist['distance']): distances.append((i, d)) return distances
def cugraph_call(cu_M, source, edgevals=False): G = cugraph.DiGraph() if edgevals is True: G.from_cudf_edgelist(cu_M, source='0', destination='1', edge_attr='2') else: G.from_cudf_edgelist(cu_M, source='0', destination='1') print('sources size = ' + str(len(cu_M['0']))) print('destinations size = ' + str(len(cu_M['1']))) print('cugraph Solving... ') t1 = time.time() df = cugraph.sssp(G, source) t2 = time.time() - t1 print('Cugraph Time : ' + str(t2)) if (np.issubdtype(df['distance'].dtype, np.integer)): max_val = np.iinfo(df['distance'].dtype).max else: max_val = np.finfo(df['distance'].dtype).max verts_np = df['vertex'].to_array() dist_np = df['distance'].to_array() pred_np = df['predecessor'].to_array() result = dict(zip(verts_np, zip(dist_np, pred_np))) return result, max_val
def get_shortest_dists(self, poi: namedtuple) -> None: """ Use `cugraph.sssp` to calculate shortest paths from POI to postcodes First subsets road graph, then finds shortest paths, ensuring all paths are routed that are known to be important to each POI. Saves to `hdf` to allow restarts. Parameters ---------- poi : namedtuple Single POI created from `df.itertuples()` """ if self.buffer: self.graph = self.create_sub_graph(poi=poi) shortest_paths: cudf.DataFrame = cugraph.filter_unreachable( cugraph.sssp(self.graph, source=poi.node_id)) pc_dist = shortest_paths[shortest_paths.vertex.isin(self.postcode_ids)] self.idx += 1 pc_dist["idx"] = self.idx if self.log_file.exists(): self.distances = cudf.read_csv(self.log_file).append(pc_dist) else: self.distances = pc_dist[["vertex", "distance", "idx"]] self.distances = (self.distances.sort_values( "distance").drop_duplicates("vertex").reset_index()[[ "vertex", "distance", "idx" ]]) self.distances.to_csv(self.log_file, index=False)
def cugraph_call(cu_M, source, edgevals=False): # Device data sources = cu_M['0'] destinations = cu_M['1'] if edgevals is False: values = None else: values = cu_M['2'] print('sources size = ' + str(len(sources))) print('destinations size = ' + str(len(destinations))) # cugraph Pagerank Call G = cugraph.Graph() G.add_edge_list(sources, destinations, values) print('cugraph Solving... ') t1 = time.time() dist = cugraph.sssp(G, source) t2 = time.time() - t1 print('Time : ' + str(t2)) distances = [] dist_np = dist['distance'].to_array() for i, d in enumerate(dist_np): distances.append((i, d)) return distances
def cugraph_call(cu_M, source, edgevals=False): # Device data sources = cu_M['0'] destinations = cu_M['1'] if edgevals is False: values = None else: values = cu_M['2'] print('sources size = ' + str(len(sources))) print('destinations size = ' + str(len(destinations))) # cugraph Pagerank Call G = cugraph.Graph() G.add_edge_list(sources, destinations, values) print('cugraph Solving... ') t1 = time.time() df = cugraph.sssp(G, source) t2 = time.time() - t1 print('Time : ' + str(t2)) verts_np = df['vertex'].to_array() dist_np = df['distance'].to_array() pred_np = df['predecessor'].to_array() result = dict(zip(verts_np, zip(dist_np, pred_np))) return result
def test_filter_unreachable(graph_file, source): gc.collect() cu_M = utils.read_csv_file(graph_file) print("sources size = " + str(len(cu_M))) print("destinations size = " + str(len(cu_M))) # cugraph Pagerank Call G = cugraph.DiGraph() G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2") print("cugraph Solving... ") t1 = time.time() df = cugraph.sssp(G, source) t2 = time.time() - t1 print("Time : " + str(t2)) reachable_df = cugraph.filter_unreachable(df) if np.issubdtype(df["distance"].dtype, np.integer): inf = np.iinfo(reachable_df["distance"].dtype).max # noqa: F841 assert len(reachable_df.query("distance == @inf")) == 0 elif np.issubdtype(df["distance"].dtype, np.inexact): inf = np.finfo(reachable_df["distance"].dtype).max # noqa: F841 assert len(reachable_df.query("distance == @inf")) == 0 assert len(reachable_df) != 0
def cugraph_call(cu_M, source, edgevals=False): G = cugraph.DiGraph() if edgevals is True: G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2") else: G.from_cudf_edgelist(cu_M, source="0", destination="1") print("sources size = " + str(len(cu_M["0"]))) print("destinations size = " + str(len(cu_M["1"]))) print("cugraph Solving... ") t1 = time.time() df = cugraph.sssp(G, source) t2 = time.time() - t1 print("Cugraph Time : " + str(t2)) if np.issubdtype(df["distance"].dtype, np.integer): max_val = np.iinfo(df["distance"].dtype).max else: max_val = np.finfo(df["distance"].dtype).max verts_np = df["vertex"].to_array() dist_np = df["distance"].to_array() pred_np = df["predecessor"].to_array() result = dict(zip(verts_np, zip(dist_np, pred_np))) return result, max_val
def test_sssp_data_type_conversion(managed, pool, graph_file, source): gc.collect() rmm.reinitialize(managed_memory=managed, pool_allocator=pool, initial_pool_size=2 << 27) assert (rmm.is_initialized()) M = utils.read_csv_for_nx(graph_file) cu_M = utils.read_csv_file(graph_file) # cugraph call with int32 weights cu_M['2'] = cu_M['2'].astype(np.int32) G = cugraph.DiGraph() G.from_cudf_edgelist(cu_M, source='0', destination='1', edge_attr='2') # assert cugraph weights is int32 assert G.edgelist.edgelist_df['weights'].dtype == np.int32 df = cugraph.sssp(G, source) max_val = np.finfo(df['distance'].dtype).max verts_np = df['vertex'].to_array() dist_np = df['distance'].to_array() pred_np = df['predecessor'].to_array() cu_paths = dict(zip(verts_np, zip(dist_np, pred_np))) # networkx call with int32 weights M['weight'] = M['weight'].astype(np.int32) Gnx = nx.from_pandas_edgelist(M, source='0', target='1', edge_attr='weight', create_using=nx.DiGraph()) # assert nx weights is int assert type(list(Gnx.edges(data=True))[0][2]['weight']) is int nx_paths = nx.single_source_dijkstra_path_length(Gnx, source) # Calculating mismatch err = 0 for vid in cu_paths: # Validate vertices that are reachable # NOTE : If distance type is float64 then cu_paths[vid][0] # should be compared against np.finfo(np.float64).max) if (cu_paths[vid][0] != max_val): if (cu_paths[vid][0] != nx_paths[vid]): err = err + 1 # check pred dist + edge_weight = current dist if (vid != source): pred = cu_paths[vid][1] edge_weight = Gnx[pred][vid]['weight'] if (cu_paths[pred][0] + edge_weight != cu_paths[vid][0]): err = err + 1 else: if (vid in nx_paths.keys()): err = err + 1 assert err == 0
def test_sssp_data_type_conversion(graph_file, source): gc.collect() M = utils.read_csv_for_nx(graph_file) cu_M = utils.read_csv_file(graph_file) # cugraph call with int32 weights cu_M["2"] = cu_M["2"].astype(np.int32) G = cugraph.DiGraph() G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2") # assert cugraph weights is int32 assert G.edgelist.edgelist_df["weights"].dtype == np.int32 df = cugraph.sssp(G, source) max_val = np.finfo(df["distance"].dtype).max verts_np = df["vertex"].to_array() dist_np = df["distance"].to_array() pred_np = df["predecessor"].to_array() cu_paths = dict(zip(verts_np, zip(dist_np, pred_np))) # networkx call with int32 weights M["weight"] = M["weight"].astype(np.int32) Gnx = nx.from_pandas_edgelist( M, source="0", target="1", edge_attr="weight", create_using=nx.DiGraph(), ) # assert nx weights is int assert type(list(Gnx.edges(data=True))[0][2]["weight"]) is int nx_paths = nx.single_source_dijkstra_path_length(Gnx, source) # Calculating mismatch err = 0 for vid in cu_paths: # Validate vertices that are reachable # NOTE : If distance type is float64 then cu_paths[vid][0] # should be compared against np.finfo(np.float64).max) if cu_paths[vid][0] != max_val: if cu_paths[vid][0] != nx_paths[vid]: err = err + 1 # check pred dist + edge_weight = current dist if vid != source: pred = cu_paths[vid][1] edge_weight = Gnx[pred][vid]["weight"] if cu_paths[pred][0] + edge_weight != cu_paths[vid][0]: err = err + 1 else: if vid in nx_paths.keys(): err = err + 1 assert err == 0
def test_dask_sssp(dask_client): gc.collect() input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "netscience.csv").as_posix() print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst", "value", renumber=True) dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst", "value") expected_dist = cugraph.sssp(g, 0) print(expected_dist) result_dist = dcg.sssp(dg, 0) result_dist = result_dist.compute() compare_dist = expected_dist.merge( result_dist, on="vertex", suffixes=["_local", "_dask"] ) err = 0 for i in range(len(compare_dist)): if ( compare_dist["distance_local"].iloc[i] != compare_dist["distance_dask"].iloc[i] ): err = err + 1 assert err == 0
def test_dask_sssp(client_connection): gc.collect() # FIXME: update this to allow dataset to be parameterized and have dataset # part of test param id (see other tests) input_data_path = r"../datasets/netscience.csv" print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst", "value", renumber=True) dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst", "value") expected_dist = cugraph.sssp(g, 0) print(expected_dist) result_dist = dcg.sssp(dg, 0) result_dist = result_dist.compute() compare_dist = expected_dist.merge(result_dist, on="vertex", suffixes=["_local", "_dask"]) err = 0 for i in range(len(compare_dist)): if (compare_dist["distance_local"].iloc[i] != compare_dist["distance_dask"].iloc[i]): err = err + 1 assert err == 0
def test_get_traversed_cost(graph_file): cu_M = utils.read_csv_file(graph_file) noise = cudf.Series(np.random.randint(10, size=(cu_M.shape[0]))) cu_M['info'] = cu_M['2'] + noise G = cugraph.Graph() G.from_cudf_edgelist(cu_M, source='0', destination='1', edge_attr='info') # run SSSP starting at vertex 17 df = cugraph.sssp(G, 16) answer = cugraph.utilities.path_retrieval.get_traversed_cost( df, 16, cu_M['0'], cu_M['1'], cu_M['info']) df = df.sort_values(by='vertex').reset_index() answer = answer.sort_values(by='vertex').reset_index() assert df.shape[0] == answer.shape[0] assert np.allclose(df['distance'], answer['info'])
def test_filter_unreachable(managed, pool, graph_file, source): gc.collect() rmm.finalize() rmm_config.use_managed_memory = managed rmm_config.use_pool_allocator = pool rmm_config.initial_pool_size = 2 << 27 rmm.initialize() assert(rmm.is_initialized()) cu_M = utils.read_csv_file(graph_file) # Device data sources = cu_M['0'] destinations = cu_M['1'] print('sources size = ' + str(len(sources))) print('destinations size = ' + str(len(destinations))) # cugraph Pagerank Call G = cugraph.Graph() G.add_edge_list(sources, destinations) print('cugraph Solving... ') t1 = time.time() df = cugraph.sssp(G, source) t2 = time.time() - t1 print('Time : '+str(t2)) reachable_df = cugraph.filter_unreachable(df) if(np.issubdtype(df['distance'].dtype, np.integer)): inf = np.iinfo(reachable_df['distance'].dtype).max # noqa: F841 assert len(reachable_df.query("distance == @inf")) == 0 elif(np.issubdtype(df['distance'].dtype, np.inexact)): inf = np.finfo(reachable_df['distance'].dtype).max # noqa: F841 assert len(reachable_df.query("distance == @inf")) == 0 assert len(reachable_df) != 0
def test_multigraph_sssp(graph_file): # FIXME: Migrate to new test fixtures for Graph setup once available cuM = utils.read_csv_file(graph_file) G = cugraph.MultiDiGraph() G.from_cudf_edgelist(cuM, source="0", destination="1", edge_attr="2") cu_paths = cugraph.sssp(G, 0) max_val = np.finfo(cu_paths["distance"].dtype).max cu_paths = cu_paths[cu_paths["distance"] != max_val] nxM = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True) Gnx = nx.from_pandas_edgelist( nxM, source="0", target="1", edge_attr="weight", create_using=nx.MultiDiGraph(), ) nx_paths = nx.single_source_dijkstra_path_length(Gnx, 0) cu_dist = cu_paths.sort_values(by='vertex')['distance'].to_numpy() nx_dist = [i[1] for i in sorted(nx_paths.items())] assert (cu_dist == nx_dist).all()
def test_filter_unreachable(managed, pool, graph_file, source): gc.collect() rmm.reinitialize( managed_memory=managed, pool_allocator=pool, initial_pool_size=2 << 27 ) assert(rmm.is_initialized()) cu_M = utils.read_csv_file(graph_file) print('sources size = ' + str(len(cu_M))) print('destinations size = ' + str(len(cu_M))) # cugraph Pagerank Call G = cugraph.DiGraph() G.from_cudf_edgelist(cu_M, source='0', destination='1', edge_attr='2') print('cugraph Solving... ') t1 = time.time() df = cugraph.sssp(G, source) t2 = time.time() - t1 print('Time : '+str(t2)) reachable_df = cugraph.filter_unreachable(df) if(np.issubdtype(df['distance'].dtype, np.integer)): inf = np.iinfo(reachable_df['distance'].dtype).max # noqa: F841 assert len(reachable_df.query("distance == @inf")) == 0 elif(np.issubdtype(df['distance'].dtype, np.inexact)): inf = np.finfo(reachable_df['distance'].dtype).max # noqa: F841 assert len(reachable_df.query("distance == @inf")) == 0 assert len(reachable_df) != 0
def getPDF(knnRelation, numBins, numSamples, numberOfNodes): us, vs, ds = map(cudf.Series, knnRelation) us, vs, ds = cugraph.structure.symmetrize(us, vs, ds) df = cudf.DataFrame({'source': us, 'destination': vs, 'weight': ds}) G = cugraph.Graph() G.from_cudf_edgelist(df, edge_attr='weight') pdf = np.zeros(numBins) # for i in tqdm(range(min(numSamples, numberOfNodes))): for i in range(min(numSamples, numberOfNodes)): ssspResult: cudf.DataFrame = cugraph.sssp(G, i) distances: cudf.Series = ssspResult['distance'] vertexIds: cudf.Series = ssspResult['vertex'] # plt.scatter(range(len(distances)), sorted(distances)) # looks somewhat like an inverse sigmoid distances = distances[vertexIds > i] if i == 0: pdfMaxDist = 1.2 * distances.max() hist, _ = np.histogram(distances.tolist(), bins=numBins, range=(0, pdfMaxDist)) pdf += hist return pdfMaxDist, pdf / pdf.sum()
def sssp(G, start): return cugraph.sssp(G, source=start)
import cugraph import cudf import json M = cudf.read_csv('simple_test_sssp.csv', names=["src","dst",'value'], dtype=['int32', 'int32', 'float32'], header=None) G = cugraph.Graph() G.from_cudf_edgelist(M, source='src', destination='dst') distances = cugraph.sssp(G, 0) print(distances) print(type(distances)) distances_json = distances.to_json() json_object = json.loads(distances_json) print(json_object) vertex_distance_dic = json_object['distance'] vertex_dic = json_object['vertex'] print(vertex_distance_dic,type(vertex_distance_dic)) print(vertex_dic)