def test_dask_pagerank(client_connection, personalization_perc): gc.collect() # FIXME: update this to allow dataset to be parameterized and have dataset # part of test param id (see other tests) input_data_path = r"../datasets/karate.csv" print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst") dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst") personalization = None if personalization_perc != 0: personalization, p = personalize(g.nodes(), personalization_perc) expected_pr = cugraph.pagerank(g, personalization=personalization, tol=1e-6) result_pr = dcg.pagerank(dg, personalization=personalization, tol=1e-6) result_pr = result_pr.compute() err = 0 tol = 1.0e-05 assert len(expected_pr) == len(result_pr) compare_pr = expected_pr.merge(result_pr, on="vertex", suffixes=["_local", "_dask"]) for i in range(len(compare_pr)): diff = abs(compare_pr["pagerank_local"].iloc[i] - compare_pr["pagerank_dask"].iloc[i]) if diff > tol * 1.1: err = err + 1 assert err == 0
def test_dask_pagerank(client_connection, personalization_perc): gc.collect() input_data_path = r"../datasets/karate.csv" chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst") dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst") # Pre compute local data and personalize personalization = None if personalization_perc != 0: dg.compute_local_data(by="dst") personalization = personalize(dg.number_of_vertices(), personalization_perc) expected_pr = cugraph.pagerank(g, personalization=personalization, tol=1e-6) result_pr = dcg.pagerank(dg, personalization=personalization, tol=1e-6) result_pr = result_pr.compute() err = 0 tol = 1.0e-05 assert len(expected_pr) == len(result_pr) compare_pr = expected_pr.merge(result_pr, on="vertex", suffixes=["_local", "_dask"]) for i in range(len(compare_pr)): diff = abs(compare_pr["pagerank_local"].iloc[i] - compare_pr["pagerank_dask"].iloc[i]) if diff > tol * 1.1: err = err + 1 assert err == 0
def test_dask_pagerank(dask_client, personalization_perc): gc.collect() input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "karate.csv").as_posix() print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst") dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst") personalization = None if personalization_perc != 0: personalization, p = personalize(g.nodes(), personalization_perc) expected_pr = cugraph.pagerank(g, personalization=personalization, tol=1e-6) result_pr = dcg.pagerank(dg, personalization=personalization, tol=1e-6) result_pr = result_pr.compute() err = 0 tol = 1.0e-05 assert len(expected_pr) == len(result_pr) compare_pr = expected_pr.merge(result_pr, on="vertex", suffixes=["_local", "_dask"]) for i in range(len(compare_pr)): diff = abs(compare_pr["pagerank_local"].iloc[i] - compare_pr["pagerank_dask"].iloc[i]) if diff > tol * 1.1: err = err + 1 assert err == 0
def test_dask_pagerank(client_connection): gc.collect() pandas.set_option("display.max_rows", 10000) input_data_path = r"../datasets/karate.csv" chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst") dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst") # Pre compute local data # dg.compute_local_data(by='dst') expected_pr = cugraph.pagerank(g) result_pr = dcg.pagerank(dg) err = 0 tol = 1.0e-05 assert len(expected_pr) == len(result_pr) compare_pr = expected_pr.merge(result_pr, on="vertex", suffixes=["_local", "_dask"]) for i in range(len(compare_pr)): diff = abs(compare_pr["pagerank_local"].iloc[i] - compare_pr["pagerank_dask"].iloc[i]) if diff > tol * 1.1: err = err + 1 print("Mismatches:", err) assert err == 0
def test_dask_pagerank(dask_client): pandas.set_option("display.max_rows", 10000) input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "karate.csv").as_posix() chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g = cugraph.Graph(directed=True) g.from_cudf_edgelist(df, "src", "dst") dg = cugraph.Graph(directed=True) dg.from_dask_cudf_edgelist(ddf, "src", "dst") expected_pr = cugraph.pagerank(g) result_pr = dcg.pagerank(dg).compute() err = 0 tol = 1.0e-05 assert len(expected_pr) == len(result_pr) compare_pr = expected_pr.merge(result_pr, on="vertex", suffixes=["_local", "_dask"]) for i in range(len(compare_pr)): diff = abs(compare_pr["pagerank_local"].iloc[i] - compare_pr["pagerank_dask"].iloc[i]) if diff > tol * 1.1: err = err + 1 print("Mismatches:", err) assert err == 0
def test_dask_pagerank(client_connection): gc.collect() # Initialize and run pagerank on two distributed graphs # with same communicator input_data_path1 = r"../datasets/karate.csv" chunksize1 = dcg.get_chunksize(input_data_path1) input_data_path2 = r"../datasets/dolphins.csv" chunksize2 = dcg.get_chunksize(input_data_path2) ddf1 = dask_cudf.read_csv(input_data_path1, chunksize=chunksize1, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) dg1 = cugraph.DiGraph() dg1.from_dask_cudf_edgelist(ddf1, 'src', 'dst') result_pr1 = dcg.pagerank(dg1) ddf2 = dask_cudf.read_csv(input_data_path2, chunksize=chunksize2, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) dg2 = cugraph.DiGraph() dg2.from_dask_cudf_edgelist(ddf2, 'src', 'dst') result_pr2 = dcg.pagerank(dg2) # Calculate single GPU pagerank for verification of results df1 = cudf.read_csv(input_data_path1, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) g1 = cugraph.DiGraph() g1.from_cudf_edgelist(df1, 'src', 'dst') expected_pr1 = cugraph.pagerank(g1) df2 = cudf.read_csv(input_data_path2, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) g2 = cugraph.DiGraph() g2.from_cudf_edgelist(df2, 'src', 'dst') expected_pr2 = cugraph.pagerank(g2) # Compare and verify pagerank results err1 = 0 err2 = 0 tol = 1.0e-05 compare_pr1 = expected_pr1.merge(result_pr1, on="vertex", suffixes=['_local', '_dask']) assert len(expected_pr1) == len(result_pr1) for i in range(len(compare_pr1)): diff = abs(compare_pr1['pagerank_local'].iloc[i] - compare_pr1['pagerank_dask'].iloc[i]) if diff > tol * 1.1: err1 = err1 + 1 print("Mismatches in ", input_data_path1, ": ", err1) assert len(expected_pr2) == len(result_pr2) compare_pr2 = expected_pr2.merge(result_pr2, on="vertex", suffixes=['_local', '_dask']) for i in range(len(compare_pr2)): diff = abs(compare_pr2['pagerank_local'].iloc[i] - compare_pr2['pagerank_dask'].iloc[i]) if diff > tol * 1.1: err2 = err2 + 1 print("Mismatches in ", input_data_path2, ": ", err2) assert err1 == err2 == 0
def test_dask_pagerank(client_connection): gc.collect() # Initialize and run pagerank on two distributed graphs # with same communicator # FIXME: update this to allow dataset to be parameterized and have dataset # part of test param id (see other tests) input_data_path1 = r"../datasets/karate.csv" print(f"dataset1={input_data_path1}") chunksize1 = dcg.get_chunksize(input_data_path1) input_data_path2 = r"../datasets/dolphins.csv" print(f"dataset2={input_data_path2}") chunksize2 = dcg.get_chunksize(input_data_path2) ddf1 = dask_cudf.read_csv( input_data_path1, chunksize=chunksize1, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) dg1 = cugraph.DiGraph() dg1.from_dask_cudf_edgelist(ddf1, "src", "dst") result_pr1 = dcg.pagerank(dg1).compute() ddf2 = dask_cudf.read_csv( input_data_path2, chunksize=chunksize2, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) dg2 = cugraph.DiGraph() dg2.from_dask_cudf_edgelist(ddf2, "src", "dst") result_pr2 = dcg.pagerank(dg2).compute() # Calculate single GPU pagerank for verification of results df1 = cudf.read_csv( input_data_path1, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g1 = cugraph.DiGraph() g1.from_cudf_edgelist(df1, "src", "dst") expected_pr1 = cugraph.pagerank(g1) df2 = cudf.read_csv( input_data_path2, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g2 = cugraph.DiGraph() g2.from_cudf_edgelist(df2, "src", "dst") expected_pr2 = cugraph.pagerank(g2) # Compare and verify pagerank results err1 = 0 err2 = 0 tol = 1.0e-05 compare_pr1 = expected_pr1.merge(result_pr1, on="vertex", suffixes=["_local", "_dask"]) assert len(expected_pr1) == len(result_pr1) for i in range(len(compare_pr1)): diff = abs(compare_pr1["pagerank_local"].iloc[i] - compare_pr1["pagerank_dask"].iloc[i]) if diff > tol * 1.1: err1 = err1 + 1 print("Mismatches in ", input_data_path1, ": ", err1) assert len(expected_pr2) == len(result_pr2) compare_pr2 = expected_pr2.merge(result_pr2, on="vertex", suffixes=["_local", "_dask"]) for i in range(len(compare_pr2)): diff = abs(compare_pr2["pagerank_local"].iloc[i] - compare_pr2["pagerank_dask"].iloc[i]) if diff > tol * 1.1: err2 = err2 + 1 print("Mismatches in ", input_data_path2, ": ", err2) assert err1 == err2 == 0