Exemplo n.º 1
0
def test_dask_pagerank(client_connection, personalization_perc):
    gc.collect()

    # FIXME: update this to allow dataset to be parameterized and have dataset
    # part of test param id (see other tests)
    input_data_path = r"../datasets/karate.csv"
    print(f"dataset={input_data_path}")
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    g = cugraph.DiGraph()
    g.from_cudf_edgelist(df, "src", "dst")

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, "src", "dst")

    personalization = None
    if personalization_perc != 0:
        personalization, p = personalize(g.nodes(), personalization_perc)

    expected_pr = cugraph.pagerank(g,
                                   personalization=personalization,
                                   tol=1e-6)
    result_pr = dcg.pagerank(dg, personalization=personalization, tol=1e-6)
    result_pr = result_pr.compute()

    err = 0
    tol = 1.0e-05

    assert len(expected_pr) == len(result_pr)

    compare_pr = expected_pr.merge(result_pr,
                                   on="vertex",
                                   suffixes=["_local", "_dask"])

    for i in range(len(compare_pr)):
        diff = abs(compare_pr["pagerank_local"].iloc[i] -
                   compare_pr["pagerank_dask"].iloc[i])
        if diff > tol * 1.1:
            err = err + 1
    assert err == 0
Exemplo n.º 2
0
def test_dask_pagerank(client_connection, personalization_perc):
    gc.collect()

    input_data_path = r"../datasets/karate.csv"
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    g = cugraph.DiGraph()
    g.from_cudf_edgelist(df, "src", "dst")

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, "src", "dst")

    # Pre compute local data and personalize
    personalization = None
    if personalization_perc != 0:
        dg.compute_local_data(by="dst")
        personalization = personalize(dg.number_of_vertices(),
                                      personalization_perc)

    expected_pr = cugraph.pagerank(g,
                                   personalization=personalization,
                                   tol=1e-6)
    result_pr = dcg.pagerank(dg, personalization=personalization, tol=1e-6)
    result_pr = result_pr.compute()

    err = 0
    tol = 1.0e-05

    assert len(expected_pr) == len(result_pr)

    compare_pr = expected_pr.merge(result_pr,
                                   on="vertex",
                                   suffixes=["_local", "_dask"])

    for i in range(len(compare_pr)):
        diff = abs(compare_pr["pagerank_local"].iloc[i] -
                   compare_pr["pagerank_dask"].iloc[i])
        if diff > tol * 1.1:
            err = err + 1
    assert err == 0
Exemplo n.º 3
0
def test_dask_pagerank(dask_client, personalization_perc):
    gc.collect()

    input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "karate.csv").as_posix()
    print(f"dataset={input_data_path}")
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    g = cugraph.DiGraph()
    g.from_cudf_edgelist(df, "src", "dst")

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, "src", "dst")

    personalization = None
    if personalization_perc != 0:
        personalization, p = personalize(g.nodes(), personalization_perc)

    expected_pr = cugraph.pagerank(g,
                                   personalization=personalization,
                                   tol=1e-6)
    result_pr = dcg.pagerank(dg, personalization=personalization, tol=1e-6)
    result_pr = result_pr.compute()

    err = 0
    tol = 1.0e-05

    assert len(expected_pr) == len(result_pr)

    compare_pr = expected_pr.merge(result_pr,
                                   on="vertex",
                                   suffixes=["_local", "_dask"])

    for i in range(len(compare_pr)):
        diff = abs(compare_pr["pagerank_local"].iloc[i] -
                   compare_pr["pagerank_dask"].iloc[i])
        if diff > tol * 1.1:
            err = err + 1
    assert err == 0
Exemplo n.º 4
0
def test_dask_pagerank(client_connection):
    gc.collect()

    pandas.set_option("display.max_rows", 10000)

    input_data_path = r"../datasets/karate.csv"
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    g = cugraph.DiGraph()
    g.from_cudf_edgelist(df, "src", "dst")

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, "src", "dst")

    # Pre compute local data
    # dg.compute_local_data(by='dst')

    expected_pr = cugraph.pagerank(g)
    result_pr = dcg.pagerank(dg)

    err = 0
    tol = 1.0e-05

    assert len(expected_pr) == len(result_pr)

    compare_pr = expected_pr.merge(result_pr,
                                   on="vertex",
                                   suffixes=["_local", "_dask"])

    for i in range(len(compare_pr)):
        diff = abs(compare_pr["pagerank_local"].iloc[i] -
                   compare_pr["pagerank_dask"].iloc[i])
        if diff > tol * 1.1:
            err = err + 1
    print("Mismatches:", err)
    assert err == 0
Exemplo n.º 5
0
def test_dask_pagerank(dask_client):
    pandas.set_option("display.max_rows", 10000)

    input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "karate.csv").as_posix()
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    g = cugraph.Graph(directed=True)
    g.from_cudf_edgelist(df, "src", "dst")

    dg = cugraph.Graph(directed=True)
    dg.from_dask_cudf_edgelist(ddf, "src", "dst")

    expected_pr = cugraph.pagerank(g)
    result_pr = dcg.pagerank(dg).compute()

    err = 0
    tol = 1.0e-05

    assert len(expected_pr) == len(result_pr)

    compare_pr = expected_pr.merge(result_pr,
                                   on="vertex",
                                   suffixes=["_local", "_dask"])

    for i in range(len(compare_pr)):
        diff = abs(compare_pr["pagerank_local"].iloc[i] -
                   compare_pr["pagerank_dask"].iloc[i])
        if diff > tol * 1.1:
            err = err + 1
    print("Mismatches:", err)
    assert err == 0
Exemplo n.º 6
0
def test_dask_pagerank(client_connection):
    gc.collect()

    # Initialize and run pagerank on two distributed graphs
    # with same communicator

    input_data_path1 = r"../datasets/karate.csv"
    chunksize1 = dcg.get_chunksize(input_data_path1)

    input_data_path2 = r"../datasets/dolphins.csv"
    chunksize2 = dcg.get_chunksize(input_data_path2)

    ddf1 = dask_cudf.read_csv(input_data_path1,
                              chunksize=chunksize1,
                              delimiter=' ',
                              names=['src', 'dst', 'value'],
                              dtype=['int32', 'int32', 'float32'])

    dg1 = cugraph.DiGraph()
    dg1.from_dask_cudf_edgelist(ddf1, 'src', 'dst')

    result_pr1 = dcg.pagerank(dg1)

    ddf2 = dask_cudf.read_csv(input_data_path2,
                              chunksize=chunksize2,
                              delimiter=' ',
                              names=['src', 'dst', 'value'],
                              dtype=['int32', 'int32', 'float32'])

    dg2 = cugraph.DiGraph()
    dg2.from_dask_cudf_edgelist(ddf2, 'src', 'dst')

    result_pr2 = dcg.pagerank(dg2)

    # Calculate single GPU pagerank for verification of results
    df1 = cudf.read_csv(input_data_path1,
                        delimiter=' ',
                        names=['src', 'dst', 'value'],
                        dtype=['int32', 'int32', 'float32'])

    g1 = cugraph.DiGraph()
    g1.from_cudf_edgelist(df1, 'src', 'dst')
    expected_pr1 = cugraph.pagerank(g1)

    df2 = cudf.read_csv(input_data_path2,
                        delimiter=' ',
                        names=['src', 'dst', 'value'],
                        dtype=['int32', 'int32', 'float32'])

    g2 = cugraph.DiGraph()
    g2.from_cudf_edgelist(df2, 'src', 'dst')
    expected_pr2 = cugraph.pagerank(g2)

    # Compare and verify pagerank results

    err1 = 0
    err2 = 0
    tol = 1.0e-05

    compare_pr1 = expected_pr1.merge(result_pr1,
                                     on="vertex",
                                     suffixes=['_local', '_dask'])

    assert len(expected_pr1) == len(result_pr1)

    for i in range(len(compare_pr1)):
        diff = abs(compare_pr1['pagerank_local'].iloc[i] -
                   compare_pr1['pagerank_dask'].iloc[i])
        if diff > tol * 1.1:
            err1 = err1 + 1
    print("Mismatches in ", input_data_path1, ": ", err1)

    assert len(expected_pr2) == len(result_pr2)

    compare_pr2 = expected_pr2.merge(result_pr2,
                                     on="vertex",
                                     suffixes=['_local', '_dask'])

    for i in range(len(compare_pr2)):
        diff = abs(compare_pr2['pagerank_local'].iloc[i] -
                   compare_pr2['pagerank_dask'].iloc[i])
        if diff > tol * 1.1:
            err2 = err2 + 1
    print("Mismatches in ", input_data_path2, ": ", err2)
    assert err1 == err2 == 0
Exemplo n.º 7
0
def test_dask_pagerank(client_connection):
    gc.collect()

    # Initialize and run pagerank on two distributed graphs
    # with same communicator

    # FIXME: update this to allow dataset to be parameterized and have dataset
    # part of test param id (see other tests)
    input_data_path1 = r"../datasets/karate.csv"
    print(f"dataset1={input_data_path1}")
    chunksize1 = dcg.get_chunksize(input_data_path1)

    input_data_path2 = r"../datasets/dolphins.csv"
    print(f"dataset2={input_data_path2}")
    chunksize2 = dcg.get_chunksize(input_data_path2)

    ddf1 = dask_cudf.read_csv(
        input_data_path1,
        chunksize=chunksize1,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    dg1 = cugraph.DiGraph()
    dg1.from_dask_cudf_edgelist(ddf1, "src", "dst")

    result_pr1 = dcg.pagerank(dg1).compute()

    ddf2 = dask_cudf.read_csv(
        input_data_path2,
        chunksize=chunksize2,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    dg2 = cugraph.DiGraph()
    dg2.from_dask_cudf_edgelist(ddf2, "src", "dst")

    result_pr2 = dcg.pagerank(dg2).compute()

    # Calculate single GPU pagerank for verification of results
    df1 = cudf.read_csv(
        input_data_path1,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    g1 = cugraph.DiGraph()
    g1.from_cudf_edgelist(df1, "src", "dst")
    expected_pr1 = cugraph.pagerank(g1)

    df2 = cudf.read_csv(
        input_data_path2,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    g2 = cugraph.DiGraph()
    g2.from_cudf_edgelist(df2, "src", "dst")
    expected_pr2 = cugraph.pagerank(g2)

    # Compare and verify pagerank results

    err1 = 0
    err2 = 0
    tol = 1.0e-05

    compare_pr1 = expected_pr1.merge(result_pr1,
                                     on="vertex",
                                     suffixes=["_local", "_dask"])

    assert len(expected_pr1) == len(result_pr1)

    for i in range(len(compare_pr1)):
        diff = abs(compare_pr1["pagerank_local"].iloc[i] -
                   compare_pr1["pagerank_dask"].iloc[i])
        if diff > tol * 1.1:
            err1 = err1 + 1
    print("Mismatches in ", input_data_path1, ": ", err1)

    assert len(expected_pr2) == len(result_pr2)

    compare_pr2 = expected_pr2.merge(result_pr2,
                                     on="vertex",
                                     suffixes=["_local", "_dask"])

    for i in range(len(compare_pr2)):
        diff = abs(compare_pr2["pagerank_local"].iloc[i] -
                   compare_pr2["pagerank_dask"].iloc[i])
        if diff > tol * 1.1:
            err2 = err2 + 1
    print("Mismatches in ", input_data_path2, ": ", err2)
    assert err1 == err2 == 0