Пример #1
0
def test_parquet_concat_within_workers(client_connection):
    if not os.path.exists("test_files_parquet"):
        print("Generate data... ")
        os.mkdir("test_files_parquet")
    for x in range(10):
        if not os.path.exists("test_files_parquet/df" + str(x)):
            df = utils.random_edgelist(e=100,
                                       ef=16,
                                       dtypes={
                                           "src": np.int32,
                                           "dst": np.int32
                                       },
                                       seed=x)
            df.to_parquet("test_files_parquet/df" + str(x), index=False)

    n_gpu = get_n_workers()

    print("Read_parquet... ")
    t1 = time.time()
    ddf = dask_cudf.read_parquet("test_files_parquet/*",
                                 dtype=["int32", "int32"])
    ddf = ddf.persist()
    futures_of(ddf)
    wait(ddf)
    t1 = time.time() - t1
    print("*** Read Time: ", t1, "s")
    print(ddf)

    assert ddf.npartitions > n_gpu

    print("Drop_duplicates... ")
    t2 = time.time()
    ddf.drop_duplicates(inplace=True)
    ddf = ddf.persist()
    futures_of(ddf)
    wait(ddf)
    t2 = time.time() - t2
    print("*** Drop duplicate time: ", t2, "s")
    assert t2 < t1

    print("Repartition... ")
    t3 = time.time()
    # Notice that ideally we would use :
    # ddf = ddf.repartition(npartitions=n_gpu)
    # However this is slower than reading and requires more memory
    # Using custom concat instead
    client = default_client()
    ddf = concat_within_workers(client, ddf)
    ddf = ddf.persist()
    futures_of(ddf)
    wait(ddf)
    t3 = time.time() - t3
    print("*** repartition Time: ", t3, "s")
    print(ddf)

    assert t3 < t1
def test_random_minimum_spanning_tree_nx(graph_size):
    gc.collect()
    rmm.reinitialize(managed_memory=True)
    df = utils.random_edgelist(
        e=graph_size,
        ef=16,
        dtypes={"src": np.int32, "dst": np.int32, "weight": float},
        drop_duplicates=True,
        seed=123456,
    )
    gdf = cudf.from_pandas(df)
    # cugraph
    G = cugraph.Graph()
    G.from_cudf_edgelist(
        gdf, source="src", destination="dst", edge_attr="weight"
    )
    # Just for getting relevant timing
    G.view_adj_list()
    t1 = time.time()
    cugraph.minimum_spanning_tree(G)
    t2 = time.time() - t1
    print("CuGraph time : " + str(t2))

    # Nx
    Gnx = nx.from_pandas_edgelist(
        df,
        create_using=nx.Graph(),
        source="src",
        target="dst",
        edge_attr="weight",
    )
    t1 = time.time()
    nx.minimum_spanning_tree(Gnx)
    t3 = time.time() - t1
    print("Nx Time : " + str(t3))
    print("Speedup: " + str(t3 / t2))