def test_parquet_concat_within_workers(client_connection): if not os.path.exists("test_files_parquet"): print("Generate data... ") os.mkdir("test_files_parquet") for x in range(10): if not os.path.exists("test_files_parquet/df" + str(x)): df = utils.random_edgelist(e=100, ef=16, dtypes={ "src": np.int32, "dst": np.int32 }, seed=x) df.to_parquet("test_files_parquet/df" + str(x), index=False) n_gpu = get_n_workers() print("Read_parquet... ") t1 = time.time() ddf = dask_cudf.read_parquet("test_files_parquet/*", dtype=["int32", "int32"]) ddf = ddf.persist() futures_of(ddf) wait(ddf) t1 = time.time() - t1 print("*** Read Time: ", t1, "s") print(ddf) assert ddf.npartitions > n_gpu print("Drop_duplicates... ") t2 = time.time() ddf.drop_duplicates(inplace=True) ddf = ddf.persist() futures_of(ddf) wait(ddf) t2 = time.time() - t2 print("*** Drop duplicate time: ", t2, "s") assert t2 < t1 print("Repartition... ") t3 = time.time() # Notice that ideally we would use : # ddf = ddf.repartition(npartitions=n_gpu) # However this is slower than reading and requires more memory # Using custom concat instead client = default_client() ddf = concat_within_workers(client, ddf) ddf = ddf.persist() futures_of(ddf) wait(ddf) t3 = time.time() - t3 print("*** repartition Time: ", t3, "s") print(ddf) assert t3 < t1
def test_random_minimum_spanning_tree_nx(graph_size): gc.collect() rmm.reinitialize(managed_memory=True) df = utils.random_edgelist( e=graph_size, ef=16, dtypes={"src": np.int32, "dst": np.int32, "weight": float}, drop_duplicates=True, seed=123456, ) gdf = cudf.from_pandas(df) # cugraph G = cugraph.Graph() G.from_cudf_edgelist( gdf, source="src", destination="dst", edge_attr="weight" ) # Just for getting relevant timing G.view_adj_list() t1 = time.time() cugraph.minimum_spanning_tree(G) t2 = time.time() - t1 print("CuGraph time : " + str(t2)) # Nx Gnx = nx.from_pandas_edgelist( df, create_using=nx.Graph(), source="src", target="dst", edge_attr="weight", ) t1 = time.time() nx.minimum_spanning_tree(Gnx) t3 = time.time() - t1 print("Nx Time : " + str(t3)) print("Speedup: " + str(t3 / t2))