def test_parquet_concat_within_workers(client_connection): if not os.path.exists("test_files_parquet"): print("Generate data... ") os.mkdir("test_files_parquet") for x in range(10): if not os.path.exists("test_files_parquet/df" + str(x)): df = utils.random_edgelist(e=100, ef=16, dtypes={ "src": np.int32, "dst": np.int32 }, seed=x) df.to_parquet("test_files_parquet/df" + str(x), index=False) n_gpu = get_n_workers() print("Read_parquet... ") t1 = time.time() ddf = dask_cudf.read_parquet("test_files_parquet/*", dtype=["int32", "int32"]) ddf = ddf.persist() futures_of(ddf) wait(ddf) t1 = time.time() - t1 print("*** Read Time: ", t1, "s") print(ddf) assert ddf.npartitions > n_gpu print("Drop_duplicates... ") t2 = time.time() ddf.drop_duplicates(inplace=True) ddf = ddf.persist() futures_of(ddf) wait(ddf) t2 = time.time() - t2 print("*** Drop duplicate time: ", t2, "s") assert t2 < t1 print("Repartition... ") t3 = time.time() # Notice that ideally we would use : # ddf = ddf.repartition(npartitions=n_gpu) # However this is slower than reading and requires more memory # Using custom concat instead client = default_client() ddf = concat_within_workers(client, ddf) ddf = ddf.persist() futures_of(ddf) wait(ddf) t3 = time.time() - t3 print("*** repartition Time: ", t3, "s") print(ddf) assert t3 < t1
def get_n_workers(sID=None): if sID is None: return read_utils.get_n_workers() else: sessionstate = get_raft_comm_state(sID) return sessionstate['nworkers']