def test_cats_and_groupby_stats(client, tmpdir, datasets, part_mem_fraction, use_client): engine = "parquet" paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = Workflow( client=client if use_client else None, cat_names=cat_names, cont_names=cont_names, label_name=label_name, ) processor.add_preprocess( ops.Categorify(out_path=str(tmpdir), freq_threshold=10, on_host=True)) processor.add_cat_feature( ops.JoinGroupby(cont_names=cont_names, stats=["count", "sum"], out_path=str(tmpdir))) processor.finalize() dataset = Dataset(paths, part_mem_fraction=part_mem_fraction) processor.apply(dataset, output_path=str(tmpdir)) result = processor.get_ddf().compute() assert "name-cat_x_sum" in result.columns assert "name-string_x_sum" in result.columns
def test_dask_normalize(client, tmpdir, datasets, engine): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] df0 = cudf.concat([df1, df2], axis=0) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = Workflow( client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name ) processor.add_preprocess(ops.Normalize()) processor.finalize() dataset = Dataset(paths, engine) processor.apply(dataset) result = processor.get_ddf().compute() # Make sure we collected accurate statistics means = df0[cont_names].mean() stds = df0[cont_names].std() counts = df0[cont_names].count() for name in cont_names: assert math.isclose(means[name], processor.stats["means"][name], rel_tol=1e-3) assert math.isclose(stds[name], processor.stats["stds"][name], rel_tol=1e-3) assert math.isclose(counts[name], processor.stats["counts"][name], rel_tol=1e-3) # New (normalized) means should all be close to zero new_means = result[cont_names].mean() for name in cont_names: assert new_means[name] < 1e-3
def test_dask_median_dummyop(client, tmpdir, datasets, engine): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] class DummyOp(ops.DFOperator): default_in, default_out = "continuous", "continuous" @property def req_stats(self): return [ops.Median()] def op_logic(self, *args, **kwargs): return _dummy_op_logic(*args, _id=self._id, **kwargs) processor = Workflow( client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name ) processor.add_preprocess(DummyOp()) processor.finalize() dataset = Dataset(paths, engine) processor.apply(dataset) result = processor.get_ddf().compute() # TODO: Improve the accuracy! "tidigest" with crick could help, # but current version seems to have cupy/numpy problems here medians = result[cont_names].quantile(q=0.5) assert math.isclose(medians["x"], processor.stats["medians"]["x"], abs_tol=1e-1) assert math.isclose(medians["y"], processor.stats["medians"]["y"], abs_tol=1e-1) assert math.isclose(medians["id"], processor.stats["medians"]["id"], rel_tol=1e-2)
def test_dask_minmax_dummyop(client, tmpdir, datasets, engine): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] class DummyOp(ops.DFOperator): default_in, default_out = "continuous", "continuous" @property def req_stats(self): return [ops.MinMax()] def op_logic(self, *args, **kwargs): return _dummy_op_logic(*args, _id=self._id, **kwargs) processor = Workflow( client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name ) processor.add_preprocess(DummyOp()) processor.finalize() dataset = Dataset(paths, engine) processor.apply(dataset) result = processor.get_ddf().compute() assert math.isclose(result.x.min(), processor.stats["mins"]["x"], rel_tol=1e-3) assert math.isclose(result.y.min(), processor.stats["mins"]["y"], rel_tol=1e-3) assert math.isclose(result.id.min(), processor.stats["mins"]["id"], rel_tol=1e-3) assert math.isclose(result.x.max(), processor.stats["maxs"]["x"], rel_tol=1e-3) assert math.isclose(result.y.max(), processor.stats["maxs"]["y"], rel_tol=1e-3) assert math.isclose(result.id.max(), processor.stats["maxs"]["id"], rel_tol=1e-3)
def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction): engine = "parquet" paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] df0 = cudf.concat([df1, df2], axis=0) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = Workflow( client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name ) processor.add_preprocess( ops.GroupBy(cont_names=cont_names, stats=["count", "sum", "std"], out_path=str(tmpdir)) ) processor.finalize() dataset = Dataset(paths, part_mem_fraction=part_mem_fraction) processor.apply(dataset) result = processor.get_ddf().compute(scheduler="synchronous") # Validate result assert len(df0) == len(result) assert "name-cat_x_std" in result.columns assert "name-cat_x_var" not in result.columns assert "name-string_x_std" in result.columns assert "name-string_x_var" not in result.columns # Check "count" assert_eq( result[["name-cat", "name-cat_count"]] .drop_duplicates() .sort_values("name-cat")["name-cat_count"], df0.groupby("name-cat").agg({"x": "count"})["x"], check_index=False, check_dtype=False, # May get int64 vs int32 check_names=False, ) # Check "std" assert_eq( result[["name-string", "name-string_x_std"]] .drop_duplicates() .sort_values("name-string")["name-string_x_std"], df0.groupby("name-string").agg({"x": "std"})["x"], check_index=False, check_names=False, )
part_mem_fraction=float(args.gpu_mem_frac), **dataset_args) valids_ds = Dataset(valid_set, engine=args.in_file_type, part_mem_fraction=float(args.gpu_mem_frac), **dataset_args) print("Running apply") out_train = os.path.join(args.out_dir, "train") out_valid = os.path.join(args.out_dir, "valid") start = time() proc.apply( trains_ds, apply_offline=True, record_stats=True, shuffle=shuffle_arg, output_path=out_train, out_files_per_proc=2, ) print(f"train preprocess time: {time() - start}") start = time() proc.apply( valids_ds, apply_offline=True, record_stats=False, shuffle=shuffle_arg, output_path=out_valid, out_files_per_proc=2, ) print(f"valid preprocess time: {time() - start}")
def main(args): """Multi-GPU Criteo/DLRM Preprocessing Benchmark This benchmark is designed to measure the time required to preprocess the Criteo (1TB) dataset for Facebook’s DLRM model. The user must specify the path of the raw dataset (using the `--data-path` flag), as well as the output directory for all temporary/final data (using the `--out-path` flag) Example Usage ------------- python dask-nvtabular-criteo-benchmark.py --data-path /path/to/criteo_parquet --out-path /out/dir/` Dataset Requirements (Parquet) ------------------------------ This benchmark is designed with a parquet-formatted dataset in mind. While a CSV-formatted dataset can be processed by NVTabular, converting to parquet will yield significantly better performance. To convert your dataset, try using the `optimize_criteo.ipynb` notebook (also located in `NVTabular/examples/`) For a detailed parameter overview see `NVTabular/examples/MultiGPUBench.md` """ # Input data_path = args.data_path freq_limit = args.freq_limit out_files_per_proc = args.out_files_per_proc high_card_columns = args.high_cards.split(",") dashboard_port = args.dashboard_port if args.protocol == "ucx": UCX_TLS = os.environ.get("UCX_TLS", "tcp,cuda_copy,cuda_ipc,sockcm") os.environ["UCX_TLS"] = UCX_TLS # Cleanup output directory BASE_DIR = args.out_path dask_workdir = os.path.join(BASE_DIR, "workdir") output_path = os.path.join(BASE_DIR, "output") stats_path = os.path.join(BASE_DIR, "stats") if not os.path.isdir(BASE_DIR): os.mkdir(BASE_DIR) for dir_path in (dask_workdir, output_path, stats_path): if os.path.isdir(dir_path): shutil.rmtree(dir_path) os.mkdir(dir_path) # Use Criteo dataset by default (for now) cont_names = (args.cont_names.split(",") if args.cont_names else ["I" + str(x) for x in range(1, 14)]) cat_names = (args.cat_names.split(",") if args.cat_names else ["C" + str(x) for x in range(1, 27)]) label_name = ["label"] # Specify Categorify/GroupbyStatistics options tree_width = {} cat_cache = {} for col in cat_names: if col in high_card_columns: tree_width[col] = args.tree_width cat_cache[col] = args.cat_cache_high else: tree_width[col] = 1 cat_cache[col] = args.cat_cache_low # Use total device size to calculate args.device_limit_frac device_size = device_mem_size(kind="total") device_limit = int(args.device_limit_frac * device_size) device_pool_size = int(args.device_pool_frac * device_size) part_size = int(args.part_mem_frac * device_size) # Parse shuffle option shuffle = None if args.shuffle == "PER_WORKER": shuffle = nvt_io.Shuffle.PER_WORKER elif args.shuffle == "PER_PARTITION": shuffle = nvt_io.Shuffle.PER_PARTITION # Check if any device memory is already occupied for dev in args.devices.split(","): fmem = _pynvml_mem_size(kind="free", index=int(dev)) used = (device_size - fmem) / 1e9 if used > 1.0: warnings.warn( f"BEWARE - {used} GB is already occupied on device {int(dev)}!" ) # Setup LocalCUDACluster if args.protocol == "tcp": cluster = LocalCUDACluster( protocol=args.protocol, n_workers=args.n_workers, CUDA_VISIBLE_DEVICES=args.devices, device_memory_limit=device_limit, local_directory=dask_workdir, dashboard_address=":" + dashboard_port, ) else: cluster = LocalCUDACluster( protocol=args.protocol, n_workers=args.n_workers, CUDA_VISIBLE_DEVICES=args.devices, enable_nvlink=True, device_memory_limit=device_limit, local_directory=dask_workdir, dashboard_address=":" + dashboard_port, ) client = Client(cluster) # Setup RMM pool if args.device_pool_frac > 0.01: setup_rmm_pool(client, device_pool_size) # Define Dask NVTabular "Workflow" processor = Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name, client=client) if args.normalize: processor.add_feature([ops.FillMissing(), ops.Normalize()]) else: processor.add_feature( [ops.FillMissing(), ops.Clip(min_value=0), ops.LogOp()]) processor.add_preprocess( ops.Categorify( out_path=stats_path, tree_width=tree_width, cat_cache=cat_cache, freq_threshold=freq_limit, search_sorted=not freq_limit, on_host=not args.cats_on_device, )) processor.finalize() dataset = Dataset(data_path, "parquet", part_size=part_size) # Execute the dask graph runtime = time.time() if args.profile is not None: with performance_report(filename=args.profile): processor.apply( dataset, shuffle=shuffle, out_files_per_proc=out_files_per_proc, output_path=output_path, num_io_threads=args.num_io_threads, ) else: processor.apply( dataset, num_io_threads=args.num_io_threads, shuffle=shuffle, out_files_per_proc=out_files_per_proc, output_path=output_path, ) runtime = time.time() - runtime print("\nDask-NVTabular DLRM/Criteo benchmark") print("--------------------------------------") print(f"partition size | {part_size}") print(f"protocol | {args.protocol}") print(f"device(s) | {args.devices}") print(f"rmm-pool-frac | {(args.device_pool_frac)}") print(f"out-files-per-proc | {args.out_files_per_proc}") print(f"num_io_threads | {args.num_io_threads}") print(f"shuffle | {args.shuffle}") print(f"cats-on-device | {args.cats_on_device}") print("======================================") print(f"Runtime[s] | {runtime}") print("======================================\n") client.close()
def test_dask_workflow_api_dlrm(client, tmpdir, datasets, freq_threshold, part_mem_fraction, engine, cat_cache, on_host, shuffle): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) if engine == "parquet": df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] elif engine == "csv": df1 = cudf.read_csv(paths[0], header=0)[mycols_csv] df2 = cudf.read_csv(paths[1], header=0)[mycols_csv] else: df1 = cudf.read_csv(paths[0], names=allcols_csv)[mycols_csv] df2 = cudf.read_csv(paths[1], names=allcols_csv)[mycols_csv] df0 = cudf.concat([df1, df2], axis=0) if engine == "parquet": cat_names = ["name-cat", "name-string"] else: cat_names = ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = Workflow(client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_feature( [ops.FillMissing(), ops.Clip(min_value=0), ops.LogOp()]) processor.add_preprocess( ops.Categorify( freq_threshold=freq_threshold, out_path=str(tmpdir), cat_cache=cat_cache, on_host=on_host, )) processor.finalize() if engine in ("parquet", "csv"): dataset = Dataset(paths, part_mem_fraction=part_mem_fraction) else: dataset = Dataset(paths, names=allcols_csv, part_mem_fraction=part_mem_fraction) output_path = os.path.join(tmpdir, "processed") processor.apply(dataset, output_path=output_path, shuffle=shuffle) # Can still access the final ddf if we didn't shuffle if not shuffle: result = processor.get_ddf().compute() assert len(df0) == len(result) assert result["x"].min() == 0.0 assert result["x"].isna().sum() == 0 assert result["y"].min() == 0.0 assert result["y"].isna().sum() == 0 # Check category counts cat_expect = df0.groupby("name-string").agg({ "name-string": "count" }).reset_index(drop=True) cat_result = (result.groupby("name-string").agg({ "name-string": "count" }).reset_index(drop=True)) if freq_threshold: cat_expect = cat_expect[ cat_expect["name-string"] >= freq_threshold] # Note that we may need to skip the 0th element in result (null mapping) assert_eq( cat_expect, cat_result.iloc[1:] if len(cat_result) > len(cat_expect) else cat_result, check_index=False, ) else: assert_eq(cat_expect, cat_result) # Read back from disk df_disk = dask_cudf.read_parquet(output_path, index=False).compute() for col in df_disk: assert_eq(result[col], df_disk[col]) else: # Read back from disk df_disk = dask_cudf.read_parquet(output_path, index=False).compute() assert len(df0) == len(df_disk)
def main(args): # Input data_path = args.data_path out_path = args.out_path freq_limit = args.freq_limit out_files_per_proc = args.splits if args.protocol == "ucx": os.environ["UCX_TLS"] = "tcp,cuda_copy,cuda_ipc,sockcm" # Use Criteo dataset by default (for now) cont_names = (args.cont_names.split(",") if args.cont_names else ["I" + str(x) for x in range(1, 14)]) cat_names = (args.cat_names.split(",") if args.cat_names else ["C" + str(x) for x in range(1, 27)]) label_name = ["label"] if args.cat_splits: tree_width = { name: int(s) for name, s in zip(cat_names, args.cat_splits.split(",")) } else: tree_width = {col: 1 for col in cat_names} if args.cat_names is None: # Using Criteo... Use more hash partitions for # known high-cardinality columns tree_width["C20"] = 8 tree_width["C1"] = 8 tree_width["C22"] = 4 tree_width["C10"] = 4 tree_width["C21"] = 2 tree_width["C11"] = 2 tree_width["C23"] = 2 tree_width["C12"] = 2 # Specify categorical caching location cat_cache = None if args.cat_cache: cat_cache = args.cat_cache.split(",") if len(cat_cache) == 1: cat_cache = cat_cache[0] else: # If user is specifying a list of options, # they must specify an option for every cat column assert len(cat_names) == len(cat_cache) if isinstance(cat_cache, str): cat_cache = {col: cat_cache for col in cat_names} elif isinstance(cat_cache, list): cat_cache = {name: c for name, c in zip(cat_names, cat_cache)} else: # Criteo/DLRM Defaults cat_cache = {col: "device" for col in cat_names} if args.cat_names is None: cat_cache["C20"] = "host" cat_cache["C1"] = "host" # Only need to cache the largest two on a dgx-2 if args.n_workers < 16: cat_cache["C22"] = "host" cat_cache["C10"] = "host" # Use total device size to calculate args.device_limit_frac device_size = device_mem_size(kind="total") device_limit = int(args.device_limit_frac * device_size) device_pool_size = int(args.device_pool_frac * device_size) part_size = int(args.part_mem_frac * device_size) # Setup LocalCUDACluster if args.protocol == "tcp": cluster = LocalCUDACluster( protocol=args.protocol, n_workers=args.n_workers, CUDA_VISIBLE_DEVICES=args.devs, device_memory_limit=device_limit, local_directory=args.dask_workspace, dashboard_address=":3787", ) else: cluster = LocalCUDACluster( protocol=args.protocol, n_workers=args.n_workers, CUDA_VISIBLE_DEVICES=args.devs, enable_nvlink=True, device_memory_limit=device_limit, local_directory=args.dask_workspace, dashboard_address=":3787", ) client = Client(cluster) # Setup RMM pool if not args.no_rmm_pool: setup_rmm_pool(client, device_pool_size) # Define Dask NVTabular "Workflow" processor = Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name, client=client) processor.add_feature([ops.ZeroFill(), ops.LogOp()]) processor.add_preprocess( ops.Categorify( out_path=out_path, tree_width=tree_width, cat_cache=cat_cache, freq_threshold=freq_limit, on_host=args.cat_on_host, )) processor.finalize() dataset = Dataset(data_path, "parquet", part_size=part_size) # Execute the dask graph runtime = time.time() if args.profile is not None: with performance_report(filename=args.profile): processor.apply( dataset, shuffle="full" if args.worker_shuffle else "partial", out_files_per_proc=out_files_per_proc, output_path=out_path, ) else: processor.apply( dataset, shuffle="full" if args.worker_shuffle else "partial", out_files_per_proc=out_files_per_proc, output_path=out_path, ) runtime = time.time() - runtime print("\nDask-NVTabular DLRM/Criteo benchmark") print("--------------------------------------") print(f"partition size | {part_size}") print(f"protocol | {args.protocol}") print(f"device(s) | {args.devs}") print(f"rmm-pool | {(not args.no_rmm_pool)}") print(f"out_files_per_proc | {args.splits}") print(f"worker-shuffle | {args.worker_shuffle}") print("======================================") print(f"Runtime[s] | {runtime}") print("======================================\n") client.close()
valid_set, names=cols, engine=args.in_file_type, gpu_memory_frac=float(args.gpu_mem_frac), sep="\t", ) print("Running apply") out_train = os.path.join(args.out_dir, "train") out_valid = os.path.join(args.out_dir, "valid") start = time() proc.apply( trains_itrs, apply_offline=True, record_stats=True, shuffle=shuffle_arg, output_path=out_train, num_out_files=35, ) print(f"train preprocess time: {time() - start}") start = time() proc.apply( valids_itrs, apply_offline=True, record_stats=False, shuffle=shuffle_arg, output_path=out_valid, num_out_files=35, ) print(f"valid preprocess time: {time() - start}")
def preprocess_criteo_parquet( input_path: str, output_path: str, client, frequency_threshold: int, ): train_days = [str(x) for x in CRITEO_TRAIN_DAYS] train_files = [ os.path.join(input_path, x) for x in os.listdir(input_path) if x.startswith("day") and x.split(".")[0].split("_")[-1] in train_days ] valid_file = os.path.join(input_path, "day_23.part2.parquet") test_file = os.path.join(input_path, "day_23.part1.parquet") all_set = train_files + [valid_file] + [test_file] print(all_set, train_files, valid_file, test_file) print("Creating Workflow Object") workflow = Workflow(cat_names=CRITEO_CATEGORICAL_COLUMNS, cont_names=CRITEO_CONTINUOUS_COLUMNS, label_name=CRITEO_CLICK_COLUMNS) # We want to assign 0 to all missing values, and calculate log(x+3) for present values # so if we set missing values to -2, then the result of log(1+2+(-2)) would be 0 workflow.add_cont_feature([ FillMissing(fill_val=-2.0), LambdaOp(op_name='Add3ButMinusOneCauseLogAddsOne', f=lambda col, _: col.add(2.0)), LogOp(), # Log(1+x) ]) workflow.add_cat_preprocess( Categorify(freq_threshold=frequency_threshold, out_path=output_path)) workflow.finalize() print("Creating Dataset Iterator") all_ds = Dataset(all_set, engine="parquet", part_mem_fraction=ALL_DS_MEM_FRAC) trains_ds = Dataset(train_files, engine="parquet", part_mem_fraction=TRAIN_DS_MEM_FRAC) valid_ds = Dataset(valid_file, engine="parquet", part_mem_fraction=TEST_DS_MEM_FRAC) test_ds = Dataset(test_file, engine="parquet", part_mem_fraction=VALID_DS_MEM_FRAC) print("Running apply") out_train = os.path.join(output_path, "train") out_valid = os.path.join(output_path, "validation") out_test = os.path.join(output_path, "test") start = time() workflow.update_stats(all_ds) print(f"Gathering statistics time: {time() - start}") start = time() workflow.apply(trains_ds, record_stats=False, output_path=out_train) print(f"train preprocess time: {time() - start}") start = time() workflow.apply(valid_ds, record_stats=False, output_path=out_valid) print(f"valid preprocess time: {time() - start}") start = time() workflow.apply(test_ds, record_stats=False, output_path=out_test) print(f"test preprocess time: {time() - start}") save_model_size_config(workflow, output_path)