def test_spec_set(tmpdir, client): gdf_test = cudf.DataFrame({ "ad_id": [1, 2, 2, 6, 6, 8, 3, 3], "source_id": [2, 4, 4, 7, 5, 2, 5, 2], "platform": [1, 2, np.nan, 2, 1, 3, 3, 1], "cont": [1, 2, np.nan, 2, 1, 3, 3, 1], "clicked": [1, 0, 1, 0, 0, 1, 1, 0], }) p = nvt.Workflow( cat_names=["ad_id", "source_id", "platform"], cont_names=["cont"], label_name=["clicked"], client=client, ) p.add_feature(ops.FillMissing()) p.add_feature(ops.Normalize()) p.add_feature(ops.Categorify()) p.add_feature( ops.TargetEncoding( cat_groups=["ad_id", "source_id", "platform"], cont_target="clicked", kfold=5, fold_seed=42, p_smooth=20, )) p.apply(nvt.Dataset(gdf_test), record_stats=True) assert p.stats
def test_dask_normalize(client, tmpdir, datasets, engine): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] df0 = cudf.concat([df1, df2], axis=0) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] normalize = ops.Normalize() conts = cont_names >> ops.FillMissing() >> normalize workflow = Workflow(conts + cat_names + label_name, client=client) dataset = Dataset(paths, engine) result = workflow.fit_transform(dataset).to_ddf().compute() # Make sure we collected accurate statistics means = df0[cont_names].mean() stds = df0[cont_names].std() for name in cont_names: assert math.isclose(means[name], normalize.means[name], rel_tol=1e-3) assert math.isclose(stds[name], normalize.stds[name], rel_tol=1e-3) # New (normalized) means should all be close to zero new_means = result[cont_names].mean() for name in cont_names: assert new_means[name] < 1e-3
def test_s3_dataset(s3, paths, engine, df): # create a mocked out bucket here bucket = "testbucket" s3.create_bucket(Bucket=bucket) s3_paths = [] for path in paths: s3_path = f"s3://{bucket}/{path}" with fsspec.open(s3_path, "wb") as f: f.write(open(path, "rb").read()) s3_paths.append(s3_path) # create a basic s3 dataset dataset = nvt.Dataset(s3_paths) # make sure the iteration API works columns = mycols_pq if engine == "parquet" else mycols_csv gdf = cudf.concat(list(dataset.to_iter()))[columns] assert_eq(gdf.reset_index(drop=True), df.reset_index(drop=True)) cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_feature([ops.FillMissing(), ops.Clip(min_value=0), ops.LogOp()]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify(cat_cache="host")) processor.finalize() processor.update_stats(dataset)
def test_concatenate_dataframe(tmpdir, output_model): # we were seeing an issue in the rossmann workflow where we dropped certain columns, # https://github.com/NVIDIA/NVTabular/issues/961 df = _make_df({ "cat": ["aaaa", "bbbb", "cccc", "aaaa", "bbbb", "aaaa"], "cont": [0.0, 1.0, 2.0, 3.0, 4.0, 5], }) # this bug only happened with a dataframe representation: force this by using a lambda cats = ["cat"] >> ops.LambdaOp(lambda col: _hash_series(col) % 1000) conts = ["cont"] >> ops.Normalize() >> ops.FillMissing() >> ops.LogOp() dataset = Dataset(df) workflow = nvt.Workflow(cats + conts).fit_schema(dataset.infer_schema()) if output_model == "pytorch": model_info = { "cat": { "columns": ["cat"], "dtype": "int32" }, "cont": { "columns": ["cont"], "dtype": "float32" }, } else: model_info = None _verify_workflow_on_tritonserver(tmpdir, workflow, df, "test_concatenate_dataframe", output_model, model_info)
def test_error_handling(tmpdir): df = _make_df({"x": np.arange(10), "y": np.arange(10)}) def custom_transform(col): if len(col) == 2: raise ValueError("Lets cause some problems") return col features = ["x", "y" ] >> ops.FillMissing() >> ops.Normalize() >> custom_transform workflow = nvt.Workflow(features) workflow.fit(nvt.Dataset(df)) model_name = "test_error_handling" triton.generate_nvtabular_model(workflow, model_name, tmpdir + f"/{model_name}", backend=BACKEND) with run_triton_server(tmpdir) as client: inputs = triton.convert_df_to_triton_input(["x", "y"], df[:2]) with pytest.raises( tritonclient.utils.InferenceServerException) as exception_info: client.infer(model_name, inputs) assert "ValueError: Lets cause some problems" in str( exception_info.value)
def test_schema_write_read_dataset(tmpdir, dataset, engine): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] norms = ops.Normalize() cat_features = cat_names >> ops.Categorify(cat_cache="host") cont_features = cont_names >> ops.FillMissing() >> ops.Clip( min_value=0) >> ops.LogOp >> norms workflow = Workflow(cat_features + cont_features + label_name) workflow.fit(dataset) workflow.transform(dataset).to_parquet( tmpdir, out_files_per_proc=10, ) schema_path = Path(tmpdir) proto_schema = PbTxt_SchemaWriter._read(schema_path / "schema.pbtxt") new_dataset = Dataset(glob.glob(str(tmpdir) + "/*.parquet")) assert """name: "name-cat"\n min: 0\n max: 27\n""" in str( proto_schema) assert new_dataset.schema == workflow.output_schema
def test_gpu_workflow(tmpdir, df, dataset, gpu_memory_frac, engine, dump): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] norms = ops.Normalize() conts = cont_names >> ops.FillMissing() >> ops.Clip(min_value=0) >> norms cats = cat_names >> ops.Categorify() workflow = nvt.Workflow(conts + cats + label_name) workflow.fit(dataset) if dump: workflow_dir = os.path.join(tmpdir, "workflow") workflow.save(workflow_dir) workflow = None workflow = Workflow.load(workflow_dir) def get_norms(tar: cudf.Series): gdf = tar.fillna(0) gdf = gdf * (gdf >= 0).astype("int") return gdf assert math.isclose(get_norms(df.x).mean(), norms.means["x"], rel_tol=1e-4) assert math.isclose(get_norms(df.y).mean(), norms.means["y"], rel_tol=1e-4) assert math.isclose(get_norms(df.x).std(), norms.stds["x"], rel_tol=1e-3) assert math.isclose(get_norms(df.y).std(), norms.stds["y"], rel_tol=1e-3) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique().values_host cats0 = get_cats(workflow, "name-cat") # adding the None entry as a string because of move from gpu assert cats0.tolist() == [None] + cats_expected0.tolist() cats_expected1 = df["name-string"].unique().values_host cats1 = get_cats(workflow, "name-string") # adding the None entry as a string because of move from gpu assert cats1.tolist() == [None] + cats_expected1.tolist() # Write to new "shuffled" and "processed" dataset workflow.transform(dataset).to_parquet( output_path=tmpdir, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION) dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) if engine == "parquet": assert is_integer_dtype(df_pp["name-cat"].dtype) assert is_integer_dtype(df_pp["name-string"].dtype) num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( str(tmpdir) + "/_metadata") assert num_rows == len(df_pp)
def test_workflow_apply(client, use_client, tmpdir, shuffle, apply_offline): out_files_per_proc = 2 out_path = str(tmpdir.mkdir("processed")) path = str(tmpdir.join("simple.parquet")) size = 25 row_group_size = 5 cont_names = ["cont1", "cont2"] cat_names = ["cat1", "cat2"] label_name = ["label"] df = pd.DataFrame({ "cont1": np.arange(size, dtype=np.float64), "cont2": np.arange(size, dtype=np.float64), "cat1": np.arange(size, dtype=np.int32), "cat2": np.arange(size, dtype=np.int32), "label": np.arange(size, dtype=np.float64), }) df.to_parquet(path, row_group_size=row_group_size, engine="pyarrow") dataset = nvt.Dataset(path, engine="parquet", row_groups_per_part=1) cat_features = cat_names >> ops.Categorify() cont_features = cont_names >> ops.FillMissing() >> ops.Clip( min_value=0) >> ops.LogOp workflow = Workflow(cat_features + cont_features + label_name, client=client if use_client else None) workflow.fit(dataset) # Force dtypes dict_dtypes = {} for col in cont_names: dict_dtypes[col] = np.float32 for col in cat_names: dict_dtypes[col] = np.float32 for col in label_name: dict_dtypes[col] = np.int64 workflow.transform(dataset).to_parquet( # apply_offline=apply_offline, Not any more? # record_stats=apply_offline, Not any more? output_path=out_path, shuffle=shuffle, out_files_per_proc=out_files_per_proc, dtypes=dict_dtypes, ) # Check dtypes for filename in glob.glob(os.path.join(out_path, "*.parquet")): gdf = cudf.io.read_parquet(filename) assert dict(gdf.dtypes) == dict_dtypes
def test_fit_schema_works_when_subtracting_column_names(): schema = Schema(["x", "y", "id"]) cont_features = (ColumnSelector( ["x", "y"]) >> ops.FillMissing() >> ops.Clip(min_value=0) >> ops.LogOp >> ops.Normalize() >> ops.Rename(postfix="_renamed")) workflow1 = Workflow(cont_features - "y_renamed") workflow1.fit_schema(schema) assert workflow1.output_schema.column_names == ["x_renamed"]
def test_fit_schema(): schema = Schema(["x", "y", "id"]) cont_features = (ColumnSelector(schema.column_names) >> ops.FillMissing() >> ops.Clip(min_value=0) >> ops.LogOp >> ops.Normalize() >> ops.Rename(postfix="_renamed")) workflow = Workflow(cont_features) workflow.fit_schema(schema) assert workflow.output_schema.column_names == [ "x_renamed", "y_renamed", "id_renamed" ]
def test_generate_triton_model(tmpdir, engine, output_model, df): tmpdir = "./tmp" conts = ["x", "y", "id"] >> ops.FillMissing() >> ops.Normalize() cats = ["name-cat", "name-string"] >> ops.Categorify(cat_cache="host") workflow = nvt.Workflow(conts + cats) workflow.fit(nvt.Dataset(df)) expected = workflow.transform(nvt.Dataset(df)).to_ddf().compute() # save workflow to triton / verify we see some expected output if output_model == "pytorch": model_info = { "name-cat": { "columns": ["name-cat"], "dtype": "int64" }, "name-string": { "columns": ["name-string"], "dtype": "int64" }, "id": { "columns": ["id"], "dtype": "float32" }, "x": { "columns": ["x"], "dtype": "float32" }, "y": { "columns": ["y"], "dtype": "float32" }, } else: model_info = None repo = os.path.join(tmpdir, "models") triton.generate_nvtabular_model( workflow=workflow, name="model", output_path=repo, version=1, output_model=output_model, output_info=model_info, ) workflow = None assert os.path.exists(os.path.join(repo, "config.pbtxt")) workflow = nvt.Workflow.load(os.path.join(repo, "1", "workflow")) transformed = workflow.transform(nvt.Dataset(df)).to_ddf().compute() assert_eq(expected, transformed)
def test_target_encode(tmpdir, cat_groups, kfold, fold_seed): df = cudf.DataFrame({ "Author": list(string.ascii_uppercase), "Engaging-User": list(string.ascii_lowercase), "Cost": range(26), "Post": [0, 1] * 13, }) df = dask_cudf.from_cudf(df, npartitions=3) cat_names = ["Author", "Engaging-User"] cont_names = ["Cost"] label_name = ["Post"] processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_feature( [ops.FillMissing(), ops.Clip(min_value=0), ops.LogOp()]) processor.add_preprocess( ops.TargetEncoding( cat_groups, "Cost", # cont_target out_path=str(tmpdir), kfold=kfold, out_col="test_name", out_dtype="float32", fold_seed=fold_seed, drop_folds=False, # Keep folds to validate )) processor.finalize() processor.apply(nvt.Dataset(df), output_format=None) df_out = processor.get_ddf().compute(scheduler="synchronous") assert "test_name" in df_out.columns assert df_out["test_name"].dtype == "float32" if kfold > 1: # Cat columns are unique. # Make sure __fold__ mapping is correct if cat_groups == "Author": name = "__fold___Author" cols = ["__fold__", "Author"] else: name = "__fold___Author_Engaging-User" cols = ["__fold__", "Author", "Engaging-User"] check = cudf.io.read_parquet(processor.stats["te_stats"][name]) check = check[cols].sort_values(cols).reset_index(drop=True) df_out_check = df_out[cols].sort_values(cols).reset_index(drop=True) assert_eq(check, df_out_check)
def test_s3_dataset(s3_base, s3so, paths, datasets, engine, df): # Copy files to mock s3 bucket files = {} for i, path in enumerate(paths): with open(path, "rb") as f: fbytes = f.read() fn = path.split(os.path.sep)[-1] files[fn] = BytesIO() files[fn].write(fbytes) files[fn].seek(0) if engine == "parquet": # Workaround for nvt#539. In order to avoid the # bug in Dask's `create_metadata_file`, we need # to manually generate a "_metadata" file here. # This can be removed after dask#7295 is merged # (see https://github.com/dask/dask/pull/7295) fn = "_metadata" files[fn] = BytesIO() meta = create_metadata_file( paths, engine="pyarrow", out_dir=False, ) meta.write_metadata_file(files[fn]) files[fn].seek(0) with s3_context(s3_base=s3_base, bucket=engine, files=files): # Create nvt.Dataset from mock s3 paths url = f"s3://{engine}" if engine == "parquet" else f"s3://{engine}/*" dataset = nvt.Dataset(url, engine=engine, storage_options=s3so) # Check that the iteration API works columns = mycols_pq if engine == "parquet" else mycols_csv gdf = cudf.concat(list(dataset.to_iter()))[columns] assert_eq(gdf.reset_index(drop=True), df.reset_index(drop=True)) cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] conts = cont_names >> ops.FillMissing() >> ops.Clip( min_value=0) >> ops.LogOp() cats = cat_names >> ops.Categorify(cat_cache="host") processor = nvt.Workflow(conts + cats + label_name) processor.fit(dataset)
def test_target_encode(tmpdir, cat_groups, kfold, fold_seed, cpu): df = dispatch._make_df({ "Author": list(string.ascii_uppercase), "Engaging-User": list(string.ascii_lowercase), "Cost": range(26), "Post": [0, 1] * 13, }) if cpu: df = dd.from_pandas( df if isinstance(df, pd.DataFrame) else df.to_pandas(), npartitions=3) else: df = dask_cudf.from_cudf(df, npartitions=3) cont_names = ["Cost"] te_features = cat_groups >> ops.TargetEncoding( cont_names, out_path=str(tmpdir), kfold=kfold, out_dtype="float32", fold_seed=fold_seed, drop_folds=False, # Keep folds to validate ) cont_features = cont_names >> ops.FillMissing() >> ops.Clip( min_value=0) >> ops.LogOp() workflow = nvt.Workflow(te_features + cont_features + ["Author", "Engaging-User"]) df_out = workflow.fit_transform( nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous") df_lib = dispatch.get_lib() if kfold > 1: # Cat columns are unique. # Make sure __fold__ mapping is correct if cat_groups == "Author": name = "__fold___Author" cols = ["__fold__", "Author"] else: name = "__fold___Author_Engaging-User" cols = ["__fold__", "Author", "Engaging-User"] check = df_lib.read_parquet(te_features.op.stats[name]) check = check[cols].sort_values(cols).reset_index(drop=True) df_out_check = df_out[cols].sort_values(cols).reset_index(drop=True) assert_eq(check, df_out_check, check_dtype=False)
def test_chaining_1(): df = cudf.DataFrame({ "cont01": np.random.randint(1, 100, 100), "cont02": np.random.random(100) * 100, "cat01": np.random.randint(0, 10, 100), "label": np.random.randint(0, 3, 100), }) df["cont01"][:10] = None cont1 = "cont01" >> ops.FillMissing() conts = cont1 + "cont02" >> ops.NormalizeMinMax() workflow = Workflow(conts + "cat01" + "label") result = workflow.fit_transform(Dataset(df)).to_ddf().compute() assert result["cont01"].max() <= 1.0 assert result["cont02"].max() <= 1.0
def test_grab_additional_input_columns(dataset, engine): schema = Schema(["x", "y"]) node1 = ["x"] >> ops.FillMissing() node2 = node1 >> ops.Clip(min_value=0) add_node = node2 + ["y"] workflow = Workflow(add_node).fit_schema(schema) output_df = workflow.transform(dataset).to_ddf().compute() assert len(workflow.output_node.input_columns.names) == 2 assert workflow.output_node.input_columns.names == ["x", "y"] assert len(workflow.output_node.output_columns.names) == 2 assert workflow.output_node.output_columns.names == ["x", "y"] assert len(output_df.columns) == 2 assert output_df.columns.tolist() == ["x", "y"]
def test_generate_triton_model(tmpdir, engine, df): tmpdir = "./tmp" conts = ["x", "y", "id"] >> ops.FillMissing() >> ops.Normalize() cats = ["name-cat", "name-string"] >> ops.Categorify(cat_cache="host") workflow = nvt.Workflow(conts + cats) workflow.fit(nvt.Dataset(df)) expected = workflow.transform(nvt.Dataset(df)).to_ddf().compute() # save workflow to triton / verify we see some expected output repo = os.path.join(tmpdir, "models") triton.generate_nvtabular_model(workflow, "model", repo) workflow = None assert os.path.exists(os.path.join(repo, "config.pbtxt")) workflow = nvt.Workflow.load(os.path.join(repo, "1", "workflow")) transformed = workflow.transform(nvt.Dataset(df)).to_ddf().compute() assert_eq(expected, transformed)
def test_dask_normalize(client, tmpdir, datasets, engine): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] df0 = cudf.concat([df1, df2], axis=0) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = Workflow(client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_preprocess([ops.FillMissing(), ops.Normalize()]) processor.finalize() dataset = Dataset(paths, engine) processor.apply(dataset) result = processor.get_ddf().compute() # Make sure we collected accurate statistics means = df0[cont_names].mean() stds = df0[cont_names].std() counts = df0[cont_names].count() for name in cont_names: assert math.isclose(means[name], processor.stats["means"][name], rel_tol=1e-3) assert math.isclose(stds[name], processor.stats["stds"][name], rel_tol=1e-3) assert math.isclose(counts[name], processor.stats["counts"][name], rel_tol=1e-3) # New (normalized) means should all be close to zero new_means = result[cont_names].mean() for name in cont_names: assert new_means[name] < 1e-3
def test_dask_preproc_cpu(client, tmpdir, datasets, engine, shuffle, cpu): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) if engine == "parquet": df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] elif engine == "csv": df1 = cudf.read_csv(paths[0], header=0)[mycols_csv] df2 = cudf.read_csv(paths[1], header=0)[mycols_csv] else: df1 = cudf.read_csv(paths[0], names=allcols_csv)[mycols_csv] df2 = cudf.read_csv(paths[1], names=allcols_csv)[mycols_csv] df0 = cudf.concat([df1, df2], axis=0) if engine in ("parquet", "csv"): dataset = Dataset(paths, part_size="1MB", cpu=cpu) else: dataset = Dataset(paths, names=allcols_csv, part_size="1MB", cpu=cpu) # Simple transform (normalize) cat_names = ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] conts = cont_names >> ops.FillMissing() >> ops.Normalize() workflow = Workflow(conts + cat_names + label_name, client=client) transformed = workflow.fit_transform(dataset) # Write out dataset output_path = os.path.join(tmpdir, "processed") transformed.to_parquet(output_path=output_path, shuffle=shuffle, out_files_per_proc=4) # Check the final result df_disk = dd_read_parquet(output_path, engine="pyarrow").compute() assert_eq( df0.sort_values(["id", "x"])[["name-string", "label"]], df_disk.sort_values(["id", "x"])[["name-string", "label"]], check_index=False, )
def test_gpu_workflow_config(tmpdir, client, df, dataset, gpu_memory_frac, engine, dump, replace): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] norms = ops.Normalize() cat_features = cat_names >> ops.Categorify() if replace: cont_features = cont_names >> ops.FillMissing() >> ops.LogOp >> norms else: fillmissing_logop = (cont_names >> ops.FillMissing() >> ops.LogOp >> ops.Rename(postfix="_FillMissing_1_LogOp_1")) cont_features = cont_names + fillmissing_logop >> norms workflow = Workflow(cat_features + cont_features + label_name, client=client) workflow.fit(dataset) if dump: workflow_dir = os.path.join(tmpdir, "workflow") workflow.save(workflow_dir) workflow = None workflow = Workflow.load(workflow_dir, client=client) def get_norms(tar: cudf.Series): ser_median = tar.dropna().quantile(0.5, interpolation="linear") gdf = tar.fillna(ser_median) gdf = np.log(gdf + 1) return gdf # Check mean and std - No good right now we have to add all other changes; Clip, Log concat_ops = "_FillMissing_1_LogOp_1" if replace: concat_ops = "" assert math.isclose(get_norms(df.x).mean(), norms.means["x" + concat_ops], rel_tol=1e-1) assert math.isclose(get_norms(df.y).mean(), norms.means["y" + concat_ops], rel_tol=1e-1) assert math.isclose(get_norms(df.x).std(), norms.stds["x" + concat_ops], rel_tol=1e-1) assert math.isclose(get_norms(df.y).std(), norms.stds["y" + concat_ops], rel_tol=1e-1) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique().values_host cats0 = get_cats(workflow, "name-cat") # adding the None entry as a string because of move from gpu assert cats0.tolist() == [None] + cats_expected0.tolist() cats_expected1 = df["name-string"].unique().values_host cats1 = get_cats(workflow, "name-string") # adding the None entry as a string because of move from gpu assert cats1.tolist() == [None] + cats_expected1.tolist() # Write to new "shuffled" and "processed" dataset workflow.transform(dataset).to_parquet( tmpdir, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION, ) dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) if engine == "parquet": assert is_integer_dtype(df_pp["name-cat"].dtype) assert is_integer_dtype(df_pp["name-string"].dtype) num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( str(tmpdir) + "/_metadata") assert num_rows == len(df_pp)
def main(args): """Multi-GPU Criteo/DLRM Preprocessing Benchmark This benchmark is designed to measure the time required to preprocess the Criteo (1TB) dataset for Facebook’s DLRM model. The user must specify the path of the raw dataset (using the `--data-path` flag), as well as the output directory for all temporary/final data (using the `--out-path` flag) Example Usage ------------- python dask-nvtabular-criteo-benchmark.py --data-path /path/to/criteo_parquet --out-path /out/dir/` Dataset Requirements (Parquet) ------------------------------ This benchmark is designed with a parquet-formatted dataset in mind. While a CSV-formatted dataset can be processed by NVTabular, converting to parquet will yield significantly better performance. To convert your dataset, try using the `optimize_criteo.ipynb` notebook (also located in `NVTabular/examples/`) For a detailed parameter overview see `NVTabular/examples/MultiGPUBench.md` """ # Input data_path = args.data_path freq_limit = args.freq_limit out_files_per_proc = args.out_files_per_proc high_card_columns = args.high_cards.split(",") dashboard_port = args.dashboard_port if args.protocol == "ucx": UCX_TLS = os.environ.get("UCX_TLS", "tcp,cuda_copy,cuda_ipc,sockcm") os.environ["UCX_TLS"] = UCX_TLS # Cleanup output directory BASE_DIR = args.out_path dask_workdir = os.path.join(BASE_DIR, "workdir") output_path = os.path.join(BASE_DIR, "output") stats_path = os.path.join(BASE_DIR, "stats") if not os.path.isdir(BASE_DIR): os.mkdir(BASE_DIR) for dir_path in (dask_workdir, output_path, stats_path): if os.path.isdir(dir_path): shutil.rmtree(dir_path) os.mkdir(dir_path) # Use Criteo dataset by default (for now) cont_names = (args.cont_names.split(",") if args.cont_names else ["I" + str(x) for x in range(1, 14)]) cat_names = (args.cat_names.split(",") if args.cat_names else ["C" + str(x) for x in range(1, 27)]) label_name = ["label"] # Specify Categorify/GroupbyStatistics options tree_width = {} cat_cache = {} for col in cat_names: if col in high_card_columns: tree_width[col] = args.tree_width cat_cache[col] = args.cat_cache_high else: tree_width[col] = 1 cat_cache[col] = args.cat_cache_low # Use total device size to calculate args.device_limit_frac device_size = device_mem_size(kind="total") device_limit = int(args.device_limit_frac * device_size) device_pool_size = int(args.device_pool_frac * device_size) part_size = int(args.part_mem_frac * device_size) # Parse shuffle option shuffle = None if args.shuffle == "PER_WORKER": shuffle = nvt_io.Shuffle.PER_WORKER elif args.shuffle == "PER_PARTITION": shuffle = nvt_io.Shuffle.PER_PARTITION # Check if any device memory is already occupied for dev in args.devices.split(","): fmem = _pynvml_mem_size(kind="free", index=int(dev)) used = (device_size - fmem) / 1e9 if used > 1.0: warnings.warn( f"BEWARE - {used} GB is already occupied on device {int(dev)}!" ) # Setup LocalCUDACluster if args.protocol == "tcp": cluster = LocalCUDACluster( protocol=args.protocol, n_workers=args.n_workers, CUDA_VISIBLE_DEVICES=args.devices, device_memory_limit=device_limit, local_directory=dask_workdir, dashboard_address=":" + dashboard_port, ) else: cluster = LocalCUDACluster( protocol=args.protocol, n_workers=args.n_workers, CUDA_VISIBLE_DEVICES=args.devices, enable_nvlink=True, device_memory_limit=device_limit, local_directory=dask_workdir, dashboard_address=":" + dashboard_port, ) client = Client(cluster) # Setup RMM pool if args.device_pool_frac > 0.01: setup_rmm_pool(client, device_pool_size) # Define Dask NVTabular "Workflow" processor = Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name, client=client) if args.normalize: processor.add_feature([ops.FillMissing(), ops.Normalize()]) else: processor.add_feature( [ops.FillMissing(), ops.Clip(min_value=0), ops.LogOp()]) processor.add_preprocess( ops.Categorify( out_path=stats_path, tree_width=tree_width, cat_cache=cat_cache, freq_threshold=freq_limit, search_sorted=not freq_limit, on_host=not args.cats_on_device, )) processor.finalize() dataset = Dataset(data_path, "parquet", part_size=part_size) # Execute the dask graph runtime = time.time() if args.profile is not None: with performance_report(filename=args.profile): processor.apply( dataset, shuffle=shuffle, out_files_per_proc=out_files_per_proc, output_path=output_path, num_io_threads=args.num_io_threads, ) else: processor.apply( dataset, num_io_threads=args.num_io_threads, shuffle=shuffle, out_files_per_proc=out_files_per_proc, output_path=output_path, ) runtime = time.time() - runtime print("\nDask-NVTabular DLRM/Criteo benchmark") print("--------------------------------------") print(f"partition size | {part_size}") print(f"protocol | {args.protocol}") print(f"device(s) | {args.devices}") print(f"rmm-pool-frac | {(args.device_pool_frac)}") print(f"out-files-per-proc | {args.out_files_per_proc}") print(f"num_io_threads | {args.num_io_threads}") print(f"shuffle | {args.shuffle}") print(f"cats-on-device | {args.cats_on_device}") print("======================================") print(f"Runtime[s] | {runtime}") print("======================================\n") client.close()
def test_dask_workflow_api_dlrm( client, tmpdir, datasets, freq_threshold, part_mem_fraction, engine, cat_cache, on_host, shuffle, cpu, ): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) paths = sorted(paths) if engine == "parquet": df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] elif engine == "csv": df1 = cudf.read_csv(paths[0], header=0)[mycols_csv] df2 = cudf.read_csv(paths[1], header=0)[mycols_csv] else: df1 = cudf.read_csv(paths[0], names=allcols_csv)[mycols_csv] df2 = cudf.read_csv(paths[1], names=allcols_csv)[mycols_csv] df0 = cudf.concat([df1, df2], axis=0) df0 = df0.to_pandas() if cpu else df0 if engine == "parquet": cat_names = ["name-cat", "name-string"] else: cat_names = ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] cats = cat_names >> ops.Categorify(freq_threshold=freq_threshold, out_path=str(tmpdir), cat_cache=cat_cache, on_host=on_host) conts = cont_names >> ops.FillMissing() >> ops.Clip( min_value=0) >> ops.LogOp() workflow = Workflow(cats + conts + label_name, client=client) if engine in ("parquet", "csv"): dataset = Dataset(paths, cpu=cpu, part_mem_fraction=part_mem_fraction) else: dataset = Dataset(paths, cpu=cpu, names=allcols_csv, part_mem_fraction=part_mem_fraction) output_path = os.path.join(tmpdir, "processed") transformed = workflow.fit_transform(dataset) transformed.to_parquet(output_path=output_path, shuffle=shuffle, out_files_per_proc=1) result = transformed.to_ddf().compute() assert len(df0) == len(result) assert result["x"].min() == 0.0 assert result["x"].isna().sum() == 0 assert result["y"].min() == 0.0 assert result["y"].isna().sum() == 0 # Check categories. Need to sort first to make sure we are comparing # "apples to apples" expect = df0.sort_values(["label", "x", "y", "id"]).reset_index(drop=True).reset_index() got = result.sort_values(["label", "x", "y", "id"]).reset_index(drop=True).reset_index() dfm = expect.merge(got, on="index", how="inner")[["name-string_x", "name-string_y"]] dfm_gb = dfm.groupby(["name-string_x", "name-string_y"]).agg({ "name-string_x": "count", "name-string_y": "count" }) if freq_threshold: dfm_gb = dfm_gb[dfm_gb["name-string_x"] >= freq_threshold] assert_eq(dfm_gb["name-string_x"], dfm_gb["name-string_y"], check_names=False) # Read back from disk if cpu: df_disk = dd_read_parquet(output_path).compute() else: df_disk = dask_cudf.read_parquet(output_path).compute() # we don't have a deterministic ordering here, especially when using # a dask client with multiple workers - so we need to sort the values here columns = ["label", "x", "y", "id"] + cat_names got = result.sort_values(columns).reset_index(drop=True) expect = df_disk.sort_values(columns).reset_index(drop=True) assert_eq(got, expect)
def test_cpu_workflow(tmpdir, df, dataset, cpu, engine, dump): # Make sure we are in cpu formats if cudf and isinstance(df, cudf.DataFrame): df = df.to_pandas() if cpu: dataset.to_cpu() cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] norms = ops.Normalize() conts = cont_names >> ops.FillMissing() >> ops.Clip(min_value=0) >> norms cats = cat_names >> ops.Categorify() workflow = nvt.Workflow(conts + cats + label_name) workflow.fit(dataset) if dump: workflow_dir = os.path.join(tmpdir, "workflow") workflow.save(workflow_dir) workflow = None workflow = Workflow.load(workflow_dir) def get_norms(tar: pd.Series): df = tar.fillna(0) df = df * (df >= 0).astype("int") return df assert math.isclose(get_norms(df.x).mean(), norms.means["x"], rel_tol=1e-4) assert math.isclose(get_norms(df.y).mean(), norms.means["y"], rel_tol=1e-4) assert math.isclose(get_norms(df.x).std(), norms.stds["x"], rel_tol=1e-3) assert math.isclose(get_norms(df.y).std(), norms.stds["y"], rel_tol=1e-3) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique() cats0 = get_cats(workflow, "name-cat", cpu=True) # adding the None entry as a string because of move from gpu assert cats0.tolist() == [None] + sorted(cats_expected0.tolist()) cats_expected1 = df["name-string"].unique() cats1 = get_cats(workflow, "name-string", cpu=True) # adding the None entry as a string because of move from gpu assert cats1.tolist() == [None] + sorted(cats_expected1.tolist()) # Write to new "shuffled" and "processed" dataset workflow.transform(dataset).to_parquet( output_path=tmpdir, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION ) dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"), cpu=cpu) df_pp = pd.concat(list(dataset_2.to_iter()), axis=0) if engine == "parquet": assert is_integer_dtype(df_pp["name-cat"].dtype) assert is_integer_dtype(df_pp["name-string"].dtype) metadata = pq.read_metadata(str(tmpdir) + "/_metadata") assert metadata.num_rows == len(df_pp)
def test_tf_gpu_dl(tmpdir, datasets, batch_size, gpu_memory_frac, engine): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) cont_names = ["x", "y", "id"] cat_names = ["name-string"] label_name = ["label"] if engine == "parquet": cat_names.append("name-cat") columns = cont_names + cat_names processor = nvt.Workflow( cat_names=cat_names, cont_names=cont_names, label_name=label_name, to_cpu=True, ) processor.add_feature([ops.FillMissing()]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify()) processor.finalize() data_itr = KerasSequenceDataset( paths, columns=columns, batch_size=batch_size, buffer_size=gpu_memory_frac, label_name=label_name[0], engine=engine, shuffle=False, ) processor.update_stats(data_itr.nvt_dataset, record_stats=True) data_itr.map(processor) rows = 0 for idx in range(len(data_itr)): X, y = next(data_itr) # first elements to check epoch-to-epoch consistency if idx == 0: X0, y0 = X, y # check that we have at most batch_size elements num_samples = y.shape[0] assert num_samples <= batch_size # check that all the features in X have the # appropriate length and that the set of # their names is exactly the set of names in # `columns` these_cols = columns.copy() for column, x in X.items(): try: these_cols.remove(column) except ValueError: raise AssertionError assert x.shape[0] == num_samples assert len(these_cols) == 0 rows += num_samples # check start of next epoch to ensure consistency X, y = next(data_itr) assert (y.numpy() == y0.numpy()).all() for column, x in X.items(): x0 = X0.pop(column) assert (x.numpy() == x0.numpy()).all() assert len(X0) == 0 # accounts for incomplete batches at the end of chunks # that dont necesssarily have the full batch_size assert (idx + 1) * batch_size >= rows assert rows == (60 * 24 * 3 + 1)
def test_gpu_dl(tmpdir, datasets, batch_size, gpu_memory_frac, engine): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) if engine == "parquet": df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] else: df1 = cudf.read_csv(paths[0], header=False, names=allcols_csv)[mycols_csv] df2 = cudf.read_csv(paths[1], header=False, names=allcols_csv)[mycols_csv] df = cudf.concat([df1, df2], axis=0) df["id"] = df["id"].astype("int64") if engine == "parquet": cat_names = ["name-cat", "name-string"] columns = mycols_pq else: cat_names = ["name-string"] columns = mycols_csv cont_names = ["x", "y", "id"] label_name = ["label"] processor = nvt.Workflow( cat_names=cat_names, cont_names=cont_names, label_name=label_name, to_cpu=True, ) processor.add_feature([ops.FillMissing()]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify()) data_itr = nvtabular.io.GPUDatasetIterator( paths, columns=columns, use_row_groups=True, gpu_memory_frac=gpu_memory_frac, names=allcols_csv, ) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) processor.apply( data_itr, apply_offline=True, record_stats=True, shuffle=True, output_path=output_train, num_out_files=2, ) tar_paths = [ os.path.join(output_train, x) for x in os.listdir(output_train) if x.endswith("parquet") ] data_itr = nvt.torch_dataloader.TorchTensorBatchDatasetItr( tar_paths[0], engine="parquet", sub_batch_size=batch_size, gpu_memory_frac=gpu_memory_frac, cats=cat_names, conts=cont_names, labels=["label"], names=mycols_csv, sep="\t", ) num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( tar_paths[0]) rows = 0 for idx, chunk in enumerate(data_itr): rows += len(chunk[0]) del chunk # accounts for incomplete batches at the end of chunks # that dont necesssarily have the full batch_size assert (idx + 1) * batch_size >= rows assert rows == num_rows if os.path.exists(output_train): shutil.rmtree(output_train)
def test_gpu_preproc(tmpdir, datasets, dump, gpu_memory_frac, engine, preprocessing): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) if engine == "parquet": df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] else: df1 = cudf.read_csv(paths[0], header=False, names=allcols_csv)[mycols_csv] df2 = cudf.read_csv(paths[1], header=False, names=allcols_csv)[mycols_csv] df = cudf.concat([df1, df2], axis=0) df["id"] = df["id"].astype("int64") if engine == "parquet": cat_names = ["name-cat", "name-string"] columns = mycols_pq else: cat_names = ["name-string"] columns = mycols_csv cont_names = ["x", "y", "id"] label_name = ["label"] processor = nvt.Workflow( cat_names=cat_names, cont_names=cont_names, label_name=label_name, to_cpu=True, ) processor.add_feature( [ops.FillMissing(), ops.LogOp(preprocessing=preprocessing)]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify()) processor.finalize() data_itr = nvtabular.io.GPUDatasetIterator( paths, columns=columns, use_row_groups=True, gpu_memory_frac=gpu_memory_frac, names=allcols_csv, ) processor.update_stats(data_itr) if dump: config_file = tmpdir + "/temp.yaml" processor.save_stats(config_file) processor.clear_stats() processor.load_stats(config_file) def get_norms(tar: cudf.Series): ser_median = tar.dropna().quantile(0.5, interpolation="linear") gdf = tar.fillna(ser_median) gdf = np.log(gdf + 1) return gdf # Check mean and std - No good right now we have to add all other changes; Zerofill, Log x_col = "x" if preprocessing else "x_LogOp" y_col = "y" if preprocessing else "y_LogOp" assert math.isclose( get_norms(df.x).mean(), processor.stats["means"][x_col], rel_tol=1e-2, ) assert math.isclose( get_norms(df.y).mean(), processor.stats["means"][y_col], rel_tol=1e-2, ) assert math.isclose( get_norms(df.x).std(), processor.stats["stds"][x_col], rel_tol=1e-2, ) assert math.isclose( get_norms(df.y).std(), processor.stats["stds"][y_col], rel_tol=1e-2, ) # Check median (TODO: Improve the accuracy) x_median = df.x.dropna().quantile(0.5, interpolation="linear") y_median = df.y.dropna().quantile(0.5, interpolation="linear") id_median = df.id.dropna().quantile(0.5, interpolation="linear") assert math.isclose(x_median, processor.stats["medians"]["x"], rel_tol=1e1) assert math.isclose(y_median, processor.stats["medians"]["y"], rel_tol=1e1) assert math.isclose(id_median, processor.stats["medians"]["id"], rel_tol=1e1) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique().values_to_string() cats0 = processor.stats["encoders"]["name-cat"].get_cats( ).values_to_string() assert cats0 == ["None"] + cats_expected0 cats_expected1 = df["name-string"].unique().values_to_string() cats1 = processor.stats["encoders"]["name-string"].get_cats( ).values_to_string() print(cats1) assert cats1 == ["None"] + cats_expected1 # Write to new "shuffled" and "processed" dataset processor.write_to_dataset(tmpdir, data_itr, nfiles=10, shuffle=True, apply_ops=True) processor.create_final_cols() # if preprocessing if not preprocessing: for col in cont_names: assert f"{col}_LogOp" in processor.columns_ctx["final"]["cols"][ "continuous"] dlc = nvtabular.torch_dataloader.DLCollator(preproc=processor, apply_ops=False) data_files = [ nvtabular.torch_dataloader.FileItrDataset( x, use_row_groups=True, gpu_memory_frac=gpu_memory_frac, names=allcols_csv, ) for x in glob.glob(str(tmpdir) + "/ds_part.*.parquet") ] data_itr = torch.utils.data.ChainDataset(data_files) dl = nvtabular.torch_dataloader.DLDataLoader(data_itr, collate_fn=dlc.gdf_col, pin_memory=False, num_workers=0) len_df_pp = 0 for chunk in dl: len_df_pp += len(chunk[0][0]) data_itr = nvtabular.io.GPUDatasetIterator( glob.glob(str(tmpdir) + "/ds_part.*.parquet"), use_row_groups=True, gpu_memory_frac=gpu_memory_frac, names=allcols_csv, ) x = processor.ds_to_tensors(data_itr, apply_ops=False) num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( str(tmpdir) + "/_metadata") assert len(x[0]) == len_df_pp itr_ds = nvtabular.torch_dataloader.TensorItrDataset([x[0], x[1], x[2]], batch_size=512000) count_tens_itr = 0 for data_gd in itr_ds: count_tens_itr += len(data_gd[1]) assert data_gd[0][0].shape[1] > 0 assert data_gd[0][1].shape[1] > 0 assert len_df_pp == count_tens_itr if os.path.exists(processor.ds_exports): shutil.rmtree(processor.ds_exports)
def test_hugectr(tmpdir, client, df, dataset, output_format, engine, op_columns, num_io_threads, use_client): client = client if use_client else None cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y"] label_names = ["label"] # set variables nfiles = 10 ext = "" outdir = tmpdir + "/hugectr" os.mkdir(outdir) # process data processor = nvt.Workflow(client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_names) processor.add_feature([ ops.FillMissing(columns=op_columns), ops.Clip(min_value=0, columns=op_columns), ops.LogOp(), ]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify()) processor.finalize() # apply the workflow and write out the dataset processor.apply( dataset, output_path=outdir, out_files_per_proc=nfiles, output_format=output_format, shuffle=None, num_io_threads=num_io_threads, ) # Check for _file_list.txt assert os.path.isfile(outdir + "/_file_list.txt") # Check for _metadata.json assert os.path.isfile(outdir + "/_metadata.json") # Check contents of _metadata.json data = {} col_summary = {} with open(outdir + "/_metadata.json", "r") as fil: for k, v in json.load(fil).items(): data[k] = v assert "cats" in data assert "conts" in data assert "labels" in data assert "file_stats" in data assert len(data["file_stats"]) == nfiles if not client else nfiles * len( client.cluster.workers) for cdata in data["cats"] + data["conts"] + data["labels"]: col_summary[cdata["index"]] = cdata["col_name"] # Check that data files exist ext = "" if output_format == "parquet": ext = "parquet" elif output_format == "hugectr": ext = "data" data_files = [ os.path.join(outdir, filename) for filename in os.listdir(outdir) if filename.endswith(ext) ] # Make sure the columns in "_metadata.json" make sense if output_format == "parquet": df_check = cudf.read_parquet(os.path.join(outdir, data_files[0])) for i, name in enumerate(df_check.columns): if i in col_summary: assert col_summary[i] == name
def processing( self, X_pd, y_names=[], encode_categor_type=None, #'categorify', 'onehotencoding', outliers_detection_technique=None, #'iqr_proximity_rule', 'gaussian_approximation','quantiles' fill_with_value=None, #'extreme_values', 'zeros','mean-median' targetencoding=False, file_path=None, ): X = dd.from_pandas(X_pd, npartitions=self.n_gpus) X = X.replace(np.nan, None) try: self.time_columns except AttributeError: try: self.initialize_types( X, n_unique_val_th=n_unique_val_th_, categor_columns_keep=categor_columns_keep_, numer_columns_keep=numer_columns_keep_) except NameError: self.initialize_types(X) workflow = nvt.Workflow(cat_names=self.categor_columns, cont_names=self.numer_columns, label_name=y_names, client=self.client) # Operators: https://nvidia.github.io/NVTabular/main/api/ops/index.html # Categorify https://nvidia.github.io/NVTabular/main/api/ops/categorify.html if encode_categor_type == 'categorify': if len(self.categor_columns) != 0: workflow.add_preprocess( ops.Categorify(columns=self.categor_columns, out_path='./')) if encode_categor_type == 'onehotencoding': #OneHotEncoder().get_feature_names(input_features=<list of features encoded>) does not work #lengths=True - chunk sizes can be computed for column in self.categor_columns: #X[column] = X[column].astype(str) X_cat_encoded = OneHotEncoder().fit_transform( X[column].to_dask_array(lengths=True).reshape(-1, 1)) uvs = X[column].unique().compute().values X = X.drop([column], axis=1) X_cat_encoded = dd.from_array( X_cat_encoded.compute().todense()) X_cat_encoded.columns = [ column + '_{}'.format(uv) for uv in uvs ] X = dd.concat([X, X_cat_encoded], axis=1) X = X.repartition(npartitions=2) for column in X.columns: if any(str(column)[-4:] == t for t in ['_nan', 'None']): # What else? X = X.drop([column], axis=1) self.initialize_types(X) print('Retyping:', self.initialize_types(X)) # Reinitialize workflow workflow = nvt.Workflow(cat_names=self.categor_columns, cont_names=self.numer_columns, label_name=y_names, client=self.client) # OutlDetect https://nvidia.github.io/NVTabular/main/api/ops/clip.html if (len(self.numer_columns) != 0) and (outliers_detection_technique != None): lower, upper = self.outldetect(outliers_detection_technique, X[self.numer_columns]) for i in range(len(self.numer_columns)): logging.info( f'column: {self.numer_columns[i]}, lower: {lower[i]}, upper: {upper[i]}' ) print( f'column: {self.numer_columns[i]}, lower: {lower[i]}, upper: {upper[i]}' ) workflow.add_preprocess( ops.Clip(min_value=lower[i], max_value=upper[i], columns=[self.numer_columns[i]])) # FillMissing https://nvidia.github.io/NVTabular/main/api/ops/fillmissing.html if fill_with_value == 'zeros': workflow.add_preprocess( ops.FillMissing(fill_val=0, columns=self.categor_columns + self.numer_columns)) if fill_with_value == 'extreme_values': extrim_values = {} if len(self.numer_columns) != 0: extrim_values.update( self.extrvalsdetect(X[self.numer_columns], 'numer_columns')) if len(self.categor_columns) != 0: extrim_values.update( self.extrvalsdetect(X[self.categor_columns], 'categor_columns')) logging.info(f'extrim_values: {extrim_values}') output = open('extrim_values', 'wb') pickle.dump(extrim_values, output) output.close() for fill_val, column in zip(list(extrim_values.values()), list(extrim_values.keys())): workflow.add_preprocess( ops.FillMissing(fill_val=fill_val, columns=[column])) if fill_with_value == 'mean-median': if len(self.categor_columns) != 0: workflow.add_preprocess( ops.FillMedian(columns=self.categor_columns, preprocessing=True, replace=True)) if len(self.numer_columns) != 0: means = list( dd.from_pandas( X[self.numer_columns], npartitions=self.n_gpus).mean().compute().values) for fill_val, column in zip(means, self.numer_columns): workflow.add_preprocess( ops.FillMissing(fill_val=fill_val, columns=[column])) if targetencoding: #https://nvidia.github.io/NVTabular/main/api/ops/targetencoding.html if len(self.y_names) != 0: if len(self.cat_groups) == 0: print( '\n Target encoding will be applied to all categorical columns' ) workflow.add_preprocess( ops.TargetEncoding(cat_groups=self.categor_columns, cont_target=self.y_names)) else: workflow.add_preprocess( ops.TargetEncoding(cat_groups=self.cat_groups, cont_target=self.y_names)) #----------------------------------------------------------------------------------------- workflow.finalize() dataset = nvt.Dataset(X) tmp_output_path = "./parquet_data_tmp" workflow.apply( dataset, output_format="parquet", output_path=tmp_output_path, shuffle=Shuffle.PER_WORKER, # Shuffle algorithm out_files_per_proc=1, # Number of output files per worker ) files = glob.glob(tmp_output_path + "/*.parquet") X_final = cudf.read_parquet(files[0]) for i in range(1, len(files)): X_final = X_final.append(cudf.read_parquet(files[i])) # Delete temporary files shutil.rmtree(tmp_output_path, ignore_errors=True) # if len(self.rest_col_names) != 0: # print(1) # X_final = pd.concat([X_final.to_pandas(), X_pd[self.rest_col_names]], axis=1) if file_path is not None: X_final.to_csv(file_path, index=False) return X_final
def test_dask_workflow_api_dlrm( client, tmpdir, datasets, freq_threshold, part_mem_fraction, engine, cat_cache, on_host, shuffle ): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) if engine == "parquet": df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] elif engine == "csv": df1 = cudf.read_csv(paths[0], header=0)[mycols_csv] df2 = cudf.read_csv(paths[1], header=0)[mycols_csv] else: df1 = cudf.read_csv(paths[0], names=allcols_csv)[mycols_csv] df2 = cudf.read_csv(paths[1], names=allcols_csv)[mycols_csv] df0 = cudf.concat([df1, df2], axis=0) if engine == "parquet": cat_names = ["name-cat", "name-string"] else: cat_names = ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] cats = cat_names >> ops.Categorify( freq_threshold=freq_threshold, out_path=str(tmpdir), cat_cache=cat_cache, on_host=on_host ) conts = cont_names >> ops.FillMissing() >> ops.Clip(min_value=0) >> ops.LogOp() workflow = Workflow(cats + conts + label_name, client=client) if engine in ("parquet", "csv"): dataset = Dataset(paths, part_mem_fraction=part_mem_fraction) else: dataset = Dataset(paths, names=allcols_csv, part_mem_fraction=part_mem_fraction) output_path = os.path.join(tmpdir, "processed") transformed = workflow.fit_transform(dataset) transformed.to_parquet(output_path=output_path, shuffle=shuffle) # Can still access the final ddf if we didn't shuffle if not shuffle: result = transformed.to_ddf().compute() assert len(df0) == len(result) assert result["x"].min() == 0.0 assert result["x"].isna().sum() == 0 assert result["y"].min() == 0.0 assert result["y"].isna().sum() == 0 # Check category counts cat_expect = df0.groupby("name-string").agg({"name-string": "count"}).reset_index(drop=True) cat_result = ( result.groupby("name-string").agg({"name-string": "count"}).reset_index(drop=True) ) if freq_threshold: cat_expect = cat_expect[cat_expect["name-string"] >= freq_threshold] # Note that we may need to skip the 0th element in result (null mapping) assert_eq( cat_expect, cat_result.iloc[1:] if len(cat_result) > len(cat_expect) else cat_result, check_index=False, ) else: assert_eq(cat_expect, cat_result) # Read back from disk df_disk = dask_cudf.read_parquet(output_path, index=False).compute() for col in df_disk: assert_eq(result[col], df_disk[col]) else: df_disk = dask_cudf.read_parquet(output_path, index=False).compute() assert len(df0) == len(df_disk)
def test_gpu_workflow_config(tmpdir, client, df, dataset, gpu_memory_frac, engine, dump, replace): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] config = nvt.workflow.get_new_config() # add operators with dependencies config["FE"]["continuous"] = [[ ops.FillMissing(replace=replace), ops.LogOp(replace=replace) ]] config["PP"]["continuous"] = [[ ops.LogOp(replace=replace), ops.Normalize() ]] config["PP"]["categorical"] = [ops.Categorify()] processor = nvt.Workflow( cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config, client=client, ) processor.update_stats(dataset) if dump: config_file = tmpdir + "/temp.yaml" processor.save_stats(config_file) processor.clear_stats() processor.load_stats(config_file) def get_norms(tar: cudf.Series): ser_median = tar.dropna().quantile(0.5, interpolation="linear") gdf = tar.fillna(ser_median) gdf = np.log(gdf + 1) return gdf # Check mean and std - No good right now we have to add all other changes; Zerofill, Log concat_ops = "_FillMissing_LogOp" if replace: concat_ops = "" assert math.isclose(get_norms(df.x).mean(), processor.stats["means"]["x" + concat_ops], rel_tol=1e-1) assert math.isclose(get_norms(df.y).mean(), processor.stats["means"]["y" + concat_ops], rel_tol=1e-1) assert math.isclose(get_norms(df.x).std(), processor.stats["stds"]["x" + concat_ops], rel_tol=1e-1) assert math.isclose(get_norms(df.y).std(), processor.stats["stds"]["y" + concat_ops], rel_tol=1e-1) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique().values_host cats0 = get_cats(processor, "name-cat") # adding the None entry as a string because of move from gpu assert cats0.tolist() == [None] + cats_expected0.tolist() cats_expected1 = df["name-string"].unique().values_host cats1 = get_cats(processor, "name-string") # adding the None entry as a string because of move from gpu assert cats1.tolist() == [None] + cats_expected1.tolist() # Write to new "shuffled" and "processed" dataset processor.write_to_dataset(tmpdir, dataset, out_files_per_proc=10, shuffle="partial", apply_ops=True) dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) if engine == "parquet": assert is_integer_dtype(df_pp["name-cat"].dtype) assert is_integer_dtype(df_pp["name-string"].dtype) num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( str(tmpdir) + "/_metadata") assert num_rows == len(df_pp)