def test_workflow_move_saved(tmpdir): raw = """US>SC>519 US>CA>807 US>MI>505 US>CA>510 CA>NB US>CA>534""".split() data = cudf.DataFrame({"geo": raw}) geo_location = ColumnGroup(["geo"]) state = geo_location >> (lambda col: col.str.slice(0, 5)) >> ops.Rename( postfix="_state") country = geo_location >> (lambda col: col.str.slice(0, 2)) >> ops.Rename( postfix="_country") geo_features = state + country + geo_location >> ops.Categorify() # create the workflow and transform the input workflow = Workflow(geo_features) expected = workflow.fit_transform(Dataset(data)).to_ddf().compute() # save the workflow (including categorical mapping parquet files) # and then verify we can load the saved workflow after moving the directory out_path = os.path.join(tmpdir, "output", "workflow") workflow.save(out_path) moved_path = os.path.join(tmpdir, "output", "workflow2") shutil.move(out_path, moved_path) workflow2 = Workflow.load(moved_path) # also check that when transforming our input we get the same results after loading transformed = workflow2.transform(Dataset(data)).to_ddf().compute() assert_eq(expected, transformed)
def test_concatenate_dataframe(tmpdir, output_model): # we were seeing an issue in the rossmann workflow where we dropped certain columns, # https://github.com/NVIDIA/NVTabular/issues/961 df = _make_df({ "cat": ["aaaa", "bbbb", "cccc", "aaaa", "bbbb", "aaaa"], "cont": [0.0, 1.0, 2.0, 3.0, 4.0, 5], }) # this bug only happened with a dataframe representation: force this by using a lambda cats = ["cat"] >> ops.LambdaOp(lambda col: _hash_series(col) % 1000) conts = ["cont"] >> ops.Normalize() >> ops.FillMissing() >> ops.LogOp() dataset = Dataset(df) workflow = nvt.Workflow(cats + conts).fit_schema(dataset.infer_schema()) if output_model == "pytorch": model_info = { "cat": { "columns": ["cat"], "dtype": "int32" }, "cont": { "columns": ["cont"], "dtype": "float32" }, } else: model_info = None _verify_workflow_on_tritonserver(tmpdir, workflow, df, "test_concatenate_dataframe", output_model, model_info)
def test_gpu_workflow(tmpdir, df, dataset, gpu_memory_frac, engine, dump): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] norms = ops.Normalize() conts = cont_names >> ops.FillMissing() >> ops.Clip(min_value=0) >> norms cats = cat_names >> ops.Categorify() workflow = nvt.Workflow(conts + cats + label_name) workflow.fit(dataset) if dump: workflow_dir = os.path.join(tmpdir, "workflow") workflow.save(workflow_dir) workflow = None workflow = Workflow.load(workflow_dir) def get_norms(tar: cudf.Series): gdf = tar.fillna(0) gdf = gdf * (gdf >= 0).astype("int") return gdf assert math.isclose(get_norms(df.x).mean(), norms.means["x"], rel_tol=1e-4) assert math.isclose(get_norms(df.y).mean(), norms.means["y"], rel_tol=1e-4) assert math.isclose(get_norms(df.x).std(), norms.stds["x"], rel_tol=1e-3) assert math.isclose(get_norms(df.y).std(), norms.stds["y"], rel_tol=1e-3) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique().values_host cats0 = get_cats(workflow, "name-cat") # adding the None entry as a string because of move from gpu assert cats0.tolist() == [None] + cats_expected0.tolist() cats_expected1 = df["name-string"].unique().values_host cats1 = get_cats(workflow, "name-string") # adding the None entry as a string because of move from gpu assert cats1.tolist() == [None] + cats_expected1.tolist() # Write to new "shuffled" and "processed" dataset workflow.transform(dataset).to_parquet( output_path=tmpdir, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION) dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) if engine == "parquet": assert is_integer_dtype(df_pp["name-cat"].dtype) assert is_integer_dtype(df_pp["name-string"].dtype) num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( str(tmpdir) + "/_metadata") assert num_rows == len(df_pp)
def test_dask_normalize(client, tmpdir, datasets, engine): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] df0 = cudf.concat([df1, df2], axis=0) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] normalize = ops.Normalize() conts = cont_names >> ops.FillMissing() >> normalize workflow = Workflow(conts + cat_names + label_name, client=client) dataset = Dataset(paths, engine) result = workflow.fit_transform(dataset).to_ddf().compute() # Make sure we collected accurate statistics means = df0[cont_names].mean() stds = df0[cont_names].std() for name in cont_names: assert math.isclose(means[name], normalize.means[name], rel_tol=1e-3) assert math.isclose(stds[name], normalize.stds[name], rel_tol=1e-3) # New (normalized) means should all be close to zero new_means = result[cont_names].mean() for name in cont_names: assert new_means[name] < 1e-3
def test_dask_normalize(client, tmpdir, datasets, engine): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] df0 = cudf.concat([df1, df2], axis=0) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = Workflow( client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name ) processor.add_preprocess(ops.Normalize()) processor.finalize() dataset = Dataset(paths, engine) processor.apply(dataset) result = processor.get_ddf().compute() # Make sure we collected accurate statistics means = df0[cont_names].mean() stds = df0[cont_names].std() counts = df0[cont_names].count() for name in cont_names: assert math.isclose(means[name], processor.stats["means"][name], rel_tol=1e-3) assert math.isclose(stds[name], processor.stats["stds"][name], rel_tol=1e-3) assert math.isclose(counts[name], processor.stats["counts"][name], rel_tol=1e-3) # New (normalized) means should all be close to zero new_means = result[cont_names].mean() for name in cont_names: assert new_means[name] < 1e-3
def test_dask_median_dummyop(client, tmpdir, datasets, engine): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] class DummyOp(ops.DFOperator): default_in, default_out = "continuous", "continuous" @property def req_stats(self): return [ops.Median()] def op_logic(self, *args, **kwargs): return _dummy_op_logic(*args, _id=self._id, **kwargs) processor = Workflow( client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name ) processor.add_preprocess(DummyOp()) processor.finalize() dataset = Dataset(paths, engine) processor.apply(dataset) result = processor.get_ddf().compute() # TODO: Improve the accuracy! "tidigest" with crick could help, # but current version seems to have cupy/numpy problems here medians = result[cont_names].quantile(q=0.5) assert math.isclose(medians["x"], processor.stats["medians"]["x"], abs_tol=1e-1) assert math.isclose(medians["y"], processor.stats["medians"]["y"], abs_tol=1e-1) assert math.isclose(medians["id"], processor.stats["medians"]["id"], rel_tol=1e-2)
def test_dask_minmax_dummyop(client, tmpdir, datasets, engine): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] class DummyOp(ops.DFOperator): default_in, default_out = "continuous", "continuous" @property def req_stats(self): return [ops.MinMax()] def op_logic(self, *args, **kwargs): return _dummy_op_logic(*args, _id=self._id, **kwargs) processor = Workflow( client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name ) processor.add_preprocess(DummyOp()) processor.finalize() dataset = Dataset(paths, engine) processor.apply(dataset) result = processor.get_ddf().compute() assert math.isclose(result.x.min(), processor.stats["mins"]["x"], rel_tol=1e-3) assert math.isclose(result.y.min(), processor.stats["mins"]["y"], rel_tol=1e-3) assert math.isclose(result.id.min(), processor.stats["mins"]["id"], rel_tol=1e-3) assert math.isclose(result.x.max(), processor.stats["maxs"]["x"], rel_tol=1e-3) assert math.isclose(result.y.max(), processor.stats["maxs"]["y"], rel_tol=1e-3) assert math.isclose(result.id.max(), processor.stats["maxs"]["id"], rel_tol=1e-3)
def test_cats_and_groupby_stats(client, tmpdir, datasets, part_mem_fraction, use_client): engine = "parquet" paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] cats = ColumnSelector(cat_names) cat_features = cats >> ops.Categorify( out_path=str(tmpdir), freq_threshold=10, on_host=True) groupby_features = cats >> ops.JoinGroupby( cont_cols=cont_names, stats=["count", "sum"], out_path=str(tmpdir)) # We have a global dask client defined in this context, so NVTabular # should warn us if we initialize a `Workflow` with `client=None` workflow = run_in_context( Workflow, cat_features + groupby_features, context=None if use_client else pytest.warns(UserWarning), client=client if use_client else None, ) dataset = Dataset(paths, part_mem_fraction=part_mem_fraction) result = workflow.fit_transform(dataset).to_ddf().compute() assert "name-cat_x_sum" in result.columns assert "name-string_x_sum" in result.columns
def test_cats_and_groupby_stats(client, tmpdir, datasets, part_mem_fraction, use_client): engine = "parquet" paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = Workflow( client=client if use_client else None, cat_names=cat_names, cont_names=cont_names, label_name=label_name, ) processor.add_preprocess( ops.Categorify(out_path=str(tmpdir), freq_threshold=10, on_host=True)) processor.add_cat_feature( ops.JoinGroupby(cont_names=cont_names, stats=["count", "sum"], out_path=str(tmpdir))) processor.finalize() dataset = Dataset(paths, part_mem_fraction=part_mem_fraction) processor.apply(dataset, output_path=str(tmpdir)) result = processor.get_ddf().compute() assert "name-cat_x_sum" in result.columns assert "name-string_x_sum" in result.columns
def test_workflow_node_select(): df = dispatch._make_df({ "a": [1, 4, 9, 16, 25], "b": [0, 1, 2, 3, 4], "c": [25, 16, 9, 4, 1] }) dataset = Dataset(df) input_features = WorkflowNode(ColumnSelector(["a", "b", "c"])) # pylint: disable=unnecessary-lambda sqrt_features = input_features[["a", "c"]] >> (lambda col: np.sqrt(col)) plus_one_features = input_features["b"] >> (lambda col: col + 1) features = sqrt_features + plus_one_features workflow = Workflow(features) workflow.fit(dataset) df_out = workflow.transform(dataset).to_ddf().compute( scheduler="synchronous") expected = dispatch._make_df() expected["a"] = np.sqrt(df["a"]) expected["c"] = np.sqrt(df["c"]) expected["b"] = df["b"] + 1 assert_eq(expected, df_out)
def test_schema_write_read_dataset(tmpdir, dataset, engine): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] norms = ops.Normalize() cat_features = cat_names >> ops.Categorify(cat_cache="host") cont_features = cont_names >> ops.FillMissing() >> ops.Clip( min_value=0) >> ops.LogOp >> norms workflow = Workflow(cat_features + cont_features + label_name) workflow.fit(dataset) workflow.transform(dataset).to_parquet( tmpdir, out_files_per_proc=10, ) schema_path = Path(tmpdir) proto_schema = PbTxt_SchemaWriter._read(schema_path / "schema.pbtxt") new_dataset = Dataset(glob.glob(str(tmpdir) + "/*.parquet")) assert """name: "name-cat"\n min: 0\n max: 27\n""" in str( proto_schema) assert new_dataset.schema == workflow.output_schema
def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction): engine = "parquet" paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] df0 = cudf.concat([df1, df2], axis=0) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] features = cat_names >> ops.JoinGroupby( cont_names=cont_names, stats=["count", "sum", "std", "min"], out_path=str(tmpdir) ) dataset = Dataset(paths, part_mem_fraction=part_mem_fraction) workflow = Workflow(features + cat_names + cont_names + label_name, client=client) result = workflow.fit_transform(dataset).to_ddf().compute(scheduler="synchronous") # Validate result assert len(df0) == len(result) assert "name-cat_x_std" in result.columns assert "name-cat_x_var" not in result.columns assert "name-string_x_std" in result.columns assert "name-string_x_var" not in result.columns # Check "count" assert_eq( result[["name-cat", "name-cat_count"]] .drop_duplicates() .sort_values("name-cat")["name-cat_count"], df0.groupby("name-cat").agg({"x": "count"})["x"].astype(np.int64), check_index=False, check_dtype=False, # May get int64 vs int32 check_names=False, ) # Check "min" assert_eq( result[["name-string", "name-string_x_min"]] .drop_duplicates() .sort_values("name-string")["name-string_x_min"], df0.groupby("name-string").agg({"x": "min"})["x"], check_index=False, check_names=False, ) # Check "std" assert_eq( result[["name-string", "name-string_x_std"]] .drop_duplicates() .sort_values("name-string")["name-string_x_std"], df0.groupby("name-string").agg({"x": "std"})["x"], check_index=False, check_names=False, )
def test_nested_workflow_node(): df = dispatch._make_df({ "geo": ["US>CA", "US>NY", "CA>BC", "CA>ON"], "user": ["User_A", "User_A", "User_A", "User_B"], }) dataset = Dataset(df) geo_selector = ColumnSelector(["geo"]) country = (geo_selector >> LambdaOp(lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country")) # country1 = geo_selector >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country1") # country2 = geo_selector >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country2") user = "******" # user2 = "user2" # make sure we can do a 'combo' categorify (cross based) of country+user # as well as categorifying the country and user columns on their own cats = country + user + [country + user] >> Categorify(encode_type="combo") workflow = Workflow(cats) workflow.fit_schema(dataset.infer_schema()) df_out = workflow.fit_transform(dataset).to_ddf().compute( scheduler="synchronous") geo_country = df_out["geo_country"] assert geo_country[0] == geo_country[1] # rows 0,1 are both 'US' assert geo_country[2] == geo_country[3] # rows 2,3 are both 'CA' user = df_out["user"] assert user[0] == user[1] == user[2] assert user[3] != user[2] geo_country_user = df_out["geo_country_user"] assert geo_country_user[0] == geo_country_user[1] # US / userA assert geo_country_user[2] != geo_country_user[ 0] # same user but in canada # make sure we get an exception if we nest too deeply (can't handle arbitrarily deep # nested column groups - and the exceptions we would get in operators like Categorify # are super confusing for users) with pytest.raises(ValueError): cats = [[country + "user"] + country + "user" ] >> Categorify(encode_type="combo")
def test_filtered_partition(tmpdir, cpu): # Toy DataFrame example df = pd.DataFrame({"col": range(100)}) ddf = dd_from_pandas(df, npartitions=5) dataset = Dataset(ddf, cpu=cpu) # Workflow filtered = ["col"] >> ops.Filter(lambda df: df["col"] < 75) workflow = Workflow(filtered) # Write result to disk workflow.transform(dataset).to_parquet(str(tmpdir))
def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction): engine = "parquet" paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] df0 = cudf.concat([df1, df2], axis=0) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = Workflow( client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name ) processor.add_preprocess( ops.GroupBy(cont_names=cont_names, stats=["count", "sum", "std"], out_path=str(tmpdir)) ) processor.finalize() dataset = Dataset(paths, part_mem_fraction=part_mem_fraction) processor.apply(dataset) result = processor.get_ddf().compute(scheduler="synchronous") # Validate result assert len(df0) == len(result) assert "name-cat_x_std" in result.columns assert "name-cat_x_var" not in result.columns assert "name-string_x_std" in result.columns assert "name-string_x_var" not in result.columns # Check "count" assert_eq( result[["name-cat", "name-cat_count"]] .drop_duplicates() .sort_values("name-cat")["name-cat_count"], df0.groupby("name-cat").agg({"x": "count"})["x"], check_index=False, check_dtype=False, # May get int64 vs int32 check_names=False, ) # Check "std" assert_eq( result[["name-string", "name-string_x_std"]] .drop_duplicates() .sort_values("name-string")["name-string_x_std"], df0.groupby("name-string").agg({"x": "std"})["x"], check_index=False, check_names=False, )
def _create_nvt_dataset(df): from nvtabular import Dataset if not isinstance(df, Dataset): # turn arrow format into readable for dispatch df_ext_format = _detect_format(df) if df_ext_format == ExtData.ARROW: df = df.to_pandas() if not cudf else cudf.DataFrame.from_arrow(df) # run through make df to safely cast to df elif df_ext_format in [ExtData.DASK_CUDF, ExtData.DASK_PANDAS]: df = df.compute() return Dataset(df) return df
def test_dask_preproc_cpu(client, tmpdir, datasets, engine, shuffle, cpu): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) if engine == "parquet": df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] elif engine == "csv": df1 = cudf.read_csv(paths[0], header=0)[mycols_csv] df2 = cudf.read_csv(paths[1], header=0)[mycols_csv] else: df1 = cudf.read_csv(paths[0], names=allcols_csv)[mycols_csv] df2 = cudf.read_csv(paths[1], names=allcols_csv)[mycols_csv] df0 = cudf.concat([df1, df2], axis=0) if engine in ("parquet", "csv"): dataset = Dataset(paths, part_size="1MB", cpu=cpu) else: dataset = Dataset(paths, names=allcols_csv, part_size="1MB", cpu=cpu) # Simple transform (normalize) cat_names = ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] conts = cont_names >> ops.FillMissing() >> ops.Normalize() workflow = Workflow(conts + cat_names + label_name, client=client) transformed = workflow.fit_transform(dataset) # Write out dataset output_path = os.path.join(tmpdir, "processed") transformed.to_parquet(output_path=output_path, shuffle=shuffle, out_files_per_proc=4) # Check the final result df_disk = dd_read_parquet(output_path, engine="pyarrow").compute() assert_eq( df0.sort_values(["id", "x"])[["name-string", "label"]], df_disk.sort_values(["id", "x"])[["name-string", "label"]], check_index=False, )
def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction): engine = "parquet" paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] df0 = cudf.concat([df1, df2], axis=0) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] features = cat_names >> ops.JoinGroupby( cont_cols=cont_names, stats=["count", "sum", "std", "min"], out_path=str(tmpdir)) dataset = Dataset(paths, part_mem_fraction=part_mem_fraction) workflow = Workflow(features + cat_names + cont_names + label_name, client=client) result = workflow.fit_transform(dataset).to_ddf().compute( scheduler="synchronous") # Validate result assert len(df0) == len(result) assert "name-cat_x_std" in result.columns assert "name-cat_x_var" not in result.columns assert "name-string_x_std" in result.columns assert "name-string_x_var" not in result.columns # Check results. Need to sort for direct comparison expect = df0.sort_values(["label", "x", "y", "id"]).reset_index(drop=True).reset_index() got = result.sort_values(["label", "x", "y", "id"]).reset_index(drop=True).reset_index() gb_e = expect.groupby("name-cat").aggregate({ "name-cat": "count", "x": ["sum", "min", "std"] }) gb_e.columns = ["count", "sum", "min", "std"] df_check = got.merge(gb_e, left_on="name-cat", right_index=True, how="left") assert_eq(df_check["name-cat_count"], df_check["count"].astype("int64"), check_names=False) assert_eq(df_check["name-cat_x_sum"], df_check["sum"], check_names=False) assert_eq(df_check["name-cat_x_min"], df_check["min"], check_names=False) assert_eq(df_check["name-cat_x_std"], df_check["std"], check_names=False)
def test_workflow_input_output_dtypes(): df = cudf.DataFrame({ "genre": ["drama", "comedy"], "user": ["a", "b"], "unneeded": [1, 2] }) features = [["genre", "user"], "genre" ] >> ops.Categorify(encode_type="combo") workflow = Workflow(features) workflow.fit(Dataset(df)) assert "unneeded" not in workflow.input_dtypes assert set(workflow.input_dtypes.keys()) == {"genre", "user"} assert set(workflow.output_dtypes.keys()) == {"genre_user", "genre"}
def test_fit_simple(): data = cudf.DataFrame({ "x": [0, 1, 2, None, 0, 1, 2], "y": [None, 3, 4, 5, 3, 4, 5] }) dataset = Dataset(data) workflow = Workflow(["x", "y"] >> ops.FillMedian() >> (lambda x: x * x)) workflow.fit(dataset) transformed = workflow.transform(dataset).to_ddf().compute() expected = cudf.DataFrame({ "x": [0, 1, 4, 1, 0, 1, 4], "y": [16, 9, 16, 25, 9, 16, 25] }) assert_eq(expected, transformed)
def test_chaining_1(): df = cudf.DataFrame({ "cont01": np.random.randint(1, 100, 100), "cont02": np.random.random(100) * 100, "cat01": np.random.randint(0, 10, 100), "label": np.random.randint(0, 3, 100), }) df["cont01"][:10] = None cont1 = "cont01" >> ops.FillMissing() conts = cont1 + "cont02" >> ops.NormalizeMinMax() workflow = Workflow(conts + "cat01" + "label") result = workflow.fit_transform(Dataset(df)).to_ddf().compute() assert result["cont01"].max() <= 1.0 assert result["cont02"].max() <= 1.0
def test_cats_and_groupby_stats(client, tmpdir, datasets, part_mem_fraction, use_client): engine = "parquet" paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] cats = ColumnGroup(cat_names) cat_features = cats >> ops.Categorify(out_path=str(tmpdir), freq_threshold=10, on_host=True) groupby_features = cats >> ops.JoinGroupby( cont_names=cont_names, stats=["count", "sum"], out_path=str(tmpdir) ) workflow = Workflow(cat_features + groupby_features, client=client) dataset = Dataset(paths, part_mem_fraction=part_mem_fraction) result = workflow.fit_transform(dataset).to_ddf().compute() assert "name-cat_x_sum" in result.columns assert "name-string_x_sum" in result.columns
def test_fit_simple(): data = nvt.dispatch._make_df({ "x": [0, 1, 2, None, 0, 1, 2], "y": [None, 3, 4, 5, 3, 4, 5] }) dataset = Dataset(data) workflow = Workflow(["x", "y"] >> ops.FillMedian() >> (lambda x: x * x)) workflow.fit(dataset) transformed = workflow.transform(dataset).to_ddf().compute() expected = nvt.dispatch._make_df({ "x": [0, 1, 4, 1, 0, 1, 4], "y": [16, 9, 16, 25, 9, 16, 25] }) if not HAS_GPU: transformed["x"] = transformed["x"].astype(expected["x"].dtype) transformed["y"] = transformed["y"].astype(expected["y"].dtype) assert_eq(expected, transformed)
def test_workflow_transform_ddf_dtypes(): # Initial Dataset df = cudf.datasets.timeseries().reset_index() ddf = dask_cudf.from_cudf(df, npartitions=2) dataset = Dataset(ddf) # Create and Execute Workflow cols = ["name", "x", "y", "timestamp"] cat_cols = ["id"] >> ops.Normalize() workflow = Workflow(cols + cat_cols) workflow.fit(dataset) transformed_ddf = workflow.transform(dataset).to_ddf() # no transforms on the pass through cols, should have original dtypes for col in cols: assert_eq(ddf.dtypes[col], transformed_ddf.dtypes[col]) # Followup dask-cudf sorting used to throw an exception because of dtype issues, # check that it works now transformed_ddf.sort_values(["id", "timestamp"]).compute()
def test_column_group_select(): df = cudf.DataFrame({ "a": [1, 4, 9, 16, 25], "b": [0, 1, 2, 3, 4], "c": [25, 16, 9, 4, 1] }) input_features = ColumnGroup(["a", "b", "c"]) sqrt_features = input_features[["a", "c"]] >> cudf.sqrt plus_one_features = input_features["b"] >> (lambda col: col + 1) features = sqrt_features + plus_one_features workflow = Workflow(features) df_out = workflow.fit_transform( Dataset(df)).to_ddf().compute(scheduler="synchronous") expected = cudf.DataFrame() expected["a"] = cudf.sqrt(df["a"]) expected["c"] = cudf.sqrt(df["c"]) expected["b"] = df["b"] + 1 assert_eq(expected, df_out)
def test_transform_geolocation(): raw = """US>SC>519 US>CA>807 US>MI>505 US>CA>510 CA>NB US>CA>534""".split() data = cudf.DataFrame({"geo_location": raw}) geo_location = ColumnGroup(["geo_location"]) state = geo_location >> (lambda col: col.str.slice(0, 5)) >> ops.Rename( postfix="_state") country = geo_location >> (lambda col: col.str.slice(0, 2)) >> ops.Rename( postfix="_country") geo_features = state + country + geo_location >> ops.HashBucket( num_buckets=100) # for this workflow we don't have any statoperators, so we can get away without fitting workflow = Workflow(geo_features) transformed = workflow.transform(Dataset(data)).to_ddf().compute() expected = cudf.DataFrame() expected["geo_location_state"] = data["geo_location"].str.slice( 0, 5).hash_values() % 100 expected["geo_location_country"] = data["geo_location"].str.slice( 0, 2).hash_values() % 100 expected["geo_location"] = data["geo_location"].hash_values() % 100 assert_eq(expected, transformed)
def test_nested_column_group(tmpdir): df = cudf.DataFrame( { "geo": ["US>CA", "US>NY", "CA>BC", "CA>ON"], "user": ["User_A", "User_A", "User_A", "User_B"], } ) country = ( ColumnGroup(["geo"]) >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country") ) # make sure we can do a 'combo' categorify (cross based) of country+user # as well as categorifying the country and user columns on their own cats = [country + "user"] + country + "user" >> Categorify(encode_type="combo") workflow = Workflow(cats) df_out = workflow.fit_transform(Dataset(df)).to_ddf().compute(scheduler="synchronous") geo_country = df_out["geo_country"] assert geo_country[0] == geo_country[1] # rows 0,1 are both 'US' assert geo_country[2] == geo_country[3] # rows 2,3 are both 'CA' user = df_out["user"] assert user[0] == user[1] == user[2] assert user[3] != user[2] geo_country_user = df_out["geo_country_user"] assert geo_country_user[0] == geo_country_user[1] # US / userA assert geo_country_user[2] != geo_country_user[0] # same user but in canada # make sure we get an exception if we nest too deeply (can't handle arbitrarily deep # nested column groups - and the exceptions we would get in operators like Categorify # are super confusing for users) with pytest.raises(ValueError): cats = [[country + "user"] + country + "user"] >> Categorify(encode_type="combo")
def test_gpu_workflow_config(tmpdir, client, df, dataset, gpu_memory_frac, engine, dump, replace): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] norms = ops.Normalize() cat_features = cat_names >> ops.Categorify() if replace: cont_features = cont_names >> ops.FillMissing() >> ops.LogOp >> norms else: fillmissing_logop = (cont_names >> ops.FillMissing() >> ops.LogOp >> ops.Rename(postfix="_FillMissing_1_LogOp_1")) cont_features = cont_names + fillmissing_logop >> norms workflow = Workflow(cat_features + cont_features + label_name, client=client) workflow.fit(dataset) if dump: workflow_dir = os.path.join(tmpdir, "workflow") workflow.save(workflow_dir) workflow = None workflow = Workflow.load(workflow_dir, client=client) def get_norms(tar: cudf.Series): ser_median = tar.dropna().quantile(0.5, interpolation="linear") gdf = tar.fillna(ser_median) gdf = np.log(gdf + 1) return gdf # Check mean and std - No good right now we have to add all other changes; Clip, Log concat_ops = "_FillMissing_1_LogOp_1" if replace: concat_ops = "" assert math.isclose(get_norms(df.x).mean(), norms.means["x" + concat_ops], rel_tol=1e-1) assert math.isclose(get_norms(df.y).mean(), norms.means["y" + concat_ops], rel_tol=1e-1) assert math.isclose(get_norms(df.x).std(), norms.stds["x" + concat_ops], rel_tol=1e-1) assert math.isclose(get_norms(df.y).std(), norms.stds["y" + concat_ops], rel_tol=1e-1) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique().values_host cats0 = get_cats(workflow, "name-cat") # adding the None entry as a string because of move from gpu assert cats0.tolist() == [None] + cats_expected0.tolist() cats_expected1 = df["name-string"].unique().values_host cats1 = get_cats(workflow, "name-string") # adding the None entry as a string because of move from gpu assert cats1.tolist() == [None] + cats_expected1.tolist() # Write to new "shuffled" and "processed" dataset workflow.transform(dataset).to_parquet( tmpdir, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION, ) dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) if engine == "parquet": assert is_integer_dtype(df_pp["name-cat"].dtype) assert is_integer_dtype(df_pp["name-string"].dtype) num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( str(tmpdir) + "/_metadata") assert num_rows == len(df_pp)
def main(args): """Multi-GPU Criteo/DLRM Preprocessing Benchmark This benchmark is designed to measure the time required to preprocess the Criteo (1TB) dataset for Facebook’s DLRM model. The user must specify the path of the raw dataset (using the `--data-path` flag), as well as the output directory for all temporary/final data (using the `--out-path` flag) Example Usage ------------- python dask-nvtabular-criteo-benchmark.py --data-path /path/to/criteo_parquet --out-path /out/dir/` Dataset Requirements (Parquet) ------------------------------ This benchmark is designed with a parquet-formatted dataset in mind. While a CSV-formatted dataset can be processed by NVTabular, converting to parquet will yield significantly better performance. To convert your dataset, try using the `optimize_criteo.ipynb` notebook (also located in `NVTabular/examples/`) For a detailed parameter overview see `NVTabular/examples/MultiGPUBench.md` """ # Input data_path = args.data_path freq_limit = args.freq_limit out_files_per_proc = args.out_files_per_proc high_card_columns = args.high_cards.split(",") dashboard_port = args.dashboard_port if args.protocol == "ucx": UCX_TLS = os.environ.get("UCX_TLS", "tcp,cuda_copy,cuda_ipc,sockcm") os.environ["UCX_TLS"] = UCX_TLS # Cleanup output directory BASE_DIR = args.out_path dask_workdir = os.path.join(BASE_DIR, "workdir") output_path = os.path.join(BASE_DIR, "output") stats_path = os.path.join(BASE_DIR, "stats") if not os.path.isdir(BASE_DIR): os.mkdir(BASE_DIR) for dir_path in (dask_workdir, output_path, stats_path): if os.path.isdir(dir_path): shutil.rmtree(dir_path) os.mkdir(dir_path) # Use Criteo dataset by default (for now) cont_names = (args.cont_names.split(",") if args.cont_names else ["I" + str(x) for x in range(1, 14)]) cat_names = (args.cat_names.split(",") if args.cat_names else ["C" + str(x) for x in range(1, 27)]) label_name = ["label"] # Specify Categorify/GroupbyStatistics options tree_width = {} cat_cache = {} for col in cat_names: if col in high_card_columns: tree_width[col] = args.tree_width cat_cache[col] = args.cat_cache_high else: tree_width[col] = 1 cat_cache[col] = args.cat_cache_low # Use total device size to calculate args.device_limit_frac device_size = device_mem_size(kind="total") device_limit = int(args.device_limit_frac * device_size) device_pool_size = int(args.device_pool_frac * device_size) part_size = int(args.part_mem_frac * device_size) # Parse shuffle option shuffle = None if args.shuffle == "PER_WORKER": shuffle = nvt_io.Shuffle.PER_WORKER elif args.shuffle == "PER_PARTITION": shuffle = nvt_io.Shuffle.PER_PARTITION # Check if any device memory is already occupied for dev in args.devices.split(","): fmem = _pynvml_mem_size(kind="free", index=int(dev)) used = (device_size - fmem) / 1e9 if used > 1.0: warnings.warn( f"BEWARE - {used} GB is already occupied on device {int(dev)}!" ) # Setup LocalCUDACluster if args.protocol == "tcp": cluster = LocalCUDACluster( protocol=args.protocol, n_workers=args.n_workers, CUDA_VISIBLE_DEVICES=args.devices, device_memory_limit=device_limit, local_directory=dask_workdir, dashboard_address=":" + dashboard_port, ) else: cluster = LocalCUDACluster( protocol=args.protocol, n_workers=args.n_workers, CUDA_VISIBLE_DEVICES=args.devices, enable_nvlink=True, device_memory_limit=device_limit, local_directory=dask_workdir, dashboard_address=":" + dashboard_port, ) client = Client(cluster) # Setup RMM pool if args.device_pool_frac > 0.01: setup_rmm_pool(client, device_pool_size) # Define Dask NVTabular "Workflow" processor = Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name, client=client) if args.normalize: processor.add_feature([ops.FillMissing(), ops.Normalize()]) else: processor.add_feature( [ops.FillMissing(), ops.Clip(min_value=0), ops.LogOp()]) processor.add_preprocess( ops.Categorify( out_path=stats_path, tree_width=tree_width, cat_cache=cat_cache, freq_threshold=freq_limit, search_sorted=not freq_limit, on_host=not args.cats_on_device, )) processor.finalize() dataset = Dataset(data_path, "parquet", part_size=part_size) # Execute the dask graph runtime = time.time() if args.profile is not None: with performance_report(filename=args.profile): processor.apply( dataset, shuffle=shuffle, out_files_per_proc=out_files_per_proc, output_path=output_path, num_io_threads=args.num_io_threads, ) else: processor.apply( dataset, num_io_threads=args.num_io_threads, shuffle=shuffle, out_files_per_proc=out_files_per_proc, output_path=output_path, ) runtime = time.time() - runtime print("\nDask-NVTabular DLRM/Criteo benchmark") print("--------------------------------------") print(f"partition size | {part_size}") print(f"protocol | {args.protocol}") print(f"device(s) | {args.devices}") print(f"rmm-pool-frac | {(args.device_pool_frac)}") print(f"out-files-per-proc | {args.out_files_per_proc}") print(f"num_io_threads | {args.num_io_threads}") print(f"shuffle | {args.shuffle}") print(f"cats-on-device | {args.cats_on_device}") print("======================================") print(f"Runtime[s] | {runtime}") print("======================================\n") client.close()
def test_dask_workflow_api_dlrm( client, tmpdir, datasets, freq_threshold, part_mem_fraction, engine, cat_cache, on_host, shuffle ): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) if engine == "parquet": df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] elif engine == "csv": df1 = cudf.read_csv(paths[0], header=0)[mycols_csv] df2 = cudf.read_csv(paths[1], header=0)[mycols_csv] else: df1 = cudf.read_csv(paths[0], names=allcols_csv)[mycols_csv] df2 = cudf.read_csv(paths[1], names=allcols_csv)[mycols_csv] df0 = cudf.concat([df1, df2], axis=0) if engine == "parquet": cat_names = ["name-cat", "name-string"] else: cat_names = ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] cats = cat_names >> ops.Categorify( freq_threshold=freq_threshold, out_path=str(tmpdir), cat_cache=cat_cache, on_host=on_host ) conts = cont_names >> ops.FillMissing() >> ops.Clip(min_value=0) >> ops.LogOp() workflow = Workflow(cats + conts + label_name, client=client) if engine in ("parquet", "csv"): dataset = Dataset(paths, part_mem_fraction=part_mem_fraction) else: dataset = Dataset(paths, names=allcols_csv, part_mem_fraction=part_mem_fraction) output_path = os.path.join(tmpdir, "processed") transformed = workflow.fit_transform(dataset) transformed.to_parquet(output_path=output_path, shuffle=shuffle) # Can still access the final ddf if we didn't shuffle if not shuffle: result = transformed.to_ddf().compute() assert len(df0) == len(result) assert result["x"].min() == 0.0 assert result["x"].isna().sum() == 0 assert result["y"].min() == 0.0 assert result["y"].isna().sum() == 0 # Check category counts cat_expect = df0.groupby("name-string").agg({"name-string": "count"}).reset_index(drop=True) cat_result = ( result.groupby("name-string").agg({"name-string": "count"}).reset_index(drop=True) ) if freq_threshold: cat_expect = cat_expect[cat_expect["name-string"] >= freq_threshold] # Note that we may need to skip the 0th element in result (null mapping) assert_eq( cat_expect, cat_result.iloc[1:] if len(cat_result) > len(cat_expect) else cat_result, check_index=False, ) else: assert_eq(cat_expect, cat_result) # Read back from disk df_disk = dask_cudf.read_parquet(output_path, index=False).compute() for col in df_disk: assert_eq(result[col], df_disk[col]) else: df_disk = dask_cudf.read_parquet(output_path, index=False).compute() assert len(df0) == len(df_disk)