def test_workflow_move_saved(tmpdir): raw = """US>SC>519 US>CA>807 US>MI>505 US>CA>510 CA>NB US>CA>534""".split() data = nvt.dispatch._make_df({"geo": raw}) geo_location = ColumnSelector(["geo"]) state = (geo_location >> ops.LambdaOp(lambda col: col.str.slice(0, 5)) >> ops.Rename(postfix="_state")) country = (geo_location >> ops.LambdaOp(lambda col: col.str.slice(0, 2)) >> ops.Rename(postfix="_country")) geo_features = state + country + geo_location >> ops.Categorify() # create the workflow and transform the input workflow = Workflow(geo_features) expected = workflow.fit_transform(Dataset(data)).to_ddf().compute() # save the workflow (including categorical mapping parquet files) # and then verify we can load the saved workflow after moving the directory out_path = os.path.join(tmpdir, "output", "workflow") workflow.save(out_path) moved_path = os.path.join(tmpdir, "output", "workflow2") shutil.move(out_path, moved_path) workflow2 = Workflow.load(moved_path) # also check that when transforming our input we get the same results after loading transformed = workflow2.transform(Dataset(data)).to_ddf().compute() assert_eq(expected, transformed)
def test_lambdaop_misalign(cpu): size = 12 df0 = pd.DataFrame({ "a": np.arange(size), "b": np.random.choice(["apple", "banana", "orange"], size), "c": np.random.choice([0, 1], size), }) ddf0 = dd.from_pandas(df0, npartitions=4) cont_names = ColumnSelector(["a"]) cat_names = ColumnSelector(["b"]) label = ColumnSelector(["c"]) if cpu: label_feature = label >> ops.LambdaOp( lambda col: np.where(col == 4, 0, 1)) else: label_feature = label >> ops.LambdaOp( lambda col: cp.where(col == 4, 0, 1)) workflow = nvt.Workflow(cat_names + cont_names + label_feature) dataset = nvt.Dataset(ddf0, cpu=cpu) transformed = workflow.transform(dataset) assert_eq_dd( df0[["a", "b"]], transformed.to_ddf().compute()[["a", "b"]], check_index=False, )
def test_chaining_3(): gdf_test = cudf.DataFrame({ "ad_id": [1, 2, 2, 6, 6, 8, 3, 3], "source_id": [2, 4, 4, 7, 5, 2, 5, 2], "platform": [1, 2, np.nan, 2, 1, 3, 3, 1], "clicked": [1, 0, 1, 0, 0, 1, 1, 0], }) platform_features = ["platform"] >> ops.Dropna() joined = ["ad_id"] >> ops.JoinGroupby(cont_cols=["clicked"], stats=["sum", "count"]) joined_lambda = ( joined >> ops.LambdaOp(f=lambda col, gdf: col / gdf["ad_id_count"]) >> ops.Rename(postfix="_ctr")) workflow = Workflow(platform_features + joined + joined_lambda) dataset = nvt.Dataset(gdf_test, engine="parquet") workflow.fit(dataset) result = workflow.transform(dataset).to_ddf().compute() assert all( x in result.columns for x in ["ad_id_count", "ad_id_clicked_sum_ctr", "ad_id_clicked_sum"])
def test_chaining_2(): gdf = cudf.DataFrame({ "A": [1, 2, 2, 9, 6, np.nan, 3], "B": [2, np.nan, 4, 7, 7, 2, 5], "C": ["a", "b", "c", np.nan, np.nan, "g", "k"], }) cat_names = ["C"] cont_names = ["A", "B"] label_name = [] all_features = (cat_names + cont_names >> ops.LambdaOp( f=lambda col: col.isnull()) >> ops.Rename(postfix="_isnull")) cat_features = cat_names >> ops.Categorify() workflow = Workflow(all_features + cat_features + label_name) dataset = nvt.Dataset(gdf, engine="parquet") workflow.fit(dataset) result = workflow.transform(dataset).to_ddf().compute() assert all(x in list(result.columns) for x in ["A_isnull", "B_isnull", "C_isnull"]) assert (x in result["C"].unique() for x in set(gdf["C"].dropna().to_arrow()))
def test_concatenate_dataframe(tmpdir, output_model): # we were seeing an issue in the rossmann workflow where we dropped certain columns, # https://github.com/NVIDIA/NVTabular/issues/961 df = _make_df({ "cat": ["aaaa", "bbbb", "cccc", "aaaa", "bbbb", "aaaa"], "cont": [0.0, 1.0, 2.0, 3.0, 4.0, 5], }) # this bug only happened with a dataframe representation: force this by using a lambda cats = ["cat"] >> ops.LambdaOp(lambda col: _hash_series(col) % 1000) conts = ["cont"] >> ops.Normalize() >> ops.FillMissing() >> ops.LogOp() dataset = Dataset(df) workflow = nvt.Workflow(cats + conts).fit_schema(dataset.infer_schema()) if output_model == "pytorch": model_info = { "cat": { "columns": ["cat"], "dtype": "int32" }, "cont": { "columns": ["cont"], "dtype": "float32" }, } else: model_info = None _verify_workflow_on_tritonserver(tmpdir, workflow, df, "test_concatenate_dataframe", output_model, model_info)
def test_workflow_generate_columns(tmpdir, use_parquet): out_path = str(tmpdir.mkdir("processed")) path = str(tmpdir.join("simple.parquet")) # Stripped down dataset with geo_locaiton codes like in outbrains df = nvt.dispatch._make_df( {"geo_location": ["US>CA", "CA>BC", "US>TN>659"]}) # defining a simple workflow that strips out the country code from the first two digits of the # geo_location code and sticks in a new 'geo_location_country' field country = (["geo_location"] >> ops.LambdaOp( f=lambda col: col.str.slice(0, 2), ) >> ops.Rename(postfix="_country")) cat_features = ["geo_location"] + country >> ops.Categorify() workflow = Workflow(cat_features) if use_parquet: df.to_parquet(path) dataset = nvt.Dataset(path) else: dataset = nvt.Dataset(df) # just make sure this works without errors workflow.fit(dataset) workflow.transform(dataset).to_parquet(out_path)
def test_workflow_generate_columns(tmpdir, use_parquet): out_path = str(tmpdir.mkdir("processed")) path = str(tmpdir.join("simple.parquet")) # Stripped down dataset with geo_locaiton codes like in outbrains df = cudf.DataFrame({"geo_location": ["US>CA", "CA>BC", "US>TN>659"]}) # defining a simple workflow that strips out the country code from the first two digits of the # geo_location code and sticks in a new 'geo_location_country' field cat_names = ["geo_location", "geo_location_country"] workflow = nvt.Workflow(cat_names=cat_names, cont_names=[], label_name=[]) workflow.add_feature( [ ops.LambdaOp( op_name="country", f=lambda col, gdf: col.str.slice(0, 2), columns=["geo_location"], replace=False, ), ops.Categorify(replace=False), ] ) workflow.finalize() if use_parquet: df.to_parquet(path) dataset = nvt.Dataset(path) else: dataset = nvt.Dataset(df) # just make sure this owrks without errors workflow.apply(dataset, output_path=out_path)
def test_transform_geolocation(): raw = """US>SC>519 US>CA>807 US>MI>505 US>CA>510 CA>NB US>CA>534""".split() data = nvt.dispatch._make_df({"geo_location": raw}) geo_location = ColumnSelector(["geo_location"]) state = (geo_location >> ops.LambdaOp(lambda col: col.str.slice(0, 5)) >> ops.Rename(postfix="_state")) country = (geo_location >> ops.LambdaOp(lambda col: col.str.slice(0, 2)) >> ops.Rename(postfix="_country")) geo_features = state + country + geo_location >> ops.HashBucket( num_buckets=100) # for this workflow we don't have any statoperators, so we can get away without fitting workflow = Workflow(geo_features) transformed = workflow.transform(Dataset(data)).to_ddf().compute() expected = nvt.dispatch._make_df() expected["geo_location_state"] = data["geo_location"].str.slice( 0, 5).hash_values() % 100 expected["geo_location_country"] = data["geo_location"].str.slice( 0, 2).hash_values() % 100 expected["geo_location"] = data["geo_location"].hash_values() % 100 assert_eq(expected, transformed)
def test_target_encode_group(): df = dispatch._make_df({ "Cost": range(15), "Post": [1, 2, 3, 4, 5] * 3, "Author": ["A"] * 5 + ["B"] * 5 + ["C"] * 2 + ["D"] * 3, "Engaging_User": ["A"] * 5 + ["B"] * 3 + ["E"] * 2 + ["D"] * 3 + ["G"] * 2, }) cat_groups = ["Author", "Engaging_User"] labels = ColumnSelector( ["Post"]) >> ops.LambdaOp(lambda col: (col > 3).astype("int8")) te_features = cat_groups >> ops.TargetEncoding( labels, out_path="./", kfold=1, out_dtype="float32", drop_folds=False, # Keep folds to validate ) workflow = nvt.Workflow(te_features + ["Author", "Engaging_User"]) workflow.fit_transform( nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous")
def test_lambdaop(tmpdir, df, dataset, gpu_memory_frac, engine, client): cat_names = ["name-cat", "name-string"] cont_names = ["x", "y"] label_name = ["label"] columns = mycols_pq if engine == "parquet" else mycols_csv df_copy = df.copy() config = nvt.workflow.get_new_config() processor = nvtabular.Workflow( cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config, client=client, ) columns_ctx = {} columns_ctx["continuous"] = {} columns_ctx["continuous"]["base"] = cont_names columns_ctx["all"] = {} columns_ctx["all"]["base"] = columns # Substring # Replacement op = ops.LambdaOp( op_name="slice", f=lambda col, gdf: col.str.slice(1, 3), columns=["name-cat", "name-string"], replace=True, ) new_gdf = op.apply_op(df, columns_ctx, "all", stats_context=None) assert new_gdf["name-cat"].equals(df_copy["name-cat"].str.slice(1, 3)) assert new_gdf["name-string"].equals(df_copy["name-string"].str.slice( 1, 3)) # No Replacement df = df_copy.copy() op = ops.LambdaOp( op_name="slice", f=lambda col, gdf: col.str.slice(1, 3), columns=["name-cat", "name-string"], replace=False, ) new_gdf = op.apply_op(df, columns_ctx, "all", stats_context=None) assert new_gdf["name-cat_slice"].equals(df_copy["name-cat"].str.slice( 1, 3)) assert new_gdf["name-string_slice"].equals( df_copy["name-string"].str.slice(1, 3)) assert new_gdf["name-cat"].equals(df_copy["name-cat"]) assert new_gdf["name-string"].equals(df_copy["name-string"]) # Replace # Replacement df = df_copy.copy() op = ops.LambdaOp( op_name="replace", f=lambda col, gdf: col.str.replace("e", "XX"), columns=["name-cat", "name-string"], replace=True, ) new_gdf = op.apply_op(df, columns_ctx, "all", stats_context=None) assert new_gdf["name-cat"].equals(df_copy["name-cat"].str.replace( "e", "XX")) assert new_gdf["name-string"].equals(df_copy["name-string"].str.replace( "e", "XX")) # No Replacement df = df_copy.copy() op = ops.LambdaOp( op_name="replace", f=lambda col, gdf: col.str.replace("e", "XX"), columns=["name-cat", "name-string"], replace=False, ) new_gdf = op.apply_op(df, columns_ctx, "all", stats_context=None) assert new_gdf["name-cat_replace"].equals(df_copy["name-cat"].str.replace( "e", "XX")) assert new_gdf["name-string_replace"].equals( df_copy["name-string"].str.replace("e", "XX")) assert new_gdf["name-cat"].equals(df_copy["name-cat"]) assert new_gdf["name-string"].equals(df_copy["name-string"]) # astype # Replacement df = df_copy.copy() op = ops.LambdaOp(op_name="astype", f=lambda col, gdf: col.astype(float), columns=["id"], replace=True) new_gdf = op.apply_op(df, columns_ctx, "all", stats_context=None) assert new_gdf["id"].dtype == "float64" # Workflow # Replacement import glob processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_preprocess([ ops.LambdaOp( op_name="slice", f=lambda col, gdf: col.astype(str).str.slice(0, 1), columns=["name-cat"], replace=True, ), ops.Categorify(), ]) processor.finalize() processor.update_stats(dataset) outdir = tmpdir.mkdir("out1") processor.write_to_dataset(outdir, dataset, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION, apply_ops=True) dataset_2 = nvtabular.io.Dataset(glob.glob(str(outdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) assert is_integer_dtype(df_pp["name-cat"].dtype) processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_preprocess([ ops.Categorify(), ops.LambdaOp(op_name="add100", f=lambda col, gdf: col + 100, replace=True), ]) processor.finalize() processor.update_stats(dataset) outdir = tmpdir.mkdir("out2") processor.write_to_dataset(outdir, dataset, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION, apply_ops=True) dataset_2 = nvtabular.io.Dataset(glob.glob(str(outdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) assert is_integer_dtype(df_pp["name-cat"].dtype) assert np.sum(df_pp["name-cat"] < 100) == 0 # Workflow # No Replacement processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_preprocess([ ops.LambdaOp( op_name="slice", f=lambda col, gdf: col.astype(str).str.slice(0, 1), columns=["name-cat"], replace=False, ), ops.Categorify(), ]) processor.finalize() processor.update_stats(dataset) outdir = tmpdir.mkdir("out3") processor.write_to_dataset(outdir, dataset, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION, apply_ops=True) dataset_2 = nvtabular.io.Dataset(glob.glob(str(outdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) assert df_pp["name-cat"].dtype == "O" print(df_pp) assert is_integer_dtype(df_pp["name-cat_slice"].dtype) assert np.sum(df_pp["name-cat_slice"] == 0) == 0 processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_preprocess([ ops.Categorify(), ops.LambdaOp(op_name="add100", f=lambda col, gdf: col + 100, replace=False), ]) processor.finalize() processor.update_stats(dataset) outdir = tmpdir.mkdir("out4") processor.write_to_dataset(outdir, dataset, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION, apply_ops=True) dataset_2 = nvtabular.io.Dataset(glob.glob(str(outdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) assert is_integer_dtype(df_pp["name-cat_add100"].dtype) assert np.sum(df_pp["name-cat_add100"] < 100) == 0 processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_preprocess([ ops.LambdaOp(op_name="mul0", f=lambda col, gdf: col * 0, columns=["x"], replace=False), ops.LambdaOp(op_name="add100", f=lambda col, gdf: col + 100, replace=False), ]) processor.finalize() processor.update_stats(dataset) outdir = tmpdir.mkdir("out5") processor.write_to_dataset(outdir, dataset, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION, apply_ops=True) dataset_2 = nvtabular.io.Dataset(glob.glob(str(outdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) assert np.sum(df_pp["x_mul0_add100"] < 100) == 0
def test_lambdaop(tmpdir, df, paths, gpu_memory_frac, engine, cpu): dataset = nvt.Dataset(paths, cpu=cpu) df_copy = df.copy() # Substring # Replacement substring = ColumnSelector([ "name-cat", "name-string" ]) >> ops.LambdaOp(lambda col: col.str.slice(1, 3)) processor = nvtabular.Workflow(substring) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() assert_eq_dd(new_gdf["name-cat"], df_copy["name-cat"].str.slice(1, 3), check_index=False) assert_eq_dd(new_gdf["name-string"], df_copy["name-string"].str.slice(1, 3), check_index=False) # No Replacement from old API (skipped for other examples) substring = (ColumnSelector(["name-cat", "name-string"]) >> ops.LambdaOp(lambda col: col.str.slice(1, 3)) >> ops.Rename(postfix="_slice")) processor = nvtabular.Workflow(substring + ["name-cat", "name-string"]) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() assert_eq_dd( new_gdf["name-cat_slice"], df_copy["name-cat"].str.slice(1, 3), check_index=False, check_names=False, ) assert_eq_dd( new_gdf["name-string_slice"], df_copy["name-string"].str.slice(1, 3), check_index=False, check_names=False, ) assert_eq_dd(new_gdf["name-cat"], df_copy["name-cat"], check_index=False) assert_eq_dd(new_gdf["name-string"], df_copy["name-string"], check_index=False) # Replace # Replacement oplambda = ColumnSelector([ "name-cat", "name-string" ]) >> ops.LambdaOp(lambda col: col.str.replace("e", "XX")) processor = nvtabular.Workflow(oplambda) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() assert_eq_dd(new_gdf["name-cat"], df_copy["name-cat"].str.replace("e", "XX"), check_index=False) assert_eq_dd(new_gdf["name-string"], df_copy["name-string"].str.replace("e", "XX"), check_index=False) # astype # Replacement oplambda = ColumnSelector( ["id"]) >> ops.LambdaOp(lambda col: col.astype(float)) processor = nvtabular.Workflow(oplambda) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() assert new_gdf["id"].dtype == "float64" # Workflow # Replacement oplambda = (ColumnSelector(["name-cat"]) >> ops.LambdaOp(lambda col: col.astype(str).str.slice(0, 1)) >> ops.Categorify()) processor = nvtabular.Workflow(oplambda) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() assert is_integer_dtype(new_gdf["name-cat"].dtype) oplambda = ( ColumnSelector(["name-cat", "name-string"]) >> ops.Categorify() >> (lambda col: col + 100)) processor = nvtabular.Workflow(oplambda) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() assert is_integer_dtype(new_gdf["name-cat"].dtype) assert np.sum(new_gdf["name-cat"] < 100) == 0