def test_lambdaop_misalign(cpu): size = 12 df0 = pd.DataFrame({ "a": np.arange(size), "b": np.random.choice(["apple", "banana", "orange"], size), "c": np.random.choice([0, 1], size), }) ddf0 = dd.from_pandas(df0, npartitions=4) cont_names = ColumnGroup(["a"]) cat_names = ColumnGroup(["b"]) label = ColumnGroup(["c"]) if cpu: label_feature = label >> (lambda col: np.where(col == 4, 0, 1)) else: label_feature = label >> (lambda col: cp.where(col == 4, 0, 1)) workflow = nvt.Workflow(cat_names + cont_names + label_feature) dataset = nvt.Dataset(ddf0, cpu=cpu) transformed = workflow.transform(dataset) assert_eq_dd( df0[["a", "b"]], transformed.to_ddf().compute()[["a", "b"]], check_index=False, )
def test_lambdaop(tmpdir, df, dataset, gpu_memory_frac, engine): df_copy = df.copy() # Substring # Replacement substring = ColumnGroup(["name-cat", "name-string" ]) >> (lambda col: col.str.slice(1, 3)) processor = nvtabular.Workflow(substring) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() assert_eq_dd(new_gdf["name-cat"], df_copy["name-cat"].str.slice(1, 3), check_index=False) assert_eq_dd(new_gdf["name-string"], df_copy["name-string"].str.slice(1, 3), check_index=False) # No Replacement from old API (skipped for other examples) substring = ( ColumnGroup(["name-cat", "name-string"]) >> (lambda col: col.str.slice(1, 3)) >> ops.Rename(postfix="_slice")) processor = nvtabular.Workflow(["name-cat", "name-string"] + substring) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() assert_eq_dd( new_gdf["name-cat_slice"], df_copy["name-cat"].str.slice(1, 3), check_index=False, check_names=False, ) assert_eq_dd( new_gdf["name-string_slice"], df_copy["name-string"].str.slice(1, 3), check_index=False, check_names=False, ) assert_eq_dd(new_gdf["name-cat"], df_copy["name-cat"], check_index=False) assert_eq_dd(new_gdf["name-string"], df_copy["name-string"], check_index=False) # Replace # Replacement oplambda = ColumnGroup(["name-cat", "name-string" ]) >> (lambda col: col.str.replace("e", "XX")) processor = nvtabular.Workflow(oplambda) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() assert_eq_dd(new_gdf["name-cat"], df_copy["name-cat"].str.replace("e", "XX"), check_index=False) assert_eq_dd(new_gdf["name-string"], df_copy["name-string"].str.replace("e", "XX"), check_index=False) # astype # Replacement oplambda = ColumnGroup(["id"]) >> (lambda col: col.astype(float)) processor = nvtabular.Workflow(oplambda) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() assert new_gdf["id"].dtype == "float64" # Workflow # Replacement oplambda = ( ColumnGroup(["name-cat"]) >> (lambda col: col.astype(str).str.slice(0, 1)) >> ops.Categorify()) processor = nvtabular.Workflow(oplambda) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() assert is_integer_dtype(new_gdf["name-cat"].dtype) oplambda = (ColumnGroup(["name-cat", "name-string"]) >> ops.Categorify() >> (lambda col: col + 100)) processor = nvtabular.Workflow(oplambda) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() assert is_integer_dtype(new_gdf["name-cat"].dtype) assert np.sum(new_gdf["name-cat"] < 100) == 0