def test_filter(tmpdir, df, dataset, gpu_memory_frac, engine): cont_names = ["x", "y"] filtered = cont_names >> ops.Filter(f=lambda df: df[df["y"] > 0.5]) processor = nvtabular.Workflow(filtered) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute().reset_index() filter_df = df[df["y"] > 0.5].reset_index() for col in cont_names: assert np.all((new_gdf[col] - filter_df[col]).abs().values <= 1e-2) # return isnull() rows for col in cont_names: idx = np.random.choice(df.shape[0] - 1, int(df.shape[0] * 0.2)) df[col].iloc[idx] = None dataset = nvt.Dataset(df) filtered = cont_names >> ops.Filter(f=lambda df: df[df.x.isnull()]) processor = nvtabular.Workflow(filtered) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() assert new_gdf.shape[0] < df.shape[0], "null values do not exist" # again testing filtering by returning a series rather than a df filtered = cont_names >> ops.Filter(f=lambda df: df.x.isnull()) processor = nvtabular.Workflow(filtered) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() assert new_gdf.shape[0] < df.shape[0], "null values do not exist" # if the filter returns an invalid type we should get an exception immediately # (rather than causing problems downstream in the workflow) filtered = cont_names >> ops.Filter(f=lambda df: "some invalid value") processor = nvtabular.Workflow(filtered) with pytest.raises(ValueError): new_gdf = processor.transform(dataset).to_ddf().compute()
def test_filter(tmpdir, df, dataset, gpu_memory_frac, engine, client): cont_names = ["x", "y"] columns = mycols_pq if engine == "parquet" else mycols_csv columns_ctx = {} columns_ctx["all"] = {} columns_ctx["all"]["base"] = columns filter_op = ops.Filter(f=lambda df: df[df["y"] > 0.5]) new_gdf = filter_op.apply_op(df, columns_ctx, "all", target_cols=columns) assert new_gdf.columns.all() == df.columns.all() # return isnull() rows columns_ctx["continuous"] = {} columns_ctx["continuous"]["base"] = cont_names for col in cont_names: idx = np.random.choice(df.shape[0] - 1, int(df.shape[0] * 0.2)) df[col].iloc[idx] = None filter_op = ops.Filter(f=lambda df: df[df.x.isnull()]) new_gdf = filter_op.apply_op(df, columns_ctx, "all", target_cols=columns) assert new_gdf.columns.all() == df.columns.all() assert new_gdf.shape[0] < df.shape[0], "null values do not exist"
def test_filtered_partition(tmpdir, cpu): # Toy DataFrame example df = pd.DataFrame({"col": range(100)}) ddf = dd_from_pandas(df, npartitions=5) dataset = Dataset(ddf, cpu=cpu) # Workflow filtered = ["col"] >> ops.Filter(lambda df: df["col"] < 75) workflow = Workflow(filtered) # Write result to disk workflow.transform(dataset).to_parquet(str(tmpdir))
def test_filter(tmpdir, df, dataset, gpu_memory_frac, engine, client): cont_names = ["x", "y"] columns = mycols_pq if engine == "parquet" else mycols_csv columns_ctx = {} columns_ctx["all"] = {} columns_ctx["all"]["base"] = columns filter_op = ops.Filter(f=lambda df: df[df["y"] > 0.5]) new_gdf = filter_op.apply_op(df, columns_ctx, "all", target_cols=columns) assert new_gdf.columns.all() == df.columns.all() # return isnull() rows columns_ctx["continuous"] = {} columns_ctx["continuous"]["base"] = cont_names for col in cont_names: idx = np.random.choice(df.shape[0] - 1, int(df.shape[0] * 0.2)) df[col].iloc[idx] = None filter_op = ops.Filter(f=lambda df: df[df.x.isnull()]) new_gdf = filter_op.apply_op(df, columns_ctx, "all", target_cols=columns) assert new_gdf.columns.all() == df.columns.all() assert new_gdf.shape[0] < df.shape[0], "null values do not exist" # again testing filtering by returning a series rather than a df filter_op = ops.Filter(f=lambda df: df.x.isnull()) new_gdf = filter_op.apply_op(df, columns_ctx, "all", target_cols=columns) assert new_gdf.columns.all() == df.columns.all() assert new_gdf.shape[0] < df.shape[0], "null values do not exist" # if the filter returns an invalid type we should get an exception immediately # (rather than causing problems downstream in the workflow) filter_op = ops.Filter(f=lambda df: "some invalid value") with pytest.raises(ValueError): filter_op.apply_op(df, columns_ctx, "all", target_cols=columns)