示例#1
0
def test_filter(tmpdir, df, dataset, gpu_memory_frac, engine):
    cont_names = ["x", "y"]
    filtered = cont_names >> ops.Filter(f=lambda df: df[df["y"] > 0.5])
    processor = nvtabular.Workflow(filtered)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute().reset_index()
    filter_df = df[df["y"] > 0.5].reset_index()
    for col in cont_names:
        assert np.all((new_gdf[col] - filter_df[col]).abs().values <= 1e-2)

    # return isnull() rows
    for col in cont_names:
        idx = np.random.choice(df.shape[0] - 1, int(df.shape[0] * 0.2))
        df[col].iloc[idx] = None

    dataset = nvt.Dataset(df)
    filtered = cont_names >> ops.Filter(f=lambda df: df[df.x.isnull()])
    processor = nvtabular.Workflow(filtered)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()
    assert new_gdf.shape[0] < df.shape[0], "null values do not exist"

    # again testing filtering by returning a series rather than a df
    filtered = cont_names >> ops.Filter(f=lambda df: df.x.isnull())
    processor = nvtabular.Workflow(filtered)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()
    assert new_gdf.shape[0] < df.shape[0], "null values do not exist"

    # if the filter returns an invalid type we should get an exception immediately
    # (rather than causing problems downstream in the workflow)
    filtered = cont_names >> ops.Filter(f=lambda df: "some invalid value")
    processor = nvtabular.Workflow(filtered)
    with pytest.raises(ValueError):
        new_gdf = processor.transform(dataset).to_ddf().compute()
示例#2
0
def test_filter(tmpdir, df, dataset, gpu_memory_frac, engine, client):

    cont_names = ["x", "y"]

    columns = mycols_pq if engine == "parquet" else mycols_csv
    columns_ctx = {}
    columns_ctx["all"] = {}
    columns_ctx["all"]["base"] = columns

    filter_op = ops.Filter(f=lambda df: df[df["y"] > 0.5])
    new_gdf = filter_op.apply_op(df, columns_ctx, "all", target_cols=columns)
    assert new_gdf.columns.all() == df.columns.all()

    # return isnull() rows
    columns_ctx["continuous"] = {}
    columns_ctx["continuous"]["base"] = cont_names

    for col in cont_names:
        idx = np.random.choice(df.shape[0] - 1, int(df.shape[0] * 0.2))
        df[col].iloc[idx] = None

    filter_op = ops.Filter(f=lambda df: df[df.x.isnull()])
    new_gdf = filter_op.apply_op(df, columns_ctx, "all", target_cols=columns)
    assert new_gdf.columns.all() == df.columns.all()
    assert new_gdf.shape[0] < df.shape[0], "null values do not exist"
def test_filtered_partition(tmpdir, cpu):
    # Toy DataFrame example
    df = pd.DataFrame({"col": range(100)})
    ddf = dd_from_pandas(df, npartitions=5)
    dataset = Dataset(ddf, cpu=cpu)

    # Workflow
    filtered = ["col"] >> ops.Filter(lambda df: df["col"] < 75)
    workflow = Workflow(filtered)

    # Write result to disk
    workflow.transform(dataset).to_parquet(str(tmpdir))
示例#4
0
def test_filter(tmpdir, df, dataset, gpu_memory_frac, engine, client):

    cont_names = ["x", "y"]

    columns = mycols_pq if engine == "parquet" else mycols_csv
    columns_ctx = {}
    columns_ctx["all"] = {}
    columns_ctx["all"]["base"] = columns

    filter_op = ops.Filter(f=lambda df: df[df["y"] > 0.5])
    new_gdf = filter_op.apply_op(df, columns_ctx, "all", target_cols=columns)
    assert new_gdf.columns.all() == df.columns.all()

    # return isnull() rows
    columns_ctx["continuous"] = {}
    columns_ctx["continuous"]["base"] = cont_names

    for col in cont_names:
        idx = np.random.choice(df.shape[0] - 1, int(df.shape[0] * 0.2))
        df[col].iloc[idx] = None

    filter_op = ops.Filter(f=lambda df: df[df.x.isnull()])
    new_gdf = filter_op.apply_op(df, columns_ctx, "all", target_cols=columns)
    assert new_gdf.columns.all() == df.columns.all()
    assert new_gdf.shape[0] < df.shape[0], "null values do not exist"

    # again testing filtering by returning a series rather than a df
    filter_op = ops.Filter(f=lambda df: df.x.isnull())
    new_gdf = filter_op.apply_op(df, columns_ctx, "all", target_cols=columns)
    assert new_gdf.columns.all() == df.columns.all()
    assert new_gdf.shape[0] < df.shape[0], "null values do not exist"

    # if the filter returns an invalid type we should get an exception immediately
    # (rather than causing problems downstream in the workflow)
    filter_op = ops.Filter(f=lambda df: "some invalid value")
    with pytest.raises(ValueError):
        filter_op.apply_op(df, columns_ctx, "all", target_cols=columns)