示例#1
0
def test_chaining_3():
    gdf_test = cudf.DataFrame({
        "ad_id": [1, 2, 2, 6, 6, 8, 3, 3],
        "source_id": [2, 4, 4, 7, 5, 2, 5, 2],
        "platform": [1, 2, np.nan, 2, 1, 3, 3, 1],
        "clicked": [1, 0, 1, 0, 0, 1, 1, 0],
    })

    platform_features = ["platform"] >> ops.Dropna()
    joined = ["ad_id"] >> ops.JoinGroupby(cont_cols=["clicked"],
                                          stats=["sum", "count"])
    joined_lambda = (
        joined >> ops.LambdaOp(f=lambda col, gdf: col / gdf["ad_id_count"]) >>
        ops.Rename(postfix="_ctr"))

    workflow = Workflow(platform_features + joined + joined_lambda)

    dataset = nvt.Dataset(gdf_test, engine="parquet")

    workflow.fit(dataset)

    result = workflow.transform(dataset).to_ddf().compute()

    assert all(
        x in result.columns
        for x in ["ad_id_count", "ad_id_clicked_sum_ctr", "ad_id_clicked_sum"])
示例#2
0
def test_dropna(tmpdir, df, dataset, engine):
    columns = mycols_pq if engine == "parquet" else mycols_csv
    dropna_features = columns >> ops.Dropna()

    processor = nvt.Workflow(dropna_features)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()
    assert new_gdf.columns.all() == df.columns.all()
    assert new_gdf.isnull().all().sum() < 1, "null values exist"
示例#3
0
def test_dropna(tmpdir, df, dataset, engine):
    dropna = ops.Dropna()
    columns = mycols_pq if engine == "parquet" else mycols_csv

    columns_ctx = {}
    columns_ctx["all"] = {}
    columns_ctx["all"]["base"] = columns

    for gdf in dataset.to_iter():
        new_gdf = dropna.apply_op(gdf, columns_ctx, "all")
        assert new_gdf.columns.all() == gdf.columns.all()
        assert new_gdf.isnull().all().sum() < 1, "null values exist"