Exemplo n.º 1
0
def test_target_encode_multi(tmpdir, npartitions):

    cat_1 = np.asarray(["baaaa"] * 12)
    cat_2 = np.asarray(["baaaa"] * 6 + ["bbaaa"] * 3 + ["bcaaa"] * 3)
    num_1 = np.asarray([1, 1, 2, 2, 2, 1, 1, 5, 4, 4, 4, 4])
    num_2 = np.asarray([1, 1, 2, 2, 2, 1, 1, 5, 4, 4, 4, 4]) * 2
    df = cudf.DataFrame({"cat": cat_1, "cat2": cat_2, "num": num_1, "num_2": num_2})
    df = dask_cudf.from_cudf(df, npartitions=npartitions)

    cat_groups = ["cat", "cat2", ["cat", "cat2"]]
    te_features = cat_groups >> ops.TargetEncoding(
        ["num", "num_2"], out_path=str(tmpdir), kfold=1, p_smooth=5, out_dtype="float32"
    )

    workflow = nvt.Workflow(te_features)

    df_out = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous")

    assert "TE_cat_cat2_num" in df_out.columns
    assert "TE_cat_num" in df_out.columns
    assert "TE_cat2_num" in df_out.columns
    assert "TE_cat_cat2_num_2" in df_out.columns
    assert "TE_cat_num_2" in df_out.columns
    assert "TE_cat2_num_2" in df_out.columns

    assert_eq(df_out["TE_cat2_num"].values, df_out["TE_cat_cat2_num"].values)
    assert_eq(df_out["TE_cat2_num_2"].values, df_out["TE_cat_cat2_num_2"].values)
    assert df_out["TE_cat_num"].iloc[0] != df_out["TE_cat2_num"].iloc[0]
    assert df_out["TE_cat_num_2"].iloc[0] != df_out["TE_cat2_num_2"].iloc[0]
    assert math.isclose(df_out["TE_cat_num"].iloc[0], num_1.mean(), abs_tol=1e-4)
    assert math.isclose(df_out["TE_cat_num_2"].iloc[0], num_2.mean(), abs_tol=1e-3)
Exemplo n.º 2
0
def test_spec_set(tmpdir, client):
    gdf_test = cudf.DataFrame({
        "ad_id": [1, 2, 2, 6, 6, 8, 3, 3],
        "source_id": [2, 4, 4, 7, 5, 2, 5, 2],
        "platform": [1, 2, np.nan, 2, 1, 3, 3, 1],
        "cont": [1, 2, np.nan, 2, 1, 3, 3, 1],
        "clicked": [1, 0, 1, 0, 0, 1, 1, 0],
    })

    p = nvt.Workflow(
        cat_names=["ad_id", "source_id", "platform"],
        cont_names=["cont"],
        label_name=["clicked"],
        client=client,
    )
    p.add_feature(ops.FillMissing())
    p.add_feature(ops.Normalize())
    p.add_feature(ops.Categorify())
    p.add_feature(
        ops.TargetEncoding(
            cat_groups=["ad_id", "source_id", "platform"],
            cont_target="clicked",
            kfold=5,
            fold_seed=42,
            p_smooth=20,
        ))

    p.apply(nvt.Dataset(gdf_test), record_stats=True)
    assert p.stats
def test_fit_schema_works_with_raw_column_dependencies():
    schema = Schema(["x", "y", "cost"])

    cat_features = ColumnSelector(["x", "y"]) >> ops.TargetEncoding("cost")

    workflow = Workflow(cat_features)
    workflow.fit_schema(schema)

    assert workflow.output_schema.column_names == ["TE_x_cost", "TE_y_cost"]
Exemplo n.º 4
0
def test_target_encode_multi(tmpdir, npartitions):

    cat_1 = np.asarray(["baaaa"] * 12)
    cat_2 = np.asarray(["baaaa"] * 6 + ["bbaaa"] * 3 + ["bcaaa"] * 3)
    num_1 = np.asarray([1, 1, 2, 2, 2, 1, 1, 5, 4, 4, 4, 4])
    num_2 = np.asarray([1, 1, 2, 2, 2, 1, 1, 5, 4, 4, 4, 4]) * 2
    df = cudf.DataFrame({
        "cat": cat_1,
        "cat2": cat_2,
        "num": num_1,
        "num_2": num_2
    })
    df = dask_cudf.from_cudf(df, npartitions=npartitions)

    cat_names = ["cat", "cat2"]
    cont_names = ["num", "num_2"]
    label_name = []
    processor = nvt.Workflow(cat_names=cat_names,
                             cont_names=cont_names,
                             label_name=label_name)

    cat_groups = ["cat", "cat2", ["cat", "cat2"]]

    processor.add_preprocess(
        ops.TargetEncoding(
            cat_groups,
            ["num", "num_2"],  # cont_target
            out_path=str(tmpdir),
            kfold=1,
            p_smooth=5,
            out_dtype="float32",
        ))
    processor.finalize()
    processor.apply(nvt.Dataset(df), output_format=None)
    df_out = processor.get_ddf().compute(scheduler="synchronous")

    assert "TE_cat_cat2_num" in df_out.columns
    assert "TE_cat_num" in df_out.columns
    assert "TE_cat2_num" in df_out.columns
    assert "TE_cat_cat2_num_2" in df_out.columns
    assert "TE_cat_num_2" in df_out.columns
    assert "TE_cat2_num_2" in df_out.columns

    assert_eq(df_out["TE_cat2_num"].values, df_out["TE_cat_cat2_num"].values)
    assert_eq(df_out["TE_cat2_num_2"].values,
              df_out["TE_cat_cat2_num_2"].values)
    assert df_out["TE_cat_num"].iloc[0] != df_out["TE_cat2_num"].iloc[0]
    assert df_out["TE_cat_num_2"].iloc[0] != df_out["TE_cat2_num_2"].iloc[0]
    assert math.isclose(df_out["TE_cat_num"].iloc[0],
                        num_1.mean(),
                        abs_tol=1e-4)
    assert math.isclose(df_out["TE_cat_num_2"].iloc[0],
                        num_2.mean(),
                        abs_tol=1e-3)
def test_fit_schema_works_with_grouped_node_inputs():
    schema = Schema(["x", "y", "cost"])

    cat_features = ColumnSelector(["x", "y",
                                   ("x", "y")]) >> ops.TargetEncoding("cost")

    workflow1 = Workflow(cat_features)
    workflow1.fit_schema(schema)

    assert sorted(workflow1.output_schema.column_names) == sorted(
        ["TE_x_cost", "TE_y_cost", "TE_x_y_cost"])
def test_fit_schema_works_with_node_dependencies():
    schema = Schema(["x", "y", "cost"])

    cont_features = ColumnSelector(["cost"]) >> ops.Rename(postfix="_renamed")
    cat_features = ColumnSelector(["x", "y"
                                   ]) >> ops.TargetEncoding(cont_features)

    workflow1 = Workflow(cat_features)
    workflow1.fit_schema(schema)

    assert workflow1.output_schema.column_names == [
        "TE_x_cost_renamed", "TE_y_cost_renamed"
    ]
Exemplo n.º 7
0
def test_target_encode(tmpdir, cat_groups, kfold, fold_seed):
    df = cudf.DataFrame({
        "Author": list(string.ascii_uppercase),
        "Engaging-User": list(string.ascii_lowercase),
        "Cost": range(26),
        "Post": [0, 1] * 13,
    })
    df = dask_cudf.from_cudf(df, npartitions=3)

    cat_names = ["Author", "Engaging-User"]
    cont_names = ["Cost"]
    label_name = ["Post"]

    processor = nvt.Workflow(cat_names=cat_names,
                             cont_names=cont_names,
                             label_name=label_name)
    processor.add_feature(
        [ops.FillMissing(),
         ops.Clip(min_value=0),
         ops.LogOp()])
    processor.add_preprocess(
        ops.TargetEncoding(
            cat_groups,
            "Cost",  # cont_target
            out_path=str(tmpdir),
            kfold=kfold,
            out_col="test_name",
            out_dtype="float32",
            fold_seed=fold_seed,
            drop_folds=False,  # Keep folds to validate
        ))
    processor.finalize()
    processor.apply(nvt.Dataset(df), output_format=None)
    df_out = processor.get_ddf().compute(scheduler="synchronous")

    assert "test_name" in df_out.columns
    assert df_out["test_name"].dtype == "float32"

    if kfold > 1:
        # Cat columns are unique.
        # Make sure __fold__ mapping is correct
        if cat_groups == "Author":
            name = "__fold___Author"
            cols = ["__fold__", "Author"]
        else:
            name = "__fold___Author_Engaging-User"
            cols = ["__fold__", "Author", "Engaging-User"]
        check = cudf.io.read_parquet(processor.stats["te_stats"][name])
        check = check[cols].sort_values(cols).reset_index(drop=True)
        df_out_check = df_out[cols].sort_values(cols).reset_index(drop=True)
        assert_eq(check, df_out_check)
Exemplo n.º 8
0
def test_spec_set(tmpdir, client):
    gdf_test = cudf.DataFrame({
        "ad_id": [1, 2, 2, 6, 6, 8, 3, 3],
        "source_id": [2, 4, 4, 7, 5, 2, 5, 2],
        "platform": [1, 2, np.nan, 2, 1, 3, 3, 1],
        "cont": [1, 2, np.nan, 2, 1, 3, 3, 1],
        "clicked": [1, 0, 1, 0, 0, 1, 1, 0],
    })

    cats = ColumnGroup(["ad_id", "source_id", "platform"])
    cat_features = cats >> ops.Categorify
    cont_features = ColumnGroup(["cont"]) >> ops.FillMissing >> ops.Normalize
    te_features = cats >> ops.TargetEncoding(
        "clicked", kfold=5, fold_seed=42, p_smooth=20)

    p = Workflow(cat_features + cont_features + te_features, client=client)
    p.fit_transform(nvt.Dataset(gdf_test)).to_ddf().compute()
Exemplo n.º 9
0
def test_target_encode(tmpdir, cat_groups, kfold, fold_seed, cpu):
    df = dispatch._make_df({
        "Author": list(string.ascii_uppercase),
        "Engaging-User": list(string.ascii_lowercase),
        "Cost": range(26),
        "Post": [0, 1] * 13,
    })
    if cpu:
        df = dd.from_pandas(
            df if isinstance(df, pd.DataFrame) else df.to_pandas(),
            npartitions=3)
    else:
        df = dask_cudf.from_cudf(df, npartitions=3)

    cont_names = ["Cost"]
    te_features = cat_groups >> ops.TargetEncoding(
        cont_names,
        out_path=str(tmpdir),
        kfold=kfold,
        out_dtype="float32",
        fold_seed=fold_seed,
        drop_folds=False,  # Keep folds to validate
    )

    cont_features = cont_names >> ops.FillMissing() >> ops.Clip(
        min_value=0) >> ops.LogOp()
    workflow = nvt.Workflow(te_features + cont_features +
                            ["Author", "Engaging-User"])
    df_out = workflow.fit_transform(
        nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous")

    df_lib = dispatch.get_lib()
    if kfold > 1:
        # Cat columns are unique.
        # Make sure __fold__ mapping is correct
        if cat_groups == "Author":
            name = "__fold___Author"
            cols = ["__fold__", "Author"]
        else:
            name = "__fold___Author_Engaging-User"
            cols = ["__fold__", "Author", "Engaging-User"]

        check = df_lib.read_parquet(te_features.op.stats[name])
        check = check[cols].sort_values(cols).reset_index(drop=True)
        df_out_check = df_out[cols].sort_values(cols).reset_index(drop=True)
        assert_eq(check, df_out_check, check_dtype=False)
Exemplo n.º 10
0
def test_target_encode_group():
    df = dispatch._make_df({
        "Cost":
        range(15),
        "Post": [1, 2, 3, 4, 5] * 3,
        "Author": ["A"] * 5 + ["B"] * 5 + ["C"] * 2 + ["D"] * 3,
        "Engaging_User":
        ["A"] * 5 + ["B"] * 3 + ["E"] * 2 + ["D"] * 3 + ["G"] * 2,
    })

    cat_groups = ["Author", "Engaging_User"]
    labels = ColumnSelector(
        ["Post"]) >> ops.LambdaOp(lambda col: (col > 3).astype("int8"))
    te_features = cat_groups >> ops.TargetEncoding(
        labels,
        out_path="./",
        kfold=1,
        out_dtype="float32",
        drop_folds=False,  # Keep folds to validate
    )

    workflow = nvt.Workflow(te_features + ["Author", "Engaging_User"])
    workflow.fit_transform(
        nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous")
Exemplo n.º 11
0
    def processing(
        self,
        X_pd,
        y_names=[],
        encode_categor_type=None,
        #'categorify', 'onehotencoding',
        outliers_detection_technique=None,
        #'iqr_proximity_rule', 'gaussian_approximation','quantiles'
        fill_with_value=None,
        #'extreme_values', 'zeros','mean-median'
        targetencoding=False,
        file_path=None,
    ):
        X = dd.from_pandas(X_pd, npartitions=self.n_gpus)
        X = X.replace(np.nan, None)
        try:
            self.time_columns
        except AttributeError:
            try:
                self.initialize_types(
                    X,
                    n_unique_val_th=n_unique_val_th_,
                    categor_columns_keep=categor_columns_keep_,
                    numer_columns_keep=numer_columns_keep_)
            except NameError:
                self.initialize_types(X)

        workflow = nvt.Workflow(cat_names=self.categor_columns,
                                cont_names=self.numer_columns,
                                label_name=y_names,
                                client=self.client)
        # Operators: https://nvidia.github.io/NVTabular/main/api/ops/index.html
        # Categorify https://nvidia.github.io/NVTabular/main/api/ops/categorify.html
        if encode_categor_type == 'categorify':
            if len(self.categor_columns) != 0:
                workflow.add_preprocess(
                    ops.Categorify(columns=self.categor_columns,
                                   out_path='./'))

        if encode_categor_type == 'onehotencoding':
            #OneHotEncoder().get_feature_names(input_features=<list of features encoded>) does not work
            #lengths=True - chunk sizes can be computed
            for column in self.categor_columns:
                #X[column] = X[column].astype(str)
                X_cat_encoded = OneHotEncoder().fit_transform(
                    X[column].to_dask_array(lengths=True).reshape(-1, 1))
                uvs = X[column].unique().compute().values
                X = X.drop([column], axis=1)
                X_cat_encoded = dd.from_array(
                    X_cat_encoded.compute().todense())
                X_cat_encoded.columns = [
                    column + '_{}'.format(uv) for uv in uvs
                ]
                X = dd.concat([X, X_cat_encoded], axis=1)
                X = X.repartition(npartitions=2)
            for column in X.columns:
                if any(str(column)[-4:] == t
                       for t in ['_nan', 'None']):  # What else?
                    X = X.drop([column], axis=1)

            self.initialize_types(X)
            print('Retyping:', self.initialize_types(X))
            # Reinitialize workflow
            workflow = nvt.Workflow(cat_names=self.categor_columns,
                                    cont_names=self.numer_columns,
                                    label_name=y_names,
                                    client=self.client)

        # OutlDetect https://nvidia.github.io/NVTabular/main/api/ops/clip.html
        if (len(self.numer_columns) != 0) and (outliers_detection_technique !=
                                               None):
            lower, upper = self.outldetect(outliers_detection_technique,
                                           X[self.numer_columns])
            for i in range(len(self.numer_columns)):
                logging.info(
                    f'column: {self.numer_columns[i]}, lower: {lower[i]}, upper: {upper[i]}'
                )
                print(
                    f'column: {self.numer_columns[i]}, lower: {lower[i]}, upper: {upper[i]}'
                )
                workflow.add_preprocess(
                    ops.Clip(min_value=lower[i],
                             max_value=upper[i],
                             columns=[self.numer_columns[i]]))

        # FillMissing https://nvidia.github.io/NVTabular/main/api/ops/fillmissing.html
        if fill_with_value == 'zeros':
            workflow.add_preprocess(
                ops.FillMissing(fill_val=0,
                                columns=self.categor_columns +
                                self.numer_columns))

        if fill_with_value == 'extreme_values':
            extrim_values = {}
            if len(self.numer_columns) != 0:
                extrim_values.update(
                    self.extrvalsdetect(X[self.numer_columns],
                                        'numer_columns'))

            if len(self.categor_columns) != 0:
                extrim_values.update(
                    self.extrvalsdetect(X[self.categor_columns],
                                        'categor_columns'))
            logging.info(f'extrim_values: {extrim_values}')

            output = open('extrim_values', 'wb')
            pickle.dump(extrim_values, output)
            output.close()

            for fill_val, column in zip(list(extrim_values.values()),
                                        list(extrim_values.keys())):
                workflow.add_preprocess(
                    ops.FillMissing(fill_val=fill_val, columns=[column]))

        if fill_with_value == 'mean-median':
            if len(self.categor_columns) != 0:
                workflow.add_preprocess(
                    ops.FillMedian(columns=self.categor_columns,
                                   preprocessing=True,
                                   replace=True))
            if len(self.numer_columns) != 0:
                means = list(
                    dd.from_pandas(
                        X[self.numer_columns],
                        npartitions=self.n_gpus).mean().compute().values)
                for fill_val, column in zip(means, self.numer_columns):
                    workflow.add_preprocess(
                        ops.FillMissing(fill_val=fill_val, columns=[column]))

        if targetencoding:
            #https://nvidia.github.io/NVTabular/main/api/ops/targetencoding.html
            if len(self.y_names) != 0:
                if len(self.cat_groups) == 0:
                    print(
                        '\n Target encoding will be applied to all categorical columns'
                    )
                    workflow.add_preprocess(
                        ops.TargetEncoding(cat_groups=self.categor_columns,
                                           cont_target=self.y_names))
                else:
                    workflow.add_preprocess(
                        ops.TargetEncoding(cat_groups=self.cat_groups,
                                           cont_target=self.y_names))
        #-----------------------------------------------------------------------------------------
        workflow.finalize()
        dataset = nvt.Dataset(X)

        tmp_output_path = "./parquet_data_tmp"
        workflow.apply(
            dataset,
            output_format="parquet",
            output_path=tmp_output_path,
            shuffle=Shuffle.PER_WORKER,  # Shuffle algorithm
            out_files_per_proc=1,  # Number of output files per worker
        )
        files = glob.glob(tmp_output_path + "/*.parquet")
        X_final = cudf.read_parquet(files[0])
        for i in range(1, len(files)):
            X_final = X_final.append(cudf.read_parquet(files[i]))
        # Delete temporary files
        shutil.rmtree(tmp_output_path, ignore_errors=True)
        #         if len(self.rest_col_names) != 0:
        #             print(1)
        #             X_final = pd.concat([X_final.to_pandas(), X_pd[self.rest_col_names]], axis=1)
        if file_path is not None:
            X_final.to_csv(file_path, index=False)
        return X_final
    [
        ops.Bucketize([1]),
        ops.Rename(postfix="_trim"),
        ops.Categorify(),
        ops.Categorify(encode_type="combo"),
        ops.Clip(0),
        ops.DifferenceLag("col1"),
        ops.FillMissing(),
        ops.Groupby("col1"),
        ops.HashBucket(1),
        ops.HashedCross(1),
        ops.JoinGroupby("col1"),
        ops.ListSlice(0),
        ops.LogOp(),
        ops.Normalize(),
        ops.TargetEncoding("col1"),
    ],
)
def test_workflow_select_by_tags(op):
    schema1 = ColumnSchema("col1", tags=["b", "c", "d"])
    schema2 = ColumnSchema("col2", tags=["c", "d"])
    schema3 = ColumnSchema("col3", tags=["d"])
    schema = Schema([schema1, schema2, schema3])

    cont_features = ColumnSelector(tags=["c"]) >> op
    workflow = Workflow(cont_features)
    workflow.fit_schema(schema)

    output_cols = op.output_column_names(ColumnSelector(["col1", "col2"]))
    assert len(workflow.output_schema.column_names) == len(output_cols.names)
Exemplo n.º 13
0
    [
        ops.Bucketize([1]),
        ops.Rename(postfix="_trim"),
        ops.Categorify(),
        ops.Categorify(encode_type="combo"),
        ops.Clip(0),
        ops.DifferenceLag("1"),
        ops.FillMissing(),
        ops.Groupby(["1"]),
        ops.HashBucket(1),
        ops.HashedCross(1),
        ops.JoinGroupby(["1"]),
        ops.ListSlice(0),
        ops.LogOp(),
        ops.Normalize(),
        ops.TargetEncoding(["1"]),
        ops.AddMetadata(tags=["excellent"], properties={"domain": {"min": 0, "max": 20}}),
        ops.ValueCount(),
    ],
)
@pytest.mark.parametrize("selection", [["1"], ["2", "3"], ["1", "2", "3", "4"]])
def test_schema_out(tags, properties, selection, op):
    # Create columnSchemas
    column_schemas = []
    all_cols = []
    for x in range(5):
        all_cols.append(str(x))
        column_schemas.append(ColumnSchema(str(x), tags=tags, properties=properties))

    # Turn to Schema
    schema = Schema(column_schemas)