def test_target_encode_multi(tmpdir, npartitions): cat_1 = np.asarray(["baaaa"] * 12) cat_2 = np.asarray(["baaaa"] * 6 + ["bbaaa"] * 3 + ["bcaaa"] * 3) num_1 = np.asarray([1, 1, 2, 2, 2, 1, 1, 5, 4, 4, 4, 4]) num_2 = np.asarray([1, 1, 2, 2, 2, 1, 1, 5, 4, 4, 4, 4]) * 2 df = cudf.DataFrame({"cat": cat_1, "cat2": cat_2, "num": num_1, "num_2": num_2}) df = dask_cudf.from_cudf(df, npartitions=npartitions) cat_groups = ["cat", "cat2", ["cat", "cat2"]] te_features = cat_groups >> ops.TargetEncoding( ["num", "num_2"], out_path=str(tmpdir), kfold=1, p_smooth=5, out_dtype="float32" ) workflow = nvt.Workflow(te_features) df_out = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous") assert "TE_cat_cat2_num" in df_out.columns assert "TE_cat_num" in df_out.columns assert "TE_cat2_num" in df_out.columns assert "TE_cat_cat2_num_2" in df_out.columns assert "TE_cat_num_2" in df_out.columns assert "TE_cat2_num_2" in df_out.columns assert_eq(df_out["TE_cat2_num"].values, df_out["TE_cat_cat2_num"].values) assert_eq(df_out["TE_cat2_num_2"].values, df_out["TE_cat_cat2_num_2"].values) assert df_out["TE_cat_num"].iloc[0] != df_out["TE_cat2_num"].iloc[0] assert df_out["TE_cat_num_2"].iloc[0] != df_out["TE_cat2_num_2"].iloc[0] assert math.isclose(df_out["TE_cat_num"].iloc[0], num_1.mean(), abs_tol=1e-4) assert math.isclose(df_out["TE_cat_num_2"].iloc[0], num_2.mean(), abs_tol=1e-3)
def test_spec_set(tmpdir, client): gdf_test = cudf.DataFrame({ "ad_id": [1, 2, 2, 6, 6, 8, 3, 3], "source_id": [2, 4, 4, 7, 5, 2, 5, 2], "platform": [1, 2, np.nan, 2, 1, 3, 3, 1], "cont": [1, 2, np.nan, 2, 1, 3, 3, 1], "clicked": [1, 0, 1, 0, 0, 1, 1, 0], }) p = nvt.Workflow( cat_names=["ad_id", "source_id", "platform"], cont_names=["cont"], label_name=["clicked"], client=client, ) p.add_feature(ops.FillMissing()) p.add_feature(ops.Normalize()) p.add_feature(ops.Categorify()) p.add_feature( ops.TargetEncoding( cat_groups=["ad_id", "source_id", "platform"], cont_target="clicked", kfold=5, fold_seed=42, p_smooth=20, )) p.apply(nvt.Dataset(gdf_test), record_stats=True) assert p.stats
def test_fit_schema_works_with_raw_column_dependencies(): schema = Schema(["x", "y", "cost"]) cat_features = ColumnSelector(["x", "y"]) >> ops.TargetEncoding("cost") workflow = Workflow(cat_features) workflow.fit_schema(schema) assert workflow.output_schema.column_names == ["TE_x_cost", "TE_y_cost"]
def test_target_encode_multi(tmpdir, npartitions): cat_1 = np.asarray(["baaaa"] * 12) cat_2 = np.asarray(["baaaa"] * 6 + ["bbaaa"] * 3 + ["bcaaa"] * 3) num_1 = np.asarray([1, 1, 2, 2, 2, 1, 1, 5, 4, 4, 4, 4]) num_2 = np.asarray([1, 1, 2, 2, 2, 1, 1, 5, 4, 4, 4, 4]) * 2 df = cudf.DataFrame({ "cat": cat_1, "cat2": cat_2, "num": num_1, "num_2": num_2 }) df = dask_cudf.from_cudf(df, npartitions=npartitions) cat_names = ["cat", "cat2"] cont_names = ["num", "num_2"] label_name = [] processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) cat_groups = ["cat", "cat2", ["cat", "cat2"]] processor.add_preprocess( ops.TargetEncoding( cat_groups, ["num", "num_2"], # cont_target out_path=str(tmpdir), kfold=1, p_smooth=5, out_dtype="float32", )) processor.finalize() processor.apply(nvt.Dataset(df), output_format=None) df_out = processor.get_ddf().compute(scheduler="synchronous") assert "TE_cat_cat2_num" in df_out.columns assert "TE_cat_num" in df_out.columns assert "TE_cat2_num" in df_out.columns assert "TE_cat_cat2_num_2" in df_out.columns assert "TE_cat_num_2" in df_out.columns assert "TE_cat2_num_2" in df_out.columns assert_eq(df_out["TE_cat2_num"].values, df_out["TE_cat_cat2_num"].values) assert_eq(df_out["TE_cat2_num_2"].values, df_out["TE_cat_cat2_num_2"].values) assert df_out["TE_cat_num"].iloc[0] != df_out["TE_cat2_num"].iloc[0] assert df_out["TE_cat_num_2"].iloc[0] != df_out["TE_cat2_num_2"].iloc[0] assert math.isclose(df_out["TE_cat_num"].iloc[0], num_1.mean(), abs_tol=1e-4) assert math.isclose(df_out["TE_cat_num_2"].iloc[0], num_2.mean(), abs_tol=1e-3)
def test_fit_schema_works_with_grouped_node_inputs(): schema = Schema(["x", "y", "cost"]) cat_features = ColumnSelector(["x", "y", ("x", "y")]) >> ops.TargetEncoding("cost") workflow1 = Workflow(cat_features) workflow1.fit_schema(schema) assert sorted(workflow1.output_schema.column_names) == sorted( ["TE_x_cost", "TE_y_cost", "TE_x_y_cost"])
def test_fit_schema_works_with_node_dependencies(): schema = Schema(["x", "y", "cost"]) cont_features = ColumnSelector(["cost"]) >> ops.Rename(postfix="_renamed") cat_features = ColumnSelector(["x", "y" ]) >> ops.TargetEncoding(cont_features) workflow1 = Workflow(cat_features) workflow1.fit_schema(schema) assert workflow1.output_schema.column_names == [ "TE_x_cost_renamed", "TE_y_cost_renamed" ]
def test_target_encode(tmpdir, cat_groups, kfold, fold_seed): df = cudf.DataFrame({ "Author": list(string.ascii_uppercase), "Engaging-User": list(string.ascii_lowercase), "Cost": range(26), "Post": [0, 1] * 13, }) df = dask_cudf.from_cudf(df, npartitions=3) cat_names = ["Author", "Engaging-User"] cont_names = ["Cost"] label_name = ["Post"] processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_feature( [ops.FillMissing(), ops.Clip(min_value=0), ops.LogOp()]) processor.add_preprocess( ops.TargetEncoding( cat_groups, "Cost", # cont_target out_path=str(tmpdir), kfold=kfold, out_col="test_name", out_dtype="float32", fold_seed=fold_seed, drop_folds=False, # Keep folds to validate )) processor.finalize() processor.apply(nvt.Dataset(df), output_format=None) df_out = processor.get_ddf().compute(scheduler="synchronous") assert "test_name" in df_out.columns assert df_out["test_name"].dtype == "float32" if kfold > 1: # Cat columns are unique. # Make sure __fold__ mapping is correct if cat_groups == "Author": name = "__fold___Author" cols = ["__fold__", "Author"] else: name = "__fold___Author_Engaging-User" cols = ["__fold__", "Author", "Engaging-User"] check = cudf.io.read_parquet(processor.stats["te_stats"][name]) check = check[cols].sort_values(cols).reset_index(drop=True) df_out_check = df_out[cols].sort_values(cols).reset_index(drop=True) assert_eq(check, df_out_check)
def test_spec_set(tmpdir, client): gdf_test = cudf.DataFrame({ "ad_id": [1, 2, 2, 6, 6, 8, 3, 3], "source_id": [2, 4, 4, 7, 5, 2, 5, 2], "platform": [1, 2, np.nan, 2, 1, 3, 3, 1], "cont": [1, 2, np.nan, 2, 1, 3, 3, 1], "clicked": [1, 0, 1, 0, 0, 1, 1, 0], }) cats = ColumnGroup(["ad_id", "source_id", "platform"]) cat_features = cats >> ops.Categorify cont_features = ColumnGroup(["cont"]) >> ops.FillMissing >> ops.Normalize te_features = cats >> ops.TargetEncoding( "clicked", kfold=5, fold_seed=42, p_smooth=20) p = Workflow(cat_features + cont_features + te_features, client=client) p.fit_transform(nvt.Dataset(gdf_test)).to_ddf().compute()
def test_target_encode(tmpdir, cat_groups, kfold, fold_seed, cpu): df = dispatch._make_df({ "Author": list(string.ascii_uppercase), "Engaging-User": list(string.ascii_lowercase), "Cost": range(26), "Post": [0, 1] * 13, }) if cpu: df = dd.from_pandas( df if isinstance(df, pd.DataFrame) else df.to_pandas(), npartitions=3) else: df = dask_cudf.from_cudf(df, npartitions=3) cont_names = ["Cost"] te_features = cat_groups >> ops.TargetEncoding( cont_names, out_path=str(tmpdir), kfold=kfold, out_dtype="float32", fold_seed=fold_seed, drop_folds=False, # Keep folds to validate ) cont_features = cont_names >> ops.FillMissing() >> ops.Clip( min_value=0) >> ops.LogOp() workflow = nvt.Workflow(te_features + cont_features + ["Author", "Engaging-User"]) df_out = workflow.fit_transform( nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous") df_lib = dispatch.get_lib() if kfold > 1: # Cat columns are unique. # Make sure __fold__ mapping is correct if cat_groups == "Author": name = "__fold___Author" cols = ["__fold__", "Author"] else: name = "__fold___Author_Engaging-User" cols = ["__fold__", "Author", "Engaging-User"] check = df_lib.read_parquet(te_features.op.stats[name]) check = check[cols].sort_values(cols).reset_index(drop=True) df_out_check = df_out[cols].sort_values(cols).reset_index(drop=True) assert_eq(check, df_out_check, check_dtype=False)
def test_target_encode_group(): df = dispatch._make_df({ "Cost": range(15), "Post": [1, 2, 3, 4, 5] * 3, "Author": ["A"] * 5 + ["B"] * 5 + ["C"] * 2 + ["D"] * 3, "Engaging_User": ["A"] * 5 + ["B"] * 3 + ["E"] * 2 + ["D"] * 3 + ["G"] * 2, }) cat_groups = ["Author", "Engaging_User"] labels = ColumnSelector( ["Post"]) >> ops.LambdaOp(lambda col: (col > 3).astype("int8")) te_features = cat_groups >> ops.TargetEncoding( labels, out_path="./", kfold=1, out_dtype="float32", drop_folds=False, # Keep folds to validate ) workflow = nvt.Workflow(te_features + ["Author", "Engaging_User"]) workflow.fit_transform( nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous")
def processing( self, X_pd, y_names=[], encode_categor_type=None, #'categorify', 'onehotencoding', outliers_detection_technique=None, #'iqr_proximity_rule', 'gaussian_approximation','quantiles' fill_with_value=None, #'extreme_values', 'zeros','mean-median' targetencoding=False, file_path=None, ): X = dd.from_pandas(X_pd, npartitions=self.n_gpus) X = X.replace(np.nan, None) try: self.time_columns except AttributeError: try: self.initialize_types( X, n_unique_val_th=n_unique_val_th_, categor_columns_keep=categor_columns_keep_, numer_columns_keep=numer_columns_keep_) except NameError: self.initialize_types(X) workflow = nvt.Workflow(cat_names=self.categor_columns, cont_names=self.numer_columns, label_name=y_names, client=self.client) # Operators: https://nvidia.github.io/NVTabular/main/api/ops/index.html # Categorify https://nvidia.github.io/NVTabular/main/api/ops/categorify.html if encode_categor_type == 'categorify': if len(self.categor_columns) != 0: workflow.add_preprocess( ops.Categorify(columns=self.categor_columns, out_path='./')) if encode_categor_type == 'onehotencoding': #OneHotEncoder().get_feature_names(input_features=<list of features encoded>) does not work #lengths=True - chunk sizes can be computed for column in self.categor_columns: #X[column] = X[column].astype(str) X_cat_encoded = OneHotEncoder().fit_transform( X[column].to_dask_array(lengths=True).reshape(-1, 1)) uvs = X[column].unique().compute().values X = X.drop([column], axis=1) X_cat_encoded = dd.from_array( X_cat_encoded.compute().todense()) X_cat_encoded.columns = [ column + '_{}'.format(uv) for uv in uvs ] X = dd.concat([X, X_cat_encoded], axis=1) X = X.repartition(npartitions=2) for column in X.columns: if any(str(column)[-4:] == t for t in ['_nan', 'None']): # What else? X = X.drop([column], axis=1) self.initialize_types(X) print('Retyping:', self.initialize_types(X)) # Reinitialize workflow workflow = nvt.Workflow(cat_names=self.categor_columns, cont_names=self.numer_columns, label_name=y_names, client=self.client) # OutlDetect https://nvidia.github.io/NVTabular/main/api/ops/clip.html if (len(self.numer_columns) != 0) and (outliers_detection_technique != None): lower, upper = self.outldetect(outliers_detection_technique, X[self.numer_columns]) for i in range(len(self.numer_columns)): logging.info( f'column: {self.numer_columns[i]}, lower: {lower[i]}, upper: {upper[i]}' ) print( f'column: {self.numer_columns[i]}, lower: {lower[i]}, upper: {upper[i]}' ) workflow.add_preprocess( ops.Clip(min_value=lower[i], max_value=upper[i], columns=[self.numer_columns[i]])) # FillMissing https://nvidia.github.io/NVTabular/main/api/ops/fillmissing.html if fill_with_value == 'zeros': workflow.add_preprocess( ops.FillMissing(fill_val=0, columns=self.categor_columns + self.numer_columns)) if fill_with_value == 'extreme_values': extrim_values = {} if len(self.numer_columns) != 0: extrim_values.update( self.extrvalsdetect(X[self.numer_columns], 'numer_columns')) if len(self.categor_columns) != 0: extrim_values.update( self.extrvalsdetect(X[self.categor_columns], 'categor_columns')) logging.info(f'extrim_values: {extrim_values}') output = open('extrim_values', 'wb') pickle.dump(extrim_values, output) output.close() for fill_val, column in zip(list(extrim_values.values()), list(extrim_values.keys())): workflow.add_preprocess( ops.FillMissing(fill_val=fill_val, columns=[column])) if fill_with_value == 'mean-median': if len(self.categor_columns) != 0: workflow.add_preprocess( ops.FillMedian(columns=self.categor_columns, preprocessing=True, replace=True)) if len(self.numer_columns) != 0: means = list( dd.from_pandas( X[self.numer_columns], npartitions=self.n_gpus).mean().compute().values) for fill_val, column in zip(means, self.numer_columns): workflow.add_preprocess( ops.FillMissing(fill_val=fill_val, columns=[column])) if targetencoding: #https://nvidia.github.io/NVTabular/main/api/ops/targetencoding.html if len(self.y_names) != 0: if len(self.cat_groups) == 0: print( '\n Target encoding will be applied to all categorical columns' ) workflow.add_preprocess( ops.TargetEncoding(cat_groups=self.categor_columns, cont_target=self.y_names)) else: workflow.add_preprocess( ops.TargetEncoding(cat_groups=self.cat_groups, cont_target=self.y_names)) #----------------------------------------------------------------------------------------- workflow.finalize() dataset = nvt.Dataset(X) tmp_output_path = "./parquet_data_tmp" workflow.apply( dataset, output_format="parquet", output_path=tmp_output_path, shuffle=Shuffle.PER_WORKER, # Shuffle algorithm out_files_per_proc=1, # Number of output files per worker ) files = glob.glob(tmp_output_path + "/*.parquet") X_final = cudf.read_parquet(files[0]) for i in range(1, len(files)): X_final = X_final.append(cudf.read_parquet(files[i])) # Delete temporary files shutil.rmtree(tmp_output_path, ignore_errors=True) # if len(self.rest_col_names) != 0: # print(1) # X_final = pd.concat([X_final.to_pandas(), X_pd[self.rest_col_names]], axis=1) if file_path is not None: X_final.to_csv(file_path, index=False) return X_final
[ ops.Bucketize([1]), ops.Rename(postfix="_trim"), ops.Categorify(), ops.Categorify(encode_type="combo"), ops.Clip(0), ops.DifferenceLag("col1"), ops.FillMissing(), ops.Groupby("col1"), ops.HashBucket(1), ops.HashedCross(1), ops.JoinGroupby("col1"), ops.ListSlice(0), ops.LogOp(), ops.Normalize(), ops.TargetEncoding("col1"), ], ) def test_workflow_select_by_tags(op): schema1 = ColumnSchema("col1", tags=["b", "c", "d"]) schema2 = ColumnSchema("col2", tags=["c", "d"]) schema3 = ColumnSchema("col3", tags=["d"]) schema = Schema([schema1, schema2, schema3]) cont_features = ColumnSelector(tags=["c"]) >> op workflow = Workflow(cont_features) workflow.fit_schema(schema) output_cols = op.output_column_names(ColumnSelector(["col1", "col2"])) assert len(workflow.output_schema.column_names) == len(output_cols.names)
[ ops.Bucketize([1]), ops.Rename(postfix="_trim"), ops.Categorify(), ops.Categorify(encode_type="combo"), ops.Clip(0), ops.DifferenceLag("1"), ops.FillMissing(), ops.Groupby(["1"]), ops.HashBucket(1), ops.HashedCross(1), ops.JoinGroupby(["1"]), ops.ListSlice(0), ops.LogOp(), ops.Normalize(), ops.TargetEncoding(["1"]), ops.AddMetadata(tags=["excellent"], properties={"domain": {"min": 0, "max": 20}}), ops.ValueCount(), ], ) @pytest.mark.parametrize("selection", [["1"], ["2", "3"], ["1", "2", "3", "4"]]) def test_schema_out(tags, properties, selection, op): # Create columnSchemas column_schemas = [] all_cols = [] for x in range(5): all_cols.append(str(x)) column_schemas.append(ColumnSchema(str(x), tags=tags, properties=properties)) # Turn to Schema schema = Schema(column_schemas)