def test_lambdaop_misalign(cpu): size = 12 df0 = pd.DataFrame({ "a": np.arange(size), "b": np.random.choice(["apple", "banana", "orange"], size), "c": np.random.choice([0, 1], size), }) ddf0 = dd.from_pandas(df0, npartitions=4) cont_names = ColumnGroup(["a"]) cat_names = ColumnGroup(["b"]) label = ColumnGroup(["c"]) if cpu: label_feature = label >> (lambda col: np.where(col == 4, 0, 1)) else: label_feature = label >> (lambda col: cp.where(col == 4, 0, 1)) workflow = nvt.Workflow(cat_names + cont_names + label_feature) dataset = nvt.Dataset(ddf0, cpu=cpu) transformed = workflow.transform(dataset) assert_eq_dd( df0[["a", "b"]], transformed.to_ddf().compute()[["a", "b"]], check_index=False, )
def test_workflow_move_saved(tmpdir): raw = """US>SC>519 US>CA>807 US>MI>505 US>CA>510 CA>NB US>CA>534""".split() data = cudf.DataFrame({"geo": raw}) geo_location = ColumnGroup(["geo"]) state = geo_location >> (lambda col: col.str.slice(0, 5)) >> ops.Rename( postfix="_state") country = geo_location >> (lambda col: col.str.slice(0, 2)) >> ops.Rename( postfix="_country") geo_features = state + country + geo_location >> ops.Categorify() # create the workflow and transform the input workflow = Workflow(geo_features) expected = workflow.fit_transform(Dataset(data)).to_ddf().compute() # save the workflow (including categorical mapping parquet files) # and then verify we can load the saved workflow after moving the directory out_path = os.path.join(tmpdir, "output", "workflow") workflow.save(out_path) moved_path = os.path.join(tmpdir, "output", "workflow2") shutil.move(out_path, moved_path) workflow2 = Workflow.load(moved_path) # also check that when transforming our input we get the same results after loading transformed = workflow2.transform(Dataset(data)).to_ddf().compute() assert_eq(expected, transformed)
def test_spec_set(tmpdir, client): gdf_test = cudf.DataFrame({ "ad_id": [1, 2, 2, 6, 6, 8, 3, 3], "source_id": [2, 4, 4, 7, 5, 2, 5, 2], "platform": [1, 2, np.nan, 2, 1, 3, 3, 1], "cont": [1, 2, np.nan, 2, 1, 3, 3, 1], "clicked": [1, 0, 1, 0, 0, 1, 1, 0], }) cats = ColumnGroup(["ad_id", "source_id", "platform"]) cat_features = cats >> ops.Categorify cont_features = ColumnGroup(["cont"]) >> ops.FillMissing >> ops.Normalize te_features = cats >> ops.TargetEncoding( "clicked", kfold=5, fold_seed=42, p_smooth=20) p = Workflow(cat_features + cont_features + te_features, client=client) p.fit_transform(nvt.Dataset(gdf_test)).to_ddf().compute()
def test_groupby_op(keys, cpu): # Initial timeseries dataset size = 60 df1 = pd.DataFrame({ "name": np.random.choice(["Dave", "Zelda"], size=size), "id": np.random.choice([0, 1], size=size), "ts": np.linspace(0.0, 10.0, num=size), "x": np.arange(size), "y": np.linspace(0.0, 10.0, num=size), "shuffle": np.random.uniform(low=0.0, high=10.0, size=size), }) df1 = df1.sort_values("shuffle").drop(columns="shuffle").reset_index( drop=True) # Create a ddf, and be sure to shuffle by the groupby keys ddf1 = dd.from_pandas(df1, npartitions=3).shuffle(keys) dataset = nvt.Dataset(ddf1, cpu=cpu) # Define Groupby Workflow groupby_features = ColumnGroup(["name", "id", "ts", "x", "y" ]) >> ops.Groupby( groupby_cols=keys, sort_cols=["ts"], aggs={ "x": ["list", "sum"], "y": ["first", "last"], "ts": ["min"], }, name_sep="-", ) processor = nvtabular.Workflow(groupby_features) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() # Check list-aggregation ordering x = new_gdf["x-list"] x = x.to_pandas() if hasattr(x, "to_pandas") else x sums = [] for el in x.values: _el = pd.Series(el) sums.append(_el.sum()) assert _el.is_monotonic_increasing # Check that list sums match sum aggregation x = new_gdf["x-sum"] x = x.to_pandas() if hasattr(x, "to_pandas") else x assert list(x) == sums # Check basic behavior or "y" column assert (new_gdf["y-first"] < new_gdf["y-last"]).all()
def test_cats_and_groupby_stats(client, tmpdir, datasets, part_mem_fraction, use_client): engine = "parquet" paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] cats = ColumnGroup(cat_names) cat_features = cats >> ops.Categorify(out_path=str(tmpdir), freq_threshold=10, on_host=True) groupby_features = cats >> ops.JoinGroupby( cont_names=cont_names, stats=["count", "sum"], out_path=str(tmpdir) ) workflow = Workflow(cat_features + groupby_features, client=client) dataset = Dataset(paths, part_mem_fraction=part_mem_fraction) result = workflow.fit_transform(dataset).to_ddf().compute() assert "name-cat_x_sum" in result.columns assert "name-string_x_sum" in result.columns
def test_column_group_select(): df = cudf.DataFrame({ "a": [1, 4, 9, 16, 25], "b": [0, 1, 2, 3, 4], "c": [25, 16, 9, 4, 1] }) input_features = ColumnGroup(["a", "b", "c"]) sqrt_features = input_features[["a", "c"]] >> cudf.sqrt plus_one_features = input_features["b"] >> (lambda col: col + 1) features = sqrt_features + plus_one_features workflow = Workflow(features) df_out = workflow.fit_transform( Dataset(df)).to_ddf().compute(scheduler="synchronous") expected = cudf.DataFrame() expected["a"] = cudf.sqrt(df["a"]) expected["c"] = cudf.sqrt(df["c"]) expected["b"] = df["b"] + 1 assert_eq(expected, df_out)
def test_transform_geolocation(): raw = """US>SC>519 US>CA>807 US>MI>505 US>CA>510 CA>NB US>CA>534""".split() data = cudf.DataFrame({"geo_location": raw}) geo_location = ColumnGroup(["geo_location"]) state = geo_location >> (lambda col: col.str.slice(0, 5)) >> ops.Rename( postfix="_state") country = geo_location >> (lambda col: col.str.slice(0, 2)) >> ops.Rename( postfix="_country") geo_features = state + country + geo_location >> ops.HashBucket( num_buckets=100) # for this workflow we don't have any statoperators, so we can get away without fitting workflow = Workflow(geo_features) transformed = workflow.transform(Dataset(data)).to_ddf().compute() expected = cudf.DataFrame() expected["geo_location_state"] = data["geo_location"].str.slice( 0, 5).hash_values() % 100 expected["geo_location_country"] = data["geo_location"].str.slice( 0, 2).hash_values() % 100 expected["geo_location"] = data["geo_location"].hash_values() % 100 assert_eq(expected, transformed)
def test_nested_column_group(): df = cudf.DataFrame({ "geo": ["US>CA", "US>NY", "CA>BC", "CA>ON"], "user": ["User_A", "User_A", "User_A", "User_B"], }) country = (ColumnGroup(["geo"]) >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country")) # make sure we can do a 'combo' categorify (cross based) of country+user # as well as categorifying the country and user columns on their own cats = [country + "user"] + country + "user" >> Categorify( encode_type="combo") workflow = Workflow(cats) df_out = workflow.fit_transform( Dataset(df)).to_ddf().compute(scheduler="synchronous") geo_country = df_out["geo_country"] assert geo_country[0] == geo_country[1] # rows 0,1 are both 'US' assert geo_country[2] == geo_country[3] # rows 2,3 are both 'CA' user = df_out["user"] assert user[0] == user[1] == user[2] assert user[3] != user[2] geo_country_user = df_out["geo_country_user"] assert geo_country_user[0] == geo_country_user[1] # US / userA assert geo_country_user[2] != geo_country_user[ 0] # same user but in canada # make sure we get an exception if we nest too deeply (can't handle arbitrarily deep # nested column groups - and the exceptions we would get in operators like Categorify # are super confusing for users) with pytest.raises(ValueError): cats = [[country + "user"] + country + "user" ] >> Categorify(encode_type="combo")
def test_lambdaop(tmpdir, df, dataset, gpu_memory_frac, engine): df_copy = df.copy() # Substring # Replacement substring = ColumnGroup(["name-cat", "name-string" ]) >> (lambda col: col.str.slice(1, 3)) processor = nvtabular.Workflow(substring) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() assert_eq_dd(new_gdf["name-cat"], df_copy["name-cat"].str.slice(1, 3), check_index=False) assert_eq_dd(new_gdf["name-string"], df_copy["name-string"].str.slice(1, 3), check_index=False) # No Replacement from old API (skipped for other examples) substring = ( ColumnGroup(["name-cat", "name-string"]) >> (lambda col: col.str.slice(1, 3)) >> ops.Rename(postfix="_slice")) processor = nvtabular.Workflow(["name-cat", "name-string"] + substring) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() assert_eq_dd( new_gdf["name-cat_slice"], df_copy["name-cat"].str.slice(1, 3), check_index=False, check_names=False, ) assert_eq_dd( new_gdf["name-string_slice"], df_copy["name-string"].str.slice(1, 3), check_index=False, check_names=False, ) assert_eq_dd(new_gdf["name-cat"], df_copy["name-cat"], check_index=False) assert_eq_dd(new_gdf["name-string"], df_copy["name-string"], check_index=False) # Replace # Replacement oplambda = ColumnGroup(["name-cat", "name-string" ]) >> (lambda col: col.str.replace("e", "XX")) processor = nvtabular.Workflow(oplambda) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() assert_eq_dd(new_gdf["name-cat"], df_copy["name-cat"].str.replace("e", "XX"), check_index=False) assert_eq_dd(new_gdf["name-string"], df_copy["name-string"].str.replace("e", "XX"), check_index=False) # astype # Replacement oplambda = ColumnGroup(["id"]) >> (lambda col: col.astype(float)) processor = nvtabular.Workflow(oplambda) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() assert new_gdf["id"].dtype == "float64" # Workflow # Replacement oplambda = ( ColumnGroup(["name-cat"]) >> (lambda col: col.astype(str).str.slice(0, 1)) >> ops.Categorify()) processor = nvtabular.Workflow(oplambda) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() assert is_integer_dtype(new_gdf["name-cat"].dtype) oplambda = (ColumnGroup(["name-cat", "name-string"]) >> ops.Categorify() >> (lambda col: col + 100)) processor = nvtabular.Workflow(oplambda) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() assert is_integer_dtype(new_gdf["name-cat"].dtype) assert np.sum(new_gdf["name-cat"] < 100) == 0
def create_workflow(data_bucket_folder, hash_spec, devices, local_directory, dask): rmm.reinitialize(managed_memory=False) documents_categories_path = os.path.join(data_bucket_folder, "documents_categories.csv") documents_topics_path = os.path.join(data_bucket_folder, "documents_topics.csv") documents_entities_path = os.path.join(data_bucket_folder, "documents_entities.csv") documents_categories_cudf = cudf.read_csv(documents_categories_path) documents_topics_cudf = cudf.read_csv(documents_topics_path) documents_entities_cudf = cudf.read_csv(documents_entities_path) documents_entities_cudf["entity_id"] = ( documents_entities_cudf["entity_id"].astype("category").cat.codes) categories = _df_to_coo(documents_categories_cudf, col="category_id") topics = _df_to_coo(documents_topics_cudf, col="topic_id") entities = _df_to_coo(documents_entities_cudf, col="entity_id") del documents_categories_cudf, documents_topics_cudf, documents_entities_cudf ctr_thresh = { "ad_id": 5, "source_id_promo": 10, "publisher_id_promo": 10, "advertiser_id": 10, "campaign_id": 10, "document_id_promo": 5, } ctr_inputs = ColumnGroup(CTR_INPUTS) cat_cols = ColumnGroup(CATEGORICAL_COLUMNS) geo_location = ColumnGroup(["geo_location"]) country = (geo_location >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country")) state = (geo_location >> (lambda col: col.str.slice(0, 5)) >> Rename(postfix="_state")) geo_features = geo_location + country + state dates = ["publish_time", "publish_time_promo"] date_features = dates >> DaysSincePublished() >> FillMedian() >> LogOp stat_cols = ctr_inputs >> JoinGroupby(cont_cols=["clicked"], stats=["sum", "count"]) ctr_cols = (stat_cols - [ column + "_count" for column in ctr_inputs.flattened_columns ] >> LambdaOp( f=lambda col, gdf: ((col) / (gdf[col.name.replace("_clicked_sum", "_count")])).where( gdf[col.name.replace("_clicked_sum", "_count")] >= ctr_thresh[ col.name.replace("_clicked_sum", "")], 0, ), dependency=stat_cols - [column + "clicked_sum" for column in ctr_inputs.flattened_columns], ) >> Rename(f=lambda x: x.replace("_clicked_sum", "_ctr"))) stat_cols = stat_cols >> FillMissing() >> LogOp() >> Normalize() ctr_cols = ctr_cols >> FillMissing() cat_cols = cat_cols + geo_features >> HashBucket(hash_spec) features = (date_features + ctr_cols + stat_cols + cat_cols + ["clicked", "display_id"]) sim_features_categ = ( [["document_id", "document_id_promo"]] >> ColumnSimilarity( categories, metric="tfidf", on_device=False) >> Rename(postfix="_categories")) sim_features_topics = ( [["document_id", "document_id_promo"] ] >> ColumnSimilarity(topics, metric="tfidf", on_device=False) >> Rename(postfix="_topics")) sim_features_entities = ( [["document_id", "document_id_promo"] ] >> ColumnSimilarity(entities, metric="tfidf", on_device=False) >> Rename(postfix="_entities")) sim_features = sim_features_categ + sim_features_topics + sim_features_entities client = create_client(devices=devices, local_directory=local_directory) if dask else None workflow = nvt.Workflow(column_group=features + sim_features, client=client) return workflow