def test_avro_basic(tmpdir, part_size, size, nfiles): # Require uavro and fastavro library. # Note that fastavro is only required to write # avro files for testing, while uavro is actually # used by AvroDatasetEngine. fa = pytest.importorskip("fastavro") pytest.importorskip("uavro") # Define avro schema schema = fa.parse_schema( { "name": "avro.example.User", "type": "record", "fields": [ {"name": "name", "type": "string"}, {"name": "age", "type": "int"}, ], } ) # Write avro dataset with two files. # Collect block and record (row) count while writing. nblocks = 0 nrecords = 0 paths = [os.path.join(str(tmpdir), f"test.{i}.avro") for i in range(nfiles)] records = [] for path in paths: names = np.random.choice(name_list, size) ages = np.random.randint(18, 100, size) data = [{"name": names[i], "age": ages[i]} for i in range(size)] with open(path, "wb") as f: fa.writer(f, schema, data) with open(path, "rb") as fo: avro_reader = fa.block_reader(fo) for block in avro_reader: nrecords += block.num_records nblocks += 1 records += list(block) if nfiles == 1: paths = paths[0] # Read back with dask.dataframe df = nvt.Dataset(paths, part_size=part_size, engine="avro").to_ddf() # Check basic length and partition count if part_size == "1KB": assert df.npartitions == nblocks assert len(df) == nrecords # Full comparison expect = pd.DataFrame.from_records(records) expect["age"] = expect["age"].astype("int32") assert_eq(df.compute().reset_index(drop=True), expect)
def test_gpu_dl_break(tmpdir, df, dataset, batch_size, part_mem_fraction, engine, device): cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] conts = cont_names >> ops.FillMedian() >> ops.Normalize() cats = cat_names >> ops.Categorify() processor = nvt.Workflow(conts + cats + label_name) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) processor.fit_transform(dataset).to_parquet( shuffle=nvt.io.Shuffle.PER_PARTITION, output_path=output_train, out_files_per_proc=2, ) tar_paths = [ os.path.join(output_train, x) for x in os.listdir(output_train) if x.endswith("parquet") ] nvt_data = nvt.Dataset(tar_paths[0], engine="parquet", part_mem_fraction=part_mem_fraction) data_itr = torch_dataloader.TorchAsyncItr( nvt_data, batch_size=batch_size, cats=cat_names, conts=cont_names, labels=["label"], device=device, ) len_dl = len(data_itr) - 1 first_chunk = 0 for idx, chunk in enumerate(data_itr): if idx == 0: first_chunk = len(chunk[0]) last_chk = len(chunk[0]) print(last_chk) if idx == 1: break del chunk assert idx < len_dl first_chunk_2 = 0 for idx, chunk in enumerate(data_itr): if idx == 0: first_chunk_2 = len(chunk[0]) del chunk assert idx == len_dl assert first_chunk == first_chunk_2
def test_workflow_apply(client, use_client, tmpdir, shuffle, apply_offline): out_files_per_proc = 2 out_path = str(tmpdir.mkdir("processed")) path = str(tmpdir.join("simple.parquet")) size = 25 row_group_size = 5 cont_names = ["cont1", "cont2"] cat_names = ["cat1", "cat2"] label_name = ["label"] df = pd.DataFrame({ "cont1": np.arange(size, dtype=np.float64), "cont2": np.arange(size, dtype=np.float64), "cat1": np.arange(size, dtype=np.int32), "cat2": np.arange(size, dtype=np.int32), "label": np.arange(size, dtype=np.float64), }) df.to_parquet(path, row_group_size=row_group_size, engine="pyarrow") dataset = nvt.Dataset(path, engine="parquet", row_groups_per_part=1) cat_features = cat_names >> ops.Categorify() cont_features = cont_names >> ops.FillMissing() >> ops.Clip( min_value=0) >> ops.LogOp workflow = Workflow(cat_features + cont_features + label_name, client=client if use_client else None) workflow.fit(dataset) # Force dtypes dict_dtypes = {} for col in cont_names: dict_dtypes[col] = np.float32 for col in cat_names: dict_dtypes[col] = np.float32 for col in label_name: dict_dtypes[col] = np.int64 workflow.transform(dataset).to_parquet( # apply_offline=apply_offline, Not any more? # record_stats=apply_offline, Not any more? output_path=out_path, shuffle=shuffle, out_files_per_proc=out_files_per_proc, dtypes=dict_dtypes, ) # Check dtypes for filename in glob.glob(os.path.join(out_path, "*.parquet")): gdf = cudf.io.read_parquet(filename) assert dict(gdf.dtypes) == dict_dtypes
def test_logop_lists(tmpdir, cpu): df = dispatch._make_df(device="cpu" if cpu else "gpu") df["vals"] = [[np.exp(0) - 1, np.exp(1) - 1], [np.exp(2) - 1], []] features = ["vals"] >> nvt.ops.LogOp() workflow = nvt.Workflow(features) new_df = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute() expected = dispatch._make_df(device="cpu" if cpu else "gpu") expected["vals"] = [[0.0, 1.0], [2.0], []] assert_eq(expected, new_df)
def _verify_workflow_on_tritonserver( tmpdir, workflow, df, model_name, output_model="tensorflow", model_info=None, sparse_max=None, ): """tests that the nvtabular workflow produces the same results when run locally in the process, and when run in tritonserver""" # fit the workflow and test on the input dataset = nvt.Dataset(df) workflow.fit(dataset) local_df = workflow.transform(dataset).to_ddf().compute( scheduler="synchronous") for col in workflow.output_node.output_columns.names: if sparse_max and col in sparse_max.keys(): workflow.output_dtypes[col] = workflow.output_dtypes.get( col).element_type triton.generate_nvtabular_model( workflow=workflow, name=model_name, output_path=tmpdir + f"/{model_name}", version=1, output_model=output_model, output_info=model_info, sparse_max=sparse_max, backend=BACKEND, ) inputs = triton.convert_df_to_triton_input(df.columns, df) outputs = [ grpcclient.InferRequestedOutput(col) for col in workflow.output_dtypes.keys() ] with run_triton_server(tmpdir) as client: response = client.infer(model_name, inputs, outputs=outputs) for col in workflow.output_dtypes.keys(): features = response.as_numpy(col) if sparse_max and col in sparse_max: features = features.tolist() triton_df = _make_df() triton_df[col] = features else: triton_df = _make_df( {col: features.reshape(features.shape[0])}) assert_eq(triton_df, local_df[[col]])
def test_categorify_freq_limit(tmpdir, freq_limit): df = pd.DataFrame({ "Author": [ "User_A", "User_E", "User_B", "User_C", "User_A", "User_E", "User_B", "User_C", "User_B", "User_C", ], "Engaging User": [ "User_B", "User_B", "User_A", "User_D", "User_B", "User_c", "User_A", "User_D", "User_D", "User_D", ], }) cat_names = ["Author", "Engaging User"] cont_names = [] label_name = [] processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_preprocess( ops.Categorify(columns=cat_names, freq_threshold=freq_limit, out_path=str(tmpdir))) processor.finalize() processor.apply(nvt.Dataset(df), output_format=None) df_out = processor.get_ddf().compute(scheduler="synchronous") # Column combinations are encoded if isinstance(freq_limit, dict): assert df_out["Author"].max() == 2 assert df_out["Engaging User"].max() == 1 else: assert len(df["Author"].unique()) == df_out["Author"].max() assert len( df["Engaging User"].unique()) == df_out["Engaging User"].max()
def test_mh_support(tmpdir): df = cudf.DataFrame({ "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]], "Reviewers": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]], "Engaging User": ["User_B", "User_B", "User_A", "User_D"], "Post": [1, 2, 3, 4], }) cat_names = ["Authors", "Reviewers"] # , "Engaging User"] cont_names = [] label_name = ["Post"] processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_preprocess(ops.HashBucket(num_buckets=10)) processor.finalize() processor.apply(nvt.Dataset(df), output_format=None) df_out = processor.get_ddf().compute(scheduler="synchronous") # check to make sure that the same strings are hashed the same authors = df_out["Authors"].to_arrow().to_pylist() assert authors[0][0] == authors[1][0] # 'User_A' assert authors[2][1] == authors[3][0] # 'User_C' data_itr = torch_dataloader.TorchAsyncItr(nvt.Dataset(df_out), cats=cat_names, conts=cont_names, labels=label_name) idx = 0 for batch in data_itr: idx = idx + 1 cats, conts, labels = batch cats, mh = cats # mh is a tuple of dictionaries {Column name: (values, offsets)} assert len(mh) == len(cat_names) assert not cats assert idx > 0
def dataset(request, paths, engine): try: gpu_memory_frac = request.getfixturevalue("gpu_memory_frac") except Exception: gpu_memory_frac = 0.01 kwargs = {} if engine == "csv-no-header": kwargs["names"] = allcols_csv return nvtabular.Dataset(paths, part_mem_fraction=gpu_memory_frac, **kwargs)
def test_column_similarity(on_device, metric): categories = cupy.sparse.coo_matrix( ( cupy.ones(14), ( cupy.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 4, 4, 5, 5]), cupy.array([0, 1, 2, 1, 2, 3, 3, 4, 5, 1, 1, 2, 0, 1]), ), ) ) input_df = cudf.DataFrame({"left": [0, 0, 0, 0, 4], "right": [0, 1, 2, 3, 5]}) op = ColumnSimilarity("output", "left", categories, "right", metric=metric, on_device=on_device) workflow = nvtabular.Workflow(cat_names=["left", "right"], cont_names=[], label_name=[]) workflow.add_feature(op) workflow.apply(nvtabular.Dataset(input_df), output_path=None) df = workflow.get_ddf().compute() output = df.output.values if metric in ("tfidf", "cosine"): # distance from document 0 to itself should be 1, since these metrics are fully normalized assert float(output[0]) == pytest.approx(1) # distance from document 0 to document 2 should be 0 since they have no features in common assert output[2] == 0 # distance from document 4 to 5 should be non-zero (have category 1 in common) assert output[4] != 0 # make sure that we can operate multiple times on the same matrix correctly op = ColumnSimilarity( "output", "left", categories, "right", metric="inner", on_device=on_device ) workflow = nvtabular.Workflow(cat_names=["left", "right"], cont_names=[], label_name=[]) workflow.add_feature(op) workflow.apply(nvtabular.Dataset(df), output_path=None) df = workflow.get_ddf().compute() assert float(df.output.values[0]) == pytest.approx(3)
def test_column_similarity(on_device, metric): categories = coo_matrix( ( cupy.ones(14), ( cupy.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 4, 4, 5, 5]), cupy.array([0, 1, 2, 1, 2, 3, 3, 4, 5, 1, 1, 2, 0, 1]), ), ) ) input_df = cudf.DataFrame({"left": [0, 0, 0, 0, 4], "right": [0, 1, 2, 3, 5]}) sim_features = [["left", "right"]] >> ColumnSimilarity( categories, metric=metric, on_device=on_device ) workflow = nvtabular.Workflow(sim_features) df = workflow.transform(nvtabular.Dataset(input_df)).to_ddf().compute() output = df["left_right_sim"].values if metric in ("tfidf", "cosine"): # distance from document 0 to itself should be 1, since these metrics are fully normalized assert float(output[0]) == pytest.approx(1) # distance from document 0 to document 2 should be 0 since they have no features in common assert output[2] == 0 # distance from document 4 to 5 should be non-zero (have category 1 in common) assert output[4] != 0 # make sure that we can operate multiple times on the same matrix correctly sim_features = [["left", "right"]] >> ColumnSimilarity( categories, metric="inner", on_device=on_device ) workflow = nvtabular.Workflow(sim_features) df = workflow.transform(nvtabular.Dataset(input_df)).to_ddf().compute() assert float(df["left_right_sim"].values[0]) == pytest.approx(3)
def test_join_external(tmpdir, df, dataset, engine, kind_ext, cache, how, drop_duplicates): # Define "external" table shift = 100 df_ext = df[["id"]].copy().sort_values("id") df_ext["new_col"] = df_ext["id"] + shift df_ext["new_col_2"] = "keep" df_ext["new_col_3"] = "ignore" df_ext_check = df_ext.copy() if kind_ext == "pandas": df_ext = df_ext.to_pandas() elif kind_ext == "arrow": df_ext = df_ext.to_arrow() elif kind_ext == "parquet": path = tmpdir.join("external.parquet") df_ext.to_parquet(path) df_ext = path elif kind_ext == "csv": path = tmpdir.join("external.csv") df_ext.to_csv(path) df_ext = path # Define Op on = "id" columns_left = list(df.columns) columns_ext = ["id", "new_col", "new_col_2"] df_ext_check = df_ext_check[columns_ext] if drop_duplicates: df_ext_check.drop_duplicates(ignore_index=True, inplace=True) joined = nvt.ColumnGroup(columns_left) >> nvt.ops.JoinExternal( df_ext, on, how=how, columns_ext=columns_ext, cache=cache, drop_duplicates_ext=drop_duplicates, ) gdf = df.reset_index() dataset = nvt.Dataset(gdf) processor = nvt.Workflow(joined) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute().reset_index() check_gdf = gdf.merge(df_ext_check, how=how, on=on) assert len(check_gdf) == len(new_gdf) assert (new_gdf["id"] + shift).all() == new_gdf["new_col"].all() assert gdf["id"].all() == new_gdf["id"].all() assert "new_col_2" in new_gdf.columns assert "new_col_3" not in new_gdf.columns
def test_target_encode(tmpdir, cat_groups, kfold, fold_seed): df = cudf.DataFrame({ "Author": list(string.ascii_uppercase), "Engaging-User": list(string.ascii_lowercase), "Cost": range(26), "Post": [0, 1] * 13, }) df = dask_cudf.from_cudf(df, npartitions=3) cat_names = ["Author", "Engaging-User"] cont_names = ["Cost"] label_name = ["Post"] processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_feature( [ops.FillMissing(), ops.Clip(min_value=0), ops.LogOp()]) processor.add_preprocess( ops.TargetEncoding( cat_groups, "Cost", # cont_target out_path=str(tmpdir), kfold=kfold, out_col="test_name", out_dtype="float32", fold_seed=fold_seed, drop_folds=False, # Keep folds to validate )) processor.finalize() processor.apply(nvt.Dataset(df), output_format=None) df_out = processor.get_ddf().compute(scheduler="synchronous") assert "test_name" in df_out.columns assert df_out["test_name"].dtype == "float32" if kfold > 1: # Cat columns are unique. # Make sure __fold__ mapping is correct if cat_groups == "Author": name = "__fold___Author" cols = ["__fold__", "Author"] else: name = "__fold___Author_Engaging-User" cols = ["__fold__", "Author", "Engaging-User"] check = cudf.io.read_parquet(processor.stats["te_stats"][name]) check = check[cols].sort_values(cols).reset_index(drop=True) df_out_check = df_out[cols].sort_values(cols).reset_index(drop=True) assert_eq(check, df_out_check)
def test_s3_dataset(s3_base, s3so, paths, datasets, engine, df): # Copy files to mock s3 bucket files = {} for i, path in enumerate(paths): with open(path, "rb") as f: fbytes = f.read() fn = path.split(os.path.sep)[-1] files[fn] = BytesIO() files[fn].write(fbytes) files[fn].seek(0) if engine == "parquet": # Workaround for nvt#539. In order to avoid the # bug in Dask's `create_metadata_file`, we need # to manually generate a "_metadata" file here. # This can be removed after dask#7295 is merged # (see https://github.com/dask/dask/pull/7295) fn = "_metadata" files[fn] = BytesIO() meta = create_metadata_file( paths, engine="pyarrow", out_dir=False, ) meta.write_metadata_file(files[fn]) files[fn].seek(0) with s3_context(s3_base=s3_base, bucket=engine, files=files): # Create nvt.Dataset from mock s3 paths url = f"s3://{engine}" if engine == "parquet" else f"s3://{engine}/*" dataset = nvt.Dataset(url, engine=engine, storage_options=s3so) # Check that the iteration API works columns = mycols_pq if engine == "parquet" else mycols_csv gdf = cudf.concat(list(dataset.to_iter()))[columns] assert_eq(gdf.reset_index(drop=True), df.reset_index(drop=True)) cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] conts = cont_names >> ops.FillMissing() >> ops.Clip( min_value=0) >> ops.LogOp() cats = cat_names >> ops.Categorify(cat_cache="host") processor = nvt.Workflow(conts + cats + label_name) processor.fit(dataset)
def save_stats(data_bucket_folder, output_train_folder, train_path, output_valid_folder, valid_path, stats_file, hash_spec, local_directory, dask): devices = get_devices() shuffle = Shuffle.PER_PARTITION if len(devices) > 1 else True workflow = create_workflow(data_bucket_folder=data_bucket_folder, hash_spec=hash_spec, devices=devices, local_directory=local_directory, dask=dask) train_dataset = nvt.Dataset(train_path, part_size="1GB") valid_dataset = nvt.Dataset(valid_path, part_size="150MB") workflow.fit(train_dataset) workflow.transform(train_dataset).to_parquet( output_path=output_train_folder, shuffle=shuffle, out_files_per_proc=8) workflow.transform(valid_dataset).to_parquet( output_path=output_valid_folder, shuffle=None, output_files=8) workflow.save(stats_file) return workflow
def test_categorify_max_size(max_emb_size): df = cudf.DataFrame({ "Author": [ "User_A", "User_E", "User_B", "User_C", "User_A", "User_E", "User_B", "User_C", "User_D", "User_F", "User_F", ], "Engaging_User": [ "User_B", "User_B", "User_A", "User_D", "User_B", "User_M", "User_A", "User_D", "User_N", "User_F", "User_E", ], }) cat_names = ["Author", "Engaging_User"] buckets = 3 dataset = nvt.Dataset(df) cat_features = cat_names >> ops.Categorify(max_size=max_emb_size, num_buckets=buckets) processor = nvt.Workflow(cat_features) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() if isinstance(max_emb_size, int): max_emb_size = {name: max_emb_size for name in cat_names} # check encoded values after freq_hashing with fix emb size assert new_gdf["Author"].max() <= max_emb_size["Author"] assert new_gdf["Engaging_User"].max() <= max_emb_size["Engaging_User"] # check embedding size is less than max_size after hashing with fix emb size. assert nvt.ops.get_embedding_sizes( processor)["Author"][0] <= max_emb_size["Author"] assert (nvt.ops.get_embedding_sizes(processor)["Engaging_User"][0] <= max_emb_size["Engaging_User"])
def test_generate_triton_multihot(tmpdir): df = _make_df({ "userId": ["a", "a", "b"], "movieId": ["1", "2", "2"], "genres": [["action", "adventure"], ["action", "comedy"], ["comedy"]], }) cats = ["userId", "movieId", "genres"] >> nvt.ops.Categorify() workflow = nvt.Workflow(cats) workflow.fit(nvt.Dataset(df)) expected = workflow.transform(nvt.Dataset(df)).to_ddf().compute() # save workflow to triton / verify we see some expected output repo = os.path.join(tmpdir, "models") triton.generate_nvtabular_model(workflow, "model", repo) workflow = None assert os.path.exists(os.path.join(repo, "config.pbtxt")) workflow = nvt.Workflow.load(os.path.join(repo, "1", "workflow")) transformed = workflow.transform(nvt.Dataset(df)).to_ddf().compute() assert_eq(expected, transformed)
def test_remove_columns(): # _remove_columns was failing to export the criteo example, because # the label column was getting inserted into the subgroups of the output node # https://github.com/NVIDIA-Merlin/NVTabular/issues/1198 label_columns = ["label"] cats = ["a"] >> ops.Categorify() conts = ["b"] >> ops.Normalize() workflow = nvt.Workflow(cats + conts + label_columns) df = pd.DataFrame({"a": ["a", "b"], "b": [1.0, 2.0], "label": [0, 1]}) workflow.fit(nvt.Dataset(df)) removed = ensemble._remove_columns(workflow, label_columns) assert set(removed.output_dtypes.keys()) == {"a", "b"}
def test_validater(tmpdir, batch_size): n_samples = 9 rand = np.random.RandomState(0) gdf = cudf.DataFrame({ "a": rand.randn(n_samples), "label": rand.randint(2, size=n_samples) }) dataloader = tf_dataloader.KerasSequenceLoader( nvt.Dataset(gdf), batch_size=batch_size, cat_names=[], cont_names=["a"], label_names=["label"], shuffle=False, ) input = tf.keras.Input(name="a", dtype=tf.float32, shape=(1, )) x = tf.keras.layers.Dense(128, "relu")(input) x = tf.keras.layers.Dense(1, activation="softmax")(x) model = tf.keras.Model(inputs=input, outputs=x) model.compile("sgd", "binary_crossentropy", metrics=["accuracy", tf.keras.metrics.AUC()]) validater = tf_dataloader.KerasSequenceValidater(dataloader) model.fit(dataloader, epochs=2, verbose=0, callbacks=[validater]) predictions, labels = [], [] for X, y_true in dataloader: y_pred = model(X) labels.extend(y_true.numpy()[:, 0]) predictions.extend(y_pred.numpy()[:, 0]) predictions = np.array(predictions) labels = np.array(labels) logs = {} validater.on_epoch_end(0, logs) auc_key = [i for i in logs.keys() if i.startswith("val_auc")][0] true_accuracy = (labels == (predictions > 0.5)).mean() estimated_accuracy = logs["val_accuracy"] assert np.isclose(true_accuracy, estimated_accuracy, rtol=1e-6) true_auc = roc_auc_score(labels, predictions) estimated_auc = logs[auc_key] assert np.isclose(true_auc, estimated_auc, rtol=1e-6)
def test_workflow_generate_columns(tmpdir, use_parquet): out_path = str(tmpdir.mkdir("processed")) path = str(tmpdir.join("simple.parquet")) # Stripped down dataset with geo_locaiton codes like in outbrains df = cudf.DataFrame({"geo_location": ["US>CA", "CA>BC", "US>TN>659"]}) # defining a simple workflow that strips out the country code from the first two digits of the # geo_location code and sticks in a new 'geo_location_country' field country = (["geo_location"] >> ops.LambdaOp( f=lambda col: col.str.slice(0, 2), ) >> ops.Rename(postfix="_country")) cat_features = ["geo_location"] + country >> ops.Categorify() workflow = Workflow(cat_features) if use_parquet: df.to_parquet(path) dataset = nvt.Dataset(path) else: dataset = nvt.Dataset(df) # just make sure this works without errors workflow.fit(dataset) workflow.transform(dataset).to_parquet(out_path)
def test_target_encode_multi(tmpdir, npartitions, cpu): cat_1 = np.asarray(["baaaa"] * 12) cat_2 = np.asarray(["baaaa"] * 6 + ["bbaaa"] * 3 + ["bcaaa"] * 3) num_1 = np.asarray([1, 1, 2, 2, 2, 1, 1, 5, 4, 4, 4, 4]) num_2 = np.asarray([1, 1, 2, 2, 2, 1, 1, 5, 4, 4, 4, 4]) * 2 df = dispatch._make_df({ "cat": cat_1, "cat2": cat_2, "num": num_1, "num_2": num_2 }) if cpu: df = dd.from_pandas( df if isinstance(df, pd.DataFrame) else df.to_pandas(), npartitions=npartitions) else: df = dask_cudf.from_cudf(df, npartitions=npartitions) cat_groups = ["cat", "cat2", ["cat", "cat2"]] te_features = cat_groups >> ops.TargetEncoding(["num", "num_2"], out_path=str(tmpdir), kfold=1, p_smooth=5, out_dtype="float32") workflow = nvt.Workflow(te_features) df_out = workflow.fit_transform( nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous") assert "TE_cat_cat2_num" in df_out.columns assert "TE_cat_num" in df_out.columns assert "TE_cat2_num" in df_out.columns assert "TE_cat_cat2_num_2" in df_out.columns assert "TE_cat_num_2" in df_out.columns assert "TE_cat2_num_2" in df_out.columns assert_eq(df_out["TE_cat2_num"].values, df_out["TE_cat_cat2_num"].values) assert_eq(df_out["TE_cat2_num_2"].values, df_out["TE_cat_cat2_num_2"].values) assert df_out["TE_cat_num"].iloc[0] != df_out["TE_cat2_num"].iloc[0] assert df_out["TE_cat_num_2"].iloc[0] != df_out["TE_cat2_num_2"].iloc[0] assert math.isclose(df_out["TE_cat_num"].iloc[0], num_1.mean(), abs_tol=1e-4) assert math.isclose(df_out["TE_cat_num_2"].iloc[0], num_2.mean(), abs_tol=1e-3)
def _convert_file(path, name, out_dir, gpu_mem_frac, fs, cols, dtypes): fn = f"{name}.parquet" out_path = fs.sep.join([out_dir, f"{name}.parquet"]) writer = ParquetWriter(out_path, compression=None) for gdf in nvt.Dataset( path, engine="csv", names=cols, part_memory_fraction=gpu_mem_frac, sep='\t', dtypes=dtypes, ).to_iter(): writer.write_table(gdf) del gdf md = writer.close(metadata_file_path=fn) return md
def test_fill_missing(tmpdir, df, dataset, engine): cont_names = ["x", "y"] cont_features = cont_names >> nvt.ops.FillMissing(fill_val=42) for col in cont_names: idx = np.random.choice(df.shape[0] - 1, int(df.shape[0] * 0.2)) df[col].iloc[idx] = None df = df.reset_index() dataset = nvt.Dataset(df) processor = nvt.Workflow(cont_features) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() for col in cont_names: assert np.all((df[col].fillna(42) - new_gdf[col]).abs().values <= 1e-2) assert new_gdf[col].isna().sum() == 0
def test_joingroupby_dependency(tmpdir): df = pd.DataFrame({ "Author": ["User_A", "User_A", "User_A", "User_B", "User_B"], "Cost": [100.0, 200.0, 300.0, 400.0, 400.0], }) normalized_cost = ["Cost"] >> nvt.ops.NormalizeMinMax() >> nvt.ops.Rename( postfix="_normalized") groupby_features = ["Author"] >> ops.JoinGroupby( out_path=str(tmpdir), stats=["sum"], cont_cols=normalized_cost) workflow = nvt.Workflow(groupby_features) df_out = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute() assert df_out["Author_Cost_normalized_sum"].to_arrow().to_pylist() == [ 1.0, 1.0, 1.0, 2.0, 2.0 ]
def test_torch_drp_reset(tmpdir, batch_size, drop_last, num_rows): df = nvt.dispatch._make_df({ "cat1": [1] * num_rows, "cat2": [2] * num_rows, "cat3": [3] * num_rows, "label": [0] * num_rows, "cont3": [3.0] * num_rows, "cont2": [2.0] * num_rows, "cont1": [1.0] * num_rows, }) path = os.path.join(tmpdir, "dataset.parquet") df.to_parquet(path) cat_names = ["cat3", "cat2", "cat1"] cont_names = ["cont3", "cont2", "cont1"] label_name = ["label"] data_itr = torch_dataloader.TorchAsyncItr( nvt.Dataset([path]), cats=cat_names, conts=cont_names, labels=label_name, batch_size=batch_size, drop_last=drop_last, device="cpu", ) all_len = len(data_itr) if drop_last else len(data_itr) - 1 all_rows = 0 df_cols = df.columns.to_list() for idx, chunk in enumerate(data_itr): all_rows += len(chunk[0]["cat1"]) if idx < all_len: for col in df_cols: if col in chunk[0].keys(): if nvt.dispatch.HAS_GPU: assert (list( chunk[0][col].cpu().numpy()) == df[col].values_host ).all() else: assert (list( chunk[0][col].cpu().numpy()) == df[col].values ).all() if drop_last and num_rows % batch_size > 0: assert num_rows > all_rows else: assert num_rows == all_rows
def test_spec_set(tmpdir, client): gdf_test = cudf.DataFrame({ "ad_id": [1, 2, 2, 6, 6, 8, 3, 3], "source_id": [2, 4, 4, 7, 5, 2, 5, 2], "platform": [1, 2, np.nan, 2, 1, 3, 3, 1], "cont": [1, 2, np.nan, 2, 1, 3, 3, 1], "clicked": [1, 0, 1, 0, 0, 1, 1, 0], }) cats = ColumnGroup(["ad_id", "source_id", "platform"]) cat_features = cats >> ops.Categorify cont_features = ColumnGroup(["cont"]) >> ops.FillMissing >> ops.Normalize te_features = cats >> ops.TargetEncoding( "clicked", kfold=5, fold_seed=42, p_smooth=20) p = Workflow(cat_features + cont_features + te_features, client=client) p.fit_transform(nvt.Dataset(gdf_test)).to_ddf().compute()
def test_target_encode(tmpdir, cat_groups, kfold, fold_seed, cpu): df = dispatch._make_df({ "Author": list(string.ascii_uppercase), "Engaging-User": list(string.ascii_lowercase), "Cost": range(26), "Post": [0, 1] * 13, }) if cpu: df = dd.from_pandas( df if isinstance(df, pd.DataFrame) else df.to_pandas(), npartitions=3) else: df = dask_cudf.from_cudf(df, npartitions=3) cont_names = ["Cost"] te_features = cat_groups >> ops.TargetEncoding( cont_names, out_path=str(tmpdir), kfold=kfold, out_dtype="float32", fold_seed=fold_seed, drop_folds=False, # Keep folds to validate ) cont_features = cont_names >> ops.FillMissing() >> ops.Clip( min_value=0) >> ops.LogOp() workflow = nvt.Workflow(te_features + cont_features + ["Author", "Engaging-User"]) df_out = workflow.fit_transform( nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous") df_lib = dispatch.get_lib() if kfold > 1: # Cat columns are unique. # Make sure __fold__ mapping is correct if cat_groups == "Author": name = "__fold___Author" cols = ["__fold__", "Author"] else: name = "__fold___Author_Engaging-User" cols = ["__fold__", "Author", "Engaging-User"] check = df_lib.read_parquet(te_features.op.stats[name]) check = check[cols].sort_values(cols).reset_index(drop=True) df_out_check = df_out[cols].sort_values(cols).reset_index(drop=True) assert_eq(check, df_out_check, check_dtype=False)
def run_perf_analyzer(model_path, input_data_path, num_rows=10, model_version=1): """Runs perf_analyzer and returns a dataframe with statistics from it Parameters ---------- model_path : str The fullpath to the model to analyze. input_data_path: str Path to datafiles containing example data to query the model with. Can be anything we can pass to a nvt.Dataset object (csv file/parquet etc) num_rows: int How many rows to query for model_version: int Which model version to use """ # load the workflow and get the base perf analyzer commandline model_name = os.path.basename(model_path) workflow_path = os.path.join(model_path, str(model_version), "workflow") workflow = nvt.Workflow.load(workflow_path) cmdline = _get_perf_analyzer_commandline(workflow, model_name, batch_size=num_rows) # read in the input data and write out as a JSON file df = nvt.Dataset(input_data_path).to_ddf().head(num_rows) json_data = _convert_df_to_triton_json(df, workflow.input_dtypes) with tempfile.NamedTemporaryFile("w", suffix=".json") as json_file: json.dump(json_data, json_file, indent=2) cmdline.extend(["--input-data", json_file.name]) json_file.flush() with tempfile.NamedTemporaryFile("w", suffix=".csv") as csv_report: csv_report.close() cmdline.extend(["-f", csv_report.name]) result = subprocess.run(cmdline, stdout=subprocess.PIPE, check=True, encoding="utf8") print(result.stdout) return pd.read_csv(csv_report.name)
def test_hashed_cross(tmpdir, df, dataset, gpu_memory_frac, engine): # TODO: add tests for > 2 features, multiple crosses, etc. cat_names = [["name-string", "id"]] num_buckets = 10 hashed_cross = cat_names >> ops.HashedCross(num_buckets) dataset = nvt.Dataset(df) processor = nvtabular.Workflow(hashed_cross) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute() # check sums for determinancy new_column_name = "_X_".join(cat_names[0]) assert np.all(new_gdf[new_column_name].values >= 0) assert np.all(new_gdf[new_column_name].values <= 9) checksum = new_gdf[new_column_name].sum() new_gdf = processor.transform(dataset).to_ddf().compute() assert new_gdf[new_column_name].sum() == checksum
def test_join_external_workflow(tmpdir, df, dataset, engine): # Define "external" table how = "left" drop_duplicates = True cache = "device" shift = 100 df_ext = df[["id"]].copy().sort_values("id") df_ext["new_col"] = df_ext["id"] + shift df_ext["new_col_2"] = "keep" df_ext["new_col_3"] = "ignore" df_ext_check = df_ext.copy() # Define Op on = "id" columns_left = list(df.columns) columns_ext = ["id", "new_col", "new_col_2"] df_ext_check = df_ext_check[columns_ext] if drop_duplicates: df_ext_check.drop_duplicates(ignore_index=True, inplace=True) joined = nvt.ColumnGroup(columns_left) >> nvt.ops.JoinExternal( df_ext, on, how=how, columns_ext=columns_ext, cache=cache, drop_duplicates_ext=drop_duplicates, ) # Define Workflow gdf = df.reset_index() dataset = nvt.Dataset(gdf) processor = nvt.Workflow(joined) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute().reset_index() # Validate check_gdf = gdf.merge(df_ext_check, how=how, on=on) assert len(check_gdf) == len(new_gdf) assert (new_gdf["id"] + shift).all() == new_gdf["new_col"].all() assert gdf["id"].all() == new_gdf["id"].all() assert "new_col_2" in new_gdf.columns assert "new_col_3" not in new_gdf.columns
def test_categorify_size(tmpdir, cpu, include_nulls): num_rows = 50 num_distinct = 10 possible_session_ids = list(range(num_distinct)) if include_nulls: possible_session_ids.append(None) df = dispatch._make_df( { "session_id": [random.choice(possible_session_ids) for _ in range(num_rows)] }, device="cpu" if cpu else None, ) cat_features = ["session_id"] >> nvt.ops.Categorify(out_path=str(tmpdir)) workflow = nvt.Workflow(cat_features) workflow.fit_transform(nvt.Dataset(df, cpu=cpu)).to_ddf().compute() vals = df["session_id"].value_counts() vocab = dispatch._read_dispatch(cpu=cpu)(os.path.join( tmpdir, "categories", "unique.session_id.parquet")) if cpu: expected = dict(zip(vals.index, vals)) computed = { session: size for session, size in zip(vocab["session_id"], vocab["session_id_size"]) if size } else: expected = dict(zip(vals.index.values_host, vals.values_host)) computed = { session: size for session, size in zip(vocab["session_id"].values_host, vocab["session_id_size"].values_host) if size } first_key = list(computed.keys())[0] if pd.isna(first_key): computed.pop(first_key) assert computed == expected