def test_empty_cols(tmpdir, df, dataset, engine, cat_names, cont_names, label_name): features = [] if cont_names: features.append(cont_names >> ops.FillMedian() >> ops.Normalize()) if cat_names: features.append(cat_names >> ops.Categorify()) # test out https://github.com/NVIDIA/NVTabular/issues/149 making sure we can iterate over # empty cats/conts graph = sum(features, nvt.ColumnGroup(label_name)) if not graph.columns: # if we don't have conts/cats/labels we're done return processor = nvt.Workflow(sum(features, nvt.ColumnGroup(label_name))) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) df_out = processor.fit_transform(dataset).to_ddf().compute(scheduler="synchronous") data_itr = torch_dataloader.TorchAsyncItr( nvt.Dataset(df_out), cats=cat_names, conts=cont_names, labels=label_name, batch_size=1 ) for nvt_batch in data_itr: cats, conts, labels = nvt_batch if cat_names: assert cats.shape[-1] == len(cat_names) if cont_names: assert conts.shape[-1] == len(cont_names) if label_name: assert labels.shape[-1] == len(label_name)
def test_multifile_parquet(tmpdir, dataset, df, engine, num_io_threads, nfiles, shuffle): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y"] label_names = ["label"] columns = cat_names + cont_names + label_names workflow = nvt.Workflow(nvt.ColumnGroup(columns)) outdir = str(tmpdir.mkdir("out")) transformed = workflow.transform(nvt.Dataset(df)) transformed.to_parquet(output_path=outdir, num_threads=num_io_threads, shuffle=shuffle, out_files_per_proc=nfiles) # Check that our output data is exactly the same out_paths = glob.glob(os.path.join(outdir, "*.parquet")) df_check = cudf.read_parquet(out_paths) assert_eq( df_check[columns].sort_values(["x", "y"]), df[columns].sort_values(["x", "y"]), check_index=False, )
def __init__( self, cont_cols=None, stats=("count", ), tree_width=None, cat_cache="host", out_path=None, on_host=True, name_sep="_", ): super().__init__() self.storage_name = {} self.name_sep = name_sep self.cont_cols = (cont_cols if isinstance(cont_cols, nvt.ColumnGroup) else nvt.ColumnGroup(cont_cols)) self.cont_names = self.cont_cols.columns self.stats = stats self.tree_width = tree_width self.out_path = out_path or "./" self.on_host = on_host self.cat_cache = cat_cache self.categories = {} supported_ops = ["count", "sum", "mean", "std", "var", "min", "max"] for op in self.stats: if op not in supported_ops: raise ValueError(op + " operation is not supported.")
def test_numeric_dtypes(tmpdir): dtypes = [] for width in [8, 16, 32, 64]: dtype = f"int{width}" dtypes.append((dtype, np.iinfo(dtype))) dtype = f"uint{width}" dtypes.append((dtype, np.iinfo(dtype))) for width in [32, 64]: dtype = f"float{width}" dtypes.append((dtype, np.finfo(dtype))) def check_dtypes(col): assert str(col.dtype) == col.name return col # simple transform to make sure we can round-trip the min/max values for each dtype, # through triton, with the 'transform' here just checking that the dtypes are correct df = cudf.DataFrame({ dtype: np.array([limits.max, 0, limits.min], dtype=dtype) for dtype, limits in dtypes }) features = nvt.ColumnGroup(df.columns) >> check_dtypes workflow = nvt.Workflow(features) _verify_workflow_on_tritonserver(tmpdir, workflow, df, "test_numeric_dtypes")
def test_join_external(tmpdir, df, dataset, engine, kind_ext, cache, how, drop_duplicates): # Define "external" table shift = 100 df_ext = df[["id"]].copy().sort_values("id") df_ext["new_col"] = df_ext["id"] + shift df_ext["new_col_2"] = "keep" df_ext["new_col_3"] = "ignore" df_ext_check = df_ext.copy() if kind_ext == "pandas": df_ext = df_ext.to_pandas() elif kind_ext == "arrow": df_ext = df_ext.to_arrow() elif kind_ext == "parquet": path = tmpdir.join("external.parquet") df_ext.to_parquet(path) df_ext = path elif kind_ext == "csv": path = tmpdir.join("external.csv") df_ext.to_csv(path) df_ext = path # Define Op on = "id" columns_left = list(df.columns) columns_ext = ["id", "new_col", "new_col_2"] df_ext_check = df_ext_check[columns_ext] if drop_duplicates: df_ext_check.drop_duplicates(ignore_index=True, inplace=True) joined = nvt.ColumnGroup(columns_left) >> nvt.ops.JoinExternal( df_ext, on, how=how, columns_ext=columns_ext, cache=cache, drop_duplicates_ext=drop_duplicates, ) gdf = df.reset_index() dataset = nvt.Dataset(gdf) processor = nvt.Workflow(joined) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute().reset_index() check_gdf = gdf.merge(df_ext_check, how=how, on=on) assert len(check_gdf) == len(new_gdf) assert (new_gdf["id"] + shift).all() == new_gdf["new_col"].all() assert gdf["id"].all() == new_gdf["id"].all() assert "new_col_2" in new_gdf.columns assert "new_col_3" not in new_gdf.columns
def test_join_external_workflow(tmpdir, df, dataset, engine): # Define "external" table how = "left" drop_duplicates = True cache = "device" shift = 100 df_ext = df[["id"]].copy().sort_values("id") df_ext["new_col"] = df_ext["id"] + shift df_ext["new_col_2"] = "keep" df_ext["new_col_3"] = "ignore" df_ext_check = df_ext.copy() # Define Op on = "id" columns_left = list(df.columns) columns_ext = ["id", "new_col", "new_col_2"] df_ext_check = df_ext_check[columns_ext] if drop_duplicates: df_ext_check.drop_duplicates(ignore_index=True, inplace=True) joined = nvt.ColumnGroup(columns_left) >> nvt.ops.JoinExternal( df_ext, on, how=how, columns_ext=columns_ext, cache=cache, drop_duplicates_ext=drop_duplicates, ) # Define Workflow gdf = df.reset_index() dataset = nvt.Dataset(gdf) processor = nvt.Workflow(joined) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute().reset_index() # Validate check_gdf = gdf.merge(df_ext_check, how=how, on=on) assert len(check_gdf) == len(new_gdf) assert (new_gdf["id"] + shift).all() == new_gdf["new_col"].all() assert gdf["id"].all() == new_gdf["id"].all() assert "new_col_2" in new_gdf.columns assert "new_col_3" not in new_gdf.columns
def test_horovod_multigpu(tmpdir): json_sample = { "conts": {}, "cats": { "genres": { "dtype": None, "cardinality": 50, "min_entry_size": 1, "max_entry_size": 5, "multi_min": 2, "multi_max": 4, "multi_avg": 3, }, "movieId": { "dtype": None, "cardinality": 500, "min_entry_size": 1, "max_entry_size": 5, }, "userId": { "dtype": None, "cardinality": 500, "min_entry_size": 1, "max_entry_size": 5 }, }, "labels": { "rating": { "dtype": None, "cardinality": 2 } }, } cols = datagen._get_cols_from_schema(json_sample) df_gen = datagen.DatasetGen(datagen.UniformDistro(), gpu_frac=0.0001) target_path = os.path.join(tmpdir, "input/") os.mkdir(target_path) df_files = df_gen.full_df_create(10000, cols, output=target_path) # process them cat_features = nvt.ColumnGroup(["userId", "movieId", "genres" ]) >> nvt.ops.Categorify() ratings = nvt.ColumnGroup(["rating"]) >> (lambda col: (col > 3).astype("int8")) output = cat_features + ratings proc = nvt.Workflow(output) train_iter = nvt.Dataset(df_files, part_size="10MB") proc.fit(train_iter) target_path_train = os.path.join(tmpdir, "train/") os.mkdir(target_path_train) proc.transform(train_iter).to_parquet(output_path=target_path_train, out_files_per_proc=5) # add new location target_path = os.path.join(tmpdir, "workflow/") os.mkdir(target_path) proc.save(target_path) curr_path = os.path.abspath(__file__) repo_root = os.path.relpath( os.path.normpath(os.path.join(curr_path, "../../.."))) hvd_wrap_path = os.path.join( repo_root, "examples/multi-gpu-movielens/hvd_wrapper.sh") hvd_exam_path = os.path.join(repo_root, "examples/multi-gpu-movielens/tf_trainer.py") process = subprocess.Popen( [ "horovodrun", "-np", "2", "-H", "localhost:2", "sh", hvd_wrap_path, "python", hvd_exam_path, "--dir_in", f"{tmpdir}", "--batch_size", "1024", ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) process.wait() stdout, stderr = process.communicate() print(stdout, stderr) assert "Loss:" in str(stdout)
def make_feature_column_workflow(feature_columns, label_name, category_dir=None): """ Maps a list of TensorFlow `feature_column`s to an NVTabular `Workflow` which imitates their preprocessing functionality. Returns both the finalized `Workflow` as well as a list of `feature_column`s that can be used to instantiate a `layers.ScalarDenseFeatures` layer to map from `Workflow` outputs to dense network inputs. Useful for replacing feature column online preprocessing with NVTabular GPU-accelerated online preprocessing for faster training. Parameters ---------- feature_columns: list(tf.feature_column) List of TensorFlow feature columns to emulate preprocessing functions of. Doesn't support sequence columns. label_name: str Name of label column in dataset category_dir: str or None Directory in which to save categories from vocabulary list and vocabulary file columns. If left as None, will create directory `/tmp/categories` and save there Returns ------- workflow: nvtabular.Workflow An NVTabular `Workflow` which performs the preprocessing steps defined in `feature_columns` new_feature_columns: list(feature_columns) List of TensorFlow feature columns that correspond to the output from `workflow`. Only contains numeric and identity categorical columns. """ # TODO: should we support a dict input for feature columns # for multi-tower support? def _get_parents(column): """ quick utility function for getting all the input tensors that will feed into a column """ # column has no parents, so we've reached a terminal node if isinstance(column, str) or isinstance(column.parents[0], str): return [column] # else climb family tree parents = [] for parent in column.parents: parents.extend([i for i in _get_parents(parent) if i not in parents]) return parents # could be more effiient with sets but this is deterministic which # might be helpful? Still not sure about this so being safe base_columns = [] for column in feature_columns: parents = _get_parents(column) base_columns.extend([col for col in parents if col not in base_columns]) cat_names, cont_names = [], [] for column in base_columns: if isinstance(column, str): # cross column input # TODO: this means we only accept categorical inputs to # cross? How do we generalize this? Probably speaks to # the inefficiencies of feature columns as a schema # representation cat_names.extend(column) elif isinstance(column, fc.CategoricalColumn): cat_names.extend(column.key) else: cont_names.extend(column.key) _CATEGORIFY_COLUMNS = (fc.VocabularyListCategoricalColumn, fc.VocabularyFileCategoricalColumn) categorifies, hashes, crosses, buckets, replaced_buckets = {}, {}, {}, {}, {} numeric_columns = [] new_feature_columns = [] for column in feature_columns: # TODO: check for shared embedding or weighted embedding columns? # Do they just inherit from EmbeddingColumn? if not isinstance(column, (fc.EmbeddingColumn, fc.IndicatorColumn)): if isinstance(column, (fc.BucketizedColumn)): # bucketized column being fed directly to model means it's # implicitly wrapped into an indicator column cat_column = column embedding_dim = None else: # can this be anything else? I don't think so assert isinstance(column, fc.NumericColumn) # check to see if we've seen a bucketized column # that gets fed by this feature. If we have, note # that it shouldn't be replaced if column.key in replaced_buckets: buckets[column.key] = replaced_buckets.pop(column.key) numeric_columns.append(column) continue else: cat_column = column.categorical_column # use this to keep track of what should be embedding # and what should be indicator, makes the bucketized # checking easier if isinstance(column, fc.EmbeddingColumn): embedding_dim = column.dimension else: embedding_dim = None if isinstance(cat_column, fc.BucketizedColumn): key = cat_column.source_column.key # check if the source numeric column is being fed # directly to the model. Keep track of both the # boundaries and embedding dim so that we can wrap # with either indicator or embedding later if key in [col.key for col in numeric_columns]: buckets[key] = (column.boundaries, embedding_dim) else: replaced_buckets[key] = (column.boundaries, embedding_dim) # put off dealing with these until the end so that # we know whether we need to replace numeric # columns or create a separate feature column # for them continue elif isinstance(cat_column, _CATEGORIFY_COLUMNS): if cat_column.num_oov_buckets > 1: warnings.warn("More than 1 oov bucket not supported for Categorify") if isinstance(cat_column, _CATEGORIFY_COLUMNS[1]): # TODO: how do we handle the case where it's too big to load? with open(cat_column.vocab_file, "r") as f: vocab = f.read().split("\n") else: vocab = cat_column.vocabulary_list categorifies[cat_column.key] = list(vocab) key = cat_column.key elif isinstance(cat_column, fc.HashedCategoricalColumn): hashes[cat_column.key] = cat_column.hash_bucket_size key = cat_column.key elif isinstance(cat_column, fc.CrossedColumn): keys = [] for key in cat_column.keys: if isinstance(key, fc.BucketizedColumn): keys.append(key.source_column.key + "_bucketized") elif isinstance(key, str): keys.append(key) else: keys.append(key.key) crosses[tuple(keys)] = (cat_column.hash_bucket_size, embedding_dim) # put off making the new columns here too so that we # make sure we have the key right after we check # for buckets later continue elif isinstance(cat_column, fc.IdentityCategoricalColumn): new_feature_columns.append(column) continue else: raise ValueError("Unknown column {}".format(cat_column)) new_feature_columns.append( _make_categorical_embedding(key, cat_column.num_buckets, embedding_dim) ) features = nvt.ColumnGroup(label_name) if len(buckets) > 0: new_buckets = {} for key, (boundaries, embedding_dim) in buckets.items(): new_feature_columns.append( _make_categorical_embedding(key + "_bucketized", len(boundaries) + 1, embedding_dim) ) new_buckets[key] = boundaries features_buckets = ( new_buckets.keys() >> Bucketize(new_buckets) >> Rename(postfix="_bucketized") ) features += features_buckets if len(replaced_buckets) > 0: new_replaced_buckets = {} for key, (boundaries, embedding_dim) in replaced_buckets.items(): new_feature_columns.append( _make_categorical_embedding(key, len(boundaries) + 1, embedding_dim) ) new_replaced_buckets[key] = boundaries features_replaced_buckets = new_replaced_buckets.keys() >> Bucketize(new_replaced_buckets) features += features_replaced_buckets if len(categorifies) > 0: features += categorifies.keys() >> Categorify() if len(hashes) > 0: features += hashes.keys() >> HashBucket(hashes) if len(crosses) > 0: # need to check if any bucketized columns are coming from # the bucketized version or the raw version new_crosses = {} for keys, (hash_bucket_size, embedding_dim) in crosses.items(): # if we're bucketizing the input we have to do more work here - if any(key.endswith("_bucketized") for key in keys): cross_columns = [] for key in keys: if key.endswith("_bucketized"): bucketized_cols = [] bucketized_cols.append(key) key = key.replace("_bucketized", "") if key in buckets: # find if there are different columns diff_col = list(set(features_buckets.columns) ^ set(bucketized_cols)) if diff_col: features_buckets.columns.remove(diff_col[0]) cross_columns.append(features_buckets) elif key in replaced_buckets: diff_col = list( set(features_replaced_buckets.columns) ^ set(bucketized_cols) ) if diff_col: features_replaced_buckets.columns.remove(diff_col[0]) cross_columns.append(features_replaced_buckets) else: raise RuntimeError("Unknown bucket column %s", key) else: cross_columns.append(nvt.ColumnGroup(key)) features += sum(cross_columns[1:], cross_columns[0]) >> HashedCross( hash_bucket_size ) else: new_crosses[tuple(keys)] = hash_bucket_size key = "_X_".join(keys) new_feature_columns.append( _make_categorical_embedding(key, hash_bucket_size, embedding_dim) ) if new_crosses: features += new_crosses.keys() >> HashedCross(new_crosses) if numeric_columns: features += [col.key for col in numeric_columns] workflow = nvt.Workflow(features) # create stats for Categorify op if we need it if len(categorifies) > 0: if category_dir is None: category_dir = "/tmp/categories" if not os.path.exists(category_dir): os.makedirs(category_dir) stats = {"categories": {}} for feature_name, categories in categorifies.items(): categories.insert(0, None) df = cudf.DataFrame({feature_name: categories}) save_path = os.path.join(category_dir, f"unique.{feature_name}.parquet") df.to_parquet(save_path) stats["categories"][feature_name] = save_path workflow.stats = stats return workflow, numeric_columns + new_feature_columns
def __rrshift__(self, other) -> ColumnGroup: import nvtabular return nvtabular.ColumnGroup(other) >> self
def test_hugectr(tmpdir, client, df, dataset, output_format, engine, op_columns, num_io_threads, use_client): client = client if use_client else None cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y"] label_names = ["label"] # set variables nfiles = 10 ext = "" outdir = tmpdir + "/hugectr" os.mkdir(outdir) outdir = str(outdir) conts = nvt.ColumnGroup(cont_names) >> ops.Normalize cats = nvt.ColumnGroup(cat_names) >> ops.Categorify workflow = nvt.Workflow(conts + cats + label_names) transformed = workflow.fit_transform(dataset) if output_format == "hugectr": transformed.to_hugectr( cats=cat_names, conts=cont_names, labels=label_names, output_path=outdir, out_files_per_proc=nfiles, num_threads=num_io_threads, ) else: transformed.to_parquet( output_path=outdir, out_files_per_proc=nfiles, num_threads=num_io_threads, ) # Check for _file_list.txt assert os.path.isfile(outdir + "/_file_list.txt") # Check for _metadata.json assert os.path.isfile(outdir + "/_metadata.json") # Check contents of _metadata.json data = {} col_summary = {} with open(outdir + "/_metadata.json", "r") as fil: for k, v in json.load(fil).items(): data[k] = v assert "cats" in data assert "conts" in data assert "labels" in data assert "file_stats" in data assert len(data["file_stats"]) == nfiles if not client else nfiles * len( client.cluster.workers) for cdata in data["cats"] + data["conts"] + data["labels"]: col_summary[cdata["index"]] = cdata["col_name"] # Check that data files exist ext = "" if output_format == "parquet": ext = "parquet" elif output_format == "hugectr": ext = "data" data_files = [ os.path.join(outdir, filename) for filename in os.listdir(outdir) if filename.endswith(ext) ] # Make sure the columns in "_metadata.json" make sense if output_format == "parquet": df_check = cudf.read_parquet(os.path.join(outdir, data_files[0])) for i, name in enumerate(df_check.columns): if i in col_summary: assert col_summary[i] == name
def test_nvt_hugectr_training(): download_file( "http://files.grouplens.org/datasets/movielens/ml-25m.zip", os.path.join(DATA_DIR, "ml-25m.zip"), ) ratings = cudf.read_csv(os.path.join(DATA_DIR, "ml-25m", "ratings.csv")) ratings["new_cat1"] = ratings["userId"] / ratings["movieId"] ratings["new_cat1"] = ratings["new_cat1"].astype("int64") ratings.head() ratings = ratings.drop("timestamp", axis=1) train, valid = train_test_split(ratings, test_size=0.2, random_state=42) train.to_parquet(DATA_DIR + "train.parquet") valid.to_parquet(DATA_DIR + "valid.parquet") del train del valid gc.collect() cat_features = CATEGORICAL_COLUMNS >> nvt.ops.Categorify(cat_cache="device") ratings = nvt.ColumnGroup(["rating"]) >> (lambda col: (col > 3).astype("int8")) output = cat_features + ratings workflow = nvt.Workflow(output) train_dataset = nvt.Dataset(DATA_DIR + "train.parquet", part_size="100MB") valid_dataset = nvt.Dataset(DATA_DIR + "valid.parquet", part_size="100MB") workflow.fit(train_dataset) dict_dtypes = {} for col in CATEGORICAL_COLUMNS: dict_dtypes[col] = np.int64 for col in LABEL_COLUMNS: dict_dtypes[col] = np.float32 if path.exists(DATA_DIR + "train"): shutil.rmtree(os.path.join(DATA_DIR, "train")) if path.exists(DATA_DIR + "valid"): shutil.rmtree(os.path.join(DATA_DIR, "valid")) workflow.transform(train_dataset).to_parquet( output_path=DATA_DIR + "train/", shuffle=nvt.io.Shuffle.PER_PARTITION, cats=CATEGORICAL_COLUMNS, labels=LABEL_COLUMNS, dtypes=dict_dtypes, ) workflow.transform(valid_dataset).to_parquet( output_path=DATA_DIR + "valid/", shuffle=False, cats=CATEGORICAL_COLUMNS, labels=LABEL_COLUMNS, dtypes=dict_dtypes, ) embeddings = get_embedding_sizes(workflow) total_cardinality = 0 slot_sizes = [] for column in CATEGORICAL_COLUMNS: slot_sizes.append(embeddings[column][0]) total_cardinality += embeddings[column][0] test_data_path = DATA_DIR + "test/" if path.exists(test_data_path): shutil.rmtree(test_data_path) os.mkdir(test_data_path) sample_data = cudf.read_parquet(DATA_DIR + "valid.parquet", num_rows=TEST_N_ROWS) sample_data.to_csv(test_data_path + "data.csv") sample_data_trans = nvt.workflow._transform_partition(sample_data, [workflow.column_group]) dense_features, embedding_columns, row_ptrs = _convert(sample_data_trans, slot_sizes) _run_model(slot_sizes, total_cardinality) if path.exists(TEMP_DIR): shutil.rmtree(TEMP_DIR) os.mkdir(TEMP_DIR) file_names = glob.iglob(os.path.join(os.getcwd(), "*.model")) for files in file_names: shutil.move(files, TEMP_DIR) _write_model_json(slot_sizes, total_cardinality) if path.exists(MODEL_DIR): shutil.rmtree(MODEL_DIR) os.mkdir(MODEL_DIR) model_name = "test_model" hugectr_params = dict() hugectr_params["config"] = MODEL_DIR + "test_model/1/model.json" hugectr_params["slots"] = len(slot_sizes) hugectr_params["max_nnz"] = len(slot_sizes) hugectr_params["embedding_vector_size"] = 16 hugectr_params["n_outputs"] = 1 export_hugectr_ensemble( workflow=workflow, hugectr_model_path=TEMP_DIR, hugectr_params=hugectr_params, name=model_name, output_path=MODEL_DIR, label_columns=["rating"], cats=CATEGORICAL_COLUMNS, max_batch_size=64, ) shutil.rmtree(TEMP_DIR) _predict(dense_features, embedding_columns, row_ptrs, hugectr_params["config"], model_name)