Python ColumnGroup示例，nvtabular.ColumnGroup Python示例

示例#1

0

显示文件

def test_empty_cols(tmpdir, df, dataset, engine, cat_names, cont_names, label_name):

    features = []
    if cont_names:
        features.append(cont_names >> ops.FillMedian() >> ops.Normalize())
    if cat_names:
        features.append(cat_names >> ops.Categorify())

    # test out https://github.com/NVIDIA/NVTabular/issues/149 making sure we can iterate over
    # empty cats/conts
    graph = sum(features, nvt.ColumnGroup(label_name))
    if not graph.columns:
        # if we don't have conts/cats/labels we're done
        return

    processor = nvt.Workflow(sum(features, nvt.ColumnGroup(label_name)))

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    df_out = processor.fit_transform(dataset).to_ddf().compute(scheduler="synchronous")

    data_itr = torch_dataloader.TorchAsyncItr(
        nvt.Dataset(df_out), cats=cat_names, conts=cont_names, labels=label_name, batch_size=1
    )

    for nvt_batch in data_itr:
        cats, conts, labels = nvt_batch
        if cat_names:
            assert cats.shape[-1] == len(cat_names)
        if cont_names:
            assert conts.shape[-1] == len(cont_names)
        if label_name:
            assert labels.shape[-1] == len(label_name)

示例#2

0

显示文件

def test_multifile_parquet(tmpdir, dataset, df, engine, num_io_threads, nfiles,
                           shuffle):

    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y"]
    label_names = ["label"]
    columns = cat_names + cont_names + label_names
    workflow = nvt.Workflow(nvt.ColumnGroup(columns))

    outdir = str(tmpdir.mkdir("out"))
    transformed = workflow.transform(nvt.Dataset(df))
    transformed.to_parquet(output_path=outdir,
                           num_threads=num_io_threads,
                           shuffle=shuffle,
                           out_files_per_proc=nfiles)

    # Check that our output data is exactly the same
    out_paths = glob.glob(os.path.join(outdir, "*.parquet"))
    df_check = cudf.read_parquet(out_paths)
    assert_eq(
        df_check[columns].sort_values(["x", "y"]),
        df[columns].sort_values(["x", "y"]),
        check_index=False,
    )

示例#3

0

显示文件

    def __init__(
        self,
        cont_cols=None,
        stats=("count", ),
        tree_width=None,
        cat_cache="host",
        out_path=None,
        on_host=True,
        name_sep="_",
    ):
        super().__init__()

        self.storage_name = {}
        self.name_sep = name_sep
        self.cont_cols = (cont_cols if isinstance(cont_cols, nvt.ColumnGroup)
                          else nvt.ColumnGroup(cont_cols))
        self.cont_names = self.cont_cols.columns
        self.stats = stats
        self.tree_width = tree_width
        self.out_path = out_path or "./"
        self.on_host = on_host
        self.cat_cache = cat_cache
        self.categories = {}

        supported_ops = ["count", "sum", "mean", "std", "var", "min", "max"]
        for op in self.stats:
            if op not in supported_ops:
                raise ValueError(op + " operation is not supported.")

示例#4

0

显示文件

文件： test_triton_inference.py 项目： sararb/NVTabular

def test_numeric_dtypes(tmpdir):
    dtypes = []
    for width in [8, 16, 32, 64]:
        dtype = f"int{width}"
        dtypes.append((dtype, np.iinfo(dtype)))
        dtype = f"uint{width}"
        dtypes.append((dtype, np.iinfo(dtype)))

    for width in [32, 64]:
        dtype = f"float{width}"
        dtypes.append((dtype, np.finfo(dtype)))

    def check_dtypes(col):
        assert str(col.dtype) == col.name
        return col

    # simple transform to make sure we can round-trip the min/max values for each dtype,
    # through triton, with the 'transform' here just checking that the dtypes are correct
    df = cudf.DataFrame({
        dtype: np.array([limits.max, 0, limits.min], dtype=dtype)
        for dtype, limits in dtypes
    })
    features = nvt.ColumnGroup(df.columns) >> check_dtypes
    workflow = nvt.Workflow(features)
    _verify_workflow_on_tritonserver(tmpdir, workflow, df,
                                     "test_numeric_dtypes")

示例#5

0

显示文件

文件： test_ops.py 项目： miguelusque/NVTabular

def test_join_external(tmpdir, df, dataset, engine, kind_ext, cache, how,
                       drop_duplicates):

    # Define "external" table
    shift = 100
    df_ext = df[["id"]].copy().sort_values("id")
    df_ext["new_col"] = df_ext["id"] + shift
    df_ext["new_col_2"] = "keep"
    df_ext["new_col_3"] = "ignore"
    df_ext_check = df_ext.copy()
    if kind_ext == "pandas":
        df_ext = df_ext.to_pandas()
    elif kind_ext == "arrow":
        df_ext = df_ext.to_arrow()
    elif kind_ext == "parquet":
        path = tmpdir.join("external.parquet")
        df_ext.to_parquet(path)
        df_ext = path
    elif kind_ext == "csv":
        path = tmpdir.join("external.csv")
        df_ext.to_csv(path)
        df_ext = path

    # Define Op
    on = "id"
    columns_left = list(df.columns)
    columns_ext = ["id", "new_col", "new_col_2"]
    df_ext_check = df_ext_check[columns_ext]
    if drop_duplicates:
        df_ext_check.drop_duplicates(ignore_index=True, inplace=True)
    joined = nvt.ColumnGroup(columns_left) >> nvt.ops.JoinExternal(
        df_ext,
        on,
        how=how,
        columns_ext=columns_ext,
        cache=cache,
        drop_duplicates_ext=drop_duplicates,
    )

    gdf = df.reset_index()
    dataset = nvt.Dataset(gdf)
    processor = nvt.Workflow(joined)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute().reset_index()

    check_gdf = gdf.merge(df_ext_check, how=how, on=on)
    assert len(check_gdf) == len(new_gdf)
    assert (new_gdf["id"] + shift).all() == new_gdf["new_col"].all()
    assert gdf["id"].all() == new_gdf["id"].all()
    assert "new_col_2" in new_gdf.columns
    assert "new_col_3" not in new_gdf.columns

示例#6

0

显示文件

文件： test_workflow.py 项目： miguelusque/NVTabular

def test_join_external_workflow(tmpdir, df, dataset, engine):

    # Define "external" table
    how = "left"
    drop_duplicates = True
    cache = "device"
    shift = 100
    df_ext = df[["id"]].copy().sort_values("id")
    df_ext["new_col"] = df_ext["id"] + shift
    df_ext["new_col_2"] = "keep"
    df_ext["new_col_3"] = "ignore"
    df_ext_check = df_ext.copy()

    # Define Op
    on = "id"
    columns_left = list(df.columns)
    columns_ext = ["id", "new_col", "new_col_2"]
    df_ext_check = df_ext_check[columns_ext]
    if drop_duplicates:
        df_ext_check.drop_duplicates(ignore_index=True, inplace=True)
    joined = nvt.ColumnGroup(columns_left) >> nvt.ops.JoinExternal(
        df_ext,
        on,
        how=how,
        columns_ext=columns_ext,
        cache=cache,
        drop_duplicates_ext=drop_duplicates,
    )

    # Define Workflow
    gdf = df.reset_index()
    dataset = nvt.Dataset(gdf)
    processor = nvt.Workflow(joined)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute().reset_index()

    # Validate
    check_gdf = gdf.merge(df_ext_check, how=how, on=on)
    assert len(check_gdf) == len(new_gdf)
    assert (new_gdf["id"] + shift).all() == new_gdf["new_col"].all()
    assert gdf["id"].all() == new_gdf["id"].all()
    assert "new_col_2" in new_gdf.columns
    assert "new_col_3" not in new_gdf.columns

示例#7

0

显示文件

def test_horovod_multigpu(tmpdir):
    json_sample = {
        "conts": {},
        "cats": {
            "genres": {
                "dtype": None,
                "cardinality": 50,
                "min_entry_size": 1,
                "max_entry_size": 5,
                "multi_min": 2,
                "multi_max": 4,
                "multi_avg": 3,
            },
            "movieId": {
                "dtype": None,
                "cardinality": 500,
                "min_entry_size": 1,
                "max_entry_size": 5,
            },
            "userId": {
                "dtype": None,
                "cardinality": 500,
                "min_entry_size": 1,
                "max_entry_size": 5
            },
        },
        "labels": {
            "rating": {
                "dtype": None,
                "cardinality": 2
            }
        },
    }
    cols = datagen._get_cols_from_schema(json_sample)
    df_gen = datagen.DatasetGen(datagen.UniformDistro(), gpu_frac=0.0001)
    target_path = os.path.join(tmpdir, "input/")
    os.mkdir(target_path)
    df_files = df_gen.full_df_create(10000, cols, output=target_path)
    # process them
    cat_features = nvt.ColumnGroup(["userId", "movieId", "genres"
                                    ]) >> nvt.ops.Categorify()
    ratings = nvt.ColumnGroup(["rating"]) >> (lambda col:
                                              (col > 3).astype("int8"))
    output = cat_features + ratings
    proc = nvt.Workflow(output)
    train_iter = nvt.Dataset(df_files, part_size="10MB")
    proc.fit(train_iter)
    target_path_train = os.path.join(tmpdir, "train/")
    os.mkdir(target_path_train)
    proc.transform(train_iter).to_parquet(output_path=target_path_train,
                                          out_files_per_proc=5)
    # add new location
    target_path = os.path.join(tmpdir, "workflow/")
    os.mkdir(target_path)
    proc.save(target_path)
    curr_path = os.path.abspath(__file__)
    repo_root = os.path.relpath(
        os.path.normpath(os.path.join(curr_path, "../../..")))
    hvd_wrap_path = os.path.join(
        repo_root, "examples/multi-gpu-movielens/hvd_wrapper.sh")
    hvd_exam_path = os.path.join(repo_root,
                                 "examples/multi-gpu-movielens/tf_trainer.py")
    process = subprocess.Popen(
        [
            "horovodrun",
            "-np",
            "2",
            "-H",
            "localhost:2",
            "sh",
            hvd_wrap_path,
            "python",
            hvd_exam_path,
            "--dir_in",
            f"{tmpdir}",
            "--batch_size",
            "1024",
        ],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
    )
    process.wait()
    stdout, stderr = process.communicate()
    print(stdout, stderr)
    assert "Loss:" in str(stdout)

示例#8

0

显示文件

文件： feature_column_utils.py 项目： miguelusque/NVTabular

def make_feature_column_workflow(feature_columns, label_name, category_dir=None):
    """
    Maps a list of TensorFlow `feature_column`s to an NVTabular `Workflow` which
    imitates their preprocessing functionality. Returns both the finalized
    `Workflow` as well as a list of `feature_column`s that can be used to
    instantiate a `layers.ScalarDenseFeatures` layer to map from `Workflow`
    outputs to dense network inputs. Useful for replacing feature column
    online preprocessing with NVTabular GPU-accelerated online preprocessing
    for faster training.

    Parameters
    ----------
    feature_columns: list(tf.feature_column)
        List of TensorFlow feature columns to emulate preprocessing functions
        of. Doesn't support sequence columns.
    label_name: str
        Name of label column in dataset
    category_dir: str or None
        Directory in which to save categories from vocabulary list and
        vocabulary file columns. If left as None, will create directory
        `/tmp/categories` and save there

    Returns
    -------
    workflow: nvtabular.Workflow
        An NVTabular `Workflow` which performs the preprocessing steps
        defined in `feature_columns`
    new_feature_columns: list(feature_columns)
        List of TensorFlow feature columns that correspond to the output
        from `workflow`. Only contains numeric and identity categorical columns.
    """
    # TODO: should we support a dict input for feature columns
    # for multi-tower support?

    def _get_parents(column):
        """
        quick utility function for getting all the input tensors
        that will feed into a column
        """
        # column has no parents, so we've reached a terminal node
        if isinstance(column, str) or isinstance(column.parents[0], str):
            return [column]

        # else climb family tree
        parents = []
        for parent in column.parents:
            parents.extend([i for i in _get_parents(parent) if i not in parents])
        return parents

    # could be more effiient with sets but this is deterministic which
    # might be helpful? Still not sure about this so being safe
    base_columns = []
    for column in feature_columns:
        parents = _get_parents(column)
        base_columns.extend([col for col in parents if col not in base_columns])

    cat_names, cont_names = [], []
    for column in base_columns:
        if isinstance(column, str):
            # cross column input
            # TODO: this means we only accept categorical inputs to
            # cross? How do we generalize this? Probably speaks to
            # the inefficiencies of feature columns as a schema
            # representation
            cat_names.extend(column)
        elif isinstance(column, fc.CategoricalColumn):
            cat_names.extend(column.key)
        else:
            cont_names.extend(column.key)

    _CATEGORIFY_COLUMNS = (fc.VocabularyListCategoricalColumn, fc.VocabularyFileCategoricalColumn)
    categorifies, hashes, crosses, buckets, replaced_buckets = {}, {}, {}, {}, {}

    numeric_columns = []
    new_feature_columns = []
    for column in feature_columns:
        # TODO: check for shared embedding or weighted embedding columns?
        # Do they just inherit from EmbeddingColumn?
        if not isinstance(column, (fc.EmbeddingColumn, fc.IndicatorColumn)):
            if isinstance(column, (fc.BucketizedColumn)):
                # bucketized column being fed directly to model means it's
                # implicitly wrapped into an indicator column
                cat_column = column
                embedding_dim = None
            else:
                # can this be anything else? I don't think so
                assert isinstance(column, fc.NumericColumn)

                # check to see if we've seen a bucketized column
                # that gets fed by this feature. If we have, note
                # that it shouldn't be replaced
                if column.key in replaced_buckets:
                    buckets[column.key] = replaced_buckets.pop(column.key)

                numeric_columns.append(column)
                continue
        else:
            cat_column = column.categorical_column

            # use this to keep track of what should be embedding
            # and what should be indicator, makes the bucketized
            # checking easier
            if isinstance(column, fc.EmbeddingColumn):
                embedding_dim = column.dimension
            else:
                embedding_dim = None

        if isinstance(cat_column, fc.BucketizedColumn):
            key = cat_column.source_column.key

            # check if the source numeric column is being fed
            # directly to the model. Keep track of both the
            # boundaries and embedding dim so that we can wrap
            # with either indicator or embedding later
            if key in [col.key for col in numeric_columns]:
                buckets[key] = (column.boundaries, embedding_dim)
            else:
                replaced_buckets[key] = (column.boundaries, embedding_dim)

            # put off dealing with these until the end so that
            # we know whether we need to replace numeric
            # columns or create a separate feature column
            # for them
            continue

        elif isinstance(cat_column, _CATEGORIFY_COLUMNS):
            if cat_column.num_oov_buckets > 1:
                warnings.warn("More than 1 oov bucket not supported for Categorify")

            if isinstance(cat_column, _CATEGORIFY_COLUMNS[1]):
                # TODO: how do we handle the case where it's too big to load?
                with open(cat_column.vocab_file, "r") as f:
                    vocab = f.read().split("\n")
            else:
                vocab = cat_column.vocabulary_list
            categorifies[cat_column.key] = list(vocab)
            key = cat_column.key

        elif isinstance(cat_column, fc.HashedCategoricalColumn):
            hashes[cat_column.key] = cat_column.hash_bucket_size
            key = cat_column.key

        elif isinstance(cat_column, fc.CrossedColumn):
            keys = []
            for key in cat_column.keys:
                if isinstance(key, fc.BucketizedColumn):
                    keys.append(key.source_column.key + "_bucketized")
                elif isinstance(key, str):
                    keys.append(key)
                else:
                    keys.append(key.key)
            crosses[tuple(keys)] = (cat_column.hash_bucket_size, embedding_dim)

            # put off making the new columns here too so that we
            # make sure we have the key right after we check
            # for buckets later
            continue

        elif isinstance(cat_column, fc.IdentityCategoricalColumn):
            new_feature_columns.append(column)
            continue

        else:
            raise ValueError("Unknown column {}".format(cat_column))

        new_feature_columns.append(
            _make_categorical_embedding(key, cat_column.num_buckets, embedding_dim)
        )

    features = nvt.ColumnGroup(label_name)

    if len(buckets) > 0:
        new_buckets = {}
        for key, (boundaries, embedding_dim) in buckets.items():
            new_feature_columns.append(
                _make_categorical_embedding(key + "_bucketized", len(boundaries) + 1, embedding_dim)
            )
            new_buckets[key] = boundaries

        features_buckets = (
            new_buckets.keys() >> Bucketize(new_buckets) >> Rename(postfix="_bucketized")
        )
        features += features_buckets

    if len(replaced_buckets) > 0:
        new_replaced_buckets = {}
        for key, (boundaries, embedding_dim) in replaced_buckets.items():
            new_feature_columns.append(
                _make_categorical_embedding(key, len(boundaries) + 1, embedding_dim)
            )
            new_replaced_buckets[key] = boundaries
        features_replaced_buckets = new_replaced_buckets.keys() >> Bucketize(new_replaced_buckets)
        features += features_replaced_buckets

    if len(categorifies) > 0:
        features += categorifies.keys() >> Categorify()

    if len(hashes) > 0:
        features += hashes.keys() >> HashBucket(hashes)

    if len(crosses) > 0:
        # need to check if any bucketized columns are coming from
        # the bucketized version or the raw version
        new_crosses = {}
        for keys, (hash_bucket_size, embedding_dim) in crosses.items():
            # if we're bucketizing the input we have to do more work here -
            if any(key.endswith("_bucketized") for key in keys):
                cross_columns = []
                for key in keys:
                    if key.endswith("_bucketized"):
                        bucketized_cols = []
                        bucketized_cols.append(key)
                        key = key.replace("_bucketized", "")
                        if key in buckets:
                            # find if there are different columns
                            diff_col = list(set(features_buckets.columns) ^ set(bucketized_cols))
                            if diff_col:
                                features_buckets.columns.remove(diff_col[0])
                            cross_columns.append(features_buckets)
                        elif key in replaced_buckets:
                            diff_col = list(
                                set(features_replaced_buckets.columns) ^ set(bucketized_cols)
                            )
                            if diff_col:
                                features_replaced_buckets.columns.remove(diff_col[0])
                            cross_columns.append(features_replaced_buckets)
                        else:
                            raise RuntimeError("Unknown bucket column %s", key)
                    else:
                        cross_columns.append(nvt.ColumnGroup(key))

                features += sum(cross_columns[1:], cross_columns[0]) >> HashedCross(
                    hash_bucket_size
                )

            else:
                new_crosses[tuple(keys)] = hash_bucket_size
            key = "_X_".join(keys)
            new_feature_columns.append(
                _make_categorical_embedding(key, hash_bucket_size, embedding_dim)
            )

        if new_crosses:
            features += new_crosses.keys() >> HashedCross(new_crosses)

    if numeric_columns:
        features += [col.key for col in numeric_columns]

    workflow = nvt.Workflow(features)

    # create stats for Categorify op if we need it
    if len(categorifies) > 0:
        if category_dir is None:
            category_dir = "/tmp/categories"
        if not os.path.exists(category_dir):
            os.makedirs(category_dir)

        stats = {"categories": {}}
        for feature_name, categories in categorifies.items():
            categories.insert(0, None)
            df = cudf.DataFrame({feature_name: categories})

            save_path = os.path.join(category_dir, f"unique.{feature_name}.parquet")
            df.to_parquet(save_path)
            stats["categories"][feature_name] = save_path

        workflow.stats = stats

    return workflow, numeric_columns + new_feature_columns

示例#9

0

显示文件

文件： operator.py 项目： lgardenhire/NVTabular

    def __rrshift__(self, other) -> ColumnGroup:
        import nvtabular

        return nvtabular.ColumnGroup(other) >> self

示例#10

0

显示文件

def test_hugectr(tmpdir, client, df, dataset, output_format, engine,
                 op_columns, num_io_threads, use_client):
    client = client if use_client else None

    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y"]
    label_names = ["label"]

    # set variables
    nfiles = 10
    ext = ""
    outdir = tmpdir + "/hugectr"
    os.mkdir(outdir)
    outdir = str(outdir)

    conts = nvt.ColumnGroup(cont_names) >> ops.Normalize
    cats = nvt.ColumnGroup(cat_names) >> ops.Categorify

    workflow = nvt.Workflow(conts + cats + label_names)
    transformed = workflow.fit_transform(dataset)

    if output_format == "hugectr":
        transformed.to_hugectr(
            cats=cat_names,
            conts=cont_names,
            labels=label_names,
            output_path=outdir,
            out_files_per_proc=nfiles,
            num_threads=num_io_threads,
        )
    else:
        transformed.to_parquet(
            output_path=outdir,
            out_files_per_proc=nfiles,
            num_threads=num_io_threads,
        )

    # Check for _file_list.txt
    assert os.path.isfile(outdir + "/_file_list.txt")

    # Check for _metadata.json
    assert os.path.isfile(outdir + "/_metadata.json")

    # Check contents of _metadata.json
    data = {}
    col_summary = {}
    with open(outdir + "/_metadata.json", "r") as fil:
        for k, v in json.load(fil).items():
            data[k] = v
    assert "cats" in data
    assert "conts" in data
    assert "labels" in data
    assert "file_stats" in data
    assert len(data["file_stats"]) == nfiles if not client else nfiles * len(
        client.cluster.workers)
    for cdata in data["cats"] + data["conts"] + data["labels"]:
        col_summary[cdata["index"]] = cdata["col_name"]

    # Check that data files exist
    ext = ""
    if output_format == "parquet":
        ext = "parquet"
    elif output_format == "hugectr":
        ext = "data"

    data_files = [
        os.path.join(outdir, filename) for filename in os.listdir(outdir)
        if filename.endswith(ext)
    ]

    # Make sure the columns in "_metadata.json" make sense
    if output_format == "parquet":
        df_check = cudf.read_parquet(os.path.join(outdir, data_files[0]))
        for i, name in enumerate(df_check.columns):
            if i in col_summary:
                assert col_summary[i] == name

示例#11

0

显示文件

def test_nvt_hugectr_training():

    download_file(
        "http://files.grouplens.org/datasets/movielens/ml-25m.zip",
        os.path.join(DATA_DIR, "ml-25m.zip"),
    )

    ratings = cudf.read_csv(os.path.join(DATA_DIR, "ml-25m", "ratings.csv"))
    ratings["new_cat1"] = ratings["userId"] / ratings["movieId"]
    ratings["new_cat1"] = ratings["new_cat1"].astype("int64")
    ratings.head()

    ratings = ratings.drop("timestamp", axis=1)
    train, valid = train_test_split(ratings, test_size=0.2, random_state=42)

    train.to_parquet(DATA_DIR + "train.parquet")
    valid.to_parquet(DATA_DIR + "valid.parquet")

    del train
    del valid
    gc.collect()

    cat_features = CATEGORICAL_COLUMNS >> nvt.ops.Categorify(cat_cache="device")
    ratings = nvt.ColumnGroup(["rating"]) >> (lambda col: (col > 3).astype("int8"))
    output = cat_features + ratings

    workflow = nvt.Workflow(output)

    train_dataset = nvt.Dataset(DATA_DIR + "train.parquet", part_size="100MB")
    valid_dataset = nvt.Dataset(DATA_DIR + "valid.parquet", part_size="100MB")

    workflow.fit(train_dataset)

    dict_dtypes = {}

    for col in CATEGORICAL_COLUMNS:
        dict_dtypes[col] = np.int64

    for col in LABEL_COLUMNS:
        dict_dtypes[col] = np.float32

    if path.exists(DATA_DIR + "train"):
        shutil.rmtree(os.path.join(DATA_DIR, "train"))
    if path.exists(DATA_DIR + "valid"):
        shutil.rmtree(os.path.join(DATA_DIR, "valid"))

    workflow.transform(train_dataset).to_parquet(
        output_path=DATA_DIR + "train/",
        shuffle=nvt.io.Shuffle.PER_PARTITION,
        cats=CATEGORICAL_COLUMNS,
        labels=LABEL_COLUMNS,
        dtypes=dict_dtypes,
    )
    workflow.transform(valid_dataset).to_parquet(
        output_path=DATA_DIR + "valid/",
        shuffle=False,
        cats=CATEGORICAL_COLUMNS,
        labels=LABEL_COLUMNS,
        dtypes=dict_dtypes,
    )

    embeddings = get_embedding_sizes(workflow)
    total_cardinality = 0
    slot_sizes = []
    for column in CATEGORICAL_COLUMNS:
        slot_sizes.append(embeddings[column][0])
        total_cardinality += embeddings[column][0]

    test_data_path = DATA_DIR + "test/"
    if path.exists(test_data_path):
        shutil.rmtree(test_data_path)

    os.mkdir(test_data_path)

    sample_data = cudf.read_parquet(DATA_DIR + "valid.parquet", num_rows=TEST_N_ROWS)
    sample_data.to_csv(test_data_path + "data.csv")

    sample_data_trans = nvt.workflow._transform_partition(sample_data, [workflow.column_group])

    dense_features, embedding_columns, row_ptrs = _convert(sample_data_trans, slot_sizes)

    _run_model(slot_sizes, total_cardinality)

    if path.exists(TEMP_DIR):
        shutil.rmtree(TEMP_DIR)

    os.mkdir(TEMP_DIR)

    file_names = glob.iglob(os.path.join(os.getcwd(), "*.model"))
    for files in file_names:
        shutil.move(files, TEMP_DIR)

    _write_model_json(slot_sizes, total_cardinality)

    if path.exists(MODEL_DIR):
        shutil.rmtree(MODEL_DIR)

    os.mkdir(MODEL_DIR)

    model_name = "test_model"
    hugectr_params = dict()
    hugectr_params["config"] = MODEL_DIR + "test_model/1/model.json"
    hugectr_params["slots"] = len(slot_sizes)
    hugectr_params["max_nnz"] = len(slot_sizes)
    hugectr_params["embedding_vector_size"] = 16
    hugectr_params["n_outputs"] = 1

    export_hugectr_ensemble(
        workflow=workflow,
        hugectr_model_path=TEMP_DIR,
        hugectr_params=hugectr_params,
        name=model_name,
        output_path=MODEL_DIR,
        label_columns=["rating"],
        cats=CATEGORICAL_COLUMNS,
        max_batch_size=64,
    )

    shutil.rmtree(TEMP_DIR)

    _predict(dense_features, embedding_columns, row_ptrs, hugectr_params["config"], model_name)