예제 #1
0
def test_nested_workflow_node():
    df = dispatch._make_df({
        "geo": ["US>CA", "US>NY", "CA>BC", "CA>ON"],
        "user": ["User_A", "User_A", "User_A", "User_B"],
    })
    dataset = Dataset(df)

    geo_selector = ColumnSelector(["geo"])
    country = (geo_selector >> LambdaOp(lambda col: col.str.slice(0, 2)) >>
               Rename(postfix="_country"))
    # country1 = geo_selector >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country1")
    # country2 = geo_selector >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country2")
    user = "******"
    # user2 = "user2"

    # make sure we can do a 'combo' categorify (cross based) of country+user
    # as well as categorifying the country and user columns on their own
    cats = country + user + [country + user] >> Categorify(encode_type="combo")

    workflow = Workflow(cats)
    workflow.fit_schema(dataset.infer_schema())

    df_out = workflow.fit_transform(dataset).to_ddf().compute(
        scheduler="synchronous")

    geo_country = df_out["geo_country"]
    assert geo_country[0] == geo_country[1]  # rows 0,1 are both 'US'
    assert geo_country[2] == geo_country[3]  # rows 2,3 are both 'CA'

    user = df_out["user"]
    assert user[0] == user[1] == user[2]
    assert user[3] != user[2]

    geo_country_user = df_out["geo_country_user"]
    assert geo_country_user[0] == geo_country_user[1]  # US / userA
    assert geo_country_user[2] != geo_country_user[
        0]  # same user but in canada

    # make sure we get an exception if we nest too deeply (can't handle arbitrarily deep
    # nested column groups - and the exceptions we would get in operators like Categorify
    # are super confusing for users)
    with pytest.raises(ValueError):
        cats = [[country + "user"] + country + "user"
                ] >> Categorify(encode_type="combo")
예제 #2
0
def test_compute_schemas():
    root_schema = Schema(["a", "b", "c", "d", "e"])

    node1 = ["a", "b"] >> Rename(postfix="_renamed")
    node1.parents[0].compute_schemas(root_schema)
    node1.compute_schemas(root_schema)

    assert node1.input_columns.names == ["a", "b"]
    assert node1.output_columns.names == ["a_renamed", "b_renamed"]

    node2 = node1 + "c"
    node2.dependencies[0].compute_schemas(root_schema)
    node2.compute_schemas(root_schema)

    assert node2.input_columns.names == ["a_renamed", "b_renamed", "c"]
    assert node2.output_columns.names == ["a_renamed", "b_renamed", "c"]

    node3 = node2["a_renamed"]
    node3.compute_schemas(root_schema)

    assert node3.input_columns.names == ["a_renamed"]
    assert node3.output_columns.names == ["a_renamed"]
예제 #3
0
def test_input_output_column_names():
    schema = Schema(["a", "b", "c", "d", "e"])

    input_node = ["a", "b", "c"] >> FillMissing()
    workflow = Workflow(input_node).fit_schema(schema)
    assert workflow.output_node.input_columns.names == ["a", "b", "c"]
    assert workflow.output_node.output_columns.names == ["a", "b", "c"]

    chained_node = input_node >> Categorify()
    workflow = Workflow(chained_node).fit_schema(schema)
    assert workflow.output_node.input_columns.names == ["a", "b", "c"]
    assert workflow.output_node.output_columns.names == ["a", "b", "c"]

    selection_node = input_node[["b", "c"]]
    workflow = Workflow(selection_node).fit_schema(schema)
    assert workflow.output_node.input_columns.names == ["b", "c"]
    assert workflow.output_node.output_columns.names == ["b", "c"]

    addition_node = input_node + ["d"]
    workflow = Workflow(addition_node).fit_schema(schema)
    assert workflow.output_node.input_columns.names == ["a", "b", "c", "d"]
    assert workflow.output_node.output_columns.names == ["a", "b", "c", "d"]

    rename_node = input_node >> Rename(postfix="_renamed")
    workflow = Workflow(rename_node).fit_schema(schema)
    assert workflow.output_node.input_columns.names == ["a", "b", "c"]
    assert workflow.output_node.output_columns.names == [
        "a_renamed", "b_renamed", "c_renamed"
    ]

    dependency_node = input_node >> TargetEncoding("d")
    workflow = Workflow(dependency_node).fit_schema(schema)
    assert workflow.output_node.input_columns.names == ["a", "b", "c"]
    assert workflow.output_node.output_columns.names == [
        "TE_a_d", "TE_b_d", "TE_c_d"
    ]
예제 #4
0
def test_nested_column_group(tmpdir):
    df = cudf.DataFrame(
        {
            "geo": ["US>CA", "US>NY", "CA>BC", "CA>ON"],
            "user": ["User_A", "User_A", "User_A", "User_B"],
        }
    )

    country = (
        ColumnGroup(["geo"]) >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country")
    )

    # make sure we can do a 'combo' categorify (cross based) of country+user
    # as well as categorifying the country and user columns on their own
    cats = [country + "user"] + country + "user" >> Categorify(encode_type="combo")

    workflow = Workflow(cats)
    df_out = workflow.fit_transform(Dataset(df)).to_ddf().compute(scheduler="synchronous")

    geo_country = df_out["geo_country"]
    assert geo_country[0] == geo_country[1]  # rows 0,1 are both 'US'
    assert geo_country[2] == geo_country[3]  # rows 2,3 are both 'CA'

    user = df_out["user"]
    assert user[0] == user[1] == user[2]
    assert user[3] != user[2]

    geo_country_user = df_out["geo_country_user"]
    assert geo_country_user[0] == geo_country_user[1]  # US / userA
    assert geo_country_user[2] != geo_country_user[0]  # same user but in canada

    # make sure we get an exception if we nest too deeply (can't handle arbitrarily deep
    # nested column groups - and the exceptions we would get in operators like Categorify
    # are super confusing for users)
    with pytest.raises(ValueError):
        cats = [[country + "user"] + country + "user"] >> Categorify(encode_type="combo")
def make_feature_column_workflow(feature_columns,
                                 label_name,
                                 category_dir=None):
    """
    Maps a list of TensorFlow `feature_column`s to an NVTabular `Workflow` which
    imitates their preprocessing functionality. Returns both the finalized
    `Workflow` as well as a list of `feature_column`s that can be used to
    instantiate a `layers.ScalarDenseFeatures` layer to map from `Workflow`
    outputs to dense network inputs. Useful for replacing feature column
    online preprocessing with NVTabular GPU-accelerated online preprocessing
    for faster training.

    Parameters
    ----------
    feature_columns: list(tf.feature_column)
        List of TensorFlow feature columns to emulate preprocessing functions
        of. Doesn't support sequence columns.
    label_name: str
        Name of label column in dataset
    category_dir: str or None
        Directory in which to save categories from vocabulary list and
        vocabulary file columns. If left as None, will create directory
        `/tmp/categories` and save there

    Returns
    -------
    workflow: nvtabular.Workflow
        An NVTabular `Workflow` which performs the preprocessing steps
        defined in `feature_columns`
    new_feature_columns: list(feature_columns)
        List of TensorFlow feature columns that correspond to the output
        from `workflow`. Only contains numeric and identity categorical columns.
    """

    # TODO: should we support a dict input for feature columns
    # for multi-tower support?

    def _get_parents(column):
        """
        quick utility function for getting all the input tensors
        that will feed into a column
        """
        # column has no parents, so we've reached a terminal node
        if isinstance(column, str) or isinstance(column.parents[0], str):
            return [column]

        # else climb family tree
        parents = []
        for parent in column.parents:
            parents.extend(
                [i for i in _get_parents(parent) if i not in parents])
        return parents

    # could be more effiient with sets but this is deterministic which
    # might be helpful? Still not sure about this so being safe
    base_columns = []
    for column in feature_columns:
        parents = _get_parents(column)
        base_columns.extend(
            [col for col in parents if col not in base_columns])

    cat_names, cont_names = [], []
    for column in base_columns:
        if isinstance(column, str):
            # cross column input
            # TODO: this means we only accept categorical inputs to
            # cross? How do we generalize this? Probably speaks to
            # the inefficiencies of feature columns as a schema
            # representation
            cat_names.extend(column)
        elif isinstance(column, fc.CategoricalColumn):
            cat_names.extend(column.key)
        else:
            cont_names.extend(column.key)

    _CATEGORIFY_COLUMNS = (fc.VocabularyListCategoricalColumn,
                           fc.VocabularyFileCategoricalColumn)
    categorifies, hashes, crosses, buckets, replaced_buckets = {}, {}, {}, {}, {}

    numeric_columns = []
    new_feature_columns = []
    for column in feature_columns:
        # TODO: check for shared embedding or weighted embedding columns?
        # Do they just inherit from EmbeddingColumn?
        if not isinstance(column, (fc.EmbeddingColumn, fc.IndicatorColumn)):
            if isinstance(column, (fc.BucketizedColumn)):
                # bucketized column being fed directly to model means it's
                # implicitly wrapped into an indicator column
                cat_column = column
                embedding_dim = None
            else:
                # can this be anything else? I don't think so
                assert isinstance(column, fc.NumericColumn)

                # check to see if we've seen a bucketized column
                # that gets fed by this feature. If we have, note
                # that it shouldn't be replaced
                if column.key in replaced_buckets:
                    buckets[column.key] = replaced_buckets.pop(column.key)

                numeric_columns.append(column)
                continue
        else:
            cat_column = column.categorical_column

            # use this to keep track of what should be embedding
            # and what should be indicator, makes the bucketized
            # checking easier
            if isinstance(column, fc.EmbeddingColumn):
                embedding_dim = column.dimension
            else:
                embedding_dim = None

        if isinstance(cat_column, fc.BucketizedColumn):
            key = cat_column.source_column.key

            # check if the source numeric column is being fed
            # directly to the model. Keep track of both the
            # boundaries and embedding dim so that we can wrap
            # with either indicator or embedding later
            if key in [col.key for col in numeric_columns]:
                buckets[key] = (column.boundaries, embedding_dim)
            else:
                replaced_buckets[key] = (column.boundaries, embedding_dim)

            # put off dealing with these until the end so that
            # we know whether we need to replace numeric
            # columns or create a separate feature column
            # for them
            continue

        elif isinstance(cat_column, _CATEGORIFY_COLUMNS):
            if cat_column.num_oov_buckets > 1:
                warnings.warn(
                    "More than 1 oov bucket not supported for Categorify")

            if isinstance(cat_column, _CATEGORIFY_COLUMNS[1]):
                # TODO: how do we handle the case where it's too big to load?
                with open(cat_column.vocab_file, "r") as f:
                    vocab = f.read().split("\n")
            else:
                vocab = cat_column.vocabulary_list
            categorifies[cat_column.key] = list(vocab)
            key = cat_column.key

        elif isinstance(cat_column, fc.HashedCategoricalColumn):
            hashes[cat_column.key] = cat_column.hash_bucket_size
            key = cat_column.key

        elif isinstance(cat_column, fc.CrossedColumn):
            keys = []
            for key in cat_column.keys:
                if isinstance(key, fc.BucketizedColumn):
                    keys.append(key.source_column.key + "_bucketized")
                elif isinstance(key, str):
                    keys.append(key)
                else:
                    keys.append(key.key)
            crosses[tuple(keys)] = (cat_column.hash_bucket_size, embedding_dim)

            # put off making the new columns here too so that we
            # make sure we have the key right after we check
            # for buckets later
            continue

        elif isinstance(cat_column, fc.IdentityCategoricalColumn):
            new_feature_columns.append(column)
            continue

        else:
            raise ValueError("Unknown column {}".format(cat_column))

        new_feature_columns.append(
            _make_categorical_embedding(key, cat_column.num_buckets,
                                        embedding_dim))

    features = ColumnSelector(label_name)

    if len(buckets) > 0:
        new_buckets = {}
        for key, (boundaries, embedding_dim) in buckets.items():
            new_feature_columns.append(
                _make_categorical_embedding(key + "_bucketized",
                                            len(boundaries) + 1,
                                            embedding_dim))
            new_buckets[key] = boundaries

        features_buckets = (new_buckets.keys() >> Bucketize(new_buckets) >>
                            Rename(postfix="_bucketized"))
        features += features_buckets

    if len(replaced_buckets) > 0:
        new_replaced_buckets = {}
        for key, (boundaries, embedding_dim) in replaced_buckets.items():
            new_feature_columns.append(
                _make_categorical_embedding(key,
                                            len(boundaries) + 1,
                                            embedding_dim))
            new_replaced_buckets[key] = boundaries
        features_replaced_buckets = new_replaced_buckets.keys() >> Bucketize(
            new_replaced_buckets)
        features += features_replaced_buckets

    if len(categorifies) > 0:
        vocabs = {
            column: pd.Series(vocab)
            for column, vocab in categorifies.items()
        }
        features += ColumnSelector(list(
            categorifies.keys())) >> Categorify(vocabs=vocabs)

    if len(hashes) > 0:
        features += ColumnSelector(list(hashes.keys())) >> HashBucket(hashes)

    if len(crosses) > 0:
        # need to check if any bucketized columns are coming from
        # the bucketized version or the raw version
        new_crosses = {}
        for keys, (hash_bucket_size, embedding_dim) in crosses.items():
            # if we're bucketizing the input we have to do more work here -
            if any(key.endswith("_bucketized") for key in keys):
                cross_columns = []
                for key in keys:
                    if key.endswith("_bucketized"):
                        bucketized_cols = []
                        bucketized_cols.append(key)
                        key = key.replace("_bucketized", "")
                        if key in buckets:
                            # find if there are different columns
                            diff_col = list(
                                set(features_buckets.columns)
                                ^ set(bucketized_cols))
                            if diff_col:
                                features_buckets.columns.remove(diff_col[0])
                            cross_columns.append(features_buckets)
                        elif key in replaced_buckets:
                            diff_col = list(
                                set(features_replaced_buckets.columns)
                                ^ set(bucketized_cols))
                            if diff_col:
                                features_replaced_buckets.columns.remove(
                                    diff_col[0])
                            cross_columns.append(features_replaced_buckets)
                        else:
                            raise RuntimeError(f"Unknown bucket column {key}")
                    else:
                        cross_columns.append(nvt.WorkflowNode(key))

                features += sum(
                    cross_columns[1:],
                    cross_columns[0]) >> HashedCross(hash_bucket_size)

            else:
                new_crosses[tuple(keys)] = hash_bucket_size
            key = "_X_".join(keys)
            new_feature_columns.append(
                _make_categorical_embedding(key, hash_bucket_size,
                                            embedding_dim))

        if new_crosses:
            features += new_crosses.keys() >> HashedCross(new_crosses)

    if numeric_columns:
        features += [col.key for col in numeric_columns]

    workflow = nvt.Workflow(features)

    return workflow, numeric_columns + new_feature_columns
예제 #6
0
def process_NVT(args):

    if args.feature_cross_list:
        feature_pairs = [
            pair.split("_") for pair in args.feature_cross_list.split(",")
        ]
        for pair in feature_pairs:
            CROSS_COLUMNS.append(pair[0] + '_' + pair[1])

    logging.info('NVTabular processing')
    train_input = os.path.join(args.data_path, "train/train.txt")
    val_input = os.path.join(args.data_path, "val/test.txt")
    PREPROCESS_DIR_temp_train = os.path.join(
        args.out_path, 'train/temp-parquet-after-conversion')
    PREPROCESS_DIR_temp_val = os.path.join(
        args.out_path, 'val/temp-parquet-after-conversion')
    PREPROCESS_DIR_temp = [PREPROCESS_DIR_temp_train, PREPROCESS_DIR_temp_val]
    train_output = os.path.join(args.out_path, "train")
    val_output = os.path.join(args.out_path, "val")

    # Make sure we have a clean parquet space for cudf conversion
    for one_path in PREPROCESS_DIR_temp:
        if os.path.exists(one_path):
            shutil.rmtree(one_path)
        os.mkdir(one_path)

    ## Get Dask Client

    # Deploy a Single-Machine Multi-GPU Cluster
    device_size = device_mem_size(kind="total")
    cluster = None
    if args.protocol == "ucx":
        UCX_TLS = os.environ.get("UCX_TLS", "tcp,cuda_copy,cuda_ipc,sockcm")
        os.environ["UCX_TLS"] = UCX_TLS
        cluster = LocalCUDACluster(protocol=args.protocol,
                                   CUDA_VISIBLE_DEVICES=args.devices,
                                   n_workers=len(args.devices.split(",")),
                                   enable_nvlink=True,
                                   device_memory_limit=int(
                                       device_size * args.device_limit_frac),
                                   dashboard_address=":" + args.dashboard_port)
    else:
        cluster = LocalCUDACluster(protocol=args.protocol,
                                   n_workers=len(args.devices.split(",")),
                                   CUDA_VISIBLE_DEVICES=args.devices,
                                   device_memory_limit=int(
                                       device_size * args.device_limit_frac),
                                   dashboard_address=":" + args.dashboard_port)

    # Create the distributed client
    client = Client(cluster)
    if args.device_pool_frac > 0.01:
        setup_rmm_pool(client, int(args.device_pool_frac * device_size))

    #calculate the total processing time
    runtime = time.time()

    #test dataset without the label feature
    if args.dataset_type == 'test':
        global LABEL_COLUMNS
        LABEL_COLUMNS = []

    ##-----------------------------------##
    # Dask rapids converts txt to parquet
    # Dask cudf dataframe = ddf

    ## train/valid txt to parquet
    train_valid_paths = [(train_input, PREPROCESS_DIR_temp_train),
                         (val_input, PREPROCESS_DIR_temp_val)]

    for input, temp_output in train_valid_paths:

        ddf = dask_cudf.read_csv(input,
                                 sep='\t',
                                 names=LABEL_COLUMNS + CONTINUOUS_COLUMNS +
                                 CATEGORICAL_COLUMNS)

        ## Convert label col to FP32
        if args.parquet_format and args.dataset_type == 'train':
            ddf["label"] = ddf['label'].astype('float32')

        # Save it as parquet format for better memory usage
        ddf.to_parquet(temp_output, header=True)
        ##-----------------------------------##

    COLUMNS = LABEL_COLUMNS + CONTINUOUS_COLUMNS + CROSS_COLUMNS + CATEGORICAL_COLUMNS
    train_paths = glob.glob(
        os.path.join(PREPROCESS_DIR_temp_train, "*.parquet"))
    valid_paths = glob.glob(os.path.join(PREPROCESS_DIR_temp_val, "*.parquet"))

    categorify_op = Categorify(freq_threshold=args.freq_limit)
    cat_features = CATEGORICAL_COLUMNS >> categorify_op
    cont_features = CONTINUOUS_COLUMNS >> FillMissing() >> Clip(
        min_value=0) >> Normalize()
    cross_cat_op = Categorify(freq_threshold=args.freq_limit)

    features = LABEL_COLUMNS

    if args.criteo_mode == 0:
        features += cont_features
        if args.feature_cross_list:
            feature_pairs = [
                pair.split("_") for pair in args.feature_cross_list.split(",")
            ]
            for pair in feature_pairs:
                col0 = pair[0]
                col1 = pair[1]
                features += col0 >> FeatureCross(col1) >> Rename(
                    postfix="_" + col1) >> cross_cat_op

    features += cat_features

    workflow = nvt.Workflow(features, client=client)

    logging.info("Preprocessing")

    output_format = 'hugectr'
    if args.parquet_format:
        output_format = 'parquet'

    # just for /samples/criteo model
    train_ds_iterator = nvt.Dataset(train_paths,
                                    engine='parquet',
                                    part_size=int(args.part_mem_frac *
                                                  device_size))
    valid_ds_iterator = nvt.Dataset(valid_paths,
                                    engine='parquet',
                                    part_size=int(args.part_mem_frac *
                                                  device_size))

    shuffle = None
    if args.shuffle == "PER_WORKER":
        shuffle = nvt.io.Shuffle.PER_WORKER
    elif args.shuffle == "PER_PARTITION":
        shuffle = nvt.io.Shuffle.PER_PARTITION

    logging.info('Train Datasets Preprocessing.....')

    dict_dtypes = {}
    for col in CATEGORICAL_COLUMNS:
        dict_dtypes[col] = np.int64
    if not args.criteo_mode:
        for col in CONTINUOUS_COLUMNS:
            dict_dtypes[col] = np.float32
    for col in CROSS_COLUMNS:
        dict_dtypes[col] = np.int64
    for col in LABEL_COLUMNS:
        dict_dtypes[col] = np.float32

    conts = CONTINUOUS_COLUMNS if not args.criteo_mode else []

    workflow.fit(train_ds_iterator)

    if output_format == 'hugectr':
        workflow.transform(train_ds_iterator).to_hugectr(
            cats=CATEGORICAL_COLUMNS + CROSS_COLUMNS,
            conts=conts,
            labels=LABEL_COLUMNS,
            output_path=train_output,
            shuffle=shuffle,
            out_files_per_proc=args.out_files_per_proc,
            num_threads=args.num_io_threads)
    else:
        workflow.transform(train_ds_iterator).to_parquet(
            output_path=train_output,
            dtypes=dict_dtypes,
            cats=CATEGORICAL_COLUMNS + CROSS_COLUMNS,
            conts=conts,
            labels=LABEL_COLUMNS,
            shuffle=shuffle,
            out_files_per_proc=args.out_files_per_proc,
            num_threads=args.num_io_threads)

    ###Getting slot size###
    #--------------------##
    embeddings_dict_cat = categorify_op.get_embedding_sizes(
        CATEGORICAL_COLUMNS)
    embeddings_dict_cross = cross_cat_op.get_embedding_sizes(CROSS_COLUMNS)
    embeddings = [embeddings_dict_cross[c][0] for c in CROSS_COLUMNS
                  ] + [embeddings_dict_cat[c][0] for c in CATEGORICAL_COLUMNS]

    print(embeddings)
    ##--------------------##

    logging.info('Valid Datasets Preprocessing.....')

    if output_format == 'hugectr':
        workflow.transform(valid_ds_iterator).to_hugectr(
            cats=CATEGORICAL_COLUMNS + CROSS_COLUMNS,
            conts=conts,
            labels=LABEL_COLUMNS,
            output_path=val_output,
            shuffle=shuffle,
            out_files_per_proc=args.out_files_per_proc,
            num_threads=args.num_io_threads)
    else:
        workflow.transform(valid_ds_iterator).to_parquet(
            output_path=val_output,
            dtypes=dict_dtypes,
            cats=CATEGORICAL_COLUMNS + CROSS_COLUMNS,
            conts=conts,
            labels=LABEL_COLUMNS,
            shuffle=shuffle,
            out_files_per_proc=args.out_files_per_proc,
            num_threads=args.num_io_threads)

    embeddings_dict_cat = categorify_op.get_embedding_sizes(
        CATEGORICAL_COLUMNS)
    embeddings_dict_cross = cross_cat_op.get_embedding_sizes(CROSS_COLUMNS)
    embeddings = [embeddings_dict_cross[c][0] for c in CROSS_COLUMNS
                  ] + [embeddings_dict_cat[c][0] for c in CATEGORICAL_COLUMNS]

    print(embeddings)
    ##--------------------##

    ## Shutdown clusters
    client.close()
    logging.info('NVTabular processing done')

    runtime = time.time() - runtime

    print("\nDask-NVTabular Criteo Preprocessing")
    print("--------------------------------------")
    print(f"data_path          | {args.data_path}")
    print(f"output_path        | {args.out_path}")
    print(
        f"partition size     | {'%.2f GB'%bytesto(int(args.part_mem_frac * device_size),'g')}"
    )
    print(f"protocol           | {args.protocol}")
    print(f"device(s)          | {args.devices}")
    print(f"rmm-pool-frac      | {(args.device_pool_frac)}")
    print(f"out-files-per-proc | {args.out_files_per_proc}")
    print(f"num_io_threads     | {args.num_io_threads}")
    print(f"shuffle            | {args.shuffle}")
    print("======================================")
    print(f"Runtime[s]         | {runtime}")
    print("======================================\n")
예제 #7
0
def create_workflow(data_bucket_folder, hash_spec, devices, local_directory,
                    dask):
    rmm.reinitialize(managed_memory=False)
    documents_categories_path = os.path.join(data_bucket_folder,
                                             "documents_categories.csv")
    documents_topics_path = os.path.join(data_bucket_folder,
                                         "documents_topics.csv")
    documents_entities_path = os.path.join(data_bucket_folder,
                                           "documents_entities.csv")

    documents_categories_cudf = cudf.read_csv(documents_categories_path)
    documents_topics_cudf = cudf.read_csv(documents_topics_path)
    documents_entities_cudf = cudf.read_csv(documents_entities_path)
    documents_entities_cudf["entity_id"] = (
        documents_entities_cudf["entity_id"].astype("category").cat.codes)

    categories = _df_to_coo(documents_categories_cudf, col="category_id")
    topics = _df_to_coo(documents_topics_cudf, col="topic_id")
    entities = _df_to_coo(documents_entities_cudf, col="entity_id")

    del documents_categories_cudf, documents_topics_cudf, documents_entities_cudf
    ctr_thresh = {
        "ad_id": 5,
        "source_id_promo": 10,
        "publisher_id_promo": 10,
        "advertiser_id": 10,
        "campaign_id": 10,
        "document_id_promo": 5,
    }

    ctr_inputs = ColumnGroup(CTR_INPUTS)
    cat_cols = ColumnGroup(CATEGORICAL_COLUMNS)

    geo_location = ColumnGroup(["geo_location"])
    country = (geo_location >>
               (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country"))
    state = (geo_location >>
             (lambda col: col.str.slice(0, 5)) >> Rename(postfix="_state"))
    geo_features = geo_location + country + state

    dates = ["publish_time", "publish_time_promo"]
    date_features = dates >> DaysSincePublished() >> FillMedian() >> LogOp

    stat_cols = ctr_inputs >> JoinGroupby(cont_cols=["clicked"],
                                          stats=["sum", "count"])
    ctr_cols = (stat_cols - [
        column + "_count" for column in ctr_inputs.flattened_columns
    ] >> LambdaOp(
        f=lambda col, gdf:
        ((col) / (gdf[col.name.replace("_clicked_sum", "_count")])).where(
            gdf[col.name.replace("_clicked_sum", "_count")] >= ctr_thresh[
                col.name.replace("_clicked_sum", "")],
            0,
        ),
        dependency=stat_cols -
        [column + "clicked_sum" for column in ctr_inputs.flattened_columns],
    ) >> Rename(f=lambda x: x.replace("_clicked_sum", "_ctr")))

    stat_cols = stat_cols >> FillMissing() >> LogOp() >> Normalize()
    ctr_cols = ctr_cols >> FillMissing()

    cat_cols = cat_cols + geo_features >> HashBucket(hash_spec)

    features = (date_features + ctr_cols + stat_cols + cat_cols +
                ["clicked", "display_id"])
    sim_features_categ = (
        [["document_id", "document_id_promo"]] >> ColumnSimilarity(
            categories, metric="tfidf", on_device=False) >>
        Rename(postfix="_categories"))
    sim_features_topics = (
        [["document_id", "document_id_promo"]
         ] >> ColumnSimilarity(topics, metric="tfidf",
                               on_device=False) >> Rename(postfix="_topics"))
    sim_features_entities = (
        [["document_id", "document_id_promo"]
         ] >> ColumnSimilarity(entities, metric="tfidf",
                               on_device=False) >> Rename(postfix="_entities"))
    sim_features = sim_features_categ + sim_features_topics + sim_features_entities

    client = create_client(devices=devices,
                           local_directory=local_directory) if dask else None

    workflow = nvt.Workflow(column_group=features + sim_features,
                            client=client)

    return workflow