def test_nested_workflow_node(): df = dispatch._make_df({ "geo": ["US>CA", "US>NY", "CA>BC", "CA>ON"], "user": ["User_A", "User_A", "User_A", "User_B"], }) dataset = Dataset(df) geo_selector = ColumnSelector(["geo"]) country = (geo_selector >> LambdaOp(lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country")) # country1 = geo_selector >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country1") # country2 = geo_selector >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country2") user = "******" # user2 = "user2" # make sure we can do a 'combo' categorify (cross based) of country+user # as well as categorifying the country and user columns on their own cats = country + user + [country + user] >> Categorify(encode_type="combo") workflow = Workflow(cats) workflow.fit_schema(dataset.infer_schema()) df_out = workflow.fit_transform(dataset).to_ddf().compute( scheduler="synchronous") geo_country = df_out["geo_country"] assert geo_country[0] == geo_country[1] # rows 0,1 are both 'US' assert geo_country[2] == geo_country[3] # rows 2,3 are both 'CA' user = df_out["user"] assert user[0] == user[1] == user[2] assert user[3] != user[2] geo_country_user = df_out["geo_country_user"] assert geo_country_user[0] == geo_country_user[1] # US / userA assert geo_country_user[2] != geo_country_user[ 0] # same user but in canada # make sure we get an exception if we nest too deeply (can't handle arbitrarily deep # nested column groups - and the exceptions we would get in operators like Categorify # are super confusing for users) with pytest.raises(ValueError): cats = [[country + "user"] + country + "user" ] >> Categorify(encode_type="combo")
def test_compute_schemas(): root_schema = Schema(["a", "b", "c", "d", "e"]) node1 = ["a", "b"] >> Rename(postfix="_renamed") node1.parents[0].compute_schemas(root_schema) node1.compute_schemas(root_schema) assert node1.input_columns.names == ["a", "b"] assert node1.output_columns.names == ["a_renamed", "b_renamed"] node2 = node1 + "c" node2.dependencies[0].compute_schemas(root_schema) node2.compute_schemas(root_schema) assert node2.input_columns.names == ["a_renamed", "b_renamed", "c"] assert node2.output_columns.names == ["a_renamed", "b_renamed", "c"] node3 = node2["a_renamed"] node3.compute_schemas(root_schema) assert node3.input_columns.names == ["a_renamed"] assert node3.output_columns.names == ["a_renamed"]
def test_input_output_column_names(): schema = Schema(["a", "b", "c", "d", "e"]) input_node = ["a", "b", "c"] >> FillMissing() workflow = Workflow(input_node).fit_schema(schema) assert workflow.output_node.input_columns.names == ["a", "b", "c"] assert workflow.output_node.output_columns.names == ["a", "b", "c"] chained_node = input_node >> Categorify() workflow = Workflow(chained_node).fit_schema(schema) assert workflow.output_node.input_columns.names == ["a", "b", "c"] assert workflow.output_node.output_columns.names == ["a", "b", "c"] selection_node = input_node[["b", "c"]] workflow = Workflow(selection_node).fit_schema(schema) assert workflow.output_node.input_columns.names == ["b", "c"] assert workflow.output_node.output_columns.names == ["b", "c"] addition_node = input_node + ["d"] workflow = Workflow(addition_node).fit_schema(schema) assert workflow.output_node.input_columns.names == ["a", "b", "c", "d"] assert workflow.output_node.output_columns.names == ["a", "b", "c", "d"] rename_node = input_node >> Rename(postfix="_renamed") workflow = Workflow(rename_node).fit_schema(schema) assert workflow.output_node.input_columns.names == ["a", "b", "c"] assert workflow.output_node.output_columns.names == [ "a_renamed", "b_renamed", "c_renamed" ] dependency_node = input_node >> TargetEncoding("d") workflow = Workflow(dependency_node).fit_schema(schema) assert workflow.output_node.input_columns.names == ["a", "b", "c"] assert workflow.output_node.output_columns.names == [ "TE_a_d", "TE_b_d", "TE_c_d" ]
def test_nested_column_group(tmpdir): df = cudf.DataFrame( { "geo": ["US>CA", "US>NY", "CA>BC", "CA>ON"], "user": ["User_A", "User_A", "User_A", "User_B"], } ) country = ( ColumnGroup(["geo"]) >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country") ) # make sure we can do a 'combo' categorify (cross based) of country+user # as well as categorifying the country and user columns on their own cats = [country + "user"] + country + "user" >> Categorify(encode_type="combo") workflow = Workflow(cats) df_out = workflow.fit_transform(Dataset(df)).to_ddf().compute(scheduler="synchronous") geo_country = df_out["geo_country"] assert geo_country[0] == geo_country[1] # rows 0,1 are both 'US' assert geo_country[2] == geo_country[3] # rows 2,3 are both 'CA' user = df_out["user"] assert user[0] == user[1] == user[2] assert user[3] != user[2] geo_country_user = df_out["geo_country_user"] assert geo_country_user[0] == geo_country_user[1] # US / userA assert geo_country_user[2] != geo_country_user[0] # same user but in canada # make sure we get an exception if we nest too deeply (can't handle arbitrarily deep # nested column groups - and the exceptions we would get in operators like Categorify # are super confusing for users) with pytest.raises(ValueError): cats = [[country + "user"] + country + "user"] >> Categorify(encode_type="combo")
def make_feature_column_workflow(feature_columns, label_name, category_dir=None): """ Maps a list of TensorFlow `feature_column`s to an NVTabular `Workflow` which imitates their preprocessing functionality. Returns both the finalized `Workflow` as well as a list of `feature_column`s that can be used to instantiate a `layers.ScalarDenseFeatures` layer to map from `Workflow` outputs to dense network inputs. Useful for replacing feature column online preprocessing with NVTabular GPU-accelerated online preprocessing for faster training. Parameters ---------- feature_columns: list(tf.feature_column) List of TensorFlow feature columns to emulate preprocessing functions of. Doesn't support sequence columns. label_name: str Name of label column in dataset category_dir: str or None Directory in which to save categories from vocabulary list and vocabulary file columns. If left as None, will create directory `/tmp/categories` and save there Returns ------- workflow: nvtabular.Workflow An NVTabular `Workflow` which performs the preprocessing steps defined in `feature_columns` new_feature_columns: list(feature_columns) List of TensorFlow feature columns that correspond to the output from `workflow`. Only contains numeric and identity categorical columns. """ # TODO: should we support a dict input for feature columns # for multi-tower support? def _get_parents(column): """ quick utility function for getting all the input tensors that will feed into a column """ # column has no parents, so we've reached a terminal node if isinstance(column, str) or isinstance(column.parents[0], str): return [column] # else climb family tree parents = [] for parent in column.parents: parents.extend( [i for i in _get_parents(parent) if i not in parents]) return parents # could be more effiient with sets but this is deterministic which # might be helpful? Still not sure about this so being safe base_columns = [] for column in feature_columns: parents = _get_parents(column) base_columns.extend( [col for col in parents if col not in base_columns]) cat_names, cont_names = [], [] for column in base_columns: if isinstance(column, str): # cross column input # TODO: this means we only accept categorical inputs to # cross? How do we generalize this? Probably speaks to # the inefficiencies of feature columns as a schema # representation cat_names.extend(column) elif isinstance(column, fc.CategoricalColumn): cat_names.extend(column.key) else: cont_names.extend(column.key) _CATEGORIFY_COLUMNS = (fc.VocabularyListCategoricalColumn, fc.VocabularyFileCategoricalColumn) categorifies, hashes, crosses, buckets, replaced_buckets = {}, {}, {}, {}, {} numeric_columns = [] new_feature_columns = [] for column in feature_columns: # TODO: check for shared embedding or weighted embedding columns? # Do they just inherit from EmbeddingColumn? if not isinstance(column, (fc.EmbeddingColumn, fc.IndicatorColumn)): if isinstance(column, (fc.BucketizedColumn)): # bucketized column being fed directly to model means it's # implicitly wrapped into an indicator column cat_column = column embedding_dim = None else: # can this be anything else? I don't think so assert isinstance(column, fc.NumericColumn) # check to see if we've seen a bucketized column # that gets fed by this feature. If we have, note # that it shouldn't be replaced if column.key in replaced_buckets: buckets[column.key] = replaced_buckets.pop(column.key) numeric_columns.append(column) continue else: cat_column = column.categorical_column # use this to keep track of what should be embedding # and what should be indicator, makes the bucketized # checking easier if isinstance(column, fc.EmbeddingColumn): embedding_dim = column.dimension else: embedding_dim = None if isinstance(cat_column, fc.BucketizedColumn): key = cat_column.source_column.key # check if the source numeric column is being fed # directly to the model. Keep track of both the # boundaries and embedding dim so that we can wrap # with either indicator or embedding later if key in [col.key for col in numeric_columns]: buckets[key] = (column.boundaries, embedding_dim) else: replaced_buckets[key] = (column.boundaries, embedding_dim) # put off dealing with these until the end so that # we know whether we need to replace numeric # columns or create a separate feature column # for them continue elif isinstance(cat_column, _CATEGORIFY_COLUMNS): if cat_column.num_oov_buckets > 1: warnings.warn( "More than 1 oov bucket not supported for Categorify") if isinstance(cat_column, _CATEGORIFY_COLUMNS[1]): # TODO: how do we handle the case where it's too big to load? with open(cat_column.vocab_file, "r") as f: vocab = f.read().split("\n") else: vocab = cat_column.vocabulary_list categorifies[cat_column.key] = list(vocab) key = cat_column.key elif isinstance(cat_column, fc.HashedCategoricalColumn): hashes[cat_column.key] = cat_column.hash_bucket_size key = cat_column.key elif isinstance(cat_column, fc.CrossedColumn): keys = [] for key in cat_column.keys: if isinstance(key, fc.BucketizedColumn): keys.append(key.source_column.key + "_bucketized") elif isinstance(key, str): keys.append(key) else: keys.append(key.key) crosses[tuple(keys)] = (cat_column.hash_bucket_size, embedding_dim) # put off making the new columns here too so that we # make sure we have the key right after we check # for buckets later continue elif isinstance(cat_column, fc.IdentityCategoricalColumn): new_feature_columns.append(column) continue else: raise ValueError("Unknown column {}".format(cat_column)) new_feature_columns.append( _make_categorical_embedding(key, cat_column.num_buckets, embedding_dim)) features = ColumnSelector(label_name) if len(buckets) > 0: new_buckets = {} for key, (boundaries, embedding_dim) in buckets.items(): new_feature_columns.append( _make_categorical_embedding(key + "_bucketized", len(boundaries) + 1, embedding_dim)) new_buckets[key] = boundaries features_buckets = (new_buckets.keys() >> Bucketize(new_buckets) >> Rename(postfix="_bucketized")) features += features_buckets if len(replaced_buckets) > 0: new_replaced_buckets = {} for key, (boundaries, embedding_dim) in replaced_buckets.items(): new_feature_columns.append( _make_categorical_embedding(key, len(boundaries) + 1, embedding_dim)) new_replaced_buckets[key] = boundaries features_replaced_buckets = new_replaced_buckets.keys() >> Bucketize( new_replaced_buckets) features += features_replaced_buckets if len(categorifies) > 0: vocabs = { column: pd.Series(vocab) for column, vocab in categorifies.items() } features += ColumnSelector(list( categorifies.keys())) >> Categorify(vocabs=vocabs) if len(hashes) > 0: features += ColumnSelector(list(hashes.keys())) >> HashBucket(hashes) if len(crosses) > 0: # need to check if any bucketized columns are coming from # the bucketized version or the raw version new_crosses = {} for keys, (hash_bucket_size, embedding_dim) in crosses.items(): # if we're bucketizing the input we have to do more work here - if any(key.endswith("_bucketized") for key in keys): cross_columns = [] for key in keys: if key.endswith("_bucketized"): bucketized_cols = [] bucketized_cols.append(key) key = key.replace("_bucketized", "") if key in buckets: # find if there are different columns diff_col = list( set(features_buckets.columns) ^ set(bucketized_cols)) if diff_col: features_buckets.columns.remove(diff_col[0]) cross_columns.append(features_buckets) elif key in replaced_buckets: diff_col = list( set(features_replaced_buckets.columns) ^ set(bucketized_cols)) if diff_col: features_replaced_buckets.columns.remove( diff_col[0]) cross_columns.append(features_replaced_buckets) else: raise RuntimeError(f"Unknown bucket column {key}") else: cross_columns.append(nvt.WorkflowNode(key)) features += sum( cross_columns[1:], cross_columns[0]) >> HashedCross(hash_bucket_size) else: new_crosses[tuple(keys)] = hash_bucket_size key = "_X_".join(keys) new_feature_columns.append( _make_categorical_embedding(key, hash_bucket_size, embedding_dim)) if new_crosses: features += new_crosses.keys() >> HashedCross(new_crosses) if numeric_columns: features += [col.key for col in numeric_columns] workflow = nvt.Workflow(features) return workflow, numeric_columns + new_feature_columns
def process_NVT(args): if args.feature_cross_list: feature_pairs = [ pair.split("_") for pair in args.feature_cross_list.split(",") ] for pair in feature_pairs: CROSS_COLUMNS.append(pair[0] + '_' + pair[1]) logging.info('NVTabular processing') train_input = os.path.join(args.data_path, "train/train.txt") val_input = os.path.join(args.data_path, "val/test.txt") PREPROCESS_DIR_temp_train = os.path.join( args.out_path, 'train/temp-parquet-after-conversion') PREPROCESS_DIR_temp_val = os.path.join( args.out_path, 'val/temp-parquet-after-conversion') PREPROCESS_DIR_temp = [PREPROCESS_DIR_temp_train, PREPROCESS_DIR_temp_val] train_output = os.path.join(args.out_path, "train") val_output = os.path.join(args.out_path, "val") # Make sure we have a clean parquet space for cudf conversion for one_path in PREPROCESS_DIR_temp: if os.path.exists(one_path): shutil.rmtree(one_path) os.mkdir(one_path) ## Get Dask Client # Deploy a Single-Machine Multi-GPU Cluster device_size = device_mem_size(kind="total") cluster = None if args.protocol == "ucx": UCX_TLS = os.environ.get("UCX_TLS", "tcp,cuda_copy,cuda_ipc,sockcm") os.environ["UCX_TLS"] = UCX_TLS cluster = LocalCUDACluster(protocol=args.protocol, CUDA_VISIBLE_DEVICES=args.devices, n_workers=len(args.devices.split(",")), enable_nvlink=True, device_memory_limit=int( device_size * args.device_limit_frac), dashboard_address=":" + args.dashboard_port) else: cluster = LocalCUDACluster(protocol=args.protocol, n_workers=len(args.devices.split(",")), CUDA_VISIBLE_DEVICES=args.devices, device_memory_limit=int( device_size * args.device_limit_frac), dashboard_address=":" + args.dashboard_port) # Create the distributed client client = Client(cluster) if args.device_pool_frac > 0.01: setup_rmm_pool(client, int(args.device_pool_frac * device_size)) #calculate the total processing time runtime = time.time() #test dataset without the label feature if args.dataset_type == 'test': global LABEL_COLUMNS LABEL_COLUMNS = [] ##-----------------------------------## # Dask rapids converts txt to parquet # Dask cudf dataframe = ddf ## train/valid txt to parquet train_valid_paths = [(train_input, PREPROCESS_DIR_temp_train), (val_input, PREPROCESS_DIR_temp_val)] for input, temp_output in train_valid_paths: ddf = dask_cudf.read_csv(input, sep='\t', names=LABEL_COLUMNS + CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS) ## Convert label col to FP32 if args.parquet_format and args.dataset_type == 'train': ddf["label"] = ddf['label'].astype('float32') # Save it as parquet format for better memory usage ddf.to_parquet(temp_output, header=True) ##-----------------------------------## COLUMNS = LABEL_COLUMNS + CONTINUOUS_COLUMNS + CROSS_COLUMNS + CATEGORICAL_COLUMNS train_paths = glob.glob( os.path.join(PREPROCESS_DIR_temp_train, "*.parquet")) valid_paths = glob.glob(os.path.join(PREPROCESS_DIR_temp_val, "*.parquet")) categorify_op = Categorify(freq_threshold=args.freq_limit) cat_features = CATEGORICAL_COLUMNS >> categorify_op cont_features = CONTINUOUS_COLUMNS >> FillMissing() >> Clip( min_value=0) >> Normalize() cross_cat_op = Categorify(freq_threshold=args.freq_limit) features = LABEL_COLUMNS if args.criteo_mode == 0: features += cont_features if args.feature_cross_list: feature_pairs = [ pair.split("_") for pair in args.feature_cross_list.split(",") ] for pair in feature_pairs: col0 = pair[0] col1 = pair[1] features += col0 >> FeatureCross(col1) >> Rename( postfix="_" + col1) >> cross_cat_op features += cat_features workflow = nvt.Workflow(features, client=client) logging.info("Preprocessing") output_format = 'hugectr' if args.parquet_format: output_format = 'parquet' # just for /samples/criteo model train_ds_iterator = nvt.Dataset(train_paths, engine='parquet', part_size=int(args.part_mem_frac * device_size)) valid_ds_iterator = nvt.Dataset(valid_paths, engine='parquet', part_size=int(args.part_mem_frac * device_size)) shuffle = None if args.shuffle == "PER_WORKER": shuffle = nvt.io.Shuffle.PER_WORKER elif args.shuffle == "PER_PARTITION": shuffle = nvt.io.Shuffle.PER_PARTITION logging.info('Train Datasets Preprocessing.....') dict_dtypes = {} for col in CATEGORICAL_COLUMNS: dict_dtypes[col] = np.int64 if not args.criteo_mode: for col in CONTINUOUS_COLUMNS: dict_dtypes[col] = np.float32 for col in CROSS_COLUMNS: dict_dtypes[col] = np.int64 for col in LABEL_COLUMNS: dict_dtypes[col] = np.float32 conts = CONTINUOUS_COLUMNS if not args.criteo_mode else [] workflow.fit(train_ds_iterator) if output_format == 'hugectr': workflow.transform(train_ds_iterator).to_hugectr( cats=CATEGORICAL_COLUMNS + CROSS_COLUMNS, conts=conts, labels=LABEL_COLUMNS, output_path=train_output, shuffle=shuffle, out_files_per_proc=args.out_files_per_proc, num_threads=args.num_io_threads) else: workflow.transform(train_ds_iterator).to_parquet( output_path=train_output, dtypes=dict_dtypes, cats=CATEGORICAL_COLUMNS + CROSS_COLUMNS, conts=conts, labels=LABEL_COLUMNS, shuffle=shuffle, out_files_per_proc=args.out_files_per_proc, num_threads=args.num_io_threads) ###Getting slot size### #--------------------## embeddings_dict_cat = categorify_op.get_embedding_sizes( CATEGORICAL_COLUMNS) embeddings_dict_cross = cross_cat_op.get_embedding_sizes(CROSS_COLUMNS) embeddings = [embeddings_dict_cross[c][0] for c in CROSS_COLUMNS ] + [embeddings_dict_cat[c][0] for c in CATEGORICAL_COLUMNS] print(embeddings) ##--------------------## logging.info('Valid Datasets Preprocessing.....') if output_format == 'hugectr': workflow.transform(valid_ds_iterator).to_hugectr( cats=CATEGORICAL_COLUMNS + CROSS_COLUMNS, conts=conts, labels=LABEL_COLUMNS, output_path=val_output, shuffle=shuffle, out_files_per_proc=args.out_files_per_proc, num_threads=args.num_io_threads) else: workflow.transform(valid_ds_iterator).to_parquet( output_path=val_output, dtypes=dict_dtypes, cats=CATEGORICAL_COLUMNS + CROSS_COLUMNS, conts=conts, labels=LABEL_COLUMNS, shuffle=shuffle, out_files_per_proc=args.out_files_per_proc, num_threads=args.num_io_threads) embeddings_dict_cat = categorify_op.get_embedding_sizes( CATEGORICAL_COLUMNS) embeddings_dict_cross = cross_cat_op.get_embedding_sizes(CROSS_COLUMNS) embeddings = [embeddings_dict_cross[c][0] for c in CROSS_COLUMNS ] + [embeddings_dict_cat[c][0] for c in CATEGORICAL_COLUMNS] print(embeddings) ##--------------------## ## Shutdown clusters client.close() logging.info('NVTabular processing done') runtime = time.time() - runtime print("\nDask-NVTabular Criteo Preprocessing") print("--------------------------------------") print(f"data_path | {args.data_path}") print(f"output_path | {args.out_path}") print( f"partition size | {'%.2f GB'%bytesto(int(args.part_mem_frac * device_size),'g')}" ) print(f"protocol | {args.protocol}") print(f"device(s) | {args.devices}") print(f"rmm-pool-frac | {(args.device_pool_frac)}") print(f"out-files-per-proc | {args.out_files_per_proc}") print(f"num_io_threads | {args.num_io_threads}") print(f"shuffle | {args.shuffle}") print("======================================") print(f"Runtime[s] | {runtime}") print("======================================\n")
def create_workflow(data_bucket_folder, hash_spec, devices, local_directory, dask): rmm.reinitialize(managed_memory=False) documents_categories_path = os.path.join(data_bucket_folder, "documents_categories.csv") documents_topics_path = os.path.join(data_bucket_folder, "documents_topics.csv") documents_entities_path = os.path.join(data_bucket_folder, "documents_entities.csv") documents_categories_cudf = cudf.read_csv(documents_categories_path) documents_topics_cudf = cudf.read_csv(documents_topics_path) documents_entities_cudf = cudf.read_csv(documents_entities_path) documents_entities_cudf["entity_id"] = ( documents_entities_cudf["entity_id"].astype("category").cat.codes) categories = _df_to_coo(documents_categories_cudf, col="category_id") topics = _df_to_coo(documents_topics_cudf, col="topic_id") entities = _df_to_coo(documents_entities_cudf, col="entity_id") del documents_categories_cudf, documents_topics_cudf, documents_entities_cudf ctr_thresh = { "ad_id": 5, "source_id_promo": 10, "publisher_id_promo": 10, "advertiser_id": 10, "campaign_id": 10, "document_id_promo": 5, } ctr_inputs = ColumnGroup(CTR_INPUTS) cat_cols = ColumnGroup(CATEGORICAL_COLUMNS) geo_location = ColumnGroup(["geo_location"]) country = (geo_location >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country")) state = (geo_location >> (lambda col: col.str.slice(0, 5)) >> Rename(postfix="_state")) geo_features = geo_location + country + state dates = ["publish_time", "publish_time_promo"] date_features = dates >> DaysSincePublished() >> FillMedian() >> LogOp stat_cols = ctr_inputs >> JoinGroupby(cont_cols=["clicked"], stats=["sum", "count"]) ctr_cols = (stat_cols - [ column + "_count" for column in ctr_inputs.flattened_columns ] >> LambdaOp( f=lambda col, gdf: ((col) / (gdf[col.name.replace("_clicked_sum", "_count")])).where( gdf[col.name.replace("_clicked_sum", "_count")] >= ctr_thresh[ col.name.replace("_clicked_sum", "")], 0, ), dependency=stat_cols - [column + "clicked_sum" for column in ctr_inputs.flattened_columns], ) >> Rename(f=lambda x: x.replace("_clicked_sum", "_ctr"))) stat_cols = stat_cols >> FillMissing() >> LogOp() >> Normalize() ctr_cols = ctr_cols >> FillMissing() cat_cols = cat_cols + geo_features >> HashBucket(hash_spec) features = (date_features + ctr_cols + stat_cols + cat_cols + ["clicked", "display_id"]) sim_features_categ = ( [["document_id", "document_id_promo"]] >> ColumnSimilarity( categories, metric="tfidf", on_device=False) >> Rename(postfix="_categories")) sim_features_topics = ( [["document_id", "document_id_promo"] ] >> ColumnSimilarity(topics, metric="tfidf", on_device=False) >> Rename(postfix="_topics")) sim_features_entities = ( [["document_id", "document_id_promo"] ] >> ColumnSimilarity(entities, metric="tfidf", on_device=False) >> Rename(postfix="_entities")) sim_features = sim_features_categ + sim_features_topics + sim_features_entities client = create_client(devices=devices, local_directory=local_directory) if dask else None workflow = nvt.Workflow(column_group=features + sim_features, client=client) return workflow