def _pool(frac=0.8): initial_pool_size = frac * device_mem_size() if initial_pool_size % 256 != 0: new_initial_pool_size = initial_pool_size // 256 * 256 print( f"Initial pool size for rmm has to be a multiply of 256. Got {initial_pool_size}, reducing to {new_initial_pool_size}" ) initial_pool_size = new_initial_pool_size rmm.reinitialize( pool_allocator=True, initial_pool_size=initial_pool_size, )
def create_client(devices, local_directory): client = None if len(devices) > 1: device_size = device_mem_size(kind="total") device_limit = int(0.8 * device_size) device_pool_size = int(0.8 * device_size) cluster = LocalCUDACluster(n_workers=len(devices), CUDA_VISIBLE_DEVICES=",".join( str(x) for x in devices), device_memory_limit=device_limit, local_directory=local_directory) client = Client(cluster) setup_rmm_pool(client, device_pool_size) return client
def main(args): # Get device configuration device_size = device_mem_size(kind="total") device_limit = int(args.device_limit_frac * device_size) device_pool_size = int(args.device_pool_frac * device_size) part_size = int(args.part_mem_frac * device_size) # Get dataset columns with fsspec.open(args.config_file) as f: config = json.load(f) # Create Dataset dataset = Dataset(args.data_path, engine=args.format, part_size=part_size) # Call Inspector with managed_client(args.devices, device_limit, args.protocol) as client: setup_rmm_pool(client, device_pool_size) a = datains.DatasetInspector(client) a.inspect(dataset, config, args.output_file)
def set_cluster_client(n_gpus=-1, device_spill_frac=0.8): # TODO: Check for any solution. If user calls this function, for the second call the correct recreation will fail. # New cluster can be created after 'kernel restart' procedure. ''' device_spill_frac: Spill GPU-Worker memory to host at this limit. Reduce if spilling fails to prevent device memory errors. ''' if os.path.isdir("dask-worker-space"): shutil.rmtree('dask-worker-space', ignore_errors=True) # Deploy a Single-Machine Multi-GPU Cluster if n_gpus == -1: nvidia_smi.nvmlInit() n_gpus_avail = nvidia_smi.nvmlDeviceGetCount() print('\n n_gpus_avail: {}'.format(n_gpus_avail)) n_gpus = n_gpus_avail # Delect devices to place workers visible_devices = [i for i in list(range(n_gpus))] visible_devices = str(visible_devices)[1:-1] #print('visible_devices: {}'.format(visible_devices)) #TODO: how to reinitialzed cluster cluster = LocalCUDACluster( protocol="tcp", # "tcp" or "ucx" CUDA_VISIBLE_DEVICES=visible_devices, device_memory_limit=device_spill_frac * device_mem_size(kind="total"), ) try: # Create the distributed client client = Client(cluster) display(client) print('\n Dashboard avail: http://localhost:8888/proxy/8787/status') # Initialize RMM pool on ALL workers def _rmm_pool(): rmm.reinitialize( pool_allocator=True, initial_pool_size=None, # Use default size ) client.run(_rmm_pool) return client except MemoryError: print('\n The client is already initialized')
def process(args): train_path = os.path.abspath("../din_data/train") test_path = os.path.abspath("../din_data/valid") if os.path.exists(train_path): shutil.rmtree(train_path) if os.path.exists(test_path): shutil.rmtree(test_path) os.mkdir(train_path) os.mkdir(test_path) #Path to save temp parquet train_temp = "../din_data/train_temp.parquet" valid_temp = "../din_data/test_temp.parquet" #Path to save final parquet train_output = train_path valid_output = test_path # Deploy a Single-Machine Multi-GPU Cluster device_size = device_mem_size(kind="total") cluster = None if args.protocol == "ucx": UCX_TLS = os.environ.get("UCX_TLS", "tcp,cuda_copy,cuda_ipc,sockcm") os.environ["UCX_TLS"] = UCX_TLS cluster = LocalCUDACluster( protocol = args.protocol, CUDA_VISIBLE_DEVICES = args.devices, n_workers = len(args.devices.split(",")), enable_nvlink=True, device_memory_limit = int(device_size * args.device_limit_frac), dashboard_address=":" + args.dashboard_port ) else: cluster = LocalCUDACluster( protocol = args.protocol, n_workers = len(args.devices.split(",")), CUDA_VISIBLE_DEVICES = args.devices, device_memory_limit = int(device_size * args.device_limit_frac), dashboard_address=":" + args.dashboard_port ) # Create the distributed client client = Client(cluster) if args.device_pool_frac > 0.01: setup_rmm_pool(client, int(args.device_pool_frac*device_size)) runtime = time.time() ##Real works here features = LABEL + ColumnGroup(CAT_COLUMNS) workflow = nvt.Workflow(features, client = client) train_ds_iterator = nvt.Dataset(train_temp, engine='parquet', part_size=int(args.part_mem_frac * device_size)) valid_ds_iterator = nvt.Dataset(valid_temp, engine='parquet', part_size=int(args.part_mem_frac * device_size)) ##Shuffle shuffle = None if args.shuffle == "PER_WORKER": shuffle = nvt.io.Shuffle.PER_WORKER elif args.shuffle == "PER_PARTITION": shuffle = nvt.io.Shuffle.PER_PARTITION dict_dtypes = {} for col in CAT_COLUMNS: dict_dtypes[col] = np.int64 for col in LABEL: dict_dtypes[col] = np.float32 workflow.fit(train_ds_iterator) workflow.transform(train_ds_iterator).to_parquet( output_path=train_output, dtypes=dict_dtypes, cats=CAT_COLUMNS, labels=LABEL, shuffle=shuffle, out_files_per_proc=args.out_files_per_proc, num_threads=args.num_io_threads) workflow.transform(valid_ds_iterator).to_parquet( output_path=valid_output, dtypes=dict_dtypes, cats=CAT_COLUMNS, labels=LABEL, shuffle=shuffle, out_files_per_proc=args.out_files_per_proc, num_threads=args.num_io_threads) client.close() print("Time:", time.time() - runtime)
def main(args): """Multi-GPU Criteo/DLRM Preprocessing Benchmark This benchmark is designed to measure the time required to preprocess the Criteo (1TB) dataset for Facebook’s DLRM model. The user must specify the path of the raw dataset (using the `--data-path` flag), as well as the output directory for all temporary/final data (using the `--out-path` flag) Example Usage ------------- python dask-nvtabular-criteo-benchmark.py --data-path /path/to/criteo_parquet --out-path /out/dir/` Dataset Requirements (Parquet) ------------------------------ This benchmark is designed with a parquet-formatted dataset in mind. While a CSV-formatted dataset can be processed by NVTabular, converting to parquet will yield significantly better performance. To convert your dataset, try using the `optimize_criteo.ipynb` notebook (also located in `NVTabular/examples/`) For a detailed parameter overview see `NVTabular/examples/MultiGPUBench.md` """ # Input data_path = args.data_path freq_limit = args.freq_limit out_files_per_proc = args.out_files_per_proc high_card_columns = args.high_cards.split(",") dashboard_port = args.dashboard_port if args.protocol == "ucx": UCX_TLS = os.environ.get("UCX_TLS", "tcp,cuda_copy,cuda_ipc,sockcm") os.environ["UCX_TLS"] = UCX_TLS # Cleanup output directory BASE_DIR = args.out_path dask_workdir = os.path.join(BASE_DIR, "workdir") output_path = os.path.join(BASE_DIR, "output") stats_path = os.path.join(BASE_DIR, "stats") if not os.path.isdir(BASE_DIR): os.mkdir(BASE_DIR) for dir_path in (dask_workdir, output_path, stats_path): if os.path.isdir(dir_path): shutil.rmtree(dir_path) os.mkdir(dir_path) # Use Criteo dataset by default (for now) cont_names = (args.cont_names.split(",") if args.cont_names else ["I" + str(x) for x in range(1, 14)]) cat_names = (args.cat_names.split(",") if args.cat_names else ["C" + str(x) for x in range(1, 27)]) label_name = ["label"] # Specify Categorify/GroupbyStatistics options tree_width = {} cat_cache = {} for col in cat_names: if col in high_card_columns: tree_width[col] = args.tree_width cat_cache[col] = args.cat_cache_high else: tree_width[col] = 1 cat_cache[col] = args.cat_cache_low # Use total device size to calculate args.device_limit_frac device_size = device_mem_size(kind="total") device_limit = int(args.device_limit_frac * device_size) device_pool_size = int(args.device_pool_frac * device_size) part_size = int(args.part_mem_frac * device_size) # Parse shuffle option shuffle = None if args.shuffle == "PER_WORKER": shuffle = nvt_io.Shuffle.PER_WORKER elif args.shuffle == "PER_PARTITION": shuffle = nvt_io.Shuffle.PER_PARTITION # Check if any device memory is already occupied for dev in args.devices.split(","): fmem = _pynvml_mem_size(kind="free", index=int(dev)) used = (device_size - fmem) / 1e9 if used > 1.0: warnings.warn( f"BEWARE - {used} GB is already occupied on device {int(dev)}!" ) # Setup LocalCUDACluster if args.protocol == "tcp": cluster = LocalCUDACluster( protocol=args.protocol, n_workers=args.n_workers, CUDA_VISIBLE_DEVICES=args.devices, device_memory_limit=device_limit, local_directory=dask_workdir, dashboard_address=":" + dashboard_port, ) else: cluster = LocalCUDACluster( protocol=args.protocol, n_workers=args.n_workers, CUDA_VISIBLE_DEVICES=args.devices, enable_nvlink=True, device_memory_limit=device_limit, local_directory=dask_workdir, dashboard_address=":" + dashboard_port, ) client = Client(cluster) # Setup RMM pool if args.device_pool_frac > 0.01: setup_rmm_pool(client, device_pool_size) # Define Dask NVTabular "Workflow" processor = Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name, client=client) if args.normalize: processor.add_feature([ops.FillMissing(), ops.Normalize()]) else: processor.add_feature( [ops.FillMissing(), ops.Clip(min_value=0), ops.LogOp()]) processor.add_preprocess( ops.Categorify( out_path=stats_path, tree_width=tree_width, cat_cache=cat_cache, freq_threshold=freq_limit, search_sorted=not freq_limit, on_host=not args.cats_on_device, )) processor.finalize() dataset = Dataset(data_path, "parquet", part_size=part_size) # Execute the dask graph runtime = time.time() if args.profile is not None: with performance_report(filename=args.profile): processor.apply( dataset, shuffle=shuffle, out_files_per_proc=out_files_per_proc, output_path=output_path, num_io_threads=args.num_io_threads, ) else: processor.apply( dataset, num_io_threads=args.num_io_threads, shuffle=shuffle, out_files_per_proc=out_files_per_proc, output_path=output_path, ) runtime = time.time() - runtime print("\nDask-NVTabular DLRM/Criteo benchmark") print("--------------------------------------") print(f"partition size | {part_size}") print(f"protocol | {args.protocol}") print(f"device(s) | {args.devices}") print(f"rmm-pool-frac | {(args.device_pool_frac)}") print(f"out-files-per-proc | {args.out_files_per_proc}") print(f"num_io_threads | {args.num_io_threads}") print(f"shuffle | {args.shuffle}") print(f"cats-on-device | {args.cats_on_device}") print("======================================") print(f"Runtime[s] | {runtime}") print("======================================\n") client.close()
def run_preprocessing(input_train_path, workflow_path, output_path, dask_workdir, num_gpus): fname = '{}.parquet' train_files = [ i for i in os.listdir(input_train_path) if re.match(fname.format('.*'), i) is not None ] train_paths = [ os.path.join(input_train_path, filename) for filename in train_files ] # Deploy a Dask Distributed Cluster # Single-Machine Multi-GPU Cluster protocol = "tcp" # "tcp" or "ucx" visible_devices = ",".join([str(n) for n in num_gpus ]) # Delect devices to place workers device_limit_frac = 0.4 # Spill GPU-Worker memory to host at this limit. device_pool_frac = 0.5 part_mem_frac = 0.05 # Use total device size to calculate args.device_limit_frac device_size = device_mem_size(kind="total") part_size = int(part_mem_frac * device_size) logging.info(f"Partition size: {part_size}") # Deploy Dask Distributed cluster only if asked for multiple GPUs if len(num_gpus) > 1: logging.info("Deploy Dask Distributed cluster...") device_limit = int(device_limit_frac * device_size) device_pool_size = int(device_pool_frac * device_size) logging.info("Checking if any device memory is already occupied...") # Check if any device memory is already occupied for dev in visible_devices.split(","): fmem = _pynvml_mem_size(kind="free", index=int(dev)) used = (device_size - fmem) / 1e9 if used > 1.0: warnings.warn( f"BEWARE - {used} GB is already occupied on device {int(dev)}!" ) cluster = None # (Optional) Specify existing scheduler port if cluster is None: cluster = LocalCUDACluster(protocol=protocol, n_workers=len( visible_devices.split(",")), CUDA_VISIBLE_DEVICES=visible_devices, device_memory_limit=device_limit, local_directory=dask_workdir) logging.info("Create the distributed client...") # Create the distributed client client = Client(cluster) logging.info("Initialize memory pools...") # Initialize RMM pool on ALL workers def _rmm_pool(): rmm.reinitialize( # RMM may require the pool size to be a multiple of 256. pool_allocator=True, initial_pool_size=(device_pool_size // 256) * 256, # Use default size ) client.run(_rmm_pool) # Import the test .parquet logging.info("Importing Data...") test_dataset = nvt.Dataset(train_paths, engine='parquet', part_size=part_size) logging.info("Loading workflow object...") workflow = nvt.Workflow.load(workflow_path) # Specify the columns IDs: this part should exactly the columns while preproc. train, valid datasets CONTINUOUS_COLUMNS = ['I' + str(x) for x in range(1, 14)] CATEGORICAL_COLUMNS = ['C' + str(x) for x in range(1, 27)] LABEL_COLUMNS = ['label'] dict_dtypes = {} for col in CATEGORICAL_COLUMNS: dict_dtypes[col] = np.int64 for col in CONTINUOUS_COLUMNS: dict_dtypes[col] = np.float32 for col in LABEL_COLUMNS: dict_dtypes[col] = np.float32 # Create output directory for test data output_test_dir = os.path.join(output_path, 'train/') if not os.path.exists(output_test_dir): logging.info(f"Creating train/ directory at: {output_test_dir}") os.makedirs(output_test_dir) logging.info("Preprocessing Data...") workflow.transform(test_dataset).to_parquet(output_path=output_test_dir, dtypes=dict_dtypes, cats=CATEGORICAL_COLUMNS, conts=CONTINUOUS_COLUMNS, labels=LABEL_COLUMNS) logging.info("Done!")
def process_NVT(args): if args.feature_cross_list: feature_pairs = [ pair.split("_") for pair in args.feature_cross_list.split(",") ] for pair in feature_pairs: CROSS_COLUMNS.append(pair[0] + '_' + pair[1]) logging.info('NVTabular processing') train_input = os.path.join(args.data_path, "train/train.txt") val_input = os.path.join(args.data_path, "val/test.txt") PREPROCESS_DIR_temp_train = os.path.join( args.out_path, 'train/temp-parquet-after-conversion') PREPROCESS_DIR_temp_val = os.path.join( args.out_path, 'val/temp-parquet-after-conversion') PREPROCESS_DIR_temp = [PREPROCESS_DIR_temp_train, PREPROCESS_DIR_temp_val] train_output = os.path.join(args.out_path, "train") val_output = os.path.join(args.out_path, "val") # Make sure we have a clean parquet space for cudf conversion for one_path in PREPROCESS_DIR_temp: if os.path.exists(one_path): shutil.rmtree(one_path) os.mkdir(one_path) ## Get Dask Client # Deploy a Single-Machine Multi-GPU Cluster device_size = device_mem_size(kind="total") cluster = None if args.protocol == "ucx": UCX_TLS = os.environ.get("UCX_TLS", "tcp,cuda_copy,cuda_ipc,sockcm") os.environ["UCX_TLS"] = UCX_TLS cluster = LocalCUDACluster(protocol=args.protocol, CUDA_VISIBLE_DEVICES=args.devices, n_workers=len(args.devices.split(",")), enable_nvlink=True, device_memory_limit=int( device_size * args.device_limit_frac), dashboard_address=":" + args.dashboard_port) else: cluster = LocalCUDACluster(protocol=args.protocol, n_workers=len(args.devices.split(",")), CUDA_VISIBLE_DEVICES=args.devices, device_memory_limit=int( device_size * args.device_limit_frac), dashboard_address=":" + args.dashboard_port) # Create the distributed client client = Client(cluster) if args.device_pool_frac > 0.01: setup_rmm_pool(client, int(args.device_pool_frac * device_size)) #calculate the total processing time runtime = time.time() #test dataset without the label feature if args.dataset_type == 'test': global LABEL_COLUMNS LABEL_COLUMNS = [] ##-----------------------------------## # Dask rapids converts txt to parquet # Dask cudf dataframe = ddf ## train/valid txt to parquet train_valid_paths = [(train_input, PREPROCESS_DIR_temp_train), (val_input, PREPROCESS_DIR_temp_val)] for input, temp_output in train_valid_paths: ddf = dask_cudf.read_csv(input, sep='\t', names=LABEL_COLUMNS + CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS) ## Convert label col to FP32 if args.parquet_format and args.dataset_type == 'train': ddf["label"] = ddf['label'].astype('float32') # Save it as parquet format for better memory usage ddf.to_parquet(temp_output, header=True) ##-----------------------------------## COLUMNS = LABEL_COLUMNS + CONTINUOUS_COLUMNS + CROSS_COLUMNS + CATEGORICAL_COLUMNS train_paths = glob.glob( os.path.join(PREPROCESS_DIR_temp_train, "*.parquet")) valid_paths = glob.glob(os.path.join(PREPROCESS_DIR_temp_val, "*.parquet")) if args.criteo_mode == 0: proc = nvt.Workflow(cat_names=CROSS_COLUMNS + CATEGORICAL_COLUMNS, cont_names=CONTINUOUS_COLUMNS, label_name=LABEL_COLUMNS, client=client) logging.info('Fillmissing processing') proc.add_cont_feature(nvt.ops.FillMissing()) #For feature Cross if args.feature_cross_list: logging.info('Feature Crossing') feature_pairs = [ pair.split("_") for pair in args.feature_cross_list.split(",") ] for pair in feature_pairs: col0 = pair[0] col1 = pair[1] #CROSS_COLUMNS.append(col0+'_'+col1) ## LambdaOp will automatically add new column with the name of col_name + "_" + op_name for differentiation proc.add_cat_feature( nvt.ops.LambdaOp(op_name=col1, f=lambda col, gdf: col + gdf[col1], columns=[col0], replace=False)) logging.info('Nomalization processing') proc.add_cont_preprocess(nvt.ops.Normalize()) else: proc = nvt.Workflow(cat_names=CROSS_COLUMNS + CATEGORICAL_COLUMNS, cont_names=[], label_name=LABEL_COLUMNS, client=client) logging.info('Categorify processing') proc.add_cat_preprocess( nvt.ops.Categorify(freq_threshold=args.freq_limit, columns=CROSS_COLUMNS + CATEGORICAL_COLUMNS)) proc.finalize() # prepare to load the config ##Define the output format## output_format = 'hugectr' if args.parquet_format: output_format = 'parquet' ##--------------------## # just for /samples/criteo model train_ds_iterator = nvt.Dataset(train_paths, engine='parquet', part_size=int(args.part_mem_frac * device_size)) valid_ds_iterator = nvt.Dataset(valid_paths, engine='parquet', part_size=int(args.part_mem_frac * device_size)) shuffle = None if args.shuffle == "PER_WORKER": shuffle = nvt.io.Shuffle.PER_WORKER elif args.shuffle == "PER_PARTITION": shuffle = nvt.io.Shuffle.PER_PARTITION logging.info('Train Datasets Preprocessing.....') proc.apply( train_ds_iterator, output_path=train_output, out_files_per_proc=args.out_files_per_proc, output_format=output_format, shuffle=shuffle, num_io_threads=args.num_io_threads, ) #--------------------## embeddings = nvt.ops.get_embedding_sizes(proc) print(embeddings) slot_size = [] #Output slot_size for each categorical feature for item in CROSS_COLUMNS + CATEGORICAL_COLUMNS: slot_size.append(embeddings[item][0]) print(slot_size) ##--------------------## logging.info('Valid Datasets Preprocessing.....') proc.apply( valid_ds_iterator, record_stats=False, output_path=val_output, out_files_per_proc=args.out_files_per_proc, output_format=output_format, shuffle=shuffle, num_io_threads=args.num_io_threads, ) embeddings = nvt.ops.get_embedding_sizes(proc) print(embeddings) slot_size = [] #Output slot_size for each categorical feature for item in CROSS_COLUMNS + CATEGORICAL_COLUMNS: slot_size.append(embeddings[item][0]) print(slot_size) ##--------------------## ## Shutdown clusters client.close() logging.info('NVTabular processing done') runtime = time.time() - runtime print("\nDask-NVTabular Criteo Preprocessing") print("--------------------------------------") print(f"data_path | {args.data_path}") print(f"output_path | {args.out_path}") print( f"partition size | {'%.2f GB'%bytesto(int(args.part_mem_frac * device_size),'g')}" ) print(f"protocol | {args.protocol}") print(f"device(s) | {args.devices}") print(f"rmm-pool-frac | {(args.device_pool_frac)}") print(f"out-files-per-proc | {args.out_files_per_proc}") print(f"num_io_threads | {args.num_io_threads}") print(f"shuffle | {args.shuffle}") print("======================================") print(f"Runtime[s] | {runtime}") print("======================================\n")
def run_preprocessing(input_path, base_dir, num_train_days, num_val_days, num_gpus): # Define paths to save artifacts dask_workdir = os.path.join(base_dir, "test_dask/workdir") output_path = os.path.join(base_dir, "test_dask/output") stats_path = os.path.join(base_dir, "test_dask/stats") logging.info(f"Dask Workdir: {dask_workdir}") logging.info(f"Output Path: {output_path}") # Make sure we have a clean worker space for Dask if os.path.isdir(dask_workdir): shutil.rmtree(dask_workdir) os.makedirs(dask_workdir) # Make sure we have a clean stats space for Dask if os.path.isdir(stats_path): shutil.rmtree(stats_path) os.mkdir(stats_path) # Make sure we have a clean output path if os.path.isdir(output_path): shutil.rmtree(output_path) os.mkdir(output_path) logging.info("Created output directories..") # This requires the data to be in this specific format eg. day_0.parquet, day_2.parquet etc. fname = 'day_{}.parquet' num_days = len([ i for i in os.listdir(input_path) if re.match(fname.format('[0-9]{1,2}'), i) is not None ]) train_paths = [ os.path.join(input_path, fname.format(day)) for day in range(num_train_days) ] valid_paths = [ os.path.join(input_path, fname.format(day)) for day in range(num_train_days, num_train_days + num_val_days) ] logging.info(f"Training data: {train_paths}") logging.info(f"Validation data: {valid_paths}") # Deploy a Dask Distributed Cluster # Single-Machine Multi-GPU Cluster protocol = "tcp" # "tcp" or "ucx" visible_devices = ",".join([str(n) for n in num_gpus ]) # Delect devices to place workers device_limit_frac = 0.4 # Spill GPU-Worker memory to host at this limit. device_pool_frac = 0.5 part_mem_frac = 0.05 # Desired maximum size of each partition as a fraction of total GPU memory. # Use total device size to calculate args.device_limit_frac device_size = device_mem_size(kind="total") part_size = int(part_mem_frac * device_size) logging.info(f"Partition size: {part_size}") # Deploy Dask Distributed cluster only if asked for multiple GPUs if len(num_gpus) > 1: device_limit = int(device_limit_frac * device_size) device_pool_size = int(device_pool_frac * device_size) logging.info("Checking if any device memory is already occupied..") # Check if any device memory is already occupied for dev in visible_devices.split(","): fmem = _pynvml_mem_size(kind="free", index=int(dev)) used = (device_size - fmem) / 1e9 if used > 1.0: warnings.warn( f"BEWARE - {used} GB is already occupied on device {int(dev)}!" ) cluster = None # (Optional) Specify existing scheduler port if cluster is None: cluster = LocalCUDACluster(protocol=protocol, n_workers=len( visible_devices.split(",")), CUDA_VISIBLE_DEVICES=visible_devices, device_memory_limit=device_limit, local_directory=dask_workdir) logging.info("Create the distributed client..") # Create the distributed client client = Client(cluster) logging.info("Initialize memory pools..") # Initialize RMM pool on ALL workers def _rmm_pool(): rmm.reinitialize( # RMM may require the pool size to be a multiple of 256. pool_allocator=True, initial_pool_size=(device_pool_size // 256) * 256, ) client.run(_rmm_pool) # Preprocessing CONTINUOUS_COLUMNS = ['I' + str(x) for x in range(1, 14)] CATEGORICAL_COLUMNS = ['C' + str(x) for x in range(1, 27)] LABEL_COLUMNS = ['label'] COLUMNS = CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS + LABEL_COLUMNS cat_features = CATEGORICAL_COLUMNS >> Categorify(out_path=stats_path) cont_features = CONTINUOUS_COLUMNS >> FillMissing() >> Clip( min_value=0) >> Normalize() features = cat_features + cont_features + LABEL_COLUMNS logging.info("Defining a workflow object..") if len(num_gpus) > 1: workflow = nvt.Workflow(features, client=client) else: workflow = nvt.Workflow(features) dict_dtypes = {} for col in CATEGORICAL_COLUMNS: dict_dtypes[col] = np.int64 for col in CONTINUOUS_COLUMNS: dict_dtypes[col] = np.float32 for col in LABEL_COLUMNS: dict_dtypes[col] = np.float32 train_dataset = nvt.Dataset(train_paths, engine='parquet', part_size=part_size) valid_dataset = nvt.Dataset(valid_paths, engine='parquet', part_size=part_size) output_train_dir = os.path.join(output_path, 'train/') logging.info(f"Creating train/ directory at: {output_train_dir}") if not os.path.exists(output_train_dir): os.makedirs(output_train_dir) output_valid_dir = os.path.join(output_path, 'valid/') logging.info(f"Creating valid/ directory at: {output_valid_dir}") if not os.path.exists(output_valid_dir): os.makedirs(output_valid_dir) logging.info("Workflow Fit..") workflow.fit(train_dataset) logging.info("Transform Training data..") workflow.transform(train_dataset).to_parquet( output_path=output_train_dir, shuffle=nvt.io.Shuffle.PER_PARTITION, dtypes=dict_dtypes, cats=CATEGORICAL_COLUMNS, conts=CONTINUOUS_COLUMNS, labels=LABEL_COLUMNS) logging.info("Transform Validation data..") workflow.transform(valid_dataset).to_parquet(output_path=output_valid_dir, dtypes=dict_dtypes, cats=CATEGORICAL_COLUMNS, conts=CONTINUOUS_COLUMNS, labels=LABEL_COLUMNS) # use these printed out cardinalities list in the "slot_size_array" in the HugeCTR training "dcn_parquet.json" cardinalities = [] for col in CATEGORICAL_COLUMNS: cardinalities.append(nvt.ops.get_embedding_sizes(workflow)[col][0]) logging.info( f"Cardinalities for configuring slot_size_array: {cardinalities}") logging.info(f"Saving workflow object at: {output_path + '/workflow'}") workflow.save(output_path + '/workflow') logging.info("Done!")
def process_NVT(args): if args.feature_cross_list: feature_pairs = [ pair.split("_") for pair in args.feature_cross_list.split(",") ] for pair in feature_pairs: CROSS_COLUMNS.append(pair[0] + '_' + pair[1]) logging.info('NVTabular processing') train_input = os.path.join(args.data_path, "train/train.txt") val_input = os.path.join(args.data_path, "val/test.txt") PREPROCESS_DIR_temp_train = os.path.join( args.out_path, 'train/temp-parquet-after-conversion') PREPROCESS_DIR_temp_val = os.path.join( args.out_path, 'val/temp-parquet-after-conversion') PREPROCESS_DIR_temp = [PREPROCESS_DIR_temp_train, PREPROCESS_DIR_temp_val] train_output = os.path.join(args.out_path, "train") val_output = os.path.join(args.out_path, "val") # Make sure we have a clean parquet space for cudf conversion for one_path in PREPROCESS_DIR_temp: if os.path.exists(one_path): shutil.rmtree(one_path) os.mkdir(one_path) ## Get Dask Client # Deploy a Single-Machine Multi-GPU Cluster device_size = device_mem_size(kind="total") cluster = None if args.protocol == "ucx": UCX_TLS = os.environ.get("UCX_TLS", "tcp,cuda_copy,cuda_ipc,sockcm") os.environ["UCX_TLS"] = UCX_TLS cluster = LocalCUDACluster(protocol=args.protocol, CUDA_VISIBLE_DEVICES=args.devices, n_workers=len(args.devices.split(",")), enable_nvlink=True, device_memory_limit=int( device_size * args.device_limit_frac), dashboard_address=":" + args.dashboard_port) else: cluster = LocalCUDACluster(protocol=args.protocol, n_workers=len(args.devices.split(",")), CUDA_VISIBLE_DEVICES=args.devices, device_memory_limit=int( device_size * args.device_limit_frac), dashboard_address=":" + args.dashboard_port) # Create the distributed client client = Client(cluster) if args.device_pool_frac > 0.01: setup_rmm_pool(client, int(args.device_pool_frac * device_size)) #calculate the total processing time runtime = time.time() #test dataset without the label feature if args.dataset_type == 'test': global LABEL_COLUMNS LABEL_COLUMNS = [] ##-----------------------------------## # Dask rapids converts txt to parquet # Dask cudf dataframe = ddf ## train/valid txt to parquet train_valid_paths = [(train_input, PREPROCESS_DIR_temp_train), (val_input, PREPROCESS_DIR_temp_val)] for input, temp_output in train_valid_paths: ddf = dask_cudf.read_csv(input, sep='\t', names=LABEL_COLUMNS + CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS) ## Convert label col to FP32 if args.parquet_format and args.dataset_type == 'train': ddf["label"] = ddf['label'].astype('float32') # Save it as parquet format for better memory usage ddf.to_parquet(temp_output, header=True) ##-----------------------------------## COLUMNS = LABEL_COLUMNS + CONTINUOUS_COLUMNS + CROSS_COLUMNS + CATEGORICAL_COLUMNS train_paths = glob.glob( os.path.join(PREPROCESS_DIR_temp_train, "*.parquet")) valid_paths = glob.glob(os.path.join(PREPROCESS_DIR_temp_val, "*.parquet")) categorify_op = Categorify(freq_threshold=args.freq_limit) cat_features = CATEGORICAL_COLUMNS >> categorify_op cont_features = CONTINUOUS_COLUMNS >> FillMissing() >> Clip( min_value=0) >> Normalize() cross_cat_op = Categorify(freq_threshold=args.freq_limit) features = LABEL_COLUMNS if args.criteo_mode == 0: features += cont_features if args.feature_cross_list: feature_pairs = [ pair.split("_") for pair in args.feature_cross_list.split(",") ] for pair in feature_pairs: col0 = pair[0] col1 = pair[1] features += col0 >> FeatureCross(col1) >> Rename( postfix="_" + col1) >> cross_cat_op features += cat_features workflow = nvt.Workflow(features, client=client) logging.info("Preprocessing") output_format = 'hugectr' if args.parquet_format: output_format = 'parquet' # just for /samples/criteo model train_ds_iterator = nvt.Dataset(train_paths, engine='parquet', part_size=int(args.part_mem_frac * device_size)) valid_ds_iterator = nvt.Dataset(valid_paths, engine='parquet', part_size=int(args.part_mem_frac * device_size)) shuffle = None if args.shuffle == "PER_WORKER": shuffle = nvt.io.Shuffle.PER_WORKER elif args.shuffle == "PER_PARTITION": shuffle = nvt.io.Shuffle.PER_PARTITION logging.info('Train Datasets Preprocessing.....') dict_dtypes = {} for col in CATEGORICAL_COLUMNS: dict_dtypes[col] = np.int64 if not args.criteo_mode: for col in CONTINUOUS_COLUMNS: dict_dtypes[col] = np.float32 for col in CROSS_COLUMNS: dict_dtypes[col] = np.int64 for col in LABEL_COLUMNS: dict_dtypes[col] = np.float32 conts = CONTINUOUS_COLUMNS if not args.criteo_mode else [] workflow.fit(train_ds_iterator) if output_format == 'hugectr': workflow.transform(train_ds_iterator).to_hugectr( cats=CATEGORICAL_COLUMNS + CROSS_COLUMNS, conts=conts, labels=LABEL_COLUMNS, output_path=train_output, shuffle=shuffle, out_files_per_proc=args.out_files_per_proc, num_threads=args.num_io_threads) else: workflow.transform(train_ds_iterator).to_parquet( output_path=train_output, dtypes=dict_dtypes, cats=CATEGORICAL_COLUMNS + CROSS_COLUMNS, conts=conts, labels=LABEL_COLUMNS, shuffle=shuffle, out_files_per_proc=args.out_files_per_proc, num_threads=args.num_io_threads) ###Getting slot size### #--------------------## embeddings_dict_cat = categorify_op.get_embedding_sizes( CATEGORICAL_COLUMNS) embeddings_dict_cross = cross_cat_op.get_embedding_sizes(CROSS_COLUMNS) embeddings = [embeddings_dict_cross[c][0] for c in CROSS_COLUMNS ] + [embeddings_dict_cat[c][0] for c in CATEGORICAL_COLUMNS] print(embeddings) ##--------------------## logging.info('Valid Datasets Preprocessing.....') if output_format == 'hugectr': workflow.transform(valid_ds_iterator).to_hugectr( cats=CATEGORICAL_COLUMNS + CROSS_COLUMNS, conts=conts, labels=LABEL_COLUMNS, output_path=val_output, shuffle=shuffle, out_files_per_proc=args.out_files_per_proc, num_threads=args.num_io_threads) else: workflow.transform(valid_ds_iterator).to_parquet( output_path=val_output, dtypes=dict_dtypes, cats=CATEGORICAL_COLUMNS + CROSS_COLUMNS, conts=conts, labels=LABEL_COLUMNS, shuffle=shuffle, out_files_per_proc=args.out_files_per_proc, num_threads=args.num_io_threads) embeddings_dict_cat = categorify_op.get_embedding_sizes( CATEGORICAL_COLUMNS) embeddings_dict_cross = cross_cat_op.get_embedding_sizes(CROSS_COLUMNS) embeddings = [embeddings_dict_cross[c][0] for c in CROSS_COLUMNS ] + [embeddings_dict_cat[c][0] for c in CATEGORICAL_COLUMNS] print(embeddings) ##--------------------## ## Shutdown clusters client.close() logging.info('NVTabular processing done') runtime = time.time() - runtime print("\nDask-NVTabular Criteo Preprocessing") print("--------------------------------------") print(f"data_path | {args.data_path}") print(f"output_path | {args.out_path}") print( f"partition size | {'%.2f GB'%bytesto(int(args.part_mem_frac * device_size),'g')}" ) print(f"protocol | {args.protocol}") print(f"device(s) | {args.devices}") print(f"rmm-pool-frac | {(args.device_pool_frac)}") print(f"out-files-per-proc | {args.out_files_per_proc}") print(f"num_io_threads | {args.num_io_threads}") print(f"shuffle | {args.shuffle}") print("======================================") print(f"Runtime[s] | {runtime}") print("======================================\n")
def nvt_etl( data_path, out_path, devices, protocol, device_limit_frac, device_pool_frac, part_mem_frac, cats, conts, labels, out_files_per_proc, ): # Set up data paths input_path = data_path[:-1] if data_path[-1] == "/" else data_path base_dir = out_path[:-1] if out_path[-1] == "/" else out_path dask_workdir = os.path.join(base_dir, "workdir") output_path = os.path.join(base_dir, "output") stats_path = os.path.join(base_dir, "stats") output_train_dir = os.path.join(output_path, "train/") output_valid_dir = os.path.join(output_path, "valid/") # Make sure we have a clean worker space for Dask if os.path.isdir(dask_workdir): shutil.rmtree(dask_workdir) os.makedirs(dask_workdir) # Make sure we have a clean stats space for Dask if os.path.isdir(stats_path): shutil.rmtree(stats_path) os.mkdir(stats_path) # Make sure we have a clean output path if os.path.isdir(output_path): shutil.rmtree(output_path) os.mkdir(output_path) os.mkdir(output_train_dir) os.mkdir(output_valid_dir) # Get train/valid files train_paths = [ os.path.join(input_path, f) for f in os.listdir(input_path) if os.path.isfile(os.path.join(input_path, f)) ] n_files = int(len(train_paths) * 0.9) valid_paths = train_paths[n_files:] train_paths = train_paths[:n_files] # Force dtypes for HugeCTR usage dict_dtypes = {} for col in cats: dict_dtypes[col] = np.int64 for col in conts: dict_dtypes[col] = np.float32 for col in labels: dict_dtypes[col] = np.float32 # Use total device size to calculate args.device_limit_frac device_size = device_mem_size(kind="total") device_limit = int(device_limit_frac * device_size) device_pool_size = int(device_pool_frac * device_size) part_size = int(part_mem_frac * device_size) # Check if any device memory is already occupied for dev in devices.split(","): fmem = _pynvml_mem_size(kind="free", index=int(dev)) used = (device_size - fmem) / 1e9 if used > 1.0: warnings.warn( f"BEWARE - {used} GB is already occupied on device {int(dev)}!" ) # Setup dask cluster and perform ETL with managed_client(dask_workdir, devices, device_limit, protocol) as client: # Setup RMM pool if device_pool_frac > 0.01: setup_rmm_pool(client, device_pool_size) # Define Dask NVTabular "Workflow" cont_features = conts >> ops.FillMissing() >> ops.Clip( min_value=0) >> ops.LogOp() cat_features = cats >> ops.Categorify(out_path=stats_path, max_size=10000000) workflow = Workflow(cat_features + cont_features + labels, client=client) train_dataset = Dataset(train_paths, engine="parquet", part_size=part_size) valid_dataset = Dataset(valid_paths, engine="parquet", part_size=part_size) workflow.fit(train_dataset) workflow.transform(train_dataset).to_parquet( output_path=output_train_dir, shuffle=nvt_io.Shuffle.PER_WORKER, dtypes=dict_dtypes, cats=cats, conts=conts, labels=labels, out_files_per_proc=out_files_per_proc, ) workflow.transform(valid_dataset).to_parquet( output_path=output_valid_dir, shuffle=nvt_io.Shuffle.PER_WORKER, dtypes=dict_dtypes, cats=cats, conts=conts, labels=labels, out_files_per_proc=out_files_per_proc, ) workflow.save(os.path.join(output_path, "workflow")) return workflow