def test_join_external(tmpdir, df, dataset, engine, kind_ext, cache, how, cpu, drop_duplicates): # Define "external" table shift = 100 df_ext = df[["id"]].copy().sort_values("id") df_ext["new_col"] = df_ext["id"] + shift df_ext["new_col_2"] = "keep" df_ext["new_col_3"] = "ignore" df_ext_check = df_ext.copy() if kind_ext == "pandas": df_ext = df_ext.to_pandas() elif kind_ext == "arrow": df_ext = df_ext.to_arrow() elif kind_ext == "parquet": path = tmpdir.join("external.parquet") df_ext.to_parquet(path) df_ext = path elif kind_ext == "parquet-multi": path = tmpdir.join("external-multi.parquet") dask_cudf.from_cudf(df_ext, npartitions=3).to_parquet(path) df_ext = path elif kind_ext == "csv": path = tmpdir.join("external.csv") df_ext.to_csv(path) df_ext = path elif kind_ext == "dask-dataframe": df_ext = dd.from_pandas(df_ext.to_pandas(), npartitions=2) elif kind_ext == "dask-cudf": df_ext = dask_cudf.from_cudf(df_ext, npartitions=2) elif kind_ext == "dataset": df_ext = nvt.Dataset(df_ext) # Define Op on = "id" columns_left = list(df.columns) columns_ext = ["id", "new_col", "new_col_2"] df_ext_check = df_ext_check[columns_ext] if drop_duplicates: df_ext_check.drop_duplicates(ignore_index=True, inplace=True) joined = nvt.ColumnSelector(columns_left) >> nvt.ops.JoinExternal( df_ext, on, how=how, columns_ext=columns_ext, cache=cache, drop_duplicates_ext=drop_duplicates, ) gdf = df.reset_index() dataset = nvt.Dataset(gdf, cpu=cpu) processor = nvt.Workflow(joined) processor.fit(dataset) new_gdf = processor.transform(dataset).to_ddf().compute().reset_index() check_gdf = gdf.merge(df_ext_check, how=how, on=on) assert len(check_gdf) == len(new_gdf) assert (new_gdf["id"] + shift).all() == new_gdf["new_col"].all() assert gdf["id"].all() == new_gdf["id"].all() assert "new_col_2" in new_gdf.columns assert "new_col_3" not in new_gdf.columns
def test_workflow_fit_op_rename(tmpdir, dataset, engine): # NVT schema = dataset.schema for name in schema.column_names: dataset.schema.column_schemas[name] = dataset.schema.column_schemas[ name].with_tags([nvt.graph.tags.Tags.USER]) selector = nvt.ColumnSelector(tags=[nvt.graph.tags.Tags.USER]) workflow_ops_1 = selector >> nvt.ops.Rename(postfix="_1") workflow_1 = nvt.Workflow(workflow_ops_1) workflow_1.fit(dataset) workflow_1.save(str(tmpdir / "one")) new_dataset = workflow_1.transform(dataset).to_ddf().compute() assert len(new_dataset.columns) > 0 assert all("_1" in col for col in new_dataset.columns)
def test_numeric_dtypes(tmpdir, output_model): if output_model == "pytorch": model_info = dict() else: model_info = None dtypes = [] for width in [8, 16, 32, 64]: dtype = f"int{width}" dtypes.append((dtype, np.iinfo(dtype))) if output_model == "pytorch": model_info[dtype] = {"columns": [dtype], "dtype": dtype} dtype = f"uint{width}" dtypes.append((dtype, np.iinfo(dtype))) if output_model == "pytorch": model_info[dtype] = {"columns": [dtype], "dtype": dtype} for width in [32, 64]: dtype = f"float{width}" dtypes.append((dtype, np.finfo(dtype))) if output_model == "pytorch": model_info[dtype] = {"columns": [dtype], "dtype": dtype} def check_dtypes(col): assert str(col.dtype) == col.name return col # simple transform to make sure we can round-trip the min/max values for each dtype, # through triton, with the 'transform' here just checking that the dtypes are correct df = _make_df({ dtype: np.array([limits.max, 0, limits.min], dtype=dtype) for dtype, limits in dtypes }) features = nvt.ColumnSelector(df.columns) >> check_dtypes workflow = nvt.Workflow(features) _verify_workflow_on_tritonserver(tmpdir, workflow, df, "test_numeric_dtypes", output_model, model_info)
def fit(self, dataset: Dataset) -> "Workflow": """Calculates statistics for this workflow on the input dataset Parameters ----------- dataset: Dataset The input dataset to calculate statistics for. If there is a train/test split this data should be the training dataset only. """ self._clear_worker_cache() if not self.output_schema: self.fit_schema(dataset.schema) ddf = dataset.to_ddf(columns=self._input_columns()) # Get a dictionary mapping all StatOperators we need to fit to a set of any dependent # StatOperators (having StatOperators that depend on the output of other StatOperators # means that will have multiple phases in the fit cycle here) stat_ops = { op: _get_stat_ops(op.parents_with_dependencies) for op in _get_stat_ops([self.output_node]) } while stat_ops: # get all the StatOperators that we can currently call fit on (no outstanding # dependencies) current_phase = [op for op, dependencies in stat_ops.items() if not dependencies] if not current_phase: # this shouldn't happen, but lets not infinite loop just in case raise RuntimeError("failed to find dependency-free StatOperator to fit") stats, ops = [], [] for workflow_node in current_phase: # Check for additional input columns that aren't generated by parents addl_input_cols = set() if workflow_node.parents: upstream_output_cols = sum( [ upstream.output_columns for upstream in workflow_node.parents_with_dependencies ], nvtabular.ColumnSelector(), ) addl_input_cols = set(workflow_node.input_columns.names) - set( upstream_output_cols.names ) # apply transforms necessary for the inputs to the current column group, ignoring # the transforms from the statop itself transformed_ddf = _ensure_optimize_dataframe_graph( ddf=_transform_ddf( ddf, workflow_node.parents_with_dependencies, additional_columns=addl_input_cols, ) ) op = workflow_node.op try: stats.append(op.fit(workflow_node.input_columns, transformed_ddf)) ops.append(op) except Exception: LOG.exception("Failed to fit operator %s", workflow_node.op) raise if self.client: results = [r.result() for r in self.client.compute(stats)] else: results = dask.compute(stats, scheduler="synchronous")[0] for computed_stats, op in zip(results, ops): op.fit_finalize(computed_stats) # Remove all the operators we processed in this phase, and remove # from the dependencies of other ops too for stat_op in current_phase: stat_ops.pop(stat_op) for dependencies in stat_ops.values(): dependencies.difference_update(current_phase) # hack: store input/output dtypes here. We should have complete dtype # information for each operator (like we do for column names), but as # an interim solution this gets us what we need. input_dtypes = dataset.to_ddf()[self._input_columns()].dtypes self.input_dtypes = dict(zip(input_dtypes.index, input_dtypes)) output_dtypes = self.transform(dataset).sample_dtypes() self.output_dtypes = dict(zip(output_dtypes.index, output_dtypes)) self._zero_output_schemas() self.fit_schema(dataset.schema) return self
def test_training(): # Download & Convert data download_file( "http://files.grouplens.org/datasets/movielens/ml-25m.zip", os.path.join(DATA_DIR, "ml-25m.zip"), ) ratings = cudf.read_csv(os.path.join(DATA_DIR, "ml-25m", "ratings.csv")) ratings["new_cat1"] = ratings["userId"] / ratings["movieId"] ratings["new_cat1"] = ratings["new_cat1"].astype("int64") ratings.head() ratings = ratings.drop("timestamp", axis=1) train, valid = train_test_split(ratings, test_size=0.2, random_state=42) train.to_parquet(DATA_DIR + "train.parquet") valid.to_parquet(DATA_DIR + "valid.parquet") del train del valid gc.collect() # Perform ETL with NVTabular cat_features = CATEGORICAL_COLUMNS >> nvt.ops.Categorify(cat_cache="device") ratings = nvt.ColumnSelector(["rating"]) >> nvt.ops.LambdaOp( lambda col: (col > 3).astype("int8") ) output = cat_features + ratings workflow = nvt.Workflow(output) train_dataset = nvt.Dataset(DATA_DIR + "train.parquet", part_size="100MB") valid_dataset = nvt.Dataset(DATA_DIR + "valid.parquet", part_size="100MB") workflow.fit(train_dataset) dict_dtypes = {} for col in CATEGORICAL_COLUMNS: dict_dtypes[col] = np.int64 for col in LABEL_COLUMNS: dict_dtypes[col] = np.float32 if path.exists(DATA_DIR + "train"): shutil.rmtree(os.path.join(DATA_DIR, "train")) if path.exists(DATA_DIR + "valid"): shutil.rmtree(os.path.join(DATA_DIR, "valid")) workflow.transform(train_dataset).to_parquet( output_path=DATA_DIR + "train/", shuffle=nvt.io.Shuffle.PER_PARTITION, cats=CATEGORICAL_COLUMNS, labels=LABEL_COLUMNS, dtypes=dict_dtypes, ) workflow.transform(valid_dataset).to_parquet( output_path=DATA_DIR + "valid/", shuffle=False, cats=CATEGORICAL_COLUMNS, labels=LABEL_COLUMNS, dtypes=dict_dtypes, ) # Train with HugeCTR embeddings = get_embedding_sizes(workflow) total_cardinality = 0 slot_sizes = [] for column in CATEGORICAL_COLUMNS: slot_sizes.append(embeddings[column][0]) total_cardinality += embeddings[column][0] test_data_path = DATA_DIR + "test/" if path.exists(test_data_path): shutil.rmtree(test_data_path) os.mkdir(test_data_path) if path.exists(MODEL_DIR): shutil.rmtree(MODEL_DIR) os.makedirs(TRAIN_DIR) sample_data = cudf.read_parquet(DATA_DIR + "valid.parquet", num_rows=TEST_N_ROWS) sample_data.to_csv(test_data_path + "data.csv") sample_data_trans = nvt.workflow._transform_partition(sample_data, [workflow.output_node]) dense_features, embedding_columns, row_ptrs = _convert(sample_data_trans, slot_sizes) _run_model(slot_sizes, total_cardinality) if path.exists(TEMP_DIR): shutil.rmtree(TEMP_DIR) os.mkdir(TEMP_DIR) file_names = glob.iglob(os.path.join(os.getcwd(), "*.model")) for files in file_names: shutil.move(files, TEMP_DIR) hugectr_params = dict() hugectr_params["config"] = NETWORK_FILE hugectr_params["slots"] = len(slot_sizes) hugectr_params["max_nnz"] = len(slot_sizes) hugectr_params["embedding_vector_size"] = 16 hugectr_params["n_outputs"] = 1 export_hugectr_ensemble( workflow=workflow, hugectr_model_path=TEMP_DIR, hugectr_params=hugectr_params, name=MODEL_NAME, output_path=MODEL_DIR, label_columns=["rating"], cats=CATEGORICAL_COLUMNS, max_batch_size=64, ) shutil.rmtree(TEMP_DIR) _predict(dense_features, embedding_columns, row_ptrs, hugectr_params["config"], MODEL_NAME)
def __rrshift__(self, other): return nvt.ColumnSelector(other) >> self
def test_horovod_multigpu(tmpdir): json_sample = { "conts": {}, "cats": { "genres": { "dtype": None, "cardinality": 50, "min_entry_size": 1, "max_entry_size": 5, "multi_min": 2, "multi_max": 4, "multi_avg": 3, }, "movieId": { "dtype": None, "cardinality": 500, "min_entry_size": 1, "max_entry_size": 5, }, "userId": {"dtype": None, "cardinality": 500, "min_entry_size": 1, "max_entry_size": 5}, }, "labels": {"rating": {"dtype": None, "cardinality": 2}}, } cols = datagen._get_cols_from_schema(json_sample) df_gen = datagen.DatasetGen(datagen.UniformDistro(), gpu_frac=0.0001) target_path = os.path.join(tmpdir, "input/") os.mkdir(target_path) df_files = df_gen.full_df_create(10000, cols, output=target_path) # process them cat_features = nvt.ColumnSelector(["userId", "movieId", "genres"]) >> nvt.ops.Categorify() ratings = nvt.ColumnSelector(["rating"]) >> nvt.ops.LambdaOp( lambda col: (col > 3).astype("int8") ) output = cat_features + ratings proc = nvt.Workflow(output) target_path_train = os.path.join(tmpdir, "train/") os.mkdir(target_path_train) proc.fit_transform(nvt.Dataset(df_files)).to_parquet( output_path=target_path_train, out_files_per_proc=5 ) # add new location target_path = os.path.join(tmpdir, "workflow/") os.mkdir(target_path) proc.save(target_path) curr_path = os.path.abspath(__file__) repo_root = os.path.relpath(os.path.normpath(os.path.join(curr_path, "../../../.."))) hvd_wrap_path = os.path.join(repo_root, "examples/multi-gpu-movielens/hvd_wrapper.sh") hvd_exam_path = os.path.join(repo_root, "examples/multi-gpu-movielens/tf_trainer.py") with subprocess.Popen( [ "horovodrun", "-np", "2", "-H", "localhost:2", "sh", hvd_wrap_path, "python", hvd_exam_path, "--dir_in", f"{tmpdir}", "--batch_size", "1024", ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) as process: process.wait() stdout, stderr = process.communicate() print(stdout, stderr) assert "Loss:" in str(stdout)