def test_gpu_workflow(tmpdir, df, dataset, gpu_memory_frac, engine, dump): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] norms = ops.Normalize() conts = cont_names >> ops.FillMissing() >> ops.Clip(min_value=0) >> norms cats = cat_names >> ops.Categorify() workflow = nvt.Workflow(conts + cats + label_name) workflow.fit(dataset) if dump: workflow_dir = os.path.join(tmpdir, "workflow") workflow.save(workflow_dir) workflow = None workflow = Workflow.load(workflow_dir) def get_norms(tar: cudf.Series): gdf = tar.fillna(0) gdf = gdf * (gdf >= 0).astype("int") return gdf assert math.isclose(get_norms(df.x).mean(), norms.means["x"], rel_tol=1e-4) assert math.isclose(get_norms(df.y).mean(), norms.means["y"], rel_tol=1e-4) assert math.isclose(get_norms(df.x).std(), norms.stds["x"], rel_tol=1e-3) assert math.isclose(get_norms(df.y).std(), norms.stds["y"], rel_tol=1e-3) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique().values_host cats0 = get_cats(workflow, "name-cat") # adding the None entry as a string because of move from gpu assert cats0.tolist() == [None] + cats_expected0.tolist() cats_expected1 = df["name-string"].unique().values_host cats1 = get_cats(workflow, "name-string") # adding the None entry as a string because of move from gpu assert cats1.tolist() == [None] + cats_expected1.tolist() # Write to new "shuffled" and "processed" dataset workflow.transform(dataset).to_parquet( output_path=tmpdir, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION) dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) if engine == "parquet": assert is_integer_dtype(df_pp["name-cat"].dtype) assert is_integer_dtype(df_pp["name-string"].dtype) num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( str(tmpdir) + "/_metadata") assert num_rows == len(df_pp)
def test_encoder(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns): cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] encoder = ops.CategoryStatistics(columns=op_columns) config = nvt.workflow.get_new_config() config["PP"]["categorical"] = [encoder] processor = nvt.Workflow( cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config ) processor.update_stats(dataset) if engine == "parquet" and not op_columns: cats_expected0 = df["name-cat"].unique().values_host cats0 = get_cats(processor, "name-cat") assert cats0.tolist() == [None] + cats_expected0.tolist() cats_expected1 = df["name-string"].unique().values_host cats1 = get_cats(processor, "name-string") assert cats1.tolist() == [None] + cats_expected1.tolist()
def test_gpu_workflow_config(tmpdir, client, df, dataset, gpu_memory_frac, engine, dump, replace): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] norms = ops.Normalize() cat_features = cat_names >> ops.Categorify() if replace: cont_features = cont_names >> ops.FillMissing() >> ops.LogOp >> norms else: fillmissing_logop = (cont_names >> ops.FillMissing() >> ops.LogOp >> ops.Rename(postfix="_FillMissing_1_LogOp_1")) cont_features = cont_names + fillmissing_logop >> norms workflow = Workflow(cat_features + cont_features + label_name, client=client) workflow.fit(dataset) if dump: workflow_dir = os.path.join(tmpdir, "workflow") workflow.save(workflow_dir) workflow = None workflow = Workflow.load(workflow_dir, client=client) def get_norms(tar: cudf.Series): ser_median = tar.dropna().quantile(0.5, interpolation="linear") gdf = tar.fillna(ser_median) gdf = np.log(gdf + 1) return gdf # Check mean and std - No good right now we have to add all other changes; Clip, Log concat_ops = "_FillMissing_1_LogOp_1" if replace: concat_ops = "" assert math.isclose(get_norms(df.x).mean(), norms.means["x" + concat_ops], rel_tol=1e-1) assert math.isclose(get_norms(df.y).mean(), norms.means["y" + concat_ops], rel_tol=1e-1) assert math.isclose(get_norms(df.x).std(), norms.stds["x" + concat_ops], rel_tol=1e-1) assert math.isclose(get_norms(df.y).std(), norms.stds["y" + concat_ops], rel_tol=1e-1) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique().values_host cats0 = get_cats(workflow, "name-cat") # adding the None entry as a string because of move from gpu assert cats0.tolist() == [None] + cats_expected0.tolist() cats_expected1 = df["name-string"].unique().values_host cats1 = get_cats(workflow, "name-string") # adding the None entry as a string because of move from gpu assert cats1.tolist() == [None] + cats_expected1.tolist() # Write to new "shuffled" and "processed" dataset workflow.transform(dataset).to_parquet( tmpdir, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION, ) dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) if engine == "parquet": assert is_integer_dtype(df_pp["name-cat"].dtype) assert is_integer_dtype(df_pp["name-string"].dtype) num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( str(tmpdir) + "/_metadata") assert num_rows == len(df_pp)
def test_cpu_workflow(tmpdir, df, dataset, cpu, engine, dump): # Make sure we are in cpu formats if cudf and isinstance(df, cudf.DataFrame): df = df.to_pandas() if cpu: dataset.to_cpu() cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] norms = ops.Normalize() conts = cont_names >> ops.FillMissing() >> ops.Clip(min_value=0) >> norms cats = cat_names >> ops.Categorify() workflow = nvt.Workflow(conts + cats + label_name) workflow.fit(dataset) if dump: workflow_dir = os.path.join(tmpdir, "workflow") workflow.save(workflow_dir) workflow = None workflow = Workflow.load(workflow_dir) def get_norms(tar: pd.Series): df = tar.fillna(0) df = df * (df >= 0).astype("int") return df assert math.isclose(get_norms(df.x).mean(), norms.means["x"], rel_tol=1e-4) assert math.isclose(get_norms(df.y).mean(), norms.means["y"], rel_tol=1e-4) assert math.isclose(get_norms(df.x).std(), norms.stds["x"], rel_tol=1e-3) assert math.isclose(get_norms(df.y).std(), norms.stds["y"], rel_tol=1e-3) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique() cats0 = get_cats(workflow, "name-cat", cpu=True) # adding the None entry as a string because of move from gpu assert cats0.tolist() == [None] + sorted(cats_expected0.tolist()) cats_expected1 = df["name-string"].unique() cats1 = get_cats(workflow, "name-string", cpu=True) # adding the None entry as a string because of move from gpu assert cats1.tolist() == [None] + sorted(cats_expected1.tolist()) # Write to new "shuffled" and "processed" dataset workflow.transform(dataset).to_parquet( output_path=tmpdir, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION ) dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"), cpu=cpu) df_pp = pd.concat(list(dataset_2.to_iter()), axis=0) if engine == "parquet": assert is_integer_dtype(df_pp["name-cat"].dtype) assert is_integer_dtype(df_pp["name-string"].dtype) metadata = pq.read_metadata(str(tmpdir) + "/_metadata") assert metadata.num_rows == len(df_pp)
def test_gpu_preproc(tmpdir, df, dataset, dump, gpu_memory_frac, engine, preprocessing): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_feature( [ops.FillMedian(), ops.LogOp(preprocessing=preprocessing)]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify()) processor.finalize() processor.update_stats(dataset) if dump: config_file = tmpdir + "/temp.yaml" processor.save_stats(config_file) processor.clear_stats() processor.load_stats(config_file) def get_norms(tar: cudf.Series): ser_median = tar.dropna().quantile(0.5, interpolation="linear") gdf = tar.fillna(ser_median) gdf = np.log(gdf + 1) return gdf # Check mean and std - No good right now we have to add all other changes; Zerofill, Log x_col = "x" if preprocessing else "x_LogOp" y_col = "y" if preprocessing else "y_LogOp" assert math.isclose(get_norms(df.x).mean(), processor.stats["means"][x_col], rel_tol=1e-2) assert math.isclose(get_norms(df.y).mean(), processor.stats["means"][y_col], rel_tol=1e-2) assert math.isclose(get_norms(df.x).std(), processor.stats["stds"][x_col], rel_tol=1e-2) assert math.isclose(get_norms(df.y).std(), processor.stats["stds"][y_col], rel_tol=1e-2) # Check median (TODO: Improve the accuracy) x_median = df.x.dropna().quantile(0.5, interpolation="linear") y_median = df.y.dropna().quantile(0.5, interpolation="linear") id_median = df.id.dropna().quantile(0.5, interpolation="linear") assert math.isclose(x_median, processor.stats["medians"]["x"], rel_tol=1e1) assert math.isclose(y_median, processor.stats["medians"]["y"], rel_tol=1e1) assert math.isclose(id_median, processor.stats["medians"]["id"], rel_tol=1e1) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique().values_host cats0 = get_cats(processor, "name-cat") assert cats0.tolist() == [None] + cats_expected0.tolist() cats_expected1 = df["name-string"].unique().values_host cats1 = get_cats(processor, "name-string") assert cats1.tolist() == [None] + cats_expected1.tolist() # Write to new "shuffled" and "processed" dataset processor.write_to_dataset(tmpdir, dataset, nfiles=10, shuffle=True, apply_ops=True) processor.create_final_cols() # if preprocessing if not preprocessing: for col in cont_names: assert f"{col}_LogOp" in processor.columns_ctx["final"]["cols"][ "continuous"] dlc = torch_dataloader.DLCollator(preproc=processor, apply_ops=False) data_files = [ torch_dataloader.FileItrDataset(x, use_row_groups=True, gpu_memory_frac=gpu_memory_frac, names=allcols_csv) for x in glob.glob(str(tmpdir) + "/*.parquet") ] data_itr = torch.utils.data.ChainDataset(data_files) dl = torch_dataloader.DLDataLoader(data_itr, collate_fn=dlc.gdf_col, pin_memory=False, num_workers=0) len_df_pp = 0 for chunk in dl: len_df_pp += len(chunk[0][0]) dataset = Dataset(glob.glob(str(tmpdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) x = processor.ds_to_tensors(dataset.to_iter(), apply_ops=False) num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( str(tmpdir) + "/_metadata") assert len(x[0]) == len_df_pp itr_ds = torch_dataloader.TensorItrDataset([x[0], x[1], x[2]], batch_size=512000) count_tens_itr = 0 for data_gd in itr_ds: count_tens_itr += len(data_gd[1]) assert data_gd[0].shape[1] > 0 assert data_gd[1].shape[1] > 0 assert len_df_pp == count_tens_itr
def test_gpu_workflow_api(tmpdir, client, df, dataset, gpu_memory_frac, engine, dump, op_columns, use_client): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = nvt.Workflow( cat_names=cat_names, cont_names=cont_names, label_name=label_name, client=client if use_client else None, ) processor.add_feature([ops.ZeroFill(columns=op_columns), ops.LogOp()]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify(cat_cache="host")) processor.finalize() processor.update_stats(dataset) if dump: config_file = tmpdir + "/temp.yaml" processor.save_stats(config_file) processor.clear_stats() processor.load_stats(config_file) def get_norms(tar: cudf.Series): gdf = tar.fillna(0) gdf = gdf * (gdf >= 0).astype("int") gdf = np.log(gdf + 1) return gdf # Check mean and std - No good right now we have to add all other changes; Zerofill, Log if not op_columns: assert math.isclose(get_norms(df.y).mean(), processor.stats["means"]["y"], rel_tol=1e-1) assert math.isclose(get_norms(df.y).std(), processor.stats["stds"]["y"], rel_tol=1e-1) assert math.isclose(get_norms(df.x).mean(), processor.stats["means"]["x"], rel_tol=1e-1) assert math.isclose(get_norms(df.x).std(), processor.stats["stds"]["x"], rel_tol=1e-1) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique().values_host cats0 = get_cats(processor, "name-cat") # adding the None entry as a string because of move from gpu assert cats0.tolist() == [None] + cats_expected0.tolist() cats_expected1 = df["name-string"].unique().values_host cats1 = get_cats(processor, "name-string") # adding the None entry as a string because of move from gpu assert cats1.tolist() == [None] + cats_expected1.tolist() # Write to new "shuffled" and "processed" dataset processor.write_to_dataset(tmpdir, dataset, out_files_per_proc=10, shuffle="partial", apply_ops=True) dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) if engine == "parquet": assert is_integer_dtype(df_pp["name-cat"].dtype) assert is_integer_dtype(df_pp["name-string"].dtype) num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( str(tmpdir) + "/_metadata") assert num_rows == len(df_pp)
def test_gpu_workflow_config(tmpdir, client, df, dataset, gpu_memory_frac, engine, dump, replace): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] config = nvt.workflow.get_new_config() # add operators with dependencies config["FE"]["continuous"] = [[ ops.FillMissing(replace=replace), ops.LogOp(replace=replace) ]] config["PP"]["continuous"] = [[ ops.LogOp(replace=replace), ops.Normalize() ]] config["PP"]["categorical"] = [ops.Categorify()] processor = nvt.Workflow( cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config, client=client, ) processor.update_stats(dataset) if dump: config_file = tmpdir + "/temp.yaml" processor.save_stats(config_file) processor.clear_stats() processor.load_stats(config_file) def get_norms(tar: cudf.Series): ser_median = tar.dropna().quantile(0.5, interpolation="linear") gdf = tar.fillna(ser_median) gdf = np.log(gdf + 1) return gdf # Check mean and std - No good right now we have to add all other changes; Zerofill, Log concat_ops = "_FillMissing_LogOp" if replace: concat_ops = "" assert math.isclose(get_norms(df.x).mean(), processor.stats["means"]["x" + concat_ops], rel_tol=1e-1) assert math.isclose(get_norms(df.y).mean(), processor.stats["means"]["y" + concat_ops], rel_tol=1e-1) assert math.isclose(get_norms(df.x).std(), processor.stats["stds"]["x" + concat_ops], rel_tol=1e-1) assert math.isclose(get_norms(df.y).std(), processor.stats["stds"]["y" + concat_ops], rel_tol=1e-1) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique().values_host cats0 = get_cats(processor, "name-cat") # adding the None entry as a string because of move from gpu assert cats0.tolist() == [None] + cats_expected0.tolist() cats_expected1 = df["name-string"].unique().values_host cats1 = get_cats(processor, "name-string") # adding the None entry as a string because of move from gpu assert cats1.tolist() == [None] + cats_expected1.tolist() # Write to new "shuffled" and "processed" dataset processor.write_to_dataset(tmpdir, dataset, out_files_per_proc=10, shuffle="partial", apply_ops=True) dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) if engine == "parquet": assert is_integer_dtype(df_pp["name-cat"].dtype) assert is_integer_dtype(df_pp["name-string"].dtype) num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( str(tmpdir) + "/_metadata") assert num_rows == len(df_pp)
def test_gpu_workflow(tmpdir, client, df, dataset, gpu_memory_frac, engine, dump): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] config = nvt.workflow.get_new_config() config["FE"]["continuous"] = [ops.ZeroFill()] config["PP"]["continuous"] = [[ops.ZeroFill(), ops.Normalize()]] config["PP"]["categorical"] = [ops.Categorify()] processor = nvt.Workflow( cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config, client=client, ) processor.update_stats(dataset) if dump: config_file = tmpdir + "/temp.yaml" processor.save_stats(config_file) processor.clear_stats() processor.load_stats(config_file) def get_norms(tar: cudf.Series): gdf = tar.fillna(0) gdf = gdf * (gdf >= 0).astype("int") return gdf assert math.isclose(get_norms(df.x).mean(), processor.stats["means"]["x"], rel_tol=1e-4) assert math.isclose(get_norms(df.y).mean(), processor.stats["means"]["y"], rel_tol=1e-4) # assert math.isclose(get_norms(df.id).mean(), # processor.stats["means"]["id_ZeroFill_LogOp"], rel_tol=1e-4) assert math.isclose(get_norms(df.x).std(), processor.stats["stds"]["x"], rel_tol=1e-3) assert math.isclose(get_norms(df.y).std(), processor.stats["stds"]["y"], rel_tol=1e-3) # assert math.isclose(get_norms(df.id).std(), # processor.stats["stds"]["id_ZeroFill_LogOp"], rel_tol=1e-3) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique().values_host cats0 = get_cats(processor, "name-cat") # adding the None entry as a string because of move from gpu assert cats0.tolist() == [None] + cats_expected0.tolist() cats_expected1 = df["name-string"].unique().values_host cats1 = get_cats(processor, "name-string") # adding the None entry as a string because of move from gpu assert cats1.tolist() == [None] + cats_expected1.tolist() # Write to new "shuffled" and "processed" dataset processor.write_to_dataset(tmpdir, dataset, out_files_per_proc=10, shuffle="partial", apply_ops=True) dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0) if engine == "parquet": assert is_integer_dtype(df_pp["name-cat"].dtype) assert is_integer_dtype(df_pp["name-string"].dtype) num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( str(tmpdir) + "/_metadata") assert num_rows == len(df_pp)
def test_gpu_workflow_api(tmpdir, client, df, dataset, gpu_memory_frac, engine, dump, use_client): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] norms = ops.Normalize() cat_features = cat_names >> ops.Categorify(cat_cache="host") cont_features = cont_names >> ops.FillMissing() >> ops.Clip( min_value=0) >> ops.LogOp >> norms workflow = Workflow(cat_features + cont_features + label_name, client=client if use_client else None) workflow.fit(dataset) if dump: workflow_dir = os.path.join(tmpdir, "workflow") workflow.save(workflow_dir) workflow = None workflow = Workflow.load(workflow_dir, client=client if use_client else None) def get_norms(tar): gdf = tar.fillna(0) gdf = gdf * (gdf >= 0).astype("int") gdf = np.log(gdf + 1) return gdf # Check mean and std - No good right now we have to add all other changes; Clip, Log assert math.isclose(get_norms(df.y).mean(), norms.means["y"], rel_tol=1e-1) assert math.isclose(get_norms(df.y).std(), norms.stds["y"], rel_tol=1e-1) assert math.isclose(get_norms(df.x).mean(), norms.means["x"], rel_tol=1e-1) assert math.isclose(get_norms(df.x).std(), norms.stds["x"], rel_tol=1e-1) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique( ).values_host if HAS_GPU else df["name-cat"].unique() cats0 = get_cats(workflow, "name-cat") # adding the None entry as a string because of move from gpu assert all(cat in [None] + sorted(cats_expected0.tolist()) for cat in cats0.tolist()) assert len(cats0.tolist()) == len(cats_expected0.tolist() + [None]) if HAS_GPU: cats_expected1 = (df["name-string"].unique().values_host if HAS_GPU else df["name-string"].unique()) else: cats_expected1 = df["name-string"].unique() cats1 = get_cats(workflow, "name-string") # adding the None entry as a string because of move from gpu assert all(cat in [None] + sorted(cats_expected1.tolist()) for cat in cats1.tolist()) assert len(cats1.tolist()) == len(cats_expected1.tolist() + [None]) # Write to new "shuffled" and "processed" dataset workflow.transform(dataset).to_parquet( tmpdir, out_files_per_proc=10, shuffle=nvt.io.Shuffle.PER_PARTITION, ) dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"), part_mem_fraction=gpu_memory_frac) df_pp = nvt.dispatch._concat(list(dataset_2.to_iter()), axis=0) if engine == "parquet": assert is_integer_dtype(df_pp["name-cat"].dtype) assert is_integer_dtype(df_pp["name-string"].dtype) num_rows, num_row_groups, col_names = nvt.dispatch._read_parquet_metadata( str(tmpdir) + "/_metadata") assert num_rows == len(df_pp)