def test_inspect_datagen(tmpdir, datasets, engine, dist): # Dataset paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) # Dataset columns type config columns_dict = {} columns_dict["cats"] = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] columns_dict["conts"] = ["x", "y"] columns_dict["labels"] = ["label"] # Create inspector and inspect output_inspect1 = tmpdir + "/dataset_info1.json" dataset = Dataset(paths, engine=engine) a = datains.DatasetInspector() a.inspect(dataset, columns_dict, output_inspect1) assert os.path.isfile(output_inspect1) # Generate dataset using data_gen tool output_datagen = tmpdir + "/datagen" os.mkdir(output_datagen) with fsspec.open(output_inspect1) as f: output1 = json.load(f) cols = datagen._get_cols_from_schema(output1) if dist == "uniform": df_gen = datagen.DatasetGen(datagen.UniformDistro(), gpu_frac=0.00001) else: df_gen = datagen.DatasetGen(datagen.PowerLawDistro(0.1), gpu_frac=0.00001) output_datagen_files = df_gen.full_df_create(output1["num_rows"], cols, entries=True, output=output_datagen) # Inspect again and check output are the same output_inspect2 = tmpdir + "/dataset_info2.json" dataset = Dataset(output_datagen_files, engine=engine) a.inspect(dataset, columns_dict, output_inspect2) assert os.path.isfile(output_inspect2) # Compare json outputs with fsspec.open(output_inspect2) as f: output2 = json.load(f) for k1 in output1.keys(): if k1 == "num_rows": assert output1[k1] == output2[k1] else: for k2 in output1[k1].keys(): for k3 in output1[k1][k2].keys(): if k3 == "dtype": if output1[k1][k2][k3] == "object": assert (output1[k1][k2][k3] == output2[k1][k2][k3] or output2[k1][k2][k3] == "int64") else: assert output1[k1][k2][k3] == output2[k1][k2][k3] else: assert output1[k1][k2][k3] == pytest.approx( output2[k1][k2][k3], rel=1e-0, abs=1e-0)
def test_powerlaw(num_rows, distro): cats = list(json_sample["cats"].keys())[1:] cols = datagen._get_cols_from_schema(json_sample, distros=distro) df_gen = datagen.DatasetGen(datagen.PowerLawDistro(0.1)) df_pw = cudf.DataFrame() for x in range(10): df_pw_1 = df_gen.create_df(num_rows, cols) df_pw = cudf.concat([df_pw, df_pw_1], axis=0) sts, ps = df_gen.verify_df(df_pw[cats]) assert all(s > 0.9 for s in sts)
def test_empty_cols(tmpdir, engine, cat_names, mh_names, cont_names, label_name, num_rows): json_sample["num_rows"] = num_rows cols = datagen._get_cols_from_schema(json_sample) df_gen = datagen.DatasetGen(datagen.PowerLawDistro(0.1)) dataset = df_gen.create_df(num_rows, cols) dataset = nvt.Dataset(dataset) features = [] if cont_names: features.append(cont_names >> ops.FillMedian() >> ops.Normalize()) if cat_names or mh_names: features.append(cat_names + mh_names >> ops.Categorify()) # test out https://github.com/NVIDIA/NVTabular/issues/149 making sure we can iterate over # empty cats/conts graph = sum(features, nvt.WorkflowNode(label_name)) processor = nvt.Workflow(graph) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) df_out = processor.fit_transform(dataset).to_ddf().compute( scheduler="synchronous") if processor.output_node.output_schema.apply_inverse( ColumnSelector("lab_1")): # if we don't have conts/cats/labels we're done return data_itr = None with pytest.raises(ValueError) as exc_info: data_itr = torch_dataloader.TorchAsyncItr( nvt.Dataset(df_out), cats=cat_names + mh_names, conts=cont_names, labels=label_name, batch_size=2, ) assert "Neither Categorical or Continuous columns were found by the dataloader. " in str( exc_info.value) if data_itr: for nvt_batch in data_itr: cats_conts, labels = nvt_batch if cat_names: assert set(cat_names).issubset(set(list(cats_conts.keys()))) if cont_names: assert set(cont_names).issubset(set(list(cats_conts.keys()))) if cat_names or cont_names or mh_names: emb_sizes = nvt.ops.get_embedding_sizes(processor) EMBEDDING_DROPOUT_RATE = 0.04 DROPOUT_RATES = [0.001, 0.01] HIDDEN_DIMS = [1000, 500] LEARNING_RATE = 0.001 model = Model( embedding_table_shapes=emb_sizes, num_continuous=len(cont_names), emb_dropout=EMBEDDING_DROPOUT_RATE, layer_hidden_dims=HIDDEN_DIMS, layer_dropout_rates=DROPOUT_RATES, ).cuda() optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) def rmspe_func(y_pred, y): "Return y_pred and y to non-log space and compute RMSPE" y_pred, y = torch.exp(y_pred) - 1, torch.exp(y) - 1 pct_var = (y_pred - y) / y return (pct_var**2).mean().pow(0.5) train_loss, y_pred, y = process_epoch( data_itr, model, train=True, optimizer=optimizer, amp=False, ) train_rmspe = None train_rmspe = rmspe_func(y_pred, y) assert train_rmspe is not None assert len(y_pred) > 0 assert len(y) > 0