def test_empty_cols(tmpdir, df, dataset, engine, cat_names, cont_names, label_name):
    # test out https://github.com/NVIDIA/NVTabular/issues/149 making sure we can iterate over
    # empty cats/conts
    # first with no continuous columns
    processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name)

    processor.add_feature([ops.FillMedian()])
    processor.add_feature(ops.Normalize())
    processor.add_feature(ops.Categorify())

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    processor.apply(
        dataset,
        apply_offline=True,
        record_stats=True,
        shuffle=nvt.io.Shuffle.PER_PARTITION,
        output_format=None,
    )
    df_out = processor.get_ddf().compute(scheduler="synchronous")

    data_itr = torch_dataloader.TorchAsyncItr(
        nvt.Dataset(df_out), cats=cat_names, conts=cont_names, labels=label_name, batch_size=1
    )

    for nvt_batch in data_itr:
        cats, conts, labels = nvt_batch
        if cat_names:
            assert cats.shape[-1] == len(cat_names)
        if cont_names:
            assert conts.shape[-1] == len(cont_names)
        if label_name:
            assert labels.shape[-1] == len(label_name)
示例#2
0
def test_empty_cols(tmpdir, df, dataset, engine, cat_names, cont_names, label_name):

    features = []
    if cont_names:
        features.append(cont_names >> ops.FillMedian() >> ops.Normalize())
    if cat_names:
        features.append(cat_names >> ops.Categorify())

    # test out https://github.com/NVIDIA/NVTabular/issues/149 making sure we can iterate over
    # empty cats/conts
    graph = sum(features, nvt.ColumnGroup(label_name))
    if not graph.columns:
        # if we don't have conts/cats/labels we're done
        return

    processor = nvt.Workflow(sum(features, nvt.ColumnGroup(label_name)))

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    df_out = processor.fit_transform(dataset).to_ddf().compute(scheduler="synchronous")

    data_itr = torch_dataloader.TorchAsyncItr(
        nvt.Dataset(df_out), cats=cat_names, conts=cont_names, labels=label_name, batch_size=1
    )

    for nvt_batch in data_itr:
        cats, conts, labels = nvt_batch
        if cat_names:
            assert cats.shape[-1] == len(cat_names)
        if cont_names:
            assert conts.shape[-1] == len(cont_names)
        if label_name:
            assert labels.shape[-1] == len(label_name)
def test_gpu_dl_break(tmpdir, df, dataset, batch_size, part_mem_fraction,
                      engine, device):
    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    conts = cont_names >> ops.FillMedian() >> ops.Normalize()
    cats = cat_names >> ops.Categorify()

    processor = nvt.Workflow(conts + cats + label_name)

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    processor.fit_transform(dataset).to_parquet(
        shuffle=nvt.io.Shuffle.PER_PARTITION,
        output_path=output_train,
        out_files_per_proc=2,
    )

    tar_paths = [
        os.path.join(output_train, x) for x in os.listdir(output_train)
        if x.endswith("parquet")
    ]

    nvt_data = nvt.Dataset(tar_paths[0],
                           engine="parquet",
                           part_mem_fraction=part_mem_fraction)
    data_itr = torch_dataloader.TorchAsyncItr(
        nvt_data,
        batch_size=batch_size,
        cats=cat_names,
        conts=cont_names,
        labels=["label"],
        device=device,
    )
    len_dl = len(data_itr) - 1

    first_chunk = 0
    idx = 0
    for idx, chunk in enumerate(data_itr):
        if idx == 0:
            first_chunk = len(chunk[0])
        last_chk = len(chunk[0])
        print(last_chk)
        if idx == 1:
            break
        del chunk

    assert idx < len_dl

    first_chunk_2 = 0
    for idx, chunk in enumerate(data_itr):
        if idx == 0:
            first_chunk_2 = len(chunk[0])
        del chunk
    assert idx == len_dl

    assert first_chunk == first_chunk_2
示例#4
0
def test_kill_dl(tmpdir, df, dataset, part_mem_fraction, engine):
    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name)

    processor.add_feature([ops.FillMedian()])
    processor.add_preprocess(ops.Normalize())
    processor.add_preprocess(ops.Categorify())

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    processor.apply(
        dataset,
        apply_offline=True,
        record_stats=True,
        shuffle=nvt.io.Shuffle.PER_PARTITION,
        output_path=output_train,
    )

    tar_paths = [
        os.path.join(output_train, x) for x in os.listdir(output_train) if x.endswith("parquet")
    ]

    nvt_data = nvt.Dataset(tar_paths[0], engine="parquet", part_mem_fraction=part_mem_fraction)

    data_itr = torch_dataloader.TorchAsyncItr(
        nvt_data, cats=cat_names, conts=cont_names, labels=["label"]
    )

    results = {}

    for batch_size in [2 ** i for i in range(9, 25, 1)]:
        print("Checking batch size: ", batch_size)
        num_iter = max(10 * 1000 * 1000 // batch_size, 100)  # load 10e7 samples
        # import pdb; pdb.set_trace()
        data_itr.batch_size = batch_size
        start = time.time()
        for i, data in enumerate(data_itr):
            if i >= num_iter:
                break
            del data

        stop = time.time()

        throughput = i * batch_size / (stop - start)
        results[batch_size] = throughput
        print(
            "batch size: ",
            batch_size,
            ", throughput: ",
            throughput,
            "items",
            i * batch_size,
            "time",
            stop - start,
        )
示例#5
0
def test_gpu_dl(tmpdir, df, dataset, batch_size, gpu_memory_frac, engine):
    cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name,)

    processor.add_feature([ops.FillMedian()])
    processor.add_preprocess(ops.Normalize())
    processor.add_preprocess(ops.Categorify())

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    processor.apply(
        dataset,
        apply_offline=True,
        record_stats=True,
        shuffle=True,
        output_path=output_train,
        num_out_files=2,
    )

    tar_paths = [
        os.path.join(output_train, x) for x in os.listdir(output_train) if x.endswith("parquet")
    ]

    data_itr = nvt.torch_dataloader.TorchTensorBatchDatasetItr(
        tar_paths[0],
        engine="parquet",
        sub_batch_size=batch_size,
        gpu_memory_frac=gpu_memory_frac,
        cats=cat_names,
        conts=cont_names,
        labels=["label"],
        names=mycols_csv,
        sep="\t",
    )

    num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(tar_paths[0])
    rows = 0
    for idx, chunk in enumerate(data_itr):
        rows += len(chunk)
        del chunk

    # accounts for incomplete batches at the end of chunks
    # that dont necesssarily have the full batch_size
    assert (idx + 1) * batch_size >= rows
    assert rows == num_rows
    if os.path.exists(output_train):
        shutil.rmtree(output_train)
示例#6
0
def test_fit_simple():
    data = cudf.DataFrame({
        "x": [0, 1, 2, None, 0, 1, 2],
        "y": [None, 3, 4, 5, 3, 4, 5]
    })
    dataset = Dataset(data)

    workflow = Workflow(["x", "y"] >> ops.FillMedian() >> (lambda x: x * x))

    workflow.fit(dataset)
    transformed = workflow.transform(dataset).to_ddf().compute()

    expected = cudf.DataFrame({
        "x": [0, 1, 4, 1, 0, 1, 4],
        "y": [16, 9, 16, 25, 9, 16, 25]
    })
    assert_eq(expected, transformed)
def test_fit_simple():
    data = nvt.dispatch._make_df({
        "x": [0, 1, 2, None, 0, 1, 2],
        "y": [None, 3, 4, 5, 3, 4, 5]
    })
    dataset = Dataset(data)

    workflow = Workflow(["x", "y"] >> ops.FillMedian() >> (lambda x: x * x))

    workflow.fit(dataset)
    transformed = workflow.transform(dataset).to_ddf().compute()

    expected = nvt.dispatch._make_df({
        "x": [0, 1, 4, 1, 0, 1, 4],
        "y": [16, 9, 16, 25, 9, 16, 25]
    })
    if not HAS_GPU:
        transformed["x"] = transformed["x"].astype(expected["x"].dtype)
        transformed["y"] = transformed["y"].astype(expected["y"].dtype)
    assert_eq(expected, transformed)
def test_dataloader_schema(tmpdir, df, dataset, batch_size, engine, device):
    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    conts = cont_names >> ops.FillMedian() >> ops.Normalize()
    cats = cat_names >> ops.Categorify()

    processor = nvt.Workflow(conts + cats + label_name)

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    processor.fit_transform(dataset).to_parquet(
        shuffle=nvt.io.Shuffle.PER_PARTITION,
        output_path=output_train,
        out_files_per_proc=2,
    )

    tar_paths = [
        os.path.join(output_train, x) for x in os.listdir(output_train)
        if x.endswith("parquet")
    ]

    nvt_data = nvt.Dataset(tar_paths, engine="parquet")

    data_loader = torch_dataloader.TorchAsyncItr(
        nvt_data,
        batch_size=batch_size,
        shuffle=False,
        labels=label_name,
    )

    batch = next(iter(data_loader))
    assert all(name in batch[0] for name in cat_names)
    assert all(name in batch[0] for name in cont_names)

    num_label_cols = batch[1].shape[1] if len(batch[1].shape) > 1 else 1
    assert num_label_cols == len(label_name)
示例#9
0
def test_tf_gpu_dl(tmpdir, paths, use_paths, dataset, batch_size,
                   gpu_memory_frac, engine):
    cont_names = ["x", "y", "id"]
    cat_names = ["name-string"]
    label_name = ["label"]
    if engine == "parquet":
        cat_names.append("name-cat")

    columns = cont_names + cat_names

    conts = cont_names >> ops.FillMedian() >> ops.Normalize()
    cats = cat_names >> ops.Categorify()

    workflow = nvt.Workflow(conts + cats + label_name)
    workflow.fit(dataset)
    workflow.transform(dataset).to_parquet(tmpdir + "/processed")

    data_itr = tf_dataloader.KerasSequenceLoader(
        str(tmpdir + "/processed"),  # workflow.transform(dataset),
        cat_names=cat_names,
        cont_names=cont_names,
        batch_size=batch_size,
        buffer_size=gpu_memory_frac,
        label_names=label_name,
        engine=engine,
        shuffle=False,
    )
    _ = tf.random.uniform((1, ))

    rows = 0
    for idx in range(len(data_itr)):
        X, y = next(data_itr)

        # first elements to check epoch-to-epoch consistency
        if idx == 0:
            X0, y0 = X, y

        # check that we have at most batch_size elements
        num_samples = y.shape[0]
        if num_samples != batch_size:
            try:
                next(data_itr)
            except StopIteration:
                rows += num_samples
                continue
            else:
                raise ValueError("Batch size too small at idx {}".format(idx))

        # check that all the features in X have the
        # appropriate length and that the set of
        # their names is exactly the set of names in
        # `columns`
        these_cols = columns.copy()
        for column, x in X.items():
            try:
                these_cols.remove(column)
            except ValueError:
                raise AssertionError
            assert x.shape[0] == num_samples
        assert len(these_cols) == 0
        rows += num_samples

    assert (idx + 1) * batch_size >= rows
    assert rows == (60 * 24 * 3 + 1)

    # if num_samples is equal to batch size,
    # we didn't exhaust the iterator and do
    # cleanup. Try that now
    if num_samples == batch_size:
        try:
            next(data_itr)
        except StopIteration:
            pass
        else:
            raise ValueError
    assert not data_itr._working
    assert data_itr._batch_itr is None

    # check start of next epoch to ensure consistency
    X, y = next(data_itr)
    assert (y.numpy() == y0.numpy()).all()

    for column, x in X.items():
        x0 = X0.pop(column)
        assert (x.numpy() == x0.numpy()).all()
    assert len(X0) == 0

    data_itr.stop()
    assert not data_itr._working
    assert data_itr._batch_itr is None
示例#10
0
def test_gpu_dl(tmpdir, df, dataset, batch_size, part_mem_fraction, engine,
                devices):
    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    conts = cont_names >> ops.FillMedian() >> ops.Normalize()
    cats = cat_names >> ops.Categorify()

    processor = nvt.Workflow(conts + cats + label_name)

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    processor.fit_transform(dataset).to_parquet(
        shuffle=nvt.io.Shuffle.PER_PARTITION,
        output_path=output_train,
        out_files_per_proc=2,
    )

    tar_paths = [
        os.path.join(output_train, x) for x in os.listdir(output_train)
        if x.endswith("parquet")
    ]

    nvt_data = nvt.Dataset(tar_paths[0],
                           engine="parquet",
                           part_mem_fraction=part_mem_fraction)
    data_itr = torch_dataloader.TorchAsyncItr(
        nvt_data,
        batch_size=batch_size,
        cats=cat_names,
        conts=cont_names,
        labels=["label"],
        devices=devices,
    )

    columns = mycols_pq
    df_test = cudf.read_parquet(tar_paths[0])[columns]
    df_test.columns = [x for x in range(0, len(columns))]
    num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(
        tar_paths[0])
    rows = 0
    # works with iterator alone, needs to test inside torch dataloader

    for idx, chunk in enumerate(data_itr):
        if devices is None:
            assert float(df_test.iloc[rows][0]) == float(chunk[0][0][0])
        rows += len(chunk[0])
        del chunk
    # accounts for incomplete batches at the end of chunks
    # that dont necesssarily have the full batch_size
    assert rows == num_rows

    def gen_col(batch):
        batch = batch[0]
        return batch[0], batch[1], batch[2]

    t_dl = torch_dataloader.DLDataLoader(data_itr,
                                         collate_fn=gen_col,
                                         pin_memory=False,
                                         num_workers=0)
    rows = 0
    for idx, chunk in enumerate(t_dl):
        if devices is None:
            assert float(df_test.iloc[rows][0]) == float(chunk[0][0][0])
        rows += len(chunk[0])

    if os.path.exists(output_train):
        shutil.rmtree(output_train)
示例#11
0
def test_tf_gpu_dl(tmpdir, datasets, batch_size, gpu_memory_frac, engine):
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    cont_names = ["x", "y", "id"]
    cat_names = ["name-string"]
    label_name = ["label"]
    if engine == "parquet":
        cat_names.append("name-cat")

    columns = cont_names + cat_names

    processor = nvt.Workflow(
        cat_names=cat_names, cont_names=cont_names, label_name=label_name, to_cpu=True,
    )
    processor.add_feature([ops.FillMedian()])
    processor.add_preprocess(ops.Normalize())
    processor.add_preprocess(ops.Categorify())
    processor.finalize()

    data_itr = tf_dataloader.KerasSequenceDataset(
        paths,
        columns=columns,
        batch_size=batch_size,
        buffer_size=gpu_memory_frac,
        label_name=label_name[0],
        engine=engine,
        shuffle=False,
    )
    processor.update_stats(data_itr.nvt_dataset, record_stats=True)
    data_itr.map(processor)

    rows = 0
    for idx in range(len(data_itr)):
        X, y = next(data_itr)

        # first elements to check epoch-to-epoch consistency
        if idx == 0:
            X0, y0 = X, y

        # check that we have at most batch_size elements
        num_samples = y.shape[0]
        assert num_samples <= batch_size

        # check that all the features in X have the
        # appropriate length and that the set of
        # their names is exactly the set of names in
        # `columns`
        these_cols = columns.copy()
        for column, x in X.items():
            try:
                these_cols.remove(column)
            except ValueError:
                raise AssertionError
            assert x.shape[0] == num_samples
        assert len(these_cols) == 0

        rows += num_samples

    # check start of next epoch to ensure consistency
    X, y = next(data_itr)
    assert (y.numpy() == y0.numpy()).all()
    for column, x in X.items():
        x0 = X0.pop(column)
        assert (x.numpy() == x0.numpy()).all()
    assert len(X0) == 0

    # accounts for incomplete batches at the end of chunks
    # that dont necesssarily have the full batch_size
    assert (idx + 1) * batch_size >= rows
    assert rows == (60 * 24 * 3 + 1)
def test_empty_cols(tmpdir, engine, cat_names, mh_names, cont_names,
                    label_name, num_rows):
    json_sample["num_rows"] = num_rows

    cols = datagen._get_cols_from_schema(json_sample)

    df_gen = datagen.DatasetGen(datagen.PowerLawDistro(0.1))
    dataset = df_gen.create_df(num_rows, cols)
    dataset = nvt.Dataset(dataset)
    features = []
    if cont_names:
        features.append(cont_names >> ops.FillMedian() >> ops.Normalize())
    if cat_names or mh_names:
        features.append(cat_names + mh_names >> ops.Categorify())
    # test out https://github.com/NVIDIA/NVTabular/issues/149 making sure we can iterate over
    # empty cats/conts
    graph = sum(features, nvt.WorkflowNode(label_name))
    processor = nvt.Workflow(graph)

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    df_out = processor.fit_transform(dataset).to_ddf().compute(
        scheduler="synchronous")

    if processor.output_node.output_schema.apply_inverse(
            ColumnSelector("lab_1")):
        # if we don't have conts/cats/labels we're done
        return

    data_itr = None

    with pytest.raises(ValueError) as exc_info:
        data_itr = torch_dataloader.TorchAsyncItr(
            nvt.Dataset(df_out),
            cats=cat_names + mh_names,
            conts=cont_names,
            labels=label_name,
            batch_size=2,
        )
    assert "Neither Categorical or Continuous columns were found by the dataloader. " in str(
        exc_info.value)

    if data_itr:
        for nvt_batch in data_itr:
            cats_conts, labels = nvt_batch
            if cat_names:
                assert set(cat_names).issubset(set(list(cats_conts.keys())))
            if cont_names:
                assert set(cont_names).issubset(set(list(cats_conts.keys())))

        if cat_names or cont_names or mh_names:
            emb_sizes = nvt.ops.get_embedding_sizes(processor)

            EMBEDDING_DROPOUT_RATE = 0.04
            DROPOUT_RATES = [0.001, 0.01]
            HIDDEN_DIMS = [1000, 500]
            LEARNING_RATE = 0.001
            model = Model(
                embedding_table_shapes=emb_sizes,
                num_continuous=len(cont_names),
                emb_dropout=EMBEDDING_DROPOUT_RATE,
                layer_hidden_dims=HIDDEN_DIMS,
                layer_dropout_rates=DROPOUT_RATES,
            ).cuda()
            optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

            def rmspe_func(y_pred, y):
                "Return y_pred and y to non-log space and compute RMSPE"
                y_pred, y = torch.exp(y_pred) - 1, torch.exp(y) - 1
                pct_var = (y_pred - y) / y
                return (pct_var**2).mean().pow(0.5)

            train_loss, y_pred, y = process_epoch(
                data_itr,
                model,
                train=True,
                optimizer=optimizer,
                amp=False,
            )
            train_rmspe = None
            train_rmspe = rmspe_func(y_pred, y)
            assert train_rmspe is not None
            assert len(y_pred) > 0
            assert len(y) > 0
示例#13
0
def test_gpu_preproc(tmpdir, df, dataset, dump, gpu_memory_frac, engine,
                     preprocessing):
    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = nvt.Workflow(cat_names=cat_names,
                             cont_names=cont_names,
                             label_name=label_name)

    processor.add_feature(
        [ops.FillMedian(),
         ops.LogOp(preprocessing=preprocessing)])
    processor.add_preprocess(ops.Normalize())
    processor.add_preprocess(ops.Categorify())
    processor.finalize()

    processor.update_stats(dataset)

    if dump:
        config_file = tmpdir + "/temp.yaml"
        processor.save_stats(config_file)
        processor.clear_stats()
        processor.load_stats(config_file)

    def get_norms(tar: cudf.Series):
        ser_median = tar.dropna().quantile(0.5, interpolation="linear")
        gdf = tar.fillna(ser_median)
        gdf = np.log(gdf + 1)
        return gdf

    # Check mean and std - No good right now we have to add all other changes; Zerofill, Log
    x_col = "x" if preprocessing else "x_LogOp"
    y_col = "y" if preprocessing else "y_LogOp"
    assert math.isclose(get_norms(df.x).mean(),
                        processor.stats["means"][x_col],
                        rel_tol=1e-2)
    assert math.isclose(get_norms(df.y).mean(),
                        processor.stats["means"][y_col],
                        rel_tol=1e-2)
    assert math.isclose(get_norms(df.x).std(),
                        processor.stats["stds"][x_col],
                        rel_tol=1e-2)
    assert math.isclose(get_norms(df.y).std(),
                        processor.stats["stds"][y_col],
                        rel_tol=1e-2)

    # Check median (TODO: Improve the accuracy)
    x_median = df.x.dropna().quantile(0.5, interpolation="linear")
    y_median = df.y.dropna().quantile(0.5, interpolation="linear")
    id_median = df.id.dropna().quantile(0.5, interpolation="linear")
    assert math.isclose(x_median, processor.stats["medians"]["x"], rel_tol=1e1)
    assert math.isclose(y_median, processor.stats["medians"]["y"], rel_tol=1e1)
    assert math.isclose(id_median,
                        processor.stats["medians"]["id"],
                        rel_tol=1e1)

    # Check that categories match
    if engine == "parquet":
        cats_expected0 = df["name-cat"].unique().values_host
        cats0 = get_cats(processor, "name-cat")
        assert cats0.tolist() == [None] + cats_expected0.tolist()
    cats_expected1 = df["name-string"].unique().values_host
    cats1 = get_cats(processor, "name-string")
    assert cats1.tolist() == [None] + cats_expected1.tolist()

    #     Write to new "shuffled" and "processed" dataset
    processor.write_to_dataset(tmpdir,
                               dataset,
                               nfiles=10,
                               shuffle=True,
                               apply_ops=True)

    processor.create_final_cols()

    # if preprocessing
    if not preprocessing:
        for col in cont_names:
            assert f"{col}_LogOp" in processor.columns_ctx["final"]["cols"][
                "continuous"]

    dlc = torch_dataloader.DLCollator(preproc=processor, apply_ops=False)
    data_files = [
        torch_dataloader.FileItrDataset(x,
                                        use_row_groups=True,
                                        gpu_memory_frac=gpu_memory_frac,
                                        names=allcols_csv)
        for x in glob.glob(str(tmpdir) + "/*.parquet")
    ]

    data_itr = torch.utils.data.ChainDataset(data_files)
    dl = torch_dataloader.DLDataLoader(data_itr,
                                       collate_fn=dlc.gdf_col,
                                       pin_memory=False,
                                       num_workers=0)

    len_df_pp = 0
    for chunk in dl:
        len_df_pp += len(chunk[0][0])

    dataset = Dataset(glob.glob(str(tmpdir) + "/*.parquet"),
                      part_mem_fraction=gpu_memory_frac)
    x = processor.ds_to_tensors(dataset.to_iter(), apply_ops=False)

    num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(
        str(tmpdir) + "/_metadata")
    assert len(x[0]) == len_df_pp

    itr_ds = torch_dataloader.TensorItrDataset([x[0], x[1], x[2]],
                                               batch_size=512000)
    count_tens_itr = 0
    for data_gd in itr_ds:
        count_tens_itr += len(data_gd[1])
        assert data_gd[0].shape[1] > 0
        assert data_gd[1].shape[1] > 0

    assert len_df_pp == count_tens_itr
示例#14
0
def test_gpu_dl(tmpdir, df, dataset, batch_size, gpu_memory_frac, engine):
    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = nvt.Workflow(cat_names=cat_names,
                             cont_names=cont_names,
                             label_name=label_name)

    processor.add_feature([ops.FillMedian()])
    processor.add_preprocess(ops.Normalize())
    processor.add_preprocess(ops.Categorify())

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    processor.apply(
        dataset,
        apply_offline=True,
        record_stats=True,
        shuffle=True,
        output_path=output_train,
        num_out_files=2,
    )

    tar_paths = [
        os.path.join(output_train, x) for x in os.listdir(output_train)
        if x.endswith("parquet")
    ]

    data_itr = nvt.torch_dataloader.TorchTensorBatchDatasetItr(
        tar_paths[0],
        engine="parquet",
        sub_batch_size=batch_size,
        gpu_memory_frac=gpu_memory_frac,
        cats=cat_names,
        conts=cont_names,
        labels=["label"],
        names=mycols_csv,
        sep="\t",
    )

    columns = mycols_pq if engine == "parquet" else mycols_csv
    df_test = cudf.read_parquet(tar_paths[0])[columns]
    df_test.columns = [x for x in range(0, len(columns))]
    num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(
        tar_paths[0])
    rows = 0
    # works with iterator alone, needs to test inside torch dataloader
    for idx, chunk in enumerate(data_itr):
        assert float(df_test.iloc[rows][0]) == float(chunk[0][0][0])
        rows += len(chunk[0])
        del chunk
    # accounts for incomplete batches at the end of chunks
    # that dont necesssarily have the full batch_size
    assert rows == num_rows

    def gen_col(batch):
        batch = batch[0]
        return batch[0], batch[1], batch[2]

    t_dl = nvt.torch_dataloader.DLDataLoader(data_itr,
                                             collate_fn=gen_col,
                                             pin_memory=False,
                                             num_workers=0)
    rows = 0
    for idx, chunk in enumerate(t_dl):
        assert float(df_test.iloc[rows][0]) == float(chunk[0][0][0])
        rows += len(chunk[0])
    if os.path.exists(output_train):
        shutil.rmtree(output_train)
示例#15
0
    def processing(
        self,
        X_pd,
        y_names=[],
        encode_categor_type=None,
        #'categorify', 'onehotencoding',
        outliers_detection_technique=None,
        #'iqr_proximity_rule', 'gaussian_approximation','quantiles'
        fill_with_value=None,
        #'extreme_values', 'zeros','mean-median'
        targetencoding=False,
        file_path=None,
    ):
        X = dd.from_pandas(X_pd, npartitions=self.n_gpus)
        X = X.replace(np.nan, None)
        try:
            self.time_columns
        except AttributeError:
            try:
                self.initialize_types(
                    X,
                    n_unique_val_th=n_unique_val_th_,
                    categor_columns_keep=categor_columns_keep_,
                    numer_columns_keep=numer_columns_keep_)
            except NameError:
                self.initialize_types(X)

        workflow = nvt.Workflow(cat_names=self.categor_columns,
                                cont_names=self.numer_columns,
                                label_name=y_names,
                                client=self.client)
        # Operators: https://nvidia.github.io/NVTabular/main/api/ops/index.html
        # Categorify https://nvidia.github.io/NVTabular/main/api/ops/categorify.html
        if encode_categor_type == 'categorify':
            if len(self.categor_columns) != 0:
                workflow.add_preprocess(
                    ops.Categorify(columns=self.categor_columns,
                                   out_path='./'))

        if encode_categor_type == 'onehotencoding':
            #OneHotEncoder().get_feature_names(input_features=<list of features encoded>) does not work
            #lengths=True - chunk sizes can be computed
            for column in self.categor_columns:
                #X[column] = X[column].astype(str)
                X_cat_encoded = OneHotEncoder().fit_transform(
                    X[column].to_dask_array(lengths=True).reshape(-1, 1))
                uvs = X[column].unique().compute().values
                X = X.drop([column], axis=1)
                X_cat_encoded = dd.from_array(
                    X_cat_encoded.compute().todense())
                X_cat_encoded.columns = [
                    column + '_{}'.format(uv) for uv in uvs
                ]
                X = dd.concat([X, X_cat_encoded], axis=1)
                X = X.repartition(npartitions=2)
            for column in X.columns:
                if any(str(column)[-4:] == t
                       for t in ['_nan', 'None']):  # What else?
                    X = X.drop([column], axis=1)

            self.initialize_types(X)
            print('Retyping:', self.initialize_types(X))
            # Reinitialize workflow
            workflow = nvt.Workflow(cat_names=self.categor_columns,
                                    cont_names=self.numer_columns,
                                    label_name=y_names,
                                    client=self.client)

        # OutlDetect https://nvidia.github.io/NVTabular/main/api/ops/clip.html
        if (len(self.numer_columns) != 0) and (outliers_detection_technique !=
                                               None):
            lower, upper = self.outldetect(outliers_detection_technique,
                                           X[self.numer_columns])
            for i in range(len(self.numer_columns)):
                logging.info(
                    f'column: {self.numer_columns[i]}, lower: {lower[i]}, upper: {upper[i]}'
                )
                print(
                    f'column: {self.numer_columns[i]}, lower: {lower[i]}, upper: {upper[i]}'
                )
                workflow.add_preprocess(
                    ops.Clip(min_value=lower[i],
                             max_value=upper[i],
                             columns=[self.numer_columns[i]]))

        # FillMissing https://nvidia.github.io/NVTabular/main/api/ops/fillmissing.html
        if fill_with_value == 'zeros':
            workflow.add_preprocess(
                ops.FillMissing(fill_val=0,
                                columns=self.categor_columns +
                                self.numer_columns))

        if fill_with_value == 'extreme_values':
            extrim_values = {}
            if len(self.numer_columns) != 0:
                extrim_values.update(
                    self.extrvalsdetect(X[self.numer_columns],
                                        'numer_columns'))

            if len(self.categor_columns) != 0:
                extrim_values.update(
                    self.extrvalsdetect(X[self.categor_columns],
                                        'categor_columns'))
            logging.info(f'extrim_values: {extrim_values}')

            output = open('extrim_values', 'wb')
            pickle.dump(extrim_values, output)
            output.close()

            for fill_val, column in zip(list(extrim_values.values()),
                                        list(extrim_values.keys())):
                workflow.add_preprocess(
                    ops.FillMissing(fill_val=fill_val, columns=[column]))

        if fill_with_value == 'mean-median':
            if len(self.categor_columns) != 0:
                workflow.add_preprocess(
                    ops.FillMedian(columns=self.categor_columns,
                                   preprocessing=True,
                                   replace=True))
            if len(self.numer_columns) != 0:
                means = list(
                    dd.from_pandas(
                        X[self.numer_columns],
                        npartitions=self.n_gpus).mean().compute().values)
                for fill_val, column in zip(means, self.numer_columns):
                    workflow.add_preprocess(
                        ops.FillMissing(fill_val=fill_val, columns=[column]))

        if targetencoding:
            #https://nvidia.github.io/NVTabular/main/api/ops/targetencoding.html
            if len(self.y_names) != 0:
                if len(self.cat_groups) == 0:
                    print(
                        '\n Target encoding will be applied to all categorical columns'
                    )
                    workflow.add_preprocess(
                        ops.TargetEncoding(cat_groups=self.categor_columns,
                                           cont_target=self.y_names))
                else:
                    workflow.add_preprocess(
                        ops.TargetEncoding(cat_groups=self.cat_groups,
                                           cont_target=self.y_names))
        #-----------------------------------------------------------------------------------------
        workflow.finalize()
        dataset = nvt.Dataset(X)

        tmp_output_path = "./parquet_data_tmp"
        workflow.apply(
            dataset,
            output_format="parquet",
            output_path=tmp_output_path,
            shuffle=Shuffle.PER_WORKER,  # Shuffle algorithm
            out_files_per_proc=1,  # Number of output files per worker
        )
        files = glob.glob(tmp_output_path + "/*.parquet")
        X_final = cudf.read_parquet(files[0])
        for i in range(1, len(files)):
            X_final = X_final.append(cudf.read_parquet(files[i]))
        # Delete temporary files
        shutil.rmtree(tmp_output_path, ignore_errors=True)
        #         if len(self.rest_col_names) != 0:
        #             print(1)
        #             X_final = pd.concat([X_final.to_pandas(), X_pd[self.rest_col_names]], axis=1)
        if file_path is not None:
            X_final.to_csv(file_path, index=False)
        return X_final
示例#16
0
def test_gpu_dl(tmpdir, datasets, batch_size, gpu_memory_frac, engine):
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])

    if engine == "parquet":
        df1 = cudf.read_parquet(paths[0])[mycols_pq]
        df2 = cudf.read_parquet(paths[1])[mycols_pq]
    else:
        df1 = cudf.read_csv(paths[0], header=False, names=allcols_csv)[mycols_csv]
        df2 = cudf.read_csv(paths[1], header=False, names=allcols_csv)[mycols_csv]
    df = cudf.concat([df1, df2], axis=0)
    df["id"] = df["id"].astype("int64")

    if engine == "parquet":
        cat_names = ["name-cat", "name-string"]
        columns = mycols_pq
    else:
        cat_names = ["name-string"]
        columns = mycols_csv
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = nvt.Workflow(
        cat_names=cat_names, cont_names=cont_names, label_name=label_name, to_cpu=True,
    )

    processor.add_feature([ops.FillMedian()])
    processor.add_preprocess(ops.Normalize())
    processor.add_preprocess(ops.Categorify())

    data_itr = nvtabular.io.GPUDatasetIterator(
        paths,
        columns=columns,
        use_row_groups=True,
        gpu_memory_frac=gpu_memory_frac,
        names=allcols_csv,
    )

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    processor.apply(
        data_itr,
        apply_offline=True,
        record_stats=True,
        shuffle=True,
        output_path=output_train,
        num_out_files=2,
    )

    tar_paths = [
        os.path.join(output_train, x) for x in os.listdir(output_train) if x.endswith("parquet")
    ]

    data_itr = nvt.torch_dataloader.TorchTensorBatchDatasetItr(
        tar_paths[0],
        engine="parquet",
        sub_batch_size=batch_size,
        gpu_memory_frac=gpu_memory_frac,
        cats=cat_names,
        conts=cont_names,
        labels=["label"],
        names=mycols_csv,
        sep="\t",
    )

    num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(tar_paths[0])
    rows = 0
    for idx, chunk in enumerate(data_itr):
        rows += len(chunk)
        del chunk

    # accounts for incomplete batches at the end of chunks
    # that dont necesssarily have the full batch_size
    assert (idx + 1) * batch_size >= rows
    assert rows == num_rows
    if os.path.exists(output_train):
        shutil.rmtree(output_train)