示例#1
0
def _get_random_movielens_data(tmpdir, rows, dataset="movie", valid=None):
    if dataset == "movie":
        json_sample_movie = {
            "conts": {},
            "cats": {
                "genres": {
                    "dtype": None,
                    "cardinality": 50,
                    "min_entry_size": 1,
                    "max_entry_size": 5,
                    "multi_min": 2,
                    "multi_max": 4,
                    "multi_avg": 3,
                },
                "movieId": {
                    "dtype": None,
                    "cardinality": 500,
                    "min_entry_size": 1,
                    "max_entry_size": 5,
                },
            },
        }
        cols = datagen._get_cols_from_schema(json_sample_movie)
    if dataset == "ratings":
        json_sample_ratings = {
            "conts": {},
            "cats": {
                "movieId": {
                    "dtype": None,
                    "cardinality": 500,
                    "min_entry_size": 1,
                    "max_entry_size": 5,
                },
                "userId": {
                    "dtype": None,
                    "cardinality": 500,
                    "min_entry_size": 1,
                    "max_entry_size": 5,
                },
            },
            "labels": {"rating": {"dtype": None, "cardinality": 5}},
        }
        cols = datagen._get_cols_from_schema(json_sample_ratings)

    df_gen = datagen.DatasetGen(datagen.UniformDistro(), gpu_frac=0.1)
    target_path = tmpdir
    df_gen.full_df_create(rows, cols, output=target_path)

    if dataset == "movie":
        movies_converted = cudf.read_parquet(os.path.join(tmpdir, "dataset_0.parquet"))
        movies_converted = movies_converted.drop_duplicates(["movieId"], keep="first")
        movies_converted.to_parquet(os.path.join(tmpdir, "movies_converted.parquet"))

    elif dataset == "ratings" and not valid:
        os.rename(os.path.join(tmpdir, "dataset_0.parquet"), os.path.join(tmpdir, "train.parquet"))
    else:
        os.rename(os.path.join(tmpdir, "dataset_0.parquet"), os.path.join(tmpdir, "valid.parquet"))
示例#2
0
def test_full_df(num_rows, tmpdir, distro):
    json_sample["num_rows"] = num_rows
    cats = list(json_sample["cats"].keys())
    cols = datagen._get_cols_from_schema(json_sample, distros=distro)

    df_gen = datagen.DatasetGen(datagen.UniformDistro(), gpu_frac=0.00001)
    df_files = df_gen.full_df_create(num_rows,
                                     cols,
                                     entries=True,
                                     output=tmpdir)
    test_size = 0
    full_df = cudf.DataFrame()
    for fi in df_files:
        df = cudf.read_parquet(fi)
        test_size = test_size + df.shape[0]
        full_df = cudf.concat([full_df, df])
    assert test_size == num_rows
    conts_rep = cols["conts"]
    cats_rep = cols["cats"]
    labels_rep = cols["labels"]
    assert df.shape[1] == len(conts_rep) + len(cats_rep) + len(labels_rep)
    for idx, cat in enumerate(cats[1:]):
        dist = cats_rep[idx + 1].distro or df_gen.dist
        if not is_string_dtype(full_df[cat]._column):
            sts, ps = dist.verify(full_df[cat].to_pandas())
            assert all(s > 0.9 for s in sts)
        assert full_df[cat].nunique() == cats_rep[idx + 1].cardinality
        assert full_df[cat].str.len().min() == cats_rep[idx + 1].min_entry_size
        assert full_df[cat].str.len().max() == cats_rep[idx + 1].max_entry_size
    check_ser = cudf.Series(full_df[cats[0]]._column.elements.values_host)
    assert check_ser.nunique() == cats_rep[0].cardinality
    assert check_ser.str.len().min() == cats_rep[0].min_entry_size
    assert check_ser.str.len().max() == cats_rep[0].max_entry_size
示例#3
0
def test_inspect_datagen(tmpdir, datasets, engine, dist):
    # Dataset
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])

    # Dataset columns type config
    columns_dict = {}
    columns_dict["cats"] = ["name-cat", "name-string"
                            ] if engine == "parquet" else ["name-string"]
    columns_dict["conts"] = ["x", "y"]
    columns_dict["labels"] = ["label"]

    # Create inspector and inspect
    output_inspect1 = tmpdir + "/dataset_info1.json"
    dataset = Dataset(paths, engine=engine)
    a = datains.DatasetInspector()
    a.inspect(dataset, columns_dict, output_inspect1)
    assert os.path.isfile(output_inspect1)

    # Generate dataset using data_gen tool
    output_datagen = tmpdir + "/datagen"
    os.mkdir(output_datagen)
    with fsspec.open(output_inspect1) as f:
        output1 = json.load(f)
    cols = datagen._get_cols_from_schema(output1)
    if dist == "uniform":
        df_gen = datagen.DatasetGen(datagen.UniformDistro(), gpu_frac=0.00001)
    else:
        df_gen = datagen.DatasetGen(datagen.PowerLawDistro(0.1),
                                    gpu_frac=0.00001)

    output_datagen_files = df_gen.full_df_create(output1["num_rows"],
                                                 cols,
                                                 entries=True,
                                                 output=output_datagen)

    # Inspect again and check output are the same
    output_inspect2 = tmpdir + "/dataset_info2.json"
    dataset = Dataset(output_datagen_files, engine=engine)
    a.inspect(dataset, columns_dict, output_inspect2)
    assert os.path.isfile(output_inspect2)

    # Compare json outputs
    with fsspec.open(output_inspect2) as f:
        output2 = json.load(f)
    for k1 in output1.keys():
        if k1 == "num_rows":
            assert output1[k1] == output2[k1]
        else:
            for k2 in output1[k1].keys():
                for k3 in output1[k1][k2].keys():
                    if k3 == "dtype":
                        if output1[k1][k2][k3] == "object":
                            assert (output1[k1][k2][k3] == output2[k1][k2][k3]
                                    or output2[k1][k2][k3] == "int64")
                        else:
                            assert output1[k1][k2][k3] == output2[k1][k2][k3]
                    else:
                        assert output1[k1][k2][k3] == pytest.approx(
                            output2[k1][k2][k3], rel=1e-0, abs=1e-0)
示例#4
0
def test_uniform(num_rows, distro):
    cats = list(json_sample["cats"].keys())[1:]
    cols = datagen._get_cols_from_schema(json_sample, distros=distro)

    df_gen = datagen.DatasetGen(datagen.UniformDistro())
    df_uni = df_gen.create_df(num_rows, cols)
    sts, ps = df_gen.verify_df(df_uni[cats])
    assert all(s > 0.9 for s in sts)
def test_sparse_tensors(tmpdir, sparse_dense):
    # create small dataset, add values to sparse_list
    json_sample = {
        "conts": {},
        "cats": {
            "spar1": {
                "dtype": None,
                "cardinality": 50,
                "min_entry_size": 1,
                "max_entry_size": 5,
                "multi_min": 2,
                "multi_max": 4,
                "multi_avg": 3,
            },
            "spar2": {
                "dtype": None,
                "cardinality": 50,
                "min_entry_size": 1,
                "max_entry_size": 5,
                "multi_min": 3,
                "multi_max": 5,
                "multi_avg": 4,
            },
            # "": {"dtype": None, "cardinality": 500, "min_entry_size": 1, "max_entry_size": 5},
        },
        "labels": {"rating": {"dtype": None, "cardinality": 2}},
    }
    cols = datagen._get_cols_from_schema(json_sample)
    df_gen = datagen.DatasetGen(datagen.UniformDistro(), gpu_frac=0.0001)
    target_path = os.path.join(tmpdir, "input/")
    os.mkdir(target_path)
    df_files = df_gen.full_df_create(10000, cols, output=target_path)
    spa_lst = ["spar1", "spar2"]
    spa_mx = {"spar1": 5, "spar2": 6}
    batch_size = 10
    data_itr = tf_dataloader.KerasSequenceLoader(
        df_files,
        cat_names=spa_lst,
        cont_names=[],
        label_names=["rating"],
        batch_size=batch_size,
        buffer_size=0.1,
        sparse_names=spa_lst,
        sparse_max=spa_mx,
        sparse_as_dense=sparse_dense,
    )
    for batch in data_itr:
        feats, labs = batch
        for col in spa_lst:
            feature_tensor = feats[f"{col}"]
            if not sparse_dense:
                assert list(feature_tensor.shape) == [batch_size, spa_mx[col]]
                assert isinstance(feature_tensor, tf.sparse.SparseTensor)
            else:
                assert feature_tensor.shape[1] == spa_mx[col]
                assert not isinstance(feature_tensor, tf.sparse.SparseTensor)
示例#6
0
def test_powerlaw(num_rows, distro):
    cats = list(json_sample["cats"].keys())[1:]

    cols = datagen._get_cols_from_schema(json_sample, distros=distro)

    df_gen = datagen.DatasetGen(datagen.PowerLawDistro(0.1))
    df_pw = cudf.DataFrame()
    for x in range(10):
        df_pw_1 = df_gen.create_df(num_rows, cols)
        df_pw = cudf.concat([df_pw, df_pw_1], axis=0)
    sts, ps = df_gen.verify_df(df_pw[cats])
    assert all(s > 0.9 for s in sts)
示例#7
0
def test_width(num_rows, distro):
    json_sample_1 = {
        "conts": {
            "cont_1": {"dtype": np.float32, "min_val": 0, "max_val": 1, "width": 20},
        }
    }
    json_sample_1["num_rows"] = num_rows
    cols = datagen._get_cols_from_schema(json_sample_1, distros=distro)

    df_gen = datagen.DatasetGen(datagen.UniformDistro())
    df_uni = df_gen.create_df(num_rows, cols)
    assert df_uni.shape[1] == 20
示例#8
0
def test_cat_rep(num_rows, distro):
    cats = list(json_sample["cats"].keys())
    cols = datagen._get_cols_from_schema(json_sample, distros=distro)

    df_gen = datagen.DatasetGen(datagen.UniformDistro())
    df_uni = df_gen.create_df(num_rows, cols, entries=True)
    df_cats = df_uni[cats]
    assert df_cats.shape[1] == len(cats)
    assert df_cats.shape[0] == num_rows
    cats_rep = cols["cats"]
    for idx, cat in enumerate(cats[1:]):
        assert df_uni[cat].nunique() == cats_rep[idx + 1].cardinality
        assert df_uni[cat].str.len().min() == cats_rep[idx + 1].min_entry_size
        assert df_uni[cat].str.len().max() == cats_rep[idx + 1].max_entry_size
    check_ser = cudf.Series(df_uni[cats[0]]._column.elements.values_host)
    assert check_ser.nunique() == cats_rep[0].cardinality
    assert check_ser.str.len().min() == cats_rep[0].min_entry_size
    assert check_ser.str.len().max() == cats_rep[0].max_entry_size
示例#9
0
def test_full_df(num_rows, tmpdir, distro):
    json_sample["num_rows"] = num_rows
    cats = list(json_sample["cats"].keys())
    cols = datagen._get_cols_from_schema(json_sample, distros=distro)

    df_gen = datagen.DatasetGen(datagen.UniformDistro(), gpu_frac=0.00001)
    df_files = df_gen.full_df_create(num_rows, cols, entries=True, output=tmpdir)
    test_size = 0
    full_df = _make_df()
    for fi in df_files:
        df = Dataset(fi).to_ddf().compute()
        test_size = test_size + df.shape[0]
        full_df = _concat([full_df, df])
    assert test_size == num_rows
    conts_rep = cols["conts"]
    cats_rep = cols["cats"]
    labels_rep = cols["labels"]
    assert df.shape[1] == len(conts_rep) + len(cats_rep) + len(labels_rep)
    for idx, cat in enumerate(cats[1:]):
        dist = cats_rep[idx + 1].distro or df_gen.dist
        if HAS_GPU:
            if not _is_string_dtype(full_df[cat]._column):
                sts, ps = dist.verify(full_df[cat].to_pandas())
                assert all(s > 0.9 for s in sts)
        else:
            if not _is_string_dtype(full_df[cat]):
                sts, ps = dist.verify(full_df[cat])
                assert all(s > 0.9 for s in sts)
        # these are not mh series
        assert full_df[cat].nunique() == cats_rep[0].cardinality
        assert full_df[cat].str.len().min() == cats_rep[0].min_entry_size
        assert full_df[cat].str.len().max() == cats_rep[0].max_entry_size
    # check the mh list here cat 0 only
    if HAS_GPU:
        check_ser = _make_df(list(full_df[cats[0]]._column.elements.values_host))[0]
    else:
        check_ser = _pull_apart_list(full_df[cats[0]])[0]
    assert check_ser.nunique() == cats_rep[0].cardinality
    assert check_ser.str.len().min() == cats_rep[0].min_entry_size
    assert check_ser.str.len().max() == cats_rep[0].max_entry_size
示例#10
0
def test_cat_rep(num_rows, distro):
    json_sample["num_rows"] = num_rows
    cats = list(json_sample["cats"].keys())
    cols = datagen._get_cols_from_schema(json_sample, distros=distro)

    df_gen = datagen.DatasetGen(datagen.UniformDistro())
    df_uni = df_gen.create_df(num_rows, cols, entries=True)
    df_cats = df_uni[cats]
    assert df_cats.shape[1] == len(cats)
    assert df_cats.shape[0] == num_rows
    cats_rep = cols["cats"]
    for idx, cat in enumerate(cats[1:]):
        assert df_uni[cat].nunique() == cats_rep[idx + 1].cardinality
        assert df_uni[cat].str.len().min() == cats_rep[idx + 1].min_entry_size
        assert df_uni[cat].str.len().max() == cats_rep[idx + 1].max_entry_size
    if HAS_GPU:
        check_ser = _make_df(list(df_uni[cats[0]]._column.elements.values_host))[0]
    else:
        check_ser = df_uni[cats[0]]
    if isinstance(check_ser[0], (list, np.ndarray)):
        check_ser = _pull_apart_list(check_ser)[0]
    assert check_ser.nunique() == cats_rep[0].cardinality
    assert check_ser.str.len().min() == cats_rep[0].min_entry_size
    assert check_ser.str.len().max() == cats_rep[0].max_entry_size
示例#11
0
def test_horovod_multigpu(tmpdir):
    json_sample = {
        "conts": {},
        "cats": {
            "genres": {
                "dtype": None,
                "cardinality": 50,
                "min_entry_size": 1,
                "max_entry_size": 5,
                "multi_min": 2,
                "multi_max": 4,
                "multi_avg": 3,
            },
            "movieId": {
                "dtype": None,
                "cardinality": 500,
                "min_entry_size": 1,
                "max_entry_size": 5,
            },
            "userId": {
                "dtype": None,
                "cardinality": 500,
                "min_entry_size": 1,
                "max_entry_size": 5
            },
        },
        "labels": {
            "rating": {
                "dtype": None,
                "cardinality": 2
            }
        },
    }
    cols = datagen._get_cols_from_schema(json_sample)
    df_gen = datagen.DatasetGen(datagen.UniformDistro(), gpu_frac=0.0001)
    target_path = os.path.join(tmpdir, "input/")
    os.mkdir(target_path)
    df_files = df_gen.full_df_create(10000, cols, output=target_path)
    # process them
    cat_features = nvt.ColumnGroup(["userId", "movieId", "genres"
                                    ]) >> nvt.ops.Categorify()
    ratings = nvt.ColumnGroup(["rating"]) >> (lambda col:
                                              (col > 3).astype("int8"))
    output = cat_features + ratings
    proc = nvt.Workflow(output)
    train_iter = nvt.Dataset(df_files, part_size="10MB")
    proc.fit(train_iter)
    target_path_train = os.path.join(tmpdir, "train/")
    os.mkdir(target_path_train)
    proc.transform(train_iter).to_parquet(output_path=target_path_train,
                                          out_files_per_proc=5)
    # add new location
    target_path = os.path.join(tmpdir, "workflow/")
    os.mkdir(target_path)
    proc.save(target_path)
    curr_path = os.path.abspath(__file__)
    repo_root = os.path.relpath(
        os.path.normpath(os.path.join(curr_path, "../../..")))
    hvd_wrap_path = os.path.join(
        repo_root, "examples/multi-gpu-movielens/hvd_wrapper.sh")
    hvd_exam_path = os.path.join(repo_root,
                                 "examples/multi-gpu-movielens/tf_trainer.py")
    process = subprocess.Popen(
        [
            "horovodrun",
            "-np",
            "2",
            "-H",
            "localhost:2",
            "sh",
            hvd_wrap_path,
            "python",
            hvd_exam_path,
            "--dir_in",
            f"{tmpdir}",
            "--batch_size",
            "1024",
        ],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
    )
    process.wait()
    stdout, stderr = process.communicate()
    print(stdout, stderr)
    assert "Loss:" in str(stdout)
def test_empty_cols(tmpdir, engine, cat_names, mh_names, cont_names,
                    label_name, num_rows):
    json_sample["num_rows"] = num_rows

    cols = datagen._get_cols_from_schema(json_sample)

    df_gen = datagen.DatasetGen(datagen.PowerLawDistro(0.1))
    dataset = df_gen.create_df(num_rows, cols)
    dataset = nvt.Dataset(dataset)
    features = []
    if cont_names:
        features.append(cont_names >> ops.FillMedian() >> ops.Normalize())
    if cat_names or mh_names:
        features.append(cat_names + mh_names >> ops.Categorify())
    # test out https://github.com/NVIDIA/NVTabular/issues/149 making sure we can iterate over
    # empty cats/conts
    graph = sum(features, nvt.WorkflowNode(label_name))
    processor = nvt.Workflow(graph)

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    df_out = processor.fit_transform(dataset).to_ddf().compute(
        scheduler="synchronous")

    if processor.output_node.output_schema.apply_inverse(
            ColumnSelector("lab_1")):
        # if we don't have conts/cats/labels we're done
        return

    data_itr = None

    with pytest.raises(ValueError) as exc_info:
        data_itr = torch_dataloader.TorchAsyncItr(
            nvt.Dataset(df_out),
            cats=cat_names + mh_names,
            conts=cont_names,
            labels=label_name,
            batch_size=2,
        )
    assert "Neither Categorical or Continuous columns were found by the dataloader. " in str(
        exc_info.value)

    if data_itr:
        for nvt_batch in data_itr:
            cats_conts, labels = nvt_batch
            if cat_names:
                assert set(cat_names).issubset(set(list(cats_conts.keys())))
            if cont_names:
                assert set(cont_names).issubset(set(list(cats_conts.keys())))

        if cat_names or cont_names or mh_names:
            emb_sizes = nvt.ops.get_embedding_sizes(processor)

            EMBEDDING_DROPOUT_RATE = 0.04
            DROPOUT_RATES = [0.001, 0.01]
            HIDDEN_DIMS = [1000, 500]
            LEARNING_RATE = 0.001
            model = Model(
                embedding_table_shapes=emb_sizes,
                num_continuous=len(cont_names),
                emb_dropout=EMBEDDING_DROPOUT_RATE,
                layer_hidden_dims=HIDDEN_DIMS,
                layer_dropout_rates=DROPOUT_RATES,
            ).cuda()
            optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

            def rmspe_func(y_pred, y):
                "Return y_pred and y to non-log space and compute RMSPE"
                y_pred, y = torch.exp(y_pred) - 1, torch.exp(y) - 1
                pct_var = (y_pred - y) / y
                return (pct_var**2).mean().pow(0.5)

            train_loss, y_pred, y = process_epoch(
                data_itr,
                model,
                train=True,
                optimizer=optimizer,
                amp=False,
            )
            train_rmspe = None
            train_rmspe = rmspe_func(y_pred, y)
            assert train_rmspe is not None
            assert len(y_pred) > 0
            assert len(y) > 0
示例#13
0
def test_json_convert():
    cols = datagen._get_cols_from_schema(json_sample)
    assert len(cols["conts"]) == len(json_sample["conts"].keys())
    assert len(cols["cats"]) == len(json_sample["cats"].keys())
    assert len(cols["labels"]) == len(json_sample["labels"].keys())