Пример #1
0
def test_hash_bucket_lists(tmpdir):
    df = cudf.DataFrame({
        "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"],
                    ["User_C"]],
        "Engaging User": ["User_B", "User_B", "User_A", "User_D"],
        "Post": [1, 2, 3, 4],
    })
    cat_names = ["Authors"]  # , "Engaging User"]
    cont_names = []
    label_name = ["Post"]

    processor = nvt.Workflow(cat_names=cat_names,
                             cont_names=cont_names,
                             label_name=label_name)
    processor.add_preprocess(ops.HashBucket(num_buckets=10))
    processor.finalize()
    processor.apply(nvt.Dataset(df), output_format=None)
    df_out = processor.get_ddf().compute(scheduler="synchronous")

    # check to make sure that the same strings are hashed the same
    authors = df_out["Authors"].to_arrow().to_pylist()
    assert authors[0][0] == authors[1][0]  # 'User_A'
    assert authors[2][1] == authors[3][0]  # 'User_C'

    # make sure we get the embedding sizes
    assert nvt.ops.get_embedding_sizes(processor)["Authors"][0] == 10
Пример #2
0
def test_mh_support(tmpdir):
    df = cudf.DataFrame(
        {
            "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]],
            "Reviewers": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]],
            "Engaging User": ["User_B", "User_B", "User_A", "User_D"],
            "Post": [1, 2, 3, 4],
        }
    )
    cat_names = ["Authors", "Reviewers"]  # , "Engaging User"]
    cont_names = []
    label_name = ["Post"]

    cats = cat_names >> ops.HashBucket(num_buckets=10)

    processor = nvt.Workflow(cats + label_name)
    df_out = processor.fit_transform(nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous")

    # check to make sure that the same strings are hashed the same
    authors = df_out["Authors"].to_arrow().to_pylist()
    assert authors[0][0] == authors[1][0]  # 'User_A'
    assert authors[2][1] == authors[3][0]  # 'User_C'

    data_itr = torch_dataloader.TorchAsyncItr(
        nvt.Dataset(df_out), cats=cat_names, conts=cont_names, labels=label_name
    )
    idx = 0
    for batch in data_itr:
        idx = idx + 1
        cats, conts, labels = batch
        cats, mh = cats
        # mh is a tuple of dictionaries {Column name: (values, offsets)}
        assert len(mh) == len(cat_names)
        assert not cats
    assert idx > 0
def test_mh_support(tmpdir):
    df = nvt.dispatch._make_df({
        "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"],
                    ["User_C"]],
        "Reviewers": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"],
                      ["User_C"]],
        "Engaging User": ["User_B", "User_B", "User_A", "User_D"],
        "Post": [1, 2, 3, 4],
    })
    cat_names = ["Authors", "Reviewers"]  # , "Engaging User"]
    cont_names = []
    label_name = ["Post"]
    if HAS_GPU:
        cats = cat_names >> ops.HashBucket(num_buckets=10)
    else:
        cats = cat_names >> ops.Categorify()

    processor = nvt.Workflow(cats + label_name)
    df_out = processor.fit_transform(
        nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous")

    # check to make sure that the same strings are hashed the same
    if HAS_GPU:
        authors = df_out["Authors"].to_arrow().to_pylist()
    else:
        authors = df_out["Authors"]
    assert authors[0][0] == authors[1][0]  # 'User_A'
    assert authors[2][1] == authors[3][0]  # 'User_C'

    data_itr = torch_dataloader.TorchAsyncItr(nvt.Dataset(df_out),
                                              cats=cat_names,
                                              conts=cont_names,
                                              labels=label_name)
    idx = 0
    for batch in data_itr:
        idx = idx + 1
        cats_conts, labels = batch
        assert "Reviewers" in cats_conts
        # check it is multihot
        assert isinstance(cats_conts["Reviewers"], tuple)
        # mh is a tuple of dictionaries {Column name: (values, offsets)}
        assert "Authors" in cats_conts
        assert isinstance(cats_conts["Authors"], tuple)
    assert idx > 0
Пример #4
0
def test_hash_bucket(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns):
    cat_names = ["name-string"]

    if op_columns is None:
        num_buckets = 10
    else:
        num_buckets = {column: 10 for column in op_columns}

    hash_features = cat_names >> ops.HashBucket(num_buckets)
    processor = nvt.Workflow(hash_features)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()

    # check sums for determinancy
    assert np.all(new_gdf[cat_names].values >= 0)
    assert np.all(new_gdf[cat_names].values <= 9)
    checksum = new_gdf[cat_names].sum().values
    new_gdf = processor.transform(dataset).to_ddf().compute()
    np.all(new_gdf[cat_names].sum().values == checksum)
Пример #5
0
def test_hash_bucket_lists(tmpdir):
    df = cudf.DataFrame({
        "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"],
                    ["User_C"]],
        "Engaging User": ["User_B", "User_B", "User_A", "User_D"],
        "Post": [1, 2, 3, 4],
    })
    cat_names = ["Authors"]  # , "Engaging User"]

    dataset = nvt.Dataset(df)
    hash_features = cat_names >> ops.HashBucket(num_buckets=10)
    processor = nvt.Workflow(hash_features)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()

    # check to make sure that the same strings are hashed the same
    authors = new_gdf["Authors"].to_arrow().to_pylist()
    assert authors[0][0] == authors[1][0]  # 'User_A'
    assert authors[2][1] == authors[3][0]  # 'User_C'

    assert nvt.ops.get_embedding_sizes(processor)["Authors"][0] == 10
Пример #6
0
def test_transform_geolocation():
    raw = """US>SC>519 US>CA>807 US>MI>505 US>CA>510 CA>NB US>CA>534""".split()
    data = cudf.DataFrame({"geo_location": raw})

    geo_location = ColumnGroup(["geo_location"])
    state = geo_location >> (lambda col: col.str.slice(0, 5)) >> ops.Rename(
        postfix="_state")
    country = geo_location >> (lambda col: col.str.slice(0, 2)) >> ops.Rename(
        postfix="_country")
    geo_features = state + country + geo_location >> ops.HashBucket(
        num_buckets=100)

    # for this workflow we don't have any statoperators, so we can get away without fitting
    workflow = Workflow(geo_features)
    transformed = workflow.transform(Dataset(data)).to_ddf().compute()

    expected = cudf.DataFrame()
    expected["geo_location_state"] = data["geo_location"].str.slice(
        0, 5).hash_values() % 100
    expected["geo_location_country"] = data["geo_location"].str.slice(
        0, 2).hash_values() % 100
    expected["geo_location"] = data["geo_location"].hash_values() % 100
    assert_eq(expected, transformed)
Пример #7
0
def test_hash_bucket(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns):
    cat_names = ["name-string"]

    if op_columns is None:
        num_buckets = 10
    else:
        num_buckets = {column: 10 for column in op_columns}
    hash_bucket_op = ops.HashBucket(num_buckets)

    columns_ctx = {}
    columns_ctx["categorical"] = {}
    columns_ctx["categorical"]["base"] = cat_names

    # check sums for determinancy
    checksums = []
    for gdf in dataset.to_iter():
        new_gdf = hash_bucket_op.apply_op(gdf, columns_ctx, "categorical")
        assert np.all(new_gdf[cat_names].values >= 0)
        assert np.all(new_gdf[cat_names].values <= 9)
        checksums.append(new_gdf[cat_names].sum().values)

    for checksum, gdf in zip(checksums, dataset.to_iter()):
        new_gdf = hash_bucket_op.apply_op(gdf, columns_ctx, "categorical")
        assert np.all(new_gdf[cat_names].sum().values == checksum)
Пример #8
0
def test_mh_support(tmpdir, batch_size):
    data = {
        "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"],
                    ["User_C"]],
        "Reviewers": [
            ["User_A"],
            ["User_A", "User_E"],
            ["User_B", "User_C"],
            ["User_C"],
        ],
        "Engaging User": ["User_B", "User_B", "User_A", "User_D"],
        "Embedding": [
            [0.1, 0.2, 0.3],
            [0.3, 0.4, 0.5],
            [0.6, 0.7, 0.8],
            [0.8, 0.4, 0.2],
        ],
        "Post": [1, 2, 3, 4],
    }
    df = cudf.DataFrame(data)
    cat_names = ["Authors", "Reviewers", "Engaging User"]
    cont_names = ["Embedding"]
    label_name = ["Post"]

    cats = cat_names >> ops.HashBucket(num_buckets=10)
    workflow = nvt.Workflow(cats + cont_names + label_name)

    data_itr = tf_dataloader.KerasSequenceLoader(
        workflow.transform(nvt.Dataset(df)),
        cat_names=cat_names,
        cont_names=cont_names,
        label_names=label_name,
        batch_size=batch_size,
        shuffle=False,
    )

    idx = 0
    for X, y in data_itr:
        assert len(X) == 7
        n_samples = y.shape[0]

        for mh_name in ["Authors", "Reviewers", "Embedding"]:
            for postfix in ["__nnzs", "__values"]:
                assert (mh_name + postfix) in X
                array = X[mh_name + postfix].numpy()[:, 0]

                if postfix == "__nnzs":
                    if mh_name == "Embedding":
                        assert (array == 3).all()
                    else:
                        lens = [
                            len(x)
                            for x in data[mh_name][idx * batch_size:idx *
                                                   batch_size + n_samples]
                        ]
                        assert (array == np.array(lens)).all()
                else:
                    if mh_name == "Embedding":
                        assert len(array) == (n_samples * 3)
                    else:
                        assert len(array) == sum(lens)
        idx += 1
    assert idx == (3 // batch_size + 1)
# initial column selector works with tags
# filter within the workflow by tags
# test tags correct at output
@pytest.mark.parametrize(
    "op",
    [
        ops.Bucketize([1]),
        ops.Rename(postfix="_trim"),
        ops.Categorify(),
        ops.Categorify(encode_type="combo"),
        ops.Clip(0),
        ops.DifferenceLag("col1"),
        ops.FillMissing(),
        ops.Groupby("col1"),
        ops.HashBucket(1),
        ops.HashedCross(1),
        ops.JoinGroupby("col1"),
        ops.ListSlice(0),
        ops.LogOp(),
        ops.Normalize(),
        ops.TargetEncoding("col1"),
    ],
)
def test_workflow_select_by_tags(op):
    schema1 = ColumnSchema("col1", tags=["b", "c", "d"])
    schema2 = ColumnSchema("col2", tags=["c", "d"])
    schema3 = ColumnSchema("col3", tags=["d"])
    schema = Schema([schema1, schema2, schema3])

    cont_features = ColumnSelector(tags=["c"]) >> op
def test_mh_support(tmpdir, batch_size):
    data = {
        "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]],
        "Reviewers": [
            ["User_A"],
            ["User_A", "User_E"],
            ["User_B", "User_C"],
            ["User_C"],
        ],
        "Engaging User": ["User_B", "User_B", "User_A", "User_D"],
        "Embedding": [
            [0.1, 0.2, 0.3],
            [0.3, 0.4, 0.5],
            [0.6, 0.7, 0.8],
            [0.8, 0.4, 0.2],
        ],
        "Post": [1, 2, 3, 4],
    }
    df = nvt.dispatch._make_df(data)
    cat_names = ["Authors", "Reviewers", "Engaging User"]
    cont_names = ["Embedding"]
    label_name = ["Post"]
    if HAS_GPU:
        cats = cat_names >> ops.HashBucket(num_buckets=10)
    else:
        cats = cat_names >> ops.Categorify()
    workflow = nvt.Workflow(cats + cont_names + label_name)

    data_itr = tf_dataloader.KerasSequenceLoader(
        workflow.fit_transform(nvt.Dataset(df)),
        cat_names=cat_names,
        cont_names=cont_names,
        label_names=label_name,
        batch_size=batch_size,
        shuffle=False,
    )
    nnzs = None
    idx = 0
    for X, y in data_itr:
        assert len(X) == 4
        n_samples = y.shape[0]

        for mh_name in ["Authors", "Reviewers", "Embedding"]:
            # assert (mh_name) in X
            array, nnzs = X[mh_name]
            nnzs = nnzs.numpy()[:, 0]
            array = array.numpy()[:, 0]

            if mh_name == "Embedding":
                assert (nnzs == 3).all()
            else:
                lens = [
                    len(x) for x in data[mh_name][idx * batch_size : idx * batch_size + n_samples]
                ]
                assert (nnzs == np.array(lens)).all()

            if mh_name == "Embedding":
                assert len(array) == (n_samples * 3)
            else:
                assert len(array) == sum(lens)
        idx += 1
    assert idx == (3 // batch_size + 1)