예제 #1
0
def test_fill_missing(tmpdir, df, dataset, engine):
    cont_names = ["x", "y"]
    cont_features = cont_names >> nvt.ops.FillMissing(fill_val=42)

    for col in cont_names:
        idx = np.random.choice(df.shape[0] - 1, int(df.shape[0] * 0.2))
        df[col].iloc[idx] = None

    df = df.reset_index()
    dataset = nvt.Dataset(df)
    processor = nvt.Workflow(cont_features)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()
    for col in cont_names:
        assert np.all((df[col].fillna(42) - new_gdf[col]).abs().values <= 1e-2)
        assert new_gdf[col].isna().sum() == 0
예제 #2
0
def test_joingroupby_dependency(tmpdir):
    df = pd.DataFrame({
        "Author": ["User_A", "User_A", "User_A", "User_B", "User_B"],
        "Cost": [100.0, 200.0, 300.0, 400.0, 400.0],
    })

    normalized_cost = ["Cost"] >> nvt.ops.NormalizeMinMax() >> nvt.ops.Rename(
        postfix="_normalized")
    groupby_features = ["Author"] >> ops.JoinGroupby(
        out_path=str(tmpdir), stats=["sum"], cont_cols=normalized_cost)
    workflow = nvt.Workflow(groupby_features)

    df_out = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute()
    assert df_out["Author_Cost_normalized_sum"].to_arrow().to_pylist() == [
        1.0, 1.0, 1.0, 2.0, 2.0
    ]
예제 #3
0
def test_workflow_fit_op_rename(tmpdir, dataset, engine):
    # NVT
    schema = dataset.schema
    for name in schema.column_names:
        dataset.schema.column_schemas[name] = dataset.schema.column_schemas[
            name].with_tags([nvt.graph.tags.Tags.USER])
    selector = nvt.ColumnSelector(tags=[nvt.graph.tags.Tags.USER])

    workflow_ops_1 = selector >> nvt.ops.Rename(postfix="_1")
    workflow_1 = nvt.Workflow(workflow_ops_1)
    workflow_1.fit(dataset)
    workflow_1.save(str(tmpdir / "one"))
    new_dataset = workflow_1.transform(dataset).to_ddf().compute()

    assert len(new_dataset.columns) > 0
    assert all("_1" in col for col in new_dataset.columns)
예제 #4
0
def test_target_encode_multi(tmpdir, npartitions, cpu):

    cat_1 = np.asarray(["baaaa"] * 12)
    cat_2 = np.asarray(["baaaa"] * 6 + ["bbaaa"] * 3 + ["bcaaa"] * 3)
    num_1 = np.asarray([1, 1, 2, 2, 2, 1, 1, 5, 4, 4, 4, 4])
    num_2 = np.asarray([1, 1, 2, 2, 2, 1, 1, 5, 4, 4, 4, 4]) * 2
    df = cudf.DataFrame({
        "cat": cat_1,
        "cat2": cat_2,
        "num": num_1,
        "num_2": num_2
    })
    if cpu:
        df = dd.from_pandas(df.to_pandas(), npartitions=npartitions)
    else:
        df = dask_cudf.from_cudf(df, npartitions=npartitions)

    cat_groups = ["cat", "cat2", ["cat", "cat2"]]
    te_features = cat_groups >> ops.TargetEncoding(["num", "num_2"],
                                                   out_path=str(tmpdir),
                                                   kfold=1,
                                                   p_smooth=5,
                                                   out_dtype="float32")

    workflow = nvt.Workflow(te_features)

    df_out = workflow.fit_transform(
        nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous")

    assert "TE_cat_cat2_num" in df_out.columns
    assert "TE_cat_num" in df_out.columns
    assert "TE_cat2_num" in df_out.columns
    assert "TE_cat_cat2_num_2" in df_out.columns
    assert "TE_cat_num_2" in df_out.columns
    assert "TE_cat2_num_2" in df_out.columns

    assert_eq(df_out["TE_cat2_num"].values, df_out["TE_cat_cat2_num"].values)
    assert_eq(df_out["TE_cat2_num_2"].values,
              df_out["TE_cat_cat2_num_2"].values)
    assert df_out["TE_cat_num"].iloc[0] != df_out["TE_cat2_num"].iloc[0]
    assert df_out["TE_cat_num_2"].iloc[0] != df_out["TE_cat2_num_2"].iloc[0]
    assert math.isclose(df_out["TE_cat_num"].iloc[0],
                        num_1.mean(),
                        abs_tol=1e-4)
    assert math.isclose(df_out["TE_cat_num_2"].iloc[0],
                        num_2.mean(),
                        abs_tol=1e-3)
예제 #5
0
def test_target_encode(tmpdir, cat_groups, kfold, fold_seed, cpu):
    df = dispatch._make_df({
        "Author": list(string.ascii_uppercase),
        "Engaging-User": list(string.ascii_lowercase),
        "Cost": range(26),
        "Post": [0, 1] * 13,
    })
    if cpu:
        df = dd.from_pandas(
            df if isinstance(df, pd.DataFrame) else df.to_pandas(),
            npartitions=3)
    else:
        df = dask_cudf.from_cudf(df, npartitions=3)

    cont_names = ["Cost"]
    te_features = cat_groups >> ops.TargetEncoding(
        cont_names,
        out_path=str(tmpdir),
        kfold=kfold,
        out_dtype="float32",
        fold_seed=fold_seed,
        drop_folds=False,  # Keep folds to validate
    )

    cont_features = cont_names >> ops.FillMissing() >> ops.Clip(
        min_value=0) >> ops.LogOp()
    workflow = nvt.Workflow(te_features + cont_features +
                            ["Author", "Engaging-User"])
    df_out = workflow.fit_transform(
        nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous")

    df_lib = dispatch.get_lib()
    if kfold > 1:
        # Cat columns are unique.
        # Make sure __fold__ mapping is correct
        if cat_groups == "Author":
            name = "__fold___Author"
            cols = ["__fold__", "Author"]
        else:
            name = "__fold___Author_Engaging-User"
            cols = ["__fold__", "Author", "Engaging-User"]

        check = df_lib.read_parquet(te_features.op.stats[name])
        check = check[cols].sort_values(cols).reset_index(drop=True)
        df_out_check = df_out[cols].sort_values(cols).reset_index(drop=True)
        assert_eq(check, df_out_check, check_dtype=False)
def test_large_strings(tmpdir, output_model):
    strings = ["a" * (2**exp) for exp in range(1, 17)]
    df = _make_df({"description": strings})
    features = ["description"] >> ops.Categorify()
    workflow = nvt.Workflow(features)

    if output_model == "pytorch":
        model_info = {
            "description": {
                "columns": ["description"],
                "dtype": "int64"
            }
        }
    else:
        model_info = None
    _verify_workflow_on_tritonserver(tmpdir, workflow, df, "test_large_string",
                                     output_model, model_info)
def test_mh_support(tmpdir):
    df = nvt.dispatch._make_df({
        "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"],
                    ["User_C"]],
        "Reviewers": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"],
                      ["User_C"]],
        "Engaging User": ["User_B", "User_B", "User_A", "User_D"],
        "Post": [1, 2, 3, 4],
    })
    cat_names = ["Authors", "Reviewers"]  # , "Engaging User"]
    cont_names = []
    label_name = ["Post"]
    if HAS_GPU:
        cats = cat_names >> ops.HashBucket(num_buckets=10)
    else:
        cats = cat_names >> ops.Categorify()

    processor = nvt.Workflow(cats + label_name)
    df_out = processor.fit_transform(
        nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous")

    # check to make sure that the same strings are hashed the same
    if HAS_GPU:
        authors = df_out["Authors"].to_arrow().to_pylist()
    else:
        authors = df_out["Authors"]
    assert authors[0][0] == authors[1][0]  # 'User_A'
    assert authors[2][1] == authors[3][0]  # 'User_C'

    data_itr = torch_dataloader.TorchAsyncItr(nvt.Dataset(df_out),
                                              cats=cat_names,
                                              conts=cont_names,
                                              labels=label_name)
    idx = 0
    for batch in data_itr:
        idx = idx + 1
        cats_conts, labels = batch
        assert "Reviewers" in cats_conts
        # check it is multihot
        assert isinstance(cats_conts["Reviewers"], tuple)
        # mh is a tuple of dictionaries {Column name: (values, offsets)}
        assert "Authors" in cats_conts
        assert isinstance(cats_conts["Authors"], tuple)
    assert idx > 0
예제 #8
0
def test_target_encode_multi(tmpdir, npartitions):

    cat_1 = np.asarray(["baaaa"] * 12)
    cat_2 = np.asarray(["baaaa"] * 6 + ["bbaaa"] * 3 + ["bcaaa"] * 3)
    num_1 = np.asarray([1, 1, 2, 2, 2, 1, 1, 5, 4, 4, 4, 4])
    num_2 = np.asarray([1, 1, 2, 2, 2, 1, 1, 5, 4, 4, 4, 4]) * 2
    df = cudf.DataFrame({"cat": cat_1, "cat2": cat_2, "num": num_1, "num_2": num_2})
    df = dask_cudf.from_cudf(df, npartitions=npartitions)

    cat_names = ["cat", "cat2"]
    cont_names = ["num", "num_2"]
    label_name = []
    processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name)

    cat_groups = ["cat", "cat2", ["cat", "cat2"]]

    processor.add_preprocess(
        ops.TargetEncoding(
            cat_groups,
            ["num", "num_2"],  # cont_target
            out_path=str(tmpdir),
            kfold=1,
            p_smooth=5,
            out_dtype="float32",
        )
    )
    processor.finalize()
    processor.apply(nvt.Dataset(df), output_format=None)
    df_out = processor.get_ddf().compute(scheduler="synchronous")

    assert "TE_cat_cat2_num" in df_out.columns
    assert "TE_cat_num" in df_out.columns
    assert "TE_cat2_num" in df_out.columns
    assert "TE_cat_cat2_num_2" in df_out.columns
    assert "TE_cat_num_2" in df_out.columns
    assert "TE_cat2_num_2" in df_out.columns

    assert_eq(df_out["TE_cat2_num"].values, df_out["TE_cat_cat2_num"].values)
    assert_eq(df_out["TE_cat2_num_2"].values, df_out["TE_cat_cat2_num_2"].values)
    assert df_out["TE_cat_num"].iloc[0] != df_out["TE_cat2_num"].iloc[0]
    assert df_out["TE_cat_num_2"].iloc[0] != df_out["TE_cat2_num_2"].iloc[0]
    assert math.isclose(df_out["TE_cat_num"].iloc[0], num_1.mean(), abs_tol=1e-4)
    assert math.isclose(df_out["TE_cat_num_2"].iloc[0], num_2.mean(), abs_tol=1e-3)
예제 #9
0
def test_categorify_size(tmpdir, cpu, include_nulls):
    num_rows = 50
    num_distinct = 10

    possible_session_ids = list(range(num_distinct))
    if include_nulls:
        possible_session_ids.append(None)

    df = dispatch._make_df(
        {
            "session_id":
            [random.choice(possible_session_ids) for _ in range(num_rows)]
        },
        device="cpu" if cpu else None,
    )

    cat_features = ["session_id"] >> nvt.ops.Categorify(out_path=str(tmpdir))
    workflow = nvt.Workflow(cat_features)
    workflow.fit_transform(nvt.Dataset(df, cpu=cpu)).to_ddf().compute()

    vals = df["session_id"].value_counts()
    vocab = dispatch._read_dispatch(cpu=cpu)(os.path.join(
        tmpdir, "categories", "unique.session_id.parquet"))

    if cpu:
        expected = dict(zip(vals.index, vals))
        computed = {
            session: size
            for session, size in zip(vocab["session_id"],
                                     vocab["session_id_size"]) if size
        }
    else:
        expected = dict(zip(vals.index.values_host, vals.values_host))
        computed = {
            session: size
            for session, size in zip(vocab["session_id"].values_host,
                                     vocab["session_id_size"].values_host)
            if size
        }
    first_key = list(computed.keys())[0]
    if pd.isna(first_key):
        computed.pop(first_key)
    assert computed == expected
예제 #10
0
def test_join_external_workflow(tmpdir, df, dataset, engine):

    # Define "external" table
    how = "left"
    drop_duplicates = True
    cache = "device"
    shift = 100
    df_ext = df[["id"]].copy().sort_values("id")
    df_ext["new_col"] = df_ext["id"] + shift
    df_ext["new_col_2"] = "keep"
    df_ext["new_col_3"] = "ignore"
    df_ext_check = df_ext.copy()

    # Define Op
    on = "id"
    columns_left = list(df.columns)
    columns_ext = ["id", "new_col", "new_col_2"]
    df_ext_check = df_ext_check[columns_ext]
    if drop_duplicates:
        df_ext_check.drop_duplicates(ignore_index=True, inplace=True)
    joined = nvt.ColumnGroup(columns_left) >> nvt.ops.JoinExternal(
        df_ext,
        on,
        how=how,
        columns_ext=columns_ext,
        cache=cache,
        drop_duplicates_ext=drop_duplicates,
    )

    # Define Workflow
    gdf = df.reset_index()
    dataset = nvt.Dataset(gdf)
    processor = nvt.Workflow(joined)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute().reset_index()

    # Validate
    check_gdf = gdf.merge(df_ext_check, how=how, on=on)
    assert len(check_gdf) == len(new_gdf)
    assert (new_gdf["id"] + shift).all() == new_gdf["new_col"].all()
    assert gdf["id"].all() == new_gdf["id"].all()
    assert "new_col_2" in new_gdf.columns
    assert "new_col_3" not in new_gdf.columns
예제 #11
0
def test_generate_triton_model(tmpdir, engine, df):
    tmpdir = "./tmp"
    conts = ["x", "y", "id"] >> ops.FillMissing() >> ops.Normalize()
    cats = ["name-cat", "name-string"] >> ops.Categorify(cat_cache="host")
    workflow = nvt.Workflow(conts + cats)
    workflow.fit(nvt.Dataset(df))
    expected = workflow.transform(nvt.Dataset(df)).to_ddf().compute()

    # save workflow to triton / verify we see some expected output
    repo = os.path.join(tmpdir, "models")
    triton.generate_nvtabular_model(workflow, "model", repo)
    workflow = None

    assert os.path.exists(os.path.join(repo, "config.pbtxt"))

    workflow = nvt.Workflow.load(os.path.join(repo, "1", "workflow"))
    transformed = workflow.transform(nvt.Dataset(df)).to_ddf().compute()

    assert_eq(expected, transformed)
예제 #12
0
def test_hash_bucket(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns):
    cat_names = ["name-string"]

    if op_columns is None:
        num_buckets = 10
    else:
        num_buckets = {column: 10 for column in op_columns}

    hash_features = cat_names >> ops.HashBucket(num_buckets)
    processor = nvt.Workflow(hash_features)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()

    # check sums for determinancy
    assert np.all(new_gdf[cat_names].values >= 0)
    assert np.all(new_gdf[cat_names].values <= 9)
    checksum = new_gdf[cat_names].sum().values
    new_gdf = processor.transform(dataset).to_ddf().compute()
    np.all(new_gdf[cat_names].sum().values == checksum)
예제 #13
0
def test_hugectr(tmpdir, df, dataset, output_format, engine, op_columns):
    cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y"]
    label_names = ["label"]

    # set variables
    nfiles = 10
    ext = ""
    outdir = tmpdir + "/hugectr"
    os.mkdir(outdir)

    # process data
    processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_names)
    processor.add_feature([ops.ZeroFill(columns=op_columns), ops.LogOp()])
    processor.add_preprocess(ops.Normalize())
    processor.add_preprocess(ops.Categorify())
    processor.finalize()

    # Need to collect statistics first (for now)
    processor.update_stats(dataset)

    # Second "online" pass to write HugeCTR output
    processor.apply(
        dataset,
        apply_offline=False,
        record_stats=False,
        output_path=outdir,
        out_files_per_proc=nfiles,
        output_format=output_format,
        shuffle=False,
    )

    # Check files
    ext = ""
    if output_format == "parquet":
        ext = "parquet"
        assert os.path.isfile(outdir + "/metadata.json")
    elif output_format == "hugectr":
        ext = "data"

    assert os.path.isfile(outdir + "/file_list.txt")
    for n in range(nfiles):
        assert os.path.isfile(os.path.join(outdir, str(n) + "." + ext))
예제 #14
0
def test_normalize_lists(tmpdir, cpu):
    df = dispatch._make_df(device="cpu" if cpu else "gpu")
    df["vals"] = [
        [0.0, 1.0, 2.0],
        [
            3.0,
            4.0,
        ],
        [5.0],
    ]

    features = ["vals"] >> nvt.ops.Normalize()
    workflow = nvt.Workflow(features)
    transformed = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute()

    expected = _flatten_list_column_values(df["vals"]).astype("float32")
    expected = (expected - expected.mean()) / expected.std()
    expected_df = type(transformed)({"vals": expected})

    assert_eq(expected_df, _flatten_list_column(transformed["vals"]))
예제 #15
0
def test_categorify_single_table():
    df = dispatch._make_df({
        "Authors": [None, "User_A", "User_A", "User_E", "User_B", "User_C"],
        "Engaging_User":
        [None, "User_B", "User_B", "User_A", "User_D", "User_D"],
        "Post": [1, 2, 3, 4, None, 5],
    })
    cat_names = ["Authors", "Engaging_User"]
    dataset = nvt.Dataset(df)
    features = cat_names >> ops.Categorify(single_table=True)
    processor = nvt.Workflow(features)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()

    old_max = 0
    for name in cat_names:
        curr_min = new_gdf[name].min()
        assert old_max <= curr_min
        curr_max = new_gdf[name].max()
        old_max += curr_max
예제 #16
0
def test_categorify_multi_combo(tmpdir):
    cat_names = [["Author", "Engaging User"], ["Author"], "Engaging User"]
    kind = "combo"
    df = pd.DataFrame(
        {
            "Author": ["User_A", "User_E", "User_B", "User_C"],
            "Engaging User": ["User_B", "User_B", "User_A", "User_D"],
            "Post": [1, 2, 3, 4],
        }
    )

    label_name = ["Post"]
    cats = cat_names >> ops.Categorify(out_path=str(tmpdir), encode_type=kind)
    workflow = nvt.Workflow(cats + label_name)
    df_out = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous")

    # Column combinations are encoded
    assert df_out["Author"].to_arrow().to_pylist() == [1, 4, 2, 3]
    assert df_out["Engaging User"].to_arrow().to_pylist() == [2, 2, 1, 3]
    assert df_out["Author_Engaging User"].to_arrow().to_pylist() == [1, 4, 2, 3]
예제 #17
0
def test_categorify_multi(tmpdir, cat_names, kind, cpu):
    df = pd.DataFrame({
        "Author": ["User_A", "User_E", "User_B", "User_C"],
        "Engaging User": ["User_B", "User_B", "User_A", "User_D"],
        "Post": [1, 2, 3, 4],
    })

    label_name = ["Post"]

    cats = cat_names >> ops.Categorify(out_path=str(tmpdir), encode_type=kind)

    workflow = nvt.Workflow(cats + label_name)

    df_out = (workflow.fit_transform(nvt.Dataset(
        df, cpu=cpu)).to_ddf().compute(scheduler="synchronous"))

    if len(cat_names) == 1:
        if kind == "joint":
            # Columns are encoded jointly
            compare_authors = (df_out["Author"].to_list() if cpu else
                               df_out["Author"].to_arrow().to_pylist())
            compare_engaging = (df_out["Engaging User"].to_list() if cpu else
                                df_out["Engaging User"].to_arrow().to_pylist())
            # again userB has highest frequency given lowest encoding
            assert compare_authors == [2, 5, 1, 3]
            assert compare_engaging == [1, 1, 2, 4]
        else:
            # Column combinations are encoded
            compare_engaging = (
                df_out["Author_Engaging User"].to_list() if cpu else
                df_out["Author_Engaging User"].to_arrow().to_pylist())
            assert compare_engaging == [1, 4, 2, 3]
    else:
        # Columns are encoded independently
        compare_authors = (df_out["Author"].to_list()
                           if cpu else df_out["Author"].to_arrow().to_pylist())
        compare_engaging = (df_out["Engaging User"].to_list() if cpu else
                            df_out["Engaging User"].to_arrow().to_pylist())
        assert compare_authors == [1, 4, 2, 3]
        # User B is first in frequency based ordering
        assert compare_engaging == [1, 1, 2, 3]
예제 #18
0
def test_moments(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns):
    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    config = nvt.workflow.get_new_config()
    config["PP"]["continuous"] = [ops.Moments(columns=op_columns)]

    processor = nvt.Workflow(
        cat_names=cat_names,
        cont_names=cont_names,
        label_name=label_name,
        config=config,
    )

    processor.update_stats(dataset)

    assert df.x.count() == processor.stats["counts"]["x"]
    assert df.x.count() == 4321

    # Check mean and std
    assert math.isclose(df.x.mean(),
                        processor.stats["means"]["x"],
                        rel_tol=1e-4)
    assert math.isclose(df.x.std(), processor.stats["stds"]["x"], rel_tol=1e-3)
    if not op_columns:
        assert math.isclose(df.y.mean(),
                            processor.stats["means"]["y"],
                            rel_tol=1e-4)
        assert math.isclose(df.id.mean(),
                            processor.stats["means"]["id"],
                            rel_tol=1e-4)

        assert math.isclose(df.y.std(),
                            processor.stats["stds"]["y"],
                            rel_tol=1e-3)
        assert math.isclose(df.id.std(),
                            processor.stats["stds"]["id"],
                            rel_tol=1e-3)
    return processor.ds_exports
예제 #19
0
def test_na_value_count(tmpdir):
    gdf = dispatch._make_df({
        "productID": ["B00406YHLI"] * 5 + ["B002YXS8E6"] * 5 +
        ["B00011KM38"] * 2 + [np.nan] * 3,
        "brand":
        ["Coby"] * 5 + [np.nan] * 5 + ["Cooler Master"] * 2 + ["Asus"] * 3,
    })

    cat_features = ["brand", "productID"] >> nvt.ops.Categorify()
    workflow = nvt.Workflow(cat_features)
    train_dataset = nvt.Dataset(gdf, engine="parquet")
    workflow.fit(train_dataset)
    workflow.transform(train_dataset).to_ddf().compute()

    single_cat = dispatch._read_dispatch("./categories/unique.brand.parquet")(
        "./categories/unique.brand.parquet")
    second_cat = dispatch._read_dispatch(
        "./categories/unique.productID.parquet")(
            "./categories/unique.productID.parquet")
    assert single_cat["brand_size"][0] == 5
    assert second_cat["productID_size"][0] == 3
예제 #20
0
def test_categorify_lists(tmpdir, freq_threshold):
    df = cudf.DataFrame(
        {
            "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"], ["User_C"]],
            "Engaging User": ["User_B", "User_B", "User_A", "User_D"],
            "Post": [1, 2, 3, 4],
        }
    )
    cat_names = ["Authors", "Engaging User"]
    label_name = ["Post"]

    cat_features = cat_names >> ops.Categorify(out_path=str(tmpdir), freq_threshold=freq_threshold)

    workflow = nvt.Workflow(cat_features + label_name)
    df_out = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute()

    # Columns are encoded independently
    if freq_threshold < 2:
        assert df_out["Authors"].to_arrow().to_pylist() == [[1], [1, 4], [2, 3], [3]]
    else:
        assert df_out["Authors"].to_arrow().to_pylist() == [[1], [1, 0], [0, 2], [2]]
예제 #21
0
def test_chaining_2():
    gdf = cudf.DataFrame(
        {
            "A": [1, 2, 2, 9, 6, np.nan, 3],
            "B": [2, np.nan, 4, 7, 7, 2, 5],
            "C": ["a", "b", "c", np.nan, np.nan, "g", "k"],
        }
    )
    proc = nvt.Workflow(cat_names=["C"], cont_names=["A", "B"], label_name=[])

    proc.add_feature(
        nvt.ops.LambdaOp(op_name="isnull", f=lambda col, gdf: col.isnull(), replace=False)
    )

    proc.add_cat_preprocess(nvt.ops.Categorify())
    train_dataset = nvt.Dataset(gdf, engine="parquet")

    proc.apply(train_dataset, apply_offline=True, record_stats=True, output_path=None)
    result = proc.get_ddf().compute()
    assert all(x in list(result.columns) for x in ["A_isnull", "B_isnull", "C_isnull"])
    assert (x in result["C"].unique() for x in set(gdf["C"].dropna().to_arrow()))
예제 #22
0
def test_categorify_hash_bucket(cpu):
    df = dispatch._make_df({
        "Authors": ["User_A", "User_A", "User_E", "User_B", "User_C"],
        "Engaging_User": ["User_B", "User_B", "User_A", "User_D", "User_D"],
        "Post": [1, 2, 3, 4, 5],
    })
    cat_names = ["Authors", "Engaging_User"]
    buckets = 10
    dataset = nvt.Dataset(df, cpu=cpu)
    hash_features = cat_names >> ops.Categorify(num_buckets=buckets)
    processor = nvt.Workflow(hash_features)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()

    # check hashed values
    assert new_gdf["Authors"].max() <= (buckets - 1)
    assert new_gdf["Engaging_User"].max() <= (buckets - 1)
    # check embedding size is equal to the num_buckets after hashing
    assert nvt.ops.get_embedding_sizes(processor)["Authors"][0] == buckets
    assert nvt.ops.get_embedding_sizes(
        processor)["Engaging_User"][0] == buckets
예제 #23
0
def test_hash_bucket_lists(tmpdir):
    df = cudf.DataFrame({
        "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"],
                    ["User_C"]],
        "Engaging User": ["User_B", "User_B", "User_A", "User_D"],
        "Post": [1, 2, 3, 4],
    })
    cat_names = ["Authors"]  # , "Engaging User"]

    dataset = nvt.Dataset(df)
    hash_features = cat_names >> ops.HashBucket(num_buckets=10)
    processor = nvt.Workflow(hash_features)
    processor.fit(dataset)
    new_gdf = processor.transform(dataset).to_ddf().compute()

    # check to make sure that the same strings are hashed the same
    authors = new_gdf["Authors"].to_arrow().to_pylist()
    assert authors[0][0] == authors[1][0]  # 'User_A'
    assert authors[2][1] == authors[3][0]  # 'User_C'

    assert nvt.ops.get_embedding_sizes(processor)["Authors"][0] == 10
예제 #24
0
def test_empty_cols(tmpdir, df, dataset, engine, cat_names, cont_names,
                    label_name):
    # test out https://github.com/NVIDIA/NVTabular/issues/149 making sure we can iterate over
    # empty cats/conts
    processor = nvt.Workflow(cat_names=cat_names,
                             cont_names=cont_names,
                             label_name=label_name)

    if cont_names:
        processor.add_feature([ops.FillMedian()])
        processor.add_feature(ops.Normalize())
    if cat_names:
        processor.add_feature(ops.Categorify())

    output_train = os.path.join(tmpdir, "train/")
    os.mkdir(output_train)

    processor.apply(
        dataset,
        apply_offline=True,
        record_stats=True,
        shuffle=nvt.io.Shuffle.PER_PARTITION,
        output_format=None,
    )
    df_out = processor.get_ddf().compute(scheduler="synchronous")

    data_itr = torch_dataloader.TorchAsyncItr(nvt.Dataset(df_out),
                                              cats=cat_names,
                                              conts=cont_names,
                                              labels=label_name,
                                              batch_size=1)

    for nvt_batch in data_itr:
        cats, conts, labels = nvt_batch
        if cat_names:
            assert cats.shape[-1] == len(cat_names)
        if cont_names:
            assert conts.shape[-1] == len(cont_names)
        if label_name:
            assert labels.shape[-1] == len(label_name)
예제 #25
0
def test_joingroupby_multi(tmpdir, groups):

    df = pd.DataFrame({
        "Author": ["User_A", "User_A", "User_A", "User_B"],
        "Engaging-User": ["User_B", "User_B", "User_C", "User_C"],
        "Cost": [100.0, 200.0, 300.0, 400.0],
        "Post": [1, 2, 3, 4],
    })

    cat_names = ["Author", "Engaging-User"]
    cont_names = ["Cost"]
    label_name = ["Post"]

    processor = nvt.Workflow(cat_names=cat_names,
                             cont_names=cont_names,
                             label_name=label_name)

    processor.add_preprocess(
        ops.JoinGroupby(columns=groups,
                        out_path=str(tmpdir),
                        stats=["sum"],
                        cont_names=["Cost"]))
    processor.finalize()
    processor.apply(nvt.Dataset(df), output_format=None)
    df_out = processor.get_ddf().compute(scheduler="synchronous")

    if isinstance(groups, list):
        # Join on ["Author", "Engaging-User"]
        assert df_out["Author_Engaging-User_Cost_sum"].to_arrow().to_pylist(
        ) == [
            300.0,
            300.0,
            300.0,
            400.0,
        ]
    else:
        # Join on ["Author"]
        assert df_out["Author_Cost_sum"].to_arrow().to_pylist() == [
            600.0, 600.0, 600.0, 400.0
        ]
예제 #26
0
def test_chaining_3():
    gdf_test = cudf.DataFrame({
        "ad_id": [1, 2, 2, 6, 6, 8, 3, 3],
        "source_id": [2, 4, 4, 7, 5, 2, 5, 2],
        "platform": [1, 2, np.nan, 2, 1, 3, 3, 1],
        "clicked": [1, 0, 1, 0, 0, 1, 1, 0],
    })

    proc = nvt.Workflow(cat_names=["ad_id", "source_id", "platform"],
                        cont_names=[],
                        label_name=["clicked"])
    # apply dropna
    proc.add_feature([
        nvt.ops.Dropna(["platform"]),
        nvt.ops.JoinGroupby(columns=["ad_id"],
                            cont_names=["clicked"],
                            stats=["sum", "count"]),
        nvt.ops.LambdaOp(
            op_name="ctr",
            f=lambda col, gdf: col / gdf["ad_id_count"],
            columns=["ad_id_clicked_sum"],
            replace=False,
        ),
    ])

    proc.finalize()
    assert len(proc.phases) == 2
    GPU_MEMORY_FRAC = 0.2
    train_dataset = nvt.Dataset(gdf_test,
                                engine="parquet",
                                part_mem_fraction=GPU_MEMORY_FRAC)
    proc.apply(train_dataset,
               apply_offline=True,
               record_stats=True,
               output_path=None,
               shuffle=False)
    result = proc.get_ddf().compute()
    assert all(
        x in result.columns
        for x in ["ad_id_count", "ad_id_clicked_sum_ctr", "ad_id_clicked_sum"])
def test_groupby_model(tmpdir, output_model):
    size = 20
    df = _make_df({
        "id": np.random.choice([0, 1], size=size),
        "ts": np.linspace(0.0, 10.0, num=size),
        "x": np.arange(size),
        "y": np.linspace(0.0, 10.0, num=size),
    })

    groupby_features = ColumnSelector(["id", "ts", "x", "y"]) >> ops.Groupby(
        groupby_cols=["id"],
        sort_cols=["ts"],
        aggs={
            "x": ["sum"],
            "y": ["first"],
        },
        name_sep="-",
    )
    workflow = nvt.Workflow(groupby_features)

    if output_model == "pytorch":
        model_info = {
            "x-sum": {
                "columns": ["x-sum"],
                "dtype": "int64"
            },
            "y-first": {
                "columns": ["y-first"],
                "dtype": "float64"
            },
            "id": {
                "columns": ["id"],
                "dtype": "int64"
            },
        }
    else:
        model_info = None

    _verify_workflow_on_tritonserver(tmpdir, workflow, df, "groupby",
                                     output_model, model_info)
예제 #28
0
def test_encoder(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns):
    cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    encoder = ops.CategoryStatistics(columns=op_columns)
    config = nvt.workflow.get_new_config()
    config["PP"]["categorical"] = [encoder]

    processor = nvt.Workflow(
        cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config
    )
    processor.update_stats(dataset)

    if engine == "parquet" and not op_columns:
        cats_expected0 = df["name-cat"].unique().values_host
        cats0 = get_cats(processor, "name-cat")
        assert cats0.tolist() == [None] + cats_expected0.tolist()

    cats_expected1 = df["name-string"].unique().values_host
    cats1 = get_cats(processor, "name-string")
    assert cats1.tolist() == [None] + cats_expected1.tolist()
예제 #29
0
def test_median(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns):
    cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    config = nvt.workflow.get_new_config()
    config["PP"]["continuous"] = [ops.Median(columns=op_columns)]

    processor = nvt.Workflow(
        cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config
    )

    processor.update_stats(dataset)

    # Check median (TODO: Improve the accuracy)
    x_median = df.x.dropna().quantile(0.5, interpolation="linear")
    assert math.isclose(x_median, processor.stats["medians"]["x"], rel_tol=1e1)
    if not op_columns:
        y_median = df.y.dropna().quantile(0.5, interpolation="linear")
        id_median = df.id.dropna().quantile(0.5, interpolation="linear")
        assert math.isclose(y_median, processor.stats["medians"]["y"], rel_tol=1e1)
        assert math.isclose(id_median, processor.stats["medians"]["id"], rel_tol=1e1)
def test_numeric_dtypes(tmpdir, output_model):
    if output_model == "pytorch":
        model_info = dict()
    else:
        model_info = None

    dtypes = []
    for width in [8, 16, 32, 64]:
        dtype = f"int{width}"
        dtypes.append((dtype, np.iinfo(dtype)))
        if output_model == "pytorch":
            model_info[dtype] = {"columns": [dtype], "dtype": dtype}

        dtype = f"uint{width}"
        dtypes.append((dtype, np.iinfo(dtype)))
        if output_model == "pytorch":
            model_info[dtype] = {"columns": [dtype], "dtype": dtype}

    for width in [32, 64]:
        dtype = f"float{width}"
        dtypes.append((dtype, np.finfo(dtype)))
        if output_model == "pytorch":
            model_info[dtype] = {"columns": [dtype], "dtype": dtype}

    def check_dtypes(col):
        assert str(col.dtype) == col.name
        return col

    # simple transform to make sure we can round-trip the min/max values for each dtype,
    # through triton, with the 'transform' here just checking that the dtypes are correct
    df = _make_df({
        dtype: np.array([limits.max, 0, limits.min], dtype=dtype)
        for dtype, limits in dtypes
    })
    features = nvt.ColumnSelector(df.columns) >> check_dtypes
    workflow = nvt.Workflow(features)
    _verify_workflow_on_tritonserver(tmpdir, workflow, df,
                                     "test_numeric_dtypes", output_model,
                                     model_info)