def test_concatenate_dataframe(tmpdir, output_model):
    # we were seeing an issue in the rossmann workflow where we dropped certain columns,
    # https://github.com/NVIDIA/NVTabular/issues/961
    df = _make_df({
        "cat": ["aaaa", "bbbb", "cccc", "aaaa", "bbbb", "aaaa"],
        "cont": [0.0, 1.0, 2.0, 3.0, 4.0, 5],
    })
    # this bug only happened with a dataframe representation: force this by using a lambda
    cats = ["cat"] >> ops.LambdaOp(lambda col: _hash_series(col) % 1000)
    conts = ["cont"] >> ops.Normalize() >> ops.FillMissing() >> ops.LogOp()

    dataset = Dataset(df)
    workflow = nvt.Workflow(cats + conts).fit_schema(dataset.infer_schema())

    if output_model == "pytorch":
        model_info = {
            "cat": {
                "columns": ["cat"],
                "dtype": "int32"
            },
            "cont": {
                "columns": ["cont"],
                "dtype": "float32"
            },
        }
    else:
        model_info = None

    _verify_workflow_on_tritonserver(tmpdir, workflow, df,
                                     "test_concatenate_dataframe",
                                     output_model, model_info)
Exemplo n.º 2
0
def test_nested_workflow_node():
    df = dispatch._make_df({
        "geo": ["US>CA", "US>NY", "CA>BC", "CA>ON"],
        "user": ["User_A", "User_A", "User_A", "User_B"],
    })
    dataset = Dataset(df)

    geo_selector = ColumnSelector(["geo"])
    country = (geo_selector >> LambdaOp(lambda col: col.str.slice(0, 2)) >>
               Rename(postfix="_country"))
    # country1 = geo_selector >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country1")
    # country2 = geo_selector >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country2")
    user = "******"
    # user2 = "user2"

    # make sure we can do a 'combo' categorify (cross based) of country+user
    # as well as categorifying the country and user columns on their own
    cats = country + user + [country + user] >> Categorify(encode_type="combo")

    workflow = Workflow(cats)
    workflow.fit_schema(dataset.infer_schema())

    df_out = workflow.fit_transform(dataset).to_ddf().compute(
        scheduler="synchronous")

    geo_country = df_out["geo_country"]
    assert geo_country[0] == geo_country[1]  # rows 0,1 are both 'US'
    assert geo_country[2] == geo_country[3]  # rows 2,3 are both 'CA'

    user = df_out["user"]
    assert user[0] == user[1] == user[2]
    assert user[3] != user[2]

    geo_country_user = df_out["geo_country_user"]
    assert geo_country_user[0] == geo_country_user[1]  # US / userA
    assert geo_country_user[2] != geo_country_user[
        0]  # same user but in canada

    # make sure we get an exception if we nest too deeply (can't handle arbitrarily deep
    # nested column groups - and the exceptions we would get in operators like Categorify
    # are super confusing for users)
    with pytest.raises(ValueError):
        cats = [[country + "user"] + country + "user"
                ] >> Categorify(encode_type="combo")