Пример #1
0
def test_dataloader_epochs(datasets, engine, batch_size, epochs, on_ddf):
    dataset = Dataset(str(datasets["parquet"]), engine=engine)

    if on_ddf:
        dataset = dataset.to_ddf()

    cont_names = ["x", "y", "id"]
    cat_names = ["name-string", "name-cat"]
    label_name = ["label"]

    data_loader = DataLoader(
        dataset,
        cat_names=cat_names,
        cont_names=cont_names,
        batch_size=batch_size,
        label_names=label_name,
        shuffle=False,
    )

    # Convert to iterators and then to DataFrames
    df1 = _concat(list(data_loader._buff.itr))
    df2 = _concat(list(data_loader.epochs(epochs)._buff.itr))

    # Check that the DataFrame sizes and rows make sense
    assert len(df2) == epochs * len(df1)
    assert_eq(
        _concat([df1 for i in range(epochs)]).reset_index(drop=True),
        df2.reset_index(drop=True),
    )
Пример #2
0
def test_workflow_node_select():
    df = dispatch._make_df({
        "a": [1, 4, 9, 16, 25],
        "b": [0, 1, 2, 3, 4],
        "c": [25, 16, 9, 4, 1]
    })
    dataset = Dataset(df)

    input_features = WorkflowNode(ColumnSelector(["a", "b", "c"]))
    # pylint: disable=unnecessary-lambda
    sqrt_features = input_features[["a", "c"]] >> (lambda col: np.sqrt(col))
    plus_one_features = input_features["b"] >> (lambda col: col + 1)
    features = sqrt_features + plus_one_features

    workflow = Workflow(features)
    workflow.fit(dataset)

    df_out = workflow.transform(dataset).to_ddf().compute(
        scheduler="synchronous")

    expected = dispatch._make_df()
    expected["a"] = np.sqrt(df["a"])
    expected["c"] = np.sqrt(df["c"])
    expected["b"] = df["b"] + 1

    assert_eq(expected, df_out)
def test_gpu_file_iterator_ds(df, dataset, batch, engine):
    df_itr = nvt.dispatch._make_df({})
    for data_gd in dataset.to_iter(columns=mycols_csv):
        df_itr = nvt.dispatch._concat(
            [df_itr, data_gd], axis=0) if df_itr is not None else data_gd

    assert_eq(df_itr.reset_index(drop=True), df.reset_index(drop=True))
Пример #4
0
def test_workflow_move_saved(tmpdir):
    raw = """US>SC>519 US>CA>807 US>MI>505 US>CA>510 CA>NB US>CA>534""".split()
    data = nvt.dispatch._make_df({"geo": raw})

    geo_location = ColumnSelector(["geo"])
    state = (geo_location >> ops.LambdaOp(lambda col: col.str.slice(0, 5)) >>
             ops.Rename(postfix="_state"))
    country = (geo_location >> ops.LambdaOp(lambda col: col.str.slice(0, 2)) >>
               ops.Rename(postfix="_country"))
    geo_features = state + country + geo_location >> ops.Categorify()

    # create the workflow and transform the input
    workflow = Workflow(geo_features)
    expected = workflow.fit_transform(Dataset(data)).to_ddf().compute()

    # save the workflow (including categorical mapping parquet files)
    # and then verify we can load the saved workflow after moving the directory
    out_path = os.path.join(tmpdir, "output", "workflow")
    workflow.save(out_path)

    moved_path = os.path.join(tmpdir, "output", "workflow2")
    shutil.move(out_path, moved_path)
    workflow2 = Workflow.load(moved_path)

    # also check that when transforming our input we get the same results after loading
    transformed = workflow2.transform(Dataset(data)).to_ddf().compute()
    assert_eq(expected, transformed)
Пример #5
0
def test_log(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns, cpu):
    cont_features = op_columns >> nvt.ops.LogOp()
    processor = nvt.Workflow(cont_features)
    processor.fit(dataset)
    new_df = processor.transform(dataset).to_ddf().compute()
    for col in op_columns:
        values = dispatch._array(new_df[col])
        original = dispatch._array(df[col])
        assert_eq(values, np.log(original.astype(np.float32) + 1))
Пример #6
0
def test_logop_lists(tmpdir, cpu):
    df = dispatch._make_df(device="cpu" if cpu else "gpu")
    df["vals"] = [[np.exp(0) - 1, np.exp(1) - 1], [np.exp(2) - 1], []]

    features = ["vals"] >> nvt.ops.LogOp()
    workflow = nvt.Workflow(features)
    new_df = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute()

    expected = dispatch._make_df(device="cpu" if cpu else "gpu")
    expected["vals"] = [[0.0, 1.0], [2.0], []]

    assert_eq(expected, new_df)
def _verify_workflow_on_tritonserver(
    tmpdir,
    workflow,
    df,
    model_name,
    output_model="tensorflow",
    model_info=None,
    sparse_max=None,
):
    """tests that the nvtabular workflow produces the same results when run locally in the
    process, and when run in tritonserver"""
    # fit the workflow and test on the input
    dataset = nvt.Dataset(df)
    workflow.fit(dataset)

    local_df = workflow.transform(dataset).to_ddf().compute(
        scheduler="synchronous")

    for col in workflow.output_node.output_columns.names:
        if sparse_max and col in sparse_max.keys():
            workflow.output_dtypes[col] = workflow.output_dtypes.get(
                col).element_type

    triton.generate_nvtabular_model(
        workflow=workflow,
        name=model_name,
        output_path=tmpdir + f"/{model_name}",
        version=1,
        output_model=output_model,
        output_info=model_info,
        sparse_max=sparse_max,
        backend=BACKEND,
    )

    inputs = triton.convert_df_to_triton_input(df.columns, df)
    outputs = [
        grpcclient.InferRequestedOutput(col)
        for col in workflow.output_dtypes.keys()
    ]
    with run_triton_server(tmpdir) as client:
        response = client.infer(model_name, inputs, outputs=outputs)

        for col in workflow.output_dtypes.keys():
            features = response.as_numpy(col)
            if sparse_max and col in sparse_max:
                features = features.tolist()
                triton_df = _make_df()
                triton_df[col] = features
            else:
                triton_df = _make_df(
                    {col: features.reshape(features.shape[0])})
            assert_eq(triton_df, local_df[[col]])
def test_generate_triton_model(tmpdir, engine, output_model, df):
    tmpdir = "./tmp"
    conts = ["x", "y", "id"] >> ops.FillMissing() >> ops.Normalize()
    cats = ["name-cat", "name-string"] >> ops.Categorify(cat_cache="host")
    workflow = nvt.Workflow(conts + cats)
    workflow.fit(nvt.Dataset(df))
    expected = workflow.transform(nvt.Dataset(df)).to_ddf().compute()

    # save workflow to triton / verify we see some expected output
    if output_model == "pytorch":
        model_info = {
            "name-cat": {
                "columns": ["name-cat"],
                "dtype": "int64"
            },
            "name-string": {
                "columns": ["name-string"],
                "dtype": "int64"
            },
            "id": {
                "columns": ["id"],
                "dtype": "float32"
            },
            "x": {
                "columns": ["x"],
                "dtype": "float32"
            },
            "y": {
                "columns": ["y"],
                "dtype": "float32"
            },
        }
    else:
        model_info = None

    repo = os.path.join(tmpdir, "models")
    triton.generate_nvtabular_model(
        workflow=workflow,
        name="model",
        output_path=repo,
        version=1,
        output_model=output_model,
        output_info=model_info,
    )
    workflow = None

    assert os.path.exists(os.path.join(repo, "config.pbtxt"))

    workflow = nvt.Workflow.load(os.path.join(repo, "1", "workflow"))
    transformed = workflow.transform(nvt.Dataset(df)).to_ddf().compute()
    assert_eq(expected, transformed)
Пример #9
0
def test_rename(cpu):
    DataFrame = pd.DataFrame if cpu else cudf.DataFrame
    df = DataFrame({"x": [1, 2, 3, 4, 5], "y": [6, 7, 8, 9, 10]})

    selector = ColumnSelector(["x", "y"])

    op = ops.Rename(f=lambda name: name.upper())
    transformed = op.transform(selector, df)
    expected = DataFrame({"X": [1, 2, 3, 4, 5], "Y": [6, 7, 8, 9, 10]})
    assert_eq(transformed, expected)

    op = ops.Rename(postfix="_lower")
    transformed = op.transform(selector, df)
    expected = DataFrame({"x_lower": [1, 2, 3, 4, 5], "y_lower": [6, 7, 8, 9, 10]})
    assert_eq(transformed, expected)

    selector = ColumnSelector(["x"])

    op = ops.Rename(name="z")
    transformed = op.transform(selector, df)
    expected = DataFrame({"z": [1, 2, 3, 4, 5]})
    assert_eq(transformed, expected)

    op = nvt.ops.Rename(f=lambda name: name.upper())
    transformed = op.transform(selector, df)
    expected = DataFrame({"X": [1, 2, 3, 4, 5]})
    assert_eq(transformed, expected)
Пример #10
0
def test_s3_dataset(s3_base, s3so, paths, datasets, engine, df):

    # Copy files to mock s3 bucket
    files = {}
    for i, path in enumerate(paths):
        with open(path, "rb") as f:
            fbytes = f.read()
        fn = path.split(os.path.sep)[-1]
        files[fn] = BytesIO()
        files[fn].write(fbytes)
        files[fn].seek(0)

    if engine == "parquet":
        # Workaround for nvt#539. In order to avoid the
        # bug in Dask's `create_metadata_file`, we need
        # to manually generate a "_metadata" file here.
        # This can be removed after dask#7295 is merged
        # (see https://github.com/dask/dask/pull/7295)
        fn = "_metadata"
        files[fn] = BytesIO()
        meta = create_metadata_file(
            paths,
            engine="pyarrow",
            out_dir=False,
        )
        meta.write_metadata_file(files[fn])
        files[fn].seek(0)

    with s3_context(s3_base=s3_base, bucket=engine, files=files):

        # Create nvt.Dataset from mock s3 paths
        url = f"s3://{engine}" if engine == "parquet" else f"s3://{engine}/*"
        dataset = nvt.Dataset(url, engine=engine, storage_options=s3so)

        # Check that the iteration API works
        columns = mycols_pq if engine == "parquet" else mycols_csv
        gdf = cudf.concat(list(dataset.to_iter()))[columns]
        assert_eq(gdf.reset_index(drop=True), df.reset_index(drop=True))

        cat_names = ["name-cat", "name-string"
                     ] if engine == "parquet" else ["name-string"]
        cont_names = ["x", "y", "id"]
        label_name = ["label"]

        conts = cont_names >> ops.FillMissing() >> ops.Clip(
            min_value=0) >> ops.LogOp()
        cats = cat_names >> ops.Categorify(cat_cache="host")

        processor = nvt.Workflow(conts + cats + label_name)
        processor.fit(dataset)
Пример #11
0
def test_target_encode_multi(tmpdir, npartitions, cpu):
    cat_1 = np.asarray(["baaaa"] * 12)
    cat_2 = np.asarray(["baaaa"] * 6 + ["bbaaa"] * 3 + ["bcaaa"] * 3)
    num_1 = np.asarray([1, 1, 2, 2, 2, 1, 1, 5, 4, 4, 4, 4])
    num_2 = np.asarray([1, 1, 2, 2, 2, 1, 1, 5, 4, 4, 4, 4]) * 2
    df = dispatch._make_df({
        "cat": cat_1,
        "cat2": cat_2,
        "num": num_1,
        "num_2": num_2
    })
    if cpu:
        df = dd.from_pandas(
            df if isinstance(df, pd.DataFrame) else df.to_pandas(),
            npartitions=npartitions)
    else:
        df = dask_cudf.from_cudf(df, npartitions=npartitions)

    cat_groups = ["cat", "cat2", ["cat", "cat2"]]
    te_features = cat_groups >> ops.TargetEncoding(["num", "num_2"],
                                                   out_path=str(tmpdir),
                                                   kfold=1,
                                                   p_smooth=5,
                                                   out_dtype="float32")

    workflow = nvt.Workflow(te_features)

    df_out = workflow.fit_transform(
        nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous")

    assert "TE_cat_cat2_num" in df_out.columns
    assert "TE_cat_num" in df_out.columns
    assert "TE_cat2_num" in df_out.columns
    assert "TE_cat_cat2_num_2" in df_out.columns
    assert "TE_cat_num_2" in df_out.columns
    assert "TE_cat2_num_2" in df_out.columns

    assert_eq(df_out["TE_cat2_num"].values, df_out["TE_cat_cat2_num"].values)
    assert_eq(df_out["TE_cat2_num_2"].values,
              df_out["TE_cat_cat2_num_2"].values)
    assert df_out["TE_cat_num"].iloc[0] != df_out["TE_cat2_num"].iloc[0]
    assert df_out["TE_cat_num_2"].iloc[0] != df_out["TE_cat2_num_2"].iloc[0]
    assert math.isclose(df_out["TE_cat_num"].iloc[0],
                        num_1.mean(),
                        abs_tol=1e-4)
    assert math.isclose(df_out["TE_cat_num_2"].iloc[0],
                        num_2.mean(),
                        abs_tol=1e-3)
Пример #12
0
def test_list_slice_pad(cpu):
    DataFrame = pd.DataFrame if cpu else cudf.DataFrame
    df = DataFrame({"y": [[0, 1, 2, 2, 767], [1, 2, 2, 3], [1, 223, 4]]})

    # 0 pad to 5 elements
    op = ops.ListSlice(5, pad=True)
    selector = ColumnSelector(["y"])
    transformed = op.transform(selector, df)
    expected = DataFrame({"y": [[0, 1, 2, 2, 767], [1, 2, 2, 3, 0], [1, 223, 4, 0, 0]]})
    assert_eq(transformed, expected)

    # make sure we can also pad when start != 0, and when pad_value is set
    op = ops.ListSlice(1, 6, pad=True, pad_value=123)
    selector = ColumnSelector(["y"])
    transformed = op.transform(selector, df)
    expected = DataFrame({"y": [[1, 2, 2, 767, 123], [2, 2, 3, 123, 123], [223, 4, 123, 123, 123]]})
    assert_eq(transformed, expected)
Пример #13
0
def test_fit_simple():
    data = cudf.DataFrame({
        "x": [0, 1, 2, None, 0, 1, 2],
        "y": [None, 3, 4, 5, 3, 4, 5]
    })
    dataset = Dataset(data)

    workflow = Workflow(["x", "y"] >> ops.FillMedian() >> (lambda x: x * x))

    workflow.fit(dataset)
    transformed = workflow.transform(dataset).to_ddf().compute()

    expected = cudf.DataFrame({
        "x": [0, 1, 4, 1, 0, 1, 4],
        "y": [16, 9, 16, 25, 9, 16, 25]
    })
    assert_eq(expected, transformed)
Пример #14
0
def test_target_encode(tmpdir, cat_groups, kfold, fold_seed, cpu):
    df = dispatch._make_df({
        "Author": list(string.ascii_uppercase),
        "Engaging-User": list(string.ascii_lowercase),
        "Cost": range(26),
        "Post": [0, 1] * 13,
    })
    if cpu:
        df = dd.from_pandas(
            df if isinstance(df, pd.DataFrame) else df.to_pandas(),
            npartitions=3)
    else:
        df = dask_cudf.from_cudf(df, npartitions=3)

    cont_names = ["Cost"]
    te_features = cat_groups >> ops.TargetEncoding(
        cont_names,
        out_path=str(tmpdir),
        kfold=kfold,
        out_dtype="float32",
        fold_seed=fold_seed,
        drop_folds=False,  # Keep folds to validate
    )

    cont_features = cont_names >> ops.FillMissing() >> ops.Clip(
        min_value=0) >> ops.LogOp()
    workflow = nvt.Workflow(te_features + cont_features +
                            ["Author", "Engaging-User"])
    df_out = workflow.fit_transform(
        nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous")

    df_lib = dispatch.get_lib()
    if kfold > 1:
        # Cat columns are unique.
        # Make sure __fold__ mapping is correct
        if cat_groups == "Author":
            name = "__fold___Author"
            cols = ["__fold__", "Author"]
        else:
            name = "__fold___Author_Engaging-User"
            cols = ["__fold__", "Author", "Engaging-User"]

        check = df_lib.read_parquet(te_features.op.stats[name])
        check = check[cols].sort_values(cols).reset_index(drop=True)
        df_out_check = df_out[cols].sort_values(cols).reset_index(drop=True)
        assert_eq(check, df_out_check, check_dtype=False)
Пример #15
0
def test_generate_triton_model(tmpdir, engine, df):
    tmpdir = "./tmp"
    conts = ["x", "y", "id"] >> ops.FillMissing() >> ops.Normalize()
    cats = ["name-cat", "name-string"] >> ops.Categorify(cat_cache="host")
    workflow = nvt.Workflow(conts + cats)
    workflow.fit(nvt.Dataset(df))
    expected = workflow.transform(nvt.Dataset(df)).to_ddf().compute()

    # save workflow to triton / verify we see some expected output
    repo = os.path.join(tmpdir, "models")
    triton.generate_nvtabular_model(workflow, "model", repo)
    workflow = None

    assert os.path.exists(os.path.join(repo, "config.pbtxt"))

    workflow = nvt.Workflow.load(os.path.join(repo, "1", "workflow"))
    transformed = workflow.transform(nvt.Dataset(df)).to_ddf().compute()

    assert_eq(expected, transformed)
Пример #16
0
def test_fit_simple():
    data = nvt.dispatch._make_df({
        "x": [0, 1, 2, None, 0, 1, 2],
        "y": [None, 3, 4, 5, 3, 4, 5]
    })
    dataset = Dataset(data)

    workflow = Workflow(["x", "y"] >> ops.FillMedian() >> (lambda x: x * x))

    workflow.fit(dataset)
    transformed = workflow.transform(dataset).to_ddf().compute()

    expected = nvt.dispatch._make_df({
        "x": [0, 1, 4, 1, 0, 1, 4],
        "y": [16, 9, 16, 25, 9, 16, 25]
    })
    if not HAS_GPU:
        transformed["x"] = transformed["x"].astype(expected["x"].dtype)
        transformed["y"] = transformed["y"].astype(expected["y"].dtype)
    assert_eq(expected, transformed)
Пример #17
0
def test_normalize_lists(tmpdir, cpu):
    df = dispatch._make_df(device="cpu" if cpu else "gpu")
    df["vals"] = [
        [0.0, 1.0, 2.0],
        [
            3.0,
            4.0,
        ],
        [5.0],
    ]

    features = ["vals"] >> nvt.ops.Normalize()
    workflow = nvt.Workflow(features)
    transformed = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute()

    expected = _flatten_list_column_values(df["vals"]).astype("float32")
    expected = (expected - expected.mean()) / expected.std()
    expected_df = type(transformed)({"vals": expected})

    assert_eq(expected_df, _flatten_list_column(transformed["vals"]))
Пример #18
0
def test_workflow_transform_ddf_dtypes():
    # Initial Dataset
    df = cudf.datasets.timeseries().reset_index()
    ddf = dask_cudf.from_cudf(df, npartitions=2)
    dataset = Dataset(ddf)

    # Create and Execute Workflow
    cols = ["name", "x", "y", "timestamp"]
    cat_cols = ["id"] >> ops.Normalize()
    workflow = Workflow(cols + cat_cols)
    workflow.fit(dataset)
    transformed_ddf = workflow.transform(dataset).to_ddf()

    # no transforms on the pass through cols, should have original dtypes
    for col in cols:
        assert_eq(ddf.dtypes[col], transformed_ddf.dtypes[col])

    # Followup dask-cudf sorting used to throw an exception because of dtype issues,
    # check that it works now
    transformed_ddf.sort_values(["id", "timestamp"]).compute()
Пример #19
0
def test_column_group_select():
    df = cudf.DataFrame({
        "a": [1, 4, 9, 16, 25],
        "b": [0, 1, 2, 3, 4],
        "c": [25, 16, 9, 4, 1]
    })

    input_features = ColumnGroup(["a", "b", "c"])
    sqrt_features = input_features[["a", "c"]] >> cudf.sqrt
    plus_one_features = input_features["b"] >> (lambda col: col + 1)
    features = sqrt_features + plus_one_features

    workflow = Workflow(features)
    df_out = workflow.fit_transform(
        Dataset(df)).to_ddf().compute(scheduler="synchronous")

    expected = cudf.DataFrame()
    expected["a"] = cudf.sqrt(df["a"])
    expected["c"] = cudf.sqrt(df["c"])
    expected["b"] = df["b"] + 1

    assert_eq(expected, df_out)
def test_convert_format(_from, _to):
    convert_format = data_conversions.convert_format

    # we want to test conversion from '_from' to '_to' but this requires us roundtripping
    # from a known format. I'm picking pd -> _from -> _to -> pandas somewhat arbitrarily
    df = pd.DataFrame({
        "float": [0.0, 1.0, 2.0],
        "int": [10, 11, 12],
        "multihot": [[0, 1, 2, 3], [3, 4], [5]]
    })

    if _from != Supports.GPU_DICT_ARRAY and _to != Supports.GPU_DICT_ARRAY:
        df["string"] = ["aa", "bb", "cc"]
        df["multihot_string"] = [["aaaa", "bb", "cc"], ["dd", "ee"],
                                 ["fffffff"]]

    start, kind = convert_format(df, Supports.CPU_DATAFRAME, _from)
    assert kind == _from
    mid, kind = convert_format(start, kind, _to)
    assert kind == _to
    final, kind = convert_format(mid, kind, Supports.CPU_DATAFRAME)
    assert kind == Supports.CPU_DATAFRAME
    assert_eq(df, final)
def test_generate_triton_multihot(tmpdir):
    df = _make_df({
        "userId": ["a", "a", "b"],
        "movieId": ["1", "2", "2"],
        "genres": [["action", "adventure"], ["action", "comedy"], ["comedy"]],
    })

    cats = ["userId", "movieId", "genres"] >> nvt.ops.Categorify()
    workflow = nvt.Workflow(cats)
    workflow.fit(nvt.Dataset(df))
    expected = workflow.transform(nvt.Dataset(df)).to_ddf().compute()

    # save workflow to triton / verify we see some expected output
    repo = os.path.join(tmpdir, "models")
    triton.generate_nvtabular_model(workflow, "model", repo)
    workflow = None

    assert os.path.exists(os.path.join(repo, "config.pbtxt"))

    workflow = nvt.Workflow.load(os.path.join(repo, "1", "workflow"))
    transformed = workflow.transform(nvt.Dataset(df)).to_ddf().compute()

    assert_eq(expected, transformed)
Пример #22
0
def _verify_workflow_on_tritonserver(tmpdir, workflow, df, model_name):
    """tests that the nvtabular workflow produces the same results when run locally in the
    process, and when run in tritonserver"""
    # fit the workflow and test on the input
    dataset = nvt.Dataset(df)
    workflow.fit(dataset)

    local_df = workflow.transform(dataset).to_ddf().compute(
        scheduler="synchronous")
    triton.generate_nvtabular_model(workflow,
                                    model_name,
                                    tmpdir + f"/{model_name}",
                                    backend=BACKEND)

    inputs = triton.convert_df_to_triton_input(df.columns, df)
    with run_triton_server(tmpdir) as client:
        response = client.infer(model_name, inputs)

        for col in df.columns:
            features = response.as_numpy(col)
            triton_df = cudf.DataFrame(
                {col: features.reshape(features.shape[0])})
            assert_eq(triton_df, local_df[[col]])
Пример #23
0
def test_transform_geolocation():
    raw = """US>SC>519 US>CA>807 US>MI>505 US>CA>510 CA>NB US>CA>534""".split()
    data = nvt.dispatch._make_df({"geo_location": raw})

    geo_location = ColumnSelector(["geo_location"])
    state = (geo_location >> ops.LambdaOp(lambda col: col.str.slice(0, 5)) >>
             ops.Rename(postfix="_state"))
    country = (geo_location >> ops.LambdaOp(lambda col: col.str.slice(0, 2)) >>
               ops.Rename(postfix="_country"))
    geo_features = state + country + geo_location >> ops.HashBucket(
        num_buckets=100)

    # for this workflow we don't have any statoperators, so we can get away without fitting
    workflow = Workflow(geo_features)
    transformed = workflow.transform(Dataset(data)).to_ddf().compute()

    expected = nvt.dispatch._make_df()
    expected["geo_location_state"] = data["geo_location"].str.slice(
        0, 5).hash_values() % 100
    expected["geo_location_country"] = data["geo_location"].str.slice(
        0, 2).hash_values() % 100
    expected["geo_location"] = data["geo_location"].hash_values() % 100
    assert_eq(expected, transformed)
Пример #24
0
def test_list_slice(cpu):
    DataFrame = pd.DataFrame if cpu else cudf.DataFrame

    df = DataFrame({"y": [[0, 1, 2, 2, 767], [1, 2, 2, 3], [1, 223, 4]]})

    op = ops.ListSlice(0, 2)
    transformed = op.transform(["y"], df)
    expected = DataFrame({"y": [[0, 1], [1, 2], [1, 223]]})
    assert_eq(transformed, expected)

    op = ops.ListSlice(3, 5)
    transformed = op.transform(["y"], df)
    expected = DataFrame({"y": [[2, 767], [3], []]})
    assert_eq(transformed, expected)

    op = ops.ListSlice(4, 10)
    transformed = op.transform(["y"], df)
    expected = DataFrame({"y": [[767], [], []]})
    assert_eq(transformed, expected)

    op = ops.ListSlice(100, 20000)
    transformed = op.transform(["y"], df)
    expected = DataFrame({"y": [[], [], []]})
    assert_eq(transformed, expected)

    op = ops.ListSlice(-4)
    transformed = op.transform(["y"], df)
    expected = DataFrame({"y": [[1, 2, 2, 767], [1, 2, 2, 3], [1, 223, 4]]})
    assert_eq(transformed, expected)

    op = ops.ListSlice(-3, -1)
    transformed = op.transform(["y"], df)
    expected = DataFrame({"y": [[2, 2], [2, 2], [1, 223]]})
    assert_eq(transformed, expected)
Пример #25
0
def test_s3_dataset(s3_base, s3so, paths, datasets, engine, df,
                    patch_aiobotocore):
    # Copy files to mock s3 bucket
    files = {}
    for i, path in enumerate(paths):
        with open(path, "rb") as f:
            fbytes = f.read()
        fn = path.split(os.path.sep)[-1]
        files[fn] = BytesIO()
        files[fn].write(fbytes)
        files[fn].seek(0)

    if engine == "parquet":
        # Workaround for nvt#539. In order to avoid the
        # bug in Dask's `create_metadata_file`, we need
        # to manually generate a "_metadata" file here.
        # This can be removed after dask#7295 is merged
        # (see https://github.com/dask/dask/pull/7295)
        fn = "_metadata"
        files[fn] = BytesIO()
        meta = create_metadata_file(
            paths,
            engine="pyarrow",
            out_dir=False,
        )
        meta.write_metadata_file(files[fn])
        files[fn].seek(0)

    with s3_context(s3_base=s3_base, bucket=engine, files=files) as s3fs:
        # Create nvt.Dataset from mock s3 paths
        url = f"s3://{engine}" if engine == "parquet" else f"s3://{engine}/*"
        dataset = nvt.Dataset(url, engine=engine, storage_options=s3so)

        # Check that the iteration API works
        columns = mycols_pq if engine == "parquet" else mycols_csv
        gdf = nvt.dispatch._concat(list(dataset.to_iter()))[columns]
        assert_eq(gdf.reset_index(drop=True), df.reset_index(drop=True))

        cat_names = ["name-cat", "name-string"
                     ] if engine == "parquet" else ["name-string"]
        cont_names = ["x", "y", "id"]
        label_name = ["label"]

        conts = cont_names >> ops.FillMissing() >> ops.Clip(
            min_value=0) >> ops.LogOp()
        cats = cat_names >> ops.Categorify(cat_cache="host")

        processor = nvt.Workflow(conts + cats + label_name)
        processor.fit(dataset)

        # make sure we can write out the dataset back to S3
        # (https://github.com/NVIDIA-Merlin/NVTabular/issues/1214)
        processor.transform(dataset).to_parquet(f"s3://{engine}/output")
        expected = processor.transform(dataset).to_ddf().compute()

        # make sure we can write out the workflow to s3
        processor.save(f"s3://{engine}/saved_workflow/")

        # make sure the workflow got saved to the right spot in S3
        workflow_files = s3fs.ls(f"/{engine}/saved_workflow/")
        assert workflow_files

        # finally make sure we can read in the workflow from S3, and use it
        # to transform values and get the same result as on the local fs
        reloaded = nvt.Workflow.load(f"s3://{engine}/saved_workflow/")
        from_s3 = reloaded.transform(dataset).to_ddf().compute()
        assert_eq(expected, from_s3)
Пример #26
0
def test_gpu_dataset_iterator_csv(df, dataset, engine):
    df_itr = nvt.dispatch._concat(list(dataset.to_iter(columns=mycols_csv)),
                                  axis=0)
    assert_eq(df_itr.reset_index(drop=True), df.reset_index(drop=True))