Exemplo n.º 1
0
def test_normalize_minmax(tmpdir, df, dataset, gpu_memory_frac, engine,
                          op_columns):
    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y"]
    label_name = ["label"]

    config = nvt.workflow.get_new_config()
    config["PP"]["continuous"] = [ops.MinMax()]

    processor = nvtabular.Workflow(cat_names=cat_names,
                                   cont_names=cont_names,
                                   label_name=label_name,
                                   config=config)

    processor.update_stats(dataset)

    op = ops.NormalizeMinMax()

    columns_ctx = {}
    columns_ctx["continuous"] = {}
    columns_ctx["continuous"]["base"] = cont_names

    new_gdf = op.apply_op(df,
                          columns_ctx,
                          "continuous",
                          stats_context=processor.stats)
    df["x"] = (df["x"] - processor.stats["mins"]["x"]) / (
        processor.stats["maxs"]["x"] - processor.stats["mins"]["x"])
    assert new_gdf["x"].equals(df["x"])
Exemplo n.º 2
0
def test_minmax(tmpdir, client, df, dataset, gpu_memory_frac, engine,
                op_columns):
    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y"]
    label_name = ["label"]

    config = nvtabular.workflow.get_new_config()
    config["PP"]["all"] = [ops.MinMax(columns=op_columns)]

    processor = nvtabular.Workflow(cat_names=cat_names,
                                   cont_names=cont_names,
                                   label_name=label_name,
                                   config=config)
    processor.update_stats(dataset)
    x_min = df["x"].min()

    assert x_min == pytest.approx(processor.stats["mins"]["x"], 1e-2)
    x_max = df["x"].max()
    assert x_max == pytest.approx(processor.stats["maxs"]["x"], 1e-2)
    if not op_columns:
        name_min = min(df["name-string"].tolist())
        name_max = max(df["name-string"].tolist())
        assert name_min == processor.stats["mins"]["name-string"]
        y_max = df["y"].max()
        y_min = df["y"].min()
        assert y_max == processor.stats["maxs"]["y"]
        assert name_max == processor.stats["maxs"]["name-string"]
        assert y_min == processor.stats["mins"]["y"]
Exemplo n.º 3
0
def test_minmax(tmpdir, datasets, gpu_memory_frac, engine, op_columns):
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])

    if engine == "parquet":
        df1 = cudf.read_parquet(paths[0])[mycols_pq]
        df2 = cudf.read_parquet(paths[1])[mycols_pq]
    else:
        df1 = cudf.read_csv(paths[0], header=False,
                            names=allcols_csv)[mycols_csv]
        df2 = cudf.read_csv(paths[1], header=False,
                            names=allcols_csv)[mycols_csv]
    df = cudf.concat([df1, df2], axis=0)
    df["id"] = df["id"].astype("int64")

    if engine == "parquet":
        cat_names = ["name-cat", "name-string"]
        columns = mycols_pq
    else:
        cat_names = ["name-string"]
        columns = mycols_csv
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    data_itr = nvtabular.io.GPUDatasetIterator(
        paths,
        columns=columns,
        use_row_groups=True,
        gpu_memory_frac=gpu_memory_frac,
        names=allcols_csv,
    )

    config = nvtabular.workflow.get_new_config()
    config["PP"]["all"] = [ops.MinMax(columns=op_columns)]

    processor = nvtabular.Workflow(
        cat_names=cat_names,
        cont_names=cont_names,
        label_name=label_name,
        config=config,
        to_cpu=False,
    )

    processor.update_stats(data_itr)

    x_min = df["x"].min()

    assert x_min == pytest.approx(processor.stats["mins"]["x"], 1e-2)
    x_max = df["x"].max()
    assert x_max == pytest.approx(processor.stats["maxs"]["x"], 1e-2)
    if not op_columns:
        name_min = min(df["name-string"].tolist())
        name_max = max(df["name-string"].tolist())
        assert name_min == processor.stats["mins"]["name-string"]
        y_max = df["y"].max()
        y_min = df["y"].min()
        assert y_max == processor.stats["maxs"]["y"]
        assert name_max == processor.stats["maxs"]["name-string"]
        assert y_min == processor.stats["mins"]["y"]
    return processor.ds_exports
Exemplo n.º 4
0
 def req_stats(self):
     return [ops.MinMax()]