Exemplo n.º 1
0
def test_chaining_2():
    gdf = cudf.DataFrame({
        "A": [1, 2, 2, 9, 6, np.nan, 3],
        "B": [2, np.nan, 4, 7, 7, 2, 5],
        "C": ["a", "b", "c", np.nan, np.nan, "g", "k"],
    })

    cat_names = ["C"]
    cont_names = ["A", "B"]
    label_name = []

    all_features = (cat_names + cont_names >> ops.LambdaOp(
        f=lambda col: col.isnull()) >> ops.Rename(postfix="_isnull"))
    cat_features = cat_names >> ops.Categorify()

    workflow = Workflow(all_features + cat_features + label_name)

    dataset = nvt.Dataset(gdf, engine="parquet")

    workflow.fit(dataset)

    result = workflow.transform(dataset).to_ddf().compute()

    assert all(x in list(result.columns)
               for x in ["A_isnull", "B_isnull", "C_isnull"])
    assert (x in result["C"].unique()
            for x in set(gdf["C"].dropna().to_arrow()))
Exemplo n.º 2
0
def test_chaining_3():
    gdf_test = cudf.DataFrame({
        "ad_id": [1, 2, 2, 6, 6, 8, 3, 3],
        "source_id": [2, 4, 4, 7, 5, 2, 5, 2],
        "platform": [1, 2, np.nan, 2, 1, 3, 3, 1],
        "clicked": [1, 0, 1, 0, 0, 1, 1, 0],
    })

    platform_features = ["platform"] >> ops.Dropna()
    joined = ["ad_id"] >> ops.JoinGroupby(cont_cols=["clicked"],
                                          stats=["sum", "count"])
    joined_lambda = (
        joined >> ops.LambdaOp(f=lambda col, gdf: col / gdf["ad_id_count"]) >>
        ops.Rename(postfix="_ctr"))

    workflow = Workflow(platform_features + joined + joined_lambda)

    dataset = nvt.Dataset(gdf_test, engine="parquet")

    workflow.fit(dataset)

    result = workflow.transform(dataset).to_ddf().compute()

    assert all(
        x in result.columns
        for x in ["ad_id_count", "ad_id_clicked_sum_ctr", "ad_id_clicked_sum"])
Exemplo n.º 3
0
def test_workflow_node_select():
    df = dispatch._make_df({
        "a": [1, 4, 9, 16, 25],
        "b": [0, 1, 2, 3, 4],
        "c": [25, 16, 9, 4, 1]
    })
    dataset = Dataset(df)

    input_features = WorkflowNode(ColumnSelector(["a", "b", "c"]))
    # pylint: disable=unnecessary-lambda
    sqrt_features = input_features[["a", "c"]] >> (lambda col: np.sqrt(col))
    plus_one_features = input_features["b"] >> (lambda col: col + 1)
    features = sqrt_features + plus_one_features

    workflow = Workflow(features)
    workflow.fit(dataset)

    df_out = workflow.transform(dataset).to_ddf().compute(
        scheduler="synchronous")

    expected = dispatch._make_df()
    expected["a"] = np.sqrt(df["a"])
    expected["c"] = np.sqrt(df["c"])
    expected["b"] = df["b"] + 1

    assert_eq(expected, df_out)
Exemplo n.º 4
0
def test_dask_normalize(client, tmpdir, datasets, engine):

    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    df1 = cudf.read_parquet(paths[0])[mycols_pq]
    df2 = cudf.read_parquet(paths[1])[mycols_pq]
    df0 = cudf.concat([df1, df2], axis=0)

    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    normalize = ops.Normalize()
    conts = cont_names >> ops.FillMissing() >> normalize
    workflow = Workflow(conts + cat_names + label_name, client=client)

    dataset = Dataset(paths, engine)
    result = workflow.fit_transform(dataset).to_ddf().compute()

    # Make sure we collected accurate statistics
    means = df0[cont_names].mean()
    stds = df0[cont_names].std()
    for name in cont_names:
        assert math.isclose(means[name], normalize.means[name], rel_tol=1e-3)
        assert math.isclose(stds[name], normalize.stds[name], rel_tol=1e-3)

    # New (normalized) means should all be close to zero
    new_means = result[cont_names].mean()
    for name in cont_names:
        assert new_means[name] < 1e-3
Exemplo n.º 5
0
def test_workflow_node_subtraction():
    schema = Schema(["a", "b", "c", "d", "e", "f"])

    node1 = ["a", "b", "c", "d"] >> Operator()
    node2 = ["c", "d"] >> Operator()
    node3 = ["b"] >> Operator()

    output_node = node1 - ["c", "d"]
    workflow = Workflow(output_node).fit_schema(schema)
    assert len(output_node.parents) == 1
    assert len(output_node.dependencies) == 0
    assert workflow.output_node.output_columns.names == ["a", "b"]

    output_node = node1 - node2
    workflow = Workflow(output_node).fit_schema(schema)
    assert len(output_node.parents) == 1
    assert len(output_node.dependencies) == 1
    assert workflow.output_node.output_columns.names == ["a", "b"]

    output_node = ["a", "b", "c", "d"] - node2
    workflow = Workflow(output_node).fit_schema(schema)
    assert len(output_node.parents) == 1
    assert len(output_node.dependencies) == 1
    assert workflow.output_node.output_columns.names == ["a", "b"]

    output_node = node1 - ["c", "d"] - node3
    workflow = Workflow(output_node).fit_schema(schema)
    assert len(output_node.parents) == 1
    assert len(output_node.dependencies) == 1
    assert workflow.output_node.output_columns.names == ["a"]
Exemplo n.º 6
0
def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction):

    engine = "parquet"
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    df1 = cudf.read_parquet(paths[0])[mycols_pq]
    df2 = cudf.read_parquet(paths[1])[mycols_pq]
    df0 = cudf.concat([df1, df2], axis=0)

    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    features = cat_names >> ops.JoinGroupby(
        cont_names=cont_names, stats=["count", "sum", "std", "min"], out_path=str(tmpdir)
    )

    dataset = Dataset(paths, part_mem_fraction=part_mem_fraction)
    workflow = Workflow(features + cat_names + cont_names + label_name, client=client)
    result = workflow.fit_transform(dataset).to_ddf().compute(scheduler="synchronous")

    # Validate result
    assert len(df0) == len(result)
    assert "name-cat_x_std" in result.columns
    assert "name-cat_x_var" not in result.columns
    assert "name-string_x_std" in result.columns
    assert "name-string_x_var" not in result.columns

    # Check "count"
    assert_eq(
        result[["name-cat", "name-cat_count"]]
        .drop_duplicates()
        .sort_values("name-cat")["name-cat_count"],
        df0.groupby("name-cat").agg({"x": "count"})["x"].astype(np.int64),
        check_index=False,
        check_dtype=False,  # May get int64 vs int32
        check_names=False,
    )

    # Check "min"
    assert_eq(
        result[["name-string", "name-string_x_min"]]
        .drop_duplicates()
        .sort_values("name-string")["name-string_x_min"],
        df0.groupby("name-string").agg({"x": "min"})["x"],
        check_index=False,
        check_names=False,
    )

    # Check "std"
    assert_eq(
        result[["name-string", "name-string_x_std"]]
        .drop_duplicates()
        .sort_values("name-string")["name-string_x_std"],
        df0.groupby("name-string").agg({"x": "std"})["x"],
        check_index=False,
        check_names=False,
    )
def test_fit_schema_works_with_raw_column_dependencies():
    schema = Schema(["x", "y", "cost"])

    cat_features = ColumnSelector(["x", "y"]) >> ops.TargetEncoding("cost")

    workflow = Workflow(cat_features)
    workflow.fit_schema(schema)

    assert workflow.output_schema.column_names == ["TE_x_cost", "TE_y_cost"]
Exemplo n.º 8
0
def test_dask_normalize(client, tmpdir, datasets, engine):

    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    df1 = cudf.read_parquet(paths[0])[mycols_pq]
    df2 = cudf.read_parquet(paths[1])[mycols_pq]
    df0 = cudf.concat([df1, df2], axis=0)

    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = Workflow(
        client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name
    )
    processor.add_preprocess(ops.Normalize())
    processor.finalize()

    dataset = Dataset(paths, engine)
    processor.apply(dataset)
    result = processor.get_ddf().compute()

    # Make sure we collected accurate statistics
    means = df0[cont_names].mean()
    stds = df0[cont_names].std()
    counts = df0[cont_names].count()
    for name in cont_names:
        assert math.isclose(means[name], processor.stats["means"][name], rel_tol=1e-3)
        assert math.isclose(stds[name], processor.stats["stds"][name], rel_tol=1e-3)
        assert math.isclose(counts[name], processor.stats["counts"][name], rel_tol=1e-3)

    # New (normalized) means should all be close to zero
    new_means = result[cont_names].mean()
    for name in cont_names:
        assert new_means[name] < 1e-3
Exemplo n.º 9
0
def test_dask_median_dummyop(client, tmpdir, datasets, engine):

    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    class DummyOp(ops.DFOperator):

        default_in, default_out = "continuous", "continuous"

        @property
        def req_stats(self):
            return [ops.Median()]

        def op_logic(self, *args, **kwargs):
            return _dummy_op_logic(*args, _id=self._id, **kwargs)

    processor = Workflow(
        client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name
    )
    processor.add_preprocess(DummyOp())
    processor.finalize()

    dataset = Dataset(paths, engine)
    processor.apply(dataset)
    result = processor.get_ddf().compute()

    # TODO: Improve the accuracy! "tidigest" with crick could help,
    #       but current version seems to have cupy/numpy problems here
    medians = result[cont_names].quantile(q=0.5)
    assert math.isclose(medians["x"], processor.stats["medians"]["x"], abs_tol=1e-1)
    assert math.isclose(medians["y"], processor.stats["medians"]["y"], abs_tol=1e-1)
    assert math.isclose(medians["id"], processor.stats["medians"]["id"], rel_tol=1e-2)
Exemplo n.º 10
0
def test_dask_minmax_dummyop(client, tmpdir, datasets, engine):

    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    class DummyOp(ops.DFOperator):

        default_in, default_out = "continuous", "continuous"

        @property
        def req_stats(self):
            return [ops.MinMax()]

        def op_logic(self, *args, **kwargs):
            return _dummy_op_logic(*args, _id=self._id, **kwargs)

    processor = Workflow(
        client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name
    )
    processor.add_preprocess(DummyOp())
    processor.finalize()

    dataset = Dataset(paths, engine)
    processor.apply(dataset)
    result = processor.get_ddf().compute()

    assert math.isclose(result.x.min(), processor.stats["mins"]["x"], rel_tol=1e-3)
    assert math.isclose(result.y.min(), processor.stats["mins"]["y"], rel_tol=1e-3)
    assert math.isclose(result.id.min(), processor.stats["mins"]["id"], rel_tol=1e-3)
    assert math.isclose(result.x.max(), processor.stats["maxs"]["x"], rel_tol=1e-3)
    assert math.isclose(result.y.max(), processor.stats["maxs"]["y"], rel_tol=1e-3)
    assert math.isclose(result.id.max(), processor.stats["maxs"]["id"], rel_tol=1e-3)
def test_fit_schema_works_with_grouped_node_inputs():
    schema = Schema(["x", "y", "cost"])

    cat_features = ColumnSelector(["x", "y",
                                   ("x", "y")]) >> ops.TargetEncoding("cost")

    workflow1 = Workflow(cat_features)
    workflow1.fit_schema(schema)

    assert sorted(workflow1.output_schema.column_names) == sorted(
        ["TE_x_cost", "TE_y_cost", "TE_x_y_cost"])
Exemplo n.º 12
0
def test_addition_nodes_are_combined():
    schema = Schema(["a", "b", "c", "d", "e", "f", "g", "h"])

    node1 = ["a", "b"] >> Operator()
    node2 = ["c", "d"] >> Operator()
    node3 = ["e", "f"] >> Operator()
    node4 = ["g", "h"] >> Operator()

    add_node = node1 + node2 + node3
    workflow = Workflow(add_node).fit_schema(schema)
    assert set(workflow.output_node.parents) == {node1}
    assert set(workflow.output_node.dependencies) == {node2, node3}
    assert set(workflow.output_node.output_columns.names) == {
        "a", "b", "c", "d", "e", "f"
    }

    add_node = node1 + "c" + "d"
    workflow = Workflow(add_node).fit_schema(schema)
    assert set(workflow.output_node.parents) == {node1}
    assert set(
        workflow.output_node.output_columns.names) == {"a", "b", "c", "d"}

    add_node = "c" + node1 + "d"
    workflow = Workflow(add_node).fit_schema(schema)
    assert set(workflow.output_node.parents) == {node1}
    assert set(
        workflow.output_node.output_columns.names) == {"a", "b", "c", "d"}

    add_node = node1 + "e" + node2
    workflow = Workflow(add_node).fit_schema(schema)
    assert set(workflow.output_node.parents) == {node1}
    assert node2 in workflow.output_node.dependencies
    assert set(workflow.output_node.output_columns.names) == {
        "a", "b", "e", "c", "d"
    }

    add_node1 = node1 + node2
    add_node2 = node3 + node4

    add_node = add_node1 + add_node2
    workflow = Workflow(add_node).fit_schema(schema)

    assert set(workflow.output_node.parents) == {node1}
    assert set(workflow.output_node.dependencies) == {node2, node3, node4}
    assert set(workflow.output_node.output_columns.names) == {
        "a",
        "b",
        "c",
        "d",
        "e",
        "f",
        "g",
        "h",
    }
def test_fit_schema_works_when_subtracting_column_names():
    schema = Schema(["x", "y", "id"])

    cont_features = (ColumnSelector(
        ["x", "y"]) >> ops.FillMissing() >> ops.Clip(min_value=0) >> ops.LogOp
                     >> ops.Normalize() >> ops.Rename(postfix="_renamed"))

    workflow1 = Workflow(cont_features - "y_renamed")
    workflow1.fit_schema(schema)

    assert workflow1.output_schema.column_names == ["x_renamed"]
Exemplo n.º 14
0
def test_filtered_partition(tmpdir, cpu):
    # Toy DataFrame example
    df = pd.DataFrame({"col": range(100)})
    ddf = dd_from_pandas(df, npartitions=5)
    dataset = Dataset(ddf, cpu=cpu)

    # Workflow
    filtered = ["col"] >> ops.Filter(lambda df: df["col"] < 75)
    workflow = Workflow(filtered)

    # Write result to disk
    workflow.transform(dataset).to_parquet(str(tmpdir))
def test_workflow_select_by_tags(op):
    schema1 = ColumnSchema("col1", tags=["b", "c", "d"])
    schema2 = ColumnSchema("col2", tags=["c", "d"])
    schema3 = ColumnSchema("col3", tags=["d"])
    schema = Schema([schema1, schema2, schema3])

    cont_features = ColumnSelector(tags=["c"]) >> op
    workflow = Workflow(cont_features)
    workflow.fit_schema(schema)

    output_cols = op.output_column_names(ColumnSelector(["col1", "col2"]))
    assert len(workflow.output_schema.column_names) == len(output_cols.names)
def test_fit_schema_works_with_node_dependencies():
    schema = Schema(["x", "y", "cost"])

    cont_features = ColumnSelector(["cost"]) >> ops.Rename(postfix="_renamed")
    cat_features = ColumnSelector(["x", "y"
                                   ]) >> ops.TargetEncoding(cont_features)

    workflow1 = Workflow(cat_features)
    workflow1.fit_schema(schema)

    assert workflow1.output_schema.column_names == [
        "TE_x_cost_renamed", "TE_y_cost_renamed"
    ]
def test_fit_schema():
    schema = Schema(["x", "y", "id"])

    cont_features = (ColumnSelector(schema.column_names) >> ops.FillMissing()
                     >> ops.Clip(min_value=0) >> ops.LogOp >> ops.Normalize()
                     >> ops.Rename(postfix="_renamed"))

    workflow = Workflow(cont_features)
    workflow.fit_schema(schema)

    assert workflow.output_schema.column_names == [
        "x_renamed", "y_renamed", "id_renamed"
    ]
Exemplo n.º 18
0
def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction):

    engine = "parquet"
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    df1 = cudf.read_parquet(paths[0])[mycols_pq]
    df2 = cudf.read_parquet(paths[1])[mycols_pq]
    df0 = cudf.concat([df1, df2], axis=0)

    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    features = cat_names >> ops.JoinGroupby(
        cont_cols=cont_names,
        stats=["count", "sum", "std", "min"],
        out_path=str(tmpdir))

    dataset = Dataset(paths, part_mem_fraction=part_mem_fraction)
    workflow = Workflow(features + cat_names + cont_names + label_name,
                        client=client)
    result = workflow.fit_transform(dataset).to_ddf().compute(
        scheduler="synchronous")

    # Validate result
    assert len(df0) == len(result)
    assert "name-cat_x_std" in result.columns
    assert "name-cat_x_var" not in result.columns
    assert "name-string_x_std" in result.columns
    assert "name-string_x_var" not in result.columns

    # Check results.  Need to sort for direct comparison
    expect = df0.sort_values(["label", "x", "y",
                              "id"]).reset_index(drop=True).reset_index()
    got = result.sort_values(["label", "x", "y",
                              "id"]).reset_index(drop=True).reset_index()
    gb_e = expect.groupby("name-cat").aggregate({
        "name-cat": "count",
        "x": ["sum", "min", "std"]
    })
    gb_e.columns = ["count", "sum", "min", "std"]
    df_check = got.merge(gb_e,
                         left_on="name-cat",
                         right_index=True,
                         how="left")
    assert_eq(df_check["name-cat_count"],
              df_check["count"].astype("int64"),
              check_names=False)
    assert_eq(df_check["name-cat_x_sum"], df_check["sum"], check_names=False)
    assert_eq(df_check["name-cat_x_min"], df_check["min"], check_names=False)
    assert_eq(df_check["name-cat_x_std"], df_check["std"], check_names=False)
Exemplo n.º 19
0
def test_workflow_input_output_dtypes():
    df = cudf.DataFrame({
        "genre": ["drama", "comedy"],
        "user": ["a", "b"],
        "unneeded": [1, 2]
    })
    features = [["genre", "user"], "genre"
                ] >> ops.Categorify(encode_type="combo")
    workflow = Workflow(features)
    workflow.fit(Dataset(df))

    assert "unneeded" not in workflow.input_dtypes
    assert set(workflow.input_dtypes.keys()) == {"genre", "user"}
    assert set(workflow.output_dtypes.keys()) == {"genre_user", "genre"}
Exemplo n.º 20
0
def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction):

    engine = "parquet"
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    df1 = cudf.read_parquet(paths[0])[mycols_pq]
    df2 = cudf.read_parquet(paths[1])[mycols_pq]
    df0 = cudf.concat([df1, df2], axis=0)

    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = Workflow(
        client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name
    )

    processor.add_preprocess(
        ops.GroupBy(cont_names=cont_names, stats=["count", "sum", "std"], out_path=str(tmpdir))
    )
    processor.finalize()

    dataset = Dataset(paths, part_mem_fraction=part_mem_fraction)
    processor.apply(dataset)
    result = processor.get_ddf().compute(scheduler="synchronous")

    # Validate result
    assert len(df0) == len(result)
    assert "name-cat_x_std" in result.columns
    assert "name-cat_x_var" not in result.columns
    assert "name-string_x_std" in result.columns
    assert "name-string_x_var" not in result.columns

    # Check "count"
    assert_eq(
        result[["name-cat", "name-cat_count"]]
        .drop_duplicates()
        .sort_values("name-cat")["name-cat_count"],
        df0.groupby("name-cat").agg({"x": "count"})["x"],
        check_index=False,
        check_dtype=False,  # May get int64 vs int32
        check_names=False,
    )

    # Check "std"
    assert_eq(
        result[["name-string", "name-string_x_std"]]
        .drop_duplicates()
        .sort_values("name-string")["name-string_x_std"],
        df0.groupby("name-string").agg({"x": "std"})["x"],
        check_index=False,
        check_names=False,
    )
Exemplo n.º 21
0
def test_fit_simple():
    data = cudf.DataFrame({
        "x": [0, 1, 2, None, 0, 1, 2],
        "y": [None, 3, 4, 5, 3, 4, 5]
    })
    dataset = Dataset(data)

    workflow = Workflow(["x", "y"] >> ops.FillMedian() >> (lambda x: x * x))

    workflow.fit(dataset)
    transformed = workflow.transform(dataset).to_ddf().compute()

    expected = cudf.DataFrame({
        "x": [0, 1, 4, 1, 0, 1, 4],
        "y": [16, 9, 16, 25, 9, 16, 25]
    })
    assert_eq(expected, transformed)
Exemplo n.º 22
0
def test_chaining_1():
    df = cudf.DataFrame({
        "cont01": np.random.randint(1, 100, 100),
        "cont02": np.random.random(100) * 100,
        "cat01": np.random.randint(0, 10, 100),
        "label": np.random.randint(0, 3, 100),
    })
    df["cont01"][:10] = None

    cont1 = "cont01" >> ops.FillMissing()
    conts = cont1 + "cont02" >> ops.NormalizeMinMax()
    workflow = Workflow(conts + "cat01" + "label")

    result = workflow.fit_transform(Dataset(df)).to_ddf().compute()

    assert result["cont01"].max() <= 1.0
    assert result["cont02"].max() <= 1.0
Exemplo n.º 23
0
def test_spec_set(tmpdir, client):
    gdf_test = cudf.DataFrame({
        "ad_id": [1, 2, 2, 6, 6, 8, 3, 3],
        "source_id": [2, 4, 4, 7, 5, 2, 5, 2],
        "platform": [1, 2, np.nan, 2, 1, 3, 3, 1],
        "cont": [1, 2, np.nan, 2, 1, 3, 3, 1],
        "clicked": [1, 0, 1, 0, 0, 1, 1, 0],
    })

    cats = ColumnGroup(["ad_id", "source_id", "platform"])
    cat_features = cats >> ops.Categorify
    cont_features = ColumnGroup(["cont"]) >> ops.FillMissing >> ops.Normalize
    te_features = cats >> ops.TargetEncoding(
        "clicked", kfold=5, fold_seed=42, p_smooth=20)

    p = Workflow(cat_features + cont_features + te_features, client=client)
    p.fit_transform(nvt.Dataset(gdf_test)).to_ddf().compute()
Exemplo n.º 24
0
def test_gpu_workflow(tmpdir, df, dataset, gpu_memory_frac, engine, dump):
    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    norms = ops.Normalize()
    conts = cont_names >> ops.FillMissing() >> ops.Clip(min_value=0) >> norms
    cats = cat_names >> ops.Categorify()
    workflow = nvt.Workflow(conts + cats + label_name)

    workflow.fit(dataset)
    if dump:
        workflow_dir = os.path.join(tmpdir, "workflow")
        workflow.save(workflow_dir)
        workflow = None

        workflow = Workflow.load(workflow_dir)

    def get_norms(tar: cudf.Series):
        gdf = tar.fillna(0)
        gdf = gdf * (gdf >= 0).astype("int")
        return gdf

    assert math.isclose(get_norms(df.x).mean(), norms.means["x"], rel_tol=1e-4)
    assert math.isclose(get_norms(df.y).mean(), norms.means["y"], rel_tol=1e-4)
    assert math.isclose(get_norms(df.x).std(), norms.stds["x"], rel_tol=1e-3)
    assert math.isclose(get_norms(df.y).std(), norms.stds["y"], rel_tol=1e-3)

    # Check that categories match
    if engine == "parquet":
        cats_expected0 = df["name-cat"].unique().values_host
        cats0 = get_cats(workflow, "name-cat")
        # adding the None entry as a string because of move from gpu
        assert cats0.tolist() == [None] + cats_expected0.tolist()
    cats_expected1 = df["name-string"].unique().values_host
    cats1 = get_cats(workflow, "name-string")
    # adding the None entry as a string because of move from gpu
    assert cats1.tolist() == [None] + cats_expected1.tolist()

    # Write to new "shuffled" and "processed" dataset
    workflow.transform(dataset).to_parquet(
        output_path=tmpdir,
        out_files_per_proc=10,
        shuffle=nvt.io.Shuffle.PER_PARTITION)

    dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"),
                        part_mem_fraction=gpu_memory_frac)

    df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0)

    if engine == "parquet":
        assert is_integer_dtype(df_pp["name-cat"].dtype)
    assert is_integer_dtype(df_pp["name-string"].dtype)

    num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(
        str(tmpdir) + "/_metadata")
    assert num_rows == len(df_pp)
Exemplo n.º 25
0
def test_grab_additional_input_columns(dataset, engine):
    schema = Schema(["x", "y"])
    node1 = ["x"] >> ops.FillMissing()
    node2 = node1 >> ops.Clip(min_value=0)

    add_node = node2 + ["y"]

    workflow = Workflow(add_node).fit_schema(schema)
    output_df = workflow.transform(dataset).to_ddf().compute()

    assert len(workflow.output_node.input_columns.names) == 2
    assert workflow.output_node.input_columns.names == ["x", "y"]

    assert len(workflow.output_node.output_columns.names) == 2
    assert workflow.output_node.output_columns.names == ["x", "y"]

    assert len(output_df.columns) == 2
    assert output_df.columns.tolist() == ["x", "y"]
Exemplo n.º 26
0
def test_nested_workflow_node():
    df = dispatch._make_df({
        "geo": ["US>CA", "US>NY", "CA>BC", "CA>ON"],
        "user": ["User_A", "User_A", "User_A", "User_B"],
    })
    dataset = Dataset(df)

    geo_selector = ColumnSelector(["geo"])
    country = (geo_selector >> LambdaOp(lambda col: col.str.slice(0, 2)) >>
               Rename(postfix="_country"))
    # country1 = geo_selector >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country1")
    # country2 = geo_selector >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country2")
    user = "******"
    # user2 = "user2"

    # make sure we can do a 'combo' categorify (cross based) of country+user
    # as well as categorifying the country and user columns on their own
    cats = country + user + [country + user] >> Categorify(encode_type="combo")

    workflow = Workflow(cats)
    workflow.fit_schema(dataset.infer_schema())

    df_out = workflow.fit_transform(dataset).to_ddf().compute(
        scheduler="synchronous")

    geo_country = df_out["geo_country"]
    assert geo_country[0] == geo_country[1]  # rows 0,1 are both 'US'
    assert geo_country[2] == geo_country[3]  # rows 2,3 are both 'CA'

    user = df_out["user"]
    assert user[0] == user[1] == user[2]
    assert user[3] != user[2]

    geo_country_user = df_out["geo_country_user"]
    assert geo_country_user[0] == geo_country_user[1]  # US / userA
    assert geo_country_user[2] != geo_country_user[
        0]  # same user but in canada

    # make sure we get an exception if we nest too deeply (can't handle arbitrarily deep
    # nested column groups - and the exceptions we would get in operators like Categorify
    # are super confusing for users)
    with pytest.raises(ValueError):
        cats = [[country + "user"] + country + "user"
                ] >> Categorify(encode_type="combo")
Exemplo n.º 27
0
def test_cats_and_groupby_stats(client, tmpdir, datasets, part_mem_fraction, use_client):
    engine = "parquet"
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])

    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]

    cats = ColumnGroup(cat_names)
    cat_features = cats >> ops.Categorify(out_path=str(tmpdir), freq_threshold=10, on_host=True)
    groupby_features = cats >> ops.JoinGroupby(
        cont_names=cont_names, stats=["count", "sum"], out_path=str(tmpdir)
    )

    workflow = Workflow(cat_features + groupby_features, client=client)
    dataset = Dataset(paths, part_mem_fraction=part_mem_fraction)
    result = workflow.fit_transform(dataset).to_ddf().compute()

    assert "name-cat_x_sum" in result.columns
    assert "name-string_x_sum" in result.columns
Exemplo n.º 28
0
def test_fit_simple():
    data = nvt.dispatch._make_df({
        "x": [0, 1, 2, None, 0, 1, 2],
        "y": [None, 3, 4, 5, 3, 4, 5]
    })
    dataset = Dataset(data)

    workflow = Workflow(["x", "y"] >> ops.FillMedian() >> (lambda x: x * x))

    workflow.fit(dataset)
    transformed = workflow.transform(dataset).to_ddf().compute()

    expected = nvt.dispatch._make_df({
        "x": [0, 1, 4, 1, 0, 1, 4],
        "y": [16, 9, 16, 25, 9, 16, 25]
    })
    if not HAS_GPU:
        transformed["x"] = transformed["x"].astype(expected["x"].dtype)
        transformed["y"] = transformed["y"].astype(expected["y"].dtype)
    assert_eq(expected, transformed)
Exemplo n.º 29
0
def test_workflow_transform_ddf_dtypes():
    # Initial Dataset
    df = cudf.datasets.timeseries().reset_index()
    ddf = dask_cudf.from_cudf(df, npartitions=2)
    dataset = Dataset(ddf)

    # Create and Execute Workflow
    cols = ["name", "x", "y", "timestamp"]
    cat_cols = ["id"] >> ops.Normalize()
    workflow = Workflow(cols + cat_cols)
    workflow.fit(dataset)
    transformed_ddf = workflow.transform(dataset).to_ddf()

    # no transforms on the pass through cols, should have original dtypes
    for col in cols:
        assert_eq(ddf.dtypes[col], transformed_ddf.dtypes[col])

    # Followup dask-cudf sorting used to throw an exception because of dtype issues,
    # check that it works now
    transformed_ddf.sort_values(["id", "timestamp"]).compute()
Exemplo n.º 30
0
def test_workflow_move_saved(tmpdir):
    raw = """US>SC>519 US>CA>807 US>MI>505 US>CA>510 CA>NB US>CA>534""".split()
    data = cudf.DataFrame({"geo": raw})

    geo_location = ColumnGroup(["geo"])
    state = geo_location >> (lambda col: col.str.slice(0, 5)) >> ops.Rename(
        postfix="_state")
    country = geo_location >> (lambda col: col.str.slice(0, 2)) >> ops.Rename(
        postfix="_country")
    geo_features = state + country + geo_location >> ops.Categorify()

    # create the workflow and transform the input
    workflow = Workflow(geo_features)
    expected = workflow.fit_transform(Dataset(data)).to_ddf().compute()

    # save the workflow (including categorical mapping parquet files)
    # and then verify we can load the saved workflow after moving the directory
    out_path = os.path.join(tmpdir, "output", "workflow")
    workflow.save(out_path)

    moved_path = os.path.join(tmpdir, "output", "workflow2")
    shutil.move(out_path, moved_path)
    workflow2 = Workflow.load(moved_path)

    # also check that when transforming our input we get the same results after loading
    transformed = workflow2.transform(Dataset(data)).to_ddf().compute()
    assert_eq(expected, transformed)