示例#1
0
def test_cats_and_groupby_stats(client, tmpdir, datasets, part_mem_fraction,
                                use_client):

    engine = "parquet"
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])

    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = Workflow(
        client=client if use_client else None,
        cat_names=cat_names,
        cont_names=cont_names,
        label_name=label_name,
    )

    processor.add_preprocess(
        ops.Categorify(out_path=str(tmpdir), freq_threshold=10, on_host=True))

    processor.add_cat_feature(
        ops.JoinGroupby(cont_names=cont_names,
                        stats=["count", "sum"],
                        out_path=str(tmpdir)))

    processor.finalize()
    dataset = Dataset(paths, part_mem_fraction=part_mem_fraction)

    processor.apply(dataset, output_path=str(tmpdir))
    result = processor.get_ddf().compute()

    assert "name-cat_x_sum" in result.columns
    assert "name-string_x_sum" in result.columns
def test_dask_normalize(client, tmpdir, datasets, engine):

    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    df1 = cudf.read_parquet(paths[0])[mycols_pq]
    df2 = cudf.read_parquet(paths[1])[mycols_pq]
    df0 = cudf.concat([df1, df2], axis=0)

    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = Workflow(
        client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name
    )
    processor.add_preprocess(ops.Normalize())
    processor.finalize()

    dataset = Dataset(paths, engine)
    processor.apply(dataset)
    result = processor.get_ddf().compute()

    # Make sure we collected accurate statistics
    means = df0[cont_names].mean()
    stds = df0[cont_names].std()
    counts = df0[cont_names].count()
    for name in cont_names:
        assert math.isclose(means[name], processor.stats["means"][name], rel_tol=1e-3)
        assert math.isclose(stds[name], processor.stats["stds"][name], rel_tol=1e-3)
        assert math.isclose(counts[name], processor.stats["counts"][name], rel_tol=1e-3)

    # New (normalized) means should all be close to zero
    new_means = result[cont_names].mean()
    for name in cont_names:
        assert new_means[name] < 1e-3
def test_dask_median_dummyop(client, tmpdir, datasets, engine):

    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    class DummyOp(ops.DFOperator):

        default_in, default_out = "continuous", "continuous"

        @property
        def req_stats(self):
            return [ops.Median()]

        def op_logic(self, *args, **kwargs):
            return _dummy_op_logic(*args, _id=self._id, **kwargs)

    processor = Workflow(
        client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name
    )
    processor.add_preprocess(DummyOp())
    processor.finalize()

    dataset = Dataset(paths, engine)
    processor.apply(dataset)
    result = processor.get_ddf().compute()

    # TODO: Improve the accuracy! "tidigest" with crick could help,
    #       but current version seems to have cupy/numpy problems here
    medians = result[cont_names].quantile(q=0.5)
    assert math.isclose(medians["x"], processor.stats["medians"]["x"], abs_tol=1e-1)
    assert math.isclose(medians["y"], processor.stats["medians"]["y"], abs_tol=1e-1)
    assert math.isclose(medians["id"], processor.stats["medians"]["id"], rel_tol=1e-2)
def test_dask_minmax_dummyop(client, tmpdir, datasets, engine):

    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    class DummyOp(ops.DFOperator):

        default_in, default_out = "continuous", "continuous"

        @property
        def req_stats(self):
            return [ops.MinMax()]

        def op_logic(self, *args, **kwargs):
            return _dummy_op_logic(*args, _id=self._id, **kwargs)

    processor = Workflow(
        client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name
    )
    processor.add_preprocess(DummyOp())
    processor.finalize()

    dataset = Dataset(paths, engine)
    processor.apply(dataset)
    result = processor.get_ddf().compute()

    assert math.isclose(result.x.min(), processor.stats["mins"]["x"], rel_tol=1e-3)
    assert math.isclose(result.y.min(), processor.stats["mins"]["y"], rel_tol=1e-3)
    assert math.isclose(result.id.min(), processor.stats["mins"]["id"], rel_tol=1e-3)
    assert math.isclose(result.x.max(), processor.stats["maxs"]["x"], rel_tol=1e-3)
    assert math.isclose(result.y.max(), processor.stats["maxs"]["y"], rel_tol=1e-3)
    assert math.isclose(result.id.max(), processor.stats["maxs"]["id"], rel_tol=1e-3)
def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction):

    engine = "parquet"
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    df1 = cudf.read_parquet(paths[0])[mycols_pq]
    df2 = cudf.read_parquet(paths[1])[mycols_pq]
    df0 = cudf.concat([df1, df2], axis=0)

    cat_names = ["name-cat", "name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = Workflow(
        client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name
    )

    processor.add_preprocess(
        ops.GroupBy(cont_names=cont_names, stats=["count", "sum", "std"], out_path=str(tmpdir))
    )
    processor.finalize()

    dataset = Dataset(paths, part_mem_fraction=part_mem_fraction)
    processor.apply(dataset)
    result = processor.get_ddf().compute(scheduler="synchronous")

    # Validate result
    assert len(df0) == len(result)
    assert "name-cat_x_std" in result.columns
    assert "name-cat_x_var" not in result.columns
    assert "name-string_x_std" in result.columns
    assert "name-string_x_var" not in result.columns

    # Check "count"
    assert_eq(
        result[["name-cat", "name-cat_count"]]
        .drop_duplicates()
        .sort_values("name-cat")["name-cat_count"],
        df0.groupby("name-cat").agg({"x": "count"})["x"],
        check_index=False,
        check_dtype=False,  # May get int64 vs int32
        check_names=False,
    )

    # Check "std"
    assert_eq(
        result[["name-string", "name-string_x_std"]]
        .drop_duplicates()
        .sort_values("name-string")["name-string_x_std"],
        df0.groupby("name-string").agg({"x": "std"})["x"],
        check_index=False,
        check_names=False,
    )
示例#6
0
def test_dask_workflow_api_dlrm(client, tmpdir, datasets, freq_threshold,
                                part_mem_fraction, engine, cat_cache, on_host,
                                shuffle):
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    if engine == "parquet":
        df1 = cudf.read_parquet(paths[0])[mycols_pq]
        df2 = cudf.read_parquet(paths[1])[mycols_pq]
    elif engine == "csv":
        df1 = cudf.read_csv(paths[0], header=0)[mycols_csv]
        df2 = cudf.read_csv(paths[1], header=0)[mycols_csv]
    else:
        df1 = cudf.read_csv(paths[0], names=allcols_csv)[mycols_csv]
        df2 = cudf.read_csv(paths[1], names=allcols_csv)[mycols_csv]
    df0 = cudf.concat([df1, df2], axis=0)

    if engine == "parquet":
        cat_names = ["name-cat", "name-string"]
    else:
        cat_names = ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = Workflow(client=client,
                         cat_names=cat_names,
                         cont_names=cont_names,
                         label_name=label_name)

    processor.add_feature(
        [ops.FillMissing(),
         ops.Clip(min_value=0),
         ops.LogOp()])
    processor.add_preprocess(
        ops.Categorify(
            freq_threshold=freq_threshold,
            out_path=str(tmpdir),
            cat_cache=cat_cache,
            on_host=on_host,
        ))
    processor.finalize()

    if engine in ("parquet", "csv"):
        dataset = Dataset(paths, part_mem_fraction=part_mem_fraction)
    else:
        dataset = Dataset(paths,
                          names=allcols_csv,
                          part_mem_fraction=part_mem_fraction)
    output_path = os.path.join(tmpdir, "processed")
    processor.apply(dataset, output_path=output_path, shuffle=shuffle)

    # Can still access the final ddf if we didn't shuffle
    if not shuffle:
        result = processor.get_ddf().compute()
        assert len(df0) == len(result)
        assert result["x"].min() == 0.0
        assert result["x"].isna().sum() == 0
        assert result["y"].min() == 0.0
        assert result["y"].isna().sum() == 0

        # Check category counts
        cat_expect = df0.groupby("name-string").agg({
            "name-string": "count"
        }).reset_index(drop=True)
        cat_result = (result.groupby("name-string").agg({
            "name-string": "count"
        }).reset_index(drop=True))
        if freq_threshold:
            cat_expect = cat_expect[
                cat_expect["name-string"] >= freq_threshold]
            # Note that we may need to skip the 0th element in result (null mapping)
            assert_eq(
                cat_expect,
                cat_result.iloc[1:]
                if len(cat_result) > len(cat_expect) else cat_result,
                check_index=False,
            )
        else:
            assert_eq(cat_expect, cat_result)

        # Read back from disk
        df_disk = dask_cudf.read_parquet(output_path, index=False).compute()
        for col in df_disk:
            assert_eq(result[col], df_disk[col])

    else:
        # Read back from disk
        df_disk = dask_cudf.read_parquet(output_path, index=False).compute()
        assert len(df0) == len(df_disk)