Пример #1
0
def test_s3_dataset(s3, paths, engine, df):
    # create a mocked out bucket here
    bucket = "testbucket"
    s3.create_bucket(Bucket=bucket)

    s3_paths = []
    for path in paths:
        s3_path = f"s3://{bucket}/{path}"
        with fsspec.open(s3_path, "wb") as f:
            f.write(open(path, "rb").read())
        s3_paths.append(s3_path)

    # create a basic s3 dataset
    dataset = nvt.Dataset(s3_paths)

    # make sure the iteration API works
    columns = mycols_pq if engine == "parquet" else mycols_csv
    gdf = cudf.concat(list(dataset.to_iter()))[columns]
    assert_eq(gdf.reset_index(drop=True), df.reset_index(drop=True))

    cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name,)

    processor.add_feature([ops.ZeroFill(), ops.LogOp()])
    processor.add_preprocess(ops.Normalize())
    processor.add_preprocess(ops.Categorify(cat_cache="host"))
    processor.finalize()

    processor.update_stats(dataset)
Пример #2
0
def test_hugectr(tmpdir, df, dataset, output_format, engine, op_columns):
    cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y"]
    label_names = ["label"]

    # set variables
    nfiles = 10
    ext = ""
    outdir = tmpdir + "/hugectr"
    os.mkdir(outdir)

    # process data
    processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_names)
    processor.add_feature([ops.ZeroFill(columns=op_columns), ops.LogOp()])
    processor.add_preprocess(ops.Normalize())
    processor.add_preprocess(ops.Categorify())
    processor.finalize()

    # Need to collect statistics first (for now)
    processor.update_stats(dataset)

    # Second "online" pass to write HugeCTR output
    processor.apply(
        dataset,
        apply_offline=False,
        record_stats=False,
        output_path=outdir,
        out_files_per_proc=nfiles,
        output_format=output_format,
        shuffle=False,
    )

    # Check files
    ext = ""
    if output_format == "parquet":
        ext = "parquet"
        assert os.path.isfile(outdir + "/metadata.json")
    elif output_format == "hugectr":
        ext = "data"

    assert os.path.isfile(outdir + "/file_list.txt")
    for n in range(nfiles):
        assert os.path.isfile(os.path.join(outdir, str(n) + "." + ext))
Пример #3
0
def test_hugectr(tmpdir, df, dataset, output_format, engine, op_columns, num_io_threads):
    cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y"]
    label_names = ["label"]

    # set variables
    nfiles = 10
    ext = ""
    outdir = tmpdir + "/hugectr"
    os.mkdir(outdir)

    # process data
    processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_names)
    processor.add_feature([ops.ZeroFill(columns=op_columns), ops.LogOp()])
    processor.add_preprocess(ops.Normalize())
    processor.add_preprocess(ops.Categorify())
    processor.finalize()

    # Need to collect statistics first (for now)
    processor.update_stats(dataset)

    # Second "online" pass to write HugeCTR output
    processor.apply(
        dataset,
        apply_offline=False,
        record_stats=False,
        output_path=outdir,
        out_files_per_proc=nfiles,
        output_format=output_format,
        shuffle=False,
        num_io_threads=num_io_threads,
    )

    # Check for _file_list.txt
    assert os.path.isfile(outdir + "/_file_list.txt")

    # Check for _metadata.json
    assert os.path.isfile(outdir + "/_metadata.json")

    # Check contents of _metadata.json
    data = {}
    col_summary = {}
    with open(outdir + "/_metadata.json", "r") as fil:
        for k, v in json.load(fil).items():
            data[k] = v
    assert "cats" in data
    assert "conts" in data
    assert "labels" in data
    assert "file_stats" in data
    assert len(data["file_stats"]) == nfiles
    for cdata in data["cats"] + data["conts"] + data["labels"]:
        col_summary[cdata["index"]] = cdata["col_name"]

    # Check that data files exist
    ext = ""
    if output_format == "parquet":
        ext = "parquet"
    elif output_format == "hugectr":
        ext = "data"
    for n in range(nfiles):
        assert os.path.isfile(os.path.join(outdir, str(n) + "." + ext))

    # Make sure the columns in "_metadata.json" make sense
    if output_format == "parquet":
        df_check = cudf.read_parquet(os.path.join(outdir, "0.parquet"))
        for i, name in enumerate(df_check.columns):
            if i in col_summary:
                assert col_summary[i] == name
Пример #4
0
def test_dask_workflow_api_dlrm(dask_cluster, tmpdir, datasets, freq_threshold,
                                part_mem_fraction, engine):

    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    if engine == "parquet":
        df1 = cudf.read_parquet(paths[0])[mycols_pq]
        df2 = cudf.read_parquet(paths[1])[mycols_pq]
    elif engine == "csv":
        df1 = cudf.read_csv(paths[0], header=0)[mycols_csv]
        df2 = cudf.read_csv(paths[1], header=0)[mycols_csv]
    else:
        df1 = cudf.read_csv(paths[0], names=allcols_csv)[mycols_csv]
        df2 = cudf.read_csv(paths[1], names=allcols_csv)[mycols_csv]
    df0 = cudf.concat([df1, df2], axis=0)

    if engine == "parquet":
        cat_names = ["name-cat", "name-string"]
    else:
        cat_names = ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = Workflow(client=client,
                         cat_names=cat_names,
                         cont_names=cont_names,
                         label_name=label_name)

    processor.add_feature([ops.ZeroFill(), ops.LogOp()])
    processor.add_preprocess(
        ops.Categorify(freq_threshold=freq_threshold,
                       out_path=str(tmpdir),
                       split_out=2))
    processor.finalize()

    if engine in ("parquet", "csv"):
        dataset = DaskDataset(paths, part_mem_fraction=part_mem_fraction)
    else:
        dataset = DaskDataset(paths,
                              names=allcols_csv,
                              part_mem_fraction=part_mem_fraction)
    processor.apply(dataset, output_path=str(tmpdir))
    result = processor.get_ddf().compute()

    assert len(df0) == len(result)
    assert result["x"].min() == 0.0
    assert result["x"].isna().sum() == 0
    assert result["y"].min() == 0.0
    assert result["y"].isna().sum() == 0

    # Check category counts
    cat_expect = df0.groupby("name-string").agg({
        "name-string": "count"
    }).reset_index(drop=True)
    cat_result = result.groupby("name-string").agg({
        "name-string": "count"
    }).reset_index(drop=True)
    if freq_threshold:
        cat_expect = cat_expect[cat_expect["name-string"] >= freq_threshold]
        # Note that we may need to skip the 0th element in result (null mapping)
        assert_eq(
            cat_expect,
            cat_result.iloc[1:]
            if len(cat_result) > len(cat_expect) else cat_result,
            check_index=False,
        )
    else:
        assert_eq(cat_expect, cat_result)

    # Read back from disk
    df_disk = dask_cudf.read_parquet("/".join([str(tmpdir), "processed"]),
                                     index=False).compute()
    for col in df_disk:
        assert_eq(result[col], df_disk[col])
Пример #5
0
def main(args):

    # Input
    data_path = args.data_path
    out_path = args.out_path
    freq_limit = args.freq_limit
    out_files_per_proc = args.splits
    if args.protocol == "ucx":
        os.environ["UCX_TLS"] = "tcp,cuda_copy,cuda_ipc,sockcm"

    # Use Criteo dataset by default (for now)
    cont_names = (args.cont_names.split(",")
                  if args.cont_names else ["I" + str(x) for x in range(1, 14)])
    cat_names = (args.cat_names.split(",")
                 if args.cat_names else ["C" + str(x) for x in range(1, 27)])
    label_name = ["label"]

    if args.cat_splits:
        tree_width = {
            name: int(s)
            for name, s in zip(cat_names, args.cat_splits.split(","))
        }
    else:
        tree_width = {col: 1 for col in cat_names}
        if args.cat_names is None:
            # Using Criteo... Use more hash partitions for
            # known high-cardinality columns
            tree_width["C20"] = 8
            tree_width["C1"] = 8
            tree_width["C22"] = 4
            tree_width["C10"] = 4
            tree_width["C21"] = 2
            tree_width["C11"] = 2
            tree_width["C23"] = 2
            tree_width["C12"] = 2

    # Specify categorical caching location
    cat_cache = None
    if args.cat_cache:
        cat_cache = args.cat_cache.split(",")
        if len(cat_cache) == 1:
            cat_cache = cat_cache[0]
        else:
            # If user is specifying a list of options,
            # they must specify an option for every cat column
            assert len(cat_names) == len(cat_cache)
    if isinstance(cat_cache, str):
        cat_cache = {col: cat_cache for col in cat_names}
    elif isinstance(cat_cache, list):
        cat_cache = {name: c for name, c in zip(cat_names, cat_cache)}
    else:
        # Criteo/DLRM Defaults
        cat_cache = {col: "device" for col in cat_names}
        if args.cat_names is None:
            cat_cache["C20"] = "host"
            cat_cache["C1"] = "host"
            # Only need to cache the largest two on a dgx-2
            if args.n_workers < 16:
                cat_cache["C22"] = "host"
                cat_cache["C10"] = "host"

    # Use total device size to calculate args.device_limit_frac
    device_size = device_mem_size(kind="total")
    device_limit = int(args.device_limit_frac * device_size)
    device_pool_size = int(args.device_pool_frac * device_size)
    part_size = int(args.part_mem_frac * device_size)

    # Setup LocalCUDACluster
    if args.protocol == "tcp":
        cluster = LocalCUDACluster(
            protocol=args.protocol,
            n_workers=args.n_workers,
            CUDA_VISIBLE_DEVICES=args.devs,
            device_memory_limit=device_limit,
            local_directory=args.dask_workspace,
            dashboard_address=":3787",
        )
    else:
        cluster = LocalCUDACluster(
            protocol=args.protocol,
            n_workers=args.n_workers,
            CUDA_VISIBLE_DEVICES=args.devs,
            enable_nvlink=True,
            device_memory_limit=device_limit,
            local_directory=args.dask_workspace,
            dashboard_address=":3787",
        )
    client = Client(cluster)

    # Setup RMM pool
    if not args.no_rmm_pool:
        setup_rmm_pool(client, device_pool_size)

    # Define Dask NVTabular "Workflow"
    processor = Workflow(cat_names=cat_names,
                         cont_names=cont_names,
                         label_name=label_name,
                         client=client)
    processor.add_feature([ops.ZeroFill(), ops.LogOp()])
    processor.add_preprocess(
        ops.Categorify(
            out_path=out_path,
            tree_width=tree_width,
            cat_cache=cat_cache,
            freq_threshold=freq_limit,
            on_host=args.cat_on_host,
        ))
    processor.finalize()

    dataset = Dataset(data_path, "parquet", part_size=part_size)

    # Execute the dask graph
    runtime = time.time()
    if args.profile is not None:
        with performance_report(filename=args.profile):
            processor.apply(
                dataset,
                shuffle="full" if args.worker_shuffle else "partial",
                out_files_per_proc=out_files_per_proc,
                output_path=out_path,
            )
    else:
        processor.apply(
            dataset,
            shuffle="full" if args.worker_shuffle else "partial",
            out_files_per_proc=out_files_per_proc,
            output_path=out_path,
        )
    runtime = time.time() - runtime

    print("\nDask-NVTabular DLRM/Criteo benchmark")
    print("--------------------------------------")
    print(f"partition size     | {part_size}")
    print(f"protocol           | {args.protocol}")
    print(f"device(s)          | {args.devs}")
    print(f"rmm-pool           | {(not args.no_rmm_pool)}")
    print(f"out_files_per_proc | {args.splits}")
    print(f"worker-shuffle     | {args.worker_shuffle}")
    print("======================================")
    print(f"Runtime[s]         | {runtime}")
    print("======================================\n")

    client.close()
Пример #6
0
def test_gpu_workflow_api(tmpdir, client, df, dataset, gpu_memory_frac, engine,
                          dump, op_columns, use_client):
    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = nvt.Workflow(
        cat_names=cat_names,
        cont_names=cont_names,
        label_name=label_name,
        client=client if use_client else None,
    )

    processor.add_feature([ops.ZeroFill(columns=op_columns), ops.LogOp()])
    processor.add_preprocess(ops.Normalize())
    processor.add_preprocess(ops.Categorify(cat_cache="host"))
    processor.finalize()

    processor.update_stats(dataset)

    if dump:
        config_file = tmpdir + "/temp.yaml"
        processor.save_stats(config_file)
        processor.clear_stats()
        processor.load_stats(config_file)

    def get_norms(tar: cudf.Series):
        gdf = tar.fillna(0)
        gdf = gdf * (gdf >= 0).astype("int")
        gdf = np.log(gdf + 1)
        return gdf

    # Check mean and std - No good right now we have to add all other changes; Zerofill, Log

    if not op_columns:
        assert math.isclose(get_norms(df.y).mean(),
                            processor.stats["means"]["y"],
                            rel_tol=1e-1)
        assert math.isclose(get_norms(df.y).std(),
                            processor.stats["stds"]["y"],
                            rel_tol=1e-1)
    assert math.isclose(get_norms(df.x).mean(),
                        processor.stats["means"]["x"],
                        rel_tol=1e-1)
    assert math.isclose(get_norms(df.x).std(),
                        processor.stats["stds"]["x"],
                        rel_tol=1e-1)

    # Check that categories match
    if engine == "parquet":
        cats_expected0 = df["name-cat"].unique().values_host
        cats0 = get_cats(processor, "name-cat")
        # adding the None entry as a string because of move from gpu
        assert cats0.tolist() == [None] + cats_expected0.tolist()
    cats_expected1 = df["name-string"].unique().values_host
    cats1 = get_cats(processor, "name-string")
    # adding the None entry as a string because of move from gpu
    assert cats1.tolist() == [None] + cats_expected1.tolist()

    # Write to new "shuffled" and "processed" dataset
    processor.write_to_dataset(tmpdir,
                               dataset,
                               out_files_per_proc=10,
                               shuffle="partial",
                               apply_ops=True)

    dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"),
                        part_mem_fraction=gpu_memory_frac)

    df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0)

    if engine == "parquet":
        assert is_integer_dtype(df_pp["name-cat"].dtype)
    assert is_integer_dtype(df_pp["name-string"].dtype)

    num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(
        str(tmpdir) + "/_metadata")
    assert num_rows == len(df_pp)
Пример #7
0
def test_gpu_workflow(tmpdir, client, df, dataset, gpu_memory_frac, engine,
                      dump):
    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    config = nvt.workflow.get_new_config()
    config["FE"]["continuous"] = [ops.ZeroFill()]
    config["PP"]["continuous"] = [[ops.ZeroFill(), ops.Normalize()]]
    config["PP"]["categorical"] = [ops.Categorify()]

    processor = nvt.Workflow(
        cat_names=cat_names,
        cont_names=cont_names,
        label_name=label_name,
        config=config,
        client=client,
    )

    processor.update_stats(dataset)
    if dump:
        config_file = tmpdir + "/temp.yaml"
        processor.save_stats(config_file)
        processor.clear_stats()
        processor.load_stats(config_file)

    def get_norms(tar: cudf.Series):
        gdf = tar.fillna(0)
        gdf = gdf * (gdf >= 0).astype("int")
        return gdf

    assert math.isclose(get_norms(df.x).mean(),
                        processor.stats["means"]["x"],
                        rel_tol=1e-4)
    assert math.isclose(get_norms(df.y).mean(),
                        processor.stats["means"]["y"],
                        rel_tol=1e-4)
    #     assert math.isclose(get_norms(df.id).mean(),
    #                         processor.stats["means"]["id_ZeroFill_LogOp"], rel_tol=1e-4)
    assert math.isclose(get_norms(df.x).std(),
                        processor.stats["stds"]["x"],
                        rel_tol=1e-3)
    assert math.isclose(get_norms(df.y).std(),
                        processor.stats["stds"]["y"],
                        rel_tol=1e-3)
    #     assert math.isclose(get_norms(df.id).std(),
    #                         processor.stats["stds"]["id_ZeroFill_LogOp"], rel_tol=1e-3)

    # Check that categories match
    if engine == "parquet":
        cats_expected0 = df["name-cat"].unique().values_host
        cats0 = get_cats(processor, "name-cat")
        # adding the None entry as a string because of move from gpu
        assert cats0.tolist() == [None] + cats_expected0.tolist()
    cats_expected1 = df["name-string"].unique().values_host
    cats1 = get_cats(processor, "name-string")
    # adding the None entry as a string because of move from gpu
    assert cats1.tolist() == [None] + cats_expected1.tolist()

    # Write to new "shuffled" and "processed" dataset
    processor.write_to_dataset(tmpdir,
                               dataset,
                               out_files_per_proc=10,
                               shuffle="partial",
                               apply_ops=True)

    dataset_2 = Dataset(glob.glob(str(tmpdir) + "/*.parquet"),
                        part_mem_fraction=gpu_memory_frac)

    df_pp = cudf.concat(list(dataset_2.to_iter()), axis=0)

    if engine == "parquet":
        assert is_integer_dtype(df_pp["name-cat"].dtype)
    assert is_integer_dtype(df_pp["name-string"].dtype)

    num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(
        str(tmpdir) + "/_metadata")
    assert num_rows == len(df_pp)
Пример #8
0
def test_gpu_workflow_api(tmpdir, datasets, dump, gpu_memory_frac, engine,
                          op_columns):
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])

    if engine == "parquet":
        df1 = cudf.read_parquet(paths[0])[mycols_pq]
        df2 = cudf.read_parquet(paths[1])[mycols_pq]
    else:
        df1 = cudf.read_csv(paths[0], header=False,
                            names=allcols_csv)[mycols_csv]
        df2 = cudf.read_csv(paths[1], header=False,
                            names=allcols_csv)[mycols_csv]
    df = cudf.concat([df1, df2], axis=0)
    df["id"] = df["id"].astype("int64")

    if engine == "parquet":
        cat_names = ["name-cat", "name-string"]
        columns = mycols_pq
    else:
        cat_names = ["name-string"]
        columns = mycols_csv
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    processor = nvt.Workflow(
        cat_names=cat_names,
        cont_names=cont_names,
        label_name=label_name,
        to_cpu=False,
    )

    processor.add_feature([ops.ZeroFill(columns=op_columns), ops.LogOp()])
    processor.add_preprocess(ops.Normalize())
    processor.add_preprocess(ops.Categorify())
    processor.finalize()

    data_itr = nvtabular.io.GPUDatasetIterator(
        paths,
        columns=columns,
        use_row_groups=True,
        gpu_memory_frac=gpu_memory_frac,
        names=allcols_csv,
    )

    processor.update_stats(data_itr)

    if dump:
        config_file = tmpdir + "/temp.yaml"
        processor.save_stats(config_file)
        processor.clear_stats()
        processor.load_stats(config_file)

    def get_norms(tar: cudf.Series):
        gdf = tar.fillna(0)
        gdf = gdf * (gdf >= 0).astype("int")
        gdf = np.log(gdf + 1)
        return gdf

    # Check mean and std - No good right now we have to add all other changes; Zerofill, Log

    if not op_columns:
        assert math.isclose(
            get_norms(df.y).mean(),
            processor.stats["means"]["y"],
            rel_tol=1e-1,
        )
        assert math.isclose(
            get_norms(df.y).std(),
            processor.stats["stds"]["y"],
            rel_tol=1e-1,
        )
    assert math.isclose(
        get_norms(df.x).mean(),
        processor.stats["means"]["x"],
        rel_tol=1e-1,
    )
    assert math.isclose(
        get_norms(df.x).std(),
        processor.stats["stds"]["x"],
        rel_tol=1e-1,
    )

    # Check that categories match
    if engine == "parquet":
        cats_expected0 = df["name-cat"].unique().values_to_string()
        cats0 = processor.stats["encoders"]["name-cat"].get_cats(
        ).values_to_string()
        # adding the None entry as a string because of move from gpu
        assert cats0 == ["None"] + cats_expected0
    cats_expected1 = df["name-string"].unique().values_to_string()
    cats1 = processor.stats["encoders"]["name-string"].get_cats(
    ).values_to_string()
    # adding the None entry as a string because of move from gpu
    assert cats1 == ["None"] + cats_expected1

    # Write to new "shuffled" and "processed" dataset
    processor.write_to_dataset(tmpdir,
                               data_itr,
                               nfiles=10,
                               shuffle=True,
                               apply_ops=True)

    data_itr_2 = nvtabular.io.GPUDatasetIterator(
        glob.glob(str(tmpdir) + "/ds_part.*.parquet"),
        use_row_groups=True,
        gpu_memory_frac=gpu_memory_frac,
    )

    df_pp = None
    for chunk in data_itr_2:
        df_pp = cudf.concat([df_pp, chunk], axis=0) if df_pp else chunk

    if engine == "parquet":
        assert df_pp["name-cat"].dtype == "int64"
    assert df_pp["name-string"].dtype == "int64"

    num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(
        str(tmpdir) + "/_metadata")
    assert num_rows == len(df_pp)
    return processor.ds_exports
Пример #9
0
def test_gpu_workflow(tmpdir, df, dataset, gpu_memory_frac, engine, dump):
    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y", "id"]
    label_name = ["label"]

    config = nvt.workflow.get_new_config()
    config["FE"]["continuous"] = [ops.ZeroFill()]
    config["PP"]["continuous"] = [[ops.ZeroFill(), ops.Normalize()]]
    config["PP"]["categorical"] = [ops.Categorify()]

    processor = nvt.Workflow(
        cat_names=cat_names,
        cont_names=cont_names,
        label_name=label_name,
        config=config,
    )

    processor.update_stats(dataset)
    if dump:
        config_file = tmpdir + "/temp.yaml"
        processor.save_stats(config_file)
        processor.clear_stats()
        processor.load_stats(config_file)

    def get_norms(tar: cudf.Series):
        gdf = tar.fillna(0)
        gdf = gdf * (gdf >= 0).astype("int")
        return gdf

    assert math.isclose(get_norms(df.x).mean(),
                        processor.stats["means"]["x"],
                        rel_tol=1e-4)
    assert math.isclose(get_norms(df.y).mean(),
                        processor.stats["means"]["y"],
                        rel_tol=1e-4)
    #     assert math.isclose(get_norms(df.id).mean(),
    #                         processor.stats["means"]["id_ZeroFill_LogOp"], rel_tol=1e-4)
    assert math.isclose(get_norms(df.x).std(),
                        processor.stats["stds"]["x"],
                        rel_tol=1e-3)
    assert math.isclose(get_norms(df.y).std(),
                        processor.stats["stds"]["y"],
                        rel_tol=1e-3)
    #     assert math.isclose(get_norms(df.id).std(),
    #                         processor.stats["stds"]["id_ZeroFill_LogOp"], rel_tol=1e-3)

    # Check that categories match
    if engine == "parquet":
        cats_expected0 = df["name-cat"].unique().values_to_string()
        cats0 = processor.stats["encoders"]["name-cat"].get_cats(
        ).values_to_string()
        # adding the None entry as a string because of move from gpu
        assert cats0 == ["None"] + cats_expected0
    cats_expected1 = df["name-string"].unique().values_to_string()
    cats1 = processor.stats["encoders"]["name-string"].get_cats(
    ).values_to_string()
    # adding the None entry as a string because of move from gpu
    assert cats1 == ["None"] + cats_expected1

    # Write to new "shuffled" and "processed" dataset
    processor.write_to_dataset(tmpdir,
                               dataset,
                               nfiles=10,
                               shuffle=True,
                               apply_ops=True)

    data_itr_2 = nvtabular.io.GPUDatasetIterator(
        glob.glob(str(tmpdir) + "/ds_part.*.parquet"),
        use_row_groups=True,
        gpu_memory_frac=gpu_memory_frac,
    )

    df_pp = None
    for chunk in data_itr_2:
        df_pp = cudf.concat([df_pp, chunk], axis=0) if df_pp else chunk

    if engine == "parquet":
        assert df_pp["name-cat"].dtype == "int64"
    assert df_pp["name-string"].dtype == "int64"

    num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(
        str(tmpdir) + "/_metadata")
    assert num_rows == len(df_pp)
    return processor.ds_exports
Пример #10
0
def test_hugectr(tmpdir, client, df, dataset, output_format, engine,
                 op_columns, num_io_threads, use_client):
    client = client if use_client else None

    cat_names = ["name-cat", "name-string"
                 ] if engine == "parquet" else ["name-string"]
    cont_names = ["x", "y"]
    label_names = ["label"]

    # set variables
    nfiles = 10
    ext = ""
    outdir = tmpdir + "/hugectr"
    os.mkdir(outdir)

    # process data
    processor = nvt.Workflow(client=client,
                             cat_names=cat_names,
                             cont_names=cont_names,
                             label_name=label_names)
    processor.add_feature([ops.ZeroFill(columns=op_columns), ops.LogOp()])
    processor.add_preprocess(ops.Normalize())
    processor.add_preprocess(ops.Categorify())
    processor.finalize()

    # apply the workflow and write out the dataset
    processor.apply(
        dataset,
        output_path=outdir,
        out_files_per_proc=nfiles,
        output_format=output_format,
        shuffle=None,
        num_io_threads=num_io_threads,
    )

    # Check for _file_list.txt
    assert os.path.isfile(outdir + "/_file_list.txt")

    # Check for _metadata.json
    assert os.path.isfile(outdir + "/_metadata.json")

    # Check contents of _metadata.json
    data = {}
    col_summary = {}
    with open(outdir + "/_metadata.json", "r") as fil:
        for k, v in json.load(fil).items():
            data[k] = v
    assert "cats" in data
    assert "conts" in data
    assert "labels" in data
    assert "file_stats" in data
    assert len(data["file_stats"]) == nfiles if not client else nfiles * len(
        client.cluster.workers)
    for cdata in data["cats"] + data["conts"] + data["labels"]:
        col_summary[cdata["index"]] = cdata["col_name"]

    # Check that data files exist
    ext = ""
    if output_format == "parquet":
        ext = "parquet"
    elif output_format == "hugectr":
        ext = "data"

    data_files = [
        os.path.join(outdir, filename) for filename in os.listdir(outdir)
        if filename.endswith(ext)
    ]

    # Make sure the columns in "_metadata.json" make sense
    if output_format == "parquet":
        df_check = cudf.read_parquet(os.path.join(outdir, data_files[0]))
        for i, name in enumerate(df_check.columns):
            if i in col_summary:
                assert col_summary[i] == name