示例#1
0
 def test_dask_classifier(self, model, local_cuda_cluster: LocalCUDACluster) -> None:
     import dask_cudf
     with Client(local_cuda_cluster) as client:
         X_, y_, w_ = generate_array(with_weights=True)
         y_ = (y_ * 10).astype(np.int32)
         X = dask_cudf.from_dask_dataframe(dd.from_dask_array(X_))
         y = dask_cudf.from_dask_dataframe(dd.from_dask_array(y_))
         w = dask_cudf.from_dask_dataframe(dd.from_dask_array(w_))
         run_dask_classifier(X, y, w, model, client)
示例#2
0
def test_categorical(local_cuda_cluster: LocalCUDACluster) -> None:
    with Client(local_cuda_cluster) as client:
        import dask_cudf

        X, y = make_categorical(client, 10000, 30, 13)
        X = dask_cudf.from_dask_dataframe(X)

        X_onehot, _ = make_categorical(client, 10000, 30, 13, True)
        X_onehot = dask_cudf.from_dask_dataframe(X_onehot)
        run_categorical(client, "gpu_hist", X, X_onehot, y)
示例#3
0
def test_categorical(local_cuda_cluster: LocalCUDACluster) -> None:
    with Client(local_cuda_cluster) as client:
        import dask_cudf

        rounds = 10
        X, y = make_categorical(client, 10000, 30, 13)
        X = dask_cudf.from_dask_dataframe(X)

        X_onehot, _ = make_categorical(client, 10000, 30, 13, True)
        X_onehot = dask_cudf.from_dask_dataframe(X_onehot)

        parameters = {"tree_method": "gpu_hist"}

        m = dxgb.DaskDMatrix(client, X_onehot, y, enable_categorical=True)
        by_etl_results = dxgb.train(
            client,
            parameters,
            m,
            num_boost_round=rounds,
            evals=[(m, "Train")],
        )["history"]

        m = dxgb.DaskDMatrix(client, X, y, enable_categorical=True)
        output = dxgb.train(
            client,
            parameters,
            m,
            num_boost_round=rounds,
            evals=[(m, "Train")],
        )
        by_builtin_results = output["history"]

        np.testing.assert_allclose(
            np.array(by_etl_results["Train"]["rmse"]),
            np.array(by_builtin_results["Train"]["rmse"]),
            rtol=1e-3,
        )
        assert tm.non_increasing(by_builtin_results["Train"]["rmse"])

        model = output["booster"]
        with tempfile.TemporaryDirectory() as tempdir:
            path = os.path.join(tempdir, "model.json")
            model.save_model(path)
            with open(path, "r") as fd:
                categorical = json.load(fd)

            categories_sizes = np.array(
                categorical["learner"]["gradient_booster"]["model"]["trees"]
                [-1]["categories_sizes"])
            assert categories_sizes.shape[0] != 0
            np.testing.assert_allclose(categories_sizes, 1)
示例#4
0
def main(client):
    import dask_cudf

    product_reviews_df = read_tables()
    product_reviews_df = product_reviews_df[
        product_reviews_df.pr_item_sk == q27_pr_item_sk
    ]

    sentences = product_reviews_df.map_partitions(
        create_sentences_from_reviews,
        review_column="pr_review_content",
        end_of_line_char=EOL_CHAR,
    )

    # need the global position in the sentence tokenized df
    sentences["x"] = 1
    sentences["sentence_tokenized_global_pos"] = sentences.x.cumsum()
    del sentences["x"]

    sentences = sentences.persist()
    wait(sentences)

    # Do the NER
    sentences = sentences.to_dask_dataframe()
    ner_parsed = sentences.map_partitions(ner_parser, "sentence")
    ner_parsed = dask_cudf.from_dask_dataframe(ner_parsed)
    ner_parsed = ner_parsed.persist()
    wait(ner_parsed)

    ner_parsed = ner_parsed[ner_parsed.company_name_list != ""]

    # separate NER results into one row per found company
    repeated_names = ner_parsed.map_partitions(
        create_words_from_sentences,
        sentence_column="company_name_list",
        global_position_column="sentence_tokenized_global_pos",
        delimiter="é",
    )

    # recombine
    recombined = repeated_names.merge(
        ner_parsed,
        how="left",
        left_on="sentence_idx_global_pos",
        right_on="sentence_tokenized_global_pos",
    )
    recombined["pr_item_sk"] = q27_pr_item_sk
    recombined = recombined[["review_idx_global_pos", "pr_item_sk", "word", "sentence"]]

    recombined = recombined.persist()
    wait(recombined)

    recombined = recombined.sort_values(
        ["review_idx_global_pos", "pr_item_sk", "word", "sentence"]
    ).persist()

    recombined.columns = ["review_sk", "item_sk", "company_name", "review_sentence"]
    recombined = recombined.persist()
    wait(recombined)
    return recombined
示例#5
0
def test_roundtrip_from_dask_cudf(tmpdir, write_meta):
    tmpdir = str(tmpdir)
    gddf = dask_cudf.from_dask_dataframe(ddf)
    gddf.to_parquet(tmpdir, write_metadata_file=write_meta)

    gddf2 = dask_cudf.read_parquet(tmpdir)
    dd.assert_eq(gddf, gddf2, check_divisions=write_meta)
 def dask_gpu_parquet_ingest(self, target_files, columns=None):
     if self.rapids_version < 15:
         # rapids 0.14 has a known issue with read_parquet https://github.com/rapidsai/cudf/issues/5579
         return dask_cudf.from_dask_dataframe(
             self.dask_cpu_parquet_ingest(target_files, columns=columns))
     else:
         return dask_cudf.read_parquet(target_files, columns=columns)
示例#7
0
    def _move_ddf(self, destination):
        """Move the collection between cpu and gpu memory."""
        _ddf = self._ddf
        if (self.moved_collection and isinstance(_ddf.dask, HighLevelGraph)
                and hasattr(_ddf.dask, "key_dependencies")):
            # If our collection has already been moved, and if the
            # underlying graph is a `HighLevelGraph`, we can just
            # drop the last "from_pandas-..." layer if the current
            # destination is "cpu", or we can drop the last
            # "to_pandas-..." layer if the destination is "gpu".
            search_name = "from_pandas-" if destination == "cpu" else "to_pandas-"

            pandas_conversion_layer = None
            pandas_conversion_dep = None
            for k, v in _ddf.dask.dependents.items():
                if k.startswith(search_name) and v == set():
                    pandas_conversion_layer = k
                    break
            if pandas_conversion_layer:
                deps = [
                    d for d in _ddf.dask.dependencies[pandas_conversion_layer]
                ]
                if len(deps) == 1:
                    pandas_conversion_dep = deps[0]

            if pandas_conversion_layer and pandas_conversion_dep:
                # We have met the criteria to remove the last "from/to_pandas-" layer
                new_layers = {
                    k: v
                    for k, v in _ddf.dask.layers.items()
                    if k != pandas_conversion_layer
                }
                new_deps = {
                    k: v
                    for k, v in _ddf.dask.dependencies.items()
                    if k != pandas_conversion_layer
                }
                hlg = HighLevelGraph(
                    layers=new_layers,
                    dependencies=new_deps,
                    key_dependencies=_ddf.dask.key_dependencies,
                )

                _meta = (_ddf._meta.to_pandas() if destination == "cpu" else
                         cudf.from_pandas(_ddf._meta))
                return new_dd_object(hlg, pandas_conversion_dep, _meta,
                                     _ddf.divisions)

        if destination == "cpu":
            # Just extend the existing graph to move the collection to cpu
            return _ddf.to_dask_dataframe()

        elif destination == "gpu":
            # Just extend the existing graph to move the collection to gpu
            return dask_cudf.from_dask_dataframe(_ddf)

        else:
            raise ValueError(f"destination {destination} not recognized.")
示例#8
0
def test_roundtrip_from_dask_cudf(tmpdir):
    tmpdir = str(tmpdir)
    gddf = dask_cudf.from_dask_dataframe(ddf)
    gddf.to_parquet(tmpdir)

    # NOTE: Need `.compute()` to resolve correct index
    #       name after `from_dask_dataframe`
    gddf2 = dask_cudf.read_parquet(tmpdir)
    assert_eq(gddf.compute(), gddf2)
示例#9
0
def test_from_dask_dataframe():
    np.random.seed(0)
    df = pd.DataFrame({
        "x": np.random.randint(0, 5, size=20),
        "y": np.random.normal(size=20)
    })
    ddf = dd.from_pandas(df, npartitions=2)
    dgdf = dgd.from_dask_dataframe(ddf)
    got = dgdf.compute().to_pandas()
    expect = df

    np.testing.assert_array_equal(got.index.values, expect.index.values)
    np.testing.assert_array_equal(got.x.values, expect.x.values)
    np.testing.assert_array_equal(got.y.values, expect.y.values)
示例#10
0
def using_quantile_device_dmatrix(client: Client, X, y):
    '''`DaskDeviceQuantileDMatrix` is a data type specialized for `gpu_hist`, tree
     method that reduces memory overhead.  When training on GPU pipeline, it's
     preferred over `DaskDMatrix`.

    .. versionadded:: 1.2.0

    '''
    # Input must be on GPU for `DaskDeviceQuantileDMatrix`.
    X = dask_cudf.from_dask_dataframe(dd.from_dask_array(X))
    y = dask_cudf.from_dask_dataframe(dd.from_dask_array(y))

    # `DaskDeviceQuantileDMatrix` is used instead of `DaskDMatrix`, be careful
    # that it can not be used for anything else other than training.
    dtrain = dxgb.DaskDeviceQuantileDMatrix(client, X, y)
    output = xgb.dask.train(client, {
        'verbosity': 2,
        'tree_method': 'gpu_hist'
    },
                            dtrain,
                            num_boost_round=4)

    prediction = xgb.dask.predict(client, output, X)
    return prediction
示例#11
0
def test_set_index(nelem):
    np.random.seed(0)
    # Use unique index range as the sort may not be stable-ordering
    x = np.arange(nelem)
    np.random.shuffle(x)
    df = pd.DataFrame({'x': x, 'y': np.random.randint(0, nelem, size=nelem)})
    ddf = dd.from_pandas(df, npartitions=2)
    dgdf = dgd.from_dask_dataframe(ddf)

    expect = ddf.set_index('x').compute()
    got = dgdf.set_index('x').compute().to_pandas()

    np.testing.assert_array_equal(got.index.values, expect.index.values)
    np.testing.assert_array_equal(got.y.values, expect.y.values)
    assert got.columns == expect.columns
示例#12
0
 def to_dc(self,
           input_item: Any,
           table_name: str,
           format: str = None,
           gpu: bool = False,
           **kwargs):
     if gpu:  # pragma: no cover
         try:
             import dask_cudf
         except ImportError:
             raise ModuleNotFoundError(
                 "Setting `gpu=True` for table creation requires dask_cudf")
         if not isinstance(input_item, dask_cudf.DataFrame):
             input_item = dask_cudf.from_dask_dataframe(
                 input_item, **kwargs)
     return input_item
示例#13
0
def test_groupby_categorical_key():
    # See https://github.com/rapidsai/cudf/issues/4608
    df = dask.datasets.timeseries()
    gddf = dask_cudf.from_dask_dataframe(df)
    gddf["name"] = gddf["name"].astype("category")
    ddf = gddf.to_dask_dataframe()

    got = (gddf.groupby("name").agg({
        "x": ["mean", "max"],
        "y": ["mean", "count"]
    }).compute())
    expect = (ddf.groupby("name").agg({
        "x": ["mean", "max"],
        "y": ["mean", "count"]
    }).compute())
    dd.assert_eq(expect, got)
    def process(self, inputs):
        """
        genearte the fake data for classification
        Arguments
        -------
         inputs: list
             empty list
        Returns
        -------
        cudf.DataFrame
        """
        output = {}

        def get_cudf(offset=None):
            conf = copy.copy(self.conf)
            if 'n_parts' in conf:
                del conf['n_parts']
            x, y = cuml.datasets.make_classification(**conf)
            df = cudf.DataFrame({'x'+str(i): x[:, i]
                                 for i in range(x.shape[1])})
            df['y'] = y
            if offset is not None:
                df.index += offset
            return df

        if self.outport_connected(CUDF_PORT_NAME):
            df = get_cudf()
            output.update({CUDF_PORT_NAME: df})
        if self.outport_connected(DASK_CUDF_PORT_NAME):

            def mapfun(x):
                return x.get()

            x, y = cuml.dask.datasets.classification.make_classification(
                **self.conf)
            ddf = x.map_blocks(mapfun,
                               dtype=x.dtype).to_dask_dataframe()
            out = dask_cudf.from_dask_dataframe(ddf)
            out.columns = ['x'+str(i) for i in range(x.shape[1])]
            out['y'] = y.astype('int64')
            output.update({DASK_CUDF_PORT_NAME: out})
        return output
示例#15
0
def test_take(nelem, nparts):
    np.random.seed(0)

    # # Use unique index range as the sort may not be stable-ordering
    x = np.random.randint(0, nelem, size=nelem)
    y = np.random.random(nelem)

    selected = np.random.randint(0, nelem - 1, size=nelem // 2)

    df = pd.DataFrame({'x': x, 'y': y})

    ddf = dd.from_pandas(df, npartitions=nparts)
    dgdf = dgd.from_dask_dataframe(ddf)
    out = dgdf.take(gd.Series(selected), npartitions=5)
    got = out.compute().to_pandas()

    expect = df.take(selected)
    assert 1 < out.npartitions <= 5
    np.testing.assert_array_equal(got.index, np.arange(len(got)))
    np.testing.assert_array_equal(got.x, expect.x)
    np.testing.assert_array_equal(got.y, expect.y)
示例#16
0
 def process(self, inputs):
     input_df = inputs[self.INPUT_PORT_NAME]
     bst_model = inputs[self.INPUT_PORT_MODEL_NAME]
     input_meta = self.get_input_meta()
     required_cols = input_meta[
         self.INPUT_PORT_MODEL_NAME]['train']
     required_cols = list(required_cols.keys())
     # required_cols.sort()
     predict_col = self.conf.get('prediction', 'predict')
     pred_contribs: bool = self.conf.get('pred_contribs', False)
     if isinstance(input_df, dask_cudf.DataFrame):
         # get the client
         client = dask.distributed.client.default_client()
         dtrain = xgb.dask.DaskDMatrix(client, input_df[required_cols])
         prediction = xgb.dask.predict(client,
                                       bst_model,
                                       dtrain,
                                       pred_contribs=pred_contribs)
         pred_df = dask_cudf.from_dask_dataframe(
             prediction.to_dask_dataframe())
         pred_df.index = input_df.index
         if not pred_contribs:
             input_df[predict_col] = pred_df
         else:
             input_df = pred_df
     else:
         infer_dmatrix = xgb.DMatrix(input_df[required_cols])
         if not pred_contribs:
             prediction = cudf.Series(bst_model.predict(infer_dmatrix),
                                      nan_as_null=False,
                                      index=input_df.index
                                      )
             input_df[predict_col] = prediction
         else:
             prediction = cudf.DataFrame(bst_model.predict(
                 infer_dmatrix, pred_contribs=pred_contribs),
                                         index=input_df.index)
             input_df = prediction
     return {self.OUTPUT_PORT_NAME: input_df}
示例#17
0
def gpu_training_df(c):
    if dask_cudf:
        df = timeseries(freq="1d").reset_index(drop=True)
        df = dask_cudf.from_dask_dataframe(df)
        c.create_table("timeseries", input_table=df)
    return None
示例#18
0
        df = sp.GeoDataFrame(*args, **kwargs)
    else:
        df = pd.DataFrame(*args, **kwargs)
    return dd.from_pandas(df, npartitions=2)


try:
    import cudf
    import cupy
    import dask_cudf

    if test_gpu is False:
        # GPU testing disabled even though cudf/cupy are available
        raise ImportError

    ddfs = [_ddf, dask_cudf.from_dask_dataframe(_ddf)]

    def dask_cudf_DataFrame(*args, **kwargs):
        assert not kwargs.pop("geo", False)
        cdf = cudf.DataFrame.from_pandas(
            pd.DataFrame(*args, **kwargs), nan_as_null=False
        )
        return dask_cudf.from_cudf(cdf, npartitions=2)

    DataFrames = [dask_DataFrame, dask_cudf_DataFrame]
except ImportError:
    cudf = cupy = dask_cudf = None
    ddfs = [_ddf]
    DataFrames = [dask_DataFrame]
    dask_cudf_DataFrame = None
def main(data_dir, client, bc, config):
    benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"])

    import dask_cudf

    query = f"""
        SELECT pr_review_sk, pr_item_sk, pr_review_content
        FROM product_reviews
        WHERE pr_item_sk = {q27_pr_item_sk}
    """
    product_reviews_df = bc.sql(query)

    sentences = product_reviews_df.map_partitions(
        create_sentences_from_reviews,
        review_column="pr_review_content",
        end_of_line_char=EOL_CHAR,
    )

    # need the global position in the sentence tokenized df
    sentences["x"] = 1
    sentences["sentence_tokenized_global_pos"] = sentences.x.cumsum()
    del sentences["x"]
    del product_reviews_df

    # Do the NER
    sentences = sentences.to_dask_dataframe()
    ner_parsed = sentences.map_partitions(ner_parser, "sentence")
    ner_parsed = dask_cudf.from_dask_dataframe(ner_parsed)
    ner_parsed = ner_parsed.persist()
    wait(ner_parsed)

    ner_parsed = ner_parsed[ner_parsed.company_name_list != ""]

    # separate NER results into one row per found company
    repeated_names = ner_parsed.map_partitions(
        create_words_from_sentences,
        sentence_column="company_name_list",
        global_position_column="sentence_tokenized_global_pos",
        delimiter="é",
    )
    del sentences

    # recombine
    repeated_names = repeated_names.persist()
    wait(repeated_names)
    bc.create_table('repeated_names', repeated_names)

    ner_parsed = ner_parsed.persist()
    wait(ner_parsed)
    bc.create_table('ner_parsed', ner_parsed)

    query = f"""
        SELECT review_idx_global_pos as review_sk,
            CAST({q27_pr_item_sk} AS BIGINT) as item_sk,
            word as company_name,
            sentence as review_sentence
        FROM repeated_names left join ner_parsed
        ON sentence_idx_global_pos = sentence_tokenized_global_pos
        ORDER BY review_idx_global_pos, item_sk, word, sentence
    """
    recombined = bc.sql(query)

    bc.drop_table("repeated_names")
    bc.drop_table("ner_parsed")
    del ner_parsed
    del repeated_names
    return recombined
示例#20
0
    def __init__(
        self,
        path_or_source,
        engine=None,
        part_size=None,
        part_mem_fraction=None,
        storage_options=None,
        dtypes=None,
        **kwargs,
    ):
        self.dtypes = dtypes
        if isinstance(
                path_or_source,
            (dask.dataframe.DataFrame, cudf.DataFrame, pd.DataFrame)):
            # User is passing in a <dask.dataframe|cudf|pd>.DataFrame
            # Use DataFrameDatasetEngine
            if isinstance(path_or_source, cudf.DataFrame):
                path_or_source = dask_cudf.from_cudf(path_or_source,
                                                     npartitions=1)
            elif isinstance(path_or_source, pd.DataFrame):
                path_or_source = dask_cudf.from_cudf(
                    cudf.from_pandas(path_or_source), npartitions=1)
            elif not isinstance(path_or_source, dask_cudf.DataFrame):
                path_or_source = dask_cudf.from_dask_dataframe(path_or_source)
            if part_size:
                warnings.warn("part_size is ignored for DataFrame input.")
            if part_mem_fraction:
                warnings.warn(
                    "part_mem_fraction is ignored for DataFrame input.")
            self.engine = DataFrameDatasetEngine(path_or_source)
        else:
            if part_size:
                # If a specific partition size is given, use it directly
                part_size = parse_bytes(part_size)
            else:
                # If a fractional partition size is given, calculate part_size
                part_mem_fraction = part_mem_fraction or 0.125
                assert part_mem_fraction > 0.0 and part_mem_fraction < 1.0
                if part_mem_fraction > 0.25:
                    warnings.warn(
                        "Using very large partitions sizes for Dask. "
                        "Memory-related errors are likely.")
                part_size = int(
                    device_mem_size(kind="total") * part_mem_fraction)

            # Engine-agnostic path handling
            paths = path_or_source
            if hasattr(paths, "name"):
                paths = stringify_path(paths)
            if isinstance(paths, str):
                paths = [paths]

            storage_options = storage_options or {}
            # If engine is not provided, try to infer from end of paths[0]
            if engine is None:
                engine = paths[0].split(".")[-1]
            if isinstance(engine, str):
                if engine == "parquet":
                    self.engine = ParquetDatasetEngine(
                        paths,
                        part_size,
                        storage_options=storage_options,
                        **kwargs)
                elif engine == "csv":
                    self.engine = CSVDatasetEngine(
                        paths,
                        part_size,
                        storage_options=storage_options,
                        **kwargs)
                else:
                    raise ValueError(
                        "Only parquet and csv supported (for now).")
            else:
                self.engine = engine(paths,
                                     part_size,
                                     storage_options=storage_options)
    def process(self, inputs):
        """
        The process is doing following things:
            1. split the data into training and testing based on provided
               conf['train_date']. If it is not provided, all the data is
               treated as training data.
            2. train a XGBoost model based on the training data
            3. Make predictions for all the data points including training and
               testing.
            4. From the prediction of returns, compute the trading signals that
               can be used in the backtesting.
        Arguments
        -------
         inputs: list
            list of input dataframes.
        Returns
        -------
        dataframe
        """
        dxgb_params = {
            'max_depth': 8,
            'max_leaves': 2**8,
            'tree_method': 'gpu_hist',
            'objective': 'reg:squarederror',
            'grow_policy': 'lossguide',
        }
        # num_of_rounds = 100
        if 'xgboost_parameters' in self.conf:
            dxgb_params.update(self.conf['xgboost_parameters'])
        input_df = inputs[self.INPUT_PORT_NAME]
        model_df = input_df
        train_cols = set(model_df.columns) - set(self.conf['no_feature'])
        train_cols = list(train_cols - set([self.conf['target']]))

        if isinstance(input_df, dask_cudf.DataFrame):
            # get the client
            client = dask.distributed.client.default_client()
            if 'train_date' in self.conf:
                train_date = datetime.datetime.strptime(
                    self.conf['train_date'],  # noqa: F841, E501
                    '%Y-%m-%d')
                model_df = model_df[model_df.datetime < train_date]
            train = model_df[train_cols]
            target = model_df[self.conf['target']]
            dmatrix = xgb.dask.DaskDMatrix(client, train, label=target)
            bst = xgb.dask.train(client,
                                 dxgb_params,
                                 dmatrix,
                                 num_boost_round=self.conf["num_of_rounds"])

            dtrain = xgb.dask.DaskDMatrix(client, input_df[train_cols])
            prediction = xgb.dask.predict(client, bst, dtrain)
            pred_df = dask_cudf.from_dask_dataframe(
                prediction.to_dask_dataframe())
            pred_df.index = input_df.index
            input_df['signal'] = pred_df
        elif isinstance(input_df, cudf.DataFrame):
            if 'train_date' in self.conf:
                train_date = datetime.datetime.strptime(
                    self.conf['train_date'],  # noqa: F841, E501
                    '%Y-%m-%d')
                model_df = model_df.query('datetime<@train_date')
            train = model_df[train_cols]
            target = model_df[self.conf['target']]
            dmatrix = xgb.DMatrix(train, label=target)
            bst = xgb.train(dxgb_params,
                            dmatrix,
                            num_boost_round=self.conf["num_of_rounds"])
            infer_dmatrix = xgb.DMatrix(input_df[train_cols])
            prediction = cudf.Series(bst.predict(infer_dmatrix),
                                     nan_as_null=False,
                                     index=input_df.index).astype('float64')
            input_df['signal'] = prediction

        input_df['tmp'] = (input_df['asset'] -
                           input_df['asset'].shift(1)).fillna(1)
        input_df['tmp'] = (input_df['tmp'] != 0).astype('int32')
        tmp = input_df['tmp']
        input_df['tmp'] = tmp.where(tmp != 1, None)
        input_df = input_df.dropna(subset=['tmp'])
        input_df = input_df.drop('tmp', axis=1)

        # convert the signal to trading action
        # 1 is buy and -1 is sell
        # It predicts the tomorrow's return (shift -1)
        # We shift 1 for trading actions so that it acts on the second day
        input_df['signal'] = ((input_df['signal'] >= 0).astype('float') * 2 -
                              1).shift(1)

        # remove the bad datapints
        input_df = input_df.dropna()
        remaining = list(self.conf['no_feature']) + ['signal']
        return {self.OUTPUT_PORT_NAME: input_df[remaining]}
示例#22
0
    def __init__(
        self,
        path_or_source,
        engine=None,
        part_size=None,
        part_mem_fraction=None,
        storage_options=None,
        dtypes=None,
        client=None,
        cpu=None,
        base_dataset=None,
        **kwargs,
    ):
        self.dtypes = dtypes
        self.client = client

        # Check if we are keeping data in cpu memory
        self.cpu = cpu or False

        # Keep track of base dataset (optional)
        self.base_dataset = base_dataset or self

        # For now, lets warn the user that "cpu mode" is experimental
        if self.cpu:
            warnings.warn(
                "Initializing an NVTabular Dataset in CPU mode."
                "This is an experimental feature with extremely limited support!"
            )

        if isinstance(path_or_source, (dask.dataframe.DataFrame, cudf.DataFrame, pd.DataFrame)):
            # User is passing in a <dask.dataframe|cudf|pd>.DataFrame
            # Use DataFrameDatasetEngine
            moved_collection = (
                False  # Whether a pd-backed collection was moved to cudf (or vice versa)
            )
            if self.cpu:
                if isinstance(path_or_source, pd.DataFrame):
                    # Convert pandas DataFrame to pandas-backed dask.dataframe.DataFrame
                    path_or_source = dask.dataframe.from_pandas(path_or_source, npartitions=1)
                elif isinstance(path_or_source, cudf.DataFrame):
                    # Convert cudf DataFrame to pandas-backed dask.dataframe.DataFrame
                    path_or_source = dask.dataframe.from_pandas(
                        path_or_source.to_pandas(), npartitions=1
                    )
                elif isinstance(path_or_source, dask_cudf.DataFrame):
                    # Convert dask_cudf DataFrame to pandas-backed dask.dataframe.DataFrame
                    path_or_source = path_or_source.to_dask_dataframe()
                    moved_collection = True
            else:
                if isinstance(path_or_source, cudf.DataFrame):
                    # Convert cudf DataFrame to dask_cudf.DataFrame
                    path_or_source = dask_cudf.from_cudf(path_or_source, npartitions=1)
                elif isinstance(path_or_source, pd.DataFrame):
                    # Convert pandas DataFrame to dask_cudf.DataFrame
                    path_or_source = dask_cudf.from_cudf(
                        cudf.from_pandas(path_or_source), npartitions=1
                    )
                elif not isinstance(path_or_source, dask_cudf.DataFrame):
                    # Convert dask.dataframe.DataFrame DataFrame to dask_cudf.DataFrame
                    path_or_source = dask_cudf.from_dask_dataframe(path_or_source)
                    moved_collection = True
            if part_size:
                warnings.warn("part_size is ignored for DataFrame input.")
            if part_mem_fraction:
                warnings.warn("part_mem_fraction is ignored for DataFrame input.")
            self.engine = DataFrameDatasetEngine(
                path_or_source, cpu=self.cpu, moved_collection=moved_collection
            )
        else:
            if part_size:
                # If a specific partition size is given, use it directly
                part_size = parse_bytes(part_size)
            else:
                # If a fractional partition size is given, calculate part_size
                part_mem_fraction = part_mem_fraction or 0.125
                assert 0.0 < part_mem_fraction < 1.0
                if part_mem_fraction > 0.25:
                    warnings.warn(
                        "Using very large partitions sizes for Dask. "
                        "Memory-related errors are likely."
                    )
                part_size = int(device_mem_size(kind="total") * part_mem_fraction)

            # Engine-agnostic path handling
            paths = path_or_source
            if hasattr(paths, "name"):
                paths = stringify_path(paths)
            if isinstance(paths, str):
                paths = [paths]
            paths = sorted(paths, key=natural_sort_key)

            storage_options = storage_options or {}
            # If engine is not provided, try to infer from end of paths[0]
            if engine is None:
                engine = paths[0].split(".")[-1]
            if isinstance(engine, str):
                if engine == "parquet":
                    self.engine = ParquetDatasetEngine(
                        paths, part_size, storage_options=storage_options, cpu=self.cpu, **kwargs
                    )
                elif engine == "csv":
                    self.engine = CSVDatasetEngine(
                        paths, part_size, storage_options=storage_options, cpu=self.cpu, **kwargs
                    )
                elif engine == "avro":
                    try:
                        from .avro import AvroDatasetEngine
                    except ImportError as e:
                        raise RuntimeError(
                            "Failed to import AvroDatasetEngine. Make sure uavro is installed."
                        ) from e

                    self.engine = AvroDatasetEngine(
                        paths, part_size, storage_options=storage_options, cpu=self.cpu, **kwargs
                    )
                else:
                    raise ValueError("Only parquet, csv, and avro supported (for now).")
            else:
                self.engine = engine(
                    paths, part_size, cpu=self.cpu, storage_options=storage_options
                )
示例#23
0
def _(embedding, n_pca, self):
    embedding = dask_cudf.from_dask_dataframe(embedding)
    return _gpu_cluster_wrapper(embedding, n_pca, self)