예제 #1
0
def run_with_dask_dataframe(DMatrixT: Type, client: Client) -> None:
    import cupy as cp
    cp.cuda.runtime.setDevice(0)
    X, y, _ = generate_array()

    X = dd.from_dask_array(X)
    y = dd.from_dask_array(y)

    X = X.map_partitions(cudf.from_pandas)
    y = y.map_partitions(cudf.from_pandas)

    dtrain = DMatrixT(client, X, y)
    out = dxgb.train(client, {'tree_method': 'gpu_hist',
                              'debug_synchronize': True},
                     dtrain=dtrain,
                     evals=[(dtrain, 'X')],
                     num_boost_round=4)

    assert isinstance(out['booster'], dxgb.Booster)
    assert len(out['history']['X']['rmse']) == 4

    predictions = dxgb.predict(client, out, dtrain)
    assert isinstance(predictions.compute(), np.ndarray)

    series_predictions = dxgb.inplace_predict(client, out, X)
    assert isinstance(series_predictions, dd.Series)

    single_node = out['booster'].predict(xgboost.DMatrix(X.compute()))

    cp.testing.assert_allclose(single_node, predictions.compute())
    np.testing.assert_allclose(single_node,
                               series_predictions.compute().to_array())

    predt = dxgb.predict(client, out, X)
    assert isinstance(predt, dd.Series)

    T = TypeVar('T')

    def is_df(part: T) -> T:
        assert isinstance(part, cudf.DataFrame), part
        return part

    predt.map_partitions(
        is_df,
        meta=dd.utils.make_meta({'prediction': 'f4'}))

    cp.testing.assert_allclose(
        predt.values.compute(), single_node)

    # Make sure the output can be integrated back to original dataframe
    X["predict"] = predictions
    X["inplace_predict"] = series_predictions

    has_null = X.isnull().values.any().compute()
    assert bool(has_null) is False
예제 #2
0
    def test_dask_dataframe(self):
        with LocalCUDACluster() as cluster:
            with Client(cluster) as client:
                import cupy as cp
                cp.cuda.runtime.setDevice(0)
                X, y = generate_array()

                X = dd.from_dask_array(X)
                y = dd.from_dask_array(y)

                X = X.map_partitions(cudf.from_pandas)
                y = y.map_partitions(cudf.from_pandas)

                dtrain = dxgb.DaskDMatrix(client, X, y)
                out = dxgb.train(client, {
                    'tree_method': 'gpu_hist',
                    'debug_synchronize': True
                },
                                 dtrain=dtrain,
                                 evals=[(dtrain, 'X')],
                                 num_boost_round=4)

                assert isinstance(out['booster'], dxgb.Booster)
                assert len(out['history']['X']['rmse']) == 4

                predictions = dxgb.predict(client, out, dtrain).compute()
                assert isinstance(predictions, np.ndarray)

                series_predictions = dxgb.inplace_predict(client, out, X)
                assert isinstance(series_predictions, dd.Series)
                series_predictions = series_predictions.compute()

                single_node = out['booster'].predict(
                    xgboost.DMatrix(X.compute()))

                cp.testing.assert_allclose(single_node, predictions)
                np.testing.assert_allclose(single_node,
                                           series_predictions.to_array())

                predt = dxgb.predict(client, out, X)
                assert isinstance(predt, dd.Series)

                def is_df(part):
                    assert isinstance(part, cudf.DataFrame), part
                    return part

                predt.map_partitions(is_df,
                                     meta=dd.utils.make_meta(
                                         {'prediction': 'f4'}))

                cp.testing.assert_allclose(predt.values.compute(), single_node)
예제 #3
0
def run_with_dask_array(DMatrixT, client):
    import cupy as cp
    cp.cuda.runtime.setDevice(0)
    X, y = generate_array()

    X = X.map_blocks(cp.asarray)
    y = y.map_blocks(cp.asarray)
    dtrain = DMatrixT(client, X, y)
    out = dxgb.train(client, {'tree_method': 'gpu_hist',
                              'debug_synchronize': True},
                     dtrain=dtrain,
                     evals=[(dtrain, 'X')],
                     num_boost_round=2)
    from_dmatrix = dxgb.predict(client, out, dtrain).compute()
    inplace_predictions = dxgb.inplace_predict(
        client, out, X).compute()
    single_node = out['booster'].predict(
        xgboost.DMatrix(X.compute()))
    np.testing.assert_allclose(single_node, from_dmatrix)
    device = cp.cuda.runtime.getDevice()
    assert device == inplace_predictions.device.id
    single_node = cp.array(single_node)
    assert device == single_node.device.id
    cp.testing.assert_allclose(
        single_node,
        inplace_predictions)
예제 #4
0
    def test_dask_array(self):
        with LocalCUDACluster() as cluster:
            with Client(cluster) as client:
                import cupy as cp
                cp.cuda.runtime.setDevice(0)
                X, y = generate_array()

                X = X.map_blocks(cp.asarray)
                y = y.map_blocks(cp.asarray)
                dtrain = dxgb.DaskDMatrix(client, X, y)
                out = dxgb.train(client, {'tree_method': 'gpu_hist'},
                                 dtrain=dtrain,
                                 evals=[(dtrain, 'X')],
                                 num_boost_round=2)
                from_dmatrix = dxgb.predict(client, out, dtrain).compute()
                inplace_predictions = dxgb.inplace_predict(client, out,
                                                           X).compute()
                single_node = out['booster'].predict(
                    xgboost.DMatrix(X.compute()))
                np.testing.assert_allclose(single_node, from_dmatrix)
                device = cp.cuda.runtime.getDevice()
                assert device == inplace_predictions.device.id
                single_node = cp.array(single_node)
                assert device == single_node.device.id
                cp.testing.assert_allclose(single_node, inplace_predictions)
예제 #5
0
    def test_dask_dataframe(self):
        with LocalCUDACluster() as cluster:
            with Client(cluster) as client:
                import cupy
                X, y = generate_array()

                X = dd.from_dask_array(X)
                y = dd.from_dask_array(y)

                X = X.map_partitions(cudf.from_pandas)
                y = y.map_partitions(cudf.from_pandas)

                dtrain = dxgb.DaskDMatrix(client, X, y)
                out = dxgb.train(client, {'tree_method': 'gpu_hist'},
                                 dtrain=dtrain,
                                 evals=[(dtrain, 'X')],
                                 num_boost_round=2)

                assert isinstance(out['booster'], dxgb.Booster)
                assert len(out['history']['X']['rmse']) == 2

                predictions = dxgb.predict(client, out, dtrain).compute()
                assert isinstance(predictions, np.ndarray)

                series_predictions = dxgb.inplace_predict(client, out, X)
                assert isinstance(series_predictions, dd.Series)
                series_predictions = series_predictions.compute()

                single_node = out['booster'].predict(
                    xgboost.DMatrix(X.compute()))

                cupy.testing.assert_allclose(single_node, predictions)
                cupy.testing.assert_allclose(single_node, series_predictions)
예제 #6
0
    def test_empty_dmatrix(self):
        def _check_outputs(out, predictions):
            assert isinstance(out['booster'], dxgb.Booster)
            assert len(out['history']['validation']['rmse']) == 2
            assert isinstance(predictions, np.ndarray)
            assert predictions.shape[0] == 1

        parameters = {
            'tree_method': 'gpu_hist',
            'verbosity': 3,
            'debug_synchronize': True
        }

        with LocalCUDACluster() as cluster:
            with Client(cluster) as client:
                kRows, kCols = 1, 97
                X = dd.from_array(np.random.randn(kRows, kCols))
                y = dd.from_array(np.random.rand(kRows))
                dtrain = dxgb.DaskDMatrix(client, X, y)

                out = dxgb.train(client,
                                 parameters,
                                 dtrain=dtrain,
                                 evals=[(dtrain, 'validation')],
                                 num_boost_round=2)
                predictions = dxgb.predict(client=client,
                                           model=out,
                                           data=dtrain).compute()
                _check_outputs(out, predictions)

                # train has more rows than evals
                valid = dtrain
                kRows += 1
                X = dd.from_array(np.random.randn(kRows, kCols))
                y = dd.from_array(np.random.rand(kRows))
                dtrain = dxgb.DaskDMatrix(client, X, y)

                out = dxgb.train(client,
                                 parameters,
                                 dtrain=dtrain,
                                 evals=[(valid, 'validation')],
                                 num_boost_round=2)
                predictions = dxgb.predict(client=client,
                                           model=out,
                                           data=valid).compute()
                _check_outputs(out, predictions)
예제 #7
0
    def test_empty_partition(self,
                             local_cuda_cluster: LocalCUDACluster) -> None:
        import dask_cudf
        import cudf
        import cupy
        with Client(local_cuda_cluster) as client:
            mult = 100
            df = cudf.DataFrame({
                "a": [1, 2, 3, 4, 5.1] * mult,
                "b": [10, 15, 29.3, 30, 31] * mult,
                "y": [10, 20, 30, 40., 50] * mult,
            })
            parameters = {"tree_method": "gpu_hist", "debug_synchronize": True}

            empty = df.iloc[:0]
            ddf = dask_cudf.concat(
                [dask_cudf.from_cudf(empty, npartitions=1)] +
                [dask_cudf.from_cudf(df, npartitions=3)] +
                [dask_cudf.from_cudf(df, npartitions=3)])
            X = ddf[ddf.columns.difference(["y"])]
            y = ddf[["y"]]
            dtrain = dxgb.DaskDeviceQuantileDMatrix(client, X, y)
            bst_empty = xgb.dask.train(client,
                                       parameters,
                                       dtrain,
                                       evals=[(dtrain, "train")])
            predt_empty = dxgb.predict(client, bst_empty, X).compute().values

            ddf = dask_cudf.concat([dask_cudf.from_cudf(df, npartitions=3)] +
                                   [dask_cudf.from_cudf(df, npartitions=3)])
            X = ddf[ddf.columns.difference(["y"])]
            y = ddf[["y"]]
            dtrain = dxgb.DaskDeviceQuantileDMatrix(client, X, y)
            bst = xgb.dask.train(client,
                                 parameters,
                                 dtrain,
                                 evals=[(dtrain, "train")])
            predt = dxgb.predict(client, bst, X).compute().values

            cupy.testing.assert_allclose(predt, predt_empty)
예제 #8
0
def test_dask_dataframe(client):
    X, y = generate_array()

    X = dd.from_dask_array(X)
    y = dd.from_dask_array(y)

    X = X.map_partitions(cudf.from_pandas)
    y = y.map_partitions(cudf.from_pandas)

    dtrain = dxgb.DaskDMatrix(client, X, y)
    out = dxgb.train(client, {'tree_method': 'gpu_hist'},
                     dtrain=dtrain,
                     evals=[(dtrain, 'X')],
                     num_boost_round=2)

    assert isinstance(out['booster'], dxgb.Booster)
    assert len(out['history']['X']['rmse']) == 2

    predictions = dxgb.predict(out, dtrain)
    predictions = predictions.compute()