Пример #1
0
def test_neighborhood_predictions(nrows, ncols, n_neighbors, n_clusters,
                                  datatype):

    X, y = make_blobs(n_samples=nrows,
                      centers=n_clusters,
                      n_features=ncols,
                      cluster_std=0.01,
                      random_state=0)

    X = X.astype(np.float32)

    if datatype == "dataframe":
        X = cudf.DataFrame.from_gpu_matrix(rmm.to_device(X))
        y = cudf.DataFrame.from_gpu_matrix(rmm.to_device(y.reshape(nrows, 1)))

    knn_cu = cuKNN(n_neighbors=n_neighbors)
    knn_cu.fit(X, y)

    predictions = knn_cu.predict(X)

    if datatype == "dataframe":
        assert isinstance(predictions, cudf.Series)
        assert array_equal(predictions.to_frame().astype(np.int32),
                           y.astype(np.int32))
    else:
        assert isinstance(predictions, np.ndarray)
        assert array_equal(predictions.astype(np.int32), y.astype(np.int32))
Пример #2
0
def test_predict_multioutput(input_type, output_type):

    X = np.array([[0, 0, 1], [1, 0, 1]]).astype(np.float32)
    y = np.array([[15, 2], [5, 4]]).astype(np.int32)

    if input_type == "cudf":
        X = cudf.DataFrame.from_gpu_matrix(rmm.to_device(X))
        y = cudf.DataFrame.from_gpu_matrix(rmm.to_device(y))
    elif input_type == "cupy":
        X = cp.asarray(X)
        y = cp.asarray(y)

    knn_cu = cuKNN(n_neighbors=1, output_type=output_type)
    knn_cu.fit(X, y)

    p = knn_cu.predict(X)

    if output_type == "cudf":
        assert isinstance(p, cudf.DataFrame)
    elif output_type == "numpy":
        assert isinstance(p, np.ndarray)
    elif output_type == "cupy":
        assert isinstance(p, cp.core.core.ndarray)

    assert array_equal(p.astype(np.int32), y)
Пример #3
0
def test_predict_proba_multioutput(input_type, output_type):

    X = np.array([[0, 0, 1], [1, 0, 1]]).astype(np.float32)
    y = np.array([[15, 2], [5, 4]]).astype(np.int32)

    if input_type == "cudf":
        X = cudf.DataFrame.from_gpu_matrix(rmm.to_device(X))
        y = cudf.DataFrame.from_gpu_matrix(rmm.to_device(y))
    elif input_type == "cupy":
        X = cp.asarray(X)
        y = cp.asarray(y)

    expected = (np.array([[0., 1.], [1., 0.]]).astype(np.float32),
                np.array([[1., 0.], [0., 1.]]).astype(np.float32))

    knn_cu = cuKNN(n_neighbors=1, output_type=output_type)
    knn_cu.fit(X, y)

    p = knn_cu.predict_proba(X)

    assert isinstance(p, tuple)

    for i in p:
        if output_type == "cudf":
            assert isinstance(i, cudf.DataFrame)
        elif output_type == "numpy":
            assert isinstance(i, np.ndarray)
        elif output_type == "cupy":
            assert isinstance(i, cp.core.core.ndarray)

    assert array_equal(p[0].astype(np.float32), expected[0])
    assert array_equal(p[1].astype(np.float32), expected[1])
Пример #4
0
def test_predict_proba(nrows, ncols, n_neighbors, n_clusters, datatype):

    X, y = make_blobs(n_samples=nrows,
                      centers=n_clusters,
                      n_features=ncols,
                      cluster_std=0.01,
                      random_state=0)

    X = X.astype(np.float32)

    if datatype == "dataframe":
        X = cudf.DataFrame.from_gpu_matrix(rmm.to_device(X))
        y = cudf.DataFrame.from_gpu_matrix(rmm.to_device(y.reshape(nrows, 1)))

    knn_cu = cuKNN(n_neighbors=n_neighbors)
    knn_cu.fit(X, y)

    predictions = knn_cu.predict_proba(X)

    if datatype == "dataframe":
        assert isinstance(predictions, cudf.DataFrame)
        predictions = predictions.as_gpu_matrix().copy_to_host()
        y = y.as_gpu_matrix().copy_to_host().reshape(nrows)
    else:
        assert isinstance(predictions, np.ndarray)

    y_hat = np.argmax(predictions, axis=1)

    assert array_equal(y_hat.astype(np.int32), y.astype(np.int32))
    assert array_equal(predictions.sum(axis=1), np.ones(nrows))
Пример #5
0
def test_array_split(type, test_size, train_size, shuffle):
    X = np.zeros((100, 10)) + np.arange(100).reshape(100, 1)
    y = np.arange(100).reshape(100, 1)

    if type == 'cupy':
        X = cp.asarray(X)
        y = cp.asarray(y)

    if type == 'numba':
        X = cuda.to_device(X)
        y = cuda.to_device(y)

    if type == 'rmm':
        X = rmm.to_device(X)
        y = rmm.to_device(y)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=train_size,
                                                        test_size=test_size,
                                                        shuffle=shuffle,
                                                        random_state=0)

    if type == 'cupy':
        assert isinstance(X_train, cp.ndarray)
        assert isinstance(X_test, cp.ndarray)
        assert isinstance(y_train, cp.ndarray)
        assert isinstance(y_test, cp.ndarray)

    if type in ['numba', 'rmm']:
        assert cuda.devicearray.is_cuda_ndarray(X_train)
        assert cuda.devicearray.is_cuda_ndarray(X_test)
        assert cuda.devicearray.is_cuda_ndarray(y_train)
        assert cuda.devicearray.is_cuda_ndarray(y_test)

    if train_size is not None:
        assert X_train.shape[0] == X.shape[0] * train_size
        assert y_train.shape[0] == y.shape[0] * train_size

    if test_size is not None:
        assert X_test.shape[0] == X.shape[0] * test_size
        assert y_test.shape[0] == y.shape[0] * test_size

    if shuffle is None:
        assert X_train == X[0:train_size]
        assert y_train == y[0:train_size]
        assert X_test == X[-1 * test_size:]
        assert y_test == y[-1 * test_size:]

        if tnc(X_train):
            X_train = PatchedNumbaDeviceArray(X_train)
            X_test = PatchedNumbaDeviceArray(X_test)
            y_train = PatchedNumbaDeviceArray(y_train)
            y_test = PatchedNumbaDeviceArray(y_test)

        X_rec = cp.sort(cp.concatenate(X_train, X_test))
        y_rec = cp.sort(cp.concatenate(y_train, y_test))

        assert X_rec == X
        assert y_rec == y
Пример #6
0
def input_to_device_arrays(X, params):
    """
    Create output arrays and return them w/ the input array(s)
    :param arr:
        A tuple in the form of (X, y)
    :return:
    """

    if len(X[0]) == 0:
        return None

    start_idx = X[0].index[0]
    stop_idx = X[0].index[-1]

    X_mat = numba_utils.row_matrix(X[0])
    dev = device_of_devicendarray(X_mat)

    shape = X_mat.shape[0]*params["k"]

    # Create output numba arrays.
    I_ndarr = rmm.to_device(np.zeros(shape, dtype=np.int64, order="C"))
    D_ndarr = rmm.to_device(np.zeros(shape, dtype=np.float32,
                                     order="C"))

    # Return canonical device id as string
    return [(X_mat, I_ndarr, D_ndarr)], dev, (start_idx, stop_idx)
Пример #7
0
def test_scatter_count():
    # regular
    strings = ["Dickens", "Einstein", "Christie"]
    dstrings = nvstrings.to_device(strings)
    expected = [
        "Dickens",
        "Einstein",
        "Einstein",
        "Christie",
        "Christie",
        "Christie",
    ]
    outcome = nvtext.scatter_count(dstrings, [1, 2, 3])
    assert outcome.to_host() == expected

    # with input as GPU mem pointer
    arr = np.array([1, 2, 3], dtype="int32")
    dev_arr = rmm.to_device(arr)
    got = nvtext.scatter_count(dstrings, dev_arr.device_ctypes_pointer.value)
    assert got.to_host() == expected

    # with nulls
    expected = ["Dickens", "Dickens"]
    outcome = nvtext.scatter_count(dstrings, [2, 0, None])
    assert outcome.to_host() == expected
Пример #8
0
def read_data():
    import pandas as pd

    basedir = os.path.dirname(__file__)
    datapath = os.path.join(basedir, "data", "ipums.pkl")
    try:
        df = pd.read_pickle(datapath)
    except Exception as excpr:
        if type(excpr).__name__ == "FileNotFoundError":
            pytest.skip(".pkl file is not found")
        else:
            print(type(excpr).__name__)

    names = []
    arrays = []
    for k in df.columns:
        arrays.append(pa.Array.from_pandas(df[k]))
        names.append(k)
    batch = pa.RecordBatch.from_arrays(arrays, names)
    schema = batch.schema.serialize().to_pybytes()
    schema = np.ndarray(shape=len(schema),
                        dtype=np.byte,
                        buffer=bytearray(schema))
    data = batch.serialize().to_pybytes()
    data = np.ndarray(shape=len(data), dtype=np.byte, buffer=bytearray(data))
    darr = rmm.to_device(data)
    return df, schema, darr
Пример #9
0
def test_score(nrows, ncols, n_neighbors, n_clusters, datatype):

    X, y = make_blobs(n_samples=nrows, centers=n_clusters,
                      n_features=ncols, random_state=0,
                      cluster_std=0.01)

    X = X.astype(np.float32)

    if datatype == "dataframe":
        X = cudf.DataFrame.from_gpu_matrix(rmm.to_device(X))
        y = cudf.DataFrame.from_gpu_matrix(rmm.to_device(y.reshape(nrows, 1)))

    knn_cu = cuKNN(n_neighbors=n_neighbors)
    knn_cu.fit(X, y)

    assert knn_cu.score(X, y) >= (1.0 - 0.004)
Пример #10
0
def test_gpu_parse_arrow_int(dtype):

    depdelay = np.array([0, 0, -3, -2, 11, 6, -7, -4, 4, -3], dtype=dtype)
    arrdelay = np.array([5, -3, 1, -2, 22, 11, -12, -5, 4, -9], dtype=dtype)
    d_depdelay = pa.array(depdelay)
    d_arrdelay = pa.array(arrdelay)
    batch = pa.RecordBatch.from_arrays(
        [d_depdelay, d_arrdelay], ["depdelay", "arrdelay"]
    )

    schema_bytes = batch.schema.serialize().to_pybytes()
    recordbatches_bytes = batch.serialize().to_pybytes()

    schema = np.ndarray(
        shape=len(schema_bytes), dtype=np.byte, buffer=bytearray(schema_bytes)
    )

    rb_cpu_data = np.ndarray(
        shape=len(recordbatches_bytes),
        dtype=np.byte,
        buffer=bytearray(recordbatches_bytes),
    )

    rb_gpu_data = rmm.to_device(rb_cpu_data)
    gar = GpuArrowReader(schema, rb_gpu_data)
    columns = gar.to_dict()
    assert columns["depdelay"].dtype == dtype
    assert set(columns) == {"depdelay", "arrdelay"}
    assert list(columns["depdelay"]) == [0, 0, -3, -2, 11, 6, -7, -4, 4, -3]
Пример #11
0
def test_neighborhood_predictions(nrows, ncols, n_neighbors, n_clusters,
                                  datatype):
    if not has_scipy():
        pytest.skip('Skipping test_neighborhood_predictions because ' +
                    'Scipy is missing')

    X, y = make_blobs(n_samples=nrows, centers=n_clusters,
                      n_features=ncols, random_state=0)

    X = X.astype(np.float32)

    if datatype == "dataframe":
        X = cudf.DataFrame.from_gpu_matrix(rmm.to_device(X))

    knn_cu = cuKNN()
    knn_cu.fit(X)
    neigh_ind = knn_cu.kneighbors(X, n_neighbors=n_neighbors,
                                  return_distance=False)

    if datatype == "dataframe":
        assert isinstance(neigh_ind, cudf.DataFrame)
        neigh_ind = neigh_ind.as_gpu_matrix().copy_to_host()
    else:
        assert isinstance(neigh_ind, np.ndarray)

    labels, probs = predict(neigh_ind, y, n_neighbors)

    assert array_equal(labels, y)
Пример #12
0
def test_gpu_parse_arrow_data():
    batch = make_gpu_parse_arrow_data_batch()
    schema_data = batch.schema.serialize()
    recbatch_data = batch.serialize()

    # To ensure compatibility for OmniSci we're going to create this numpy
    # array to be read-only as that's how numpy arrays created from foreign
    # memory buffers will be set
    cpu_schema = np.frombuffer(schema_data, dtype=np.uint8)
    cpu_data = np.frombuffer(recbatch_data, dtype=np.uint8)
    gpu_data = rmm.to_device(cpu_data)
    del cpu_data

    # test reader
    reader = GpuArrowReader(cpu_schema, gpu_data)
    assert reader[0].name == "dest_lat"
    assert reader[1].name == "dest_lon"
    lat = reader[0].data.copy_to_host()
    lon = reader[1].data.copy_to_host()
    assert lat.size == 23
    assert lon.size == 23
    np.testing.assert_array_less(lat, 42)
    np.testing.assert_array_less(27, lat)
    np.testing.assert_array_less(lon, -76)
    np.testing.assert_array_less(-105, lon)

    dct = reader.to_dict()
    np.testing.assert_array_equal(lat, dct["dest_lat"].to_array())
    np.testing.assert_array_equal(lon, dct["dest_lon"].to_array())
Пример #13
0
def test_cuml_against_sklearn(input_type, nrows, n_feats, k):
    X, _ = make_blobs(n_samples=nrows,
                      n_features=n_feats, random_state=0)

    knn_sk = skKNN(metric="euclidean")
    knn_sk.fit(X)
    D_sk, I_sk = knn_sk.kneighbors(X, k)

    if input_type == "dataframe":
        X = cudf.DataFrame.from_gpu_matrix(rmm.to_device(X))

    knn_cu = cuKNN()
    knn_cu.fit(X)
    D_cuml, I_cuml = knn_cu.kneighbors(X, k)

    if input_type == "dataframe":
        assert isinstance(D_cuml, cudf.DataFrame)
        assert isinstance(I_cuml, cudf.DataFrame)
        D_cuml_arr = D_cuml.as_gpu_matrix().copy_to_host()
        I_cuml_arr = I_cuml.as_gpu_matrix().copy_to_host()
    else:
        assert isinstance(D_cuml, np.ndarray)
        assert isinstance(I_cuml, np.ndarray)
        D_cuml_arr = D_cuml
        I_cuml_arr = I_cuml

    assert array_equal(D_cuml_arr, D_sk, 1e-2, with_sign=True)
    assert I_cuml_arr.all() == I_sk.all()
Пример #14
0
def _request_transfer(key, remoteinfo):
    logger.info("rebuild from: %s for %r", remoteinfo, key)

    context = zmq.Context()
    socket = context.socket(zmq.REQ)
    socket.connect("tcp://{0}:{1}".format(*remoteinfo))

    myaddr = _global_addr[0]
    theiraddr = remoteinfo[0]
    if myaddr == theiraddr:
        # Same machine go by IPC
        logger.info("request by IPC")
        socket.send(pickle.dumps(("IPC", key)))
        rcv = socket.recv()
        ipch = pickle.loads(rcv)
        # Open IPC and copy to local context

        with ipch as data:
            copied = rmm.device_array_like(data)
            copied.copy_to_device(data)

        # Release
        _request_drop(socket, key)
        return copied
    else:
        # Different machine go by NET
        logger.info("request by NET: %s->%s", theiraddr, myaddr)
        socket.send(pickle.dumps(("NET", key)))
        rcv = socket.recv()
        output = rmm.to_device(pickle.loads(rcv))
        # Release
        _request_drop(socket, key)
        return output
Пример #15
0
def test_gpu_parse_arrow_timestamps(dtype):
    timestamp = (
        cudf.datasets.timeseries(
            start="2000-01-01", end="2000-01-02", freq="3600s", dtypes={}
        )
        .reset_index()["timestamp"]
        .reset_index(drop=True)
    )
    gdf = cudf.DataFrame({"timestamp": timestamp.astype(dtype)})
    pdf = gdf.to_arrow(preserve_index=False)
    schema_data = pdf.schema.serialize()
    recbatch_data = pdf.to_batches()[0].serialize()

    # To ensure compatibility for OmniSci we're going to create this numpy
    # array to be read-only as that's how numpy arrays created from foreign
    # memory buffers will be set
    cpu_schema = np.frombuffer(schema_data, dtype=np.uint8)
    cpu_data = np.frombuffer(recbatch_data, dtype=np.uint8)
    gpu_data = rmm.to_device(cpu_data)
    del cpu_data

    # test reader
    reader = GpuArrowReader(cpu_schema, gpu_data)
    assert reader[0].name == "timestamp"
    timestamp_arr = reader[0].data.copy_to_host()
    np.testing.assert_array_equal(timestamp_arr, gdf["timestamp"].to_array())
    dct = reader.to_dict()
    np.testing.assert_array_equal(timestamp_arr, dct["timestamp"].to_array())
Пример #16
0
    def __getitem__(self, index):
        from numbers import Number

        if isinstance(index, slice):
            start, stop, step = index.indices(len(self))
            sln = (stop - start) // step
            sln = max(0, sln)
            start += self._start
            stop += self._start
            if sln == 0:
                return RangeIndex(0, None, self.name)
            elif step == 1:
                return RangeIndex(start, stop, self.name)
            else:
                return index_from_range(start, stop, step)

        elif isinstance(index, Number):
            index = utils.normalize_index(index, len(self))
            index += self._start
            return index
        elif isinstance(index, (list, np.ndarray)):
            index = np.asarray(index)
            index = rmm.to_device(index)

        else:
            if is_scalar(index):
                index = min_signed_type(index)(index)
            index = column.as_column(index)

        return as_index(self._values[index], name=self.name)
Пример #17
0
def host_to_device(s: DeviceSerialized) -> object:
    frames = [
        cuda_memory_manager.to_device(f) if ic else f
        for ic, f in zip(s.is_cuda, s.parts)
    ]

    return deserialize(s.header, frames)
Пример #18
0
def buffers_from_pyarrow(pa_arr, dtype=None):
    from cudf.core.buffer import Buffer
    from cudf.utils.cudautils import copy_array

    buffers = pa_arr.buffers()

    if buffers[0]:
        mask_dev_array = make_mask(len(pa_arr))
        arrow_dev_array = rmm.to_device(np.array(buffers[0]).view("int8"))
        copy_array(arrow_dev_array, mask_dev_array)
        pamask = Buffer(mask_dev_array)
    else:
        pamask = None

    if dtype:
        new_dtype = dtype
    else:
        if isinstance(pa_arr, pa.DictionaryArray):
            new_dtype = pa_arr.indices.type.to_pandas_dtype()
        else:
            new_dtype = pa_arr.type.to_pandas_dtype()

    if buffers[1]:
        padata = Buffer(
            np.array(buffers[1]).view(new_dtype)[pa_arr.offset:pa_arr.offset +
                                                 len(pa_arr)])
    else:
        padata = Buffer(np.empty(0, dtype=new_dtype))
    return (pamask, padata)
Пример #19
0
def get_sorted_inds(by, ascending=True, na_position="last"):
    """
        Sort by the values.

        Parameters
        ----------
        by : Column or list of Column
            Column or list of Column objects to sort by.
        ascending : bool or list of bool, default True
            If True, sort values in ascending order, otherwise descending.
        na_position : {‘first’ or ‘last’}, default ‘last’
            Argument ‘first’ puts NaNs at the beginning, ‘last’ puts NaNs at
            the end.
        Returns
        -------
        col_inds : cuDF Column of indices sorted based on input

        Difference from pandas:
          * Support axis='index' only.
          * Not supporting: inplace, kind
          * Ascending can be a list of bools to control per column
    """
    if isinstance(by, (ColumnBase)):
        by = [by]

    col_inds = column.as_column(cudautils.arange(len(by[0]), dtype="int32"))

    # This needs to be updated to handle list of bools for ascending
    if ascending is True:
        if na_position == "last":
            na_position = 0
        elif na_position == "first":
            na_position = 1
    elif ascending is False:
        if na_position == "last":
            na_position = 1
        elif na_position == "first":
            na_position = 0
    else:
        logging.warning(
            "When using a sequence of booleans for `ascending`, `na_position` "
            "flag is not yet supported and defaults to treating nulls as "
            "greater than all numbers")
        na_position = 0

    # If given a scalar need to construct a sequence of length # of columns
    if np.isscalar(ascending):
        ascending = [ascending] * len(by)
    # If given a list-like need to convert to a numpy array and copy to device
    if isinstance(ascending, collections.abc.Sequence):
        # Need to flip the boolean here since libcudf has 0 as ascending
        ascending = [not val for val in ascending]
        ascending = rmm.to_device(np.array(ascending, dtype="int8"))
    else:
        raise ValueError("Must use a boolean or list of booleans")

    libcudf.sort.order_by(by, col_inds, ascending, na_position)

    return col_inds
Пример #20
0
def test_score(nrows, ncols, n_neighbors, n_clusters, datatype):

    # Using make_blobs here to check averages and neighborhoods
    X, y = make_blobs(n_samples=nrows, centers=n_clusters,
                      cluster_std=0.01,
                      n_features=ncols, random_state=0)

    X = X.astype(np.float32)
    y = y.astype(np.float32)

    if datatype == "dataframe":
        X = cudf.DataFrame.from_gpu_matrix(rmm.to_device(X))
        y = cudf.DataFrame.from_gpu_matrix(rmm.to_device(y.reshape(nrows, 1)))

    knn_cu = cuKNN(n_neighbors=n_neighbors)
    knn_cu.fit(X, y)

    assert knn_cu.score(X, y) >= 0.9999
Пример #21
0
def test_gather_single_col():
    col = column.as_column(np.arange(100), dtype=np.int32)
    gather_map = np.array([0, 1, 2, 3, 5, 8, 13, 21], dtype=np.int32)

    device_gather_map = rmm.to_device(gather_map)

    out = libcudf.copying.gather(col, device_gather_map)

    np.testing.assert_array_equal(out.to_array(), gather_map)
Пример #22
0
def test_from_offsets_dev_data():
    values = np.array(
        [97, 112, 112, 108, 101, 112, 101, 97, 114], dtype=np.int8
    )
    offsets = np.array([0, 5, 5, 9], dtype=np.int32)
    bitmask = np.array([5], dtype=np.int8)
    values = rmm.to_device(values)
    offsets = rmm.to_device(offsets)
    bitmask = rmm.to_device(bitmask)
    s = nvstrings.from_offsets(
        values.device_ctypes_pointer.value,
        offsets.device_ctypes_pointer.value,
        3,
        bitmask.device_ctypes_pointer.value,
        1,
        True,
    )
    expected = ["apple", None, "pear"]
    assert_eq(s, expected)
Пример #23
0
def test_rf_classification_multi_class(datatype, column_info, nrows, n_classes,
                                       type):

    ncols, n_info = column_info
    X, y = make_classification(n_samples=nrows,
                               n_features=ncols,
                               n_clusters_per_class=1,
                               n_informative=n_info,
                               random_state=0,
                               n_classes=n_classes)
    X = X.astype(datatype[0])
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)
    X_test = X_test.astype(datatype[1])

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc()
    if type == 'dataframe':
        X_train_df = cudf.DataFrame.from_gpu_matrix(rmm.to_device(X_train))
        y_train_df = cudf.Series(y_train)
        X_test_df = cudf.DataFrame.from_gpu_matrix(rmm.to_device(X_test))
        cuml_model.fit(X_train_df, y_train_df)
        cu_preds = cuml_model.predict(X_test_df,
                                      predict_model="CPU").to_array()
    else:
        cuml_model.fit(X_train, y_train)
        cu_preds = cuml_model.predict(X_test, predict_model="CPU")

    cu_acc = accuracy_score(y_test, cu_preds)

    # sklearn random forest classification model
    # initialization, fit and predict
    if nrows < 500000:
        sk_model = skrfc(max_depth=16, random_state=10)
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_acc = accuracy_score(y_test, sk_preds)
        assert cu_acc >= (sk_acc - 0.07)
Пример #24
0
def column_hash_values(column0, *other_columns, initial_hash_values=None):
    """Hash all values in the given columns.
    Returns a new NumericalColumn[int32]
    """
    columns = [column0] + list(other_columns)
    buf = Buffer(rmm.device_array(len(column0), dtype=np.int32))
    result = NumericalColumn(data=buf, dtype=buf.dtype)
    if initial_hash_values:
        initial_hash_values = rmm.to_device(initial_hash_values)
    libcudf.hash.hash_columns(columns, result, initial_hash_values)
    return result
Пример #25
0
def test_predict_multioutput(datatype):

    X = np.array([[0, 0, 1], [1, 0, 1]]).astype(np.float32)
    y = np.array([[15, 2], [5, 4]]).astype(np.int32)

    if datatype == "dataframe":
        X = cudf.DataFrame.from_gpu_matrix(rmm.to_device(X))
        y = cudf.DataFrame.from_gpu_matrix(rmm.to_device(y))

    knn_cu = cuKNN(n_neighbors=1)
    knn_cu.fit(X, y)

    p = knn_cu.predict(X)

    if datatype == "dataframe":
        assert isinstance(p, cudf.DataFrame)
    else:
        assert isinstance(p, np.ndarray)

    assert array_equal(p.astype(np.int32), y)
Пример #26
0
def array_tester(dtype, nelem):
    # data
    h_in = np.full(nelem, 3.2, dtype)
    h_result = np.empty(nelem, dtype)

    d_in = rmm.to_device(h_in)
    d_result = rmm.device_array_like(d_in)

    d_result.copy_to_device(d_in)
    h_result = d_result.copy_to_host()

    np.testing.assert_array_equal(h_result, h_in)
Пример #27
0
def column_hash_values(column0, *other_columns, initial_hash_values=None):
    """Hash all values in the given columns.
    Returns a new NumericalColumn[int32]
    """
    from cudf.core.column import column_empty

    columns = [column0] + list(other_columns)
    result = column_empty(len(column0), dtype=np.int32, masked=False)
    if initial_hash_values:
        initial_hash_values = rmm.to_device(initial_hash_values)
    libcudf.hash.hash_columns(columns, result, initial_hash_values)
    return result
Пример #28
0
def _build_train_test_data(X, y, datatype, train_ratio=0.9):

    train_selection = np.random.RandomState(42).choice(
        [True, False],
        X.shape[0],
        replace=True,
        p=[train_ratio, 1.0 - train_ratio])

    X_train = X[train_selection]
    y_train = y[train_selection]
    X_test = X[~train_selection]
    y_test = y[~train_selection]

    if datatype == "dataframe":
        X_train = cudf.DataFrame.from_gpu_matrix(rmm.to_device(X_train))
        y_train = cudf.DataFrame.from_gpu_matrix(
            rmm.to_device(y_train.reshape(y_train.shape[0], 1)))
        X_test = cudf.DataFrame.from_gpu_matrix(rmm.to_device(X_test))
        y_test = cudf.DataFrame.from_gpu_matrix(
            rmm.to_device(y_test.reshape(y_test.shape[0], 1)))

    return X_train, X_test, y_train, y_test
Пример #29
0
def test_compare():
    strs = nvstrings.to_device(
        ["hello", "there", "world", "accéntéd", None, ""])
    got = strs.compare("there")
    expected = [-12, 0, 3, -19, None, -1]
    assert_eq(got, expected)

    # device array
    arr = np.arange(strs.size(), dtype=np.int32)
    d_arr = rmm.to_device(arr)
    devmem = d_arr.device_ctypes_pointer.value
    strs.compare("there", devmem)
    expected = [-12, 0, 3, -19, -1, -1]
    assert_eq(d_arr.copy_to_host().tolist(), expected)
Пример #30
0
def test_rmm_csv_log(dtype, nelem):
    # data
    h_in = np.full(nelem, 3.2, dtype)

    d_in = rmm.to_device(h_in)
    d_result = rmm.device_array_like(d_in)

    d_result.copy_to_device(d_in)

    csv = rmm.csv_log()

    assert (csv.find("Event Type,Device ID,Address,Stream,Size (bytes),"
                     "Free Memory,Total Memory,Current Allocs,Start,End,"
                     "Elapsed,Location") >= 0)