Exemplo n.º 1
def test_series_indexing(i1, i2, i3):
    a1 = np.arange(20)
    series = Series(a1)
    # Indexing
    sr1 = series.iloc[i1]
    assert sr1.null_count == 0
    np.testing.assert_equal(sr1.to_array(), a1[:12])
    sr2 = sr1.iloc[i2]
    assert sr2.null_count == 0
    np.testing.assert_equal(sr2.to_array(), a1[3:12])
    # Index with stride
    sr3 = sr2.iloc[i3]
    assert sr3.null_count == 0
    np.testing.assert_equal(sr3.to_array(), a1[3:12:2])

    # Integer indexing
    if isinstance(i1, range):
        for i in i1:  # Python int-s
            assert series[i] == a1[i]
    if isinstance(i1, np.ndarray) and i1.dtype in index_dtypes:
        for i in i1:  # numpy integers
            assert series[i] == a1[i]
def test_vectorizer_min_df():
    test_data = Series(['abc', 'dea', 'eat'])
    vect = CountVectorizer(analyzer='char', min_df=1)
    assert 'a' in vect.vocabulary_.to_arrow().to_pylist()
    assert len(vect.vocabulary_.to_arrow().to_pylist()) == 6
    assert len(vect.stop_words_) == 0

    vect.min_df = 2
    assert 'c' not in vect.vocabulary_.to_arrow().to_pylist()  # {bcdt} ignored
    assert len(vect.vocabulary_.to_arrow().to_pylist()) == 2  # {ae} remain
    assert 'c' in vect.stop_words_.to_arrow().to_pylist()
    assert len(vect.stop_words_) == 4

    vect.min_df = 0.8  # 0.8 * 3 documents -> min_doc_count == 2.4
    # {bcdet} ignored
    assert 'c' not in vect.vocabulary_.to_arrow().to_pylist()
    assert len(vect.vocabulary_.to_arrow().to_pylist()) == 1  # {a} remains
    assert 'c' in vect.stop_words_.to_arrow().to_pylist()
    assert len(vect.stop_words_) == 5
def test_count_binary_occurrences():
    # by default multiple occurrences are counted as longs
    test_data = Series(['aaabc', 'abbde'])
    vect = CountVectorizer(analyzer='char', max_df=1.0)
    X = cp.asnumpy(vect.fit_transform(test_data).todense())
    assert_array_equal(['a', 'b', 'c', 'd', 'e'],
    assert_array_equal([[3, 1, 1, 0, 0], [1, 2, 0, 1, 1]], X)

    # using boolean features, we can fetch the binary occurrence info
    # instead.
    vect = CountVectorizer(analyzer='char', max_df=1.0, binary=True)
    X = cp.asnumpy(vect.fit_transform(test_data).todense())
    assert_array_equal([[1, 1, 1, 0, 0], [1, 1, 0, 1, 1]], X)

    # check the ability to change the dtype
    vect = CountVectorizer(analyzer='char',
    X = vect.fit_transform(test_data)
    assert X.dtype == cp.float32
Exemplo n.º 4
def test_series_nsmallest(data, n):
    """Indirectly tests Series.sort_values()"""
    sr = Series(data)
    psr = pd.Series(data)
    assert_eq(sr.nsmallest(n), psr.nsmallest(n))
        sr.nsmallest(n, keep="last").sort_index(),
        psr.nsmallest(n, keep="last").sort_index(),

        lfunc_args_and_kwargs=([], {
            "n": 3,
            "keep": "what"
        rfunc_args_and_kwargs=([], {
            "n": 3,
            "keep": "what"
        expected_error_message='keep must be either "first", "last"',
Exemplo n.º 5
def test_sum_of_squares(dtype, nelem):
    dtype = cudf.dtype(dtype).type
    data = gen_rand(dtype, nelem)
    sr = Series(data)
    df = cudf.DataFrame(sr)

    got = sr.sum_of_squares()
    got_df = df.sum_of_squares()
    expect = (data ** 2).sum()

    if cudf.dtype(dtype).kind in {"u", "i"}:
        if 0 <= expect <= np.iinfo(dtype).max:
            np.testing.assert_array_almost_equal(expect, got)
            np.testing.assert_array_almost_equal(expect, got_df.iloc[0])
            print("overflow, passing")
            expect, got, significant=accuracy_for_dtype[dtype]
            expect, got_df.iloc[0], significant=accuracy_for_dtype[dtype]
Exemplo n.º 6
def test_onehot_masked():
    high = 5
    size = 100
    arr = np.random.randint(low=0, high=high, size=size)
    bitmask = utils.random_bitmask(size)
    bytemask = np.asarray(
        utils.expand_bits_to_bytes(bitmask)[:size], dtype=np.bool_
    arr[~bytemask] = -1

    df = DataFrame()
    df["a"] = Series(arr).set_mask(bitmask)

    out = df.one_hot_encoding(
        "a", cats=list(range(high)), prefix="a", dtype=np.int32

    assert tuple(out.columns) == ("a", "a_0", "a_1", "a_2", "a_3", "a_4")
    np.testing.assert_array_equal((out["a_0"] == 1).to_array(), arr == 0)
    np.testing.assert_array_equal((out["a_1"] == 1).to_array(), arr == 1)
    np.testing.assert_array_equal((out["a_2"] == 1).to_array(), arr == 2)
    np.testing.assert_array_equal((out["a_3"] == 1).to_array(), arr == 3)
    np.testing.assert_array_equal((out["a_4"] == 1).to_array(), arr == 4)
Exemplo n.º 7
def test_len(data):
    expect = Series(data).list.len()
    ds = dgd.from_cudf(Series(data), 5)
    assert_eq(expect, ds.list.len().compute())
Exemplo n.º 8
def test_datetime_accessor_initialization(data):
    pdsr = pd.Series(data.copy())
    sr = Series(pdsr)
    dsr = dgd.from_cudf(sr, npartitions=5)
    with pytest.raises(AttributeError):
Exemplo n.º 9
def test_create_list_series(data):
    expect = pd.Series(data)
    ds_got = dgd.from_cudf(Series(data), 4)
    assert_eq(expect, ds_got.compute())
from cudf import Series

import cuspatial

in_trajs = []
in_trajs.append(np.array([[1, 0], [2, 1], [3, 2], [5, 3], [7, 1]]))
in_trajs.append(np.array([[0, 3], [2, 5], [3, 6], [6, 5]]))
in_trajs.append(np.array([[1, 4], [3, 7], [6, 4]]))
out_trajs = np.concatenate([np.asarray(traj) for traj in in_trajs], 0)
py_x = np.array(out_trajs[:, 0])
py_y = np.array(out_trajs[:, 1])
py_cnt = []
for traj in in_trajs:
pnt_x = Series(py_x)
pnt_y = Series(py_y)
cnt = Series(py_cnt)
distance = cuspatial.directed_hausdorff_distance(pnt_x, pnt_y, cnt)

matrix = distance.as_matrix()

# clustering using AgglomerativeClustering
agg1 = AgglomerativeClustering(n_clusters=2,
label1 = agg1.fit(matrix)
print("AgglomerativeClustering results={}".format(label1.labels_))

# clustering using DBSCAN; as the minimum distanance is ~1.4,
# using eps=1.5 will generate the same two clasters as AgglomerativeClustering
Exemplo n.º 11
def test_categorical_accessor_initialization2(data):
    sr = Series(data.copy())
    dsr = dgd.from_cudf(sr, npartitions=5)
    with pytest.raises(AttributeError):
Exemplo n.º 12
def test_leaves(data):
    expect = Series(data).list.leaves
    ds = dgd.from_cudf(Series(data), 5)
    got = ds.list.leaves.compute().reset_index(drop=True)
    assert_eq(expect, got)
Exemplo n.º 13
def gen_rand_series(dtype, size, **kwargs):
    values = gen_rand(dtype, size, **kwargs)
    if kwargs.get("has_nulls", False):
        return Series.from_masked_array(values, random_bitmask(size))

    return Series(values)
Exemplo n.º 14
def read_uint(filename):
    """Reads a binary file of uint32s into a `cudf.Series`
    return Series(cpp_read_uint_soa(filename))
Exemplo n.º 15
def test_countvectorizer_custom_vocabulary():
Exemplo n.º 16
import time

from cudf import Series, read_csv

import cuspatial

start = time.time()
# data dowloaded from
# https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2009-01.csv
df = read_csv("data/yellow_tripdata_2009-01.csv")
end = time.time()
print("data ingesting time (from SSD) in ms={}".format((end - start) * 1000))

start = time.time()
x1 = Series(df["Start_Lon"])
y1 = Series(df["Start_Lat"])
x2 = Series(df["End_Lon"])
y2 = Series(df["End_Lat"])
end = time.time()
    "data frame to column conversion time in ms={}".format(
        (end - start) * 1000

start = time.time()
h_dist = cuspatial.haversine_distance(x1, y1, x2, y2)
end = time.time()
print("python computing distance time in ms={}".format((end - start) * 1000))
# h_dist.data.to_array()
Exemplo n.º 17
def test_countvectorizer_custom_vocabulary():
    vocab = {"pizza": 0, "beer": 1}
Exemplo n.º 18
def read_its_timestamps(filename):
    """Reads a binary formatted its_timestamp file into a Series of uint64s.
    return Series(cpp_read_ts_soa(filename))
Exemplo n.º 19
    skX = from_df_to_numpy(X)
    X = dask_cudf.from_cudf(X, npartitions=2)

    enc = OneHotEncoder(sparse=False)
    skohe = SkOneHotEncoder(sparse=False)

    ohe = enc.fit_transform(X)
    ref = skohe.fit_transform(skX)

    cp.testing.assert_array_equal(ohe.compute(), ref)

                         [None, 'first', {
                             'g': Series('F'),
                             'i': Series(3)
def test_onehot_inverse_transform(client, drop):
    df = DataFrame({'g': ['M', 'F', 'F'], 'i': [1, 3, 2]})
    X = dask_cudf.from_cudf(df, npartitions=2)

    enc = OneHotEncoder(drop=drop)
    ohe = enc.fit_transform(X)
    inv = enc.inverse_transform(ohe)
    assert_frame_equal(inv.compute().to_pandas(), df.to_pandas())

def test_onehot_categories(client):
    X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]})
Exemplo n.º 20
def test_numpy_non_contiguious():
    recdtype = np.dtype([("index", np.int64), ("a", np.int32)])
    rec = np.recarray(10, dtype=recdtype)
    rec.index = np.arange(30, 40)
    rec.a = aa = np.arange(20, dtype=np.int32)[::2]
    assert rec.a.flags["C_CONTIGUOUS"] is False

    gdf = DataFrame.from_records(rec, index="index")
    assert_eq(aa, gdf["a"].values)

        Series([1, 2, 3, -12, 12, 44]),
        Series([1, 2, 3, -12, 12, 44], dtype="str"),
        Series([1, 2, 3, -12, 12, 44]).index,
            "a": [1, 2, 3, -1234],
            "b": [0.1, 0.2222, 0.4, -3.14]
            "a": [1, 2, 3, -1234],
            "b": [0.1, 0.2222, 0.4, -3.14]
@pytest.mark.parametrize("dtype", [None, "float", "int", "str"])
def test_series_dataframe__array__(data, dtype):
    gs = data
Exemplo n.º 21
def test_contains(data, search_key):
    expect = Series(data).list.contains(search_key)
    ds = dgd.from_cudf(Series(data), 5)
    assert_eq(expect, ds.list.contains(search_key).compute())
Exemplo n.º 22
    def remove_categories(self, removals, **kwargs):
        Remove the specified categories.

        `removals` must be included in the
        old categories. Values which were in the
        removed categories will be set to null.


        removals : category or list-like of category
            The categories which should be removed.

        inplace : bool, default False
            Whether or not to remove the categories
            inplace or return a copy of this categorical
            with removed categories.

            Categorical with removed categories or None
            if inplace.

        >>> import cudf
        >>> s = cudf.Series([10, 1, 1, 2, 10, 2, 10], dtype="category")
        >>> s
        0    10
        1     1
        2     1
        3     2
        4    10
        5     2
        6    10
        dtype: category
        Categories (3, int64): [1, 2, 10]
        >>> s.cat.remove_categories([1])
        0     10
        1   null
        2   null
        3      2
        4     10
        5      2
        6     10
        dtype: category
        Categories (2, int64): [2, 10]
        >>> s
        0    10
        1     1
        2     1
        3     2
        4    10
        5     2
        6    10
        dtype: category
        Categories (3, int64): [1, 2, 10]
        >>> s.cat.remove_categories([10], inplace=True)
        >>> s
        0   null
        1      1
        2      1
        3      2
        4   null
        5      2
        6   null
        dtype: category
        Categories (2, int64): [1, 2]
        from cudf import Series

        cats = self.categories.to_series()
        removals = Series(removals, dtype=cats.dtype)
        removals_mask = removals.isin(cats)

        # ensure all the removals are in the current categories
        # list. If not, raise an error to match Pandas behavior
        if not removals_mask.all():
            vals = removals[~removals_mask].to_array()
            msg = "removals must all be in old categories: {}".format(vals)
            raise ValueError(msg)

        new_categories = cats[~cats.isin(removals)]._column
        out_col = self._column
        if not self._categories_equal(new_categories, **kwargs):
            out_col = self._set_categories(new_categories, **kwargs)

        return self._return_or_inplace(out_col, **kwargs)
Exemplo n.º 23
def test_series(data):
    pdsr = pd.Series(data.copy())
    sr = Series(pdsr)
    dsr = dgd.from_cudf(sr, npartitions=5)

    np.testing.assert_equal(np.array(pdsr), dsr.compute().to_array())
Exemplo n.º 24
def test_only_delimiters():
    data = ['abc def. 123', '   ', '456 789']
    data_gpu = Series(data)
    res = CountVectorizer().fit_transform(data_gpu)
    ref = SkCountVect().fit_transform(data)
    cp.testing.assert_array_equal(res.todense(), ref.toarray())
Exemplo n.º 25
def test_categorical_accessor_initialization1(data):
    sr = Series(data.copy())
    dsr = dgd.from_cudf(sr, npartitions=5)
Exemplo n.º 26
    def inverse_transform(self, X):
        Convert the data back to the original representation.
        In case unknown categories are encountered (all zeros in the
        one-hot encoding), ``None`` is used to represent this category.

        The return type is the same as the type of the input used by the first
        call to fit on this estimator instance.
        X : array-like or sparse matrix, shape [n_samples, n_encoded_features]
            The transformed data.
        X_tr : cudf.DataFrame or cupy.ndarray
            Inverse transformed array.
        if cp.sparse.issparse(X):
            # cupy.sparse 7.x does not support argmax, when we upgrade cupy to
            # 8.x, we should add a condition in the
            # if close: `and not cp.sparse.issparsecsc(X)`
            # and change the following line by `X = X.tocsc()`
            X = X.toarray()
        result = DataFrame(columns=self._encoders.keys())
        j = 0
        for feature in self._encoders.keys():
            feature_enc = self._encoders[feature]
            cats = feature_enc.classes_

            if self.drop is not None:
                # Remove dropped categories
                dropped_class_idx = Series(self.drop_idx_[feature])
                dropped_class_mask = Series(cats).isin(cats[dropped_class_idx])
                if len(cats) == 1:
                    inv = Series(GenericIndex(cats[0]).repeat(X.shape[0]))
                    result[feature] = inv
                cats = cats[~dropped_class_mask]

            enc_size = len(cats)
            x_feature = X[:, j:j + enc_size]
            idx = cp.argmax(x_feature, axis=1)
            inv = Series(cats.iloc[idx]).reset_index(drop=True)

            if self.handle_unknown == 'ignore':
                not_null_idx = x_feature.any(axis=1)
                inv.iloc[~not_null_idx] = None
            elif self.drop is not None:
                # drop will either be None or handle_unknown will be error. If
                # self.drop is not None, then we can safely assume that all of
                # the nulls in each column are the dropped value
                dropped_mask = cp.asarray(x_feature.sum(axis=1) == 0).flatten()
                if dropped_mask.any():
                    inv[dropped_mask] = feature_enc.inverse_transform(

            result[feature] = inv
            j += enc_size
        if self.input_type == 'array':
                result = cp.asarray(result.as_gpu_matrix())
            except ValueError:
                warnings.warn("The input one hot encoding contains rows with "
                              "unknown categories. Arrays do not support null "
                              "values. Returning output as a DataFrame "
        return result
Exemplo n.º 27
    return pdf, gdf

    "i1, i2, i3",
        (slice(None, 12), slice(3, None), slice(None, None, 2)),
        (range(12), range(3, 12), range(0, 9, 2)),
        (np.arange(12), np.arange(3, 12), np.arange(0, 9, 2)),
        (list(range(12)), list(range(3, 12)), list(range(0, 9, 2))),
            pd.Series(range(3, 12)),
            pd.Series(range(0, 9, 2)),
        (Series(range(12)), Series(range(3, 12)), Series(range(0, 9, 2))),
            [i in range(12) for i in range(20)],
            [i in range(3, 12) for i in range(12)],
            [i in range(0, 9, 2) for i in range(9)],
            np.array([i in range(12) for i in range(20)], dtype=bool),
            np.array([i in range(3, 12) for i in range(12)], dtype=bool),
            np.array([i in range(0, 9, 2) for i in range(9)], dtype=bool),
    ] + [(
        np.arange(12, dtype=t),
        np.arange(3, 12, dtype=t),
        np.arange(0, 9, 2, dtype=t),
    ) for t in index_dtypes]),
Exemplo n.º 28
    def __init__(self, t, y, ids=None, size=None, prefixes=None):
        Computes various error preconditions on the input data, then
        uses CUDA to compute cubic splines for each set of input
        coordinates on the GPU in parallel.

        t : cudf.Series
            time sample values. Must be monotonically increasing.
        y : cudf.Series
            columns to have curves fit to according to x
        ids (Optional) : cudf.Series
            ids of each spline
        size (Optional) : cudf.Series
            fixed size of each spline
        prefixes (Optional) : cudf.Series
            alternative to `size`, allows splines of varying
            length. Not yet fully supported.

        CubicSpline : callable `o`
            ``o.c`` contains the coefficients that can be used to compute new
            points along the spline fitting the original ``t`` data. ``o(n)``
            interpolates the spline coordinates along new input values ``n``.

        # error protections:
        if len(t) < 5:
            raise ValueError(
                "Use of GPU cubic spline requires splines of length > 4"
        if not isinstance(t, Series):
            raise TypeError(
                "Error: input independent vars must be cudf Series"
        if not isinstance(y, (Series, DataFrame)):
            raise TypeError(
                "Error: input dependent vars must be cudf Series or DataFrame"
        if not len(t) == len(y):
            raise TypeError(
                "Error: dependent and independent vars have different length"
        if ids is None:
            self.ids = Series([0, 0]).astype("int32")
            if not isinstance(ids, Series):
                raise TypeError("cuspatial.CubicSpline requires a cudf.Series")
            if not ids.dtype == np.int32:
                raise TypeError("Error: int32 only supported at this time.")
            self.ids = ids
        self.size = size if size is not None else len(t)
        if not isinstance(self.size, int):
            raise TypeError("Error: size must be an integer")
        if not ((len(t) % self.size) == 0):
            raise ValueError(
                "Error: length of input is not a multiple of size"
        if not isinstance(t, Series):
            raise TypeError("cuspatial.CubicSpline requires a cudf.Series")
        if not t.dtype == np.float32:
            raise TypeError("Error: float32 only supported at this time.")
        if not isinstance(y, Series):
            raise TypeError("cuspatial.CubicSpline requires a cudf.Series")
        if not y.dtype == np.float32:
            raise TypeError("Error: float32 only supported at this time.")
        self.t = t
        self.y = y
        if prefixes is None:
            self.prefix = Series(
                cp.arange((len(t) / self.size) + 1) * self.size
            if not isinstance(prefixes, Series):
                raise TypeError("cuspatial.CubicSpline requires a cudf.Series")
            if not prefixes.dtype == np.int32:
                raise TypeError("Error: int32 only supported at this time.")
            self.prefix = prefixes

        self.c = self._compute_coefficients()
Exemplo n.º 29
def test_countvectorizer_empty_vocabulary():
    v = CountVectorizer(max_df=1.0, stop_words="english")
    # fitting only on stopwords will result in an empty vocabulary
    with pytest.raises(ValueError):
        v.fit(Series(["to be or not to be", "and me too", "and so do you"]))
Exemplo n.º 30
def test_empty_doc_after_limit_features():
    data = ['abc abc def', 'def abc', 'ghi']
    data_gpu = Series(data)
    count = CountVectorizer(min_df=2).fit_transform(data_gpu)
    ref = SkCountVect(min_df=2).fit_transform(data)
    cp.testing.assert_array_equal(count.todense(), ref.toarray())