예제 #1
0
def test_series_indexing(i1, i2, i3):
    a1 = np.arange(20)
    series = Series(a1)
    # Indexing
    sr1 = series.iloc[i1]
    assert sr1.null_count == 0
    np.testing.assert_equal(sr1.to_array(), a1[:12])
    sr2 = sr1.iloc[i2]
    assert sr2.null_count == 0
    np.testing.assert_equal(sr2.to_array(), a1[3:12])
    # Index with stride
    sr3 = sr2.iloc[i3]
    assert sr3.null_count == 0
    np.testing.assert_equal(sr3.to_array(), a1[3:12:2])

    # Integer indexing
    if isinstance(i1, range):
        for i in i1:  # Python int-s
            assert series[i] == a1[i]
    if isinstance(i1, np.ndarray) and i1.dtype in index_dtypes:
        for i in i1:  # numpy integers
            assert series[i] == a1[i]
def test_vectorizer_min_df():
    test_data = Series(['abc', 'dea', 'eat'])
    vect = CountVectorizer(analyzer='char', min_df=1)
    vect.fit(test_data)
    assert 'a' in vect.vocabulary_.to_arrow().to_pylist()
    assert len(vect.vocabulary_.to_arrow().to_pylist()) == 6
    assert len(vect.stop_words_) == 0

    vect.min_df = 2
    vect.fit(test_data)
    assert 'c' not in vect.vocabulary_.to_arrow().to_pylist()  # {bcdt} ignored
    assert len(vect.vocabulary_.to_arrow().to_pylist()) == 2  # {ae} remain
    assert 'c' in vect.stop_words_.to_arrow().to_pylist()
    assert len(vect.stop_words_) == 4

    vect.min_df = 0.8  # 0.8 * 3 documents -> min_doc_count == 2.4
    vect.fit(test_data)
    # {bcdet} ignored
    assert 'c' not in vect.vocabulary_.to_arrow().to_pylist()
    assert len(vect.vocabulary_.to_arrow().to_pylist()) == 1  # {a} remains
    assert 'c' in vect.stop_words_.to_arrow().to_pylist()
    assert len(vect.stop_words_) == 5
def test_count_binary_occurrences():
    # by default multiple occurrences are counted as longs
    test_data = Series(['aaabc', 'abbde'])
    vect = CountVectorizer(analyzer='char', max_df=1.0)
    X = cp.asnumpy(vect.fit_transform(test_data).todense())
    assert_array_equal(['a', 'b', 'c', 'd', 'e'],
                       vect.get_feature_names().to_arrow().to_pylist())
    assert_array_equal([[3, 1, 1, 0, 0], [1, 2, 0, 1, 1]], X)

    # using boolean features, we can fetch the binary occurrence info
    # instead.
    vect = CountVectorizer(analyzer='char', max_df=1.0, binary=True)
    X = cp.asnumpy(vect.fit_transform(test_data).todense())
    assert_array_equal([[1, 1, 1, 0, 0], [1, 1, 0, 1, 1]], X)

    # check the ability to change the dtype
    vect = CountVectorizer(analyzer='char',
                           max_df=1.0,
                           binary=True,
                           dtype=cp.float32)
    X = vect.fit_transform(test_data)
    assert X.dtype == cp.float32
예제 #4
0
def test_series_nsmallest(data, n):
    """Indirectly tests Series.sort_values()"""
    sr = Series(data)
    psr = pd.Series(data)
    assert_eq(sr.nsmallest(n), psr.nsmallest(n))
    assert_eq(
        sr.nsmallest(n, keep="last").sort_index(),
        psr.nsmallest(n, keep="last").sort_index(),
    )

    assert_exceptions_equal(
        lfunc=psr.nsmallest,
        rfunc=sr.nsmallest,
        lfunc_args_and_kwargs=([], {
            "n": 3,
            "keep": "what"
        }),
        rfunc_args_and_kwargs=([], {
            "n": 3,
            "keep": "what"
        }),
        expected_error_message='keep must be either "first", "last"',
    )
예제 #5
0
def test_sum_of_squares(dtype, nelem):
    dtype = cudf.dtype(dtype).type
    data = gen_rand(dtype, nelem)
    sr = Series(data)
    df = cudf.DataFrame(sr)

    got = sr.sum_of_squares()
    got_df = df.sum_of_squares()
    expect = (data ** 2).sum()

    if cudf.dtype(dtype).kind in {"u", "i"}:
        if 0 <= expect <= np.iinfo(dtype).max:
            np.testing.assert_array_almost_equal(expect, got)
            np.testing.assert_array_almost_equal(expect, got_df.iloc[0])
        else:
            print("overflow, passing")
    else:
        np.testing.assert_approx_equal(
            expect, got, significant=accuracy_for_dtype[dtype]
        )
        np.testing.assert_approx_equal(
            expect, got_df.iloc[0], significant=accuracy_for_dtype[dtype]
        )
예제 #6
0
def test_onehot_masked():
    np.random.seed(0)
    high = 5
    size = 100
    arr = np.random.randint(low=0, high=high, size=size)
    bitmask = utils.random_bitmask(size)
    bytemask = np.asarray(
        utils.expand_bits_to_bytes(bitmask)[:size], dtype=np.bool_
    )
    arr[~bytemask] = -1

    df = DataFrame()
    df["a"] = Series(arr).set_mask(bitmask)

    out = df.one_hot_encoding(
        "a", cats=list(range(high)), prefix="a", dtype=np.int32
    )

    assert tuple(out.columns) == ("a", "a_0", "a_1", "a_2", "a_3", "a_4")
    np.testing.assert_array_equal((out["a_0"] == 1).to_array(), arr == 0)
    np.testing.assert_array_equal((out["a_1"] == 1).to_array(), arr == 1)
    np.testing.assert_array_equal((out["a_2"] == 1).to_array(), arr == 2)
    np.testing.assert_array_equal((out["a_3"] == 1).to_array(), arr == 3)
    np.testing.assert_array_equal((out["a_4"] == 1).to_array(), arr == 4)
예제 #7
0
def test_len(data):
    expect = Series(data).list.len()
    ds = dgd.from_cudf(Series(data), 5)
    assert_eq(expect, ds.list.len().compute())
예제 #8
0
def test_datetime_accessor_initialization(data):
    pdsr = pd.Series(data.copy())
    sr = Series(pdsr)
    dsr = dgd.from_cudf(sr, npartitions=5)
    with pytest.raises(AttributeError):
        dsr.dt
예제 #9
0
def test_create_list_series(data):
    expect = pd.Series(data)
    ds_got = dgd.from_cudf(Series(data), 4)
    assert_eq(expect, ds_got.compute())
from cudf import Series

import cuspatial

in_trajs = []
in_trajs.append(np.array([[1, 0], [2, 1], [3, 2], [5, 3], [7, 1]]))
in_trajs.append(np.array([[0, 3], [2, 5], [3, 6], [6, 5]]))
in_trajs.append(np.array([[1, 4], [3, 7], [6, 4]]))
out_trajs = np.concatenate([np.asarray(traj) for traj in in_trajs], 0)
py_x = np.array(out_trajs[:, 0])
py_y = np.array(out_trajs[:, 1])
py_cnt = []
for traj in in_trajs:
    py_cnt.append(len(traj))
pnt_x = Series(py_x)
pnt_y = Series(py_y)
cnt = Series(py_cnt)
distance = cuspatial.directed_hausdorff_distance(pnt_x, pnt_y, cnt)

matrix = distance.as_matrix()

# clustering using AgglomerativeClustering
agg1 = AgglomerativeClustering(n_clusters=2,
                               affinity="precomputed",
                               linkage="average")
label1 = agg1.fit(matrix)
print("AgglomerativeClustering results={}".format(label1.labels_))

# clustering using DBSCAN; as the minimum distanance is ~1.4,
# using eps=1.5 will generate the same two clasters as AgglomerativeClustering
예제 #11
0
def test_categorical_accessor_initialization2(data):
    sr = Series(data.copy())
    dsr = dgd.from_cudf(sr, npartitions=5)
    with pytest.raises(AttributeError):
        dsr.cat
예제 #12
0
def test_leaves(data):
    expect = Series(data).list.leaves
    ds = dgd.from_cudf(Series(data), 5)
    got = ds.list.leaves.compute().reset_index(drop=True)
    assert_eq(expect, got)
예제 #13
0
파일: utils.py 프로젝트: zivzone/cudf
def gen_rand_series(dtype, size, **kwargs):
    values = gen_rand(dtype, size, **kwargs)
    if kwargs.get("has_nulls", False):
        return Series.from_masked_array(values, random_bitmask(size))

    return Series(values)
예제 #14
0
def read_uint(filename):
    """Reads a binary file of uint32s into a `cudf.Series`
    """
    return Series(cpp_read_uint_soa(filename))
예제 #15
0
    "the coke burger coke copyright",
    "the coke burger burger",
)

NOTJUNK_FOOD_DOCS = (
    "the salad celeri copyright",
    "the salad salad sparkling water copyright",
    "the the celeri celeri copyright",
    "the tomato tomato salad water",
    "the tomato salad water copyright",
)

EMPTY_DOCS = ("",)

DOCS = JUNK_FOOD_DOCS + EMPTY_DOCS + NOTJUNK_FOOD_DOCS + EMPTY_DOCS
DOCS_GPU = Series(DOCS)

NGRAM_RANGES = [(1, 1), (1, 2), (2, 3)]
NGRAM_IDS = [f'ngram_range={str(r)}' for r in NGRAM_RANGES]


@pytest.mark.parametrize('ngram_range', NGRAM_RANGES, ids=NGRAM_IDS)
def test_word_analyzer(ngram_range):
    v = CountVectorizer(ngram_range=ngram_range).fit(DOCS_GPU)
    ref = SkCountVect(ngram_range=ngram_range).fit(DOCS)
    assert (
        ref.get_feature_names() == v.get_feature_names().to_arrow().to_pylist()
    )


def test_countvectorizer_custom_vocabulary():
예제 #16
0
import time

from cudf import Series, read_csv

import cuspatial

start = time.time()
# data dowloaded from
# https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2009-01.csv
df = read_csv("data/yellow_tripdata_2009-01.csv")
end = time.time()
print("data ingesting time (from SSD) in ms={}".format((end - start) * 1000))
df.head().to_pandas().columns

start = time.time()
x1 = Series(df["Start_Lon"])
y1 = Series(df["Start_Lat"])
x2 = Series(df["End_Lon"])
y2 = Series(df["End_Lat"])
end = time.time()
print(
    "data frame to column conversion time in ms={}".format(
        (end - start) * 1000
    )
)

start = time.time()
h_dist = cuspatial.haversine_distance(x1, y1, x2, y2)
end = time.time()
print("python computing distance time in ms={}".format((end - start) * 1000))
# h_dist.data.to_array()
    "the coke burger coke copyright",
    "the coke burger burger",
)

NOTJUNK_FOOD_DOCS = (
    "the salad celeri copyright",
    "the salad salad sparkling water copyright",
    "the the celeri celeri copyright",
    "the tomato tomato salad water",
    "the tomato salad water copyright",
)

EMPTY_DOCS = ("", )

DOCS = JUNK_FOOD_DOCS + EMPTY_DOCS + NOTJUNK_FOOD_DOCS + EMPTY_DOCS
DOCS_GPU = Series(DOCS)

NGRAM_RANGES = [(1, 1), (1, 2), (2, 3)]
NGRAM_IDS = [f'ngram_range={str(r)}' for r in NGRAM_RANGES]


@pytest.mark.parametrize('ngram_range', NGRAM_RANGES, ids=NGRAM_IDS)
def test_word_analyzer(ngram_range):
    v = CountVectorizer(ngram_range=ngram_range).fit(DOCS_GPU)
    ref = SkCountVect(ngram_range=ngram_range).fit(DOCS)
    assert (ref.get_feature_names() ==
            v.get_feature_names().to_arrow().to_pylist())


def test_countvectorizer_custom_vocabulary():
    vocab = {"pizza": 0, "beer": 1}
예제 #18
0
def read_its_timestamps(filename):
    """Reads a binary formatted its_timestamp file into a Series of uint64s.
    """
    return Series(cpp_read_ts_soa(filename))
예제 #19
0
    skX = from_df_to_numpy(X)
    X = dask_cudf.from_cudf(X, npartitions=2)

    enc = OneHotEncoder(sparse=False)
    skohe = SkOneHotEncoder(sparse=False)

    ohe = enc.fit_transform(X)
    ref = skohe.fit_transform(skX)

    cp.testing.assert_array_equal(ohe.compute(), ref)


@pytest.mark.mg
@pytest.mark.parametrize('drop',
                         [None, 'first', {
                             'g': Series('F'),
                             'i': Series(3)
                         }])
def test_onehot_inverse_transform(client, drop):
    df = DataFrame({'g': ['M', 'F', 'F'], 'i': [1, 3, 2]})
    X = dask_cudf.from_cudf(df, npartitions=2)

    enc = OneHotEncoder(drop=drop)
    ohe = enc.fit_transform(X)
    inv = enc.inverse_transform(ohe)
    assert_frame_equal(inv.compute().to_pandas(), df.to_pandas())


@pytest.mark.mg
def test_onehot_categories(client):
    X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]})
예제 #20
0
def test_numpy_non_contiguious():
    recdtype = np.dtype([("index", np.int64), ("a", np.int32)])
    rec = np.recarray(10, dtype=recdtype)
    rec.index = np.arange(30, 40)
    rec.a = aa = np.arange(20, dtype=np.int32)[::2]
    assert rec.a.flags["C_CONTIGUOUS"] is False

    gdf = DataFrame.from_records(rec, index="index")
    assert_eq(aa, gdf["a"].values)


@pytest.mark.parametrize(
    "data",
    [
        Series([1, 2, 3, -12, 12, 44]),
        Series([1, 2, 3, -12, 12, 44], dtype="str"),
        Series([1, 2, 3, -12, 12, 44]).index,
        DataFrame({
            "a": [1, 2, 3, -1234],
            "b": [0.1, 0.2222, 0.4, -3.14]
        }),
        DataFrame({
            "a": [1, 2, 3, -1234],
            "b": [0.1, 0.2222, 0.4, -3.14]
        }).index,
    ],
)
@pytest.mark.parametrize("dtype", [None, "float", "int", "str"])
def test_series_dataframe__array__(data, dtype):
    gs = data
예제 #21
0
def test_contains(data, search_key):
    expect = Series(data).list.contains(search_key)
    ds = dgd.from_cudf(Series(data), 5)
    assert_eq(expect, ds.list.contains(search_key).compute())
예제 #22
0
    def remove_categories(self, removals, **kwargs):
        """
        Remove the specified categories.

        `removals` must be included in the
        old categories. Values which were in the
        removed categories will be set to null.

        Parameters
        ----------

        removals : category or list-like of category
            The categories which should be removed.

        inplace : bool, default False
            Whether or not to remove the categories
            inplace or return a copy of this categorical
            with removed categories.

        Returns
        -------
        cat
            Categorical with removed categories or None
            if inplace.

        Examples
        --------
        >>> import cudf
        >>> s = cudf.Series([10, 1, 1, 2, 10, 2, 10], dtype="category")
        >>> s
        0    10
        1     1
        2     1
        3     2
        4    10
        5     2
        6    10
        dtype: category
        Categories (3, int64): [1, 2, 10]
        >>> s.cat.remove_categories([1])
        0     10
        1   null
        2   null
        3      2
        4     10
        5      2
        6     10
        dtype: category
        Categories (2, int64): [2, 10]
        >>> s
        0    10
        1     1
        2     1
        3     2
        4    10
        5     2
        6    10
        dtype: category
        Categories (3, int64): [1, 2, 10]
        >>> s.cat.remove_categories([10], inplace=True)
        >>> s
        0   null
        1      1
        2      1
        3      2
        4   null
        5      2
        6   null
        dtype: category
        Categories (2, int64): [1, 2]
        """
        from cudf import Series

        cats = self.categories.to_series()
        removals = Series(removals, dtype=cats.dtype)
        removals_mask = removals.isin(cats)

        # ensure all the removals are in the current categories
        # list. If not, raise an error to match Pandas behavior
        if not removals_mask.all():
            vals = removals[~removals_mask].to_array()
            msg = "removals must all be in old categories: {}".format(vals)
            raise ValueError(msg)

        new_categories = cats[~cats.isin(removals)]._column
        out_col = self._column
        if not self._categories_equal(new_categories, **kwargs):
            out_col = self._set_categories(new_categories, **kwargs)

        return self._return_or_inplace(out_col, **kwargs)
예제 #23
0
def test_series(data):
    pdsr = pd.Series(data.copy())
    sr = Series(pdsr)
    dsr = dgd.from_cudf(sr, npartitions=5)

    np.testing.assert_equal(np.array(pdsr), dsr.compute().to_array())
def test_only_delimiters():
    data = ['abc def. 123', '   ', '456 789']
    data_gpu = Series(data)
    res = CountVectorizer().fit_transform(data_gpu)
    ref = SkCountVect().fit_transform(data)
    cp.testing.assert_array_equal(res.todense(), ref.toarray())
예제 #25
0
def test_categorical_accessor_initialization1(data):
    sr = Series(data.copy())
    dsr = dgd.from_cudf(sr, npartitions=5)
    dsr.cat
예제 #26
0
파일: encoders.py 프로젝트: teju85/cuml
    def inverse_transform(self, X):
        """
        Convert the data back to the original representation.
        In case unknown categories are encountered (all zeros in the
        one-hot encoding), ``None`` is used to represent this category.

        The return type is the same as the type of the input used by the first
        call to fit on this estimator instance.
        Parameters
        ----------
        X : array-like or sparse matrix, shape [n_samples, n_encoded_features]
            The transformed data.
        Returns
        -------
        X_tr : cudf.DataFrame or cupy.ndarray
            Inverse transformed array.
        """
        self._check_is_fitted()
        if cp.sparse.issparse(X):
            # cupy.sparse 7.x does not support argmax, when we upgrade cupy to
            # 8.x, we should add a condition in the
            # if close: `and not cp.sparse.issparsecsc(X)`
            # and change the following line by `X = X.tocsc()`
            X = X.toarray()
        result = DataFrame(columns=self._encoders.keys())
        j = 0
        for feature in self._encoders.keys():
            feature_enc = self._encoders[feature]
            cats = feature_enc.classes_

            if self.drop is not None:
                # Remove dropped categories
                dropped_class_idx = Series(self.drop_idx_[feature])
                dropped_class_mask = Series(cats).isin(cats[dropped_class_idx])
                if len(cats) == 1:
                    inv = Series(GenericIndex(cats[0]).repeat(X.shape[0]))
                    result[feature] = inv
                    continue
                cats = cats[~dropped_class_mask]

            enc_size = len(cats)
            x_feature = X[:, j:j + enc_size]
            idx = cp.argmax(x_feature, axis=1)
            inv = Series(cats.iloc[idx]).reset_index(drop=True)

            if self.handle_unknown == 'ignore':
                not_null_idx = x_feature.any(axis=1)
                inv.iloc[~not_null_idx] = None
            elif self.drop is not None:
                # drop will either be None or handle_unknown will be error. If
                # self.drop is not None, then we can safely assume that all of
                # the nulls in each column are the dropped value
                dropped_mask = cp.asarray(x_feature.sum(axis=1) == 0).flatten()
                if dropped_mask.any():
                    inv[dropped_mask] = feature_enc.inverse_transform(
                        Series(self.drop_idx_[feature]))[0]

            result[feature] = inv
            j += enc_size
        if self.input_type == 'array':
            try:
                result = cp.asarray(result.as_gpu_matrix())
            except ValueError:
                warnings.warn("The input one hot encoding contains rows with "
                              "unknown categories. Arrays do not support null "
                              "values. Returning output as a DataFrame "
                              "instead.")
        return result
예제 #27
0
    return pdf, gdf


@pytest.mark.parametrize(
    "i1, i2, i3",
    ([
        (slice(None, 12), slice(3, None), slice(None, None, 2)),
        (range(12), range(3, 12), range(0, 9, 2)),
        (np.arange(12), np.arange(3, 12), np.arange(0, 9, 2)),
        (list(range(12)), list(range(3, 12)), list(range(0, 9, 2))),
        (
            pd.Series(range(12)),
            pd.Series(range(3, 12)),
            pd.Series(range(0, 9, 2)),
        ),
        (Series(range(12)), Series(range(3, 12)), Series(range(0, 9, 2))),
        (
            [i in range(12) for i in range(20)],
            [i in range(3, 12) for i in range(12)],
            [i in range(0, 9, 2) for i in range(9)],
        ),
        (
            np.array([i in range(12) for i in range(20)], dtype=bool),
            np.array([i in range(3, 12) for i in range(12)], dtype=bool),
            np.array([i in range(0, 9, 2) for i in range(9)], dtype=bool),
        ),
    ] + [(
        np.arange(12, dtype=t),
        np.arange(3, 12, dtype=t),
        np.arange(0, 9, 2, dtype=t),
    ) for t in index_dtypes]),
예제 #28
0
    def __init__(self, t, y, ids=None, size=None, prefixes=None):
        """
        Computes various error preconditions on the input data, then
        uses CUDA to compute cubic splines for each set of input
        coordinates on the GPU in parallel.

        Parameters
        ----------
        t : cudf.Series
            time sample values. Must be monotonically increasing.
        y : cudf.Series
            columns to have curves fit to according to x
        ids (Optional) : cudf.Series
            ids of each spline
        size (Optional) : cudf.Series
            fixed size of each spline
        prefixes (Optional) : cudf.Series
            alternative to `size`, allows splines of varying
            length. Not yet fully supported.

        Returns
        -------
        CubicSpline : callable `o`
            ``o.c`` contains the coefficients that can be used to compute new
            points along the spline fitting the original ``t`` data. ``o(n)``
            interpolates the spline coordinates along new input values ``n``.
        """

        # error protections:
        if len(t) < 5:
            raise ValueError(
                "Use of GPU cubic spline requires splines of length > 4"
            )
        if not isinstance(t, Series):
            raise TypeError(
                "Error: input independent vars must be cudf Series"
            )
        if not isinstance(y, (Series, DataFrame)):
            raise TypeError(
                "Error: input dependent vars must be cudf Series or DataFrame"
            )
        if not len(t) == len(y):
            raise TypeError(
                "Error: dependent and independent vars have different length"
            )
        if ids is None:
            self.ids = Series([0, 0]).astype("int32")
        else:
            if not isinstance(ids, Series):
                raise TypeError("cuspatial.CubicSpline requires a cudf.Series")
            if not ids.dtype == np.int32:
                raise TypeError("Error: int32 only supported at this time.")
            self.ids = ids
        self.size = size if size is not None else len(t)
        if not isinstance(self.size, int):
            raise TypeError("Error: size must be an integer")
        if not ((len(t) % self.size) == 0):
            raise ValueError(
                "Error: length of input is not a multiple of size"
            )
        if not isinstance(t, Series):
            raise TypeError("cuspatial.CubicSpline requires a cudf.Series")
        if not t.dtype == np.float32:
            raise TypeError("Error: float32 only supported at this time.")
        if not isinstance(y, Series):
            raise TypeError("cuspatial.CubicSpline requires a cudf.Series")
        if not y.dtype == np.float32:
            raise TypeError("Error: float32 only supported at this time.")
        self.t = t
        self.y = y
        if prefixes is None:
            self.prefix = Series(
                cp.arange((len(t) / self.size) + 1) * self.size
            ).astype("int32")
        else:
            if not isinstance(prefixes, Series):
                raise TypeError("cuspatial.CubicSpline requires a cudf.Series")
            if not prefixes.dtype == np.int32:
                raise TypeError("Error: int32 only supported at this time.")
            self.prefix = prefixes

        self.c = self._compute_coefficients()
예제 #29
0
def test_countvectorizer_empty_vocabulary():
    v = CountVectorizer(max_df=1.0, stop_words="english")
    # fitting only on stopwords will result in an empty vocabulary
    with pytest.raises(ValueError):
        v.fit(Series(["to be or not to be", "and me too", "and so do you"]))
def test_empty_doc_after_limit_features():
    data = ['abc abc def', 'def abc', 'ghi']
    data_gpu = Series(data)
    count = CountVectorizer(min_df=2).fit_transform(data_gpu)
    ref = SkCountVect(min_df=2).fit_transform(data)
    cp.testing.assert_array_equal(count.todense(), ref.toarray())