def test_series_indexing(i1, i2, i3): a1 = np.arange(20) series = Series(a1) # Indexing sr1 = series.iloc[i1] assert sr1.null_count == 0 np.testing.assert_equal(sr1.to_array(), a1[:12]) sr2 = sr1.iloc[i2] assert sr2.null_count == 0 np.testing.assert_equal(sr2.to_array(), a1[3:12]) # Index with stride sr3 = sr2.iloc[i3] assert sr3.null_count == 0 np.testing.assert_equal(sr3.to_array(), a1[3:12:2]) # Integer indexing if isinstance(i1, range): for i in i1: # Python int-s assert series[i] == a1[i] if isinstance(i1, np.ndarray) and i1.dtype in index_dtypes: for i in i1: # numpy integers assert series[i] == a1[i]
def test_vectorizer_min_df(): test_data = Series(['abc', 'dea', 'eat']) vect = CountVectorizer(analyzer='char', min_df=1) vect.fit(test_data) assert 'a' in vect.vocabulary_.to_arrow().to_pylist() assert len(vect.vocabulary_.to_arrow().to_pylist()) == 6 assert len(vect.stop_words_) == 0 vect.min_df = 2 vect.fit(test_data) assert 'c' not in vect.vocabulary_.to_arrow().to_pylist() # {bcdt} ignored assert len(vect.vocabulary_.to_arrow().to_pylist()) == 2 # {ae} remain assert 'c' in vect.stop_words_.to_arrow().to_pylist() assert len(vect.stop_words_) == 4 vect.min_df = 0.8 # 0.8 * 3 documents -> min_doc_count == 2.4 vect.fit(test_data) # {bcdet} ignored assert 'c' not in vect.vocabulary_.to_arrow().to_pylist() assert len(vect.vocabulary_.to_arrow().to_pylist()) == 1 # {a} remains assert 'c' in vect.stop_words_.to_arrow().to_pylist() assert len(vect.stop_words_) == 5
def test_count_binary_occurrences(): # by default multiple occurrences are counted as longs test_data = Series(['aaabc', 'abbde']) vect = CountVectorizer(analyzer='char', max_df=1.0) X = cp.asnumpy(vect.fit_transform(test_data).todense()) assert_array_equal(['a', 'b', 'c', 'd', 'e'], vect.get_feature_names().to_arrow().to_pylist()) assert_array_equal([[3, 1, 1, 0, 0], [1, 2, 0, 1, 1]], X) # using boolean features, we can fetch the binary occurrence info # instead. vect = CountVectorizer(analyzer='char', max_df=1.0, binary=True) X = cp.asnumpy(vect.fit_transform(test_data).todense()) assert_array_equal([[1, 1, 1, 0, 0], [1, 1, 0, 1, 1]], X) # check the ability to change the dtype vect = CountVectorizer(analyzer='char', max_df=1.0, binary=True, dtype=cp.float32) X = vect.fit_transform(test_data) assert X.dtype == cp.float32
def test_series_nsmallest(data, n): """Indirectly tests Series.sort_values()""" sr = Series(data) psr = pd.Series(data) assert_eq(sr.nsmallest(n), psr.nsmallest(n)) assert_eq( sr.nsmallest(n, keep="last").sort_index(), psr.nsmallest(n, keep="last").sort_index(), ) assert_exceptions_equal( lfunc=psr.nsmallest, rfunc=sr.nsmallest, lfunc_args_and_kwargs=([], { "n": 3, "keep": "what" }), rfunc_args_and_kwargs=([], { "n": 3, "keep": "what" }), expected_error_message='keep must be either "first", "last"', )
def test_sum_of_squares(dtype, nelem): dtype = cudf.dtype(dtype).type data = gen_rand(dtype, nelem) sr = Series(data) df = cudf.DataFrame(sr) got = sr.sum_of_squares() got_df = df.sum_of_squares() expect = (data ** 2).sum() if cudf.dtype(dtype).kind in {"u", "i"}: if 0 <= expect <= np.iinfo(dtype).max: np.testing.assert_array_almost_equal(expect, got) np.testing.assert_array_almost_equal(expect, got_df.iloc[0]) else: print("overflow, passing") else: np.testing.assert_approx_equal( expect, got, significant=accuracy_for_dtype[dtype] ) np.testing.assert_approx_equal( expect, got_df.iloc[0], significant=accuracy_for_dtype[dtype] )
def test_onehot_masked(): np.random.seed(0) high = 5 size = 100 arr = np.random.randint(low=0, high=high, size=size) bitmask = utils.random_bitmask(size) bytemask = np.asarray( utils.expand_bits_to_bytes(bitmask)[:size], dtype=np.bool_ ) arr[~bytemask] = -1 df = DataFrame() df["a"] = Series(arr).set_mask(bitmask) out = df.one_hot_encoding( "a", cats=list(range(high)), prefix="a", dtype=np.int32 ) assert tuple(out.columns) == ("a", "a_0", "a_1", "a_2", "a_3", "a_4") np.testing.assert_array_equal((out["a_0"] == 1).to_array(), arr == 0) np.testing.assert_array_equal((out["a_1"] == 1).to_array(), arr == 1) np.testing.assert_array_equal((out["a_2"] == 1).to_array(), arr == 2) np.testing.assert_array_equal((out["a_3"] == 1).to_array(), arr == 3) np.testing.assert_array_equal((out["a_4"] == 1).to_array(), arr == 4)
def test_len(data): expect = Series(data).list.len() ds = dgd.from_cudf(Series(data), 5) assert_eq(expect, ds.list.len().compute())
def test_datetime_accessor_initialization(data): pdsr = pd.Series(data.copy()) sr = Series(pdsr) dsr = dgd.from_cudf(sr, npartitions=5) with pytest.raises(AttributeError): dsr.dt
def test_create_list_series(data): expect = pd.Series(data) ds_got = dgd.from_cudf(Series(data), 4) assert_eq(expect, ds_got.compute())
from cudf import Series import cuspatial in_trajs = [] in_trajs.append(np.array([[1, 0], [2, 1], [3, 2], [5, 3], [7, 1]])) in_trajs.append(np.array([[0, 3], [2, 5], [3, 6], [6, 5]])) in_trajs.append(np.array([[1, 4], [3, 7], [6, 4]])) out_trajs = np.concatenate([np.asarray(traj) for traj in in_trajs], 0) py_x = np.array(out_trajs[:, 0]) py_y = np.array(out_trajs[:, 1]) py_cnt = [] for traj in in_trajs: py_cnt.append(len(traj)) pnt_x = Series(py_x) pnt_y = Series(py_y) cnt = Series(py_cnt) distance = cuspatial.directed_hausdorff_distance(pnt_x, pnt_y, cnt) matrix = distance.as_matrix() # clustering using AgglomerativeClustering agg1 = AgglomerativeClustering(n_clusters=2, affinity="precomputed", linkage="average") label1 = agg1.fit(matrix) print("AgglomerativeClustering results={}".format(label1.labels_)) # clustering using DBSCAN; as the minimum distanance is ~1.4, # using eps=1.5 will generate the same two clasters as AgglomerativeClustering
def test_categorical_accessor_initialization2(data): sr = Series(data.copy()) dsr = dgd.from_cudf(sr, npartitions=5) with pytest.raises(AttributeError): dsr.cat
def test_leaves(data): expect = Series(data).list.leaves ds = dgd.from_cudf(Series(data), 5) got = ds.list.leaves.compute().reset_index(drop=True) assert_eq(expect, got)
def gen_rand_series(dtype, size, **kwargs): values = gen_rand(dtype, size, **kwargs) if kwargs.get("has_nulls", False): return Series.from_masked_array(values, random_bitmask(size)) return Series(values)
def read_uint(filename): """Reads a binary file of uint32s into a `cudf.Series` """ return Series(cpp_read_uint_soa(filename))
"the coke burger coke copyright", "the coke burger burger", ) NOTJUNK_FOOD_DOCS = ( "the salad celeri copyright", "the salad salad sparkling water copyright", "the the celeri celeri copyright", "the tomato tomato salad water", "the tomato salad water copyright", ) EMPTY_DOCS = ("",) DOCS = JUNK_FOOD_DOCS + EMPTY_DOCS + NOTJUNK_FOOD_DOCS + EMPTY_DOCS DOCS_GPU = Series(DOCS) NGRAM_RANGES = [(1, 1), (1, 2), (2, 3)] NGRAM_IDS = [f'ngram_range={str(r)}' for r in NGRAM_RANGES] @pytest.mark.parametrize('ngram_range', NGRAM_RANGES, ids=NGRAM_IDS) def test_word_analyzer(ngram_range): v = CountVectorizer(ngram_range=ngram_range).fit(DOCS_GPU) ref = SkCountVect(ngram_range=ngram_range).fit(DOCS) assert ( ref.get_feature_names() == v.get_feature_names().to_arrow().to_pylist() ) def test_countvectorizer_custom_vocabulary():
import time from cudf import Series, read_csv import cuspatial start = time.time() # data dowloaded from # https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2009-01.csv df = read_csv("data/yellow_tripdata_2009-01.csv") end = time.time() print("data ingesting time (from SSD) in ms={}".format((end - start) * 1000)) df.head().to_pandas().columns start = time.time() x1 = Series(df["Start_Lon"]) y1 = Series(df["Start_Lat"]) x2 = Series(df["End_Lon"]) y2 = Series(df["End_Lat"]) end = time.time() print( "data frame to column conversion time in ms={}".format( (end - start) * 1000 ) ) start = time.time() h_dist = cuspatial.haversine_distance(x1, y1, x2, y2) end = time.time() print("python computing distance time in ms={}".format((end - start) * 1000)) # h_dist.data.to_array()
"the coke burger coke copyright", "the coke burger burger", ) NOTJUNK_FOOD_DOCS = ( "the salad celeri copyright", "the salad salad sparkling water copyright", "the the celeri celeri copyright", "the tomato tomato salad water", "the tomato salad water copyright", ) EMPTY_DOCS = ("", ) DOCS = JUNK_FOOD_DOCS + EMPTY_DOCS + NOTJUNK_FOOD_DOCS + EMPTY_DOCS DOCS_GPU = Series(DOCS) NGRAM_RANGES = [(1, 1), (1, 2), (2, 3)] NGRAM_IDS = [f'ngram_range={str(r)}' for r in NGRAM_RANGES] @pytest.mark.parametrize('ngram_range', NGRAM_RANGES, ids=NGRAM_IDS) def test_word_analyzer(ngram_range): v = CountVectorizer(ngram_range=ngram_range).fit(DOCS_GPU) ref = SkCountVect(ngram_range=ngram_range).fit(DOCS) assert (ref.get_feature_names() == v.get_feature_names().to_arrow().to_pylist()) def test_countvectorizer_custom_vocabulary(): vocab = {"pizza": 0, "beer": 1}
def read_its_timestamps(filename): """Reads a binary formatted its_timestamp file into a Series of uint64s. """ return Series(cpp_read_ts_soa(filename))
skX = from_df_to_numpy(X) X = dask_cudf.from_cudf(X, npartitions=2) enc = OneHotEncoder(sparse=False) skohe = SkOneHotEncoder(sparse=False) ohe = enc.fit_transform(X) ref = skohe.fit_transform(skX) cp.testing.assert_array_equal(ohe.compute(), ref) @pytest.mark.mg @pytest.mark.parametrize('drop', [None, 'first', { 'g': Series('F'), 'i': Series(3) }]) def test_onehot_inverse_transform(client, drop): df = DataFrame({'g': ['M', 'F', 'F'], 'i': [1, 3, 2]}) X = dask_cudf.from_cudf(df, npartitions=2) enc = OneHotEncoder(drop=drop) ohe = enc.fit_transform(X) inv = enc.inverse_transform(ohe) assert_frame_equal(inv.compute().to_pandas(), df.to_pandas()) @pytest.mark.mg def test_onehot_categories(client): X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]})
def test_numpy_non_contiguious(): recdtype = np.dtype([("index", np.int64), ("a", np.int32)]) rec = np.recarray(10, dtype=recdtype) rec.index = np.arange(30, 40) rec.a = aa = np.arange(20, dtype=np.int32)[::2] assert rec.a.flags["C_CONTIGUOUS"] is False gdf = DataFrame.from_records(rec, index="index") assert_eq(aa, gdf["a"].values) @pytest.mark.parametrize( "data", [ Series([1, 2, 3, -12, 12, 44]), Series([1, 2, 3, -12, 12, 44], dtype="str"), Series([1, 2, 3, -12, 12, 44]).index, DataFrame({ "a": [1, 2, 3, -1234], "b": [0.1, 0.2222, 0.4, -3.14] }), DataFrame({ "a": [1, 2, 3, -1234], "b": [0.1, 0.2222, 0.4, -3.14] }).index, ], ) @pytest.mark.parametrize("dtype", [None, "float", "int", "str"]) def test_series_dataframe__array__(data, dtype): gs = data
def test_contains(data, search_key): expect = Series(data).list.contains(search_key) ds = dgd.from_cudf(Series(data), 5) assert_eq(expect, ds.list.contains(search_key).compute())
def remove_categories(self, removals, **kwargs): """ Remove the specified categories. `removals` must be included in the old categories. Values which were in the removed categories will be set to null. Parameters ---------- removals : category or list-like of category The categories which should be removed. inplace : bool, default False Whether or not to remove the categories inplace or return a copy of this categorical with removed categories. Returns ------- cat Categorical with removed categories or None if inplace. Examples -------- >>> import cudf >>> s = cudf.Series([10, 1, 1, 2, 10, 2, 10], dtype="category") >>> s 0 10 1 1 2 1 3 2 4 10 5 2 6 10 dtype: category Categories (3, int64): [1, 2, 10] >>> s.cat.remove_categories([1]) 0 10 1 null 2 null 3 2 4 10 5 2 6 10 dtype: category Categories (2, int64): [2, 10] >>> s 0 10 1 1 2 1 3 2 4 10 5 2 6 10 dtype: category Categories (3, int64): [1, 2, 10] >>> s.cat.remove_categories([10], inplace=True) >>> s 0 null 1 1 2 1 3 2 4 null 5 2 6 null dtype: category Categories (2, int64): [1, 2] """ from cudf import Series cats = self.categories.to_series() removals = Series(removals, dtype=cats.dtype) removals_mask = removals.isin(cats) # ensure all the removals are in the current categories # list. If not, raise an error to match Pandas behavior if not removals_mask.all(): vals = removals[~removals_mask].to_array() msg = "removals must all be in old categories: {}".format(vals) raise ValueError(msg) new_categories = cats[~cats.isin(removals)]._column out_col = self._column if not self._categories_equal(new_categories, **kwargs): out_col = self._set_categories(new_categories, **kwargs) return self._return_or_inplace(out_col, **kwargs)
def test_series(data): pdsr = pd.Series(data.copy()) sr = Series(pdsr) dsr = dgd.from_cudf(sr, npartitions=5) np.testing.assert_equal(np.array(pdsr), dsr.compute().to_array())
def test_only_delimiters(): data = ['abc def. 123', ' ', '456 789'] data_gpu = Series(data) res = CountVectorizer().fit_transform(data_gpu) ref = SkCountVect().fit_transform(data) cp.testing.assert_array_equal(res.todense(), ref.toarray())
def test_categorical_accessor_initialization1(data): sr = Series(data.copy()) dsr = dgd.from_cudf(sr, npartitions=5) dsr.cat
def inverse_transform(self, X): """ Convert the data back to the original representation. In case unknown categories are encountered (all zeros in the one-hot encoding), ``None`` is used to represent this category. The return type is the same as the type of the input used by the first call to fit on this estimator instance. Parameters ---------- X : array-like or sparse matrix, shape [n_samples, n_encoded_features] The transformed data. Returns ------- X_tr : cudf.DataFrame or cupy.ndarray Inverse transformed array. """ self._check_is_fitted() if cp.sparse.issparse(X): # cupy.sparse 7.x does not support argmax, when we upgrade cupy to # 8.x, we should add a condition in the # if close: `and not cp.sparse.issparsecsc(X)` # and change the following line by `X = X.tocsc()` X = X.toarray() result = DataFrame(columns=self._encoders.keys()) j = 0 for feature in self._encoders.keys(): feature_enc = self._encoders[feature] cats = feature_enc.classes_ if self.drop is not None: # Remove dropped categories dropped_class_idx = Series(self.drop_idx_[feature]) dropped_class_mask = Series(cats).isin(cats[dropped_class_idx]) if len(cats) == 1: inv = Series(GenericIndex(cats[0]).repeat(X.shape[0])) result[feature] = inv continue cats = cats[~dropped_class_mask] enc_size = len(cats) x_feature = X[:, j:j + enc_size] idx = cp.argmax(x_feature, axis=1) inv = Series(cats.iloc[idx]).reset_index(drop=True) if self.handle_unknown == 'ignore': not_null_idx = x_feature.any(axis=1) inv.iloc[~not_null_idx] = None elif self.drop is not None: # drop will either be None or handle_unknown will be error. If # self.drop is not None, then we can safely assume that all of # the nulls in each column are the dropped value dropped_mask = cp.asarray(x_feature.sum(axis=1) == 0).flatten() if dropped_mask.any(): inv[dropped_mask] = feature_enc.inverse_transform( Series(self.drop_idx_[feature]))[0] result[feature] = inv j += enc_size if self.input_type == 'array': try: result = cp.asarray(result.as_gpu_matrix()) except ValueError: warnings.warn("The input one hot encoding contains rows with " "unknown categories. Arrays do not support null " "values. Returning output as a DataFrame " "instead.") return result
return pdf, gdf @pytest.mark.parametrize( "i1, i2, i3", ([ (slice(None, 12), slice(3, None), slice(None, None, 2)), (range(12), range(3, 12), range(0, 9, 2)), (np.arange(12), np.arange(3, 12), np.arange(0, 9, 2)), (list(range(12)), list(range(3, 12)), list(range(0, 9, 2))), ( pd.Series(range(12)), pd.Series(range(3, 12)), pd.Series(range(0, 9, 2)), ), (Series(range(12)), Series(range(3, 12)), Series(range(0, 9, 2))), ( [i in range(12) for i in range(20)], [i in range(3, 12) for i in range(12)], [i in range(0, 9, 2) for i in range(9)], ), ( np.array([i in range(12) for i in range(20)], dtype=bool), np.array([i in range(3, 12) for i in range(12)], dtype=bool), np.array([i in range(0, 9, 2) for i in range(9)], dtype=bool), ), ] + [( np.arange(12, dtype=t), np.arange(3, 12, dtype=t), np.arange(0, 9, 2, dtype=t), ) for t in index_dtypes]),
def __init__(self, t, y, ids=None, size=None, prefixes=None): """ Computes various error preconditions on the input data, then uses CUDA to compute cubic splines for each set of input coordinates on the GPU in parallel. Parameters ---------- t : cudf.Series time sample values. Must be monotonically increasing. y : cudf.Series columns to have curves fit to according to x ids (Optional) : cudf.Series ids of each spline size (Optional) : cudf.Series fixed size of each spline prefixes (Optional) : cudf.Series alternative to `size`, allows splines of varying length. Not yet fully supported. Returns ------- CubicSpline : callable `o` ``o.c`` contains the coefficients that can be used to compute new points along the spline fitting the original ``t`` data. ``o(n)`` interpolates the spline coordinates along new input values ``n``. """ # error protections: if len(t) < 5: raise ValueError( "Use of GPU cubic spline requires splines of length > 4" ) if not isinstance(t, Series): raise TypeError( "Error: input independent vars must be cudf Series" ) if not isinstance(y, (Series, DataFrame)): raise TypeError( "Error: input dependent vars must be cudf Series or DataFrame" ) if not len(t) == len(y): raise TypeError( "Error: dependent and independent vars have different length" ) if ids is None: self.ids = Series([0, 0]).astype("int32") else: if not isinstance(ids, Series): raise TypeError("cuspatial.CubicSpline requires a cudf.Series") if not ids.dtype == np.int32: raise TypeError("Error: int32 only supported at this time.") self.ids = ids self.size = size if size is not None else len(t) if not isinstance(self.size, int): raise TypeError("Error: size must be an integer") if not ((len(t) % self.size) == 0): raise ValueError( "Error: length of input is not a multiple of size" ) if not isinstance(t, Series): raise TypeError("cuspatial.CubicSpline requires a cudf.Series") if not t.dtype == np.float32: raise TypeError("Error: float32 only supported at this time.") if not isinstance(y, Series): raise TypeError("cuspatial.CubicSpline requires a cudf.Series") if not y.dtype == np.float32: raise TypeError("Error: float32 only supported at this time.") self.t = t self.y = y if prefixes is None: self.prefix = Series( cp.arange((len(t) / self.size) + 1) * self.size ).astype("int32") else: if not isinstance(prefixes, Series): raise TypeError("cuspatial.CubicSpline requires a cudf.Series") if not prefixes.dtype == np.int32: raise TypeError("Error: int32 only supported at this time.") self.prefix = prefixes self.c = self._compute_coefficients()
def test_countvectorizer_empty_vocabulary(): v = CountVectorizer(max_df=1.0, stop_words="english") # fitting only on stopwords will result in an empty vocabulary with pytest.raises(ValueError): v.fit(Series(["to be or not to be", "and me too", "and so do you"]))
def test_empty_doc_after_limit_features(): data = ['abc abc def', 'def abc', 'ghi'] data_gpu = Series(data) count = CountVectorizer(min_df=2).fit_transform(data_gpu) ref = SkCountVect(min_df=2).fit_transform(data) cp.testing.assert_array_equal(count.todense(), ref.toarray())