def test_dataframe_as_gpu_matrix_null_values(): df = DataFrame() nelem = 123 na = -10000 refvalues = {} for k in 'abcd': df[k] = data = np.random.random(nelem) bitmask = utils.random_bitmask(nelem) df[k] = df[k].set_mask(bitmask) boolmask = np.asarray(utils.expand_bits_to_bytes(bitmask)[:nelem], dtype=np.bool_) data[~boolmask] = na refvalues[k] = data # Check null value causes error with pytest.raises(ValueError) as raises: df.as_gpu_matrix() raises.match("column 'a' has null values") for k in df.columns: df[k] = df[k].fillna(na) mat = df.as_gpu_matrix().copy_to_host() for i, k in enumerate(df.columns): np.testing.assert_array_equal(refvalues[k], mat[:, i])
def test_dataframe_setitem_from_masked_object(): ary = np.random.randn(100) mask = np.zeros(100, dtype=bool) mask[:20] = True np.random.shuffle(mask) ary[mask] = np.nan test1 = Series(ary) assert (test1.has_null_mask) assert (test1.null_count == 20) test2 = DataFrame.from_pandas(pd.DataFrame({'a': ary})) assert (test2['a'].has_null_mask) assert (test2['a'].null_count == 20) gpu_ary = rmm.to_device(ary) test3 = Series(gpu_ary) assert (test3.has_null_mask) assert (test3.null_count == 20) test4 = DataFrame() lst = [1, 2, None, 4, 5, 6, None, 8, 9] test4['lst'] = lst assert (test4['lst'].has_null_mask) assert (test4['lst'].null_count == 2)
def test_dataframe_column_name_indexing(): df = DataFrame() data = np.asarray(range(10), dtype=np.int32) df['a'] = data df[1] = data np.testing.assert_equal(df['a'].to_array(), np.asarray(range(10), dtype=np.int32)) np.testing.assert_equal(df[1].to_array(), np.asarray(range(10), dtype=np.int32)) pdf = pd.DataFrame() nelem = 10 pdf['key1'] = np.random.randint(0, 5, nelem) pdf['key2'] = np.random.randint(0, 3, nelem) pdf[1] = np.arange(1, 1 + nelem) pdf[2] = np.random.random(nelem) df = DataFrame.from_pandas(pdf) for i in range(1, len(pdf.columns)+1): for idx in combinations(pdf.columns, i): assert(pdf[list(idx)].equals(df[list(idx)].to_pandas())) # test for only numeric columns df = pd.DataFrame() for i in range(0, 10): df[i] = range(nelem) gdf = DataFrame.from_pandas(df) assert_eq(gdf, df)
def agg(self, agg_types): df = DataFrame() by = [] if self.level is not None: if isinstance(self.source_series.index, MultiIndex): # Add index columns specified by multiindex into _df # Record the index column names for the groupby for col in self.source_series.index.codes: df[self.group_name + col] = self.source_series.index.codes[ col] by.append(self.group_name + col) else: if isinstance(self.group_keys, Series): df[self.group_name] = self.group_keys by = self.group_name else: df = self.group_keys by = self._by df[self.source_name] = self.source_series groupby = df.groupby(by).agg(agg_types) idx = groupby.index if len(groupby.columns) == 1: result = groupby[self.source_name] result.name = self.source_series.name idx.name = None result = result.set_index(idx) else: idx.name = self.group_name result = groupby.set_index(idx) if len(result) == 0 and self._by is not None: empties = [[] for x in range(len(self._by))] mi = MultiIndex(empties, empties, names=self._by) result = result.set_index(mi) return result
def __getattr__(self, attr): df = DataFrame() df[self.source_name] = self.source_series by = [] if self.level is not None: if isinstance(self.source_series.index, MultiIndex): # Add index columns specified by multiindex into _df # Record the index column names for the groupby for col in self.source_series.index.codes: df[self.group_name + col] = self.source_series.index.codes[col] by.append(self.group_name + col) else: df[self.group_name] = self.group_series by = self.group_name groupby = df.groupby(by, level=self.level, sort=self.sort) result_df = getattr(groupby, attr)() def get_result(): result_series = result_df[self.source_name] result_series.name = self.source_name if self.source_name !=\ '_x' else None idx = result_df.index idx.name = self.group_name if self.group_name != '_y' else None result_series.set_index(idx) return result_series return get_result
def test_assign(): gdf = DataFrame({'x': [1, 2, 3]}) gdf2 = gdf.assign(y=gdf.x + 1) assert list(gdf.columns) == ['x'] assert list(gdf2.columns) == ['x', 'y'] np.testing.assert_equal(gdf2.y.to_array(), [2, 3, 4])
def test_dataframe_hash_partition(nrows, nparts, nkeys): np.random.seed(123) gdf = DataFrame() keycols = [] for i in range(nkeys): keyname = 'key{}'.format(i) gdf[keyname] = np.random.randint(0, 7 - i, nrows) keycols.append(keyname) gdf['val1'] = np.random.randint(0, nrows * 2, nrows) got = gdf.partition_by_hash(keycols, nparts=nparts) # Must return a list assert isinstance(got, list) # Must have correct number of partitions assert len(got) == nparts # All partitions must be DataFrame type assert all(isinstance(p, DataFrame) for p in got) # Check that all partitions have unique keys part_unique_keys = set() for p in got: if len(p): # Take rows of the keycolums and build a set of the key-values unique_keys = set(map(tuple, p.as_matrix(columns=keycols))) # Ensure that none of the key-values have occurred in other groups assert not (unique_keys & part_unique_keys) part_unique_keys |= unique_keys assert len(part_unique_keys)
def test_dataframe_empty_to_string(): # Test for printing empty dataframe df = DataFrame() got = df.to_string() print(got) expect = "Empty DataFrame\nColumns: []\nIndex: []\n" # values should match despite whitespace difference assert got.split() == expect.split()
def test_dataframe_iloc_setitem(): gdf = DataFrame() nelem = 123 gdf['a'] = np.random.randint(low=0, high=100, size=nelem) \ .astype(np.int32) gdf['b'] = np.random.random(nelem).astype(np.float32) gdf.iloc[0] = nelem
def concat(objs, axis=0, ignore_index=False, sort=None): """Concatenate DataFrames, Series, or Indices row-wise. Parameters ---------- objs : list of DataFrame, Series, or Index axis : concatenation axis, 0 - index, 1 - columns ignore_index : bool Set True to ignore the index of the *objs* and provide a default range index instead. Returns ------- A new object of like type with rows from each object in ``objs``. """ if sort not in (None, False): raise NotImplementedError("sort parameter is not yet supported") if not objs: raise ValueError("Need at least one object to concatenate") # no-op for single object if len(objs) == 1: return objs[0] typs = set(type(o) for o in objs) allowed_typs = {Series, DataFrame} # when axis is 1 (column) we can concat with Series and Dataframes if axis == 1: assert typs.issubset(allowed_typs) df = DataFrame() for idx, o in enumerate(objs): if isinstance(o, Series): name = o.name if o.name is None: # pandas uses 0-offset name = idx - 1 df[name] = o else: for col in o.columns: df[col] = o[col] return df if len(typs) > 1: raise ValueError("`concat` expects all objects to be of the same " "type. Got mix of %r." % [t.__name__ for t in typs]) typ = list(typs)[0] if typ is DataFrame: return DataFrame._concat(objs, axis=axis, ignore_index=ignore_index) elif typ is Series: return Series._concat(objs, axis=axis) elif issubclass(typ, Index): return Index._concat(objs) else: raise ValueError("Unknown type %r" % typ)
def test_dataframe_append_to_empty(): pdf = pd.DataFrame() pdf['a'] = [] pdf['b'] = [1, 2, 3] gdf = DataFrame() gdf['a'] = [] gdf['b'] = [1, 2, 3] pd.testing.assert_frame_equal(gdf.to_pandas(), pdf)
def test_dataframe_emptycolumns_to_string(): # Test for printing dataframe having empty columns df = DataFrame() df['a'] = [] df['b'] = [] got = df.to_string() print(got) expect = "Empty DataFrame\nColumns: ['a', 'b']\nIndex: []\n" # values should match despite whitespace difference assert got.split() == expect.split()
def test_dataframe_empty_concat(): gdf1 = DataFrame() gdf1['a'] = [] gdf1['b'] = [] gdf2 = gdf1.copy() gdf3 = gd.concat([gdf1, gdf2]) assert len(gdf3) == 0 assert len(gdf3.columns) == 2
def agg(self, agg_types): df = DataFrame() df['x'] = self.source_series if self.level is not None: df['y'] = self.source_series.index else: df['y'] = self.group_series groupby = df.groupby('y').agg(agg_types) idx = groupby.index idx.name = None groupby.set_index(idx) return groupby
def _apply_basic_agg(self, agg_type, sort_results=False): """ Parameters ---------- agg_type : str The aggregation function to run. """ result = DataFrame() add_col_values = True ctx = ffi.new('gdf_context*') ctx.flag_sorted = 0 ctx.flag_method = self._method ctx.flag_distinct = 0 val_columns = self._val_columns val_columns_out = self._val_columns result = self._apply_agg(agg_type, result, add_col_values, ctx, val_columns, val_columns_out, sort_result=sort_results) # If a Groupby has one index column and one value column # and as_index is set, return a Series instead of a df if isinstance(val_columns, (str, Number)) and self._as_index: result_series = result[val_columns] idx = index.as_index(result[self._by[0]]) if self.level == 0: idx.name = self._original_index_name else: idx.name = self._by[0] result_series = result_series.set_index(idx) return result_series # TODO: Do MultiIndex here if (self._as_index): idx = index.as_index(result[self._by[0]]) idx.name = self._by[0] result.drop_column(idx.name) if self.level == 0: idx.name = self._original_index_name else: idx.name = self._by[0] result = result.set_index(idx) nvtx_range_pop() return result
def test_nonmatching_index_setitem(nrows): np.random.seed(0) gdf = DataFrame() gdf['a'] = np.random.randint(2147483647, size=nrows) gdf['b'] = np.random.randint(2147483647, size=nrows) gdf = gdf.set_index('b') test_values = np.random.randint(2147483647, size=nrows) gdf['c'] = test_values assert (len(test_values) == len(gdf['c'])) assert (gdf['c'].to_pandas().equals( Series(test_values).set_index(gdf._index).to_pandas()))
def test_dataframe_masked_slicing(nelem, slice_start, slice_end): gdf = DataFrame() gdf['a'] = list(range(nelem)) gdf['b'] = list(range(nelem, 2 * nelem)) gdf['a'] = gdf['a'].set_mask(utils.random_bitmask(nelem)) gdf['b'] = gdf['b'].set_mask(utils.random_bitmask(nelem)) def do_slice(x): return x[slice_start:slice_end] expect = do_slice(gdf.to_pandas()) got = do_slice(gdf).to_pandas() pd.testing.assert_frame_equal(expect, got)
def __getitem__(self, arg): if isinstance(arg, (str, Number)): if arg not in self._val_columns: raise KeyError("Column not found: " + str(arg)) else: for val in arg: if val not in self._val_columns: raise KeyError("Column not found: " + str(val)) result = self.copy(deep=False) result._df = DataFrame() if isinstance(self._by, (str, Number)): result._df[self._by] = self._df[self._by] else: for by in self._by: result._df[by] = self._df[by] result._val_columns = arg if isinstance(arg, (str, Number)): result._df[arg] = self._df[arg] else: for a in arg: result._df[a] = self._df[a] if isinstance(result._val_columns, (str, Number)): new_by = [result._by] if isinstance(result._by, (str, Number))\ else list(result._by) new_val_columns = [result._val_columns] if\ isinstance(result._val_columns, (str, Number))\ else list(result._val_columns) new_val_series = result._df[new_val_columns[0]] new_val_series.name = new_val_columns[0] new_by_keys = result._df[new_by] new_by_keys.name = new_by[0] return SeriesGroupBy(new_val_series, new_by_keys, by=result._by) return result
def read_feather(path, *args, **kwargs): """{docstring}""" warnings.warn("Using CPU via PyArrow to read feather dataset, this may " "be GPU accelerated in the future") pa_table = feather.read_table(path, *args, **kwargs) return DataFrame.from_arrow(pa_table)
def read_orc( filepath_or_buffer, engine="cudf", columns=None, stripe=None, skip_rows=None, num_rows=None, use_index=True, ): """{docstring}""" filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( filepath_or_buffer, None) if compression is not None: ValueError("URL content-encoding decompression is not supported") if engine == "cudf": df = cpp_read_orc(filepath_or_buffer, columns, stripe, skip_rows, num_rows, use_index) else: warnings.warn("Using CPU via PyArrow to read ORC dataset.") orc_file = orc.ORCFile(filepath_or_buffer) pa_table = orc_file.read(columns=columns) df = DataFrame.from_arrow(pa_table) return df
def concat(objs, ignore_index=False): """Concatenate DataFrames, Series, or Indices row-wise. Parameters ---------- objs : list of DataFrame, Series, or Index ignore_index : bool Set True to ignore the index of the *objs* and provide a default range index instead. Returns ------- A new object of like type with rows from each object in ``objs``. """ if not objs: raise ValueError("Need at least one object to concatenate") # no-op for single object if len(objs) == 1: return objs[0] typs = set(type(o) for o in objs) if len(typs) > 1: raise ValueError("`concat` expects all objects to be of the same " "type. Got mix of %r." % [t.__name__ for t in typs]) typ = list(typs)[0] if typ is DataFrame: return DataFrame._concat(objs, ignore_index=ignore_index) elif typ is Series: return Series._concat(objs) elif issubclass(typ, Index): return Index._concat(objs) else: raise ValueError("Unknown type %r" % typ)
def test_dataframe_iloc(nelem): gdf = DataFrame() gdf['a'] = ha = np.random.randint(low=0, high=100, size=nelem) \ .astype(np.int32) gdf['b'] = hb = np.random.random(nelem).astype(np.float32) pdf = pd.DataFrame() pdf['a'] = ha pdf['b'] = hb # Positive tests for slicing using iloc def assert_col(g, p): np.testing.assert_equal(g['a'].to_array(), p['a']) np.testing.assert_equal(g['b'].to_array(), p['b']) assert_col(gdf.iloc[-1:1], pdf.iloc[-1:1]) assert_col(gdf.iloc[nelem-1:-1], pdf.iloc[nelem-1:-1]) assert_col(gdf.iloc[0:nelem-1], pdf.iloc[0:nelem-1]) assert_col(gdf.iloc[0:nelem], pdf.iloc[0:nelem]) assert_col(gdf.iloc[1:1], pdf.iloc[1:1]) assert_col(gdf.iloc[1:2], pdf.iloc[1:2]) assert_col(gdf.iloc[nelem-1:nelem+1], pdf.iloc[nelem-1:nelem+1]) assert_col(gdf.iloc[nelem:nelem*2], pdf.iloc[nelem:nelem*2]) # Positive tests for int indexing def assert_series(g, p): np.testing.assert_equal(g.to_array(), p) assert_series(gdf.iloc[-1 * nelem], pdf.iloc[-1 * nelem]) assert_series(gdf.iloc[-1], pdf.iloc[-1]) assert_series(gdf.iloc[0], pdf.iloc[0]) assert_series(gdf.iloc[1], pdf.iloc[1]) assert_series(gdf.iloc[nelem - 1], pdf.iloc[nelem - 1])
def _apply_basic_agg(self, agg_type): """ Parameters ---------- agg_type : str The aggregation function to run. """ result = DataFrame() add_col_values = True ctx = ffi.new('gdf_context*') ctx.flag_sorted = 0 ctx.flag_method = self._method ctx.flag_distinct = 0 val_columns = self._val_columns val_columns_out = [agg_type + "_" + column for column in val_columns] result = self._apply_agg(agg_type, result, add_col_values, ctx, val_columns, val_columns_out, sort_result=False) nvtx_range_pop() return result
def test_dataframe_to_string_wide(): # Test basic df = DataFrame() for i in range(100): df['a{}'.format(i)] = list(range(3)) got = df.to_string(ncols=8) print(got) expect = ''' a0 a1 a2 a3 a4 a5 a6 ... a99 0 0 0 0 0 0 0 0 ... 0 1 1 1 1 1 1 1 1 ... 1 2 2 2 2 2 2 2 2 ... 2 [92 more columns] ''' # values should match despite whitespace difference assert got.split() == expect.split()
def test_dataframe_hash_partition_masked_value(nrows): gdf = DataFrame() gdf['key'] = np.arange(nrows) gdf['val'] = np.arange(nrows) + 100 bitmask = utils.random_bitmask(nrows) bytemask = utils.expand_bits_to_bytes(bitmask) gdf['val'] = gdf['val'].set_mask(bitmask) parted = gdf.partition_by_hash(['key'], nparts=3) # Verify that the valid mask is correct for p in parted: df = p.to_pandas() for row in df.itertuples(): valid = bool(bytemask[row.key]) expected_value = row.key + 100 if valid else -1 got_value = row.val assert expected_value == got_value
def test_dataframe_copy_shallow(): # Test for copy dataframe using class method df = DataFrame() df['a'] = [1, 2, 3] df2 = df.copy() df2['b'] = [4, 2, 3] got = df.to_string() print(got) expect = ''' a 0 1 1 2 2 3 ''' # values should match despite whitespace difference assert got.split() == expect.split()
def read_parquet(path, *args, **kwargs): """{docstring}""" warnings.warn("Using CPU via PyArrow to read Parquet dataset, this will " "be GPU accelerated in the future") pa_table = pq.read_pandas(path, *args, **kwargs) return DataFrame.from_arrow(pa_table)
def read_orc(path, columns=None, **kwargs): """{docstring}""" warnings.warn("Using CPU via PyArrow to read ORC dataset, this will " "be GPU accelerated in the future") orc_file = orc.ORCFile(path) pa_table = orc_file.read(columns=columns) return DataFrame.from_arrow(pa_table)
def test_dataframe_to_string(): with set_options(formatting={'nrows': 5, 'ncols': 8}): # Test basic df = DataFrame([('a', [1, 2, 3, 4, 5, 6]), ('b', [11, 12, 13, 14, 15, 16])]) string = str(df) print(string) assert string.splitlines()[-1] == '[1 more rows]' # Test skipped columns df = DataFrame([('a', [1, 2, 3, 4, 5, 6]), ('b', [11, 12, 13, 14, 15, 16]), ('c', [11, 12, 13, 14, 15, 16]), ('d', [11, 12, 13, 14, 15, 16])]) string = df.to_string(ncols=3) print(string) assert string.splitlines()[-2] == '[1 more rows]' assert string.splitlines()[-1] == '[1 more columns]' # Test masked df = DataFrame([('a', [1, 2, 3, 4, 5, 6]), ('b', [11, 12, 13, 14, 15, 16])]) data = np.arange(6) mask = np.zeros(1, dtype=np.uint8) mask[0] = 0b00101101 masked = Series.from_masked_array(data, mask) assert masked.null_count == 2 df['c'] = masked # check data values = list(masked) validids = [0, 2, 3, 5] densearray = masked.to_array() np.testing.assert_equal(data[validids], densearray) # valid position is corret for i in validids: assert data[i] == values[i] # null position is correct for i in range(len(values)): if i not in validids: assert values[i] is None got = df.to_string(nrows=None) print(got) expect = ''' a b c 0 1 11 0 1 2 12 2 3 13 2 3 4 14 3 4 5 15 5 6 16 5 ''' # values should match despite whitespace difference assert got.split() == expect.split()
def test_kernel_shallow_copy(): pdf = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=['a', 'b', 'c']) gdf = DataFrame.from_pandas(pdf) cdf = gdf.copy(deep=False) sr = gdf['a'] add_one[1, len(sr)](sr.to_gpu_array()) assert_eq(gdf, cdf)