def test_constructor_ndarray(self): # no index or columns sp = SparseDataFrame(self.frame.values) # 1d sp = SparseDataFrame(self.data['A'], index=self.dates, columns=['A']) tm.assert_sp_frame_equal(sp, self.frame.reindex(columns=['A'])) # raise on level argument pytest.raises(TypeError, self.frame.reindex, columns=['A'], level=1) # wrong length index / columns with tm.assert_raises_regex(ValueError, "^Index length"): SparseDataFrame(self.frame.values, index=self.frame.index[:-1]) with tm.assert_raises_regex(ValueError, "^Column length"): SparseDataFrame(self.frame.values, columns=self.frame.columns[:-1])
def test_pickle(self): def _test_roundtrip(frame, orig): result = tm.round_trip_pickle(frame) tm.assert_sp_frame_equal(frame, result) tm.assert_frame_equal(result.to_dense(), orig, check_dtype=False) _test_roundtrip(SparseDataFrame(), DataFrame()) self._check_all(_test_roundtrip)
def test_fill_value_when_combine_const(self): # GH12723 dat = np.array([0, 1, np.nan, 3, 4, 5], dtype='float') df = SparseDataFrame({'foo': dat}, index=range(6)) exp = df.fillna(0).add(2) res = df.add(2, fill_value=0) tm.assert_sp_frame_equal(res, exp)
def test_constructor_from_unknown_type(self): # GH 19393 class Unknown(object): pass with pytest.raises(TypeError, message='SparseDataFrame called with unknown type ' '"Unknown" for data argument'): SparseDataFrame(Unknown())
def test_constructor(self, float_frame, float_frame_int_kind, float_frame_fill0): for col, series in compat.iteritems(float_frame): assert isinstance(series, SparseSeries) assert isinstance(float_frame_int_kind['A'].sp_index, IntIndex) # constructed zframe from matrix above assert float_frame_fill0['A'].fill_value == 0 tm.assert_numpy_array_equal(pd.SparseArray([1., 2., 3., 4., 5., 6.]), float_frame_fill0['A'].values) tm.assert_numpy_array_equal(np.array([0., 0., 0., 0., 1., 2., 3., 4., 5., 6.]), float_frame_fill0['A'].to_dense().values) # construct no data sdf = SparseDataFrame(columns=np.arange(10), index=np.arange(10)) for col, series in compat.iteritems(sdf): assert isinstance(series, SparseSeries) # construct from nested dict data = {} for c, s in compat.iteritems(float_frame): data[c] = s.to_dict() sdf = SparseDataFrame(data) tm.assert_sp_frame_equal(sdf, float_frame) # TODO: test data is copied from inputs # init dict with different index idx = float_frame.index[:5] cons = SparseDataFrame( float_frame, index=idx, columns=float_frame.columns, default_fill_value=float_frame.default_fill_value, default_kind=float_frame.default_kind, copy=True) reindexed = float_frame.reindex(idx) tm.assert_sp_frame_equal(cons, reindexed, exact_indices=False) # assert level parameter breaks reindex with pytest.raises(TypeError): float_frame.reindex(idx, level=0) repr(float_frame)
def setup_method(self, method): self.data = {'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], 'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], 'C': np.arange(10, dtype=float), 'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]} self.dates = bdate_range('1/1/2011', periods=10) self.frame = SparseDataFrame(self.data, index=self.dates)
def test_getitem(self): # 1585 select multiple columns sdf = SparseDataFrame(index=[0, 1, 2], columns=['a', 'b', 'c']) result = sdf[['a', 'b']] exp = sdf.reindex(columns=['a', 'b']) tm.assert_sp_frame_equal(result, exp) pytest.raises(Exception, sdf.__getitem__, ['a', 'd'])
def get_empty_Frame(data, sparse): if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) if not sparse: return DataFrame(index=index) else: return SparseDataFrame(index=index, default_fill_value=0)
def setup_method(self, method): self.data = { 'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], 'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], 'C': np.arange(10, dtype=np.float64), 'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan] } self.dates = bdate_range('1/1/2011', periods=10) self.orig = pd.DataFrame(self.data, index=self.dates) self.iorig = pd.DataFrame(self.data, index=self.dates) self.frame = SparseDataFrame(self.data, index=self.dates) self.iframe = SparseDataFrame(self.data, index=self.dates, default_kind='integer') self.mixed_frame = self.frame.copy(False) self.mixed_frame['foo'] = pd.SparseArray(['bar'] * len(self.dates)) values = self.frame.values.copy() values[np.isnan(values)] = 0 self.zorig = pd.DataFrame(values, columns=['A', 'B', 'C', 'D'], index=self.dates) self.zframe = SparseDataFrame(values, columns=['A', 'B', 'C', 'D'], default_fill_value=0, index=self.dates) values = self.frame.values.copy() values[np.isnan(values)] = 2 self.fill_orig = pd.DataFrame(values, columns=['A', 'B', 'C', 'D'], index=self.dates) self.fill_frame = SparseDataFrame(values, columns=['A', 'B', 'C', 'D'], default_fill_value=2, index=self.dates) self.empty = SparseDataFrame()
def test_as_blocks(self): df = SparseDataFrame({ 'A': [1.1, 3.3], 'B': [nan, -3.9] }, dtype='float64') df_blocks = df.blocks assert list(df_blocks.keys()) == ['float64'] tm.assert_frame_equal(df_blocks['float64'], df)
def test_density(self): df = SparseSeries([nan, nan, nan, 0, 1, 2, 3, 4, 5, 6]) self.assertEqual(df.density, 0.7) df = SparseDataFrame({'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], 'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], 'C': np.arange(10), 'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]}) self.assertEqual(df.density, 0.75)
def test_as_blocks(self): df = SparseDataFrame({'A': [1.1, 3.3], 'B': [nan, -3.9]}, dtype='float64') # deprecated 0.21.0 with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): df_blocks = df.blocks assert list(df_blocks.keys()) == ['float64'] tm.assert_frame_equal(df_blocks['float64'], df)
def test_density(self): df = SparseSeries([nan, nan, nan, 0, 1, 2, 3, 4, 5, 6]) assert df.density == 0.7 df = SparseDataFrame({'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], 'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], 'C': np.arange(10), 'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]}) assert df.density == 0.75
def test_cumsum(self, float_frame): expected = SparseDataFrame(float_frame.to_dense().cumsum()) result = float_frame.cumsum() tm.assert_sp_frame_equal(result, expected) result = float_frame.cumsum(axis=None) tm.assert_sp_frame_equal(result, expected) result = float_frame.cumsum(axis=0) tm.assert_sp_frame_equal(result, expected)
def test_iloc(self): # 2227 result = self.frame.iloc[:, 0] assert isinstance(result, SparseSeries) tm.assert_sp_series_equal(result, self.frame['A']) # preserve sparse index type. #2251 data = {'A': [0, 1]} iframe = SparseDataFrame(data, default_kind='integer') tm.assert_class_equal(iframe['A'].sp_index, iframe.iloc[:, 0].sp_index)
def setUp(self): self.data = { 'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], 'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], 'C': np.arange(10, dtype=np.float64), 'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan] } self.dates = bdate_range('1/1/2011', periods=10) self.orig = pd.DataFrame(self.data, index=self.dates) self.iorig = pd.DataFrame(self.data, index=self.dates) self.frame = SparseDataFrame(self.data, index=self.dates) self.iframe = SparseDataFrame(self.data, index=self.dates, default_kind='integer') values = self.frame.values.copy() values[np.isnan(values)] = 0 self.zorig = pd.DataFrame(values, columns=['A', 'B', 'C', 'D'], index=self.dates) self.zframe = SparseDataFrame(values, columns=['A', 'B', 'C', 'D'], default_fill_value=0, index=self.dates) values = self.frame.values.copy() values[np.isnan(values)] = 2 self.fill_orig = pd.DataFrame(values, columns=['A', 'B', 'C', 'D'], index=self.dates) self.fill_frame = SparseDataFrame(values, columns=['A', 'B', 'C', 'D'], default_fill_value=2, index=self.dates) self.empty = SparseDataFrame()
def test_numpy_cumsum(self, float_frame): result = np.cumsum(float_frame) expected = SparseDataFrame(float_frame.to_dense().cumsum()) tm.assert_sp_frame_equal(result, expected) msg = "the 'dtype' parameter is not supported" tm.assert_raises_regex(ValueError, msg, np.cumsum, float_frame, dtype=np.int64) msg = "the 'out' parameter is not supported" tm.assert_raises_regex(ValueError, msg, np.cumsum, float_frame, out=result)
def test_pickle(self, float_frame, float_frame_int_kind, float_frame_dense, float_frame_fill0, float_frame_fill0_dense, float_frame_fill2, float_frame_fill2_dense): def _test_roundtrip(frame, orig): result = tm.round_trip_pickle(frame) tm.assert_sp_frame_equal(frame, result) tm.assert_frame_equal(result.to_dense(), orig, check_dtype=False) _test_roundtrip(SparseDataFrame(), DataFrame()) _test_roundtrip(float_frame, float_frame_dense) _test_roundtrip(float_frame_int_kind, float_frame_dense) _test_roundtrip(float_frame_fill0, float_frame_fill0_dense) _test_roundtrip(float_frame_fill2, float_frame_fill2_dense)
def test_quantile(self): # GH 17386 data = [[1, 1], [2, 10], [3, 100], [nan, nan]] q = 0.1 sparse_df = SparseDataFrame(data) result = sparse_df.quantile(q) dense_df = DataFrame(data) dense_expected = dense_df.quantile(q) sparse_expected = SparseSeries(dense_expected) tm.assert_series_equal(result, dense_expected) tm.assert_sp_series_equal(result, sparse_expected)
def test_constructor_from_series(self): # GH 2873 x = Series(np.random.randn(10000), name='a') x = x.to_sparse(fill_value=0) assert isinstance(x, SparseSeries) df = SparseDataFrame(x) assert isinstance(df, SparseDataFrame) x = Series(np.random.randn(10000), name='a') y = Series(np.random.randn(10000), name='b') x2 = x.astype(float) x2.loc[:9998] = np.NaN # TODO: x_sparse is unused...fix x_sparse = x2.to_sparse(fill_value=np.NaN) # noqa # Currently fails too with weird ufunc error # df1 = SparseDataFrame([x_sparse, y]) y.loc[:9998] = 0 # TODO: y_sparse is unsused...fix y_sparse = y.to_sparse(fill_value=0) # noqa
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False, drop_first=False): # Series avoids inconsistent NaN handling codes, levels = _factorize_from_iterable(Series(data)) def get_empty_Frame(data, sparse): if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) if not sparse: return DataFrame(index=index) else: return SparseDataFrame(index=index, default_fill_value=0) # if all NaN if not dummy_na and len(levels) == 0: return get_empty_Frame(data, sparse) codes = codes.copy() if dummy_na: codes[codes == -1] = len(levels) levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: return get_empty_Frame(data, sparse) number_of_cols = len(levels) if prefix is not None: dummy_strs = [ u'{prefix}{sep}{level}' if isinstance(v, text_type) else '{prefix}{sep}{level}' for v in levels ] dummy_cols = [ dummy_str.format(prefix=prefix, sep=prefix_sep, level=v) for dummy_str, v in zip(dummy_strs, levels) ] else: dummy_cols = levels if isinstance(data, Series): index = data.index else: index = None if sparse: sparse_series = {} N = len(data) sp_indices = [[] for _ in range(len(dummy_cols))] for ndx, code in enumerate(codes): if code == -1: # Blank entries if not dummy_na and code == -1, #GH4446 continue sp_indices[code].append(ndx) if drop_first: # remove first categorical level to avoid perfect collinearity # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray(np.ones(len(ixs), dtype=np.uint8), sparse_index=IntIndex(N, ixs), fill_value=0, dtype=np.uint8) sparse_series[col] = SparseSeries(data=sarr, index=index) out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols, default_fill_value=0, dtype=np.uint8) return out else: dummy_mat = np.eye(number_of_cols, dtype=np.uint8).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def test_reindex_method(self): sparse = SparseDataFrame(data=[[11., 12., 14.], [21., 22., 24.], [41., 42., 44.]], index=[1, 2, 4], columns=[1, 2, 4], dtype=float) # Over indices # default method result = sparse.reindex(index=range(6)) expected = SparseDataFrame(data=[[nan, nan, nan], [11., 12., 14.], [21., 22., 24.], [nan, nan, nan], [41., 42., 44.], [nan, nan, nan]], index=range(6), columns=[1, 2, 4], dtype=float) tm.assert_sp_frame_equal(result, expected) # method='bfill' result = sparse.reindex(index=range(6), method='bfill') expected = SparseDataFrame(data=[[11., 12., 14.], [11., 12., 14.], [21., 22., 24.], [41., 42., 44.], [41., 42., 44.], [nan, nan, nan]], index=range(6), columns=[1, 2, 4], dtype=float) tm.assert_sp_frame_equal(result, expected) # method='ffill' result = sparse.reindex(index=range(6), method='ffill') expected = SparseDataFrame(data=[[nan, nan, nan], [11., 12., 14.], [21., 22., 24.], [21., 22., 24.], [41., 42., 44.], [41., 42., 44.]], index=range(6), columns=[1, 2, 4], dtype=float) tm.assert_sp_frame_equal(result, expected) # Over columns # default method result = sparse.reindex(columns=range(6)) expected = SparseDataFrame(data=[[nan, 11., 12., nan, 14., nan], [nan, 21., 22., nan, 24., nan], [nan, 41., 42., nan, 44., nan]], index=[1, 2, 4], columns=range(6), dtype=float) tm.assert_sp_frame_equal(result, expected) # method='bfill' with pytest.raises(NotImplementedError): sparse.reindex(columns=range(6), method='bfill') # method='ffill' with pytest.raises(NotImplementedError): sparse.reindex(columns=range(6), method='ffill')
def test_constructor_empty(self): sp = SparseDataFrame() self.assertEqual(len(sp.index), 0) self.assertEqual(len(sp.columns), 0)
def test_constructor_convert_index_once(self): arr = np.array([1.5, 2.5, 3.5]) sdf = SparseDataFrame(columns=lrange(4), index=arr) assert sdf[0].index is sdf[1].index
def test_constructor_dataframe(self, float_frame): dense = float_frame.to_dense() sp = SparseDataFrame(dense) tm.assert_sp_frame_equal(sp, float_frame)
def test_constructor_empty(self): sp = SparseDataFrame() assert len(sp.index) == 0 assert len(sp.columns) == 0
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False, drop_first=False, dtype=None): # Series avoids inconsistent NaN handling codes, levels = _factorize_from_iterable(Series(data)) if dtype is None: dtype = np.uint8 dtype = np.dtype(dtype) if is_object_dtype(dtype): raise ValueError("dtype=object is not a valid dtype for get_dummies") def get_empty_Frame(data, sparse): if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) if not sparse: return DataFrame(index=index) else: return SparseDataFrame(index=index, default_fill_value=0) # if all NaN if not dummy_na and len(levels) == 0: return get_empty_Frame(data, sparse) codes = codes.copy() if dummy_na: codes[codes == -1] = len(levels) levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: return get_empty_Frame(data, sparse) number_of_cols = len(levels) if prefix is None: dummy_cols = levels else: # PY2 embedded unicode, gh-22084 def _make_col_name(prefix, prefix_sep, level): fstr = '{prefix}{prefix_sep}{level}' if PY2 and (isinstance(prefix, text_type) or isinstance( prefix_sep, text_type) or isinstance(level, text_type)): fstr = u(fstr) return fstr.format(prefix=prefix, prefix_sep=prefix_sep, level=level) dummy_cols = [ _make_col_name(prefix, prefix_sep, level) for level in levels ] if isinstance(data, Series): index = data.index else: index = None if sparse: sparse_series = {} N = len(data) sp_indices = [[] for _ in range(len(dummy_cols))] mask = codes != -1 codes = codes[mask] n_idx = np.arange(N)[mask] for ndx, code in zip(n_idx, codes): sp_indices[code].append(ndx) if drop_first: # remove first categorical level to avoid perfect collinearity # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray(np.ones(len(ixs), dtype=dtype), sparse_index=IntIndex(N, ixs), fill_value=0, dtype=dtype) sparse_series[col] = SparseSeries(data=sarr, index=index) out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols, default_fill_value=0, dtype=dtype) return out else: dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols)