def shift(self, periods, freq=None): """ Analogous to Series.shift """ # no special handling of fill values yet if not isnull(self.fill_value): # TODO: kwds is not defined...should this work? dense_shifted = self.to_dense().shift(periods, freq=freq, **kwds) # noqa return dense_shifted.to_sparse(fill_value=self.fill_value, kind=self.kind) if periods == 0: return self.copy() if freq is not None: return self._constructor( self.sp_values, sparse_index=self.sp_index, index=self.index.shift(periods, freq), fill_value=self.fill_value).__finalize__(self) int_index = self.sp_index.to_int_index() new_indices = int_index.indices + periods start, end = new_indices.searchsorted([0, int_index.length]) new_indices = new_indices[start:end] new_sp_index = IntIndex(len(self), new_indices) if isinstance(self.sp_index, BlockIndex): new_sp_index = new_sp_index.to_block_index() return self._constructor(self.sp_values[start:end].copy(), index=self.index, sparse_index=new_sp_index, fill_value=self.fill_value).__finalize__(self)
def _check_with_fill_value(values, first, second, fill_value=nan): i_index1 = IntIndex(length, first) i_index2 = IntIndex(length, second) b_index1 = i_index1.to_block_index() b_index2 = i_index2.to_block_index() _check(values, i_index1, i_index2, fill_value) _check(values, b_index1, b_index2, fill_value)
def make_sparse(arr, kind='block', fill_value=nan): """ Convert ndarray to sparse format Parameters ---------- arr : ndarray kind : {'block', 'integer'} fill_value : NaN or another value Returns ------- (sparse_values, index) : (ndarray, SparseIndex) """ arr = np.asarray(arr) length = len(arr) if np.isnan(fill_value): mask = -np.isnan(arr) else: mask = arr != fill_value indices = np.arange(length, dtype=np.int32)[mask] if kind == 'block': locs, lens = splib.get_blocks(indices) index = BlockIndex(length, locs, lens) elif kind == 'integer': index = IntIndex(length, indices) else: # pragma: no cover raise ValueError('must be block or integer type') sparsified_values = arr[mask] return sparsified_values, index
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False): # Series avoids inconsistent NaN handling cat = Categorical.from_array(Series(data), ordered=True) levels = cat.categories # if all NaN if not dummy_na and len(levels) == 0: if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) if not sparse: return DataFrame(index=index) else: return SparseDataFrame(index=index) codes = cat.codes.copy() if dummy_na: codes[codes == -1] = len(cat.categories) levels = np.append(cat.categories, np.nan) number_of_cols = len(levels) if prefix is not None: dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v) for v in levels] else: dummy_cols = levels if isinstance(data, Series): index = data.index else: index = None if sparse: sparse_series = {} N = len(data) sp_indices = [ [] for _ in range(len(dummy_cols)) ] for ndx, code in enumerate(codes): if code == -1: # Blank entries if not dummy_na and code == -1, #GH4446 continue sp_indices[code].append(ndx) for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray(np.ones(len(ixs)), sparse_index=IntIndex(N, ixs), fill_value=0) sparse_series[col] = SparseSeries(data=sarr, index=index) return SparseDataFrame(sparse_series, index=index, columns=dummy_cols) else: dummy_mat = np.eye(number_of_cols).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def _make_index(length, indices, kind): if kind == 'block' or isinstance(kind, BlockIndex): locs, lens = splib.get_blocks(indices) index = BlockIndex(length, locs, lens) elif kind == 'integer' or isinstance(kind, IntIndex): index = IntIndex(length, indices) else: # pragma: no cover raise ValueError('must be block or integer type') return index
def test_constructor_spindex_dtype(self): arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2])) tm.assert_sp_array_equal(arr, SparseArray([np.nan, 1, 2, np.nan])) self.assertEqual(arr.dtype, np.float64) self.assertTrue(np.isnan(arr.fill_value)) arr = SparseArray(data=[0, 1, 2, 3], sparse_index=IntIndex(4, [0, 1, 2, 3]), dtype=np.int64) exp = SparseArray([0, 1, 2, 3], dtype=np.int64) tm.assert_sp_array_equal(arr, exp) self.assertEqual(arr.dtype, np.int64) self.assertTrue(np.isnan(arr.fill_value)) arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=np.int64) exp = SparseArray([0, 1, 2, 0], fill_value=0, dtype=np.int64) tm.assert_sp_array_equal(arr, exp) self.assertEqual(arr.dtype, np.int64) self.assertEqual(arr.fill_value, 0) arr = SparseArray(data=[0, 1, 2, 3], sparse_index=IntIndex(4, [0, 1, 2, 3]), dtype=None) exp = SparseArray([0, 1, 2, 3], dtype=None) tm.assert_sp_array_equal(arr, exp) self.assertEqual(arr.dtype, np.int64) self.assertTrue(np.isnan(arr.fill_value)) arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=None) exp = SparseArray([0, 1, 2, 0], fill_value=0, dtype=None) tm.assert_sp_array_equal(arr, exp) self.assertEqual(arr.dtype, np.int64) self.assertEqual(arr.fill_value, 0)
def test_sparse_reindex(self): length = 10 def _check(values, index1, index2, fill_value): first_series = SparseSeries(values, sparse_index=index1, fill_value=fill_value) reindexed = first_series.sparse_reindex(index2) self.assertIs(reindexed.sp_index, index2) int_indices1 = index1.to_int_index().indices int_indices2 = index2.to_int_index().indices expected = Series(values, index=int_indices1) expected = expected.reindex(int_indices2).fillna(fill_value) tm.assert_almost_equal(expected.values, reindexed.sp_values) # make sure level argument asserts # TODO: expected is not used anywhere...remove? expected = expected.reindex(int_indices2).fillna( fill_value) # noqa def _check_with_fill_value(values, first, second, fill_value=nan): i_index1 = IntIndex(length, first) i_index2 = IntIndex(length, second) b_index1 = i_index1.to_block_index() b_index2 = i_index2.to_block_index() _check(values, i_index1, i_index2, fill_value) _check(values, b_index1, b_index2, fill_value) def _check_all(values, first, second): _check_with_fill_value(values, first, second, fill_value=nan) _check_with_fill_value(values, first, second, fill_value=0) index1 = [2, 4, 5, 6, 8, 9] values1 = np.arange(6.) _check_all(values1, index1, [2, 4, 5]) _check_all(values1, index1, [2, 3, 4, 5, 6, 7, 8, 9]) _check_all(values1, index1, [0, 1]) _check_all(values1, index1, [0, 1, 7, 8, 9]) _check_all(values1, index1, []) first_series = SparseSeries(values1, sparse_index=IntIndex(length, index1), fill_value=nan) with tm.assertRaisesRegexp(TypeError, 'new index must be a SparseIndex'): reindexed = first_series.sparse_reindex(0) # noqa
def shift(self, periods, offset=None, timeRule=None): """ Analogous to Series.shift """ # no special handling of fill values yet if not isnull(self.fill_value): dense_shifted = self.to_dense().shift(periods, offset=offset, timeRule=timeRule) return dense_shifted.to_sparse(fill_value=self.fill_value, kind=self.kind) if periods == 0: return self.copy() if timeRule is not None and offset is None: offset = datetools.getOffset(timeRule) if offset is not None: return SparseSeries(self.sp_values, sparse_index=self.sp_index, index=self.index.shift(periods, offset), fill_value=self.fill_value) int_index = self.sp_index.to_int_index() new_indices = int_index.indices + periods start, end = new_indices.searchsorted([0, int_index.length]) new_indices = new_indices[start:end] new_sp_index = IntIndex(len(self), new_indices) if isinstance(self.sp_index, BlockIndex): new_sp_index = new_sp_index.to_block_index() return SparseSeries(self.sp_values[start:end].copy(), index=self.index, sparse_index=new_sp_index, fill_value=self.fill_value)
def shift(self, periods, freq=None, **kwds): """ Analogous to Series.shift """ from pandas.core.datetools import _resolve_offset offset = _resolve_offset(freq, kwds) # no special handling of fill values yet if not isnull(self.fill_value): dense_shifted = self.to_dense().shift(periods, freq=freq, **kwds) return dense_shifted.to_sparse(fill_value=self.fill_value, kind=self.kind) if periods == 0: return self.copy() if offset is not None: return self._constructor(self.sp_values, sparse_index=self.sp_index, index=self.index.shift(periods, offset), fill_value=self.fill_value).__finalize__(self) int_index = self.sp_index.to_int_index() new_indices = int_index.indices + periods start, end = new_indices.searchsorted([0, int_index.length]) new_indices = new_indices[start:end] new_sp_index = IntIndex(len(self), new_indices) if isinstance(self.sp_index, BlockIndex): new_sp_index = new_sp_index.to_block_index() return self._constructor(self.sp_values[start:end].copy(), index=self.index, sparse_index=new_sp_index, fill_value=self.fill_value).__finalize__(self)
def test_equals(self): index = IntIndex(10, [0, 1, 2, 3, 4]) self.assert_(index.equals(index)) self.assert_(not index.equals(IntIndex(10, [0, 1, 2, 3])))
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False, drop_first=False): # Series avoids inconsistent NaN handling codes, levels = _factorize_from_iterable(Series(data)) def get_empty_Frame(data, sparse): if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) if not sparse: return DataFrame(index=index) else: return SparseDataFrame(index=index) # if all NaN if not dummy_na and len(levels) == 0: return get_empty_Frame(data, sparse) codes = codes.copy() if dummy_na: codes[codes == -1] = len(levels) levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: return get_empty_Frame(data, sparse) number_of_cols = len(levels) if prefix is not None: dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v) for v in levels] else: dummy_cols = levels if isinstance(data, Series): index = data.index else: index = None if sparse: sparse_series = {} N = len(data) sp_indices = [[] for _ in range(len(dummy_cols))] for ndx, code in enumerate(codes): if code == -1: # Blank entries if not dummy_na and code == -1, #GH4446 continue sp_indices[code].append(ndx) if drop_first: # remove first categorical level to avoid perfect collinearity # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray(np.ones(len(ixs), dtype=np.uint8), sparse_index=IntIndex(N, ixs), fill_value=0, dtype=np.uint8) sparse_series[col] = SparseSeries(data=sarr, index=index) out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols, dtype=np.uint8) return out else: dummy_mat = np.eye(number_of_cols, dtype=np.uint8).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def test_equals(self): index = IntIndex(10, [0, 1, 2, 3, 4]) self.assertTrue(index.equals(index)) self.assertFalse(index.equals(IntIndex(10, [0, 1, 2, 3])))
def test_to_int_index(self): index = IntIndex(10, [2, 3, 4, 5, 6]) self.assertIs(index.to_int_index(), index)
def test_to_int_index(self): index = IntIndex(10, [2, 3, 4, 5, 6]) self.assert_(index.to_int_index() is index)
def __new__(cls, data, index=None, sparse_index=None, kind='block', fill_value=None, name=None, copy=False): is_sparse_array = isinstance(data, SparseArray) if fill_value is None: if is_sparse_array: fill_value = data.fill_value else: fill_value = nan if is_sparse_array: if isinstance(data, SparseSeries) and index is None: index = data.index elif index is not None: assert(len(index) == len(data)) sparse_index = data.sp_index values = np.asarray(data) elif isinstance(data, (Series, dict)): if index is None: index = data.index data = Series(data) values, sparse_index = make_sparse(data, kind=kind, fill_value=fill_value) elif isinstance(data, (tuple, list, np.ndarray)): # array-like if sparse_index is None: values, sparse_index = make_sparse(data, kind=kind, fill_value=fill_value) else: values = data assert(len(values) == sparse_index.npoints) else: if index is None: raise Exception('must pass index!') length = len(index) if data == fill_value or (isnull(data) and isnull(fill_value)): if kind == 'block': sparse_index = BlockIndex(length, [], []) else: sparse_index = IntIndex(length, []) values = np.array([]) else: if kind == 'block': locs, lens = ([0], [length]) if length else ([], []) sparse_index = BlockIndex(length, locs, lens) else: sparse_index = IntIndex(length, index) values = np.empty(length) values.fill(data) if index is None: index = com._default_index(sparse_index.length) index = _ensure_index(index) # Create array, do *not* copy data by default if copy: subarr = np.array(values, dtype=np.float64, copy=True) else: subarr = np.asarray(values, dtype=np.float64) if index.is_all_dates: cls = SparseTimeSeries # Change the class of the array to be the subclass type. output = subarr.view(cls) output.sp_index = sparse_index output.fill_value = np.float64(fill_value) output.index = index output.name = name return output
def __init__(self, data=None, index=None, sparse_index=None, kind='block', fill_value=None, name=None, dtype=None, copy=False, fastpath=False): # we are called internally, so short-circuit if fastpath: # data is an ndarray, index is defined if not isinstance(data, SingleBlockManager): data = SingleBlockManager(data, index, fastpath=True) if copy: data = data.copy() else: if data is None: data = [] if isinstance(data, Series) and name is None: name = data.name is_sparse_array = isinstance(data, SparseArray) if fill_value is None: if is_sparse_array: fill_value = data.fill_value else: fill_value = np.nan if is_sparse_array: if isinstance(data, SparseSeries) and index is None: index = data.index.view() elif index is not None: assert (len(index) == len(data)) sparse_index = data.sp_index data = np.asarray(data) elif isinstance(data, SparseSeries): if index is None: index = data.index.view() # extract the SingleBlockManager data = data._data elif isinstance(data, (Series, dict)): if index is None: index = data.index.view() data = Series(data) data, sparse_index = make_sparse(data, kind=kind, fill_value=fill_value) elif isinstance(data, (tuple, list, np.ndarray)): # array-like if sparse_index is None: data, sparse_index = make_sparse(data, kind=kind, fill_value=fill_value) else: assert (len(data) == sparse_index.npoints) elif isinstance(data, SingleBlockManager): if dtype is not None: data = data.astype(dtype) if index is None: index = data.index.view() else: data = data.reindex(index, copy=False) else: length = len(index) if data == fill_value or (isnull(data) and isnull(fill_value)): if kind == 'block': sparse_index = BlockIndex(length, [], []) else: sparse_index = IntIndex(length, []) data = np.array([]) else: if kind == 'block': locs, lens = ([0], [length]) if length else ([], []) sparse_index = BlockIndex(length, locs, lens) else: sparse_index = IntIndex(length, index) v = data data = np.empty(length) data.fill(v) if index is None: index = com._default_index(sparse_index.length) index = _ensure_index(index) # create/copy the manager if isinstance(data, SingleBlockManager): if copy: data = data.copy() else: # create a sparse array if not isinstance(data, SparseArray): data = SparseArray(data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype, copy=copy) data = SingleBlockManager(data, index) generic.NDFrame.__init__(self, data) self.index = index self.name = name