def _concat_sparse(to_concat, axis=0, typs=None): """ provide concatenation of an sparse/dense array of arrays each of which is a single dtype Parameters ---------- to_concat : array of arrays axis : axis to provide concatenation typs : set of to_concat dtypes Returns ------- a single array, preserving the combined dtypes """ from pandas.core.arrays import SparseArray fill_values = [ x.fill_value for x in to_concat if isinstance(x, SparseArray) ] fill_value = fill_values[0] # TODO: Fix join unit generation so we aren't passed this. to_concat = [ x if isinstance(x, SparseArray) else SparseArray(x.squeeze(), fill_value=fill_value) for x in to_concat ] return SparseArray._concat_same_type(to_concat)
def test_setitem_with_unaligned_sparse_value(self): df = DataFrame({"c_1": ["a", "b", "c"], "n_1": [1.0, 2.0, 3.0]}) sp_series = Series(SparseArray([0, 0, 1]), index=[2, 1, 0]) df["new_column"] = sp_series expected = Series(SparseArray([1, 0, 0]), name="new_column") tm.assert_series_equal(df["new_column"], expected)
def test_abs_operator(self): arr = SparseArray([-1, -2, np.nan, 3], fill_value=np.nan, dtype=np.int8) res = abs(arr) exp = SparseArray([1, 2, np.nan, 3], fill_value=np.nan, dtype=np.int8) tm.assert_sp_array_equal(exp, res) arr = SparseArray([-1, -2, 1, 3], fill_value=-1, dtype=np.int8) res = abs(arr) exp = SparseArray([1, 2, 1, 3], fill_value=1, dtype=np.int8) tm.assert_sp_array_equal(exp, res)
def test_invert_operator(self): arr = SparseArray([False, True, False, True], fill_value=False, dtype=np.bool8) exp = SparseArray( np.invert([False, True, False, True]), fill_value=True, dtype=np.bool8 ) res = ~arr tm.assert_sp_array_equal(exp, res) arr = SparseArray([0, 1, 0, 2, 3, 0], fill_value=0, dtype=np.int32) res = ~arr exp = SparseArray([-1, -2, -1, -3, -4, -1], fill_value=-1, dtype=np.int32) tm.assert_sp_array_equal(exp, res)
def test_invert(fill_value): arr = np.array([True, False, False, True]) sparray = SparseArray(arr, fill_value=fill_value) result = ~sparray expected = SparseArray(~arr, fill_value=not fill_value) tm.assert_sp_array_equal(result, expected) result = ~pd.Series(sparray) expected = pd.Series(expected) tm.assert_series_equal(result, expected) result = ~pd.DataFrame({"A": sparray}) expected = pd.DataFrame({"A": expected}) tm.assert_frame_equal(result, expected)
def notna(self): arr = SparseArray( notna(self.values.sp_values), sparse_index=self.values.sp_index, fill_value=notna(self.fill_value), ) return self._constructor(arr, index=self.index).__finalize__(self)
def _unpickle_series_compat(self, state): nd_state, own_state = state # recreate the ndarray data = np.empty(nd_state[1], dtype=nd_state[2]) np.ndarray.__setstate__(data, nd_state) index, fill_value, sp_index = own_state[:3] name = None if len(own_state) > 3: name = own_state[3] # create a sparse array if not isinstance(data, SparseArray): data = SparseArray(data, sparse_index=sp_index, fill_value=fill_value, copy=False) # recreate data = SingleBlockManager(data, index, fastpath=True) generic.NDFrame.__init__(self, data) self._set_axis(0, index) self.name = name
def test_concat_sparse(): # GH 23557 a = Series(SparseArray([0, 1, 2])) expected = DataFrame(data=[[0, 0], [1, 1], [2, 2]]).astype( pd.SparseDtype(np.int64, 0)) result = pd.concat([a, a], axis=1) tm.assert_frame_equal(result, expected)
def _concat_sparse(to_concat, axis=0, typs=None): """ provide concatenation of an sparse/dense array of arrays each of which is a single dtype Parameters ---------- to_concat : array of arrays axis : axis to provide concatenation typs : set of to_concat dtypes Returns ------- a single array, preserving the combined dtypes """ from pandas.core.arrays import SparseArray fill_values = [x.fill_value for x in to_concat if isinstance(x, SparseArray)] fill_value = fill_values[0] # TODO: Fix join unit generation so we aren't passed this. to_concat = [x if isinstance(x, SparseArray) else SparseArray(x.squeeze(), fill_value=fill_value) for x in to_concat] return SparseArray._concat_same_type(to_concat)
def test_setitem_with_sparse_value(self): # GH#8131 df = DataFrame({"c_1": ["a", "b", "c"], "n_1": [1.0, 2.0, 3.0]}) sp_array = SparseArray([0, 0, 1]) df["new_column"] = sp_array expected = Series(sp_array, name="new_column") tm.assert_series_equal(df["new_column"], expected)
def __init__(self, *args, **kwargs): """ A pandas DataFrame wrapper for one of Table's numpy arrays: - sets index values corresponding to Orange's global row indices e.g. ['_o1', '_o2'] (allows Orange to handle selection) - remembers the array's role in the Table (attribute, class var, meta) - keeps the Variable objects, and uses them in back-to-table conversion, should a column name match a variable's name - stores weight values (legacy) Parameters ---------- table : Table orange_role : Role, (default=Role.Attribute) When converting back to an orange table, the DataFrame will convert to the right role (attrs, class vars, or metas) """ if len(args) <= 0 or not isinstance(args[0], Table): super().__init__(*args, **kwargs) return table = args[0] if 'orange_role' in kwargs: role = kwargs.pop('orange_role') elif len(args) >= 2: role = args[1] else: role = Role.Attribute if role == Role.Attribute: data = table.X vars_ = table.domain.attributes elif role == Role.ClassAttribute: data = table.Y vars_ = table.domain.class_vars else: # if role == Role.Meta: data = table.metas vars_ = table.domain.metas index = ['_o' + str(id_) for id_ in table.ids] varsdict = {var._name: var for var in vars_} columns = varsdict.keys() if sp.issparse(data): data = data.asformat('csc') sparrays = [SparseArray.from_spmatrix(data[:, i]) for i in range(data.shape[1])] data = dict(enumerate(sparrays)) super().__init__(data, index=index, **kwargs) self.columns = columns # a hack to keep Orange df _metadata in sparse->dense conversion self.sparse.to_dense = self.__patch_constructor(self.sparse.to_dense) else: super().__init__(data=data, index=index, columns=columns, **kwargs) self.orange_role = role self.orange_variables = varsdict self.orange_weights = (dict(zip(index, table.W)) if table.W.size > 0 else {}) self.orange_attributes = table.attributes
def __init__( self, data=None, index=None, sparse_index=None, kind="block", fill_value=None, name=None, dtype=None, copy=False, fastpath=False, ): warnings.warn(depr_msg, FutureWarning, stacklevel=2) # TODO: Most of this should be refactored and shared with Series # 1. BlockManager -> array # 2. Series.index, Series.name, index, name reconciliation # 3. Implicit reindexing # 4. Implicit broadcasting # 5. Dict construction if data is None: data = [] elif isinstance(data, SingleBlockManager): index = data.index data = data.blocks[0].values elif isinstance(data, (ABCSeries, ABCSparseSeries)): index = data.index if index is None else index dtype = data.dtype if dtype is None else dtype name = data.name if name is None else name if index is not None: data = data.reindex(index) elif isinstance(data, abc.Mapping): data, index = Series()._init_dict(data, index=index) elif is_scalar(data) and index is not None: data = np.full(len(index), fill_value=data) if isinstance(data, SingleBlockManager): # SparseArray doesn't accept SingleBlockManager index = data.index data = data.blocks[0].values super().__init__( SparseArray( data, sparse_index=sparse_index, kind=kind, dtype=dtype, fill_value=fill_value, copy=copy, ), index=index, name=name, copy=False, fastpath=fastpath, )
def as_sparse_array(self, kind=None, fill_value=None, copy=False): """ return my self as a sparse array, do not copy by default """ if fill_value is None: fill_value = self.fill_value if kind is None: kind = self.kind return SparseArray(self.values, sparse_index=self.sp_index, fill_value=fill_value, kind=kind, copy=copy)
def _set_value(self, label, value, takeable=False): values = self.to_dense() # if the label doesn't exist, we will create a new object here # and possibly change the index new_values = values._set_value(label, value, takeable=takeable) if new_values is not None: values = new_values new_index = values.index values = SparseArray(values, fill_value=self.fill_value, kind=self.kind) self._data = SingleBlockManager(values, new_index) self._index = new_index
def _set_values(self, key, value): # this might be inefficient as we have to recreate the sparse array # rather than setting individual elements, but have to convert # the passed slice/boolean that's in dense space into a sparse indexer # not sure how to do that! if isinstance(key, Series): key = key.values values = self.values.to_dense() values[key] = libindex.convert_scalar(values, value) values = SparseArray(values, fill_value=self.fill_value, kind=self.kind) self._data = SingleBlockManager(values, self.index)
def test_getitem_sparse_column_return_type_and_dtype(self): # https://github.com/pandas-dev/pandas/issues/23559 data = SparseArray([0, 1]) df = DataFrame({"A": data}) expected = Series(data, name="A") result = df["A"] tm.assert_series_equal(result, expected) # Also check iloc and loc while we're here result = df.iloc[:, 0] tm.assert_series_equal(result, expected) result = df.loc[:, "A"] tm.assert_series_equal(result, expected)
def __init__(self, data=None, index=None, sparse_index=None, kind='block', fill_value=None, name=None, dtype=None, copy=False, fastpath=False): # TODO: Most of this should be refactored and shared with Series # 1. BlockManager -> array # 2. Series.index, Series.name, index, name reconciliation # 3. Implicit reindexing # 4. Implicit broadcasting # 5. Dict construction if data is None: data = [] elif isinstance(data, SingleBlockManager): index = data.index data = data.blocks[0].values elif isinstance(data, (ABCSeries, ABCSparseSeries)): index = data.index if index is None else index dtype = data.dtype if dtype is None else dtype name = data.name if name is None else name if index is not None: data = data.reindex(index) elif isinstance(data, compat.Mapping): data, index = Series()._init_dict(data, index=index) elif is_scalar(data) and index is not None: data = np.full(len(index), fill_value=data) super(SparseSeries, self).__init__(SparseArray(data, sparse_index=sparse_index, kind=kind, dtype=dtype, fill_value=fill_value, copy=copy), index=index, name=name, copy=False, fastpath=fastpath)
def sparse_reindex(self, new_index): """ Conform sparse values to new SparseIndex Parameters ---------- new_index : {BlockIndex, IntIndex} Returns ------- reindexed : SparseSeries """ if not isinstance(new_index, splib.SparseIndex): raise TypeError("new index must be a SparseIndex") values = self.values values = values.sp_index.to_int_index().reindex( values.sp_values.astype("float64"), values.fill_value, new_index) values = SparseArray(values, sparse_index=new_index, fill_value=self.values.fill_value) return self._constructor(values, index=self.index).__finalize__(self)
def _set_value(self, label, value, takeable=False): """ Quickly set single value at passed label. If label is not contained, a new object is created with the label placed at the end of the result index .. deprecated:: 0.21.0 Please use .at[] or .iat[] accessors. Parameters ---------- label : object Partial indexing with MultiIndex not allowed value : object Scalar value takeable : interpret the index as indexers, default False Notes ----- This method *always* returns a new object. It is not particularly efficient but is provided for API compatibility with Series Returns ------- series : SparseSeries """ values = self.to_dense() # if the label doesn't exist, we will create a new object here # and possibly change the index new_values = values._set_value(label, value, takeable=takeable) if new_values is not None: values = new_values new_index = values.index values = SparseArray(values, fill_value=self.fill_value, kind=self.kind) self._data = SingleBlockManager(values, new_index) self._index = new_index
def test_unary_op(op, fill_value): arr = np.array([0, 1, np.nan, 2]) sparray = SparseArray(arr, fill_value=fill_value) result = op(sparray) expected = SparseArray(op(arr), fill_value=op(fill_value)) tm.assert_sp_array_equal(result, expected)
def _get_dummies_1d( data, prefix, prefix_sep="_", dummy_na: bool = False, sparse: bool = False, drop_first: bool = False, dtype: Dtype | None = None, ) -> DataFrame: from pandas.core.reshape.concat import concat # Series avoids inconsistent NaN handling codes, levels = factorize_from_iterable(Series(data)) if dtype is None: dtype = np.dtype(np.uint8) # error: Argument 1 to "dtype" has incompatible type "Union[ExtensionDtype, str, # dtype[Any], Type[object]]"; expected "Type[Any]" dtype = np.dtype(dtype) # type: ignore[arg-type] if is_object_dtype(dtype): raise ValueError("dtype=object is not a valid dtype for get_dummies") def get_empty_frame(data) -> DataFrame: if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) return DataFrame(index=index) # if all NaN if not dummy_na and len(levels) == 0: return get_empty_frame(data) codes = codes.copy() if dummy_na: codes[codes == -1] = len(levels) levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: return get_empty_frame(data) number_of_cols = len(levels) if prefix is None: dummy_cols = levels else: dummy_cols = Index([f"{prefix}{prefix_sep}{level}" for level in levels]) index: Index | None if isinstance(data, Series): index = data.index else: index = None if sparse: fill_value: bool | float | int if is_integer_dtype(dtype): fill_value = 0 elif dtype == np.dtype(bool): fill_value = False else: fill_value = 0.0 sparse_series = [] N = len(data) sp_indices: list[list] = [[] for _ in range(len(dummy_cols))] mask = codes != -1 codes = codes[mask] n_idx = np.arange(N)[mask] for ndx, code in zip(n_idx, codes): sp_indices[code].append(ndx) if drop_first: # remove first categorical level to avoid perfect collinearity # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray( np.ones(len(ixs), dtype=dtype), sparse_index=IntIndex(N, ixs), fill_value=fill_value, dtype=dtype, ) sparse_series.append(Series(data=sarr, index=index, name=col)) return concat(sparse_series, axis=1, copy=False) else: # take on axis=1 + transpose to ensure ndarray layout is column-major dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=1).T if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def _get_dummies_1d( data, prefix, prefix_sep="_", dummy_na=False, sparse=False, drop_first=False, dtype=None, ): from pandas.core.reshape.concat import concat # Series avoids inconsistent NaN handling codes, levels = _factorize_from_iterable(Series(data)) if dtype is None: dtype = np.uint8 dtype = np.dtype(dtype) if is_object_dtype(dtype): raise ValueError("dtype=object is not a valid dtype for get_dummies") def get_empty_frame(data): if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) return DataFrame(index=index) # if all NaN if not dummy_na and len(levels) == 0: return get_empty_frame(data) codes = codes.copy() if dummy_na: codes[codes == -1] = len(levels) levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: return get_empty_frame(data) number_of_cols = len(levels) if prefix is None: dummy_cols = levels else: # PY2 embedded unicode, gh-22084 def _make_col_name(prefix, prefix_sep, level): fstr = "{prefix}{prefix_sep}{level}" return fstr.format(prefix=prefix, prefix_sep=prefix_sep, level=level) dummy_cols = [ _make_col_name(prefix, prefix_sep, level) for level in levels ] if isinstance(data, Series): index = data.index else: index = None if sparse: if is_integer_dtype(dtype): fill_value = 0 elif dtype == bool: fill_value = False else: fill_value = 0.0 sparse_series = [] N = len(data) sp_indices = [[] for _ in range(len(dummy_cols))] mask = codes != -1 codes = codes[mask] n_idx = np.arange(N)[mask] for ndx, code in zip(n_idx, codes): sp_indices[code].append(ndx) if drop_first: # remove first categorical level to avoid perfect collinearity # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray( np.ones(len(ixs), dtype=dtype), sparse_index=IntIndex(N, ixs), fill_value=fill_value, dtype=dtype, ) sparse_series.append(Series(data=sarr, index=index, name=col)) out = concat(sparse_series, axis=1, copy=False) return out else: dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def create_block(typestr, placement, item_shape=None, num_offset=0, maker=new_block): """ Supported typestr: * float, f8, f4, f2 * int, i8, i4, i2, i1 * uint, u8, u4, u2, u1 * complex, c16, c8 * bool * object, string, O * datetime, dt, M8[ns], M8[ns, tz] * timedelta, td, m8[ns] * sparse (SparseArray with fill_value=0.0) * sparse_na (SparseArray with fill_value=np.nan) * category, category2 """ placement = BlockPlacement(placement) num_items = len(placement) if item_shape is None: item_shape = (N, ) shape = (num_items, ) + item_shape mat = get_numeric_mat(shape) if typestr in ( "float", "f8", "f4", "f2", "int", "i8", "i4", "i2", "i1", "uint", "u8", "u4", "u2", "u1", ): values = mat.astype(typestr) + num_offset elif typestr in ("complex", "c16", "c8"): values = 1.0j * (mat.astype(typestr) + num_offset) elif typestr in ("object", "string", "O"): values = np.reshape([f"A{i:d}" for i in mat.ravel() + num_offset], shape) elif typestr in ("b", "bool"): values = np.ones(shape, dtype=np.bool_) elif typestr in ("datetime", "dt", "M8[ns]"): values = (mat * 1e9).astype("M8[ns]") elif typestr.startswith("M8[ns"): # datetime with tz m = re.search(r"M8\[ns,\s*(\w+\/?\w*)\]", typestr) assert m is not None, f"incompatible typestr -> {typestr}" tz = m.groups()[0] assert num_items == 1, "must have only 1 num items for a tz-aware" values = DatetimeIndex(np.arange(N) * 1e9, tz=tz)._data elif typestr in ("timedelta", "td", "m8[ns]"): values = (mat * 1).astype("m8[ns]") elif typestr in ("category", ): values = Categorical([1, 1, 2, 2, 3, 3, 3, 3, 4, 4]) elif typestr in ("category2", ): values = Categorical( ["a", "a", "a", "a", "b", "b", "c", "c", "c", "d"]) elif typestr in ("sparse", "sparse_na"): # FIXME: doesn't support num_rows != 10 assert shape[-1] == 10 assert all(s == 1 for s in shape[:-1]) if typestr.endswith("_na"): fill_value = np.nan else: fill_value = 0.0 values = SparseArray( [fill_value, fill_value, 1, 2, 3, fill_value, 4, 5, fill_value, 6], fill_value=fill_value, ) arr = values.sp_values.view() arr += num_offset - 1 else: raise ValueError(f'Unsupported typestr: "{typestr}"') return maker(values, placement=placement, ndim=len(shape))
if is_datetime64_dtype(any_numpy_dtype): assert isinstance(result, DatetimeArray) elif is_timedelta64_dtype(any_numpy_dtype): assert isinstance(result, TimedeltaArray) else: assert isinstance(result, PandasArray) @pytest.mark.parametrize( "array, attr", [ (pd.Categorical(["a", "b"]), "_codes"), (pd.core.arrays.period_array(["2000", "2001"], freq="D"), "_data"), (pd.core.arrays.integer_array([0, np.nan]), "_data"), (IntervalArray.from_breaks([0, 1]), "_left"), (SparseArray([0, 1]), "_sparse_values"), (DatetimeArray(np.array([1, 2], dtype="datetime64[ns]")), "_data"), # tz-aware Datetime ( DatetimeArray( np.array(["2000-01-01T12:00:00", "2000-01-02T12:00:00"], dtype="M8[ns]"), dtype=DatetimeTZDtype(tz="US/Central"), ), "_data", ), ], ) def test_array(array, attr, index_or_series): box = index_or_series if array.dtype.name in ("Int64", "Sparse[int64, 0]") and box is pd.Index: