def _init_data(self, data, copy, dtype, **kwargs): """ Generate ND initialization; axes are passed as required objects to __init__ """ if data is None: data = {} if dtype is not None: dtype = self._validate_dtype(dtype) passed_axes = [kwargs.get(a) for a in self._AXIS_ORDERS] axes = None if isinstance(data, BlockManager): if any(x is not None for x in passed_axes): axes = [ x if x is not None else y for x, y in zip(passed_axes, data.axes) ] mgr = data elif isinstance(data, dict): mgr = self._init_dict(data, passed_axes, dtype=dtype) copy = False dtype = None elif isinstance(data, (np.ndarray, list)): mgr = self._init_matrix(data, passed_axes, dtype=dtype, copy=copy) copy = False dtype = None else: # pragma: no cover raise PandasError('Panel constructor not properly called!') NDFrame.__init__(self, mgr, axes=axes, copy=copy, dtype=dtype)
def _init_data(self, data, copy, dtype, **kwargs): """ Generate ND initialization; axes are passed as required objects to __init__ """ if data is None: data = {} if dtype is not None: dtype = self._validate_dtype(dtype) passed_axes = [kwargs.get(a) for a in self._AXIS_ORDERS] axes = None if isinstance(data, BlockManager): if any(x is not None for x in passed_axes): axes = [x if x is not None else y for x, y in zip(passed_axes, data.axes)] mgr = data elif isinstance(data, dict): mgr = self._init_dict(data, passed_axes, dtype=dtype) copy = False dtype = None elif isinstance(data, (np.ndarray, list)): mgr = self._init_matrix(data, passed_axes, dtype=dtype, copy=copy) copy = False dtype = None else: # pragma: no cover raise PandasError('Panel constructor not properly called!') NDFrame.__init__(self, mgr, axes=axes, copy=copy, dtype=dtype)
def _set_new_item(self, name: str, value: np.ndarray) -> None: """Set a new column and avoid SettingWithCopyWarning by using pandas internal APIs see: https://github.com/pandas-dev/pandas/blob/v1.1.0/pandas/core/frame.py#L3114 """ # noqa: E501 NDFrame._set_item(self, name, value)
def encode(self, obj: NDFrame, description: Optional[str], params: Optional[Dict]) -> FrameData: buf = StringIO() obj.to_csv(buf, index=self.index, header=self.header, encoding=self.encoding, quoting=QUOTE_ALL) index_type = [str(obj.index.dtype)] if self.index else [] return FrameData(BytesContent(buf.getvalue().encode(self.encoding)), MediaType("text/csv", self.application()), description, params, {"header": self.header, "index": self.index, "schema": index_type + self.schema(obj), "encoding": self.encoding, "version": pandas_version})
class TestNDFrame(unittest.TestCase): _multiprocess_can_split_ = True def setUp(self): tdf = t.makeTimeDataFrame() self.ndf = NDFrame(tdf._data) def test_constructor(self): # with cast ndf = NDFrame(self.ndf._data, dtype=np.int64) self.assert_(ndf.values.dtype == np.int64) def test_ndim(self): self.assertEquals(self.ndf.ndim, 2) def test_astype(self): casted = self.ndf.astype(int) self.assert_(casted.values.dtype == np.int_) casted = self.ndf.astype(np.int32) self.assert_(casted.values.dtype == np.int32) def test_squeeze(self): # noop for s in [ t.makeFloatSeries(), t.makeStringSeries(), t.makeObjectSeries() ]: t.assert_series_equal(s.squeeze(), s) for df in [t.makeTimeDataFrame()]: t.assert_frame_equal(df.squeeze(), df) for p in [t.makePanel()]: t.assert_panel_equal(p.squeeze(), p) for p4d in [t.makePanel4D()]: t.assert_panel4d_equal(p4d.squeeze(), p4d) # squeezing df = t.makeTimeDataFrame().reindex(columns=['A']) t.assert_series_equal(df.squeeze(), df['A']) p = t.makePanel().reindex(items=['ItemA']) t.assert_frame_equal(p.squeeze(), p['ItemA']) p = t.makePanel().reindex(items=['ItemA'], minor_axis=['A']) t.assert_series_equal(p.squeeze(), p.ix['ItemA', :, 'A']) p4d = t.makePanel4D().reindex(labels=['label1']) t.assert_panel_equal(p4d.squeeze(), p4d['label1']) p4d = t.makePanel4D().reindex(labels=['label1'], items=['ItemA']) t.assert_frame_equal(p4d.squeeze(), p4d.ix['label1', 'ItemA'])
def tprint(df: NDFrame, head=0, to_latex=False): if isinstance(df, pd.Series): df = pd.DataFrame(df) if head > 0: df = df.head(head) elif head < 0: df = df.tail(-head) print(tabulate(df, headers="keys", tablefmt="pipe", floatfmt=".3f") + '\n') if to_latex: print(df.to_latex(bold_rows=True))
class TestNDFrame(unittest.TestCase): _multiprocess_can_split_ = True def setUp(self): tdf = t.makeTimeDataFrame() self.ndf = NDFrame(tdf._data) def test_constructor(self): # with cast ndf = NDFrame(self.ndf._data, dtype=np.int64) self.assert_(ndf.values.dtype == np.int64) def test_ndim(self): self.assertEquals(self.ndf.ndim, 2) def test_astype(self): casted = self.ndf.astype(int) self.assert_(casted.values.dtype == np.int_) casted = self.ndf.astype(np.int32) self.assert_(casted.values.dtype == np.int32) def test_squeeze(self): # noop for s in [ t.makeFloatSeries(), t.makeStringSeries(), t.makeObjectSeries() ]: t.assert_series_equal(s.squeeze(),s) for df in [ t.makeTimeDataFrame() ]: t.assert_frame_equal(df.squeeze(),df) for p in [ t.makePanel() ]: t.assert_panel_equal(p.squeeze(),p) for p4d in [ t.makePanel4D() ]: t.assert_panel4d_equal(p4d.squeeze(),p4d) # squeezing df = t.makeTimeDataFrame().reindex(columns=['A']) t.assert_series_equal(df.squeeze(),df['A']) p = t.makePanel().reindex(items=['ItemA']) t.assert_frame_equal(p.squeeze(),p['ItemA']) p = t.makePanel().reindex(items=['ItemA'],minor_axis=['A']) t.assert_series_equal(p.squeeze(),p.ix['ItemA',:,'A']) p4d = t.makePanel4D().reindex(labels=['label1']) t.assert_panel_equal(p4d.squeeze(),p4d['label1']) p4d = t.makePanel4D().reindex(labels=['label1'],items=['ItemA']) t.assert_frame_equal(p4d.squeeze(),p4d.ix['label1','ItemA'])
def __setitem__(self, key, value): _, N, K = self.shape if isinstance(value, DataFrame): value = value.reindex(index=self.major_axis, columns=self.minor_axis) mat = value.values elif isinstance(value, np.ndarray): assert value.shape == (N, K) mat = np.asarray(value) elif np.isscalar(value): dtype = _infer_dtype(value) mat = np.empty((N, K), dtype=dtype) mat.fill(value) mat = mat.reshape((1, N, K)) NDFrame._set_item(self, key, mat)
def __init__(self, data=None, items=None, major_axis=None, minor_axis=None, copy=False, dtype=None): """ Represents wide format panel data, stored as 3-dimensional array Parameters ---------- data : ndarray (items x major x minor), or dict of DataFrames items : Index or array-like axis=1 major_axis : Index or array-like axis=1 minor_axis : Index or array-like axis=2 dtype : dtype, default None Data type to force, otherwise infer copy : boolean, default False Copy data from inputs. Only affects DataFrame / 2d ndarray input """ if data is None: data = {} passed_axes = [items, major_axis, minor_axis] axes = None if isinstance(data, BlockManager): if any(x is not None for x in passed_axes): axes = [ x if x is not None else y for x, y in zip(passed_axes, data.axes) ] mgr = data elif isinstance(data, dict): mgr = self._init_dict(data, passed_axes, dtype=dtype) copy = False dtype = None elif isinstance(data, (np.ndarray, list)): mgr = self._init_matrix(data, passed_axes, dtype=dtype, copy=copy) copy = False dtype = None else: # pragma: no cover raise PandasError('Panel constructor not properly called!') NDFrame.__init__(self, mgr, axes=axes, copy=copy, dtype=dtype)
def to_json( path_or_buf, obj: NDFrame, orient: str | None = None, date_format: str = "epoch", double_precision: int = 10, force_ascii: bool = True, date_unit: str = "ms", default_handler: Callable[[Any], JSONSerializable] | None = None, lines: bool = False, compression: CompressionOptions = "infer", index: bool = True, indent: int = 0, storage_options: StorageOptions = None, ): if not index and orient not in ["split", "table"]: raise ValueError( "'index=False' is only valid when 'orient' is 'split' or 'table'") if lines and orient != "records": raise ValueError("'lines' keyword only valid when 'orient' is records") if orient == "table" and isinstance(obj, Series): obj = obj.to_frame(name=obj.name or "values") writer: type[Writer] if orient == "table" and isinstance(obj, DataFrame): writer = JSONTableWriter elif isinstance(obj, Series): writer = SeriesWriter elif isinstance(obj, DataFrame): writer = FrameWriter else: raise NotImplementedError("'obj' should be a Series or a DataFrame") s = writer( obj, orient=orient, date_format=date_format, double_precision=double_precision, ensure_ascii=force_ascii, date_unit=date_unit, default_handler=default_handler, index=index, indent=indent, ).write() if lines: s = convert_to_line_delimits(s) if path_or_buf is not None: # apply compression and byte/text conversion with get_handle(path_or_buf, "w", compression=compression, storage_options=storage_options) as handles: handles.handle.write(s) else: return s
def __setitem__(self, key, value): _, N, K = self.shape if isinstance(value, DataFrame): value = value.reindex(index=self.major_axis, columns=self.minor_axis) mat = value.values elif isinstance(value, np.ndarray): assert(value.shape == (N, K)) mat = np.asarray(value) elif np.isscalar(value): dtype = _infer_dtype(value) mat = np.empty((N, K), dtype=dtype) mat.fill(value) mat = mat.reshape((1, N, K)) NDFrame._set_item(self, key, mat)
def _get_sharpe_ratio(price_series: NDFrame, rf_series: Series, scale_to_annualise: bool): """" Returns the Sharpe ratio based on a series of asset prices and risk-free asset prices. The calculation is based on the arithmetic mean of actual returns, as appears to be standard :param price_series - a pandas series or data frame of prices :param rf_series - a pandas series representing the total return/price series of the risk free rate :param scale_to_annualise - bool governing whether or not a scaling factor is applied to annualise the statistic Notes ------- https://en.wikipedia.org/wiki/Sharpe_ratio """ # excess_return = get_annualised_excess_return(price_series, rf_series) # vol = get_annual_vol(price_series) # sharpe_ratio = excess_return/vol returns = (price_series / price_series.shift(1)).dropna() returns_rf = (rf_series / rf_series.shift(1)).dropna() rel_returns = returns.subtract(returns_rf, axis=0) avg_excess = rel_returns.mean() vol = rel_returns.std() annualising_scaling = TSeriesHelper._get_annualisation_factor(price_series.index) if scale_to_annualise else 1 sharpe_ratio = avg_excess * annualising_scaling / vol return sharpe_ratio
def panel4d_reindex(self, labs=None, labels=None, items=None, major_axis=None, minor_axis=None, axis=None, **kwargs): # Hack for reindex_axis deprecation # Ha, we used labels for two different things # I think this will work still. if labs is None: args = () else: args = (labs,) kwargs_ = dict(labels=labels, items=items, major_axis=major_axis, minor_axis=minor_axis, axis=axis) kwargs_ = {k: v for k, v in kwargs_.items() if v is not None} # major = kwargs.pop("major", None) # minor = kwargs.pop('minor', None) # if major is not None: # if kwargs.get("major_axis"): # raise TypeError("Cannot specify both 'major' and 'major_axis'") # kwargs_['major_axis'] = major # if minor is not None: # if kwargs.get("minor_axis"): # raise TypeError("Cannot specify both 'minor' and 'minor_axis'") # kwargs_['minor_axis'] = minor if axis is not None: kwargs_['axis'] = axis axes = validate_axis_style_args(self, args, kwargs_, 'labs', 'reindex') kwargs.update(axes) return NDFrame.reindex(self, **kwargs)
def price_to_return(price_series: NDFrame): """ convert a series of asset prices to a series or returns :param price_series: pandas price series or dataframe of price series :return: """ if (price_series == 0).any(axis=None).any(): raise ValueError("Cannot convert price series with zeroes to a return") price_series = price_series / price_series.shift(1) price_series.dropna(inplace=True) price_series = price_series - 1 return price_series
def __setitem__(self, key, value): _, N, K = self.shape if isinstance(value, DataFrame): value = value.reindex(index=self.major_axis, columns=self.minor_axis) mat = value.values elif isinstance(value, np.ndarray): if value.shape != (N, K): raise AssertionError(("Shape of values must be (%d, %d), " "not (%d, %d)") % ((N, K) + values.shape)) mat = np.asarray(value) elif np.isscalar(value): dtype = _infer_dtype(value) mat = np.empty((N, K), dtype=dtype) mat.fill(value) else: raise TypeError("Cannot set item of type: %s" % str(type(value))) mat = mat.reshape((1, N, K)) NDFrame._set_item(self, key, mat)
def remove_empty_values(data_frame: NDFrame) -> NDFrame: modified_data_set = data_frame.fillna(" ") sum_empty_values = panda.isnull(modified_data_set).sum() if sum_empty_values.any(): print("Has some empties values in the data frame") raise Exception("Problem in load data set, we need to remove them") return modified_data_set
def __setitem__(self, key, value): shape = tuple(self.shape) if isinstance(value, self._constructor_sliced): value = value.reindex( **self._construct_axes_dict_for_slice(self._AXIS_ORDERS[1:])) mat = value.values elif isinstance(value, np.ndarray): if not ((value.shape == shape[1:])): raise AssertionError() mat = np.asarray(value) elif np.isscalar(value): dtype, value = _infer_dtype_from_scalar(value) mat = np.empty(shape[1:], dtype=dtype) mat.fill(value) else: raise TypeError('Cannot set item of type: %s' % str(type(value))) mat = mat.reshape(tuple([1]) + shape[1:]) NDFrame._set_item(self, key, mat)
def __setitem__(self, key, value): shape = tuple(self.shape) if isinstance(value, self._constructor_sliced): value = value.reindex(**self._construct_axes_dict_for_slice(self._AXIS_ORDERS[1:])) mat = value.values elif isinstance(value, np.ndarray): if value.shape != shape[1:]: raise ValueError( "shape of value must be {0}, shape of given " "object was {1}".format(shape[1:], value.shape) ) mat = np.asarray(value) elif np.isscalar(value): dtype, value = _infer_dtype_from_scalar(value) mat = np.empty(shape[1:], dtype=dtype) mat.fill(value) else: raise TypeError("Cannot set item of type: %s" % str(type(value))) mat = mat.reshape(tuple([1]) + shape[1:]) NDFrame._set_item(self, key, mat)
def __init__(self, data=None, items=None, major_axis=None, minor_axis=None, copy=False, dtype=None): """ Represents wide format panel data, stored as 3-dimensional array Parameters ---------- data : ndarray (items x major x minor), or dict of DataFrames items : Index or array-like axis=1 major_axis : Index or array-like axis=1 minor_axis : Index or array-like axis=2 dtype : dtype, default None Data type to force, otherwise infer copy : boolean, default False Copy data from inputs. Only affects DataFrame / 2d ndarray input """ if data is None: data = {} passed_axes = [items, major_axis, minor_axis] axes = None if isinstance(data, BlockManager): if any(x is not None for x in passed_axes): axes = [x if x is not None else y for x, y in zip(passed_axes, data.axes)] mgr = data elif isinstance(data, dict): mgr = self._init_dict(data, passed_axes, dtype=dtype) copy = False dtype = None elif isinstance(data, (np.ndarray, list)): mgr = self._init_matrix(data, passed_axes, dtype=dtype, copy=copy) copy = False dtype = None else: # pragma: no cover raise PandasError('Panel constructor not properly called!') NDFrame.__init__(self, mgr, axes=axes, copy=copy, dtype=dtype)
def __setitem__(self, key, value): shape = tuple(self.shape) if isinstance(value, self._constructor_sliced): value = value.reindex( **self._construct_axes_dict_for_slice(self._AXIS_ORDERS[1:])) mat = value.values elif isinstance(value, np.ndarray): if value.shape != shape[1:]: raise ValueError('shape of value must be {0}, shape of given ' 'object was {1}'.format( shape[1:], tuple(map(int, value.shape)))) mat = np.asarray(value) elif np.isscalar(value): dtype, value = _infer_dtype_from_scalar(value) mat = np.empty(shape[1:], dtype=dtype) mat.fill(value) else: raise TypeError('Cannot set item of type: %s' % str(type(value))) mat = mat.reshape(tuple([1]) + shape[1:]) NDFrame._set_item(self, key, mat)
def __setitem__(self, key, value): _, N, K = self.shape if isinstance(value, DataFrame): value = value.reindex(index=self.major_axis, columns=self.minor_axis) mat = value.values elif isinstance(value, np.ndarray): if value.shape != (N, K): raise AssertionError( ('Shape of values must be (%d, %d), ' 'not (%d, %d)') % ((N, K) + values.shape)) mat = np.asarray(value) elif np.isscalar(value): dtype = _infer_dtype(value) mat = np.empty((N, K), dtype=dtype) mat.fill(value) else: raise TypeError('Cannot set item of type: %s' % str(type(value))) mat = mat.reshape((1, N, K)) NDFrame._set_item(self, key, mat)
def pop(self, item): """ Return item slice from panel and delete from panel Parameters ---------- key : object Must be contained in panel's items Returns ------- y : DataFrame """ return NDFrame.pop(self, item)
def get_annualised_vol(price_series: NDFrame): """" Returns the annualised volatility of returns based on a stream of asset prices. The function derives the time period of the price data uses this to calculate a suitable factor to annualise the data with. Note that the function calculates the standard deviation of the NATURAL LOG of the returns, as is conventional. :param price_series - a pandas series or dataframe of prices Notes -------- https://en.wikipedia.org/wiki/Volatility_(finance) """ log_returns = np.log(price_series / price_series.shift(1)) annualising_scaling = TSeriesHelper._get_annualisation_factor(price_series.index) return np.std(log_returns) * annualising_scaling
def __setitem__(self, key, value): _, N, K = self.shape # XXX if isinstance(value, LongPanel): if len(value.items) != 1: raise ValueError('Input panel must have only one item!') value = value.to_wide()[value.items[0]] if isinstance(value, DataFrame): value = value.reindex(index=self.major_axis, columns=self.minor_axis) mat = value.values elif isinstance(value, np.ndarray): assert(value.shape == (N, K)) mat = np.asarray(value) elif np.isscalar(value): dtype = _infer_dtype(value) mat = np.empty((N, K), dtype=dtype) mat.fill(value) mat = mat.reshape((1, N, K)) NDFrame._set_item(self, key, mat)
class TestNDFrame(unittest.TestCase): def setUp(self): tdf = t.makeTimeDataFrame() self.ndf = NDFrame(tdf._data) def test_constructor(self): # with cast ndf = NDFrame(self.ndf._data, dtype=np.int64) self.assert_(ndf.values.dtype == np.int64) def test_ndim(self): self.assertEquals(self.ndf.ndim, 2) def test_astype(self): casted = self.ndf.astype(int) self.assert_(casted.values.dtype == np.int64)
def mean(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): """ The mean method. :param axis: inherit :param skipna: inherit :param level: inherit :param numeric_only: inherit :param kwargs: inherit :return: OneSeries """ return OneSeries( NDFrame.mean(self, axis, skipna, level, numeric_only, **kwargs))
def final_matching(self, gsp_results: NDFrame) -> DataFrame: final_columns = [''] * len(gsp_results.columns) period_scores = [] for i, auto_coreffs in enumerate(self.auto_coreffs): avg = np.average(auto_coreffs) std = np.std(auto_coreffs) # STD of len 1 is 0 # Remove things with only 1 coreff # If having trouble, could also weight by # of auto_coreffs if len(auto_coreffs) > 1: weighted_std = std / avg period_scores.append((i, weighted_std)) period_scores.sort(key=lambda x: x[1]) used_series = [] # Match always on appliances first with highest periodicity scores for ((index, score), label) in zip(period_scores, self.always_on): final_columns[index] = label used_series.append(index) # Match the remaining appliances remaining_power_indices = set(range(len( gsp_results.columns))) - set(used_series) pt = np.transpose(self.pairing_table) mask = [not i in remaining_power_indices for i in range(len(pt[0]))] consensus = [(i, c, sum(c) / self.compute_entropy(c), Matcher.get_ordering(c)) for (i, c) in enumerate(pt)] consensus.sort(key=lambda x: -x[2]) self.past_consensus = consensus for (li, pi, c, o) in consensus: masked = ma.masked_array(pi, mask) choice = masked.argmax() mask[choice] = True final_columns[choice] = self.labels[li] gsp_results.columns = final_columns return gsp_results
def get_iterator(self, data: NDFrame, axis: int = 0): """ Groupby iterator Returns ------- Generator yielding sequence of (name, subsetted object) for each group """ slicer = lambda start, edge: data._slice(slice(start, edge), axis=axis) length = len(data.axes[axis]) start = 0 for edge, label in zip(self.bins, self.binlabels): if label is not NaT: yield label, slicer(start, edge) start = edge if start < length: yield self.binlabels[-1], slicer(start, None)
def _get_downside_deviation(asset_returns: NDFrame, threshold: int = 1): """ Returns the downside annual vol - this is the vol of the returns relative to the threshold returns (capped at 0 at the upper bound) :param price_series - a pandas series or data frame of returns in the form (1+r) :param threshold - the threshold below which returns should be included in the calculation. Default is a zero return (ie a threshold of 1). """ returns_relative = asset_returns.subtract(threshold, axis=0) returns_clipped = returns_relative.where(returns_relative < 0, 0) returns_sqd = np.power(returns_clipped, 2) deviation = np.sum(returns_sqd) / len(returns_clipped) return np.sqrt(deviation)
def _get_sortino_ratio(price_series: NDFrame, benchmark_series: Series, scale_to_annualise: bool): """" Returns the Sortino ratio :param price_series - a pandas series or data frame of prices :param benchmark_series - a pandas series representing the total return/price series of the chosen benchmark Notes ------ https://en.wikipedia.org/wiki/Sortino_ratio """ returns = (price_series / price_series.shift(1)).dropna() returns_rf = (benchmark_series / benchmark_series.shift(1)).dropna() avg_excess = (returns.subtract(returns_rf, axis=0)).mean() vol = TSeriesHelper._get_downside_deviation(returns, returns_rf) annualising_scaling = TSeriesHelper._get_annualisation_factor(price_series.index) if scale_to_annualise else 1 sortino_ratio = avg_excess * annualising_scaling / vol return sortino_ratio
def setUp(self): tdf = t.makeTimeDataFrame() self.ndf = NDFrame(tdf._data)
def __init__(self, data=None, index=None, columns=None, default_kind=None, default_fill_value=None, dtype=None, copy=False): # pick up the defaults from the Sparse structures if isinstance(data, SparseDataFrame): if index is None: index = data.index if columns is None: columns = data.columns if default_fill_value is None: default_fill_value = data.default_fill_value if default_kind is None: default_kind = data.default_kind elif isinstance(data, (SparseSeries, SparseArray)): if index is None: index = data.index if default_fill_value is None: default_fill_value = data.fill_value if columns is None and hasattr(data, 'name'): columns = [data.name] if columns is None: raise Exception("cannot pass a series w/o a name or columns") data = {columns[0]: data} if default_fill_value is None: default_fill_value = np.nan if default_kind is None: default_kind = 'block' self._default_kind = default_kind self._default_fill_value = default_fill_value if isinstance(data, dict): mgr = self._init_dict(data, index, columns) if dtype is not None: mgr = mgr.astype(dtype) elif isinstance(data, (np.ndarray, list)): mgr = self._init_matrix(data, index, columns) if dtype is not None: mgr = mgr.astype(dtype) elif isinstance(data, SparseDataFrame): mgr = self._init_mgr( data._data, dict(index=index, columns=columns), dtype=dtype, copy=copy) elif isinstance(data, DataFrame): mgr = self._init_dict(data, data.index, data.columns) if dtype is not None: mgr = mgr.astype(dtype) elif isinstance(data, BlockManager): mgr = self._init_mgr( data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy) elif data is None: data = {} if index is None: index = Index([]) else: index = _ensure_index(index) if columns is None: columns = Index([]) else: for c in columns: data[c] = SparseArray(np.nan, index=index, kind=self._default_kind, fill_value=self._default_fill_value) mgr = dict_to_manager(data, columns, index) if dtype is not None: mgr = mgr.astype(dtype) NDFrame.__init__(self, mgr)
def test_ndframe_indexing_raises(idxr, error, error_message): # GH 25567 frame = NDFrame(np.random.randint(5, size=(2, 2, 2))) with pytest.raises(error, match=error_message): idxr(frame)[0]
def __init__(self, data=None, index=None, columns=None, default_kind=None, default_fill_value=None, dtype=None, copy=False): # pick up the defaults from the Sparse structures if isinstance(data, SparseDataFrame): if index is None: index = data.index if columns is None: columns = data.columns if default_fill_value is None: default_fill_value = data.default_fill_value if default_kind is None: default_kind = data.default_kind elif isinstance(data, (SparseSeries, SparseArray)): if index is None: index = data.index if default_fill_value is None: default_fill_value = data.fill_value if columns is None and hasattr(data, 'name'): columns = [data.name] if columns is None: raise Exception("cannot pass a series w/o a name or columns") data = {columns[0]: data} if default_fill_value is None: default_fill_value = np.nan if default_kind is None: default_kind = 'block' self._default_kind = default_kind self._default_fill_value = default_fill_value if isinstance(data, dict): mgr = self._init_dict(data, index, columns) if dtype is not None: mgr = mgr.astype(dtype) elif isinstance(data, (np.ndarray, list)): mgr = self._init_matrix(data, index, columns) if dtype is not None: mgr = mgr.astype(dtype) elif isinstance(data, SparseDataFrame): mgr = self._init_mgr(data._data, dict(index=index, columns=columns), dtype=dtype, copy=copy) elif isinstance(data, DataFrame): mgr = self._init_dict(data, data.index, data.columns) if dtype is not None: mgr = mgr.astype(dtype) elif isinstance(data, BlockManager): mgr = self._init_mgr(data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy) elif data is None: data = {} if index is None: index = Index([]) else: index = _ensure_index(index) if columns is None: columns = Index([]) else: for c in columns: data[c] = SparseArray(np.nan, index=index, kind=self._default_kind, fill_value=self._default_fill_value) mgr = dict_to_manager(data, columns, index) if dtype is not None: mgr = mgr.astype(dtype) NDFrame.__init__(self, mgr)
def _get_grouper( obj: NDFrame, key=None, axis=0, level=None, sort=True, observed=False, mutated=False, validate=True, ): """ create and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. This may be composed of multiple Grouping objects, indicating multiple groupers Groupers are ultimately index mappings. They can originate as: index mappings, keys to columns, functions, or Groupers Groupers enable local references to axis,level,sort, while the passed in axis, level, and sort are 'global'. This routine tries to figure out what the passing in references are and then creates a Grouping for each one, combined into a BaseGrouper. If observed & we have a categorical grouper, only show the observed values If validate, then check for key/level overlaps """ group_axis = obj._get_axis(axis) # validate that the passed single level is compatible with the passed # axis of the object if level is not None: # TODO: These if-block and else-block are almost same. # MultiIndex instance check is removable, but it seems that there are # some processes only for non-MultiIndex in else-block, # eg. `obj.index.name != level`. We have to consider carefully whether # these are applicable for MultiIndex. Even if these are applicable, # we need to check if it makes no side effect to subsequent processes # on the outside of this condition. # (GH 17621) if isinstance(group_axis, MultiIndex): if is_list_like(level) and len(level) == 1: level = level[0] if key is None and is_scalar(level): # Get the level values from group_axis key = group_axis.get_level_values(level) level = None else: # allow level to be a length-one list-like object # (e.g., level=[0]) # GH 13901 if is_list_like(level): nlevels = len(level) if nlevels == 1: level = level[0] elif nlevels == 0: raise ValueError("No group keys passed!") else: raise ValueError( "multiple levels only valid with MultiIndex") if isinstance(level, str): if obj.index.name != level: raise ValueError( "level name {} is not the name of the index".format( level)) elif level > 0 or level < -1: raise ValueError( "level > 0 or level < -1 only valid with MultiIndex") # NOTE: `group_axis` and `group_axis.get_level_values(level)` # are same in this section. level = None key = group_axis # a passed-in Grouper, directly convert if isinstance(key, Grouper): binner, grouper, obj = key._get_grouper(obj, validate=False) if key.key is None: return grouper, [], obj else: return grouper, {key.key}, obj # already have a BaseGrouper, just return it elif isinstance(key, BaseGrouper): return key, [], obj # In the future, a tuple key will always mean an actual key, # not an iterable of keys. In the meantime, we attempt to provide # a warning. We can assume that the user wanted a list of keys when # the key is not in the index. We just have to be careful with # unhashable elements of `key`. Any unhashable elements implies that # they wanted a list of keys. # https://github.com/pandas-dev/pandas/issues/18314 is_tuple = isinstance(key, tuple) all_hashable = is_tuple and is_hashable(key) if is_tuple: if (all_hashable and key not in obj and set(key).issubset(obj)) or not all_hashable: # column names ('a', 'b') -> ['a', 'b'] # arrays like (a, b) -> [a, b] msg = ("Interpreting tuple 'by' as a list of keys, rather than " "a single key. Use 'by=[...]' instead of 'by=(...)'. In " "the future, a tuple will always mean a single key.") warnings.warn(msg, FutureWarning, stacklevel=5) key = list(key) if not isinstance(key, list): keys = [key] match_axis_length = False else: keys = key match_axis_length = len(keys) == len(group_axis) # what are we after, exactly? any_callable = any(callable(g) or isinstance(g, dict) for g in keys) any_groupers = any(isinstance(g, Grouper) for g in keys) any_arraylike = any( isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys) # is this an index replacement? if (not any_callable and not any_arraylike and not any_groupers and match_axis_length and level is None): if isinstance(obj, DataFrame): all_in_columns_index = all(g in obj.columns or g in obj.index.names for g in keys) elif isinstance(obj, Series): all_in_columns_index = all(g in obj.index.names for g in keys) if not all_in_columns_index: keys = [com.asarray_tuplesafe(keys)] if isinstance(level, (tuple, list)): if key is None: keys = [None] * len(level) levels = level else: levels = [level] * len(keys) groupings = [] exclusions = [] # if the actual grouper should be obj[key] def is_in_axis(key): if not _is_label_like(key): items = obj._data.items try: items.get_loc(key) except (KeyError, TypeError): # TypeError shows up here if we pass e.g. Int64Index return False return True # if the grouper is obj[name] def is_in_obj(gpr): if not hasattr(gpr, "name"): return False try: return gpr is obj[gpr.name] except (KeyError, IndexError): return False for i, (gpr, level) in enumerate(zip(keys, levels)): if is_in_obj(gpr): # df.groupby(df['name']) in_axis, name = True, gpr.name exclusions.append(name) elif is_in_axis(gpr): # df.groupby('name') if gpr in obj: if validate: obj._check_label_or_level_ambiguity(gpr, axis=axis) in_axis, name, gpr = True, gpr, obj[gpr] exclusions.append(name) elif obj._is_level_reference(gpr, axis=axis): in_axis, name, level, gpr = False, None, gpr, None else: raise KeyError(gpr) elif isinstance(gpr, Grouper) and gpr.key is not None: # Add key to exclusions exclusions.append(gpr.key) in_axis, name = False, None else: in_axis, name = False, None if is_categorical_dtype(gpr) and len(gpr) != obj.shape[axis]: raise ValueError( ("Length of grouper ({len_gpr}) and axis ({len_axis})" " must be same length".format(len_gpr=len(gpr), len_axis=obj.shape[axis]))) # create the Grouping # allow us to passing the actual Grouping as the gpr ping = (Grouping( group_axis, gpr, obj=obj, name=name, level=level, sort=sort, observed=observed, in_axis=in_axis, ) if not isinstance(gpr, Grouping) else gpr) groupings.append(ping) if len(groupings) == 0 and len(obj): raise ValueError("No group keys passed!") elif len(groupings) == 0: groupings.append( Grouping(Index([], dtype="int"), np.array([], dtype=np.intp))) # create the internals grouper grouper = BaseGrouper(group_axis, groupings, sort=sort, mutated=mutated) return grouper, exclusions, obj
def test_constructor(self): # with cast ndf = NDFrame(self.ndf._data, dtype=np.int64) self.assert_(ndf.values.dtype == np.int64)
def _set_grouper(self, obj: NDFrame, sort: bool = False): """ given an object and the specifications, setup the internal grouper for this particular specification Parameters ---------- obj : Series or DataFrame sort : bool, default False whether the resulting grouper should be sorted """ assert obj is not None if self.key is not None and self.level is not None: raise ValueError( "The Grouper cannot specify both a key and a level!") # Keep self.grouper value before overriding if self._grouper is None: # TODO: What are we assuming about subsequent calls? self._grouper = self._gpr_index self._indexer = self.indexer # the key must be a valid info item if self.key is not None: key = self.key # The 'on' is already defined if getattr(self._gpr_index, "name", None) == key and isinstance( obj, Series): # Sometimes self._grouper will have been resorted while # obj has not. In this case there is a mismatch when we # call self._grouper.take(obj.index) so we need to undo the sorting # before we call _grouper.take. assert self._grouper is not None if self._indexer is not None: reverse_indexer = self._indexer.argsort() unsorted_ax = self._grouper.take(reverse_indexer) ax = unsorted_ax.take(obj.index) else: ax = self._grouper.take(obj.index) else: if key not in obj._info_axis: raise KeyError(f"The grouper name {key} is not found") ax = Index(obj[key], name=key) else: ax = obj._get_axis(self.axis) if self.level is not None: level = self.level # if a level is given it must be a mi level or # equivalent to the axis name if isinstance(ax, MultiIndex): level = ax._get_level_number(level) ax = Index(ax._get_level_values(level), name=ax.names[level]) else: if level not in (0, ax.name): raise ValueError(f"The level {level} is not valid") # possibly sort if (self.sort or sort) and not ax.is_monotonic: # use stable sort to support first, last, nth # TODO: why does putting na_position="first" fix datetimelike cases? indexer = self.indexer = ax.array.argsort(kind="mergesort", na_position="first") ax = ax.take(indexer) obj = obj.take(indexer, axis=self.axis) # error: Incompatible types in assignment (expression has type # "NDFrameT", variable has type "None") self.obj = obj # type: ignore[assignment] self._gpr_index = ax return self._gpr_index