def _wrap_aggregated_output(self, output, mask, comp_ids): agg_axis = 0 if self.axis == 1 else 1 agg_labels = self._obj_with_exclusions._get_axis(agg_axis) if len(output) == len(agg_labels): output_keys = agg_labels else: output_keys = sorted(output) try: output_keys.sort() except Exception: # pragma: no cover pass if isinstance(agg_labels, MultiIndex): output_keys = MultiIndex.from_tuples(output_keys, names=agg_labels.names) if not self.as_index: result = DataFrame(output, columns=output_keys) group_levels = self._get_group_levels(mask, comp_ids) for i, (name, labels) in enumerate(group_levels): result.insert(i, name, labels) result = result.consolidate() else: index = self._get_multi_index(mask, comp_ids) result = DataFrame(output, index=index, columns=output_keys) if self.axis == 1: result = result.T return result
def aggregate(self, arg, *args, **kwargs): """ Aggregate using input function or dict of {column -> function} Parameters ---------- arg : function or dict Function to use for aggregating groups. If a function, must either work when passed a DataFrame or when passed to DataFrame.apply. If pass a dict, the keys must be DataFrame column names Returns ------- aggregated : DataFrame """ if isinstance(arg, basestring): return getattr(self, arg)() result = {} if isinstance(arg, dict): if self.axis != 0: # pragma: no cover raise ValueError('Can only pass dict with axis=0') obj = self._obj_with_exclusions if self._column is not None: series_obj = obj[self._column] for fname, func in arg.iteritems(): colg = SeriesGroupBy(series_obj, column=self._column, grouper=self.grouper) result[fname] = colg.aggregate(func) else: for col, func in arg.iteritems(): colg = SeriesGroupBy(obj[col], column=col, grouper=self.grouper) result[col] = colg.aggregate(func) result = DataFrame(result) elif isinstance(arg, list): return self._aggregate_multiple_funcs(arg) else: if len(self.grouper.groupings) > 1: return self._python_agg_general(arg, *args, **kwargs) else: result = self._aggregate_generic(arg, *args, **kwargs) if not self.as_index: if isinstance(result.index, MultiIndex): zipped = zip(result.index.levels, result.index.labels, result.index.names) for i, (lev, lab, name) in enumerate(zipped): result.insert(i, name, lev.values.take(lab)) result = result.consolidate() else: values = result.index.values name = self.grouper.groupings[0].name result.insert(0, name, values) result.index = np.arange(len(result)) return result
def _cython_agg_general(self, how): obj = self._obj_with_exclusions if self.axis == 1: obj = obj.T new_blocks = [] for block in obj._data.blocks: values = block.values.T if not issubclass(values.dtype.type, (np.number, np.bool_)): continue values = com._ensure_float64(values) result, counts = self.grouper.aggregate(values, how) mask = counts > 0 if len(mask) > 0: result = result[mask] newb = make_block(result.T, block.items, block.ref_items) new_blocks.append(newb) if len(new_blocks) == 0: raise GroupByError('No numeric types to aggregate') agg_axis = 0 if self.axis == 1 else 1 agg_labels = self._obj_with_exclusions._get_axis(agg_axis) if sum(len(x.items) for x in new_blocks) == len(agg_labels): output_keys = agg_labels else: all_items = [] for b in new_blocks: all_items.extend(b.items) output_keys = agg_labels[agg_labels.isin(all_items)] if not self.as_index: index = np.arange(new_blocks[0].values.shape[1]) mgr = BlockManager(new_blocks, [output_keys, index]) result = DataFrame(mgr) group_levels = self.grouper.get_group_levels() zipped = zip(self.grouper.names, group_levels) for i, (name, labels) in enumerate(zipped): result.insert(i, name, labels) result = result.consolidate() else: index = self.grouper.result_index mgr = BlockManager(new_blocks, [output_keys, index]) result = DataFrame(mgr) if self.axis == 1: result = result.T return result
def aggregate(self, arg, *args, **kwargs): """ Aggregate using input function or dict of {column -> function} Parameters ---------- arg : function or dict Function to use for aggregating groups. If a function, must either work when passed a DataFrame or when passed to DataFrame.apply. If pass a dict, the keys must be DataFrame column names Returns ------- aggregated : DataFrame """ if isinstance(arg, basestring): return getattr(self, arg)() result = {} if isinstance(arg, dict): if self.axis != 0: # pragma: no cover raise ValueError('Can only pass dict with axis=0') obj = self._obj_with_exclusions for col, func in arg.iteritems(): colg = SeriesGroupBy(obj[col], column=col, groupings=self.groupings) result[col] = colg.agg(func) result = DataFrame(result) elif isinstance(arg, list): return self._aggregate_multiple_funcs(arg) else: if len(self.groupings) > 1: return self._python_agg_general(arg, *args, **kwargs) else: result = self._aggregate_generic(arg, *args, **kwargs) if not self.as_index: if isinstance(result.index, MultiIndex): zipped = zip(result.index.levels, result.index.labels, result.index.names) for i, (lev, lab, name) in enumerate(zipped): result.insert(i, name, lev.values.take(lab)) result = result.consolidate() else: values = result.index.values name = self.groupings[0].name result.insert(0, name, values) result.index = np.arange(len(result)) return result
def aggregate(self, arg, *args, **kwargs): """ Aggregate using input function or dict of {column -> function} Parameters ---------- arg : function or dict Function to use for aggregating groups. If a function, must either work when passed a DataFrame or when passed to DataFrame.apply. If pass a dict, the keys must be DataFrame column names Returns ------- aggregated : DataFrame """ if isinstance(arg, basestring): return getattr(self, arg)() result = {} if isinstance(arg, dict): if self.axis != 0: # pragma: no cover raise ValueError('Can only pass dict with axis=0') for col, func in arg.iteritems(): result[col] = self[col].agg(func) result = DataFrame(result) else: if len(self.groupings) > 1: try: return self._python_agg_general(arg, *args, **kwargs) except Exception: return self._aggregate_item_by_item(arg, *args, **kwargs) result = self._aggregate_generic(arg, *args, **kwargs) if not self.as_index: if isinstance(result.index, MultiIndex): zipped = zip(result.index.levels, result.index.labels, result.index.names) for i, (lev, lab, name) in enumerate(zipped): result.insert(i, name, lev.values.take(lab)) result = result.consolidate() else: values = result.index.values name = self.groupings[0].name result.insert(0, name, values) result.index = np.arange(len(result)) return result
def _cython_agg_general(self, how): comp_ids, obs_group_ids, max_group = self._group_info obj = self._obj_with_exclusions if self.axis == 1: obj = obj.T new_blocks = [] for block in obj._data.blocks: values = block.values.T if not issubclass(values.dtype.type, (np.number, np.bool_)): continue values = com._ensure_float64(values) result, counts = cython_aggregate(values, comp_ids, max_group, how=how) mask = counts > 0 if len(mask) > 0: result = result[mask] newb = make_block(result.T, block.items, block.ref_items) new_blocks.append(newb) if len(new_blocks) == 0: raise GroupByError('No numeric types to aggregate') agg_axis = 0 if self.axis == 1 else 1 agg_labels = self._obj_with_exclusions._get_axis(agg_axis) if sum(len(x.items) for x in new_blocks) == len(agg_labels): output_keys = agg_labels else: output_keys = [] for b in new_blocks: output_keys.extend(b.items) try: output_keys.sort() except TypeError: # pragma: no cover pass if isinstance(agg_labels, MultiIndex): output_keys = MultiIndex.from_tuples(output_keys, names=agg_labels.names) if not self.as_index: index = np.arange(new_blocks[0].values.shape[1]) mgr = BlockManager(new_blocks, [output_keys, index]) result = DataFrame(mgr) group_levels = self._get_group_levels(mask, obs_group_ids) for i, (name, labels) in enumerate(group_levels): result.insert(i, name, labels) result = result.consolidate() else: index = self._get_multi_index(mask, obs_group_ids) mgr = BlockManager(new_blocks, [output_keys, index]) result = DataFrame(mgr) if self.axis == 1: result = result.T return result