示例#1
0
文件: groupby.py 项目: kfatyas/pandas
    def _wrap_aggregated_output(self, output, mask, comp_ids):
        agg_axis = 0 if self.axis == 1 else 1
        agg_labels = self._obj_with_exclusions._get_axis(agg_axis)

        if len(output) == len(agg_labels):
            output_keys = agg_labels
        else:
            output_keys = sorted(output)
            try:
                output_keys.sort()
            except Exception:  # pragma: no cover
                pass

            if isinstance(agg_labels, MultiIndex):
                output_keys = MultiIndex.from_tuples(output_keys,
                                                     names=agg_labels.names)

        if not self.as_index:
            result = DataFrame(output, columns=output_keys)
            group_levels = self._get_group_levels(mask, comp_ids)
            for i, (name, labels) in enumerate(group_levels):
                result.insert(i, name, labels)
            result = result.consolidate()
        else:
            index = self._get_multi_index(mask, comp_ids)
            result = DataFrame(output, index=index, columns=output_keys)

        if self.axis == 1:
            result = result.T

        return result
示例#2
0
    def aggregate(self, arg, *args, **kwargs):
        """
        Aggregate using input function or dict of {column -> function}

        Parameters
        ----------
        arg : function or dict
            Function to use for aggregating groups. If a function, must either
            work when passed a DataFrame or when passed to DataFrame.apply. If
            pass a dict, the keys must be DataFrame column names

        Returns
        -------
        aggregated : DataFrame
        """
        if isinstance(arg, basestring):
            return getattr(self, arg)()

        result = {}
        if isinstance(arg, dict):
            if self.axis != 0:  # pragma: no cover
                raise ValueError('Can only pass dict with axis=0')

            obj = self._obj_with_exclusions

            if self._column is not None:
                series_obj = obj[self._column]
                for fname, func in arg.iteritems():
                    colg = SeriesGroupBy(series_obj, column=self._column,
                                         grouper=self.grouper)
                    result[fname] = colg.aggregate(func)
            else:
                for col, func in arg.iteritems():
                    colg = SeriesGroupBy(obj[col], column=col,
                                         grouper=self.grouper)
                    result[col] = colg.aggregate(func)

            result = DataFrame(result)
        elif isinstance(arg, list):
            return self._aggregate_multiple_funcs(arg)
        else:
            if len(self.grouper.groupings) > 1:
                return self._python_agg_general(arg, *args, **kwargs)
            else:
                result = self._aggregate_generic(arg, *args, **kwargs)

        if not self.as_index:
            if isinstance(result.index, MultiIndex):
                zipped = zip(result.index.levels, result.index.labels,
                             result.index.names)
                for i, (lev, lab, name) in enumerate(zipped):
                    result.insert(i, name, lev.values.take(lab))
                result = result.consolidate()
            else:
                values = result.index.values
                name = self.grouper.groupings[0].name
                result.insert(0, name, values)
            result.index = np.arange(len(result))

        return result
示例#3
0
    def _cython_agg_general(self, how):
        obj = self._obj_with_exclusions
        if self.axis == 1:
            obj = obj.T

        new_blocks = []

        for block in obj._data.blocks:
            values = block.values.T
            if not issubclass(values.dtype.type, (np.number, np.bool_)):
                continue

            values = com._ensure_float64(values)
            result, counts = self.grouper.aggregate(values, how)

            mask = counts > 0
            if len(mask) > 0:
                result = result[mask]
            newb = make_block(result.T, block.items, block.ref_items)
            new_blocks.append(newb)

        if len(new_blocks) == 0:
            raise GroupByError('No numeric types to aggregate')

        agg_axis = 0 if self.axis == 1 else 1
        agg_labels = self._obj_with_exclusions._get_axis(agg_axis)

        if sum(len(x.items) for x in new_blocks) == len(agg_labels):
            output_keys = agg_labels
        else:
            all_items = []
            for b in new_blocks:
                all_items.extend(b.items)
            output_keys = agg_labels[agg_labels.isin(all_items)]

        if not self.as_index:
            index = np.arange(new_blocks[0].values.shape[1])
            mgr = BlockManager(new_blocks, [output_keys, index])
            result = DataFrame(mgr)

            group_levels = self.grouper.get_group_levels()
            zipped = zip(self.grouper.names, group_levels)

            for i, (name, labels) in enumerate(zipped):
                result.insert(i, name, labels)
            result = result.consolidate()
        else:
            index = self.grouper.result_index
            mgr = BlockManager(new_blocks, [output_keys, index])
            result = DataFrame(mgr)

        if self.axis == 1:
            result = result.T

        return result
示例#4
0
    def _cython_agg_general(self, how):
        obj = self._obj_with_exclusions
        if self.axis == 1:
            obj = obj.T

        new_blocks = []

        for block in obj._data.blocks:
            values = block.values.T
            if not issubclass(values.dtype.type, (np.number, np.bool_)):
                continue

            values = com._ensure_float64(values)
            result, counts = self.grouper.aggregate(values, how)

            mask = counts > 0
            if len(mask) > 0:
                result = result[mask]
            newb = make_block(result.T, block.items, block.ref_items)
            new_blocks.append(newb)

        if len(new_blocks) == 0:
            raise GroupByError('No numeric types to aggregate')

        agg_axis = 0 if self.axis == 1 else 1
        agg_labels = self._obj_with_exclusions._get_axis(agg_axis)

        if sum(len(x.items) for x in new_blocks) == len(agg_labels):
            output_keys = agg_labels
        else:
            all_items = []
            for b in new_blocks:
                all_items.extend(b.items)
            output_keys = agg_labels[agg_labels.isin(all_items)]

        if not self.as_index:
            index = np.arange(new_blocks[0].values.shape[1])
            mgr = BlockManager(new_blocks, [output_keys, index])
            result = DataFrame(mgr)

            group_levels = self.grouper.get_group_levels()
            zipped = zip(self.grouper.names, group_levels)

            for i, (name, labels) in enumerate(zipped):
                result.insert(i, name, labels)
            result = result.consolidate()
        else:
            index = self.grouper.result_index
            mgr = BlockManager(new_blocks, [output_keys, index])
            result = DataFrame(mgr)

        if self.axis == 1:
            result = result.T

        return result
示例#5
0
文件: groupby.py 项目: kfatyas/pandas
    def aggregate(self, arg, *args, **kwargs):
        """
        Aggregate using input function or dict of {column -> function}

        Parameters
        ----------
        arg : function or dict
            Function to use for aggregating groups. If a function, must either
            work when passed a DataFrame or when passed to DataFrame.apply. If
            pass a dict, the keys must be DataFrame column names

        Returns
        -------
        aggregated : DataFrame
        """
        if isinstance(arg, basestring):
            return getattr(self, arg)()

        result = {}
        if isinstance(arg, dict):
            if self.axis != 0:  # pragma: no cover
                raise ValueError('Can only pass dict with axis=0')

            obj = self._obj_with_exclusions
            for col, func in arg.iteritems():
                colg = SeriesGroupBy(obj[col], column=col,
                                     groupings=self.groupings)
                result[col] = colg.agg(func)

            result = DataFrame(result)
        elif isinstance(arg, list):
            return self._aggregate_multiple_funcs(arg)
        else:
            if len(self.groupings) > 1:
                return self._python_agg_general(arg, *args, **kwargs)
            else:
                result = self._aggregate_generic(arg, *args, **kwargs)

        if not self.as_index:
            if isinstance(result.index, MultiIndex):
                zipped = zip(result.index.levels, result.index.labels,
                             result.index.names)
                for i, (lev, lab, name) in enumerate(zipped):
                    result.insert(i, name, lev.values.take(lab))
                result = result.consolidate()
            else:
                values = result.index.values
                name = self.groupings[0].name
                result.insert(0, name, values)
            result.index = np.arange(len(result))

        return result
示例#6
0
文件: groupby.py 项目: hammer/pandas
    def aggregate(self, arg, *args, **kwargs):
        """
        Aggregate using input function or dict of {column -> function}

        Parameters
        ----------
        arg : function or dict
            Function to use for aggregating groups. If a function, must either
            work when passed a DataFrame or when passed to DataFrame.apply. If
            pass a dict, the keys must be DataFrame column names

        Returns
        -------
        aggregated : DataFrame
        """
        if isinstance(arg, basestring):
            return getattr(self, arg)()

        result = {}
        if isinstance(arg, dict):
            if self.axis != 0:  # pragma: no cover
                raise ValueError('Can only pass dict with axis=0')

            for col, func in arg.iteritems():
                result[col] = self[col].agg(func)

            result = DataFrame(result)
        else:
            if len(self.groupings) > 1:
                try:
                    return self._python_agg_general(arg, *args, **kwargs)
                except Exception:
                    return self._aggregate_item_by_item(arg, *args, **kwargs)
            result = self._aggregate_generic(arg, *args, **kwargs)

        if not self.as_index:
            if isinstance(result.index, MultiIndex):
                zipped = zip(result.index.levels, result.index.labels,
                             result.index.names)
                for i, (lev, lab, name) in enumerate(zipped):
                    result.insert(i, name, lev.values.take(lab))
                result = result.consolidate()
            else:
                values = result.index.values
                name = self.groupings[0].name
                result.insert(0, name, values)
            result.index = np.arange(len(result))

        return result
示例#7
0
    def _cython_agg_general(self, how):

        comp_ids, obs_group_ids, max_group = self._group_info

        obj = self._obj_with_exclusions
        if self.axis == 1:
            obj = obj.T

        new_blocks = []

        for block in obj._data.blocks:
            values = block.values.T
            if not issubclass(values.dtype.type, (np.number, np.bool_)):
                continue

            values = com._ensure_float64(values)
            result, counts = cython_aggregate(values, comp_ids,
                                              max_group, how=how)

            mask = counts > 0
            if len(mask) > 0:
                result = result[mask]
            newb = make_block(result.T, block.items, block.ref_items)
            new_blocks.append(newb)

        if len(new_blocks) == 0:
            raise GroupByError('No numeric types to aggregate')

        agg_axis = 0 if self.axis == 1 else 1
        agg_labels = self._obj_with_exclusions._get_axis(agg_axis)

        if sum(len(x.items) for x in new_blocks) == len(agg_labels):
            output_keys = agg_labels
        else:
            output_keys = []
            for b in new_blocks:
                output_keys.extend(b.items)
            try:
                output_keys.sort()
            except TypeError:  # pragma: no cover
                pass

            if isinstance(agg_labels, MultiIndex):
                output_keys = MultiIndex.from_tuples(output_keys,
                                                     names=agg_labels.names)

        if not self.as_index:
            index = np.arange(new_blocks[0].values.shape[1])
            mgr = BlockManager(new_blocks, [output_keys, index])
            result = DataFrame(mgr)
            group_levels = self._get_group_levels(mask, obs_group_ids)
            for i, (name, labels) in enumerate(group_levels):
                result.insert(i, name, labels)
            result = result.consolidate()
        else:
            index = self._get_multi_index(mask, obs_group_ids)
            mgr = BlockManager(new_blocks, [output_keys, index])
            result = DataFrame(mgr)

        if self.axis == 1:
            result = result.T

        return result