예제 #1
0
    def value_counts(self,
                     normalize=False,
                     sort=True,
                     ascending=False,
                     bins=None,
                     dropna=True):
        if bins is not None:
            raise NotImplementedError(
                "value_counts currently does not support bins")

        if dropna:
            sdf_dropna = self._kdf._sdf.filter(self.notna()._scol)
        else:
            sdf_dropna = self._kdf._sdf
        sdf = sdf_dropna.groupby(self._scol).count()
        if sort:
            if ascending:
                sdf = sdf.orderBy(F.col('count'))
            else:
                sdf = sdf.orderBy(F.col('count').desc())

        if normalize:
            sum = sdf_dropna.count()
            sdf = sdf.withColumn('count', F.col('count') / F.lit(sum))

        index_name = 'index' if self.name != 'index' else 'level_0'
        kdf = DataFrame(sdf)
        kdf.columns = [index_name, self.name]
        kdf._metadata = Metadata(column_fields=[self.name],
                                 index_info=[(index_name, None)])
        return _col(kdf)
예제 #2
0
    def aggregate(self, func_or_funcs, *args, **kwargs):
        """Compute aggregates and returns the result as a :class:`DataFrame`.

        The available aggregate functions can be built-in aggregation functions, such as `avg`,
        `max`, `min`, `sum`, `count`.

        :param func_or_funcs: a dict mapping from column name (string) to aggregate functions
                              (string).
        """
        if not isinstance(func_or_funcs, dict) or \
            not all(isinstance(key, string_types) and isinstance(value, string_types)
                    for key, value in func_or_funcs.items()):
            raise ValueError(
                "aggs must be a dict mapping from column name (string) to aggregate "
                "functions (string).")
        sdf = self._groupdata.agg(func_or_funcs)

        reorder = [
            '%s(%s)' % (value, key)
            for key, value in iter(func_or_funcs.items())
        ]
        kdf = DataFrame(sdf.select(reorder))
        kdf.columns = [key for key in iter(func_or_funcs.keys())]

        return kdf
예제 #3
0
    def value_counts(self,
                     normalize=False,
                     sort=True,
                     ascending=False,
                     bins=None,
                     dropna=True):
        """
        Return a Series containing counts of unique values.
        The resulting object will be in descending order so that the
        first element is the most frequently-occurring element.
        Excludes NA values by default.

        Parameters
        ----------
        normalize : boolean, default False
            If True then the object returned will contain the relative
            frequencies of the unique values.
        sort : boolean, default True
            Sort by values.
        ascending : boolean, default False
            Sort in ascending order.
        bins : Not Yet Supported
        dropna : boolean, default True
            Don't include counts of NaN.

        Returns
        -------
        counts : Series

        See Also
        --------
        Series.count: Number of non-NA elements in a Series.

        Examples
        --------
        >>> df = ks.DataFrame({'x':[0, 0, 1, 1, 1, np.nan]})
        >>> df.x.value_counts() # doctest: +NORMALIZE_WHITESPACE
        1.0    3
        0.0    2
        Name: x, dtype: int64

        With `normalize` set to `True`, returns the relative frequency by
        dividing all values by the sum of values.

        >>> df.x.value_counts(normalize=True) # doctest: +NORMALIZE_WHITESPACE
        1.0    0.6
        0.0    0.4
        Name: x, dtype: float64

        **dropna**
        With `dropna` set to `False` we can also see NaN index values.

        >>> df.x.value_counts(dropna=False) # doctest: +NORMALIZE_WHITESPACE
        1.0    3
        0.0    2
        NaN    1
        Name: x, dtype: int64
        """
        if bins is not None:
            raise NotImplementedError(
                "value_counts currently does not support bins")

        if dropna:
            sdf_dropna = self._kdf._sdf.filter(self.notna()._scol)
        else:
            sdf_dropna = self._kdf._sdf
        sdf = sdf_dropna.groupby(self._scol).count()
        if sort:
            if ascending:
                sdf = sdf.orderBy(F.col('count'))
            else:
                sdf = sdf.orderBy(F.col('count').desc())

        if normalize:
            sum = sdf_dropna.count()
            sdf = sdf.withColumn('count', F.col('count') / F.lit(sum))

        index_name = 'index' if self.name != 'index' else 'level_0'
        kdf = DataFrame(sdf)
        kdf.columns = [index_name, self.name]
        kdf._metadata = Metadata(column_fields=[self.name],
                                 index_info=[(index_name, None)])
        return _col(kdf)
예제 #4
0
파일: groupby.py 프로젝트: the7day/koalas
    def aggregate(self, func_or_funcs, *args, **kwargs):
        """Aggregate using one or more operations over the specified axis.

        Parameters
        ----------
        func : dict
             a dict mapping from column name (string) to aggregate functions (string).

        Returns
        -------
        Series or DataFrame

            The return can be:

            * Series : when DataFrame.agg is called with a single function
            * DataFrame : when DataFrame.agg is called with several functions

            Return Series or DataFrame.

        Notes
        -----
        `agg` is an alias for `aggregate`. Use the alias.

        Examples
        --------

        >>> df = ks.DataFrame({'A': [1, 1, 2, 2],
        ...                    'B': [1, 2, 3, 4],
        ...                    'C': [0.362, 0.227, 1.267, -0.562]})

        >>> df
           A  B      C
        0  1  1  0.362
        1  1  2  0.227
        2  2  3  1.267
        3  2  4 -0.562

        Different aggregations per column

        >>> df.groupby('A').agg({'B': 'min', 'C': 'sum'})
           B      C
        0  1  0.589
        1  3  0.705

        """
        if not isinstance(func_or_funcs, dict) or \
            not all(isinstance(key, string_types) and isinstance(value, string_types)
                    for key, value in func_or_funcs.items()):
            raise ValueError(
                "aggs must be a dict mapping from column name (string) to aggregate "
                "functions (string).")

        sdf = self._kdf._sdf
        groupkeys = self._groupkeys
        groupkey_cols = [
            s._scol.alias('__index_level_{}__'.format(i))
            for i, s in enumerate(groupkeys)
        ]
        gdf = sdf.groupby(*groupkey_cols).agg(func_or_funcs)
        reordered = [
            '%s(%s)' % (value, key)
            for key, value in iter(func_or_funcs.items())
        ]
        kdf = DataFrame(gdf.select(reordered))
        kdf.columns = [key for key in iter(func_or_funcs.keys())]

        return kdf