def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True): if bins is not None: raise NotImplementedError( "value_counts currently does not support bins") if dropna: sdf_dropna = self._kdf._sdf.filter(self.notna()._scol) else: sdf_dropna = self._kdf._sdf sdf = sdf_dropna.groupby(self._scol).count() if sort: if ascending: sdf = sdf.orderBy(F.col('count')) else: sdf = sdf.orderBy(F.col('count').desc()) if normalize: sum = sdf_dropna.count() sdf = sdf.withColumn('count', F.col('count') / F.lit(sum)) index_name = 'index' if self.name != 'index' else 'level_0' kdf = DataFrame(sdf) kdf.columns = [index_name, self.name] kdf._metadata = Metadata(column_fields=[self.name], index_info=[(index_name, None)]) return _col(kdf)
def aggregate(self, func_or_funcs, *args, **kwargs): """Compute aggregates and returns the result as a :class:`DataFrame`. The available aggregate functions can be built-in aggregation functions, such as `avg`, `max`, `min`, `sum`, `count`. :param func_or_funcs: a dict mapping from column name (string) to aggregate functions (string). """ if not isinstance(func_or_funcs, dict) or \ not all(isinstance(key, string_types) and isinstance(value, string_types) for key, value in func_or_funcs.items()): raise ValueError( "aggs must be a dict mapping from column name (string) to aggregate " "functions (string).") sdf = self._groupdata.agg(func_or_funcs) reorder = [ '%s(%s)' % (value, key) for key, value in iter(func_or_funcs.items()) ] kdf = DataFrame(sdf.select(reorder)) kdf.columns = [key for key in iter(func_or_funcs.keys())] return kdf
def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True): """ Return a Series containing counts of unique values. The resulting object will be in descending order so that the first element is the most frequently-occurring element. Excludes NA values by default. Parameters ---------- normalize : boolean, default False If True then the object returned will contain the relative frequencies of the unique values. sort : boolean, default True Sort by values. ascending : boolean, default False Sort in ascending order. bins : Not Yet Supported dropna : boolean, default True Don't include counts of NaN. Returns ------- counts : Series See Also -------- Series.count: Number of non-NA elements in a Series. Examples -------- >>> df = ks.DataFrame({'x':[0, 0, 1, 1, 1, np.nan]}) >>> df.x.value_counts() # doctest: +NORMALIZE_WHITESPACE 1.0 3 0.0 2 Name: x, dtype: int64 With `normalize` set to `True`, returns the relative frequency by dividing all values by the sum of values. >>> df.x.value_counts(normalize=True) # doctest: +NORMALIZE_WHITESPACE 1.0 0.6 0.0 0.4 Name: x, dtype: float64 **dropna** With `dropna` set to `False` we can also see NaN index values. >>> df.x.value_counts(dropna=False) # doctest: +NORMALIZE_WHITESPACE 1.0 3 0.0 2 NaN 1 Name: x, dtype: int64 """ if bins is not None: raise NotImplementedError( "value_counts currently does not support bins") if dropna: sdf_dropna = self._kdf._sdf.filter(self.notna()._scol) else: sdf_dropna = self._kdf._sdf sdf = sdf_dropna.groupby(self._scol).count() if sort: if ascending: sdf = sdf.orderBy(F.col('count')) else: sdf = sdf.orderBy(F.col('count').desc()) if normalize: sum = sdf_dropna.count() sdf = sdf.withColumn('count', F.col('count') / F.lit(sum)) index_name = 'index' if self.name != 'index' else 'level_0' kdf = DataFrame(sdf) kdf.columns = [index_name, self.name] kdf._metadata = Metadata(column_fields=[self.name], index_info=[(index_name, None)]) return _col(kdf)
def aggregate(self, func_or_funcs, *args, **kwargs): """Aggregate using one or more operations over the specified axis. Parameters ---------- func : dict a dict mapping from column name (string) to aggregate functions (string). Returns ------- Series or DataFrame The return can be: * Series : when DataFrame.agg is called with a single function * DataFrame : when DataFrame.agg is called with several functions Return Series or DataFrame. Notes ----- `agg` is an alias for `aggregate`. Use the alias. Examples -------- >>> df = ks.DataFrame({'A': [1, 1, 2, 2], ... 'B': [1, 2, 3, 4], ... 'C': [0.362, 0.227, 1.267, -0.562]}) >>> df A B C 0 1 1 0.362 1 1 2 0.227 2 2 3 1.267 3 2 4 -0.562 Different aggregations per column >>> df.groupby('A').agg({'B': 'min', 'C': 'sum'}) B C 0 1 0.589 1 3 0.705 """ if not isinstance(func_or_funcs, dict) or \ not all(isinstance(key, string_types) and isinstance(value, string_types) for key, value in func_or_funcs.items()): raise ValueError( "aggs must be a dict mapping from column name (string) to aggregate " "functions (string).") sdf = self._kdf._sdf groupkeys = self._groupkeys groupkey_cols = [ s._scol.alias('__index_level_{}__'.format(i)) for i, s in enumerate(groupkeys) ] gdf = sdf.groupby(*groupkey_cols).agg(func_or_funcs) reordered = [ '%s(%s)' % (value, key) for key, value in iter(func_or_funcs.items()) ] kdf = DataFrame(gdf.select(reordered)) kdf.columns = [key for key in iter(func_or_funcs.keys())] return kdf