def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True): if bins is not None: raise NotImplementedError( "value_counts currently does not support bins") if dropna: sdf_dropna = self._kdf._sdf.filter(self.notna()._scol) else: sdf_dropna = self._kdf._sdf sdf = sdf_dropna.groupby(self._scol).count() if sort: if ascending: sdf = sdf.orderBy(F.col('count')) else: sdf = sdf.orderBy(F.col('count').desc()) if normalize: sum = sdf_dropna.count() sdf = sdf.withColumn('count', F.col('count') / F.lit(sum)) index_name = 'index' if self.name != 'index' else 'level_0' kdf = DataFrame(sdf) kdf.columns = [index_name, self.name] kdf._metadata = Metadata(column_fields=[self.name], index_info=[(index_name, None)]) return _col(kdf)
def _reduce_for_stat_function(self, sfun, only_numeric): groupkeys = self._groupkeys groupkey_cols = [s._scol.alias('__index_level_{}__'.format(i)) for i, s in enumerate(groupkeys)] sdf = self._kdf._sdf column_fields = [] if len(self._agg_columns) > 0: stat_exprs = [] for ks in self._agg_columns: spark_type = ks.spark_type # TODO: we should have a function that takes dataframes and converts the numeric # types. Converting the NaNs is used in a few places, it should be in utils. # Special handle floating point types because Spark's count treats nan as a valid # value, whereas Pandas count doesn't include nan. if isinstance(spark_type, DoubleType) or isinstance(spark_type, FloatType): stat_exprs.append(sfun(F.nanvl(ks._scol, F.lit(None))).alias(ks.name)) column_fields.append(ks.name) elif isinstance(spark_type, NumericType) or not only_numeric: stat_exprs.append(sfun(ks._scol).alias(ks.name)) column_fields.append(ks.name) sdf = sdf.groupby(*groupkey_cols).agg(*stat_exprs) else: sdf = sdf.select(*groupkey_cols).distinct() sdf = sdf.sort(*groupkey_cols) metadata = Metadata(column_fields=column_fields, index_info=[('__index_level_{}__'.format(i), s.name) for i, s in enumerate(groupkeys)]) return DataFrame(sdf, metadata)
def _init_from_spark(self, sdf, metadata=None, *args): self._sdf = sdf if metadata is None: self._metadata = Metadata( column_fields=self._sdf.schema.fieldNames()) else: self._metadata = metadata
def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True): if bins is not None: raise NotImplementedError("value_counts currently does not support bins") if dropna: df_dropna = self._pandas_anchor._spark_filter(self.notna()) else: df_dropna = self._pandas_anchor df = df_dropna._spark_groupby(self).count() if sort: if ascending: df = df._spark_orderBy(F._spark_col('count')) else: df = df._spark_orderBy(F._spark_col('count')._spark_desc()) if normalize: sum = df_dropna._spark_count() df = df._spark_withColumn('count', F._spark_col('count') / F._spark_lit(sum)) index_name = 'index' if self.name != 'index' else 'level_0' df.columns = [index_name, self.name] df._metadata = Metadata(column_fields=[self.name], index_info=[(index_name, None)]) return _col(df)
def to_dataframe(self): sdf = self._kdf._sdf.select([field for field, _ in self._index_info] + [self._scol]) metadata = Metadata(column_fields=[sdf.schema[-1].name], index_info=self._index_info) return DataFrame(sdf, metadata)
def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True): """ Return a Series containing counts of unique values. The resulting object will be in descending order so that the first element is the most frequently-occurring element. Excludes NA values by default. Parameters ---------- normalize : boolean, default False If True then the object returned will contain the relative frequencies of the unique values. sort : boolean, default True Sort by values. ascending : boolean, default False Sort in ascending order. bins : Not Yet Supported dropna : boolean, default True Don't include counts of NaN. Returns ------- counts : Series See Also -------- Series.count: Number of non-NA elements in a Series. Examples -------- >>> df = ks.DataFrame({'x':[0, 0, 1, 1, 1, np.nan]}) >>> df.x.value_counts() # doctest: +NORMALIZE_WHITESPACE 1.0 3 0.0 2 Name: x, dtype: int64 With `normalize` set to `True`, returns the relative frequency by dividing all values by the sum of values. >>> df.x.value_counts(normalize=True) # doctest: +NORMALIZE_WHITESPACE 1.0 0.6 0.0 0.4 Name: x, dtype: float64 **dropna** With `dropna` set to `False` we can also see NaN index values. >>> df.x.value_counts(dropna=False) # doctest: +NORMALIZE_WHITESPACE 1.0 3 0.0 2 NaN 1 Name: x, dtype: int64 """ if bins is not None: raise NotImplementedError( "value_counts currently does not support bins") if dropna: sdf_dropna = self._kdf._sdf.filter(self.notna()._scol) else: sdf_dropna = self._kdf._sdf sdf = sdf_dropna.groupby(self._scol).count() if sort: if ascending: sdf = sdf.orderBy(F.col('count')) else: sdf = sdf.orderBy(F.col('count').desc()) if normalize: sum = sdf_dropna.count() sdf = sdf.withColumn('count', F.col('count') / F.lit(sum)) index_name = 'index' if self.name != 'index' else 'level_0' kdf = DataFrame(sdf) kdf.columns = [index_name, self.name] kdf._metadata = Metadata(column_fields=[self.name], index_info=[(index_name, None)]) return _col(kdf)
def to_dataframe(self): sdf = self._kdf._sdf.select([field for field, _ in self._index_map] + [self._scol]) metadata = Metadata(data_columns=[sdf.schema[-1].name], index_map=self._index_map) return DataFrame(sdf, metadata)
def aggregate(self, func_or_funcs, *args, **kwargs): """Aggregate using one or more operations over the specified axis. Parameters ---------- func : dict a dict mapping from column name (string) to aggregate functions (string). Returns ------- Series or DataFrame The return can be: * Series : when DataFrame.agg is called with a single function * DataFrame : when DataFrame.agg is called with several functions Return Series or DataFrame. Notes ----- `agg` is an alias for `aggregate`. Use the alias. Examples -------- >>> df = ks.DataFrame({'A': [1, 1, 2, 2], ... 'B': [1, 2, 3, 4], ... 'C': [0.362, 0.227, 1.267, -0.562]}) >>> df = df[['A', 'B', 'C']] >>> df A B C 0 1 1 0.362 1 1 2 0.227 2 2 3 1.267 3 2 4 -0.562 Different aggregations per column >>> aggregated = df.groupby('A').agg({'B': 'min', 'C': 'sum'}) >>> aggregated[['B', 'C']] # doctest: +NORMALIZE_WHITESPACE B C A 1 1 0.589 2 3 0.705 """ if not isinstance(func_or_funcs, dict) or \ not all(isinstance(key, string_types) and isinstance(value, string_types) for key, value in func_or_funcs.items()): raise ValueError("aggs must be a dict mapping from column name (string) to aggregate " "functions (string).") sdf = self._kdf._sdf groupkeys = self._groupkeys groupkey_cols = [s._scol.alias('__index_level_{}__'.format(i)) for i, s in enumerate(groupkeys)] reordered = [F.expr('{1}({0}) as {0}'.format(key, value)) for key, value in func_or_funcs.items()] sdf = sdf.groupby(*groupkey_cols).agg(*reordered) metadata = Metadata(column_fields=[key for key, _ in func_or_funcs.items()], index_info=[('__index_level_{}__'.format(i), s.name) for i, s in enumerate(groupkeys)]) return DataFrame(sdf, metadata)
def _metadata(self): if not hasattr(self, '_pandas_metadata') or self._pandas_metadata is None: self._pandas_metadata = Metadata( column_fields=self.schema.fieldNames()) return self._pandas_metadata