def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True): if bins is not None: raise NotImplementedError( "value_counts currently does not support bins") if dropna: sdf_dropna = self._kdf._sdf.filter(self.notna()._scol) else: sdf_dropna = self._kdf._sdf sdf = sdf_dropna.groupby(self._scol).count() if sort: if ascending: sdf = sdf.orderBy(F.col('count')) else: sdf = sdf.orderBy(F.col('count').desc()) if normalize: sum = sdf_dropna.count() sdf = sdf.withColumn('count', F.col('count') / F.lit(sum)) index_name = 'index' if self.name != 'index' else 'level_0' kdf = DataFrame(sdf) kdf.columns = [index_name, self.name] kdf._metadata = Metadata(column_fields=[self.name], index_info=[(index_name, None)]) return _col(kdf)
def __getitem__(self, key): from pyspark.sql.functions import lit from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series def raiseNotImplemented(description): raise SparkPandasNotImplementedError( description=description, pandas_function=".loc[..., ...]", spark_target_function="select, where") rows_sel, cols_sel = _unfold(key, self._ks) sdf = self._kdf._sdf if isinstance(rows_sel, Series): sdf_for_check_schema = sdf.select(rows_sel._scol) assert isinstance(sdf_for_check_schema.schema.fields[0].dataType, BooleanType), \ (str(sdf_for_check_schema), sdf_for_check_schema.schema.fields[0].dataType) sdf = sdf.where(rows_sel._scol) elif isinstance(rows_sel, slice): if rows_sel.step is not None: raiseNotImplemented("Cannot use step with Spark.") if rows_sel == slice(None): # If slice is None - select everything, so nothing to do pass elif len(self._kdf._index_columns) == 0: raiseNotImplemented( "Cannot use slice for Spark if no index provided.") elif len(self._kdf._index_columns) == 1: start = rows_sel.start stop = rows_sel.stop index_column = self._kdf.index index_data_type = index_column.schema[0].dataType cond = [] if start is not None: cond.append( index_column._scol >= lit(start).cast(index_data_type)) if stop is not None: cond.append( index_column._scol <= lit(stop).cast(index_data_type)) if len(cond) > 0: sdf = sdf.where(reduce(lambda x, y: x & y, cond)) else: raiseNotImplemented( "Cannot use slice for MultiIndex with Spark.") elif isinstance(rows_sel, str): raiseNotImplemented( "Cannot use a scalar value for row selection with Spark.") else: try: rows_sel = list(rows_sel) except TypeError: raiseNotImplemented( "Cannot use a scalar value for row selection with Spark.") if len(rows_sel) == 0: sdf = sdf.where(lit(False)) elif len(self._kdf._index_columns) == 1: index_column = self._kdf.index index_data_type = index_column.schema[0].dataType if len(rows_sel) == 1: sdf = sdf.where(index_column._scol == lit( rows_sel[0]).cast(index_data_type)) else: sdf = sdf.where( index_column._scol.isin( [lit(r).cast(index_data_type) for r in rows_sel])) else: raiseNotImplemented( "Cannot select with MultiIndex with Spark.") if cols_sel is None: columns = [_make_col(c) for c in self._kdf._metadata.data_columns] elif isinstance(cols_sel, spark.Column): columns = [cols_sel] else: columns = [_make_col(c) for c in cols_sel] try: kdf = DataFrame( sdf.select(self._kdf._metadata.index_columns + columns)) except AnalysisException: raise KeyError('[{}] don\'t exist in columns'.format( [col._jc.toString() for col in columns])) kdf._metadata = self._kdf._metadata.copy( data_columns=kdf._metadata.data_columns[-len(columns):]) if cols_sel is not None and isinstance(cols_sel, spark.Column): from databricks.koalas.series import _col return _col(kdf) else: return kdf
def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True): """ Return a Series containing counts of unique values. The resulting object will be in descending order so that the first element is the most frequently-occurring element. Excludes NA values by default. Parameters ---------- normalize : boolean, default False If True then the object returned will contain the relative frequencies of the unique values. sort : boolean, default True Sort by values. ascending : boolean, default False Sort in ascending order. bins : Not Yet Supported dropna : boolean, default True Don't include counts of NaN. Returns ------- counts : Series See Also -------- Series.count: Number of non-NA elements in a Series. Examples -------- >>> df = ks.DataFrame({'x':[0, 0, 1, 1, 1, np.nan]}) >>> df.x.value_counts() # doctest: +NORMALIZE_WHITESPACE 1.0 3 0.0 2 Name: x, dtype: int64 With `normalize` set to `True`, returns the relative frequency by dividing all values by the sum of values. >>> df.x.value_counts(normalize=True) # doctest: +NORMALIZE_WHITESPACE 1.0 0.6 0.0 0.4 Name: x, dtype: float64 **dropna** With `dropna` set to `False` we can also see NaN index values. >>> df.x.value_counts(dropna=False) # doctest: +NORMALIZE_WHITESPACE 1.0 3 0.0 2 NaN 1 Name: x, dtype: int64 """ if bins is not None: raise NotImplementedError( "value_counts currently does not support bins") if dropna: sdf_dropna = self._kdf._sdf.filter(self.notna()._scol) else: sdf_dropna = self._kdf._sdf sdf = sdf_dropna.groupby(self._scol).count() if sort: if ascending: sdf = sdf.orderBy(F.col('count')) else: sdf = sdf.orderBy(F.col('count').desc()) if normalize: sum = sdf_dropna.count() sdf = sdf.withColumn('count', F.col('count') / F.lit(sum)) index_name = 'index' if self.name != 'index' else 'level_0' kdf = DataFrame(sdf) kdf.columns = [index_name, self.name] kdf._metadata = Metadata(column_fields=[self.name], index_info=[(index_name, None)]) return _col(kdf)
def __getitem__(self, key): from databricks.koalas.frame import DataFrame from databricks.koalas.indexes import Index from databricks.koalas.series import Series def raiseNotImplemented(description): raise SparkPandasNotImplementedError( description=description, pandas_function=".iloc[..., ...]", spark_target_function="select, where") rows_sel, cols_sel = _unfold(key, self._ks) sdf = self._kdf._sdf if isinstance(rows_sel, Index): sdf_for_check_schema = sdf.select(rows_sel._scol) assert isinstance(sdf_for_check_schema.schema.fields[0].dataType, BooleanType), \ (str(sdf_for_check_schema), sdf_for_check_schema.schema.fields[0].dataType) sdf = sdf.where(rows_sel._scol) elif isinstance(rows_sel, slice): if rows_sel == slice(None): # If slice is None - select everything, so nothing to do pass elif (rows_sel.start is not None) or (rows_sel.step is not None): raiseNotImplemented("Cannot use start or step with Spark.") elif not isinstance(rows_sel.stop, int): raise TypeError( "cannot do slice indexing with these indexers [{}] of {}". format(rows_sel.stop, type(rows_sel.stop))) elif rows_sel.stop >= 0: sdf = sdf.limit(rows_sel.stop) else: sdf = sdf.limit(sdf.count() + rows_sel.stop) else: raiseNotImplemented( ".iloc requires numeric slice or conditional boolean Index, " "got {}".format(rows_sel)) # make cols_sel a 1-tuple of string if a single string if isinstance(cols_sel, Series): columns = [_make_col(cols_sel)] elif isinstance(cols_sel, int): columns = [_make_col(self._kdf.columns[cols_sel])] elif cols_sel is None or cols_sel == slice(None): columns = [_make_col(col) for col in self._kdf.columns] elif isinstance(cols_sel, slice): if all(s is None or isinstance(s, int) for s in (cols_sel.start, cols_sel.stop, cols_sel.step)): columns = [ _make_col(col) for col in self._kdf.columns[cols_sel] ] else: not_none = cols_sel.start if cols_sel.start is not None \ else cols_sel.stop if cols_sel.stop is not None else cols_sel.step raise TypeError( 'cannot do slice indexing with these indexers {} of {}'. format(not_none, type(not_none))) elif is_list_like(cols_sel): if all(isinstance(s, int) for s in cols_sel): columns = [ _make_col(col) for col in self._kdf.columns[cols_sel] ] else: raise TypeError('cannot perform reduce with flexible type') else: raise ValueError( "Location based indexing can only have [integer, integer slice, " "listlike of integers, boolean array] types, got {}".format( cols_sel)) try: kdf = DataFrame( sdf.select(self._kdf._metadata.index_columns + columns)) except AnalysisException: raise KeyError('[{}] don\'t exist in columns'.format( [col._jc.toString() for col in columns])) kdf._metadata = self._kdf._metadata.copy( data_columns=kdf._metadata.data_columns[-len(columns):]) if cols_sel is not None and isinstance(cols_sel, (Series, int)): from databricks.koalas.series import _col return _col(kdf) else: return kdf
def __getitem__(self, key): from pyspark.sql.functions import lit from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series def raiseNotImplemented(description): raise SparkPandasNotImplementedError( description=description, pandas_function=".loc[..., ...]", spark_target_function="select, where") rows_sel, cols_sel = _unfold(key, self._ks) sdf = self._kdf._sdf if isinstance(rows_sel, Series): sdf_for_check_schema = sdf.select(rows_sel._scol) assert isinstance(sdf_for_check_schema.schema.fields[0].dataType, BooleanType), \ (str(sdf_for_check_schema), sdf_for_check_schema.schema.fields[0].dataType) sdf = sdf.where(rows_sel._scol) elif isinstance(rows_sel, slice): if rows_sel.step is not None: raiseNotImplemented("Cannot use step with Spark.") if rows_sel == slice(None): # If slice is None - select everything, so nothing to do pass elif len(self._kdf._index_columns) == 0: raiseNotImplemented("Cannot use slice for Spark if no index provided.") elif len(self._kdf._index_columns) == 1: start = rows_sel.start stop = rows_sel.stop index_column = self._kdf.index index_data_type = index_column.schema[0].dataType cond = [] if start is not None: cond.append(index_column._scol >= lit(start).cast(index_data_type)) if stop is not None: cond.append(index_column._scol <= lit(stop).cast(index_data_type)) if len(cond) > 0: sdf = sdf.where(reduce(lambda x, y: x & y, cond)) else: raiseNotImplemented("Cannot use slice for MultiIndex with Spark.") elif isinstance(rows_sel, str): raiseNotImplemented("Cannot use a scalar value for row selection with Spark.") else: try: rows_sel = list(rows_sel) except TypeError: raiseNotImplemented("Cannot use a scalar value for row selection with Spark.") if len(rows_sel) == 0: sdf = sdf.where(lit(False)) elif len(self._kdf._index_columns) == 1: index_column = self._kdf.index index_data_type = index_column.schema[0].dataType if len(rows_sel) == 1: sdf = sdf.where( index_column._scol == lit(rows_sel[0]).cast(index_data_type)) else: sdf = sdf.where(index_column._scol.isin( [lit(r).cast(index_data_type) for r in rows_sel])) else: raiseNotImplemented("Cannot select with MultiIndex with Spark.") if cols_sel is None: columns = [_make_col(c) for c in self._kdf._metadata.data_columns] elif isinstance(cols_sel, spark.Column): columns = [cols_sel] else: columns = [_make_col(c) for c in cols_sel] try: kdf = DataFrame(sdf.select(self._kdf._metadata.index_columns + columns)) except AnalysisException: raise KeyError('[{}] don\'t exist in columns' .format([col._jc.toString() for col in columns])) kdf._metadata = self._kdf._metadata.copy( data_columns=kdf._metadata.data_columns[-len(columns):]) if cols_sel is not None and isinstance(cols_sel, spark.Column): from databricks.koalas.series import _col return _col(kdf) else: return kdf