def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True): if bins is not None: raise NotImplementedError( "value_counts currently does not support bins") if dropna: df_dropna = self.to_dataframe()._spark_filter(self.notna()) else: df_dropna = self.to_dataframe() df = df_dropna._spark_groupby(self).count() if sort: if ascending: df = df._spark_orderBy(F._spark_col('count')) else: df = df._spark_orderBy(F._spark_col('count')._spark_desc()) if normalize: sum = df_dropna._spark_count() df = df._spark_withColumn( 'count', F._spark_col('count') / F._spark_lit(sum)) return _col(df.set_index([self.name]))
def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True): if bins is not None: raise NotImplementedError( "value_counts currently does not support bins") if dropna: df_dropna = self._pandas_anchor._spark_filter(self.notna()) else: df_dropna = self._pandas_anchor df = df_dropna._spark_groupby(self).count() if sort: if ascending: df = df._spark_orderBy(F._spark_col('count')) else: df = df._spark_orderBy(F._spark_col('count')._spark_desc()) if normalize: sum = df_dropna._spark_count() df = df._spark_withColumn( 'count', F._spark_col('count') / F._spark_lit(sum)) index_name = 'index' if self.name != 'index' else 'level_0' df.columns = [index_name, self.name] df._metadata = Metadata(column_fields=[self.name], index_info=[(index_name, None)]) return _col(df)
def dropna(self, axis=0, how='any', thresh=None, subset=None, inplace=False): if axis == 0 or axis == 'index': if subset is not None: if isinstance(subset, string_types): columns = [subset] else: columns = list(subset) invalids = [ column for column in columns if column not in self._metadata.column_fields ] if len(invalids) > 0: raise KeyError(invalids) else: columns = list(self.columns) cnt = reduce(lambda x, y: x + y, [ F._spark_when(self[column].notna(), 1)._spark_otherwise(0) for column in columns ], F._spark_lit(0)) if thresh is not None: pred = cnt >= F._spark_lit(int(thresh)) elif how == 'any': pred = cnt == F._spark_lit(len(columns)) elif how == 'all': pred = cnt > F._spark_lit(0) else: if how is not None: raise ValueError('invalid how option: {h}'.format(h=how)) else: raise TypeError('must specify how or thresh') df = self._spark_filter(pred) df._metadata = self._metadata.copy() if inplace: _reassign_jdf(self, df) else: return df else: raise NotImplementedError( "dropna currently only works for axis=0 or axis='index'")
def __invert__(self): return anchor_wrap(self, self._spark_cast("boolean") == F._spark_lit(False))
def __invert__(self): return anchor_wrap(self, self.astype(bool) == F._spark_lit(False))
def __getitem__(self, key): from pyspark.sql.functions import _spark_lit def raiseNotImplemented(description): raise SparkPandasNotImplementedError( description=description, pandas_function=".loc[..., ...]", spark_target_function="select, where") rows_sel, cols_sel = _unfold(key, self.col) df = self.df if isinstance(rows_sel, Column): df_for_check_schema = self.df._spark_select(rows_sel) assert isinstance(df_for_check_schema.schema.fields[0].dataType, BooleanType), \ (str(df_for_check_schema), df_for_check_schema.schema.fields[0].dataType) df = df._spark_where(rows_sel) elif isinstance(rows_sel, slice): if rows_sel.step is not None: raiseNotImplemented("Cannot use step with Spark.") if rows_sel == slice(None): # If slice is None - select everything, so nothing to do pass elif len(self.df._index_columns) == 0: raiseNotImplemented( "Cannot use slice for Spark if no index provided.") elif len(self.df._index_columns) == 1: start = rows_sel.start stop = rows_sel.stop index_column = self.df._index_columns[0] index_data_type = index_column.schema[0].dataType cond = [] if start is not None: cond.append(index_column >= _spark_lit(start)._spark_cast( index_data_type)) if stop is not None: cond.append(index_column <= _spark_lit(stop)._spark_cast( index_data_type)) if len(cond) > 0: df = df._spark_where(reduce(lambda x, y: x & y, cond)) else: raiseNotImplemented( "Cannot use slice for MultiIndex with Spark.") elif isinstance(rows_sel, string_types): raiseNotImplemented( "Cannot use a scalar value for row selection with Spark.") else: try: rows_sel = list(rows_sel) except TypeError: raiseNotImplemented( "Cannot use a scalar value for row selection with Spark.") if len(rows_sel) == 0: df = df._spark_where(_spark_lit(False)) elif len(self.df._index_columns) == 1: index_column = self.df._index_columns[0] index_data_type = index_column.schema[0].dataType if len(rows_sel) == 1: df = df._spark_where(index_column == _spark_lit( rows_sel[0])._spark_cast(index_data_type)) else: df = df._spark_where( index_column._spark_isin([ _spark_lit(r)._spark_cast(index_data_type) for r in rows_sel ])) else: raiseNotImplemented( "Cannot select with MultiIndex with Spark.") if cols_sel is None: columns = [_make_col(c) for c in self.df._metadata.column_fields] elif isinstance(cols_sel, Column): columns = [cols_sel] else: columns = [_make_col(c) for c in cols_sel] try: df = df._spark_select(self.df._metadata.index_fields + columns) except AnalysisException: raise KeyError('[{}] don\'t exist in columns'.format( [col._jc.toString() for col in columns])) df._metadata = self.df._metadata.copy( column_fields=df._metadata.column_fields[-len(columns):]) if cols_sel is not None and isinstance(cols_sel, Column): from .series import _col return _col(df) else: return df