Пример #1
0
    def value_counts(self,
                     normalize=False,
                     sort=True,
                     ascending=False,
                     bins=None,
                     dropna=True):
        if bins is not None:
            raise NotImplementedError(
                "value_counts currently does not support bins")

        if dropna:
            df_dropna = self.to_dataframe()._spark_filter(self.notna())
        else:
            df_dropna = self.to_dataframe()
        df = df_dropna._spark_groupby(self).count()
        if sort:
            if ascending:
                df = df._spark_orderBy(F._spark_col('count'))
            else:
                df = df._spark_orderBy(F._spark_col('count')._spark_desc())

        if normalize:
            sum = df_dropna._spark_count()
            df = df._spark_withColumn(
                'count',
                F._spark_col('count') / F._spark_lit(sum))

        return _col(df.set_index([self.name]))
Пример #2
0
    def value_counts(self,
                     normalize=False,
                     sort=True,
                     ascending=False,
                     bins=None,
                     dropna=True):
        if bins is not None:
            raise NotImplementedError(
                "value_counts currently does not support bins")

        if dropna:
            df_dropna = self._pandas_anchor._spark_filter(self.notna())
        else:
            df_dropna = self._pandas_anchor
        df = df_dropna._spark_groupby(self).count()
        if sort:
            if ascending:
                df = df._spark_orderBy(F._spark_col('count'))
            else:
                df = df._spark_orderBy(F._spark_col('count')._spark_desc())

        if normalize:
            sum = df_dropna._spark_count()
            df = df._spark_withColumn(
                'count',
                F._spark_col('count') / F._spark_lit(sum))

        index_name = 'index' if self.name != 'index' else 'level_0'
        df.columns = [index_name, self.name]
        df._metadata = Metadata(column_fields=[self.name],
                                index_info=[(index_name, None)])
        return _col(df)
Пример #3
0
    def dropna(self,
               axis=0,
               how='any',
               thresh=None,
               subset=None,
               inplace=False):
        if axis == 0 or axis == 'index':
            if subset is not None:
                if isinstance(subset, string_types):
                    columns = [subset]
                else:
                    columns = list(subset)
                invalids = [
                    column for column in columns
                    if column not in self._metadata.column_fields
                ]
                if len(invalids) > 0:
                    raise KeyError(invalids)
            else:
                columns = list(self.columns)

            cnt = reduce(lambda x, y: x + y, [
                F._spark_when(self[column].notna(), 1)._spark_otherwise(0)
                for column in columns
            ], F._spark_lit(0))
            if thresh is not None:
                pred = cnt >= F._spark_lit(int(thresh))
            elif how == 'any':
                pred = cnt == F._spark_lit(len(columns))
            elif how == 'all':
                pred = cnt > F._spark_lit(0)
            else:
                if how is not None:
                    raise ValueError('invalid how option: {h}'.format(h=how))
                else:
                    raise TypeError('must specify how or thresh')

            df = self._spark_filter(pred)
            df._metadata = self._metadata.copy()
            if inplace:
                _reassign_jdf(self, df)
            else:
                return df

        else:
            raise NotImplementedError(
                "dropna currently only works for axis=0 or axis='index'")
Пример #4
0
 def __invert__(self):
     return anchor_wrap(self,
                        self._spark_cast("boolean") == F._spark_lit(False))
Пример #5
0
 def __invert__(self):
     return anchor_wrap(self, self.astype(bool) == F._spark_lit(False))
Пример #6
0
    def __getitem__(self, key):
        from pyspark.sql.functions import _spark_lit

        def raiseNotImplemented(description):
            raise SparkPandasNotImplementedError(
                description=description,
                pandas_function=".loc[..., ...]",
                spark_target_function="select, where")

        rows_sel, cols_sel = _unfold(key, self.col)

        df = self.df
        if isinstance(rows_sel, Column):
            df_for_check_schema = self.df._spark_select(rows_sel)
            assert isinstance(df_for_check_schema.schema.fields[0].dataType, BooleanType), \
                (str(df_for_check_schema), df_for_check_schema.schema.fields[0].dataType)
            df = df._spark_where(rows_sel)
        elif isinstance(rows_sel, slice):
            if rows_sel.step is not None:
                raiseNotImplemented("Cannot use step with Spark.")
            if rows_sel == slice(None):
                # If slice is None - select everything, so nothing to do
                pass
            elif len(self.df._index_columns) == 0:
                raiseNotImplemented(
                    "Cannot use slice for Spark if no index provided.")
            elif len(self.df._index_columns) == 1:
                start = rows_sel.start
                stop = rows_sel.stop

                index_column = self.df._index_columns[0]
                index_data_type = index_column.schema[0].dataType
                cond = []
                if start is not None:
                    cond.append(index_column >= _spark_lit(start)._spark_cast(
                        index_data_type))
                if stop is not None:
                    cond.append(index_column <= _spark_lit(stop)._spark_cast(
                        index_data_type))

                if len(cond) > 0:
                    df = df._spark_where(reduce(lambda x, y: x & y, cond))
            else:
                raiseNotImplemented(
                    "Cannot use slice for MultiIndex with Spark.")
        elif isinstance(rows_sel, string_types):
            raiseNotImplemented(
                "Cannot use a scalar value for row selection with Spark.")
        else:
            try:
                rows_sel = list(rows_sel)
            except TypeError:
                raiseNotImplemented(
                    "Cannot use a scalar value for row selection with Spark.")
            if len(rows_sel) == 0:
                df = df._spark_where(_spark_lit(False))
            elif len(self.df._index_columns) == 1:
                index_column = self.df._index_columns[0]
                index_data_type = index_column.schema[0].dataType
                if len(rows_sel) == 1:
                    df = df._spark_where(index_column == _spark_lit(
                        rows_sel[0])._spark_cast(index_data_type))
                else:
                    df = df._spark_where(
                        index_column._spark_isin([
                            _spark_lit(r)._spark_cast(index_data_type)
                            for r in rows_sel
                        ]))
            else:
                raiseNotImplemented(
                    "Cannot select with MultiIndex with Spark.")
        if cols_sel is None:
            columns = [_make_col(c) for c in self.df._metadata.column_fields]
        elif isinstance(cols_sel, Column):
            columns = [cols_sel]
        else:
            columns = [_make_col(c) for c in cols_sel]
        try:
            df = df._spark_select(self.df._metadata.index_fields + columns)
        except AnalysisException:
            raise KeyError('[{}] don\'t exist in columns'.format(
                [col._jc.toString() for col in columns]))
        df._metadata = self.df._metadata.copy(
            column_fields=df._metadata.column_fields[-len(columns):])
        if cols_sel is not None and isinstance(cols_sel, Column):
            from .series import _col
            return _col(df)
        else:
            return df