Exemplo n.º 1
0
class DeferredSeries(frame_base.DeferredFrame):
    def __array__(self, dtype=None):
        raise frame_base.WontImplementError(
            'Conversion to a non-deferred a numpy array.')

    isna = frame_base._elementwise_method('isna')
    notnull = notna = frame_base._elementwise_method('notna')

    transform = frame_base._elementwise_method('transform',
                                               restrictions={'axis': 0})

    def agg(self, *args, **kwargs):
        return frame_base.DeferredFrame.wrap(
            expressions.ComputedExpression(
                'agg',
                lambda df: df.agg(*args, **kwargs), [self._expr],
                preserves_partition_by=partitionings.Singleton(),
                requires_partition_by=partitionings.Singleton()))

    all = frame_base._associative_agg_method('all')
    any = frame_base._associative_agg_method('any')
    min = frame_base._associative_agg_method('min')
    max = frame_base._associative_agg_method('max')
    prod = product = frame_base._associative_agg_method('prod')
    sum = frame_base._associative_agg_method('sum')

    cummax = cummin = cumsum = cumprod = frame_base.wont_implement_method(
        'order-sensitive')
    diff = frame_base.wont_implement_method('order-sensitive')

    def replace(self,
                to_replace=None,
                value=None,
                inplace=False,
                limit=None,
                *args,
                **kwargs):
        if limit is None:
            requires_partition_by = partitionings.Nothing()
        else:
            requires_partition_by = partitionings.Singleton()
        result = frame_base.DeferredFrame.wrap(
            expressions.ComputedExpression(
                'replace',
                lambda df: df.replace(to_replace, value, False, limit, *args,
                                      **kwargs), [self._expr],
                preserves_partition_by=partitionings.Singleton(),
                requires_partition_by=requires_partition_by))
        if inplace:
            self._expr = result._expr
        else:
            return result

    def unstack(self, *args, **kwargs):
        raise frame_base.WontImplementError('non-deferred column values')
Exemplo n.º 2
0
 def __getattr__(self, name):
     if name.startswith('read_'):
         return frame_base.wont_implement_method(
             'Use p | apache_beam.dataframe.io.%s' % name)
     res = getattr(pd, name)
     if _is_top_level_function(res):
         return frame_base.not_implemented_method(name)
     else:
         return res
Exemplo n.º 3
0
class DeferredDataFrame(frame_base.DeferredFrame):
    @property
    def T(self):
        return self.transpose()

    def groupby(self, cols):
        # TODO: what happens to the existing index?
        # We set the columns to index as we have a notion of being partitioned by
        # index, but not partitioned by an arbitrary subset of columns.
        return DeferredGroupBy(
            expressions.ComputedExpression(
                'groupbyindex',
                lambda df: df.groupby(level=list(range(df.index.nlevels))),
                [self.set_index(cols)._expr],
                requires_partition_by=partitionings.Index(),
                preserves_partition_by=partitionings.Singleton()))

    def __getattr__(self, name):
        # Column attribute access.
        if name in self._expr.proxy().columns:
            return self[name]
        else:
            return object.__getattribute__(self, name)

    def __getitem__(self, key):
        if key in self._expr.proxy().columns:
            return self._elementwise(lambda df: df[key], 'get_column')
        else:
            raise NotImplementedError(key)

    def __setitem__(self, key, value):
        if isinstance(key, str):
            # yapf: disable
            return self._elementwise(
                lambda df, key, value: df.__setitem__(key, value),
                'set_column',
                (key, value),
                inplace=True)
        else:
            raise NotImplementedError(key)

    def set_index(self, keys, **kwargs):
        if isinstance(keys, str):
            keys = [keys]
        if not set(keys).issubset(self._expr.proxy().columns):
            raise NotImplementedError(keys)
        return self._elementwise(
            lambda df: df.set_index(keys, **kwargs),
            'set_index',
            inplace=kwargs.get('inplace', False))

    def at(self, *args, **kwargs):
        raise NotImplementedError()

    @property
    def loc(self):
        return _DeferredLoc(self)

    def aggregate(self, *args, **kwargs):
        if 'axis' in kwargs and kwargs['axis'] is None:
            return self.agg(*args, **dict(kwargs, axis=1)).agg(
                *args, **dict(kwargs, axis=0))
        return frame_base.DeferredFrame.wrap(
            expressions.ComputedExpression(
                'aggregate',
                lambda df: df.agg(*args, **kwargs),
                [self._expr],
                # TODO(robertwb): Sub-aggregate when possible.
                requires_partition_by=partitionings.Singleton()))

    agg = aggregate

    applymap = frame_base._elementwise_method('applymap')

    memory_usage = frame_base.wont_implement_method('non-deferred value')

    all = frame_base._associative_agg_method('all')
    any = frame_base._associative_agg_method('any')

    cummax = cummin = cumsum = cumprod = frame_base.wont_implement_method(
        'order-sensitive')
    diff = frame_base.wont_implement_method('order-sensitive')

    max = frame_base._associative_agg_method('max')
    min = frame_base._associative_agg_method('min')
    mode = frame_base._agg_method('mode')

    def dropna(
        self,
        axis=0,
        how='any',
        thresh=None,
        subset=None,
        inplace=False,
        *args,
        **kwargs):
        # TODO(robertwb): This is a common pattern. Generalize?
        if axis == 1 or axis == 'columns':
            requires_partition_by = partitionings.Singleton()
        else:
            requires_partition_by = partitionings.Nothing()
        result = frame_base.DeferredFrame.wrap(
            expressions.ComputedExpression(
                'dropna',
                lambda df: df.dropna(
                    axis, how, thresh, subset, False, *args, **kwargs),
                [self._expr],
                preserves_partition_by=partitionings.Singleton(),
                requires_partition_by=requires_partition_by))
        if inplace:
            self._expr = result._expr
        else:
            return result

    items = itertuples = iterrows = iteritems = frame_base.wont_implement_method(
        'non-lazy')

    isna = frame_base._elementwise_method('isna')
    notnull = notna = frame_base._elementwise_method('notna')

    prod = product = frame_base._associative_agg_method('prod')

    def quantile(self, q=0.5, axis=0, *args, **kwargs):
        if axis != 0:
            raise frame_base.WontImplementError('non-deferred column values')
        return frame_base.DeferredFrame.wrap(
            expressions.ComputedExpression(
                'quantile',
                lambda df: df.quantile(q, axis, *args, **kwargs),
                [self._expr],
                #TODO(robertwb): Approximate quantiles?
                requires_partition_by=partitionings.Singleton(),
                preserves_partition_by=partitionings.Singleton()))

    query = frame_base._elementwise_method('query')

    def replace(self, to_replace=None,
        value=None,
        inplace=False,
        limit=None, *args, **kwargs):
        if limit is None:
            requires_partition_by = partitionings.Nothing()
        else:
            requires_partition_by = partitionings.Singleton()
        result = frame_base.DeferredFrame.wrap(
            expressions.ComputedExpression(
                'replace',
                lambda df: df.replace(
                    to_replace, value, False, limit, *args, **kwargs),
                [self._expr],
                preserves_partition_by=partitionings.Singleton(),
                requires_partition_by=requires_partition_by))
        if inplace:
            self._expr = result._expr
        else:
            return result

    def reset_index(self, level=None, drop=False, inplace=False, *args, **kwargs):
        if level is not None and not isinstance(level, (tuple, list)):
            level = [level]
        if level is None or len(level) == len(self._expr.proxy().index.levels):
            # TODO: Could do distributed re-index with offsets.
            requires_partition_by = partitionings.Singleton()
        else:
            requires_partition_by = partitionings.Nothing()
        result = frame_base.DeferredFrame.wrap(
            expressions.ComputedExpression(
                'reset_index',
                lambda df: df.reset_index(level, drop, False, *args, **kwargs),
                [self._expr],
                preserves_partition_by=partitionings.Singleton(),
                requires_partition_by=requires_partition_by))
        if inplace:
            self._expr = result._expr
        else:
            return result

    round = frame_base._elementwise_method('round')
    select_dtypes = frame_base._elementwise_method('select_dtypes')

    def shift(self, periods=1, freq=None, axis=0, *args, **kwargs):
        if axis == 1 or axis == 'columns':
            requires_partition_by = partitionings.Nothing()
        else:
            requires_partition_by = partitionings.Singleton()
        return frame_base.DeferredFrame.wrap(
            expressions.ComputedExpression(
                'shift',
                lambda df: df.shift(periods, freq, axis, *args, **kwargs),
                [self._expr],
                preserves_partition_by=partitionings.Singleton(),
                requires_partition_by=requires_partition_by))

    @property
    def shape(self):
        raise frame_base.WontImplementError('scalar value')

    def sort_values(
        self, by, axis=0, ascending=True, inplace=False, *args, **kwargs):
        if axis == 1 or axis == 'columns':
            requires_partition_by = partitionings.Nothing()
        else:
            requires_partition_by = partitionings.Singleton()
        result = frame_base.DeferredFrame.wrap(
            expressions.ComputedExpression(
                'sort_values',
                lambda df: df.sort_values(
                    by, axis, ascending, False, *args, **kwargs),
                [self._expr],
                preserves_partition_by=partitionings.Singleton(),
                requires_partition_by=requires_partition_by))
        if inplace:
            self._expr = result._expr
        else:
            return result

    stack = frame_base._elementwise_method('stack')

    sum = frame_base._associative_agg_method('sum')

    to_records = to_dict = to_numpy = to_string = (
        frame_base.wont_implement_method('non-deferred value'))

    to_sparse = to_string # frame_base._elementwise_method('to_sparse')

    transform = frame_base._elementwise_method(
        'transform', restrictions={'axis': 0})

    def transpose(self, *args, **kwargs):
        raise frame_base.WontImplementError('non-deferred column values')

    def unstack(self, *args, **kwargs):
        if self._expr.proxy().index.nlevels == 1:
            return frame_base.DeferredFrame.wrap(
              expressions.ComputedExpression(
                  'unstack',
                  lambda df: df.unstack(*args, **kwargs),
                  [self._expr],
                  requires_partition_by=partitionings.Index()))
        else:
            raise frame_base.WontImplementError('non-deferred column values')

    update = frame_base._proxy_method(
        'update',
        inplace=True,
        requires_partition_by=partitionings.Index(),
        preserves_partition_by=partitionings.Index())
Exemplo n.º 4
0
    globals()['read_%s' % format] = frame_base.with_docs_from(pd)(
        _binary_reader(format))
    globals()['to_%s' % format] = frame_base.with_docs_from(pd.DataFrame)(
        _binary_writer(format))

for format in ('sas', 'spss'):
    if hasattr(pd, 'read_%s' % format):  # Depends on pandas version.
        globals()['read_%s' % format] = frame_base.with_docs_from(pd)(
            _binary_reader(format))

read_clipboard = frame_base.not_implemented_method('read_clipboard',
                                                   base_type=pd)
to_clipboard = frame_base.not_implemented_method('to_clipboard',
                                                 base_type=pd.DataFrame)
read_msgpack = frame_base.wont_implement_method(pd,
                                                'read_msgpack',
                                                reason="deprecated")
to_msgpack = frame_base.wont_implement_method(pd.DataFrame,
                                              'to_msgpack',
                                              reason="deprecated")
read_hdf = frame_base.wont_implement_method(
    pd, 'read_hdf', explanation="because HDF5 is a random access file format")
to_hdf = frame_base.wont_implement_method(
    pd.DataFrame,
    'to_hdf',
    explanation="because HDF5 is a random access file format")

for name in dir(pd):
    if name.startswith('read_') and name not in globals():
        globals()[name] = frame_base.not_implemented_method(name, base_type=pd)
Exemplo n.º 5
0

def _binary_writer(format):
    return (lambda df, path, *args, **kwargs: _as_pc(df) | _WriteToPandas(
        f'to_{format}', path, args, kwargs))


for format in ('excel', 'feather', 'parquet', 'stata'):
    globals()['read_%s' % format] = _binary_reader(format)
    globals()['to_%s' % format] = _binary_writer(format)

for format in ('sas', 'spss'):
    if hasattr(pd, 'read_%s' % format):  # Depends on pandas version.
        globals()['read_%s' % format] = _binary_reader(format)

read_clipboard = to_clipboard = frame_base.wont_implement_method('clipboard')
read_msgpack = to_msgpack = frame_base.wont_implement_method('deprecated')
read_hdf = to_hdf = frame_base.wont_implement_method('random access files')

for name in dir(pd):
    if name.startswith('read_') and name not in globals():
        globals()[name] = frame_base.not_implemented_method(name)


def _prefix_range_index_with(prefix, df):
    if isinstance(df.index, pd.RangeIndex):
        return df.set_index(prefix + df.index.map(str).astype(str))
    else:
        return df

Exemplo n.º 6
0
class DeferredSeries(frame_base.DeferredFrame):
    def __array__(self, dtype=None):
        raise frame_base.WontImplementError(
            'Conversion to a non-deferred a numpy array.')

    astype = frame_base._elementwise_method('astype')

    between = frame_base._elementwise_method('between')

    @frame_base.args_to_kwargs(pd.Series)
    @frame_base.populate_defaults(pd.Series)
    @frame_base.maybe_inplace
    def dropna(self, **kwargs):
        return frame_base.DeferredFrame.wrap(
            expressions.ComputedExpression(
                'dropna',
                lambda df: df.dropna(**kwargs), [self._expr],
                preserves_partition_by=partitionings.Singleton(),
                requires_partition_by=partitionings.Nothing()))

    items = iteritems = frame_base.wont_implement_method('non-lazy')

    isin = frame_base._elementwise_method('isin')

    isna = frame_base._elementwise_method('isna')
    notnull = notna = frame_base._elementwise_method('notna')

    @frame_base.args_to_kwargs(pd.Series)
    @frame_base.populate_defaults(pd.Series)
    @frame_base.maybe_inplace
    def fillna(self, value, method):
        if method is not None:
            raise frame_base.WontImplementError('order-sensitive')
        if isinstance(value, frame_base.DeferredBase):
            value_expr = value._expr
        else:
            value_expr = expressions.ConstantExpression(value)
        return frame_base.DeferredFrame.wrap(
            expressions.ComputedExpression(
                'fillna',
                lambda df, value: df.fillna(value, method=method),
                [self._expr, value_expr],
                preserves_partition_by=partitionings.Singleton(),
                requires_partition_by=partitionings.Nothing()))

    transform = frame_base._elementwise_method('transform',
                                               restrictions={'axis': 0})

    def aggregate(self, func, axis=0, *args, **kwargs):
        if isinstance(func, list) and len(func) > 1:
            # Aggregate each column separately, then stick them all together.
            rows = [self.agg([f], *args, **kwargs) for f in func]
            return frame_base.DeferredFrame.wrap(
                expressions.ComputedExpression('join_aggregate',
                                               lambda *rows: pd.concat(rows),
                                               [row._expr for row in rows]))
        else:
            # We're only handling a single column.
            base_func = func[0] if isinstance(func, list) else func
            if _is_associative(base_func) and not args and not kwargs:
                intermediate = expressions.elementwise_expression(
                    'pre_aggregate',
                    lambda s: s.agg([base_func], *args, **kwargs),
                    [self._expr])
                allow_nonparallel_final = True
            else:
                intermediate = self._expr
                allow_nonparallel_final = None  # i.e. don't change the value
            with expressions.allow_non_parallel_operations(
                    allow_nonparallel_final):
                return frame_base.DeferredFrame.wrap(
                    expressions.ComputedExpression(
                        'aggregate',
                        lambda s: s.agg(func, *args, **kwargs), [intermediate],
                        preserves_partition_by=partitionings.Singleton(),
                        requires_partition_by=partitionings.Singleton()))

    agg = aggregate

    all = frame_base._agg_method('all')
    any = frame_base._agg_method('any')
    min = frame_base._agg_method('min')
    max = frame_base._agg_method('max')
    prod = product = frame_base._agg_method('prod')
    sum = frame_base._agg_method('sum')

    cummax = cummin = cumsum = cumprod = frame_base.wont_implement_method(
        'order-sensitive')
    diff = frame_base.wont_implement_method('order-sensitive')

    @frame_base.args_to_kwargs(pd.Series)
    @frame_base.populate_defaults(pd.Series)
    @frame_base.maybe_inplace
    def replace(self, limit, **kwargs):
        if limit is None:
            requires_partition_by = partitionings.Nothing()
        else:
            requires_partition_by = partitionings.Singleton()
        return frame_base.DeferredFrame.wrap(
            expressions.ComputedExpression(
                'replace',
                lambda df: df.replace(limit=limit, **kwargs), [self._expr],
                preserves_partition_by=partitionings.Singleton(),
                requires_partition_by=requires_partition_by))

    round = frame_base._elementwise_method('round')

    searchsorted = frame_base.wont_implement_method('order-sensitive')

    shift = frame_base.wont_implement_method('order-sensitive')

    take = frame_base.wont_implement_method('deprecated')

    to_dict = frame_base.wont_implement_method('non-deferred')

    to_frame = frame_base._elementwise_method('to_frame')

    def unique(self, as_series=False):
        if not as_series:
            raise frame_base.WontImplementError(
                'pass as_series=True to get the result as a (deferred) Series')
        return frame_base.DeferredFrame.wrap(
            expressions.ComputedExpression(
                'unique',
                lambda df: pd.Series(df.unique()), [self._expr],
                preserves_partition_by=partitionings.Singleton(),
                requires_partition_by=partitionings.Singleton()))

    def update(self, other):
        self._expr = expressions.ComputedExpression(
            'update',
            lambda df, other: df.update(other) or df,
            [self._expr, other._expr],
            preserves_partition_by=partitionings.Singleton(),
            requires_partition_by=partitionings.Index())

    unstack = frame_base.wont_implement_method('non-deferred column values')

    values = property(frame_base.wont_implement_method('non-deferred'))

    view = frame_base.wont_implement_method('memory sharing semantics')
Exemplo n.º 7
0
class DeferredDataFrame(frame_base.DeferredFrame):
    @property
    def T(self):
        return self.transpose()

    def groupby(self, cols):
        # TODO: what happens to the existing index?
        # We set the columns to index as we have a notion of being partitioned by
        # index, but not partitioned by an arbitrary subset of columns.
        return DeferredGroupBy(
            expressions.ComputedExpression(
                'groupbyindex',
                lambda df: df.groupby(level=list(range(df.index.nlevels))),
                [self.set_index(cols)._expr],
                requires_partition_by=partitionings.Index(),
                preserves_partition_by=partitionings.Singleton()))

    def __getattr__(self, name):
        # Column attribute access.
        if name in self._expr.proxy().columns:
            return self[name]
        else:
            return object.__getattribute__(self, name)

    def __getitem__(self, key):
        if key in self._expr.proxy().columns:
            return self._elementwise(lambda df: df[key], 'get_column')
        else:
            raise NotImplementedError(key)

    def __setitem__(self, key, value):
        if isinstance(key, str):
            # yapf: disable
            return self._elementwise(
                lambda df, key, value: df.__setitem__(key, value),
                'set_column',
                (key, value),
                inplace=True)
        else:
            raise NotImplementedError(key)

    def set_index(self, keys, **kwargs):
        if isinstance(keys, str):
            keys = [keys]
        if not set(keys).issubset(self._expr.proxy().columns):
            raise NotImplementedError(keys)
        return self._elementwise(
            lambda df: df.set_index(keys, **kwargs),
            'set_index',
            inplace=kwargs.get('inplace', False))

    def at(self, *args, **kwargs):
        raise NotImplementedError()

    @property
    def loc(self):
        return _DeferredLoc(self)

    def aggregate(self, func, axis=0, *args, **kwargs):
        if axis is None:
            # Aggregate across all elements by first aggregating across columns,
            # then across rows.
            return self.agg(func, *args, **dict(kwargs, axis=1)).agg(
                func, *args, **dict(kwargs, axis=0))
        elif axis in (1, 'columns'):
            # This is an easy elementwise aggregation.
            return frame_base.DeferredFrame.wrap(
                expressions.ComputedExpression(
                    'aggregate',
                    lambda df: df.agg(func, axis=1, *args, **kwargs),
                    [self._expr],
                    requires_partition_by=partitionings.Nothing()))
        elif len(self._expr.proxy().columns) == 0 or args or kwargs:
            # For these corner cases, just colocate everything.
            return frame_base.DeferredFrame.wrap(
              expressions.ComputedExpression(
                  'aggregate',
                  lambda df: df.agg(func, *args, **kwargs),
                  [self._expr],
                  requires_partition_by=partitionings.Singleton()))
        else:
            # In the general case, compute the aggregation of each column separately,
            # then recombine.
            if not isinstance(func, dict):
                col_names = list(self._expr.proxy().columns)
                func = {col: func for col in col_names}
            else:
                col_names = list(func.keys())
            aggregated_cols = []
            for col in col_names:
                funcs = func[col]
                if not isinstance(funcs, list):
                    funcs = [funcs]
                aggregated_cols.append(self[col].agg(funcs, *args, **kwargs))
            # The final shape is different depending on whether any of the columns
            # were aggregated by a list of aggregators.
            with expressions.allow_non_parallel_operations():
                if any(isinstance(funcs, list) for funcs in func.values()):
                    return frame_base.DeferredFrame.wrap(
                        expressions.ComputedExpression(
                            'join_aggregate',
                            lambda *cols: pd.DataFrame(
                                {col: value for col, value in zip(col_names, cols)}),
                            [col._expr for col in aggregated_cols],
                            requires_partition_by=partitionings.Singleton()))
                else:
                    return frame_base.DeferredFrame.wrap(
                      expressions.ComputedExpression(
                          'join_aggregate',
                            lambda *cols: pd.Series(
                                {col: value[0] for col, value in zip(col_names, cols)}),
                          [col._expr for col in aggregated_cols],
                          requires_partition_by=partitionings.Singleton(),
                          proxy=self._expr.proxy().agg(func, *args, **kwargs)))

    agg = aggregate

    applymap = frame_base._elementwise_method('applymap')

    memory_usage = frame_base.wont_implement_method('non-deferred value')

    all = frame_base._agg_method('all')
    any = frame_base._agg_method('any')

    cummax = cummin = cumsum = cumprod = frame_base.wont_implement_method(
        'order-sensitive')
    diff = frame_base.wont_implement_method('order-sensitive')

    max = frame_base._agg_method('max')
    min = frame_base._agg_method('min')

    def mode(self, axis=0, *args, **kwargs):
        if axis == 1 or axis == 'columns':
            raise frame_base.WontImplementError('non-deferred column values')
        return frame_base.DeferredFrame.wrap(
            expressions.ComputedExpression(
                'mode',
                lambda df: df.mode(*args, **kwargs),
                [self._expr],
                #TODO(robertwb): Approximate?
                requires_partition_by=partitionings.Singleton(),
                preserves_partition_by=partitionings.Singleton()))

    @frame_base.args_to_kwargs(pd.DataFrame)
    @frame_base.populate_defaults(pd.DataFrame)
    @frame_base.maybe_inplace
    def dropna(self, axis, **kwargs):
        # TODO(robertwb): This is a common pattern. Generalize?
        if axis == 1 or axis == 'columns':
            requires_partition_by = partitionings.Singleton()
        else:
            requires_partition_by = partitionings.Nothing()
        return frame_base.DeferredFrame.wrap(
            expressions.ComputedExpression(
                'dropna',
                lambda df: df.dropna(axis=axis, **kwargs),
                [self._expr],
                preserves_partition_by=partitionings.Singleton(),
                requires_partition_by=requires_partition_by))

    items = itertuples = iterrows = iteritems = frame_base.wont_implement_method(
        'non-lazy')

    isna = frame_base._elementwise_method('isna')
    notnull = notna = frame_base._elementwise_method('notna')

    @frame_base.args_to_kwargs(pd.DataFrame)
    @frame_base.populate_defaults(pd.DataFrame)
    @frame_base.maybe_inplace
    def fillna(self, value, method, axis, **kwargs):
        if method is not None and axis in (0, 'index'):
            raise frame_base.WontImplementError('order-sensitive')
        if isinstance(value, frame_base.DeferredBase):
            value_expr = value._expr
        else:
            value_expr = expressions.ConstantExpression(value)
        return frame_base.DeferredFrame.wrap(
            expressions.ComputedExpression(
                'fillna',
                lambda df, value: df.fillna(
                    value, method=method, axis=axis, **kwargs),
                [self._expr, value_expr],
                preserves_partition_by=partitionings.Singleton(),
                requires_partition_by=partitionings.Nothing()))

    prod = product = frame_base._agg_method('prod')

    @frame_base.args_to_kwargs(pd.DataFrame)
    @frame_base.populate_defaults(pd.DataFrame)
    def quantile(self, axis, **kwargs):
        if axis == 1 or axis == 'columns':
            raise frame_base.WontImplementError('non-deferred column values')
        return frame_base.DeferredFrame.wrap(
            expressions.ComputedExpression(
                'quantile',
                lambda df: df.quantile(axis=axis, **kwargs),
                [self._expr],
                #TODO(robertwb): Approximate quantiles?
                requires_partition_by=partitionings.Singleton(),
                preserves_partition_by=partitionings.Singleton()))

    query = frame_base._elementwise_method('query')

    @frame_base.args_to_kwargs(pd.DataFrame)
    @frame_base.populate_defaults(pd.DataFrame)
    @frame_base.maybe_inplace
    def replace(self, limit, **kwargs):
        if limit is None:
            requires_partition_by = partitionings.Nothing()
        else:
            requires_partition_by = partitionings.Singleton()
        return frame_base.DeferredFrame.wrap(
            expressions.ComputedExpression(
                'replace',
                lambda df: df.replace(limit=limit, **kwargs),
                [self._expr],
                preserves_partition_by=partitionings.Singleton(),
                requires_partition_by=requires_partition_by))

    @frame_base.args_to_kwargs(pd.DataFrame)
    @frame_base.populate_defaults(pd.DataFrame)
    @frame_base.maybe_inplace
    def reset_index(self, level, **kwargs):
        if level is not None and not isinstance(level, (tuple, list)):
            level = [level]
        if level is None or len(level) == len(self._expr.proxy().index.levels):
            # TODO: Could do distributed re-index with offsets.
            requires_partition_by = partitionings.Singleton()
        else:
            requires_partition_by = partitionings.Nothing()
        return frame_base.DeferredFrame.wrap(
            expressions.ComputedExpression(
                'reset_index',
                lambda df: df.reset_index(level=level, **kwargs),
                [self._expr],
                preserves_partition_by=partitionings.Singleton(),
                requires_partition_by=requires_partition_by))

    round = frame_base._elementwise_method('round')
    select_dtypes = frame_base._elementwise_method('select_dtypes')

    @frame_base.args_to_kwargs(pd.DataFrame)
    @frame_base.populate_defaults(pd.DataFrame)
    def shift(self, axis, **kwargs):
        if axis == 1 or axis == 'columns':
            requires_partition_by = partitionings.Nothing()
        else:
            requires_partition_by = partitionings.Singleton()
        return frame_base.DeferredFrame.wrap(
            expressions.ComputedExpression(
                'shift',
                lambda df: df.shift(axis=axis, **kwargs),
                [self._expr],
                preserves_partition_by=partitionings.Singleton(),
                requires_partition_by=requires_partition_by))

    @property
    def shape(self):
        raise frame_base.WontImplementError('scalar value')

    @frame_base.args_to_kwargs(pd.DataFrame)
    @frame_base.populate_defaults(pd.DataFrame)
    @frame_base.maybe_inplace
    def sort_values(self, axis, **kwargs):
        if axis == 1 or axis == 'columns':
            requires_partition_by = partitionings.Nothing()
        else:
            requires_partition_by = partitionings.Singleton()
        return frame_base.DeferredFrame.wrap(
            expressions.ComputedExpression(
                'sort_values',
                lambda df: df.sort_values(axis=axis, **kwargs),
                [self._expr],
                preserves_partition_by=partitionings.Singleton(),
                requires_partition_by=requires_partition_by))

    stack = frame_base._elementwise_method('stack')

    sum = frame_base._agg_method('sum')

    take = frame_base.wont_implement_method('deprecated')

    to_records = to_dict = to_numpy = to_string = (
        frame_base.wont_implement_method('non-deferred value'))

    to_sparse = to_string # frame_base._elementwise_method('to_sparse')

    transform = frame_base._elementwise_method(
        'transform', restrictions={'axis': 0})

    def transpose(self, *args, **kwargs):
        raise frame_base.WontImplementError('non-deferred column values')

    def unstack(self, *args, **kwargs):
        if self._expr.proxy().index.nlevels == 1:
            return frame_base.DeferredFrame.wrap(
              expressions.ComputedExpression(
                  'unstack',
                  lambda df: df.unstack(*args, **kwargs),
                  [self._expr],
                  requires_partition_by=partitionings.Index()))
        else:
            raise frame_base.WontImplementError('non-deferred column values')

    update = frame_base._proxy_method(
        'update',
        inplace=True,
        requires_partition_by=partitionings.Index(),
        preserves_partition_by=partitionings.Index())
Exemplo n.º 8
0
class DeferredPandasModule(object):
    array = _defer_to_pandas('array')
    bdate_range = _defer_to_pandas('bdate_range')

    @staticmethod
    @frame_base.args_to_kwargs(pd)
    @frame_base.populate_defaults(pd)
    def concat(objs, axis, join, ignore_index, keys, levels, names,
               verify_integrity, sort, copy):

        if ignore_index:
            raise NotImplementedError('concat(ignore_index)')
        if levels:
            raise NotImplementedError('concat(levels)')

        if isinstance(objs, Mapping):
            if keys is None:
                keys = list(objs.keys())
            objs = [objs[k] for k in keys]
        else:
            objs = list(objs)
        deferred_none = expressions.ConstantExpression(None)
        exprs = [deferred_none if o is None else o._expr for o in objs]

        if axis in (1, 'columns'):
            required_partitioning = partitionings.Index()
        elif verify_integrity:
            required_partitioning = partitionings.Index()
        else:
            required_partitioning = partitionings.Nothing()

        return frame_base.DeferredBase.wrap(
            expressions.ComputedExpression(
                'concat',
                lambda *objs: pd.concat(objs,
                                        axis=axis,
                                        join=join,
                                        ignore_index=ignore_index,
                                        keys=keys,
                                        levels=levels,
                                        names=names,
                                        verify_integrity=verify_integrity
                                        ),  # yapf break
                exprs,
                requires_partition_by=required_partitioning,
                preserves_partition_by=partitionings.Index()))

    date_range = _defer_to_pandas('date_range')
    describe_option = _defer_to_pandas('describe_option')
    factorize = _call_on_first_arg('factorize')
    get_option = _defer_to_pandas('get_option')
    interval_range = _defer_to_pandas('interval_range')
    isna = _call_on_first_arg('isna')
    isnull = _call_on_first_arg('isnull')
    json_normalize = _defer_to_pandas('json_normalize')
    melt = _call_on_first_arg('melt')
    merge = _call_on_first_arg('merge')
    melt = _call_on_first_arg('melt')
    merge_ordered = frame_base.wont_implement_method('order-sensitive')
    notna = _call_on_first_arg('notna')
    notnull = _call_on_first_arg('notnull')
    option_context = _defer_to_pandas('option_context')
    period_range = _defer_to_pandas('period_range')
    pivot = _call_on_first_arg('pivot')
    pivot_table = _call_on_first_arg('pivot_table')
    show_versions = _defer_to_pandas('show_versions')
    test = frame_base.wont_implement_method('test')
    timedelta_range = _defer_to_pandas('timedelta_range')
    to_pickle = frame_base.wont_implement_method('order-sensitive')
    notna = _call_on_first_arg('notna')

    def __getattr__(self, name):
        if name.startswith('read_'):
            return frame_base.wont_implement_method(
                'Use p | apache_beam.dataframe.io.%s' % name)
        res = getattr(pd, name)
        if _is_top_level_function(res):
            return frame_base.not_implemented_method(name)
        else:
            return res
Exemplo n.º 9
0
class DeferredDataFrame(frame_base.DeferredFrame):
  @property
  def T(self):
    return self.transpose()

  @property
  def columns(self):
    return self._expr.proxy().columns

  def groupby(self, by):
    # TODO: what happens to the existing index?
    # We set the columns to index as we have a notion of being partitioned by
    # index, but not partitioned by an arbitrary subset of columns.
    return DeferredGroupBy(
        expressions.ComputedExpression(
            'groupbyindex',
            lambda df: df.groupby(level=list(range(df.index.nlevels))),
            [self.set_index(by)._expr],
            requires_partition_by=partitionings.Index(),
            preserves_partition_by=partitionings.Singleton()))

  def __getattr__(self, name):
    # Column attribute access.
    if name in self._expr.proxy().columns:
      return self[name]
    else:
      return object.__getattribute__(self, name)

  def __getitem__(self, key):
    # TODO: Replicate pd.DataFrame.__getitem__ logic
    if isinstance(key, frame_base.DeferredBase):
      # Fail early if key is a DeferredBase as it interacts surprisingly with
      # key in self._expr.proxy().columns
      raise NotImplementedError(
          "Indexing with a deferred frame is not yet supported. Consider "
          "using df.loc[...]")

    if (isinstance(key, list) and
        all(key_column in self._expr.proxy().columns
            for key_column in key)) or key in self._expr.proxy().columns:
      return self._elementwise(lambda df: df[key], 'get_column')
    else:
      raise NotImplementedError(key)

  def __contains__(self, key):
    # Checks if proxy has the given column
    return self._expr.proxy().__contains__(key)

  def __setitem__(self, key, value):
    if isinstance(key, str):
      # yapf: disable
      return self._elementwise(
          lambda df, key, value: df.__setitem__(key, value),
          'set_column',
          (key, value),
          inplace=True)
    else:
      raise NotImplementedError(key)

  @frame_base.args_to_kwargs(pd.DataFrame)
  @frame_base.populate_defaults(pd.DataFrame)
  @frame_base.maybe_inplace
  def set_index(self, keys, **kwargs):
    if isinstance(keys, str):
      keys = [keys]
    if not set(keys).issubset(self._expr.proxy().columns):
      raise NotImplementedError(keys)
    return frame_base.DeferredFrame.wrap(
      expressions.ComputedExpression(
          'set_index',
          lambda df: df.set_index(keys, **kwargs),
          [self._expr],
          requires_partition_by=partitionings.Nothing(),
          preserves_partition_by=partitionings.Nothing()))

  at = frame_base.not_implemented_method('at')

  @property
  def loc(self):
    return _DeferredLoc(self)

  _get_index = _set_index = frame_base.not_implemented_method('index')
  index = property(_get_index, _set_index)

  @property
  def axes(self):
    return (self.index, self.columns)

  apply = frame_base.not_implemented_method('apply')
  explode = frame_base.not_implemented_method('explode')
  isin = frame_base.not_implemented_method('isin')
  assign = frame_base.not_implemented_method('assign')
  append = frame_base.not_implemented_method('append')
  combine = frame_base.not_implemented_method('combine')
  combine_first = frame_base.not_implemented_method('combine_first')
  cov = frame_base.not_implemented_method('cov')
  corr = frame_base.not_implemented_method('corr')
  count = frame_base.not_implemented_method('count')
  drop = frame_base.not_implemented_method('drop')
  eval = frame_base.not_implemented_method('eval')
  reindex = frame_base.not_implemented_method('reindex')
  melt = frame_base.not_implemented_method('melt')
  pivot = frame_base.not_implemented_method('pivot')
  pivot_table = frame_base.not_implemented_method('pivot_table')

  def aggregate(self, func, axis=0, *args, **kwargs):
    if axis is None:
      # Aggregate across all elements by first aggregating across columns,
      # then across rows.
      return self.agg(func, *args, **dict(kwargs, axis=1)).agg(
          func, *args, **dict(kwargs, axis=0))
    elif axis in (1, 'columns'):
      # This is an easy elementwise aggregation.
      return frame_base.DeferredFrame.wrap(
          expressions.ComputedExpression(
              'aggregate',
              lambda df: df.agg(func, axis=1, *args, **kwargs),
              [self._expr],
              requires_partition_by=partitionings.Nothing()))
    elif len(self._expr.proxy().columns) == 0 or args or kwargs:
      # For these corner cases, just colocate everything.
      return frame_base.DeferredFrame.wrap(
        expressions.ComputedExpression(
            'aggregate',
            lambda df: df.agg(func, *args, **kwargs),
            [self._expr],
            requires_partition_by=partitionings.Singleton()))
    else:
      # In the general case, compute the aggregation of each column separately,
      # then recombine.
      if not isinstance(func, dict):
        col_names = list(self._expr.proxy().columns)
        func = {col: func for col in col_names}
      else:
        col_names = list(func.keys())
      aggregated_cols = []
      for col in col_names:
        funcs = func[col]
        if not isinstance(funcs, list):
          funcs = [funcs]
        aggregated_cols.append(self[col].agg(funcs, *args, **kwargs))
      # The final shape is different depending on whether any of the columns
      # were aggregated by a list of aggregators.
      with expressions.allow_non_parallel_operations():
        if any(isinstance(funcs, list) for funcs in func.values()):
          return frame_base.DeferredFrame.wrap(
              expressions.ComputedExpression(
                  'join_aggregate',
                  lambda *cols: pd.DataFrame(
                      {col: value for col, value in zip(col_names, cols)}),
                  [col._expr for col in aggregated_cols],
                  requires_partition_by=partitionings.Singleton()))
        else:
          return frame_base.DeferredFrame.wrap(
            expressions.ComputedExpression(
                'join_aggregate',
                  lambda *cols: pd.Series(
                      {col: value[0] for col, value in zip(col_names, cols)}),
                [col._expr for col in aggregated_cols],
                requires_partition_by=partitionings.Singleton(),
                proxy=self._expr.proxy().agg(func, *args, **kwargs)))

  agg = aggregate

  applymap = frame_base._elementwise_method('applymap')

  memory_usage = frame_base.wont_implement_method('non-deferred value')
  info = frame_base.wont_implement_method('non-deferred value')

  all = frame_base._agg_method('all')
  any = frame_base._agg_method('any')

  cummax = cummin = cumsum = cumprod = frame_base.wont_implement_method(
      'order-sensitive')
  diff = frame_base.wont_implement_method('order-sensitive')

  def dot(self, other):
    # We want to broadcast the right hand side to all partitions of the left.
    # This is OK, as its index must be the same size as the columns set of self,
    # so cannot be too large.
    class AsScalar(object):
      def __init__(self, value):
        self.value = value

    if isinstance(other, frame_base.DeferredFrame):
      proxy = other._expr.proxy()
      with expressions.allow_non_parallel_operations():
        side = expressions.ComputedExpression(
            'as_scalar',
            lambda df: AsScalar(df),
            [other._expr],
            requires_partition_by=partitionings.Singleton())
    else:
      proxy = pd.DataFrame(columns=range(len(other[0])))
      side = expressions.ConstantExpression(AsScalar(other))

    return frame_base.DeferredFrame.wrap(
        expressions.ComputedExpression(
            'dot',
            lambda left, right: left @ right.value,
            [self._expr, side],
            requires_partition_by=partitionings.Nothing(),
            preserves_partition_by=partitionings.Index(),
            proxy=proxy))

  __matmul__ = dot

  head = tail = frame_base.wont_implement_method('order-sensitive')

  max = frame_base._agg_method('max')
  min = frame_base._agg_method('min')

  def mode(self, axis=0, *args, **kwargs):
    if axis == 1 or axis == 'columns':
      # Number of columns is max(number mode values for each row), so we can't
      # determine how many there will be before looking at the data.
      raise frame_base.WontImplementError('non-deferred column values')
    return frame_base.DeferredFrame.wrap(
        expressions.ComputedExpression(
            'mode',
            lambda df: df.mode(*args, **kwargs),
            [self._expr],
            #TODO(robertwb): Approximate?
            requires_partition_by=partitionings.Singleton(),
            preserves_partition_by=partitionings.Singleton()))

  @frame_base.args_to_kwargs(pd.DataFrame)
  @frame_base.populate_defaults(pd.DataFrame)
  @frame_base.maybe_inplace
  def dropna(self, axis, **kwargs):
    # TODO(robertwb): This is a common pattern. Generalize?
    if axis == 1 or axis == 'columns':
      requires_partition_by = partitionings.Singleton()
    else:
      requires_partition_by = partitionings.Nothing()
    return frame_base.DeferredFrame.wrap(
        expressions.ComputedExpression(
            'dropna',
            lambda df: df.dropna(axis=axis, **kwargs),
            [self._expr],
            preserves_partition_by=partitionings.Singleton(),
            requires_partition_by=requires_partition_by))

  @frame_base.args_to_kwargs(pd.DataFrame)
  @frame_base.populate_defaults(pd.DataFrame)
  @frame_base.maybe_inplace
  def fillna(self, value, method, axis, **kwargs):
    if method is not None and axis in (0, 'index'):
      raise frame_base.WontImplementError('order-sensitive')
    if isinstance(value, frame_base.DeferredBase):
      value_expr = value._expr
    else:
      value_expr = expressions.ConstantExpression(value)
    return frame_base.DeferredFrame.wrap(
        expressions.ComputedExpression(
            'fillna',
            lambda df, value: df.fillna(
                value, method=method, axis=axis, **kwargs),
            [self._expr, value_expr],
            preserves_partition_by=partitionings.Singleton(),
            requires_partition_by=partitionings.Nothing()))

  isna = frame_base._elementwise_method('isna')
  notnull = notna = frame_base._elementwise_method('notna')

  items = itertuples = iterrows = iteritems = frame_base.wont_implement_method(
      'non-lazy')

  def _cols_as_temporary_index(self, cols, suffix=''):
    original_index_names = list(self._expr.proxy().index.names)
    new_index_names = [
        '__apache_beam_temp_%d_%s' % (ix, suffix)
        for (ix, _) in enumerate(original_index_names)]
    def reindex(df):
      return frame_base.DeferredFrame.wrap(
          expressions.ComputedExpression(
              'reindex',
              lambda df:
                  df.rename_axis(index=new_index_names, copy=False)
                  .reset_index().set_index(cols),
              [df._expr],
              preserves_partition_by=partitionings.Nothing(),
              requires_partition_by=partitionings.Nothing()))
    def revert(df):
      return frame_base.DeferredFrame.wrap(
          expressions.ComputedExpression(
              'join_restoreindex',
              lambda df:
                  df.reset_index().set_index(new_index_names)
                  .rename_axis(index=original_index_names, copy=False),
              [df._expr],
              preserves_partition_by=partitionings.Nothing(),
              requires_partition_by=partitionings.Nothing()))
    return reindex, revert

  @frame_base.args_to_kwargs(pd.DataFrame)
  @frame_base.populate_defaults(pd.DataFrame)
  def join(self, other, on, **kwargs):
    if on is not None:
      reindex, revert = self._cols_as_temporary_index(on)
      return revert(reindex(self).join(other, **kwargs))
    if isinstance(other, list):
      other_is_list = True
    else:
      other = [other]
      other_is_list = False
    placeholder = object()
    other_exprs = [
        df._expr for df in other if isinstance(df, frame_base.DeferredFrame)]
    const_others = [
        placeholder if isinstance(df, frame_base.DeferredFrame) else df
        for df in other]
    def fill_placeholders(values):
      values = iter(values)
      filled = [
          next(values) if df is placeholder else df for df in const_others]
      if other_is_list:
        return filled
      else:
        return filled[0]
    return frame_base.DeferredFrame.wrap(
        expressions.ComputedExpression(
            'join',
            lambda df, *deferred_others: df.join(
                fill_placeholders(deferred_others), **kwargs),
            [self._expr] + other_exprs,
            preserves_partition_by=partitionings.Singleton(),
            requires_partition_by=partitionings.Index()))

  @frame_base.args_to_kwargs(pd.DataFrame)
  @frame_base.populate_defaults(pd.DataFrame)
  def merge(
      self,
      right,
      on,
      left_on,
      right_on,
      left_index,
      right_index,
      **kwargs):
    self_proxy = self._expr.proxy()
    right_proxy = right._expr.proxy()
    # Validate with a pandas call.
    _ = self_proxy.merge(
        right_proxy,
        on=on,
        left_on=left_on,
        right_on=right_on,
        left_index=left_index,
        right_index=right_index,
        **kwargs)
    if not any([on, left_on, right_on, left_index, right_index]):
      on = [col for col in self_proxy.columns() if col in right_proxy.columns()]
    if not left_on:
      left_on = on
    elif not isinstance(left_on, list):
      left_on = [left_on]
    if not right_on:
      right_on = on
    elif not isinstance(right_on, list):
      right_on = [right_on]

    if left_index:
      indexed_left = self
    else:
      indexed_left = self.set_index(left_on, drop=False)

    if right_index:
      indexed_right = right
    else:
      indexed_right = right.set_index(right_on, drop=False)

    merged = frame_base.DeferredFrame.wrap(
        expressions.ComputedExpression(
            'merge',
            lambda left, right: left.merge(
                right, left_index=True, right_index=True, **kwargs),
            [indexed_left._expr, indexed_right._expr],
            preserves_partition_by=partitionings.Singleton(),
            requires_partition_by=partitionings.Index()))

    if left_index or right_index:
      return merged
    else:
      return merged.reset_index(drop=True)

  @frame_base.args_to_kwargs(pd.DataFrame)
  @frame_base.populate_defaults(pd.DataFrame)
  def nlargest(self, keep, **kwargs):
    if keep == 'any':
      keep = 'first'
    elif keep != 'all':
      raise frame_base.WontImplementError('order-sensitive')
    kwargs['keep'] = keep
    per_partition = expressions.ComputedExpression(
            'nlargest-per-partition',
            lambda df: df.nlargest(**kwargs),
            [self._expr],
            preserves_partition_by=partitionings.Singleton(),
            requires_partition_by=partitionings.Nothing())
    with expressions.allow_non_parallel_operations(True):
      return frame_base.DeferredFrame.wrap(
          expressions.ComputedExpression(
              'nlargest',
              lambda df: df.nlargest(**kwargs),
              [per_partition],
              preserves_partition_by=partitionings.Singleton(),
              requires_partition_by=partitionings.Singleton()))

  @frame_base.args_to_kwargs(pd.DataFrame)
  @frame_base.populate_defaults(pd.DataFrame)
  def nsmallest(self, keep, **kwargs):
    if keep == 'any':
      keep = 'first'
    elif keep != 'all':
      raise frame_base.WontImplementError('order-sensitive')
    kwargs['keep'] = keep
    per_partition = expressions.ComputedExpression(
            'nsmallest-per-partition',
            lambda df: df.nsmallest(**kwargs),
            [self._expr],
            preserves_partition_by=partitionings.Singleton(),
            requires_partition_by=partitionings.Nothing())
    with expressions.allow_non_parallel_operations(True):
      return frame_base.DeferredFrame.wrap(
          expressions.ComputedExpression(
              'nsmallest',
              lambda df: df.nsmallest(**kwargs),
              [per_partition],
              preserves_partition_by=partitionings.Singleton(),
              requires_partition_by=partitionings.Singleton()))

  @frame_base.args_to_kwargs(pd.DataFrame)
  def nunique(self, **kwargs):
    if kwargs.get('axis', None) in (1, 'columns'):
      requires_partition_by = partitionings.Nothing()
    else:
      requires_partition_by = partitionings.Singleton()
    return frame_base.DeferredFrame.wrap(
        expressions.ComputedExpression(
            'nunique',
            lambda df: df.nunique(**kwargs),
            [self._expr],
            preserves_partition_by=partitionings.Singleton(),
            requires_partition_by=requires_partition_by))

  prod = product = frame_base._agg_method('prod')

  @frame_base.args_to_kwargs(pd.DataFrame)
  @frame_base.populate_defaults(pd.DataFrame)
  def quantile(self, axis, **kwargs):
    if axis == 1 or axis == 'columns':
      raise frame_base.WontImplementError('non-deferred column values')
    return frame_base.DeferredFrame.wrap(
        expressions.ComputedExpression(
            'quantile',
            lambda df: df.quantile(axis=axis, **kwargs),
            [self._expr],
            #TODO(robertwb): Approximate quantiles?
            requires_partition_by=partitionings.Singleton(),
            preserves_partition_by=partitionings.Singleton()))

  query = frame_base._elementwise_method('query')

  @frame_base.args_to_kwargs(pd.DataFrame)
  @frame_base.maybe_inplace
  def rename(self, **kwargs):
    rename_index = (
        'index' in kwargs
        or kwargs.get('axis', None) in (0, 'index')
        or ('columns' not in kwargs and 'axis' not in kwargs))
    if rename_index:
      # Technically, it's still partitioned by index, but it's no longer
      # partitioned by the hash of the index.
      preserves_partition_by = partitionings.Nothing()
    else:
      preserves_partition_by = partitionings.Singleton()
    if kwargs.get('errors', None) == 'raise' and rename_index:
      # Renaming index with checking.
      requires_partition_by = partitionings.Singleton()
      proxy = self._expr.proxy()
    else:
      requires_partition_by = partitionings.Nothing()
      proxy = None
    return frame_base.DeferredFrame.wrap(
        expressions.ComputedExpression(
            'rename',
            lambda df: df.rename(**kwargs),
            [self._expr],
            proxy=proxy,
            preserves_partition_by=preserves_partition_by,
            requires_partition_by=requires_partition_by))

  rename_axis = frame_base._elementwise_method('rename_axis')

  @frame_base.args_to_kwargs(pd.DataFrame)
  @frame_base.populate_defaults(pd.DataFrame)
  @frame_base.maybe_inplace
  def replace(self, limit, **kwargs):
    if limit is None:
      requires_partition_by = partitionings.Nothing()
    else:
      requires_partition_by = partitionings.Singleton()
    return frame_base.DeferredFrame.wrap(
        expressions.ComputedExpression(
            'replace',
            lambda df: df.replace(limit=limit, **kwargs),
            [self._expr],
            preserves_partition_by=partitionings.Singleton(),
            requires_partition_by=requires_partition_by))

  @frame_base.args_to_kwargs(pd.DataFrame)
  @frame_base.populate_defaults(pd.DataFrame)
  @frame_base.maybe_inplace
  def reset_index(self, level=None, **kwargs):
    if level is not None and not isinstance(level, (tuple, list)):
      level = [level]
    if level is None or len(level) == len(self._expr.proxy().index.levels):
      # TODO: Could do distributed re-index with offsets.
      requires_partition_by = partitionings.Singleton()
    else:
      requires_partition_by = partitionings.Nothing()
    return frame_base.DeferredFrame.wrap(
        expressions.ComputedExpression(
            'reset_index',
            lambda df: df.reset_index(level=level, **kwargs),
            [self._expr],
            preserves_partition_by=partitionings.Singleton(),
            requires_partition_by=requires_partition_by))

  round = frame_base._elementwise_method('round')
  select_dtypes = frame_base._elementwise_method('select_dtypes')

  @frame_base.args_to_kwargs(pd.DataFrame)
  @frame_base.populate_defaults(pd.DataFrame)
  def shift(self, axis, **kwargs):
    if 'freq' in kwargs:
      raise frame_base.WontImplementError('data-dependent')
    if axis == 1 or axis == 'columns':
      requires_partition_by = partitionings.Nothing()
    else:
      requires_partition_by = partitionings.Singleton()
    return frame_base.DeferredFrame.wrap(
        expressions.ComputedExpression(
            'shift',
            lambda df: df.shift(axis=axis, **kwargs),
            [self._expr],
            preserves_partition_by=partitionings.Singleton(),
            requires_partition_by=requires_partition_by))

  @property
  def shape(self):
    raise frame_base.WontImplementError('scalar value')

  @frame_base.args_to_kwargs(pd.DataFrame)
  @frame_base.populate_defaults(pd.DataFrame)
  @frame_base.maybe_inplace
  def sort_values(self, axis, **kwargs):
    if axis == 1 or axis == 'columns':
      requires_partition_by = partitionings.Nothing()
    else:
      requires_partition_by = partitionings.Singleton()
    return frame_base.DeferredFrame.wrap(
        expressions.ComputedExpression(
            'sort_values',
            lambda df: df.sort_values(axis=axis, **kwargs),
            [self._expr],
            preserves_partition_by=partitionings.Singleton(),
            requires_partition_by=requires_partition_by))

  stack = frame_base._elementwise_method('stack')

  sum = frame_base._agg_method('sum')

  take = frame_base.wont_implement_method('deprecated')

  to_records = to_dict = to_numpy = to_string = (
      frame_base.wont_implement_method('non-deferred value'))

  to_sparse = to_string # frame_base._elementwise_method('to_sparse')

  transform = frame_base._elementwise_method(
      'transform', restrictions={'axis': 0})

  transpose = frame_base.wont_implement_method('non-deferred column values')

  def unstack(self, *args, **kwargs):
    if self._expr.proxy().index.nlevels == 1:
      return frame_base.DeferredFrame.wrap(
        expressions.ComputedExpression(
            'unstack',
            lambda df: df.unstack(*args, **kwargs),
            [self._expr],
            requires_partition_by=partitionings.Index()))
    else:
      raise frame_base.WontImplementError('non-deferred column values')

  update = frame_base._proxy_method(
      'update',
      inplace=True,
      requires_partition_by=partitionings.Index(),
      preserves_partition_by=partitionings.Index())
Exemplo n.º 10
0
class DeferredSeries(frame_base.DeferredFrame):
  def __array__(self, dtype=None):
    raise frame_base.WontImplementError(
        'Conversion to a non-deferred a numpy array.')

  astype = frame_base._elementwise_method('astype')

  between = frame_base._elementwise_method('between')

  def dot(self, other):
    left = self._expr
    if isinstance(other, DeferredSeries):
      right = expressions.ComputedExpression(
          'to_dataframe',
          pd.DataFrame, [other._expr],
          requires_partition_by=partitionings.Nothing(),
          preserves_partition_by=partitionings.Index())
      right_is_series = True
    elif isinstance(other, DeferredDataFrame):
      right = other._expr
      right_is_series = False
    else:
      raise frame_base.WontImplementError('non-deferred result')

    dots = expressions.ComputedExpression(
        'dot',
        # Transpose so we can sum across rows.
        (lambda left, right: pd.DataFrame(left @ right).T),
        [left, right],
        requires_partition_by=partitionings.Index())
    with expressions.allow_non_parallel_operations(True):
      sums = expressions.ComputedExpression(
          'sum',
          lambda dots: dots.sum(),  #
          [dots],
          requires_partition_by=partitionings.Singleton())

      if right_is_series:
        result = expressions.ComputedExpression(
            'extract',
            lambda df: df[0], [sums],
            requires_partition_by=partitionings.Singleton())
      else:
        result = sums
      return frame_base.DeferredFrame.wrap(result)

  __matmul__ = dot

  @frame_base.args_to_kwargs(pd.Series)
  @frame_base.populate_defaults(pd.Series)
  @frame_base.maybe_inplace
  def dropna(self, **kwargs):
    return frame_base.DeferredFrame.wrap(
        expressions.ComputedExpression(
            'dropna',
            lambda df: df.dropna(**kwargs), [self._expr],
            preserves_partition_by=partitionings.Singleton(),
            requires_partition_by=partitionings.Nothing()))

  items = iteritems = frame_base.wont_implement_method('non-lazy')

  isin = frame_base._elementwise_method('isin')

  isna = frame_base._elementwise_method('isna')
  notnull = notna = frame_base._elementwise_method('notna')

  @frame_base.args_to_kwargs(pd.Series)
  @frame_base.populate_defaults(pd.Series)
  @frame_base.maybe_inplace
  def fillna(self, value, method):
    if method is not None:
      raise frame_base.WontImplementError('order-sensitive')
    if isinstance(value, frame_base.DeferredBase):
      value_expr = value._expr
    else:
      value_expr = expressions.ConstantExpression(value)
    return frame_base.DeferredFrame.wrap(
        expressions.ComputedExpression(
            'fillna',
            lambda df,
            value: df.fillna(value, method=method), [self._expr, value_expr],
            preserves_partition_by=partitionings.Singleton(),
            requires_partition_by=partitionings.Nothing()))

  reindex = frame_base.not_implemented_method('reindex')

  to_numpy = to_string = frame_base.wont_implement_method('non-deferred value')

  transform = frame_base._elementwise_method(
      'transform', restrictions={'axis': 0})

  def aggregate(self, func, axis=0, *args, **kwargs):
    if isinstance(func, list) and len(func) > 1:
      # Aggregate each column separately, then stick them all together.
      rows = [self.agg([f], *args, **kwargs) for f in func]
      return frame_base.DeferredFrame.wrap(
          expressions.ComputedExpression(
              'join_aggregate',
              lambda *rows: pd.concat(rows), [row._expr for row in rows]))
    else:
      # We're only handling a single column.
      base_func = func[0] if isinstance(func, list) else func
      if _is_associative(base_func) and not args and not kwargs:
        intermediate = expressions.elementwise_expression(
            'pre_aggregate',
            lambda s: s.agg([base_func], *args, **kwargs), [self._expr])
        allow_nonparallel_final = True
      else:
        intermediate = self._expr
        allow_nonparallel_final = None  # i.e. don't change the value
      with expressions.allow_non_parallel_operations(allow_nonparallel_final):
        return frame_base.DeferredFrame.wrap(
            expressions.ComputedExpression(
                'aggregate',
                lambda s: s.agg(func, *args, **kwargs), [intermediate],
                preserves_partition_by=partitionings.Singleton(),
                requires_partition_by=partitionings.Singleton()))

  agg = aggregate

  all = frame_base._agg_method('all')
  any = frame_base._agg_method('any')
  min = frame_base._agg_method('min')
  max = frame_base._agg_method('max')
  prod = product = frame_base._agg_method('prod')
  sum = frame_base._agg_method('sum')

  cummax = cummin = cumsum = cumprod = frame_base.wont_implement_method(
      'order-sensitive')
  diff = frame_base.wont_implement_method('order-sensitive')

  head = tail = frame_base.wont_implement_method('order-sensitive')

  memory_usage = frame_base.wont_implement_method('non-deferred value')

  # In Series __contains__ checks the index
  __contains__ = frame_base.wont_implement_method('non-deferred value')

  @frame_base.args_to_kwargs(pd.Series)
  @frame_base.populate_defaults(pd.Series)
  def nlargest(self, keep, **kwargs):
    # TODO(robertwb): Document 'any' option.
    # TODO(robertwb): Consider (conditionally) defaulting to 'any' if no
    # explicit keep parameter is requested.
    if keep == 'any':
      keep = 'first'
    elif keep != 'all':
      raise frame_base.WontImplementError('order-sensitive')
    kwargs['keep'] = keep
    per_partition = expressions.ComputedExpression(
        'nlargest-per-partition',
        lambda df: df.nlargest(**kwargs), [self._expr],
        preserves_partition_by=partitionings.Singleton(),
        requires_partition_by=partitionings.Nothing())
    with expressions.allow_non_parallel_operations(True):
      return frame_base.DeferredFrame.wrap(
          expressions.ComputedExpression(
              'nlargest',
              lambda df: df.nlargest(**kwargs), [per_partition],
              preserves_partition_by=partitionings.Singleton(),
              requires_partition_by=partitionings.Singleton()))

  @frame_base.args_to_kwargs(pd.Series)
  @frame_base.populate_defaults(pd.Series)
  def nsmallest(self, keep, **kwargs):
    if keep == 'any':
      keep = 'first'
    elif keep != 'all':
      raise frame_base.WontImplementError('order-sensitive')
    kwargs['keep'] = keep
    per_partition = expressions.ComputedExpression(
        'nsmallest-per-partition',
        lambda df: df.nsmallest(**kwargs), [self._expr],
        preserves_partition_by=partitionings.Singleton(),
        requires_partition_by=partitionings.Nothing())
    with expressions.allow_non_parallel_operations(True):
      return frame_base.DeferredFrame.wrap(
          expressions.ComputedExpression(
              'nsmallest',
              lambda df: df.nsmallest(**kwargs), [per_partition],
              preserves_partition_by=partitionings.Singleton(),
              requires_partition_by=partitionings.Singleton()))

  rename_axis = frame_base._elementwise_method('rename_axis')

  @frame_base.args_to_kwargs(pd.Series)
  @frame_base.populate_defaults(pd.Series)
  @frame_base.maybe_inplace
  def replace(self, limit, **kwargs):
    if limit is None:
      requires_partition_by = partitionings.Nothing()
    else:
      requires_partition_by = partitionings.Singleton()
    return frame_base.DeferredFrame.wrap(
        expressions.ComputedExpression(
            'replace',
            lambda df: df.replace(limit=limit, **kwargs), [self._expr],
            preserves_partition_by=partitionings.Singleton(),
            requires_partition_by=requires_partition_by))

  round = frame_base._elementwise_method('round')

  searchsorted = frame_base.wont_implement_method('order-sensitive')

  shift = frame_base.wont_implement_method('order-sensitive')

  take = frame_base.wont_implement_method('deprecated')

  to_dict = frame_base.wont_implement_method('non-deferred')

  to_frame = frame_base._elementwise_method('to_frame')

  def unique(self, as_series=False):
    if not as_series:
      raise frame_base.WontImplementError(
          'pass as_series=True to get the result as a (deferred) Series')
    return frame_base.DeferredFrame.wrap(
        expressions.ComputedExpression(
            'unique',
            lambda df: pd.Series(df.unique()), [self._expr],
            preserves_partition_by=partitionings.Singleton(),
            requires_partition_by=partitionings.Singleton()))

  def update(self, other):
    self._expr = expressions.ComputedExpression(
        'update',
        lambda df,
        other: df.update(other) or df, [self._expr, other._expr],
        preserves_partition_by=partitionings.Singleton(),
        requires_partition_by=partitionings.Index())

  unstack = frame_base.wont_implement_method('non-deferred column values')

  values = property(frame_base.wont_implement_method('non-deferred'))

  view = frame_base.wont_implement_method('memory sharing semantics')

  @property
  def str(self):
    expr = expressions.ComputedExpression(
        'str',
        lambda df: df.str, [self._expr],
        requires_partition_by=partitionings.Nothing(),
        preserves_partition_by=partitionings.Singleton())
    return _DeferredStringMethods(expr)
class DeferredPandasModule(object):
    array = _defer_to_pandas('array')
    bdate_range = _defer_to_pandas('bdate_range')

    @staticmethod
    @frame_base.args_to_kwargs(pd)
    @frame_base.populate_defaults(pd)
    def concat(objs, axis, join, ignore_index, keys, levels, names,
               verify_integrity, sort, copy):

        if ignore_index:
            raise NotImplementedError('concat(ignore_index)')
        if levels:
            raise NotImplementedError('concat(levels)')

        if isinstance(objs, Mapping):
            if keys is None:
                keys = list(objs.keys())
            objs = [objs[k] for k in keys]
        else:
            objs = list(objs)

        if keys is None:
            preserves_partitioning = partitionings.Arbitrary()
        else:
            # Index 0 will be a new index for keys, only partitioning by the original
            # indexes (1 to N) will be preserved.
            nlevels = min(o._expr.proxy().index.nlevels for o in objs)
            preserves_partitioning = partitionings.Index(
                [i for i in range(1, nlevels + 1)])

        deferred_none = expressions.ConstantExpression(None)
        exprs = [deferred_none if o is None else o._expr for o in objs]

        if axis in (1, 'columns'):
            required_partitioning = partitionings.Index()
        elif verify_integrity:
            required_partitioning = partitionings.Index()
        else:
            required_partitioning = partitionings.Arbitrary()

        return frame_base.DeferredBase.wrap(
            expressions.ComputedExpression(
                'concat',
                lambda *objs: pd.concat(objs,
                                        axis=axis,
                                        join=join,
                                        ignore_index=ignore_index,
                                        keys=keys,
                                        levels=levels,
                                        names=names,
                                        verify_integrity=verify_integrity
                                        ),  # yapf break
                exprs,
                requires_partition_by=required_partitioning,
                preserves_partition_by=preserves_partitioning))

    date_range = _defer_to_pandas('date_range')
    describe_option = _defer_to_pandas('describe_option')
    factorize = _call_on_first_arg('factorize')
    get_option = _defer_to_pandas('get_option')
    interval_range = _defer_to_pandas('interval_range')
    isna = _call_on_first_arg('isna')
    isnull = _call_on_first_arg('isnull')
    json_normalize = _defer_to_pandas('json_normalize')
    melt = _call_on_first_arg('melt')
    merge = _call_on_first_arg('merge')
    melt = _call_on_first_arg('melt')
    merge_ordered = frame_base.wont_implement_method(pd,
                                                     'merge_ordered',
                                                     reason='order-sensitive')
    notna = _call_on_first_arg('notna')
    notnull = _call_on_first_arg('notnull')
    option_context = _defer_to_pandas('option_context')
    period_range = _defer_to_pandas('period_range')
    pivot = _call_on_first_arg('pivot')
    pivot_table = _call_on_first_arg('pivot_table')
    show_versions = _defer_to_pandas('show_versions')
    test = frame_base.wont_implement_method(
        pd,
        'test',
        explanation="because it is an internal pandas testing utility.")
    timedelta_range = _defer_to_pandas('timedelta_range')
    to_pickle = frame_base.wont_implement_method(pd,
                                                 'to_pickle',
                                                 reason='order-sensitive')
    to_datetime = _defer_to_pandas_maybe_elementwise('to_datetime')
    notna = _call_on_first_arg('notna')

    def __getattr__(self, name):
        if name.startswith('read_'):

            def func(*args, **kwargs):
                raise frame_base.WontImplementError(
                    'Use p | apache_beam.dataframe.io.%s' % name)

            return func
        res = getattr(pd, name)
        if _is_top_level_function(res):
            return frame_base.not_implemented_method(name, base_type=pd)
        else:
            return res