示例#1
0
    def test_only_caches_same_input(self):
        arg_0_expr = expressions.ConstantExpression(0)
        ident_val = expressions.ComputedExpression('ident', lambda x: x,
                                                   [arg_0_expr])
        comp_expr = expressions.ComputedExpression('add', lambda x, y: x + y,
                                                   [ident_val, arg_0_expr])

        self.mock_cache(arg_0_expr)

        replaced = self.cache.replace_with_cached(comp_expr)

        # Assert that arg_0_expr, being an input to two computations, was replaced
        # with the same placeholder expression.
        expected_trace = [
            expressions.ComputedExpression, expressions.ComputedExpression,
            expressions.PlaceholderExpression,
            expressions.PlaceholderExpression
        ]
        actual_trace = self.create_trace(comp_expr)
        unique_placeholders = set(
            t for t in actual_trace
            if isinstance(t, expressions.PlaceholderExpression))
        self.assertTraceTypes(comp_expr, expected_trace)
        self.assertTrue(
            all(e == replaced[arg_0_expr._id] for e in unique_placeholders))
        self.assertIn(arg_0_expr._id, replaced)
示例#2
0
  def dot(self, other):
    # We want to broadcast the right hand side to all partitions of the left.
    # This is OK, as its index must be the same size as the columns set of self,
    # so cannot be too large.
    class AsScalar(object):
      def __init__(self, value):
        self.value = value

    if isinstance(other, frame_base.DeferredFrame):
      proxy = other._expr.proxy()
      with expressions.allow_non_parallel_operations():
        side = expressions.ComputedExpression(
            'as_scalar',
            lambda df: AsScalar(df),
            [other._expr],
            requires_partition_by=partitionings.Singleton())
    else:
      proxy = pd.DataFrame(columns=range(len(other[0])))
      side = expressions.ConstantExpression(AsScalar(other))

    return frame_base.DeferredFrame.wrap(
        expressions.ComputedExpression(
            'dot',
            lambda left, right: left @ right.value,
            [self._expr, side],
            requires_partition_by=partitionings.Nothing(),
            preserves_partition_by=partitionings.Index(),
            proxy=proxy))
示例#3
0
    def test_only_replaces_cached(self):
        in_expr = expressions.ConstantExpression(0)
        comp_expr = expressions.ComputedExpression('test', lambda x: x,
                                                   [in_expr])

        # Expect that no replacement of expressions is performed.
        expected_trace = [
            expressions.ComputedExpression, expressions.ConstantExpression
        ]
        self.assertTraceTypes(comp_expr, expected_trace)

        self.cache.replace_with_cached(comp_expr)

        self.assertTraceTypes(comp_expr, expected_trace)

        # Now "cache" the expression and assert that the cached expression was
        # replaced with a placeholder.
        self.mock_cache(in_expr)

        replaced = self.cache.replace_with_cached(comp_expr)

        expected_trace = [
            expressions.ComputedExpression, expressions.PlaceholderExpression
        ]
        self.assertTraceTypes(comp_expr, expected_trace)
        self.assertIn(in_expr._id, replaced)
    def test_preserves_index_output_partitioning(self):
        # Empty DataFrame with two columns and two index levels
        input_expr = expressions.ConstantExpression(
            pd.DataFrame(columns=["foo", "bar"], index=[[], []]))

        preserves_partial_index = expressions.ComputedExpression(
            'preserves_partial_index',
            # This adds an additional index  level, so we'd only preserve
            # partitioning on the two index levels that existed before.
            lambda df: df.set_index('foo', append=True),
            [input_expr],
            requires_partition_by=partitionings.Arbitrary(),
            preserves_partition_by=partitionings.Index([0, 1]))

        for partitioning in (
                partitionings.Singleton(),
                partitionings.Index([0]),
                partitionings.Index([1]),
                partitionings.Index([0, 1]),
        ):
            self.assertEqual(
                expressions.output_partitioning(preserves_partial_index,
                                                partitioning), partitioning,
                f"Should preserve {partitioning}")

        for partitioning in (partitionings.Index([0, 1, 2]),
                             partitionings.Index(), partitionings.Arbitrary()):
            self.assertEqual(
                expressions.output_partitioning(preserves_partial_index,
                                                partitioning),
                partitionings.Arbitrary(),
                f"Should NOT preserve {partitioning}")
    def test_preserves_singleton_output_partitioning(self):
        # Empty DataFrame with one column and two index levels
        input_expr = expressions.ConstantExpression(
            pd.DataFrame(columns=["column"], index=[[], []]))

        preserves_only_singleton = expressions.ComputedExpression(
            'preserves_only_singleton',
            # index is replaced with an entirely new one, so
            # if we were partitioned by Index we're not anymore.
            lambda df: df.set_index('column'),
            [input_expr],
            requires_partition_by=partitionings.Arbitrary(),
            preserves_partition_by=partitionings.Singleton())

        for partitioning in (partitionings.Singleton(), ):
            self.assertEqual(
                expressions.output_partitioning(preserves_only_singleton,
                                                partitioning), partitioning,
                f"Should preserve {partitioning}")

        for partitioning in (partitionings.Index([0]), partitionings.Index(),
                             partitionings.Arbitrary()):
            self.assertEqual(
                expressions.output_partitioning(preserves_only_singleton,
                                                partitioning),
                partitionings.Arbitrary(),
                f"Should NOT preserve {partitioning}")
示例#6
0
 def wrapper(*args, **kwargs):
     res = getattr(pd, name)(*args, **kwargs)
     if type(res) in frame_base.DeferredBase._pandas_type_map.keys():
         return frame_base.DeferredBase.wrap(
             expressions.ConstantExpression(res, res[0:0]))
     else:
         return res
示例#7
0
  def concat(
      objs,
      axis,
      join,
      ignore_index,
      keys,
      levels,
      names,
      verify_integrity,
      sort,
      copy):

    if ignore_index:
      raise NotImplementedError('concat(ignore_index)')
    if levels:
      raise NotImplementedError('concat(levels)')

    if isinstance(objs, Mapping):
      if keys is None:
        keys = list(objs.keys())
      objs = [objs[k] for k in keys]
    else:
      objs = list(objs)

    if keys is None:
      preserves_partitioning = partitionings.Arbitrary()
    else:
      # Index 0 will be a new index for keys, only partitioning by the original
      # indexes (1 to N) will be preserved.
      nlevels = min(o._expr.proxy().index.nlevels for o in objs)
      preserves_partitioning = partitionings.Index(
          [i for i in range(1, nlevels + 1)])

    deferred_none = expressions.ConstantExpression(None)
    exprs = [deferred_none if o is None else o._expr for o in objs]

    if axis in (1, 'columns'):
      required_partitioning = partitionings.Index()
    elif verify_integrity:
      required_partitioning = partitionings.Index()
    else:
      required_partitioning = partitionings.Arbitrary()

    return frame_base.DeferredBase.wrap(
        expressions.ComputedExpression(
            'concat',
            lambda *objs: pd.concat(
                objs,
                axis=axis,
                join=join,
                ignore_index=ignore_index,
                keys=keys,
                levels=levels,
                names=names,
                verify_integrity=verify_integrity),  # yapf break
            exprs,
            requires_partition_by=required_partitioning,
            preserves_partition_by=preserves_partitioning))
示例#8
0
 def _run_test(self, func, *args):
   deferred_args = [
       frame_base.DeferredFrame.wrap(
           expressions.ConstantExpression(arg, arg[0:0])) for arg in args
   ]
   expected = func(*args)
   actual = expressions.Session({}).evaluate(func(*deferred_args)._expr)
   self.assertTrue(
       expected.equals(actual),
       'Expected:\n\n%r\n\nActual:\n\n%r' % (expected, actual))
示例#9
0
    def wrapper(*args, **kwargs):
        for key, values in ():  #restrictions.items():
            if key in kwargs:
                value = kwargs[key]
            else:
                try:
                    # pylint: disable=deprecated-method
                    ix = inspect.getargspec(func).args.index(key)
                except ValueError:
                    # TODO: fix for delegation?
                    continue
                if len(args) <= ix:
                    continue
                value = args[ix]
            if not isinstance(values, list):
                values = [values]
            if value not in values:
                raise NotImplementedError('%s=%s not supported for %s' %
                                          (key, value, name))
        deferred_arg_indices = []
        deferred_arg_exprs = []
        constant_args = [None] * len(args)
        for ix, arg in enumerate(args):
            if isinstance(arg, DeferredBase):
                deferred_arg_indices.append(ix)
                deferred_arg_exprs.append(arg._expr)
            elif isinstance(arg, pd.core.generic.NDFrame):
                deferred_arg_indices.append(ix)
                deferred_arg_exprs.append(
                    expressions.ConstantExpression(arg, arg[0:0]))
            else:
                constant_args[ix] = arg

        if inplace:
            actual_func = copy_and_mutate(func)
        else:
            actual_func = func

        def apply(*actual_args):
            full_args = list(constant_args)
            for ix, arg in zip(deferred_arg_indices, actual_args):
                full_args[ix] = arg
            return actual_func(*full_args, **kwargs)

        result_expr = expressions.ComputedExpression(
            name,
            apply,
            deferred_arg_exprs,
            requires_partition_by=requires_partition_by,
            preserves_partition_by=preserves_partition_by)
        if inplace:
            args[0]._expr = result_expr

        else:
            return DeferredFrame.wrap(result_expr)
示例#10
0
    def test_only_replaces_inputs(self):
        arg_0_expr = expressions.ConstantExpression(0)
        ident_val = expressions.ComputedExpression('ident', lambda x: x,
                                                   [arg_0_expr])

        arg_1_expr = expressions.ConstantExpression(1)
        comp_expr = expressions.ComputedExpression('add', lambda x, y: x + y,
                                                   [ident_val, arg_1_expr])

        self.mock_cache(ident_val)

        replaced = self.cache.replace_with_cached(comp_expr)

        # Assert that ident_val was replaced and that its arguments were removed
        # from the expression tree.
        expected_trace = [
            expressions.ComputedExpression, expressions.PlaceholderExpression,
            expressions.ConstantExpression
        ]
        self.assertTraceTypes(comp_expr, expected_trace)
        self.assertIn(ident_val._id, replaced)
        self.assertNotIn(arg_0_expr, self.create_trace(comp_expr))
示例#11
0
    def _run_test(self, func, *args, distributed=True, expect_error=False):
        deferred_args = [
            frame_base.DeferredFrame.wrap(
                expressions.ConstantExpression(arg, arg[0:0])) for arg in args
        ]
        try:
            expected = func(*args)
        except Exception as e:
            if not expect_error:
                raise
            expected = e
        else:
            if expect_error:
                raise AssertionError(
                    "Expected an error but computing expected result successfully "
                    f"returned: {expected}")

        session_type = (expressions.PartitioningSession
                        if distributed else expressions.Session)
        try:
            actual = session_type({}).evaluate(func(*deferred_args)._expr)
        except Exception as e:
            if not expect_error:
                raise
            actual = e
        else:
            if expect_error:
                raise AssertionError(
                    "Expected an error:\n{expected}\nbut successfully "
                    f"returned:\n{actual}")

        if not expect_error:
            if hasattr(expected, 'equals'):
                if distributed:
                    cmp = lambda df: expected.sort_index().equals(df.
                                                                  sort_index())
                else:
                    cmp = expected.equals
            elif isinstance(expected, float):
                cmp = lambda x: (math.isnan(x) and math.isnan(
                    expected)) or x == expected == 0 or abs(expected - x) / (
                        abs(expected) + abs(x)) < 1e-8
            else:
                cmp = expected.__eq__
            self.assertTrue(
                cmp(actual),
                'Expected:\n\n%r\n\nActual:\n\n%r' % (expected, actual))
        else:
            self.assertIsInstance(actual, type(expected))
            self.assertEqual(str(actual), str(expected))
示例#12
0
 def fillna(self, value, method):
     if method is not None:
         raise frame_base.WontImplementError('order-sensitive')
     if isinstance(value, frame_base.DeferredBase):
         value_expr = value._expr
     else:
         value_expr = expressions.ConstantExpression(value)
     return frame_base.DeferredFrame.wrap(
         expressions.ComputedExpression(
             'fillna',
             lambda df, value: df.fillna(value, method=method),
             [self._expr, value_expr],
             preserves_partition_by=partitionings.Singleton(),
             requires_partition_by=partitionings.Nothing()))
示例#13
0
    def test_dataframe_eval_query(self):
        df = pd.DataFrame(np.random.randn(20, 3), columns=['a', 'b', 'c'])
        self._run_test(lambda df: df.eval('foo = a + b - c'), df)
        self._run_test(lambda df: df.query('a > b + c'), df)

        self._run_inplace_test(lambda df: df.eval('foo = a + b - c'), df)

        # Verify that attempting to access locals raises a useful error
        deferred_df = frame_base.DeferredFrame.wrap(
            expressions.ConstantExpression(df, df[0:0]))
        self.assertRaises(NotImplementedError,
                          lambda: deferred_df.eval('foo = a + @b - c'))
        self.assertRaises(NotImplementedError,
                          lambda: deferred_df.query('a > @b + c'))
示例#14
0
  def concat(
      objs,
      axis,
      join,
      ignore_index,
      keys,
      levels,
      names,
      verify_integrity,
      sort,
      copy):

    if ignore_index:
      raise NotImplementedError('concat(ignore_index)')
    if levels:
      raise NotImplementedError('concat(levels)')

    if isinstance(objs, Mapping):
      if keys is None:
        keys = list(objs.keys())
      objs = [objs[k] for k in keys]
    else:
      objs = list(objs)
    deferred_none = expressions.ConstantExpression(None)
    exprs = [deferred_none if o is None else o._expr for o in objs]

    if axis in (1, 'columns'):
      required_partitioning = partitionings.Index()
    elif verify_integrity:
      required_partitioning = partitionings.Index()
    else:
      required_partitioning = partitionings.Nothing()

    return frame_base.DeferredBase.wrap(
        expressions.ComputedExpression(
            'concat',
            lambda *objs: pd.concat(
                objs,
                axis=axis,
                join=join,
                ignore_index=ignore_index,
                keys=keys,
                levels=levels,
                names=names,
                verify_integrity=verify_integrity),  # yapf break
            exprs,
            requires_partition_by=required_partitioning,
            preserves_partition_by=partitionings.Index()))
示例#15
0
 def _run_test(self, func, *args, distributed=False):
   deferred_args = [
       frame_base.DeferredFrame.wrap(
           expressions.ConstantExpression(arg, arg[0:0])) for arg in args
   ]
   expected = func(*args)
   session_type = (
       expressions.PartitioningSession if distributed else expressions.Session)
   actual = session_type({}).evaluate(func(*deferred_args)._expr)
   if hasattr(expected, 'equals'):
     if distributed:
       cmp = lambda df: expected.sort_index().equals(df.sort_index())
     else:
       cmp = expected.equals
   elif isinstance(expected, float):
     cmp = lambda x: (math.isnan(x) and math.isnan(expected)
                      ) or x == expected == 0 or abs(expected - x) / (
                          abs(expected) + abs(x)) < 1e-8
   else:
     cmp = expected.__eq__
   self.assertTrue(
       cmp(actual), 'Expected:\n\n%r\n\nActual:\n\n%r' % (expected, actual))
def _maybe_wrap_constant_expr(res):
    if type(res) in frame_base.DeferredBase._pandas_type_map.keys():
        return frame_base.DeferredBase.wrap(
            expressions.ConstantExpression(res, res[0:0]))
    else:
        return res
示例#17
0
    def wrapper(*args, **kwargs):
        for key, values in restrictions.items():
            if key in kwargs:
                value = kwargs[key]
            else:
                try:
                    ix = _getargspec(func).args.index(key)
                except ValueError:
                    # TODO: fix for delegation?
                    continue
                if len(args) <= ix:
                    continue
                value = args[ix]
            if callable(values):
                check = values
            elif isinstance(values, list):
                check = lambda x, values=values: x in values
            else:
                check = lambda x, value=value: x == value

            if not check(value):
                raise NotImplementedError('%s=%s not supported for %s' %
                                          (key, value, name))
        deferred_arg_indices = []
        deferred_arg_exprs = []
        constant_args = [None] * len(args)
        from apache_beam.dataframe.frames import _DeferredIndex
        for ix, arg in enumerate(args):
            if isinstance(arg, DeferredBase):
                deferred_arg_indices.append(ix)
                deferred_arg_exprs.append(arg._expr)
            elif isinstance(arg, _DeferredIndex):
                # TODO(robertwb): Consider letting indices pass through as indices.
                # This would require updating the partitioning code, as indices don't
                # have indices.
                deferred_arg_indices.append(ix)
                deferred_arg_exprs.append(
                    expressions.ComputedExpression(
                        'index_as_series',
                        lambda ix: ix.index.to_series(),  # yapf break
                        [arg._frame._expr],
                        preserves_partition_by=partitionings.Singleton(),
                        requires_partition_by=partitionings.Nothing()))
            elif isinstance(arg, pd.core.generic.NDFrame):
                deferred_arg_indices.append(ix)
                deferred_arg_exprs.append(
                    expressions.ConstantExpression(arg, arg[0:0]))
            else:
                constant_args[ix] = arg

        deferred_kwarg_keys = []
        deferred_kwarg_exprs = []
        constant_kwargs = {key: None for key in kwargs}
        for key, arg in kwargs.items():
            if isinstance(arg, DeferredBase):
                deferred_kwarg_keys.append(key)
                deferred_kwarg_exprs.append(arg._expr)
            elif isinstance(arg, pd.core.generic.NDFrame):
                deferred_kwarg_keys.append(key)
                deferred_kwarg_exprs.append(
                    expressions.ConstantExpression(arg, arg[0:0]))
            else:
                constant_kwargs[key] = arg

        deferred_exprs = deferred_arg_exprs + deferred_kwarg_exprs

        if inplace:
            actual_func = copy_and_mutate(func)
        else:
            actual_func = func

        def apply(*actual_args):
            actual_args, actual_kwargs = (
                actual_args[:len(deferred_arg_exprs)],
                actual_args[len(deferred_arg_exprs):])

            full_args = list(constant_args)
            for ix, arg in zip(deferred_arg_indices, actual_args):
                full_args[ix] = arg

            full_kwargs = dict(constant_kwargs)
            for key, arg in zip(deferred_kwarg_keys, actual_kwargs):
                full_kwargs[key] = arg

            return actual_func(*full_args, **full_kwargs)

        if (not requires_partition_by.is_subpartitioning_of(
                partitionings.Index()) and sum(
                    isinstance(arg.proxy(), pd.core.generic.NDFrame)
                    for arg in deferred_exprs) > 1):
            # Implicit join on index if there is more than one indexed input.
            actual_requires_partition_by = partitionings.Index()
        else:
            actual_requires_partition_by = requires_partition_by

        result_expr = expressions.ComputedExpression(
            name,
            apply,
            deferred_exprs,
            requires_partition_by=actual_requires_partition_by,
            preserves_partition_by=preserves_partition_by)
        if inplace:
            args[0]._expr = result_expr

        else:
            return DeferredFrame.wrap(result_expr)
示例#18
0
def _get_deferred_args(*args):
    return [
        frame_base.DeferredFrame.wrap(
            expressions.ConstantExpression(arg, arg[0:0])) for arg in args
    ]
示例#19
0
 def test_constant_expresion(self):
   two = expressions.ConstantExpression(2)
   session = expressions.Session({})
   self.assertEqual(session.evaluate(two), 2)
示例#20
0
    def _run_test(self, func, *args, distributed=True, expect_error=False):
        deferred_args = [
            frame_base.DeferredFrame.wrap(
                expressions.ConstantExpression(arg, arg[0:0])) for arg in args
        ]
        try:
            expected = func(*args)
        except Exception as e:
            if not expect_error:
                raise
            expected = e
        else:
            if expect_error:
                raise AssertionError(
                    "Expected an error but computing expected result successfully "
                    f"returned: {expected}")

        session_type = (expressions.PartitioningSession
                        if distributed else expressions.Session)
        try:
            actual = session_type({}).evaluate(func(*deferred_args)._expr)
        except Exception as e:
            if not expect_error:
                raise
            actual = e
        else:
            if expect_error:
                raise AssertionError(
                    "Expected an error:\n{expected}\nbut successfully "
                    f"returned:\n{actual}")

        if expect_error:
            if not isinstance(
                    actual,
                    type(expected)) or not str(actual) == str(expected):
                raise AssertionError(
                    f'Expected {expected!r} to be raised, but got {actual!r}'
                ) from actual
        else:
            if isinstance(expected, pd.core.generic.NDFrame):
                if distributed:
                    if expected.index.is_unique:
                        expected = expected.sort_index()
                        actual = actual.sort_index()
                    else:
                        expected = expected.sort_values(list(expected.columns))
                        actual = actual.sort_values(list(actual.columns))

                if isinstance(expected, pd.Series):
                    pd.testing.assert_series_equal(expected, actual)
                elif isinstance(expected, pd.DataFrame):
                    pd.testing.assert_frame_equal(expected, actual)
                else:
                    raise ValueError(f"Expected value is a {type(expected)},"
                                     "not a Series or DataFrame.")
            else:
                # Expectation is not a pandas object
                if isinstance(expected, float):
                    cmp = lambda x: np.isclose(expected, x)
                else:
                    cmp = expected.__eq__
                self.assertTrue(
                    cmp(actual),
                    'Expected:\n\n%r\n\nActual:\n\n%r' % (expected, actual))
示例#21
0
  def wrapper(*args, **kwargs):
    for key, values in restrictions.items():
      if key in kwargs:
        value = kwargs[key]
      else:
        try:
          ix = _getargspec(func).args.index(key)
        except ValueError:
          # TODO: fix for delegation?
          continue
        if len(args) <= ix:
          continue
        value = args[ix]
      if callable(values):
        check = values
      elif isinstance(values, list):
        check = lambda x, values=values: x in values
      else:
        check = lambda x, value=value: x == value

      if not check(value):
        raise NotImplementedError(
            '%s=%s not supported for %s' % (key, value, name))
    deferred_arg_indices = []
    deferred_arg_exprs = []
    constant_args = [None] * len(args)
    for ix, arg in enumerate(args):
      if isinstance(arg, DeferredBase):
        deferred_arg_indices.append(ix)
        deferred_arg_exprs.append(arg._expr)
      elif isinstance(arg, pd.core.generic.NDFrame):
        deferred_arg_indices.append(ix)
        deferred_arg_exprs.append(expressions.ConstantExpression(arg, arg[0:0]))
      else:
        constant_args[ix] = arg

    deferred_kwarg_keys = []
    deferred_kwarg_exprs = []
    constant_kwargs = {key: None for key in kwargs}
    for key, arg in kwargs.items():
      if isinstance(arg, DeferredBase):
        deferred_kwarg_keys.append(key)
        deferred_kwarg_exprs.append(arg._expr)
      elif isinstance(arg, pd.core.generic.NDFrame):
        deferred_kwarg_keys.append(key)
        deferred_kwarg_exprs.append(
            expressions.ConstantExpression(arg, arg[0:0]))
      else:
        constant_kwargs[key] = arg

    deferred_exprs = deferred_arg_exprs + deferred_kwarg_exprs

    if inplace:
      actual_func = copy_and_mutate(func)
    else:
      actual_func = func

    def apply(*actual_args):
      actual_args, actual_kwargs = (actual_args[:len(deferred_arg_exprs)],
                                    actual_args[len(deferred_arg_exprs):])

      full_args = list(constant_args)
      for ix, arg in zip(deferred_arg_indices, actual_args):
        full_args[ix] = arg

      full_kwargs = dict(constant_kwargs)
      for key, arg in zip(deferred_kwarg_keys, actual_kwargs):
        full_kwargs[key] = arg

      return actual_func(*full_args, **full_kwargs)

    if any(isinstance(arg.proxy(), pd.core.generic.NDFrame) for arg in
           deferred_exprs) and not requires_partition_by.is_subpartitioning_of(
               partitionings.Index()):
      # Implicit join on index.
      actual_requires_partition_by = partitionings.Index()
    else:
      actual_requires_partition_by = requires_partition_by

    result_expr = expressions.ComputedExpression(
        name,
        apply,
        deferred_exprs,
        requires_partition_by=actual_requires_partition_by,
        preserves_partition_by=preserves_partition_by)
    if inplace:
      args[0]._expr = result_expr

    else:
      return DeferredFrame.wrap(result_expr)