def run_scenario(self, input, func):
        expected = func(input)

        empty = input.iloc[0:0]
        input_placeholder = expressions.PlaceholderExpression(empty)
        input_deferred = frame_base.DeferredFrame.wrap(input_placeholder)
        actual_deferred = func(input_deferred)._expr.evaluate_at(
            expressions.Session({input_placeholder: input}))

        check_correct(expected, actual_deferred)

        with beam.Pipeline() as p:
            input_pcoll = p | beam.Create([input.iloc[::2], input.iloc[1::2]])
            input_df = convert.to_dataframe(input_pcoll, proxy=empty)
            output_df = func(input_df)

            output_proxy = output_df._expr.proxy()
            if isinstance(output_proxy, pd.core.generic.NDFrame):
                self.assertTrue(
                    output_proxy.iloc[:0].equals(expected.iloc[:0]),
                    ('Output proxy is incorrect:\n'
                     f'Expected:\n{expected.iloc[:0]}\n\n'
                     f'Actual:\n{output_proxy.iloc[:0]}'))
            else:
                self.assertEqual(type(output_proxy), type(expected))

            output_pcoll = convert.to_pcollection(output_df,
                                                  yield_elements='pandas')

            assert_that(output_pcoll,
                        lambda actual: check_correct(expected, concat(actual)))
Exemplo n.º 2
0
 def evaluate(partition, stage=self.stage):
     session = expressions.Session(
         {expr: partition[expr._id]
          for expr in stage.inputs})
     for expr in stage.outputs:
         yield beam.pvalue.TaggedOutput(
             expr._id, expr.evaluate_at(session))
Exemplo n.º 3
0
  def run_scenario(self, input, func):
    expected = func(input)

    empty = input[0:0]
    input_placeholder = expressions.PlaceholderExpression(empty)
    input_deferred = frame_base.DeferredFrame.wrap(input_placeholder)
    actual_deferred = func(input_deferred)._expr.evaluate_at(
        expressions.Session({input_placeholder: input}))

    def check_correct(actual):
      if actual is None:
        raise AssertionError('Empty frame but expected: \n\n%s' % (expected))
      sorted_actual = actual.sort_index()
      sorted_expected = expected.sort_index()
      if not sorted_actual.equals(sorted_expected):
        raise AssertionError(
            'Dataframes not equal: \n\n%s\n\n%s' %
            (sorted_actual, sorted_expected))

    check_correct(actual_deferred)

    with beam.Pipeline() as p:
      input_pcoll = p | beam.Create([input[::2], input[1::2]])
      output_pcoll = input_pcoll | transforms.DataframeTransform(
          func, proxy=empty)
      assert_that(
          output_pcoll,
          lambda actual: check_correct(pd.concat(actual) if actual else None))
Exemplo n.º 4
0
 def test_computed_expression(self):
     a = expressions.PlaceholderExpression(0)
     b = expressions.PlaceholderExpression(0)
     a_plus_b = expressions.ComputedExpression('add', lambda a, b: a + b,
                                               [a, b])
     session = expressions.Session({a: 1, b: 2})
     self.assertEqual(session.evaluate(a_plus_b), 3)
Exemplo n.º 5
0
 def compute_using_session(self, to_compute):
   session = expressions.Session(self._env._inputs)
   return {
       name: frame._expr.evaluate_at(session)
       for name,
       frame in to_compute.items()
   }
Exemplo n.º 6
0
 def evaluate(partition, stage=self.stage, **side_inputs):
     session = expressions.Session(
         dict([(expr, partition[expr._id])
               for expr in tabular_inputs] +
              [(expr, side_inputs[expr._id])
               for expr in scalar_inputs]))
     for expr in stage.outputs:
         yield beam.pvalue.TaggedOutput(
             expr._id, expr.evaluate_at(session))
Exemplo n.º 7
0
 def _run_test(self, func, *args):
   deferred_args = [
       frame_base.DeferredFrame.wrap(
           expressions.ConstantExpression(arg, arg[0:0])) for arg in args
   ]
   expected = func(*args)
   actual = expressions.Session({}).evaluate(func(*deferred_args)._expr)
   self.assertTrue(
       expected.equals(actual),
       'Expected:\n\n%r\n\nActual:\n\n%r' % (expected, actual))
Exemplo n.º 8
0
        def evaluate(partition, stage=self.stage, **side_inputs):
          def lookup(expr):
            # Use proxy if there's no data in this partition
            return expr.proxy(
            ).iloc[:0] if partition[expr._id] is None else partition[expr._id]

          session = expressions.Session(
              dict([(expr, lookup(expr)) for expr in tabular_inputs] +
                   [(expr, side_inputs[expr._id]) for expr in scalar_inputs]))
          for expr in stage.outputs:
            yield beam.pvalue.TaggedOutput(expr._id, expr.evaluate_at(session))
Exemplo n.º 9
0
  def test_elementwise_func(self):
    a = pd.Series([1, 2, 3])
    b = pd.Series([100, 200, 300])
    empty_proxy = a[:0]
    x = frames.DeferredSeries(expressions.PlaceholderExpression(empty_proxy))
    y = frames.DeferredSeries(expressions.PlaceholderExpression(empty_proxy))
    sub = frame_base._elementwise_function(lambda x, y: x - y)

    session = expressions.Session({x._expr: a, y._expr: b})
    self.assertTrue(sub(x, y)._expr.evaluate_at(session).equals(a - b))
    self.assertTrue(sub(x, 1)._expr.evaluate_at(session).equals(a - 1))
    self.assertTrue(sub(1, x)._expr.evaluate_at(session).equals(1 - a))
    self.assertTrue(sub(x, b)._expr.evaluate_at(session).equals(a - b))
    self.assertTrue(sub(a, y)._expr.evaluate_at(session).equals(a - b))
Exemplo n.º 10
0
    def run_scenario(self, input, func):
        expected = func(input)

        empty = input[0:0]
        input_placeholder = expressions.PlaceholderExpression(empty)
        input_deferred = frame_base.DeferredFrame.wrap(input_placeholder)
        actual_deferred = func(input_deferred)._expr.evaluate_at(
            expressions.Session({input_placeholder: input}))

        check_correct(expected, actual_deferred)

        with beam.Pipeline() as p:
            input_pcoll = p | beam.Create([input[::2], input[1::2]])
            output_pcoll = input_pcoll | transforms.DataframeTransform(
                func, proxy=empty, yield_elements='pandas')
            assert_that(output_pcoll,
                        lambda actual: check_correct(expected, concat(actual)))
Exemplo n.º 11
0
    def run_scenario(self, input, func):
        expected = func(input)

        empty = input[0:0]
        input_placeholder = expressions.PlaceholderExpression(empty)
        input_deferred = frame_base.DeferredFrame.wrap(input_placeholder)
        actual_deferred = func(input_deferred)._expr.evaluate_at(
            expressions.Session({input_placeholder: input}))

        def concat(parts):
            if len(parts) > 1:
                return pd.concat(parts)
            elif len(parts) == 1:
                return parts[0]
            else:
                return None

        def check_correct(actual):
            if actual is None:
                raise AssertionError('Empty frame but expected: \n\n%s' %
                                     (expected))
            if isinstance(expected, pd.core.generic.NDFrame):
                sorted_actual = actual.sort_index()
                sorted_expected = expected.sort_index()
                if not sorted_actual.equals(sorted_expected):
                    raise AssertionError('Dataframes not equal: \n\n%s\n\n%s' %
                                         (sorted_actual, sorted_expected))
            else:
                if actual != expected:
                    raise AssertionError('Scalars not equal: %s != %s' %
                                         (actual, expected))

        check_correct(actual_deferred)

        with beam.Pipeline() as p:
            input_pcoll = p | beam.Create([input[::2], input[1::2]])
            output_pcoll = input_pcoll | transforms.DataframeTransform(
                func, proxy=empty)
            assert_that(output_pcoll,
                        lambda actual: check_correct(concat(actual)))
Exemplo n.º 12
0
 def test_constant_expresion(self):
   two = expressions.ConstantExpression(2)
   session = expressions.Session({})
   self.assertEqual(session.evaluate(two), 2)
Exemplo n.º 13
0
 def test_placeholder_expression(self):
   a = expressions.PlaceholderExpression(None)
   b = expressions.PlaceholderExpression(None)
   session = expressions.Session({a: 1, b: 2})
   self.assertEqual(session.evaluate(a), 1)
   self.assertEqual(session.evaluate(b), 2)