Exemplo n.º 1
0
 def test_timestamped_with_combiners(self):
   p = TestPipeline()
   result = (p
             # Create some initial test values.
             | 'start' >> Create([(k, k) for k in range(10)])
             # The purpose of the WindowInto transform is to establish a
             # FixedWindows windowing function for the PCollection.
             # It does not bucket elements into windows since the timestamps
             # from Create are not spaced 5 ms apart and very likely they all
             # fall into the same window.
             | 'w' >> WindowInto(FixedWindows(5))
             # Generate timestamped values using the values as timestamps.
             # Now there are values 5 ms apart and since Map propagates the
             # windowing function from input to output the output PCollection
             # will have elements falling into different 5ms windows.
             | Map(lambda (x, t): TimestampedValue(x, t))
             # We add a 'key' to each value representing the index of the
             # window. This is important since there is no guarantee of
             # order for the elements of a PCollection.
             | Map(lambda v: (v / 5, v)))
   # Sum all elements associated with a key and window. Although it
   # is called CombinePerKey it is really CombinePerKeyAndWindow the
   # same way GroupByKey is really GroupByKeyAndWindow.
   sum_per_window = result | CombinePerKey(sum)
   # Compute mean per key and window.
   mean_per_window = result | combiners.Mean.PerKey()
   assert_that(sum_per_window, equal_to([(0, 10), (1, 35)]),
               label='assert:sum')
   assert_that(mean_per_window, equal_to([(0, 2.0), (1, 7.0)]),
               label='assert:mean')
   p.run()
Exemplo n.º 2
0
 def test_timestamped_with_combiners(self):
   p = TestPipeline()
   result = (p
             # Create some initial test values.
             | 'start' >> Create([(k, k) for k in range(10)])
             # The purpose of the WindowInto transform is to establish a
             # FixedWindows windowing function for the PCollection.
             # It does not bucket elements into windows since the timestamps
             # from Create are not spaced 5 ms apart and very likely they all
             # fall into the same window.
             | 'w' >> WindowInto(FixedWindows(5))
             # Generate timestamped values using the values as timestamps.
             # Now there are values 5 ms apart and since Map propagates the
             # windowing function from input to output the output PCollection
             # will have elements falling into different 5ms windows.
             | Map(lambda (x, t): TimestampedValue(x, t))
             # We add a 'key' to each value representing the index of the
             # window. This is important since there is no guarantee of
             # order for the elements of a PCollection.
             | Map(lambda v: (v / 5, v)))
   # Sum all elements associated with a key and window. Although it
   # is called CombinePerKey it is really CombinePerKeyAndWindow the
   # same way GroupByKey is really GroupByKeyAndWindow.
   sum_per_window = result | CombinePerKey(sum)
   # Compute mean per key and window.
   mean_per_window = result | combiners.Mean.PerKey()
   assert_that(sum_per_window, equal_to([(0, 10), (1, 35)]),
               label='assert:sum')
   assert_that(mean_per_window, equal_to([(0, 2.0), (1, 7.0)]),
               label='assert:mean')
   p.run()
Exemplo n.º 3
0
  def test_create(self):
    pipeline = TestPipeline()
    pcoll = pipeline | 'label1' >> Create([1, 2, 3])
    assert_that(pcoll, equal_to([1, 2, 3]))

    # Test if initial value is an iterator object.
    pcoll2 = pipeline | 'label2' >> Create(iter((4, 5, 6)))
    pcoll3 = pcoll2 | 'do' >> FlatMap(lambda x: [x + 10])
    assert_that(pcoll3, equal_to([14, 15, 16]), label='pcoll3')
    pipeline.run()
Exemplo n.º 4
0
 def test_reuse_cloned_custom_transform_instance(self):
   pipeline = TestPipeline()
   pcoll1 = pipeline | 'pc1' >> Create([1, 2, 3])
   pcoll2 = pipeline | 'pc2' >> Create([4, 5, 6])
   transform = PipelineTest.CustomTransform()
   result1 = pcoll1 | transform
   result2 = pcoll2 | 'new_label' >> transform
   assert_that(result1, equal_to([2, 3, 4]), label='r1')
   assert_that(result2, equal_to([5, 6, 7]), label='r2')
   pipeline.run()
Exemplo n.º 5
0
 def test_reuse_cloned_custom_transform_instance(self):
     pipeline = TestPipeline()
     pcoll1 = pipeline | 'pc1' >> Create([1, 2, 3])
     pcoll2 = pipeline | 'pc2' >> Create([4, 5, 6])
     transform = PipelineTest.CustomTransform()
     result1 = pcoll1 | transform
     result2 = pcoll2 | 'new_label' >> transform
     assert_that(result1, equal_to([2, 3, 4]), label='r1')
     assert_that(result2, equal_to([5, 6, 7]), label='r2')
     pipeline.run()
Exemplo n.º 6
0
    def test_create(self):
        pipeline = TestPipeline()
        pcoll = pipeline | 'label1' >> Create([1, 2, 3])
        assert_that(pcoll, equal_to([1, 2, 3]))

        # Test if initial value is an iterator object.
        pcoll2 = pipeline | 'label2' >> Create(iter((4, 5, 6)))
        pcoll3 = pcoll2 | 'do' >> FlatMap(lambda x: [x + 10])
        assert_that(pcoll3, equal_to([14, 15, 16]), label='pcoll3')
        pipeline.run()
Exemplo n.º 7
0
    def test_pardo_side_outputs(self):
        def tee(elem, *tags):
            for tag in tags:
                if tag in elem:
                    yield beam.pvalue.TaggedOutput(tag, elem)

        with self.create_pipeline() as p:
            xy = (p
                  | 'Create' >> beam.Create(['x', 'y', 'xy'])
                  | beam.FlatMap(tee, 'x', 'y').with_outputs())
            assert_that(xy.x, equal_to(['x', 'xy']), label='x')
            assert_that(xy.y, equal_to(['y', 'xy']), label='y')
Exemplo n.º 8
0
  def test_combine_globally_with_default_side_input(self):
    class CombineWithSideInput(PTransform):
      def expand(self, pcoll):
        side = pcoll | CombineGlobally(sum).as_singleton_view()
        main = pcoll.pipeline | Create([None])
        return main | Map(lambda _, s: s, side)

    p = TestPipeline()
    result1 = p | 'i1' >> Create([]) | 'c1' >> CombineWithSideInput()
    result2 = p | 'i2' >> Create([1, 2, 3, 4]) | 'c2' >> CombineWithSideInput()
    assert_that(result1, equal_to([0]), label='r1')
    assert_that(result2, equal_to([10]), label='r2')
    p.run()
Exemplo n.º 9
0
  def test_combine_globally_with_default_side_input(self):
    class CombineWithSideInput(PTransform):
      def expand(self, pcoll):
        side = pcoll | CombineGlobally(sum).as_singleton_view()
        main = pcoll.pipeline | Create([None])
        return main | Map(lambda _, s: s, side)

    p = TestPipeline()
    result1 = p | 'i1' >> Create([]) | 'c1' >> CombineWithSideInput()
    result2 = p | 'i2' >> Create([1, 2, 3, 4]) | 'c2' >> CombineWithSideInput()
    assert_that(result1, equal_to([0]), label='r1')
    assert_that(result2, equal_to([10]), label='r2')
    p.run()
Exemplo n.º 10
0
  def test_flatmap_builtin(self):
    pipeline = TestPipeline()
    pcoll = pipeline | 'label1' >> Create([1, 2, 3])
    assert_that(pcoll, equal_to([1, 2, 3]))

    pcoll2 = pcoll | 'do' >> FlatMap(lambda x: [x + 10])
    assert_that(pcoll2, equal_to([11, 12, 13]), label='pcoll2')

    pcoll3 = pcoll2 | 'm1' >> Map(lambda x: [x, 12])
    assert_that(pcoll3,
                equal_to([[11, 12], [12, 12], [13, 12]]), label='pcoll3')

    pcoll4 = pcoll3 | 'do2' >> FlatMap(set)
    assert_that(pcoll4, equal_to([11, 12, 12, 12, 13]), label='pcoll4')
    pipeline.run()
Exemplo n.º 11
0
  def test_flatmap_builtin(self):
    pipeline = TestPipeline()
    pcoll = pipeline | 'label1' >> Create([1, 2, 3])
    assert_that(pcoll, equal_to([1, 2, 3]))

    pcoll2 = pcoll | 'do' >> FlatMap(lambda x: [x + 10])
    assert_that(pcoll2, equal_to([11, 12, 13]), label='pcoll2')

    pcoll3 = pcoll2 | 'm1' >> Map(lambda x: [x, 12])
    assert_that(pcoll3,
                equal_to([[11, 12], [12, 12], [13, 12]]), label='pcoll3')

    pcoll4 = pcoll3 | 'do2' >> FlatMap(set)
    assert_that(pcoll4, equal_to([11, 12, 12, 12, 13]), label='pcoll4')
    pipeline.run()
Exemplo n.º 12
0
 def test_group_by_key(self):
     with self.create_pipeline() as p:
         res = (p
                | beam.Create([('a', 1), ('a', 2), ('b', 3)])
                | beam.GroupByKey()
                | beam.Map(lambda (k, vs): (k, sorted(vs))))
         assert_that(res, equal_to([('a', [1, 2]), ('b', [3])]))
Exemplo n.º 13
0
    def test_run_direct(self):
        file_name = self._create_temp_file('aaaa\nbbbb\ncccc\ndddd')
        pipeline = TestPipeline()
        pcoll = pipeline | beam.io.Read(LineSource(file_name))
        assert_that(pcoll, equal_to(['aaaa', 'bbbb', 'cccc', 'dddd']))

        pipeline.run()
Exemplo n.º 14
0
 def test_read(self):
     with tempfile.NamedTemporaryFile() as temp_file:
         temp_file.write('a\nb\nc')
         temp_file.flush()
         with self.create_pipeline() as p:
             assert_that(p | beam.io.ReadFromText(temp_file.name),
                         equal_to(['a', 'b', 'c']))
Exemplo n.º 15
0
 def test_pardo(self):
     with self.create_pipeline() as p:
         res = (p
                | beam.Create(['a', 'bc'])
                | beam.Map(lambda e: e * 2)
                | beam.Map(lambda e: e + 'x'))
         assert_that(res, equal_to(['aax', 'bcbcx']))
 def run_pipeline(self, count_implementation, factor=1):
   p = TestPipeline()
   words = p | beam.Create(['CAT', 'DOG', 'CAT', 'CAT', 'DOG'])
   result = words | count_implementation
   assert_that(
       result, equal_to([('CAT', (3 * factor)), ('DOG', (2 * factor))]))
   p.run()
Exemplo n.º 17
0
 def test_dataflow_single_file(self):
   file_name, expected_data = write_data(5)
   assert len(expected_data) == 5
   pipeline = TestPipeline()
   pcoll = pipeline | 'Read' >> ReadFromText(file_name)
   assert_that(pcoll, equal_to(expected_data))
   pipeline.run()
Exemplo n.º 18
0
 def run_pipeline(self, count_implementation, factor=1):
   p = TestPipeline()
   words = p | beam.Create(['CAT', 'DOG', 'CAT', 'CAT', 'DOG'])
   result = words | count_implementation
   assert_that(
       result, equal_to([('CAT', (3 * factor)), ('DOG', (2 * factor))]))
   p.run()
Exemplo n.º 19
0
  def test_run_direct(self):
    file_name = self._create_temp_file('aaaa\nbbbb\ncccc\ndddd')
    pipeline = TestPipeline()
    pcoll = pipeline | beam.io.Read(LineSource(file_name))
    assert_that(pcoll, equal_to(['aaaa', 'bbbb', 'cccc', 'dddd']))

    pipeline.run()
Exemplo n.º 20
0
 def test_read_gzip_empty_file(self):
     file_name = self._create_temp_file()
     pipeline = TestPipeline()
     pcoll = pipeline | 'Read' >> ReadFromText(
         file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder())
     assert_that(pcoll, equal_to([]))
     pipeline.run()
Exemplo n.º 21
0
 def test_dataflow_single_file(self):
     file_name, expected_data = write_data(5)
     assert len(expected_data) == 5
     pipeline = TestPipeline()
     pcoll = pipeline | 'Read' >> ReadFromText(file_name)
     assert_that(pcoll, equal_to(expected_data))
     pipeline.run()
Exemplo n.º 22
0
 def test_dataflow_file_pattern(self):
     pattern, expected_data = write_pattern([5, 3, 12, 8, 8, 4])
     assert len(expected_data) == 40
     pipeline = TestPipeline()
     pcoll = pipeline | 'Read' >> ReadFromText(pattern)
     assert_that(pcoll, equal_to(expected_data))
     pipeline.run()
Exemplo n.º 23
0
 def test_dataflow_file_pattern(self):
   pattern, expected_data = write_pattern([5, 3, 12, 8, 8, 4])
   assert len(expected_data) == 40
   pipeline = TestPipeline()
   pcoll = pipeline | 'Read' >> ReadFromText(pattern)
   assert_that(pcoll, equal_to(expected_data))
   pipeline.run()
Exemplo n.º 24
0
  def test_top_shorthands(self):
    pipeline = TestPipeline()

    pcoll = pipeline | 'start' >> Create([6, 3, 1, 1, 9, 1, 5, 2, 0, 6])
    result_top = pcoll | 'top' >> beam.CombineGlobally(combine.Largest(5))
    result_bot = pcoll | 'bot' >> beam.CombineGlobally(combine.Smallest(4))
    assert_that(result_top, equal_to([[9, 6, 6, 5, 3]]), label='assert:top')
    assert_that(result_bot, equal_to([[0, 1, 1, 1]]), label='assert:bot')

    pcoll = pipeline | 'start-perkey' >> Create(
        [('a', x) for x in [6, 3, 1, 1, 9, 1, 5, 2, 0, 6]])
    result_ktop = pcoll | 'top-perkey' >> beam.CombinePerKey(combine.Largest(5))
    result_kbot = pcoll | 'bot-perkey' >> beam.CombinePerKey(
        combine.Smallest(4))
    assert_that(result_ktop, equal_to([('a', [9, 6, 6, 5, 3])]), label='k:top')
    assert_that(result_kbot, equal_to([('a', [0, 1, 1, 1])]), label='k:bot')
    pipeline.run()
Exemplo n.º 25
0
 def test_compute_points(self):
   p = TestPipeline()
   records = p | 'create' >> beam.Create(self.SAMPLE_RECORDS)
   result = (records
             | 'points' >> beam.FlatMap(coders.compute_points)
             | beam.CombinePerKey(sum))
   assert_that(result, equal_to([('Italy', 0), ('Brasil', 6), ('Germany', 3)]))
   p.run()
Exemplo n.º 26
0
  def test_top_shorthands(self):
    pipeline = TestPipeline()

    pcoll = pipeline | 'start' >> Create([6, 3, 1, 1, 9, 1, 5, 2, 0, 6])
    result_top = pcoll | 'top' >> beam.CombineGlobally(combine.Largest(5))
    result_bot = pcoll | 'bot' >> beam.CombineGlobally(combine.Smallest(4))
    assert_that(result_top, equal_to([[9, 6, 6, 5, 3]]), label='assert:top')
    assert_that(result_bot, equal_to([[0, 1, 1, 1]]), label='assert:bot')

    pcoll = pipeline | 'start-perkey' >> Create(
        [('a', x) for x in [6, 3, 1, 1, 9, 1, 5, 2, 0, 6]])
    result_ktop = pcoll | 'top-perkey' >> beam.CombinePerKey(combine.Largest(5))
    result_kbot = pcoll | 'bot-perkey' >> beam.CombinePerKey(
        combine.Smallest(4))
    assert_that(result_ktop, equal_to([('a', [9, 6, 6, 5, 3])]), label='k:top')
    assert_that(result_kbot, equal_to([('a', [0, 1, 1, 1])]), label='k:bot')
    pipeline.run()
Exemplo n.º 27
0
 def test_default_value_singleton_side_input(self):
   pipeline = self.create_pipeline()
   pcol = pipeline | 'start' >> beam.Create([1, 2])
   side = pipeline | 'side' >> beam.Create([])  # 0 values in side input.
   result = pcol | beam.FlatMap(
       lambda x, s: [x * s], beam.pvalue.AsSingleton(side, 10))
   assert_that(result, equal_to([10, 20]))
   pipeline.run()
Exemplo n.º 28
0
 def test_read_gzip_empty_file(self):
     filename = tempfile.NamedTemporaryFile(delete=False,
                                            prefix=tempfile.template).name
     pipeline = TestPipeline()
     pcoll = pipeline | 'Read' >> ReadFromText(
         filename, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder())
     assert_that(pcoll, equal_to([]))
     pipeline.run()
Exemplo n.º 29
0
 def test_default_value_singleton_side_input(self):
   pipeline = self.create_pipeline()
   pcol = pipeline | 'start' >> beam.Create([1, 2])
   side = pipeline | 'side' >> beam.Create([])  # 0 values in side input.
   result = pcol | beam.FlatMap(
       lambda x, s: [x * s], beam.pvalue.AsSingleton(side, 10))
   assert_that(result, equal_to([10, 20]))
   pipeline.run()
Exemplo n.º 30
0
 def test_windowing(self):
     with self.create_pipeline() as p:
         res = (p
                | beam.Create([1, 2, 100, 101, 102])
                | beam.Map(lambda t: TimestampedValue(('k', t), t))
                | beam.WindowInto(beam.transforms.window.Sessions(10))
                | beam.GroupByKey()
                | beam.Map(lambda (k, vs): (k, sorted(vs))))
         assert_that(res, equal_to([('k', [1, 2]), ('k', [100, 101, 102])]))
Exemplo n.º 31
0
  def test_element(self):
    class TestDoFn(DoFn):
      def process(self, element):
        yield element + 10

    pipeline = TestPipeline()
    pcoll = pipeline | 'Create' >> Create([1, 2]) | 'Do' >> ParDo(TestDoFn())
    assert_that(pcoll, equal_to([11, 12]))
    pipeline.run()
Exemplo n.º 32
0
 def test_tuple_combine_fn(self):
     p = TestPipeline()
     result = (p
               | Create([('a', 100, 0.0), ('b', 10, -1), ('c', 1, 100)])
               | beam.CombineGlobally(
                   combine.TupleCombineFn(max, combine.MeanCombineFn(),
                                          sum)).without_defaults())
     assert_that(result, equal_to([('c', 111.0 / 3, 99.0)]))
     p.run()
Exemplo n.º 33
0
  def test_element(self):
    class TestDoFn(DoFn):
      def process(self, element):
        yield element + 10

    pipeline = TestPipeline()
    pcoll = pipeline | 'Create' >> Create([1, 2]) | 'Do' >> ParDo(TestDoFn())
    assert_that(pcoll, equal_to([11, 12]))
    pipeline.run()
Exemplo n.º 34
0
  def test_timestamp_param(self):
    class TestDoFn(DoFn):
      def process(self, element, timestamp=DoFn.TimestampParam):
        yield timestamp

    pipeline = TestPipeline()
    pcoll = pipeline | 'Create' >> Create([1, 2]) | 'Do' >> ParDo(TestDoFn())
    assert_that(pcoll, equal_to([MIN_TIMESTAMP, MIN_TIMESTAMP]))
    pipeline.run()
Exemplo n.º 35
0
  def test_context_param(self):
    class TestDoFn(DoFn):
      def process(self, element, context=DoFn.ContextParam):
        yield context.element + 10

    pipeline = TestPipeline()
    pcoll = pipeline | 'Create' >> Create([1, 2])| 'Do' >> ParDo(TestDoFn())
    assert_that(pcoll, equal_to([11, 12]))
    pipeline.run()
Exemplo n.º 36
0
  def test_context_param(self):
    class TestDoFn(DoFn):
      def process(self, element, context=DoFn.ContextParam):
        yield context.element + 10

    pipeline = TestPipeline()
    pcoll = pipeline | 'Create' >> Create([1, 2])| 'Do' >> ParDo(TestDoFn())
    assert_that(pcoll, equal_to([11, 12]))
    pipeline.run()
Exemplo n.º 37
0
 def test_iterable_side_input(self):
   pipeline = self.create_pipeline()
   pcol = pipeline | 'start' >> beam.Create([1, 2])
   side = pipeline | 'side' >> beam.Create([3, 4])  # 2 values in side input.
   result = pcol | 'compute' >> beam.FlatMap(
       lambda x, s: [x * y for y in s],
       beam.pvalue.AsIter(side))
   assert_that(result, equal_to([3, 4, 6, 8]))
   pipeline.run()
Exemplo n.º 38
0
  def test_timestamp_param(self):
    class TestDoFn(DoFn):
      def process(self, element, timestamp=DoFn.TimestampParam):
        yield timestamp

    pipeline = TestPipeline()
    pcoll = pipeline | 'Create' >> Create([1, 2]) | 'Do' >> ParDo(TestDoFn())
    assert_that(pcoll, equal_to([MIN_TIMESTAMP, MIN_TIMESTAMP]))
    pipeline.run()
Exemplo n.º 39
0
 def test_read_gzip_empty_file(self):
   file_name = self._create_temp_file()
   pipeline = TestPipeline()
   pcoll = pipeline | 'Read' >> ReadFromText(
       file_name,
       0, CompressionTypes.GZIP,
       True, coders.StrUtf8Coder())
   assert_that(pcoll, equal_to([]))
   pipeline.run()
Exemplo n.º 40
0
 def test_iterable_side_input(self):
   pipeline = self.create_pipeline()
   pcol = pipeline | 'start' >> beam.Create([1, 2])
   side = pipeline | 'side' >> beam.Create([3, 4])  # 2 values in side input.
   result = pcol | 'compute' >> beam.FlatMap(
       lambda x, s: [x * y for y in s],
       beam.pvalue.AsIter(side))
   assert_that(result, equal_to([3, 4, 6, 8]))
   pipeline.run()
Exemplo n.º 41
0
 def test_tuple_combine_fn(self):
   p = TestPipeline()
   result = (
       p
       | Create([('a', 100, 0.0), ('b', 10, -1), ('c', 1, 100)])
       | beam.CombineGlobally(combine.TupleCombineFn(max,
                                                     combine.MeanCombineFn(),
                                                     sum)).without_defaults())
   assert_that(result, equal_to([('c', 111.0 / 3, 99.0)]))
   p.run()
Exemplo n.º 42
0
 def test_metrics_in_source(self):
   pipeline = TestPipeline()
   pcoll = pipeline | Read(FakeSource([1, 2, 3, 4, 5, 6]))
   assert_that(pcoll, equal_to([1, 2, 3, 4, 5, 6]))
   res = pipeline.run()
   metric_results = res.metrics().query()
   outputs_counter = metric_results['counters'][0]
   self.assertEqual(outputs_counter.key.step, 'Read')
   self.assertEqual(outputs_counter.key.metric.name, 'outputs')
   self.assertEqual(outputs_counter.committed, 6)
Exemplo n.º 43
0
 def test_sink_transform(self):
   with tempfile.NamedTemporaryFile() as dst:
     path = dst.name
     with TestPipeline() as p:
       # pylint: disable=expression-not-assigned
       p | beam.Create(self.RECORDS) | avroio.WriteToAvro(path, self.SCHEMA)
     with TestPipeline() as p:
       # json used for stable sortability
       readback = p | avroio.ReadFromAvro(path + '*') | beam.Map(json.dumps)
       assert_that(readback, equal_to([json.dumps(r) for r in self.RECORDS]))
Exemplo n.º 44
0
 def test_tuple_combine_fn_without_defaults(self):
   p = TestPipeline()
   result = (
       p
       | Create([1, 1, 2, 3])
       | beam.CombineGlobally(
           combine.TupleCombineFn(min, combine.MeanCombineFn(), max)
           .with_common_input()).without_defaults())
   assert_that(result, equal_to([(1, 7.0 / 4, 3)]))
   p.run()
Exemplo n.º 45
0
  def test_read_auto_bzip2(self):
    _, lines = write_data(15)
    file_name = self._create_temp_file(suffix='.bz2')
    with bz2.BZ2File(file_name, 'wb') as f:
      f.write('\n'.join(lines))

    pipeline = TestPipeline()
    pcoll = pipeline | 'Read' >> ReadFromText(file_name)
    assert_that(pcoll, equal_to(lines))
    pipeline.run()
Exemplo n.º 46
0
    def test_read_auto_bzip2(self):
        _, lines = write_data(15)
        file_name = self._create_temp_file(suffix='.bz2')
        with bz2.BZ2File(file_name, 'wb') as f:
            f.write('\n'.join(lines))

        pipeline = TestPipeline()
        pcoll = pipeline | 'Read' >> ReadFromText(file_name)
        assert_that(pcoll, equal_to(lines))
        pipeline.run()
Exemplo n.º 47
0
    def test_flattened_side_input(self):
        pipeline = self.create_pipeline()
        main_input = pipeline | 'main input' >> beam.Create([None])
        side_input = (pipeline | 'side1' >> beam.Create(['a']), pipeline
                      | 'side2' >> beam.Create(['b'])) | beam.Flatten()
        results = main_input | beam.FlatMap(lambda _, ab: ab,
                                            beam.pvalue.AsList(side_input))

        assert_that(results, equal_to(['a', 'b']))
        pipeline.run()
Exemplo n.º 48
0
 def test_metrics_in_source(self):
     pipeline = TestPipeline()
     pcoll = pipeline | Read(FakeSource([1, 2, 3, 4, 5, 6]))
     assert_that(pcoll, equal_to([1, 2, 3, 4, 5, 6]))
     res = pipeline.run()
     metric_results = res.metrics().query()
     outputs_counter = metric_results['counters'][0]
     self.assertEqual(outputs_counter.key.step, 'Read')
     self.assertEqual(outputs_counter.key.metric.name, 'outputs')
     self.assertEqual(outputs_counter.committed, 6)
Exemplo n.º 49
0
  def test_run_concat_direct(self):
    source = ConcatSource([RangeSource(0, 10),
                           RangeSource(10, 100),
                           RangeSource(100, 1000),
                          ])
    pipeline = TestPipeline()
    pcoll = pipeline | beam.Read(source)
    assert_that(pcoll, equal_to(range(1000)))

    pipeline.run()
Exemplo n.º 50
0
 def test_tuple_combine_fn_without_defaults(self):
     p = TestPipeline()
     result = (p
               | Create([1, 1, 2, 3])
               | beam.CombineGlobally(
                   combine.TupleCombineFn(
                       min, combine.MeanCombineFn(),
                       max).with_common_input()).without_defaults())
     assert_that(result, equal_to([(1, 7.0 / 4, 3)]))
     p.run()
Exemplo n.º 51
0
 def test_sink_transform(self):
   with tempfile.NamedTemporaryFile() as dst:
     path = dst.name
     with TestPipeline() as p:
       # pylint: disable=expression-not-assigned
       p | beam.Create(self.RECORDS) | avroio.WriteToAvro(path, self.SCHEMA)
     with TestPipeline() as p:
       # json used for stable sortability
       readback = p | avroio.ReadFromAvro(path + '*') | beam.Map(json.dumps)
       assert_that(readback, equal_to([json.dumps(r) for r in self.RECORDS]))
Exemplo n.º 52
0
  def test_deterministic_key(self):
    p = TestPipeline()
    lines = (p | beam.Create(
        ['banana,fruit,3', 'kiwi,fruit,2', 'kiwi,fruit,2', 'zucchini,veg,3']))

    # For pickling
    global Player  # pylint: disable=global-variable-not-assigned

    # [START type_hints_deterministic_key]
    class Player(object):
      def __init__(self, team, name):
        self.team = team
        self.name = name

    class PlayerCoder(beam.coders.Coder):
      def encode(self, player):
        return '%s:%s' % (player.team, player.name)

      def decode(self, s):
        return Player(*s.split(':'))

      def is_deterministic(self):
        return True

    beam.coders.registry.register_coder(Player, PlayerCoder)

    def parse_player_and_score(csv):
      name, team, score = csv.split(',')
      return Player(team, name), int(score)

    totals = (
        lines
        | beam.Map(parse_player_and_score)
        | beam.CombinePerKey(sum).with_input_types(
            beam.typehints.Tuple[Player, int]))
    # [END type_hints_deterministic_key]

    assert_that(
        totals | beam.Map(lambda (k, v): (k.name, v)),
        equal_to([('banana', 3), ('kiwi', 4), ('zucchini', 3)]))

    p.run()
Exemplo n.º 53
0
 def test_timestamped_value(self):
   p = TestPipeline()
   result = (p
             | 'start' >> Create([(k, k) for k in range(10)])
             | Map(lambda (x, t): TimestampedValue(x, t))
             | 'w' >> WindowInto(FixedWindows(5))
             | Map(lambda v: ('key', v))
             | GroupByKey())
   assert_that(result, equal_to([('key', [0, 1, 2, 3, 4]),
                                 ('key', [5, 6, 7, 8, 9])]))
   p.run()
Exemplo n.º 54
0
 def test_sliding_windows(self):
   p = TestPipeline()
   pcoll = self.timestamped_key_values(p, 'key', 1, 2, 3)
   result = (pcoll
             | 'w' >> WindowInto(SlidingWindows(period=2, size=4))
             | GroupByKey()
             | reify_windows)
   expected = [('key @ [-2.0, 2.0)', [1]),
               ('key @ [0.0, 4.0)', [1, 2, 3]),
               ('key @ [2.0, 6.0)', [2, 3])]
   assert_that(result, equal_to(expected))
   p.run()
Exemplo n.º 55
0
  def test_builtin_combines(self):
    pipeline = TestPipeline()

    vals = [6, 3, 1, 1, 9, 1, 5, 2, 0, 6]
    mean = sum(vals) / float(len(vals))
    size = len(vals)

    # First for global combines.
    pcoll = pipeline | 'start' >> Create(vals)
    result_mean = pcoll | 'mean' >> combine.Mean.Globally()
    result_count = pcoll | 'count' >> combine.Count.Globally()
    assert_that(result_mean, equal_to([mean]), label='assert:mean')
    assert_that(result_count, equal_to([size]), label='assert:size')

    # Again for per-key combines.
    pcoll = pipeline | 'start-perkey' >> Create([('a', x) for x in vals])
    result_key_mean = pcoll | 'mean-perkey' >> combine.Mean.PerKey()
    result_key_count = pcoll | 'count-perkey' >> combine.Count.PerKey()
    assert_that(result_key_mean, equal_to([('a', mean)]), label='key:mean')
    assert_that(result_key_count, equal_to([('a', size)]), label='key:size')
    pipeline.run()
Exemplo n.º 56
0
  def test_flattened_side_input(self):
    pipeline = self.create_pipeline()
    main_input = pipeline | 'main input' >> beam.Create([None])
    side_input = (
        pipeline | 'side1' >> beam.Create(['a']),
        pipeline | 'side2' >> beam.Create(['b'])) | beam.Flatten()
    results = main_input | beam.FlatMap(
        lambda _, ab: ab,
        beam.pvalue.AsList(side_input))

    assert_that(results, equal_to(['a', 'b']))
    pipeline.run()
Exemplo n.º 57
0
  def test_window_param(self):
    class TestDoFn(DoFn):
      def process(self, element, window=DoFn.WindowParam):
        yield (element, (float(window.start), float(window.end)))

    pipeline = TestPipeline()
    pcoll = (pipeline
             | Create([1, 7])
             | Map(lambda x: TimestampedValue(x, x))
             | WindowInto(windowfn=SlidingWindows(10, 5))
             | ParDo(TestDoFn()))
    assert_that(pcoll, equal_to([(1, (-5, 5)), (1, (0, 10)),
                                 (7, (0, 10)), (7, (5, 15))]))
    pcoll2 = pcoll | 'Again' >> ParDo(TestDoFn())
    assert_that(
        pcoll2,
        equal_to([
            ((1, (-5, 5)), (-5, 5)), ((1, (0, 10)), (0, 10)),
            ((7, (0, 10)), (0, 10)), ((7, (5, 15)), (5, 15))]),
        label='doubled windows')
    pipeline.run()
Exemplo n.º 58
0
  def test_read_gzip_with_skip_lines(self):
    _, lines = write_data(15)
    file_name = self._create_temp_file()
    with gzip.GzipFile(file_name, 'wb') as f:
      f.write('\n'.join(lines))

    pipeline = TestPipeline()
    pcoll = pipeline | 'Read' >> ReadFromText(
        file_name, 0, CompressionTypes.GZIP,
        True, coders.StrUtf8Coder(), skip_header_lines=2)
    assert_that(pcoll, equal_to(lines[2:]))
    pipeline.run()
Exemplo n.º 59
0
 def test_sessions(self):
   p = TestPipeline()
   pcoll = self.timestamped_key_values(p, 'key', 1, 2, 3, 20, 35, 27)
   result = (pcoll
             | 'w' >> WindowInto(Sessions(10))
             | GroupByKey()
             | sort_values
             | reify_windows)
   expected = [('key @ [1.0, 13.0)', [1, 2, 3]),
               ('key @ [20.0, 45.0)', [20, 27, 35])]
   assert_that(result, equal_to(expected))
   p.run()