示例#1
0
def pipeline_monitoring(renames):
    """Using monitoring interface snippets."""

    import re
    import apache_beam as beam
    from apache_beam.utils.pipeline_options import PipelineOptions

    class WordCountOptions(PipelineOptions):
        @classmethod
        def _add_argparse_args(cls, parser):
            parser.add_argument('--input',
                                help='Input for the pipeline',
                                default='gs://my-bucket/input')
            parser.add_argument('--output',
                                help='output for the pipeline',
                                default='gs://my-bucket/output')

    class ExtractWordsFn(beam.DoFn):
        def process(self, element):
            words = re.findall(r'[A-Za-z\']+', element)
            for word in words:
                yield word

    class FormatCountsFn(beam.DoFn):
        def process(self, element):
            word, count = element
            yield '%s: %s' % (word, count)

    # [START pipeline_monitoring_composite]
    # The CountWords Composite Transform inside the WordCount pipeline.
    class CountWords(beam.PTransform):
        def expand(self, pcoll):
            return (pcoll
                    # Convert lines of text into individual words.
                    | 'ExtractWords' >> beam.ParDo(ExtractWordsFn())
                    # Count the number of times each word occurs.
                    | beam.combiners.Count.PerElement()
                    # Format each word and count into a printable string.
                    | 'FormatCounts' >> beam.ParDo(FormatCountsFn()))

    # [END pipeline_monitoring_composite]

    pipeline_options = PipelineOptions()
    options = pipeline_options.view_as(WordCountOptions)
    p = TestPipeline()  # Use TestPipeline for testing.

    # [START pipeline_monitoring_execution]
    (p
     # Read the lines of the input text.
     | 'ReadLines' >> beam.io.ReadFromText(options.input)
     # Count the words.
     | CountWords()
     # Write the formatted word counts to output.
     | 'WriteCounts' >> beam.io.WriteToText(options.output))
    # [END pipeline_monitoring_execution]

    p.visit(SnippetUtils.RenameFiles(renames))
    p.run()
示例#2
0
 def test_read_gzip_empty_file(self):
   file_name = self._create_temp_file()
   pipeline = TestPipeline()
   pcoll = pipeline | 'Read' >> ReadFromText(
       file_name,
       0, CompressionTypes.GZIP,
       True, coders.StrUtf8Coder())
   assert_that(pcoll, equal_to([]))
   pipeline.run()
示例#3
0
 def test_compute_points(self):
     p = TestPipeline()
     records = p | 'create' >> beam.Create(self.SAMPLE_RECORDS)
     result = (records
               | 'points' >> beam.FlatMap(coders.compute_points)
               | beam.CombinePerKey(sum))
     assert_that(result,
                 equal_to([('Italy', 0), ('Brasil', 6), ('Germany', 3)]))
     p.run()
示例#4
0
  def test_end2end_example_proto(self):
    file_path_prefix = os.path.join(self._new_tempdir(), 'result')

    example = tf.train.Example()
    example.features.feature['int'].int64_list.value.extend(range(3))
    example.features.feature['bytes'].bytes_list.value.extend(
        [b'foo', b'bar'])

    with TestPipeline() as p:
      _ = p | beam.Create([example]) | WriteToTFRecord(
          file_path_prefix, coder=beam.coders.ProtoCoder(example.__class__))

    # Read the file back and compare.
    with TestPipeline() as p:
      actual_data = (p | ReadFromTFRecord(
          file_path_prefix + '-*',
          coder=beam.coders.ProtoCoder(example.__class__)))
      beam.assert_that(actual_data, beam.equal_to([example]))
示例#5
0
  def test_element(self):
    class TestDoFn(DoFn):
      def process(self, element):
        yield element + 10

    pipeline = TestPipeline()
    pcoll = pipeline | 'Create' >> Create([1, 2]) | 'Do' >> ParDo(TestDoFn())
    assert_that(pcoll, equal_to([11, 12]))
    pipeline.run()
示例#6
0
 def test_tuple_combine_fn(self):
     p = TestPipeline()
     result = (p
               | Create([('a', 100, 0.0), ('b', 10, -1), ('c', 1, 100)])
               | beam.CombineGlobally(
                   combine.TupleCombineFn(max, combine.MeanCombineFn(),
                                          sum)).without_defaults())
     assert_that(result, equal_to([('c', 111.0 / 3, 99.0)]))
     p.run()
示例#7
0
 def test_runtime_checks_on(self):
     # pylint: disable=expression-not-assigned
     p = TestPipeline()
     with self.assertRaises(typehints.TypeCheckError):
         # [START type_hints_runtime_on]
         p.options.view_as(TypeOptions).runtime_type_check = True
         p | beam.Create(['a'
                          ]) | beam.Map(lambda x: 3).with_output_types(str)
         p.run()
示例#8
0
  def test_timestamp_param(self):
    class TestDoFn(DoFn):
      def process(self, element, timestamp=DoFn.TimestampParam):
        yield timestamp

    pipeline = TestPipeline()
    pcoll = pipeline | 'Create' >> Create([1, 2]) | 'Do' >> ParDo(TestDoFn())
    assert_that(pcoll, equal_to([MIN_TIMESTAMP, MIN_TIMESTAMP]))
    pipeline.run()
示例#9
0
  def test_context_param(self):
    class TestDoFn(DoFn):
      def process(self, element, context=DoFn.ContextParam):
        yield context.element + 10

    pipeline = TestPipeline()
    pcoll = pipeline | 'Create' >> Create([1, 2])| 'Do' >> ParDo(TestDoFn())
    assert_that(pcoll, equal_to([11, 12]))
    pipeline.run()
示例#10
0
    def test_top(self):
        pipeline = TestPipeline()

        # A parameter we'll be sharing with a custom comparator.
        names = {
            0: 'zo',
            1: 'one',
            2: 'twoo',
            3: 'three',
            5: 'fiiive',
            6: 'sssssix',
            9: 'nniiinne'
        }

        # First for global combines.
        pcoll = pipeline | 'start' >> Create([6, 3, 1, 1, 9, 1, 5, 2, 0, 6])
        result_top = pcoll | 'top' >> combine.Top.Largest(5)
        result_bot = pcoll | 'bot' >> combine.Top.Smallest(4)
        result_cmp = pcoll | 'cmp' >> combine.Top.Of(
            'cmp', 6, lambda a, b, names: len(names[a]) < len(names[b]),
            names)  # Note parameter passed to comparator.
        result_cmp_rev = pcoll | 'cmp_rev' >> combine.Top.Of(
            'cmp',
            3,
            lambda a, b, names: len(names[a]) < len(names[b]),
            names,  # Note parameter passed to comparator.
            reverse=True)
        assert_that(result_top,
                    equal_to([[9, 6, 6, 5, 3]]),
                    label='assert:top')
        assert_that(result_bot, equal_to([[0, 1, 1, 1]]), label='assert:bot')
        assert_that(result_cmp,
                    equal_to([[9, 6, 6, 5, 3, 2]]),
                    label='assert:cmp')
        assert_that(result_cmp_rev,
                    equal_to([[0, 1, 1]]),
                    label='assert:cmp_rev')

        # Again for per-key combines.
        pcoll = pipeline | 'start-perkye' >> Create(
            [('a', x) for x in [6, 3, 1, 1, 9, 1, 5, 2, 0, 6]])
        result_key_top = pcoll | 'top-perkey' >> combine.Top.LargestPerKey(5)
        result_key_bot = pcoll | 'bot-perkey' >> combine.Top.SmallestPerKey(4)
        result_key_cmp = pcoll | 'cmp-perkey' >> combine.Top.PerKey(
            6, lambda a, b, names: len(names[a]) < len(names[b]),
            names)  # Note parameter passed to comparator.
        assert_that(result_key_top,
                    equal_to([('a', [9, 6, 6, 5, 3])]),
                    label='key:top')
        assert_that(result_key_bot,
                    equal_to([('a', [0, 1, 1, 1])]),
                    label='key:bot')
        assert_that(result_key_cmp,
                    equal_to([('a', [9, 6, 6, 5, 3, 2])]),
                    label='key:cmp')
        pipeline.run()
示例#11
0
 def test_reuse_cloned_custom_transform_instance(self):
     pipeline = TestPipeline()
     pcoll1 = pipeline | 'pc1' >> Create([1, 2, 3])
     pcoll2 = pipeline | 'pc2' >> Create([4, 5, 6])
     transform = PipelineTest.CustomTransform()
     result1 = pcoll1 | transform
     result2 = pcoll2 | 'new_label' >> transform
     assert_that(result1, equal_to([2, 3, 4]), label='r1')
     assert_that(result2, equal_to([5, 6, 7]), label='r2')
     pipeline.run()
示例#12
0
 def test_metrics_in_source(self):
     pipeline = TestPipeline()
     pcoll = pipeline | Read(FakeSource([1, 2, 3, 4, 5, 6]))
     assert_that(pcoll, equal_to([1, 2, 3, 4, 5, 6]))
     res = pipeline.run()
     metric_results = res.metrics().query()
     outputs_counter = metric_results['counters'][0]
     self.assertEqual(outputs_counter.key.step, 'Read')
     self.assertEqual(outputs_counter.key.metric.name, 'outputs')
     self.assertEqual(outputs_counter.committed, 6)
示例#13
0
文件: textio_test.py 项目: xgong/beam
    def test_read_auto_bzip2(self):
        _, lines = write_data(15)
        file_name = self._create_temp_file(suffix='.bz2')
        with bz2.BZ2File(file_name, 'wb') as f:
            f.write('\n'.join(lines))

        pipeline = TestPipeline()
        pcoll = pipeline | 'Read' >> ReadFromText(file_name)
        assert_that(pcoll, equal_to(lines))
        pipeline.run()
示例#14
0
    def test_create(self):
        pipeline = TestPipeline()
        pcoll = pipeline | 'label1' >> Create([1, 2, 3])
        assert_that(pcoll, equal_to([1, 2, 3]))

        # Test if initial value is an iterator object.
        pcoll2 = pipeline | 'label2' >> Create(iter((4, 5, 6)))
        pcoll3 = pcoll2 | 'do' >> FlatMap(lambda x: [x + 10])
        assert_that(pcoll3, equal_to([14, 15, 16]), label='pcoll3')
        pipeline.run()
示例#15
0
 def test_empty_write(self):
     temp_path = tempfile.NamedTemporaryFile().name
     sink = MyFileSink(temp_path,
                       file_name_suffix='.output',
                       coder=coders.ToStringCoder())
     p = TestPipeline()
     p | beam.Create([]) | beam.io.Write(sink)  # pylint: disable=expression-not-assigned
     p.run()
     self.assertEqual(
         open(temp_path + '-00000-of-00001.output').read(), '[start][end]')
示例#16
0
 def test_tuple_combine_fn_without_defaults(self):
     p = TestPipeline()
     result = (p
               | Create([1, 1, 2, 3])
               | beam.CombineGlobally(
                   combine.TupleCombineFn(
                       min, combine.MeanCombineFn(),
                       max).with_common_input()).without_defaults())
     assert_that(result, equal_to([(1, 7.0 / 4, 3)]))
     p.run()
示例#17
0
  def test_run_concat_direct(self):
    source = ConcatSource([RangeSource(0, 10),
                           RangeSource(10, 100),
                           RangeSource(100, 1000),
                          ])
    pipeline = TestPipeline()
    pcoll = pipeline | beam.Read(source)
    assert_that(pcoll, equal_to(range(1000)))

    pipeline.run()
示例#18
0
 def test_hourly_team_score(self):
     with TestPipeline() as p:
         result = (p
                   | beam.Create(HourlyTeamScoreTest.SAMPLE_DATA)
                   | hourly_team_score.HourlyTeamScore(
                       start_min='2015-11-16-15-20',
                       stop_min='2015-11-16-17-20',
                       window_duration=60))
         beam.assert_that(
             result,
             beam.equal_to([('team1', 18), ('team2', 2), ('team3', 13)]))
示例#19
0
文件: textio_test.py 项目: xgong/beam
    def test_read_gzip(self):
        _, lines = write_data(15)
        file_name = self._create_temp_file()
        with gzip.GzipFile(file_name, 'wb') as f:
            f.write('\n'.join(lines))

        pipeline = TestPipeline()
        pcoll = pipeline | 'Read' >> ReadFromText(
            file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder())
        assert_that(pcoll, equal_to(lines))
        pipeline.run()
示例#20
0
文件: textio_test.py 项目: xgong/beam
    def test_read_bzip2(self):
        _, lines = write_data(15)
        file_name = self._create_temp_file()
        with bz2.BZ2File(file_name, 'wb') as f:
            f.write('\n'.join(lines))

        pipeline = TestPipeline()
        pcoll = pipeline | 'Read' >> ReadFromText(
            file_name, compression_type=CompressionTypes.BZIP2)
        assert_that(pcoll, equal_to(lines))
        pipeline.run()
示例#21
0
 def test_tfidf_transform(self):
   p = TestPipeline()
   uri_to_line = p | beam.Create(
       'create sample',
       [('1.txt', 'abc def ghi'),
        ('2.txt', 'abc def'),
        ('3.txt', 'abc')])
   result = (
       uri_to_line
       | tfidf.TfIdf()
       | beam.Map(lambda (word, (uri, tfidf)): (word, uri, tfidf)))
示例#22
0
 def test_timestamped_value(self):
   p = TestPipeline()
   result = (p
             | 'start' >> Create([(k, k) for k in range(10)])
             | Map(lambda (x, t): TimestampedValue(x, t))
             | 'w' >> WindowInto(FixedWindows(5))
             | Map(lambda v: ('key', v))
             | GroupByKey())
   assert_that(result, equal_to([('key', [0, 1, 2, 3, 4]),
                                 ('key', [5, 6, 7, 8, 9])]))
   p.run()
示例#23
0
 def test_process_auto(self):
     path = os.path.join(self._new_tempdir(), 'result.gz')
     self._write_file_gzip(path, FOO_BAR_RECORD_BASE64)
     with TestPipeline() as p:
         result = (p
                   | beam.Read(
                       _TFRecordSource(
                           path,
                           coder=coders.BytesCoder(),
                           compression_type=fileio.CompressionTypes.AUTO)))
         beam.assert_that(result, beam.equal_to(['foo', 'bar']))
示例#24
0
 def test_process_single(self):
     path = os.path.join(self._new_tempdir(), 'result')
     self._write_file(path, FOO_RECORD_BASE64)
     with TestPipeline() as p:
         result = (p
                   | beam.io.Read(
                       _TFRecordSource(
                           path,
                           coder=coders.BytesCoder(),
                           compression_type=CompressionTypes.AUTO,
                           validate=True)))
         beam.assert_that(result, beam.equal_to(['foo']))
示例#25
0
 def test_sliding_windows(self):
   p = TestPipeline()
   pcoll = self.timestamped_key_values(p, 'key', 1, 2, 3)
   result = (pcoll
             | 'w' >> WindowInto(SlidingWindows(period=2, size=4))
             | GroupByKey()
             | reify_windows)
   expected = [('key @ [-2.0, 2.0)', [1]),
               ('key @ [0.0, 4.0)', [1, 2, 3]),
               ('key @ [2.0, 6.0)', [2, 3])]
   assert_that(result, equal_to(expected))
   p.run()
示例#26
0
 def test_sessions(self):
   p = TestPipeline()
   pcoll = self.timestamped_key_values(p, 'key', 1, 2, 3, 20, 35, 27)
   result = (pcoll
             | 'w' >> WindowInto(Sessions(10))
             | GroupByKey()
             | sort_values
             | reify_windows)
   expected = [('key @ [1.0, 13.0)', [1, 2, 3]),
               ('key @ [20.0, 45.0)', [20, 27, 35])]
   assert_that(result, equal_to(expected))
   p.run()
示例#27
0
    def test_read_auto_gzip(self):
        _, lines = write_data(15)
        file_name = tempfile.NamedTemporaryFile(delete=False,
                                                prefix=tempfile.template,
                                                suffix='.gz').name
        with gzip.GzipFile(file_name, 'wb') as f:
            f.write('\n'.join(lines))

        pipeline = TestPipeline()
        pcoll = pipeline | 'Read' >> ReadFromText(file_name)
        assert_that(pcoll, equal_to(lines))
        pipeline.run()
示例#28
0
    def test_read_bzip2(self):
        _, lines = write_data(15)
        file_name = tempfile.NamedTemporaryFile(delete=False,
                                                prefix=tempfile.template).name
        with bz2.BZ2File(file_name, 'wb') as f:
            f.write('\n'.join(lines))

        pipeline = TestPipeline()
        pcoll = pipeline | 'Read' >> ReadFromText(
            file_name, compression_type=CompressionTypes.BZIP2)
        assert_that(pcoll, equal_to(lines))
        pipeline.run()
示例#29
0
    def test_read_gzip_large(self):
        _, lines = write_data(10000)
        file_name = tempfile.NamedTemporaryFile(delete=False,
                                                prefix=tempfile.template).name
        with gzip.GzipFile(file_name, 'wb') as f:
            f.write('\n'.join(lines))

        pipeline = TestPipeline()
        pcoll = pipeline | 'Read' >> ReadFromText(
            file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder())
        assert_that(pcoll, equal_to(lines))
        pipeline.run()
示例#30
0
    def test_write_dataflow_auto_compression(self):
        pipeline = TestPipeline()
        pcoll = pipeline | beam.core.Create(self.lines)
        pcoll | 'Write' >> WriteToText(self.path, file_name_suffix='.gz')  # pylint: disable=expression-not-assigned
        pipeline.run()

        read_result = []
        for file_name in glob.glob(self.path + '*'):
            with gzip.GzipFile(file_name, 'r') as f:
                read_result.extend(f.read().splitlines())

        self.assertEqual(read_result, self.lines)