def pipeline_monitoring(renames): """Using monitoring interface snippets.""" import re import apache_beam as beam from apache_beam.utils.pipeline_options import PipelineOptions class WordCountOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--input', help='Input for the pipeline', default='gs://my-bucket/input') parser.add_argument('--output', help='output for the pipeline', default='gs://my-bucket/output') class ExtractWordsFn(beam.DoFn): def process(self, element): words = re.findall(r'[A-Za-z\']+', element) for word in words: yield word class FormatCountsFn(beam.DoFn): def process(self, element): word, count = element yield '%s: %s' % (word, count) # [START pipeline_monitoring_composite] # The CountWords Composite Transform inside the WordCount pipeline. class CountWords(beam.PTransform): def expand(self, pcoll): return (pcoll # Convert lines of text into individual words. | 'ExtractWords' >> beam.ParDo(ExtractWordsFn()) # Count the number of times each word occurs. | beam.combiners.Count.PerElement() # Format each word and count into a printable string. | 'FormatCounts' >> beam.ParDo(FormatCountsFn())) # [END pipeline_monitoring_composite] pipeline_options = PipelineOptions() options = pipeline_options.view_as(WordCountOptions) p = TestPipeline() # Use TestPipeline for testing. # [START pipeline_monitoring_execution] (p # Read the lines of the input text. | 'ReadLines' >> beam.io.ReadFromText(options.input) # Count the words. | CountWords() # Write the formatted word counts to output. | 'WriteCounts' >> beam.io.WriteToText(options.output)) # [END pipeline_monitoring_execution] p.visit(SnippetUtils.RenameFiles(renames)) p.run()
def test_read_gzip_empty_file(self): file_name = self._create_temp_file() pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText( file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder()) assert_that(pcoll, equal_to([])) pipeline.run()
def test_compute_points(self): p = TestPipeline() records = p | 'create' >> beam.Create(self.SAMPLE_RECORDS) result = (records | 'points' >> beam.FlatMap(coders.compute_points) | beam.CombinePerKey(sum)) assert_that(result, equal_to([('Italy', 0), ('Brasil', 6), ('Germany', 3)])) p.run()
def test_end2end_example_proto(self): file_path_prefix = os.path.join(self._new_tempdir(), 'result') example = tf.train.Example() example.features.feature['int'].int64_list.value.extend(range(3)) example.features.feature['bytes'].bytes_list.value.extend( [b'foo', b'bar']) with TestPipeline() as p: _ = p | beam.Create([example]) | WriteToTFRecord( file_path_prefix, coder=beam.coders.ProtoCoder(example.__class__)) # Read the file back and compare. with TestPipeline() as p: actual_data = (p | ReadFromTFRecord( file_path_prefix + '-*', coder=beam.coders.ProtoCoder(example.__class__))) beam.assert_that(actual_data, beam.equal_to([example]))
def test_element(self): class TestDoFn(DoFn): def process(self, element): yield element + 10 pipeline = TestPipeline() pcoll = pipeline | 'Create' >> Create([1, 2]) | 'Do' >> ParDo(TestDoFn()) assert_that(pcoll, equal_to([11, 12])) pipeline.run()
def test_tuple_combine_fn(self): p = TestPipeline() result = (p | Create([('a', 100, 0.0), ('b', 10, -1), ('c', 1, 100)]) | beam.CombineGlobally( combine.TupleCombineFn(max, combine.MeanCombineFn(), sum)).without_defaults()) assert_that(result, equal_to([('c', 111.0 / 3, 99.0)])) p.run()
def test_runtime_checks_on(self): # pylint: disable=expression-not-assigned p = TestPipeline() with self.assertRaises(typehints.TypeCheckError): # [START type_hints_runtime_on] p.options.view_as(TypeOptions).runtime_type_check = True p | beam.Create(['a' ]) | beam.Map(lambda x: 3).with_output_types(str) p.run()
def test_timestamp_param(self): class TestDoFn(DoFn): def process(self, element, timestamp=DoFn.TimestampParam): yield timestamp pipeline = TestPipeline() pcoll = pipeline | 'Create' >> Create([1, 2]) | 'Do' >> ParDo(TestDoFn()) assert_that(pcoll, equal_to([MIN_TIMESTAMP, MIN_TIMESTAMP])) pipeline.run()
def test_context_param(self): class TestDoFn(DoFn): def process(self, element, context=DoFn.ContextParam): yield context.element + 10 pipeline = TestPipeline() pcoll = pipeline | 'Create' >> Create([1, 2])| 'Do' >> ParDo(TestDoFn()) assert_that(pcoll, equal_to([11, 12])) pipeline.run()
def test_top(self): pipeline = TestPipeline() # A parameter we'll be sharing with a custom comparator. names = { 0: 'zo', 1: 'one', 2: 'twoo', 3: 'three', 5: 'fiiive', 6: 'sssssix', 9: 'nniiinne' } # First for global combines. pcoll = pipeline | 'start' >> Create([6, 3, 1, 1, 9, 1, 5, 2, 0, 6]) result_top = pcoll | 'top' >> combine.Top.Largest(5) result_bot = pcoll | 'bot' >> combine.Top.Smallest(4) result_cmp = pcoll | 'cmp' >> combine.Top.Of( 'cmp', 6, lambda a, b, names: len(names[a]) < len(names[b]), names) # Note parameter passed to comparator. result_cmp_rev = pcoll | 'cmp_rev' >> combine.Top.Of( 'cmp', 3, lambda a, b, names: len(names[a]) < len(names[b]), names, # Note parameter passed to comparator. reverse=True) assert_that(result_top, equal_to([[9, 6, 6, 5, 3]]), label='assert:top') assert_that(result_bot, equal_to([[0, 1, 1, 1]]), label='assert:bot') assert_that(result_cmp, equal_to([[9, 6, 6, 5, 3, 2]]), label='assert:cmp') assert_that(result_cmp_rev, equal_to([[0, 1, 1]]), label='assert:cmp_rev') # Again for per-key combines. pcoll = pipeline | 'start-perkye' >> Create( [('a', x) for x in [6, 3, 1, 1, 9, 1, 5, 2, 0, 6]]) result_key_top = pcoll | 'top-perkey' >> combine.Top.LargestPerKey(5) result_key_bot = pcoll | 'bot-perkey' >> combine.Top.SmallestPerKey(4) result_key_cmp = pcoll | 'cmp-perkey' >> combine.Top.PerKey( 6, lambda a, b, names: len(names[a]) < len(names[b]), names) # Note parameter passed to comparator. assert_that(result_key_top, equal_to([('a', [9, 6, 6, 5, 3])]), label='key:top') assert_that(result_key_bot, equal_to([('a', [0, 1, 1, 1])]), label='key:bot') assert_that(result_key_cmp, equal_to([('a', [9, 6, 6, 5, 3, 2])]), label='key:cmp') pipeline.run()
def test_reuse_cloned_custom_transform_instance(self): pipeline = TestPipeline() pcoll1 = pipeline | 'pc1' >> Create([1, 2, 3]) pcoll2 = pipeline | 'pc2' >> Create([4, 5, 6]) transform = PipelineTest.CustomTransform() result1 = pcoll1 | transform result2 = pcoll2 | 'new_label' >> transform assert_that(result1, equal_to([2, 3, 4]), label='r1') assert_that(result2, equal_to([5, 6, 7]), label='r2') pipeline.run()
def test_metrics_in_source(self): pipeline = TestPipeline() pcoll = pipeline | Read(FakeSource([1, 2, 3, 4, 5, 6])) assert_that(pcoll, equal_to([1, 2, 3, 4, 5, 6])) res = pipeline.run() metric_results = res.metrics().query() outputs_counter = metric_results['counters'][0] self.assertEqual(outputs_counter.key.step, 'Read') self.assertEqual(outputs_counter.key.metric.name, 'outputs') self.assertEqual(outputs_counter.committed, 6)
def test_read_auto_bzip2(self): _, lines = write_data(15) file_name = self._create_temp_file(suffix='.bz2') with bz2.BZ2File(file_name, 'wb') as f: f.write('\n'.join(lines)) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText(file_name) assert_that(pcoll, equal_to(lines)) pipeline.run()
def test_create(self): pipeline = TestPipeline() pcoll = pipeline | 'label1' >> Create([1, 2, 3]) assert_that(pcoll, equal_to([1, 2, 3])) # Test if initial value is an iterator object. pcoll2 = pipeline | 'label2' >> Create(iter((4, 5, 6))) pcoll3 = pcoll2 | 'do' >> FlatMap(lambda x: [x + 10]) assert_that(pcoll3, equal_to([14, 15, 16]), label='pcoll3') pipeline.run()
def test_empty_write(self): temp_path = tempfile.NamedTemporaryFile().name sink = MyFileSink(temp_path, file_name_suffix='.output', coder=coders.ToStringCoder()) p = TestPipeline() p | beam.Create([]) | beam.io.Write(sink) # pylint: disable=expression-not-assigned p.run() self.assertEqual( open(temp_path + '-00000-of-00001.output').read(), '[start][end]')
def test_tuple_combine_fn_without_defaults(self): p = TestPipeline() result = (p | Create([1, 1, 2, 3]) | beam.CombineGlobally( combine.TupleCombineFn( min, combine.MeanCombineFn(), max).with_common_input()).without_defaults()) assert_that(result, equal_to([(1, 7.0 / 4, 3)])) p.run()
def test_run_concat_direct(self): source = ConcatSource([RangeSource(0, 10), RangeSource(10, 100), RangeSource(100, 1000), ]) pipeline = TestPipeline() pcoll = pipeline | beam.Read(source) assert_that(pcoll, equal_to(range(1000))) pipeline.run()
def test_hourly_team_score(self): with TestPipeline() as p: result = (p | beam.Create(HourlyTeamScoreTest.SAMPLE_DATA) | hourly_team_score.HourlyTeamScore( start_min='2015-11-16-15-20', stop_min='2015-11-16-17-20', window_duration=60)) beam.assert_that( result, beam.equal_to([('team1', 18), ('team2', 2), ('team3', 13)]))
def test_read_gzip(self): _, lines = write_data(15) file_name = self._create_temp_file() with gzip.GzipFile(file_name, 'wb') as f: f.write('\n'.join(lines)) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText( file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder()) assert_that(pcoll, equal_to(lines)) pipeline.run()
def test_read_bzip2(self): _, lines = write_data(15) file_name = self._create_temp_file() with bz2.BZ2File(file_name, 'wb') as f: f.write('\n'.join(lines)) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText( file_name, compression_type=CompressionTypes.BZIP2) assert_that(pcoll, equal_to(lines)) pipeline.run()
def test_tfidf_transform(self): p = TestPipeline() uri_to_line = p | beam.Create( 'create sample', [('1.txt', 'abc def ghi'), ('2.txt', 'abc def'), ('3.txt', 'abc')]) result = ( uri_to_line | tfidf.TfIdf() | beam.Map(lambda (word, (uri, tfidf)): (word, uri, tfidf)))
def test_timestamped_value(self): p = TestPipeline() result = (p | 'start' >> Create([(k, k) for k in range(10)]) | Map(lambda (x, t): TimestampedValue(x, t)) | 'w' >> WindowInto(FixedWindows(5)) | Map(lambda v: ('key', v)) | GroupByKey()) assert_that(result, equal_to([('key', [0, 1, 2, 3, 4]), ('key', [5, 6, 7, 8, 9])])) p.run()
def test_process_auto(self): path = os.path.join(self._new_tempdir(), 'result.gz') self._write_file_gzip(path, FOO_BAR_RECORD_BASE64) with TestPipeline() as p: result = (p | beam.Read( _TFRecordSource( path, coder=coders.BytesCoder(), compression_type=fileio.CompressionTypes.AUTO))) beam.assert_that(result, beam.equal_to(['foo', 'bar']))
def test_process_single(self): path = os.path.join(self._new_tempdir(), 'result') self._write_file(path, FOO_RECORD_BASE64) with TestPipeline() as p: result = (p | beam.io.Read( _TFRecordSource( path, coder=coders.BytesCoder(), compression_type=CompressionTypes.AUTO, validate=True))) beam.assert_that(result, beam.equal_to(['foo']))
def test_sliding_windows(self): p = TestPipeline() pcoll = self.timestamped_key_values(p, 'key', 1, 2, 3) result = (pcoll | 'w' >> WindowInto(SlidingWindows(period=2, size=4)) | GroupByKey() | reify_windows) expected = [('key @ [-2.0, 2.0)', [1]), ('key @ [0.0, 4.0)', [1, 2, 3]), ('key @ [2.0, 6.0)', [2, 3])] assert_that(result, equal_to(expected)) p.run()
def test_sessions(self): p = TestPipeline() pcoll = self.timestamped_key_values(p, 'key', 1, 2, 3, 20, 35, 27) result = (pcoll | 'w' >> WindowInto(Sessions(10)) | GroupByKey() | sort_values | reify_windows) expected = [('key @ [1.0, 13.0)', [1, 2, 3]), ('key @ [20.0, 45.0)', [20, 27, 35])] assert_that(result, equal_to(expected)) p.run()
def test_read_auto_gzip(self): _, lines = write_data(15) file_name = tempfile.NamedTemporaryFile(delete=False, prefix=tempfile.template, suffix='.gz').name with gzip.GzipFile(file_name, 'wb') as f: f.write('\n'.join(lines)) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText(file_name) assert_that(pcoll, equal_to(lines)) pipeline.run()
def test_read_bzip2(self): _, lines = write_data(15) file_name = tempfile.NamedTemporaryFile(delete=False, prefix=tempfile.template).name with bz2.BZ2File(file_name, 'wb') as f: f.write('\n'.join(lines)) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText( file_name, compression_type=CompressionTypes.BZIP2) assert_that(pcoll, equal_to(lines)) pipeline.run()
def test_read_gzip_large(self): _, lines = write_data(10000) file_name = tempfile.NamedTemporaryFile(delete=False, prefix=tempfile.template).name with gzip.GzipFile(file_name, 'wb') as f: f.write('\n'.join(lines)) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText( file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder()) assert_that(pcoll, equal_to(lines)) pipeline.run()
def test_write_dataflow_auto_compression(self): pipeline = TestPipeline() pcoll = pipeline | beam.core.Create(self.lines) pcoll | 'Write' >> WriteToText(self.path, file_name_suffix='.gz') # pylint: disable=expression-not-assigned pipeline.run() read_result = [] for file_name in glob.glob(self.path + '*'): with gzip.GzipFile(file_name, 'r') as f: read_result.extend(f.read().splitlines()) self.assertEqual(read_result, self.lines)