def pipeline_monitoring(renames): """Using monitoring interface snippets.""" import re import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions class WordCountOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--input', help='Input for the pipeline', default='gs://my-bucket/input') parser.add_argument('--output', help='output for the pipeline', default='gs://my-bucket/output') class ExtractWordsFn(beam.DoFn): def process(self, element): words = re.findall(r'[A-Za-z\']+', element) for word in words: yield word class FormatCountsFn(beam.DoFn): def process(self, element): word, count = element yield '%s: %s' % (word, count) # [START pipeline_monitoring_composite] # The CountWords Composite Transform inside the WordCount pipeline. class CountWords(beam.PTransform): def expand(self, pcoll): return (pcoll # Convert lines of text into individual words. | 'ExtractWords' >> beam.ParDo(ExtractWordsFn()) # Count the number of times each word occurs. | beam.combiners.Count.PerElement() # Format each word and count into a printable string. | 'FormatCounts' >> beam.ParDo(FormatCountsFn())) # [END pipeline_monitoring_composite] pipeline_options = PipelineOptions() options = pipeline_options.view_as(WordCountOptions) p = TestPipeline() # Use TestPipeline for testing. # [START pipeline_monitoring_execution] (p # Read the lines of the input text. | 'ReadLines' >> beam.io.ReadFromText(options.input) # Count the words. | CountWords() # Write the formatted word counts to output. | 'WriteCounts' >> beam.io.WriteToText(options.output)) # [END pipeline_monitoring_execution] p.visit(SnippetUtils.RenameFiles(renames)) p.run()
def model_textio_compressed(renames, expected): """Using a Read Transform to read compressed text files.""" p = TestPipeline() # [START model_textio_write_compressed] lines = p | 'ReadFromText' >> beam.io.ReadFromText( '/path/to/input-*.csv.gz', compression_type=beam.io.filesystem.CompressionTypes.GZIP) # [END model_textio_write_compressed] beam.assert_that(lines, beam.equal_to(expected)) p.visit(SnippetUtils.RenameFiles(renames)) p.run().wait_until_finish()
def model_textio_compressed(renames, expected): """Using a Read Transform to read compressed text files.""" p = TestPipeline() # [START model_textio_write_compressed] lines = p | 'ReadFromText' >> beam.io.ReadFromText( '/path/to/input-*.csv.gz', compression_type=beam.io.filesystem.CompressionTypes.GZIP) # [END model_textio_write_compressed] assert_that(lines, equal_to(expected)) p.visit(SnippetUtils.RenameFiles(renames)) p.run().wait_until_finish()
def construct_pipeline(renames): """A reverse words snippet as an example for constructing a pipeline.""" import re # This is duplicate of the import statement in # pipelines_constructing_creating tag below, but required to avoid # Unresolved reference in ReverseWords class import apache_beam as beam class ReverseWords(beam.PTransform): """A PTransform that reverses individual elements in a PCollection.""" def expand(self, pcoll): return pcoll | beam.Map(lambda e: e[::-1]) def filter_words(unused_x): """Pass through filter to select everything.""" return True # [START pipelines_constructing_creating] import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions p = beam.Pipeline(options=PipelineOptions()) # [END pipelines_constructing_creating] p = TestPipeline() # Use TestPipeline for testing. # [START pipelines_constructing_reading] lines = p | 'ReadMyFile' >> beam.io.ReadFromText('gs://some/inputData.txt') # [END pipelines_constructing_reading] # [START pipelines_constructing_applying] words = lines | beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)) reversed_words = words | ReverseWords() # [END pipelines_constructing_applying] # [START pipelines_constructing_writing] filtered_words = reversed_words | 'FilterWords' >> beam.Filter( filter_words) filtered_words | 'WriteMyFile' >> beam.io.WriteToText( 'gs://some/outputData.txt') # [END pipelines_constructing_writing] p.visit(SnippetUtils.RenameFiles(renames)) # [START pipelines_constructing_running] p.run()
def construct_pipeline(renames): """A reverse words snippet as an example for constructing a pipeline.""" import re # This is duplicate of the import statement in # pipelines_constructing_creating tag below, but required to avoid # Unresolved reference in ReverseWords class import apache_beam as beam class ReverseWords(beam.PTransform): """A PTransform that reverses individual elements in a PCollection.""" def expand(self, pcoll): return pcoll | beam.Map(lambda e: e[::-1]) def filter_words(unused_x): """Pass through filter to select everything.""" return True # [START pipelines_constructing_creating] import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions p = beam.Pipeline(options=PipelineOptions()) # [END pipelines_constructing_creating] p = TestPipeline() # Use TestPipeline for testing. # [START pipelines_constructing_reading] lines = p | 'ReadMyFile' >> beam.io.ReadFromText('gs://some/inputData.txt') # [END pipelines_constructing_reading] # [START pipelines_constructing_applying] words = lines | beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)) reversed_words = words | ReverseWords() # [END pipelines_constructing_applying] # [START pipelines_constructing_writing] filtered_words = reversed_words | 'FilterWords' >> beam.Filter(filter_words) filtered_words | 'WriteMyFile' >> beam.io.WriteToText( 'gs://some/outputData.txt') # [END pipelines_constructing_writing] p.visit(SnippetUtils.RenameFiles(renames)) # [START pipelines_constructing_running] p.run()
def test_gbk_then_flatten_input_visitor(self): p = TestPipeline(runner=DataflowRunner(), options=PipelineOptions(self.default_properties)) none_str_pc = p | 'c1' >> beam.Create({None: 'a'}) none_int_pc = p | 'c2' >> beam.Create({None: 3}) flat = (none_str_pc, none_int_pc) | beam.Flatten() _ = flat | beam.GroupByKey() # This may change if type inference changes, but we assert it here # to make sure the check below is not vacuous. self.assertNotIsInstance(flat.element_type, typehints.TupleConstraint) p.visit(DataflowRunner.group_by_key_input_visitor()) p.visit(DataflowRunner.flatten_input_visitor()) # The dataflow runner requires gbk input to be tuples *and* flatten # inputs to be equal to their outputs. Assert both hold. self.assertIsInstance(flat.element_type, typehints.TupleConstraint) self.assertEqual(flat.element_type, none_str_pc.element_type) self.assertEqual(flat.element_type, none_int_pc.element_type)
def test_gbk_then_flatten_input_visitor(self): p = TestPipeline( runner=DataflowRunner(), options=PipelineOptions(self.default_properties)) none_str_pc = p | 'c1' >> beam.Create({None: 'a'}) none_int_pc = p | 'c2' >> beam.Create({None: 3}) flat = (none_str_pc, none_int_pc) | beam.Flatten() _ = flat | beam.GroupByKey() # This may change if type inference changes, but we assert it here # to make sure the check below is not vacuous. self.assertNotIsInstance(flat.element_type, typehints.TupleConstraint) p.visit(DataflowRunner.group_by_key_input_visitor()) p.visit(DataflowRunner.flatten_input_visitor()) # The dataflow runner requires gbk input to be tuples *and* flatten # inputs to be equal to their outputs. Assert both hold. self.assertIsInstance(flat.element_type, typehints.TupleConstraint) self.assertEqual(flat.element_type, none_str_pc.element_type) self.assertEqual(flat.element_type, none_int_pc.element_type)
def test_ptransform_override_multiple_outputs(self): class MultiOutputComposite(PTransform): def __init__(self): self.output_tags = set() def expand(self, pcoll): def mux_input(x): x = x * 2 if isinstance(x, int): yield TaggedOutput('numbers', x) else: yield TaggedOutput('letters', x) multi = pcoll | 'MyReplacement' >> beam.ParDo( mux_input).with_outputs() letters = multi.letters | 'LettersComposite' >> beam.Map( lambda x: x * 3) numbers = multi.numbers | 'NumbersComposite' >> beam.Map( lambda x: x * 5) return { 'letters': letters, 'numbers': numbers, } class MultiOutputOverride(PTransformOverride): def matches(self, applied_ptransform): return applied_ptransform.full_label == 'MyMultiOutput' def get_replacement_transform(self, ptransform): return MultiOutputComposite() def mux_input(x): if isinstance(x, int): yield TaggedOutput('numbers', x) else: yield TaggedOutput('letters', x) p = TestPipeline() multi = (p | beam.Create([1, 2, 3, 'a', 'b', 'c']) | 'MyMultiOutput' >> beam.ParDo(mux_input).with_outputs()) letters = multi.letters | 'MyLetters' >> beam.Map(lambda x: x) numbers = multi.numbers | 'MyNumbers' >> beam.Map(lambda x: x) # Assert that the PCollection replacement worked correctly and that elements # are flowing through. The replacement transform first multiples by 2 then # the leaf nodes inside the composite multiply by an additional 3 and 5. Use # prime numbers to ensure that each transform is getting executed once. assert_that(letters, equal_to(['a' * 2 * 3, 'b' * 2 * 3, 'c' * 2 * 3]), label='assert letters') assert_that(numbers, equal_to([1 * 2 * 5, 2 * 2 * 5, 3 * 2 * 5]), label='assert numbers') # Do the replacement and run the element assertions. p.replace_all([MultiOutputOverride()]) p.run() # The following checks the graph to make sure the replacement occurred. visitor = PipelineTest.Visitor(visited=[]) p.visit(visitor) pcollections = visitor.visited composites = visitor.enter_composite # Assert the replacement is in the composite list and retrieve the # AppliedPTransform. self.assertIn(MultiOutputComposite, [t.transform.__class__ for t in composites]) multi_output_composite = list( filter(lambda t: t.transform.__class__ == MultiOutputComposite, composites))[0] # Assert that all of the replacement PCollections are in the graph. for output in multi_output_composite.outputs.values(): self.assertIn(output, pcollections) # Assert that all of the "old"/replaced PCollections are not in the graph. self.assertNotIn(multi[None], visitor.visited) self.assertNotIn(multi.letters, visitor.visited) self.assertNotIn(multi.numbers, visitor.visited)
def examples_wordcount_debugging(renames): """DebuggingWordCount example snippets.""" import re import apache_beam as beam # [START example_wordcount_debugging_logging] # [START example_wordcount_debugging_aggregators] import logging class FilterTextFn(beam.DoFn): """A DoFn that filters for a specific key based on a regular expression.""" def __init__(self, pattern): self.pattern = pattern # A custom metric can track values in your pipeline as it runs. Create # custom metrics matched_word and unmatched_words. self.matched_words = Metrics.counter(self.__class__, 'matched_words') self.umatched_words = Metrics.counter(self.__class__, 'umatched_words') def process(self, element): word, _ = element if re.match(self.pattern, word): # Log at INFO level each element we match. When executing this pipeline # using the Dataflow service, these log lines will appear in the Cloud # Logging UI. logging.info('Matched %s', word) # Add 1 to the custom metric counter matched_words self.matched_words.inc() yield element else: # Log at the "DEBUG" level each element that is not matched. Different # log levels can be used to control the verbosity of logging providing # an effective mechanism to filter less important information. Note # currently only "INFO" and higher level logs are emitted to the Cloud # Logger. This log message will not be visible in the Cloud Logger. logging.debug('Did not match %s', word) # Add 1 to the custom metric counter umatched_words self.umatched_words.inc() # [END example_wordcount_debugging_logging] # [END example_wordcount_debugging_aggregators] p = TestPipeline() # Use TestPipeline for testing. filtered_words = ( p | beam.io.ReadFromText('gs://dataflow-samples/shakespeare/kinglear.txt') | 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)) | beam.combiners.Count.PerElement() | 'FilterText' >> beam.ParDo(FilterTextFn('Flourish|stomach'))) # [START example_wordcount_debugging_assert] beam.assert_that(filtered_words, beam.equal_to([('Flourish', 3), ('stomach', 1)])) # [END example_wordcount_debugging_assert] output = (filtered_words | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c)) | 'Write' >> beam.io.WriteToText('gs://my-bucket/counts.txt')) p.visit(SnippetUtils.RenameFiles(renames)) p.run()
def examples_wordcount_debugging(renames): """DebuggingWordCount example snippets.""" import re import apache_beam as beam # [START example_wordcount_debugging_logging] # [START example_wordcount_debugging_aggregators] import logging class FilterTextFn(beam.DoFn): """A DoFn that filters for a specific key based on a regular expression.""" def __init__(self, pattern): self.pattern = pattern # A custom metric can track values in your pipeline as it runs. Create # custom metrics matched_word and unmatched_words. self.matched_words = Metrics.counter(self.__class__, 'matched_words') self.umatched_words = Metrics.counter(self.__class__, 'umatched_words') def process(self, element): word, _ = element if re.match(self.pattern, word): # Log at INFO level each element we match. When executing this pipeline # using the Dataflow service, these log lines will appear in the Cloud # Logging UI. logging.info('Matched %s', word) # Add 1 to the custom metric counter matched_words self.matched_words.inc() yield element else: # Log at the "DEBUG" level each element that is not matched. Different # log levels can be used to control the verbosity of logging providing # an effective mechanism to filter less important information. Note # currently only "INFO" and higher level logs are emitted to the Cloud # Logger. This log message will not be visible in the Cloud Logger. logging.debug('Did not match %s', word) # Add 1 to the custom metric counter umatched_words self.umatched_words.inc() # [END example_wordcount_debugging_logging] # [END example_wordcount_debugging_aggregators] p = TestPipeline() # Use TestPipeline for testing. filtered_words = ( p | beam.io.ReadFromText( 'gs://dataflow-samples/shakespeare/kinglear.txt') | 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)) | beam.combiners.Count.PerElement() | 'FilterText' >> beam.ParDo(FilterTextFn('Flourish|stomach'))) # [START example_wordcount_debugging_assert] beam.testing.util.assert_that( filtered_words, beam.testing.util.equal_to( [('Flourish', 3), ('stomach', 1)])) # [END example_wordcount_debugging_assert] output = (filtered_words | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c)) | 'Write' >> beam.io.WriteToText('gs://my-bucket/counts.txt')) p.visit(SnippetUtils.RenameFiles(renames)) p.run()