Пример #1
def pipeline_monitoring(renames):
    """Using monitoring interface snippets."""

    import re
    import apache_beam as beam
    from apache_beam.options.pipeline_options import PipelineOptions

    class WordCountOptions(PipelineOptions):
        def _add_argparse_args(cls, parser):
                                help='Input for the pipeline',
                                help='output for the pipeline',

    class ExtractWordsFn(beam.DoFn):
        def process(self, element):
            words = re.findall(r'[A-Za-z\']+', element)
            for word in words:
                yield word

    class FormatCountsFn(beam.DoFn):
        def process(self, element):
            word, count = element
            yield '%s: %s' % (word, count)

    # [START pipeline_monitoring_composite]
    # The CountWords Composite Transform inside the WordCount pipeline.
    class CountWords(beam.PTransform):
        def expand(self, pcoll):
            return (pcoll
                    # Convert lines of text into individual words.
                    | 'ExtractWords' >> beam.ParDo(ExtractWordsFn())
                    # Count the number of times each word occurs.
                    | beam.combiners.Count.PerElement()
                    # Format each word and count into a printable string.
                    | 'FormatCounts' >> beam.ParDo(FormatCountsFn()))

    # [END pipeline_monitoring_composite]

    pipeline_options = PipelineOptions()
    options = pipeline_options.view_as(WordCountOptions)
    p = TestPipeline()  # Use TestPipeline for testing.

    # [START pipeline_monitoring_execution]
     # Read the lines of the input text.
     | 'ReadLines' >> beam.io.ReadFromText(options.input)
     # Count the words.
     | CountWords()
     # Write the formatted word counts to output.
     | 'WriteCounts' >> beam.io.WriteToText(options.output))
    # [END pipeline_monitoring_execution]

Пример #4
def construct_pipeline(renames):
    """A reverse words snippet as an example for constructing a pipeline."""
    import re

    # This is duplicate of the import statement in
    # pipelines_constructing_creating tag below, but required to avoid
    # Unresolved reference in ReverseWords class
    import apache_beam as beam

    class ReverseWords(beam.PTransform):
        """A PTransform that reverses individual elements in a PCollection."""
        def expand(self, pcoll):
            return pcoll | beam.Map(lambda e: e[::-1])

    def filter_words(unused_x):
        """Pass through filter to select everything."""
        return True

    # [START pipelines_constructing_creating]
    import apache_beam as beam
    from apache_beam.options.pipeline_options import PipelineOptions

    p = beam.Pipeline(options=PipelineOptions())
    # [END pipelines_constructing_creating]

    p = TestPipeline()  # Use TestPipeline for testing.

    # [START pipelines_constructing_reading]
    lines = p | 'ReadMyFile' >> beam.io.ReadFromText('gs://some/inputData.txt')
    # [END pipelines_constructing_reading]

    # [START pipelines_constructing_applying]
    words = lines | beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
    reversed_words = words | ReverseWords()
    # [END pipelines_constructing_applying]

    # [START pipelines_constructing_writing]
    filtered_words = reversed_words | 'FilterWords' >> beam.Filter(
    filtered_words | 'WriteMyFile' >> beam.io.WriteToText(
    # [END pipelines_constructing_writing]


    # [START pipelines_constructing_running]
Пример #6
    def test_gbk_then_flatten_input_visitor(self):
        p = TestPipeline(runner=DataflowRunner(),
        none_str_pc = p | 'c1' >> beam.Create({None: 'a'})
        none_int_pc = p | 'c2' >> beam.Create({None: 3})
        flat = (none_str_pc, none_int_pc) | beam.Flatten()
        _ = flat | beam.GroupByKey()

        # This may change if type inference changes, but we assert it here
        # to make sure the check below is not vacuous.
        self.assertNotIsInstance(flat.element_type, typehints.TupleConstraint)


        # The dataflow runner requires gbk input to be tuples *and* flatten
        # inputs to be equal to their outputs. Assert both hold.
        self.assertIsInstance(flat.element_type, typehints.TupleConstraint)
        self.assertEqual(flat.element_type, none_str_pc.element_type)
        self.assertEqual(flat.element_type, none_int_pc.element_type)
Пример #8
    def test_ptransform_override_multiple_outputs(self):
        class MultiOutputComposite(PTransform):
            def __init__(self):
                self.output_tags = set()

            def expand(self, pcoll):
                def mux_input(x):
                    x = x * 2
                    if isinstance(x, int):
                        yield TaggedOutput('numbers', x)
                        yield TaggedOutput('letters', x)

                multi = pcoll | 'MyReplacement' >> beam.ParDo(
                letters = multi.letters | 'LettersComposite' >> beam.Map(
                    lambda x: x * 3)
                numbers = multi.numbers | 'NumbersComposite' >> beam.Map(
                    lambda x: x * 5)

                return {
                    'letters': letters,
                    'numbers': numbers,

        class MultiOutputOverride(PTransformOverride):
            def matches(self, applied_ptransform):
                return applied_ptransform.full_label == 'MyMultiOutput'

            def get_replacement_transform(self, ptransform):
                return MultiOutputComposite()

        def mux_input(x):
            if isinstance(x, int):
                yield TaggedOutput('numbers', x)
                yield TaggedOutput('letters', x)

        p = TestPipeline()
        multi = (p
                 | beam.Create([1, 2, 3, 'a', 'b', 'c'])
                 | 'MyMultiOutput' >> beam.ParDo(mux_input).with_outputs())
        letters = multi.letters | 'MyLetters' >> beam.Map(lambda x: x)
        numbers = multi.numbers | 'MyNumbers' >> beam.Map(lambda x: x)

        # Assert that the PCollection replacement worked correctly and that elements
        # are flowing through. The replacement transform first multiples by 2 then
        # the leaf nodes inside the composite multiply by an additional 3 and 5. Use
        # prime numbers to ensure that each transform is getting executed once.
                    equal_to(['a' * 2 * 3, 'b' * 2 * 3, 'c' * 2 * 3]),
                    label='assert letters')
                    equal_to([1 * 2 * 5, 2 * 2 * 5, 3 * 2 * 5]),
                    label='assert numbers')

        # Do the replacement and run the element assertions.

        # The following checks the graph to make sure the replacement occurred.
        visitor = PipelineTest.Visitor(visited=[])
        pcollections = visitor.visited
        composites = visitor.enter_composite

        # Assert the replacement is in the composite list and retrieve the
        # AppliedPTransform.
                      [t.transform.__class__ for t in composites])
        multi_output_composite = list(
            filter(lambda t: t.transform.__class__ == MultiOutputComposite,

        # Assert that all of the replacement PCollections are in the graph.
        for output in multi_output_composite.outputs.values():
            self.assertIn(output, pcollections)

        # Assert that all of the "old"/replaced PCollections are not in the graph.
        self.assertNotIn(multi[None], visitor.visited)
        self.assertNotIn(multi.letters, visitor.visited)
        self.assertNotIn(multi.numbers, visitor.visited)
Пример #10
def examples_wordcount_debugging(renames):
  """DebuggingWordCount example snippets."""
  import re

  import apache_beam as beam

  # [START example_wordcount_debugging_logging]
  # [START example_wordcount_debugging_aggregators]
  import logging

  class FilterTextFn(beam.DoFn):
    """A DoFn that filters for a specific key based on a regular expression."""

    def __init__(self, pattern):
      self.pattern = pattern
      # A custom metric can track values in your pipeline as it runs. Create
      # custom metrics matched_word and unmatched_words.
      self.matched_words = Metrics.counter(self.__class__, 'matched_words')
      self.umatched_words = Metrics.counter(self.__class__, 'umatched_words')

    def process(self, element):
      word, _ = element
      if re.match(self.pattern, word):
        # Log at INFO level each element we match. When executing this pipeline
        # using the Dataflow service, these log lines will appear in the Cloud
        # Logging UI.
        logging.info('Matched %s', word)

        # Add 1 to the custom metric counter matched_words
        yield element
        # Log at the "DEBUG" level each element that is not matched. Different
        # log levels can be used to control the verbosity of logging providing
        # an effective mechanism to filter less important information. Note
        # currently only "INFO" and higher level logs are emitted to the Cloud
        # Logger. This log message will not be visible in the Cloud Logger.
        logging.debug('Did not match %s', word)

        # Add 1 to the custom metric counter umatched_words
  # [END example_wordcount_debugging_logging]
  # [END example_wordcount_debugging_aggregators]

  p = TestPipeline()  # Use TestPipeline for testing.
  filtered_words = (
      | beam.io.ReadFromText(
      | 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
      | beam.combiners.Count.PerElement()
      | 'FilterText' >> beam.ParDo(FilterTextFn('Flourish|stomach')))

  # [START example_wordcount_debugging_assert]
      filtered_words, beam.testing.util.equal_to(
          [('Flourish', 3), ('stomach', 1)]))
  # [END example_wordcount_debugging_assert]

  output = (filtered_words
            | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c))
            | 'Write' >> beam.io.WriteToText('gs://my-bucket/counts.txt'))

