示例#1
0
文件: snippets.py 项目: gyamxxx/beam
def pipeline_monitoring(renames):
    """Using monitoring interface snippets."""

    import re
    import apache_beam as beam
    from apache_beam.options.pipeline_options import PipelineOptions

    class WordCountOptions(PipelineOptions):
        @classmethod
        def _add_argparse_args(cls, parser):
            parser.add_argument('--input',
                                help='Input for the pipeline',
                                default='gs://my-bucket/input')
            parser.add_argument('--output',
                                help='output for the pipeline',
                                default='gs://my-bucket/output')

    class ExtractWordsFn(beam.DoFn):
        def process(self, element):
            words = re.findall(r'[A-Za-z\']+', element)
            for word in words:
                yield word

    class FormatCountsFn(beam.DoFn):
        def process(self, element):
            word, count = element
            yield '%s: %s' % (word, count)

    # [START pipeline_monitoring_composite]
    # The CountWords Composite Transform inside the WordCount pipeline.
    class CountWords(beam.PTransform):
        def expand(self, pcoll):
            return (pcoll
                    # Convert lines of text into individual words.
                    | 'ExtractWords' >> beam.ParDo(ExtractWordsFn())
                    # Count the number of times each word occurs.
                    | beam.combiners.Count.PerElement()
                    # Format each word and count into a printable string.
                    | 'FormatCounts' >> beam.ParDo(FormatCountsFn()))

    # [END pipeline_monitoring_composite]

    pipeline_options = PipelineOptions()
    options = pipeline_options.view_as(WordCountOptions)
    p = TestPipeline()  # Use TestPipeline for testing.

    # [START pipeline_monitoring_execution]
    (p
     # Read the lines of the input text.
     | 'ReadLines' >> beam.io.ReadFromText(options.input)
     # Count the words.
     | CountWords()
     # Write the formatted word counts to output.
     | 'WriteCounts' >> beam.io.WriteToText(options.output))
    # [END pipeline_monitoring_execution]

    p.visit(SnippetUtils.RenameFiles(renames))
    p.run()
示例#2
0
文件: snippets.py 项目: gyamxxx/beam
def model_textio_compressed(renames, expected):
    """Using a Read Transform to read compressed text files."""
    p = TestPipeline()

    # [START model_textio_write_compressed]
    lines = p | 'ReadFromText' >> beam.io.ReadFromText(
        '/path/to/input-*.csv.gz',
        compression_type=beam.io.filesystem.CompressionTypes.GZIP)
    # [END model_textio_write_compressed]

    beam.assert_that(lines, beam.equal_to(expected))
    p.visit(SnippetUtils.RenameFiles(renames))
    p.run().wait_until_finish()
示例#3
0
def model_textio_compressed(renames, expected):
  """Using a Read Transform to read compressed text files."""
  p = TestPipeline()

  # [START model_textio_write_compressed]
  lines = p | 'ReadFromText' >> beam.io.ReadFromText(
      '/path/to/input-*.csv.gz',
      compression_type=beam.io.filesystem.CompressionTypes.GZIP)
  # [END model_textio_write_compressed]

  assert_that(lines, equal_to(expected))
  p.visit(SnippetUtils.RenameFiles(renames))
  p.run().wait_until_finish()
示例#4
0
文件: snippets.py 项目: zoyahav/beam
def construct_pipeline(renames):
    """A reverse words snippet as an example for constructing a pipeline."""
    import re

    # This is duplicate of the import statement in
    # pipelines_constructing_creating tag below, but required to avoid
    # Unresolved reference in ReverseWords class
    import apache_beam as beam

    class ReverseWords(beam.PTransform):
        """A PTransform that reverses individual elements in a PCollection."""
        def expand(self, pcoll):
            return pcoll | beam.Map(lambda e: e[::-1])

    def filter_words(unused_x):
        """Pass through filter to select everything."""
        return True

    # [START pipelines_constructing_creating]
    import apache_beam as beam
    from apache_beam.options.pipeline_options import PipelineOptions

    p = beam.Pipeline(options=PipelineOptions())
    # [END pipelines_constructing_creating]

    p = TestPipeline()  # Use TestPipeline for testing.

    # [START pipelines_constructing_reading]
    lines = p | 'ReadMyFile' >> beam.io.ReadFromText('gs://some/inputData.txt')
    # [END pipelines_constructing_reading]

    # [START pipelines_constructing_applying]
    words = lines | beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
    reversed_words = words | ReverseWords()
    # [END pipelines_constructing_applying]

    # [START pipelines_constructing_writing]
    filtered_words = reversed_words | 'FilterWords' >> beam.Filter(
        filter_words)
    filtered_words | 'WriteMyFile' >> beam.io.WriteToText(
        'gs://some/outputData.txt')
    # [END pipelines_constructing_writing]

    p.visit(SnippetUtils.RenameFiles(renames))

    # [START pipelines_constructing_running]
    p.run()
示例#5
0
文件: snippets.py 项目: eralmas7/beam
def construct_pipeline(renames):
  """A reverse words snippet as an example for constructing a pipeline."""
  import re

  # This is duplicate of the import statement in
  # pipelines_constructing_creating tag below, but required to avoid
  # Unresolved reference in ReverseWords class
  import apache_beam as beam

  class ReverseWords(beam.PTransform):
    """A PTransform that reverses individual elements in a PCollection."""

    def expand(self, pcoll):
      return pcoll | beam.Map(lambda e: e[::-1])

  def filter_words(unused_x):
    """Pass through filter to select everything."""
    return True

  # [START pipelines_constructing_creating]
  import apache_beam as beam
  from apache_beam.options.pipeline_options import PipelineOptions

  p = beam.Pipeline(options=PipelineOptions())
  # [END pipelines_constructing_creating]

  p = TestPipeline() # Use TestPipeline for testing.

  # [START pipelines_constructing_reading]
  lines = p | 'ReadMyFile' >> beam.io.ReadFromText('gs://some/inputData.txt')
  # [END pipelines_constructing_reading]

  # [START pipelines_constructing_applying]
  words = lines | beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
  reversed_words = words | ReverseWords()
  # [END pipelines_constructing_applying]

  # [START pipelines_constructing_writing]
  filtered_words = reversed_words | 'FilterWords' >> beam.Filter(filter_words)
  filtered_words | 'WriteMyFile' >> beam.io.WriteToText(
      'gs://some/outputData.txt')
  # [END pipelines_constructing_writing]

  p.visit(SnippetUtils.RenameFiles(renames))

  # [START pipelines_constructing_running]
  p.run()
示例#6
0
    def test_gbk_then_flatten_input_visitor(self):
        p = TestPipeline(runner=DataflowRunner(),
                         options=PipelineOptions(self.default_properties))
        none_str_pc = p | 'c1' >> beam.Create({None: 'a'})
        none_int_pc = p | 'c2' >> beam.Create({None: 3})
        flat = (none_str_pc, none_int_pc) | beam.Flatten()
        _ = flat | beam.GroupByKey()

        # This may change if type inference changes, but we assert it here
        # to make sure the check below is not vacuous.
        self.assertNotIsInstance(flat.element_type, typehints.TupleConstraint)

        p.visit(DataflowRunner.group_by_key_input_visitor())
        p.visit(DataflowRunner.flatten_input_visitor())

        # The dataflow runner requires gbk input to be tuples *and* flatten
        # inputs to be equal to their outputs. Assert both hold.
        self.assertIsInstance(flat.element_type, typehints.TupleConstraint)
        self.assertEqual(flat.element_type, none_str_pc.element_type)
        self.assertEqual(flat.element_type, none_int_pc.element_type)
  def test_gbk_then_flatten_input_visitor(self):
    p = TestPipeline(
        runner=DataflowRunner(),
        options=PipelineOptions(self.default_properties))
    none_str_pc = p | 'c1' >> beam.Create({None: 'a'})
    none_int_pc = p | 'c2' >> beam.Create({None: 3})
    flat = (none_str_pc, none_int_pc) | beam.Flatten()
    _ = flat | beam.GroupByKey()

    # This may change if type inference changes, but we assert it here
    # to make sure the check below is not vacuous.
    self.assertNotIsInstance(flat.element_type, typehints.TupleConstraint)

    p.visit(DataflowRunner.group_by_key_input_visitor())
    p.visit(DataflowRunner.flatten_input_visitor())

    # The dataflow runner requires gbk input to be tuples *and* flatten
    # inputs to be equal to their outputs. Assert both hold.
    self.assertIsInstance(flat.element_type, typehints.TupleConstraint)
    self.assertEqual(flat.element_type, none_str_pc.element_type)
    self.assertEqual(flat.element_type, none_int_pc.element_type)
示例#8
0
    def test_ptransform_override_multiple_outputs(self):
        class MultiOutputComposite(PTransform):
            def __init__(self):
                self.output_tags = set()

            def expand(self, pcoll):
                def mux_input(x):
                    x = x * 2
                    if isinstance(x, int):
                        yield TaggedOutput('numbers', x)
                    else:
                        yield TaggedOutput('letters', x)

                multi = pcoll | 'MyReplacement' >> beam.ParDo(
                    mux_input).with_outputs()
                letters = multi.letters | 'LettersComposite' >> beam.Map(
                    lambda x: x * 3)
                numbers = multi.numbers | 'NumbersComposite' >> beam.Map(
                    lambda x: x * 5)

                return {
                    'letters': letters,
                    'numbers': numbers,
                }

        class MultiOutputOverride(PTransformOverride):
            def matches(self, applied_ptransform):
                return applied_ptransform.full_label == 'MyMultiOutput'

            def get_replacement_transform(self, ptransform):
                return MultiOutputComposite()

        def mux_input(x):
            if isinstance(x, int):
                yield TaggedOutput('numbers', x)
            else:
                yield TaggedOutput('letters', x)

        p = TestPipeline()
        multi = (p
                 | beam.Create([1, 2, 3, 'a', 'b', 'c'])
                 | 'MyMultiOutput' >> beam.ParDo(mux_input).with_outputs())
        letters = multi.letters | 'MyLetters' >> beam.Map(lambda x: x)
        numbers = multi.numbers | 'MyNumbers' >> beam.Map(lambda x: x)

        # Assert that the PCollection replacement worked correctly and that elements
        # are flowing through. The replacement transform first multiples by 2 then
        # the leaf nodes inside the composite multiply by an additional 3 and 5. Use
        # prime numbers to ensure that each transform is getting executed once.
        assert_that(letters,
                    equal_to(['a' * 2 * 3, 'b' * 2 * 3, 'c' * 2 * 3]),
                    label='assert letters')
        assert_that(numbers,
                    equal_to([1 * 2 * 5, 2 * 2 * 5, 3 * 2 * 5]),
                    label='assert numbers')

        # Do the replacement and run the element assertions.
        p.replace_all([MultiOutputOverride()])
        p.run()

        # The following checks the graph to make sure the replacement occurred.
        visitor = PipelineTest.Visitor(visited=[])
        p.visit(visitor)
        pcollections = visitor.visited
        composites = visitor.enter_composite

        # Assert the replacement is in the composite list and retrieve the
        # AppliedPTransform.
        self.assertIn(MultiOutputComposite,
                      [t.transform.__class__ for t in composites])
        multi_output_composite = list(
            filter(lambda t: t.transform.__class__ == MultiOutputComposite,
                   composites))[0]

        # Assert that all of the replacement PCollections are in the graph.
        for output in multi_output_composite.outputs.values():
            self.assertIn(output, pcollections)

        # Assert that all of the "old"/replaced PCollections are not in the graph.
        self.assertNotIn(multi[None], visitor.visited)
        self.assertNotIn(multi.letters, visitor.visited)
        self.assertNotIn(multi.numbers, visitor.visited)
示例#9
0
文件: snippets.py 项目: gyamxxx/beam
def examples_wordcount_debugging(renames):
    """DebuggingWordCount example snippets."""
    import re

    import apache_beam as beam

    # [START example_wordcount_debugging_logging]
    # [START example_wordcount_debugging_aggregators]
    import logging

    class FilterTextFn(beam.DoFn):
        """A DoFn that filters for a specific key based on a regular expression."""
        def __init__(self, pattern):
            self.pattern = pattern
            # A custom metric can track values in your pipeline as it runs. Create
            # custom metrics matched_word and unmatched_words.
            self.matched_words = Metrics.counter(self.__class__,
                                                 'matched_words')
            self.umatched_words = Metrics.counter(self.__class__,
                                                  'umatched_words')

        def process(self, element):
            word, _ = element
            if re.match(self.pattern, word):
                # Log at INFO level each element we match. When executing this pipeline
                # using the Dataflow service, these log lines will appear in the Cloud
                # Logging UI.
                logging.info('Matched %s', word)

                # Add 1 to the custom metric counter matched_words
                self.matched_words.inc()
                yield element
            else:
                # Log at the "DEBUG" level each element that is not matched. Different
                # log levels can be used to control the verbosity of logging providing
                # an effective mechanism to filter less important information. Note
                # currently only "INFO" and higher level logs are emitted to the Cloud
                # Logger. This log message will not be visible in the Cloud Logger.
                logging.debug('Did not match %s', word)

                # Add 1 to the custom metric counter umatched_words
                self.umatched_words.inc()

    # [END example_wordcount_debugging_logging]
    # [END example_wordcount_debugging_aggregators]

    p = TestPipeline()  # Use TestPipeline for testing.
    filtered_words = (
        p
        |
        beam.io.ReadFromText('gs://dataflow-samples/shakespeare/kinglear.txt')
        |
        'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
        | beam.combiners.Count.PerElement()
        | 'FilterText' >> beam.ParDo(FilterTextFn('Flourish|stomach')))

    # [START example_wordcount_debugging_assert]
    beam.assert_that(filtered_words,
                     beam.equal_to([('Flourish', 3), ('stomach', 1)]))
    # [END example_wordcount_debugging_assert]

    output = (filtered_words
              | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c))
              | 'Write' >> beam.io.WriteToText('gs://my-bucket/counts.txt'))

    p.visit(SnippetUtils.RenameFiles(renames))
    p.run()
示例#10
0
def examples_wordcount_debugging(renames):
  """DebuggingWordCount example snippets."""
  import re

  import apache_beam as beam

  # [START example_wordcount_debugging_logging]
  # [START example_wordcount_debugging_aggregators]
  import logging

  class FilterTextFn(beam.DoFn):
    """A DoFn that filters for a specific key based on a regular expression."""

    def __init__(self, pattern):
      self.pattern = pattern
      # A custom metric can track values in your pipeline as it runs. Create
      # custom metrics matched_word and unmatched_words.
      self.matched_words = Metrics.counter(self.__class__, 'matched_words')
      self.umatched_words = Metrics.counter(self.__class__, 'umatched_words')

    def process(self, element):
      word, _ = element
      if re.match(self.pattern, word):
        # Log at INFO level each element we match. When executing this pipeline
        # using the Dataflow service, these log lines will appear in the Cloud
        # Logging UI.
        logging.info('Matched %s', word)

        # Add 1 to the custom metric counter matched_words
        self.matched_words.inc()
        yield element
      else:
        # Log at the "DEBUG" level each element that is not matched. Different
        # log levels can be used to control the verbosity of logging providing
        # an effective mechanism to filter less important information. Note
        # currently only "INFO" and higher level logs are emitted to the Cloud
        # Logger. This log message will not be visible in the Cloud Logger.
        logging.debug('Did not match %s', word)

        # Add 1 to the custom metric counter umatched_words
        self.umatched_words.inc()
  # [END example_wordcount_debugging_logging]
  # [END example_wordcount_debugging_aggregators]

  p = TestPipeline()  # Use TestPipeline for testing.
  filtered_words = (
      p
      | beam.io.ReadFromText(
          'gs://dataflow-samples/shakespeare/kinglear.txt')
      | 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
      | beam.combiners.Count.PerElement()
      | 'FilterText' >> beam.ParDo(FilterTextFn('Flourish|stomach')))

  # [START example_wordcount_debugging_assert]
  beam.testing.util.assert_that(
      filtered_words, beam.testing.util.equal_to(
          [('Flourish', 3), ('stomach', 1)]))
  # [END example_wordcount_debugging_assert]

  output = (filtered_words
            | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c))
            | 'Write' >> beam.io.WriteToText('gs://my-bucket/counts.txt'))

  p.visit(SnippetUtils.RenameFiles(renames))
  p.run()
示例#11
0
def pipeline_monitoring(renames):
  """Using monitoring interface snippets."""

  import re
  import apache_beam as beam
  from apache_beam.options.pipeline_options import PipelineOptions

  class WordCountOptions(PipelineOptions):

    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_argument('--input',
                          help='Input for the pipeline',
                          default='gs://my-bucket/input')
      parser.add_argument('--output',
                          help='output for the pipeline',
                          default='gs://my-bucket/output')

  class ExtractWordsFn(beam.DoFn):

    def process(self, element):
      words = re.findall(r'[A-Za-z\']+', element)
      for word in words:
        yield word

  class FormatCountsFn(beam.DoFn):

    def process(self, element):
      word, count = element
      yield '%s: %s' % (word, count)

  # [START pipeline_monitoring_composite]
  # The CountWords Composite Transform inside the WordCount pipeline.
  class CountWords(beam.PTransform):

    def expand(self, pcoll):
      return (pcoll
              # Convert lines of text into individual words.
              | 'ExtractWords' >> beam.ParDo(ExtractWordsFn())
              # Count the number of times each word occurs.
              | beam.combiners.Count.PerElement()
              # Format each word and count into a printable string.
              | 'FormatCounts' >> beam.ParDo(FormatCountsFn()))
  # [END pipeline_monitoring_composite]

  pipeline_options = PipelineOptions()
  options = pipeline_options.view_as(WordCountOptions)
  p = TestPipeline()  # Use TestPipeline for testing.

  # [START pipeline_monitoring_execution]
  (p
   # Read the lines of the input text.
   | 'ReadLines' >> beam.io.ReadFromText(options.input)
   # Count the words.
   | CountWords()
   # Write the formatted word counts to output.
   | 'WriteCounts' >> beam.io.WriteToText(options.output))
  # [END pipeline_monitoring_execution]

  p.visit(SnippetUtils.RenameFiles(renames))
  p.run()