示例#1
0
 def test_dataflow_file_pattern(self):
   pattern, expected_data = write_pattern([5, 3, 12, 8, 8, 4])
   assert len(expected_data) == 40
   pipeline = TestPipeline()
   pcoll = pipeline | 'Read' >> ReadFromText(pattern)
   assert_that(pcoll, equal_to(expected_data))
   pipeline.run()
示例#2
0
  def test_setting_timestamp(self):
    p = TestPipeline()
    unkeyed_items = p | beam.Create([12, 30, 60, 61, 66])
    items = (unkeyed_items | 'key' >> beam.Map(lambda x: ('k', x)))

    def extract_timestamp_from_log_entry(entry):
      return entry[1]

    # [START setting_timestamp]
    class AddTimestampDoFn(beam.DoFn):

      def process(self, element):
        # Extract the numeric Unix seconds-since-epoch timestamp to be
        # associated with the current log entry.
        unix_timestamp = extract_timestamp_from_log_entry(element)
        # Wrap and emit the current entry and new timestamp in a
        # TimestampedValue.
        yield beam.TimestampedValue(element, unix_timestamp)

    timestamped_items = items | 'timestamp' >> beam.ParDo(AddTimestampDoFn())
    # [END setting_timestamp]
    fixed_windowed_items = (
        timestamped_items | 'window' >> beam.WindowInto(
            beam.window.FixedWindows(60)))
    summed = (fixed_windowed_items
              | 'group' >> beam.GroupByKey()
              | 'combine' >> beam.CombineValues(sum))
    unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1])
    beam.assert_that(unkeyed, beam.equal_to([42, 187]))
    p.run()
示例#3
0
    def test_setting_timestamp(self):
        p = TestPipeline()
        unkeyed_items = p | beam.Create([12, 30, 60, 61, 66])
        items = (unkeyed_items | 'key' >> beam.Map(lambda x: ('k', x)))

        def extract_timestamp_from_log_entry(entry):
            return entry[1]

        # [START setting_timestamp]
        class AddTimestampDoFn(beam.DoFn):
            def process(self, element):
                # Extract the numeric Unix seconds-since-epoch timestamp to be
                # associated with the current log entry.
                unix_timestamp = extract_timestamp_from_log_entry(element)
                # Wrap and emit the current entry and new timestamp in a
                # TimestampedValue.
                yield beam.TimestampedValue(element, unix_timestamp)

        timestamped_items = items | 'timestamp' >> beam.ParDo(
            AddTimestampDoFn())
        # [END setting_timestamp]
        fixed_windowed_items = (
            timestamped_items
            | 'window' >> beam.WindowInto(beam.window.FixedWindows(60)))
        summed = (fixed_windowed_items
                  | 'group' >> beam.GroupByKey()
                  | 'combine' >> beam.CombineValues(sum))
        unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1])
        beam.assert_that(unkeyed, beam.equal_to([42, 187]))
        p.run()
示例#4
0
def model_multiple_pcollections_partition(contents, output_path):
    """Splitting a PCollection with Partition."""
    some_hash_fn = lambda s: ord(s[0])

    def get_percentile(i):
        """Assume i in [0,100)."""
        return i

    import apache_beam as beam
    p = TestPipeline()  # Use TestPipeline for testing.

    students = p | beam.Create(contents)

    # [START model_multiple_pcollections_partition]
    def partition_fn(student, num_partitions):
        return int(get_percentile(student) * num_partitions / 100)

    by_decile = students | beam.Partition(partition_fn, 10)
    # [END model_multiple_pcollections_partition]
    # [START model_multiple_pcollections_partition_40th]
    fortieth_percentile = by_decile[4]
    # [END model_multiple_pcollections_partition_40th]

    ([by_decile[d] for d in xrange(10) if d != 4] + [fortieth_percentile]
     | beam.Flatten()
     | beam.io.WriteToText(output_path))

    p.run()
示例#5
0
def model_multiple_pcollections_partition(contents, output_path):
  """Splitting a PCollection with Partition."""
  some_hash_fn = lambda s: ord(s[0])

  def get_percentile(i):
    """Assume i in [0,100)."""
    return i
  import apache_beam as beam
  p = TestPipeline()  # Use TestPipeline for testing.

  students = p | beam.Create(contents)

  # [START model_multiple_pcollections_partition]
  def partition_fn(student, num_partitions):
    return int(get_percentile(student) * num_partitions / 100)

  by_decile = students | beam.Partition(partition_fn, 10)
  # [END model_multiple_pcollections_partition]
  # [START model_multiple_pcollections_partition_40th]
  fortieth_percentile = by_decile[4]
  # [END model_multiple_pcollections_partition_40th]

  ([by_decile[d] for d in xrange(10) if d != 4] + [fortieth_percentile]
   | beam.Flatten()
   | beam.io.WriteToText(output_path))

  p.run()
 def test_basics(self):
     p = TestPipeline()
     rows = (p | 'create' >> beam.Create([{
         'month': 1,
         'day': 1,
         'tornado': False
     }, {
         'month': 1,
         'day': 2,
         'tornado': True
     }, {
         'month': 1,
         'day': 3,
         'tornado': True
     }, {
         'month': 2,
         'day': 1,
         'tornado': True
     }]))
     results = bigquery_tornadoes.count_tornadoes(rows)
     beam.assert_that(
         results,
         beam.equal_to([{
             'month': 1,
             'tornado_count': 2
         }, {
             'month': 2,
             'tornado_count': 1
         }]))
     p.run().wait_until_finish()
    def test_compute_top_sessions(self):
        p = TestPipeline()
        edits = p | beam.Create(self.EDITS)
        result = edits | top_wikipedia_sessions.ComputeTopSessions(1.0)

        beam.assert_that(result, beam.equal_to(self.EXPECTED))
        p.run()
示例#8
0
 def test_dataflow_file_pattern(self):
     pattern, expected_data = write_pattern([5, 3, 12, 8, 8, 4])
     assert len(expected_data) == 40
     pipeline = TestPipeline()
     pcoll = pipeline | 'Read' >> ReadFromText(pattern)
     assert_that(pcoll, equal_to(expected_data))
     pipeline.run()
示例#9
0
def model_composite_transform_example(contents, output_path):
    """Example of a composite transform.

  To declare a composite transform, define a subclass of PTransform.

  To override the apply method, define a method "apply" that
  takes a PCollection as its only parameter and returns a PCollection.
  """
    import re

    import apache_beam as beam

    # [START composite_transform_example]
    # [START composite_ptransform_apply_method]
    # [START composite_ptransform_declare]
    class CountWords(beam.PTransform):
        # [END composite_ptransform_declare]

        def expand(self, pcoll):
            return (pcoll
                    | beam.FlatMap(lambda x: re.findall(r'\w+', x))
                    | beam.combiners.Count.PerElement()
                    | beam.Map(lambda (word, c): '%s: %s' % (word, c)))

    # [END composite_ptransform_apply_method]
    # [END composite_transform_example]

    p = TestPipeline()  # Use TestPipeline for testing.
    (p
     | beam.Create(contents)
     | CountWords()
     | beam.io.WriteToText(output_path))
    p.run()
  def test_compute_top_sessions(self):
    p = TestPipeline()
    edits = p | beam.Create(self.EDITS)
    result = edits | top_wikipedia_sessions.ComputeTopSessions(1.0)

    beam.assert_that(result, beam.equal_to(self.EXPECTED))
    p.run()
示例#11
0
文件: textio_test.py 项目: xgong/beam
 def test_read_gzip_empty_file(self):
     file_name = self._create_temp_file()
     pipeline = TestPipeline()
     pcoll = pipeline | 'Read' >> ReadFromText(
         file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder())
     assert_that(pcoll, equal_to([]))
     pipeline.run()
示例#12
0
 def test_timestamped_with_combiners(self):
   p = TestPipeline()
   result = (p
             # Create some initial test values.
             | 'start' >> Create([(k, k) for k in range(10)])
             # The purpose of the WindowInto transform is to establish a
             # FixedWindows windowing function for the PCollection.
             # It does not bucket elements into windows since the timestamps
             # from Create are not spaced 5 ms apart and very likely they all
             # fall into the same window.
             | 'w' >> WindowInto(FixedWindows(5))
             # Generate timestamped values using the values as timestamps.
             # Now there are values 5 ms apart and since Map propagates the
             # windowing function from input to output the output PCollection
             # will have elements falling into different 5ms windows.
             | Map(lambda (x, t): TimestampedValue(x, t))
             # We add a 'key' to each value representing the index of the
             # window. This is important since there is no guarantee of
             # order for the elements of a PCollection.
             | Map(lambda v: (v / 5, v)))
   # Sum all elements associated with a key and window. Although it
   # is called CombinePerKey it is really CombinePerKeyAndWindow the
   # same way GroupByKey is really GroupByKeyAndWindow.
   sum_per_window = result | CombinePerKey(sum)
   # Compute mean per key and window.
   mean_per_window = result | combiners.Mean.PerKey()
   assert_that(sum_per_window, equal_to([(0, 10), (1, 35)]),
               label='assert:sum')
   assert_that(mean_per_window, equal_to([(0, 2.0), (1, 7.0)]),
               label='assert:mean')
   p.run()
示例#13
0
def pipeline_logging(lines, output):
  """Logging Pipeline Messages."""

  import re
  import apache_beam as beam

  # [START pipeline_logging]
  # import Python logging module.
  import logging

  class ExtractWordsFn(beam.DoFn):

    def process(self, element):
      words = re.findall(r'[A-Za-z\']+', element)
      for word in words:
        yield word

        if word.lower() == 'love':
          # Log using the root logger at info or higher levels
          logging.info('Found : %s', word.lower())

  # Remaining WordCount example code ...
  # [END pipeline_logging]

  p = TestPipeline()  # Use TestPipeline for testing.
  (p
   | beam.Create(lines)
   | beam.ParDo(ExtractWordsFn())
   | beam.io.WriteToText(output))

  p.run()
示例#14
0
    def test_bad_types(self):
        # [START type_hints_missing_define_numbers]
        p = TestPipeline(options=PipelineOptions(pipeline_type_check=True))

        numbers = p | beam.Create(['1', '2', '3'])
        # [END type_hints_missing_define_numbers]

        # Consider the following code.
        # pylint: disable=expression-not-assigned
        # pylint: disable=unused-variable
        # [START type_hints_missing_apply]
        evens = numbers | beam.Filter(lambda x: x % 2 == 0)
        # [END type_hints_missing_apply]

        # Now suppose numbers was defined as [snippet above].
        # When running this pipeline, you'd get a runtime error,
        # possibly on a remote machine, possibly very late.

        with self.assertRaises(TypeError):
            p.run()

        # To catch this early, we can assert what types we expect.
        with self.assertRaises(typehints.TypeCheckError):
            # [START type_hints_takes]
            evens = numbers | beam.Filter(
                lambda x: x % 2 == 0).with_input_types(int)
            # [END type_hints_takes]

        # Type hints can be declared on DoFns and callables as well, rather
        # than where they're used, to be more self contained.
        with self.assertRaises(typehints.TypeCheckError):
            # [START type_hints_do_fn]
            @beam.typehints.with_input_types(int)
            class FilterEvensDoFn(beam.DoFn):
                def process(self, element):
                    if element % 2 == 0:
                        yield element

            evens = numbers | beam.ParDo(FilterEvensDoFn())
            # [END type_hints_do_fn]

        words = p | 'words' >> beam.Create(['a', 'bb', 'c'])
        # One can assert outputs and apply them to transforms as well.
        # Helps document the contract and checks it at pipeline construction time.
        # [START type_hints_transform]
        T = beam.typehints.TypeVariable('T')

        @beam.typehints.with_input_types(T)
        @beam.typehints.with_output_types(beam.typehints.Tuple[int, T])
        class MyTransform(beam.PTransform):
            def expand(self, pcoll):
                return pcoll | beam.Map(lambda x: (len(x), x))

        words_with_lens = words | MyTransform()
        # [END type_hints_transform]

        # pylint: disable=expression-not-assigned
        with self.assertRaises(typehints.TypeCheckError):
            words_with_lens | beam.Map(lambda x: x).with_input_types(
                beam.typehints.Tuple[int, int])
示例#15
0
    def test_to_list_and_to_dict(self):
        pipeline = TestPipeline()
        the_list = [6, 3, 1, 1, 9, 1, 5, 2, 0, 6]
        pcoll = pipeline | 'start' >> Create(the_list)
        result = pcoll | 'to list' >> combine.ToList()

        def matcher(expected):
            def match(actual):
                equal_to(expected[0])(actual[0])

            return match

        assert_that(result, matcher([the_list]))
        pipeline.run()

        pipeline = TestPipeline()
        pairs = [(1, 2), (3, 4), (5, 6)]
        pcoll = pipeline | 'start-pairs' >> Create(pairs)
        result = pcoll | 'to dict' >> combine.ToDict()

        def matcher():
            def match(actual):
                equal_to([1])([len(actual)])
                equal_to(pairs)(actual[0].iteritems())

            return match

        assert_that(result, matcher())
        pipeline.run()
    def test_create_groups(self):
        p = TestPipeline()

        group_ids_pcoll = p | 'CreateGroupIds' >> beam.Create(['A', 'B', 'C'])
        corpus_pcoll = p | 'CreateCorpus' >> beam.Create([{
            'f': 'corpus1'
        }, {
            'f': 'corpus2'
        }, {
            'f': 'corpus3'
        }])
        words_pcoll = p | 'CreateWords' >> beam.Create([{
            'f': 'word1'
        }, {
            'f': 'word2'
        }, {
            'f': 'word3'
        }])
        ignore_corpus_pcoll = p | 'CreateIgnoreCorpus' >> beam.Create(
            ['corpus1'])
        ignore_word_pcoll = p | 'CreateIgnoreWord' >> beam.Create(['word1'])

        groups = bigquery_side_input.create_groups(group_ids_pcoll,
                                                   corpus_pcoll, words_pcoll,
                                                   ignore_corpus_pcoll,
                                                   ignore_word_pcoll)

        beam.assert_that(
            groups,
            beam.equal_to([('A', 'corpus2', 'word2'),
                           ('B', 'corpus2', 'word2'),
                           ('C', 'corpus2', 'word2')]))
        p.run()
示例#17
0
def model_co_group_by_key_tuple(email_list, phone_list, output_path):
    """Applying a CoGroupByKey Transform to a tuple."""
    import apache_beam as beam
    p = TestPipeline()  # Use TestPipeline for testing.
    # [START model_group_by_key_cogroupbykey_tuple]
    # Each data set is represented by key-value pairs in separate PCollections.
    # Both data sets share a common key type (in this example str).
    # The email_list contains values such as: ('joe', '*****@*****.**') with
    # multiple possible values for each key.
    # The phone_list contains values such as: ('mary': '111-222-3333') with
    # multiple possible values for each key.
    emails = p | 'email' >> beam.Create(email_list)
    phones = p | 'phone' >> beam.Create(phone_list)
    # The result PCollection contains one key-value element for each key in the
    # input PCollections. The key of the pair will be the key from the input and
    # the value will be a dictionary with two entries: 'emails' - an iterable of
    # all values for the current key in the emails PCollection and 'phones': an
    # iterable of all values for the current key in the phones PCollection.
    # For instance, if 'emails' contained ('joe', '*****@*****.**') and
    # ('joe', '*****@*****.**'), then 'result' will contain the element
    # ('joe', {'emails': ['*****@*****.**', '*****@*****.**'], 'phones': ...})
    result = {'emails': emails, 'phones': phones} | beam.CoGroupByKey()

    def join_info((name, info)):
        return '; '.join([
            '%s' % name,
            '%s' % ','.join(info['emails']),
            '%s' % ','.join(info['phones'])
        ])

    contact_lines = result | beam.Map(join_info)
    # [END model_group_by_key_cogroupbykey_tuple]
    contact_lines | beam.io.WriteToText(output_path)
    p.run()
示例#18
0
 def test_dataflow_single_file(self):
   file_name, expected_data = write_data(5)
   assert len(expected_data) == 5
   pipeline = TestPipeline()
   pcoll = pipeline | 'Read' >> ReadFromText(file_name)
   assert_that(pcoll, equal_to(expected_data))
   pipeline.run()
示例#19
0
def model_multiple_pcollections_flatten(contents, output_path):
  """Merging a PCollection with Flatten."""
  some_hash_fn = lambda s: ord(s[0])
  import apache_beam as beam
  p = TestPipeline()  # Use TestPipeline for testing.
  partition_fn = lambda element, partitions: some_hash_fn(element) % partitions

  # Partition into deciles
  partitioned = p | beam.Create(contents) | beam.Partition(partition_fn, 3)
  pcoll1 = partitioned[0]
  pcoll2 = partitioned[1]
  pcoll3 = partitioned[2]

  # Flatten them back into 1

  # A collection of PCollection objects can be represented simply
  # as a tuple (or list) of PCollections.
  # (The SDK for Python has no separate type to store multiple
  # PCollection objects, whether containing the same or different
  # types.)
  # [START model_multiple_pcollections_flatten]
  merged = (
      # [START model_multiple_pcollections_tuple]
      (pcoll1, pcoll2, pcoll3)
      # [END model_multiple_pcollections_tuple]
      # A list of tuples can be "piped" directly into a Flatten transform.
      | beam.Flatten())
  # [END model_multiple_pcollections_flatten]
  merged | beam.io.WriteToText(output_path)

  p.run()
 def run_pipeline(self, count_implementation, factor=1):
   p = TestPipeline()
   words = p | beam.Create(['CAT', 'DOG', 'CAT', 'CAT', 'DOG'])
   result = words | count_implementation
   assert_that(
       result, equal_to([('CAT', (3 * factor)), ('DOG', (2 * factor))]))
   p.run()
示例#21
0
  def test_run_direct(self):
    file_name = self._create_temp_file('aaaa\nbbbb\ncccc\ndddd')
    pipeline = TestPipeline()
    pcoll = pipeline | beam.io.Read(LineSource(file_name))
    assert_that(pcoll, equal_to(['aaaa', 'bbbb', 'cccc', 'dddd']))

    pipeline.run()
示例#22
0
    def test_run_direct(self):
        file_name = self._create_temp_file('aaaa\nbbbb\ncccc\ndddd')
        pipeline = TestPipeline()
        pcoll = pipeline | beam.io.Read(LineSource(file_name))
        assert_that(pcoll, equal_to(['aaaa', 'bbbb', 'cccc', 'dddd']))

        pipeline.run()
示例#23
0
 def test_timestamped_with_combiners(self):
   p = TestPipeline()
   result = (p
             # Create some initial test values.
             | 'start' >> Create([(k, k) for k in range(10)])
             # The purpose of the WindowInto transform is to establish a
             # FixedWindows windowing function for the PCollection.
             # It does not bucket elements into windows since the timestamps
             # from Create are not spaced 5 ms apart and very likely they all
             # fall into the same window.
             | 'w' >> WindowInto(FixedWindows(5))
             # Generate timestamped values using the values as timestamps.
             # Now there are values 5 ms apart and since Map propagates the
             # windowing function from input to output the output PCollection
             # will have elements falling into different 5ms windows.
             | Map(lambda (x, t): TimestampedValue(x, t))
             # We add a 'key' to each value representing the index of the
             # window. This is important since there is no guarantee of
             # order for the elements of a PCollection.
             | Map(lambda v: (v / 5, v)))
   # Sum all elements associated with a key and window. Although it
   # is called CombinePerKey it is really CombinePerKeyAndWindow the
   # same way GroupByKey is really GroupByKeyAndWindow.
   sum_per_window = result | CombinePerKey(sum)
   # Compute mean per key and window.
   mean_per_window = result | combiners.Mean.PerKey()
   assert_that(sum_per_window, equal_to([(0, 10), (1, 35)]),
               label='assert:sum')
   assert_that(mean_per_window, equal_to([(0, 2.0), (1, 7.0)]),
               label='assert:mean')
   p.run()
示例#24
0
def model_multiple_pcollections_flatten(contents, output_path):
    """Merging a PCollection with Flatten."""
    some_hash_fn = lambda s: ord(s[0])
    import apache_beam as beam
    p = TestPipeline()  # Use TestPipeline for testing.
    partition_fn = lambda element, partitions: some_hash_fn(element
                                                            ) % partitions

    # Partition into deciles
    partitioned = p | beam.Create(contents) | beam.Partition(partition_fn, 3)
    pcoll1 = partitioned[0]
    pcoll2 = partitioned[1]
    pcoll3 = partitioned[2]

    # Flatten them back into 1

    # A collection of PCollection objects can be represented simply
    # as a tuple (or list) of PCollections.
    # (The SDK for Python has no separate type to store multiple
    # PCollection objects, whether containing the same or different
    # types.)
    # [START model_multiple_pcollections_flatten]
    merged = (
        # [START model_multiple_pcollections_tuple]
        (pcoll1, pcoll2, pcoll3)
        # [END model_multiple_pcollections_tuple]
        # A list of tuples can be "piped" directly into a Flatten transform.
        | beam.Flatten())
    # [END model_multiple_pcollections_flatten]
    merged | beam.io.WriteToText(output_path)

    p.run()
 def run_pipeline(self, count_implementation, factor=1):
   p = TestPipeline()
   words = p | beam.Create(['CAT', 'DOG', 'CAT', 'CAT', 'DOG'])
   result = words | count_implementation
   assert_that(
       result, equal_to([('CAT', (3 * factor)), ('DOG', (2 * factor))]))
   p.run()
示例#26
0
  def test_to_list_and_to_dict(self):
    pipeline = TestPipeline()
    the_list = [6, 3, 1, 1, 9, 1, 5, 2, 0, 6]
    pcoll = pipeline | 'start' >> Create(the_list)
    result = pcoll | 'to list' >> combine.ToList()

    def matcher(expected):
      def match(actual):
        equal_to(expected[0])(actual[0])
      return match
    assert_that(result, matcher([the_list]))
    pipeline.run()

    pipeline = TestPipeline()
    pairs = [(1, 2), (3, 4), (5, 6)]
    pcoll = pipeline | 'start-pairs' >> Create(pairs)
    result = pcoll | 'to dict' >> combine.ToDict()

    def matcher():
      def match(actual):
        equal_to([1])([len(actual)])
        equal_to(pairs)(actual[0].iteritems())
      return match
    assert_that(result, matcher())
    pipeline.run()
示例#27
0
 def test_dataflow_single_file(self):
     file_name, expected_data = write_data(5)
     assert len(expected_data) == 5
     pipeline = TestPipeline()
     pcoll = pipeline | 'Read' >> ReadFromText(file_name)
     assert_that(pcoll, equal_to(expected_data))
     pipeline.run()
示例#28
0
  def test_bad_types(self):
    p = TestPipeline()
    evens = None  # pylint: disable=unused-variable

    # [START type_hints_missing_define_numbers]
    numbers = p | beam.Create(['1', '2', '3'])
    # [END type_hints_missing_define_numbers]

    # Consider the following code.
    # pylint: disable=expression-not-assigned
    # pylint: disable=unused-variable
    # [START type_hints_missing_apply]
    evens = numbers | beam.Filter(lambda x: x % 2 == 0)
    # [END type_hints_missing_apply]

    # Now suppose numbers was defined as [snippet above].
    # When running this pipeline, you'd get a runtime error,
    # possibly on a remote machine, possibly very late.

    with self.assertRaises(TypeError):
      p.run()

    # To catch this early, we can assert what types we expect.
    with self.assertRaises(typehints.TypeCheckError):
      # [START type_hints_takes]
      p.options.view_as(TypeOptions).pipeline_type_check = True
      evens = numbers | beam.Filter(lambda x: x % 2 == 0).with_input_types(int)
      # [END type_hints_takes]

    # Type hints can be declared on DoFns and callables as well, rather
    # than where they're used, to be more self contained.
    with self.assertRaises(typehints.TypeCheckError):
      # [START type_hints_do_fn]
      @beam.typehints.with_input_types(int)
      class FilterEvensDoFn(beam.DoFn):
        def process(self, element):
          if element % 2 == 0:
            yield element
      evens = numbers | beam.ParDo(FilterEvensDoFn())
      # [END type_hints_do_fn]

    words = p | 'words' >> beam.Create(['a', 'bb', 'c'])
    # One can assert outputs and apply them to transforms as well.
    # Helps document the contract and checks it at pipeline construction time.
    # [START type_hints_transform]
    T = beam.typehints.TypeVariable('T')

    @beam.typehints.with_input_types(T)
    @beam.typehints.with_output_types(beam.typehints.Tuple[int, T])
    class MyTransform(beam.PTransform):
      def expand(self, pcoll):
        return pcoll | beam.Map(lambda x: (len(x), x))

    words_with_lens = words | MyTransform()
    # [END type_hints_transform]

    # pylint: disable=expression-not-assigned
    with self.assertRaises(typehints.TypeCheckError):
      words_with_lens | beam.Map(lambda x: x).with_input_types(
          beam.typehints.Tuple[int, int])
示例#29
0
def model_composite_transform_example(contents, output_path):
  """Example of a composite transform.

  To declare a composite transform, define a subclass of PTransform.

  To override the apply method, define a method "apply" that
  takes a PCollection as its only parameter and returns a PCollection.
  """
  import re

  import apache_beam as beam

  # [START composite_transform_example]
  # [START composite_ptransform_apply_method]
  # [START composite_ptransform_declare]
  class CountWords(beam.PTransform):
    # [END composite_ptransform_declare]

    def expand(self, pcoll):
      return (pcoll
              | beam.FlatMap(lambda x: re.findall(r'\w+', x))
              | beam.combiners.Count.PerElement()
              | beam.Map(lambda (word, c): '%s: %s' % (word, c)))
  # [END composite_ptransform_apply_method]
  # [END composite_transform_example]

  p = TestPipeline()  # Use TestPipeline for testing.
  (p
   | beam.Create(contents)
   | CountWords()
   | beam.io.WriteToText(output_path))
  p.run()
示例#30
0
def model_co_group_by_key_tuple(email_list, phone_list, output_path):
  """Applying a CoGroupByKey Transform to a tuple."""
  import apache_beam as beam
  p = TestPipeline()  # Use TestPipeline for testing.
  # [START model_group_by_key_cogroupbykey_tuple]
  # Each data set is represented by key-value pairs in separate PCollections.
  # Both data sets share a common key type (in this example str).
  # The email_list contains values such as: ('joe', '*****@*****.**') with
  # multiple possible values for each key.
  # The phone_list contains values such as: ('mary': '111-222-3333') with
  # multiple possible values for each key.
  emails = p | 'email' >> beam.Create(email_list)
  phones = p | 'phone' >> beam.Create(phone_list)
  # The result PCollection contains one key-value element for each key in the
  # input PCollections. The key of the pair will be the key from the input and
  # the value will be a dictionary with two entries: 'emails' - an iterable of
  # all values for the current key in the emails PCollection and 'phones': an
  # iterable of all values for the current key in the phones PCollection.
  # For instance, if 'emails' contained ('joe', '*****@*****.**') and
  # ('joe', '*****@*****.**'), then 'result' will contain the element
  # ('joe', {'emails': ['*****@*****.**', '*****@*****.**'], 'phones': ...})
  result = {'emails': emails, 'phones': phones} | beam.CoGroupByKey()

  def join_info((name, info)):
    return '; '.join(['%s' % name,
                      '%s' % ','.join(info['emails']),
                      '%s' % ','.join(info['phones'])])

  contact_lines = result | beam.Map(join_info)
  # [END model_group_by_key_cogroupbykey_tuple]
  contact_lines | beam.io.WriteToText(output_path)
  p.run()
示例#31
0
def pipeline_logging(lines, output):
    """Logging Pipeline Messages."""

    import re
    import apache_beam as beam

    # [START pipeline_logging]
    # import Python logging module.
    import logging

    class ExtractWordsFn(beam.DoFn):
        def process(self, element):
            words = re.findall(r'[A-Za-z\']+', element)
            for word in words:
                yield word

                if word.lower() == 'love':
                    # Log using the root logger at info or higher levels
                    logging.info('Found : %s', word.lower())

    # Remaining WordCount example code ...
    # [END pipeline_logging]

    p = TestPipeline()  # Use TestPipeline for testing.
    (p
     | beam.Create(lines)
     | beam.ParDo(ExtractWordsFn())
     | beam.io.WriteToText(output))

    p.run()
示例#32
0
 def test_runtime_checks_on(self):
   # pylint: disable=expression-not-assigned
   p = TestPipeline()
   with self.assertRaises(typehints.TypeCheckError):
     # [START type_hints_runtime_on]
     p.options.view_as(TypeOptions).runtime_type_check = True
     p | beam.Create(['a']) | beam.Map(lambda x: 3).with_output_types(str)
     p.run()
示例#33
0
 def test_runtime_checks_on(self):
     # pylint: disable=expression-not-assigned
     p = TestPipeline(options=PipelineOptions(runtime_type_check=True))
     with self.assertRaises(typehints.TypeCheckError):
         # [START type_hints_runtime_on]
         p | beam.Create(['a'
                          ]) | beam.Map(lambda x: 3).with_output_types(str)
         p.run()
示例#34
0
    def test_basics(self):
        p = TestPipeline()
        result = p | 'Estimate' >> estimate_pi.EstimatePiTransform(5000)

        # Note: Probabilistically speaking this test can fail with a probability
        # that is very small (VERY) given that we run at least 500 thousand trials.
        assert_that(result, in_between(3.125, 3.155))
        p.run()
示例#35
0
  def test_basics(self):
    p = TestPipeline()
    result = p | 'Estimate' >> estimate_pi.EstimatePiTransform(5000)

    # Note: Probabilistically speaking this test can fail with a probability
    # that is very small (VERY) given that we run at least 500 thousand trials.
    assert_that(result, in_between(3.125, 3.155))
    p.run()
示例#36
0
 def test_read_gzip_empty_file(self):
     filename = tempfile.NamedTemporaryFile(delete=False,
                                            prefix=tempfile.template).name
     pipeline = TestPipeline()
     pcoll = pipeline | 'Read' >> ReadFromText(
         filename, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder())
     assert_that(pcoll, equal_to([]))
     pipeline.run()
示例#37
0
def pipeline_monitoring(renames):
    """Using monitoring interface snippets."""

    import re
    import apache_beam as beam
    from apache_beam.utils.pipeline_options import PipelineOptions

    class WordCountOptions(PipelineOptions):
        @classmethod
        def _add_argparse_args(cls, parser):
            parser.add_argument('--input',
                                help='Input for the pipeline',
                                default='gs://my-bucket/input')
            parser.add_argument('--output',
                                help='output for the pipeline',
                                default='gs://my-bucket/output')

    class ExtractWordsFn(beam.DoFn):
        def process(self, element):
            words = re.findall(r'[A-Za-z\']+', element)
            for word in words:
                yield word

    class FormatCountsFn(beam.DoFn):
        def process(self, element):
            word, count = element
            yield '%s: %s' % (word, count)

    # [START pipeline_monitoring_composite]
    # The CountWords Composite Transform inside the WordCount pipeline.
    class CountWords(beam.PTransform):
        def expand(self, pcoll):
            return (pcoll
                    # Convert lines of text into individual words.
                    | 'ExtractWords' >> beam.ParDo(ExtractWordsFn())
                    # Count the number of times each word occurs.
                    | beam.combiners.Count.PerElement()
                    # Format each word and count into a printable string.
                    | 'FormatCounts' >> beam.ParDo(FormatCountsFn()))

    # [END pipeline_monitoring_composite]

    pipeline_options = PipelineOptions()
    options = pipeline_options.view_as(WordCountOptions)
    p = TestPipeline()  # Use TestPipeline for testing.

    # [START pipeline_monitoring_execution]
    (p
     # Read the lines of the input text.
     | 'ReadLines' >> beam.io.ReadFromText(options.input)
     # Count the words.
     | CountWords()
     # Write the formatted word counts to output.
     | 'WriteCounts' >> beam.io.WriteToText(options.output))
    # [END pipeline_monitoring_execution]

    p.visit(SnippetUtils.RenameFiles(renames))
    p.run()
示例#38
0
 def test_compute_points(self):
     p = TestPipeline()
     records = p | 'create' >> beam.Create(self.SAMPLE_RECORDS)
     result = (records
               | 'points' >> beam.FlatMap(coders.compute_points)
               | beam.CombinePerKey(sum))
     assert_that(result,
                 equal_to([('Italy', 0), ('Brasil', 6), ('Germany', 3)]))
     p.run()
示例#39
0
  def test_timestamp_param(self):
    class TestDoFn(DoFn):
      def process(self, element, timestamp=DoFn.TimestampParam):
        yield timestamp

    pipeline = TestPipeline()
    pcoll = pipeline | 'Create' >> Create([1, 2]) | 'Do' >> ParDo(TestDoFn())
    assert_that(pcoll, equal_to([MIN_TIMESTAMP, MIN_TIMESTAMP]))
    pipeline.run()
示例#40
0
  def test_context_param(self):
    class TestDoFn(DoFn):
      def process(self, element, context=DoFn.ContextParam):
        yield context.element + 10

    pipeline = TestPipeline()
    pcoll = pipeline | 'Create' >> Create([1, 2])| 'Do' >> ParDo(TestDoFn())
    assert_that(pcoll, equal_to([11, 12]))
    pipeline.run()
示例#41
0
  def test_element(self):
    class TestDoFn(DoFn):
      def process(self, element):
        yield element + 10

    pipeline = TestPipeline()
    pcoll = pipeline | 'Create' >> Create([1, 2]) | 'Do' >> ParDo(TestDoFn())
    assert_that(pcoll, equal_to([11, 12]))
    pipeline.run()
示例#42
0
 def test_empty_write(self):
   temp_path = tempfile.NamedTemporaryFile().name
   sink = MyFileSink(
       temp_path, file_name_suffix='.foo', coder=coders.ToStringCoder())
   p = TestPipeline()
   p | beam.Create([]) | beam.io.Write(sink)  # pylint: disable=expression-not-assigned
   p.run()
   self.assertEqual(
       open(temp_path + '-00000-of-00001.foo').read(), '[start][end]')
示例#43
0
 def test_tuple_combine_fn(self):
     p = TestPipeline()
     result = (p
               | Create([('a', 100, 0.0), ('b', 10, -1), ('c', 1, 100)])
               | beam.CombineGlobally(
                   combine.TupleCombineFn(max, combine.MeanCombineFn(),
                                          sum)).without_defaults())
     assert_that(result, equal_to([('c', 111.0 / 3, 99.0)]))
     p.run()
示例#44
0
  def test_timestamp_param(self):
    class TestDoFn(DoFn):
      def process(self, element, timestamp=DoFn.TimestampParam):
        yield timestamp

    pipeline = TestPipeline()
    pcoll = pipeline | 'Create' >> Create([1, 2]) | 'Do' >> ParDo(TestDoFn())
    assert_that(pcoll, equal_to([MIN_TIMESTAMP, MIN_TIMESTAMP]))
    pipeline.run()
示例#45
0
 def test_read_gzip_empty_file(self):
   file_name = self._create_temp_file()
   pipeline = TestPipeline()
   pcoll = pipeline | 'Read' >> ReadFromText(
       file_name,
       0, CompressionTypes.GZIP,
       True, coders.StrUtf8Coder())
   assert_that(pcoll, equal_to([]))
   pipeline.run()
示例#46
0
  def test_element(self):
    class TestDoFn(DoFn):
      def process(self, element):
        yield element + 10

    pipeline = TestPipeline()
    pcoll = pipeline | 'Create' >> Create([1, 2]) | 'Do' >> ParDo(TestDoFn())
    assert_that(pcoll, equal_to([11, 12]))
    pipeline.run()
示例#47
0
  def test_context_param(self):
    class TestDoFn(DoFn):
      def process(self, element, context=DoFn.ContextParam):
        yield context.element + 10

    pipeline = TestPipeline()
    pcoll = pipeline | 'Create' >> Create([1, 2])| 'Do' >> ParDo(TestDoFn())
    assert_that(pcoll, equal_to([11, 12]))
    pipeline.run()
示例#48
0
    def test_top(self):
        pipeline = TestPipeline()

        # A parameter we'll be sharing with a custom comparator.
        names = {
            0: 'zo',
            1: 'one',
            2: 'twoo',
            3: 'three',
            5: 'fiiive',
            6: 'sssssix',
            9: 'nniiinne'
        }

        # First for global combines.
        pcoll = pipeline | 'start' >> Create([6, 3, 1, 1, 9, 1, 5, 2, 0, 6])
        result_top = pcoll | 'top' >> combine.Top.Largest(5)
        result_bot = pcoll | 'bot' >> combine.Top.Smallest(4)
        result_cmp = pcoll | 'cmp' >> combine.Top.Of(
            'cmp', 6, lambda a, b, names: len(names[a]) < len(names[b]),
            names)  # Note parameter passed to comparator.
        result_cmp_rev = pcoll | 'cmp_rev' >> combine.Top.Of(
            'cmp',
            3,
            lambda a, b, names: len(names[a]) < len(names[b]),
            names,  # Note parameter passed to comparator.
            reverse=True)
        assert_that(result_top,
                    equal_to([[9, 6, 6, 5, 3]]),
                    label='assert:top')
        assert_that(result_bot, equal_to([[0, 1, 1, 1]]), label='assert:bot')
        assert_that(result_cmp,
                    equal_to([[9, 6, 6, 5, 3, 2]]),
                    label='assert:cmp')
        assert_that(result_cmp_rev,
                    equal_to([[0, 1, 1]]),
                    label='assert:cmp_rev')

        # Again for per-key combines.
        pcoll = pipeline | 'start-perkye' >> Create(
            [('a', x) for x in [6, 3, 1, 1, 9, 1, 5, 2, 0, 6]])
        result_key_top = pcoll | 'top-perkey' >> combine.Top.LargestPerKey(5)
        result_key_bot = pcoll | 'bot-perkey' >> combine.Top.SmallestPerKey(4)
        result_key_cmp = pcoll | 'cmp-perkey' >> combine.Top.PerKey(
            6, lambda a, b, names: len(names[a]) < len(names[b]),
            names)  # Note parameter passed to comparator.
        assert_that(result_key_top,
                    equal_to([('a', [9, 6, 6, 5, 3])]),
                    label='key:top')
        assert_that(result_key_bot,
                    equal_to([('a', [0, 1, 1, 1])]),
                    label='key:bot')
        assert_that(result_key_cmp,
                    equal_to([('a', [9, 6, 6, 5, 3, 2])]),
                    label='key:cmp')
        pipeline.run()
示例#49
0
  def test_read_auto_bzip2(self):
    _, lines = write_data(15)
    file_name = self._create_temp_file(suffix='.bz2')
    with bz2.BZ2File(file_name, 'wb') as f:
      f.write('\n'.join(lines))

    pipeline = TestPipeline()
    pcoll = pipeline | 'Read' >> ReadFromText(file_name)
    assert_that(pcoll, equal_to(lines))
    pipeline.run()
示例#50
0
 def test_tuple_combine_fn_without_defaults(self):
   p = TestPipeline()
   result = (
       p
       | Create([1, 1, 2, 3])
       | beam.CombineGlobally(
           combine.TupleCombineFn(min, combine.MeanCombineFn(), max)
           .with_common_input()).without_defaults())
   assert_that(result, equal_to([(1, 7.0 / 4, 3)]))
   p.run()
  def test_run_concat_direct(self):
    source = ConcatSource([RangeSource(0, 10),
                           RangeSource(10, 100),
                           RangeSource(100, 1000),
                          ])
    pipeline = TestPipeline()
    pcoll = pipeline | beam.Read(source)
    assert_that(pcoll, equal_to(range(1000)))

    pipeline.run()
示例#52
0
 def test_tuple_combine_fn(self):
   p = TestPipeline()
   result = (
       p
       | Create([('a', 100, 0.0), ('b', 10, -1), ('c', 1, 100)])
       | beam.CombineGlobally(combine.TupleCombineFn(max,
                                                     combine.MeanCombineFn(),
                                                     sum)).without_defaults())
   assert_that(result, equal_to([('c', 111.0 / 3, 99.0)]))
   p.run()
示例#53
0
  def test_create(self):
    pipeline = TestPipeline()
    pcoll = pipeline | 'label1' >> Create([1, 2, 3])
    assert_that(pcoll, equal_to([1, 2, 3]))

    # Test if initial value is an iterator object.
    pcoll2 = pipeline | 'label2' >> Create(iter((4, 5, 6)))
    pcoll3 = pcoll2 | 'do' >> FlatMap(lambda x: [x + 10])
    assert_that(pcoll3, equal_to([14, 15, 16]), label='pcoll3')
    pipeline.run()
示例#54
0
 def test_reuse_cloned_custom_transform_instance(self):
   pipeline = TestPipeline()
   pcoll1 = pipeline | 'pc1' >> Create([1, 2, 3])
   pcoll2 = pipeline | 'pc2' >> Create([4, 5, 6])
   transform = PipelineTest.CustomTransform()
   result1 = pcoll1 | transform
   result2 = pcoll2 | 'new_label' >> transform
   assert_that(result1, equal_to([2, 3, 4]), label='r1')
   assert_that(result2, equal_to([5, 6, 7]), label='r2')
   pipeline.run()
示例#55
0
 def test_reuse_cloned_custom_transform_instance(self):
     pipeline = TestPipeline()
     pcoll1 = pipeline | 'pc1' >> Create([1, 2, 3])
     pcoll2 = pipeline | 'pc2' >> Create([4, 5, 6])
     transform = PipelineTest.CustomTransform()
     result1 = pcoll1 | transform
     result2 = pcoll2 | 'new_label' >> transform
     assert_that(result1, equal_to([2, 3, 4]), label='r1')
     assert_that(result2, equal_to([5, 6, 7]), label='r2')
     pipeline.run()
示例#56
0
    def test_create(self):
        pipeline = TestPipeline()
        pcoll = pipeline | 'label1' >> Create([1, 2, 3])
        assert_that(pcoll, equal_to([1, 2, 3]))

        # Test if initial value is an iterator object.
        pcoll2 = pipeline | 'label2' >> Create(iter((4, 5, 6)))
        pcoll3 = pcoll2 | 'do' >> FlatMap(lambda x: [x + 10])
        assert_that(pcoll3, equal_to([14, 15, 16]), label='pcoll3')
        pipeline.run()
示例#57
0
 def test_empty_write(self):
     temp_path = tempfile.NamedTemporaryFile().name
     sink = MyFileSink(temp_path,
                       file_name_suffix='.output',
                       coder=coders.ToStringCoder())
     p = TestPipeline()
     p | beam.Create([]) | beam.io.Write(sink)  # pylint: disable=expression-not-assigned
     p.run()
     self.assertEqual(
         open(temp_path + '-00000-of-00001.output').read(), '[start][end]')
示例#58
0
 def test_timestamped_value(self):
   p = TestPipeline()
   result = (p
             | 'start' >> Create([(k, k) for k in range(10)])
             | Map(lambda (x, t): TimestampedValue(x, t))
             | 'w' >> WindowInto(FixedWindows(5))
             | Map(lambda v: ('key', v))
             | GroupByKey())
   assert_that(result, equal_to([('key', [0, 1, 2, 3, 4]),
                                 ('key', [5, 6, 7, 8, 9])]))
   p.run()
示例#59
0
 def test_sliding_windows(self):
   p = TestPipeline()
   pcoll = self.timestamped_key_values(p, 'key', 1, 2, 3)
   result = (pcoll
             | 'w' >> WindowInto(SlidingWindows(period=2, size=4))
             | GroupByKey()
             | reify_windows)
   expected = [('key @ [-2.0, 2.0)', [1]),
               ('key @ [0.0, 4.0)', [1, 2, 3]),
               ('key @ [2.0, 6.0)', [2, 3])]
   assert_that(result, equal_to(expected))
   p.run()
示例#60
0
 def test_sessions(self):
   p = TestPipeline()
   pcoll = self.timestamped_key_values(p, 'key', 1, 2, 3, 20, 35, 27)
   result = (pcoll
             | 'w' >> WindowInto(Sessions(10))
             | GroupByKey()
             | sort_values
             | reify_windows)
   expected = [('key @ [1.0, 13.0)', [1, 2, 3]),
               ('key @ [20.0, 45.0)', [20, 27, 35])]
   assert_that(result, equal_to(expected))
   p.run()