示例#1
0
 def test_dataflow_single_file(self):
   file_name, expected_data = write_data(5)
   assert len(expected_data) == 5
   pipeline = TestPipeline()
   pcoll = pipeline | 'Read' >> ReadFromText(file_name)
   assert_that(pcoll, equal_to(expected_data))
   pipeline.run()
示例#2
0
  def test_setting_timestamp(self):
    p = TestPipeline()
    unkeyed_items = p | beam.Create([12, 30, 60, 61, 66])
    items = (unkeyed_items | 'key' >> beam.Map(lambda x: ('k', x)))

    def extract_timestamp_from_log_entry(entry):
      return entry[1]

    # [START setting_timestamp]
    class AddTimestampDoFn(beam.DoFn):

      def process(self, element):
        # Extract the numeric Unix seconds-since-epoch timestamp to be
        # associated with the current log entry.
        unix_timestamp = extract_timestamp_from_log_entry(element)
        # Wrap and emit the current entry and new timestamp in a
        # TimestampedValue.
        yield beam.TimestampedValue(element, unix_timestamp)

    timestamped_items = items | 'timestamp' >> beam.ParDo(AddTimestampDoFn())
    # [END setting_timestamp]
    fixed_windowed_items = (
        timestamped_items | 'window' >> beam.WindowInto(
            beam.window.FixedWindows(60)))
    summed = (fixed_windowed_items
              | 'group' >> beam.GroupByKey()
              | 'combine' >> beam.CombineValues(sum))
    unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1])
    beam.assert_that(unkeyed, beam.equal_to([42, 187]))
    p.run()
示例#3
0
 def test_dataflow_file_pattern(self):
   pattern, expected_data = write_pattern([5, 3, 12, 8, 8, 4])
   assert len(expected_data) == 40
   pipeline = TestPipeline()
   pcoll = pipeline | 'Read' >> ReadFromText(pattern)
   assert_that(pcoll, equal_to(expected_data))
   pipeline.run()
 def run_pipeline(self, count_implementation, factor=1):
   p = TestPipeline()
   words = p | beam.Create(['CAT', 'DOG', 'CAT', 'CAT', 'DOG'])
   result = words | count_implementation
   assert_that(
       result, equal_to([('CAT', (3 * factor)), ('DOG', (2 * factor))]))
   p.run()
  def test_bad_types(self):
    p = TestPipeline()
    evens = None  # pylint: disable=unused-variable

    # [START type_hints_missing_define_numbers]
    numbers = p | beam.Create(['1', '2', '3'])
    # [END type_hints_missing_define_numbers]

    # Consider the following code.
    # pylint: disable=expression-not-assigned
    # pylint: disable=unused-variable
    # [START type_hints_missing_apply]
    evens = numbers | beam.Filter(lambda x: x % 2 == 0)
    # [END type_hints_missing_apply]

    # Now suppose numbers was defined as [snippet above].
    # When running this pipeline, you'd get a runtime error,
    # possibly on a remote machine, possibly very late.

    with self.assertRaises(TypeError):
      p.run()

    # To catch this early, we can assert what types we expect.
    with self.assertRaises(typehints.TypeCheckError):
      # [START type_hints_takes]
      p.options.view_as(TypeOptions).pipeline_type_check = True
      evens = numbers | beam.Filter(lambda x: x % 2 == 0).with_input_types(int)
      # [END type_hints_takes]

    # Type hints can be declared on DoFns and callables as well, rather
    # than where they're used, to be more self contained.
    with self.assertRaises(typehints.TypeCheckError):
      # [START type_hints_do_fn]
      @beam.typehints.with_input_types(int)
      class FilterEvensDoFn(beam.DoFn):
        def process(self, element):
          if element % 2 == 0:
            yield element
      evens = numbers | beam.ParDo(FilterEvensDoFn())
      # [END type_hints_do_fn]

    words = p | 'words' >> beam.Create(['a', 'bb', 'c'])
    # One can assert outputs and apply them to transforms as well.
    # Helps document the contract and checks it at pipeline construction time.
    # [START type_hints_transform]
    T = beam.typehints.TypeVariable('T')

    @beam.typehints.with_input_types(T)
    @beam.typehints.with_output_types(beam.typehints.Tuple[int, T])
    class MyTransform(beam.PTransform):
      def expand(self, pcoll):
        return pcoll | beam.Map(lambda x: (len(x), x))

    words_with_lens = words | MyTransform()
    # [END type_hints_transform]

    # pylint: disable=expression-not-assigned
    with self.assertRaises(typehints.TypeCheckError):
      words_with_lens | beam.Map(lambda x: x).with_input_types(
          beam.typehints.Tuple[int, int])
示例#6
0
def model_composite_transform_example(contents, output_path):
  """Example of a composite transform.

  To declare a composite transform, define a subclass of PTransform.

  To override the apply method, define a method "apply" that
  takes a PCollection as its only parameter and returns a PCollection.
  """
  import re

  import apache_beam as beam

  # [START composite_transform_example]
  # [START composite_ptransform_apply_method]
  # [START composite_ptransform_declare]
  class CountWords(beam.PTransform):
    # [END composite_ptransform_declare]

    def expand(self, pcoll):
      return (pcoll
              | beam.FlatMap(lambda x: re.findall(r'\w+', x))
              | beam.combiners.Count.PerElement()
              | beam.Map(lambda (word, c): '%s: %s' % (word, c)))
  # [END composite_ptransform_apply_method]
  # [END composite_transform_example]

  p = TestPipeline()  # Use TestPipeline for testing.
  (p
   | beam.Create(contents)
   | CountWords()
   | beam.io.WriteToText(output_path))
  p.run()
示例#7
0
def model_multiple_pcollections_partition(contents, output_path):
  """Splitting a PCollection with Partition."""
  some_hash_fn = lambda s: ord(s[0])

  def get_percentile(i):
    """Assume i in [0,100)."""
    return i
  import apache_beam as beam
  p = TestPipeline()  # Use TestPipeline for testing.

  students = p | beam.Create(contents)

  # [START model_multiple_pcollections_partition]
  def partition_fn(student, num_partitions):
    return int(get_percentile(student) * num_partitions / 100)

  by_decile = students | beam.Partition(partition_fn, 10)
  # [END model_multiple_pcollections_partition]
  # [START model_multiple_pcollections_partition_40th]
  fortieth_percentile = by_decile[4]
  # [END model_multiple_pcollections_partition_40th]

  ([by_decile[d] for d in xrange(10) if d != 4] + [fortieth_percentile]
   | beam.Flatten()
   | beam.io.WriteToText(output_path))

  p.run()
示例#8
0
def model_multiple_pcollections_flatten(contents, output_path):
  """Merging a PCollection with Flatten."""
  some_hash_fn = lambda s: ord(s[0])
  import apache_beam as beam
  p = TestPipeline()  # Use TestPipeline for testing.
  partition_fn = lambda element, partitions: some_hash_fn(element) % partitions

  # Partition into deciles
  partitioned = p | beam.Create(contents) | beam.Partition(partition_fn, 3)
  pcoll1 = partitioned[0]
  pcoll2 = partitioned[1]
  pcoll3 = partitioned[2]

  # Flatten them back into 1

  # A collection of PCollection objects can be represented simply
  # as a tuple (or list) of PCollections.
  # (The SDK for Python has no separate type to store multiple
  # PCollection objects, whether containing the same or different
  # types.)
  # [START model_multiple_pcollections_flatten]
  merged = (
      # [START model_multiple_pcollections_tuple]
      (pcoll1, pcoll2, pcoll3)
      # [END model_multiple_pcollections_tuple]
      # A list of tuples can be "piped" directly into a Flatten transform.
      | beam.Flatten())
  # [END model_multiple_pcollections_flatten]
  merged | beam.io.WriteToText(output_path)

  p.run()
示例#9
0
def model_co_group_by_key_tuple(email_list, phone_list, output_path):
  """Applying a CoGroupByKey Transform to a tuple."""
  import apache_beam as beam
  p = TestPipeline()  # Use TestPipeline for testing.
  # [START model_group_by_key_cogroupbykey_tuple]
  # Each data set is represented by key-value pairs in separate PCollections.
  # Both data sets share a common key type (in this example str).
  # The email_list contains values such as: ('joe', '*****@*****.**') with
  # multiple possible values for each key.
  # The phone_list contains values such as: ('mary': '111-222-3333') with
  # multiple possible values for each key.
  emails = p | 'email' >> beam.Create(email_list)
  phones = p | 'phone' >> beam.Create(phone_list)
  # The result PCollection contains one key-value element for each key in the
  # input PCollections. The key of the pair will be the key from the input and
  # the value will be a dictionary with two entries: 'emails' - an iterable of
  # all values for the current key in the emails PCollection and 'phones': an
  # iterable of all values for the current key in the phones PCollection.
  # For instance, if 'emails' contained ('joe', '*****@*****.**') and
  # ('joe', '*****@*****.**'), then 'result' will contain the element
  # ('joe', {'emails': ['*****@*****.**', '*****@*****.**'], 'phones': ...})
  result = {'emails': emails, 'phones': phones} | beam.CoGroupByKey()

  def join_info((name, info)):
    return '; '.join(['%s' % name,
                      '%s' % ','.join(info['emails']),
                      '%s' % ','.join(info['phones'])])

  contact_lines = result | beam.Map(join_info)
  # [END model_group_by_key_cogroupbykey_tuple]
  contact_lines | beam.io.WriteToText(output_path)
  p.run()
示例#10
0
 def test_timestamped_with_combiners(self):
   p = TestPipeline()
   result = (p
             # Create some initial test values.
             | 'start' >> Create([(k, k) for k in range(10)])
             # The purpose of the WindowInto transform is to establish a
             # FixedWindows windowing function for the PCollection.
             # It does not bucket elements into windows since the timestamps
             # from Create are not spaced 5 ms apart and very likely they all
             # fall into the same window.
             | 'w' >> WindowInto(FixedWindows(5))
             # Generate timestamped values using the values as timestamps.
             # Now there are values 5 ms apart and since Map propagates the
             # windowing function from input to output the output PCollection
             # will have elements falling into different 5ms windows.
             | Map(lambda (x, t): TimestampedValue(x, t))
             # We add a 'key' to each value representing the index of the
             # window. This is important since there is no guarantee of
             # order for the elements of a PCollection.
             | Map(lambda v: (v / 5, v)))
   # Sum all elements associated with a key and window. Although it
   # is called CombinePerKey it is really CombinePerKeyAndWindow the
   # same way GroupByKey is really GroupByKeyAndWindow.
   sum_per_window = result | CombinePerKey(sum)
   # Compute mean per key and window.
   mean_per_window = result | combiners.Mean.PerKey()
   assert_that(sum_per_window, equal_to([(0, 10), (1, 35)]),
               label='assert:sum')
   assert_that(mean_per_window, equal_to([(0, 2.0), (1, 7.0)]),
               label='assert:mean')
   p.run()
  def test_compute_top_sessions(self):
    p = TestPipeline()
    edits = p | beam.Create(self.EDITS)
    result = edits | top_wikipedia_sessions.ComputeTopSessions(1.0)

    beam.assert_that(result, beam.equal_to(self.EXPECTED))
    p.run()
示例#12
0
def pipeline_logging(lines, output):
  """Logging Pipeline Messages."""

  import re
  import apache_beam as beam

  # [START pipeline_logging]
  # import Python logging module.
  import logging

  class ExtractWordsFn(beam.DoFn):

    def process(self, element):
      words = re.findall(r'[A-Za-z\']+', element)
      for word in words:
        yield word

        if word.lower() == 'love':
          # Log using the root logger at info or higher levels
          logging.info('Found : %s', word.lower())

  # Remaining WordCount example code ...
  # [END pipeline_logging]

  p = TestPipeline()  # Use TestPipeline for testing.
  (p
   | beam.Create(lines)
   | beam.ParDo(ExtractWordsFn())
   | beam.io.WriteToText(output))

  p.run()
示例#13
0
  def test_run_direct(self):
    file_name = self._create_temp_file('aaaa\nbbbb\ncccc\ndddd')
    pipeline = TestPipeline()
    pcoll = pipeline | beam.io.Read(LineSource(file_name))
    assert_that(pcoll, equal_to(['aaaa', 'bbbb', 'cccc', 'dddd']))

    pipeline.run()
示例#14
0
 def test_runtime_checks_on(self):
   # pylint: disable=expression-not-assigned
   p = TestPipeline()
   with self.assertRaises(typehints.TypeCheckError):
     # [START type_hints_runtime_on]
     p.options.view_as(TypeOptions).runtime_type_check = True
     p | beam.Create(['a']) | beam.Map(lambda x: 3).with_output_types(str)
     p.run()
示例#15
0
  def test_basics(self):
    p = TestPipeline()
    result = p | 'Estimate' >> estimate_pi.EstimatePiTransform(5000)

    # Note: Probabilistically speaking this test can fail with a probability
    # that is very small (VERY) given that we run at least 500 thousand trials.
    assert_that(result, in_between(3.125, 3.155))
    p.run()
示例#16
0
 def test_empty_write(self):
   temp_path = tempfile.NamedTemporaryFile().name
   sink = MyFileSink(
       temp_path, file_name_suffix='.foo', coder=coders.ToStringCoder())
   p = TestPipeline()
   p | beam.Create([]) | beam.io.Write(sink)  # pylint: disable=expression-not-assigned
   p.run()
   self.assertEqual(
       open(temp_path + '-00000-of-00001.foo').read(), '[start][end]')
示例#17
0
  def test_element(self):
    class TestDoFn(DoFn):
      def process(self, element):
        yield element + 10

    pipeline = TestPipeline()
    pcoll = pipeline | 'Create' >> Create([1, 2]) | 'Do' >> ParDo(TestDoFn())
    assert_that(pcoll, equal_to([11, 12]))
    pipeline.run()
示例#18
0
  def test_context_param(self):
    class TestDoFn(DoFn):
      def process(self, element, context=DoFn.ContextParam):
        yield context.element + 10

    pipeline = TestPipeline()
    pcoll = pipeline | 'Create' >> Create([1, 2])| 'Do' >> ParDo(TestDoFn())
    assert_that(pcoll, equal_to([11, 12]))
    pipeline.run()
示例#19
0
  def test_timestamp_param(self):
    class TestDoFn(DoFn):
      def process(self, element, timestamp=DoFn.TimestampParam):
        yield timestamp

    pipeline = TestPipeline()
    pcoll = pipeline | 'Create' >> Create([1, 2]) | 'Do' >> ParDo(TestDoFn())
    assert_that(pcoll, equal_to([MIN_TIMESTAMP, MIN_TIMESTAMP]))
    pipeline.run()
示例#20
0
 def test_read_gzip_empty_file(self):
   file_name = self._create_temp_file()
   pipeline = TestPipeline()
   pcoll = pipeline | 'Read' >> ReadFromText(
       file_name,
       0, CompressionTypes.GZIP,
       True, coders.StrUtf8Coder())
   assert_that(pcoll, equal_to([]))
   pipeline.run()
  def test_run_concat_direct(self):
    source = ConcatSource([RangeSource(0, 10),
                           RangeSource(10, 100),
                           RangeSource(100, 1000),
                          ])
    pipeline = TestPipeline()
    pcoll = pipeline | beam.Read(source)
    assert_that(pcoll, equal_to(range(1000)))

    pipeline.run()
示例#22
0
  def test_read_auto_bzip2(self):
    _, lines = write_data(15)
    file_name = self._create_temp_file(suffix='.bz2')
    with bz2.BZ2File(file_name, 'wb') as f:
      f.write('\n'.join(lines))

    pipeline = TestPipeline()
    pcoll = pipeline | 'Read' >> ReadFromText(file_name)
    assert_that(pcoll, equal_to(lines))
    pipeline.run()
示例#23
0
  def test_create(self):
    pipeline = TestPipeline()
    pcoll = pipeline | 'label1' >> Create([1, 2, 3])
    assert_that(pcoll, equal_to([1, 2, 3]))

    # Test if initial value is an iterator object.
    pcoll2 = pipeline | 'label2' >> Create(iter((4, 5, 6)))
    pcoll3 = pcoll2 | 'do' >> FlatMap(lambda x: [x + 10])
    assert_that(pcoll3, equal_to([14, 15, 16]), label='pcoll3')
    pipeline.run()
示例#24
0
 def test_reuse_cloned_custom_transform_instance(self):
   pipeline = TestPipeline()
   pcoll1 = pipeline | 'pc1' >> Create([1, 2, 3])
   pcoll2 = pipeline | 'pc2' >> Create([4, 5, 6])
   transform = PipelineTest.CustomTransform()
   result1 = pcoll1 | transform
   result2 = pcoll2 | 'new_label' >> transform
   assert_that(result1, equal_to([2, 3, 4]), label='r1')
   assert_that(result2, equal_to([5, 6, 7]), label='r2')
   pipeline.run()
示例#25
0
 def test_tuple_combine_fn(self):
   p = TestPipeline()
   result = (
       p
       | Create([('a', 100, 0.0), ('b', 10, -1), ('c', 1, 100)])
       | beam.CombineGlobally(combine.TupleCombineFn(max,
                                                     combine.MeanCombineFn(),
                                                     sum)).without_defaults())
   assert_that(result, equal_to([('c', 111.0 / 3, 99.0)]))
   p.run()
示例#26
0
 def test_metrics_in_source(self):
   pipeline = TestPipeline()
   pcoll = pipeline | Read(FakeSource([1, 2, 3, 4, 5, 6]))
   assert_that(pcoll, equal_to([1, 2, 3, 4, 5, 6]))
   res = pipeline.run()
   metric_results = res.metrics().query()
   outputs_counter = metric_results['counters'][0]
   self.assertEqual(outputs_counter.key.step, 'Read')
   self.assertEqual(outputs_counter.key.metric.name, 'outputs')
   self.assertEqual(outputs_counter.committed, 6)
示例#27
0
 def test_tuple_combine_fn_without_defaults(self):
   p = TestPipeline()
   result = (
       p
       | Create([1, 1, 2, 3])
       | beam.CombineGlobally(
           combine.TupleCombineFn(min, combine.MeanCombineFn(), max)
           .with_common_input()).without_defaults())
   assert_that(result, equal_to([(1, 7.0 / 4, 3)]))
   p.run()
示例#28
0
 def test_timestamped_value(self):
   p = TestPipeline()
   result = (p
             | 'start' >> Create([(k, k) for k in range(10)])
             | Map(lambda (x, t): TimestampedValue(x, t))
             | 'w' >> WindowInto(FixedWindows(5))
             | Map(lambda v: ('key', v))
             | GroupByKey())
   assert_that(result, equal_to([('key', [0, 1, 2, 3, 4]),
                                 ('key', [5, 6, 7, 8, 9])]))
   p.run()
示例#29
0
 def test_sessions(self):
   p = TestPipeline()
   pcoll = self.timestamped_key_values(p, 'key', 1, 2, 3, 20, 35, 27)
   result = (pcoll
             | 'w' >> WindowInto(Sessions(10))
             | GroupByKey()
             | sort_values
             | reify_windows)
   expected = [('key @ [1.0, 13.0)', [1, 2, 3]),
               ('key @ [20.0, 45.0)', [20, 27, 35])]
   assert_that(result, equal_to(expected))
   p.run()
示例#30
0
 def test_sliding_windows(self):
   p = TestPipeline()
   pcoll = self.timestamped_key_values(p, 'key', 1, 2, 3)
   result = (pcoll
             | 'w' >> WindowInto(SlidingWindows(period=2, size=4))
             | GroupByKey()
             | reify_windows)
   expected = [('key @ [-2.0, 2.0)', [1]),
               ('key @ [0.0, 4.0)', [1, 2, 3]),
               ('key @ [2.0, 6.0)', [2, 3])]
   assert_that(result, equal_to(expected))
   p.run()
示例#31
0
 def test_setting_sliding_windows(self):
     p = TestPipeline()
     unkeyed_items = p | beam.Create([2, 16, 23])
     items = (unkeyed_items
              | 'key' >> beam.Map(lambda x: beam.window.TimestampedValue(
                  ('k', x), x)))
     # [START setting_sliding_windows]
     from apache_beam import window
     sliding_windowed_items = (
         items | 'window' >> beam.WindowInto(window.SlidingWindows(30, 5)))
     # [END setting_sliding_windows]
     summed = (sliding_windowed_items
               | 'group' >> beam.GroupByKey()
               | 'combine' >> beam.CombineValues(sum))
     unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1])
     beam.assert_that(unkeyed,
                      beam.equal_to([2, 2, 2, 18, 23, 39, 39, 39, 41, 41]))
     p.run()
示例#32
0
  def test_end2end_example_proto(self):
    file_path_prefix = os.path.join(self._new_tempdir(), 'result')

    example = tf.train.Example()
    example.features.feature['int'].int64_list.value.extend(range(3))
    example.features.feature['bytes'].bytes_list.value.extend(
        [b'foo', b'bar'])

    with TestPipeline() as p:
      _ = p | beam.Create([example]) | WriteToTFRecord(
          file_path_prefix, coder=beam.coders.ProtoCoder(example.__class__))

    # Read the file back and compare.
    with TestPipeline() as p:
      actual_data = (p | ReadFromTFRecord(
          file_path_prefix + '-*',
          coder=beam.coders.ProtoCoder(example.__class__)))
      beam.assert_that(actual_data, beam.equal_to([example]))
示例#33
0
 def test_top_prefixes(self):
     p = TestPipeline()
     words = p | beam.Create(self.WORDS)
     result = words | autocomplete.TopPerPrefix(5)
     # values must be hashable for now
     result = result | beam.Map(lambda (k, vs): (k, tuple(vs)))
     assert_that(
         result,
         equal_to([
             ('t', ((3, 'to'), (2, 'this'), (1, 'that'))),
             ('to', ((3, 'to'), )),
             ('th', ((2, 'this'), (1, 'that'))),
             ('thi', ((2, 'this'), )),
             ('this', ((2, 'this'), )),
             ('tha', ((1, 'that'), )),
             ('that', ((1, 'that'), )),
         ]))
     p.run()
示例#34
0
 def test_read_auto_pattern(self):
   _, lines = write_data(200)
   splits = [0, 34, 100, 140, 164, 188, 200]
   chunks = [lines[splits[i - 1]:splits[i]] for i in xrange(1, len(splits))]
   compressed_chunks = []
   for c in chunks:
     out = cStringIO.StringIO()
     with gzip.GzipFile(fileobj=out, mode="w") as f:
       f.write('\n'.join(c))
     compressed_chunks.append(out.getvalue())
   file_pattern = write_prepared_pattern(
       compressed_chunks, suffixes=['.gz']*len(chunks))
   pipeline = TestPipeline()
   pcoll = pipeline | 'Read' >> beam.Read(LineSource(
       file_pattern,
       compression_type=CompressionTypes.AUTO))
   assert_that(pcoll, equal_to(lines))
   pipeline.run()
示例#35
0
 def test_rewindow(self):
     p = TestPipeline()
     result = (
         p
         | Create([(k, k) for k in range(10)])
         | Map(lambda (x, t): TimestampedValue(x, t))
         | 'window' >> WindowInto(SlidingWindows(period=2, size=6))
         # Per the model, each element is now duplicated across
         # three windows. Rewindowing must preserve this duplication.
         | 'rewindow' >> WindowInto(FixedWindows(5))
         | 'rewindow2' >> WindowInto(FixedWindows(5))
         | Map(lambda v: ('key', v))
         | GroupByKey())
     assert_that(
         result,
         equal_to([('key', sorted([0, 1, 2, 3, 4] * 3)),
                   ('key', sorted([5, 6, 7, 8, 9] * 3))]))
     p.run()
示例#36
0
    def test_per_key_sample(self):
        pipeline = TestPipeline()
        pcoll = pipeline | 'start-perkey' >> Create(
            sum(([(i, 1), (i, 1), (i, 2), (i, 2)] for i in xrange(9)), []))
        result = pcoll | 'sample' >> combine.Sample.FixedSizePerKey(3)

        def matcher():
            def match(actual):
                for _, samples in actual:
                    equal_to([3])([len(samples)])
                    num_ones = sum(1 for x in samples if x == 1)
                    num_twos = sum(1 for x in samples if x == 2)
                    equal_to([1, 2])([num_ones, num_twos])

            return match

        assert_that(result, matcher())
        pipeline.run()
示例#37
0
    def test_memory_usage(self):
        try:
            import resource
        except ImportError:
            # Skip the test if resource module is not available (e.g. non-Unix os).
            self.skipTest('resource module not available.')
        if platform.mac_ver()[0]:
            # Skip the test on macos, depending on version it returns ru_maxrss in
            # different units.
            self.skipTest('ru_maxrss is not in standard units.')

        def get_memory_usage_in_bytes():
            return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss * (2**10)

        def check_memory(value, memory_threshold):
            memory_usage = get_memory_usage_in_bytes()
            if memory_usage > memory_threshold:
                raise RuntimeError('High memory usage: %d > %d' %
                                   (memory_usage, memory_threshold))
            return value

        len_elements = 1000000
        num_elements = 10
        num_maps = 100

        pipeline = TestPipeline()

        # Consumed memory should not be proportional to the number of maps.
        memory_threshold = (get_memory_usage_in_bytes() +
                            (5 * len_elements * num_elements))

        # Plus small additional slack for memory fluctuations during the test.
        memory_threshold += 10 * (2**20)

        biglist = pipeline | 'oom:create' >> Create(
            ['x' * len_elements] * num_elements)
        for i in range(num_maps):
            biglist = biglist | ('oom:addone-%d' % i) >> Map(lambda x: x + 'y')
        result = biglist | 'oom:check' >> Map(check_memory, memory_threshold)
        assert_that(
            result,
            equal_to(['x' * len_elements + 'y' * num_maps] * num_elements))

        pipeline.run()
  def test_bigquery_tornadoes_it(self):
    test_pipeline = TestPipeline(is_integration_test=True)

    # Set extra options to the pipeline for test purpose
    output_table = ('BigQueryTornadoesIT'
                    '.monthly_tornadoes_%s' % int(round(time.time() * 1000)))
    query = 'SELECT month, tornado_count FROM [%s]' % output_table
    pipeline_verifiers = [PipelineStateMatcher(),
                          BigqueryMatcher(
                              project=test_pipeline.get_option('project'),
                              query=query,
                              checksum=self.DEFAULT_CHECKSUM)]
    extra_opts = {'output': output_table,
                  'on_success_matcher': all_of(*pipeline_verifiers)}

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    bigquery_tornadoes.run(
        test_pipeline.get_full_options_as_args(**extra_opts))
示例#39
0
 def test_read_pattern_bzip2(self):
     _, lines = write_data(200)
     splits = [0, 34, 100, 140, 164, 188, 200]
     chunks = [
         lines[splits[i - 1]:splits[i]] for i in xrange(1, len(splits))
     ]
     compressed_chunks = []
     for c in chunks:
         compressobj = bz2.BZ2Compressor()
         compressed_chunks.append(
             compressobj.compress('\n'.join(c)) + compressobj.flush())
     file_pattern = write_prepared_pattern(compressed_chunks)
     pipeline = TestPipeline()
     pcoll = pipeline | 'Read' >> beam.io.Read(
         LineSource(file_pattern,
                    splittable=False,
                    compression_type=CompressionTypes.BZIP2))
     assert_that(pcoll, equal_to(lines))
     pipeline.run()
示例#40
0
  def test_deterministic_key(self):
    p = TestPipeline()
    lines = (p | beam.Create(
        ['banana,fruit,3', 'kiwi,fruit,2', 'kiwi,fruit,2', 'zucchini,veg,3']))

    # For pickling
    global Player  # pylint: disable=global-variable-not-assigned

    # [START type_hints_deterministic_key]
    class Player(object):
      def __init__(self, team, name):
        self.team = team
        self.name = name

    class PlayerCoder(beam.coders.Coder):
      def encode(self, player):
        return '%s:%s' % (player.team, player.name)

      def decode(self, s):
        return Player(*s.split(':'))

      def is_deterministic(self):
        return True

    beam.coders.registry.register_coder(Player, PlayerCoder)

    def parse_player_and_score(csv):
      name, team, score = csv.split(',')
      return Player(team, name), int(score)

    totals = (
        lines
        | beam.Map(parse_player_and_score)
        | beam.CombinePerKey(sum).with_input_types(
            beam.typehints.Tuple[Player, int]))
    # [END type_hints_deterministic_key]

    assert_that(
        totals | beam.Map(lambda (k, v): (k.name, v)),
        equal_to([('banana', 3), ('kiwi', 4), ('zucchini', 3)]))

    p.run()
示例#41
0
 def test_process_auto(self):
     path = os.path.join(self._new_tempdir(), 'result.gz')
     self._write_file_gzip(path, FOO_BAR_RECORD_BASE64)
     with TestPipeline() as p:
         result = (p
                   | beam.Read(
                       _TFRecordSource(
                           path,
                           coder=coders.BytesCoder(),
                           compression_type=fileio.CompressionTypes.AUTO)))
         beam.assert_that(result, beam.equal_to(['foo', 'bar']))
示例#42
0
 def test_tfidf_transform(self):
   p = TestPipeline()
   uri_to_line = p | beam.Create(
       'create sample',
       [('1.txt', 'abc def ghi'),
        ('2.txt', 'abc def'),
        ('3.txt', 'abc')])
   result = (
       uri_to_line
       | tfidf.TfIdf()
       | beam.Map(lambda (word, (uri, tfidf)): (word, uri, tfidf)))
示例#43
0
 def test_hourly_team_score(self):
     with TestPipeline() as p:
         result = (p
                   | beam.Create(HourlyTeamScoreTest.SAMPLE_DATA)
                   | hourly_team_score.HourlyTeamScore(
                       start_min='2015-11-16-15-20',
                       stop_min='2015-11-16-17-20',
                       window_duration=60))
         beam.assert_that(
             result,
             beam.equal_to([('team1', 18), ('team2', 2), ('team3', 13)]))
示例#44
0
    def test_wordcount_it(self):
        test_pipeline = TestPipeline(is_integration_test=True)

        # Set extra options to the pipeline for test purpose
        output = '/'.join([
            test_pipeline.get_option('output'),
            test_pipeline.get_option('job_name'), 'results'
        ])
        pipeline_verifiers = [
            PipelineStateMatcher(),
            FileChecksumMatcher(output + '*-of-*', self.DEFAULT_CHECKSUM)
        ]
        extra_opts = {
            'output': output,
            'on_success_matcher': all_of(*pipeline_verifiers)
        }

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        wordcount.run(test_pipeline.get_full_options_as_args(**extra_opts))
示例#45
0
  def test_create_groups(self):
    p = TestPipeline()

    group_ids_pcoll = p | 'CreateGroupIds' >> beam.Create(['A', 'B', 'C'])
    corpus_pcoll = p | 'CreateCorpus' >> beam.Create(
        [{'f': 'corpus1'}, {'f': 'corpus2'}, {'f': 'corpus3'}])
    words_pcoll = p | 'CreateWords' >> beam.Create(
        [{'f': 'word1'}, {'f': 'word2'}, {'f': 'word3'}])
    ignore_corpus_pcoll = p | 'CreateIgnoreCorpus' >> beam.Create(['corpus1'])
    ignore_word_pcoll = p | 'CreateIgnoreWord' >> beam.Create(['word1'])

    groups = bigquery_side_input.create_groups(group_ids_pcoll, corpus_pcoll,
                                               words_pcoll, ignore_corpus_pcoll,
                                               ignore_word_pcoll)

    beam.assert_that(groups, beam.equal_to(
        [('A', 'corpus2', 'word2'),
         ('B', 'corpus2', 'word2'),
         ('C', 'corpus2', 'word2')]))
    p.run()
示例#46
0
 def test_process_single(self):
     path = os.path.join(self._new_tempdir(), 'result')
     self._write_file(path, FOO_RECORD_BASE64)
     with TestPipeline() as p:
         result = (p
                   | beam.io.Read(
                       _TFRecordSource(
                           path,
                           coder=coders.BytesCoder(),
                           compression_type=CompressionTypes.AUTO,
                           validate=True)))
         beam.assert_that(result, beam.equal_to(['foo']))
示例#47
0
    def test_window_param(self):
        class TestDoFn(DoFn):
            def process(self, element, window=DoFn.WindowParam):
                yield (element, (float(window.start), float(window.end)))

        pipeline = TestPipeline()
        pcoll = (pipeline
                 | Create([1, 7])
                 | Map(lambda x: TimestampedValue(x, x))
                 | WindowInto(windowfn=SlidingWindows(10, 5))
                 | ParDo(TestDoFn()))
        assert_that(
            pcoll,
            equal_to([(1, (-5, 5)), (1, (0, 10)), (7, (0, 10)), (7, (5, 15))]))
        pcoll2 = pcoll | 'Again' >> ParDo(TestDoFn())
        assert_that(pcoll2,
                    equal_to([((1, (-5, 5)), (-5, 5)), ((1, (0, 10)), (0, 10)),
                              ((7, (0, 10)), (0, 10)),
                              ((7, (5, 15)), (5, 15))]),
                    label='doubled windows')
        pipeline.run()
示例#48
0
 def test_group_by_key_input_visitor_with_invalid_inputs(self):
     p = TestPipeline()
     pcoll1 = PCollection(p)
     pcoll2 = PCollection(p)
     for transform in [beam.GroupByKeyOnly(), beam.GroupByKey()]:
         pcoll1.element_type = typehints.TupleSequenceConstraint
         pcoll2.element_type = typehints.Set
         err_msg = "Input to GroupByKey must be of Tuple or Any type"
         for pcoll in [pcoll1, pcoll2]:
             with self.assertRaisesRegexp(ValueError, err_msg):
                 runner.group_by_key_input_visitor().visit_transform(
                     AppliedPTransform(None, transform, "label", [pcoll]))
示例#49
0
  def test_builtin_combines(self):
    pipeline = TestPipeline()

    vals = [6, 3, 1, 1, 9, 1, 5, 2, 0, 6]
    mean = sum(vals) / float(len(vals))
    size = len(vals)

    # First for global combines.
    pcoll = pipeline | 'start' >> Create(vals)
    result_mean = pcoll | 'mean' >> combine.Mean.Globally()
    result_count = pcoll | 'count' >> combine.Count.Globally()
    assert_that(result_mean, equal_to([mean]), label='assert:mean')
    assert_that(result_count, equal_to([size]), label='assert:size')

    # Again for per-key combines.
    pcoll = pipeline | 'start-perkey' >> Create([('a', x) for x in vals])
    result_key_mean = pcoll | 'mean-perkey' >> combine.Mean.PerKey()
    result_key_count = pcoll | 'count-perkey' >> combine.Count.PerKey()
    assert_that(result_key_mean, equal_to([('a', mean)]), label='key:mean')
    assert_that(result_key_count, equal_to([('a', size)]), label='key:size')
    pipeline.run()
示例#50
0
def model_group_by_key(contents, output_path):
    """Applying a GroupByKey Transform."""
    import re

    import apache_beam as beam
    p = TestPipeline()  # Use TestPipeline for testing.
    words_and_counts = (p
                        | beam.Create(contents)
                        | beam.FlatMap(lambda x: re.findall(r'\w+', x))
                        | 'one word' >> beam.Map(lambda w: (w, 1)))
    # GroupByKey accepts a PCollection of (w, 1) and
    # outputs a PCollection of (w, (1, 1, ...)).
    # (A key/value pair is just a tuple in Python.)
    # This is a somewhat forced example, since one could
    # simply use beam.combiners.Count.PerElement here.
    # [START model_group_by_key_transform]
    grouped_words = words_and_counts | beam.GroupByKey()
    # [END model_group_by_key_transform]
    (grouped_words
     | 'count words' >> beam.Map(lambda (word, counts): (word, len(counts)))
     | beam.io.WriteToText(output_path))
    p.run()
示例#51
0
    def test_global_sample(self):
        def is_good_sample(actual):
            assert len(actual) == 1
            assert sorted(actual[0]) in [[1, 1, 2], [1, 2, 2]], actual

        with TestPipeline() as pipeline:
            pcoll = pipeline | 'start' >> Create([1, 1, 2, 2])
            for ix in xrange(9):
                assert_that(
                    pcoll
                    | 'sample-%d' % ix >> combine.Sample.FixedSizeGlobally(3),
                    is_good_sample,
                    label='check-%d' % ix)
示例#52
0
    def test_pardo_side_input(self):
        p = TestPipeline()
        words = p | 'start' >> beam.Create(['a', 'bb', 'ccc', 'dddd'])

        # [START model_pardo_side_input]
        # Callable takes additional arguments.
        def filter_using_length(word, lower_bound, upper_bound=float('inf')):
            if lower_bound <= len(word) <= upper_bound:
                yield word

        # Construct a deferred side input.
        avg_word_len = (words
                        | beam.Map(len)
                        | beam.CombineGlobally(beam.combiners.MeanCombineFn()))

        # Call with explicit side inputs.
        small_words = words | 'small' >> beam.FlatMap(filter_using_length, 0,
                                                      3)

        # A single deferred side input.
        larger_than_average = (words | 'large' >> beam.FlatMap(
            filter_using_length, lower_bound=pvalue.AsSingleton(avg_word_len)))

        # Mix and match.
        small_but_nontrivial = words | beam.FlatMap(
            filter_using_length,
            lower_bound=2,
            upper_bound=pvalue.AsSingleton(avg_word_len))
        # [END model_pardo_side_input]

        beam.assert_that(small_words, beam.equal_to(['a', 'bb', 'ccc']))
        beam.assert_that(larger_than_average,
                         beam.equal_to(['ccc', 'dddd']),
                         label='larger_than_average')
        beam.assert_that(small_but_nontrivial,
                         beam.equal_to(['bb']),
                         label='small_but_not_trivial')
        p.run()
示例#53
0
def pipeline_monitoring(renames):
    """Using monitoring interface snippets."""

    import re
    import apache_beam as beam
    from apache_beam.utils.pipeline_options import PipelineOptions

    class WordCountOptions(PipelineOptions):
        @classmethod
        def _add_argparse_args(cls, parser):
            parser.add_argument('--input',
                                help='Input for the pipeline',
                                default='gs://my-bucket/input')
            parser.add_argument('--output',
                                help='output for the pipeline',
                                default='gs://my-bucket/output')

    class ExtractWordsFn(beam.DoFn):
        def process(self, element):
            words = re.findall(r'[A-Za-z\']+', element)
            for word in words:
                yield word

    class FormatCountsFn(beam.DoFn):
        def process(self, element):
            word, count = element
            yield '%s: %s' % (word, count)

    # [START pipeline_monitoring_composite]
    # The CountWords Composite Transform inside the WordCount pipeline.
    class CountWords(beam.PTransform):
        def expand(self, pcoll):
            return (pcoll
                    # Convert lines of text into individual words.
                    | 'ExtractWords' >> beam.ParDo(ExtractWordsFn())
                    # Count the number of times each word occurs.
                    | beam.combiners.Count.PerElement()
                    # Format each word and count into a printable string.
                    | 'FormatCounts' >> beam.ParDo(FormatCountsFn()))

    # [END pipeline_monitoring_composite]

    pipeline_options = PipelineOptions()
    options = pipeline_options.view_as(WordCountOptions)
    p = TestPipeline()  # Use TestPipeline for testing.

    # [START pipeline_monitoring_execution]
    (p
     # Read the lines of the input text.
     | 'ReadLines' >> beam.io.ReadFromText(options.input)
     # Count the words.
     | CountWords()
     # Write the formatted word counts to output.
     | 'WriteCounts' >> beam.io.WriteToText(options.output))
    # [END pipeline_monitoring_execution]

    p.visit(SnippetUtils.RenameFiles(renames))
    p.run()
示例#54
0
def model_join_using_side_inputs(name_list, email_list, phone_list,
                                 output_path):
    """Joining PCollections using side inputs."""

    import apache_beam as beam
    from apache_beam.pvalue import AsIter

    p = TestPipeline()  # Use TestPipeline for testing.
    # [START model_join_using_side_inputs]
    # This code performs a join by receiving the set of names as an input and
    # passing PCollections that contain emails and phone numbers as side inputs
    # instead of using CoGroupByKey.
    names = p | 'names' >> beam.Create(name_list)
    emails = p | 'email' >> beam.Create(email_list)
    phones = p | 'phone' >> beam.Create(phone_list)

    def join_info(name, emails, phone_numbers):
        filtered_emails = []
        for name_in_list, email in emails:
            if name_in_list == name:
                filtered_emails.append(email)

        filtered_phone_numbers = []
        for name_in_list, phone_number in phone_numbers:
            if name_in_list == name:
                filtered_phone_numbers.append(phone_number)

        return '; '.join([
            '%s' % name,
            '%s' % ','.join(filtered_emails),
            '%s' % ','.join(filtered_phone_numbers)
        ])

    contact_lines = names | 'CreateContacts' >> beam.core.Map(
        join_info, AsIter(emails), AsIter(phones))
    # [END model_join_using_side_inputs]
    contact_lines | beam.io.WriteToText(output_path)
    p.run()
示例#55
0
  def test_write_record_auto(self):
    file_path_prefix = os.path.join(self._new_tempdir(), 'result')
    with TestPipeline() as p:
      input_data = ['foo', 'bar']
      _ = p | beam.Create(input_data) | WriteToTFRecord(
          file_path_prefix, file_name_suffix='.gz')

    actual = []
    file_name = glob.glob(file_path_prefix + '-*.gz')[0]
    for r in tf.python_io.tf_record_iterator(
        file_name, options=tf.python_io.TFRecordOptions(
            tf.python_io.TFRecordCompressionType.GZIP)):
      actual.append(r)
    self.assertEqual(actual, input_data)
示例#56
0
def pipeline_options_remote(argv):
    """Creating a Pipeline using a PipelineOptions object for remote execution."""

    from apache_beam import Pipeline
    from apache_beam.utils.pipeline_options import PipelineOptions

    # [START pipeline_options_create]
    options = PipelineOptions(flags=argv)

    # [END pipeline_options_create]

    # [START pipeline_options_define_custom]
    class MyOptions(PipelineOptions):
        @classmethod
        def _add_argparse_args(cls, parser):
            parser.add_argument('--input')
            parser.add_argument('--output')

    # [END pipeline_options_define_custom]

    from apache_beam.utils.pipeline_options import GoogleCloudOptions
    from apache_beam.utils.pipeline_options import StandardOptions

    # [START pipeline_options_dataflow_service]
    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=argv)

    # For Cloud execution, set the Cloud Platform project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = 'my-project-id'
    google_cloud_options.job_name = 'myjob'
    google_cloud_options.staging_location = 'gs://my-bucket/binaries'
    google_cloud_options.temp_location = 'gs://my-bucket/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)
    # [END pipeline_options_dataflow_service]

    my_options = options.view_as(MyOptions)
    my_input = my_options.input
    my_output = my_options.output

    p = TestPipeline()  # Use TestPipeline for testing.

    lines = p | beam.io.ReadFromText(my_input)
    lines | beam.io.WriteToText(my_output)

    p.run()
示例#57
0
 def test_group_by_key_input_visitor_with_valid_inputs(self):
     p = TestPipeline()
     pcoll1 = PCollection(p)
     pcoll2 = PCollection(p)
     pcoll3 = PCollection(p)
     for transform in [beam.GroupByKeyOnly(), beam.GroupByKey()]:
         pcoll1.element_type = None
         pcoll2.element_type = typehints.Any
         pcoll3.element_type = typehints.KV[typehints.Any, typehints.Any]
         for pcoll in [pcoll1, pcoll2, pcoll3]:
             DataflowRunner.group_by_key_input_visitor().visit_transform(
                 AppliedPTransform(None, transform, "label", [pcoll]))
             self.assertEqual(pcoll.element_type,
                              typehints.KV[typehints.Any, typehints.Any])
 def test_read_auto_pattern_compressed_and_uncompressed(self):
     _, lines = write_data(200)
     splits = [0, 34, 100, 140, 164, 188, 200]
     chunks = [
         lines[splits[i - 1]:splits[i]] for i in xrange(1, len(splits))
     ]
     chunks_to_write = []
     for i, c in enumerate(chunks):
         if i % 2 == 0:
             out = cStringIO.StringIO()
             with gzip.GzipFile(fileobj=out, mode="w") as f:
                 f.write('\n'.join(c))
             chunks_to_write.append(out.getvalue())
         else:
             chunks_to_write.append('\n'.join(c))
     file_pattern = write_prepared_pattern(chunks_to_write,
                                           suffixes=(['.gz', ''] * 3))
     pipeline = TestPipeline()
     pcoll = pipeline | 'Read' >> beam.Read(
         LineSource(file_pattern,
                    compression_type=fileio.CompressionTypes.AUTO))
     assert_that(pcoll, equal_to(lines))
     pipeline.run()
示例#59
0
    def test_top_shorthands(self):
        pipeline = TestPipeline()

        pcoll = pipeline | 'start' >> Create([6, 3, 1, 1, 9, 1, 5, 2, 0, 6])
        result_top = pcoll | 'top' >> beam.CombineGlobally(combine.Largest(5))
        result_bot = pcoll | 'bot' >> beam.CombineGlobally(combine.Smallest(4))
        assert_that(result_top,
                    equal_to([[9, 6, 6, 5, 3]]),
                    label='assert:top')
        assert_that(result_bot, equal_to([[0, 1, 1, 1]]), label='assert:bot')

        pcoll = pipeline | 'start-perkey' >> Create(
            [('a', x) for x in [6, 3, 1, 1, 9, 1, 5, 2, 0, 6]])
        result_ktop = pcoll | 'top-perkey' >> beam.CombinePerKey(
            combine.Largest(5))
        result_kbot = pcoll | 'bot-perkey' >> beam.CombinePerKey(
            combine.Smallest(4))
        assert_that(result_ktop,
                    equal_to([('a', [9, 6, 6, 5, 3])]),
                    label='k:top')
        assert_that(result_kbot,
                    equal_to([('a', [0, 1, 1, 1])]),
                    label='k:bot')
        pipeline.run()
示例#60
0
    def _test_flatten_input_visitor(self, input_type, output_type, num_inputs):
        p = TestPipeline()
        inputs = []
        for _ in range(num_inputs):
            input_pcoll = PCollection(p)
            input_pcoll.element_type = input_type
            inputs.append(input_pcoll)
        output_pcoll = PCollection(p)
        output_pcoll.element_type = output_type

        flatten = AppliedPTransform(None, beam.Flatten(), "label", inputs)
        flatten.add_output(output_pcoll, None)
        DataflowRunner.flatten_input_visitor().visit_transform(flatten)
        for _ in range(num_inputs):
            self.assertEqual(inputs[0].element_type, output_type)