def test_dataflow_file_pattern(self): pattern, expected_data = write_pattern([5, 3, 12, 8, 8, 4]) assert len(expected_data) == 40 pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText(pattern) assert_that(pcoll, equal_to(expected_data)) pipeline.run()
def test_setting_timestamp(self): p = TestPipeline() unkeyed_items = p | beam.Create([12, 30, 60, 61, 66]) items = (unkeyed_items | 'key' >> beam.Map(lambda x: ('k', x))) def extract_timestamp_from_log_entry(entry): return entry[1] # [START setting_timestamp] class AddTimestampDoFn(beam.DoFn): def process(self, element): # Extract the numeric Unix seconds-since-epoch timestamp to be # associated with the current log entry. unix_timestamp = extract_timestamp_from_log_entry(element) # Wrap and emit the current entry and new timestamp in a # TimestampedValue. yield beam.TimestampedValue(element, unix_timestamp) timestamped_items = items | 'timestamp' >> beam.ParDo(AddTimestampDoFn()) # [END setting_timestamp] fixed_windowed_items = ( timestamped_items | 'window' >> beam.WindowInto( beam.window.FixedWindows(60))) summed = (fixed_windowed_items | 'group' >> beam.GroupByKey() | 'combine' >> beam.CombineValues(sum)) unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1]) beam.assert_that(unkeyed, beam.equal_to([42, 187])) p.run()
def test_setting_timestamp(self): p = TestPipeline() unkeyed_items = p | beam.Create([12, 30, 60, 61, 66]) items = (unkeyed_items | 'key' >> beam.Map(lambda x: ('k', x))) def extract_timestamp_from_log_entry(entry): return entry[1] # [START setting_timestamp] class AddTimestampDoFn(beam.DoFn): def process(self, element): # Extract the numeric Unix seconds-since-epoch timestamp to be # associated with the current log entry. unix_timestamp = extract_timestamp_from_log_entry(element) # Wrap and emit the current entry and new timestamp in a # TimestampedValue. yield beam.TimestampedValue(element, unix_timestamp) timestamped_items = items | 'timestamp' >> beam.ParDo( AddTimestampDoFn()) # [END setting_timestamp] fixed_windowed_items = ( timestamped_items | 'window' >> beam.WindowInto(beam.window.FixedWindows(60))) summed = (fixed_windowed_items | 'group' >> beam.GroupByKey() | 'combine' >> beam.CombineValues(sum)) unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1]) beam.assert_that(unkeyed, beam.equal_to([42, 187])) p.run()
def model_multiple_pcollections_partition(contents, output_path): """Splitting a PCollection with Partition.""" some_hash_fn = lambda s: ord(s[0]) def get_percentile(i): """Assume i in [0,100).""" return i import apache_beam as beam p = TestPipeline() # Use TestPipeline for testing. students = p | beam.Create(contents) # [START model_multiple_pcollections_partition] def partition_fn(student, num_partitions): return int(get_percentile(student) * num_partitions / 100) by_decile = students | beam.Partition(partition_fn, 10) # [END model_multiple_pcollections_partition] # [START model_multiple_pcollections_partition_40th] fortieth_percentile = by_decile[4] # [END model_multiple_pcollections_partition_40th] ([by_decile[d] for d in xrange(10) if d != 4] + [fortieth_percentile] | beam.Flatten() | beam.io.WriteToText(output_path)) p.run()
def model_multiple_pcollections_partition(contents, output_path): """Splitting a PCollection with Partition.""" some_hash_fn = lambda s: ord(s[0]) def get_percentile(i): """Assume i in [0,100).""" return i import apache_beam as beam p = TestPipeline() # Use TestPipeline for testing. students = p | beam.Create(contents) # [START model_multiple_pcollections_partition] def partition_fn(student, num_partitions): return int(get_percentile(student) * num_partitions / 100) by_decile = students | beam.Partition(partition_fn, 10) # [END model_multiple_pcollections_partition] # [START model_multiple_pcollections_partition_40th] fortieth_percentile = by_decile[4] # [END model_multiple_pcollections_partition_40th] ([by_decile[d] for d in xrange(10) if d != 4] + [fortieth_percentile] | beam.Flatten() | beam.io.WriteToText(output_path)) p.run()
def test_basics(self): p = TestPipeline() rows = (p | 'create' >> beam.Create([{ 'month': 1, 'day': 1, 'tornado': False }, { 'month': 1, 'day': 2, 'tornado': True }, { 'month': 1, 'day': 3, 'tornado': True }, { 'month': 2, 'day': 1, 'tornado': True }])) results = bigquery_tornadoes.count_tornadoes(rows) beam.assert_that( results, beam.equal_to([{ 'month': 1, 'tornado_count': 2 }, { 'month': 2, 'tornado_count': 1 }])) p.run().wait_until_finish()
def test_compute_top_sessions(self): p = TestPipeline() edits = p | beam.Create(self.EDITS) result = edits | top_wikipedia_sessions.ComputeTopSessions(1.0) beam.assert_that(result, beam.equal_to(self.EXPECTED)) p.run()
def test_dataflow_file_pattern(self): pattern, expected_data = write_pattern([5, 3, 12, 8, 8, 4]) assert len(expected_data) == 40 pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText(pattern) assert_that(pcoll, equal_to(expected_data)) pipeline.run()
def model_composite_transform_example(contents, output_path): """Example of a composite transform. To declare a composite transform, define a subclass of PTransform. To override the apply method, define a method "apply" that takes a PCollection as its only parameter and returns a PCollection. """ import re import apache_beam as beam # [START composite_transform_example] # [START composite_ptransform_apply_method] # [START composite_ptransform_declare] class CountWords(beam.PTransform): # [END composite_ptransform_declare] def expand(self, pcoll): return (pcoll | beam.FlatMap(lambda x: re.findall(r'\w+', x)) | beam.combiners.Count.PerElement() | beam.Map(lambda (word, c): '%s: %s' % (word, c))) # [END composite_ptransform_apply_method] # [END composite_transform_example] p = TestPipeline() # Use TestPipeline for testing. (p | beam.Create(contents) | CountWords() | beam.io.WriteToText(output_path)) p.run()
def test_compute_top_sessions(self): p = TestPipeline() edits = p | beam.Create(self.EDITS) result = edits | top_wikipedia_sessions.ComputeTopSessions(1.0) beam.assert_that(result, beam.equal_to(self.EXPECTED)) p.run()
def test_read_gzip_empty_file(self): file_name = self._create_temp_file() pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText( file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder()) assert_that(pcoll, equal_to([])) pipeline.run()
def test_timestamped_with_combiners(self): p = TestPipeline() result = (p # Create some initial test values. | 'start' >> Create([(k, k) for k in range(10)]) # The purpose of the WindowInto transform is to establish a # FixedWindows windowing function for the PCollection. # It does not bucket elements into windows since the timestamps # from Create are not spaced 5 ms apart and very likely they all # fall into the same window. | 'w' >> WindowInto(FixedWindows(5)) # Generate timestamped values using the values as timestamps. # Now there are values 5 ms apart and since Map propagates the # windowing function from input to output the output PCollection # will have elements falling into different 5ms windows. | Map(lambda (x, t): TimestampedValue(x, t)) # We add a 'key' to each value representing the index of the # window. This is important since there is no guarantee of # order for the elements of a PCollection. | Map(lambda v: (v / 5, v))) # Sum all elements associated with a key and window. Although it # is called CombinePerKey it is really CombinePerKeyAndWindow the # same way GroupByKey is really GroupByKeyAndWindow. sum_per_window = result | CombinePerKey(sum) # Compute mean per key and window. mean_per_window = result | combiners.Mean.PerKey() assert_that(sum_per_window, equal_to([(0, 10), (1, 35)]), label='assert:sum') assert_that(mean_per_window, equal_to([(0, 2.0), (1, 7.0)]), label='assert:mean') p.run()
def pipeline_logging(lines, output): """Logging Pipeline Messages.""" import re import apache_beam as beam # [START pipeline_logging] # import Python logging module. import logging class ExtractWordsFn(beam.DoFn): def process(self, element): words = re.findall(r'[A-Za-z\']+', element) for word in words: yield word if word.lower() == 'love': # Log using the root logger at info or higher levels logging.info('Found : %s', word.lower()) # Remaining WordCount example code ... # [END pipeline_logging] p = TestPipeline() # Use TestPipeline for testing. (p | beam.Create(lines) | beam.ParDo(ExtractWordsFn()) | beam.io.WriteToText(output)) p.run()
def test_bad_types(self): # [START type_hints_missing_define_numbers] p = TestPipeline(options=PipelineOptions(pipeline_type_check=True)) numbers = p | beam.Create(['1', '2', '3']) # [END type_hints_missing_define_numbers] # Consider the following code. # pylint: disable=expression-not-assigned # pylint: disable=unused-variable # [START type_hints_missing_apply] evens = numbers | beam.Filter(lambda x: x % 2 == 0) # [END type_hints_missing_apply] # Now suppose numbers was defined as [snippet above]. # When running this pipeline, you'd get a runtime error, # possibly on a remote machine, possibly very late. with self.assertRaises(TypeError): p.run() # To catch this early, we can assert what types we expect. with self.assertRaises(typehints.TypeCheckError): # [START type_hints_takes] evens = numbers | beam.Filter( lambda x: x % 2 == 0).with_input_types(int) # [END type_hints_takes] # Type hints can be declared on DoFns and callables as well, rather # than where they're used, to be more self contained. with self.assertRaises(typehints.TypeCheckError): # [START type_hints_do_fn] @beam.typehints.with_input_types(int) class FilterEvensDoFn(beam.DoFn): def process(self, element): if element % 2 == 0: yield element evens = numbers | beam.ParDo(FilterEvensDoFn()) # [END type_hints_do_fn] words = p | 'words' >> beam.Create(['a', 'bb', 'c']) # One can assert outputs and apply them to transforms as well. # Helps document the contract and checks it at pipeline construction time. # [START type_hints_transform] T = beam.typehints.TypeVariable('T') @beam.typehints.with_input_types(T) @beam.typehints.with_output_types(beam.typehints.Tuple[int, T]) class MyTransform(beam.PTransform): def expand(self, pcoll): return pcoll | beam.Map(lambda x: (len(x), x)) words_with_lens = words | MyTransform() # [END type_hints_transform] # pylint: disable=expression-not-assigned with self.assertRaises(typehints.TypeCheckError): words_with_lens | beam.Map(lambda x: x).with_input_types( beam.typehints.Tuple[int, int])
def test_to_list_and_to_dict(self): pipeline = TestPipeline() the_list = [6, 3, 1, 1, 9, 1, 5, 2, 0, 6] pcoll = pipeline | 'start' >> Create(the_list) result = pcoll | 'to list' >> combine.ToList() def matcher(expected): def match(actual): equal_to(expected[0])(actual[0]) return match assert_that(result, matcher([the_list])) pipeline.run() pipeline = TestPipeline() pairs = [(1, 2), (3, 4), (5, 6)] pcoll = pipeline | 'start-pairs' >> Create(pairs) result = pcoll | 'to dict' >> combine.ToDict() def matcher(): def match(actual): equal_to([1])([len(actual)]) equal_to(pairs)(actual[0].iteritems()) return match assert_that(result, matcher()) pipeline.run()
def test_create_groups(self): p = TestPipeline() group_ids_pcoll = p | 'CreateGroupIds' >> beam.Create(['A', 'B', 'C']) corpus_pcoll = p | 'CreateCorpus' >> beam.Create([{ 'f': 'corpus1' }, { 'f': 'corpus2' }, { 'f': 'corpus3' }]) words_pcoll = p | 'CreateWords' >> beam.Create([{ 'f': 'word1' }, { 'f': 'word2' }, { 'f': 'word3' }]) ignore_corpus_pcoll = p | 'CreateIgnoreCorpus' >> beam.Create( ['corpus1']) ignore_word_pcoll = p | 'CreateIgnoreWord' >> beam.Create(['word1']) groups = bigquery_side_input.create_groups(group_ids_pcoll, corpus_pcoll, words_pcoll, ignore_corpus_pcoll, ignore_word_pcoll) beam.assert_that( groups, beam.equal_to([('A', 'corpus2', 'word2'), ('B', 'corpus2', 'word2'), ('C', 'corpus2', 'word2')])) p.run()
def model_co_group_by_key_tuple(email_list, phone_list, output_path): """Applying a CoGroupByKey Transform to a tuple.""" import apache_beam as beam p = TestPipeline() # Use TestPipeline for testing. # [START model_group_by_key_cogroupbykey_tuple] # Each data set is represented by key-value pairs in separate PCollections. # Both data sets share a common key type (in this example str). # The email_list contains values such as: ('joe', '*****@*****.**') with # multiple possible values for each key. # The phone_list contains values such as: ('mary': '111-222-3333') with # multiple possible values for each key. emails = p | 'email' >> beam.Create(email_list) phones = p | 'phone' >> beam.Create(phone_list) # The result PCollection contains one key-value element for each key in the # input PCollections. The key of the pair will be the key from the input and # the value will be a dictionary with two entries: 'emails' - an iterable of # all values for the current key in the emails PCollection and 'phones': an # iterable of all values for the current key in the phones PCollection. # For instance, if 'emails' contained ('joe', '*****@*****.**') and # ('joe', '*****@*****.**'), then 'result' will contain the element # ('joe', {'emails': ['*****@*****.**', '*****@*****.**'], 'phones': ...}) result = {'emails': emails, 'phones': phones} | beam.CoGroupByKey() def join_info((name, info)): return '; '.join([ '%s' % name, '%s' % ','.join(info['emails']), '%s' % ','.join(info['phones']) ]) contact_lines = result | beam.Map(join_info) # [END model_group_by_key_cogroupbykey_tuple] contact_lines | beam.io.WriteToText(output_path) p.run()
def test_dataflow_single_file(self): file_name, expected_data = write_data(5) assert len(expected_data) == 5 pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText(file_name) assert_that(pcoll, equal_to(expected_data)) pipeline.run()
def model_multiple_pcollections_flatten(contents, output_path): """Merging a PCollection with Flatten.""" some_hash_fn = lambda s: ord(s[0]) import apache_beam as beam p = TestPipeline() # Use TestPipeline for testing. partition_fn = lambda element, partitions: some_hash_fn(element) % partitions # Partition into deciles partitioned = p | beam.Create(contents) | beam.Partition(partition_fn, 3) pcoll1 = partitioned[0] pcoll2 = partitioned[1] pcoll3 = partitioned[2] # Flatten them back into 1 # A collection of PCollection objects can be represented simply # as a tuple (or list) of PCollections. # (The SDK for Python has no separate type to store multiple # PCollection objects, whether containing the same or different # types.) # [START model_multiple_pcollections_flatten] merged = ( # [START model_multiple_pcollections_tuple] (pcoll1, pcoll2, pcoll3) # [END model_multiple_pcollections_tuple] # A list of tuples can be "piped" directly into a Flatten transform. | beam.Flatten()) # [END model_multiple_pcollections_flatten] merged | beam.io.WriteToText(output_path) p.run()
def run_pipeline(self, count_implementation, factor=1): p = TestPipeline() words = p | beam.Create(['CAT', 'DOG', 'CAT', 'CAT', 'DOG']) result = words | count_implementation assert_that( result, equal_to([('CAT', (3 * factor)), ('DOG', (2 * factor))])) p.run()
def test_run_direct(self): file_name = self._create_temp_file('aaaa\nbbbb\ncccc\ndddd') pipeline = TestPipeline() pcoll = pipeline | beam.io.Read(LineSource(file_name)) assert_that(pcoll, equal_to(['aaaa', 'bbbb', 'cccc', 'dddd'])) pipeline.run()
def test_run_direct(self): file_name = self._create_temp_file('aaaa\nbbbb\ncccc\ndddd') pipeline = TestPipeline() pcoll = pipeline | beam.io.Read(LineSource(file_name)) assert_that(pcoll, equal_to(['aaaa', 'bbbb', 'cccc', 'dddd'])) pipeline.run()
def test_timestamped_with_combiners(self): p = TestPipeline() result = (p # Create some initial test values. | 'start' >> Create([(k, k) for k in range(10)]) # The purpose of the WindowInto transform is to establish a # FixedWindows windowing function for the PCollection. # It does not bucket elements into windows since the timestamps # from Create are not spaced 5 ms apart and very likely they all # fall into the same window. | 'w' >> WindowInto(FixedWindows(5)) # Generate timestamped values using the values as timestamps. # Now there are values 5 ms apart and since Map propagates the # windowing function from input to output the output PCollection # will have elements falling into different 5ms windows. | Map(lambda (x, t): TimestampedValue(x, t)) # We add a 'key' to each value representing the index of the # window. This is important since there is no guarantee of # order for the elements of a PCollection. | Map(lambda v: (v / 5, v))) # Sum all elements associated with a key and window. Although it # is called CombinePerKey it is really CombinePerKeyAndWindow the # same way GroupByKey is really GroupByKeyAndWindow. sum_per_window = result | CombinePerKey(sum) # Compute mean per key and window. mean_per_window = result | combiners.Mean.PerKey() assert_that(sum_per_window, equal_to([(0, 10), (1, 35)]), label='assert:sum') assert_that(mean_per_window, equal_to([(0, 2.0), (1, 7.0)]), label='assert:mean') p.run()
def model_multiple_pcollections_flatten(contents, output_path): """Merging a PCollection with Flatten.""" some_hash_fn = lambda s: ord(s[0]) import apache_beam as beam p = TestPipeline() # Use TestPipeline for testing. partition_fn = lambda element, partitions: some_hash_fn(element ) % partitions # Partition into deciles partitioned = p | beam.Create(contents) | beam.Partition(partition_fn, 3) pcoll1 = partitioned[0] pcoll2 = partitioned[1] pcoll3 = partitioned[2] # Flatten them back into 1 # A collection of PCollection objects can be represented simply # as a tuple (or list) of PCollections. # (The SDK for Python has no separate type to store multiple # PCollection objects, whether containing the same or different # types.) # [START model_multiple_pcollections_flatten] merged = ( # [START model_multiple_pcollections_tuple] (pcoll1, pcoll2, pcoll3) # [END model_multiple_pcollections_tuple] # A list of tuples can be "piped" directly into a Flatten transform. | beam.Flatten()) # [END model_multiple_pcollections_flatten] merged | beam.io.WriteToText(output_path) p.run()
def run_pipeline(self, count_implementation, factor=1): p = TestPipeline() words = p | beam.Create(['CAT', 'DOG', 'CAT', 'CAT', 'DOG']) result = words | count_implementation assert_that( result, equal_to([('CAT', (3 * factor)), ('DOG', (2 * factor))])) p.run()
def test_to_list_and_to_dict(self): pipeline = TestPipeline() the_list = [6, 3, 1, 1, 9, 1, 5, 2, 0, 6] pcoll = pipeline | 'start' >> Create(the_list) result = pcoll | 'to list' >> combine.ToList() def matcher(expected): def match(actual): equal_to(expected[0])(actual[0]) return match assert_that(result, matcher([the_list])) pipeline.run() pipeline = TestPipeline() pairs = [(1, 2), (3, 4), (5, 6)] pcoll = pipeline | 'start-pairs' >> Create(pairs) result = pcoll | 'to dict' >> combine.ToDict() def matcher(): def match(actual): equal_to([1])([len(actual)]) equal_to(pairs)(actual[0].iteritems()) return match assert_that(result, matcher()) pipeline.run()
def test_dataflow_single_file(self): file_name, expected_data = write_data(5) assert len(expected_data) == 5 pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText(file_name) assert_that(pcoll, equal_to(expected_data)) pipeline.run()
def test_bad_types(self): p = TestPipeline() evens = None # pylint: disable=unused-variable # [START type_hints_missing_define_numbers] numbers = p | beam.Create(['1', '2', '3']) # [END type_hints_missing_define_numbers] # Consider the following code. # pylint: disable=expression-not-assigned # pylint: disable=unused-variable # [START type_hints_missing_apply] evens = numbers | beam.Filter(lambda x: x % 2 == 0) # [END type_hints_missing_apply] # Now suppose numbers was defined as [snippet above]. # When running this pipeline, you'd get a runtime error, # possibly on a remote machine, possibly very late. with self.assertRaises(TypeError): p.run() # To catch this early, we can assert what types we expect. with self.assertRaises(typehints.TypeCheckError): # [START type_hints_takes] p.options.view_as(TypeOptions).pipeline_type_check = True evens = numbers | beam.Filter(lambda x: x % 2 == 0).with_input_types(int) # [END type_hints_takes] # Type hints can be declared on DoFns and callables as well, rather # than where they're used, to be more self contained. with self.assertRaises(typehints.TypeCheckError): # [START type_hints_do_fn] @beam.typehints.with_input_types(int) class FilterEvensDoFn(beam.DoFn): def process(self, element): if element % 2 == 0: yield element evens = numbers | beam.ParDo(FilterEvensDoFn()) # [END type_hints_do_fn] words = p | 'words' >> beam.Create(['a', 'bb', 'c']) # One can assert outputs and apply them to transforms as well. # Helps document the contract and checks it at pipeline construction time. # [START type_hints_transform] T = beam.typehints.TypeVariable('T') @beam.typehints.with_input_types(T) @beam.typehints.with_output_types(beam.typehints.Tuple[int, T]) class MyTransform(beam.PTransform): def expand(self, pcoll): return pcoll | beam.Map(lambda x: (len(x), x)) words_with_lens = words | MyTransform() # [END type_hints_transform] # pylint: disable=expression-not-assigned with self.assertRaises(typehints.TypeCheckError): words_with_lens | beam.Map(lambda x: x).with_input_types( beam.typehints.Tuple[int, int])
def model_composite_transform_example(contents, output_path): """Example of a composite transform. To declare a composite transform, define a subclass of PTransform. To override the apply method, define a method "apply" that takes a PCollection as its only parameter and returns a PCollection. """ import re import apache_beam as beam # [START composite_transform_example] # [START composite_ptransform_apply_method] # [START composite_ptransform_declare] class CountWords(beam.PTransform): # [END composite_ptransform_declare] def expand(self, pcoll): return (pcoll | beam.FlatMap(lambda x: re.findall(r'\w+', x)) | beam.combiners.Count.PerElement() | beam.Map(lambda (word, c): '%s: %s' % (word, c))) # [END composite_ptransform_apply_method] # [END composite_transform_example] p = TestPipeline() # Use TestPipeline for testing. (p | beam.Create(contents) | CountWords() | beam.io.WriteToText(output_path)) p.run()
def model_co_group_by_key_tuple(email_list, phone_list, output_path): """Applying a CoGroupByKey Transform to a tuple.""" import apache_beam as beam p = TestPipeline() # Use TestPipeline for testing. # [START model_group_by_key_cogroupbykey_tuple] # Each data set is represented by key-value pairs in separate PCollections. # Both data sets share a common key type (in this example str). # The email_list contains values such as: ('joe', '*****@*****.**') with # multiple possible values for each key. # The phone_list contains values such as: ('mary': '111-222-3333') with # multiple possible values for each key. emails = p | 'email' >> beam.Create(email_list) phones = p | 'phone' >> beam.Create(phone_list) # The result PCollection contains one key-value element for each key in the # input PCollections. The key of the pair will be the key from the input and # the value will be a dictionary with two entries: 'emails' - an iterable of # all values for the current key in the emails PCollection and 'phones': an # iterable of all values for the current key in the phones PCollection. # For instance, if 'emails' contained ('joe', '*****@*****.**') and # ('joe', '*****@*****.**'), then 'result' will contain the element # ('joe', {'emails': ['*****@*****.**', '*****@*****.**'], 'phones': ...}) result = {'emails': emails, 'phones': phones} | beam.CoGroupByKey() def join_info((name, info)): return '; '.join(['%s' % name, '%s' % ','.join(info['emails']), '%s' % ','.join(info['phones'])]) contact_lines = result | beam.Map(join_info) # [END model_group_by_key_cogroupbykey_tuple] contact_lines | beam.io.WriteToText(output_path) p.run()
def pipeline_logging(lines, output): """Logging Pipeline Messages.""" import re import apache_beam as beam # [START pipeline_logging] # import Python logging module. import logging class ExtractWordsFn(beam.DoFn): def process(self, element): words = re.findall(r'[A-Za-z\']+', element) for word in words: yield word if word.lower() == 'love': # Log using the root logger at info or higher levels logging.info('Found : %s', word.lower()) # Remaining WordCount example code ... # [END pipeline_logging] p = TestPipeline() # Use TestPipeline for testing. (p | beam.Create(lines) | beam.ParDo(ExtractWordsFn()) | beam.io.WriteToText(output)) p.run()
def test_runtime_checks_on(self): # pylint: disable=expression-not-assigned p = TestPipeline() with self.assertRaises(typehints.TypeCheckError): # [START type_hints_runtime_on] p.options.view_as(TypeOptions).runtime_type_check = True p | beam.Create(['a']) | beam.Map(lambda x: 3).with_output_types(str) p.run()
def test_runtime_checks_on(self): # pylint: disable=expression-not-assigned p = TestPipeline(options=PipelineOptions(runtime_type_check=True)) with self.assertRaises(typehints.TypeCheckError): # [START type_hints_runtime_on] p | beam.Create(['a' ]) | beam.Map(lambda x: 3).with_output_types(str) p.run()
def test_basics(self): p = TestPipeline() result = p | 'Estimate' >> estimate_pi.EstimatePiTransform(5000) # Note: Probabilistically speaking this test can fail with a probability # that is very small (VERY) given that we run at least 500 thousand trials. assert_that(result, in_between(3.125, 3.155)) p.run()
def test_basics(self): p = TestPipeline() result = p | 'Estimate' >> estimate_pi.EstimatePiTransform(5000) # Note: Probabilistically speaking this test can fail with a probability # that is very small (VERY) given that we run at least 500 thousand trials. assert_that(result, in_between(3.125, 3.155)) p.run()
def test_read_gzip_empty_file(self): filename = tempfile.NamedTemporaryFile(delete=False, prefix=tempfile.template).name pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText( filename, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder()) assert_that(pcoll, equal_to([])) pipeline.run()
def pipeline_monitoring(renames): """Using monitoring interface snippets.""" import re import apache_beam as beam from apache_beam.utils.pipeline_options import PipelineOptions class WordCountOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--input', help='Input for the pipeline', default='gs://my-bucket/input') parser.add_argument('--output', help='output for the pipeline', default='gs://my-bucket/output') class ExtractWordsFn(beam.DoFn): def process(self, element): words = re.findall(r'[A-Za-z\']+', element) for word in words: yield word class FormatCountsFn(beam.DoFn): def process(self, element): word, count = element yield '%s: %s' % (word, count) # [START pipeline_monitoring_composite] # The CountWords Composite Transform inside the WordCount pipeline. class CountWords(beam.PTransform): def expand(self, pcoll): return (pcoll # Convert lines of text into individual words. | 'ExtractWords' >> beam.ParDo(ExtractWordsFn()) # Count the number of times each word occurs. | beam.combiners.Count.PerElement() # Format each word and count into a printable string. | 'FormatCounts' >> beam.ParDo(FormatCountsFn())) # [END pipeline_monitoring_composite] pipeline_options = PipelineOptions() options = pipeline_options.view_as(WordCountOptions) p = TestPipeline() # Use TestPipeline for testing. # [START pipeline_monitoring_execution] (p # Read the lines of the input text. | 'ReadLines' >> beam.io.ReadFromText(options.input) # Count the words. | CountWords() # Write the formatted word counts to output. | 'WriteCounts' >> beam.io.WriteToText(options.output)) # [END pipeline_monitoring_execution] p.visit(SnippetUtils.RenameFiles(renames)) p.run()
def test_compute_points(self): p = TestPipeline() records = p | 'create' >> beam.Create(self.SAMPLE_RECORDS) result = (records | 'points' >> beam.FlatMap(coders.compute_points) | beam.CombinePerKey(sum)) assert_that(result, equal_to([('Italy', 0), ('Brasil', 6), ('Germany', 3)])) p.run()
def test_timestamp_param(self): class TestDoFn(DoFn): def process(self, element, timestamp=DoFn.TimestampParam): yield timestamp pipeline = TestPipeline() pcoll = pipeline | 'Create' >> Create([1, 2]) | 'Do' >> ParDo(TestDoFn()) assert_that(pcoll, equal_to([MIN_TIMESTAMP, MIN_TIMESTAMP])) pipeline.run()
def test_context_param(self): class TestDoFn(DoFn): def process(self, element, context=DoFn.ContextParam): yield context.element + 10 pipeline = TestPipeline() pcoll = pipeline | 'Create' >> Create([1, 2])| 'Do' >> ParDo(TestDoFn()) assert_that(pcoll, equal_to([11, 12])) pipeline.run()
def test_element(self): class TestDoFn(DoFn): def process(self, element): yield element + 10 pipeline = TestPipeline() pcoll = pipeline | 'Create' >> Create([1, 2]) | 'Do' >> ParDo(TestDoFn()) assert_that(pcoll, equal_to([11, 12])) pipeline.run()
def test_empty_write(self): temp_path = tempfile.NamedTemporaryFile().name sink = MyFileSink( temp_path, file_name_suffix='.foo', coder=coders.ToStringCoder()) p = TestPipeline() p | beam.Create([]) | beam.io.Write(sink) # pylint: disable=expression-not-assigned p.run() self.assertEqual( open(temp_path + '-00000-of-00001.foo').read(), '[start][end]')
def test_tuple_combine_fn(self): p = TestPipeline() result = (p | Create([('a', 100, 0.0), ('b', 10, -1), ('c', 1, 100)]) | beam.CombineGlobally( combine.TupleCombineFn(max, combine.MeanCombineFn(), sum)).without_defaults()) assert_that(result, equal_to([('c', 111.0 / 3, 99.0)])) p.run()
def test_timestamp_param(self): class TestDoFn(DoFn): def process(self, element, timestamp=DoFn.TimestampParam): yield timestamp pipeline = TestPipeline() pcoll = pipeline | 'Create' >> Create([1, 2]) | 'Do' >> ParDo(TestDoFn()) assert_that(pcoll, equal_to([MIN_TIMESTAMP, MIN_TIMESTAMP])) pipeline.run()
def test_read_gzip_empty_file(self): file_name = self._create_temp_file() pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText( file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder()) assert_that(pcoll, equal_to([])) pipeline.run()
def test_element(self): class TestDoFn(DoFn): def process(self, element): yield element + 10 pipeline = TestPipeline() pcoll = pipeline | 'Create' >> Create([1, 2]) | 'Do' >> ParDo(TestDoFn()) assert_that(pcoll, equal_to([11, 12])) pipeline.run()
def test_context_param(self): class TestDoFn(DoFn): def process(self, element, context=DoFn.ContextParam): yield context.element + 10 pipeline = TestPipeline() pcoll = pipeline | 'Create' >> Create([1, 2])| 'Do' >> ParDo(TestDoFn()) assert_that(pcoll, equal_to([11, 12])) pipeline.run()
def test_top(self): pipeline = TestPipeline() # A parameter we'll be sharing with a custom comparator. names = { 0: 'zo', 1: 'one', 2: 'twoo', 3: 'three', 5: 'fiiive', 6: 'sssssix', 9: 'nniiinne' } # First for global combines. pcoll = pipeline | 'start' >> Create([6, 3, 1, 1, 9, 1, 5, 2, 0, 6]) result_top = pcoll | 'top' >> combine.Top.Largest(5) result_bot = pcoll | 'bot' >> combine.Top.Smallest(4) result_cmp = pcoll | 'cmp' >> combine.Top.Of( 'cmp', 6, lambda a, b, names: len(names[a]) < len(names[b]), names) # Note parameter passed to comparator. result_cmp_rev = pcoll | 'cmp_rev' >> combine.Top.Of( 'cmp', 3, lambda a, b, names: len(names[a]) < len(names[b]), names, # Note parameter passed to comparator. reverse=True) assert_that(result_top, equal_to([[9, 6, 6, 5, 3]]), label='assert:top') assert_that(result_bot, equal_to([[0, 1, 1, 1]]), label='assert:bot') assert_that(result_cmp, equal_to([[9, 6, 6, 5, 3, 2]]), label='assert:cmp') assert_that(result_cmp_rev, equal_to([[0, 1, 1]]), label='assert:cmp_rev') # Again for per-key combines. pcoll = pipeline | 'start-perkye' >> Create( [('a', x) for x in [6, 3, 1, 1, 9, 1, 5, 2, 0, 6]]) result_key_top = pcoll | 'top-perkey' >> combine.Top.LargestPerKey(5) result_key_bot = pcoll | 'bot-perkey' >> combine.Top.SmallestPerKey(4) result_key_cmp = pcoll | 'cmp-perkey' >> combine.Top.PerKey( 6, lambda a, b, names: len(names[a]) < len(names[b]), names) # Note parameter passed to comparator. assert_that(result_key_top, equal_to([('a', [9, 6, 6, 5, 3])]), label='key:top') assert_that(result_key_bot, equal_to([('a', [0, 1, 1, 1])]), label='key:bot') assert_that(result_key_cmp, equal_to([('a', [9, 6, 6, 5, 3, 2])]), label='key:cmp') pipeline.run()
def test_read_auto_bzip2(self): _, lines = write_data(15) file_name = self._create_temp_file(suffix='.bz2') with bz2.BZ2File(file_name, 'wb') as f: f.write('\n'.join(lines)) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText(file_name) assert_that(pcoll, equal_to(lines)) pipeline.run()
def test_tuple_combine_fn_without_defaults(self): p = TestPipeline() result = ( p | Create([1, 1, 2, 3]) | beam.CombineGlobally( combine.TupleCombineFn(min, combine.MeanCombineFn(), max) .with_common_input()).without_defaults()) assert_that(result, equal_to([(1, 7.0 / 4, 3)])) p.run()
def test_run_concat_direct(self): source = ConcatSource([RangeSource(0, 10), RangeSource(10, 100), RangeSource(100, 1000), ]) pipeline = TestPipeline() pcoll = pipeline | beam.Read(source) assert_that(pcoll, equal_to(range(1000))) pipeline.run()
def test_tuple_combine_fn(self): p = TestPipeline() result = ( p | Create([('a', 100, 0.0), ('b', 10, -1), ('c', 1, 100)]) | beam.CombineGlobally(combine.TupleCombineFn(max, combine.MeanCombineFn(), sum)).without_defaults()) assert_that(result, equal_to([('c', 111.0 / 3, 99.0)])) p.run()
def test_create(self): pipeline = TestPipeline() pcoll = pipeline | 'label1' >> Create([1, 2, 3]) assert_that(pcoll, equal_to([1, 2, 3])) # Test if initial value is an iterator object. pcoll2 = pipeline | 'label2' >> Create(iter((4, 5, 6))) pcoll3 = pcoll2 | 'do' >> FlatMap(lambda x: [x + 10]) assert_that(pcoll3, equal_to([14, 15, 16]), label='pcoll3') pipeline.run()
def test_reuse_cloned_custom_transform_instance(self): pipeline = TestPipeline() pcoll1 = pipeline | 'pc1' >> Create([1, 2, 3]) pcoll2 = pipeline | 'pc2' >> Create([4, 5, 6]) transform = PipelineTest.CustomTransform() result1 = pcoll1 | transform result2 = pcoll2 | 'new_label' >> transform assert_that(result1, equal_to([2, 3, 4]), label='r1') assert_that(result2, equal_to([5, 6, 7]), label='r2') pipeline.run()
def test_reuse_cloned_custom_transform_instance(self): pipeline = TestPipeline() pcoll1 = pipeline | 'pc1' >> Create([1, 2, 3]) pcoll2 = pipeline | 'pc2' >> Create([4, 5, 6]) transform = PipelineTest.CustomTransform() result1 = pcoll1 | transform result2 = pcoll2 | 'new_label' >> transform assert_that(result1, equal_to([2, 3, 4]), label='r1') assert_that(result2, equal_to([5, 6, 7]), label='r2') pipeline.run()
def test_create(self): pipeline = TestPipeline() pcoll = pipeline | 'label1' >> Create([1, 2, 3]) assert_that(pcoll, equal_to([1, 2, 3])) # Test if initial value is an iterator object. pcoll2 = pipeline | 'label2' >> Create(iter((4, 5, 6))) pcoll3 = pcoll2 | 'do' >> FlatMap(lambda x: [x + 10]) assert_that(pcoll3, equal_to([14, 15, 16]), label='pcoll3') pipeline.run()
def test_empty_write(self): temp_path = tempfile.NamedTemporaryFile().name sink = MyFileSink(temp_path, file_name_suffix='.output', coder=coders.ToStringCoder()) p = TestPipeline() p | beam.Create([]) | beam.io.Write(sink) # pylint: disable=expression-not-assigned p.run() self.assertEqual( open(temp_path + '-00000-of-00001.output').read(), '[start][end]')
def test_timestamped_value(self): p = TestPipeline() result = (p | 'start' >> Create([(k, k) for k in range(10)]) | Map(lambda (x, t): TimestampedValue(x, t)) | 'w' >> WindowInto(FixedWindows(5)) | Map(lambda v: ('key', v)) | GroupByKey()) assert_that(result, equal_to([('key', [0, 1, 2, 3, 4]), ('key', [5, 6, 7, 8, 9])])) p.run()
def test_sliding_windows(self): p = TestPipeline() pcoll = self.timestamped_key_values(p, 'key', 1, 2, 3) result = (pcoll | 'w' >> WindowInto(SlidingWindows(period=2, size=4)) | GroupByKey() | reify_windows) expected = [('key @ [-2.0, 2.0)', [1]), ('key @ [0.0, 4.0)', [1, 2, 3]), ('key @ [2.0, 6.0)', [2, 3])] assert_that(result, equal_to(expected)) p.run()
def test_sessions(self): p = TestPipeline() pcoll = self.timestamped_key_values(p, 'key', 1, 2, 3, 20, 35, 27) result = (pcoll | 'w' >> WindowInto(Sessions(10)) | GroupByKey() | sort_values | reify_windows) expected = [('key @ [1.0, 13.0)', [1, 2, 3]), ('key @ [20.0, 45.0)', [20, 27, 35])] assert_that(result, equal_to(expected)) p.run()