def test_dataflow_single_file(self): file_name, expected_data = write_data(5) assert len(expected_data) == 5 pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText(file_name) assert_that(pcoll, equal_to(expected_data)) pipeline.run()
def test_setting_timestamp(self): p = TestPipeline() unkeyed_items = p | beam.Create([12, 30, 60, 61, 66]) items = (unkeyed_items | 'key' >> beam.Map(lambda x: ('k', x))) def extract_timestamp_from_log_entry(entry): return entry[1] # [START setting_timestamp] class AddTimestampDoFn(beam.DoFn): def process(self, element): # Extract the numeric Unix seconds-since-epoch timestamp to be # associated with the current log entry. unix_timestamp = extract_timestamp_from_log_entry(element) # Wrap and emit the current entry and new timestamp in a # TimestampedValue. yield beam.TimestampedValue(element, unix_timestamp) timestamped_items = items | 'timestamp' >> beam.ParDo(AddTimestampDoFn()) # [END setting_timestamp] fixed_windowed_items = ( timestamped_items | 'window' >> beam.WindowInto( beam.window.FixedWindows(60))) summed = (fixed_windowed_items | 'group' >> beam.GroupByKey() | 'combine' >> beam.CombineValues(sum)) unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1]) beam.assert_that(unkeyed, beam.equal_to([42, 187])) p.run()
def test_dataflow_file_pattern(self): pattern, expected_data = write_pattern([5, 3, 12, 8, 8, 4]) assert len(expected_data) == 40 pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText(pattern) assert_that(pcoll, equal_to(expected_data)) pipeline.run()
def run_pipeline(self, count_implementation, factor=1): p = TestPipeline() words = p | beam.Create(['CAT', 'DOG', 'CAT', 'CAT', 'DOG']) result = words | count_implementation assert_that( result, equal_to([('CAT', (3 * factor)), ('DOG', (2 * factor))])) p.run()
def test_bad_types(self): p = TestPipeline() evens = None # pylint: disable=unused-variable # [START type_hints_missing_define_numbers] numbers = p | beam.Create(['1', '2', '3']) # [END type_hints_missing_define_numbers] # Consider the following code. # pylint: disable=expression-not-assigned # pylint: disable=unused-variable # [START type_hints_missing_apply] evens = numbers | beam.Filter(lambda x: x % 2 == 0) # [END type_hints_missing_apply] # Now suppose numbers was defined as [snippet above]. # When running this pipeline, you'd get a runtime error, # possibly on a remote machine, possibly very late. with self.assertRaises(TypeError): p.run() # To catch this early, we can assert what types we expect. with self.assertRaises(typehints.TypeCheckError): # [START type_hints_takes] p.options.view_as(TypeOptions).pipeline_type_check = True evens = numbers | beam.Filter(lambda x: x % 2 == 0).with_input_types(int) # [END type_hints_takes] # Type hints can be declared on DoFns and callables as well, rather # than where they're used, to be more self contained. with self.assertRaises(typehints.TypeCheckError): # [START type_hints_do_fn] @beam.typehints.with_input_types(int) class FilterEvensDoFn(beam.DoFn): def process(self, element): if element % 2 == 0: yield element evens = numbers | beam.ParDo(FilterEvensDoFn()) # [END type_hints_do_fn] words = p | 'words' >> beam.Create(['a', 'bb', 'c']) # One can assert outputs and apply them to transforms as well. # Helps document the contract and checks it at pipeline construction time. # [START type_hints_transform] T = beam.typehints.TypeVariable('T') @beam.typehints.with_input_types(T) @beam.typehints.with_output_types(beam.typehints.Tuple[int, T]) class MyTransform(beam.PTransform): def expand(self, pcoll): return pcoll | beam.Map(lambda x: (len(x), x)) words_with_lens = words | MyTransform() # [END type_hints_transform] # pylint: disable=expression-not-assigned with self.assertRaises(typehints.TypeCheckError): words_with_lens | beam.Map(lambda x: x).with_input_types( beam.typehints.Tuple[int, int])
def model_composite_transform_example(contents, output_path): """Example of a composite transform. To declare a composite transform, define a subclass of PTransform. To override the apply method, define a method "apply" that takes a PCollection as its only parameter and returns a PCollection. """ import re import apache_beam as beam # [START composite_transform_example] # [START composite_ptransform_apply_method] # [START composite_ptransform_declare] class CountWords(beam.PTransform): # [END composite_ptransform_declare] def expand(self, pcoll): return (pcoll | beam.FlatMap(lambda x: re.findall(r'\w+', x)) | beam.combiners.Count.PerElement() | beam.Map(lambda (word, c): '%s: %s' % (word, c))) # [END composite_ptransform_apply_method] # [END composite_transform_example] p = TestPipeline() # Use TestPipeline for testing. (p | beam.Create(contents) | CountWords() | beam.io.WriteToText(output_path)) p.run()
def model_multiple_pcollections_partition(contents, output_path): """Splitting a PCollection with Partition.""" some_hash_fn = lambda s: ord(s[0]) def get_percentile(i): """Assume i in [0,100).""" return i import apache_beam as beam p = TestPipeline() # Use TestPipeline for testing. students = p | beam.Create(contents) # [START model_multiple_pcollections_partition] def partition_fn(student, num_partitions): return int(get_percentile(student) * num_partitions / 100) by_decile = students | beam.Partition(partition_fn, 10) # [END model_multiple_pcollections_partition] # [START model_multiple_pcollections_partition_40th] fortieth_percentile = by_decile[4] # [END model_multiple_pcollections_partition_40th] ([by_decile[d] for d in xrange(10) if d != 4] + [fortieth_percentile] | beam.Flatten() | beam.io.WriteToText(output_path)) p.run()
def model_multiple_pcollections_flatten(contents, output_path): """Merging a PCollection with Flatten.""" some_hash_fn = lambda s: ord(s[0]) import apache_beam as beam p = TestPipeline() # Use TestPipeline for testing. partition_fn = lambda element, partitions: some_hash_fn(element) % partitions # Partition into deciles partitioned = p | beam.Create(contents) | beam.Partition(partition_fn, 3) pcoll1 = partitioned[0] pcoll2 = partitioned[1] pcoll3 = partitioned[2] # Flatten them back into 1 # A collection of PCollection objects can be represented simply # as a tuple (or list) of PCollections. # (The SDK for Python has no separate type to store multiple # PCollection objects, whether containing the same or different # types.) # [START model_multiple_pcollections_flatten] merged = ( # [START model_multiple_pcollections_tuple] (pcoll1, pcoll2, pcoll3) # [END model_multiple_pcollections_tuple] # A list of tuples can be "piped" directly into a Flatten transform. | beam.Flatten()) # [END model_multiple_pcollections_flatten] merged | beam.io.WriteToText(output_path) p.run()
def model_co_group_by_key_tuple(email_list, phone_list, output_path): """Applying a CoGroupByKey Transform to a tuple.""" import apache_beam as beam p = TestPipeline() # Use TestPipeline for testing. # [START model_group_by_key_cogroupbykey_tuple] # Each data set is represented by key-value pairs in separate PCollections. # Both data sets share a common key type (in this example str). # The email_list contains values such as: ('joe', '*****@*****.**') with # multiple possible values for each key. # The phone_list contains values such as: ('mary': '111-222-3333') with # multiple possible values for each key. emails = p | 'email' >> beam.Create(email_list) phones = p | 'phone' >> beam.Create(phone_list) # The result PCollection contains one key-value element for each key in the # input PCollections. The key of the pair will be the key from the input and # the value will be a dictionary with two entries: 'emails' - an iterable of # all values for the current key in the emails PCollection and 'phones': an # iterable of all values for the current key in the phones PCollection. # For instance, if 'emails' contained ('joe', '*****@*****.**') and # ('joe', '*****@*****.**'), then 'result' will contain the element # ('joe', {'emails': ['*****@*****.**', '*****@*****.**'], 'phones': ...}) result = {'emails': emails, 'phones': phones} | beam.CoGroupByKey() def join_info((name, info)): return '; '.join(['%s' % name, '%s' % ','.join(info['emails']), '%s' % ','.join(info['phones'])]) contact_lines = result | beam.Map(join_info) # [END model_group_by_key_cogroupbykey_tuple] contact_lines | beam.io.WriteToText(output_path) p.run()
def test_timestamped_with_combiners(self): p = TestPipeline() result = (p # Create some initial test values. | 'start' >> Create([(k, k) for k in range(10)]) # The purpose of the WindowInto transform is to establish a # FixedWindows windowing function for the PCollection. # It does not bucket elements into windows since the timestamps # from Create are not spaced 5 ms apart and very likely they all # fall into the same window. | 'w' >> WindowInto(FixedWindows(5)) # Generate timestamped values using the values as timestamps. # Now there are values 5 ms apart and since Map propagates the # windowing function from input to output the output PCollection # will have elements falling into different 5ms windows. | Map(lambda (x, t): TimestampedValue(x, t)) # We add a 'key' to each value representing the index of the # window. This is important since there is no guarantee of # order for the elements of a PCollection. | Map(lambda v: (v / 5, v))) # Sum all elements associated with a key and window. Although it # is called CombinePerKey it is really CombinePerKeyAndWindow the # same way GroupByKey is really GroupByKeyAndWindow. sum_per_window = result | CombinePerKey(sum) # Compute mean per key and window. mean_per_window = result | combiners.Mean.PerKey() assert_that(sum_per_window, equal_to([(0, 10), (1, 35)]), label='assert:sum') assert_that(mean_per_window, equal_to([(0, 2.0), (1, 7.0)]), label='assert:mean') p.run()
def test_compute_top_sessions(self): p = TestPipeline() edits = p | beam.Create(self.EDITS) result = edits | top_wikipedia_sessions.ComputeTopSessions(1.0) beam.assert_that(result, beam.equal_to(self.EXPECTED)) p.run()
def pipeline_logging(lines, output): """Logging Pipeline Messages.""" import re import apache_beam as beam # [START pipeline_logging] # import Python logging module. import logging class ExtractWordsFn(beam.DoFn): def process(self, element): words = re.findall(r'[A-Za-z\']+', element) for word in words: yield word if word.lower() == 'love': # Log using the root logger at info or higher levels logging.info('Found : %s', word.lower()) # Remaining WordCount example code ... # [END pipeline_logging] p = TestPipeline() # Use TestPipeline for testing. (p | beam.Create(lines) | beam.ParDo(ExtractWordsFn()) | beam.io.WriteToText(output)) p.run()
def test_run_direct(self): file_name = self._create_temp_file('aaaa\nbbbb\ncccc\ndddd') pipeline = TestPipeline() pcoll = pipeline | beam.io.Read(LineSource(file_name)) assert_that(pcoll, equal_to(['aaaa', 'bbbb', 'cccc', 'dddd'])) pipeline.run()
def test_runtime_checks_on(self): # pylint: disable=expression-not-assigned p = TestPipeline() with self.assertRaises(typehints.TypeCheckError): # [START type_hints_runtime_on] p.options.view_as(TypeOptions).runtime_type_check = True p | beam.Create(['a']) | beam.Map(lambda x: 3).with_output_types(str) p.run()
def test_basics(self): p = TestPipeline() result = p | 'Estimate' >> estimate_pi.EstimatePiTransform(5000) # Note: Probabilistically speaking this test can fail with a probability # that is very small (VERY) given that we run at least 500 thousand trials. assert_that(result, in_between(3.125, 3.155)) p.run()
def test_empty_write(self): temp_path = tempfile.NamedTemporaryFile().name sink = MyFileSink( temp_path, file_name_suffix='.foo', coder=coders.ToStringCoder()) p = TestPipeline() p | beam.Create([]) | beam.io.Write(sink) # pylint: disable=expression-not-assigned p.run() self.assertEqual( open(temp_path + '-00000-of-00001.foo').read(), '[start][end]')
def test_element(self): class TestDoFn(DoFn): def process(self, element): yield element + 10 pipeline = TestPipeline() pcoll = pipeline | 'Create' >> Create([1, 2]) | 'Do' >> ParDo(TestDoFn()) assert_that(pcoll, equal_to([11, 12])) pipeline.run()
def test_context_param(self): class TestDoFn(DoFn): def process(self, element, context=DoFn.ContextParam): yield context.element + 10 pipeline = TestPipeline() pcoll = pipeline | 'Create' >> Create([1, 2])| 'Do' >> ParDo(TestDoFn()) assert_that(pcoll, equal_to([11, 12])) pipeline.run()
def test_timestamp_param(self): class TestDoFn(DoFn): def process(self, element, timestamp=DoFn.TimestampParam): yield timestamp pipeline = TestPipeline() pcoll = pipeline | 'Create' >> Create([1, 2]) | 'Do' >> ParDo(TestDoFn()) assert_that(pcoll, equal_to([MIN_TIMESTAMP, MIN_TIMESTAMP])) pipeline.run()
def test_read_gzip_empty_file(self): file_name = self._create_temp_file() pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText( file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder()) assert_that(pcoll, equal_to([])) pipeline.run()
def test_run_concat_direct(self): source = ConcatSource([RangeSource(0, 10), RangeSource(10, 100), RangeSource(100, 1000), ]) pipeline = TestPipeline() pcoll = pipeline | beam.Read(source) assert_that(pcoll, equal_to(range(1000))) pipeline.run()
def test_read_auto_bzip2(self): _, lines = write_data(15) file_name = self._create_temp_file(suffix='.bz2') with bz2.BZ2File(file_name, 'wb') as f: f.write('\n'.join(lines)) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText(file_name) assert_that(pcoll, equal_to(lines)) pipeline.run()
def test_create(self): pipeline = TestPipeline() pcoll = pipeline | 'label1' >> Create([1, 2, 3]) assert_that(pcoll, equal_to([1, 2, 3])) # Test if initial value is an iterator object. pcoll2 = pipeline | 'label2' >> Create(iter((4, 5, 6))) pcoll3 = pcoll2 | 'do' >> FlatMap(lambda x: [x + 10]) assert_that(pcoll3, equal_to([14, 15, 16]), label='pcoll3') pipeline.run()
def test_reuse_cloned_custom_transform_instance(self): pipeline = TestPipeline() pcoll1 = pipeline | 'pc1' >> Create([1, 2, 3]) pcoll2 = pipeline | 'pc2' >> Create([4, 5, 6]) transform = PipelineTest.CustomTransform() result1 = pcoll1 | transform result2 = pcoll2 | 'new_label' >> transform assert_that(result1, equal_to([2, 3, 4]), label='r1') assert_that(result2, equal_to([5, 6, 7]), label='r2') pipeline.run()
def test_tuple_combine_fn(self): p = TestPipeline() result = ( p | Create([('a', 100, 0.0), ('b', 10, -1), ('c', 1, 100)]) | beam.CombineGlobally(combine.TupleCombineFn(max, combine.MeanCombineFn(), sum)).without_defaults()) assert_that(result, equal_to([('c', 111.0 / 3, 99.0)])) p.run()
def test_metrics_in_source(self): pipeline = TestPipeline() pcoll = pipeline | Read(FakeSource([1, 2, 3, 4, 5, 6])) assert_that(pcoll, equal_to([1, 2, 3, 4, 5, 6])) res = pipeline.run() metric_results = res.metrics().query() outputs_counter = metric_results['counters'][0] self.assertEqual(outputs_counter.key.step, 'Read') self.assertEqual(outputs_counter.key.metric.name, 'outputs') self.assertEqual(outputs_counter.committed, 6)
def test_tuple_combine_fn_without_defaults(self): p = TestPipeline() result = ( p | Create([1, 1, 2, 3]) | beam.CombineGlobally( combine.TupleCombineFn(min, combine.MeanCombineFn(), max) .with_common_input()).without_defaults()) assert_that(result, equal_to([(1, 7.0 / 4, 3)])) p.run()
def test_timestamped_value(self): p = TestPipeline() result = (p | 'start' >> Create([(k, k) for k in range(10)]) | Map(lambda (x, t): TimestampedValue(x, t)) | 'w' >> WindowInto(FixedWindows(5)) | Map(lambda v: ('key', v)) | GroupByKey()) assert_that(result, equal_to([('key', [0, 1, 2, 3, 4]), ('key', [5, 6, 7, 8, 9])])) p.run()
def test_sessions(self): p = TestPipeline() pcoll = self.timestamped_key_values(p, 'key', 1, 2, 3, 20, 35, 27) result = (pcoll | 'w' >> WindowInto(Sessions(10)) | GroupByKey() | sort_values | reify_windows) expected = [('key @ [1.0, 13.0)', [1, 2, 3]), ('key @ [20.0, 45.0)', [20, 27, 35])] assert_that(result, equal_to(expected)) p.run()
def test_sliding_windows(self): p = TestPipeline() pcoll = self.timestamped_key_values(p, 'key', 1, 2, 3) result = (pcoll | 'w' >> WindowInto(SlidingWindows(period=2, size=4)) | GroupByKey() | reify_windows) expected = [('key @ [-2.0, 2.0)', [1]), ('key @ [0.0, 4.0)', [1, 2, 3]), ('key @ [2.0, 6.0)', [2, 3])] assert_that(result, equal_to(expected)) p.run()
def test_setting_sliding_windows(self): p = TestPipeline() unkeyed_items = p | beam.Create([2, 16, 23]) items = (unkeyed_items | 'key' >> beam.Map(lambda x: beam.window.TimestampedValue( ('k', x), x))) # [START setting_sliding_windows] from apache_beam import window sliding_windowed_items = ( items | 'window' >> beam.WindowInto(window.SlidingWindows(30, 5))) # [END setting_sliding_windows] summed = (sliding_windowed_items | 'group' >> beam.GroupByKey() | 'combine' >> beam.CombineValues(sum)) unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1]) beam.assert_that(unkeyed, beam.equal_to([2, 2, 2, 18, 23, 39, 39, 39, 41, 41])) p.run()
def test_end2end_example_proto(self): file_path_prefix = os.path.join(self._new_tempdir(), 'result') example = tf.train.Example() example.features.feature['int'].int64_list.value.extend(range(3)) example.features.feature['bytes'].bytes_list.value.extend( [b'foo', b'bar']) with TestPipeline() as p: _ = p | beam.Create([example]) | WriteToTFRecord( file_path_prefix, coder=beam.coders.ProtoCoder(example.__class__)) # Read the file back and compare. with TestPipeline() as p: actual_data = (p | ReadFromTFRecord( file_path_prefix + '-*', coder=beam.coders.ProtoCoder(example.__class__))) beam.assert_that(actual_data, beam.equal_to([example]))
def test_top_prefixes(self): p = TestPipeline() words = p | beam.Create(self.WORDS) result = words | autocomplete.TopPerPrefix(5) # values must be hashable for now result = result | beam.Map(lambda (k, vs): (k, tuple(vs))) assert_that( result, equal_to([ ('t', ((3, 'to'), (2, 'this'), (1, 'that'))), ('to', ((3, 'to'), )), ('th', ((2, 'this'), (1, 'that'))), ('thi', ((2, 'this'), )), ('this', ((2, 'this'), )), ('tha', ((1, 'that'), )), ('that', ((1, 'that'), )), ])) p.run()
def test_read_auto_pattern(self): _, lines = write_data(200) splits = [0, 34, 100, 140, 164, 188, 200] chunks = [lines[splits[i - 1]:splits[i]] for i in xrange(1, len(splits))] compressed_chunks = [] for c in chunks: out = cStringIO.StringIO() with gzip.GzipFile(fileobj=out, mode="w") as f: f.write('\n'.join(c)) compressed_chunks.append(out.getvalue()) file_pattern = write_prepared_pattern( compressed_chunks, suffixes=['.gz']*len(chunks)) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> beam.Read(LineSource( file_pattern, compression_type=CompressionTypes.AUTO)) assert_that(pcoll, equal_to(lines)) pipeline.run()
def test_rewindow(self): p = TestPipeline() result = ( p | Create([(k, k) for k in range(10)]) | Map(lambda (x, t): TimestampedValue(x, t)) | 'window' >> WindowInto(SlidingWindows(period=2, size=6)) # Per the model, each element is now duplicated across # three windows. Rewindowing must preserve this duplication. | 'rewindow' >> WindowInto(FixedWindows(5)) | 'rewindow2' >> WindowInto(FixedWindows(5)) | Map(lambda v: ('key', v)) | GroupByKey()) assert_that( result, equal_to([('key', sorted([0, 1, 2, 3, 4] * 3)), ('key', sorted([5, 6, 7, 8, 9] * 3))])) p.run()
def test_per_key_sample(self): pipeline = TestPipeline() pcoll = pipeline | 'start-perkey' >> Create( sum(([(i, 1), (i, 1), (i, 2), (i, 2)] for i in xrange(9)), [])) result = pcoll | 'sample' >> combine.Sample.FixedSizePerKey(3) def matcher(): def match(actual): for _, samples in actual: equal_to([3])([len(samples)]) num_ones = sum(1 for x in samples if x == 1) num_twos = sum(1 for x in samples if x == 2) equal_to([1, 2])([num_ones, num_twos]) return match assert_that(result, matcher()) pipeline.run()
def test_memory_usage(self): try: import resource except ImportError: # Skip the test if resource module is not available (e.g. non-Unix os). self.skipTest('resource module not available.') if platform.mac_ver()[0]: # Skip the test on macos, depending on version it returns ru_maxrss in # different units. self.skipTest('ru_maxrss is not in standard units.') def get_memory_usage_in_bytes(): return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss * (2**10) def check_memory(value, memory_threshold): memory_usage = get_memory_usage_in_bytes() if memory_usage > memory_threshold: raise RuntimeError('High memory usage: %d > %d' % (memory_usage, memory_threshold)) return value len_elements = 1000000 num_elements = 10 num_maps = 100 pipeline = TestPipeline() # Consumed memory should not be proportional to the number of maps. memory_threshold = (get_memory_usage_in_bytes() + (5 * len_elements * num_elements)) # Plus small additional slack for memory fluctuations during the test. memory_threshold += 10 * (2**20) biglist = pipeline | 'oom:create' >> Create( ['x' * len_elements] * num_elements) for i in range(num_maps): biglist = biglist | ('oom:addone-%d' % i) >> Map(lambda x: x + 'y') result = biglist | 'oom:check' >> Map(check_memory, memory_threshold) assert_that( result, equal_to(['x' * len_elements + 'y' * num_maps] * num_elements)) pipeline.run()
def test_bigquery_tornadoes_it(self): test_pipeline = TestPipeline(is_integration_test=True) # Set extra options to the pipeline for test purpose output_table = ('BigQueryTornadoesIT' '.monthly_tornadoes_%s' % int(round(time.time() * 1000))) query = 'SELECT month, tornado_count FROM [%s]' % output_table pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=test_pipeline.get_option('project'), query=query, checksum=self.DEFAULT_CHECKSUM)] extra_opts = {'output': output_table, 'on_success_matcher': all_of(*pipeline_verifiers)} # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. bigquery_tornadoes.run( test_pipeline.get_full_options_as_args(**extra_opts))
def test_read_pattern_bzip2(self): _, lines = write_data(200) splits = [0, 34, 100, 140, 164, 188, 200] chunks = [ lines[splits[i - 1]:splits[i]] for i in xrange(1, len(splits)) ] compressed_chunks = [] for c in chunks: compressobj = bz2.BZ2Compressor() compressed_chunks.append( compressobj.compress('\n'.join(c)) + compressobj.flush()) file_pattern = write_prepared_pattern(compressed_chunks) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> beam.io.Read( LineSource(file_pattern, splittable=False, compression_type=CompressionTypes.BZIP2)) assert_that(pcoll, equal_to(lines)) pipeline.run()
def test_deterministic_key(self): p = TestPipeline() lines = (p | beam.Create( ['banana,fruit,3', 'kiwi,fruit,2', 'kiwi,fruit,2', 'zucchini,veg,3'])) # For pickling global Player # pylint: disable=global-variable-not-assigned # [START type_hints_deterministic_key] class Player(object): def __init__(self, team, name): self.team = team self.name = name class PlayerCoder(beam.coders.Coder): def encode(self, player): return '%s:%s' % (player.team, player.name) def decode(self, s): return Player(*s.split(':')) def is_deterministic(self): return True beam.coders.registry.register_coder(Player, PlayerCoder) def parse_player_and_score(csv): name, team, score = csv.split(',') return Player(team, name), int(score) totals = ( lines | beam.Map(parse_player_and_score) | beam.CombinePerKey(sum).with_input_types( beam.typehints.Tuple[Player, int])) # [END type_hints_deterministic_key] assert_that( totals | beam.Map(lambda (k, v): (k.name, v)), equal_to([('banana', 3), ('kiwi', 4), ('zucchini', 3)])) p.run()
def test_process_auto(self): path = os.path.join(self._new_tempdir(), 'result.gz') self._write_file_gzip(path, FOO_BAR_RECORD_BASE64) with TestPipeline() as p: result = (p | beam.Read( _TFRecordSource( path, coder=coders.BytesCoder(), compression_type=fileio.CompressionTypes.AUTO))) beam.assert_that(result, beam.equal_to(['foo', 'bar']))
def test_tfidf_transform(self): p = TestPipeline() uri_to_line = p | beam.Create( 'create sample', [('1.txt', 'abc def ghi'), ('2.txt', 'abc def'), ('3.txt', 'abc')]) result = ( uri_to_line | tfidf.TfIdf() | beam.Map(lambda (word, (uri, tfidf)): (word, uri, tfidf)))
def test_hourly_team_score(self): with TestPipeline() as p: result = (p | beam.Create(HourlyTeamScoreTest.SAMPLE_DATA) | hourly_team_score.HourlyTeamScore( start_min='2015-11-16-15-20', stop_min='2015-11-16-17-20', window_duration=60)) beam.assert_that( result, beam.equal_to([('team1', 18), ('team2', 2), ('team3', 13)]))
def test_wordcount_it(self): test_pipeline = TestPipeline(is_integration_test=True) # Set extra options to the pipeline for test purpose output = '/'.join([ test_pipeline.get_option('output'), test_pipeline.get_option('job_name'), 'results' ]) pipeline_verifiers = [ PipelineStateMatcher(), FileChecksumMatcher(output + '*-of-*', self.DEFAULT_CHECKSUM) ] extra_opts = { 'output': output, 'on_success_matcher': all_of(*pipeline_verifiers) } # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. wordcount.run(test_pipeline.get_full_options_as_args(**extra_opts))
def test_create_groups(self): p = TestPipeline() group_ids_pcoll = p | 'CreateGroupIds' >> beam.Create(['A', 'B', 'C']) corpus_pcoll = p | 'CreateCorpus' >> beam.Create( [{'f': 'corpus1'}, {'f': 'corpus2'}, {'f': 'corpus3'}]) words_pcoll = p | 'CreateWords' >> beam.Create( [{'f': 'word1'}, {'f': 'word2'}, {'f': 'word3'}]) ignore_corpus_pcoll = p | 'CreateIgnoreCorpus' >> beam.Create(['corpus1']) ignore_word_pcoll = p | 'CreateIgnoreWord' >> beam.Create(['word1']) groups = bigquery_side_input.create_groups(group_ids_pcoll, corpus_pcoll, words_pcoll, ignore_corpus_pcoll, ignore_word_pcoll) beam.assert_that(groups, beam.equal_to( [('A', 'corpus2', 'word2'), ('B', 'corpus2', 'word2'), ('C', 'corpus2', 'word2')])) p.run()
def test_process_single(self): path = os.path.join(self._new_tempdir(), 'result') self._write_file(path, FOO_RECORD_BASE64) with TestPipeline() as p: result = (p | beam.io.Read( _TFRecordSource( path, coder=coders.BytesCoder(), compression_type=CompressionTypes.AUTO, validate=True))) beam.assert_that(result, beam.equal_to(['foo']))
def test_window_param(self): class TestDoFn(DoFn): def process(self, element, window=DoFn.WindowParam): yield (element, (float(window.start), float(window.end))) pipeline = TestPipeline() pcoll = (pipeline | Create([1, 7]) | Map(lambda x: TimestampedValue(x, x)) | WindowInto(windowfn=SlidingWindows(10, 5)) | ParDo(TestDoFn())) assert_that( pcoll, equal_to([(1, (-5, 5)), (1, (0, 10)), (7, (0, 10)), (7, (5, 15))])) pcoll2 = pcoll | 'Again' >> ParDo(TestDoFn()) assert_that(pcoll2, equal_to([((1, (-5, 5)), (-5, 5)), ((1, (0, 10)), (0, 10)), ((7, (0, 10)), (0, 10)), ((7, (5, 15)), (5, 15))]), label='doubled windows') pipeline.run()
def test_group_by_key_input_visitor_with_invalid_inputs(self): p = TestPipeline() pcoll1 = PCollection(p) pcoll2 = PCollection(p) for transform in [beam.GroupByKeyOnly(), beam.GroupByKey()]: pcoll1.element_type = typehints.TupleSequenceConstraint pcoll2.element_type = typehints.Set err_msg = "Input to GroupByKey must be of Tuple or Any type" for pcoll in [pcoll1, pcoll2]: with self.assertRaisesRegexp(ValueError, err_msg): runner.group_by_key_input_visitor().visit_transform( AppliedPTransform(None, transform, "label", [pcoll]))
def test_builtin_combines(self): pipeline = TestPipeline() vals = [6, 3, 1, 1, 9, 1, 5, 2, 0, 6] mean = sum(vals) / float(len(vals)) size = len(vals) # First for global combines. pcoll = pipeline | 'start' >> Create(vals) result_mean = pcoll | 'mean' >> combine.Mean.Globally() result_count = pcoll | 'count' >> combine.Count.Globally() assert_that(result_mean, equal_to([mean]), label='assert:mean') assert_that(result_count, equal_to([size]), label='assert:size') # Again for per-key combines. pcoll = pipeline | 'start-perkey' >> Create([('a', x) for x in vals]) result_key_mean = pcoll | 'mean-perkey' >> combine.Mean.PerKey() result_key_count = pcoll | 'count-perkey' >> combine.Count.PerKey() assert_that(result_key_mean, equal_to([('a', mean)]), label='key:mean') assert_that(result_key_count, equal_to([('a', size)]), label='key:size') pipeline.run()
def model_group_by_key(contents, output_path): """Applying a GroupByKey Transform.""" import re import apache_beam as beam p = TestPipeline() # Use TestPipeline for testing. words_and_counts = (p | beam.Create(contents) | beam.FlatMap(lambda x: re.findall(r'\w+', x)) | 'one word' >> beam.Map(lambda w: (w, 1))) # GroupByKey accepts a PCollection of (w, 1) and # outputs a PCollection of (w, (1, 1, ...)). # (A key/value pair is just a tuple in Python.) # This is a somewhat forced example, since one could # simply use beam.combiners.Count.PerElement here. # [START model_group_by_key_transform] grouped_words = words_and_counts | beam.GroupByKey() # [END model_group_by_key_transform] (grouped_words | 'count words' >> beam.Map(lambda (word, counts): (word, len(counts))) | beam.io.WriteToText(output_path)) p.run()
def test_global_sample(self): def is_good_sample(actual): assert len(actual) == 1 assert sorted(actual[0]) in [[1, 1, 2], [1, 2, 2]], actual with TestPipeline() as pipeline: pcoll = pipeline | 'start' >> Create([1, 1, 2, 2]) for ix in xrange(9): assert_that( pcoll | 'sample-%d' % ix >> combine.Sample.FixedSizeGlobally(3), is_good_sample, label='check-%d' % ix)
def test_pardo_side_input(self): p = TestPipeline() words = p | 'start' >> beam.Create(['a', 'bb', 'ccc', 'dddd']) # [START model_pardo_side_input] # Callable takes additional arguments. def filter_using_length(word, lower_bound, upper_bound=float('inf')): if lower_bound <= len(word) <= upper_bound: yield word # Construct a deferred side input. avg_word_len = (words | beam.Map(len) | beam.CombineGlobally(beam.combiners.MeanCombineFn())) # Call with explicit side inputs. small_words = words | 'small' >> beam.FlatMap(filter_using_length, 0, 3) # A single deferred side input. larger_than_average = (words | 'large' >> beam.FlatMap( filter_using_length, lower_bound=pvalue.AsSingleton(avg_word_len))) # Mix and match. small_but_nontrivial = words | beam.FlatMap( filter_using_length, lower_bound=2, upper_bound=pvalue.AsSingleton(avg_word_len)) # [END model_pardo_side_input] beam.assert_that(small_words, beam.equal_to(['a', 'bb', 'ccc'])) beam.assert_that(larger_than_average, beam.equal_to(['ccc', 'dddd']), label='larger_than_average') beam.assert_that(small_but_nontrivial, beam.equal_to(['bb']), label='small_but_not_trivial') p.run()
def pipeline_monitoring(renames): """Using monitoring interface snippets.""" import re import apache_beam as beam from apache_beam.utils.pipeline_options import PipelineOptions class WordCountOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--input', help='Input for the pipeline', default='gs://my-bucket/input') parser.add_argument('--output', help='output for the pipeline', default='gs://my-bucket/output') class ExtractWordsFn(beam.DoFn): def process(self, element): words = re.findall(r'[A-Za-z\']+', element) for word in words: yield word class FormatCountsFn(beam.DoFn): def process(self, element): word, count = element yield '%s: %s' % (word, count) # [START pipeline_monitoring_composite] # The CountWords Composite Transform inside the WordCount pipeline. class CountWords(beam.PTransform): def expand(self, pcoll): return (pcoll # Convert lines of text into individual words. | 'ExtractWords' >> beam.ParDo(ExtractWordsFn()) # Count the number of times each word occurs. | beam.combiners.Count.PerElement() # Format each word and count into a printable string. | 'FormatCounts' >> beam.ParDo(FormatCountsFn())) # [END pipeline_monitoring_composite] pipeline_options = PipelineOptions() options = pipeline_options.view_as(WordCountOptions) p = TestPipeline() # Use TestPipeline for testing. # [START pipeline_monitoring_execution] (p # Read the lines of the input text. | 'ReadLines' >> beam.io.ReadFromText(options.input) # Count the words. | CountWords() # Write the formatted word counts to output. | 'WriteCounts' >> beam.io.WriteToText(options.output)) # [END pipeline_monitoring_execution] p.visit(SnippetUtils.RenameFiles(renames)) p.run()
def model_join_using_side_inputs(name_list, email_list, phone_list, output_path): """Joining PCollections using side inputs.""" import apache_beam as beam from apache_beam.pvalue import AsIter p = TestPipeline() # Use TestPipeline for testing. # [START model_join_using_side_inputs] # This code performs a join by receiving the set of names as an input and # passing PCollections that contain emails and phone numbers as side inputs # instead of using CoGroupByKey. names = p | 'names' >> beam.Create(name_list) emails = p | 'email' >> beam.Create(email_list) phones = p | 'phone' >> beam.Create(phone_list) def join_info(name, emails, phone_numbers): filtered_emails = [] for name_in_list, email in emails: if name_in_list == name: filtered_emails.append(email) filtered_phone_numbers = [] for name_in_list, phone_number in phone_numbers: if name_in_list == name: filtered_phone_numbers.append(phone_number) return '; '.join([ '%s' % name, '%s' % ','.join(filtered_emails), '%s' % ','.join(filtered_phone_numbers) ]) contact_lines = names | 'CreateContacts' >> beam.core.Map( join_info, AsIter(emails), AsIter(phones)) # [END model_join_using_side_inputs] contact_lines | beam.io.WriteToText(output_path) p.run()
def test_write_record_auto(self): file_path_prefix = os.path.join(self._new_tempdir(), 'result') with TestPipeline() as p: input_data = ['foo', 'bar'] _ = p | beam.Create(input_data) | WriteToTFRecord( file_path_prefix, file_name_suffix='.gz') actual = [] file_name = glob.glob(file_path_prefix + '-*.gz')[0] for r in tf.python_io.tf_record_iterator( file_name, options=tf.python_io.TFRecordOptions( tf.python_io.TFRecordCompressionType.GZIP)): actual.append(r) self.assertEqual(actual, input_data)
def pipeline_options_remote(argv): """Creating a Pipeline using a PipelineOptions object for remote execution.""" from apache_beam import Pipeline from apache_beam.utils.pipeline_options import PipelineOptions # [START pipeline_options_create] options = PipelineOptions(flags=argv) # [END pipeline_options_create] # [START pipeline_options_define_custom] class MyOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--input') parser.add_argument('--output') # [END pipeline_options_define_custom] from apache_beam.utils.pipeline_options import GoogleCloudOptions from apache_beam.utils.pipeline_options import StandardOptions # [START pipeline_options_dataflow_service] # Create and set your PipelineOptions. options = PipelineOptions(flags=argv) # For Cloud execution, set the Cloud Platform project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = 'my-project-id' google_cloud_options.job_name = 'myjob' google_cloud_options.staging_location = 'gs://my-bucket/binaries' google_cloud_options.temp_location = 'gs://my-bucket/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) # [END pipeline_options_dataflow_service] my_options = options.view_as(MyOptions) my_input = my_options.input my_output = my_options.output p = TestPipeline() # Use TestPipeline for testing. lines = p | beam.io.ReadFromText(my_input) lines | beam.io.WriteToText(my_output) p.run()
def test_group_by_key_input_visitor_with_valid_inputs(self): p = TestPipeline() pcoll1 = PCollection(p) pcoll2 = PCollection(p) pcoll3 = PCollection(p) for transform in [beam.GroupByKeyOnly(), beam.GroupByKey()]: pcoll1.element_type = None pcoll2.element_type = typehints.Any pcoll3.element_type = typehints.KV[typehints.Any, typehints.Any] for pcoll in [pcoll1, pcoll2, pcoll3]: DataflowRunner.group_by_key_input_visitor().visit_transform( AppliedPTransform(None, transform, "label", [pcoll])) self.assertEqual(pcoll.element_type, typehints.KV[typehints.Any, typehints.Any])
def test_read_auto_pattern_compressed_and_uncompressed(self): _, lines = write_data(200) splits = [0, 34, 100, 140, 164, 188, 200] chunks = [ lines[splits[i - 1]:splits[i]] for i in xrange(1, len(splits)) ] chunks_to_write = [] for i, c in enumerate(chunks): if i % 2 == 0: out = cStringIO.StringIO() with gzip.GzipFile(fileobj=out, mode="w") as f: f.write('\n'.join(c)) chunks_to_write.append(out.getvalue()) else: chunks_to_write.append('\n'.join(c)) file_pattern = write_prepared_pattern(chunks_to_write, suffixes=(['.gz', ''] * 3)) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> beam.Read( LineSource(file_pattern, compression_type=fileio.CompressionTypes.AUTO)) assert_that(pcoll, equal_to(lines)) pipeline.run()
def test_top_shorthands(self): pipeline = TestPipeline() pcoll = pipeline | 'start' >> Create([6, 3, 1, 1, 9, 1, 5, 2, 0, 6]) result_top = pcoll | 'top' >> beam.CombineGlobally(combine.Largest(5)) result_bot = pcoll | 'bot' >> beam.CombineGlobally(combine.Smallest(4)) assert_that(result_top, equal_to([[9, 6, 6, 5, 3]]), label='assert:top') assert_that(result_bot, equal_to([[0, 1, 1, 1]]), label='assert:bot') pcoll = pipeline | 'start-perkey' >> Create( [('a', x) for x in [6, 3, 1, 1, 9, 1, 5, 2, 0, 6]]) result_ktop = pcoll | 'top-perkey' >> beam.CombinePerKey( combine.Largest(5)) result_kbot = pcoll | 'bot-perkey' >> beam.CombinePerKey( combine.Smallest(4)) assert_that(result_ktop, equal_to([('a', [9, 6, 6, 5, 3])]), label='k:top') assert_that(result_kbot, equal_to([('a', [0, 1, 1, 1])]), label='k:bot') pipeline.run()
def _test_flatten_input_visitor(self, input_type, output_type, num_inputs): p = TestPipeline() inputs = [] for _ in range(num_inputs): input_pcoll = PCollection(p) input_pcoll.element_type = input_type inputs.append(input_pcoll) output_pcoll = PCollection(p) output_pcoll.element_type = output_type flatten = AppliedPTransform(None, beam.Flatten(), "label", inputs) flatten.add_output(output_pcoll, None) DataflowRunner.flatten_input_visitor().visit_transform(flatten) for _ in range(num_inputs): self.assertEqual(inputs[0].element_type, output_type)