def test_setting_timestamp(self): p = TestPipeline() unkeyed_items = p | beam.Create([12, 30, 60, 61, 66]) items = (unkeyed_items | 'key' >> beam.Map(lambda x: ('k', x))) def extract_timestamp_from_log_entry(entry): return entry[1] # [START setting_timestamp] class AddTimestampDoFn(beam.DoFn): def process(self, element): # Extract the numeric Unix seconds-since-epoch timestamp to be # associated with the current log entry. unix_timestamp = extract_timestamp_from_log_entry(element) # Wrap and emit the current entry and new timestamp in a # TimestampedValue. yield beam.TimestampedValue(element, unix_timestamp) timestamped_items = items | 'timestamp' >> beam.ParDo(AddTimestampDoFn()) # [END setting_timestamp] fixed_windowed_items = ( timestamped_items | 'window' >> beam.WindowInto( beam.window.FixedWindows(60))) summed = (fixed_windowed_items | 'group' >> beam.GroupByKey() | 'combine' >> beam.CombineValues(sum)) unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1]) beam.assert_that(unkeyed, beam.equal_to([42, 187])) p.run()
def test_create_groups(self): p = TestPipeline() group_ids_pcoll = p | 'CreateGroupIds' >> beam.Create(['A', 'B', 'C']) corpus_pcoll = p | 'CreateCorpus' >> beam.Create([{ 'f': 'corpus1' }, { 'f': 'corpus2' }, { 'f': 'corpus3' }]) words_pcoll = p | 'CreateWords' >> beam.Create([{ 'f': 'word1' }, { 'f': 'word2' }, { 'f': 'word3' }]) ignore_corpus_pcoll = p | 'CreateIgnoreCorpus' >> beam.Create( ['corpus1']) ignore_word_pcoll = p | 'CreateIgnoreWord' >> beam.Create(['word1']) groups = bigquery_side_input.create_groups(group_ids_pcoll, corpus_pcoll, words_pcoll, ignore_corpus_pcoll, ignore_word_pcoll) beam.assert_that( groups, beam.equal_to([('A', 'corpus2', 'word2'), ('B', 'corpus2', 'word2'), ('C', 'corpus2', 'word2')])) p.run()
def test_basics(self): p = TestPipeline() rows = (p | 'create' >> beam.Create([{ 'month': 1, 'day': 1, 'tornado': False }, { 'month': 1, 'day': 2, 'tornado': True }, { 'month': 1, 'day': 3, 'tornado': True }, { 'month': 2, 'day': 1, 'tornado': True }])) results = bigquery_tornadoes.count_tornadoes(rows) beam.assert_that( results, beam.equal_to([{ 'month': 1, 'tornado_count': 2 }, { 'month': 2, 'tornado_count': 1 }])) p.run().wait_until_finish()
def test_user_score(self): with TestPipeline() as p: result = ( p | beam.Create(UserScoreTest.SAMPLE_DATA) | user_score.UserScore()) beam.assert_that(result, beam.equal_to([ ('user1_team1', 50), ('user2_team2', 2), ('user3_team3', 8), ('user4_team3', 5)]))
def test_compute_top_sessions(self): p = TestPipeline() edits = p | beam.Create(self.EDITS) result = edits | top_wikipedia_sessions.ComputeTopSessions(1.0) beam.assert_that(result, beam.equal_to(self.EXPECTED)) p.run()
def test_setting_timestamp(self): p = TestPipeline() unkeyed_items = p | beam.Create([12, 30, 60, 61, 66]) items = (unkeyed_items | 'key' >> beam.Map(lambda x: ('k', x))) def extract_timestamp_from_log_entry(entry): return entry[1] # [START setting_timestamp] class AddTimestampDoFn(beam.DoFn): def process(self, element): # Extract the numeric Unix seconds-since-epoch timestamp to be # associated with the current log entry. unix_timestamp = extract_timestamp_from_log_entry(element) # Wrap and emit the current entry and new timestamp in a # TimestampedValue. yield beam.TimestampedValue(element, unix_timestamp) timestamped_items = items | 'timestamp' >> beam.ParDo( AddTimestampDoFn()) # [END setting_timestamp] fixed_windowed_items = ( timestamped_items | 'window' >> beam.WindowInto(beam.window.FixedWindows(60))) summed = (fixed_windowed_items | 'group' >> beam.GroupByKey() | 'combine' >> beam.CombineValues(sum)) unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1]) beam.assert_that(unkeyed, beam.equal_to([42, 187])) p.run()
def test_basic(self): """Test that the correct result is returned for a simple dataset.""" results = self._get_result_for_month(1) beam.assert_that( results, beam.equal_to([{'year': 2010, 'month': 1, 'day': 1, 'mean_temp': 3}, {'year': 2012, 'month': 1, 'day': 2, 'mean_temp': 3}])) results.pipeline.run()
def test_process_gzip_auto(self): path = os.path.join(self._new_tempdir(), 'result.gz') self._write_file_gzip(path, FOO_BAR_RECORD_BASE64) with TestPipeline() as p: result = (p | ReadFromTFRecord( path, compression_type=fileio.CompressionTypes.AUTO)) beam.assert_that(result, beam.equal_to(['foo', 'bar']))
def test_process_gzip(self): path = os.path.join(self._new_tempdir(), 'result') self._write_file_gzip(path, FOO_BAR_RECORD_BASE64) with TestPipeline() as p: result = (p | ReadFromTFRecord( path, compression_type=CompressionTypes.GZIP)) beam.assert_that(result, beam.equal_to(['foo', 'bar']))
def test_process_gzip_auto(self): path = os.path.join(self._new_tempdir(), 'result.gz') self._write_file_gzip(path, FOO_BAR_RECORD_BASE64) with beam.Pipeline(DirectRunner()) as p: result = (p | ReadFromTFRecord( path, compression_type=fileio.CompressionTypes.AUTO)) beam.assert_that(result, beam.equal_to(['foo', 'bar']))
def test_hourly_team_score(self): with TestPipeline() as p: result = (p | beam.Create(HourlyTeamScoreTest.SAMPLE_DATA) | hourly_team_score.HourlyTeamScore( start_min='2015-11-16-15-20', stop_min='2015-11-16-17-20', window_duration=60)) beam.assert_that(result, beam.equal_to([ ('team1', 18), ('team2', 2), ('team3', 13)]))
def test_process_gzip(self): path = os.path.join(self._new_tempdir(), 'result') self._write_file_gzip(path, FOO_BAR_RECORD_BASE64) with beam.Pipeline(DirectRunner()) as p: result = (p | beam.Read( _TFRecordSource( path, coder=coders.BytesCoder(), compression_type=fileio.CompressionTypes.GZIP))) beam.assert_that(result, beam.equal_to(['foo', 'bar']))
def test_hourly_team_score(self): with TestPipeline() as p: result = (p | beam.Create(HourlyTeamScoreTest.SAMPLE_DATA) | hourly_team_score.HourlyTeamScore( start_min='2015-11-16-15-20', stop_min='2015-11-16-17-20', window_duration=60)) beam.assert_that( result, beam.equal_to([('team1', 18), ('team2', 2), ('team3', 13)]))
def test_process_gzip(self): path = os.path.join(self._new_tempdir(), 'result') self._write_file_gzip(path, FOO_BAR_RECORD_BASE64) with TestPipeline() as p: result = (p | beam.Read( _TFRecordSource( path, coder=coders.BytesCoder(), compression_type=fileio.CompressionTypes.GZIP))) beam.assert_that(result, beam.equal_to(['foo', 'bar']))
def test_process_single(self): path = os.path.join(self._new_tempdir(), 'result') self._write_file(path, FOO_RECORD_BASE64) with TestPipeline() as p: result = (p | beam.Read( _TFRecordSource( path, coder=coders.BytesCoder(), compression_type=fileio.CompressionTypes.AUTO))) beam.assert_that(result, beam.equal_to(['foo']))
def test_process_auto(self): path = os.path.join(self._new_tempdir(), 'result.gz') self._write_file_gzip(path, FOO_BAR_RECORD_BASE64) with TestPipeline() as p: result = (p | beam.io.Read( _TFRecordSource( path, coder=coders.BytesCoder(), compression_type=CompressionTypes.AUTO, validate=True))) beam.assert_that(result, beam.equal_to(['foo', 'bar']))
def test_process_single(self): path = os.path.join(self._new_tempdir(), 'result') self._write_file(path, FOO_RECORD_BASE64) with TestPipeline() as p: result = (p | beam.Read( _TFRecordSource( path, coder=coders.BytesCoder(), compression_type=CompressionTypes.AUTO, validate=True))) beam.assert_that(result, beam.equal_to(['foo']))
def test_end2end(self): file_path_prefix = os.path.join(self._new_tempdir(), 'result') # Generate a TFRecord file. with TestPipeline() as p: expected_data = [self.create_inputs() for _ in range(0, 10)] _ = p | beam.Create(expected_data) | WriteToTFRecord(file_path_prefix) # Read the file back and compare. with TestPipeline() as p: actual_data = p | ReadFromTFRecord(file_path_prefix + '-*') beam.assert_that(actual_data, beam.equal_to(expected_data))
def model_textio_compressed(renames, expected): """Using a Read Transform to read compressed text files.""" p = TestPipeline() # [START model_textio_write_compressed] lines = p | 'ReadFromText' >> beam.io.ReadFromText( '/path/to/input-*.csv.gz', compression_type=beam.io.filesystem.CompressionTypes.GZIP) # [END model_textio_write_compressed] beam.assert_that(lines, beam.equal_to(expected)) p.visit(SnippetUtils.RenameFiles(renames)) p.run().wait_until_finish()
def test_end2end_auto_compression_unsharded(self): file_path_prefix = os.path.join(self._new_tempdir(), 'result') # Generate a TFRecord file. with beam.Pipeline(DirectRunner()) as p: expected_data = [self.create_inputs() for _ in range(0, 10)] _ = p | beam.Create(expected_data) | WriteToTFRecord( file_path_prefix + '.gz', shard_name_template='') # Read the file back and compare. with beam.Pipeline(DirectRunner()) as p: actual_data = p | ReadFromTFRecord(file_path_prefix + '.gz') beam.assert_that(actual_data, beam.equal_to(expected_data))
def test_pardo_side_input(self): p = TestPipeline() words = p | 'start' >> beam.Create(['a', 'bb', 'ccc', 'dddd']) # [START model_pardo_side_input] # Callable takes additional arguments. def filter_using_length(word, lower_bound, upper_bound=float('inf')): if lower_bound <= len(word) <= upper_bound: yield word # Construct a deferred side input. avg_word_len = (words | beam.Map(len) | beam.CombineGlobally(beam.combiners.MeanCombineFn())) # Call with explicit side inputs. small_words = words | 'small' >> beam.FlatMap(filter_using_length, 0, 3) # A single deferred side input. larger_than_average = (words | 'large' >> beam.FlatMap( filter_using_length, lower_bound=pvalue.AsSingleton(avg_word_len))) # Mix and match. small_but_nontrivial = words | beam.FlatMap( filter_using_length, lower_bound=2, upper_bound=pvalue.AsSingleton(avg_word_len)) # [END model_pardo_side_input] beam.assert_that(small_words, beam.equal_to(['a', 'bb', 'ccc'])) beam.assert_that(larger_than_average, beam.equal_to(['ccc', 'dddd']), label='larger_than_average') beam.assert_that(small_but_nontrivial, beam.equal_to(['bb']), label='small_but_not_trivial') p.run()
def test_end2end_read_write_read(self): path = os.path.join(self._new_tempdir(), 'result') with TestPipeline() as p: # Initial read to validate the pipeline doesn't fail before the file is # created. _ = p | ReadFromTFRecord(path + '-*', validate=False) expected_data = [self.create_inputs() for _ in range(0, 10)] _ = p | beam.Create(expected_data) | WriteToTFRecord( path, file_name_suffix='.gz') # Read the file back and compare. with TestPipeline() as p: actual_data = p | ReadFromTFRecord(path + '-*', validate=True) beam.assert_that(actual_data, beam.equal_to(expected_data))
def test_end2end_read_write_read(self): path = os.path.join(self._new_tempdir(), 'result') with TestPipeline() as p: # Initial read to validate the pipeline doesn't fail before the file is # created. _ = p | ReadFromTFRecord(path + '-*', validate=False) expected_data = [self.create_inputs() for _ in range(0, 10)] _ = p | beam.Create(expected_data) | WriteToTFRecord( path, file_name_suffix='.gz') # Read the file back and compare. with TestPipeline() as p: actual_data = p | ReadFromTFRecord(path+'-*', validate=True) beam.assert_that(actual_data, beam.equal_to(expected_data))
def test_composite(self): # [START model_composite_transform] class ComputeWordLengths(beam.PTransform): def expand(self, pcoll): # transform logic goes here return pcoll | beam.Map(lambda x: len(x)) # [END model_composite_transform] p = TestPipeline() lengths = p | beam.Create(["a", "ab", "abc"]) | ComputeWordLengths() beam.assert_that(lengths, beam.equal_to([1, 2, 3])) p.run()
def test_combine_per_key_with_custom_callable(self): """CombinePerKey using a custom function reducing iterables.""" def multiply(values): result = 1 for v in values: result *= v return result result = (TestPipeline() | beam.Create(CombinersTest.SAMPLE_DATA) | beam.CombinePerKey(multiply)) beam.assert_that(result, beam.equal_to([('a', 6), ('b', 200), ('c', 100)])) result.pipeline.run()
def test_pardo_side_input(self): p = TestPipeline() words = p | 'start' >> beam.Create(['a', 'bb', 'ccc', 'dddd']) # [START model_pardo_side_input] # Callable takes additional arguments. def filter_using_length(word, lower_bound, upper_bound=float('inf')): if lower_bound <= len(word) <= upper_bound: yield word # Construct a deferred side input. avg_word_len = (words | beam.Map(len) | beam.CombineGlobally(beam.combiners.MeanCombineFn())) # Call with explicit side inputs. small_words = words | 'small' >> beam.FlatMap(filter_using_length, 0, 3) # A single deferred side input. larger_than_average = (words | 'large' >> beam.FlatMap( filter_using_length, lower_bound=pvalue.AsSingleton(avg_word_len))) # Mix and match. small_but_nontrivial = words | beam.FlatMap(filter_using_length, lower_bound=2, upper_bound=pvalue.AsSingleton( avg_word_len)) # [END model_pardo_side_input] beam.assert_that(small_words, beam.equal_to(['a', 'bb', 'ccc'])) beam.assert_that(larger_than_average, beam.equal_to(['ccc', 'dddd']), label='larger_than_average') beam.assert_that(small_but_nontrivial, beam.equal_to(['bb']), label='small_but_not_trivial') p.run()
def test_combine_per_key_with_callable(self): """CombinePerKey using a standard callable reducing iterables. A common case for Dataflow combiners is to sum (or max or min) over the values of each key. Such standard functions can be used directly as combiner functions. In fact, any function "reducing" an iterable to a single value can be used. """ result = (TestPipeline() | beam.Create(CombinersTest.SAMPLE_DATA) | beam.CombinePerKey(sum)) beam.assert_that(result, beam.equal_to([('a', 6), ('b', 30), ('c', 100)])) result.pipeline.run()
def test_tfidf_transform(self): p = TestPipeline() uri_to_line = p | 'create sample' >> beam.Create( [('1.txt', 'abc def ghi'), ('2.txt', 'abc def'), ('3.txt', 'abc')]) result = ( uri_to_line | tfidf.TfIdf() | beam.Map(lambda (word, (uri, tfidf)): (word, uri, tfidf))) beam.assert_that(result, beam.equal_to(EXPECTED_RESULTS)) # Run the pipeline. Note that the assert_that above adds to the pipeline # a check that the result PCollection contains expected values. To actually # trigger the check the pipeline must be run. p.run()
def run(argv=None): """Runs the debugging wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection, count the occurrences of # each word and filter by a list of words. filtered_words = ( p | 'read' >> ReadFromText(known_args.input) | CountWords() | 'FilterText' >> beam.ParDo(FilterTextFn('Flourish|stomach'))) # assert_that is a convenient PTransform that checks a PCollection has an # expected value. Asserts are best used in unit tests with small data sets but # is demonstrated here as a teaching tool. # # Note assert_that does not provide any output and that successful completion # of the Pipeline implies that the expectations were met. Learn more at # https://cloud.google.com/dataflow/pipelines/testing-your-pipeline on how to # test your pipeline. beam.assert_that(filtered_words, beam.equal_to([('Flourish', 3), ('stomach', 1)])) # Format the counts into a PCollection of strings and write the output using a # "Write" transform that has side effects. # pylint: disable=unused-variable output = (filtered_words | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c)) | 'write' >> WriteToText(known_args.output)) # Actually run the pipeline (all operations above are deferred). p.run().wait_until_finish()
def test_setting_global_window(self): p = TestPipeline() unkeyed_items = p | beam.Create([2, 11, 16, 27]) items = (unkeyed_items | 'key' >> beam.Map( lambda x: beam.window.TimestampedValue(('k', x), x))) # [START setting_global_window] from apache_beam import window session_windowed_items = ( items | 'window' >> beam.WindowInto(window.GlobalWindows())) # [END setting_global_window] summed = (session_windowed_items | 'group' >> beam.GroupByKey() | 'combine' >> beam.CombineValues(sum)) unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1]) beam.assert_that(unkeyed, beam.equal_to([56])) p.run()
def run(argv=None): """Runs the debugging wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection, count the occurrences of # each word and filter by a list of words. filtered_words = ( p | 'read' >> ReadFromText(known_args.input) | CountWords() | 'FilterText' >> beam.ParDo(FilterTextFn('Flourish|stomach'))) # assert_that is a convenient PTransform that checks a PCollection has an # expected value. Asserts are best used in unit tests with small data sets but # is demonstrated here as a teaching tool. # # Note assert_that does not provide any output and that successful completion # of the Pipeline implies that the expectations were met. Learn more at # https://cloud.google.com/dataflow/pipelines/testing-your-pipeline on how to # test your pipeline. beam.assert_that( filtered_words, beam.equal_to([('Flourish', 3), ('stomach', 1)])) # Format the counts into a PCollection of strings and write the output using a # "Write" transform that has side effects. # pylint: disable=unused-variable output = (filtered_words | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c)) | 'write' >> WriteToText(known_args.output)) # Actually run the pipeline (all operations above are deferred). p.run().wait_until_finish()
def test_setting_fixed_windows(self): p = TestPipeline() unkeyed_items = p | beam.Create([22, 33, 55, 100, 115, 120]) items = (unkeyed_items | 'key' >> beam.Map(lambda x: beam.window.TimestampedValue( ('k', x), x))) # [START setting_fixed_windows] from apache_beam import window fixed_windowed_items = ( items | 'window' >> beam.WindowInto(window.FixedWindows(60))) # [END setting_fixed_windows] summed = (fixed_windowed_items | 'group' >> beam.GroupByKey() | 'combine' >> beam.CombineValues(sum)) unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1]) beam.assert_that(unkeyed, beam.equal_to([110, 215, 120])) p.run()
def test_setting_global_window(self): p = TestPipeline() unkeyed_items = p | beam.Create([2, 11, 16, 27]) items = (unkeyed_items | 'key' >> beam.Map(lambda x: beam.window.TimestampedValue( ('k', x), x))) # [START setting_global_window] from apache_beam import window session_windowed_items = ( items | 'window' >> beam.WindowInto(window.GlobalWindows())) # [END setting_global_window] summed = (session_windowed_items | 'group' >> beam.GroupByKey() | 'combine' >> beam.CombineValues(sum)) unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1]) beam.assert_that(unkeyed, beam.equal_to([56])) p.run()
def test_setting_fixed_windows(self): p = TestPipeline() unkeyed_items = p | beam.Create([22, 33, 55, 100, 115, 120]) items = (unkeyed_items | 'key' >> beam.Map( lambda x: beam.window.TimestampedValue(('k', x), x))) # [START setting_fixed_windows] from apache_beam import window fixed_windowed_items = ( items | 'window' >> beam.WindowInto(window.FixedWindows(60))) # [END setting_fixed_windows] summed = (fixed_windowed_items | 'group' >> beam.GroupByKey() | 'combine' >> beam.CombineValues(sum)) unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1]) beam.assert_that(unkeyed, beam.equal_to([110, 215, 120])) p.run()
def test_end2end_example_proto(self): file_path_prefix = os.path.join(self._new_tempdir(), 'result') example = tf.train.Example() example.features.feature['int'].int64_list.value.extend(range(3)) example.features.feature['bytes'].bytes_list.value.extend( [b'foo', b'bar']) with TestPipeline() as p: _ = p | beam.Create([example]) | WriteToTFRecord( file_path_prefix, coder=beam.coders.ProtoCoder(example.__class__)) # Read the file back and compare. with TestPipeline() as p: actual_data = (p | ReadFromTFRecord( file_path_prefix + '-*', coder=beam.coders.ProtoCoder(example.__class__))) beam.assert_that(actual_data, beam.equal_to([example]))
def test_setting_sliding_windows(self): p = TestPipeline() unkeyed_items = p | beam.Create([2, 16, 23]) items = (unkeyed_items | 'key' >> beam.Map(lambda x: beam.window.TimestampedValue( ('k', x), x))) # [START setting_sliding_windows] from apache_beam import window sliding_windowed_items = ( items | 'window' >> beam.WindowInto(window.SlidingWindows(30, 5))) # [END setting_sliding_windows] summed = (sliding_windowed_items | 'group' >> beam.GroupByKey() | 'combine' >> beam.CombineValues(sum)) unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1]) beam.assert_that(unkeyed, beam.equal_to([2, 2, 2, 18, 23, 39, 39, 39, 41, 41])) p.run()
def test_setting_sliding_windows(self): p = TestPipeline() unkeyed_items = p | beam.Create([2, 16, 23]) items = (unkeyed_items | 'key' >> beam.Map( lambda x: beam.window.TimestampedValue(('k', x), x))) # [START setting_sliding_windows] from apache_beam import window sliding_windowed_items = ( items | 'window' >> beam.WindowInto(window.SlidingWindows(30, 5))) # [END setting_sliding_windows] summed = (sliding_windowed_items | 'group' >> beam.GroupByKey() | 'combine' >> beam.CombineValues(sum)) unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1]) beam.assert_that(unkeyed, beam.equal_to([2, 2, 2, 18, 23, 39, 39, 39, 41, 41])) p.run()
def run(): pipeline = beam.Pipeline() filtered_words = ( pipeline | 'read' >> ReadFromText('data/king_arthur.txt') | CountWords() | 'FilterText' >> beam.ParDo(FilterTextFn('Camelot|Excalibur')) ) beam.assert_that( filtered_words, beam.equal_to([('Camelot', 33), ('Excalibur', 17)]) ) output = filtered_words | 'format' >> beam.Map( lambda (word, count): '{}: {}'.format(word, count) ) output | 'write' >> WriteToText('debugging-wordcount', '.txt') pipeline.run().wait_until_finish()
def test_create_transform(self): with TestPipeline() as p: assert_that(p | Create(range(10)), equal_to(range(10)))
def examples_wordcount_debugging(renames): """DebuggingWordCount example snippets.""" import re import apache_beam as beam # [START example_wordcount_debugging_logging] # [START example_wordcount_debugging_aggregators] import logging class FilterTextFn(beam.DoFn): """A DoFn that filters for a specific key based on a regular expression.""" def __init__(self, pattern): self.pattern = pattern # A custom metric can track values in your pipeline as it runs. Create # custom metrics matched_word and unmatched_words. self.matched_words = Metrics.counter(self.__class__, 'matched_words') self.umatched_words = Metrics.counter(self.__class__, 'umatched_words') def process(self, element): word, _ = element if re.match(self.pattern, word): # Log at INFO level each element we match. When executing this pipeline # using the Dataflow service, these log lines will appear in the Cloud # Logging UI. logging.info('Matched %s', word) # Add 1 to the custom metric counter matched_words self.matched_words.inc() yield element else: # Log at the "DEBUG" level each element that is not matched. Different # log levels can be used to control the verbosity of logging providing # an effective mechanism to filter less important information. Note # currently only "INFO" and higher level logs are emitted to the Cloud # Logger. This log message will not be visible in the Cloud Logger. logging.debug('Did not match %s', word) # Add 1 to the custom metric counter umatched_words self.umatched_words.inc() # [END example_wordcount_debugging_logging] # [END example_wordcount_debugging_aggregators] p = TestPipeline() # Use TestPipeline for testing. filtered_words = ( p | beam.io.ReadFromText('gs://dataflow-samples/shakespeare/kinglear.txt') | 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)) | beam.combiners.Count.PerElement() | 'FilterText' >> beam.ParDo(FilterTextFn('Flourish|stomach'))) # [START example_wordcount_debugging_assert] beam.assert_that(filtered_words, beam.equal_to([('Flourish', 3), ('stomach', 1)])) # [END example_wordcount_debugging_assert] output = (filtered_words | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c)) | 'Write' >> beam.io.WriteToText('gs://my-bucket/counts.txt')) p.visit(SnippetUtils.RenameFiles(renames)) p.run()
def model_custom_source(count): """Demonstrates creating a new custom source and using it in a pipeline. Defines a new source ``CountingSource`` that produces integers starting from 0 up to a given size. Uses the new source in an example pipeline. Additionally demonstrates how a source should be implemented using a ``PTransform``. This is the recommended way to develop sources that are to distributed to a large number of end users. This method runs two pipelines. (1) A pipeline that uses ``CountingSource`` directly using the ``df.Read`` transform. (2) A pipeline that uses a custom ``PTransform`` that wraps ``CountingSource``. Args: count: the size of the counting source to be used in the pipeline demonstrated in this method. """ import apache_beam as beam from apache_beam.io import iobase from apache_beam.io.range_trackers import OffsetRangeTracker from apache_beam.transforms.core import PTransform from apache_beam.utils.pipeline_options import PipelineOptions # Defining a new source. # [START model_custom_source_new_source] class CountingSource(iobase.BoundedSource): def __init__(self, count): self._count = count def estimate_size(self): return self._count def get_range_tracker(self, start_position, stop_position): if start_position is None: start_position = 0 if stop_position is None: stop_position = self._count return OffsetRangeTracker(start_position, stop_position) def read(self, range_tracker): for i in range(self._count): if not range_tracker.try_claim(i): return yield i def split(self, desired_bundle_size, start_position=None, stop_position=None): if start_position is None: start_position = 0 if stop_position is None: stop_position = self._count bundle_start = start_position while bundle_start < self._count: bundle_stop = max(self._count, bundle_start + desired_bundle_size) yield iobase.SourceBundle(weight=(bundle_stop - bundle_start), source=self, start_position=bundle_start, stop_position=bundle_stop) bundle_start = bundle_stop # [END model_custom_source_new_source] # Using the source in an example pipeline. # [START model_custom_source_use_new_source] p = beam.Pipeline(options=PipelineOptions()) numbers = p | 'ProduceNumbers' >> beam.io.Read(CountingSource(count)) # [END model_custom_source_use_new_source] lines = numbers | beam.core.Map(lambda number: 'line %d' % number) beam.assert_that( lines, beam.equal_to(['line ' + str(number) for number in range(0, count)])) p.run().wait_until_finish() # We recommend users to start Source classes with an underscore to discourage # using the Source class directly when a PTransform for the source is # available. We simulate that here by simply extending the previous Source # class. class _CountingSource(CountingSource): pass # [START model_custom_source_new_ptransform] class ReadFromCountingSource(PTransform): def __init__(self, count, **kwargs): super(ReadFromCountingSource, self).__init__(**kwargs) self._count = count def expand(self, pcoll): return pcoll | iobase.Read(_CountingSource(count)) # [END model_custom_source_new_ptransform] # [START model_custom_source_use_ptransform] p = beam.Pipeline(options=PipelineOptions()) numbers = p | 'ProduceNumbers' >> ReadFromCountingSource(count) # [END model_custom_source_use_ptransform] lines = numbers | beam.core.Map(lambda number: 'line %d' % number) beam.assert_that( lines, beam.equal_to(['line ' + str(number) for number in range(0, count)])) # Don't test runner api due to pickling errors. p.run(test_runner_api=False).wait_until_finish()
def test_basic_empty_missing(self): """Test that the correct empty result is returned for a missing month.""" results = self._get_result_for_month(4) beam.assert_that(results, beam.equal_to([])) results.pipeline.run()
def examples_wordcount_debugging(renames): """DebuggingWordCount example snippets.""" import re import apache_beam as beam # [START example_wordcount_debugging_logging] # [START example_wordcount_debugging_aggregators] import logging class FilterTextFn(beam.DoFn): """A DoFn that filters for a specific key based on a regular expression.""" def __init__(self, pattern): self.pattern = pattern # A custom metric can track values in your pipeline as it runs. Create # custom metrics matched_word and unmatched_words. self.matched_words = Metrics.counter(self.__class__, 'matched_words') self.umatched_words = Metrics.counter(self.__class__, 'umatched_words') def process(self, element): word, _ = element if re.match(self.pattern, word): # Log at INFO level each element we match. When executing this pipeline # using the Dataflow service, these log lines will appear in the Cloud # Logging UI. logging.info('Matched %s', word) # Add 1 to the custom metric counter matched_words self.matched_words.inc() yield element else: # Log at the "DEBUG" level each element that is not matched. Different # log levels can be used to control the verbosity of logging providing # an effective mechanism to filter less important information. Note # currently only "INFO" and higher level logs are emitted to the Cloud # Logger. This log message will not be visible in the Cloud Logger. logging.debug('Did not match %s', word) # Add 1 to the custom metric counter umatched_words self.umatched_words.inc() # [END example_wordcount_debugging_logging] # [END example_wordcount_debugging_aggregators] p = TestPipeline() # Use TestPipeline for testing. filtered_words = ( p | beam.io.ReadFromText( 'gs://dataflow-samples/shakespeare/kinglear.txt') | 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)) | beam.combiners.Count.PerElement() | 'FilterText' >> beam.ParDo(FilterTextFn('Flourish|stomach'))) # [START example_wordcount_debugging_assert] beam.assert_that( filtered_words, beam.equal_to([('Flourish', 3), ('stomach', 1)])) # [END example_wordcount_debugging_assert] output = (filtered_words | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c)) | 'Write' >> beam.io.WriteToText('gs://my-bucket/counts.txt')) p.visit(SnippetUtils.RenameFiles(renames)) p.run()
def model_custom_source(count): """Demonstrates creating a new custom source and using it in a pipeline. Defines a new source ``CountingSource`` that produces integers starting from 0 up to a given size. Uses the new source in an example pipeline. Additionally demonstrates how a source should be implemented using a ``PTransform``. This is the recommended way to develop sources that are to distributed to a large number of end users. This method runs two pipelines. (1) A pipeline that uses ``CountingSource`` directly using the ``df.Read`` transform. (2) A pipeline that uses a custom ``PTransform`` that wraps ``CountingSource``. Args: count: the size of the counting source to be used in the pipeline demonstrated in this method. """ # Using the source in an example pipeline. # [START model_custom_source_use_new_source] p = beam.Pipeline(options=PipelineOptions()) numbers = p | 'ProduceNumbers' >> beam.io.Read(CountingSource(count)) # [END model_custom_source_use_new_source] lines = numbers | beam.core.Map(lambda number: 'line %d' % number) beam.assert_that( lines, beam.equal_to( ['line ' + str(number) for number in range(0, count)])) p.run().wait_until_finish() # We recommend users to start Source classes with an underscore to discourage # using the Source class directly when a PTransform for the source is # available. We simulate that here by simply extending the previous Source # class. class _CountingSource(CountingSource): pass # [START model_custom_source_new_ptransform] class ReadFromCountingSource(PTransform): def __init__(self, count, **kwargs): super(ReadFromCountingSource, self).__init__(**kwargs) self._count = count def expand(self, pcoll): return pcoll | iobase.Read(_CountingSource(count)) # [END model_custom_source_new_ptransform] # [START model_custom_source_use_ptransform] p = beam.Pipeline(options=PipelineOptions()) numbers = p | 'ProduceNumbers' >> ReadFromCountingSource(count) # [END model_custom_source_use_ptransform] lines = numbers | beam.core.Map(lambda number: 'line %d' % number) beam.assert_that( lines, beam.equal_to( ['line ' + str(number) for number in range(0, count)])) # Don't test runner api due to pickling errors. p.run(test_runner_api=False).wait_until_finish()
class TfIdfTest(unittest.TestCase): def create_file(self, path, contents): logging.info('Creating temp file: %s', path) with open(path, 'w') as f: f.write(contents) def test_tfidf_transform(self): p = TestPipeline() uri_to_line = p | 'create sample' >> beam.Create( [('1.txt', 'abc def ghi'), ('2.txt', 'abc def'), ('3.txt', 'abc')]) result = (uri_to_line | tfidf.TfIdf() | beam.Map(lambda (word, (uri, tfidf)): (word, uri, tfidf))) beam.assert_that(result, beam.equal_to(EXPECTED_RESULTS)) # Run the pipeline. Note that the assert_that above adds to the pipeline # a check that the result PCollection contains expected values. To actually # trigger the check the pipeline must be run. p.run() def test_basics(self): # Setup the files with expected content. temp_folder = tempfile.mkdtemp() self.create_file(os.path.join(temp_folder, '1.txt'), 'abc def ghi') self.create_file(os.path.join(temp_folder, '2.txt'), 'abc def') self.create_file(os.path.join(temp_folder, '3.txt'), 'abc') tfidf.run([ '--uris=%s/*' % temp_folder, '--output', os.path.join(temp_folder, 'result') ])
def test_basic_empty(self): """Test that the correct empty result is returned for a simple dataset.""" results = self._get_result_for_month(3) beam.assert_that(results, beam.equal_to([])) results.pipeline.run()
def run(argv=None, assert_results=None): parser = argparse.ArgumentParser() parser.add_argument( '--input_email', required=True, help='Email database, with each line formatted as "name<TAB>email".') parser.add_argument( '--input_phone', required=True, help='Phonebook, with each line formatted as "name<TAB>phone number".') parser.add_argument( '--input_snailmail', required=True, help='Address database, with each line formatted as "name<TAB>address".') parser.add_argument('--output_tsv', required=True, help='Tab-delimited output file.') parser.add_argument('--output_stats', required=True, help='Output file for statistics about the input.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # Helper: read a tab-separated key-value mapping from a text file, escape all # quotes/backslashes, and convert it a PCollection of (key, value) pairs. def read_kv_textfile(label, textfile): return (p | 'Read: %s' % label >> ReadFromText(textfile) | 'Backslash: %s' % label >> beam.Map( lambda x: re.sub(r'\\', r'\\\\', x)) | 'EscapeQuotes: %s' % label >> beam.Map( lambda x: re.sub(r'"', r'\"', x)) | 'Split: %s' % label >> beam.Map( lambda x: re.split(r'\t+', x, 1))) # Read input databases. email = read_kv_textfile('email', known_args.input_email) phone = read_kv_textfile('phone', known_args.input_phone) snailmail = read_kv_textfile('snailmail', known_args.input_snailmail) # Group together all entries under the same name. grouped = (email, phone, snailmail) | 'group_by_name' >> beam.CoGroupByKey() # Prepare tab-delimited output; something like this: # "name"<TAB>"email_1,email_2"<TAB>"phone"<TAB>"first_snailmail_only" tsv_lines = grouped | beam.Map( lambda (name, (email, phone, snailmail)): '\t'.join( ['"%s"' % name, '"%s"' % ','.join(email), '"%s"' % ','.join(phone), '"%s"' % next(iter(snailmail), '')])) # Compute some stats about our database of people. luddites = grouped | beam.Filter( # People without email. lambda (name, (email, phone, snailmail)): not next(iter(email), None)) writers = grouped | beam.Filter( # People without phones. lambda (name, (email, phone, snailmail)): not next(iter(phone), None)) nomads = grouped | beam.Filter( # People without addresses. lambda (name, (email, phone, snailmail)): not next(iter(snailmail), None)) num_luddites = luddites | 'Luddites' >> beam.combiners.Count.Globally() num_writers = writers | 'Writers' >> beam.combiners.Count.Globally() num_nomads = nomads | 'Nomads' >> beam.combiners.Count.Globally() # Write tab-delimited output. # pylint: disable=expression-not-assigned tsv_lines | 'WriteTsv' >> WriteToText(known_args.output_tsv) # TODO(silviuc): Move the assert_results logic to the unit test. if assert_results is not None: expected_luddites, expected_writers, expected_nomads = assert_results beam.assert_that(num_luddites, beam.equal_to([expected_luddites]), label='assert:luddites') beam.assert_that(num_writers, beam.equal_to([expected_writers]), label='assert:writers') beam.assert_that(num_nomads, beam.equal_to([expected_nomads]), label='assert:nomads') # Execute pipeline. return p.run()
def model_custom_source(count): """Demonstrates creating a new custom source and using it in a pipeline. Defines a new source ``CountingSource`` that produces integers starting from 0 up to a given size. Uses the new source in an example pipeline. Additionally demonstrates how a source should be implemented using a ``PTransform``. This is the recommended way to develop sources that are to distributed to a large number of end users. This method runs two pipelines. (1) A pipeline that uses ``CountingSource`` directly using the ``df.Read`` transform. (2) A pipeline that uses a custom ``PTransform`` that wraps ``CountingSource``. Args: count: the size of the counting source to be used in the pipeline demonstrated in this method. """ import apache_beam as beam from apache_beam.io import iobase from apache_beam.io.range_trackers import OffsetRangeTracker from apache_beam.transforms.core import PTransform from apache_beam.utils.pipeline_options import PipelineOptions # Defining a new source. # [START model_custom_source_new_source] class CountingSource(iobase.BoundedSource): def __init__(self, count): self._count = count def estimate_size(self): return self._count def get_range_tracker(self, start_position, stop_position): if start_position is None: start_position = 0 if stop_position is None: stop_position = self._count return OffsetRangeTracker(start_position, stop_position) def read(self, range_tracker): for i in range(self._count): if not range_tracker.try_claim(i): return yield i def split(self, desired_bundle_size, start_position=None, stop_position=None): if start_position is None: start_position = 0 if stop_position is None: stop_position = self._count bundle_start = start_position while bundle_start < self._count: bundle_stop = max(self._count, bundle_start + desired_bundle_size) yield iobase.SourceBundle(weight=(bundle_stop - bundle_start), source=self, start_position=bundle_start, stop_position=bundle_stop) bundle_start = bundle_stop # [END model_custom_source_new_source] # Using the source in an example pipeline. # [START model_custom_source_use_new_source] p = beam.Pipeline(options=PipelineOptions()) numbers = p | 'ProduceNumbers' >> beam.io.Read(CountingSource(count)) # [END model_custom_source_use_new_source] lines = numbers | beam.core.Map(lambda number: 'line %d' % number) beam.assert_that( lines, beam.equal_to( ['line ' + str(number) for number in range(0, count)])) p.run().wait_until_finish() # We recommend users to start Source classes with an underscore to discourage # using the Source class directly when a PTransform for the source is # available. We simulate that here by simply extending the previous Source # class. class _CountingSource(CountingSource): pass # [START model_custom_source_new_ptransform] class ReadFromCountingSource(PTransform): def __init__(self, count, **kwargs): super(ReadFromCountingSource, self).__init__(**kwargs) self._count = count def expand(self, pcoll): return pcoll | iobase.Read(_CountingSource(count)) # [END model_custom_source_new_ptransform] # [START model_custom_source_use_ptransform] p = beam.Pipeline(options=PipelineOptions()) numbers = p | 'ProduceNumbers' >> ReadFromCountingSource(count) # [END model_custom_source_use_ptransform] lines = numbers | beam.core.Map(lambda number: 'line %d' % number) beam.assert_that( lines, beam.equal_to( ['line ' + str(number) for number in range(0, count)])) p.run().wait_until_finish()