def test_bounded_sum(self): # [START combine_bounded_sum] pc = [1, 10, 100, 1000] def bounded_sum(values, bound=500): return min(sum(values), bound) small_sum = pc | df.CombineGlobally(bounded_sum) # [500] large_sum = pc | df.CombineGlobally(bounded_sum, bound=5000) # [1111] # [END combine_bounded_sum] self.assertEqual([500], small_sum) self.assertEqual([1111], large_sum)
def test_combine_concat(self): pc = ['a', 'b'] # [START combine_concat] def concat(values, separator=', '): return separator.join(values) with_commas = pc | df.CombineGlobally(concat) with_dashes = pc | df.CombineGlobally(concat, separator='-') # [END combine_concat] self.assertEqual(1, len(with_commas)) self.assertTrue(with_commas[0] in {'a, b', 'b, a'}) self.assertEqual(1, len(with_dashes)) self.assertTrue(with_dashes[0] in {'a-b', 'b-a'})
def run(argv=None): # pylint: disable=expression-not-assigned parser = argparse.ArgumentParser() parser.add_argument('--input', required=True, help='Input file pattern to process.') parser.add_argument('--output', required=True, help='Output file pattern to write results to.') parser.add_argument('--checksum_output', required=True, help='Checksum output file pattern.') known_args, pipeline_args = parser.parse_known_args(argv) p = df.Pipeline(argv=pipeline_args) # Read the text file[pattern] into a PCollection. lines = p | df.io.Read('read', df.io.TextFileSource(known_args.input)) # Count the occurrences of each word. output = (lines | df.Map('split', lambda x: (x[:10], x[10:99])) | df.GroupByKey('group') | df.FlatMap( 'format', lambda (key, vals): ['%s%s' % (key, val) for val in vals])) input_csum = (lines | df.Map('input-csum', crc32line) | df.CombineGlobally('combine-input-csum', sum) | df.Map('hex-format', lambda x: '%x' % x)) input_csum | df.io.Write( 'write-input-csum', df.io.TextFileSink(known_args.checksum_output + '-input')) # Write the output using a "Write" transform that has side effects. output | df.io.Write('write', df.io.TextFileSink(known_args.output)) # Write the output checksum output_csum = (output | df.Map('output-csum', crc32line) | df.CombineGlobally('combine-output-csum', sum) | df.Map('hex-format-output', lambda x: '%x' % x)) output_csum | df.io.Write( 'write-output-csum', df.io.TextFileSink(known_args.checksum_output + '-output')) # Actually run the pipeline (all operations above are deferred). p.run()
def test_tuple_combine_fn(self): p = Pipeline('DirectPipelineRunner') result = (p | Create([('a', 100, 0.0), ('b', 10, -1), ('c', 1, 100)]) | df.CombineGlobally( combine.TupleCombineFn(max, combine.MeanCombineFn(), sum)).without_defaults()) assert_that(result, equal_to([('c', 111.0 / 3, 99.0)])) p.run()
def test_combine_reduce(self): factors = [2, 3, 5, 7] # [START combine_reduce] import functools import operator product = factors | df.CombineGlobally( functools.partial(reduce, operator.mul), 1) # [END combine_reduce] self.assertEqual([210], product)
def test_tuple_combine_fn_without_defaults(self): p = Pipeline('DirectPipelineRunner') result = (p | Create([1, 1, 2, 3]) | df.CombineGlobally( combine.TupleCombineFn( min, combine.MeanCombineFn(), max).with_common_input()).without_defaults()) assert_that(result, equal_to([(1, 7.0 / 4, 3)])) p.run()
def test_top_shorthands(self): pipeline = Pipeline('DirectPipelineRunner') pcoll = pipeline | Create('start', [6, 3, 1, 1, 9, 1, 5, 2, 0, 6]) result_top = pcoll | df.CombineGlobally('top', combiners.Largest(5)) result_bot = pcoll | df.CombineGlobally('bot', combiners.Smallest(4)) assert_that(result_top, equal_to([[9, 6, 6, 5, 3]]), label='assert:top') assert_that(result_bot, equal_to([[0, 1, 1, 1]]), label='assert:bot') pcoll = pipeline | Create( 'start-perkey', [('a', x) for x in [6, 3, 1, 1, 9, 1, 5, 2, 0, 6]]) result_ktop = pcoll | df.CombinePerKey('top-perkey', combiners.Largest(5)) result_kbot = pcoll | df.CombinePerKey('bot-perkey', combiners.Smallest(4)) assert_that(result_ktop, equal_to([('a', [9, 6, 6, 5, 3])]), label='k:top') assert_that(result_kbot, equal_to([('a', [0, 1, 1, 1])]), label='k:bot') pipeline.run()
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) p = df.Pipeline(argv=pipeline_args) # A thousand work items of a million tries each. (p # pylint: disable=expression-not-assigned | df.Create('Initialize', [100000] * 100).with_output_types(int) | df.Map('Run trials', run_trials) | df.CombineGlobally('Sum', combine_results).without_defaults() | df.io.Write('Write', df.io.TextFileSink(known_args.output, coder=JsonCoder()))) # Actually run the pipeline (all operations above are deferred). p.run()
def test_pardo_side_input(self): p = df.Pipeline('DirectPipelineRunner') words = p | df.Create('start', ['a', 'bb', 'ccc', 'dddd']) # [START model_pardo_side_input] # Callable takes additional arguments. def filter_using_length(word, lower_bound, upper_bound=float('inf')): if lower_bound <= len(word) <= upper_bound: yield word # Construct a deferred side input. avg_word_len = words | df.Map(len) | df.CombineGlobally( df.combiners.MeanCombineFn()) # Call with explicit side inputs. small_words = words | df.FlatMap('small', filter_using_length, 0, 3) # A single deferred side input. larger_than_average = words | df.FlatMap( 'large', filter_using_length, lower_bound=pvalue.AsSingleton(avg_word_len)) # Mix and match. small_but_nontrivial = words | df.FlatMap( filter_using_length, lower_bound=2, upper_bound=pvalue.AsSingleton(avg_word_len)) # [END model_pardo_side_input] df.assert_that(small_words, df.equal_to(['a', 'bb', 'ccc'])) df.assert_that(larger_than_average, df.equal_to(['ccc', 'dddd']), label='larger_than_average') df.assert_that(small_but_nontrivial, df.equal_to(['bb']), label='small_but_not_trivial') p.run()
def test_global_sum(self): pc = [1, 2, 3] # [START global_sum] result = pc | df.CombineGlobally(sum) # [END global_sum] self.assertEqual([6], result)
# [START combine_custom_average] class AverageFn(df.CombineFn): def create_accumulator(self): return (0.0, 0) def add_input(self, (sum, count), input): return sum + input, count + 1 def merge_accumulators(self, accumulators): sums, counts = zip(*accumulators) return sum(sums), sum(counts) def extract_output(self, (sum, count)): return sum / count if count else float('NaN') average = pc | df.CombineGlobally(AverageFn()) # [END combine_custom_average] self.assertEqual([4.25], average) def test_keys(self): occurrences = [('cat', 1), ('cat', 5), ('dog', 5), ('cat', 9), ('dog', 2)] unique_keys = occurrences | snippets.Keys() self.assertEqual({'cat', 'dog'}, set(unique_keys)) def test_count(self): occurrences = ['cat', 'dog', 'cat', 'cat', 'dog'] perkey_counts = occurrences | snippets.Count() self.assertEqual({('cat', 3), ('dog', 2)}, set(perkey_counts))