def preprocess(self, input_path, input_dict, output_path): """ Args: input_path: Input specified as uri to CSV file. Each line of csv file contains colon-separated GCS uri to an image and labels input_dict: Input dictionary. Specified as text file uri. Each line of the file stores one label. """ opt = self.pipeline_options.view_as(PrepareImagesOptions) p = df.Pipeline(options=self.pipeline_options) # Read input data. csv_data = df.io.TextFileSource(input_path, strip_trailing_newlines=True) dict_data = df.io.TextFileSource(input_dict, strip_trailing_newlines=True) labels = (p | df.Read(StageName.READ_DICTIONARY, dict_data)) content = (p | df.Read(StageName.READ_CSV, csv_data) | df.Map(StageName.PARSE_CSV, lambda line: csv.reader([line]).next()) | df.ParDo(StageName.EXTRACT_LABEL_IDS, ExtractLabelIdsDoFn(), df.pvalue.AsIter(labels)) | df.ParDo(StageName.READ_IMAGE, ExtractImageDoFn())) # Process input data using common transformations. image_graph_uri = os.path.join(opt.input_data_location, Default.IMAGE_GRAPH_FILENAME) examples = ( content | df.ParDo( StageName.CONVERT_IMAGE, ResizeImageDoFn(Default.IMAGE_TYPE, opt.max_image_width, opt.max_image_height)) | df.ParDo( StageName.ENCODE_EXAMPLE, EncodeExampleDoFn(image_graph_uri, opt.image_graph_jpeg_input_tensor, opt.image_graph_output_tensor, opt.training_data_percentage))) # Write in JSON format to Text file. # Remove redundant whitespace for more compact representation. # Images/labels are base64 encoded so will not contain spaces. to_json = lambda x: re.sub(r'\s+', ' ', json_format.MessageToJson(x[0]) ) for dataset in Dataset.ALL: _ = (examples | df.Filter(StageName.FILTER + dataset, lambda x, dataset=dataset: x[1] == dataset) | df.Map(StageName.TO_JSON + dataset, to_json) | df.Write( StageName.SAVE + dataset, df.io.TextFileSink('{}.{}.json'.format( output_path, dataset), num_shards=opt.output_shard_count))) # Execute the pipeline. p.run()
def run(argv=None): """Build and run the pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input_topic', required=True, help='Input PubSub topic of the form "/topics/<PROJECT>/<TOPIC>".') parser.add_argument( '--output_topic', required=True, help='Output PubSub topic of the form "/topics/<PROJECT>/<TOPIC>".') known_args, pipeline_args = parser.parse_known_args(argv) p = df.Pipeline(argv=pipeline_args) # Read the text file[pattern] into a PCollection. lines = p | df.io.Read('read', df.io.PubSubSource(known_args.input_topic)) # Capitalize the characters in each line. transformed = (lines | (df.FlatMap('split', lambda x: re.findall( r'[A-Za-z\']+', x)).with_output_types(unicode)) | df.Map('pair_with_one', lambda x: (x, 1)) | df.WindowInto(window.FixedWindows(15, 0)) | df.GroupByKey('group') | df.Map('count', lambda (word, ones): (word, sum(ones))) | df.Map('format', lambda tup: '%s: %d' % tup)) # Write to PubSub. # pylint: disable=expression-not-assigned transformed | df.io.Write('pubsub_write', df.io.PubSubSink(known_args.output_topic)) p.run()
def read_kv_textfile(label, textfile): return (p | df.io.Read('read_%s' % label, textfile) | df.Map('backslash_%s' % label, lambda x: re.sub(r'\\', r'\\\\', x)) | df.Map('escape_quotes_%s' % label, lambda x: re.sub(r'"', r'\"', x)) | df.Map('split_%s' % label, lambda x: re.split(r'\t+', x, 1)))
def _run_repeat_test_good(self, repeat): # As a positional argument. result = ['a', 'bb', 'c'] | df.Map(repeat, 3) self.assertEqual(['aaa', 'bbbbbb', 'ccc'], sorted(result)) # As a keyword argument. result = ['a', 'bb', 'c'] | df.Map(repeat, times=3) self.assertEqual(['aaa', 'bbbbbb', 'ccc'], sorted(result))
def _run_repeat_test_bad(self, repeat): # Various mismatches. with self.assertRaises(typehints.TypeCheckError): ['a', 'bb', 'c'] | df.Map(repeat, 'z') with self.assertRaises(typehints.TypeCheckError): ['a', 'bb', 'c'] | df.Map(repeat, times='z') with self.assertRaises(typehints.TypeCheckError): ['a', 'bb', 'c'] | df.Map(repeat, 3, 4) if not inspect.getargspec(repeat).defaults: with self.assertRaises(typehints.TypeCheckError): ['a', 'bb', 'c'] | df.Map(repeat)
def run(argv=None): # pylint: disable=expression-not-assigned parser = argparse.ArgumentParser() parser.add_argument('--input', required=True, help='Input file pattern to process.') parser.add_argument('--output', required=True, help='Output file pattern to write results to.') parser.add_argument('--checksum_output', required=True, help='Checksum output file pattern.') known_args, pipeline_args = parser.parse_known_args(argv) p = df.Pipeline(argv=pipeline_args) # Read the text file[pattern] into a PCollection. lines = p | df.io.Read('read', df.io.TextFileSource(known_args.input)) # Count the occurrences of each word. output = (lines | df.Map('split', lambda x: (x[:10], x[10:99])) | df.GroupByKey('group') | df.FlatMap( 'format', lambda (key, vals): ['%s%s' % (key, val) for val in vals])) input_csum = (lines | df.Map('input-csum', crc32line) | df.CombineGlobally('combine-input-csum', sum) | df.Map('hex-format', lambda x: '%x' % x)) input_csum | df.io.Write( 'write-input-csum', df.io.TextFileSink(known_args.checksum_output + '-input')) # Write the output using a "Write" transform that has side effects. output | df.io.Write('write', df.io.TextFileSink(known_args.output)) # Write the output checksum output_csum = (output | df.Map('output-csum', crc32line) | df.CombineGlobally('combine-output-csum', sum) | df.Map('hex-format-output', lambda x: '%x' % x)) output_csum | df.io.Write( 'write-output-csum', df.io.TextFileSink(known_args.checksum_output + '-output')) # Actually run the pipeline (all operations above are deferred). p.run()
def test_deferred_side_input_iterable(self): @typehints.with_input_types(str, typehints.Iterable[str]) def concat(glue, items): return glue.join(sorted(items)) p = df.Pipeline(options=PipelineOptions([])) main_input = p | df.Create(['a', 'bb', 'c']) side_input = p | df.Create('side', ['x', 'y', 'z']) result = main_input | df.Map(concat, pvalue.AsIter(side_input)) assert_that(result, equal_to(['xayaz', 'xbbybbz', 'xcycz'])) p.run() bad_side_input = p | df.Create('bad_side', [1, 2, 3]) with self.assertRaises(typehints.TypeCheckError): main_input | df.Map('fail', concat, pvalue.AsIter(bad_side_input))
def test_non_function(self): result = ['a', 'bb', 'c'] | df.Map(str.upper) self.assertEqual(['A', 'BB', 'C'], sorted(result)) result = ['xa', 'bbx', 'xcx'] | df.Map(str.strip, 'x') self.assertEqual(['a', 'bb', 'c'], sorted(result)) result = ['1', '10', '100'] | df.Map(int) self.assertEqual([1, 10, 100], sorted(result)) result = ['1', '10', '100'] | df.Map(int, 16) self.assertEqual([1, 16, 256], sorted(result)) with self.assertRaises(typehints.TypeCheckError): [1, 2, 3] | df.Map(str.upper)
def test_pardo_using_map(self): words = ['aa', 'bbb', 'c'] # [START model_pardo_using_map] word_lengths = words | df.Map(len) # [END model_pardo_using_map] self.assertEqual({2, 3, 1}, set(word_lengths))
def test_runtime_checks_on(self): p = df.Pipeline('DirectPipelineRunner', argv=sys.argv) with self.assertRaises(typehints.TypeCheckError): # [START type_hints_runtime_on] p.options.view_as(TypeOptions).runtime_type_check = True p | df.Create(['a']) | df.Map(lambda x: 3).with_output_types(str) p.run()
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) p = df.Pipeline(argv=pipeline_args) lines = p | df.io.Read('read', df.io.TextFileSource(known_args.input)) counts = (lines | (df.ParDo('split', WordExtractingDoFn()).with_output_types(unicode)) | df.Map('pair_with_one', lambda x: (x, 1)) | df.GroupByKey('group') | df.Map('count', lambda (word, ones): (word, sum(ones)))) output = counts | df.Map('format', lambda (word, c): '%s: %s' % (word, c)) output | df.io.Write('write', df.io.TextFileSink(known_args.output)) result = p.run() empty_line_values = result.aggregated_values(empty_line_aggregator) logging.info('number of empty lines: %d', sum(empty_line_values.values())) word_length_values = result.aggregated_values(average_word_size_aggregator) logging.info('average word lengths: %s', word_length_values.values())
def run(argv=sys.argv[1:]): """Runs the workflow computing total points from a collection of matches.""" parser = argparse.ArgumentParser() parser.add_argument('--input', required=True, help='Input file to process.') parser.add_argument('--output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) p = df.Pipeline(argv=pipeline_args) # Register the custom coder for the Player class, so that it will be used in # the computation. coders.registry.register_coder(Player, PlayerCoder) (p # pylint: disable=expression-not-assigned | df.io.Read('read', df.io.TextFileSource(known_args.input)) # The get_players function is annotated with a type hint above, so the type # system knows the output type of the following operation is a key-value pair # of a Player and an int. Please see the documentation for details on # types that are inferred automatically as well as other ways to specify # type hints. | df.Map('get players', get_players) # The output type hint of the previous step is used to infer that the key # type of the following operation is the Player type. Since a custom coder # is registered for the Player class above, a PlayerCoder will be used to # encode Player objects as keys for this combine operation. | df.CombinePerKey(sum) | df.Map(lambda (k, v): '%s,%d' % (k.name, v)) | df.io.Write('write', df.io.TextFileSink(known_args.output))) p.run()
def test_bad_main_input(self): @typehints.with_input_types(str, int) def repeat(s, times): return s * times with self.assertRaises(typehints.TypeCheckError): [1, 2, 3] | df.Map(repeat, 3)
def test_deferred_side_inputs(self): @typehints.with_input_types(str, int) def repeat(s, times): return s * times p = df.Pipeline(options=PipelineOptions([])) main_input = p | df.Create(['a', 'bb', 'c']) side_input = p | df.Create('side', [3]) result = main_input | df.Map(repeat, pvalue.AsSingleton(side_input)) assert_that(result, equal_to(['aaa', 'bbbbbb', 'ccc'])) p.run() bad_side_input = p | df.Create('bad_side', ['z']) with self.assertRaises(typehints.TypeCheckError): main_input | df.Map('again', repeat, pvalue.AsSingleton(bad_side_input))
def test_varargs_side_input_hint(self): @typehints.with_input_types(str, int) def repeat(s, *times): return s * times[0] result = ['a', 'bb', 'c'] | df.Map(repeat, 3) self.assertEqual(['aaa', 'bbbbbb', 'ccc'], sorted(result))
def run(argv=None): """Build and run the pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input_topic', dest='input_topic', required=True, help='Input PubSub topic of the form "/topics/<PROJECT>/<TOPIC>".') parser.add_argument( '--output_topic', dest='output_topic', required=True, help='Output PubSub topic of the form "/topics/<PROJECT>/<TOPIC>".') known_args, pipeline_args = parser.parse_known_args(argv) p = df.Pipeline(argv=pipeline_args) # Read the text file[pattern] into a PCollection. lines = p | df.io.Read( 'read', df.io.PubSubSource(known_args.input_topic)) # Capitalize the characters in each line. transformed = (lines | (df.Map('capitalize', lambda x: x.upper()))) # Write to PubSub. # pylint: disable=expression-not-assigned transformed | df.io.Write( 'pubsub_write', df.io.PubSubSink(known_args.output_topic)) p.run()
def apply(self, pcoll): return (pcoll | (df.FlatMap('split', lambda x: re.findall(r'[A-Za-z\']+', x) ).with_output_types(unicode)) | df.Map('pair_with_one', lambda x: (x, 1)) | df.GroupByKey('group') | df.Map('count', lambda (word, ones): (word, sum(ones))))
def test_loose_bounds(self): @typehints.with_input_types(typehints.Union[int, float, long]) @typehints.with_output_types(basestring) def format_number(x): return '%g' % x result = [1, 2, 3] | df.Map(format_number) self.assertEqual(['1', '2', '3'], sorted(result))
def test_pardo_with_label(self): words = ['aa', 'bbc', 'defg'] # [START model_pardo_with_label] result = words | df.Map('CountUniqueLetters', lambda word: len(set(word))) # [END model_pardo_with_label] self.assertEqual({1, 2, 4}, set(result))
def test_bad_types(self): p = df.Pipeline('DirectPipelineRunner', argv=sys.argv) # [START type_hints_missing_define_numbers] numbers = p | df.Create(['1', '2', '3']) # [END type_hints_missing_define_numbers] # Consider the following code. # [START type_hints_missing_apply] evens = numbers | df.Filter(lambda x: x % 2 == 0) # [END type_hints_missing_apply] # Now suppose numers was defined as [snippet above]. # When running this pipeline, you'd get a runtime error, # possibly on a remote machine, possibly very late. with self.assertRaises(TypeError): p.run() # To catch this early, we can assert what types we expect. with self.assertRaises(typehints.TypeCheckError): # [START type_hints_takes] p.options.view_as(TypeOptions).pipeline_type_check = True evens = numbers | df.Filter(lambda x: x % 2 == 0).with_input_types( int) # [END type_hints_takes] # Type hints can be declared on DoFns and callables as well, rather # than where they're used, to be more self contained. with self.assertRaises(typehints.TypeCheckError): # [START type_hints_do_fn] @df.typehints.with_input_types(int) class FilterEvensDoFn(df.DoFn): def process(self, context): if context.element % 2 == 0: yield context.element evens = numbers | df.ParDo(FilterEvensDoFn()) # [END type_hints_do_fn] words = p | df.Create('words', ['a', 'bb', 'c']) # One can assert outputs and apply them to transforms as well. # Helps document the contract and checks it at pipeline construction time. # [START type_hints_transform] T = df.typehints.TypeVariable('T') @df.typehints.with_input_types(T) @df.typehints.with_output_types(df.typehints.Tuple[int, T]) class MyTransform(df.PTransform): def apply(self, pcoll): return pcoll | df.Map(lambda x: (len(x), x)) words_with_lens = words | MyTransform() # [END type_hints_transform] with self.assertRaises(typehints.TypeCheckError): words_with_lens | df.Map(lambda x: x).with_input_types( df.typehints.Tuple[int, int])
def read_documents(pipeline, uris): """Read the documents at the provided uris and returns (uri, line) pairs.""" pcolls = [] for uri in uris: pcolls.append(pipeline | df.io.Read('read: %s' % uri, df.io.TextFileSource(uri)) | df.Map('withkey: %s' % uri, lambda v, uri: (uri, v), uri)) return pcolls | df.Flatten('flatten read pcolls')
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument( '--output', dest='output', # CHANGE 1/5: The Google Cloud Storage path is required # for outputting the results. default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX', help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend([ # CHANGE 2/5: (OPTIONAL) Change this to BlockingDataflowPipelineRunner to # run your pipeline on the Google Cloud Dataflow Service. '--runner=DirectPipelineRunner', # CHANGE 3/5: Your project ID is required in order to run your pipeline on # the Google Cloud Dataflow Service. '--project=SET_YOUR_PROJECT_ID_HERE', # CHANGE 4/5: Your Google Cloud Storage path is required for staging local # files. '--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY', # CHANGE 5/5: Your Google Cloud Storage path is required for temporary # files. '--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY', '--job_name=your-wordcount-job', ]) p = df.Pipeline(argv=pipeline_args) # Read the text file[pattern] into a PCollection. lines = p | df.io.Read('read', df.io.TextFileSource(known_args.input)) # Count the occurrences of each word. counts = (lines | (df.FlatMap('split', lambda x: re.findall(r'[A-Za-z\']+', x)). with_output_types(unicode)) | df.Map('pair_with_one', lambda x: (x, 1)) | df.GroupByKey('group') | df.Map('count', lambda (word, ones): (word, sum(ones)))) # Format the counts into a PCollection of strings. output = counts | df.Map('format', lambda (word, c): '%s: %s' % (word, c)) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | df.io.Write('write', df.io.TextFileSink(known_args.output)) # Actually run the pipeline (all operations above are deferred). p.run()
def run(argv=None): """Runs the workflow counting the long words and short words separately.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', required=True, help='Output prefix for files to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) p = df.Pipeline(argv=pipeline_args) lines = p | df.Read('read', df.io.TextFileSource(known_args.input)) # with_outputs allows accessing the side outputs of a DoFn. split_lines_result = ( lines | df.ParDo(SplitLinesToWordsFn()).with_outputs( SplitLinesToWordsFn.SIDE_OUTPUT_TAG_SHORT_WORDS, SplitLinesToWordsFn.SIDE_OUTPUT_TAG_CHARACTER_COUNT, main='words')) # split_lines_result is an object of type DoOutputsTuple. It supports # accessing result in alternative ways. words, _, _ = split_lines_result short_words = split_lines_result[ SplitLinesToWordsFn.SIDE_OUTPUT_TAG_SHORT_WORDS] character_count = split_lines_result.tag_character_count # pylint: disable=expression-not-assigned (character_count | df.Map('pair_with_key', lambda x: ('chars_temp_key', x)) | df.GroupByKey() | df.Map('count chars', lambda (_, counts): sum(counts)) | df.Write('write chars', df.io.TextFileSink(known_args.output + '-chars'))) # pylint: disable=expression-not-assigned (short_words | CountWords('count short words') | df.Write('write short words', df.io.TextFileSink(known_args.output + '-short-words'))) # pylint: disable=expression-not-assigned (words | CountWords('count words') | df.Write('write words', df.io.TextFileSink(known_args.output + '-words'))) p.run()
def generate_julia_set_colors(pipeline, c, n, max_iterations): """Compute julia set coordinates for each point in our set.""" def point_set(n): for x in range(n): for y in range(n): yield (x, y) julia_set_colors = (pipeline | df.Create('add points', point_set(n)) | df.Map(get_julia_set_point_color, c, n, max_iterations)) return julia_set_colors
def filter_cold_days(input_data, month_filter): """Workflow computing rows in a specific month with low temperatures. Args: input_data: a PCollection of dictionaries representing table rows. Each dictionary must have the keys ['year', 'month', 'day', and 'mean_temp']. month_filter: an int representing the month for which colder-than-average days should be returned. Returns: A PCollection of dictionaries with the same keys described above. Each row represents a day in the specified month where temperatures were colder than the global mean temperature in the entire dataset. """ # Project to only the desired fields from a complete input row. # E.g., SELECT f1, f2, f3, ... FROM InputTable. projection_fields = ['year', 'month', 'day', 'mean_temp'] fields_of_interest = ( input_data | df.Map('projected', lambda row: {f: row[f] for f in projection_fields})) # Compute the global mean temperature. global_mean = AsSingleton( fields_of_interest | df.Map('extract mean', lambda row: row['mean_temp']) | df.combiners.Mean.Globally('global mean')) # Filter to the rows representing days in the month of interest # in which the mean daily temperature is below the global mean. return ( fields_of_interest | df.Filter('desired month', lambda row: row['month'] == month_filter) | df.Filter('below mean', lambda row, mean: row['mean_temp'] < mean, global_mean))
def Count(label, pcoll, factor=1): """Count as a decorated function with a side input. Args: label: optional label for this transform pcoll: the PCollection passed in from the previous transform factor: the amount by which to count Returns: A PCollection counting the number of times each unique element occurs. """ return ( pcoll | df.Map('Init', lambda v: (v, factor)) | df.CombinePerKey(sum))
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) p = df.Pipeline(argv=pipeline_args) # A thousand work items of a million tries each. (p # pylint: disable=expression-not-assigned | df.Create('Initialize', [100000] * 100).with_output_types(int) | df.Map('Run trials', run_trials) | df.CombineGlobally('Sum', combine_results).without_defaults() | df.io.Write('Write', df.io.TextFileSink(known_args.output, coder=JsonCoder()))) # Actually run the pipeline (all operations above are deferred). p.run()
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) p = df.Pipeline(argv=pipeline_args) # Read the text file[pattern] into a PCollection. lines = p | df.io.Read('read', df.io.TextFileSource(known_args.input)) # Count the occurrences of each word. counts = (lines | (df.ParDo('split', WordExtractingDoFn()).with_output_types(unicode)) | df.Map('pair_with_one', lambda x: (x, 1)) | df.GroupByKey('group') | df.Map('count', lambda (word, ones): (word, sum(ones)))) # Format the counts into a PCollection of strings. output = counts | df.Map('format', lambda (word, c): '%s: %s' % (word, c)) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | df.io.Write('write', df.io.TextFileSink(known_args.output)) # Actually run the pipeline (all operations above are deferred). result = p.run() empty_line_values = result.aggregated_values(empty_line_aggregator) logging.info('number of empty lines: %d', sum(empty_line_values.values())) word_length_values = result.aggregated_values(average_word_size_aggregator) logging.info('average word lengths: %s', word_length_values.values())
def test_pardo_side_input(self): p = df.Pipeline('DirectPipelineRunner') words = p | df.Create('start', ['a', 'bb', 'ccc', 'dddd']) # [START model_pardo_side_input] # Callable takes additional arguments. def filter_using_length(word, lower_bound, upper_bound=float('inf')): if lower_bound <= len(word) <= upper_bound: yield word # Construct a deferred side input. avg_word_len = words | df.Map(len) | df.CombineGlobally( df.combiners.MeanCombineFn()) # Call with explicit side inputs. small_words = words | df.FlatMap('small', filter_using_length, 0, 3) # A single deferred side input. larger_than_average = words | df.FlatMap( 'large', filter_using_length, lower_bound=pvalue.AsSingleton(avg_word_len)) # Mix and match. small_but_nontrivial = words | df.FlatMap( filter_using_length, lower_bound=2, upper_bound=pvalue.AsSingleton(avg_word_len)) # [END model_pardo_side_input] df.assert_that(small_words, df.equal_to(['a', 'bb', 'ccc'])) df.assert_that(larger_than_average, df.equal_to(['ccc', 'dddd']), label='larger_than_average') df.assert_that(small_but_nontrivial, df.equal_to(['bb']), label='small_but_not_trivial') p.run()
def test_deterministic_key(self): p = df.Pipeline('DirectPipelineRunner', argv=sys.argv) lines = [ 'banana,fruit,3', 'kiwi,fruit,2', 'kiwi,fruit,2', 'zucchini,veg,3' ] # [START type_hints_deterministic_key] class Player(object): def __init__(self, team, name): self.team = team self.name = name class PlayerCoder(df.coders.Coder): def encode(self, player): return '%s:%s' % (player.team, player.name) def decode(self, s): return Player(*s.split(':')) def is_deterministic(self): return True df.coders.registry.register_coder(Player, PlayerCoder) def parse_player_and_score(csv): name, team, score = csv.split(',') return Player(team, name), int(score) totals = (lines | df.Map(parse_player_and_score) | df.CombinePerKey(sum).with_input_types( df.typehints.Tuple[Player, int])) # [END type_hints_deterministic_key] self.assertEquals({('banana', 3), ('kiwi', 4), ('zucchini', 3)}, set(totals | df.Map(lambda (k, v): (k.name, v))))