def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) p = df.Pipeline(argv=pipeline_args) lines = p | df.io.Read('read', df.io.TextFileSource(known_args.input)) counts = (lines | (df.ParDo('split', WordExtractingDoFn()).with_output_types(unicode)) | df.Map('pair_with_one', lambda x: (x, 1)) | df.GroupByKey('group') | df.Map('count', lambda (word, ones): (word, sum(ones)))) output = counts | df.Map('format', lambda (word, c): '%s: %s' % (word, c)) output | df.io.Write('write', df.io.TextFileSink(known_args.output)) result = p.run() empty_line_values = result.aggregated_values(empty_line_aggregator) logging.info('number of empty lines: %d', sum(empty_line_values.values())) word_length_values = result.aggregated_values(average_word_size_aggregator) logging.info('average word lengths: %s', word_length_values.values())
def test_combine_values(self): occurences = [('cat', 1), ('cat', 5), ('cat', 9), ('dog', 5), ('dog', 2)] # [START combine_values] first_occurences = occurences | df.GroupByKey() | df.CombineValues(min) # [END combine_values] self.assertEqual({('cat', 1), ('dog', 2)}, set(first_occurences))
def apply(self, pcoll): return (pcoll | (df.FlatMap('split', lambda x: re.findall(r'[A-Za-z\']+', x) ).with_output_types(unicode)) | df.Map('pair_with_one', lambda x: (x, 1)) | df.GroupByKey('group') | df.Map('count', lambda (word, ones): (word, sum(ones))))
def run(argv=None): """Build and run the pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input_topic', required=True, help='Input PubSub topic of the form "/topics/<PROJECT>/<TOPIC>".') parser.add_argument( '--output_topic', required=True, help='Output PubSub topic of the form "/topics/<PROJECT>/<TOPIC>".') known_args, pipeline_args = parser.parse_known_args(argv) p = df.Pipeline(argv=pipeline_args) # Read the text file[pattern] into a PCollection. lines = p | df.io.Read('read', df.io.PubSubSource(known_args.input_topic)) # Capitalize the characters in each line. transformed = (lines | (df.FlatMap('split', lambda x: re.findall( r'[A-Za-z\']+', x)).with_output_types(unicode)) | df.Map('pair_with_one', lambda x: (x, 1)) | df.WindowInto(window.FixedWindows(15, 0)) | df.GroupByKey('group') | df.Map('count', lambda (word, ones): (word, sum(ones))) | df.Map('format', lambda tup: '%s: %d' % tup)) # Write to PubSub. # pylint: disable=expression-not-assigned transformed | df.io.Write('pubsub_write', df.io.PubSubSink(known_args.output_topic)) p.run()
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument( '--output', dest='output', # CHANGE 1/5: The Google Cloud Storage path is required # for outputting the results. default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX', help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend([ # CHANGE 2/5: (OPTIONAL) Change this to BlockingDataflowPipelineRunner to # run your pipeline on the Google Cloud Dataflow Service. '--runner=DirectPipelineRunner', # CHANGE 3/5: Your project ID is required in order to run your pipeline on # the Google Cloud Dataflow Service. '--project=SET_YOUR_PROJECT_ID_HERE', # CHANGE 4/5: Your Google Cloud Storage path is required for staging local # files. '--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY', # CHANGE 5/5: Your Google Cloud Storage path is required for temporary # files. '--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY', '--job_name=your-wordcount-job', ]) p = df.Pipeline(argv=pipeline_args) # Read the text file[pattern] into a PCollection. lines = p | df.io.Read('read', df.io.TextFileSource(known_args.input)) # Count the occurrences of each word. counts = (lines | (df.FlatMap('split', lambda x: re.findall(r'[A-Za-z\']+', x)). with_output_types(unicode)) | df.Map('pair_with_one', lambda x: (x, 1)) | df.GroupByKey('group') | df.Map('count', lambda (word, ones): (word, sum(ones)))) # Format the counts into a PCollection of strings. output = counts | df.Map('format', lambda (word, c): '%s: %s' % (word, c)) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | df.io.Write('write', df.io.TextFileSink(known_args.output)) # Actually run the pipeline (all operations above are deferred). p.run()
def run(argv=None): """Runs the workflow counting the long words and short words separately.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', required=True, help='Output prefix for files to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) p = df.Pipeline(argv=pipeline_args) lines = p | df.Read('read', df.io.TextFileSource(known_args.input)) # with_outputs allows accessing the side outputs of a DoFn. split_lines_result = ( lines | df.ParDo(SplitLinesToWordsFn()).with_outputs( SplitLinesToWordsFn.SIDE_OUTPUT_TAG_SHORT_WORDS, SplitLinesToWordsFn.SIDE_OUTPUT_TAG_CHARACTER_COUNT, main='words')) # split_lines_result is an object of type DoOutputsTuple. It supports # accessing result in alternative ways. words, _, _ = split_lines_result short_words = split_lines_result[ SplitLinesToWordsFn.SIDE_OUTPUT_TAG_SHORT_WORDS] character_count = split_lines_result.tag_character_count # pylint: disable=expression-not-assigned (character_count | df.Map('pair_with_key', lambda x: ('chars_temp_key', x)) | df.GroupByKey() | df.Map('count chars', lambda (_, counts): sum(counts)) | df.Write('write chars', df.io.TextFileSink(known_args.output + '-chars'))) # pylint: disable=expression-not-assigned (short_words | CountWords('count short words') | df.Write('write short words', df.io.TextFileSink(known_args.output + '-short-words'))) # pylint: disable=expression-not-assigned (words | CountWords('count words') | df.Write('write words', df.io.TextFileSink(known_args.output + '-words'))) p.run()
def run(argv=None): # pylint: disable=expression-not-assigned parser = argparse.ArgumentParser() parser.add_argument('--input', required=True, help='Input file pattern to process.') parser.add_argument('--output', required=True, help='Output file pattern to write results to.') parser.add_argument('--checksum_output', required=True, help='Checksum output file pattern.') known_args, pipeline_args = parser.parse_known_args(argv) p = df.Pipeline(argv=pipeline_args) # Read the text file[pattern] into a PCollection. lines = p | df.io.Read('read', df.io.TextFileSource(known_args.input)) # Count the occurrences of each word. output = (lines | df.Map('split', lambda x: (x[:10], x[10:99])) | df.GroupByKey('group') | df.FlatMap( 'format', lambda (key, vals): ['%s%s' % (key, val) for val in vals])) input_csum = (lines | df.Map('input-csum', crc32line) | df.CombineGlobally('combine-input-csum', sum) | df.Map('hex-format', lambda x: '%x' % x)) input_csum | df.io.Write( 'write-input-csum', df.io.TextFileSink(known_args.checksum_output + '-input')) # Write the output using a "Write" transform that has side effects. output | df.io.Write('write', df.io.TextFileSink(known_args.output)) # Write the output checksum output_csum = (output | df.Map('output-csum', crc32line) | df.CombineGlobally('combine-output-csum', sum) | df.Map('hex-format-output', lambda x: '%x' % x)) output_csum | df.io.Write( 'write-output-csum', df.io.TextFileSink(known_args.checksum_output + '-output')) # Actually run the pipeline (all operations above are deferred). p.run()
def test_after_count(self): p = Pipeline('DirectPipelineRunner') result = (p | df.Create([1, 2, 3, 4, 5, 10, 11]) | df.FlatMap(lambda t: [('A', t), ('B', t + 5)]) | df.Map(lambda (k, t): TimestampedValue((k, t), t)) | df.WindowInto( FixedWindows(10), trigger=AfterCount(3), accumulation_mode=AccumulationMode.DISCARDING) | df.GroupByKey() | df.Map(lambda (k, v): ('%s-%s' % (k, len(v)), set(v)))) assert_that( result, equal_to({ 'A-5': {1, 2, 3, 4, 5}, # A-10, A-11 never emitted due to AfterCount(3) never firing. 'B-4': {6, 7, 8, 9}, 'B-3': {10, 15, 16}, }.iteritems()))
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) p = df.Pipeline(argv=pipeline_args) # Read the text file[pattern] into a PCollection. lines = p | df.io.Read('read', df.io.TextFileSource(known_args.input)) # Count the occurrences of each word. counts = (lines | (df.ParDo('split', WordExtractingDoFn()).with_output_types(unicode)) | df.Map('pair_with_one', lambda x: (x, 1)) | df.GroupByKey('group') | df.Map('count', lambda (word, ones): (word, sum(ones)))) # Format the counts into a PCollection of strings. output = counts | df.Map('format', lambda (word, c): '%s: %s' % (word, c)) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | df.io.Write('write', df.io.TextFileSink(known_args.output)) # Actually run the pipeline (all operations above are deferred). result = p.run() empty_line_values = result.aggregated_values(empty_line_aggregator) logging.info('number of empty lines: %d', sum(empty_line_values.values())) word_length_values = result.aggregated_values(average_word_size_aggregator) logging.info('average word lengths: %s', word_length_values.values())
def run(argv=None): # pylint: disable=missing-docstring parser = argparse.ArgumentParser() parser.add_argument('--grid_size', dest='grid_size', default=1000, help='Size of the NxN matrix') parser.add_argument( '--coordinate_output', dest='coordinate_output', required=True, help='Output file to write the color coordinates of the image to.') parser.add_argument('--image_output', dest='image_output', default=None, help='Output file to write the resulting image to.') known_args, pipeline_args = parser.parse_known_args(argv) p = df.Pipeline(argv=pipeline_args) n = int(known_args.grid_size) coordinates = generate_julia_set_colors(p, complex(-.62772, .42193), n, 100) # Group each coordinate triplet by its x value, then write the coordinates to # the output file with an x-coordinate grouping per line. # pylint: disable=expression-not-assigned # pylint: disable=g-long-lambda (coordinates | df.Map('x coord key', lambda (x, y, i): (x, (x, y, i))) | df.GroupByKey('x coord') | df.Map( 'format', lambda (k, coords): ' '.join('(%s, %s, %s)' % coord for coord in coords)) | df.io.Write('write', df.io.TextFileSink(known_args.coordinate_output))) # pylint: enable=g-long-lambda # pylint: enable=expression-not-assigned p.run()
def apply(self, pcoll): return (pcoll | df.Map('pair_with_one', lambda x: (x, 1)) | df.GroupByKey('group') | df.Map('count', lambda (word, ones): (word, sum(ones))) | df.Map('format', lambda (word, c): '%s: %s' % (word, c)))