예제 #1
0
def run(argv=None):

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)

    lines = p | df.io.Read('read', df.io.TextFileSource(known_args.input))
    counts = (lines
              | (df.ParDo('split',
                          WordExtractingDoFn()).with_output_types(unicode))
              | df.Map('pair_with_one', lambda x: (x, 1))
              | df.GroupByKey('group')
              | df.Map('count', lambda (word, ones): (word, sum(ones))))
    output = counts | df.Map('format', lambda (word, c): '%s: %s' % (word, c))
    output | df.io.Write('write', df.io.TextFileSink(known_args.output))

    result = p.run()
    empty_line_values = result.aggregated_values(empty_line_aggregator)
    logging.info('number of empty lines: %d', sum(empty_line_values.values()))
    word_length_values = result.aggregated_values(average_word_size_aggregator)
    logging.info('average word lengths: %s', word_length_values.values())
예제 #2
0
 def test_combine_values(self):
     occurences = [('cat', 1), ('cat', 5), ('cat', 9), ('dog', 5),
                   ('dog', 2)]
     # [START combine_values]
     first_occurences = occurences | df.GroupByKey() | df.CombineValues(min)
     # [END combine_values]
     self.assertEqual({('cat', 1), ('dog', 2)}, set(first_occurences))
예제 #3
0
 def apply(self, pcoll):
     return (pcoll
             | (df.FlatMap('split', lambda x: re.findall(r'[A-Za-z\']+', x)
                           ).with_output_types(unicode))
             | df.Map('pair_with_one', lambda x: (x, 1))
             | df.GroupByKey('group')
             | df.Map('count', lambda (word, ones): (word, sum(ones))))
def run(argv=None):
    """Build and run the pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input_topic',
        required=True,
        help='Input PubSub topic of the form "/topics/<PROJECT>/<TOPIC>".')
    parser.add_argument(
        '--output_topic',
        required=True,
        help='Output PubSub topic of the form "/topics/<PROJECT>/<TOPIC>".')
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)

    # Read the text file[pattern] into a PCollection.
    lines = p | df.io.Read('read', df.io.PubSubSource(known_args.input_topic))

    # Capitalize the characters in each line.
    transformed = (lines
                   | (df.FlatMap('split', lambda x: re.findall(
                       r'[A-Za-z\']+', x)).with_output_types(unicode))
                   | df.Map('pair_with_one', lambda x: (x, 1))
                   | df.WindowInto(window.FixedWindows(15, 0))
                   | df.GroupByKey('group')
                   | df.Map('count', lambda (word, ones): (word, sum(ones)))
                   | df.Map('format', lambda tup: '%s: %d' % tup))

    # Write to PubSub.
    # pylint: disable=expression-not-assigned
    transformed | df.io.Write('pubsub_write',
                              df.io.PubSubSink(known_args.output_topic))

    p.run()
예제 #5
0
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument(
        '--output',
        dest='output',
        # CHANGE 1/5: The Google Cloud Storage path is required
        # for outputting the results.
        default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX',
        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_args.extend([
        # CHANGE 2/5: (OPTIONAL) Change this to BlockingDataflowPipelineRunner to
        # run your pipeline on the Google Cloud Dataflow Service.
        '--runner=DirectPipelineRunner',
        # CHANGE 3/5: Your project ID is required in order to run your pipeline on
        # the Google Cloud Dataflow Service.
        '--project=SET_YOUR_PROJECT_ID_HERE',
        # CHANGE 4/5: Your Google Cloud Storage path is required for staging local
        # files.
        '--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY',
        # CHANGE 5/5: Your Google Cloud Storage path is required for temporary
        # files.
        '--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY',
        '--job_name=your-wordcount-job',
    ])

    p = df.Pipeline(argv=pipeline_args)

    # Read the text file[pattern] into a PCollection.
    lines = p | df.io.Read('read', df.io.TextFileSource(known_args.input))

    # Count the occurrences of each word.
    counts = (lines
              | (df.FlatMap('split', lambda x: re.findall(r'[A-Za-z\']+', x)).
                 with_output_types(unicode))
              | df.Map('pair_with_one', lambda x: (x, 1))
              | df.GroupByKey('group')
              | df.Map('count', lambda (word, ones): (word, sum(ones))))

    # Format the counts into a PCollection of strings.
    output = counts | df.Map('format', lambda (word, c): '%s: %s' % (word, c))

    # Write the output using a "Write" transform that has side effects.
    # pylint: disable=expression-not-assigned
    output | df.io.Write('write', df.io.TextFileSink(known_args.output))

    # Actually run the pipeline (all operations above are deferred).
    p.run()
예제 #6
0
def run(argv=None):
    """Runs the workflow counting the long words and short words separately."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        required=True,
                        help='Output prefix for files to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)

    lines = p | df.Read('read', df.io.TextFileSource(known_args.input))

    # with_outputs allows accessing the side outputs of a DoFn.
    split_lines_result = (
        lines
        | df.ParDo(SplitLinesToWordsFn()).with_outputs(
            SplitLinesToWordsFn.SIDE_OUTPUT_TAG_SHORT_WORDS,
            SplitLinesToWordsFn.SIDE_OUTPUT_TAG_CHARACTER_COUNT,
            main='words'))

    # split_lines_result is an object of type DoOutputsTuple. It supports
    # accessing result in alternative ways.
    words, _, _ = split_lines_result
    short_words = split_lines_result[
        SplitLinesToWordsFn.SIDE_OUTPUT_TAG_SHORT_WORDS]
    character_count = split_lines_result.tag_character_count

    # pylint: disable=expression-not-assigned
    (character_count
     | df.Map('pair_with_key', lambda x: ('chars_temp_key', x))
     | df.GroupByKey()
     | df.Map('count chars', lambda (_, counts): sum(counts))
     | df.Write('write chars',
                df.io.TextFileSink(known_args.output + '-chars')))

    # pylint: disable=expression-not-assigned
    (short_words
     | CountWords('count short words')
     | df.Write('write short words',
                df.io.TextFileSink(known_args.output + '-short-words')))

    # pylint: disable=expression-not-assigned
    (words
     | CountWords('count words')
     | df.Write('write words',
                df.io.TextFileSink(known_args.output + '-words')))

    p.run()
예제 #7
0
def run(argv=None):
    # pylint: disable=expression-not-assigned

    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        required=True,
                        help='Input file pattern to process.')
    parser.add_argument('--output',
                        required=True,
                        help='Output file pattern to write results to.')
    parser.add_argument('--checksum_output',
                        required=True,
                        help='Checksum output file pattern.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)

    # Read the text file[pattern] into a PCollection.
    lines = p | df.io.Read('read', df.io.TextFileSource(known_args.input))

    # Count the occurrences of each word.
    output = (lines
              | df.Map('split', lambda x: (x[:10], x[10:99]))
              | df.GroupByKey('group')
              | df.FlatMap(
                  'format', lambda
                  (key, vals): ['%s%s' % (key, val) for val in vals]))

    input_csum = (lines
                  | df.Map('input-csum', crc32line)
                  | df.CombineGlobally('combine-input-csum', sum)
                  | df.Map('hex-format', lambda x: '%x' % x))
    input_csum | df.io.Write(
        'write-input-csum',
        df.io.TextFileSink(known_args.checksum_output + '-input'))

    # Write the output using a "Write" transform that has side effects.
    output | df.io.Write('write', df.io.TextFileSink(known_args.output))
    # Write the output checksum
    output_csum = (output
                   | df.Map('output-csum', crc32line)
                   | df.CombineGlobally('combine-output-csum', sum)
                   | df.Map('hex-format-output', lambda x: '%x' % x))
    output_csum | df.io.Write(
        'write-output-csum',
        df.io.TextFileSink(known_args.checksum_output + '-output'))

    # Actually run the pipeline (all operations above are deferred).
    p.run()
예제 #8
0
 def test_after_count(self):
     p = Pipeline('DirectPipelineRunner')
     result = (p
               | df.Create([1, 2, 3, 4, 5, 10, 11])
               | df.FlatMap(lambda t: [('A', t), ('B', t + 5)])
               | df.Map(lambda (k, t): TimestampedValue((k, t), t))
               | df.WindowInto(
                   FixedWindows(10),
                   trigger=AfterCount(3),
                   accumulation_mode=AccumulationMode.DISCARDING)
               | df.GroupByKey()
               | df.Map(lambda (k, v): ('%s-%s' % (k, len(v)), set(v))))
     assert_that(
         result,
         equal_to({
             'A-5': {1, 2, 3, 4, 5},
             # A-10, A-11 never emitted due to AfterCount(3) never firing.
             'B-4': {6, 7, 8, 9},
             'B-3': {10, 15, 16},
         }.iteritems()))
예제 #9
0
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)

    # Read the text file[pattern] into a PCollection.
    lines = p | df.io.Read('read', df.io.TextFileSource(known_args.input))

    # Count the occurrences of each word.
    counts = (lines
              | (df.ParDo('split',
                          WordExtractingDoFn()).with_output_types(unicode))
              | df.Map('pair_with_one', lambda x: (x, 1))
              | df.GroupByKey('group')
              | df.Map('count', lambda (word, ones): (word, sum(ones))))

    # Format the counts into a PCollection of strings.
    output = counts | df.Map('format', lambda (word, c): '%s: %s' % (word, c))

    # Write the output using a "Write" transform that has side effects.
    # pylint: disable=expression-not-assigned
    output | df.io.Write('write', df.io.TextFileSink(known_args.output))

    # Actually run the pipeline (all operations above are deferred).
    result = p.run()
    empty_line_values = result.aggregated_values(empty_line_aggregator)
    logging.info('number of empty lines: %d', sum(empty_line_values.values()))
    word_length_values = result.aggregated_values(average_word_size_aggregator)
    logging.info('average word lengths: %s', word_length_values.values())
예제 #10
0
def run(argv=None):  # pylint: disable=missing-docstring

    parser = argparse.ArgumentParser()
    parser.add_argument('--grid_size',
                        dest='grid_size',
                        default=1000,
                        help='Size of the NxN matrix')
    parser.add_argument(
        '--coordinate_output',
        dest='coordinate_output',
        required=True,
        help='Output file to write the color coordinates of the image to.')
    parser.add_argument('--image_output',
                        dest='image_output',
                        default=None,
                        help='Output file to write the resulting image to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)
    n = int(known_args.grid_size)

    coordinates = generate_julia_set_colors(p, complex(-.62772, .42193), n,
                                            100)

    # Group each coordinate triplet by its x value, then write the coordinates to
    # the output file with an x-coordinate grouping per line.
    # pylint: disable=expression-not-assigned
    # pylint: disable=g-long-lambda
    (coordinates | df.Map('x coord key', lambda (x, y, i): (x, (x, y, i)))
     | df.GroupByKey('x coord') | df.Map(
         'format', lambda
         (k, coords): ' '.join('(%s, %s, %s)' % coord for coord in coords))
     | df.io.Write('write', df.io.TextFileSink(known_args.coordinate_output)))
    # pylint: enable=g-long-lambda
    # pylint: enable=expression-not-assigned
    p.run()
예제 #11
0
 def apply(self, pcoll):
     return (pcoll
             | df.Map('pair_with_one', lambda x: (x, 1))
             | df.GroupByKey('group')
             | df.Map('count', lambda (word, ones): (word, sum(ones)))
             | df.Map('format', lambda (word, c): '%s: %s' % (word, c)))