Exemplo n.º 1
0
def create_groups(group_ids, corpus, word, ignore_corpus, ignore_word):
    """Generate groups given the input PCollections."""
    def attach_corpus_fn(group, corpus, ignore):
        selected = None
        len_corpus = len(corpus)
        while not selected:
            c = corpus[randrange(0, len_corpus - 1)].values()[0]
            if c != ignore:
                selected = c

        yield (group, selected)

    def attach_word_fn(group, words, ignore):
        selected = None
        len_words = len(words)
        while not selected:
            c = words[randrange(0, len_words - 1)].values()[0]
            if c != ignore:
                selected = c

        yield group + (selected, )

    return (group_ids
            | df.FlatMap('attach corpus', attach_corpus_fn, AsList(corpus),
                         AsSingleton(ignore_corpus))
            | df.FlatMap('attach word', attach_word_fn, AsIter(word),
                         AsSingleton(ignore_word)))
Exemplo n.º 2
0
    def test_pardo_using_flatmap(self):
        words = ['aa', 'bbb', 'c']
        # [START model_pardo_using_flatmap]
        word_lengths = words | df.FlatMap(lambda word: [len(word)])
        # [END model_pardo_using_flatmap]

        self.assertEqual({2, 3, 1}, set(word_lengths))
Exemplo n.º 3
0
 def apply(self, pcoll):
     return (pcoll
             | (df.FlatMap('split', lambda x: re.findall(r'[A-Za-z\']+', x)
                           ).with_output_types(unicode))
             | df.Map('pair_with_one', lambda x: (x, 1))
             | df.GroupByKey('group')
             | df.Map('count', lambda (word, ones): (word, sum(ones))))
def run(argv=None):
    """Build and run the pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input_topic',
        required=True,
        help='Input PubSub topic of the form "/topics/<PROJECT>/<TOPIC>".')
    parser.add_argument(
        '--output_topic',
        required=True,
        help='Output PubSub topic of the form "/topics/<PROJECT>/<TOPIC>".')
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)

    # Read the text file[pattern] into a PCollection.
    lines = p | df.io.Read('read', df.io.PubSubSource(known_args.input_topic))

    # Capitalize the characters in each line.
    transformed = (lines
                   | (df.FlatMap('split', lambda x: re.findall(
                       r'[A-Za-z\']+', x)).with_output_types(unicode))
                   | df.Map('pair_with_one', lambda x: (x, 1))
                   | df.WindowInto(window.FixedWindows(15, 0))
                   | df.GroupByKey('group')
                   | df.Map('count', lambda (word, ones): (word, sum(ones)))
                   | df.Map('format', lambda tup: '%s: %d' % tup))

    # Write to PubSub.
    # pylint: disable=expression-not-assigned
    transformed | df.io.Write('pubsub_write',
                              df.io.PubSubSink(known_args.output_topic))

    p.run()
Exemplo n.º 5
0
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument(
        '--output',
        dest='output',
        # CHANGE 1/5: The Google Cloud Storage path is required
        # for outputting the results.
        default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX',
        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_args.extend([
        # CHANGE 2/5: (OPTIONAL) Change this to BlockingDataflowPipelineRunner to
        # run your pipeline on the Google Cloud Dataflow Service.
        '--runner=DirectPipelineRunner',
        # CHANGE 3/5: Your project ID is required in order to run your pipeline on
        # the Google Cloud Dataflow Service.
        '--project=SET_YOUR_PROJECT_ID_HERE',
        # CHANGE 4/5: Your Google Cloud Storage path is required for staging local
        # files.
        '--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY',
        # CHANGE 5/5: Your Google Cloud Storage path is required for temporary
        # files.
        '--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY',
        '--job_name=your-wordcount-job',
    ])

    p = df.Pipeline(argv=pipeline_args)

    # Read the text file[pattern] into a PCollection.
    lines = p | df.io.Read('read', df.io.TextFileSource(known_args.input))

    # Count the occurrences of each word.
    counts = (lines
              | (df.FlatMap('split', lambda x: re.findall(r'[A-Za-z\']+', x)).
                 with_output_types(unicode))
              | df.Map('pair_with_one', lambda x: (x, 1))
              | df.GroupByKey('group')
              | df.Map('count', lambda (word, ones): (word, sum(ones))))

    # Format the counts into a PCollection of strings.
    output = counts | df.Map('format', lambda (word, c): '%s: %s' % (word, c))

    # Write the output using a "Write" transform that has side effects.
    # pylint: disable=expression-not-assigned
    output | df.io.Write('write', df.io.TextFileSink(known_args.output))

    # Actually run the pipeline (all operations above are deferred).
    p.run()
Exemplo n.º 6
0
def assert_that(pcoll, matcher):
    """Asserts that the give PCollection satisfies the constraints of the matcher
  in a way that is runnable locally or on a remote service.
  """
    singleton = pcoll.pipeline | df.Create('create_singleton', [None])

    def check_matcher(_, side_value):
        assert matcher(side_value)
        return []

    singleton | df.FlatMap(check_matcher, AsIter(pcoll))
Exemplo n.º 7
0
    def test_pardo_using_flatmap_yield(self):
        words = ['aA', 'bbb', 'C']

        # [START model_pardo_using_flatmap_yield]
        def capitals(word):
            for letter in word:
                if 'A' <= letter <= 'Z':
                    yield letter

        all_capitals = words | df.FlatMap(capitals)
        # [END model_pardo_using_flatmap_yield]

        self.assertEqual({'A', 'C'}, set(all_capitals))
Exemplo n.º 8
0
    def test_pardo_side_input(self):
        p = df.Pipeline('DirectPipelineRunner')
        words = p | df.Create('start', ['a', 'bb', 'ccc', 'dddd'])

        # [START model_pardo_side_input]
        # Callable takes additional arguments.
        def filter_using_length(word, lower_bound, upper_bound=float('inf')):
            if lower_bound <= len(word) <= upper_bound:
                yield word

        # Construct a deferred side input.
        avg_word_len = words | df.Map(len) | df.CombineGlobally(
            df.combiners.MeanCombineFn())

        # Call with explicit side inputs.
        small_words = words | df.FlatMap('small', filter_using_length, 0, 3)

        # A single deferred side input.
        larger_than_average = words | df.FlatMap(
            'large',
            filter_using_length,
            lower_bound=pvalue.AsSingleton(avg_word_len))

        # Mix and match.
        small_but_nontrivial = words | df.FlatMap(
            filter_using_length,
            lower_bound=2,
            upper_bound=pvalue.AsSingleton(avg_word_len))
        # [END model_pardo_side_input]

        df.assert_that(small_words, df.equal_to(['a', 'bb', 'ccc']))
        df.assert_that(larger_than_average,
                       df.equal_to(['ccc', 'dddd']),
                       label='larger_than_average')
        df.assert_that(small_but_nontrivial,
                       df.equal_to(['bb']),
                       label='small_but_not_trivial')
        p.run()
Exemplo n.º 9
0
    def apply(self, words):
        """Compute the most common words for each possible prefixes.

    Args:
      words: a PCollection of strings

    Returns:
      A PCollection of most common words with each prefix, in the form
          (prefix, [(count, word), (count, word), ...])
    """
        return (words
                | df.combiners.Count.PerElement()
                | df.FlatMap(extract_prefixes)
                | df.combiners.Top.LargestPerKey(self._count))
Exemplo n.º 10
0
    def apply(self, uri_to_content):

        # Compute the total number of documents, and prepare a singleton
        # PCollection to use as side input.
        total_documents = (uri_to_content
                           | df.Keys('get uris')
                           | df.RemoveDuplicates('get unique uris')
                           | df.combiners.Count.Globally(' count uris'))

        # Create a collection of pairs mapping a URI to each of the words
        # in the document associated with that that URI.

        def split_into_words((uri, line)):
            return [(uri, w.lower()) for w in re.findall(r'[A-Za-z\']+', line)]

        uri_to_words = (uri_to_content
                        | df.FlatMap('split words', split_into_words))

        # Compute a mapping from each word to the total number of documents
        # in which it appears.
        word_to_doc_count = (
            uri_to_words
            | df.RemoveDuplicates('get unique words per doc')
            | df.Values('get words')
            | df.combiners.Count.PerElement('count docs per word'))

        # Compute a mapping from each URI to the total number of words in the
        # document associated with that URI.
        uri_to_word_total = (
            uri_to_words
            | df.Keys(' get uris')
            | df.combiners.Count.PerElement('count words in doc'))

        # Count, for each (URI, word) pair, the number of occurrences of that word
        # in the document associated with the URI.
        uri_and_word_to_count = (
            uri_to_words
            | df.combiners.Count.PerElement('count word-doc pairs'))

        # Adjust the above collection to a mapping from (URI, word) pairs to counts
        # into an isomorphic mapping from URI to (word, count) pairs, to prepare
        # for a join by the URI key.
        uri_to_word_and_count = (uri_and_word_to_count
                                 | df.Map(
                                     'shift keys', lambda ((uri, word), count):
                                     (uri, (word, count))))
Exemplo n.º 11
0
    def test_pardo_with_undeclared_side_outputs(self):
        numbers = [1, 2, 3, 4, 5, 10, 20]

        # [START model_pardo_with_side_outputs_undeclared]
        def even_odd(x):
            yield pvalue.SideOutputValue('odd' if x % 2 else 'even', x)
            if x % 10 == 0:
                yield x

        results = numbers | df.FlatMap(even_odd).with_outputs()

        evens = results.even
        odds = results.odd
        tens = results[None]  # the undeclared main output
        # [END model_pardo_with_side_outputs_undeclared]

        self.assertEqual({2, 4, 10, 20}, set(evens))
        self.assertEqual({1, 3, 5}, set(odds))
        self.assertEqual({10, 20}, set(tens))
Exemplo n.º 12
0
def run(argv=None):
    """Runs the workflow computing total points from a collection of matches."""

    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        required=True,
                        help='Input file to process.')
    parser.add_argument('--output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)
    (p  # pylint: disable=expression-not-assigned
     | df.io.Read('read',
                  df.io.TextFileSource(known_args.input, coder=JsonCoder()))
     | df.FlatMap('points', compute_points) | df.CombinePerKey(sum)
     | df.io.Write('write',
                   df.io.TextFileSink(known_args.output, coder=JsonCoder())))
    p.run()
Exemplo n.º 13
0
 def test_after_count(self):
     p = Pipeline('DirectPipelineRunner')
     result = (p
               | df.Create([1, 2, 3, 4, 5, 10, 11])
               | df.FlatMap(lambda t: [('A', t), ('B', t + 5)])
               | df.Map(lambda (k, t): TimestampedValue((k, t), t))
               | df.WindowInto(
                   FixedWindows(10),
                   trigger=AfterCount(3),
                   accumulation_mode=AccumulationMode.DISCARDING)
               | df.GroupByKey()
               | df.Map(lambda (k, v): ('%s-%s' % (k, len(v)), set(v))))
     assert_that(
         result,
         equal_to({
             'A-5': {1, 2, 3, 4, 5},
             # A-10, A-11 never emitted due to AfterCount(3) never firing.
             'B-4': {6, 7, 8, 9},
             'B-3': {10, 15, 16},
         }.iteritems()))
Exemplo n.º 14
0
def run(argv=None):

    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        required=True,
                        help='Input file to process.')
    parser.add_argument('--output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)

    (p  # pylint: disable=expression-not-assigned
     | df.io.Read('read', df.io.TextFileSource(known_args.input))
     | df.FlatMap('split', lambda x: re.findall(r'[A-Za-z\']+', x))
     | TopPerPrefix('TopPerPrefix', 5)
     | df.Map('format', lambda (prefix, candidates): '%s: %s' %
              (prefix, candidates))
     | df.io.Write('write', df.io.TextFileSink(known_args.output)))
    p.run()
def count_tornadoes(input_data):
    """Workflow computing the number of tornadoes for each month that had one.

  Args:
    input_data: a PCollection of dictionaries representing table rows. Each
      dictionary will have a 'month' and a 'tornado' key as described in the
      module comment.

  Returns:
    A PCollection of dictionaries containing 'month' and 'tornado_count' keys.
    Months without tornadoes are skipped.
  """

    return (input_data
            | df.FlatMap(
                'months with tornadoes', lambda row: [(int(row['month']), 1)]
                if row['tornado'] else [])
            | df.CombinePerKey('monthly count', sum)
            | df.Map('format', lambda (k, v): {
                'month': k,
                'tornado_count': v
            }))
Exemplo n.º 16
0
                                           | df.CoGroupByKey('cogroup by uri'))

        # Compute a mapping from each word to a (URI, term frequency) pair for each
        # URI. A word's term frequency for a document is simply the number of times
        # that word occurs in the document divided by the total number of words in
        # the document.

        def compute_term_frequency((uri, count_and_total)):
            word_and_count = count_and_total['word counts']
            # We have an iterable for one element that we want extracted.
            [word_total] = count_and_total['word totals']
            for word, count in word_and_count:
                yield word, (uri, float(count) / word_total)

        word_to_uri_and_tf = (uri_to_word_and_count_and_total
                              | df.FlatMap('compute term frequencies',
                                           compute_term_frequency))

        # Compute a mapping from each word to its document frequency.
        # A word's document frequency in a corpus is the number of
        # documents in which the word appears divided by the total
        # number of documents in the corpus.
        #
        # This calculation uses a side input, a Dataflow-computed auxiliary value
        # presented to each invocation of our MapFn lambda. The second argument to
        # the lambda (called total---note that we are unpacking the first argument)
        # receives the value we listed after the lambda in Map(). Additional side
        # inputs (and ordinary Python values, too) can be provided to MapFns and
        # DoFns in this way.
        word_to_df = (word_to_doc_count
                      | df.Map(
                          'compute doc frequencies', lambda