def create_groups(group_ids, corpus, word, ignore_corpus, ignore_word): """Generate groups given the input PCollections.""" def attach_corpus_fn(group, corpus, ignore): selected = None len_corpus = len(corpus) while not selected: c = corpus[randrange(0, len_corpus - 1)].values()[0] if c != ignore: selected = c yield (group, selected) def attach_word_fn(group, words, ignore): selected = None len_words = len(words) while not selected: c = words[randrange(0, len_words - 1)].values()[0] if c != ignore: selected = c yield group + (selected, ) return (group_ids | df.FlatMap('attach corpus', attach_corpus_fn, AsList(corpus), AsSingleton(ignore_corpus)) | df.FlatMap('attach word', attach_word_fn, AsIter(word), AsSingleton(ignore_word)))
def apply(self, pcoll): sink_coll = pcoll.pipeline | core.Create('create_sink_collection', [self.sink]) init_result_coll = sink_coll | core.Map( 'initialize_write', lambda sink: sink.initialize_write()) write_result_coll = pcoll | core.ParDo( 'write_bundles', _WriteBundleDoFn(), AsSingleton(sink_coll), AsSingleton(init_result_coll)) return sink_coll | core.FlatMap( 'finalize_write', lambda sink, init_result, write_results: sink. finalize_write(init_result, write_results), AsSingleton(init_result_coll), AsIter(write_result_coll))
def apply(self, pcoll): sink_coll = pcoll.pipeline | core.Create('create_sink_collection', [self.sink]) init_result_coll = sink_coll | core.Map( 'initialize_write', lambda sink: sink.initialize_write()) write_result_coll = pcoll | core.ParDo( 'write_bundles', _WriteBundleDoFn(), AsSingleton(sink_coll), AsSingleton(init_result_coll)) return sink_coll | core.FlatMap( 'finalize_write', lambda sink, init_result, write_results: (window.TimestampedValue(v, window.MAX_TIMESTAMP) for v in sink.finalize_write(init_result, write_results) or ()), AsSingleton(init_result_coll), AsIter(write_result_coll))
def test_as_singleton_with_different_defaults_without_unique_labels(self): # This should fail as AsSingleton with distinct default values should create # distinct PCollectionViews with the same full_label. a_list = [2] pipeline = Pipeline('DirectPipelineRunner') main_input = pipeline | Create('main input', [1]) side_list = pipeline | Create('side list', a_list) with self.assertRaises(RuntimeError) as e: _ = main_input | FlatMap('test', lambda x, s1, s2: [[x, s1, s2]], AsSingleton(side_list), AsSingleton(side_list, default_value=3)) self.assertTrue( e.exception.message.startswith( 'Transform "ViewAsSingleton(side list.None)" does not have a ' 'stable unique label.'))
def test_multi_valued_singleton_side_input(self): pipeline = Pipeline('DirectPipelineRunner') pcol = pipeline | Create('start', [1, 2]) side = pipeline | Create('side', [3, 4]) # 2 values in side input. pcol | FlatMap('compute', lambda x, s: [x * s], AsSingleton(side)) with self.assertRaises(ValueError) as e: pipeline.run()
def test_default_value_singleton_side_input(self): pipeline = Pipeline('DirectPipelineRunner') pcol = pipeline | Create('start', [1, 2]) side = pipeline | Create('side', []) # 0 values in side input. result = (pcol | FlatMap('compute', lambda x, s: [x * s], AsSingleton(side, 10))) assert_that(result, equal_to([10, 20])) pipeline.run()
def test_par_do_with_side_input_as_arg(self): pipeline = Pipeline('DirectPipelineRunner') words_list = ['aa', 'bb', 'cc'] words = pipeline | Create('SomeWords', words_list) prefix = pipeline | Create('SomeString', ['xyz']) # side in suffix = 'zyx' result = words | FlatMap( 'DecorateWords', lambda x, pfx, sfx: ['%s-%s-%s' % (pfx, x, sfx)], AsSingleton(prefix), suffix) assert_that(result, equal_to(['xyz-%s-zyx' % x for x in words_list])) pipeline.run()
def test_as_singleton_with_different_defaults_with_unique_labels(self): a_list = [] pipeline = Pipeline('DirectPipelineRunner') main_input = pipeline | Create('main input', [1]) side_list = pipeline | Create('side list', a_list) results = main_input | FlatMap( 'test', lambda x, s1, s2: [[x, s1, s2]], AsSingleton('si1', side_list, default_value=2), AsSingleton('si2', side_list, default_value=3)) def matcher(expected_elem, expected_singleton1, expected_singleton2): def match(actual): [[actual_elem, actual_singleton1, actual_singleton2]] = actual equal_to([expected_elem])([actual_elem]) equal_to([expected_singleton1])([actual_singleton1]) equal_to([expected_singleton2])([actual_singleton2]) return match assert_that(results, matcher(1, 2, 3)) pipeline.run()
def test_empty_singleton_side_input(self): pipeline = Pipeline('DirectPipelineRunner') pcol = pipeline | Create('start', [1, 2]) side = pipeline | Create('side', []) # Empty side input. def my_fn(k, s): v = ('empty' if isinstance(s, EmptySideInput) else 'full') return [(k, v)] result = pcol | FlatMap('compute', my_fn, AsSingleton(side)) assert_that(result, equal_to([(1, 'empty'), (2, 'empty')])) pipeline.run()
def test_as_singleton_without_unique_labels(self): # This should succeed as calling AsSingleton on the same PCollection twice # with the same defaults will return the same PCollectionView. a_list = [2] pipeline = Pipeline('DirectPipelineRunner') main_input = pipeline | Create('main input', [1]) side_list = pipeline | Create('side list', a_list) results = main_input | FlatMap('test', lambda x, s1, s2: [[x, s1, s2]], AsSingleton(side_list), AsSingleton(side_list)) def matcher(expected_elem, expected_singleton): def match(actual): [[actual_elem, actual_singleton1, actual_singleton2]] = actual equal_to([expected_elem])([actual_elem]) equal_to([expected_singleton])([actual_singleton1]) equal_to([expected_singleton])([actual_singleton2]) return match assert_that(results, matcher(1, 2)) pipeline.run()
def test_pcollectionview_not_recreated(self): pipeline = Pipeline('DirectPipelineRunner') value = pipeline | Create('create1', [1, 2, 3]) value2 = pipeline | Create('create2', [(1, 1), (2, 2), (3, 3)]) self.assertEqual(AsSingleton(value), AsSingleton(value)) self.assertEqual(AsSingleton('new', value, default_value=1), AsSingleton('new', value, default_value=1)) self.assertNotEqual(AsSingleton(value), AsSingleton('new', value, default_value=1)) self.assertEqual(AsIter(value), AsIter(value)) self.assertEqual(AsList(value), AsList(value)) self.assertEqual(AsDict(value2), AsDict(value2)) self.assertNotEqual(AsSingleton(value), AsSingleton(value2)) self.assertNotEqual(AsIter(value), AsIter(value2)) self.assertNotEqual(AsList(value), AsList(value2)) self.assertNotEqual(AsDict(value), AsDict(value2))
def test_par_do_with_do_fn_object(self): class SomeDoFn(DoFn): """A custom DoFn for a FlatMap transform.""" def process(self, context, prefix, suffix): return ['%s-%s-%s' % (prefix, context.element, suffix)] pipeline = Pipeline('DirectPipelineRunner') words_list = ['aa', 'bb', 'cc'] words = pipeline | Create('SomeWords', words_list) prefix = 'zyx' suffix = pipeline | Create('SomeString', ['xyz']) # side in result = words | ParDo('DecorateWordsDoFn', SomeDoFn(), prefix, suffix=AsSingleton(suffix)) assert_that(result, equal_to(['zyx-%s-xyz' % x for x in words_list])) pipeline.run()
def filter_cold_days(input_data, month_filter): """Workflow computing rows in a specific month with low temperatures. Args: input_data: a PCollection of dictionaries representing table rows. Each dictionary must have the keys ['year', 'month', 'day', and 'mean_temp']. month_filter: an int representing the month for which colder-than-average days should be returned. Returns: A PCollection of dictionaries with the same keys described above. Each row represents a day in the specified month where temperatures were colder than the global mean temperature in the entire dataset. """ # Project to only the desired fields from a complete input row. # E.g., SELECT f1, f2, f3, ... FROM InputTable. projection_fields = ['year', 'month', 'day', 'mean_temp'] fields_of_interest = ( input_data | df.Map('projected', lambda row: {f: row[f] for f in projection_fields})) # Compute the global mean temperature. global_mean = AsSingleton( fields_of_interest | df.Map('extract mean', lambda row: row['mean_temp']) | df.combiners.Mean.Globally('global mean')) # Filter to the rows representing days in the month of interest # in which the mean daily temperature is below the global mean. return ( fields_of_interest | df.Filter('desired month', lambda row: row['month'] == month_filter) | df.Filter('below mean', lambda row, mean: row['mean_temp'] < mean, global_mean))
# Compute a mapping from each word to its document frequency. # A word's document frequency in a corpus is the number of # documents in which the word appears divided by the total # number of documents in the corpus. # # This calculation uses a side input, a Dataflow-computed auxiliary value # presented to each invocation of our MapFn lambda. The second argument to # the lambda (called total---note that we are unpacking the first argument) # receives the value we listed after the lambda in Map(). Additional side # inputs (and ordinary Python values, too) can be provided to MapFns and # DoFns in this way. word_to_df = (word_to_doc_count | df.Map( 'compute doc frequencies', lambda (word, count), total: (word, float(count) / total), AsSingleton(total_documents))) # Join the term frequency and document frequency collections, # each keyed on the word. word_to_uri_and_tf_and_df = ({ 'tf': word_to_uri_and_tf, 'df': word_to_df } | df.CoGroupByKey('cogroup words by tf-df')) # Compute a mapping from each word to a (URI, TF-IDF) score for each URI. # There are a variety of definitions of TF-IDF # ("term frequency - inverse document frequency") score; here we use a # basic version that is the term frequency divided by the log of the # document frequency.