def model_join_using_side_inputs( name_list, email_list, phone_list, output_path): """Joining PCollections using side inputs.""" import apache_beam as beam from apache_beam.pvalue import AsIter with TestPipeline() as p: # Use TestPipeline for testing. # [START model_join_using_side_inputs] # This code performs a join by receiving the set of names as an input and # passing PCollections that contain emails and phone numbers as side inputs # instead of using CoGroupByKey. names = p | 'names' >> beam.Create(name_list) emails = p | 'email' >> beam.Create(email_list) phones = p | 'phone' >> beam.Create(phone_list) def join_info(name, emails, phone_numbers): filtered_emails = [] for name_in_list, email in emails: if name_in_list == name: filtered_emails.append(email) filtered_phone_numbers = [] for name_in_list, phone_number in phone_numbers: if name_in_list == name: filtered_phone_numbers.append(phone_number) return '; '.join(['%s' % name, '%s' % ','.join(filtered_emails), '%s' % ','.join(filtered_phone_numbers)]) contact_lines = names | 'CreateContacts' >> beam.core.Map( join_info, AsIter(emails), AsIter(phones)) # [END model_join_using_side_inputs] contact_lines | beam.io.WriteToText(output_path)
def test(self): def join_fn(element, side_input, iterations): result = [] for i in range(iterations): for key, value in side_input: if i == iterations - 1: result.append({key: element[1] + value}) yield result main_input = ( self.pipeline | "Read pcoll 1" >> beam.io.Read( SyntheticSource(self.parse_synthetic_source_options())) | 'Measure time: Start pcoll 1' >> beam.ParDo( MeasureTime(self.metrics_namespace))) side_input = ( self.pipeline | "Read pcoll 2" >> beam.io.Read( SyntheticSource(self.parse_synthetic_source_options())) | 'Measure time: Start pcoll 2' >> beam.ParDo( MeasureTime(self.metrics_namespace))) # pylint: disable=expression-not-assigned ( main_input | "Merge" >> beam.ParDo(join_fn, AsIter(side_input), self.iterations) | 'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace)))
def testSideInput(self): def join_fn(element, side_input, iterations): list = [] for i in range(iterations): for key, value in side_input: if i == iterations - 1: list.append({key: element[1] + value}) yield list with self.pipeline as p: main_input = (p | "Read pcoll 1" >> beam.io.Read( synthetic_pipeline.SyntheticSource( self._parseTestPipelineOptions())) | 'Measure time: Start pcoll 1' >> beam.ParDo( MeasureTime(self.metrics_namespace))) side_input = ( p | "Read pcoll 2" >> beam.io.Read( synthetic_pipeline.SyntheticSource(self._getSideInput())) | 'Measure time: Start pcoll 2' >> beam.ParDo( MeasureTime(self.metrics_namespace))) # pylint: disable=expression-not-assigned (main_input | "Merge" >> beam.ParDo(join_fn, AsIter(side_input), self.iterations) | 'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace))) result = p.run() result.wait_until_finish() if self.metrics_monitor is not None: self.metrics_monitor.send_metrics(result)
def expand(self, pcoll): do_once = pcoll.pipeline | 'DoOnce' >> core.Create([None]) init_result_coll = do_once | 'InitializeWrite' >> core.Map( lambda _, sink: sink.initialize_write(), self.sink) if getattr(self.sink, 'num_shards', 0): min_shards = self.sink.num_shards if min_shards == 1: keyed_pcoll = pcoll | core.Map(lambda x: (None, x)) else: keyed_pcoll = pcoll | core.ParDo(_RoundRobinKeyFn(min_shards)) write_result_coll = ( keyed_pcoll | core.WindowInto(window.GlobalWindows()) | core.GroupByKey() | 'WriteBundles' >> core.ParDo(_WriteKeyedBundleDoFn(self.sink), AsSingleton(init_result_coll))) else: min_shards = 1 write_result_coll = ( pcoll | 'WriteBundles' >> core.ParDo(_WriteBundleDoFn(self.sink), AsSingleton(init_result_coll)) | 'Pair' >> core.Map(lambda x: (None, x)) | core.WindowInto(window.GlobalWindows()) | core.GroupByKey() | 'Extract' >> core.FlatMap(lambda x: x[1])) return do_once | 'FinalizeWrite' >> core.FlatMap( _finalize_write, self.sink, AsSingleton(init_result_coll), AsIter(write_result_coll), min_shards)
def test_pcollectionview_not_recreated(self): pipeline = Pipeline('DirectRunner') value = pipeline | 'create1' >> Create([1, 2, 3]) value2 = pipeline | 'create2' >> Create([(1, 1), (2, 2), (3, 3)]) value3 = pipeline | 'create3' >> Create([(1, 1), (2, 2), (3, 3)]) self.assertEqual(AsSingleton(value), AsSingleton(value)) self.assertEqual(AsSingleton('new', value, default_value=1), AsSingleton('new', value, default_value=1)) self.assertNotEqual(AsSingleton(value), AsSingleton('new', value, default_value=1)) self.assertEqual(AsIter(value), AsIter(value)) self.assertEqual(AsList(value), AsList(value)) self.assertEqual(AsDict(value2), AsDict(value2)) self.assertNotEqual(AsSingleton(value), AsSingleton(value2)) self.assertNotEqual(AsIter(value), AsIter(value2)) self.assertNotEqual(AsList(value), AsList(value2)) self.assertNotEqual(AsDict(value2), AsDict(value3))