def _test_flatten_input_visitor(self, input_type, output_type, num_inputs): p = TestPipeline() inputs = [] for _ in range(num_inputs): input_pcoll = PCollection(p) input_pcoll.element_type = input_type inputs.append(input_pcoll) output_pcoll = PCollection(p) output_pcoll.element_type = output_type flatten = AppliedPTransform(None, beam.Flatten(), "label", inputs) flatten.add_output(output_pcoll, None) DataflowRunner.flatten_input_visitor().visit_transform(flatten) for _ in range(num_inputs): self.assertEqual(inputs[0].element_type, output_type)
def test_gbk_then_flatten_input_visitor(self): p = TestPipeline(runner=DataflowRunner(), options=PipelineOptions(self.default_properties)) none_str_pc = p | 'c1' >> beam.Create({None: 'a'}) none_int_pc = p | 'c2' >> beam.Create({None: 3}) flat = (none_str_pc, none_int_pc) | beam.Flatten() _ = flat | beam.GroupByKey() # This may change if type inference changes, but we assert it here # to make sure the check below is not vacuous. self.assertNotIsInstance(flat.element_type, typehints.TupleConstraint) p.visit(DataflowRunner.group_by_key_input_visitor()) p.visit(DataflowRunner.flatten_input_visitor()) # The dataflow runner requires gbk input to be tuples *and* flatten # inputs to be equal to their outputs. Assert both hold. self.assertIsInstance(flat.element_type, typehints.TupleConstraint) self.assertEqual(flat.element_type, none_str_pc.element_type) self.assertEqual(flat.element_type, none_int_pc.element_type)
def test_gbk_then_flatten_input_visitor(self): p = TestPipeline( runner=DataflowRunner(), options=PipelineOptions(self.default_properties)) none_str_pc = p | 'c1' >> beam.Create({None: 'a'}) none_int_pc = p | 'c2' >> beam.Create({None: 3}) flat = (none_str_pc, none_int_pc) | beam.Flatten() _ = flat | beam.GroupByKey() # This may change if type inference changes, but we assert it here # to make sure the check below is not vacuous. self.assertNotIsInstance(flat.element_type, typehints.TupleConstraint) p.visit(DataflowRunner.group_by_key_input_visitor()) p.visit(DataflowRunner.flatten_input_visitor()) # The dataflow runner requires gbk input to be tuples *and* flatten # inputs to be equal to their outputs. Assert both hold. self.assertIsInstance(flat.element_type, typehints.TupleConstraint) self.assertEqual(flat.element_type, none_str_pc.element_type) self.assertEqual(flat.element_type, none_int_pc.element_type)