def test_bqfl_streaming(self): if isinstance(self.test_pipeline.runner, TestDataflowRunner): self.skipTest("TestStream is not supported on TestDataflowRunner") output_table = '%s_%s' % (self.output_table, 'ints') _SIZE = 100 schema = self.BIG_QUERY_STREAMING_SCHEMA l = [{'Integr': i} for i in range(_SIZE)] state_matcher = PipelineStateMatcher(PipelineState.RUNNING) bq_matcher = BigqueryFullResultStreamingMatcher( project=self.project, query="SELECT Integr FROM %s" % output_table, data=[(i, ) for i in range(100)]) args = self.test_pipeline.get_full_options_as_args( on_success_matcher=all_of(state_matcher, bq_matcher), streaming=True) with beam.Pipeline(argv=args) as p: stream_source = ( TestStream().advance_watermark_to(0).advance_processing_time( 100).add_elements(l[:_SIZE // 4]). advance_processing_time(100).advance_watermark_to(100).add_elements( l[_SIZE // 4:2 * _SIZE // 4]).advance_processing_time( 100).advance_watermark_to(200).add_elements( l[2 * _SIZE // 4:3 * _SIZE // 4]).advance_processing_time( 100).advance_watermark_to(300).add_elements( l[3 * _SIZE // 4:]).advance_processing_time( 100).advance_watermark_to_infinity()) _ = (p | stream_source | bigquery.WriteToBigQuery(output_table, schema=schema, method=bigquery.WriteToBigQuery \ .Method.FILE_LOADS, triggering_frequency=100))
def _run_pubsub_bq_pipeline(self, method, triggering_frequency=None): l = [i for i in range(self._SIZE)] matchers = [ PipelineStateMatcher(PipelineState.RUNNING), BigqueryFullResultStreamingMatcher(project=self.project, query="SELECT number FROM %s" % self.output_table, data=[(i, ) for i in l]) ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*matchers), wait_until_finish_duration=self.WAIT_UNTIL_FINISH_DURATION, experiments='use_beam_bq_sink', streaming=True) def add_schema_info(element): yield {'number': element} messages = [str(i).encode('utf-8') for i in l] for message in messages: self.pub_client.publish(self.input_topic.name, message) with beam.Pipeline(argv=args) as p: mesages = (p | ReadFromPubSub(subscription=self.input_sub.name) | beam.ParDo(add_schema_info)) _ = mesages | WriteToBigQuery( self.output_table, schema=self.SCHEMA, method=method, triggering_frequency=triggering_frequency)
def test_multiple_destinations_transform(self): streaming = self.test_pipeline.options.view_as( StandardOptions).streaming if streaming and isinstance(self.test_pipeline.runner, TestDataflowRunner): self.skipTest("TestStream is not supported on TestDataflowRunner") output_table_1 = '%s%s' % (self.output_table, 1) output_table_2 = '%s%s' % (self.output_table, 2) full_output_table_1 = '%s:%s' % (self.project, output_table_1) full_output_table_2 = '%s:%s' % (self.project, output_table_2) schema1 = { 'fields': [{ 'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'language', 'type': 'STRING', 'mode': 'NULLABLE' }] } schema2 = { 'fields': [{ 'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'foundation', 'type': 'STRING', 'mode': 'NULLABLE' }] } bad_record = {'language': 1, 'manguage': 2} if streaming: pipeline_verifiers = [ PipelineStateMatcher(PipelineState.RUNNING), BigqueryFullResultStreamingMatcher( project=self.project, query="SELECT name, language FROM %s" % output_table_1, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d]), BigqueryFullResultStreamingMatcher( project=self.project, query="SELECT name, foundation FROM %s" % output_table_2, data=[(d['name'], d['foundation']) for d in _ELEMENTS if 'foundation' in d]) ] else: pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT name, language FROM %s" % output_table_1, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT name, foundation FROM %s" % output_table_2, data=[(d['name'], d['foundation']) for d in _ELEMENTS if 'foundation' in d]) ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers), experiments='use_beam_bq_sink') with beam.Pipeline(argv=args) as p: if streaming: _SIZE = len(_ELEMENTS) test_stream = ( TestStream().advance_watermark_to(0).add_elements( _ELEMENTS[:_SIZE // 2]).advance_watermark_to( 100).add_elements( _ELEMENTS[_SIZE // 2:]).advance_watermark_to_infinity()) input = p | test_stream else: input = p | beam.Create(_ELEMENTS) schema_table_pcv = beam.pvalue.AsDict( p | "MakeSchemas" >> beam.Create([(full_output_table_1, schema1), (full_output_table_2, schema2)])) table_record_pcv = beam.pvalue.AsDict( p | "MakeTables" >> beam.Create([('table1', full_output_table_1), ('table2', full_output_table_2)])) input2 = p | "Broken record" >> beam.Create([bad_record]) input = (input, input2) | beam.Flatten() r = (input | "WriteWithMultipleDests" >> beam.io.gcp.bigquery.WriteToBigQuery( table=lambda x, tables: (tables['table1'] if 'language' in x else tables['table2']), table_side_inputs=(table_record_pcv, ), schema=lambda dest, table_map: table_map.get(dest, None), schema_side_inputs=(schema_table_pcv, ), insert_retry_strategy=RetryStrategy. RETRY_ON_TRANSIENT_ERROR, method='STREAMING_INSERTS')) assert_that(r[beam.io.gcp.bigquery.BigQueryWriteFn.FAILED_ROWS], equal_to([(full_output_table_1, bad_record)]))
def test_pubsub_pipe_it(self): # Build expected dataset. expected_msg = ['conall_0 - 1608051184'.encode('utf-8')] # Set extra options to the pipeline for test purpose state_verifier = PipelineStateMatcher(PipelineState.RUNNING) pubsub_msg_verifier = PubSubMessageMatcher(self.project, self.output_sub.name, expected_msg, timeout=60 * 7) # in seconds EXPECTED_BQ_CHECKSUM = 'da39a3ee5e6b4b0d3255bfef95601890afd80709' # SELECT SHA1(text) FROM `<project>.<dataset>.<table>` validation_query = f'SELECT text FROM `{self.project}.{self.dataset_ref.dataset_id}.{OUTPUT_TABLE}`' bq_sessions_verifier = BigqueryMatcher(self.project, validation_query, EXPECTED_BQ_CHECKSUM) # make sure you put the expected result in a tuple with a trailing comma expected_bq_msg = [('conall_0 - 1608051184', )] # Fetch Bigquery data with given query, compare to the expected data. # bigquery_verifier = BigqueryFullResultMatcher( # project=self.project, # query=validation_query, # data=expected_bq_msg) # Fetch Bigquery data with given query, compare to the expected data. # This matcher polls BigQuery until the no. of records in BigQuery is # equal to the no. of records in expected data. # Specifying a timeout is optional bigquery_streaming_verifier = BigqueryFullResultStreamingMatcher( project=self.project, query=validation_query, data=expected_bq_msg, timeout=60 * 7) extra_opts = { 'bigquery_dataset': self.dataset_ref.dataset_id, 'bigquery_table': OUTPUT_TABLE, 'input_subscription': self.input_sub.name, 'output_topic': self.output_topic.name, 'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION, 'on_success_matcher': all_of(bigquery_streaming_verifier, state_verifier, pubsub_msg_verifier) # bigquery_verifier } # Generate input data and inject to PubSub. self._inject_numbers(self.input_topic, DEFAULT_INPUT_NUMBERS) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. pipeline.run(self.test_pipeline.get_full_options_as_args(**extra_opts)) # Cleanup PubSub self.addCleanup(self._cleanup_pubsub) self.addCleanup(utils.delete_bq_dataset, self.project, self.dataset_ref)