def test_bqfl_streaming(self):
    if isinstance(self.test_pipeline.runner, TestDataflowRunner):
      self.skipTest("TestStream is not supported on TestDataflowRunner")
    output_table = '%s_%s' % (self.output_table, 'ints')
    _SIZE = 100
    schema = self.BIG_QUERY_STREAMING_SCHEMA
    l = [{'Integr': i} for i in range(_SIZE)]

    state_matcher = PipelineStateMatcher(PipelineState.RUNNING)
    bq_matcher = BigqueryFullResultStreamingMatcher(
        project=self.project,
        query="SELECT Integr FROM %s" % output_table,
        data=[(i, ) for i in range(100)])

    args = self.test_pipeline.get_full_options_as_args(
        on_success_matcher=all_of(state_matcher, bq_matcher), streaming=True)
    with beam.Pipeline(argv=args) as p:
      stream_source = (
          TestStream().advance_watermark_to(0).advance_processing_time(
              100).add_elements(l[:_SIZE // 4]).
          advance_processing_time(100).advance_watermark_to(100).add_elements(
              l[_SIZE // 4:2 * _SIZE // 4]).advance_processing_time(
                  100).advance_watermark_to(200).add_elements(
                      l[2 * _SIZE // 4:3 * _SIZE // 4]).advance_processing_time(
                          100).advance_watermark_to(300).add_elements(
                              l[3 * _SIZE // 4:]).advance_processing_time(
                                  100).advance_watermark_to_infinity())
      _ = (p
           | stream_source
           | bigquery.WriteToBigQuery(output_table,
                                      schema=schema,
                                      method=bigquery.WriteToBigQuery \
                                        .Method.FILE_LOADS,
                                      triggering_frequency=100))
Пример #2
0
    def _run_pubsub_bq_pipeline(self, method, triggering_frequency=None):
        l = [i for i in range(self._SIZE)]

        matchers = [
            PipelineStateMatcher(PipelineState.RUNNING),
            BigqueryFullResultStreamingMatcher(project=self.project,
                                               query="SELECT number FROM %s" %
                                               self.output_table,
                                               data=[(i, ) for i in l])
        ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=hc.all_of(*matchers),
            wait_until_finish_duration=self.WAIT_UNTIL_FINISH_DURATION,
            experiments='use_beam_bq_sink',
            streaming=True)

        def add_schema_info(element):
            yield {'number': element}

        messages = [str(i).encode('utf-8') for i in l]
        for message in messages:
            self.pub_client.publish(self.input_topic.name, message)

        with beam.Pipeline(argv=args) as p:
            mesages = (p
                       | ReadFromPubSub(subscription=self.input_sub.name)
                       | beam.ParDo(add_schema_info))
            _ = mesages | WriteToBigQuery(
                self.output_table,
                schema=self.SCHEMA,
                method=method,
                triggering_frequency=triggering_frequency)
Пример #3
0
    def test_multiple_destinations_transform(self):
        streaming = self.test_pipeline.options.view_as(
            StandardOptions).streaming
        if streaming and isinstance(self.test_pipeline.runner,
                                    TestDataflowRunner):
            self.skipTest("TestStream is not supported on TestDataflowRunner")

        output_table_1 = '%s%s' % (self.output_table, 1)
        output_table_2 = '%s%s' % (self.output_table, 2)

        full_output_table_1 = '%s:%s' % (self.project, output_table_1)
        full_output_table_2 = '%s:%s' % (self.project, output_table_2)

        schema1 = {
            'fields': [{
                'name': 'name',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }, {
                'name': 'language',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }]
        }
        schema2 = {
            'fields': [{
                'name': 'name',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }, {
                'name': 'foundation',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }]
        }

        bad_record = {'language': 1, 'manguage': 2}

        if streaming:
            pipeline_verifiers = [
                PipelineStateMatcher(PipelineState.RUNNING),
                BigqueryFullResultStreamingMatcher(
                    project=self.project,
                    query="SELECT name, language FROM %s" % output_table_1,
                    data=[(d['name'], d['language']) for d in _ELEMENTS
                          if 'language' in d]),
                BigqueryFullResultStreamingMatcher(
                    project=self.project,
                    query="SELECT name, foundation FROM %s" % output_table_2,
                    data=[(d['name'], d['foundation']) for d in _ELEMENTS
                          if 'foundation' in d])
            ]
        else:
            pipeline_verifiers = [
                BigqueryFullResultMatcher(
                    project=self.project,
                    query="SELECT name, language FROM %s" % output_table_1,
                    data=[(d['name'], d['language']) for d in _ELEMENTS
                          if 'language' in d]),
                BigqueryFullResultMatcher(
                    project=self.project,
                    query="SELECT name, foundation FROM %s" % output_table_2,
                    data=[(d['name'], d['foundation']) for d in _ELEMENTS
                          if 'foundation' in d])
            ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=hc.all_of(*pipeline_verifiers),
            experiments='use_beam_bq_sink')

        with beam.Pipeline(argv=args) as p:
            if streaming:
                _SIZE = len(_ELEMENTS)
                test_stream = (
                    TestStream().advance_watermark_to(0).add_elements(
                        _ELEMENTS[:_SIZE // 2]).advance_watermark_to(
                            100).add_elements(
                                _ELEMENTS[_SIZE //
                                          2:]).advance_watermark_to_infinity())
                input = p | test_stream
            else:
                input = p | beam.Create(_ELEMENTS)

            schema_table_pcv = beam.pvalue.AsDict(
                p
                | "MakeSchemas" >> beam.Create([(full_output_table_1, schema1),
                                                (full_output_table_2,
                                                 schema2)]))

            table_record_pcv = beam.pvalue.AsDict(
                p
                | "MakeTables" >> beam.Create([('table1', full_output_table_1),
                                               ('table2',
                                                full_output_table_2)]))

            input2 = p | "Broken record" >> beam.Create([bad_record])

            input = (input, input2) | beam.Flatten()

            r = (input
                 | "WriteWithMultipleDests" >>
                 beam.io.gcp.bigquery.WriteToBigQuery(
                     table=lambda x, tables:
                     (tables['table1']
                      if 'language' in x else tables['table2']),
                     table_side_inputs=(table_record_pcv, ),
                     schema=lambda dest, table_map: table_map.get(dest, None),
                     schema_side_inputs=(schema_table_pcv, ),
                     insert_retry_strategy=RetryStrategy.
                     RETRY_ON_TRANSIENT_ERROR,
                     method='STREAMING_INSERTS'))

            assert_that(r[beam.io.gcp.bigquery.BigQueryWriteFn.FAILED_ROWS],
                        equal_to([(full_output_table_1, bad_record)]))
Пример #4
0
    def test_pubsub_pipe_it(self):
        # Build expected dataset.
        expected_msg = ['conall_0 - 1608051184'.encode('utf-8')]

        # Set extra options to the pipeline for test purpose
        state_verifier = PipelineStateMatcher(PipelineState.RUNNING)
        pubsub_msg_verifier = PubSubMessageMatcher(self.project,
                                                   self.output_sub.name,
                                                   expected_msg,
                                                   timeout=60 *
                                                   7)  # in seconds

        EXPECTED_BQ_CHECKSUM = 'da39a3ee5e6b4b0d3255bfef95601890afd80709'  # SELECT SHA1(text) FROM `<project>.<dataset>.<table>`
        validation_query = f'SELECT text FROM `{self.project}.{self.dataset_ref.dataset_id}.{OUTPUT_TABLE}`'
        bq_sessions_verifier = BigqueryMatcher(self.project, validation_query,
                                               EXPECTED_BQ_CHECKSUM)

        # make sure you put the expected result in a tuple with a trailing comma
        expected_bq_msg = [('conall_0 - 1608051184', )]
        # Fetch Bigquery data with given query, compare to the expected data.
        # bigquery_verifier = BigqueryFullResultMatcher(
        #     project=self.project,
        #     query=validation_query,
        #     data=expected_bq_msg)

        # Fetch Bigquery data with given query, compare to the expected data.
        # This matcher polls BigQuery until the no. of records in BigQuery is
        # equal to the no. of records in expected data.
        # Specifying a timeout is optional
        bigquery_streaming_verifier = BigqueryFullResultStreamingMatcher(
            project=self.project,
            query=validation_query,
            data=expected_bq_msg,
            timeout=60 * 7)

        extra_opts = {
            'bigquery_dataset':
            self.dataset_ref.dataset_id,
            'bigquery_table':
            OUTPUT_TABLE,
            'input_subscription':
            self.input_sub.name,
            'output_topic':
            self.output_topic.name,
            'wait_until_finish_duration':
            WAIT_UNTIL_FINISH_DURATION,
            'on_success_matcher':
            all_of(bigquery_streaming_verifier, state_verifier,
                   pubsub_msg_verifier)  # bigquery_verifier
        }

        # Generate input data and inject to PubSub.
        self._inject_numbers(self.input_topic, DEFAULT_INPUT_NUMBERS)

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        pipeline.run(self.test_pipeline.get_full_options_as_args(**extra_opts))

        # Cleanup PubSub
        self.addCleanup(self._cleanup_pubsub)
        self.addCleanup(utils.delete_bq_dataset, self.project,
                        self.dataset_ref)