Python BigqueryMatcher示例，apache_beam.io.gcp.tests.bigquery_matcher.BigqueryMatcher Python示例

示例#1

0

显示文件

    def test_leader_board_it(self):
        state_verifier = PipelineStateMatcher(PipelineState.RUNNING)

        success_condition = 'total_score=5000 LIMIT 1'
        users_query = ('SELECT total_score FROM [%s:%s.%s] '
                       'WHERE %s' %
                       (self.project, self.dataset.name,
                        self.OUTPUT_TABLE_USERS, success_condition))
        bq_users_verifier = BigqueryMatcher(self.project, users_query,
                                            self.DEFAULT_EXPECTED_CHECKSUM)

        teams_query = ('SELECT total_score FROM [%s:%s.%s] '
                       'WHERE %s' %
                       (self.project, self.dataset.name,
                        self.OUTPUT_TABLE_TEAMS, success_condition))
        bq_teams_verifier = BigqueryMatcher(self.project, teams_query,
                                            self.DEFAULT_EXPECTED_CHECKSUM)

        extra_opts = {
            'subscription':
            self.input_sub.full_name,
            'dataset':
            self.dataset.name,
            'topic':
            self.input_topic.full_name,
            'team_window_duration':
            1,
            'wait_until_finish_duration':
            self.WAIT_UNTIL_FINISH_DURATION,
            'on_success_matcher':
            all_of(state_verifier, bq_users_verifier, bq_teams_verifier)
        }

        # Register cleanup before pipeline execution.
        # Note that actual execution happens in reverse order.
        self.addCleanup(self._cleanup_pubsub)
        self.addCleanup(self._cleanup_dataset)
        self.addCleanup(utils.delete_bq_table, self.project, self.dataset.name,
                        self.OUTPUT_TABLE_USERS)
        self.addCleanup(utils.delete_bq_table, self.project, self.dataset.name,
                        self.OUTPUT_TABLE_TEAMS)

        # Generate input data and inject to PubSub.
        test_utils.wait_for_subscriptions_created(
            [self.input_topic, self.input_sub])
        self._inject_pubsub_game_events(self.input_topic,
                                        self.DEFAULT_INPUT_COUNT)

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        leader_board.run(
            self.test_pipeline.get_full_options_as_args(**extra_opts))

示例#2

0

显示文件

文件： hourly_team_score_it_test.py 项目： xsm110/Beam15.0

    def test_hourly_team_score_it(self):
        state_verifier = PipelineStateMatcher(PipelineState.DONE)
        query = (
            'SELECT COUNT(*) FROM `%s.%s.%s`' %
            (self.project, self.dataset_ref.dataset_id, self.OUTPUT_TABLE))

        bigquery_verifier = BigqueryMatcher(self.project, query,
                                            self.DEFAULT_EXPECTED_CHECKSUM)

        extra_opts = {
            'input': self.DEFAULT_INPUT_FILE,
            'dataset': self.dataset_ref.dataset_id,
            'window_duration': 1,
            'on_success_matcher': all_of(state_verifier, bigquery_verifier)
        }

        # Register clean up before pipeline execution
        # Note that actual execution happens in reverse order.
        self.addCleanup(utils.delete_bq_dataset, self.project,
                        self.dataset_ref)

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        hourly_team_score.run(
            self.test_pipeline.get_full_options_as_args(**extra_opts))

示例#3

0

显示文件

    def test_big_query_standard_sql_kms_key_native(self):
        if isinstance(self.test_pipeline.runner, TestDirectRunner):
            self.skipTest("This test doesn't work on DirectRunner.")
        verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
        expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
        pipeline_verifiers = [
            PipelineStateMatcher(),
            BigqueryMatcher(project=self.project,
                            query=verify_query,
                            checksum=expected_checksum)
        ]
        kms_key = self.test_pipeline.get_option('kms_key_name')
        self.assertTrue(kms_key)
        extra_opts = {
            'query': STANDARD_QUERY,
            'output': self.output_table,
            'output_schema': DIALECT_OUTPUT_SCHEMA,
            'use_standard_sql': True,
            'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS,
            'on_success_matcher': all_of(*pipeline_verifiers),
            'kms_key': kms_key,
            'native': True,
            'experiments': 'use_legacy_bq_sink',
        }
        options = self.test_pipeline.get_full_options_as_args(**extra_opts)
        big_query_query_to_table_pipeline.run_bq_pipeline(options)

        table = self.bigquery_client.get_table(self.project, self.dataset_id,
                                               'output_table')
        self.assertIsNotNone(table.encryptionConfiguration,
                             'No encryption configuration found: %s' % table)
        self.assertEqual(kms_key, table.encryptionConfiguration.kmsKeyName)

示例#4

0

显示文件

 def test_big_query_new_types_native(self):
     expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED)
     verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table
     pipeline_verifiers = [
         PipelineStateMatcher(),
         BigqueryMatcher(
             project=self.project,
             query=verify_query,
             checksum=expected_checksum,
             timeout_secs=30,
         )
     ]
     self._setup_new_types_env()
     extra_opts = {
         'query':
         NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE),
         'output': self.output_table,
         'output_schema': NEW_TYPES_OUTPUT_SCHEMA,
         'use_standard_sql': False,
         'native': True,
         'use_json_exports': True,
         'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS,
         'on_success_matcher': all_of(*pipeline_verifiers),
         'experiments': 'use_legacy_bq_sink',
     }
     options = self.test_pipeline.get_full_options_as_args(**extra_opts)
     big_query_query_to_table_pipeline.run_bq_pipeline(options)

示例#5

0

显示文件

文件： bigquery_tornadoes_it_test.py 项目： kobisalant/incubator-beam

  def test_bigquery_tornadoes_it(self):
    test_pipeline = TestPipeline(is_integration_test=True)

    # Set extra options to the pipeline for test purpose
    project = test_pipeline.get_option('project')

    dataset = 'BigQueryTornadoesIT'
    table = 'monthly_tornadoes_%s' % int(round(time.time() * 1000))
    output_table = '.'.join([dataset, table])
    query = 'SELECT month, tornado_count FROM [%s]' % output_table

    pipeline_verifiers = [PipelineStateMatcher(),
                          BigqueryMatcher(
                              project=project,
                              query=query,
                              checksum=self.DEFAULT_CHECKSUM)]
    extra_opts = {'output': output_table,
                  'on_success_matcher': all_of(*pipeline_verifiers)}

    # Register cleanup before pipeline execution.
    self.addCleanup(utils.delete_bq_table, project, dataset, table)

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    bigquery_tornadoes.run(
        test_pipeline.get_full_options_as_args(**extra_opts))

示例#6

0

显示文件

  def test_filters_output_bigquery_matcher(self):
    test_pipeline = TestPipeline(is_integration_test=True)

    # Set extra options to the pipeline for test purpose
    project = test_pipeline.get_option('project')

    dataset = 'FiltersTestIT'
    table = 'cold_days_%s' % int(round(time.time() * 1000))
    output_table = '.'.join([dataset, table])
    query = 'SELECT year, month, day, mean_temp FROM `%s`' % output_table

    pipeline_verifiers = [
        PipelineStateMatcher(),
        BigqueryMatcher(
            project=project, query=query, checksum=self.DEFAULT_CHECKSUM)
    ]
    extra_opts = {
        'output': output_table,
        'on_success_matcher': all_of(*pipeline_verifiers)
    }

    # Register cleanup before pipeline execution.
    # Note that actual execution happens in reverse order.
    self.addCleanup(utils.delete_bq_table, project, dataset, table)

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    filters.run(test_pipeline.get_full_options_as_args(**extra_opts))

示例#7

0

显示文件

文件： hourly_team_score_it_test.py 项目： roger-mike/beam

    def test_hourly_team_score_output_checksum_on_small_input(self):
        # Small dataset to prevent Out of Memory when running in local runners
        INPUT_FILE = 'gs://apache-beam-samples/game/small/gaming_data.csv'
        EXPECTED_CHECKSUM = '91143e81622aa391eb62eaa3f3a5123401edb07d'
        state_verifier = PipelineStateMatcher(PipelineState.DONE)
        query = (
            'SELECT COUNT(*) FROM `%s.%s.%s`' %
            (self.project, self.dataset_ref.dataset_id, self.OUTPUT_TABLE))

        bigquery_verifier = BigqueryMatcher(self.project, query,
                                            EXPECTED_CHECKSUM)

        extra_opts = {
            'input': INPUT_FILE,
            'dataset': self.dataset_ref.dataset_id,
            'window_duration': 1,
            'on_success_matcher': all_of(state_verifier, bigquery_verifier)
        }

        # Register clean up before pipeline execution
        # Note that actual execution happens in reverse order.
        self.addCleanup(utils.delete_bq_dataset, self.project,
                        self.dataset_ref)

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        hourly_team_score.run(
            self.test_pipeline.get_full_options_as_args(**extra_opts))

示例#8

0

显示文件

文件： big_query_query_to_table_it_test.py 项目： wscheep/beam

 def test_big_query_standard_sql(self):
   verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
   expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
   pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
       project=self.project,
       query=verify_query,
       checksum=expected_checksum)]
   extra_opts = {'query': STANDARD_QUERY,
                 'output': self.output_table,
                 'output_schema': DIALECT_OUTPUT_SCHEMA,
                 'use_standard_sql': True,
                 'on_success_matcher': all_of(*pipeline_verifiers)}
   options = self.test_pipeline.get_full_options_as_args(**extra_opts)
   big_query_query_to_table_pipeline.run_bq_pipeline(options)

示例#9

0

显示文件

文件： big_query_query_to_table_it_test.py 项目： wscheep/beam

 def test_big_query_new_types(self):
   expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED)
   verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table
   pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
       project=self.project,
       query=verify_query,
       checksum=expected_checksum)]
   self._setup_new_types_env()
   extra_opts = {
       'query': NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE),
       'output': self.output_table,
       'output_schema': NEW_TYPES_OUTPUT_SCHEMA,
       'use_standard_sql': False,
       'on_success_matcher': all_of(*pipeline_verifiers)}
   options = self.test_pipeline.get_full_options_as_args(**extra_opts)
   big_query_query_to_table_pipeline.run_bq_pipeline(options)

示例#10

0

显示文件

文件： big_query_query_to_table_it_test.py 项目： yaoshi1994/beam

  def test_big_query_legacy_sql(self):
    verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
    expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
    pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
        project=self.project,
        query=verify_query,
        checksum=expected_checksum)]

    extra_opts = {'query': LEGACY_QUERY,
                  'output': self.output_table,
                  'output_schema': DIALECT_OUTPUT_SCHEMA,
                  'use_standard_sql': False,
                  'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS,
                  'on_success_matcher': all_of(*pipeline_verifiers)}
    options = self.test_pipeline.get_full_options_as_args(**extra_opts)
    big_query_query_to_table_pipeline.run_bq_pipeline(options)

示例#11

0

显示文件

文件： bigquery_tornadoes_it_test.py 项目： davidthinh/BeamPipelinePython

  def test_bigquery_tornadoes_it(self):
    test_pipeline = TestPipeline(is_integration_test=True)

    # Set extra options to the pipeline for test purpose
    output_table = ('BigQueryTornadoesIT'
                    '.monthly_tornadoes_%s' % int(round(time.time() * 1000)))
    query = 'SELECT month, tornado_count FROM [%s]' % output_table
    pipeline_verifiers = [PipelineStateMatcher(),
                          BigqueryMatcher(
                              project=test_pipeline.get_option('project'),
                              query=query,
                              checksum=self.DEFAULT_CHECKSUM)]
    extra_opts = {'output': output_table,
                  'on_success_matcher': all_of(*pipeline_verifiers)}

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    bigquery_tornadoes.run(
        test_pipeline.get_full_options_as_args(**extra_opts))

示例#12

0

显示文件

  def test_big_query_standard_sql_kms_key(self):
    verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
    expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
    pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
        project=self.project,
        query=verify_query,
        checksum=expected_checksum)]
    extra_opts = {'query': STANDARD_QUERY,
                  'output': self.output_table,
                  'output_schema': DIALECT_OUTPUT_SCHEMA,
                  'use_standard_sql': True,
                  'on_success_matcher': all_of(*pipeline_verifiers),
                  'kms_key': KMS_KEY
                 }
    options = self.test_pipeline.get_full_options_as_args(**extra_opts)
    big_query_query_to_table_pipeline.run_bq_pipeline(options)

    table = self.bigquery_client.get_table(
        self.project, self.dataset_id, 'output_table')
    self.assertEqual(KMS_KEY, table.encryptionConfiguration.kmsKeyName)

示例#13

0

显示文件

文件： big_query_query_to_table_it_test.py 项目： ptphuy/beam

    def test_big_query_legacy_sql(self):
        verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
        expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
        pipeline_verifiers = [
            PipelineStateMatcher(),
            BigqueryMatcher(project=self.project,
                            query=verify_query,
                            checksum=expected_checksum)
        ]

        gs_location = 'gs://temp-storage-for-upload-tests/%s' % self.output_table
        extra_opts = {
            'query': LEGACY_QUERY,
            'output': self.output_table,
            'bq_temp_location': gs_location,
            'output_schema': DIALECT_OUTPUT_SCHEMA,
            'use_standard_sql': False,
            'on_success_matcher': all_of(*pipeline_verifiers)
        }
        options = self.test_pipeline.get_full_options_as_args(**extra_opts)
        big_query_query_to_table_pipeline.run_bq_pipeline(options)

示例#14

0

显示文件

    def test_game_stats_it(self):
        state_verifier = PipelineStateMatcher(PipelineState.RUNNING)

        success_condition = 'mean_duration=300 LIMIT 1'
        sessions_query = ('SELECT mean_duration FROM [%s:%s.%s] '
                          'WHERE %s' %
                          (self.project, self.dataset.name,
                           self.OUTPUT_TABLE_SESSIONS, success_condition))
        bq_sessions_verifier = BigqueryMatcher(self.project, sessions_query,
                                               self.DEFAULT_EXPECTED_CHECKSUM)

        # TODO(mariagh): Add teams table verifier once game_stats.py is fixed.

        extra_opts = {
            'subscription': self.input_sub.name,
            'dataset': self.dataset.name,
            'topic': self.input_topic.name,
            'fixed_window_duration': 1,
            'user_activity_window_duration': 1,
            'wait_until_finish_duration': self.WAIT_UNTIL_FINISH_DURATION,
            'on_success_matcher': all_of(state_verifier, bq_sessions_verifier)
        }

        # Register cleanup before pipeline execution.
        # Note that actual execution happens in reverse order.
        self.addCleanup(self._cleanup_pubsub)
        self.addCleanup(self._cleanup_dataset)
        self.addCleanup(utils.delete_bq_table, self.project, self.dataset.name,
                        self.OUTPUT_TABLE_SESSIONS)
        self.addCleanup(utils.delete_bq_table, self.project, self.dataset.name,
                        self.OUTPUT_TABLE_TEAMS)

        # Generate input data and inject to PubSub.
        self._inject_pubsub_game_events(self.input_topic,
                                        self.DEFAULT_INPUT_COUNT)

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        game_stats.run(
            self.test_pipeline.get_full_options_as_args(**extra_opts))

示例#15

0

显示文件

文件： pubsub_it_test.py 项目： tunnelWithAC/apache-beam-python-integration-test

    def test_pubsub_pipe_it(self):
        # Build expected dataset.
        expected_msg = ['conall_0 - 1608051184'.encode('utf-8')]

        # Set extra options to the pipeline for test purpose
        state_verifier = PipelineStateMatcher(PipelineState.RUNNING)
        pubsub_msg_verifier = PubSubMessageMatcher(self.project,
                                                   self.output_sub.name,
                                                   expected_msg,
                                                   timeout=60 *
                                                   7)  # in seconds

        EXPECTED_BQ_CHECKSUM = 'da39a3ee5e6b4b0d3255bfef95601890afd80709'  # SELECT SHA1(text) FROM `<project>.<dataset>.<table>`
        validation_query = f'SELECT text FROM `{self.project}.{self.dataset_ref.dataset_id}.{OUTPUT_TABLE}`'
        bq_sessions_verifier = BigqueryMatcher(self.project, validation_query,
                                               EXPECTED_BQ_CHECKSUM)
        # bq_sessions_verifier

        extra_opts = {
            'bigquery_dataset': self.dataset_ref.dataset_id,
            'bigquery_table': OUTPUT_TABLE,
            'input_subscription': self.input_sub.name,
            'output_topic': self.output_topic.name,
            'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION,
            'on_success_matcher': all_of(state_verifier, pubsub_msg_verifier)
        }

        # Generate input data and inject to PubSub.
        self._inject_numbers(self.input_topic, DEFAULT_INPUT_NUMBERS)

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        pipeline.run(self.test_pipeline.get_full_options_as_args(**extra_opts))

        # Cleanup PubSub
        self.addCleanup(self._cleanup_pubsub)
        self.addCleanup(utils.delete_bq_dataset, self.project,
                        self.dataset_ref)

示例#16

0

显示文件

    def test_pubsub_pipe_it(self):
        # Build expected dataset.
        expected_msg = ['conall_0 - 1608051184'.encode('utf-8')]

        # Set extra options to the pipeline for test purpose
        state_verifier = PipelineStateMatcher(PipelineState.RUNNING)
        pubsub_msg_verifier = PubSubMessageMatcher(self.project,
                                                   self.output_sub.name,
                                                   expected_msg,
                                                   timeout=60 *
                                                   7)  # in seconds

        EXPECTED_BQ_CHECKSUM = 'da39a3ee5e6b4b0d3255bfef95601890afd80709'  # SELECT SHA1(text) FROM `<project>.<dataset>.<table>`
        validation_query = f'SELECT text FROM `{self.project}.{self.dataset_ref.dataset_id}.{OUTPUT_TABLE}`'
        bq_sessions_verifier = BigqueryMatcher(self.project, validation_query,
                                               EXPECTED_BQ_CHECKSUM)

        # make sure you put the expected result in a tuple with a trailing comma
        expected_bq_msg = [('conall_0 - 1608051184', )]
        # Fetch Bigquery data with given query, compare to the expected data.
        # bigquery_verifier = BigqueryFullResultMatcher(
        #     project=self.project,
        #     query=validation_query,
        #     data=expected_bq_msg)

        # Fetch Bigquery data with given query, compare to the expected data.
        # This matcher polls BigQuery until the no. of records in BigQuery is
        # equal to the no. of records in expected data.
        # Specifying a timeout is optional
        bigquery_streaming_verifier = BigqueryFullResultStreamingMatcher(
            project=self.project,
            query=validation_query,
            data=expected_bq_msg,
            timeout=60 * 7)

        extra_opts = {
            'bigquery_dataset':
            self.dataset_ref.dataset_id,
            'bigquery_table':
            OUTPUT_TABLE,
            'input_subscription':
            self.input_sub.name,
            'output_topic':
            self.output_topic.name,
            'wait_until_finish_duration':
            WAIT_UNTIL_FINISH_DURATION,
            'on_success_matcher':
            all_of(bigquery_streaming_verifier, state_verifier,
                   pubsub_msg_verifier)  # bigquery_verifier
        }

        # Generate input data and inject to PubSub.
        self._inject_numbers(self.input_topic, DEFAULT_INPUT_NUMBERS)

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        pipeline.run(self.test_pipeline.get_full_options_as_args(**extra_opts))

        # Cleanup PubSub
        self.addCleanup(self._cleanup_pubsub)
        self.addCleanup(utils.delete_bq_dataset, self.project,
                        self.dataset_ref)