def test_big_query_new_types_native(self): expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED) verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table pipeline_verifiers = [ PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum, timeout_secs=30, ) ] self._setup_new_types_env() extra_opts = { 'query': NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE), 'output': self.output_table, 'output_schema': NEW_TYPES_OUTPUT_SCHEMA, 'use_standard_sql': False, 'native': True, 'use_json_exports': True, 'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS, 'on_success_matcher': all_of(*pipeline_verifiers), 'experiments': 'use_legacy_bq_sink', } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options)
def test_big_query_standard_sql_kms_key_native(self): if isinstance(self.test_pipeline.runner, TestDirectRunner): self.skipTest("This test doesn't work on DirectRunner.") verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [ PipelineStateMatcher(), BigqueryMatcher(project=self.project, query=verify_query, checksum=expected_checksum) ] kms_key = self.test_pipeline.get_option('kms_key_name') self.assertTrue(kms_key) extra_opts = { 'query': STANDARD_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': True, 'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS, 'on_success_matcher': all_of(*pipeline_verifiers), 'kms_key': kms_key, 'native': True, 'experiments': 'use_legacy_bq_sink', } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) table = self.bigquery_client.get_table(self.project, self.dataset_id, 'output_table') self.assertIsNotNone(table.encryptionConfiguration, 'No encryption configuration found: %s' % table) self.assertEqual(kms_key, table.encryptionConfiguration.kmsKeyName)
def _matches(self, _): if self.checksum is None: response = self._query_with_retry() _LOGGER.info('Read from given query (%s), total rows %d', self.query, len(response)) self.checksum = compute_hash(response) _LOGGER.info('Generate checksum: %s', self.checksum) return self.checksum == self.expected_checksum
def get_checksum(): response = self._query_with_retry() _LOGGER.info('Read from given query (%s), total rows %d', self.query, len(response)) self.checksum = compute_hash(response) _LOGGER.info('Generate checksum: %s', self.checksum) if self.checksum != self.expected_checksum: # This exception is never raised beyond the enclosing method. raise ValueError( 'Checksums do not match. Expected: %s, got: %s' % (self.expected_checksum, self.checksum))
def test_autocomplete_it(self): with TestPipeline(is_integration_test=True) as p: words = p | beam.io.ReadFromText(self.KINGLEAR_INPUT) result = words | autocomplete.TopPerPrefix(10) # values must be hashable for now result = result | beam.Map( lambda k_vs: [k_vs[0], k_vs[1][0][0], k_vs[1][0][1]]) checksum = (result | beam.Map(lambda x: int(compute_hash(x)[:8], 16)) | beam.CombineGlobally(sum)) assert_that(checksum, equal_to([self.KINGLEAR_HASH_SUM]))
def test_autocomplete_it(self): with TestPipeline(is_integration_test=True) as p: words = p | beam.io.ReadFromText(self.KINGLEAR_INPUT) result = words | autocomplete.TopPerPrefix(10) # values must be hashable for now result = result | beam.Map(lambda k_vs: [k_vs[0], k_vs[1][0][0], k_vs[1][0][1]]) checksum = (result | beam.Map(lambda x: int(compute_hash(x)[:8], 16)) | beam.CombineGlobally(sum)) assert_that(checksum, equal_to([self.KINGLEAR_HASH_SUM]))
def test_big_query_standard_sql(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] extra_opts = {'query': STANDARD_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': True, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options)
def test_big_query_standard_sql(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] extra_opts = {'query': STANDARD_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': True, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options)
def _matches(self, _): logging.info('Start verify Bigquery data.') # Run query bigquery_client = bigquery.Client(project=self.project) response = self._query_with_retry(bigquery_client) logging.info('Read from given query (%s), total rows %d', self.query, len(response)) # Compute checksum self.checksum = compute_hash(response) logging.info('Generate checksum: %s', self.checksum) # Verify result return self.checksum == self.expected_checksum
def _matches(self, _): if self.sleep_secs: # Wait to have output file ready on FS logging.info('Wait %d seconds...', self.sleep_secs) time.sleep(self.sleep_secs) # Read from given file(s) path read_lines = self._read_with_retry() # Compute checksum self.checksum = utils.compute_hash(read_lines) logging.info('Read from given path %s, %d lines, checksum: %s.', self.file_path, len(read_lines), self.checksum) return self.checksum == self.expected_checksum
def _matches(self, _): logging.info('Start verify Bigquery data.') # Run query bigquery_client = bigquery.Client(project=self.project) response = self._query_with_retry(bigquery_client) logging.info('Read from given query (%s), total rows %d', self.query, len(response)) # Compute checksum self.checksum = compute_hash(response) logging.info('Generate checksum: %s', self.checksum) # Verify result return self.checksum == self.expected_checksum
def test_big_query_new_types(self): expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED) verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] self._setup_new_types_env() extra_opts = { 'query': NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE), 'output': self.output_table, 'output_schema': NEW_TYPES_OUTPUT_SCHEMA, 'use_standard_sql': False, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options)
def test_big_query_new_types(self): expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED) verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] self._setup_new_types_env() extra_opts = { 'query': NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE), 'output': self.output_table, 'output_schema': NEW_TYPES_OUTPUT_SCHEMA, 'use_standard_sql': False, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options)
def test_big_query_legacy_sql(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] extra_opts = {'query': LEGACY_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': False, 'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options)
def test_big_query_legacy_sql(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] gs_location = 'gs://temp-storage-for-upload-tests/%s' % self.output_table extra_opts = {'query': LEGACY_QUERY, 'output': self.output_table, 'bq_temp_location': gs_location, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': False, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options)
def test_big_query_standard_sql_kms_key(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] extra_opts = {'query': STANDARD_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': True, 'on_success_matcher': all_of(*pipeline_verifiers), 'kms_key': KMS_KEY } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) table = self.bigquery_client.get_table( self.project, self.dataset_id, 'output_table') self.assertEqual(KMS_KEY, table.encryptionConfiguration.kmsKeyName)
def test_big_query_standard_sql_kms_key(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] extra_opts = {'query': STANDARD_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': True, 'on_success_matcher': all_of(*pipeline_verifiers), 'kms_key': KMS_KEY } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) table = self.bigquery_client.get_table( self.project, self.dataset_id, 'output_table') self.assertEqual(KMS_KEY, table.encryptionConfiguration.kmsKeyName)
def test_big_query_legacy_sql(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [ PipelineStateMatcher(), BigqueryMatcher(project=self.project, query=verify_query, checksum=expected_checksum) ] gs_location = 'gs://temp-storage-for-upload-tests/%s' % self.output_table extra_opts = { 'query': LEGACY_QUERY, 'output': self.output_table, 'bq_temp_location': gs_location, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': False, 'on_success_matcher': all_of(*pipeline_verifiers) } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options)