class BigQueryQueryToTableIT(unittest.TestCase): def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.bigquery_client = BigQueryWrapper() self.dataset_id = '%s%s%d' % (BIG_QUERY_DATASET_ID, str(int(time.time())), random.randint(0, 10000)) self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id) self.output_table = "%s.output_table" % (self.dataset_id) def tearDown(self): request = bigquery.BigqueryDatasetsDeleteRequest( projectId=self.project, datasetId=self.dataset_id, deleteContents=True) try: self.bigquery_client.client.datasets.Delete(request) except HttpError: logging.debug('Failed to clean up dataset %s' % self.dataset_id) def _setup_new_types_env(self): table_schema = bigquery.TableSchema() table_field = bigquery.TableFieldSchema() table_field.name = 'bytes' table_field.type = 'BYTES' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'date' table_field.type = 'DATE' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'time' table_field.type = 'TIME' table_schema.fields.append(table_field) table = bigquery.Table( tableReference=bigquery.TableReference( projectId=self.project, datasetId=self.dataset_id, tableId=NEW_TYPES_INPUT_TABLE), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=self.project, datasetId=self.dataset_id, table=table) self.bigquery_client.client.tables.Insert(request) table_data = [ {'bytes':b'xyw=', 'date':'2011-01-01', 'time':'23:59:59.999999'}, {'bytes':b'abc=', 'date':'2000-01-01', 'time':'00:00:00'}, {'bytes':b'dec=', 'date':'3000-12-31', 'time':'23:59:59.990000'} ] self.bigquery_client.insert_rows( self.project, self.dataset_id, NEW_TYPES_INPUT_TABLE, table_data) @attr('IT') def test_big_query_legacy_sql(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] extra_opts = {'query': LEGACY_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': False, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) @attr('IT') def test_big_query_standard_sql(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] extra_opts = {'query': STANDARD_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': True, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) # TODO(BEAM-6660): Enable this test when ready. @unittest.skip('This test requires BQ Dataflow native source support for ' + 'KMS, which is not available yet.') @attr('IT') def test_big_query_standard_sql_kms_key(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] extra_opts = {'query': STANDARD_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': True, 'on_success_matcher': all_of(*pipeline_verifiers), 'kms_key': KMS_KEY } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) table = self.bigquery_client.get_table( self.project, self.dataset_id, 'output_table') self.assertEqual(KMS_KEY, table.encryptionConfiguration.kmsKeyName) @unittest.skipIf(sys.version_info[0] == 3 and os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1', 'This test still needs to be fixed on Python 3' 'TODO: BEAM-6769') @attr('IT') def test_big_query_new_types(self): expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED) verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] self._setup_new_types_env() extra_opts = { 'query': NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE), 'output': self.output_table, 'output_schema': NEW_TYPES_OUTPUT_SCHEMA, 'use_standard_sql': False, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options)
class BigQueryReadIntegrationTests(unittest.TestCase): BIG_QUERY_DATASET_ID = 'python_read_table_' def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.bigquery_client = BigQueryWrapper() self.dataset_id = '%s%s%d' % (self.BIG_QUERY_DATASET_ID, str(int(time.time())), random.randint(0, 10000)) self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id) logging.info("Created dataset %s in project %s", self.dataset_id, self.project) def tearDown(self): request = bigquery.BigqueryDatasetsDeleteRequest( projectId=self.project, datasetId=self.dataset_id, deleteContents=True) try: logging.info("Deleting dataset %s in project %s", self.dataset_id, self.project) self.bigquery_client.client.datasets.Delete(request) except HttpError: logging.debug('Failed to clean up dataset %s in project %s', self.dataset_id, self.project) def create_table(self, tablename): table_schema = bigquery.TableSchema() table_field = bigquery.TableFieldSchema() table_field.name = 'number' table_field.type = 'INTEGER' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'str' table_field.type = 'STRING' table_schema.fields.append(table_field) table = bigquery.Table(tableReference=bigquery.TableReference( projectId=self.project, datasetId=self.dataset_id, tableId=tablename), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=self.project, datasetId=self.dataset_id, table=table) self.bigquery_client.client.tables.Insert(request) table_data = [{ 'number': 1, 'str': 'abc' }, { 'number': 2, 'str': 'def' }, { 'number': 3, 'str': u'你好' }, { 'number': 4, 'str': u'привет' }] self.bigquery_client.insert_rows(self.project, self.dataset_id, tablename, table_data) def create_table_new_types(self, table_name): table_schema = bigquery.TableSchema() table_field = bigquery.TableFieldSchema() table_field.name = 'float' table_field.type = 'FLOAT' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'numeric' table_field.type = 'NUMERIC' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'bytes' table_field.type = 'BYTES' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'date' table_field.type = 'DATE' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'time' table_field.type = 'TIME' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'datetime' table_field.type = 'DATETIME' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'timestamp' table_field.type = 'TIMESTAMP' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'geo' table_field.type = 'GEOGRAPHY' table_schema.fields.append(table_field) table = bigquery.Table(tableReference=bigquery.TableReference( projectId=self.project, datasetId=self.dataset_id, tableId=table_name), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=self.project, datasetId=self.dataset_id, table=table) self.bigquery_client.client.tables.Insert(request) row_data = { 'float': 0.33, 'numeric': Decimal('10'), 'bytes': base64.b64encode(b'\xab\xac').decode('utf-8'), 'date': '3000-12-31', 'time': '23:59:59', 'datetime': '2018-12-31T12:44:31', 'timestamp': '2018-12-31 12:44:31.744957 UTC', 'geo': 'POINT(30 10)' } table_data = [row_data] # add rows with only one key value pair and None values for all other keys for key, value in iteritems(row_data): table_data.append({key: value}) self.bigquery_client.insert_rows(self.project, self.dataset_id, table_name, table_data) @attr('IT') def test_big_query_read(self): table_name = 'python_write_table' self.create_table(table_name) table_id = '{}.{}'.format(self.dataset_id, table_name) args = self.test_pipeline.get_full_options_as_args() with beam.Pipeline(argv=args) as p: result = (p | 'read' >> beam.io.Read( beam.io.BigQuerySource( query='SELECT number, str FROM `%s`' % table_id, use_standard_sql=True))) assert_that( result, equal_to([{ 'number': 1, 'str': 'abc' }, { 'number': 2, 'str': 'def' }, { 'number': 3, 'str': u'你好' }, { 'number': 4, 'str': u'привет' }])) @attr('IT') def test_big_query_read_new_types(self): table_name = 'python_new_types' self.create_table_new_types(table_name) table_id = '{}.{}'.format(self.dataset_id, table_name) args = self.test_pipeline.get_full_options_as_args() expected_row = { 'float': 0.33, 'numeric': Decimal('10'), 'bytes': base64.b64encode(b'\xab\xac'), 'date': '3000-12-31', 'time': '23:59:59', 'datetime': '2018-12-31T12:44:31', 'timestamp': '2018-12-31 12:44:31.744957 UTC', 'geo': 'POINT(30 10)' } expected_data = [expected_row] # add rows with only one key value pair and None values for all other keys for key, value in iteritems(expected_row): row = {k: None for k in expected_row} row[key] = value expected_data.append(row) with beam.Pipeline(argv=args) as p: result = (p | 'read' >> beam.io.Read( beam.io.BigQuerySource( query='SELECT float, numeric, bytes, date, time, datetime,' 'timestamp, geo FROM `%s`' % table_id, use_standard_sql=True))) assert_that(result, equal_to(expected_data))
class BigQueryQueryToTableIT(unittest.TestCase): def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.bigquery_client = BigQueryWrapper() self.dataset_id = '%s%s%d' % (BIG_QUERY_DATASET_ID, str(int(time.time())), random.randint(0, 10000)) self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id) self.output_table = "%s.output_table" % (self.dataset_id) def tearDown(self): request = bigquery.BigqueryDatasetsDeleteRequest( projectId=self.project, datasetId=self.dataset_id, deleteContents=True) try: self.bigquery_client.client.datasets.Delete(request) except HttpError: _LOGGER.debug('Failed to clean up dataset %s' % self.dataset_id) def _setup_new_types_env(self): table_schema = bigquery.TableSchema() table_field = bigquery.TableFieldSchema() table_field.name = 'bytes' table_field.type = 'BYTES' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'date' table_field.type = 'DATE' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'time' table_field.type = 'TIME' table_schema.fields.append(table_field) table = bigquery.Table(tableReference=bigquery.TableReference( projectId=self.project, datasetId=self.dataset_id, tableId=NEW_TYPES_INPUT_TABLE), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=self.project, datasetId=self.dataset_id, table=table) self.bigquery_client.client.tables.Insert(request) table_data = [{ 'bytes': b'xyw', 'date': '2011-01-01', 'time': '23:59:59.999999' }, { 'bytes': b'abc', 'date': '2000-01-01', 'time': '00:00:00' }, { 'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd', 'date': '3000-12-31', 'time': '23:59:59.990000' }, { 'bytes': b'\xab\xac\xad', 'date': '2000-01-01', 'time': '00:00:00' }] # the API Tools bigquery client expects byte values to be base-64 encoded # TODO BEAM-4850: upgrade to google-cloud-bigquery which does not require # handling the encoding in beam for row in table_data: row['bytes'] = base64.b64encode(row['bytes']).decode('utf-8') self.bigquery_client.insert_rows(self.project, self.dataset_id, NEW_TYPES_INPUT_TABLE, table_data) @attr('IT') def test_big_query_legacy_sql(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [ PipelineStateMatcher(), BigqueryMatcher(project=self.project, query=verify_query, checksum=expected_checksum) ] extra_opts = { 'query': LEGACY_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': False, 'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS, 'on_success_matcher': all_of(*pipeline_verifiers), } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) @attr('IT') def test_big_query_standard_sql(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [ PipelineStateMatcher(), BigqueryMatcher(project=self.project, query=verify_query, checksum=expected_checksum) ] extra_opts = { 'query': STANDARD_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': True, 'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS, 'on_success_matcher': all_of(*pipeline_verifiers), } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) @attr('IT') def test_big_query_standard_sql_kms_key_native(self): if isinstance(self.test_pipeline.runner, TestDirectRunner): self.skipTest("This test doesn't work on DirectRunner.") verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [ PipelineStateMatcher(), BigqueryMatcher(project=self.project, query=verify_query, checksum=expected_checksum) ] kms_key = self.test_pipeline.get_option('kms_key_name') self.assertTrue(kms_key) extra_opts = { 'query': STANDARD_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': True, 'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS, 'on_success_matcher': all_of(*pipeline_verifiers), 'kms_key': kms_key, 'native': True, 'experiments': 'use_legacy_bq_sink', } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) table = self.bigquery_client.get_table(self.project, self.dataset_id, 'output_table') self.assertIsNotNone(table.encryptionConfiguration, 'No encryption configuration found: %s' % table) self.assertEqual(kms_key, table.encryptionConfiguration.kmsKeyName) @attr('IT') def test_big_query_new_types(self): expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED) verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table pipeline_verifiers = [ PipelineStateMatcher(), BigqueryMatcher(project=self.project, query=verify_query, checksum=expected_checksum) ] self._setup_new_types_env() extra_opts = { 'query': NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE), 'output': self.output_table, 'output_schema': NEW_TYPES_OUTPUT_SCHEMA, 'use_standard_sql': False, 'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS, 'use_json_exports': True, 'on_success_matcher': all_of(*pipeline_verifiers) } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) @attr('IT') def test_big_query_new_types_avro(self): expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED) verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table pipeline_verifiers = [ PipelineStateMatcher(), BigqueryMatcher(project=self.project, query=verify_query, checksum=expected_checksum) ] self._setup_new_types_env() extra_opts = { 'query': NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE), 'output': self.output_table, 'output_schema': NEW_TYPES_OUTPUT_SCHEMA, 'use_standard_sql': False, 'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS, 'on_success_matcher': all_of(*pipeline_verifiers), } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) @attr('IT') def test_big_query_new_types_native(self): expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED) verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table pipeline_verifiers = [ PipelineStateMatcher(), BigqueryMatcher(project=self.project, query=verify_query, checksum=expected_checksum) ] self._setup_new_types_env() extra_opts = { 'query': NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE), 'output': self.output_table, 'output_schema': NEW_TYPES_OUTPUT_SCHEMA, 'use_standard_sql': False, 'native': True, 'use_json_exports': True, 'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS, 'on_success_matcher': all_of(*pipeline_verifiers), 'experiments': 'use_legacy_bq_sink', } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options)
class BigQueryQueryToTableIT(unittest.TestCase): def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.bigquery_client = BigQueryWrapper() self.dataset_id = '%s%s%d' % (BIG_QUERY_DATASET_ID, str(int(time.time())), random.randint(0, 10000)) self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id) self.output_table = "%s.output_table" % (self.dataset_id) def tearDown(self): request = bigquery.BigqueryDatasetsDeleteRequest( projectId=self.project, datasetId=self.dataset_id, deleteContents=True) try: self.bigquery_client.client.datasets.Delete(request) except HttpError: logging.debug('Failed to clean up dataset %s' % self.dataset_id) def _setup_new_types_env(self): table_schema = bigquery.TableSchema() table_field = bigquery.TableFieldSchema() table_field.name = 'bytes' table_field.type = 'BYTES' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'date' table_field.type = 'DATE' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'time' table_field.type = 'TIME' table_schema.fields.append(table_field) table = bigquery.Table(tableReference=bigquery.TableReference( projectId=self.project, datasetId=self.dataset_id, tableId=NEW_TYPES_INPUT_TABLE), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=self.project, datasetId=self.dataset_id, table=table) self.bigquery_client.client.tables.Insert(request) table_data = [{ 'bytes': b'xyw=', 'date': '2011-01-01', 'time': '23:59:59.999999' }, { 'bytes': b'abc=', 'date': '2000-01-01', 'time': '00:00:00' }, { 'bytes': b'dec=', 'date': '3000-12-31', 'time': '23:59:59.990000' }] self.bigquery_client.insert_rows(self.project, self.dataset_id, NEW_TYPES_INPUT_TABLE, table_data) @attr('IT') def test_big_query_legacy_sql(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [ PipelineStateMatcher(), BigqueryMatcher(project=self.project, query=verify_query, checksum=expected_checksum) ] gs_location = 'gs://temp-storage-for-upload-tests/%s' % self.output_table extra_opts = { 'query': LEGACY_QUERY, 'output': self.output_table, 'bq_temp_location': gs_location, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': False, 'on_success_matcher': all_of(*pipeline_verifiers) } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) @attr('IT') def test_big_query_standard_sql(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [ PipelineStateMatcher(), BigqueryMatcher(project=self.project, query=verify_query, checksum=expected_checksum) ] gs_location = 'gs://temp-storage-for-upload-tests/%s' % self.output_table extra_opts = { 'query': STANDARD_QUERY, 'output': self.output_table, 'bq_temp_location': gs_location, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': True, 'on_success_matcher': all_of(*pipeline_verifiers) } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) # TODO(BEAM-6660): Enable this test when ready. @unittest.skip( 'This test requires BQ Dataflow native source support for ' + 'KMS, which is not available yet.') @attr('IT') def test_big_query_standard_sql_kms_key(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [ PipelineStateMatcher(), BigqueryMatcher(project=self.project, query=verify_query, checksum=expected_checksum) ] extra_opts = { 'query': STANDARD_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': True, 'on_success_matcher': all_of(*pipeline_verifiers), 'kms_key': KMS_KEY } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) table = self.bigquery_client.get_table(self.project, self.dataset_id, 'output_table') self.assertEqual(KMS_KEY, table.encryptionConfiguration.kmsKeyName) @unittest.skipIf(sys.version_info[0] == 3 and os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1', 'This test still needs to be fixed on Python 3' 'TODO: BEAM-6769') @attr('IT') def test_big_query_new_types(self): expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED) verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table pipeline_verifiers = [ PipelineStateMatcher(), BigqueryMatcher(project=self.project, query=verify_query, checksum=expected_checksum) ] self._setup_new_types_env() gs_location = 'gs://temp-storage-for-upload-tests/%s' % self.output_table extra_opts = { 'query': NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE), 'output': self.output_table, 'bq_temp_location': gs_location, 'output_schema': NEW_TYPES_OUTPUT_SCHEMA, 'use_standard_sql': False, 'on_success_matcher': all_of(*pipeline_verifiers) } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options)
class BigQueryReadIntegrationTests(unittest.TestCase): BIG_QUERY_DATASET_ID = 'python_read_table_' def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.bigquery_client = BigQueryWrapper() self.dataset_id = '%s%s%d' % (self.BIG_QUERY_DATASET_ID, str(int(time.time())), random.randint(0, 10000)) self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id) logging.info("Created dataset %s in project %s", self.dataset_id, self.project) def tearDown(self): request = bigquery.BigqueryDatasetsDeleteRequest( projectId=self.project, datasetId=self.dataset_id, deleteContents=True) try: logging.info("Deleting dataset %s in project %s", self.dataset_id, self.project) self.bigquery_client.client.datasets.Delete(request) except HttpError: logging.debug('Failed to clean up dataset %s in project %s', self.dataset_id, self.project) def create_table(self, tablename): table_schema = bigquery.TableSchema() table_field = bigquery.TableFieldSchema() table_field.name = 'number' table_field.type = 'INTEGER' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'str' table_field.type = 'STRING' table_schema.fields.append(table_field) table = bigquery.Table(tableReference=bigquery.TableReference( projectId=self.project, datasetId=self.dataset_id, tableId=tablename), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=self.project, datasetId=self.dataset_id, table=table) self.bigquery_client.client.tables.Insert(request) table_data = [{ 'number': 1, 'str': 'abc' }, { 'number': 2, 'str': 'def' }, { 'number': 3, 'str': u'你好' }, { 'number': 4, 'str': u'привет' }] self.bigquery_client.insert_rows(self.project, self.dataset_id, tablename, table_data) def create_table_new_types(self, table_name): table_schema = bigquery.TableSchema() table_field = bigquery.TableFieldSchema() table_field.name = 'bytes' table_field.type = 'BYTES' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'date' table_field.type = 'DATE' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'time' table_field.type = 'TIME' table_schema.fields.append(table_field) table = bigquery.Table(tableReference=bigquery.TableReference( projectId=self.project, datasetId=self.dataset_id, tableId=table_name), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=self.project, datasetId=self.dataset_id, table=table) self.bigquery_client.client.tables.Insert(request) table_data = [{ 'bytes': b'xyw', 'date': '2011-01-01', 'time': '23:59:59.999999' }, { 'bytes': b'abc', 'date': '2000-01-01', 'time': '00:00:00' }, { 'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd', 'date': '3000-12-31', 'time': '23:59:59' }, { 'bytes': b'\xab\xac\xad', 'date': '2000-01-01', 'time': '00:00:00' }] # bigquery client expects base64 encoded bytes for row in table_data: row['bytes'] = base64.b64encode(row['bytes']).decode('utf-8') self.bigquery_client.insert_rows(self.project, self.dataset_id, table_name, table_data) @attr('IT') def test_big_query_read(self): table_name = 'python_write_table' self.create_table(table_name) table_id = '{}.{}'.format(self.dataset_id, table_name) args = self.test_pipeline.get_full_options_as_args() with beam.Pipeline(argv=args) as p: result = (p | 'read' >> beam.io.Read( beam.io.BigQuerySource( query='SELECT number, str FROM `%s`' % table_id, use_standard_sql=True))) assert_that( result, equal_to([{ 'number': 1, 'str': 'abc' }, { 'number': 2, 'str': 'def' }, { 'number': 3, 'str': u'你好' }, { 'number': 4, 'str': u'привет' }])) @attr('IT') def test_big_query_read_new_types(self): table_name = 'python_new_types_table' self.create_table_new_types(table_name) table_id = '{}.{}'.format(self.dataset_id, table_name) args = self.test_pipeline.get_full_options_as_args() expected_data = [{ 'bytes': b'xyw', 'date': '2011-01-01', 'time': '23:59:59.999999' }, { 'bytes': b'abc', 'date': '2000-01-01', 'time': '00:00:00' }, { 'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd', 'date': '3000-12-31', 'time': '23:59:59' }, { 'bytes': b'\xab\xac\xad', 'date': '2000-01-01', 'time': '00:00:00' }] # bigquery io returns bytes as base64 encoded values for row in expected_data: row['bytes'] = base64.b64encode(row['bytes']) with beam.Pipeline(argv=args) as p: result = (p | 'read' >> beam.io.Read( beam.io.BigQuerySource( query='SELECT bytes, date, time FROM `%s`' % table_id, use_standard_sql=True))) assert_that(result, equal_to(expected_data))
class BigQueryQueryToTableIT(unittest.TestCase): def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.bigquery_client = BigQueryWrapper() self.dataset_id = '%s%s%d' % (BIG_QUERY_DATASET_ID, str(int(time.time())), random.randint(0, 10000)) self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id) self.output_table = "%s.output_table" % (self.dataset_id) def tearDown(self): request = bigquery.BigqueryDatasetsDeleteRequest( projectId=self.project, datasetId=self.dataset_id, deleteContents=True) try: self.bigquery_client.client.datasets.Delete(request) except HttpError: logging.debug('Failed to clean up dataset %s' % self.dataset_id) def _setup_new_types_env(self): table_schema = bigquery.TableSchema() table_field = bigquery.TableFieldSchema() table_field.name = 'bytes' table_field.type = 'BYTES' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'date' table_field.type = 'DATE' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'time' table_field.type = 'TIME' table_schema.fields.append(table_field) table = bigquery.Table( tableReference=bigquery.TableReference( projectId=self.project, datasetId=self.dataset_id, tableId=NEW_TYPES_INPUT_TABLE), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=self.project, datasetId=self.dataset_id, table=table) self.bigquery_client.client.tables.Insert(request) table_data = [ {'bytes':b'xyw=', 'date':'2011-01-01', 'time':'23:59:59.999999'}, {'bytes':b'abc=', 'date':'2000-01-01', 'time':'00:00:00'}, {'bytes':b'dec=', 'date':'3000-12-31', 'time':'23:59:59.990000'} ] self.bigquery_client.insert_rows( self.project, self.dataset_id, NEW_TYPES_INPUT_TABLE, table_data) @attr('IT') def test_big_query_legacy_sql(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] extra_opts = {'query': LEGACY_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': False, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) @attr('IT') def test_big_query_standard_sql(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] extra_opts = {'query': STANDARD_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': True, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) @attr('IT') def test_big_query_new_types(self): expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED) verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] self._setup_new_types_env() extra_opts = { 'query': NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE), 'output': self.output_table, 'output_schema': NEW_TYPES_OUTPUT_SCHEMA, 'use_standard_sql': False, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options)