def test_descriptions(self): """ Test that differences in description are ignored when ignore_descriptions=True. """ schema1 = bigquery.TableSchema(fields=[ bigquery.TableFieldSchema( name="a", mode="REQUIRED", type="FLOAT64", description="Field A", ), bigquery.TableFieldSchema( name="b", mode="REQUIRED", type="INT64", ), ]) schema2 = bigquery.TableSchema(fields=[ bigquery.TableFieldSchema(name="a", mode="REQUIRED", type="FLOAT64", description="Field A is for Apple"), bigquery.TableFieldSchema( name="b", mode="REQUIRED", type="INT64", description="Field B", ), ]) self.assertFalse(check_schema_equal(schema1, schema2)) self.assertTrue( check_schema_equal(schema1, schema2, ignore_descriptions=True))
def test_field_order(self): """Test that field order is ignored when ignore_field_order=True.""" schema1 = bigquery.TableSchema(fields=[ bigquery.TableFieldSchema( name="a", mode="REQUIRED", type="FLOAT64"), bigquery.TableFieldSchema(name="b", mode="REQUIRED", type="INT64"), ]) schema2 = bigquery.TableSchema(fields=list(reversed(schema1.fields))) self.assertFalse(check_schema_equal(schema1, schema2)) self.assertTrue( check_schema_equal(schema1, schema2, ignore_field_order=True))
def test_simple_schemas(self): schema1 = bigquery.TableSchema(fields=[]) self.assertTrue(check_schema_equal(schema1, schema1)) schema2 = bigquery.TableSchema(fields=[ bigquery.TableFieldSchema(name="a", mode="NULLABLE", type="INT64") ]) self.assertTrue(check_schema_equal(schema2, schema2)) self.assertFalse(check_schema_equal(schema1, schema2)) schema3 = bigquery.TableSchema(fields=[ bigquery.TableFieldSchema( name="b", mode="REPEATED", type="RECORD", fields=[ bigquery.TableFieldSchema( name="c", mode="REQUIRED", type="BOOL") ]) ]) self.assertTrue(check_schema_equal(schema3, schema3)) self.assertFalse(check_schema_equal(schema2, schema3))
def process(self, element, schema_mod_job_name_prefix): destination = element[0] temp_table_load_job_reference = element[1] if callable(self._additional_bq_parameters): additional_parameters = self._additional_bq_parameters(destination) elif isinstance(self._additional_bq_parameters, vp.ValueProvider): additional_parameters = self._additional_bq_parameters.get() else: additional_parameters = self._additional_bq_parameters # When writing to normal tables WRITE_TRUNCATE will overwrite the schema but # when writing to a partition, care needs to be taken to update the schema # even on WRITE_TRUNCATE. if (self._write_disposition not in ('WRITE_TRUNCATE', 'WRITE_APPEND') or not additional_parameters or not additional_parameters.get("schemaUpdateOptions")): # No need to modify schema of destination table return table_reference = bigquery_tools.parse_table_reference(destination) if table_reference.projectId is None: table_reference.projectId = vp.RuntimeValueProvider.get_value( 'project', str, '') try: # Check if destination table exists destination_table = self._bq_wrapper.get_table( project_id=table_reference.projectId, dataset_id=table_reference.datasetId, table_id=table_reference.tableId) except HttpError as exn: if exn.status_code == 404: # Destination table does not exist, so no need to modify its schema # ahead of the copy jobs. return else: raise temp_table_load_job = self._bq_wrapper.get_job( project=temp_table_load_job_reference.projectId, job_id=temp_table_load_job_reference.jobId, location=temp_table_load_job_reference.location) temp_table_schema = temp_table_load_job.configuration.load.schema if bigquery_tools.check_schema_equal(temp_table_schema, destination_table.schema, ignore_descriptions=True, ignore_field_order=True): # Destination table schema is already the same as the temp table schema, # so no need to run a job to update the destination table schema. return destination_hash = _bq_uuid( '%s:%s.%s' % (table_reference.projectId, table_reference.datasetId, table_reference.tableId)) uid = _bq_uuid() job_name = '%s_%s_%s' % (schema_mod_job_name_prefix, destination_hash, uid) _LOGGER.debug('Triggering schema modification job %s on %s', job_name, table_reference) # Trigger potential schema modification by loading zero rows into the # destination table with the temporary table schema. schema_update_job_reference = self._bq_wrapper.perform_load_job( destination=table_reference, source_stream=io.BytesIO(), # file with zero rows job_id=job_name, schema=temp_table_schema, write_disposition='WRITE_APPEND', create_disposition='CREATE_NEVER', additional_load_parameters=additional_parameters, job_labels=self._bq_io_metadata.add_additional_bq_job_labels()) yield (destination, schema_update_job_reference)