def create_external_table_hive_partitioning( bq_client: bigquery.Client, dataset: bigquery.Dataset, table_id: str, gcs_directory_path: str) -> bigquery.Table: """ Creates an external table with AUTO hive partitioning in GCS :param bq_client: Client object to bigquery :param dataset: dataset object. Check 'get_or_create_dataset' method :param table_id: Table to be created :param gcs_directory_path: Directory of GCS with the data. For example: If you have a structure like this: "gs://bucket/images_metadata/source_id=abc/date=2018-02-20" You should pass: "******" :return: """ table = bigquery.Table(dataset.table(table_id)) external_config = bigquery.ExternalConfig( bigquery.SourceFormat.PARQUET) external_config.source_uris = [f"{gcs_directory_path}/*"] hive_part_opt = HivePartitioningOptions() hive_part_opt.mode = "AUTO" hive_part_opt.source_uri_prefix = gcs_directory_path external_config.hive_partitioning = hive_part_opt table.external_data_configuration = external_config table = bq_client.create_table(table, exists_ok=True) return table
def test_hive_partitioning_setter(self): from google.cloud.bigquery.external_config import HivePartitioningOptions hive_partitioning = HivePartitioningOptions() hive_partitioning.source_uri_prefix = "http://foo/bar" hive_partitioning.mode = "AUTO" config = self._get_target_class()() config.hive_partitioning = hive_partitioning self.assertEqual( config._properties["load"]["hivePartitioningOptions"], { "sourceUriPrefix": "http://foo/bar", "mode": "AUTO" }, ) config.hive_partitioning = None self.assertIsNone( config._properties["load"]["hivePartitioningOptions"])
def create_bq_table(project, gcs_uri, dataset, table, require_hive_partition_filter=True): bq_client = bigquery.Client(project=project) table_ref = bq_client.dataset(dataset).table(table) table = bigquery.Table(table_ref) hive_partition_options = HivePartitioningOptions() hive_partition_options.mode = "AUTO" hive_partition_options.source_uri_prefix = gcs_uri # To prevent one from accidentaly scan the whole table, set this # partition filter requirement. # # table.require_partition_filter = True is not supported by the class yet. # hive_partition_options.require_partition_filter = True is not # supported by the class yet. # So I need to do the following to include the option: hive_partition_options._properties[ "require_partition_filter"] = require_hive_partition_filter extconfig = bigquery.ExternalConfig('CSV') extconfig.schema = [bigquery.SchemaField('line', 'STRING')] extconfig.options.field_delimiter = u'\u00ff' extconfig.options.quote_character = '' # extconfig.compression = 'GZIP' extconfig.options.allow_jagged_rows = False extconfig.options.allow_quoted_newlines = False extconfig.max_bad_records = 10000000 extconfig.source_uris = [os.path.join(gcs_uri, "*")] extconfig.hive_partitioning = hive_partition_options table.external_data_configuration = extconfig bq_client.create_table(table)