def create_external_table_hive_partitioning(
            bq_client: bigquery.Client, dataset: bigquery.Dataset,
            table_id: str, gcs_directory_path: str) -> bigquery.Table:
        """
            Creates an external table with AUTO hive partitioning in GCS
        :param bq_client: Client object to bigquery
        :param dataset: dataset object. Check 'get_or_create_dataset' method
        :param table_id: Table to be created
        :param gcs_directory_path: Directory of GCS with the data. For example:
            If you have a structure like this:
            "gs://bucket/images_metadata/source_id=abc/date=2018-02-20"
            You should pass:
            "******"
        :return:
        """
        table = bigquery.Table(dataset.table(table_id))

        external_config = bigquery.ExternalConfig(
            bigquery.SourceFormat.PARQUET)
        external_config.source_uris = [f"{gcs_directory_path}/*"]
        hive_part_opt = HivePartitioningOptions()
        hive_part_opt.mode = "AUTO"
        hive_part_opt.source_uri_prefix = gcs_directory_path

        external_config.hive_partitioning = hive_part_opt
        table.external_data_configuration = external_config
        table = bq_client.create_table(table, exists_ok=True)

        return table
예제 #2
0
    def test_hive_partitioning_setter(self):
        from google.cloud.bigquery.external_config import HivePartitioningOptions

        hive_partitioning = HivePartitioningOptions()
        hive_partitioning.source_uri_prefix = "http://foo/bar"
        hive_partitioning.mode = "AUTO"

        config = self._get_target_class()()
        config.hive_partitioning = hive_partitioning
        self.assertEqual(
            config._properties["load"]["hivePartitioningOptions"],
            {
                "sourceUriPrefix": "http://foo/bar",
                "mode": "AUTO"
            },
        )

        config.hive_partitioning = None
        self.assertIsNone(
            config._properties["load"]["hivePartitioningOptions"])
def create_bq_table(project,
                    gcs_uri,
                    dataset,
                    table,
                    require_hive_partition_filter=True):
    bq_client = bigquery.Client(project=project)
    table_ref = bq_client.dataset(dataset).table(table)
    table = bigquery.Table(table_ref)

    hive_partition_options = HivePartitioningOptions()
    hive_partition_options.mode = "AUTO"
    hive_partition_options.source_uri_prefix = gcs_uri

    # To prevent one from accidentaly scan the whole table, set this
    # partition filter requirement.
    #
    # table.require_partition_filter = True is not supported by the class yet.
    # hive_partition_options.require_partition_filter = True is not
    # supported by the class yet.
    # So I need to do the following to include the option:
    hive_partition_options._properties[
        "require_partition_filter"] = require_hive_partition_filter

    extconfig = bigquery.ExternalConfig('CSV')
    extconfig.schema = [bigquery.SchemaField('line', 'STRING')]
    extconfig.options.field_delimiter = u'\u00ff'
    extconfig.options.quote_character = ''
    #   extconfig.compression = 'GZIP'
    extconfig.options.allow_jagged_rows = False
    extconfig.options.allow_quoted_newlines = False
    extconfig.max_bad_records = 10000000
    extconfig.source_uris = [os.path.join(gcs_uri, "*")]
    extconfig.hive_partitioning = hive_partition_options

    table.external_data_configuration = extconfig

    bq_client.create_table(table)