def _process_response_for_bigquery(self, table_reference: TableReference,
                                       rankings: Sequence[dict]):
        job_config = LoadJobConfig()
        job_config.write_disposition = WriteDisposition.WRITE_APPEND
        job_config.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.DAY, field='date')

        job_config.schema = (
            SchemaField('date', SqlTypeNames.DATETIME, 'REQUIRED'),
            SchemaField('url', SqlTypeNames.STRING, 'REQUIRED'),
            SchemaField('project_id', SqlTypeNames.INTEGER, 'REQUIRED'),
            SchemaField('prev_rank', SqlTypeNames.INTEGER),
            SchemaField('keyword', SqlTypeNames.STRING, 'REQUIRED'),
            SchemaField('keyword_group', SqlTypeNames.STRING, 'REPEATED'),
            SchemaField('ranking', SqlTypeNames.INTEGER),
            SchemaField('ranking_change', SqlTypeNames.INTEGER, 'REQUIRED'),
            SchemaField('rank', SqlTypeNames.STRING, 'REQUIRED'),
            SchemaField('scalar_type', SqlTypeNames.STRING, 'REQUIRED'),
        )

        for ranking in rankings:
            ranking['date'] = ranking['date'].strftime('%Y-%m-%dT%H:%M:%S.%f')

        load_job = self.bigquery.client.load_table_from_json(
            rankings, table_reference, job_config=job_config)
        load_job.result()
示例#2
0
def create_bq_table(table_id,
                    dataset,
                    schema=None,
                    partition_field=None,
                    cluster_by=()):
    """
    Create a BigQuery table.
    """

    client, table_ref = get_bq_client(table_id, dataset)
    table_def = bigquery.Table(table_ref, schema=schema)

    if partition_field:
        _tp = TimePartitioning(type_=TimePartitioningType.DAY,
                               field=partition_field)
        table_def.time_partitioning = _tp

    if cluster_by:
        table_def.clustering_fields = cluster_by

    try:
        client.create_table(table_def)
    except google.api_core.exceptions.Conflict:
        logging.info('{}: BigQuery table already exists.'.format(table_id))
        pass

    logging.info('{}: table created.'.format(table_id))
示例#3
0
    def test_time_partitioning_setter(self):
        from google.cloud.bigquery.table import TimePartitioning
        from google.cloud.bigquery.table import TimePartitioningType

        field = "creation_date"
        year_ms = 86400 * 1000 * 365

        with warnings.catch_warnings(record=True) as warned:
            time_partitioning = TimePartitioning(
                type_=TimePartitioningType.DAY,
                field=field,
                expiration_ms=year_ms,
                require_partition_filter=False,
            )

        config = self._get_target_class()()
        config.time_partitioning = time_partitioning
        expected = {
            "type": TimePartitioningType.DAY,
            "field": field,
            "expirationMs": str(year_ms),
            "requirePartitionFilter": False,
        }
        self.assertEqual(config._properties["load"]["timePartitioning"],
                         expected)

        assert len(warned) == 1
        warning = warned[0]
        assert "TimePartitioning.require_partition_filter" in str(warning)
示例#4
0
    def _process_responses_for_bigquery(self, data: Sequence[dict], table_reference: TableReference):
        job_config = LoadJobConfig()
        job_config.write_disposition = WriteDisposition.WRITE_APPEND
        job_config.time_partitioning = TimePartitioning(type_=TimePartitioningType.DAY, field='date')

        job_config.schema = (
            SchemaField('url', SqlTypeNames.STRING, 'REQUIRED'),
            SchemaField('cluster', SqlTypeNames.STRING),
            SchemaField('name', SqlTypeNames.STRING, 'REQUIRED'),
            SchemaField('query', SqlTypeNames.STRING, 'REQUIRED'),
            SchemaField('date', SqlTypeNames.DATETIME, 'REQUIRED'),
            SchemaField('elements', SqlTypeNames.RECORD, 'REPEATED', fields=(
                SchemaField('content', SqlTypeNames.STRING),
                SchemaField('operation', SqlTypeNames.STRING),
                SchemaField('result', SqlTypeNames.STRING),
            )),
        )

        for data_item in data:
            data_item['date'] = data_item['date'].strftime('%Y-%m-%dT%H:%M:%S.%f')

            for element in data_item['elements']:
                if element['result'] is not None:
                    element['result'] = str(element['result'])

        load_job = self.bigquery.client.load_table_from_json(data, table_reference, job_config=job_config)
        load_job.result()
示例#5
0
    def time_partitioning(self):
        """Optional[google.cloud.bigquery.table.TimePartitioning]: Specifies time-based
        partitioning for the destination table.

        Only specify at most one of
        :attr:`~google.cloud.bigquery.job.LoadJobConfig.time_partitioning` or
        :attr:`~google.cloud.bigquery.job.LoadJobConfig.range_partitioning`.
        """
        prop = self._get_sub_prop("timePartitioning")
        if prop is not None:
            prop = TimePartitioning.from_api_repr(prop)
        return prop
示例#6
0
def _preprocess_config(cfg):
    destination_encryption_configuration = cfg.get(
        'destination_encryption_configuration')
    time_partitioning = cfg.get('time_partitioning')

    if destination_encryption_configuration is not None:
        cfg['destination_encryption_configuration'] = EncryptionConfiguration(
            kms_key_name=destination_encryption_configuration)

    if time_partitioning is not None:
        cfg['time_partitioning'] = TimePartitioning(**time_partitioning)

    return cfg
示例#7
0
    def _process_response_rows_for_bigquery(self, dataframe: DataFrame,
                                            table_reference: TableReference):
        job_config = LoadJobConfig()
        job_config.write_disposition = WriteDisposition.WRITE_APPEND
        job_config.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.DAY, field='date')
        job_config.schema = [
            self._get_schema_for_field(column)
            for column in list(dataframe.columns.values)
        ]

        load_job = self.bigquery.client.load_table_from_dataframe(
            dataframe, table_reference, job_config=job_config)

        load_job.result()
示例#8
0
def test_get_indexes(faux_conn):
    from google.cloud.bigquery.table import TimePartitioning

    cursor = faux_conn.connection.cursor()
    cursor.execute("create table foo (x INT64)")
    assert faux_conn.dialect.get_indexes(faux_conn, "foo") == []

    client = faux_conn.connection._client
    client.tables.foo.time_partitioning = TimePartitioning(field="tm")
    client.tables.foo.clustering_fields = ["user_email", "store_code"]

    assert faux_conn.dialect.get_indexes(faux_conn, "foo") == [
        dict(name="partition", column_names=["tm"], unique=False,),
        dict(
            name="clustering", column_names=["user_email", "store_code"], unique=False,
        ),
    ]
示例#9
0
def main():
    args = parse_args()
    dest_table = '{}.{}.{}'.format(args.project, args.dataset, args.table)

    # Need to do this for each folder separately since we might have different
    # schemas 2013 doesn't have enough data so we skip it for now
    folders = list(
        set([
            blob.id.split('/')[2]
            for blob in storage_client.list_blobs(args.bucket,
                                                  prefix='hard-drive-failure')
            if '2013' not in blob.id
        ]))

    file_patterns = [
        "gs://{0}/hard-drive-failure/{1}/*.csv".format(args.bucket, folder)
        for folder in folders
    ]

    for file_pattern, folder in zip(file_patterns, folders):
        print("{:=^80}".format(" Loading Data for {} ".format(folder)))
        # Update the schema for every folder
        schema = get_schema(args.bucket, folder)

        job_config = bigquery.LoadJobConfig(
            skip_leading_rows=1,
            schema_update_options=[
                'ALLOW_FIELD_ADDITION', 'ALLOW_FIELD_RELAXATION'
            ],
            schema=schema,
            clustering_fields=['model', 'serial_number'])
        # Only works when I defer setting it for some reason?
        job_config.time_partitioning = TimePartitioning(field='date')

        load_job = client.load_table_from_uri(file_pattern,
                                              dest_table,
                                              job_config=job_config)
        load_job.result()  # Wait for job to finish

        print("{:-^80}".format(" Data Successfully Loaded "))

    print("SUCCESS")
示例#10
0
    def _process_responses_for_bigquery(self, responses: Sequence[dict],
                                        schema: Sequence[SchemaField],
                                        table_reference: TableReference):
        job_config = LoadJobConfig()
        job_config.write_disposition = WriteDisposition.WRITE_APPEND
        job_config.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.DAY, field='request_date')
        job_config.schema = schema

        for response in responses:
            if 'request_date' in response:
                response['request_date'] = response['request_date'].strftime(
                    '%Y-%m-%dT%H:%M:%S.%f')
            if 'date' in response:
                response['date'] = response['date'].strftime(
                    '%Y-%m-%dT%H:%M:%S.%f')

        load_job = self.bigquery.client.load_table_from_json(
            responses, table_reference, job_config=job_config)
        load_job.result()
示例#11
0
    def _process_response_rows_for_bigquery(self, rows: Sequence[dict],
                                            methods: Sequence[dict],
                                            table_reference: TableReference):
        rows_dataframe = DataFrame.from_records(rows)
        rows_dataframe['date'] = rows_dataframe['date'].apply(
            lambda x: x.date())

        job_config = LoadJobConfig()
        job_config.write_disposition = WriteDisposition.WRITE_APPEND
        job_config.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.DAY, field='date')
        job_config.schema = [
            self._get_schema_for_field(column, methods)
            for column in list(rows_dataframe.columns.values)
        ]

        try:
            load_job = self.bigquery.client.load_table_from_dataframe(
                rows_dataframe, table_reference, job_config=job_config)

            load_job.result()
        except BadRequest as error:
            print(error.errors)
 def apply(self, bq_resource: BQTable) -> BQTable:
     new_resource = deepcopy(bq_resource)
     new_resource.time_partitioning = TimePartitioning(
         type_=self.type_.value, field=self.field)
     return new_resource