def _process_response_for_bigquery(self, table_reference: TableReference, rankings: Sequence[dict]): job_config = LoadJobConfig() job_config.write_disposition = WriteDisposition.WRITE_APPEND job_config.time_partitioning = TimePartitioning( type_=TimePartitioningType.DAY, field='date') job_config.schema = ( SchemaField('date', SqlTypeNames.DATETIME, 'REQUIRED'), SchemaField('url', SqlTypeNames.STRING, 'REQUIRED'), SchemaField('project_id', SqlTypeNames.INTEGER, 'REQUIRED'), SchemaField('prev_rank', SqlTypeNames.INTEGER), SchemaField('keyword', SqlTypeNames.STRING, 'REQUIRED'), SchemaField('keyword_group', SqlTypeNames.STRING, 'REPEATED'), SchemaField('ranking', SqlTypeNames.INTEGER), SchemaField('ranking_change', SqlTypeNames.INTEGER, 'REQUIRED'), SchemaField('rank', SqlTypeNames.STRING, 'REQUIRED'), SchemaField('scalar_type', SqlTypeNames.STRING, 'REQUIRED'), ) for ranking in rankings: ranking['date'] = ranking['date'].strftime('%Y-%m-%dT%H:%M:%S.%f') load_job = self.bigquery.client.load_table_from_json( rankings, table_reference, job_config=job_config) load_job.result()
def create_bq_table(table_id, dataset, schema=None, partition_field=None, cluster_by=()): """ Create a BigQuery table. """ client, table_ref = get_bq_client(table_id, dataset) table_def = bigquery.Table(table_ref, schema=schema) if partition_field: _tp = TimePartitioning(type_=TimePartitioningType.DAY, field=partition_field) table_def.time_partitioning = _tp if cluster_by: table_def.clustering_fields = cluster_by try: client.create_table(table_def) except google.api_core.exceptions.Conflict: logging.info('{}: BigQuery table already exists.'.format(table_id)) pass logging.info('{}: table created.'.format(table_id))
def test_time_partitioning_setter(self): from google.cloud.bigquery.table import TimePartitioning from google.cloud.bigquery.table import TimePartitioningType field = "creation_date" year_ms = 86400 * 1000 * 365 with warnings.catch_warnings(record=True) as warned: time_partitioning = TimePartitioning( type_=TimePartitioningType.DAY, field=field, expiration_ms=year_ms, require_partition_filter=False, ) config = self._get_target_class()() config.time_partitioning = time_partitioning expected = { "type": TimePartitioningType.DAY, "field": field, "expirationMs": str(year_ms), "requirePartitionFilter": False, } self.assertEqual(config._properties["load"]["timePartitioning"], expected) assert len(warned) == 1 warning = warned[0] assert "TimePartitioning.require_partition_filter" in str(warning)
def _process_responses_for_bigquery(self, data: Sequence[dict], table_reference: TableReference): job_config = LoadJobConfig() job_config.write_disposition = WriteDisposition.WRITE_APPEND job_config.time_partitioning = TimePartitioning(type_=TimePartitioningType.DAY, field='date') job_config.schema = ( SchemaField('url', SqlTypeNames.STRING, 'REQUIRED'), SchemaField('cluster', SqlTypeNames.STRING), SchemaField('name', SqlTypeNames.STRING, 'REQUIRED'), SchemaField('query', SqlTypeNames.STRING, 'REQUIRED'), SchemaField('date', SqlTypeNames.DATETIME, 'REQUIRED'), SchemaField('elements', SqlTypeNames.RECORD, 'REPEATED', fields=( SchemaField('content', SqlTypeNames.STRING), SchemaField('operation', SqlTypeNames.STRING), SchemaField('result', SqlTypeNames.STRING), )), ) for data_item in data: data_item['date'] = data_item['date'].strftime('%Y-%m-%dT%H:%M:%S.%f') for element in data_item['elements']: if element['result'] is not None: element['result'] = str(element['result']) load_job = self.bigquery.client.load_table_from_json(data, table_reference, job_config=job_config) load_job.result()
def time_partitioning(self): """Optional[google.cloud.bigquery.table.TimePartitioning]: Specifies time-based partitioning for the destination table. Only specify at most one of :attr:`~google.cloud.bigquery.job.LoadJobConfig.time_partitioning` or :attr:`~google.cloud.bigquery.job.LoadJobConfig.range_partitioning`. """ prop = self._get_sub_prop("timePartitioning") if prop is not None: prop = TimePartitioning.from_api_repr(prop) return prop
def _preprocess_config(cfg): destination_encryption_configuration = cfg.get( 'destination_encryption_configuration') time_partitioning = cfg.get('time_partitioning') if destination_encryption_configuration is not None: cfg['destination_encryption_configuration'] = EncryptionConfiguration( kms_key_name=destination_encryption_configuration) if time_partitioning is not None: cfg['time_partitioning'] = TimePartitioning(**time_partitioning) return cfg
def _process_response_rows_for_bigquery(self, dataframe: DataFrame, table_reference: TableReference): job_config = LoadJobConfig() job_config.write_disposition = WriteDisposition.WRITE_APPEND job_config.time_partitioning = TimePartitioning( type_=TimePartitioningType.DAY, field='date') job_config.schema = [ self._get_schema_for_field(column) for column in list(dataframe.columns.values) ] load_job = self.bigquery.client.load_table_from_dataframe( dataframe, table_reference, job_config=job_config) load_job.result()
def test_get_indexes(faux_conn): from google.cloud.bigquery.table import TimePartitioning cursor = faux_conn.connection.cursor() cursor.execute("create table foo (x INT64)") assert faux_conn.dialect.get_indexes(faux_conn, "foo") == [] client = faux_conn.connection._client client.tables.foo.time_partitioning = TimePartitioning(field="tm") client.tables.foo.clustering_fields = ["user_email", "store_code"] assert faux_conn.dialect.get_indexes(faux_conn, "foo") == [ dict(name="partition", column_names=["tm"], unique=False,), dict( name="clustering", column_names=["user_email", "store_code"], unique=False, ), ]
def main(): args = parse_args() dest_table = '{}.{}.{}'.format(args.project, args.dataset, args.table) # Need to do this for each folder separately since we might have different # schemas 2013 doesn't have enough data so we skip it for now folders = list( set([ blob.id.split('/')[2] for blob in storage_client.list_blobs(args.bucket, prefix='hard-drive-failure') if '2013' not in blob.id ])) file_patterns = [ "gs://{0}/hard-drive-failure/{1}/*.csv".format(args.bucket, folder) for folder in folders ] for file_pattern, folder in zip(file_patterns, folders): print("{:=^80}".format(" Loading Data for {} ".format(folder))) # Update the schema for every folder schema = get_schema(args.bucket, folder) job_config = bigquery.LoadJobConfig( skip_leading_rows=1, schema_update_options=[ 'ALLOW_FIELD_ADDITION', 'ALLOW_FIELD_RELAXATION' ], schema=schema, clustering_fields=['model', 'serial_number']) # Only works when I defer setting it for some reason? job_config.time_partitioning = TimePartitioning(field='date') load_job = client.load_table_from_uri(file_pattern, dest_table, job_config=job_config) load_job.result() # Wait for job to finish print("{:-^80}".format(" Data Successfully Loaded ")) print("SUCCESS")
def _process_responses_for_bigquery(self, responses: Sequence[dict], schema: Sequence[SchemaField], table_reference: TableReference): job_config = LoadJobConfig() job_config.write_disposition = WriteDisposition.WRITE_APPEND job_config.time_partitioning = TimePartitioning( type_=TimePartitioningType.DAY, field='request_date') job_config.schema = schema for response in responses: if 'request_date' in response: response['request_date'] = response['request_date'].strftime( '%Y-%m-%dT%H:%M:%S.%f') if 'date' in response: response['date'] = response['date'].strftime( '%Y-%m-%dT%H:%M:%S.%f') load_job = self.bigquery.client.load_table_from_json( responses, table_reference, job_config=job_config) load_job.result()
def _process_response_rows_for_bigquery(self, rows: Sequence[dict], methods: Sequence[dict], table_reference: TableReference): rows_dataframe = DataFrame.from_records(rows) rows_dataframe['date'] = rows_dataframe['date'].apply( lambda x: x.date()) job_config = LoadJobConfig() job_config.write_disposition = WriteDisposition.WRITE_APPEND job_config.time_partitioning = TimePartitioning( type_=TimePartitioningType.DAY, field='date') job_config.schema = [ self._get_schema_for_field(column, methods) for column in list(rows_dataframe.columns.values) ] try: load_job = self.bigquery.client.load_table_from_dataframe( rows_dataframe, table_reference, job_config=job_config) load_job.result() except BadRequest as error: print(error.errors)
def apply(self, bq_resource: BQTable) -> BQTable: new_resource = deepcopy(bq_resource) new_resource.time_partitioning = TimePartitioning( type_=self.type_.value, field=self.field) return new_resource