def execute(self, context): self.log.info('Fetching Data from:') self.log.info('Dataset: %s ; Table: %s ; Max Results: %s', self.dataset_id, self.table_id, self.max_results) hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) conn = hook.get_conn() cursor = conn.cursor() response = cursor.get_tabledata(dataset_id=self.dataset_id, table_id=self.table_id, max_results=self.max_results, selected_fields=self.selected_fields) self.log.info('Total Extracted rows: %s', response['totalRows']) rows = response['rows'] table_data = [] for dict_row in rows: single_row = [] for fields in dict_row['f']: single_row.append(fields['v']) table_data.append(single_row) return table_data
def execute(self, context): logging.info('Deleting: %s', self.deletion_dataset_table) hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) conn = hook.get_conn() cursor = conn.cursor() cursor.run_table_delete(self.deletion_dataset_table, self.ignore_if_missing)
def execute(self, context): bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) if not self.schema_fields and self.schema_object \ and self.source_format != 'DATASTORE_BACKUP': gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) schema_fields = json.loads(gcs_hook.download( self.bucket, self.schema_object).decode("utf-8")) else: schema_fields = self.schema_fields source_uris = ['gs://{}/{}'.format(self.bucket, source_object) for source_object in self.source_objects] conn = bq_hook.get_conn() cursor = conn.cursor() cursor.create_external_table( external_project_dataset_table=self.destination_project_dataset_table, schema_fields=schema_fields, source_uris=source_uris, source_format=self.source_format, compression=self.compression, skip_leading_rows=self.skip_leading_rows, field_delimiter=self.field_delimiter, max_bad_records=self.max_bad_records, quote_character=self.quote_character, allow_quoted_newlines=self.allow_quoted_newlines, allow_jagged_rows=self.allow_jagged_rows, src_fmt_configs=self.src_fmt_configs, labels=self.labels )
def execute(self, context): gcs_hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) schema_fields = self.schema_fields if self.schema_fields else json.loads(gcs_hook.download(self.bucket, self.schema_object)) source_uris = map(lambda schema_object: 'gs://{}/{}'.format(self.bucket, schema_object), self.source_objects) conn = bq_hook.get_conn() cursor = conn.cursor() cursor.run_load( destination_project_dataset_table=self.destination_project_dataset_table, schema_fields=schema_fields, source_uris=source_uris, source_format=self.source_format, create_disposition=self.create_disposition, skip_leading_rows=self.skip_leading_rows, write_disposition=self.write_disposition, field_delimiter=self.field_delimiter) if self.max_id_key: cursor.execute('SELECT MAX({}) FROM {}'.format(self.max_id_key, self.destination_project_dataset_table)) row = cursor.fetchone() max_id = row[0] if row[0] else 0 logging.info('Loaded BQ data with max {}.{}={}'.format(self.destination_project_dataset_table, self.max_id_key, max_id)) return max_id
def execute(self, context): if self.bq_cursor is None: self.log.info('Executing: %s', self.sql) hook = BigQueryHook( bigquery_conn_id=self.bigquery_conn_id, use_legacy_sql=self.use_legacy_sql, delegate_to=self.delegate_to, location=self.location, ) conn = hook.get_conn() self.bq_cursor = conn.cursor() self.bq_cursor.run_query( sql=self.sql, destination_dataset_table=self.destination_dataset_table, write_disposition=self.write_disposition, allow_large_results=self.allow_large_results, flatten_results=self.flatten_results, udf_config=self.udf_config, maximum_billing_tier=self.maximum_billing_tier, maximum_bytes_billed=self.maximum_bytes_billed, create_disposition=self.create_disposition, query_params=self.query_params, labels=self.labels, schema_update_options=self.schema_update_options, priority=self.priority, time_partitioning=self.time_partitioning, api_resource_configs=self.api_resource_configs, cluster_fields=self.cluster_fields, )
def execute(self, context): bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) if not self.schema_fields and self.gcs_schema_object: gcs_bucket, gcs_object = _parse_gcs_url(self.gcs_schema_object) gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) schema_fields = json.loads(gcs_hook.download( gcs_bucket, gcs_object).decode("utf-8")) else: schema_fields = self.schema_fields conn = bq_hook.get_conn() cursor = conn.cursor() cursor.create_empty_table( project_id=self.project_id, dataset_id=self.dataset_id, table_id=self.table_id, schema_fields=schema_fields, time_partitioning=self.time_partitioning, labels=self.labels )
def poke(self, context): table_uri = '{0}:{1}.{2}'.format(self.project_id, self.dataset_id, self.table_id) self.log.info('Sensor checks existence of table: %s', table_uri) hook = BigQueryHook( bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) return hook.table_exists(self.project_id, self.dataset_id, self.table_id)
def execute(self, context): logging.info('Executing: %s', self.bql) hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) conn = hook.get_conn() cursor = conn.cursor() cursor.run_query(self.bql, self.destination_dataset_table, self.write_disposition, self.allow_large_results, self.udf_config, self.use_legacy_sql)
def execute(self, context): logging.info('Executing copy of %s into: %s', self.source_project_dataset_tables, self.destination_project_dataset_table) hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) conn = hook.get_conn() cursor = conn.cursor() cursor.run_copy( self.source_project_dataset_tables, self.destination_project_dataset_table, self.write_disposition, self.create_disposition)
def execute(self, context): logging.info('Executing extract of %s into: %s', self.source_dataset_table, self.destination_cloud_storage_uris) hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id) hook.run_extract( self.source_dataset_table, self.destination_cloud_storage_uris, self.compression, self.export_format, self.field_delimiter, self.print_header)
def execute(self, context): if(self.bq_cursor == None): self.log.info('Executing: %s', self.bql) hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) conn = hook.get_conn() self.bq_cursor = conn.cursor() self.bq_cursor.run_query(self.bql, self.destination_dataset_table, self.write_disposition, self.allow_large_results, self.udf_config, self.use_legacy_sql, self.maximum_billing_tier, self.create_disposition, self.query_params)
def execute(self, context): bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) conn = bq_hook.get_conn() cursor = conn.cursor() cursor.create_empty_dataset( project_id=self.project_id, dataset_id=self.dataset_id, dataset_reference=self.dataset_reference)
def execute(self, context): self.log.info('Executing extract of %s into: %s', self.source_project_dataset_table, self.destination_cloud_storage_uris) hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) conn = hook.get_conn() cursor = conn.cursor() cursor.run_extract( self.source_project_dataset_table, self.destination_cloud_storage_uris, self.compression, self.export_format, self.field_delimiter, self.print_header)
def execute(self, context): bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) conn = bq_hook.get_conn() cursor = conn.cursor() datasets = cursor.get_datasets_list(project_id=self.project_id) dataset_ids = list(d['datasetReference']['datasetId'] for d in datasets) if self.dataset_id not in dataset_ids: cursor.create_empty_dataset( project_id=self.project_id, dataset_id=self.dataset_id, dataset_reference=self.dataset_reference)
def init(self): self.log.info(f"init() is started") # bucket connection self.gcs_hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.gcs_conn_id, delegate_to=None) # bigquery connection self.bq_hook = BigQueryHook(bigquery_conn_id=self.gcs_conn_id, use_legacy_sql=False) bq_conn = self.bq_hook.get_conn() self.bq_cursor = bq_conn.cursor() # geotab connection self.geotab_hook = GeotabHook(geotab_conn_id=self.geotab_conn_id) params = self.geotab_hook.get_connection(self.geotab_conn_id) self.log.info(f"login: "******", password: "******", schema: " + params.schema)
def execute(self, context): self.log.info('Fetching last partition from tables: {}'.format( str(self.table_lst))) hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) return_dict = {} for each_table in self.table_lst: project = each_table.split(':')[0] dataset = each_table.split(':')[1].split('.')[0] table_name = each_table.split(':')[1].split('.')[1] lp = sorted(hook.table_list_partition(project, dataset, table_name))[-1] return_dict[each_table] = lp self.log.info("Table {} has latest partition: {}".format( each_table, lp)) return return_dict
def execute(self, context): self.log.info( 'Executing copy of %s into: %s', self.source_project_dataset_tables, self.destination_project_dataset_table ) hook = BigQueryHook(bigquery_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) conn = hook.get_conn() cursor = conn.cursor() cursor.run_copy( source_project_dataset_tables=self.source_project_dataset_tables, destination_project_dataset_table=self.destination_project_dataset_table, write_disposition=self.write_disposition, create_disposition=self.create_disposition, labels=self.labels, encryption_configuration=self.encryption_configuration)
def execute(self, context): self.log.info('Executing extract of %s into: %s', self.source_project_dataset_table, self.destination_cloud_storage_uris) hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) conn = hook.get_conn() cursor = conn.cursor() cursor.run_extract( source_project_dataset_table=self.source_project_dataset_table, destination_cloud_storage_uris=self.destination_cloud_storage_uris, compression=self.compression, export_format=self.export_format, field_delimiter=self.field_delimiter, print_header=self.print_header, labels=self.labels)
def execute(self, context): bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) if not self.schema_fields and self.schema_object \ and self.source_format != 'DATASTORE_BACKUP': gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) schema_fields = json.loads( gcs_hook.download(self.bucket, self.schema_object).decode("utf-8")) else: schema_fields = self.schema_fields source_uris = [ 'gs://{}/{}'.format(self.bucket, source_object) for source_object in self.source_objects ] conn = bq_hook.get_conn() cursor = conn.cursor() cursor.run_load(destination_project_dataset_table=self. destination_project_dataset_table, schema_fields=schema_fields, source_uris=source_uris, source_format=self.source_format, create_disposition=self.create_disposition, skip_leading_rows=self.skip_leading_rows, write_disposition=self.write_disposition, field_delimiter=self.field_delimiter, max_bad_records=self.max_bad_records, quote_character=self.quote_character, allow_quoted_newlines=self.allow_quoted_newlines, allow_jagged_rows=self.allow_jagged_rows, schema_update_options=self.schema_update_options, src_fmt_configs=self.src_fmt_configs, time_partitioning=self.time_partitioning) if self.max_id_key: cursor.execute('SELECT MAX({}) FROM {}'.format( self.max_id_key, self.destination_project_dataset_table)) row = cursor.fetchone() max_id = row[0] if row[0] else 0 self.log.info('Loaded BQ data with max %s.%s=%s', self.destination_project_dataset_table, self.max_id_key, max_id) return max_id
def save_hash_reference(input_rows, project_id=None, dataset=None, table=None, schema=None): bq_hook = BigQueryHook(bigquery_conn_id='bigquery_default', use_legacy_sql=False) gcp_credentials = bq_hook._get_credentials() bq_client = bigquery.Client(credentials=gcp_credentials, project=project_id) target_dataset_ref = bigquery.DatasetReference(project=project_id, dataset_id=dataset) try: target_dataset = bq_client.get_dataset(dataset_ref=target_dataset_ref) print("Dataset found: ", target_dataset) except NotFound as ex: # LOGGER.info(f"Dataset '{target_dataset_ref}' not found, attempting to create.") print("Dataset not found") target_dataset = bq_client.create_dataset(dataset=target_dataset_ref) print("Dataset created: ", target_dataset) target_table_ref = bigquery.TableReference(dataset_ref=target_dataset, table_id=table) try: target_table = bq_client.get_table(table=target_table_ref) print("Table found: ", target_table) except NotFound as ex: print("Table not found") t = bigquery.Table(table_ref=target_table_ref, schema=schema) target_table = bq_client.create_table(table=t) print("Table created: ", target_table) insert_rows = { "timestamp": datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%f'), "s3_data": input_rows } print("Rows to insert: ", input_rows) print("Target Table: ", target_table) error = bq_client.insert_rows_json(table=target_table, json_rows=[insert_rows]) print(error)
def execute(self, context): bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) if not self.schema_fields and self.schema_object \ and self.source_format != 'DATASTORE_BACKUP': gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) schema_fields = json.loads(gcs_hook.download( self.bucket, self.schema_object).decode("utf-8")) else: schema_fields = self.schema_fields source_uris = ['gs://{}/{}'.format(self.bucket, source_object) for source_object in self.source_objects] conn = bq_hook.get_conn() cursor = conn.cursor() cursor.run_load( destination_project_dataset_table=self.destination_project_dataset_table, schema_fields=schema_fields, source_uris=source_uris, source_format=self.source_format, create_disposition=self.create_disposition, skip_leading_rows=self.skip_leading_rows, write_disposition=self.write_disposition, field_delimiter=self.field_delimiter, max_bad_records=self.max_bad_records, quote_character=self.quote_character, allow_quoted_newlines=self.allow_quoted_newlines, allow_jagged_rows=self.allow_jagged_rows, schema_update_options=self.schema_update_options, src_fmt_configs=self.src_fmt_configs, time_partitioning=self.time_partitioning) if self.max_id_key: cursor.execute('SELECT MAX({}) FROM {}'.format( self.max_id_key, self.destination_project_dataset_table)) row = cursor.fetchone() max_id = row[0] if row[0] else 0 self.log.info( 'Loaded BQ data with max %s.%s=%s', self.destination_project_dataset_table, self.max_id_key, max_id ) return max_id
class GoogleDisplayVideo360ERFToBigQueryOperator(BaseOperator): """Upload Multiple Entity Read Files to specified big query dataset. """ def __init__(self, gcp_conn_id='google_cloud_default', report_body=None, yesterday=False, entity_type=None, file_creation_date=None, cloud_project_id=None, bq_table=None, schema=None, gcs_bucket=None, erf_bucket=None, partner_ids=[], write_disposition='WRITE_TRUNCATE', *args, **kwargs): super(GoogleDisplayVideo360ERFToBigQueryOperator, self).__init__(*args, **kwargs) self.gcp_conn_id = gcp_conn_id self.service = None self.bq_hook = None self.gcs_hook = None self.report_body = report_body self.erf_bucket = erf_bucket self.yesterday = yesterday self.cloud_project_id = cloud_project_id self.bq_table = bq_table self.gcs_bucket = gcs_bucket self.schema = schema self.entity_type = entity_type self.erf_object = 'entity/%s.0.%s.json' % (file_creation_date, entity_type) self.partner_ids = partner_ids self.write_disposition = write_disposition self.file_creation_date = file_creation_date def execute(self, context): if self.gcs_hook is None: self.gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.gcp_conn_id) if self.bq_hook is None: self.bq_hook = BigQueryHook(bigquery_conn_id=self.gcp_conn_id) for i, partner_id in enumerate(self.partner_ids): filename = erf_utils.download_and_transform_erf(self, partner_id) entity_read_file_ndj = 'gs://%s/%s' % (self.gcs_bucket, filename) if i > 0: self.write_disposition = 'WRITE_APPEND' bq_base_cursor = self.bq_hook.get_conn().cursor() bq_base_cursor.run_load( destination_project_dataset_table=self.bq_table, schema_fields=self.schema, source_uris=[entity_read_file_ndj], source_format='NEWLINE_DELIMITED_JSON', write_disposition=self.write_disposition) self.gcs_hook.delete(self.gcs_bucket, filename)
def execute_big_queries(bigquery_conn_id, multi_sqls, sql_separator=";", use_legacy_sql=False, **kwargs): hook = BigQueryHook(bigquery_conn_id=bigquery_conn_id) conn = hook.get_conn() logging.info("Execute : " + multi_sqls) for sql in multi_sqls.split(sql_separator): cursor = conn.cursor() cursor.run_query(bql=sql, destination_dataset_table=None, allow_large_results=True, use_legacy_sql=False) logging.info("Execute : Done ")
def execute(self, context): hook = BigQueryHook( bigquery_conn_id=self.gcp_conn_id, use_legacy_sql=self.use_legacy_sql, location=self.location, ) records = self.run_query(project=hook._get_field("project"), credentials=hook._get_credentials()) if not records: raise AirflowException("Query returned no results.") elif not all([bool(record) for record in records]): raise AirflowException( f"Test failed\nQuery: {self.sql}\nRecords: {records}") self.log.info(f"Test passed\nQuery: {self.sql}\nRecords: {records}")
def execute(self, context): self.log.info('Fetching Data from:') self.log.info('Query: %s', self.sql) hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) conn = hook.get_conn() cursor = conn.cursor() cursor.execute(self.sql) response = cursor.fetchmany(self.max_rows) self.log.info('Total Extracted rows: %s', len(response)) self.log.info('Response: %s', response) return response
def execute(self, context): if self.bq_cursor is None: self.log.info('Executing: %s', self.bql) hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, use_legacy_sql=self.use_legacy_sql, delegate_to=self.delegate_to) conn = hook.get_conn() self.bq_cursor = conn.cursor() self.bq_cursor.run_query( self.bql, destination_dataset_table=self.destination_dataset_table, write_disposition=self.write_disposition, allow_large_results=self.allow_large_results, udf_config=self.udf_config, maximum_billing_tier=self.maximum_billing_tier, create_disposition=self.create_disposition, query_params=self.query_params, schema_update_options=self.schema_update_options)
def execute(self, context): # TODO Buscar schema do hub se não for passado schema via argumento sql = self.sql.format(table=self.destination_project_dataset_table_id, staging=self.staging_project_dataset_table_id, cols=','.join(self.schema), hash=self.schema[0]) self.log.info('Execution sql \n {}'.format(sql)) self.hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, use_legacy_sql=False) self.conn = self.hook.get_conn() self.cursor = self.conn.cursor() self.cursor.run_query(sql)
def bq_to_pubsub_query_executor(**kwargs): """Executes a custom detector query in BigQuery and passes the results to the next task""" query = kwargs['templates_dict']['query'] logging.info(query) bigquery_hook = BigQueryHook(use_legacy_sql=False) df = bigquery_hook.get_pandas_df(sql=query) messages = [{ 'data': b64e(row.to_json().encode()).decode() } for index, row in df.iterrows()] """splitting the array to 1000 size chunks (PubSub limit)""" messages_chunks = chunks(messages, 1000) pubsub_hoook = PubSubHook() for chunk in messages_chunks: pubsub_hoook.publish(project=gcp_project, topic=pubsub_topic, messages=chunk)
def execute(self, context): """ This method check the quality of the data given the input test cases indicated in the sql_test_cases dictionary. """ bigquery = BigQueryHook(bigquery_conn_id=self.conn_id) found_errors = [] for query, expected_result in self.sql_test_cases.items(): records = bigquery.run_query(sql=query) if len(records) < 1 or records[0][0] != expected_result: found_errors.append(query) if len(found_errors) > 0: raise ValueError( f"The following query test cases were not successful {found_errors}" ) self.log.info('DataQualityOperator has been executed')
def execute(self, context): dst_table_name = format_table_name(self.dst_table_name, is_staging=True) bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id) conn = bq_hook.get_conn() cursor = conn.cursor() bucket = get_bucket() src_uris = f"{bucket}/{self.src_uris}" cursor.run_load( dst_table_name, source_uris=src_uris, schema_fields=self.schema_fields, autodetect=self.autodetect, skip_leading_rows=self.skip_leading_rows, write_disposition=self.write_disposition, )
def execute(self, context): if self.bq_cursor is None: self.log.info('Executing: %s', self.bql) hook = BigQueryHook( bigquery_conn_id=self.bigquery_conn_id, use_legacy_sql=self.use_legacy_sql, delegate_to=self.delegate_to) conn = hook.get_conn() self.bq_cursor = conn.cursor() self.bq_cursor.run_query( self.bql, destination_dataset_table=self.destination_dataset_table, write_disposition=self.write_disposition, allow_large_results=self.allow_large_results, udf_config=self.udf_config, maximum_billing_tier=self.maximum_billing_tier, create_disposition=self.create_disposition, query_params=self.query_params, schema_update_options=self.schema_update_options)
def execute(self, context): bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) if not self.schema_fields and self.schema_object: gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) schema_fields = json.loads( gcs_hook.download(self.bucket, self.schema_object).decode("utf-8")) else: schema_fields = self.schema_fields source_uris = [ 'gs://{}/{}'.format(self.bucket, source_object) for source_object in self.source_objects ] conn = bq_hook.get_conn() cursor = conn.cursor() cursor.run_load(destination_project_dataset_table=self. destination_project_dataset_table, schema_fields=schema_fields, source_uris=source_uris, source_format=self.source_format, create_disposition=self.create_disposition, skip_leading_rows=self.skip_leading_rows, write_disposition=self.write_disposition, field_delimiter=self.field_delimiter, max_bad_records=self.max_bad_records, schema_update_options=self.schema_update_options) if self.max_id_key: cursor.execute('SELECT MAX({}) FROM {}'.format( self.max_id_key, self.destination_project_dataset_table)) row = cursor.fetchone() max_id = row[0] if row[0] else 0 logging.info('Loaded BQ data with max {}.{}={}'.format( self.destination_project_dataset_table, self.max_id_key, max_id)) return max_id
def bq_get_last_modified(self): logging.info("Connecting to Big Query") bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) bq_conn = bq_hook.get_connection(self.bigquery_conn_id) bq_conn_extra_json = bq_conn.extra bq_conn_extra = json.loads(bq_conn_extra_json) service_dict = bq_conn_extra['extra__google_cloud_platform__keyfile_dict'] sql = """ #standardSQL SELECT last_modified_time AS TS FROM `{0}.{1}.__TABLES__` WHERE table_id = '{2}' """.format(self.project_id, self.dataset, self.table_name) logging.info("Getting table last_modified_time from BQ with SQL:/n{0}".format(sql)) df = read_gbq(sql, dialect='standard', project_id=self.project_id, private_key = service_dict) logging.info("Got table!") ts = str(df['TS'][0]) return ts
def execute(self, context): if self.gcs_hook is None: self.gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.gcp_conn_id) if self.bq_hook is None: self.bq_hook = BigQueryHook(bigquery_conn_id=self.gcp_conn_id) for i, partner_id in enumerate(self.partner_ids): filename = erf_utils.download_and_transform_erf(self, partner_id) entity_read_file_ndj = 'gs://%s/%s' % (self.gcs_bucket, filename) if i > 0: self.write_disposition = 'WRITE_APPEND' bq_base_cursor = self.bq_hook.get_conn().cursor() bq_base_cursor.run_load( destination_project_dataset_table=self.bq_table, schema_fields=self.schema, source_uris=[entity_read_file_ndj], source_format='NEWLINE_DELIMITED_JSON', write_disposition=self.write_disposition) self.gcs_hook.delete(self.gcs_bucket, filename)
def bq_cursor(self): remote_conn_id = conf.get('core', 'REMOTE_LOG_CONN_ID') try: from airflow.contrib.hooks.bigquery_hook import BigQueryHook return BigQueryHook(bigquery_conn_id=remote_conn_id, use_legacy_sql=False).get_conn().cursor() except Exception as e: self.log.error( 'Could not create a BigQueryHook with connection id ' '"%s". %s\n\nPlease make sure that the BigQuery ' 'connection exists.', remote_conn_id, str(e))
def execute(self, context): for i in range(0, len(self.source_project_dataset_table), 1): try: self.log.info('Executing %d/%d extracts', i+1, len(self.source_project_dataset_table)) self.log.info('Executing extract of %s into: %s', self.source_project_dataset_table[i], self.destination_cloud_storage_uris[i]) hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) conn = hook.get_conn() cursor = conn.cursor() cursor.run_extract( self.source_project_dataset_table[i], self.destination_cloud_storage_uris[i], self.compression, self.export_format, self.field_delimiter, self.print_header, self.labels) except Exception as e: self.log.error('Exception: %s', e) self.log.info('Wait %d seconds retry', self.lazy_retry_wait) time.sleep(self.lazy_retry_wait) hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) conn = hook.get_conn() cursor = conn.cursor() cursor.run_extract( self.source_project_dataset_table[i], self.destination_cloud_storage_uris[i], self.compression, self.export_format, self.field_delimiter, self.print_header, self.labels)
def get_hook(self): try: if self.conn_type == 'mysql': from airflow.hooks.mysql_hook import MySqlHook return MySqlHook(mysql_conn_id=self.conn_id) elif self.conn_type == 'google_cloud_platform': from airflow.contrib.hooks.bigquery_hook import BigQueryHook return BigQueryHook(bigquery_conn_id=self.conn_id) elif self.conn_type == 'postgres': from airflow.hooks.postgres_hook import PostgresHook return PostgresHook(postgres_conn_id=self.conn_id) elif self.conn_type == 'hive_cli': from airflow.hooks.hive_hooks import HiveCliHook return HiveCliHook(hive_cli_conn_id=self.conn_id) elif self.conn_type == 'presto': from airflow.hooks.presto_hook import PrestoHook return PrestoHook(presto_conn_id=self.conn_id) elif self.conn_type == 'hiveserver2': from airflow.hooks.hive_hooks import HiveServer2Hook return HiveServer2Hook(hiveserver2_conn_id=self.conn_id) elif self.conn_type == 'sqlite': from airflow.hooks.sqlite_hook import SqliteHook return SqliteHook(sqlite_conn_id=self.conn_id) elif self.conn_type == 'jdbc': from airflow.hooks.jdbc_hook import JdbcHook return JdbcHook(jdbc_conn_id=self.conn_id) elif self.conn_type == 'mssql': from airflow.hooks.mssql_hook import MsSqlHook return MsSqlHook(mssql_conn_id=self.conn_id) elif self.conn_type == 'oracle': from airflow.hooks.oracle_hook import OracleHook return OracleHook(oracle_conn_id=self.conn_id) elif self.conn_type == 'vertica': from airflow.contrib.hooks.vertica_hook import VerticaHook return VerticaHook(vertica_conn_id=self.conn_id) elif self.conn_type == 'cloudant': from airflow.contrib.hooks.cloudant_hook import CloudantHook return CloudantHook(cloudant_conn_id=self.conn_id) elif self.conn_type == 'jira': from airflow.contrib.hooks.jira_hook import JiraHook return JiraHook(jira_conn_id=self.conn_id) elif self.conn_type == 'redis': from airflow.contrib.hooks.redis_hook import RedisHook return RedisHook(redis_conn_id=self.conn_id) elif self.conn_type == 'wasb': from airflow.contrib.hooks.wasb_hook import WasbHook return WasbHook(wasb_conn_id=self.conn_id) elif self.conn_type == 'docker': from airflow.hooks.docker_hook import DockerHook return DockerHook(docker_conn_id=self.conn_id) except: pass
def execute(self, context): dest = self.destination_file sql = self.sql logging.info("Connecting to Big Query") bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) bq_conn = bq_hook.get_connection(self.bigquery_conn_id) bq_conn_extra_json = bq_conn.extra bq_conn_extra = json.loads(bq_conn_extra_json) service_dict = bq_conn_extra['extra__google_cloud_platform__keyfile_dict'] logging.info("Getting table from BQ with SQL:/n{0}".format(sql)) df = read_gbq(sql, dialect='standard', private_key = service_dict) logging.info("Got table!") #logging.info('\tSaving to... {}'.format(save_dir)) #if not os.path.isdir(save_dir): # os.mkdir(save_dir) logging.info("Writing table to disk in feather format") feather.write_dataframe(df, dest) logging.info("Table written to {0}".format(dest)) return df.info()
def execute(self, context): gcs_hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.conn_id) partner_ids = models.Variable.get('partner_ids').split(',') for i, partner_id in enumerate(partner_ids): filename = download_and_transform_erf(self, partner_id) entity_read_file_ndj = 'gs://%s/%s' % (self.gcs_bucket, filename) hook = BigQueryHook(bigquery_conn_id=self.conn_id) self.service = hook.get_service() if i == 0: write_disposition = 'WRITE_TRUNCATE' else: write_disposition = 'WRITE_APPEND' bq_base_cursor = BigQueryBaseCursor(self.service, self.cloud_project_id) bq_base_cursor.run_load( self.bq_table, [entity_read_file_ndj], schema_fields=self.schema, source_format='NEWLINE_DELIMITED_JSON', write_disposition=write_disposition, ignore_unknown_values=True) gcs_hook.delete(self.gcs_bucket, filename)
def execute(self, context=None): self.log.info('Executing SQL check: %s', self.sql) hook = BigQueryHook( bigquery_conn_id=self.bigquery_conn_id, use_legacy_sql=self.use_legacy_sql, delegate_to=self.delegate_to) records = hook.get_first(self.sql) self.log.info('Record: %s', records) branch_to_follow = self.pass_task if not records: self.log.info('The query returned None') branch_to_follow = self.fail_task elif not all([bool(r) for r in records]): exceptstr = 'Test failed.\nQuery:\n{q}\nResults:\n{r!s}' self.log.info(exceptstr.format(q=self.sql, r=records)) branch_to_follow = self.fail_task downstream_tasks = context['task'].downstream_list self.log.info('Following branch %s', branch_to_follow) self.log.info('Downstream task_ids %s', downstream_tasks) skip_tasks = [t for t in downstream_tasks if t.task_id != branch_to_follow] if downstream_tasks: self.skip(context['dag_run'], context['ti'].execution_date, skip_tasks)
def execute(self, context): if self.bq_cursor is None: self.log.info('Executing: %s', self.sql) hook = BigQueryHook( bigquery_conn_id=self.bigquery_conn_id, use_legacy_sql=self.use_legacy_sql, delegate_to=self.delegate_to, location=self.location, ) conn = hook.get_conn() self.bq_cursor = conn.cursor() job_id = self.bq_cursor.run_query( sql=self.sql, destination_dataset_table=self.destination_dataset_table, write_disposition=self.write_disposition, allow_large_results=self.allow_large_results, flatten_results=self.flatten_results, udf_config=self.udf_config, maximum_billing_tier=self.maximum_billing_tier, maximum_bytes_billed=self.maximum_bytes_billed, create_disposition=self.create_disposition, query_params=self.query_params, labels=self.labels, schema_update_options=self.schema_update_options, priority=self.priority, time_partitioning=self.time_partitioning, api_resource_configs=self.api_resource_configs, cluster_fields=self.cluster_fields, ) context['task_instance'].xcom_push(key='job_id', value=job_id) df = hook.get_pandas_df(self.sql) if self.sort_by is not None: df.sort_values('self.sort_by') list_to_return = df.astype(str).to_dict('index') print(list_to_return) return list_to_return
def create_big_query_table(): bq_hook = BigQueryHook(bigquery_conn_id='bigquery_default', use_legacy_sql=False) gcp_credentials = bq_hook._get_credentials() bq_client = bigquery.Client(credentials=gcp_credentials, project=bigquery_project) target_dataset_ref = bigquery.DatasetReference( project=bigquery_project, dataset_id=reference_dataset) try: target_dataset = bq_client.get_dataset(dataset_ref=target_dataset_ref) except NotFound as ex: # LOGGER.info(f"Dataset '{target_dataset_ref}' not found, attempting to create.") target_dataset = bq_client.create_dataset(dataset=target_dataset_ref) target_table_ref = bigquery.TableReference(dataset_ref=target_dataset, table_id=reference_table) target_table = bq_client.delete_table(table=target_table_ref)
def execute(self, context): bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) if not self.schema_fields and self.schema_object \ and self.source_format != 'DATASTORE_BACKUP': gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) schema_fields = json.loads( gcs_hook.download(self.bucket, self.schema_object).decode("utf-8")) else: schema_fields = self.schema_fields source_uris = [ 'gs://{}/{}'.format(self.bucket, source_object) for source_object in self.source_objects ] conn = bq_hook.get_conn() cursor = conn.cursor() cursor.create_external_table( external_project_dataset_table=self. destination_project_dataset_table, schema_fields=schema_fields, source_uris=source_uris, source_format=self.source_format, compression=self.compression, skip_leading_rows=self.skip_leading_rows, field_delimiter=self.field_delimiter, max_bad_records=self.max_bad_records, quote_character=self.quote_character, allow_quoted_newlines=self.allow_quoted_newlines, allow_jagged_rows=self.allow_jagged_rows, src_fmt_configs=self.src_fmt_configs, labels=self.labels, encryption_configuration=self.encryption_configuration)
def _bq_get_data(self): self.log.info('Fetching Data from:') self.log.info('Dataset: %s ; Table: %s', self.dataset_id, self.table_id) hook = BigQueryHook(bigquery_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, location=self.location) conn = hook.get_conn() cursor = conn.cursor() i = 0 while True: response = cursor.get_tabledata( dataset_id=self.dataset_id, table_id=self.table_id, max_results=self.batch_size, selected_fields=self.selected_fields, start_index=i * self.batch_size) if 'rows' in response: rows = response['rows'] else: self.log.info('Job Finished') return self.log.info('Total Extracted rows: %s', len(rows) + i * self.batch_size) table_data = [] for dict_row in rows: single_row = [] for fields in dict_row['f']: single_row.append(fields['v']) table_data.append(single_row) yield table_data i += 1
def execute(self, context): bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) logging.info('start_date_str = %s', self.start_date_str) logging.info('end_date_str = %s', self.end_date_str) logging.info('Date conversion starts') start = str2date(self.start_date_str) end = str2date(self.end_date_str) logging.info('Date conversion ends') logging.info('time_partitioning = %s', self.time_partitioning) for i in daterange(start, end): date_no_dash = i.strftime("%Y%m%d") partitioned_table_id = self.table_id + date_no_dash logging.info("Partitioned table {0}".format(partitioned_table_id)) logging.info('Hooks to check if table exists <%s:%s.%s>', self.project_id, self.dataset_id, partitioned_table_id) table_exists = bq_hook.table_exists(self.project_id, self.dataset_id, partitioned_table_id) if not table_exists: logging.info('Table <%s> does not exists', partitioned_table_id) logging.info('Connects to BigQuery') cursor = BigQueryHelperCursor(bq_hook.get_service(), self.project_id) logging.info('Creates the empty table %s with the schema %s', partitioned_table_id, self.schema_fields) cursor.create_empty_table( project_id=self.project_id, dataset_id=self.dataset_id, table_id=partitioned_table_id, schema_fields=self.schema_fields, time_partitioning=self.time_partitioning)
def execute(self, context): logging.info('Executing: %s', str(self.bql)) hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id) hook.run(self.bql, self.destination_dataset_table)
def execute(self, context): logging.info('Executing: %s', str(self.bql)) hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) hook.run(self.bql, self.destination_dataset_table, self.write_disposition)