def poke(self, context): hook = BigQueryHook(bigquery_conn_id=self.gcp_conn_id) service = hook.get_service() try: service.datasets().get(datasetId=self.dataset_id, projectId=self.project_id).execute() return True except HttpError as e: if e.resp["status"] == "404": return False raise AirflowException(f"Error: {e}")
def execute(self, context): result_export_success = True dataset_creation_success = False query_execution_success = False err_msg = "" try: hook = BigQueryHook(use_legacy_sql=False, bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to, location=self.location) service = hook.get_service() cursor = BigQueryBaseCursor(project_id=self.project_id, service=service) cursor.create_empty_dataset(dataset_id=self.dataset_id, project_id=self.project_id, dataset_reference={ 'defaultTableExpirationMs': self.default_table_expiry_in_ms }) dataset_creation_success = True cursor.run_query(destination_dataset_table=self.temp_table_name, write_disposition='WRITE_TRUNCATE', allow_large_results=True, sql=self.query, use_legacy_sql=False) query_execution_success = True cursor.run_extract( source_project_dataset_table=self.temp_table_name, destination_cloud_storage_uris=self. destination_cloud_storage_uris, compression=self.compression, export_format=self.export_format, field_delimiter=self.field_delimiter, print_header=self.print_header) except Exception as e: err_msg = e logging.error(e) result_export_success = False finally: if query_execution_success: cursor.run_table_delete( deletion_dataset_table=self.temp_table_name) if dataset_creation_success: cursor.delete_dataset(dataset_id=self.dataset_id, project_id=self.project_id) if result_export_success is False: raise AirflowException( "Query export failed. Error: {}".format(err_msg))
def execute(self, context): gcs_hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.conn_id) partner_ids = models.Variable.get('partner_ids').split(',') for i, partner_id in enumerate(partner_ids): filename = download_and_transform_erf(self, partner_id) entity_read_file_ndj = 'gs://%s/%s' % (self.gcs_bucket, filename) hook = BigQueryHook(bigquery_conn_id=self.conn_id) self.service = hook.get_service() if i == 0: write_disposition = 'WRITE_TRUNCATE' else: write_disposition = 'WRITE_APPEND' bq_base_cursor = BigQueryBaseCursor(self.service, self.cloud_project_id) bq_base_cursor.run_load( self.bq_table, [entity_read_file_ndj], schema_fields=self.schema, source_format='NEWLINE_DELIMITED_JSON', write_disposition=write_disposition, ignore_unknown_values=True) gcs_hook.delete(self.gcs_bucket, filename)
def execute(self, context): bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) logging.info('start_date_str = %s', self.start_date_str) logging.info('end_date_str = %s', self.end_date_str) logging.info('Date conversion starts') start = str2date(self.start_date_str) end = str2date(self.end_date_str) logging.info('Date conversion ends') logging.info('time_partitioning = %s', self.time_partitioning) for i in daterange(start, end): date_no_dash = i.strftime("%Y%m%d") partitioned_table_id = self.table_id + date_no_dash logging.info("Partitioned table {0}".format(partitioned_table_id)) logging.info('Hooks to check if table exists <%s:%s.%s>', self.project_id, self.dataset_id, partitioned_table_id) table_exists = bq_hook.table_exists(self.project_id, self.dataset_id, partitioned_table_id) if not table_exists: logging.info('Table <%s> does not exists', partitioned_table_id) logging.info('Connects to BigQuery') cursor = BigQueryHelperCursor(bq_hook.get_service(), self.project_id) logging.info('Creates the empty table %s with the schema %s', partitioned_table_id, self.schema_fields) cursor.create_empty_table( project_id=self.project_id, dataset_id=self.dataset_id, table_id=partitioned_table_id, schema_fields=self.schema_fields, time_partitioning=self.time_partitioning)