def execute(self, context: 'Context'): self.log.info( 'Executing extract of %s into: %s', self.source_project_dataset_table, self.destination_cloud_storage_uris, ) hook = BigQueryHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, location=self.location, impersonation_chain=self.impersonation_chain, ) table_ref = TableReference.from_string(self.source_project_dataset_table, hook.project_id) configuration: Dict[str, Any] = { 'extract': { 'sourceTable': table_ref.to_api_repr(), 'compression': self.compression, 'destinationUris': self.destination_cloud_storage_uris, 'destinationFormat': self.export_format, } } if self.labels: configuration['labels'] = self.labels if self.export_format == 'CSV': # Only set fieldDelimiter and printHeader fields if using CSV. # Google does not like it if you set these fields for other export # formats. configuration['extract']['fieldDelimiter'] = self.field_delimiter configuration['extract']['printHeader'] = self.print_header hook.insert_job(configuration=configuration)
def execute(self, context: 'Context'): self.log.info( 'Executing extract of %s into: %s', self.source_project_dataset_table, self.destination_cloud_storage_uris, ) hook = BigQueryHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, location=self.location, impersonation_chain=self.impersonation_chain, ) self.hook = hook configuration = self._prepare_configuration() job_id = hook.generate_job_id( job_id=self.job_id, dag_id=self.dag_id, task_id=self.task_id, logical_date=context["logical_date"], configuration=configuration, force_rerun=self.force_rerun, ) try: self.log.info("Executing: %s", configuration) job: ExtractJob = hook.insert_job( job_id=job_id, configuration=configuration, project_id=self.project_id, location=self.location, timeout=self.result_timeout, retry=self.result_retry, ) self._handle_job_error(job) except Conflict: # If the job already exists retrieve it job = hook.get_job( project_id=self.project_id, location=self.location, job_id=job_id, ) if job.state in self.reattach_states: # We are reattaching to a job job.result(timeout=self.result_timeout, retry=self.result_retry) self._handle_job_error(job) else: # Same job configuration so we need force_rerun raise AirflowException( f"Job with id: {job_id} already exists and is in {job.state} state. If you " f"want to force rerun it consider setting `force_rerun=True`." f"Or, if you want to reattach in this scenario add {job.state} to `reattach_states`" ) conf = job.to_api_repr()["configuration"]["extract"]["sourceTable"] dataset_id, project_id, table_id = conf["datasetId"], conf[ "projectId"], conf["tableId"] BigQueryTableLink.persist( context=context, task_instance=self, dataset_id=dataset_id, project_id=project_id, table_id=table_id, )