示例#1
0
    def execute(self, context: 'Context'):
        self.log.info(
            'Executing extract of %s into: %s',
            self.source_project_dataset_table,
            self.destination_cloud_storage_uris,
        )
        hook = BigQueryHook(
            gcp_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            location=self.location,
            impersonation_chain=self.impersonation_chain,
        )

        table_ref = TableReference.from_string(self.source_project_dataset_table, hook.project_id)

        configuration: Dict[str, Any] = {
            'extract': {
                'sourceTable': table_ref.to_api_repr(),
                'compression': self.compression,
                'destinationUris': self.destination_cloud_storage_uris,
                'destinationFormat': self.export_format,
            }
        }

        if self.labels:
            configuration['labels'] = self.labels

        if self.export_format == 'CSV':
            # Only set fieldDelimiter and printHeader fields if using CSV.
            # Google does not like it if you set these fields for other export
            # formats.
            configuration['extract']['fieldDelimiter'] = self.field_delimiter
            configuration['extract']['printHeader'] = self.print_header

        hook.insert_job(configuration=configuration)
示例#2
0
    def execute(self, context: 'Context'):
        self.log.info(
            'Executing extract of %s into: %s',
            self.source_project_dataset_table,
            self.destination_cloud_storage_uris,
        )
        hook = BigQueryHook(
            gcp_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            location=self.location,
            impersonation_chain=self.impersonation_chain,
        )
        self.hook = hook

        configuration = self._prepare_configuration()
        job_id = hook.generate_job_id(
            job_id=self.job_id,
            dag_id=self.dag_id,
            task_id=self.task_id,
            logical_date=context["logical_date"],
            configuration=configuration,
            force_rerun=self.force_rerun,
        )

        try:
            self.log.info("Executing: %s", configuration)
            job: ExtractJob = hook.insert_job(
                job_id=job_id,
                configuration=configuration,
                project_id=self.project_id,
                location=self.location,
                timeout=self.result_timeout,
                retry=self.result_retry,
            )
            self._handle_job_error(job)
        except Conflict:
            # If the job already exists retrieve it
            job = hook.get_job(
                project_id=self.project_id,
                location=self.location,
                job_id=job_id,
            )
            if job.state in self.reattach_states:
                # We are reattaching to a job
                job.result(timeout=self.result_timeout,
                           retry=self.result_retry)
                self._handle_job_error(job)
            else:
                # Same job configuration so we need force_rerun
                raise AirflowException(
                    f"Job with id: {job_id} already exists and is in {job.state} state. If you "
                    f"want to force rerun it consider setting `force_rerun=True`."
                    f"Or, if you want to reattach in this scenario add {job.state} to `reattach_states`"
                )

        conf = job.to_api_repr()["configuration"]["extract"]["sourceTable"]
        dataset_id, project_id, table_id = conf["datasetId"], conf[
            "projectId"], conf["tableId"]
        BigQueryTableLink.persist(
            context=context,
            task_instance=self,
            dataset_id=dataset_id,
            project_id=project_id,
            table_id=table_id,
        )