Exemplo n.º 1
0
    def execute(self, context: 'Context'):
        self.log.info(
            'Executing extract of %s into: %s',
            self.source_project_dataset_table,
            self.destination_cloud_storage_uris,
        )
        hook = BigQueryHook(
            gcp_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            location=self.location,
            impersonation_chain=self.impersonation_chain,
        )
        job_id = hook.run_extract(
            source_project_dataset_table=self.source_project_dataset_table,
            destination_cloud_storage_uris=self.destination_cloud_storage_uris,
            compression=self.compression,
            export_format=self.export_format,
            field_delimiter=self.field_delimiter,
            print_header=self.print_header,
            labels=self.labels,
        )

        job = hook.get_job(job_id=job_id).to_api_repr()
        conf = job["configuration"]["extract"]["sourceTable"]
        dataset_id, project_id, table_id = conf["datasetId"], conf[
            "projectId"], conf["tableId"]
        BigQueryTableLink.persist(
            context=context,
            task_instance=self,
            dataset_id=dataset_id,
            project_id=project_id,
            table_id=table_id,
        )
Exemplo n.º 2
0
    def execute(self, context: 'Context') -> None:
        self.log.info(
            'Executing copy of %s into: %s',
            self.source_project_dataset_tables,
            self.destination_project_dataset_table,
        )
        hook = BigQueryHook(
            gcp_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            location=self.location,
            impersonation_chain=self.impersonation_chain,
        )

        with warnings.catch_warnings():
            warnings.simplefilter("ignore", DeprecationWarning)
            job_id = hook.run_copy(
                source_project_dataset_tables=self.source_project_dataset_tables,
                destination_project_dataset_table=self.destination_project_dataset_table,
                write_disposition=self.write_disposition,
                create_disposition=self.create_disposition,
                labels=self.labels,
                encryption_configuration=self.encryption_configuration,
            )

            job = hook.get_job(job_id=job_id).to_api_repr()
            conf = job["configuration"]["copy"]["destinationTable"]
            BigQueryTableLink.persist(
                context=context,
                task_instance=self,
                dataset_id=conf["datasetId"],
                project_id=conf["projectId"],
                table_id=conf["tableId"],
            )
Exemplo n.º 3
0
    def execute(self, context: 'Context'):
        bq_hook = BigQueryHook(
            gcp_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            location=self.location,
            impersonation_chain=self.impersonation_chain,
        )

        if not self.schema_fields:
            if self.schema_object and self.source_format != 'DATASTORE_BACKUP':
                gcs_hook = GCSHook(
                    gcp_conn_id=self.gcp_conn_id,
                    delegate_to=self.delegate_to,
                    impersonation_chain=self.impersonation_chain,
                )
                blob = gcs_hook.download(
                    bucket_name=self.bucket,
                    object_name=self.schema_object,
                )
                schema_fields = json.loads(blob.decode("utf-8"))
            else:
                schema_fields = None
        else:
            schema_fields = self.schema_fields

        self.source_objects = (self.source_objects if isinstance(
            self.source_objects, list) else [self.source_objects])
        source_uris = [
            f'gs://{self.bucket}/{source_object}'
            for source_object in self.source_objects
        ]

        if self.external_table:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", DeprecationWarning)
                bq_hook.create_external_table(
                    external_project_dataset_table=self.
                    destination_project_dataset_table,
                    schema_fields=schema_fields,
                    source_uris=source_uris,
                    source_format=self.source_format,
                    autodetect=self.autodetect,
                    compression=self.compression,
                    skip_leading_rows=self.skip_leading_rows,
                    field_delimiter=self.field_delimiter,
                    max_bad_records=self.max_bad_records,
                    quote_character=self.quote_character,
                    ignore_unknown_values=self.ignore_unknown_values,
                    allow_quoted_newlines=self.allow_quoted_newlines,
                    allow_jagged_rows=self.allow_jagged_rows,
                    encoding=self.encoding,
                    src_fmt_configs=self.src_fmt_configs,
                    encryption_configuration=self.encryption_configuration,
                    labels=self.labels,
                    description=self.description,
                )
        else:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", DeprecationWarning)
                bq_hook.run_load(
                    destination_project_dataset_table=self.
                    destination_project_dataset_table,
                    schema_fields=schema_fields,
                    source_uris=source_uris,
                    source_format=self.source_format,
                    autodetect=self.autodetect,
                    create_disposition=self.create_disposition,
                    skip_leading_rows=self.skip_leading_rows,
                    write_disposition=self.write_disposition,
                    field_delimiter=self.field_delimiter,
                    max_bad_records=self.max_bad_records,
                    quote_character=self.quote_character,
                    ignore_unknown_values=self.ignore_unknown_values,
                    allow_quoted_newlines=self.allow_quoted_newlines,
                    allow_jagged_rows=self.allow_jagged_rows,
                    encoding=self.encoding,
                    schema_update_options=self.schema_update_options,
                    src_fmt_configs=self.src_fmt_configs,
                    time_partitioning=self.time_partitioning,
                    cluster_fields=self.cluster_fields,
                    encryption_configuration=self.encryption_configuration,
                    labels=self.labels,
                    description=self.description,
                )

        if self.max_id_key:
            select_command = f'SELECT MAX({self.max_id_key}) FROM `{self.destination_project_dataset_table}`'
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", DeprecationWarning)
                job_id = bq_hook.run_query(
                    sql=select_command,
                    use_legacy_sql=False,
                )
            row = list(bq_hook.get_job(job_id).result())
            if row:
                max_id = row[0] if row[0] else 0
                self.log.info(
                    'Loaded BQ data with max %s.%s=%s',
                    self.destination_project_dataset_table,
                    self.max_id_key,
                    max_id,
                )
            else:
                raise RuntimeError(f"The {select_command} returned no rows!")
Exemplo n.º 4
0
    def execute(self, context: 'Context'):
        self.log.info(
            'Executing extract of %s into: %s',
            self.source_project_dataset_table,
            self.destination_cloud_storage_uris,
        )
        hook = BigQueryHook(
            gcp_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            location=self.location,
            impersonation_chain=self.impersonation_chain,
        )
        self.hook = hook

        configuration = self._prepare_configuration()
        job_id = hook.generate_job_id(
            job_id=self.job_id,
            dag_id=self.dag_id,
            task_id=self.task_id,
            logical_date=context["logical_date"],
            configuration=configuration,
            force_rerun=self.force_rerun,
        )

        try:
            self.log.info("Executing: %s", configuration)
            job: ExtractJob = hook.insert_job(
                job_id=job_id,
                configuration=configuration,
                project_id=self.project_id,
                location=self.location,
                timeout=self.result_timeout,
                retry=self.result_retry,
            )
            self._handle_job_error(job)
        except Conflict:
            # If the job already exists retrieve it
            job = hook.get_job(
                project_id=self.project_id,
                location=self.location,
                job_id=job_id,
            )
            if job.state in self.reattach_states:
                # We are reattaching to a job
                job.result(timeout=self.result_timeout,
                           retry=self.result_retry)
                self._handle_job_error(job)
            else:
                # Same job configuration so we need force_rerun
                raise AirflowException(
                    f"Job with id: {job_id} already exists and is in {job.state} state. If you "
                    f"want to force rerun it consider setting `force_rerun=True`."
                    f"Or, if you want to reattach in this scenario add {job.state} to `reattach_states`"
                )

        conf = job.to_api_repr()["configuration"]["extract"]["sourceTable"]
        dataset_id, project_id, table_id = conf["datasetId"], conf[
            "projectId"], conf["tableId"]
        BigQueryTableLink.persist(
            context=context,
            task_instance=self,
            dataset_id=dataset_id,
            project_id=project_id,
            table_id=table_id,
        )