def execute(self, context):
     self.log.info('Executing extract of %s into: %s',
                   self.source_project_dataset_table,
                   self.destination_cloud_storage_uris)
     hook = BigQueryHook(bigquery_conn_id=self.gcp_conn_id,
                         delegate_to=self.delegate_to,
                         location=self.location)
     conn = hook.get_conn()
     cursor = conn.cursor()
     cursor.run_extract(
         source_project_dataset_table=self.source_project_dataset_table,
         destination_cloud_storage_uris=self.destination_cloud_storage_uris,
         compression=self.compression,
         export_format=self.export_format,
         field_delimiter=self.field_delimiter,
         print_header=self.print_header,
         labels=self.labels)
Пример #2
0
 def execute(self, context):
     self.log.info('Executing copy of %s into: %s',
                   self.source_project_dataset_tables,
                   self.destination_project_dataset_table)
     hook = BigQueryHook(bigquery_conn_id=self.gcp_conn_id,
                         delegate_to=self.delegate_to,
                         location=self.location)
     conn = hook.get_conn()
     cursor = conn.cursor()
     cursor.run_copy(
         source_project_dataset_tables=self.source_project_dataset_tables,
         destination_project_dataset_table=self.
         destination_project_dataset_table,
         write_disposition=self.write_disposition,
         create_disposition=self.create_disposition,
         labels=self.labels,
         encryption_configuration=self.encryption_configuration)
Пример #3
0
    def _bq_get_data(self):
        self.log.info('Fetching Data from:')
        self.log.info('Dataset: %s ; Table: %s', self.dataset_id,
                      self.table_id)

        hook = BigQueryHook(
            bigquery_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            location=self.location,
            impersonation_chain=self.impersonation_chain,
        )

        conn = hook.get_conn()
        cursor = conn.cursor()
        i = 0
        while True:
            response = cursor.get_tabledata(
                dataset_id=self.dataset_id,
                table_id=self.table_id,
                max_results=self.batch_size,
                selected_fields=self.selected_fields,
                start_index=i * self.batch_size,
            )

            if 'rows' in response:
                rows = response['rows']
            else:
                self.log.info('Job Finished')
                return

            self.log.info('Total Extracted rows: %s',
                          len(rows) + i * self.batch_size)

            table_data = []
            for dict_row in rows:
                single_row = []
                for fields in dict_row['f']:
                    single_row.append(fields['v'])
                table_data.append(single_row)

            yield table_data
            i += 1
    def _bq_get_data(self):

        hook = BigQueryHook(
            bigquery_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            location=self.location,
            impersonation_chain=self.impersonation_chain,
        )
        table_ref = TableReference.from_string(
            self.source_project_dataset_table)
        self.log.info('Fetching Data from:')
        self.log.info('Dataset: %s, Table: %s', table_ref.dataset_id,
                      table_ref.table_id)

        conn = hook.get_conn()
        cursor = conn.cursor()
        i = 0
        while True:
            response = cursor.get_tabledata(
                dataset_id=table_ref.dataset_id,
                table_id=table_ref.table_id,
                max_results=self.batch_size,
                selected_fields=self.selected_fields,
                start_index=i * self.batch_size,
            )

            if 'rows' not in response:
                self.log.info('Job Finished')
                return

            rows = response['rows']

            self.log.info('Total Extracted rows: %s',
                          len(rows) + i * self.batch_size)

            table_data = []
            table_data = [[fields['v'] for fields in dict_row['f']]
                          for dict_row in rows]

            yield table_data
            i += 1
Пример #5
0
    def execute(self, context):
        bq_hook = BigQueryHook(
            bigquery_conn_id=self.bigquery_conn_id,
            delegate_to=self.delegate_to,
            location=self.location,
            impersonation_chain=self.impersonation_chain,
        )

        if not self.schema_fields:
            if self.schema_object and self.source_format != 'DATASTORE_BACKUP':
                gcs_hook = GCSHook(
                    gcp_conn_id=self.google_cloud_storage_conn_id,
                    delegate_to=self.delegate_to,
                    impersonation_chain=self.impersonation_chain,
                )
                blob = gcs_hook.download(
                    bucket_name=self.bucket,
                    object_name=self.schema_object,
                )
                schema_fields = json.loads(blob.decode("utf-8"))
            elif self.schema_object is None and self.autodetect is False:
                raise AirflowException(
                    'At least one of `schema_fields`, `schema_object`, or `autodetect` must be passed.'
                )
            else:
                schema_fields = None

        else:
            schema_fields = self.schema_fields

        source_uris = [
            f'gs://{self.bucket}/{source_object}'
            for source_object in self.source_objects
        ]
        conn = bq_hook.get_conn()
        cursor = conn.cursor()

        if self.external_table:
            cursor.create_external_table(
                external_project_dataset_table=self.
                destination_project_dataset_table,
                schema_fields=schema_fields,
                source_uris=source_uris,
                source_format=self.source_format,
                compression=self.compression,
                skip_leading_rows=self.skip_leading_rows,
                field_delimiter=self.field_delimiter,
                max_bad_records=self.max_bad_records,
                quote_character=self.quote_character,
                ignore_unknown_values=self.ignore_unknown_values,
                allow_quoted_newlines=self.allow_quoted_newlines,
                allow_jagged_rows=self.allow_jagged_rows,
                encoding=self.encoding,
                src_fmt_configs=self.src_fmt_configs,
                encryption_configuration=self.encryption_configuration,
                labels=self.labels,
                description=self.description,
            )
        else:
            cursor.run_load(
                destination_project_dataset_table=self.
                destination_project_dataset_table,
                schema_fields=schema_fields,
                source_uris=source_uris,
                source_format=self.source_format,
                autodetect=self.autodetect,
                create_disposition=self.create_disposition,
                skip_leading_rows=self.skip_leading_rows,
                write_disposition=self.write_disposition,
                field_delimiter=self.field_delimiter,
                max_bad_records=self.max_bad_records,
                quote_character=self.quote_character,
                ignore_unknown_values=self.ignore_unknown_values,
                allow_quoted_newlines=self.allow_quoted_newlines,
                allow_jagged_rows=self.allow_jagged_rows,
                encoding=self.encoding,
                schema_update_options=self.schema_update_options,
                src_fmt_configs=self.src_fmt_configs,
                time_partitioning=self.time_partitioning,
                cluster_fields=self.cluster_fields,
                encryption_configuration=self.encryption_configuration,
                labels=self.labels,
                description=self.description,
            )

        if cursor.use_legacy_sql:
            escaped_table_name = f'[{self.destination_project_dataset_table}]'
        else:
            escaped_table_name = f'`{self.destination_project_dataset_table}`'

        if self.max_id_key:
            cursor.execute(
                f'SELECT MAX({self.max_id_key}) FROM {escaped_table_name}')
            row = cursor.fetchone()
            max_id = row[0] if row[0] else 0
            self.log.info(
                'Loaded BQ data with max %s.%s=%s',
                self.destination_project_dataset_table,
                self.max_id_key,
                max_id,
            )