Пример #1
0
 def execute(self, context):
     for i in range(0, len(self.source_project_dataset_table), 1):
         try:
             self.log.info('Executing %d/%d extracts', i+1, len(self.source_project_dataset_table))
             self.log.info('Executing extract of %s into: %s',
                           self.source_project_dataset_table[i],
                           self.destination_cloud_storage_uris[i])
             hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                                 delegate_to=self.delegate_to)
             conn = hook.get_conn()
             cursor = conn.cursor()
             cursor.run_extract(
                 self.source_project_dataset_table[i],
                 self.destination_cloud_storage_uris[i],
                 self.compression,
                 self.export_format,
                 self.field_delimiter,
                 self.print_header,
                 self.labels)
         except Exception as e:
             self.log.error('Exception: %s', e)
             self.log.info('Wait %d seconds retry', self.lazy_retry_wait)
             time.sleep(self.lazy_retry_wait)
             hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                                 delegate_to=self.delegate_to)
             conn = hook.get_conn()
             cursor = conn.cursor()
             cursor.run_extract(
                 self.source_project_dataset_table[i],
                 self.destination_cloud_storage_uris[i],
                 self.compression,
                 self.export_format,
                 self.field_delimiter,
                 self.print_header,
                 self.labels)
Пример #2
0
    def execute(self, context):
        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id)

        # Remover a tabela caso exista
        full_table_name = '%s.%s.%s' % (self.project_id, self.dataset_id,
                                        self.table_id)
        bq_hook.get_conn().cursor().run_table_delete(full_table_name,
                                                     ignore_if_missing=True)
Пример #3
0
def count_rows(project, dataset, table):
    hook = BigQueryHook()
    conn = hook.get_conn()
    cursor = conn.cursor()
    cursor.execute(f"SELECT COUNT(*) FROM `{PROJECT}.{dataset}.{table}`")
    res = cursor.fetchone()
    return res[0]
 def _load_bq_cursor(self):
     if self.bq_cursor is None:
         hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                             use_legacy_sql=self.use_legacy_sql,
                             delegate_to=self.delegate_to)
         conn = hook.get_conn()
         self.bq_cursor = conn.cursor()
Пример #5
0
        def dry_run_bql(task):
            """
            Call the BigQuery dry run API to run the rendered query.

            :param task: BigQueryOperator task that need to be rendered
            :type task: BigQueryOperator
            :return: query reply from the API
            :rtype: json
            """
            query = getattr(task, 'bql')
            if query is None:
                query = getattr(task, 'sql')

            hook = BigQueryHook(bigquery_conn_id=task.bigquery_conn_id,
                                delegate_to=task.delegate_to)
            conn = hook.get_conn()
            cursor = conn.cursor()

            job_data = {
                'configuration': {
                    'dryRun': True,
                    'query': {
                        'query': query,
                        'useLegacySql': task.use_legacy_sql,
                        'maximumBillingTier': task.maximum_billing_tier
                    }
                }
            }

            jobs = cursor.service.jobs()
            query_reply = jobs \
                .insert(projectId=cursor.project_id, body=job_data) \
                .execute()

            return query_reply
 def execute(self, context):
     self.log.info('Deleting: %s', self.deletion_dataset_table)
     hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                         delegate_to=self.delegate_to)
     conn = hook.get_conn()
     cursor = conn.cursor()
     cursor.run_table_delete(self.deletion_dataset_table, self.ignore_if_missing)
Пример #7
0
 def execute(self, context):
     if self.bq_cursor is None:
         self.log.info('Executing: %s', self.sql)
         hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                             use_legacy_sql=self.use_legacy_sql,
                             delegate_to=self.delegate_to)
         conn = hook.get_conn()
         self.bq_cursor = conn.cursor()
     self.bq_cursor.run_query(
         self.sql,
         destination_dataset_table=self.destination_dataset_table,
         write_disposition=self.write_disposition,
         allow_large_results=self.allow_large_results,
         flatten_results=self.flatten_results,
         udf_config=self.udf_config,
         maximum_billing_tier=self.maximum_billing_tier,
         maximum_bytes_billed=self.maximum_bytes_billed,
         create_disposition=self.create_disposition,
         query_params=self.query_params,
         labels=self.labels,
         schema_update_options=self.schema_update_options,
         priority=self.priority,
         time_partitioning=self.time_partitioning,
         api_resource_configs=self.api_resource_configs,
         cluster_fields=self.cluster_fields,
     )
Пример #8
0
    def execute(self, context):
        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                               delegate_to=self.delegate_to)

        if not self.schema_fields and self.gcs_schema_object:

            gcs_bucket, gcs_object = _parse_gcs_url(self.gcs_schema_object)

            gcs_hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
                delegate_to=self.delegate_to)
            schema_fields = json.loads(gcs_hook.download(
                gcs_bucket,
                gcs_object).decode("utf-8"))
        else:
            schema_fields = self.schema_fields

        conn = bq_hook.get_conn()
        cursor = conn.cursor()

        cursor.create_empty_table(
            project_id=self.project_id,
            dataset_id=self.dataset_id,
            table_id=self.table_id,
            schema_fields=schema_fields,
            time_partitioning=self.time_partitioning
        )
Пример #9
0
    def _run_bq_query(self, context):
        self.log.info('Running BigQuery query: %s', self.sql)
        if self.bq_cursor is None:
            hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                                use_legacy_sql=self.use_legacy_sql,
                                delegate_to=self.delegate_to,
                                location=None)
            conn = hook.get_conn()
            self.bq_cursor = conn.cursor()

        job_id = self.bq_cursor.run_query(
            sql=self.sql,
            destination_dataset_table=self.tmp_dataset_table,
            write_disposition='WRITE_TRUNCATE',
            allow_large_results=self.allow_large_results,
            flatten_results=None,
            udf_config=None,
            maximum_billing_tier=None,
            maximum_bytes_billed=None,
            create_disposition='CREATE_IF_NEEDED',
            query_params=None,
            labels=self.labels,
            schema_update_options=(),
            priority='INTERACTIVE',
            time_partitioning=None,
            api_resource_configs=None,
            cluster_fields=None,
        )

        context['task_instance'].xcom_push(key='job_id', value=job_id)
Пример #10
0
    def execute(self, context):
        if self.dataset:
            raw_tables = [
                f"{self.dataset}.{tbl}" for tbl in self.dst_table_names
            ]
        else:
            raw_tables = self.dst_table_names

        dst_table_names = [format_table_name(x) for x in raw_tables]

        src_table_names = [
            format_table_name(x, is_staging=True) for x in raw_tables
        ]

        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id)
        conn = bq_hook.get_conn()
        cursor = conn.cursor()

        for src, dst in zip(src_table_names, dst_table_names):
            cursor.run_copy(src, dst, write_disposition=self.write_disposition)

        # once all tables moved, then delete staging
        for src in src_table_names:
            cursor.run_table_delete(src)

        return dst_table_names
Пример #11
0
    def execute(self, context):
        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                               delegate_to=self.delegate_to)

        if not self.schema_fields and self.schema_object \
                and self.source_format != 'DATASTORE_BACKUP':
            gcs_hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
                delegate_to=self.delegate_to)
            schema_fields = json.loads(gcs_hook.download(
                self.bucket,
                self.schema_object).decode("utf-8"))
        else:
            schema_fields = self.schema_fields

        source_uris = ['gs://{}/{}'.format(self.bucket, source_object)
                       for source_object in self.source_objects]
        conn = bq_hook.get_conn()
        cursor = conn.cursor()

        cursor.create_external_table(
            external_project_dataset_table=self.destination_project_dataset_table,
            schema_fields=schema_fields,
            source_uris=source_uris,
            source_format=self.source_format,
            compression=self.compression,
            skip_leading_rows=self.skip_leading_rows,
            field_delimiter=self.field_delimiter,
            max_bad_records=self.max_bad_records,
            quote_character=self.quote_character,
            allow_quoted_newlines=self.allow_quoted_newlines,
            allow_jagged_rows=self.allow_jagged_rows,
            src_fmt_configs=self.src_fmt_configs
        )
Пример #12
0
    def execute(self, context):
        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                               delegate_to=self.delegate_to)

        if not self.schema_fields and self.schema_object \
                and self.source_format != 'DATASTORE_BACKUP':
            gcs_hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
                delegate_to=self.delegate_to)
            schema_fields = json.loads(gcs_hook.download(
                self.bucket,
                self.schema_object).decode("utf-8"))
        else:
            schema_fields = self.schema_fields

        source_uris = ['gs://{}/{}'.format(self.bucket, source_object)
                       for source_object in self.source_objects]
        conn = bq_hook.get_conn()
        cursor = conn.cursor()

        cursor.create_external_table(
            external_project_dataset_table=self.destination_project_dataset_table,
            schema_fields=schema_fields,
            source_uris=source_uris,
            source_format=self.source_format,
            compression=self.compression,
            skip_leading_rows=self.skip_leading_rows,
            field_delimiter=self.field_delimiter,
            max_bad_records=self.max_bad_records,
            quote_character=self.quote_character,
            allow_quoted_newlines=self.allow_quoted_newlines,
            allow_jagged_rows=self.allow_jagged_rows,
            src_fmt_configs=self.src_fmt_configs,
            labels=self.labels
        )
 def execute(self, context):
     logging.info('Deleting: %s', self.deletion_dataset_table)
     hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                         delegate_to=self.delegate_to)
     conn = hook.get_conn()
     cursor = conn.cursor()
     cursor.run_table_delete(self.deletion_dataset_table, self.ignore_if_missing)
Пример #14
0
    def execute(self, context):
        self.log.info('Fetching Data from:')
        self.log.info('Dataset: %s ; Table: %s ; Max Results: %s',
                      self.dataset_id, self.table_id, self.max_results)

        hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                            delegate_to=self.delegate_to)

        conn = hook.get_conn()
        cursor = conn.cursor()
        response = cursor.get_tabledata(dataset_id=self.dataset_id,
                                        table_id=self.table_id,
                                        max_results=self.max_results,
                                        selected_fields=self.selected_fields)

        self.log.info('Total Extracted rows: %s', response['totalRows'])
        rows = response['rows']

        table_data = []
        for dict_row in rows:
            single_row = []
            for fields in dict_row['f']:
                single_row.append(fields['v'])
            table_data.append(single_row)

        return table_data
Пример #15
0
    def execute(self, context):
        self.log.info('Fetching Data from:')
        self.log.info('Dataset: %s ; Table: %s ; Max Results: %s',
                      self.dataset_id, self.table_id, self.max_results)

        hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                            delegate_to=self.delegate_to)

        conn = hook.get_conn()
        cursor = conn.cursor()
        response = cursor.get_tabledata(dataset_id=self.dataset_id,
                                        table_id=self.table_id,
                                        max_results=self.max_results,
                                        selected_fields=self.selected_fields)

        self.log.info('Total Extracted rows: %s', response['totalRows'])
        rows = response['rows']

        table_data = []
        for dict_row in rows:
            single_row = []
            for fields in dict_row['f']:
                single_row.append(fields['v'])
            table_data.append(single_row)

        return table_data
Пример #16
0
    def _bq_get_data(self):
        self.log.info('Fetching Data from:')
        self.log.info('Dataset: %s ; Table: %s',
                      self.dataset_id, self.table_id)

        hook = BigQueryHook(bigquery_conn_id=self.gcp_conn_id,
                            delegate_to=self.delegate_to)

        conn = hook.get_conn()
        cursor = conn.cursor()
        i = 0
        while True:
            response = cursor.get_tabledata(dataset_id=self.dataset_id,
                                            table_id=self.table_id,
                                            max_results=self.batch_size,
                                            selected_fields=self.selected_fields,
                                            start_index=i * self.batch_size)

            if 'rows' in response:
                rows = response['rows']
            else:
                self.log.info('Job Finished')
                return

            self.log.info('Total Extracted rows: %s', len(rows) + i * self.batch_size)

            table_data = []
            for dict_row in rows:
                single_row = []
                for fields in dict_row['f']:
                    single_row.append(fields['v'])
                table_data.append(single_row)

            yield table_data
            i += 1
Пример #17
0
    def execute(self, context):
        gcs_hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
            delegate_to=self.delegate_to)
        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                               delegate_to=self.delegate_to)

        schema_fields = self.schema_fields if self.schema_fields else json.loads(
            gcs_hook.download(self.bucket, self.schema_object))
        source_uris = map(
            lambda schema_object: 'gs://{}/{}'.format(
                self.bucket, schema_object), self.source_objects)
        conn = bq_hook.get_conn()
        cursor = conn.cursor()
        cursor.run_load(
            destination_dataset_table=self.destination_dataset_table,
            schema_fields=schema_fields,
            source_uris=source_uris,
            source_format=self.source_format,
            create_disposition=self.create_disposition,
            skip_leading_rows=self.skip_leading_rows,
            write_disposition=self.write_disposition,
            field_delimiter=self.field_delimiter)

        if self.max_id_key:
            cursor.execute('SELECT MAX({}) FROM {}'.format(
                self.max_id_key, self.destination_dataset_table))
            row = cursor.fetchone()
            max_id = row[0] if row[0] else 0
            logging.info('Loaded BQ data with max {}.{}={}'.format(
                self.destination_dataset_table, self.max_id_key, max_id))
            return max_id
Пример #18
0
    def execute(self, context):
        full_table_name = format_table_name(self.dst_table_name)
        print(full_table_name)

        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id)
        conn = bq_hook.get_conn()
        cursor = conn.cursor()

        print(self.sql)

        # table_resource = {
        #    "tableReference": {"table_id": table_id},
        #    "materializedView": {"query": self.sql}
        # }

        # bigquery.Table.from_api_repr(table_resource)

        try:
            cursor.run_query(
                sql=self.sql,
                destination_dataset_table=full_table_name,
                write_disposition="WRITE_TRUNCATE",
                create_disposition=self.create_disposition,
                use_legacy_sql=False,
            )

            self.log.info("Query table as created successfully: {}".format(
                full_table_name))
        except HttpError as err:
            raise AirflowException("BigQuery error: %s" % err.content)
Пример #19
0
    def execute(self, context):
        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                               delegate_to=self.delegate_to)

        if not self.schema_fields and self.gcs_schema_object:

            gcs_bucket, gcs_object = _parse_gcs_url(self.gcs_schema_object)

            gcs_hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
                delegate_to=self.delegate_to)
            schema_fields = json.loads(gcs_hook.download(
                gcs_bucket,
                gcs_object).decode("utf-8"))
        else:
            schema_fields = self.schema_fields

        conn = bq_hook.get_conn()
        cursor = conn.cursor()

        cursor.create_empty_table(
            project_id=self.project_id,
            dataset_id=self.dataset_id,
            table_id=self.table_id,
            schema_fields=schema_fields,
            time_partitioning=self.time_partitioning,
            labels=self.labels
        )
Пример #20
0
    def execute(self, context):
        full_table_name = format_table_name(self.src_table)
        dataset_id, table_id = full_table_name.split(".")

        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id)
        conn = bq_hook.get_conn()
        cursor = conn.cursor()

        table_resource = {
            "tableReference": {
                "table_id": table_id
            },
            "materializedView": {
                "query": self.sql
            },
        }

        # bigquery.Table.from_api_repr(table_resource)
        project_id = get_project_id()

        try:
            cursor.service.tables().insert(
                projectId=project_id,
                datasetId=dataset_id,
                body=table_resource).execute(num_retries=self.num_retries)

            self.log.info("Table created successfully: %s:%s.%s", project_id,
                          dataset_id, table_id)
        except HttpError as err:
            raise AirflowException("BigQuery error: %s" % err.content)
Пример #21
0
 def execute(self, context):
     if self.bq_cursor is None:
         self.log.info('Executing: %s', self.sql)
         hook = BigQueryHook(
             bigquery_conn_id=self.bigquery_conn_id,
             use_legacy_sql=self.use_legacy_sql,
             delegate_to=self.delegate_to,
             location=self.location,
         )
         conn = hook.get_conn()
         self.bq_cursor = conn.cursor()
     self.bq_cursor.run_query(
         sql=self.sql,
         destination_dataset_table=self.destination_dataset_table,
         write_disposition=self.write_disposition,
         allow_large_results=self.allow_large_results,
         flatten_results=self.flatten_results,
         udf_config=self.udf_config,
         maximum_billing_tier=self.maximum_billing_tier,
         maximum_bytes_billed=self.maximum_bytes_billed,
         create_disposition=self.create_disposition,
         query_params=self.query_params,
         labels=self.labels,
         schema_update_options=self.schema_update_options,
         priority=self.priority,
         time_partitioning=self.time_partitioning,
         api_resource_configs=self.api_resource_configs,
         cluster_fields=self.cluster_fields,
     )
Пример #22
0
 def execute(self, context):
     self.log.info(f'Executing query """\n{self.sql}\n""" '
                   f'and save to table: "{self.destination_dataset_table}"')
     hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                         delegate_to=self.delegate_to)
     conn = hook.get_conn()
     cursor = conn.cursor()
     cursor.run_query(
         sql=self.sql,
         destination_dataset_table=self.destination_dataset_table,
         write_disposition=self.write_disposition,
         create_disposition=self.create_disposition,
         labels=self.labels,
         encryption_configuration=self.encryption_configuration,
         allow_large_results=self.allow_large_results,
         flatten_results=self.flatten_results,
         udf_config=self.udf_config,
         use_legacy_sql=self.use_legacy_sql,
         maximum_billing_tier=self.maximum_billing_tier,
         maximum_bytes_billed=self.maximum_bytes_billed,
         query_params=self.query_params,
         schema_update_options=self.schema_update_options,
         priority=self.priority,
         time_partitioning=self.time_partitioning,
         api_resource_configs=self.api_resource_configs,
         cluster_fields=self.cluster_fields,
         location=self.location)
    def execute(self, context):
        gcs_hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
                                          delegate_to=self.delegate_to)
        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                               delegate_to=self.delegate_to)

        schema_fields = self.schema_fields if self.schema_fields else json.loads(gcs_hook.download(self.bucket, self.schema_object))
        source_uris = map(lambda schema_object: 'gs://{}/{}'.format(self.bucket, schema_object), self.source_objects)
        conn = bq_hook.get_conn()
        cursor = conn.cursor()
        cursor.run_load(
            destination_project_dataset_table=self.destination_project_dataset_table,
            schema_fields=schema_fields,
            source_uris=source_uris,
            source_format=self.source_format,
            create_disposition=self.create_disposition,
            skip_leading_rows=self.skip_leading_rows,
            write_disposition=self.write_disposition,
            field_delimiter=self.field_delimiter)

        if self.max_id_key:
            cursor.execute('SELECT MAX({}) FROM {}'.format(self.max_id_key, self.destination_project_dataset_table))
            row = cursor.fetchone()
            max_id = row[0] if row[0] else 0
            logging.info('Loaded BQ data with max {}.{}={}'.format(self.destination_project_dataset_table, self.max_id_key, max_id))
            return max_id
Пример #24
0
class GoogleDisplayVideo360ERFToBigQueryOperator(BaseOperator):
    """Upload Multiple Entity Read Files to specified big query dataset.
    """
    def __init__(self,
                 gcp_conn_id='google_cloud_default',
                 report_body=None,
                 yesterday=False,
                 entity_type=None,
                 file_creation_date=None,
                 cloud_project_id=None,
                 bq_table=None,
                 schema=None,
                 gcs_bucket=None,
                 erf_bucket=None,
                 partner_ids=[],
                 write_disposition='WRITE_TRUNCATE',
                 *args,
                 **kwargs):
        super(GoogleDisplayVideo360ERFToBigQueryOperator,
              self).__init__(*args, **kwargs)
        self.gcp_conn_id = gcp_conn_id
        self.service = None
        self.bq_hook = None
        self.gcs_hook = None
        self.report_body = report_body
        self.erf_bucket = erf_bucket
        self.yesterday = yesterday
        self.cloud_project_id = cloud_project_id
        self.bq_table = bq_table
        self.gcs_bucket = gcs_bucket
        self.schema = schema
        self.entity_type = entity_type
        self.erf_object = 'entity/%s.0.%s.json' % (file_creation_date,
                                                   entity_type)
        self.partner_ids = partner_ids
        self.write_disposition = write_disposition
        self.file_creation_date = file_creation_date

    def execute(self, context):
        if self.gcs_hook is None:
            self.gcs_hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=self.gcp_conn_id)
        if self.bq_hook is None:
            self.bq_hook = BigQueryHook(bigquery_conn_id=self.gcp_conn_id)

        for i, partner_id in enumerate(self.partner_ids):
            filename = erf_utils.download_and_transform_erf(self, partner_id)
            entity_read_file_ndj = 'gs://%s/%s' % (self.gcs_bucket, filename)
            if i > 0:
                self.write_disposition = 'WRITE_APPEND'

            bq_base_cursor = self.bq_hook.get_conn().cursor()
            bq_base_cursor.run_load(
                destination_project_dataset_table=self.bq_table,
                schema_fields=self.schema,
                source_uris=[entity_read_file_ndj],
                source_format='NEWLINE_DELIMITED_JSON',
                write_disposition=self.write_disposition)
            self.gcs_hook.delete(self.gcs_bucket, filename)
Пример #25
0
 def execute(self, context):
     logging.info('Executing: %s', self.bql)
     hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                         delegate_to=self.delegate_to)
     conn = hook.get_conn()
     cursor = conn.cursor()
     cursor.run_query(self.bql, self.destination_dataset_table, self.write_disposition,
                      self.allow_large_results, self.udf_config, self.use_legacy_sql)
Пример #26
0
 def execute(self, context):
     logging.info('Executing: %s', str(self.bql))
     hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                         delegate_to=self.delegate_to)
     conn = hook.get_conn()
     cursor = conn.cursor()
     cursor.run_query(self.bql, self.destination_dataset_table,
                      self.write_disposition, self.allow_large_results)
Пример #27
0
 def execute(self, context):
     if self.bq_cursor is None:
         self.log.info('Executing: %s', self.sql)
         hook = BigQueryHook(
             bigquery_conn_id=self.bigquery_conn_id,
             use_legacy_sql=self.use_legacy_sql,
             delegate_to=self.delegate_to,
             location=self.location,
         )
         conn = hook.get_conn()
         self.bq_cursor = conn.cursor()
     if isinstance(self.sql, str):
         job_id = self.bq_cursor.run_query(
             sql=self.sql,
             destination_dataset_table=self.destination_dataset_table,
             write_disposition=self.write_disposition,
             allow_large_results=self.allow_large_results,
             flatten_results=self.flatten_results,
             udf_config=self.udf_config,
             maximum_billing_tier=self.maximum_billing_tier,
             maximum_bytes_billed=self.maximum_bytes_billed,
             create_disposition=self.create_disposition,
             query_params=self.query_params,
             labels=self.labels,
             schema_update_options=self.schema_update_options,
             priority=self.priority,
             time_partitioning=self.time_partitioning,
             api_resource_configs=self.api_resource_configs,
             cluster_fields=self.cluster_fields,
             encryption_configuration=self.encryption_configuration
         )
     elif isinstance(self.sql, Iterable):
         job_id = [
             self.bq_cursor.run_query(
                 sql=s,
                 destination_dataset_table=self.destination_dataset_table,
                 write_disposition=self.write_disposition,
                 allow_large_results=self.allow_large_results,
                 flatten_results=self.flatten_results,
                 udf_config=self.udf_config,
                 maximum_billing_tier=self.maximum_billing_tier,
                 maximum_bytes_billed=self.maximum_bytes_billed,
                 create_disposition=self.create_disposition,
                 query_params=self.query_params,
                 labels=self.labels,
                 schema_update_options=self.schema_update_options,
                 priority=self.priority,
                 time_partitioning=self.time_partitioning,
                 api_resource_configs=self.api_resource_configs,
                 cluster_fields=self.cluster_fields,
                 encryption_configuration=self.encryption_configuration
             )
             for s in self.sql]
     else:
         raise AirflowException(
             "argument 'sql' of type {} is neither a string nor an iterable".format(type(str)))
     context['task_instance'].xcom_push(key='job_id', value=job_id)
Пример #28
0
    def execute(self, context):
        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                               delegate_to=self.delegate_to)

        conn = bq_hook.get_conn()
        cursor = conn.cursor()

        cursor.delete_dataset(project_id=self.project_id,
                              dataset_id=self.dataset_id)
Пример #29
0
    def execute(self, context):
        bq_hook = BigQueryHook(bigquery_conn_id=self.gcp_conn_id,
                               delegate_to=self.delegate_to)

        conn = bq_hook.get_conn()
        cursor = conn.cursor()

        cursor.create_empty_dataset(project_id=self.project_id,
                                    dataset_id=self.dataset_id,
                                    dataset_reference=self.dataset_reference)
Пример #30
0
 def execute(self, context):
     self.log.info('Executing: %s', self.bql)
     hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                         delegate_to=self.delegate_to)
     conn = hook.get_conn()
     cursor = conn.cursor()
     cursor.run_query(self.bql, self.destination_dataset_table, self.write_disposition,
                      self.allow_large_results, self.udf_config,
                      self.use_legacy_sql, self.maximum_billing_tier,
                      self.create_disposition, self.query_params)
Пример #31
0
 def execute(self, context):
     logging.info('Executing copy of %s into: %s', self.source_project_dataset_tables, self.destination_project_dataset_table)
     hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to)
     conn = hook.get_conn()
     cursor = conn.cursor()
     cursor.run_copy(
         self.source_project_dataset_tables,
         self.destination_project_dataset_table,
         self.write_disposition,
         self.create_disposition)
Пример #32
0
 def execute(self, context):
     bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                            delegate_to=self.delegate_to,
                            use_legacy_sql=False)
     bq_cursor = bq_hook.get_conn().cursor()
     sql = self.SQL_TEMPLATE.format(**self.sql_template_params)
     bq_cursor.execute(sql)
     result = bq_cursor.fetchall()
     # getting the 1st cell of the 1st row of the resultset
     return result[0][0]
 def execute(self, context):
     if(self.bq_cursor == None):
         self.log.info('Executing: %s', self.bql)
         hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                             delegate_to=self.delegate_to)
         conn = hook.get_conn()
         self.bq_cursor = conn.cursor()
     self.bq_cursor.run_query(self.bql, self.destination_dataset_table, self.write_disposition,
                      self.allow_large_results, self.udf_config,
                      self.use_legacy_sql, self.maximum_billing_tier,
                      self.create_disposition, self.query_params)
Пример #34
0
    def execute(self, context):
        bq_hook = BigQueryHook(bigquery_conn_id=self.gcp_conn_id,
                               delegate_to=self.delegate_to)
        conn = bq_hook.get_conn()
        cursor = conn.cursor()

        self.log.info('Start getting dataset: %s:%s', self.project_id, self.dataset_id)

        return cursor.get_dataset(
            dataset_id=self.dataset_id,
            project_id=self.project_id)
Пример #35
0
def bq_to_gcs(**kwargs):
    date_stamp = kwargs['ts']

    # get the last current date from Postgres
    conn = PostgresHook(postgres_conn_id='my_local_db').get_conn()
    cursor = conn.cursor()

    cursor.execute('SELECT MAX(last_update_date) FROM airflow.austin_service_reports;')
    
    recent_ds = cursor.fetchone()[0]
    if recent_ds is not None:
        recent_ds+=timedelta(seconds=1)
        last = recent_ds
    else:
        last = kwargs['start_date']-timedelta(days=1)
    
    cursor.close()
    conn.close()

    # open connection to BigQuery
    hook = BigQueryHook(
        bigquery_conn_id='my_gcp_connection',
        use_legacy_sql=False
    )
    conn = hook.get_conn()
    cursor = conn.cursor()
    with open(SQL_PATH + 'query_bq_dataset.sql', 'r') as f:
        query = f.read()
    query = query.format(last, date_stamp)

    cursor.execute(query)

    # write to gcs bucket
    # Each returned row of the result gives:
    # result = [unique_key, complaint_type, complaint_description, owning_department, source,
    #           status, created_date, last_update_date, close_date, city]
    with BUCKET.open('bq_bucket/bq_dataset.txt', 'w') as f:
        while True:
            result = cursor.fetchone()
            if result is None:
                break
            
            if result[8] is None:
                result[8] = ''
            else:
                result[8] = datetime.utcfromtimestamp(result[8])
            if result[9] is None:
                result[9] = ''
            result[7] = datetime.utcfromtimestamp(result[7])
            result[6] = datetime.utcfromtimestamp(result[6])
            f.write('|'.join([str(val) for val in result]) + '\n')

    cursor.close()
    conn.close()
Пример #36
0
    def execute(self, context):
        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                               delegate_to=self.delegate_to)

        conn = bq_hook.get_conn()
        cursor = conn.cursor()

        cursor.create_empty_dataset(
            project_id=self.project_id,
            dataset_id=self.dataset_id,
            dataset_reference=self.dataset_reference)
Пример #37
0
 def execute(self, context):
     logging.info('Executing extract of %s into: %s', self.source_project_dataset_table, self.destination_cloud_storage_uris)
     hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to)
     conn = hook.get_conn()
     cursor = conn.cursor()
     cursor.run_extract(
         self.source_project_dataset_table,
         self.destination_cloud_storage_uris,
         self.compression,
         self.export_format,
         self.field_delimiter,
         self.print_header)
Пример #38
0
 def execute(self, context):
     logging.info('Executing extract of %s into: %s',
                  self.source_project_dataset_table,
                  self.destination_cloud_storage_uris)
     hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                         delegate_to=self.delegate_to)
     conn = hook.get_conn()
     cursor = conn.cursor()
     cursor.run_extract(self.source_project_dataset_table,
                        self.destination_cloud_storage_uris,
                        self.compression, self.export_format,
                        self.field_delimiter, self.print_header)
Пример #39
0
def bq_to_gcs(**kwargs):
    ds = kwargs['ds']
    previous = datetime.strptime(kwargs['prev_ds'], '%Y-%m-%d').date()

    # get the last current date from Postgres
    conn = PostgresHook(postgres_conn_id='my_local_db').get_conn()
    cursor = conn.cursor()

    cursor.execute('SELECT MAX(CAST(created_date AS DATE)) FROM airflow.austin_service_reports;')
    
    recent_ds = cursor.fetchone()[0]
    if recent_ds is not None:
        recent_ds+=timedelta(days=1)
        if recent_ds < previous:
            prev_ds = datetime.strftime(recent_ds, '%Y-%m-%d')
        else:
            prev_ds = kwargs['prev_ds']
    else:
        prev_ds = datetime.strftime(kwargs['start_date']-timedelta(days=1), '%Y-%m-%d')
    
    cursor.close()
    conn.close()

    # open connection to BigQuery
    hook = BigQueryHook(
        bigquery_conn_id='my_gcp_connection',
        use_legacy_sql=False
    )
    conn = hook.get_conn()
    cursor = conn.cursor()
    with open(SQL_PATH + 'query_bq_dataset.sql', 'r') as f:
        query = f.read()
        query = query.format(prev_ds,ds)

    cursor.execute(query)

    # write to gcs bucket
    with BUCKET.open('bq_bucket/bq_dataset.csv', 'w') as f:
        while True:
            result = cursor.fetchone()
            if result is None:
                break
            
            if result[6] is None:
                result[6]= ''
            else:
                result[6] = datetime.utcfromtimestamp(result[6])
            result[5] = datetime.utcfromtimestamp(result[5])
            f.write(','.join([str(val) for val in result]) + '\n')

    cursor.close()
    conn.close()
Пример #40
0
    def execute(self, context):
        self.log.info('Dataset id: %s Project id: %s', self.dataset_id, self.project_id)

        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                               delegate_to=self.delegate_to)

        conn = bq_hook.get_conn()
        cursor = conn.cursor()

        cursor.create_empty_dataset(
            project_id=self.project_id,
            dataset_id=self.dataset_id,
            dataset_reference=self.dataset_reference)
Пример #41
0
    def execute(self, context):
        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                               delegate_to=self.delegate_to)

        if not self.schema_fields and self.schema_object \
                                  and self.source_format != 'DATASTORE_BACKUP':
            gcs_hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
                delegate_to=self.delegate_to)
            schema_fields = json.loads(gcs_hook.download(
                self.bucket,
                self.schema_object).decode("utf-8"))
        else:
            schema_fields = self.schema_fields

        source_uris = ['gs://{}/{}'.format(self.bucket, source_object)
                       for source_object in self.source_objects]
        conn = bq_hook.get_conn()
        cursor = conn.cursor()
        cursor.run_load(
            destination_project_dataset_table=self.destination_project_dataset_table,
            schema_fields=schema_fields,
            source_uris=source_uris,
            source_format=self.source_format,
            create_disposition=self.create_disposition,
            skip_leading_rows=self.skip_leading_rows,
            write_disposition=self.write_disposition,
            field_delimiter=self.field_delimiter,
            max_bad_records=self.max_bad_records,
            quote_character=self.quote_character,
            allow_quoted_newlines=self.allow_quoted_newlines,
            allow_jagged_rows=self.allow_jagged_rows,
            schema_update_options=self.schema_update_options,
            src_fmt_configs=self.src_fmt_configs,
            time_partitioning=self.time_partitioning)

        if self.max_id_key:
            cursor.execute('SELECT MAX({}) FROM {}'.format(
                self.max_id_key,
                self.destination_project_dataset_table))
            row = cursor.fetchone()
            max_id = row[0] if row[0] else 0
            self.log.info(
                'Loaded BQ data with max %s.%s=%s',
                self.destination_project_dataset_table, self.max_id_key, max_id
            )
            return max_id
 def execute(self, context):
     if self.bq_cursor is None:
         self.log.info('Executing: %s', self.bql)
         hook = BigQueryHook(
             bigquery_conn_id=self.bigquery_conn_id,
             use_legacy_sql=self.use_legacy_sql,
             delegate_to=self.delegate_to)
         conn = hook.get_conn()
         self.bq_cursor = conn.cursor()
     self.bq_cursor.run_query(
         self.bql,
         destination_dataset_table=self.destination_dataset_table,
         write_disposition=self.write_disposition,
         allow_large_results=self.allow_large_results,
         udf_config=self.udf_config,
         maximum_billing_tier=self.maximum_billing_tier,
         create_disposition=self.create_disposition,
         query_params=self.query_params,
         schema_update_options=self.schema_update_options)
Пример #43
0
 def execute(self, context):
     logging.info('Executing: %s', str(self.bql))
     hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to)
     conn = hook.get_conn()
     cursor = conn.cursor()
     cursor.run_query(self.bql, self.destination_dataset_table, self.write_disposition)