Пример #1
0
    def execute(self, context):
        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                               delegate_to=self.delegate_to)

        if not self.schema_fields and self.schema_object \
                                  and self.source_format != 'DATASTORE_BACKUP':
            gcs_hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
                delegate_to=self.delegate_to)
            schema_fields = json.loads(
                gcs_hook.download(self.bucket,
                                  self.schema_object).decode("utf-8"))
        else:
            schema_fields = self.schema_fields

        source_uris = [
            'gs://{}/{}'.format(self.bucket, source_object)
            for source_object in self.source_objects
        ]
        conn = bq_hook.get_conn()
        cursor = conn.cursor()

        if self.external_table:
            cursor.create_external_table(
                external_project_dataset_table=self.
                destination_project_dataset_table,
                schema_fields=schema_fields,
                source_uris=source_uris,
                source_format=self.source_format,
                compression=self.compression,
                skip_leading_rows=self.skip_leading_rows,
                field_delimiter=self.field_delimiter,
                max_bad_records=self.max_bad_records,
                quote_character=self.quote_character,
                ignore_unknown_values=self.ignore_unknown_values,
                allow_quoted_newlines=self.allow_quoted_newlines,
                allow_jagged_rows=self.allow_jagged_rows,
                src_fmt_configs=self.src_fmt_configs)
        else:
            cursor.run_load(destination_project_dataset_table=self.
                            destination_project_dataset_table,
                            schema_fields=schema_fields,
                            source_uris=source_uris,
                            source_format=self.source_format,
                            create_disposition=self.create_disposition,
                            skip_leading_rows=self.skip_leading_rows,
                            write_disposition=self.write_disposition,
                            field_delimiter=self.field_delimiter,
                            max_bad_records=self.max_bad_records,
                            quote_character=self.quote_character,
                            ignore_unknown_values=self.ignore_unknown_values,
                            allow_quoted_newlines=self.allow_quoted_newlines,
                            allow_jagged_rows=self.allow_jagged_rows,
                            schema_update_options=self.schema_update_options,
                            src_fmt_configs=self.src_fmt_configs,
                            time_partitioning=self.time_partitioning)

        if self.max_id_key:
            cursor.execute('SELECT MAX({}) FROM {}'.format(
                self.max_id_key, self.destination_project_dataset_table))
            row = cursor.fetchone()
            max_id = row[0] or 0
            self.log.info('Loaded BQ data with max %s.%s=%s',
                          self.destination_project_dataset_table,
                          self.max_id_key, max_id)
            return max_id
Пример #2
0
 def get_db_hook(self):
     return BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                         use_legacy_sql=self.use_legacy_sql)
Пример #3
0
 def get_db_hook(self):
     return BigQueryHook(bigquery_conn_id=self.bigquery_conn_id)
Пример #4
0
 def get_hook(self):
     if self.conn_type == 'mysql':
         from airflow.hooks.mysql_hook import MySqlHook
         return MySqlHook(mysql_conn_id=self.conn_id)
     elif self.conn_type == 'google_cloud_platform':
         from airflow.contrib.hooks.bigquery_hook import BigQueryHook
         return BigQueryHook(bigquery_conn_id=self.conn_id)
     elif self.conn_type == 'postgres':
         from airflow.hooks.postgres_hook import PostgresHook
         return PostgresHook(postgres_conn_id=self.conn_id)
     elif self.conn_type == 'pig_cli':
         from airflow.hooks.pig_hook import PigCliHook
         return PigCliHook(pig_conn_id=self.conn_id)
     elif self.conn_type == 'hive_cli':
         from airflow.hooks.hive_hooks import HiveCliHook
         return HiveCliHook(hive_cli_conn_id=self.conn_id)
     elif self.conn_type == 'presto':
         from airflow.hooks.presto_hook import PrestoHook
         return PrestoHook(presto_conn_id=self.conn_id)
     elif self.conn_type == 'hiveserver2':
         from airflow.hooks.hive_hooks import HiveServer2Hook
         return HiveServer2Hook(hiveserver2_conn_id=self.conn_id)
     elif self.conn_type == 'sqlite':
         from airflow.hooks.sqlite_hook import SqliteHook
         return SqliteHook(sqlite_conn_id=self.conn_id)
     elif self.conn_type == 'jdbc':
         from airflow.hooks.jdbc_hook import JdbcHook
         return JdbcHook(jdbc_conn_id=self.conn_id)
     elif self.conn_type == 'mssql':
         from airflow.hooks.mssql_hook import MsSqlHook
         return MsSqlHook(mssql_conn_id=self.conn_id)
     elif self.conn_type == 'oracle':
         from airflow.hooks.oracle_hook import OracleHook
         return OracleHook(oracle_conn_id=self.conn_id)
     elif self.conn_type == 'vertica':
         from airflow.contrib.hooks.vertica_hook import VerticaHook
         return VerticaHook(vertica_conn_id=self.conn_id)
     elif self.conn_type == 'cloudant':
         from airflow.contrib.hooks.cloudant_hook import CloudantHook
         return CloudantHook(cloudant_conn_id=self.conn_id)
     elif self.conn_type == 'jira':
         from airflow.contrib.hooks.jira_hook import JiraHook
         return JiraHook(jira_conn_id=self.conn_id)
     elif self.conn_type == 'redis':
         from airflow.contrib.hooks.redis_hook import RedisHook
         return RedisHook(redis_conn_id=self.conn_id)
     elif self.conn_type == 'wasb':
         from airflow.contrib.hooks.wasb_hook import WasbHook
         return WasbHook(wasb_conn_id=self.conn_id)
     elif self.conn_type == 'docker':
         from airflow.hooks.docker_hook import DockerHook
         return DockerHook(docker_conn_id=self.conn_id)
     elif self.conn_type == 'azure_data_lake':
         from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook
         return AzureDataLakeHook(azure_data_lake_conn_id=self.conn_id)
     elif self.conn_type == 'azure_cosmos':
         from airflow.contrib.hooks.azure_cosmos_hook import AzureCosmosDBHook
         return AzureCosmosDBHook(azure_cosmos_conn_id=self.conn_id)
     elif self.conn_type == 'cassandra':
         from airflow.contrib.hooks.cassandra_hook import CassandraHook
         return CassandraHook(cassandra_conn_id=self.conn_id)
     elif self.conn_type == 'mongo':
         from airflow.contrib.hooks.mongo_hook import MongoHook
         return MongoHook(conn_id=self.conn_id)
     elif self.conn_type == 'gcpcloudsql':
         from airflow.contrib.hooks.gcp_sql_hook import CloudSqlDatabaseHook
         return CloudSqlDatabaseHook(gcp_cloudsql_conn_id=self.conn_id)
     elif self.conn_type == 'grpc':
         from airflow.contrib.hooks.grpc_hook import GrpcHook
         return GrpcHook(grpc_conn_id=self.conn_id)
     raise AirflowException("Unknown hook type {}".format(self.conn_type))
 def init(self):
     self.bq_hook = BigQueryHook(bigquery_conn_id=self.bq_conn_id,
                                 use_legacy_sql=False)
     bq_conn = self.bq_hook.get_conn()
     self.bq_cursor = bq_conn.cursor()
def create_dag(pipeline):
    #Request Configuration
    DAG_NAME = pipeline['dag']['dag_name']
    ARGS = pipeline['args']
    START_DATE = pipeline['start_date']
    SCHEDULE_INTERVAL = pipeline['dag']['schedule_interval'] 
    BQ_CONN_ID = pipeline['connection']['bq_conn_id']
    BQ_PROJECT_DESTINATION = pipeline['connection']['bq_project_destination']
    BQ_DATASET_DESTINATION = pipeline['connection']['bq_dataset_destination']
    BQ_TABLE_DESTINATION = pipeline['connection']['bq_table_destination']
    BUCKET_DESTINATION = pipeline['connection']['bucket_destination']
    FOLDER_IN_BUCKET = pipeline['connection']['folder_in_bucket']
    IS_USING_ML = pipeline['model']['is_using_ml']
    TARGET_FORECAST = pipeline['model']['target_forecast']
    CATEGORY = pipeline['model']['category']
    YLABEL = pipeline['viz']['ylabel']
    QUERY = pipeline['query']
    START_DATETIME_QUERY = pipeline['start_datetime_query'] 

    if IS_USING_ML:
        ATTACHED_FILE = ['forecast_result.pdf']
    else:
        ATTACHED_FILE = None

    
    
    # Connection Hook
    bq_hook = BigQueryHook(bigquery_conn_id=BQ_CONN_ID, use_legacy_sql=False)
    storage_client = storage.Client(project = bq_hook._get_field("project"), credentials = bq_hook._get_credentials())
    bq_client = bigquery.Client(project = bq_hook._get_field("project"), credentials = bq_hook._get_credentials())

    dag = DAG(
        dag_id=DAG_NAME,
        default_args=ARGS,
        schedule_interval=timedelta(days=SCHEDULE_INTERVAL),
        start_date= datetime.strptime(START_DATE, '%Y-%m-%d'),
        dagrun_timeout=timedelta(minutes=60),
        max_active_runs=1, 
        catchup=False
        )

    def if_table_exists(**kwargs):
        bq_table = bq_client.dataset(BQ_DATASET_DESTINATION).table(BQ_TABLE_DESTINATION+'_'+'*')
        #read file 
        try:
            bq_client.get_table(bq_table)
            kwargs['ti'].xcom_push(key='created_flag', value=True)
        except:
            kwargs['ti'].xcom_push(key='created_flag', value=False)

    def branch_table(**kwargs):
        is_table_exists = kwargs['ti'].xcom_pull(key='created_flag', task_ids='chck_table')
        print('is_table_exists' + str(is_table_exists))
        if not IS_USING_ML:
            return 'mail'
        elif IS_USING_ML and is_table_exists: 
            return 'predict'
        else:
            return 'train'

    def to_table(**kwargs):
        is_table_exists=kwargs['ti'].xcom_pull(key='created_flag', task_ids='chck_table')

        #table_ref = bq_client.dataset(BQ_DATASET_DESTINATION).table(BQ_TABLE_DESTINATION+'_'+str(kwargs['execution_date'].date().strftime('%Y%m%d')))
        if is_table_exists:
            table_ref = bq_client.dataset(BQ_DATASET_DESTINATION).table(BQ_TABLE_DESTINATION+'_'+'20180914')
        else:
            table_ref = bq_client.dataset(BQ_DATASET_DESTINATION).table(BQ_TABLE_DESTINATION+'_'+'20180831')


        job_config = bigquery.QueryJobConfig()
        job_config.create_disposition = 'CREATE_IF_NEEDED'
        job_config.write_disposition = 'WRITE_TRUNCATE'    
        job_config.destination = table_ref
        # -- Uncomment to create partitioned table
        #job_config.time_partitioning = bigquery.table.TimePartitioning()
        
        #is_table_exists = False
        if is_table_exists:
            #sql=QUERY.format(str(datetime.fromtimestamp(kwargs['execution_date'].timestamp()) - timedelta(SCHEDULE_INTERVAL - 1)) ,str(datetime.fromtimestamp(kwargs['execution_date'].timestamp())))
            sql=QUERY.format('2018-09-01 00:00:00','2018-09-14 23:59:59')
            kwargs['ti'].xcom_push(key='csv_file', value='20180914')
        else:
            #sql=QUERY.format(START_DATETIME_QUERY,str(datetime.fromtimestamp(kwargs['execution_date'].timestamp())))
            sql=QUERY.format('2018-01-01 00:00:00','2018-08-31 23:59:59')
            kwargs['ti'].xcom_push(key='csv_file', value='20180831')

        query_job = bq_client.query(
            sql,
            location='US',
            job_config=job_config,
        )
        result = query_job.result()
        kwargs['ti'].xcom_push(key='row_num', value=result.total_rows)

    def query_to_csv(**kwargs):    
        #table_ref = bq_client.dataset(dataset_id=BQ_DATASET_DESTINATION).table(table_id=BQ_TABLE_DESTINATION+'_'+str(kwargs['execution_date'].date().strftime('%Y%m%d')))

        is_table_exists=kwargs['ti'].xcom_pull(key='created_flag', task_ids='chck_table')

        #table_ref = bq_client.dataset(BQ_DATASET_DESTINATION).table(BQ_TABLE_DESTINATION+'_'+str(kwargs['execution_date'].date().strftime('%Y%m%d')))
        if is_table_exists:
            table_ref = bq_client.dataset(BQ_DATASET_DESTINATION).table(BQ_TABLE_DESTINATION+'_'+'20180914')
            uri = 'gs://'+BUCKET_DESTINATION+'/'+FOLDER_IN_BUCKET+BQ_TABLE_DESTINATION+'_'+'20180914.csv'
        else:
            table_ref = bq_client.dataset(BQ_DATASET_DESTINATION).table(BQ_TABLE_DESTINATION+'_'+'20180831')
            uri = 'gs://'+BUCKET_DESTINATION+'/'+FOLDER_IN_BUCKET+BQ_TABLE_DESTINATION+'_'+'20180831.csv'

        csv_job = bq_client.extract_table(
            source = table_ref,
            #Change the destination uri if needed
            # 'gs://'+BUCKET_DESTINATION+'/'+FOLDER_IN_BUCKET+BQ_TABLE_DESTINATION+'_'+str(kwargs['execution_date'].date().strftime('%Y%m%d'))+'.csv' 
            destination_uris = uri,
            location='US',
        )
        csv_job.result()

    def gcs_csv_to_df(bq_hook, execution_date):
        #todo : change the blob folder to be filled with the request.json 
        #blob = storage_client.get_bucket(BUCKET_DESTINATION).get_blob(FOLDER_IN_BUCKET+BQ_TABLE_DESTINATION+'_'+str(kwargs['execution_date'].date().strftime('%Y%m%d'))+'.csv') 
        blob = storage_client.get_bucket(BUCKET_DESTINATION).get_blob(FOLDER_IN_BUCKET+BQ_TABLE_DESTINATION+'_'+execution_date+'.csv') 
        byte_stream = io.BytesIO()
        blob.download_to_file(byte_stream)
        byte_stream.seek(0)
        df = pd.read_csv(byte_stream)
        return df

    def save_to_gcs(bq_hook, stream, path_file):
        blob = storage_client.get_bucket(BUCKET_DESTINATION).blob(FOLDER_IN_BUCKET+path_file)
        byte_stream = io.BytesIO()
        pickle.dump(stream, byte_stream, pickle.HIGHEST_PROTOCOL)
        byte_stream.seek(0)
        blob.upload_from_file(byte_stream)

    def train_mdl(**kwargs):
        #df = gcs_csv_to_df(bq_hook, kwargs['execution_date'].date())
        df = gcs_csv_to_df(bq_hook, '20180831')

        df.rename(columns={TARGET_FORECAST: 'y'})
        model = prophet.train(df, category_cols=CATEGORY)
        save_to_gcs(bq_hook,  model, 'model/model.pickle')
        
        #prediction = predict_mdl(kwargs['execution_date'].date())
        prediction = predict_mdl(datetime(2018,8,31).date())
        return prediction

    def predict_mdl(execution_date):
        blob = storage_client.get_bucket(BUCKET_DESTINATION).get_blob(FOLDER_IN_BUCKET+'model/model.pickle')
        byte_stream = io.BytesIO()
        blob.download_to_file(byte_stream)
        byte_stream.seek(0)
        models = pickle.load(byte_stream)
        prediction = prophet.predict(execution_date, models=models, schedule_interval=SCHEDULE_INTERVAL, category_cols=CATEGORY)
        plot.plotvis(prediction, YLABEL, SCHEDULE_INTERVAL, category_cols=CATEGORY)
        prediction.to_csv('prediction.csv', index=False)

        #blob = storage_client.get_bucket(BUCKET_DESTINATION).blob(FOLDER_IN_BUCKET+'prediction/prediction_'+str((kwargs['execution_date']+timedelta(days=1)).date().strftime('%Y%m%d'))+'.csv')
        blob = storage_client.get_bucket(BUCKET_DESTINATION).blob(FOLDER_IN_BUCKET+'prediction/prediction_'+execution_date.strftime('%Y%m%d')+'.csv')

        blob.upload_from_filename('prediction.csv')
        return prediction

    def predict(**kwargs):
        #prediction = predict_mdl(kwargs['execution_date'].date())
        prediction = predict_mdl(datetime(2018,9,14).date())
        return prediction
    

    with dag: 
        chck_table = PythonOperator(
            task_id = 'chck_table',
            dag=dag,
            python_callable=if_table_exists,
        )

        chck_table_branch = BranchPythonOperator(
            task_id = 'chck_table_branch',
            dag=dag,
            python_callable=branch_table,
        )

        crt_table = PythonOperator(
            task_id='crt_table',
            dag=dag,
            python_callable=to_table
        )

        save_to_csv = PythonOperator(
            task_id='save_to_csv',
            dag=dag,
            python_callable=query_to_csv
        )

        train = PythonOperator(
            task_id='train',
            dag=dag,
            python_callable=train_mdl,
        )

        #Change the 2018031 to {{ds_nodash}}
        mail = EmailOperator(
            task_id='mail',
            dag=dag,
            trigger_rule='none_failed',
            to='*****@*****.**',
            subject='Reporting: {{ params.dag_name }} {{ ds }}',
            params={
                'dag_name': DAG_NAME,
                'table': BQ_TABLE_DESTINATION, 
                'dataset': BQ_DATASET_DESTINATION,
                'project': BQ_PROJECT_DESTINATION,
                'bucket': BUCKET_DESTINATION,
                'using_ml': IS_USING_ML
                },
            html_content=''' 
            DAG name : {{ params.dag_name }}
            <br>
            Table : {{ params.table }}
            <br>
            Dataset : {{ params.dataset }} 
            <br>
            Project : {{ params.project }}
            <br>
            CSV link in GCS : https://storage.cloud.google.com/{{ params.bucket }}/{{ params.dataset }}/{{ params.table }}_{{task_instance.xcom_pull(task_ids='crt_table', key='csv_file')}}.csv
            <br>
            Number of recorded rows : {{task_instance.xcom_pull(task_ids='crt_table', key='row_num')}}
            ''',
            files = ATTACHED_FILE,
            cc=['*****@*****.**', '*****@*****.**'],
        )

        predict = PythonOperator(
            task_id='predict',
            dag=dag,
            python_callable=predict,
        )

    chck_table >> crt_table >> save_to_csv >> chck_table_branch >> [train, predict, mail] 
    train >> mail 
    predict >> mail 

    return dag
Пример #7
0
from __future__ import print_function
import json
from airflow import AirflowException
from airflow.contrib.hooks.bigquery_hook import BigQueryHook

# Global Variables for library
g_source_composer_bucket_path = '/home/airflow/gcs/dags/'
g_json_composer_bucket_path = g_source_composer_bucket_path + 'json/'
g_sql_composer_bucket_path = g_source_composer_bucket_path + 'sql/'
g_conn_id = 'bigquery_default'
g_bq_hook = BigQueryHook(bigquery_conn_id=g_conn_id)
g_config_file = "/home/airflow/gcs/dags/json/DPLF_DAG_Generator_Config.json"


# Access configuration
def DPLF_Access_ReadConfig(
    p_config_file="/home/airflow/gcs/dags/json/DPLF_DAG_Generator_Config.json"
):
    v_json_data = open(p_config_file).read()
    v_config = json.loads(v_json_data)
    return v_config


# Extractor of files and tasks from Json
def DPLF_GetValueByKey(p_list, p_key):
    if p_key in p_list:
        return p_list[p_key]
    else:
        return ""

    def patch_bq_cols(self, bq_table, sf_cols):
        """
        Used to decide whether we need to run an ALTER or CREATE
        table command. Leverages alter_tbl_ddl() and create_tbl_ddl()
        to create the DDL that will be run.
        """
        bq_service = BigQueryHook(
            bigquery_conn_id=self.bq_conn_id).get_service()
        bq_conn = BigQueryBaseCursor(bq_service, self.bq_project)

        missing_cols = []

        try:
            bq_cols = bq_conn.get_schema(self.bq_dataset, bq_table)
            print(bq_cols)
            bq_cols = [col for col in bq_cols['fields']]
            missing_cols = [x for x in sf_cols if x['name'] not in bq_cols]

        except:
            bq_cols = []
            for col in sf_cols:
                bq_cols.append({
                    "type": col['type'],
                    "name": col["name"].lower(),
                    "mode": "NULLABLE"
                })

            self.create_tbl_ddl(bq_table, bq_cols)

        if missing_cols:

            bq_cols = []
            for col in sf_cols:
                bq_cols.append({
                    "type": col['type'],
                    "name": col["name"].lower(),
                    "mode": "NULLABLE"
                })
            bq_cols.append({
                "name": "partition_date",
                "type": "DATE",
                "mode": "NULLABLE"
            })

            print('new schema is ' + str(bq_cols))

            table_resource = {}

            table_resource['schema'] = {'fields': bq_cols}

            try:
                bq_service.tables().patch(projectId=self.bq_project,
                                          datasetId=self.bq_dataset,
                                          tableId=bq_table,
                                          body=table_resource).execute()

                self.log.info('Table patched successfully')

            except HttpError as err:
                raise AirflowException(
                    'BigQuery job failed. Error was: {}'.format(err.content))

        return bq_cols
Пример #9
0
 def execute(self, context):
     logging.info('Executing: %s', str(self.bql))
     hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id)
     hook.run(self.bql, self.destination_dataset_table, self.write_disposition)
Пример #10
0
 def get_db_hook(self):
     return BigQueryHook(bigquery_conn_id=self.conn_id).get_conn().cursor()
Пример #11
0
#  ----------------------------------------------------------------------------

conn_id_gcp = 'conn_id_gcp'
conn_id_gcp_cros = 'conn_id_gcp_cros'
sa_cross = Variable.get("config_sa-cross")

slack_token_variable_name = "slack_token"
slack_user_name_daf_airflow = "slack_user_name_to_users"
slack_channel_daf_airflow = "slack_channel_daf_airflow"

github_auth = Variable.get("github_auth", deserialize_json=True)
#  ----------------------------------------------------------------------------
#  HOOKS - VARIABLES
#  ----------------------------------------------------------------------------

bq_hook = BigQueryHook(bigquery_conn_id=conn_id_gcp_cros, use_legacy_sql=False)

#  ----------------------------------------------------------------------------
#  DATE - VARIABLES
#  ----------------------------------------------------------------------------

macro_yesterday_date = '{{ ds }}'
macro_today_date = '{{ next_ds }}'

#  ----------------------------------------------------------------------------
#  CLUSTER CONFIG - VARIABLES
#  ----------------------------------------------------------------------------
predict_config_script = \
    Variable.get("var-food_ontology-predict-configuration",
                 deserialize_json=True)
predict_properties_daily = \
Пример #12
0
    def execute(self, context):
        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                               delegate_to=self.delegate_to)

        if not self.schema_fields:
            if self.schema_object and self.source_format != 'DATASTORE_BACKUP':
                gcs_hook = GoogleCloudStorageHook(
                    google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
                    delegate_to=self.delegate_to)
                schema_fields = json.loads(gcs_hook.download(
                    self.bucket,
                    self.schema_object).decode("utf-8"))
            elif self.schema_object is None and self.autodetect is False:
                raise ValueError('At least one of `schema_fields`, `schema_object`, '
                                 'or `autodetect` must be passed.')
            else:
                schema_fields = None

        else:
            schema_fields = self.schema_fields

        source_uris = ['gs://{}/{}'.format(self.bucket, source_object)
                       for source_object in self.source_objects]
        conn = bq_hook.get_conn()
        cursor = conn.cursor()

        if self.external_table:
            cursor.create_external_table(
                external_project_dataset_table=self.destination_project_dataset_table,
                schema_fields=schema_fields,
                source_uris=source_uris,
                source_format=self.source_format,
                compression=self.compression,
                skip_leading_rows=self.skip_leading_rows,
                field_delimiter=self.field_delimiter,
                max_bad_records=self.max_bad_records,
                quote_character=self.quote_character,
                ignore_unknown_values=self.ignore_unknown_values,
                allow_quoted_newlines=self.allow_quoted_newlines,
                allow_jagged_rows=self.allow_jagged_rows,
                encoding=self.encoding,
                src_fmt_configs=self.src_fmt_configs,
                encryption_configuration=self.encryption_configuration
            )
        else:
            cursor.run_load(
                destination_project_dataset_table=self.destination_project_dataset_table,
                schema_fields=schema_fields,
                source_uris=source_uris,
                source_format=self.source_format,
                autodetect=self.autodetect,
                create_disposition=self.create_disposition,
                skip_leading_rows=self.skip_leading_rows,
                write_disposition=self.write_disposition,
                field_delimiter=self.field_delimiter,
                max_bad_records=self.max_bad_records,
                quote_character=self.quote_character,
                ignore_unknown_values=self.ignore_unknown_values,
                allow_quoted_newlines=self.allow_quoted_newlines,
                allow_jagged_rows=self.allow_jagged_rows,
                encoding=self.encoding,
                schema_update_options=self.schema_update_options,
                src_fmt_configs=self.src_fmt_configs,
                time_partitioning=self.time_partitioning,
                cluster_fields=self.cluster_fields,
                encryption_configuration=self.encryption_configuration)

        if cursor.use_legacy_sql:
            escaped_table_name = '[{}]'.format(self.destination_project_dataset_table)
        else:
            escaped_table_name = '`{}`'.format(self.destination_project_dataset_table)

        if self.max_id_key:
            cursor.execute('SELECT MAX({}) FROM {}'.format(
                self.max_id_key,
                escaped_table_name))
            row = cursor.fetchone()
            max_id = row[0] if row[0] else 0
            self.log.info(
                'Loaded BQ data with max %s.%s=%s',
                self.destination_project_dataset_table, self.max_id_key, max_id
            )
            return max_id
Пример #13
0
 def execute(self, context):
     logging.info('Executing: %s', str(self.bql))
     hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to)
     conn = hook.get_conn()
     cursor = conn.cursor()
     cursor.run_query(self.bql, self.destination_dataset_table, self.write_disposition, self.allow_large_results, self.udf_config)