def test_execute_bad_type(self, mock_hook): operator = BigQueryOperator( task_id=TASK_ID, sql=1, destination_dataset_table=None, write_disposition='WRITE_EMPTY', allow_large_results=False, flatten_results=None, bigquery_conn_id='google_cloud_default', udf_config=None, use_legacy_sql=True, maximum_billing_tier=None, maximum_bytes_billed=None, create_disposition='CREATE_IF_NEEDED', schema_update_options=(), query_params=None, labels=None, priority='INTERACTIVE', time_partitioning=None, api_resource_configs=None, cluster_fields=None, ) with self.assertRaises(AirflowException): operator.execute(MagicMock())
def test_bigquery_operator_defaults(self, mock_hook): operator = BigQueryOperator( task_id=TASK_ID, sql='Select * from test_table', ) operator.execute(None) mock_hook.return_value \ .get_conn() \ .cursor() \ .run_query \ .assert_called_once_with( sql='Select * from test_table', destination_dataset_table=None, write_disposition='WRITE_EMPTY', allow_large_results=False, flatten_results=None, udf_config=None, maximum_billing_tier=None, maximum_bytes_billed=None, create_disposition='CREATE_IF_NEEDED', schema_update_options=(), query_params=None, labels=None, priority='INTERACTIVE', time_partitioning=None, api_resource_configs=None, cluster_fields=None, )
def test_bigquery_operator_extra_link(self, mock_hook): bigquery_task = BigQueryOperator( task_id=TASK_ID, sql='SELECT * FROM test_table', dag=self.dag, ) self.dag.clear() ti = TaskInstance( task=bigquery_task, execution_date=DEFAULT_DATE, ) job_id = '12345' ti.xcom_push(key='job_id', value=job_id) self.assertEquals( 'https://console.cloud.google.com/bigquery?j={job_id}'.format(job_id=job_id), bigquery_task.get_extra_links(DEFAULT_DATE, BigQueryConsoleLink.name), ) self.assertEquals( '', bigquery_task.get_extra_links(datetime(2019, 1, 1), BigQueryConsoleLink.name), )
def test_bigquery_operator_extra_link(self, mock_hook): bigquery_task = BigQueryOperator( task_id=TASK_ID, sql='SELECT * FROM test_table', dag=self.dag, ) self.dag.clear() ti = TaskInstance( task=bigquery_task, execution_date=DEFAULT_DATE, ) job_id = '12345' ti.xcom_push(key='job_id', value=job_id) self.assertEqual( 'https://console.cloud.google.com/bigquery?j={job_id}'.format( job_id=job_id), bigquery_task.get_extra_links(DEFAULT_DATE, BigQueryConsoleLink.name), ) self.assertEqual( '', bigquery_task.get_extra_links(datetime(2019, 1, 1), BigQueryConsoleLink.name), )
def test_bigquery_operator_extra_link_when_multiple_query(self, mock_hook, session): bigquery_task = BigQueryOperator( task_id=TASK_ID, sql=['SELECT * FROM test_table', 'SELECT * FROM test_table2'], dag=self.dag, ) self.dag.clear() session.query(XCom).delete() ti = TaskInstance( task=bigquery_task, execution_date=DEFAULT_DATE, ) job_id = ['123', '45'] ti.xcom_push(key='job_id', value=job_id) self.assertEqual( {'BigQuery Console #1', 'BigQuery Console #2'}, bigquery_task.operator_extra_link_dict.keys() ) self.assertEqual( 'https://console.cloud.google.com/bigquery?j=123', bigquery_task.get_extra_links(DEFAULT_DATE, 'BigQuery Console #1'), ) self.assertEqual( 'https://console.cloud.google.com/bigquery?j=45', bigquery_task.get_extra_links(DEFAULT_DATE, 'BigQuery Console #2'), )
def test_bigquery_operator_defaults(self, mock_hook): operator = BigQueryOperator( task_id=TASK_ID, sql='Select * from test_table', ) operator.execute(None) mock_hook.return_value \ .get_conn() \ .cursor() \ .run_query \ .assert_called_once_with( sql='Select * from test_table', destination_dataset_table=None, write_disposition='WRITE_EMPTY', allow_large_results=False, flatten_results=None, udf_config=None, maximum_billing_tier=None, maximum_bytes_billed=None, create_disposition='CREATE_IF_NEEDED', schema_update_options=(), query_params=None, labels=None, priority='INTERACTIVE', time_partitioning=None, api_resource_configs=None, cluster_fields=None, )
def test_bigquery_operator_defaults(self, mock_hook): operator = BigQueryOperator(task_id=TASK_ID, sql='Select * from test_table', dag=self.dag, default_args=self.args) operator.execute(None) mock_hook.return_value \ .get_conn() \ .cursor() \ .run_query \ .assert_called_once_with( sql='Select * from test_table', destination_dataset_table=None, write_disposition='WRITE_EMPTY', allow_large_results=False, flatten_results=None, udf_config=None, maximum_billing_tier=None, maximum_bytes_billed=None, create_disposition='CREATE_IF_NEEDED', schema_update_options=(), query_params=None, labels=None, priority='INTERACTIVE', time_partitioning=None, api_resource_configs=None, cluster_fields=None, ) self.assertTrue(isinstance(operator.sql, six.string_types)) ti = TaskInstance(task=operator, execution_date=DEFAULT_DATE) ti.render_templates() self.assertTrue(isinstance(ti.task.sql, six.string_types))
def execute(self, context): # TODO checar se 'hasattr' contempla try: self.sql = self.SQL_TEMPLATE.format(**self.sql_template_params) except AttributeError: self.sql = self.sql.format(**self.sql_template_params) BigQueryOperator.execute(self, context)
def test_bigquery_operator_extra_link_when_missing_job_id(self, mock_hook, session): bigquery_task = BigQueryOperator( task_id=TASK_ID, sql='SELECT * FROM test_table', dag=self.dag, ) self.dag.clear() session.query(XCom).delete() self.assertEqual( '', bigquery_task.get_extra_links(DEFAULT_DATE, BigQueryConsoleLink.name), )
def test_execute(self, mock_hook): encryption_configuration = {'key': 'kk'} operator = BigQueryOperator( task_id=TASK_ID, sql='Select * from test_table', destination_dataset_table=None, write_disposition='WRITE_EMPTY', allow_large_results=False, flatten_results=None, gcp_conn_id='google_cloud_default', udf_config=None, use_legacy_sql=True, maximum_billing_tier=None, maximum_bytes_billed=None, create_disposition='CREATE_IF_NEEDED', schema_update_options=(), query_params=None, labels=None, priority='INTERACTIVE', time_partitioning=None, api_resource_configs=None, cluster_fields=None, encryption_configuration=encryption_configuration ) operator.execute(MagicMock()) mock_hook.return_value \ .get_conn.return_value \ .cursor.return_value \ .run_query \ .assert_called_once_with( sql='Select * from test_table', destination_dataset_table=None, write_disposition='WRITE_EMPTY', allow_large_results=False, flatten_results=None, udf_config=None, maximum_billing_tier=None, maximum_bytes_billed=None, create_disposition='CREATE_IF_NEEDED', schema_update_options=(), query_params=None, labels=None, priority='INTERACTIVE', time_partitioning=None, api_resource_configs=None, cluster_fields=None, encryption_configuration=encryption_configuration )
def execute_sql(task_id: str, sql_file_path: str) -> BigQueryOperator: return BigQueryOperator(task_id=task_id, sql=sql_file_path, bigquery_conn_id=BIG_QUERY_CONN_ID, write_disposition='WRITE_APPEND', use_legacy_sql=False, location='US')
def gc_tasks(name, schema, next_task=DummyOperator(task_id="Done")): bq_staging = f"{{{{ var.value.gc_project_id }}}}.{{{{ var.value.bq_dataset_source }}}}.{name}" bq_warehouse = f"{{{{ var.value.gc_project_id }}}}.{{{{ var.value.bq_dataset_target }}}}.{name}" t1 = GoogleCloudStorageToBigQueryOperator( task_id=f"staging_{name}", bucket="{{var.value.gcs_bucket}}", source_objects=[f"{name}*"], destination_project_dataset_table=bq_staging, write_disposition="WRITE_TRUNCATE", schema_fields=schema, skip_leading_rows=1, ) t2 = BigQueryOperator( task_id=f"merge_{name}_into_warehouse", sql=_create_merge_sql(bq_staging, bq_warehouse, schema), use_legacy_sql=False, ) t3 = GoogleCloudStorageToGoogleCloudStorageOperator( task_id=f"move_{name}_to_processed", source_bucket="{{var.value.gcs_bucket}}", source_object=f"{name}*", destination_bucket="{{var.value.gcs_bucket}}", destination_object=f"processed/{name}", move_object=True, ) t1 >> t2 >> t3 >> next_task return t1
def Call_BQ_Load_Proc(proj, dset, tgt_tab): return BigQueryOperator(task_id='load_' + tgt_tab + '_via_sproc', sql='CALL `' + proj + '.' + dset + '.load_' + tgt_tab + '`()', use_legacy_sql=False, trigger_rule='none_failed')
def Truncate_BQ_Table(proj, dset, tgt_tab): return BigQueryOperator(task_id='truncate_' + tgt_tab, sql='TRUNCATE TABLE `' + proj + '.' + dset + '.' + tgt_tab + '`', use_legacy_sql=False, trigger_rule='none_failed')
def createTaskHelper(table): return BigQueryOperator( task_id='materialize__{0}'.format(table), bql='{0}.sql'.format(table), use_legacy_sql=False, write_disposition="WRITE_TRUNCATE", destination_dataset_table='{0}.{1}'.format(BQ_DATASET_NAME,table), dag=dag_daily)
def createTaskHelper(table): return BigQueryOperator( task_id='materialize_{0}'.format(table), bql='{0}.sql'.format(table), params={"partition_date":"{0}".format(job_run_date)}, use_legacy_sql=False, write_disposition="WRITE_TRUNCATE", destination_dataset_table='{0}.{1}${2}'.format(BQ_DATASET_NAME,table,job_run_date.replace('-','')), dag=dag_daily)
def test_bql_deprecation_warning(self): with warnings.catch_warnings(record=True) as w: BigQueryOperator( task_id='test_deprecation_warning_for_bql', bql='select * from test_table' ) self.assertIn( 'Deprecated parameter `bql`', w[0].message.args[0])
def deleteStagingTablesTask(table): return BigQueryOperator( task_id='delete_staging_{0}'.format(table), bql = ''' DROP TABLE IF EXISTS {{params.table}} ''', params={"table":"{0}.{1}".format(BQ_STAGING_DATASET_NAME,table)}, use_legacy_sql=False, dag=dag_daily)
def insert_overwrite(task_id: str, sql_file_path: str, destination_table: str) -> BigQueryOperator: return BigQueryOperator(task_id=task_id, sql=sql_file_path, bigquery_conn_id=BIG_QUERY_CONN_ID, write_disposition='WRITE_TRUNCATE', destination_dataset_table=destination_table, use_legacy_sql=False, location='US')
def Load_Within_BQ(mode, proj, dset, tgt_tab, src_tab, src_cols='*'): t = 'load_' + tgt_tab + '_from_' + src_tab # t = 'load_' + tgt_tab + '_via_' + ('custom_op' + str(mode) if mode in (1,2,3) else 'subdag' if mode == 4 else 'std_op') d = proj + '.' + dset + '.' + tgt_tab s = 'SELECT ' + src_cols + ' FROM `' + proj + '.' + dset + '.' + src_tab + '`' c = 'CREATE_NEVER' w = 'WRITE_TRUNCATE' if mode in (1, 2, 3) else 'WRITE_EMPTY' l = False r = 'none_failed' if mode == 1: return CustomBigQueryOperator(task_id=t, sql=s, destination_dataset_table=d, create_disposition=c, write_disposition=w, use_legacy_sql=l, trigger_rule=r) elif mode == 2: return AnotherCustomBigQueryOperator(task_id=t, sql=s, destination_dataset_table=d, create_disposition=c, write_disposition=w, use_legacy_sql=l, trigger_rule=r) elif mode == 3: return DodgyCustomBigQueryOperator(task_id=t, sql=s, destination_dataset_table=d, create_disposition=c, write_disposition=w, use_legacy_sql=l, trigger_rule=r) elif mode == 4: return SubDagOperator(subdag=Load_Subdag(tgt_tab, t, s, d, c, w, l, r, dag.default_args), task_id=t, dag=dag) else: return BigQueryOperator(task_id=t, sql=s, destination_dataset_table=d, create_disposition=c, write_disposition=w, use_legacy_sql=l, trigger_rule=r)
def _get_bigquery_task(): dag = DAG(dag_id='TestBigQueryExtractorE2E') task = BigQueryOperator( sql='select first_name, last_name from customers;', task_id="task_id", project_id="project_id", dag_id="dag_id", dag=dag, start_date=timezone.datetime(2016, 2, 1, 0, 0, 0)) return task
def __init__(self, project, table, sql_template_params, task_id=None, sql=None, *args, **kwargs): self.project = project self.table = table self.sql_template_params = sql_template_params BigQueryOperator.__init__( self, task_id=task_id if task_id else '{}-table-{}'.format( self.operation, self.table), sql=sql if sql else 'SELECT 1', allow_large_results=True, use_legacy_sql=False, *args, **kwargs)
def test_extract_error(self, mock_client, mock_hook): bq_job_id = "foo.bq.job_id" mock_hook.return_value \ .get_conn.return_value \ .cursor.return_value \ .run_query.return_value = bq_job_id mock_client.return_value \ .get_job.side_effects = [Exception("bq error")] # To make sure hasattr "sees" close and calls it mock_client.return_value.close.return_value mock.seal(mock_hook) mock.seal(mock_client) dag = DAG(dag_id='TestBigQueryExtractorE2E') task = BigQueryOperator( sql='select first_name, last_name from customers;', task_id="task_id", project_id="project_id", dag_id="dag_id", dag=dag, start_date=timezone.datetime(2016, 2, 1, 0, 0, 0) ) task_instance = TaskInstance( task=task, execution_date=datetime.utcnow().replace(tzinfo=pytz.utc)) bq_extractor = BigQueryExtractor(task) steps_meta_extract = bq_extractor.extract() assert steps_meta_extract is None task_instance.run() step_meta = bq_extractor.extract_on_complete(task_instance) assert step_meta.context['bigquery.extractor.error'] is not None mock_client.return_value \ .get_job.assert_called_once_with(job_id=bq_job_id) assert step_meta.inputs is not None assert len(step_meta.inputs) == 0 assert step_meta.outputs is not None assert len(step_meta.outputs) == 0 assert step_meta.context['sql'] == task.sql mock_client.return_value.close.assert_called()
def Load_Subdag(tgt_tab, t, s, d, c, w, l, r, args): subdag = models.DAG(dag_id='Skating_ELT.' + t, default_args=args, schedule_interval="@daily") s01 = BigQueryOperator(task_id='truncate_' + tgt_tab, sql='TRUNCATE TABLE `' + d + '`', use_legacy_sql=l, trigger_rule=r, dag=subdag) s02 = BigQueryOperator(task_id='load_' + tgt_tab, sql=s, destination_dataset_table=d, create_disposition=c, write_disposition=w, use_legacy_sql=l, trigger_rule=r, dag=subdag) s01 >> s02 return subdag
def test_bigquery_operator_defaults(self, mock_hook): operator = BigQueryOperator( task_id=TASK_ID, sql='Select * from test_table', dag=self.dag, default_args=self.args ) operator.execute(None) mock_hook.return_value \ .get_conn() \ .cursor() \ .run_query \ .assert_called_once_with( sql='Select * from test_table', destination_dataset_table=None, write_disposition='WRITE_EMPTY', allow_large_results=False, flatten_results=None, udf_config=None, maximum_billing_tier=None, maximum_bytes_billed=None, create_disposition='CREATE_IF_NEEDED', schema_update_options=(), query_params=None, labels=None, priority='INTERACTIVE', time_partitioning=None, api_resource_configs=None, cluster_fields=None, ) self.assertTrue(isinstance(operator.sql, six.string_types)) ti = TaskInstance(task=operator, execution_date=DEFAULT_DATE) ti.render_templates() self.assertTrue(isinstance(ti.task.sql, six.string_types))
def view_redefinition_task_factory(table_config, **kwargs): # load the values if needed in the command you plan to execute dataset = table_config['dataset'] table_name = table_config['table_name'] table_suffix = table_config['table_suffix'] return BigQueryOperator( task_id=f'view_redeploy_{table_name}', sql= f'create or replace view `{dataset}.{table_name}` as select * from `{dataset}.{table_name + table_suffix}`', #destination_dataset_table=False, bigquery_conn_id='bigquery_default', #<-- Need these both google_cloud_storage_conn_id= 'bigquery_default', #<-- becasue of inheritance use_legacy_sql=False, dag=dag)
def add_verify_tasks(task, dependencies=None): # The queries in verify/sqls will fail when the condition is not met # Have to use this trick since the Python 2 version of BigQueryCheckOperator doesn't support standard SQL # and legacy SQL can't be used to query partitioned tables. sql_path = os.path.join( dags_folder, 'resources/stages/verify/sqls/{task}.sql'.format(task=task)) sql = read_file(sql_path) verify_task = BigQueryOperator(task_id='verify_{task}'.format(task=task), bql=sql, use_legacy_sql=False, dag=dag) if dependencies is not None and len(dependencies) > 0: for dependency in dependencies: dependency >> verify_task return verify_task
def get_bq_to_bq_operator( sql_or_filename, dst_table_name, dag=None, params={}, table_expiration_seconds=None, partition_expiration_seconds=None): """Get templated BigQueryOperator. Args: sql_or_filename (string): Valid SQL statement or a path to a sql file. It can be templated using Jinja in either case. dag (airflow.models.DAG): DAG used by context_manager. e.g. `with get_dag() as dag: get_bq_to_bq_operator(..., dag=dag)`. Defaults to None. Returns: airflow.contrib.operators.bigquery_operator.BigQueryOperator """ dag = dag or models._CONTEXT_MANAGER_DAG if dag is None: logger.warning('No DAG context was found. The operator may not be associated to any DAG nor appeared in Web UI') dst_table_name_with_date_descriptor = \ '{table_name}{date_descriptor}'.format( table_name=dst_table_name, date_descriptor='{{ ds_nodash }}') dataset_name = '{experiment_name}_database'.format( experiment_name=get_config('experiment_name')) return BigQueryOperator( dag=dag, task_id='{experiment_name}.{table_name}.bq_to_bq' .format( experiment_name=get_config('experiment_name'), table_name=dst_table_name), sql=sql_or_filename, use_legacy_sql=False, write_disposition="WRITE_TRUNCATE", destination_dataset_table="{gcp_project_name}:{dataset_name}.{table_name}" .format( gcp_project_name=get_config('gcp_project_name'), dataset_name=dataset_name, table_name=dst_table_name_with_date_descriptor), params=params)
def test_extract_cached(self, mock_client, mock_hook): bq_job_id = "foo.bq.job_id" mock_hook.return_value \ .get_conn.return_value \ .cursor.return_value \ .run_query.return_value = bq_job_id job_details = self.read_file_json( "tests/extractors/cached_job_details.json" ) mock_client.return_value.get_job.return_value._properties = job_details # To make sure hasattr "sees" close and calls it mock_client.return_value.close.return_value mock.seal(mock_hook) mock.seal(mock_client) dag = DAG(dag_id='TestBigQueryExtractorE2E') task = BigQueryOperator( sql='select first_name, last_name from customers;', task_id="task_id", project_id="project_id", dag_id="dag_id", dag=dag, start_date=timezone.datetime(2016, 2, 1, 0, 0, 0) ) task_instance = TaskInstance( task=task, execution_date=datetime.utcnow().replace(tzinfo=pytz.utc)) bq_extractor = BigQueryExtractor(task) steps_meta_extract = bq_extractor.extract() assert steps_meta_extract is None task_instance.run() step_meta = bq_extractor.extract_on_complete(task_instance) assert step_meta.inputs is not None assert step_meta.outputs is not None assert len(step_meta.run_facets) == 1 assert step_meta.run_facets['bigQuery_statistics'] \ == BigQueryStaticticsRunFacet(cached=True)
def insert_overwrite(date): str_date = re.sub("-", '', date) print('str_date : %s' % str_date) obj = BigQueryOperator( task_id='insertOverwrite_{}'.format(date), write_disposition= 'WRITE_TRUNCATE', # WRITE_TRUNCATE, WRITE_APPEND, WRITE_EMPTY create_disposition='CREATE_IF_NEEDED', # priority="BATCH", allow_large_results=True, use_legacy_sql=False, location=bq_location, sql=""" SELECT CAST (cyymmdd AS DATE ) AS cyymmdd, un, rgn_cd, cnty_cd, tcom_cd, dvc_gp_id, dvc_modl_id , fw_ver, cp_ver, hw_ver, os_ver FROM sa-bigdata-dev.hive_test.device_origin WHERE cyymmdd = "{}" """.format(date), destination_dataset_table=pj_bigquery + '.' + ds_demo + '.' + tb_profile + '$' + str_date, # maximum_billing_tier=1, #trigger_rule=TriggerRule.ALL_SUCCESS, retries=5, retry_delay=timedelta(seconds=5), dag=dag) return obj
t3 = BigQueryOperator( task_id='bq_write_to_github_daily_metrics', use_legacy_sql=False, write_disposition='WRITE_TRUNCATE', allow_large_results=True, bql=''' #standardSQL SELECT date, repo, SUM(IF(type='WatchEvent', 1, NULL)) AS stars, SUM(IF(type='ForkEvent', 1, NULL)) AS forks FROM ( SELECT FORMAT_TIMESTAMP("%Y%m%d", created_at) AS date, actor.id as actor_id, repo.name as repo, type FROM `githubarchive.day.{{ yesterday_ds_nodash }}` WHERE type IN ('WatchEvent','ForkEvent') ) GROUP BY date, repo ''', destination_dataset_table= 'my-project.github_trends.github_daily_metrics${{ yesterday_ds_nodash }}', dag=dag)
{ 'name': 'timestamp', 'type': 'integer', 'mode': 'nullable' }, { 'name': 'window_start', 'type': 'string', 'mode': 'nullable' }, ], write_disposition='WRITE_TRUNCATE') # Run example query (http://shortn/_BdF1UTEYOb) and save result to the # destination table. t3 = BigQueryOperator( task_id='bq_example_query', bql=""" SELECT name, team, total_score FROM [bq_example.foobar] WHERE total_score > 15 LIMIT 100; """, destination_dataset_table='{0}.gcp_example_query_result'.format( BQ_DATASET_NAME), write_disposition='WRITE_TRUNCATE') t1 >> t2 >> t3
source_objects = ['data/{}.csv'.format(table_name)], destination_project_dataset_table = '{}:{}.{}'.format(params['GCP_PROJECT_ID'],params['BQ_DATASET_ID'],table_name), schema_fields = params_bq_schema[table_name], write_disposition = 'WRITE_TRUNCATE', dag = dag ) list_gcs_to_bq.append(gcs_to_bq) # Se lee la querie de las variables de airflow para obtener el dataset final agrupado y almacenar en BigQuery execute_bq_sql = BigQueryOperator( task_id='execute_bq_sql', sql= query_sql, use_legacy_sql=False, destination_dataset_table=bq_recent_questions_table_id, create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_TRUNCATE', dag = dag ) # Se exporta el resultado de la tabla temporal a GCS export_data_groupby = BigQueryToCloudStorageOperator( task_id='export_table_temp_to_gcs', source_project_dataset_table= bq_recent_questions_table_id, destination_cloud_storage_uris='gs://{}/data/archivo_final_agrupado.csv'.format(params['BUCKET_ID']), export_format='CSV', dag = dag )
dataset_s3_bucket="telemetry-parquet", aws_conn_id="aws_dev_iam_s3", dataset="clients_daily", dataset_version="v6", gke_cluster_name="bq-load-gke-1", reprocess=True, ), task_id="clients_daily_v6_bigquery_load", dag=dag) clients_last_seen = BigQueryOperator( task_id='clients_last_seen', bql='sql/clients_last_seen_v1.sql', destination_dataset_table='telemetry.clients_last_seen_v1${{ds_nodash}}', write_disposition='WRITE_TRUNCATE', use_legacy_sql=False, bigquery_conn_id="google_cloud_derived_datasets", depends_on_past=True, start_date=datetime(2019, 4, 15), dag=dag, ) clients_last_seen_export = SubDagOperator( subdag=export_to_parquet( table="clients_last_seen_v1", arguments=["--submission-date={{ds}}"], parent_dag_name=dag.dag_id, dag_name="clients_last_seen_export", default_args=default_args, num_preemptible_workers=10), task_id="clients_last_seen_export",