def test_sql_sensor_postgres(self): t = SqlSensor( task_id='sql_sensor_check', conn_id='postgres_default', sql="SELECT count(1) FROM INFORMATION_SCHEMA.TABLES", dag=self.dag ) t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_sql_sensor(self): op = SqlSensor(task_id='hdfs_sensor_check', conn_id='presto_default', sql="SELECT 'x' FROM airflow.static_babynames LIMIT 1;", dag=self.dag) op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_unsupported_conn_type(self): t = SqlSensor( task_id='sql_sensor_check', conn_id='redis_default', sql="SELECT count(1) FROM INFORMATION_SCHEMA.TABLES", dag=self.dag ) with self.assertRaises(AirflowException): t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_sql_sensor_postgres_poke(self, mock_hook): t = SqlSensor( task_id='sql_sensor_check', conn_id='postgres_default', sql="SELECT 1", ) mock_hook.get_connection('postgres_default').conn_type = "postgres" mock_get_records = mock_hook.get_connection.return_value.get_hook.return_value.get_records mock_get_records.return_value = [] self.assertFalse(t.poke(None)) mock_get_records.return_value = [[None]] self.assertFalse(t.poke(None)) mock_get_records.return_value = [['None']] self.assertTrue(t.poke(None)) mock_get_records.return_value = [[0.0]] self.assertFalse(t.poke(None)) mock_get_records.return_value = [[0]] self.assertFalse(t.poke(None)) mock_get_records.return_value = [['0']] self.assertTrue(t.poke(None)) mock_get_records.return_value = [['1']] self.assertTrue(t.poke(None))
def test_sql_sensor_postgres_poke_failure(self, mock_hook): t = SqlSensor(task_id='sql_sensor_check', conn_id='postgres_default', sql="SELECT 1", failure=lambda x: x in [1]) mock_hook.get_connection('postgres_default').conn_type = "postgres" mock_get_records = mock_hook.get_connection.return_value.get_hook.return_value.get_records mock_get_records.return_value = [] self.assertFalse(t.poke(None)) mock_get_records.return_value = [[1]] self.assertRaises(AirflowException, t.poke, None)
def test_sql_sensor_postgres_poke(self, mock_hook): t = SqlSensor( task_id='sql_sensor_check', conn_id='postgres_default', sql="SELECT 1", ) mock_get_records = mock_hook.get_connection.return_value.get_hook.return_value.get_records mock_get_records.return_value = [] self.assertFalse(t.poke(None)) mock_get_records.return_value = [['1']] self.assertTrue(t.poke(None))
def test_Operator(self,test_dag): sensor_task = SqlSensor( task_id = 'postgres_sensor', conn_id = 'postgres_dwh', sql = 'select * from staging.audit_runs limit 1;', poke_interval=3, timeout=10, dag=test_dag ) test_operator = multiplyby5_operator.MultiplyBy5Operator( task_id='test_operator', my_operator_param=10, dag=test_dag ) get_value = PythonOperator( task_id='get_value', python_callable=self.xcom_value, dag=test_dag ) # sensor_task >> test_operator >> get_value self.run_task(sensor_task,test_dag) self.run_task(test_operator,test_dag) self.run_task(get_value,test_dag) assert str(self.value) == '50'
def test_sql_sensor_postgres_poke_success(self, mock_hook): op = SqlSensor(task_id='sql_sensor_check', conn_id='postgres_default', sql="SELECT 1", success=lambda x: x in [1]) mock_hook.get_connection('postgres_default').conn_type = "postgres" mock_get_records = mock_hook.get_connection.return_value.get_hook.return_value.get_records mock_get_records.return_value = [] self.assertFalse(op.poke(None)) mock_get_records.return_value = [[1]] self.assertTrue(op.poke(None)) mock_get_records.return_value = [['1']] self.assertFalse(op.poke(None))
def test_sql_sensor_postgres_poke_fail_on_empty(self, mock_hook): t = SqlSensor(task_id='sql_sensor_check', conn_id='postgres_default', sql="SELECT 1", fail_on_empty=True) mock_hook.get_connection('postgres_default').conn_type = "postgres" mock_get_records = mock_hook.get_connection.return_value.get_hook.return_value.get_records mock_get_records.return_value = [] self.assertRaises(AirflowException, t.poke, None)
def test_sql_sensor_postgres_poke_invalid_success(self, mock_hook): t = SqlSensor( task_id='sql_sensor_check', conn_id='postgres_default', sql="SELECT 1", success=[1], ) mock_hook.get_connection('postgres_default').conn_type = "postgres" mock_get_records = mock_hook.get_connection.return_value.get_hook.return_value.get_records mock_get_records.return_value = [[1]] self.assertRaises(AirflowException, t.poke, None)
def test_sql_sensor_mysql(self): t1 = SqlSensor( task_id='sql_sensor_check', conn_id='mysql_default', sql="SELECT count(1) FROM INFORMATION_SCHEMA.TABLES", dag=self.dag ) t1.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) t2 = SqlSensor( task_id='sql_sensor_check', conn_id='mysql_default', sql="SELECT count(%s) FROM INFORMATION_SCHEMA.TABLES", parameters=["table_name"], dag=self.dag ) t2.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_sql_sensor_postgres(self): op1 = SqlSensor(task_id='sql_sensor_check_1', conn_id='postgres_default', sql="SELECT count(1) FROM INFORMATION_SCHEMA.TABLES", dag=self.dag) op1.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) op2 = SqlSensor(task_id='sql_sensor_check_2', conn_id='postgres_default', sql="SELECT count(%s) FROM INFORMATION_SCHEMA.TABLES", parameters=["table_name"], dag=self.dag) op2.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_sql_sensor_postgres_check_allow_null(self, mock_hook): t1 = SqlSensor(task_id='sql_sensor_check', conn_id='postgres_default', sql="SELECT NULL", allow_null=True) mock_hook.get_connection('postgres_default').conn_type = "postgres" mock_get_records = mock_hook.get_connection.return_value.get_hook.return_value.get_records mock_get_records.return_value = [[None]] self.assertTrue(t1.poke(None)) t2 = SqlSensor(task_id='sql_sensor_check', conn_id='postgres_default', sql="SELECT NULL", allow_null=False) mock_hook.get_connection('postgres_default').conn_type = "postgres" mock_get_records = mock_hook.get_connection.return_value.get_hook.return_value.get_records mock_get_records.return_value = [[None]] self.assertFalse(t2.poke(None))
group by user_id_uuid, user_id ), all_past_pageviews as (select user_id_uuid, user_id, max(date(time_received)) as ymd from staging.user_page_log where date(time_received) <= '%(YYDY)s-%(YYDM)s-%(YYDD)s' group by user_id_uuid, user_id) select daily_pageviews.user_id_uuid, daily_pageviews.user_id, '%(YDY)s-%(YDM)s-%(YDD)s'::date as ymd, has_wallet, pageviews, coalesce(all_past_pageviews.ymd, '-Infinity'::date) as last_visit from daily_pageviews left join all_past_pageviews on daily_pageviews.user_id_uuid = all_past_pageviews.user_id_uuid """ delete_cmd = "DELETE FROM agg_visitor_previous_visit WHERE ymd = '%(YDY)s-%(YDM)s-%(YDD)s' " sensor = SqlSensor( task_id="sensor", conn_id='postgres_data_warehouse', sql=sense_sql % MACROS, poke_interval=15 * 60, # minutes timeout=60 * 60 * 13, # 13h, S3 dag has 12h timeout dag=dag) delete = PostgresOperator(task_id="delete", postgres_conn_id='postgres_data_warehouse', sql=delete_cmd % MACROS, dag=dag) insert = PostgresOperator(task_id="insert", postgres_conn_id='postgres_data_warehouse', sql=insert_cmd % MACROS, dag=dag) delete.set_upstream(sensor)
'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5) } dag = DAG('generate_bounty_view_board', default_args=default_args, schedule_interval='@hourly') sensor = SqlSensor(task_id="sensor", conn_id='postgres_data_warehouse', sql=sense_sql % { 'dy': DY, 'dm': DM, 'dd': DD, 'dh': DH, 'lhdy': LHDY, 'lhdm': LHDM, 'lhdd': LHDD, 'lhdh': LHDH }, dag=dag) move_rows_to_partitions = PostgresOperator( task_id="move_rows_to_partitions", postgres_conn_id='postgres_data_warehouse', sql=insert_sql % { 'dy': DY, 'dm': DM, 'dd': DD, 'dh': DH,
dag = airflow.DAG('opos_source_sqoop_di', schedule_interval=schedule_interval, concurrency=15, max_active_runs=1, default_args=args) dag_monitor = airflow.DAG('opos_source_sqoop_di_monitor', schedule_interval=schedule_interval, default_args=args) ##----------------------------------------- 数据采集前元数据校验 ---------------------------------------## import_data_validate = SqlSensor(task_id="import_data_validate", conn_id='mysql_pre_ptsp_db', sql=''' select count(1) from pre_ptsp_db.pre_opos_payment_order_sync_status where DATE_FORMAT(sync_date,"%Y-%m-%d") = '{{ ds }}' and sync_status = 1 ''', dag=dag) import_order_bd_validate = SqlSensor(task_id="import_order_bd_validate", conn_id='mysql_pre_ptsp_db', sql=''' select count(1) from pre_ptsp_db.pre_opos_payment_order_bd_sync_status where DATE_FORMAT(sync_date,"%Y-%m-%d") = '{{ ds }}' and sync_status = 1 ''', dag=dag)
dag = airflow.DAG('oride_source_sqoop_df', schedule_interval=schedule_interval, concurrency=40, max_active_runs=1, default_args=args) dag_monitor = airflow.DAG('oride_source_sqoop_df_monitor', schedule_interval=schedule_interval, default_args=args) check_data_driver_records_finish = SqlSensor( task_id="check_data_driver_records_finish", conn_id='sqoop_db', sql=''' select count(1) from oride_data.data_driver_records_finish where from_unixtime(day, "%Y-%m-%d") = '{{ macros.ds_add(ds, -1) }}' ''', dag=dag) ##----------------------------------------- 任务超时监控 ---------------------------------------## def fun_task_timeout_monitor(ds, db_name, table_name, **op_kwargs): tb = [{ "db": db_name, "table": table_name, "partition": "dt={pt}".format(pt=ds), "timeout": "7200"
def dbt_dags_factory( dwh_engine, dwh_conn_id, project_name, dbt_schema_name, airflow_conn_id, dag_base_name='DBT_run', analytics_reader=None, # list of users of DWH who are read-only schedule_interval=timedelta(hours=1), start_date=datetime(2019,1,1), default_args=None, folder=None, models=None, exclude=None, ): if analytics_reader: for statement in ('insert', 'update', 'delete', 'drop', 'create', 'select', ';', 'grant'): for reader in analytics_reader: if (statement in reader.lower()): raise Exception('Error! The analytics reader {0} ' \ + 'is invalid.'.format(reader)) #analytics_reader = analytics_reader.split(',') analytics_reader_sql = f'\nGRANT USAGE ON SCHEMA "{dbt_schema_name}"' analytics_reader_sql += ' TO "{0}";' analytics_reader_sql += f''' \nGRANT SELECT ON ALL TABLES IN SCHEMA "{dbt_schema_name}"''' + ' TO "{0}";' analytics_reader_sql = ''.join([ analytics_reader_sql.format(i) for i in analytics_reader ]) if models and not (type(models) == str): models = ' --models ' + ' '.join(models) else: models = '' if exclude and not (type(exclude) == str): exclude = ' --exclude ' + ' '.join(exclude) else: exclude = '' flags = models + exclude dag = DAG( dag_base_name, catchup=False, max_active_runs=1, schedule_interval=schedule_interval, start_date=start_date, default_args=default_args, ) dag_full_refresh = DAG( dag_base_name + '_full_refresh', catchup=False, max_active_runs=1, schedule_interval=None, start_date=start_date, default_args=default_args, ) folder = folder or (os.environ.get('AIRFLOW_HOME') or conf.get('core', 'airflow_home')).replace( 'airflow_home/airflow', 'dbt_home', ) bash_command = ''' cd {1} source env/bin/activate cd {2} dbt {0} '''.format( '{0}', folder, project_name, ) sensor_sql = """ SELECT CASE WHEN COUNT(*) = 0 THEN 1 ELSE 0 END -- only run if exatly equal to 0 FROM public.dag_run WHERE dag_id IN ('{0}', '{1}') and state = 'running' and not (run_id = '{2}') """.format( dag._dag_id, dag_full_refresh._dag_id, '{{ run_id }}', ) # refactor?! not coupled to values in profiles.yml! if dwh_engine == EC.DWH_ENGINE_POSTGRES: conn = BaseHook.get_connection(dwh_conn_id) env = { 'DBT_DWH_HOST': str(conn.host), 'DBT_DWH_USER': str(conn.login), 'DBT_DWH_PASS': str(conn.password), 'DBT_DWH_PORT': str(conn.port), 'DBT_DWH_DBNAME': str(conn.schema), 'DBT_DWH_SCHEMA': dbt_schema_name, 'DBT_PROFILES_DIR': folder, } elif dwh_engine == EC.DWH_ENGINE_SNOWFLAKE: analytics_conn = BaseHook.get_connection(dwh_conn_id) analytics_conn_extra = analytics_conn.extra_dejson env = { 'DBT_ACCOUNT': analytics_conn_extra.get( 'account', analytics_conn.host, ), 'DBT_USER': analytics_conn.login, 'DBT_PASS': analytics_conn.password, 'DBT_ROLE': analytics_conn_extra.get('role'), 'DBT_DB': analytics_conn_extra.get('database'), 'DBT_WH': analytics_conn_extra.get('warehouse'), 'DBT_SCHEMA': dbt_schema_name, 'DBT_PROFILES_DIR': folder, } else: raise ValueError('DWH type not implemented!') #with dag: snsr = SqlSensor( task_id='sense_dbt_conflict_avoided', conn_id=airflow_conn_id, sql=sensor_sql, dag=dag, ) dbt_seed = BashOperator( task_id='run_dbt_seed', bash_command=bash_command.format('seed'), env=env, dag=dag, ) dbt_run = BashOperator( task_id='run_dbt', bash_command=bash_command.format('run' + flags), env=env, dag=dag, ) dbt_test = BashOperator( task_id='test_dbt', bash_command=bash_command.format('test' + flags), env=env, dag=dag, ) dbt_docs = BashOperator( task_id='create_dbt_docs', bash_command=bash_command.format('docs generate'), env=env, dag=dag, ) snsr >> dbt_seed >> dbt_run >> dbt_test if analytics_reader: # This should not occur when using Snowflake read_rights = PostgresOperator( task_id='grant_access_to_read_users', sql=analytics_reader_sql, postgres_conn_id=dwh_conn_id, dag=dag, ) dbt_test >> read_rights >> dbt_docs else: dbt_test >> dbt_docs #with dag_full_refresh: snsr = SqlSensor( task_id='sense_dbt_conflict_avoided', conn_id=airflow_conn_id, sql=sensor_sql, dag=dag_full_refresh, ) dbt_seed = BashOperator( task_id='run_dbt_seed', bash_command=bash_command.format('seed'), env=env, dag=dag_full_refresh, ) dbt_run = BashOperator( task_id='run_dbt', bash_command=bash_command.format('run --full-refresh' + flags), env=env, dag=dag_full_refresh, ) dbt_test = BashOperator( task_id='test_dbt', bash_command=bash_command.format('test' + flags), env=env, dag=dag_full_refresh, ) dbt_docs = BashOperator( task_id='create_dbt_docs', bash_command=bash_command.format('docs generate'), env=env, dag=dag_full_refresh, ) snsr >> dbt_seed >> dbt_run >> dbt_test if analytics_reader: read_rights = PostgresOperator( task_id='grant_access_to_read_users', sql=analytics_reader_sql, postgres_conn_id=dwh_conn_id, dag=dag_full_refresh, ) dbt_test >> read_rights >> dbt_docs else: dbt_test >> dbt_docs return (dag, dag_full_refresh)
schedule_interval='@hourly', max_active_runs=1, catchup=False) # select posts that include 'china' or 'hong kong' in the title within the previous hour QUERY = """ SELECT * FROM {table} WHERE (post_title ILIKE '%hong kong%' OR post_title ILIKE '%china%') AND creation_datetime BETWEEN DATE_TRUNC('hour', NOW() - interval '1 hour') AND DATE_TRUNC('hour', NOW()) """.format(table=config['reddit_news_db']['table']) sensor = SqlSensor( task_id="news_sensor", soft_fail=True, # mark task as SKIPPED on failure conn_id=config['Airflow']['postgres_conn_id'], sql=QUERY, poke_interval=1, # fail right away if no records are found timeout=1, dag=dag) email_alert = EmailOperator( task_id='email_alert', to=['*****@*****.**'], subject='China/Hong Kong news available', html_content= '<h1>There is news about China or Hong Kong within the previous hour</h1>', dag=dag) sensor >> email_alert
'start_date': datetime(2019, 1, 21, 0), 'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5) } dag = DAG('generate_user_page_log', default_args=default_args, schedule_interval='@hourly') sensor = SqlSensor( task_id="sensor", conn_id='postgres_data_warehouse', sql=sense_sql % {'dy':DY, 'dm':DM, 'dd':DD, 'dh':DH, 'lhdy':LHDY, 'lhdm':LHDM, 'lhdd':LHDD, 'lhdh':LHDH}, timeout=60 * 60 * 13, # 13h timeout (S3 load dag has 12h timeout) poke_interval=15 * 60, # 15m between pokes dag=dag ) create_partitions = PostgresOperator( task_id="create_table", postgres_conn_id='postgres_data_warehouse', sql=create_sql % {'dy':DY, 'dm':DM, 'dd':DD, 'dh':DH, 'lhdy':LHDY, 'lhdm':LHDM, 'lhdd':LHDD, 'lhdh':LHDH}, dag=dag ) move_rows_to_partitions = PostgresOperator( task_id="move_rows_to_partitions",
def fun_task_timeout_monitor(ds, db_name, table_name, **op_kwargs): tb = [{ "db": db_name, "table": table_name, "partition": "dt={pt}".format(pt=ds), "timeout": "7200" }] TaskTimeoutMonitor().set_task_monitor(tb) import_data_validate = SqlSensor(task_id="import_data_validate", conn_id='ofood_db', sql=''' select count(1) from food_operapay_co.jh_crontab_log where day = '{{ ds_nodash }}' and crontab_status = 'bi-ok' ''', dag=dag) ''' 导入数据的列表 db_name,table_name,conn_id,prefix_name ''' # table_list = [ # 订单相关数据 ("food_operapay_co", "jh_order", "ofood_db", "base"), ("food_operapay_co", "jh_waimai_order", "ofood_db", "base"),