def setUp(self): configuration.test_mode() utils.initdb() args = {'owner': 'airflow', 'start_date': datetime(2015, 1, 1)} dag = DAG(TEST_DAG_ID, default_args=args) dag.clear(start_date=DEFAULT_DATE, end_date=datetime.now()) self.dag = dag
def test_schedule_dag_no_previous_runs(self): """ Tests scheduling a dag with no previous runs """ dag = DAG(self.TEST_SCHEDULE_WITH_NO_PREVIOUS_RUNS_DAG_ID) dag.add_task(BaseOperator( task_id="faketastic", owner='Also fake', start_date=datetime(2015, 1, 2, 0, 0))) dag_file_processor = DagFileProcessor(dag_ids=[], log=mock.MagicMock()) dag_run = dag_file_processor.create_dag_run(dag) self.assertIsNotNone(dag_run) self.assertEqual(dag.dag_id, dag_run.dag_id) self.assertIsNotNone(dag_run.run_id) self.assertNotEqual('', dag_run.run_id) self.assertEqual( datetime(2015, 1, 2, 0, 0), dag_run.execution_date, msg='dag_run.execution_date did not match expectation: {0}' .format(dag_run.execution_date) ) self.assertEqual(State.RUNNING, dag_run.state) self.assertFalse(dag_run.external_trigger) dag.clear()
class BranchOperatorTest(unittest.TestCase): def setUp(self): self.dag = DAG('branch_operator_test', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE}, schedule_interval=INTERVAL) self.branch_op = BranchPythonOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: 'branch_1') self.branch_1 = DummyOperator(task_id='branch_1', dag=self.dag) self.branch_1.set_upstream(self.branch_op) self.branch_2 = DummyOperator(task_id='branch_2', dag=self.dag) self.branch_2.set_upstream(self.branch_op) self.dag.clear() def test_without_dag_run(self): """This checks the defensive against non existent tasks in a dag run""" self.branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) session = Session() tis = session.query(TI).filter( TI.dag_id == self.dag.dag_id, TI.execution_date == DEFAULT_DATE ) session.close() for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1': # should exist with state None self.assertEquals(ti.state, State.NONE) elif ti.task_id == 'branch_2': self.assertEquals(ti.state, State.SKIPPED) else: raise def test_with_dag_run(self): dr = self.dag.create_dagrun( run_id="manual__", start_date=datetime.datetime.now(), execution_date=DEFAULT_DATE, state=State.RUNNING ) self.branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1': self.assertEquals(ti.state, State.NONE) elif ti.task_id == 'branch_2': self.assertEquals(ti.state, State.SKIPPED) else: raise
def test_with_dag_run(self): value = False dag = DAG('shortcircuit_operator_test_with_dag_run', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE }, schedule_interval=INTERVAL) short_op = ShortCircuitOperator(task_id='make_choice', dag=dag, python_callable=lambda: value) branch_1 = DummyOperator(task_id='branch_1', dag=dag) branch_1.set_upstream(short_op) branch_2 = DummyOperator(task_id='branch_2', dag=dag) branch_2.set_upstream(branch_1) upstream = DummyOperator(task_id='upstream', dag=dag) upstream.set_downstream(short_op) dag.clear() logging.error("Tasks {}".format(dag.tasks)) dr = dag.create_dagrun( run_id="manual__", start_date=datetime.datetime.now(), execution_date=DEFAULT_DATE, state=State.RUNNING ) upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() self.assertEqual(len(tis), 4) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.SKIPPED) else: raise value = True dag.clear() dr.verify_integrity() upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() self.assertEqual(len(tis), 4) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.NONE) else: raise
def test_without_dag_run(self): """This checks the defensive against non existent tasks in a dag run""" value = False dag = DAG('shortcircuit_operator_test_without_dag_run', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE }, schedule_interval=INTERVAL) short_op = ShortCircuitOperator(task_id='make_choice', dag=dag, python_callable=lambda: value) branch_1 = DummyOperator(task_id='branch_1', dag=dag) branch_1.set_upstream(short_op) branch_2 = DummyOperator(task_id='branch_2', dag=dag) branch_2.set_upstream(branch_1) upstream = DummyOperator(task_id='upstream', dag=dag) upstream.set_downstream(short_op) dag.clear() short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) session = Session() tis = session.query(TI).filter( TI.dag_id == dag.dag_id, TI.execution_date == DEFAULT_DATE ) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': # should not exist raise elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.SKIPPED) else: raise value = True dag.clear() short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': # should not exist raise elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.NONE) else: raise session.close()
def test_operator(): dag = DAG(dag_id='test', default_args={'start_date': datetime(2019, 1, 1)}) with dag: http_to_gcs = HttpToGoogleCloudStorageOperator( endpoint='/test?ds={{ ds }}/', bucket='test-bucket', filename='/data/ds={{ ds }}/file', task_id='test') dag.clear() return http_to_gcs
def test_schedule_dag_once(self): """ Tests scheduling a dag scheduled for @once - should be scheduled the first time it is called, and not scheduled the second. """ dag = DAG(self.TEST_SCHEDULE_ONCE_DAG_ID) dag.schedule_interval = '@once' dag.add_task(BaseOperator( task_id="faketastic", owner='Also fake', start_date=datetime(2015, 1, 2, 0, 0))) dag_run = DagFileProcessor(dag_ids=[], log=mock.MagicMock()).create_dag_run(dag) dag_run2 = DagFileProcessor(dag_ids=[], log=mock.MagicMock()).create_dag_run(dag) self.assertIsNotNone(dag_run) self.assertIsNone(dag_run2) dag.clear()
def test_schedule_dag_once(self): """ Tests scheduling a dag scheduled for @once - should be scheduled the first time it is called, and not scheduled the second. """ dag = DAG(self.TEST_SCHEDULE_ONCE_DAG_ID) dag.schedule_interval = '@once' dag.add_task(BaseOperator( task_id="faketastic", owner='Also fake', start_date=datetime(2015, 1, 2, 0, 0))) dag_run = jobs.SchedulerJob(**self.default_scheduler_args).create_dag_run(dag) dag_run2 = jobs.SchedulerJob(**self.default_scheduler_args).create_dag_run(dag) self.assertIsNotNone(dag_run) self.assertIsNone(dag_run2) dag.clear()
def test_schedule_dag_relativedelta(self): """ Tests scheduling a dag with a relativedelta schedule_interval """ delta = relativedelta(hours=+1) dag = DAG(self.TEST_SCHEDULE_RELATIVEDELTA_DAG_ID, schedule_interval=delta) dag.add_task(BaseOperator( task_id="faketastic", owner='Also fake', start_date=datetime(2015, 1, 2, 0, 0))) dag_file_processor = DagFileProcessor(dag_ids=[], log=mock.MagicMock()) dag_run = dag_file_processor.create_dag_run(dag) self.assertIsNotNone(dag_run) self.assertEqual(dag.dag_id, dag_run.dag_id) self.assertIsNotNone(dag_run.run_id) self.assertNotEqual('', dag_run.run_id) self.assertEqual( datetime(2015, 1, 2, 0, 0), dag_run.execution_date, msg='dag_run.execution_date did not match expectation: {0}' .format(dag_run.execution_date) ) self.assertEqual(State.RUNNING, dag_run.state) self.assertFalse(dag_run.external_trigger) dag_run2 = dag_file_processor.create_dag_run(dag) self.assertIsNotNone(dag_run2) self.assertEqual(dag.dag_id, dag_run2.dag_id) self.assertIsNotNone(dag_run2.run_id) self.assertNotEqual('', dag_run2.run_id) self.assertEqual( datetime(2015, 1, 2, 0, 0) + delta, dag_run2.execution_date, msg='dag_run2.execution_date did not match expectation: {0}' .format(dag_run2.execution_date) ) self.assertEqual(State.RUNNING, dag_run2.state) self.assertFalse(dag_run2.external_trigger) dag.clear()
f'The two values differ {pulled_value_1} and {value_1}') if pulled_value_2 != value_2: raise ValueError( f'The two values differ {pulled_value_2} and {value_2}') push1 = PythonOperator( task_id='push', dag=dag, python_callable=push, ) push2 = PythonOperator( task_id='push_by_returning', dag=dag, python_callable=push_by_returning, ) pull = PythonOperator( task_id='puller', dag=dag, python_callable=puller, ) pull << [push1, push2] if __name__ == '__main__': from airflow.utils.state import State dag.clear(dag_run_state=State.NONE) dag.run()
def test_already_running_then_skip(env, setup_teardown, airflow_session): def datafile(filename): return os.path.join('/workspace/airflow/dags/libs/shared/data', filename) # Save these snippets for later in case we need to mock an success. - Stu M. 4/29/19 # http = HttpMock(datafile('dataflow.json'), {'status': '200'}) # requestBuilder = RequestMockBuilder( # {'dataflow.projects.templates.launch': (None, '{"job": ""}')} # ) # with pytest.raises(HttpError) as e: # job = task.execute(ti.get_template_context()) # assert e.resp.status == 409 http = HttpMock(datafile('dataflow.json'), {'status': '200'}) errorResponse = httplib2.Response({ 'status': '409', 'reason': 'Server Error' }) requestBuilder = RequestMockBuilder( {'dataflow.projects.templates.launch': (errorResponse, b'')}) dag = DAG('shortcircuit_operator_test_with_dag_run', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE }, schedule_interval=INTERVAL) task = ScheduleDataflowJobOperator( project=env['project'], template_name='load_vibe_to_lake', job_name='schedule-dataflow-test-{}'.format(int(time.time())), job_parameters={ 'client': 'bluesun', 'table': 'pyr_bluesun_local.tree_user_types', 'dest': '{}:lake.tree_user_types'.format(env['project']) }, dag=dag, task_id='schedule_dataflow_operation', http=http, requestBuilder=requestBuilder) middle_task = DummyOperator(task_id='middle_task', dag=dag) finish_task = DummyOperator(task_id='finish', dag=dag) task >> middle_task >> finish_task dag.clear() task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) with airflow_session() as session: tis = session.query(TaskInstance).filter( TaskInstance.dag_id == dag.dag_id, TaskInstance.execution_date == DEFAULT_DATE) for ti in tis: if ti.task_id == 'schedule_dataflow_operation': assert ti.state == State.SUCCESS elif ti.task_id == 'middle_task': assert ti.state == State.SKIPPED elif ti.task_id == 'finish': assert ti.state == State.SKIPPED
import os from utils import default_args, resolve_file from airflow import DAG from airflow_kubernetes_job_operator.kubernetes_job_operator import KubernetesJobOperator dag = DAG( "kub-job-op-custom", default_args=default_args, description="Test base job operator", schedule_interval=None, catchup=False, ) with dag: KubernetesJobOperator(task_id="test_dbl_log", body_filepath=__file__ + ".yaml") if __name__ == "__main__": dag.clear() dag.run()
class ShortCircuitOperatorTest(unittest.TestCase): def setUp(self): self.dag = DAG('shortcircuit_operator_test', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE}, schedule_interval=INTERVAL) self.short_op = ShortCircuitOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: self.value) self.branch_1 = DummyOperator(task_id='branch_1', dag=self.dag) self.branch_1.set_upstream(self.short_op) self.branch_2 = DummyOperator(task_id='branch_2', dag=self.dag) self.branch_2.set_upstream(self.branch_1) self.upstream = DummyOperator(task_id='upstream', dag=self.dag) self.upstream.set_downstream(self.short_op) self.dag.clear() self.value = True def test_without_dag_run(self): """This checks the defensive against non existent tasks in a dag run""" self.value = False self.short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) session = Session() tis = session.query(TI).filter( TI.dag_id == self.dag.dag_id, TI.execution_date == DEFAULT_DATE ) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': # should not exist raise elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.SKIPPED) else: raise self.value = True self.dag.clear() self.short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': # should not exist raise elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.NONE) else: raise session.close() def test_with_dag_run(self): self.value = False logging.error("Tasks {}".format(self.dag.tasks)) dr = self.dag.create_dagrun( run_id="manual__", start_date=datetime.datetime.now(), execution_date=DEFAULT_DATE, state=State.RUNNING ) self.upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) self.short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() self.assertEqual(len(tis), 4) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.SKIPPED) else: raise self.value = True self.dag.clear() dr.verify_integrity() self.upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) self.short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() self.assertEqual(len(tis), 4) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.NONE) else: raise
'team_col_name': TEAM_COL_NAME, 'validation_results_path': VALIDATION_RESULTS_PATH, 'parse_results_path': PARSE_RESULTS_PATH, 'error_stats_path': ERROR_STATS_PATH, 'blacklist_conn_id': PROJECT_DB_CONN_ID, 'temp_table_name': TEMP_TABLE_NAME }, templates_dict={'load_data_query': 'load_data.sql'}, provide_context=True, dag=dag) send_report_task = PythonOperator(task_id='send_report', python_callable=send_report, op_kwargs={ 'token_id': Variable.get('HW3_TELEGRAM_BOT_TOKEN'), 'chat_id': Variable.get('HW3_TELEGRAM_CHAT_ID'), 'error_stats_path': ERROR_STATS_PATH }, provide_context=True, dag=dag) (process_urls_task >> parse_urls_task >> calculate_stats_task >> send_report_task) if __name__ == '__main__': dag.clear(reset_dag_runs=True) dag.run()
schedule_interval='0 3 * * *') blp_logs = EmrCreateJobFlowOperator(task_id='blp_create_job_flow', job_flow_overrides={'Steps': BLP_STEPS}, aws_conn_id='aws_data_iam', emr_conn_id='emr_data_iam_mango', dag=blp_dag) blp_job_sensor = EmrJobFlowSensor( task_id='blp_check_job_flow', job_flow_id= "{{ task_instance.xcom_pull('blp_create_job_flow', key='return_value') }}", aws_conn_id='aws_data_iam', dag=blp_dag, on_retry_callback=lambda context: blp_dag.clear( start_date=context['execution_date'], end_date=context['execution_date']), ) gcp_conn_id = "google_cloud_derived_datasets" connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id) gcstj_object_conditions = {'includePrefixes': 'blpadi/{{ ds }}'} gcstj_transfer_options = {'deleteObjectsUniqueInSink': True} bq_args = [ 'bq', '--location=US', 'load', '--source_format=CSV',
blp_logs = EmrCreateJobFlowOperator( task_id='blp_create_job_flow', job_flow_overrides={'Steps': BLP_STEPS}, aws_conn_id='aws_data_iam', emr_conn_id='emr_data_iam_mango', dag=blp_dag ) blp_job_sensor = EmrJobFlowSensor( task_id='blp_check_job_flow', job_flow_id="{{ task_instance.xcom_pull('blp_create_job_flow', key='return_value') }}", aws_conn_id='aws_data_iam', dag=blp_dag, on_retry_callback=lambda context: blp_dag.clear( start_date=context['execution_date'], end_date=context['execution_date']), ) gcp_conn_id = "google_cloud_derived_datasets" connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id) gcstj_object_conditions = { 'includePrefixes': 'blpadi/{{ ds }}' } gcstj_transfer_options = { 'deleteObjectsUniqueInSink': True } bq_args = [