def test_scheduler_process_check_heartrate(self): """ Test if process dag honors the heartrate """ dag = DAG( dag_id='test_scheduler_process_check_heartrate', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) orm_dag.last_scheduler_run = datetime.datetime.now() session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() scheduler.heartrate = 1000 dag.clear() dr = scheduler.schedule_dag(dag) self.assertIsNotNone(dr) queue = mock.Mock() scheduler.process_dag(dag, queue=queue) queue.put.assert_not_called()
def setUp(self): configuration.load_test_config() from airflow.contrib.hooks.ssh_hook import SSHHook hook = SSHHook(ssh_conn_id='ssh_default') hook.no_host_key_check = True args = { 'owner': 'airflow', 'start_date': DEFAULT_DATE, 'provide_context': True } dag = DAG(TEST_DAG_ID + 'test_schedule_dag_once', default_args=args) dag.schedule_interval = '@once' self.hook = hook self.dag = dag self.test_dir = "/tmp" self.test_local_dir = "/tmp/tmp2" self.test_remote_dir = "/tmp/tmp1" self.test_local_filename = 'test_local_file' self.test_remote_filename = 'test_remote_file' self.test_local_filepath = '{0}/{1}'.format(self.test_dir, self.test_local_filename) # Local Filepath with Intermediate Directory self.test_local_filepath_int_dir = '{0}/{1}'.format(self.test_local_dir, self.test_local_filename) self.test_remote_filepath = '{0}/{1}'.format(self.test_dir, self.test_remote_filename) # Remote Filepath with Intermediate Directory self.test_remote_filepath_int_dir = '{0}/{1}'.format(self.test_remote_dir, self.test_remote_filename)
def test_scheduler_do_not_schedule_removed_task(self): dag = DAG( dag_id='test_scheduler_do_not_schedule_removed_task', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.schedule_dag(dag) self.assertIsNotNone(dr) dag = DAG( dag_id='test_scheduler_do_not_schedule_removed_task', start_date=DEFAULT_DATE) queue = mock.Mock() scheduler.process_dag(dag, queue=queue) queue.put.assert_not_called()
def test_clear_task_instances_without_task(self): dag = DAG('test_clear_task_instances_without_task', start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + datetime.timedelta(days=10)) task0 = DummyOperator(task_id='task0', owner='test', dag=dag) task1 = DummyOperator(task_id='task1', owner='test', dag=dag, retries=2) ti0 = TI(task=task0, execution_date=DEFAULT_DATE) ti1 = TI(task=task1, execution_date=DEFAULT_DATE) ti0.run() ti1.run() # Remove the task from dag. dag.task_dict = {} self.assertFalse(dag.has_task(task0.task_id)) self.assertFalse(dag.has_task(task1.task_id)) session = settings.Session() qry = session.query(TI).filter( TI.dag_id == dag.dag_id).all() clear_task_instances(qry, session) session.commit() # When dag is None, max_tries will be maximum of original max_tries or try_number. ti0.refresh_from_db() ti1.refresh_from_db() # Next try to run will be try 2 self.assertEqual(ti0.try_number, 2) self.assertEqual(ti0.max_tries, 1) self.assertEqual(ti1.try_number, 2) self.assertEqual(ti1.max_tries, 2)
def test_following_previous_schedule_daily_dag_CET_to_CEST(self): """ Make sure DST transitions are properly observed """ local_tz = pendulum.timezone('Europe/Zurich') start = local_tz.convert(datetime.datetime(2018, 3, 25, 2), dst_rule=pendulum.PRE_TRANSITION) utc = timezone.convert_to_utc(start) dag = DAG('tz_dag', start_date=start, schedule_interval='0 3 * * *') prev = dag.previous_schedule(utc) prev_local = local_tz.convert(prev) self.assertEqual(prev_local.isoformat(), "2018-03-24T03:00:00+01:00") self.assertEqual(prev.isoformat(), "2018-03-24T02:00:00+00:00") _next = dag.following_schedule(utc) next_local = local_tz.convert(_next) self.assertEqual(next_local.isoformat(), "2018-03-25T03:00:00+02:00") self.assertEqual(_next.isoformat(), "2018-03-25T01:00:00+00:00") prev = dag.previous_schedule(_next) prev_local = local_tz.convert(prev) self.assertEqual(prev_local.isoformat(), "2018-03-24T03:00:00+01:00") self.assertEqual(prev.isoformat(), "2018-03-24T02:00:00+00:00")
def test_sync_to_db(self, mock_now): dag = DAG( 'dag', start_date=DEFAULT_DATE, ) with dag: DummyOperator(task_id='task', owner='owner1') SubDagOperator( task_id='subtask', owner='owner2', subdag=DAG( 'dag.subtask', start_date=DEFAULT_DATE, ) ) now = datetime.datetime.utcnow().replace(tzinfo=pendulum.timezone('UTC')) mock_now.return_value = now session = settings.Session() dag.sync_to_db(session=session) orm_dag = session.query(DagModel).filter(DagModel.dag_id == 'dag').one() self.assertEqual(set(orm_dag.owners.split(', ')), {'owner1', 'owner2'}) self.assertEqual(orm_dag.last_scheduler_run, now) self.assertTrue(orm_dag.is_active) self.assertIsNone(orm_dag.default_view) self.assertEqual(orm_dag.get_default_view(), configuration.conf.get('webserver', 'dag_default_view').lower()) self.assertEqual(orm_dag.safe_dag_id, 'dag') orm_subdag = session.query(DagModel).filter( DagModel.dag_id == 'dag.subtask').one() self.assertEqual(set(orm_subdag.owners.split(', ')), {'owner1', 'owner2'}) self.assertEqual(orm_subdag.last_scheduler_run, now) self.assertTrue(orm_subdag.is_active) self.assertEqual(orm_subdag.safe_dag_id, 'dag__dot__subtask')
def test_skip(self, mock_now): session = settings.Session() now = datetime.datetime.utcnow().replace(tzinfo=pendulum.timezone('UTC')) mock_now.return_value = now dag = DAG( 'dag', start_date=DEFAULT_DATE, ) with dag: tasks = [DummyOperator(task_id='task')] dag_run = dag.create_dagrun( run_id='manual__' + now.isoformat(), state=State.FAILED, ) SkipMixin().skip( dag_run=dag_run, execution_date=now, tasks=tasks, session=session) session.query(TI).filter( TI.dag_id == 'dag', TI.task_id == 'task', TI.state == State.SKIPPED, TI.start_date == now, TI.end_date == now, ).one()
def test_following_previous_schedule(self): """ Make sure DST transitions are properly observed """ local_tz = pendulum.timezone('Europe/Zurich') start = local_tz.convert(datetime.datetime(2018, 10, 28, 2, 55), dst_rule=pendulum.PRE_TRANSITION) self.assertEqual(start.isoformat(), "2018-10-28T02:55:00+02:00", "Pre-condition: start date is in DST") utc = timezone.convert_to_utc(start) dag = DAG('tz_dag', start_date=start, schedule_interval='*/5 * * * *') _next = dag.following_schedule(utc) next_local = local_tz.convert(_next) self.assertEqual(_next.isoformat(), "2018-10-28T01:00:00+00:00") self.assertEqual(next_local.isoformat(), "2018-10-28T02:00:00+01:00") prev = dag.previous_schedule(utc) prev_local = local_tz.convert(prev) self.assertEqual(prev_local.isoformat(), "2018-10-28T02:50:00+02:00") prev = dag.previous_schedule(_next) prev_local = local_tz.convert(prev) self.assertEqual(prev_local.isoformat(), "2018-10-28T02:55:00+02:00") self.assertEqual(prev, utc)
def test_scheduler_process_task_instances(self): """ Test if _process_task_instances puts the right task instances into the queue. """ dag = DAG( dag_id='test_scheduler_process_execute_task', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) queue = mock.Mock() scheduler._process_task_instances(dag, queue=queue) queue.append.assert_called_with( (dag.dag_id, dag_task1.task_id, DEFAULT_DATE) )
def setUp(self): configuration.load_test_config() from airflow.contrib.hooks.ssh_hook import SSHHook from airflow.hooks.S3_hook import S3Hook hook = SSHHook(ssh_conn_id='ssh_default') s3_hook = S3Hook('aws_default') hook.no_host_key_check = True args = { 'owner': 'airflow', 'start_date': DEFAULT_DATE, 'provide_context': True } dag = DAG(TEST_DAG_ID + 'test_schedule_dag_once', default_args=args) dag.schedule_interval = '@once' self.hook = hook self.s3_hook = s3_hook self.ssh_client = self.hook.get_conn() self.sftp_client = self.ssh_client.open_sftp() self.dag = dag self.s3_bucket = BUCKET self.sftp_path = SFTP_PATH self.s3_key = S3_KEY
def test_scheduler_does_not_run_excluded(self): dag = DAG( dag_id='test_scheduler_does_not_run_excluded', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) tis = dr.get_task_instances(session=session) for ti in tis: ti.state = State.EXCLUDED session.commit() session.close() queue = mock.Mock() scheduler._process_task_instances(dag, queue=queue) queue.put.assert_not_called()
def test_scheduler_do_not_schedule_too_early(self): dag = DAG( dag_id='test_scheduler_do_not_schedule_too_early', start_date=datetime.datetime(2200, 1, 1)) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNone(dr) queue = mock.Mock() scheduler._process_task_instances(dag, queue=queue) queue.put.assert_not_called()
def test_scheduler_verify_max_active_runs(self): """ Test if a a dagrun will not be scheduled if max_dag_runs has been reached """ dag = DAG( dag_id='test_scheduler_verify_max_active_runs', start_date=DEFAULT_DATE) dag.max_active_runs = 1 dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) dr = scheduler.create_dag_run(dag) self.assertIsNone(dr)
def test_scheduler_fail_dagrun_timeout(self): """ Test if a a dagrun wil be set failed if timeout """ dag = DAG( dag_id='test_scheduler_fail_dagrun_timeout', start_date=DEFAULT_DATE) dag.dagrun_timeout = datetime.timedelta(seconds=60) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) dr.start_date = datetime.datetime.now() - datetime.timedelta(days=1) session.merge(dr) session.commit() dr2 = scheduler.create_dag_run(dag) self.assertIsNotNone(dr2) dr.refresh_from_db(session=session) self.assertEquals(dr.state, State.FAILED)
def test_dag_clear(self): dag = DAG('test_dag_clear', start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + datetime.timedelta(days=10)) task0 = DummyOperator(task_id='test_dag_clear_task_0', owner='test', dag=dag) ti0 = TI(task=task0, execution_date=DEFAULT_DATE) # Next try to run will be try 1 self.assertEqual(ti0.try_number, 1) ti0.run() self.assertEqual(ti0.try_number, 2) dag.clear() ti0.refresh_from_db() self.assertEqual(ti0.try_number, 2) self.assertEqual(ti0.state, State.NONE) self.assertEqual(ti0.max_tries, 1) task1 = DummyOperator(task_id='test_dag_clear_task_1', owner='test', dag=dag, retries=2) ti1 = TI(task=task1, execution_date=DEFAULT_DATE) self.assertEqual(ti1.max_tries, 2) ti1.try_number = 1 # Next try will be 2 ti1.run() self.assertEqual(ti1.try_number, 3) self.assertEqual(ti1.max_tries, 2) dag.clear() ti0.refresh_from_db() ti1.refresh_from_db() # after clear dag, ti2 should show attempt 3 of 5 self.assertEqual(ti1.max_tries, 4) self.assertEqual(ti1.try_number, 3) # after clear dag, ti1 should show attempt 2 of 2 self.assertEqual(ti0.try_number, 2) self.assertEqual(ti0.max_tries, 1)
def test_scheduler_process_execute_task(self): """ Test if process dag sends a task to the executor """ dag = DAG( dag_id='test_scheduler_process_execute_task', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag.clear() dr = scheduler.schedule_dag(dag) self.assertIsNotNone(dr) queue = mock.Mock() scheduler.process_dag(dag, queue=queue) queue.put.assert_called_with( ((dag.dag_id, dag_task1.task_id, DEFAULT_DATE), None) ) tis = dr.get_task_instances(state=State.SCHEDULED) self.assertIsNotNone(tis)
def test_scheduler_verify_pool_full(self, mock_pool_full): """ Test task instances not queued when pool is full """ mock_pool_full.return_value = False dag = DAG( dag_id='test_scheduler_verify_pool_full', start_date=DEFAULT_DATE) DummyOperator( task_id='dummy', dag=dag, owner='airflow', pool='test_scheduler_verify_pool_full') session = settings.Session() pool = Pool(pool='test_scheduler_verify_pool_full', slots=1) session.add(pool) orm_dag = DagModel(dag_id=dag.dag_id) orm_dag.is_paused = False session.merge(orm_dag) session.commit() scheduler = SchedulerJob() dag.clear() # Create 2 dagruns, which will create 2 task instances. dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) self.assertEquals(dr.execution_date, DEFAULT_DATE) dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) queue = [] scheduler._process_task_instances(dag, queue=queue) self.assertEquals(len(queue), 2) dagbag = SimpleDagBag([dag]) # Recreated part of the scheduler here, to kick off tasks -> executor for ti_key in queue: task = dag.get_task(ti_key[1]) ti = models.TaskInstance(task, ti_key[2]) # Task starts out in the scheduled state. All tasks in the # scheduled state will be sent to the executor ti.state = State.SCHEDULED # Also save this task instance to the DB. session.merge(ti) session.commit() scheduler._execute_task_instances(dagbag, (State.SCHEDULED, State.UP_FOR_RETRY)) self.assertEquals(len(scheduler.executor.queued_tasks), 1)
def test_without_dag_run(self): """This checks the defensive against non existent tasks in a dag run""" value = False dag = DAG('shortcircuit_operator_test_without_dag_run', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE }, schedule_interval=INTERVAL) short_op = ShortCircuitOperator(task_id='make_choice', dag=dag, python_callable=lambda: value) branch_1 = DummyOperator(task_id='branch_1', dag=dag) branch_1.set_upstream(short_op) branch_2 = DummyOperator(task_id='branch_2', dag=dag) branch_2.set_upstream(branch_1) upstream = DummyOperator(task_id='upstream', dag=dag) upstream.set_downstream(short_op) dag.clear() short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) session = Session() tis = session.query(TI).filter( TI.dag_id == dag.dag_id, TI.execution_date == DEFAULT_DATE ) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': # should not exist raise elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.SKIPPED) else: raise value = True dag.clear() short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': # should not exist raise elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.NONE) else: raise session.close()
def dag(mocker): clear_session() configuration.load_test_config() dag = DAG( "test_dag", default_args=dict(owner="airflow", start_date=DEFAULT_DATE), schedule_interval=INTERVAL, ) yield dag dag.clear() clear_session()
def test_dag_get_active_runs(self): """ Test to check that a DAG returns it's active runs """ now = datetime.datetime.now() six_hours_ago_to_the_hour = (now - datetime.timedelta(hours=6)).replace(minute=0, second=0, microsecond=0) START_DATE = six_hours_ago_to_the_hour DAG_NAME1 = 'get_active_runs_test' default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': START_DATE } dag1 = DAG(DAG_NAME1, schedule_interval='* * * * *', max_active_runs=1, default_args=default_args ) run_this_1 = DummyOperator(task_id='run_this_1', dag=dag1) run_this_2 = DummyOperator(task_id='run_this_2', dag=dag1) run_this_2.set_upstream(run_this_1) run_this_3 = DummyOperator(task_id='run_this_3', dag=dag1) run_this_3.set_upstream(run_this_2) session = settings.Session() orm_dag = DagModel(dag_id=dag1.dag_id) session.merge(orm_dag) session.commit() session.close() scheduler = SchedulerJob() dag1.clear() dr = scheduler.create_dag_run(dag1) # We had better get a dag run self.assertIsNotNone(dr) execution_date = dr.execution_date running_dates = dag1.get_active_runs() try: running_date = running_dates[0] except: running_date = 'Except' self.assertEqual(execution_date, running_date, 'Running Date must match Execution Date')
def setUp(self): from airflow.www_rbac.views import dagbag from airflow.utils.state import State dag = DAG(self.DAG_ID, start_date=self.DEFAULT_DATE) dagbag.bag_dag(dag, parent_dag=dag, root_dag=dag) self.runs = [] for rd in self.RUNS_DATA: run = dag.create_dagrun( run_id=rd[0], execution_date=rd[1], state=State.SUCCESS, external_trigger=True ) self.runs.append(run)
def setUp(self): configuration.load_test_config() from airflow.contrib.hooks.ssh_hook import SSHHook hook = SSHHook(ssh_conn_id='ssh_default') hook.no_host_key_check = True args = { 'owner': 'airflow', 'start_date': DEFAULT_DATE, 'provide_context': True } dag = DAG(TEST_DAG_ID + 'test_schedule_dag_once', default_args=args) dag.schedule_interval = '@once' self.hook = hook self.dag = dag
def test_dagrun_success_conditions(self): session = settings.Session() dag = DAG( 'test_dagrun_success_conditions', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) # A -> B # A -> C -> D # ordered: B, D, C, A or D, B, C, A or D, C, B, A with dag: op1 = DummyOperator(task_id='A') op2 = DummyOperator(task_id='B') op3 = DummyOperator(task_id='C') op4 = DummyOperator(task_id='D') op1.set_upstream([op2, op3]) op3.set_upstream(op4) dag.clear() now = datetime.datetime.now() dr = dag.create_dagrun(run_id='test_dagrun_success_conditions', state=State.RUNNING, execution_date=now, start_date=now) # op1 = root ti_op1 = dr.get_task_instance(task_id=op1.task_id) ti_op1.set_state(state=State.SUCCESS, session=session) ti_op2 = dr.get_task_instance(task_id=op2.task_id) ti_op3 = dr.get_task_instance(task_id=op3.task_id) ti_op4 = dr.get_task_instance(task_id=op4.task_id) # root is successful, but unfinished tasks state = dr.update_state() self.assertEqual(State.RUNNING, state) # one has failed, but root is successful ti_op2.set_state(state=State.FAILED, session=session) ti_op3.set_state(state=State.SUCCESS, session=session) ti_op4.set_state(state=State.SUCCESS, session=session) state = dr.update_state() self.assertEqual(State.SUCCESS, state) # upstream dependency failed, root has not run ti_op1.set_state(State.NONE, session) state = dr.update_state() self.assertEqual(State.FAILED, state)
def test_scheduler_reschedule(self): """ Checks if tasks that are not taken up by the executor get rescheduled """ executor = TestExecutor() dagbag = DagBag(executor=executor) dagbag.dags.clear() dagbag.executor = executor dag = DAG( dag_id='test_scheduler_reschedule', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') dag.clear() dag.is_subdag = False session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) orm_dag.is_paused = False session.merge(orm_dag) session.commit() dagbag.bag_dag(dag=dag, root_dag=dag, parent_dag=dag) @mock.patch('airflow.models.DagBag', return_value=dagbag) @mock.patch('airflow.models.DagBag.collect_dags') def do_schedule(function, function2): # Use a empty file since the above mock will return the # expected DAGs. Also specify only a single file so that it doesn't # try to schedule the above DAG repeatedly. scheduler = SchedulerJob(num_runs=1, executor=executor, subdir=os.path.join(models.DAGS_FOLDER, "no_dags.py")) scheduler.heartrate = 0 scheduler.run() do_schedule() self.assertEquals(1, len(executor.queued_tasks)) executor.queued_tasks.clear() do_schedule() self.assertEquals(2, len(executor.queued_tasks))
def test_scheduler_dagrun_once(self): """ Test if the scheduler does not create multiple dagruns if a dag is scheduled with @once and a start_date """ dag = DAG( 'test_scheduler_dagrun_once', start_date=datetime.datetime(2015, 1, 1), schedule_interval="@once") scheduler = SchedulerJob() dag.clear() dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) dr = scheduler.create_dag_run(dag) self.assertIsNone(dr)
def test_dagstats_crud(self): DagStat.create(dag_id='test_dagstats_crud') session = settings.Session() qry = session.query(DagStat).filter(DagStat.dag_id == 'test_dagstats_crud') self.assertEqual(len(qry.all()), len(State.dag_states)) DagStat.set_dirty(dag_id='test_dagstats_crud') res = qry.all() for stat in res: self.assertTrue(stat.dirty) # create missing DagStat.set_dirty(dag_id='test_dagstats_crud_2') qry2 = session.query(DagStat).filter(DagStat.dag_id == 'test_dagstats_crud_2') self.assertEqual(len(qry2.all()), len(State.dag_states)) dag = DAG( 'test_dagstats_crud', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) with dag: op1 = DummyOperator(task_id='A') now = datetime.datetime.now() dr = dag.create_dagrun( run_id='manual__' + now.isoformat(), execution_date=now, start_date=now, state=State.FAILED, external_trigger=False, ) DagStat.update(dag_ids=['test_dagstats_crud']) res = qry.all() for stat in res: if stat.state == State.FAILED: self.assertEqual(stat.count, 1) else: self.assertEqual(stat.count, 0) DagStat.update() res = qry2.all() for stat in res: self.assertFalse(stat.dirty)
def setUp(self): self.dag = DAG('branch_operator_test', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE}, schedule_interval=INTERVAL) self.branch_1 = DummyOperator(task_id='branch_1', dag=self.dag) self.branch_2 = DummyOperator(task_id='branch_2', dag=self.dag)
def setUp(self): configuration.load_test_config() app = application.create_app(testing=True) app.config['WTF_CSRF_METHODS'] = [] self.app = app.test_client() self.session = Session() from airflow.www.views import dagbag from airflow.utils.state import State dag = DAG(self.DAG_ID, start_date=self.DEFAULT_DATE) dagbag.bag_dag(dag, parent_dag=dag, root_dag=dag) self.runs = [] for rd in self.RUNS_DATA: run = dag.create_dagrun( run_id=rd[0], execution_date=rd[1], state=State.SUCCESS, external_trigger=True ) self.runs.append(run)
def test_scheduler_reschedule(self): """ Checks if tasks that are not taken up by the executor get rescheduled """ executor = TestExecutor() dagbag = DagBag(executor=executor) dagbag.dags.clear() dagbag.executor = executor dag = DAG( dag_id='test_scheduler_reschedule', start_date=DEFAULT_DATE) dag_task1 = DummyOperator( task_id='dummy', dag=dag, owner='airflow') dag.clear() dag.is_subdag = False session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) orm_dag.is_paused = False session.merge(orm_dag) session.commit() dagbag.bag_dag(dag=dag, root_dag=dag, parent_dag=dag) @mock.patch('airflow.models.DagBag', return_value=dagbag) @mock.patch('airflow.models.DagBag.collect_dags') def do_schedule(function, function2): scheduler = SchedulerJob(num_runs=1, executor=executor,) scheduler.heartrate = 0 scheduler.run() do_schedule() self.assertEquals(1, len(executor.queued_tasks)) executor.queued_tasks.clear() do_schedule() self.assertEquals(2, len(executor.queued_tasks))
def test_get_num_task_instances(self): test_dag_id = 'test_get_num_task_instances_dag' test_task_id = 'task_1' test_dag = DAG(dag_id=test_dag_id, start_date=DEFAULT_DATE) test_task = DummyOperator(task_id=test_task_id, dag=test_dag) ti1 = TI(task=test_task, execution_date=DEFAULT_DATE) ti1.state = None ti2 = TI(task=test_task, execution_date=DEFAULT_DATE + datetime.timedelta(days=1)) ti2.state = State.RUNNING ti3 = TI(task=test_task, execution_date=DEFAULT_DATE + datetime.timedelta(days=2)) ti3.state = State.QUEUED ti4 = TI(task=test_task, execution_date=DEFAULT_DATE + datetime.timedelta(days=3)) ti4.state = State.RUNNING session = settings.Session() session.merge(ti1) session.merge(ti2) session.merge(ti3) session.merge(ti4) session.commit() self.assertEqual( 0, DAG.get_num_task_instances(test_dag_id, ['fakename'], session=session) ) self.assertEqual( 4, DAG.get_num_task_instances(test_dag_id, [test_task_id], session=session) ) self.assertEqual( 4, DAG.get_num_task_instances( test_dag_id, ['fakename', test_task_id], session=session) ) self.assertEqual( 1, DAG.get_num_task_instances( test_dag_id, [test_task_id], states=[None], session=session) ) self.assertEqual( 2, DAG.get_num_task_instances( test_dag_id, [test_task_id], states=[State.RUNNING], session=session) ) self.assertEqual( 3, DAG.get_num_task_instances( test_dag_id, [test_task_id], states=[None, State.RUNNING], session=session) ) self.assertEqual( 4, DAG.get_num_task_instances( test_dag_id, [test_task_id], states=[None, State.QUEUED, State.RUNNING], session=session) ) session.close()
def generate_dag_run(): return [DagRunOrder(payload={'timeout': i}) for i in range(10)] def after_dags_handler(): print("All target DAGs are finished") args = { 'start_date': days_ago(1), 'owner': 'airflow', } dag = DAG( dag_id='trigger_with_multi_dagrun_sensor', max_active_runs=1, schedule_interval='@hourly', default_args=args, ) gen_target_dag_run = TriggerMultiDagRunOperator( task_id='gen_target_dag_run', dag=dag, trigger_dag_id='common_target', python_callable=generate_dag_run, ) # Wait until there is no running instance of target DAG wait_target_dag = MultiDagRunSensor(task_id='wait_target_dag', dag=dag) wait_target_dag.set_upstream(gen_target_dag_run) after_dags_handler_op = PythonOperator(task_id='after_dags_handler',
def _get_task_instance(self, state): dag = DAG('test_dag') task = Mock(dag=dag) ti = TaskInstance(task=task, state=state, execution_date=None) return ti
def nested_subdags(): from airflow.models import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.operators.subdag_operator import SubDagOperator import datetime DAG_NAME = 'master' DEFAULT_ARGS = { 'owner': 'owner1', 'start_date': datetime.datetime(2016, 1, 1) } dag = DAG(DAG_NAME, default_args=DEFAULT_ARGS) # master: # A -> opSubdag_0 # master.opSubdag_0: # -> opSubDag_A # master.opSubdag_0.opSubdag_A: # -> subdag_A.task # -> opSubdag_B # master.opSubdag_0.opSubdag_B: # -> subdag_B.task # A -> opSubdag_1 # master.opSubdag_1: # -> opSubdag_C # master.opSubdag_1.opSubdag_C: # -> subdag_C.task # -> opSubDag_D # master.opSubdag_1.opSubdag_D: # -> subdag_D.task with dag: def subdag_A(): subdag_A = DAG('master.opSubdag_0.opSubdag_A', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_A.task', dag=subdag_A) return subdag_A def subdag_B(): subdag_B = DAG('master.opSubdag_0.opSubdag_B', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_B.task', dag=subdag_B) return subdag_B def subdag_C(): subdag_C = DAG('master.opSubdag_1.opSubdag_C', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_C.task', dag=subdag_C) return subdag_C def subdag_D(): subdag_D = DAG('master.opSubdag_1.opSubdag_D', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_D.task', dag=subdag_D) return subdag_D def subdag_0(): subdag_0 = DAG('master.opSubdag_0', default_args=DEFAULT_ARGS) SubDagOperator(task_id='opSubdag_A', dag=subdag_0, subdag=subdag_A()) SubDagOperator(task_id='opSubdag_B', dag=subdag_0, subdag=subdag_B()) return subdag_0 def subdag_1(): subdag_1 = DAG('master.opSubdag_1', default_args=DEFAULT_ARGS) SubDagOperator(task_id='opSubdag_C', dag=subdag_1, subdag=subdag_C()) SubDagOperator(task_id='opSubdag_D', dag=subdag_1, subdag=subdag_D()) return subdag_1 opSubdag_0 = SubDagOperator(task_id='opSubdag_0', dag=dag, subdag=subdag_0()) opSubdag_1 = SubDagOperator(task_id='opSubdag_1', dag=dag, subdag=subdag_1()) opA = DummyOperator(task_id='A') opA.set_downstream(opSubdag_0) opA.set_downstream(opSubdag_1) return dag
def subdag_C(): subdag_C = DAG('master.opSubdag_1.opSubdag_C', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_C.task', dag=subdag_C) return subdag_C
def subdag_B(): subdag_B = DAG('nested_cycle.opSubdag_0.opSubdag_B', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_B.task', dag=subdag_B) return subdag_B
from datetime import datetime from airflow.models import DAG from airflow.operators.bash import BashOperator from airflow.operators.python import PythonOperator from airflow.operators.subdag_operator import SubDagOperator DEFAULT_DATE = datetime(2016, 1, 1) default_args = { 'owner': 'airflow', 'start_date': DEFAULT_DATE, 'run_as_user': '******' } dag = DAG(dag_id='impersonation_subdag', default_args=default_args) def print_today(): print('Today is {}'.format(datetime.utcnow())) subdag = DAG('impersonation_subdag.test_subdag_operation', default_args=default_args) PythonOperator(python_callable=print_today, task_id='exec_python_fn', dag=subdag) BashOperator(task_id='exec_bash_operator', bash_command='echo "Running within SubDag"',
import airflow import requests from airflow.models import DAG from airflow.operators.python_operator import PythonOperator from airflow_breakfast.utils.slack import send_slack_message args = { "owner": "godatadriven", "start_date": airflow.utils.dates.days_ago(10) } dag = DAG( dag_id="2_rockets", default_args=args, description= "DAG downloading rocket launches from Launch Library.", # e.g. https://launchlibrary.net/1.4/launch?startdate=2019-04-10&enddate=2019-04-21 schedule_interval="0 0 * * *", ) def _download_rocket_launches(ds, next_ds, **_): query = f"https://launchlibrary.net/1.4/launch?startdate={ds}&enddate={next_ds}" result_path = f"/data/rocket_launches/ds={ds}" pathlib.Path(result_path).mkdir(parents=True, exist_ok=True) result_file = posixpath.join(result_path, "launches.json") response = requests.get(query) with open(result_file, "w") as f: f.write(response.text) print(f"Wrote result to file {result_file}")
import yaml import glob from datetime import datetime from airflow.models import DAG from airflow.operators.postgres_operator import PostgresOperator YAML_DIR = '/usr/local/airflow/dags' default_args = {'start_date': datetime(2019, 1, 1)} dag = DAG(dag_id='example_yaml', default_args=default_args) with dag: for filename in glob.glob(YAML_DIR + '/*.yaml'): with open(filename, 'r') as stream: yaml_data = yaml.safe_load(stream) incremental_task = PostgresOperator( task_id=yaml_data['task_id'], sql=yaml_data['sql'], )
class TestSqlBranch(TestHiveEnvironment, unittest.TestCase): """ Test for SQL Branch Operator """ @classmethod def setUpClass(cls): super().setUpClass() with create_session() as session: session.query(DagRun).delete() session.query(TI).delete() def setUp(self): super().setUp() self.dag = DAG( "sql_branch_operator_test", default_args={ "owner": "airflow", "start_date": DEFAULT_DATE }, schedule_interval=INTERVAL, ) self.branch_1 = DummyOperator(task_id="branch_1", dag=self.dag) self.branch_2 = DummyOperator(task_id="branch_2", dag=self.dag) self.branch_3 = None def tearDown(self): super().tearDown() with create_session() as session: session.query(DagRun).delete() session.query(TI).delete() def test_unsupported_conn_type(self): """ Check if BranchSqlOperator throws an exception for unsupported connection type """ op = BranchSqlOperator( task_id="make_choice", conn_id="redis_default", sql="SELECT count(1) FROM INFORMATION_SCHEMA.TABLES", follow_task_ids_if_true="branch_1", follow_task_ids_if_false="branch_2", dag=self.dag, ) with self.assertRaises(AirflowException): op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) def test_invalid_conn(self): """ Check if BranchSqlOperator throws an exception for invalid connection """ op = BranchSqlOperator( task_id="make_choice", conn_id="invalid_connection", sql="SELECT count(1) FROM INFORMATION_SCHEMA.TABLES", follow_task_ids_if_true="branch_1", follow_task_ids_if_false="branch_2", dag=self.dag, ) with self.assertRaises(AirflowException): op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) def test_invalid_follow_task_true(self): """ Check if BranchSqlOperator throws an exception for invalid connection """ op = BranchSqlOperator( task_id="make_choice", conn_id="invalid_connection", sql="SELECT count(1) FROM INFORMATION_SCHEMA.TABLES", follow_task_ids_if_true=None, follow_task_ids_if_false="branch_2", dag=self.dag, ) with self.assertRaises(AirflowException): op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) def test_invalid_follow_task_false(self): """ Check if BranchSqlOperator throws an exception for invalid connection """ op = BranchSqlOperator( task_id="make_choice", conn_id="invalid_connection", sql="SELECT count(1) FROM INFORMATION_SCHEMA.TABLES", follow_task_ids_if_true="branch_1", follow_task_ids_if_false=None, dag=self.dag, ) with self.assertRaises(AirflowException): op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) @pytest.mark.backend("mysql") def test_sql_branch_operator_mysql(self): """ Check if BranchSqlOperator works with backend """ branch_op = BranchSqlOperator( task_id="make_choice", conn_id="mysql_default", sql="SELECT 1", follow_task_ids_if_true="branch_1", follow_task_ids_if_false="branch_2", dag=self.dag, ) branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) @pytest.mark.backend("postgres") def test_sql_branch_operator_postgres(self): """ Check if BranchSqlOperator works with backend """ branch_op = BranchSqlOperator( task_id="make_choice", conn_id="postgres_default", sql="SELECT 1", follow_task_ids_if_true="branch_1", follow_task_ids_if_false="branch_2", dag=self.dag, ) branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) @mock.patch("airflow.operators.sql_branch_operator.BaseHook") def test_branch_single_value_with_dag_run(self, mock_hook): """ Check BranchSqlOperator branch operation """ branch_op = BranchSqlOperator( task_id="make_choice", conn_id="mysql_default", sql="SELECT 1", follow_task_ids_if_true="branch_1", follow_task_ids_if_false="branch_2", dag=self.dag, ) self.branch_1.set_upstream(branch_op) self.branch_2.set_upstream(branch_op) self.dag.clear() dr = self.dag.create_dagrun( run_id="manual__", start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING, ) mock_hook.get_connection("mysql_default").conn_type = "mysql" mock_get_records = (mock_hook.get_connection.return_value.get_hook. return_value.get_first) mock_get_records.return_value = 1 branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() for ti in tis: if ti.task_id == "make_choice": self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == "branch_1": self.assertEqual(ti.state, State.NONE) elif ti.task_id == "branch_2": self.assertEqual(ti.state, State.SKIPPED) else: raise ValueError(f"Invalid task id {ti.task_id} found!") @mock.patch("airflow.operators.sql_branch_operator.BaseHook") def test_branch_true_with_dag_run(self, mock_hook): """ Check BranchSqlOperator branch operation """ branch_op = BranchSqlOperator( task_id="make_choice", conn_id="mysql_default", sql="SELECT 1", follow_task_ids_if_true="branch_1", follow_task_ids_if_false="branch_2", dag=self.dag, ) self.branch_1.set_upstream(branch_op) self.branch_2.set_upstream(branch_op) self.dag.clear() dr = self.dag.create_dagrun( run_id="manual__", start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING, ) mock_hook.get_connection("mysql_default").conn_type = "mysql" mock_get_records = (mock_hook.get_connection.return_value.get_hook. return_value.get_first) for true_value in SUPPORTED_TRUE_VALUES: mock_get_records.return_value = true_value branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() for ti in tis: if ti.task_id == "make_choice": self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == "branch_1": self.assertEqual(ti.state, State.NONE) elif ti.task_id == "branch_2": self.assertEqual(ti.state, State.SKIPPED) else: raise ValueError(f"Invalid task id {ti.task_id} found!") @mock.patch("airflow.operators.sql_branch_operator.BaseHook") def test_branch_false_with_dag_run(self, mock_hook): """ Check BranchSqlOperator branch operation """ branch_op = BranchSqlOperator( task_id="make_choice", conn_id="mysql_default", sql="SELECT 1", follow_task_ids_if_true="branch_1", follow_task_ids_if_false="branch_2", dag=self.dag, ) self.branch_1.set_upstream(branch_op) self.branch_2.set_upstream(branch_op) self.dag.clear() dr = self.dag.create_dagrun( run_id="manual__", start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING, ) mock_hook.get_connection("mysql_default").conn_type = "mysql" mock_get_records = (mock_hook.get_connection.return_value.get_hook. return_value.get_first) for false_value in SUPPORTED_FALSE_VALUES: mock_get_records.return_value = false_value branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() for ti in tis: if ti.task_id == "make_choice": self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == "branch_1": self.assertEqual(ti.state, State.SKIPPED) elif ti.task_id == "branch_2": self.assertEqual(ti.state, State.NONE) else: raise ValueError(f"Invalid task id {ti.task_id} found!") @mock.patch("airflow.operators.sql_branch_operator.BaseHook") def test_branch_list_with_dag_run(self, mock_hook): """ Checks if the BranchSqlOperator supports branching off to a list of tasks.""" branch_op = BranchSqlOperator( task_id="make_choice", conn_id="mysql_default", sql="SELECT 1", follow_task_ids_if_true=["branch_1", "branch_2"], follow_task_ids_if_false="branch_3", dag=self.dag, ) self.branch_1.set_upstream(branch_op) self.branch_2.set_upstream(branch_op) self.branch_3 = DummyOperator(task_id="branch_3", dag=self.dag) self.branch_3.set_upstream(branch_op) self.dag.clear() dr = self.dag.create_dagrun( run_id="manual__", start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING, ) mock_hook.get_connection("mysql_default").conn_type = "mysql" mock_get_records = (mock_hook.get_connection.return_value.get_hook. return_value.get_first) mock_get_records.return_value = [["1"]] branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() for ti in tis: if ti.task_id == "make_choice": self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == "branch_1": self.assertEqual(ti.state, State.NONE) elif ti.task_id == "branch_2": self.assertEqual(ti.state, State.NONE) elif ti.task_id == "branch_3": self.assertEqual(ti.state, State.SKIPPED) else: raise ValueError(f"Invalid task id {ti.task_id} found!") @mock.patch("airflow.operators.sql_branch_operator.BaseHook") def test_invalid_query_result_with_dag_run(self, mock_hook): """ Check BranchSqlOperator branch operation """ branch_op = BranchSqlOperator( task_id="make_choice", conn_id="mysql_default", sql="SELECT 1", follow_task_ids_if_true="branch_1", follow_task_ids_if_false="branch_2", dag=self.dag, ) self.branch_1.set_upstream(branch_op) self.branch_2.set_upstream(branch_op) self.dag.clear() self.dag.create_dagrun( run_id="manual__", start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING, ) mock_hook.get_connection("mysql_default").conn_type = "mysql" mock_get_records = (mock_hook.get_connection.return_value.get_hook. return_value.get_first) mock_get_records.return_value = ["Invalid Value"] with self.assertRaises(AirflowException): branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) @mock.patch("airflow.operators.sql_branch_operator.BaseHook") def test_with_skip_in_branch_downstream_dependencies(self, mock_hook): """ Test SQL Branch with skipping all downstream dependencies """ branch_op = BranchSqlOperator( task_id="make_choice", conn_id="mysql_default", sql="SELECT 1", follow_task_ids_if_true="branch_1", follow_task_ids_if_false="branch_2", dag=self.dag, ) branch_op >> self.branch_1 >> self.branch_2 branch_op >> self.branch_2 self.dag.clear() dr = self.dag.create_dagrun( run_id="manual__", start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING, ) mock_hook.get_connection("mysql_default").conn_type = "mysql" mock_get_records = (mock_hook.get_connection.return_value.get_hook. return_value.get_first) for true_value in SUPPORTED_TRUE_VALUES: mock_get_records.return_value = [true_value] branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() for ti in tis: if ti.task_id == "make_choice": self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == "branch_1": self.assertEqual(ti.state, State.NONE) elif ti.task_id == "branch_2": self.assertEqual(ti.state, State.NONE) else: raise ValueError(f"Invalid task id {ti.task_id} found!") @mock.patch("airflow.operators.sql_branch_operator.BaseHook") def test_with_skip_in_branch_downstream_dependencies2(self, mock_hook): """ Test skipping downstream dependency for false condition""" branch_op = BranchSqlOperator( task_id="make_choice", conn_id="mysql_default", sql="SELECT 1", follow_task_ids_if_true="branch_1", follow_task_ids_if_false="branch_2", dag=self.dag, ) branch_op >> self.branch_1 >> self.branch_2 branch_op >> self.branch_2 self.dag.clear() dr = self.dag.create_dagrun( run_id="manual__", start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING, ) mock_hook.get_connection("mysql_default").conn_type = "mysql" mock_get_records = (mock_hook.get_connection.return_value.get_hook. return_value.get_first) for false_value in SUPPORTED_FALSE_VALUES: mock_get_records.return_value = [false_value] branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() for ti in tis: if ti.task_id == "make_choice": self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == "branch_1": self.assertEqual(ti.state, State.SKIPPED) elif ti.task_id == "branch_2": self.assertEqual(ti.state, State.NONE) else: raise ValueError(f"Invalid task id {ti.task_id} found!")
from airflow.contrib.operators.ssh_operator import SSHOperator args = { 'owner': 'shitao', #'start_date': airflow.utils.dates.days_ago(2), 'start_date': datetime(2018, 1, 1), 'retries': 3, 'retry_delay': timedelta(minutes=30), 'email': ['*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, } dag = DAG( dag_id='shopping_cart', default_args=args, schedule_interval='0 5 * * *', dagrun_timeout=timedelta(minutes=60), ) daily = SSHOperator( ssh_conn_id='ws@hdp-0', task_id='daily', command= 'cd /usr/local/bigdata/jobtaskh0/pythonjob/pyspark_template/ && spark-submit \ --num-executors 4 \ --executor-memory 4G \ --executor-cores 4 \ --driver-memory 4G \ --driver-cores 4 \ --jars /usr/hdp/3.0.1.0-187/spark2/jars/mysql-connector-java-5.1.47.jar \ --driver-class-path /usr/hdp/3.0.1.0-187/spark2/jars/mysql-connector-java-5.1.47.jar \
from airflow.contrib.operators.ssh_operator import SSHOperator args = { 'owner': 'mayx', 'start_date': airflow.utils.dates.days_ago(2), 'retries': 1, 'retry_delay': timedelta(minutes=10), 'email': ['*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, } # day 类型的任务 根据相应的类型,打开或者关闭相关的注释 dag = DAG( dag_id='device_total_day', default_args=args, schedule_interval='30 5 * * *', dagrun_timeout=timedelta(minutes=60), ) # # week 类型的任务 # dag = DAG( # dag_id='airflow_pyspark_template_week', # default_args=args, # schedule_interval='50 6 * * 1', # dagrun_timeout=timedelta(minutes=60), # ) # # # # month 类型的任务 dag_id 需要修改 # dag = DAG( # dag_id='airflow_pyspark_template_week',
# with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from datetime import datetime from time import sleep from airflow.models import DAG from airflow.operators.python import PythonOperator DEFAULT_DATE = datetime(2016, 1, 1) args = { 'owner': 'airflow', 'start_date': DEFAULT_DATE, } dag = DAG(dag_id='test_mark_success', default_args=args) task = PythonOperator( task_id='task1', python_callable=lambda x: sleep(x), # pylint: disable=W0108 op_args=[600], dag=dag)
dag_id = 'salesforce_recommendation_reason' # independent_reasons = { # 'hot_location_longterm': hot_location_longterm, # 'hot_location_occupancy': hot_location_occupancy, # 'hot_location_shortterm': hot_location_shortterm, # } """ Create a DAG to execute tasks """ dag = DAG( dag_id=dag_id, default_args=args, schedule_interval=None, ) main_op = DummyOperator( task_id = 'main_entrance', dag= dag, ) generate_pair_op = PythonOperator( task_id='generate_pairs', python_callable=generate_pairs, dag=dag, ) merging_op = PythonOperator(
@stakeholders: People who learns """ from airflow.operators.python_operator import PythonOperator from airflow.operators.dummy_operator import DummyOperator from airflow.models import DAG import datetime import logging def say_hello(**context): """ Function is puting example string into task log. :param context: :return: """ logging.info(f'Everything Works! {datetime.datetime.now()}') dag = DAG(dag_id='hello_world', schedule_interval=None, start_date=datetime.datetime(2020, 1, 1), default_args={"owner": "airflow_lesson"}) start = DummyOperator(task_id='start_dag', dag=dag) hello = PythonOperator(task_id='say_hello', python_callable=say_hello, dag=dag) end = DummyOperator(task_id='end_dag', dag=dag) start >> hello >> end
def setUp(self): dag = DAG('dag_for_testing_filename_rendering', start_date=DEFAULT_DATE) task = DummyOperator(task_id='task_for_testing_filename_rendering', dag=dag) self.ti = TaskInstance(task=task, execution_date=DEFAULT_DATE)
import airflow from airflow.operators.bash_operator import BashOperator from airflow.models import DAG args = { 'owner': 'Freddy Drennan', 'start_date': airflow.utils.dates.days_ago(2), 'email': ['*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True } dag = DAG(dag_id='update_ip', default_args=args, schedule_interval='@hourly', concurrency=1, max_active_runs=1, catchup=False) task_1 = BashOperator( task_id='update_ip', bash_command='. /home/scripts/R/shell/update_ip', dag=dag )
# assemble env vars env_vars = Variable.get("atd_knack_services_postgrest", deserialize_json=True) atd_knack_auth = Variable.get("atd_knack_auth", deserialize_json=True) env_vars["KNACK_APP_ID"] = atd_knack_auth[app_name][env]["app_id"] env_vars["KNACK_API_KEY"] = atd_knack_auth[app_name][env]["api_key"] env_vars["SOCRATA_API_KEY_ID"] = Variable.get("atd_service_bot_socrata_api_key_id") env_vars["SOCRATA_API_KEY_SECRET"] = Variable.get( "atd_service_bot_socrata_api_key_secret" ) env_vars["SOCRATA_APP_TOKEN"] = Variable.get("atd_service_bot_socrata_app_token") with DAG( dag_id="atd_knack_mmc_activities_to_s3_to_socrata", default_args=default_args, schedule_interval="20 6 * * *", dagrun_timeout=timedelta(minutes=300), tags=["production", "knack"], catchup=False, ) as dag: date = "{{ prev_execution_date_success or '1970-01-01' }}" t1 = DockerOperator( task_id="atd_knack_mmc_activities_to_postgrest", image=docker_image, api_version="auto", auto_remove=True, command=f'./atd-knack-services/services/{task_1_script}.py -a {app_name} -c {container} -d "{date}"', # noqa docker_url="tcp://localhost:2376", network_mode="bridge", environment=env_vars,
from airflow.models import DAG from airflow.operators.python_operator import PythonOperator args = { 'owner': 'noah', 'depends_on_past': False, 'start_date': datetime.utcnow(), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5) } dag = DAG(dag_id='mock_loan_predicting', default_args=args, schedule_interval="@once") # Tasks load_unseen_data = PythonOperator( task_id='load_raw_unseen_prediction_data', provide_context=True, python_callable=dag_functions.load_unseen_prediction_data_from_db, dag=dag) wrangle_unseen_data = PythonOperator( task_id='wrangle_unseen_data', provide_context=True, python_callable=dag_functions.wrangle_unseen_data, dag=dag)
def subdag_D(): subdag_D = DAG('nested_cycle.opSubdag_1.opSubdag_D', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_D.task', dag=subdag_D) return subdag_D
from pprint import pprint import airflow from airflow.models import DAG from airflow.operators.python_operator import PythonOperator from scr.quality.prod import ppt_email args = { 'owner': 'lishulong', 'start_date': airflow.utils.dates.days_ago(2), 'depends_on_past': False, } def print_context(ds, **kwargs): pprint(kwargs) print(ds) ppt_email() return 'Whatever you return gets printed in the logs' dag = DAG(dag_id='python_quality', default_args=args, schedule_interval='0 0 * * *') s2 = PythonOperator(task_id='prod_email', provide_context=True, python_callable=print_context, dag=dag)
def nested_subdag_cycle(): from airflow.models import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.operators.subdag_operator import SubDagOperator import datetime DAG_NAME = 'nested_cycle' DEFAULT_ARGS = { 'owner': 'owner1', 'start_date': datetime.datetime(2016, 1, 1) } dag = DAG(DAG_NAME, default_args=DEFAULT_ARGS) # cycle: # A -> opSubdag_0 # cycle.opSubdag_0: # -> opSubDag_A # cycle.opSubdag_0.opSubdag_A: # -> subdag_A.task # -> opSubdag_B # cycle.opSubdag_0.opSubdag_B: # -> subdag_B.task # A -> opSubdag_1 # cycle.opSubdag_1: # -> opSubdag_C # cycle.opSubdag_1.opSubdag_C: # -> subdag_C.task -> subdag_C.task >Invalid Loop< # -> opSubDag_D # cycle.opSubdag_1.opSubdag_D: # -> subdag_D.task with dag: def subdag_A(): subdag_A = DAG('nested_cycle.opSubdag_0.opSubdag_A', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_A.task', dag=subdag_A) return subdag_A def subdag_B(): subdag_B = DAG('nested_cycle.opSubdag_0.opSubdag_B', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_B.task', dag=subdag_B) return subdag_B def subdag_C(): subdag_C = DAG('nested_cycle.opSubdag_1.opSubdag_C', default_args=DEFAULT_ARGS) opSubdag_C_task = DummyOperator(task_id='subdag_C.task', dag=subdag_C) # introduce a loop in opSubdag_C opSubdag_C_task.set_downstream(opSubdag_C_task) return subdag_C def subdag_D(): subdag_D = DAG('nested_cycle.opSubdag_1.opSubdag_D', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_D.task', dag=subdag_D) return subdag_D def subdag_0(): subdag_0 = DAG('nested_cycle.opSubdag_0', default_args=DEFAULT_ARGS) SubDagOperator(task_id='opSubdag_A', dag=subdag_0, subdag=subdag_A()) SubDagOperator(task_id='opSubdag_B', dag=subdag_0, subdag=subdag_B()) return subdag_0 def subdag_1(): subdag_1 = DAG('nested_cycle.opSubdag_1', default_args=DEFAULT_ARGS) SubDagOperator(task_id='opSubdag_C', dag=subdag_1, subdag=subdag_C()) SubDagOperator(task_id='opSubdag_D', dag=subdag_1, subdag=subdag_D()) return subdag_1 opSubdag_0 = SubDagOperator(task_id='opSubdag_0', dag=dag, subdag=subdag_0()) opSubdag_1 = SubDagOperator(task_id='opSubdag_1', dag=dag, subdag=subdag_1()) opA = DummyOperator(task_id='A') opA.set_downstream(opSubdag_0) opA.set_downstream(opSubdag_1) return dag
ALERT_EMAIL_ADDRESSES = [ ] # List of email address to send email alerts to if this job fails ENABLE_DELETE = True # Whether the job should delete the logs or not. Included if you want to temporarily avoid deleting the logs default_args = { 'owner': DAG_OWNER_NAME, 'email': ALERT_EMAIL_ADDRESSES, 'email_on_failure': False, 'email_on_retry': False, 'start_date': START_DATE, 'retries': 1, 'retry_delay': timedelta(minutes=1) } dag = DAG(DAG_ID, default_args=default_args, schedule_interval=SCHEDULE_INTERVAL, start_date=START_DATE) if hasattr(dag, 'doc_md'): dag.doc_md = __doc__ if hasattr(dag, 'catchup'): dag.catchup = False def clear_missing_dags_fn(**context): logging.info("Starting to run Clear Process") try: host_name = socket.gethostname() host_ip = socket.gethostbyname(host_name) logging.info("Running on Machine with Host Name: " + host_name)
def subdag_A(): subdag_A = DAG('master.opSubdag_0.opSubdag_A', default_args=DEFAULT_ARGS) DummyOperator(task_id='subdag_A.task', dag=subdag_A) return subdag_A
from builtins import range from datetime import timedelta import airflow from airflow.models import DAG from airflow.operators.bash_operator import BashOperator from airflow.operators.dummy_operator import DummyOperator args = { 'owner': 'airflow', 'start_date': airflow.utils.dates.days_ago(2), } dag = DAG( dag_id='example_bash_operator', default_args=args, schedule_interval='0 0 * * *', dagrun_timeout=timedelta(minutes=60), ) run_this_last = DummyOperator( task_id='run_this_last', dag=dag, ) # [START howto_operator_bash] run_this = BashOperator( task_id='run_after_loop', bash_command='echo 1', dag=dag, ) # [END howto_operator_bash]
from datetime import datetime from airflow.models import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.operators.python_operator import PythonOperator from airflow.operators.bash_operator import BashOperator default_args = { 'owner': 'ivanfdz', 'start_date': datetime(2020, 5, 20, 11, 0, 0) } def hello_world_loop(): for palabra in ['hello', 'world']: print(palabra) with DAG('dag_prueba', default_args=default_args, schedule_interval='@daily') as dag: start = DummyOperator(task_id='start') prueba_python = PythonOperator(task_id='prueba_python', python_callable=hello_world_loop) prueba_bash = BashOperator(task_id='prueba_bash', bash_command='echo prueba_bash') start >> prueba_python >> prueba_bash
import pprint from datetime import datetime from airflow.models import DAG from airflow.operators.bash_operator import BashOperator pp = pprint.PrettyPrinter(indent=4) args = { # TODO use current date without harming dag functionality 'start_date': datetime(2019, 6, 15), 'owner': 'airflow', } dag = DAG(dag_id='proxyscraper_dag', default_args=args, schedule_interval='*/15 * * * *', catchup=False) collect_proxies = BashOperator( task_id='get_proxies', bash_command=\ "cd /FIFA/fifa_data/ && python3 -m fifa_data.spiders.fate_proxy ", dag=dag )
from airflow.models import DAG from airflow.providers.apache.spark.operators.spark_sql import SparkSqlOperator from airflow.providers.jdbc.operators.jdbc import JdbcOperator from airflow.utils.dates import days_ago from datetime import timedelta, datetime as dt args = { 'owner': 'Seshu Edala', } with DAG( dag_id='album_external_to_album', default_args=args, schedule_interval='*/30 * * * *', dagrun_timeout=timedelta(minutes=5), start_date=days_ago(1), tags=['album_external_to_album', 'load_data', 'aluminium'], catchup=False, ) as dag: ''' -- create control table create database if not exists meta; create external table if not exists meta.control (data_file string, al_table string, process_time timestamp) using delta location "s3a://spark/warehouse/control"; -- switch to correct database use music; -- drop previous table drop view if exists global_temp.album_{__signature__};
def _get_task(self, **kwargs): return BaseOperator(task_id='test_task', dag=DAG('test_dag'), **kwargs)
from airflow.models import DAG from airflow.operators.python_operator import PythonOperator from airflow.utils.dates import days_ago from airflow.operators.sensors import ExternalTaskSensor # https://towardsdatascience.com/dependencies-between-dags-in-apache-airflow-2f5935cde3f0 dag = DAG( dag_id='dependencia_tres', schedule_interval='@once', #owner: 'test', start_date=days_ago(0), catchup=False) def print_success_message(**kwargs): print("Success!!") def print_end_message(**kwargs): print("END") externalsensor1 = ExternalTaskSensor( task_id='dependencia_dos_completed_Status', external_dag_id='dependencia_dos', external_task_id=None, check_existence=True) success = PythonOperator(task_id='success', python_callable=print_success_message,
"""_dags file for 'council districts' sde extraction.""" from airflow.models import DAG from trident.util import general from dags.sde.parks_jobs import sde_to_shp from trident.util.sde_extract_tasks import create_sde_tasks args = general.args conf = general.config schedule = general.schedule['gis_weekly'] start_date = general.start_date['gis_weekly'] folder = 'parks' layer = 'parks' datasd_name = 'parks_datasd' md = 'park-locations' path_to_file = conf['prod_data_dir'] + '/' + datasd_name dag = DAG(dag_id='gis_{layer}'.format(layer=layer), default_args=args, start_date=start_date, schedule_interval=schedule) #: Create tasks dynamically create_sde_tasks(dag=dag, folder=folder, layer=layer, datasd_name=datasd_name, md=md, path_to_file=path_to_file, sde_to_shp=sde_to_shp)