def test_failure_callback_only_called_once(self, mock_return_code, _check_call): """ Test that ensures that when a task exits with failure by itself, failure callback is only called once """ # use shared memory value so we can properly track value change even if # it's been updated across processes. failure_callback_called = Value('i', 0) callback_count_lock = Lock() def failure_callback(context): with callback_count_lock: failure_callback_called.value += 1 assert context['dag_run'].dag_id == 'test_failure_callback_race' assert isinstance(context['exception'], AirflowFailException) def task_function(ti): raise AirflowFailException() dag = DAG(dag_id='test_failure_callback_race', start_date=DEFAULT_DATE) task = PythonOperator( task_id='test_exit_on_failure', python_callable=task_function, on_failure_callback=failure_callback, dag=dag, ) dag.clear() with create_session() as session: dag.create_dagrun( run_id="test", state=State.RUNNING, execution_date=DEFAULT_DATE, start_date=DEFAULT_DATE, session=session, ) ti = TaskInstance(task=task, execution_date=DEFAULT_DATE) ti.refresh_from_db() job1 = LocalTaskJob(task_instance=ti, ignore_ti_state=True, executor=SequentialExecutor()) # Simulate race condition where job1 heartbeat ran right after task # state got set to failed by ti.handle_failure but before task process # fully exits. See _execute loop in airflow/jobs/local_task_job.py. # In this case, we have: # * task_runner.return_code() is None # * ti.state == State.Failed # # We also need to set return_code to a valid int after job1.terminating # is set to True so _execute loop won't loop forever. def dummy_return_code(*args, **kwargs): return None if not job1.terminating else -9 mock_return_code.side_effect = dummy_return_code with timeout(10): # This should be _much_ shorter to run. # If you change this limit, make the timeout in the callable above bigger job1.run() ti.refresh_from_db() assert ti.state == State.FAILED # task exits with failure state assert failure_callback_called.value == 1
def setUp(self): args = { 'owner': 'airflow', 'start_date': datetime.datetime(2017, 1, 1) } self.dag = DAG('test_dag_id', default_args=args)
def dag_backfill(args, dag=None): """Creates backfill job or dry run for a DAG""" logging.basicConfig(level=settings.LOGGING_LEVEL, format=settings.SIMPLE_LOG_FORMAT) signal.signal(signal.SIGTERM, sigint_handler) import warnings warnings.warn( '--ignore-first-depends-on-past is deprecated as the value is always set to True', category=PendingDeprecationWarning) if args.ignore_first_depends_on_past is False: args.ignore_first_depends_on_past = True dag = dag or get_dag(args.subdir, args.dag_id) if not args.start_date and not args.end_date: raise AirflowException("Provide a start_date and/or end_date") # If only one date is passed, using same as start and end args.end_date = args.end_date or args.start_date args.start_date = args.start_date or args.end_date if args.task_regex: dag = dag.sub_dag(task_regex=args.task_regex, include_upstream=not args.ignore_dependencies) run_conf = None if args.conf: run_conf = json.loads(args.conf) if args.dry_run: print("Dry run of DAG {0} on {1}".format(args.dag_id, args.start_date)) for task in dag.tasks: print("Task {0}".format(task.task_id)) ti = TaskInstance(task, args.start_date) ti.dry_run() else: if args.reset_dagruns: DAG.clear_dags( [dag], start_date=args.start_date, end_date=args.end_date, confirm_prompt=not args.yes, include_subdags=True, ) dag.run(start_date=args.start_date, end_date=args.end_date, mark_success=args.mark_success, local=args.local, donot_pickle=(args.donot_pickle or conf.getboolean('core', 'donot_pickle')), ignore_first_depends_on_past=args.ignore_first_depends_on_past, ignore_task_deps=args.ignore_dependencies, pool=args.pool, delay_on_limit_secs=args.delay_on_limit, verbose=args.verbose, conf=run_conf, rerun_failed_tasks=args.rerun_failed_tasks, run_backwards=args.run_backwards)
def dag_bag_ext(): """ Create a DagBag with DAGs looking like this. The dotted lines represent external dependencies set up using ExternalTaskMarker and ExternalTaskSensor. dag_0: task_a_0 >> task_b_0 | | dag_1: ---> task_a_1 >> task_b_1 | | dag_2: ---> task_a_2 >> task_b_2 | | dag_3: ---> task_a_3 >> task_b_3 """ dag_bag = DagBag(dag_folder=DEV_NULL, include_examples=False) dag_0 = DAG("dag_0", start_date=DEFAULT_DATE, schedule_interval=None) task_a_0 = DummyOperator(task_id="task_a_0", dag=dag_0) task_b_0 = ExternalTaskMarker(task_id="task_b_0", external_dag_id="dag_1", external_task_id="task_a_1", recursion_depth=3, dag=dag_0) task_a_0 >> task_b_0 dag_1 = DAG("dag_1", start_date=DEFAULT_DATE, schedule_interval=None) task_a_1 = ExternalTaskSensor(task_id="task_a_1", external_dag_id=dag_0.dag_id, external_task_id=task_b_0.task_id, dag=dag_1) task_b_1 = ExternalTaskMarker(task_id="task_b_1", external_dag_id="dag_2", external_task_id="task_a_2", recursion_depth=2, dag=dag_1) task_a_1 >> task_b_1 dag_2 = DAG("dag_2", start_date=DEFAULT_DATE, schedule_interval=None) task_a_2 = ExternalTaskSensor(task_id="task_a_2", external_dag_id=dag_1.dag_id, external_task_id=task_b_1.task_id, dag=dag_2) task_b_2 = ExternalTaskMarker(task_id="task_b_2", external_dag_id="dag_3", external_task_id="task_a_3", recursion_depth=1, dag=dag_2) task_a_2 >> task_b_2 dag_3 = DAG("dag_3", start_date=DEFAULT_DATE, schedule_interval=None) task_a_3 = ExternalTaskSensor(task_id="task_a_3", external_dag_id=dag_2.dag_id, external_task_id=task_b_2.task_id, dag=dag_3) task_b_3 = DummyOperator(task_id="task_b_3", dag=dag_3) task_a_3 >> task_b_3 for dag in [dag_0, dag_1, dag_2, dag_3]: dag_bag.bag_dag(dag, None, dag) return dag_bag
) for i in range(2): DummyOperator( task_id='%s-task-%s' % (child_dag_name, i + 1), default_args=args, dag=dag_subdag, ) return dag_subdag with DAG( dag_id=DAG_NAME, start_date=datetime(2019, 1, 1), max_active_runs=1, default_args=DEFAULT_TASK_ARGS, schedule_interval=timedelta(minutes=1), ) as dag: start = DummyOperator(task_id='start', ) section_1 = SubDagOperator( task_id='section-1', subdag=subdag(DAG_NAME, 'section-1', DEFAULT_TASK_ARGS), default_args=DEFAULT_TASK_ARGS, ) some_other_task = DummyOperator(task_id='some-other-task', ) start >> section_1 >> some_other_task # pylint: disable=W0104
# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from airflow.models.dag import DAG from airflow.utils import timezone from airflow.ti_deps.met_handlers.aiflow_met_handler import AIFlowMetHandler from airflow.operators.dummy_operator import DummyOperator from airflow.models.event import Event from airflow.operators.send_event_operator import SendEventOperator from airflow.operators.bash_operator import BashOperator dag = DAG(dag_id='test_projec1', start_date=timezone.utcnow(), schedule_interval="@once") env = { 'PYTHONPATH': '/Users/chenwuchao/code/ali/ai_flow/python_ai_flow/test/python_codes/simple_python:/Users/chenwuchao/code/ali/ai_flow:/Users/chenwuchao/code/ali/ai_flow/flink_ai_flow/tests/python_codes:/Users/chenwuchao/code/ali/ai_flow/flink_ai_flow/tests:/Applications/PyCharm CE.app/Contents/helpers/pycharm:/anaconda3/lib/python37.zip:/anaconda3/lib/python3.7:/anaconda3/lib/python3.7/lib-dynload:/Users/chenwuchao/.local/lib/python3.7/site-packages:/anaconda3/lib/python3.7/site-packages:/anaconda3/lib/python3.7/site-packages/aeosa://anaconda3/lib/python3.7/site-packages:/Users/chenwuchao/airflow/dags:/Users/chenwuchao/airflow/config:/Users/chenwuchao/airflow/plugins:/Users/chenwuchao/code/ali/ai_flow/python_ai_flow:/Users/chenwuchao/code/ali/ai_flow/python_ai_flow/test/python_codes' } op_0 = BashOperator( task_id='None', dag=dag, bash_command= '/anaconda3/bin/python /Users/chenwuchao/code/ali/ai_flow/python_ai_flow/local_job_run.py /Users/chenwuchao/code/ali/ai_flow/python_ai_flow/test tmp_funca533b537-8e45-439c-8f71-0ad8dd9409c0LocalPythonJob_0 tmp_args713c2a6b-c023-4340-96ee-22f7c62f15b3LocalPythonJob_0 test_simple_python', env=env)
return DingdingOperator( task_id='dingding_success_callback', dingding_conn_id='dingding_default', message_type='text', message=message, at_all=True, ).execute(context) args['on_failure_callback'] = failure_callback # [END howto_operator_dingding_failure_callback] with DAG( dag_id='example_dingding_operator', default_args=args, schedule_interval='@once', dagrun_timeout=timedelta(minutes=60), tags=['example'], ) as dag: # [START howto_operator_dingding] text_msg_remind_none = DingdingOperator( task_id='text_msg_remind_none', dingding_conn_id='dingding_default', message_type='text', message='Airflow dingding text message remind none', at_mobiles=None, at_all=False) # [END howto_operator_dingding] text_msg_remind_specific = DingdingOperator(
def sync_to_db(self, session: Optional[Session] = None): """Save attributes about list of DAG to the DB.""" # To avoid circular import - airflow.models.dagbag -> airflow.models.dag -> airflow.models.dagbag from airflow.models.dag import DAG from airflow.models.serialized_dag import SerializedDagModel def _serialze_dag_capturing_errors(dag, session): """ Try to serialize the dag to the DB, but make a note of any errors. We can't place them directly in import_errors, as this may be retried, and work the next time """ if dag.is_subdag: return [] try: # We cant use bulk_write_to_db as we want to capture each error individually SerializedDagModel.write_dag( dag, min_update_interval=settings. MIN_SERIALIZED_DAG_UPDATE_INTERVAL, session=session, ) return [] except OperationalError: raise except Exception: # pylint: disable=broad-except return [(dag.fileloc, traceback.format_exc( limit=-self.dagbag_import_error_traceback_depth))] # Retry 'DAG.bulk_write_to_db' & 'SerializedDagModel.bulk_sync_to_db' in case # of any Operational Errors # In case of failures, provide_session handles rollback for attempt in tenacity.Retrying( retry=tenacity.retry_if_exception_type( exception_types=OperationalError), wait=tenacity.wait_random_exponential(multiplier=0.5, max=5), stop=tenacity.stop_after_attempt(settings.MAX_DB_RETRIES), before_sleep=tenacity.before_sleep_log(self.log, logging.DEBUG), reraise=True, ): with attempt: serialize_errors = [] self.log.debug( "Running dagbag.sync_to_db with retries. Try %d of %d", attempt.retry_state.attempt_number, settings.MAX_DB_RETRIES, ) self.log.debug("Calling the DAG.bulk_sync_to_db method") try: # Write Serialized DAGs to DB, capturing errors for dag in self.dags.values(): serialize_errors.extend( _serialze_dag_capturing_errors(dag, session)) DAG.bulk_write_to_db(self.dags.values(), session=session) except OperationalError: session.rollback() raise # Only now we are "complete" do we update import_errors - don't want to record errors from # previous failed attempts self.import_errors.update(dict(serialize_errors))
def set_dag_run_state_to_failed( *, dag: DAG, execution_date: Optional[datetime] = None, run_id: Optional[str] = None, commit: bool = False, session: SASession = NEW_SESSION, ): """ Set the dag run for a specific execution date or run_id and its running task instances to failed. :param dag: the DAG of which to alter state :param execution_date: the execution date from which to start looking(deprecated) :param run_id: the DAG run_id to start looking from :param commit: commit DAG and tasks to be altered to the database :param session: database session :return: If commit is true, list of tasks that have been updated, otherwise list of tasks that will be updated :raises: AssertionError if dag or execution_date is invalid """ if not exactly_one(execution_date, run_id): return [] if not dag: return [] if execution_date: if not timezone.is_localized(execution_date): raise ValueError(f"Received non-localized date {execution_date}") dag_run = dag.get_dagrun(execution_date=execution_date) if not dag_run: raise ValueError( f'DagRun with execution_date: {execution_date} not found') run_id = dag_run.run_id if not run_id: raise ValueError(f'Invalid dag_run_id: {run_id}') # Mark the dag run to failed. if commit: _set_dag_run_state(dag.dag_id, run_id, DagRunState.FAILED, session) # Mark only RUNNING task instances. task_ids = [task.task_id for task in dag.tasks] tis = session.query(TaskInstance).filter( TaskInstance.dag_id == dag.dag_id, TaskInstance.run_id == run_id, TaskInstance.task_id.in_(task_ids), TaskInstance.state.in_(State.running), ) task_ids_of_running_tis = [task_instance.task_id for task_instance in tis] tasks = [] for task in dag.tasks: if task.task_id not in task_ids_of_running_tis: continue task.dag = dag tasks.append(task) return set_state(tasks=tasks, dag_run_id=run_id, state=State.FAILED, commit=commit, session=session)
Mode, OpenMLDBLoadDataOperator, OpenMLDBSelectIntoOperator, OpenMLDBSQLOperator, OpenMLDBDeployOperator, ) import xgboost_train_sample # cp example_dags/train_sample.csv to /tmp first PATH_TO_DATA_FILE = os.environ.get('OPENMLDB_PATH_TO_DATA_FILE', '/tmp/train_sample.csv') ENV_ID = os.environ.get("SYSTEM_TESTS_ENV_ID") DAG_ID = "example_openmldb_complex" with DAG( dag_id=DAG_ID, start_date=datetime(2021, 1, 1), default_args={'openmldb_conn_id': 'openmldb_conn_id'}, max_active_runs=1, tags=['example'], catchup=False, ) as dag: database = "example_db" table = "example_table" create_database = OpenMLDBSQLOperator( task_id='create-db', db=database, mode=Mode.OFFSYNC, sql=f'create database if not exists {database}' ) create_table = OpenMLDBSQLOperator( task_id='create-table', db=database, mode=Mode.OFFSYNC,
class TestCore(unittest.TestCase): default_scheduler_args = {"num_runs": 1} def setUp(self): self.dagbag = DagBag(dag_folder=DEV_NULL, include_examples=True, read_dags_from_db=False) self.args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} self.dag = DAG(TEST_DAG_ID, default_args=self.args) self.dag_bash = self.dagbag.dags['example_bash_operator'] self.runme_0 = self.dag_bash.get_task('runme_0') self.run_after_loop = self.dag_bash.get_task('run_after_loop') self.run_this_last = self.dag_bash.get_task('run_this_last') def tearDown(self): session = Session() session.query(DagRun).filter(DagRun.dag_id == TEST_DAG_ID).delete( synchronize_session=False) session.query(TaskInstance).filter( TaskInstance.dag_id == TEST_DAG_ID).delete( synchronize_session=False) session.query(TaskFail).filter(TaskFail.dag_id == TEST_DAG_ID).delete( synchronize_session=False) session.commit() session.close() clear_db_dags() clear_db_runs() def test_check_operators(self): conn_id = "sqlite_default" captain_hook = BaseHook.get_hook(conn_id=conn_id) # quite funny :D captain_hook.run("CREATE TABLE operator_test_table (a, b)") captain_hook.run("insert into operator_test_table values (1,2)") self.dag.create_dagrun(run_type=DagRunType.MANUAL, state=State.RUNNING, execution_date=DEFAULT_DATE) op = CheckOperator(task_id='check', sql="select count(*) from operator_test_table", conn_id=conn_id, dag=self.dag) op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) op = ValueCheckOperator( task_id='value_check', pass_value=95, tolerance=0.1, conn_id=conn_id, sql="SELECT 100", dag=self.dag, ) op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) captain_hook.run("drop table operator_test_table") def test_clear_api(self): task = self.dag_bash.tasks[0] task.clear(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, upstream=True, downstream=True) ti = TaskInstance(task=task, execution_date=DEFAULT_DATE) ti.are_dependents_done() def test_illegal_args(self): """ Tests that Operators reject illegal arguments """ msg = 'Invalid arguments were passed to BashOperator (task_id: test_illegal_args).' with conf_vars({('operators', 'allow_illegal_arguments'): 'True'}): with self.assertWarns(PendingDeprecationWarning) as warning: BashOperator( task_id='test_illegal_args', bash_command='echo success', dag=self.dag, illegal_argument_1234='hello?', ) assert any(msg in str(w) for w in warning.warnings) def test_illegal_args_forbidden(self): """ Tests that operators raise exceptions on illegal arguments when illegal arguments are not allowed. """ with self.assertRaises(AirflowException) as ctx: BashOperator( task_id='test_illegal_args', bash_command='echo success', dag=self.dag, illegal_argument_1234='hello?', ) self.assertIn( 'Invalid arguments were passed to BashOperator (task_id: test_illegal_args).', str(ctx.exception), ) def test_bash_operator(self): op = BashOperator(task_id='test_bash_operator', bash_command="echo success", dag=self.dag) self.dag.create_dagrun(run_type=DagRunType.MANUAL, state=State.RUNNING, execution_date=DEFAULT_DATE) op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) def test_bash_operator_multi_byte_output(self): op = BashOperator( task_id='test_multi_byte_bash_operator', bash_command="echo \u2600", dag=self.dag, output_encoding='utf-8', ) self.dag.create_dagrun(run_type=DagRunType.MANUAL, state=State.RUNNING, execution_date=DEFAULT_DATE) op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) def test_bash_operator_kill(self): import psutil sleep_time = "100%d" % os.getpid() op = BashOperator( task_id='test_bash_operator_kill', execution_timeout=timedelta(seconds=1), bash_command="/bin/bash -c 'sleep %s'" % sleep_time, dag=self.dag, ) self.assertRaises(AirflowTaskTimeout, op.run, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) sleep(2) pid = -1 for proc in psutil.process_iter(): if proc.cmdline() == ['sleep', sleep_time]: pid = proc.pid if pid != -1: os.kill(pid, signal.SIGTERM) self.fail( "BashOperator's subprocess still running after stopping on timeout!" ) def test_on_failure_callback(self): # Annoying workaround for nonlocal not existing in python 2 data = {'called': False} def check_failure(context, test_case=self): data['called'] = True error = context.get('exception') test_case.assertIsInstance(error, AirflowException) op = BashOperator( task_id='check_on_failure_callback', bash_command="exit 1", dag=self.dag, on_failure_callback=check_failure, ) self.assertRaises(AirflowException, op.run, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) self.assertTrue(data['called']) def test_dryrun(self): op = BashOperator(task_id='test_dryrun', bash_command="echo success", dag=self.dag) op.dry_run() def test_sqlite(self): import airflow.providers.sqlite.operators.sqlite op = airflow.providers.sqlite.operators.sqlite.SqliteOperator( task_id='time_sqlite', sql="CREATE TABLE IF NOT EXISTS unitest (dummy VARCHAR(20))", dag=self.dag) self.dag.create_dagrun(run_type=DagRunType.MANUAL, state=State.RUNNING, execution_date=DEFAULT_DATE) op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) def test_timeout(self): op = PythonOperator( task_id='test_timeout', execution_timeout=timedelta(seconds=1), python_callable=lambda: sleep(5), dag=self.dag, ) self.assertRaises(AirflowTaskTimeout, op.run, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) def test_python_op(self): def test_py_op(templates_dict, ds, **kwargs): if not templates_dict['ds'] == ds: raise Exception("failure") op = PythonOperator(task_id='test_py_op', python_callable=test_py_op, templates_dict={'ds': "{{ ds }}"}, dag=self.dag) self.dag.create_dagrun(run_type=DagRunType.MANUAL, state=State.RUNNING, execution_date=DEFAULT_DATE) op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) def test_complex_template(self): def verify_templated_field(context): self.assertEqual(context['ti'].task.some_templated_field['bar'][1], context['ds']) op = OperatorSubclass( task_id='test_complex_template', some_templated_field={ 'foo': '123', 'bar': ['baz', '{{ ds }}'] }, dag=self.dag, ) op.execute = verify_templated_field self.dag.create_dagrun(run_type=DagRunType.MANUAL, state=State.RUNNING, execution_date=DEFAULT_DATE) op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) def test_template_non_bool(self): """ Test templates can handle objects with no sense of truthiness """ class NonBoolObject: def __len__(self): # pylint: disable=invalid-length-returned return NotImplemented def __bool__(self): # pylint: disable=invalid-bool-returned, bad-option-value return NotImplemented op = OperatorSubclass(task_id='test_bad_template_obj', some_templated_field=NonBoolObject(), dag=self.dag) op.resolve_template_files() def test_task_get_template(self): TI = TaskInstance ti = TI(task=self.runme_0, execution_date=DEFAULT_DATE) ti.dag = self.dag_bash self.dag_bash.create_dagrun(run_type=DagRunType.MANUAL, state=State.RUNNING, execution_date=DEFAULT_DATE) ti.run(ignore_ti_state=True) context = ti.get_template_context() # DEFAULT DATE is 2015-01-01 self.assertEqual(context['ds'], '2015-01-01') self.assertEqual(context['ds_nodash'], '20150101') # next_ds is 2015-01-02 as the dag interval is daily self.assertEqual(context['next_ds'], '2015-01-02') self.assertEqual(context['next_ds_nodash'], '20150102') # prev_ds is 2014-12-31 as the dag interval is daily self.assertEqual(context['prev_ds'], '2014-12-31') self.assertEqual(context['prev_ds_nodash'], '20141231') self.assertEqual(context['ts'], '2015-01-01T00:00:00+00:00') self.assertEqual(context['ts_nodash'], '20150101T000000') self.assertEqual(context['ts_nodash_with_tz'], '20150101T000000+0000') self.assertEqual(context['yesterday_ds'], '2014-12-31') self.assertEqual(context['yesterday_ds_nodash'], '20141231') self.assertEqual(context['tomorrow_ds'], '2015-01-02') self.assertEqual(context['tomorrow_ds_nodash'], '20150102') def test_local_task_job(self): TI = TaskInstance ti = TI(task=self.runme_0, execution_date=DEFAULT_DATE) job = LocalTaskJob(task_instance=ti, ignore_ti_state=True) job.run() def test_raw_job(self): TI = TaskInstance ti = TI(task=self.runme_0, execution_date=DEFAULT_DATE) ti.dag = self.dag_bash self.dag_bash.create_dagrun(run_type=DagRunType.MANUAL, state=State.RUNNING, execution_date=DEFAULT_DATE) ti.run(ignore_ti_state=True) def test_round_time(self): rt1 = round_time(datetime(2015, 1, 1, 6), timedelta(days=1)) self.assertEqual(datetime(2015, 1, 1, 0, 0), rt1) rt2 = round_time(datetime(2015, 1, 2), relativedelta(months=1)) self.assertEqual(datetime(2015, 1, 1, 0, 0), rt2) rt3 = round_time(datetime(2015, 9, 16, 0, 0), timedelta(1), datetime(2015, 9, 14, 0, 0)) self.assertEqual(datetime(2015, 9, 16, 0, 0), rt3) rt4 = round_time(datetime(2015, 9, 15, 0, 0), timedelta(1), datetime(2015, 9, 14, 0, 0)) self.assertEqual(datetime(2015, 9, 15, 0, 0), rt4) rt5 = round_time(datetime(2015, 9, 14, 0, 0), timedelta(1), datetime(2015, 9, 14, 0, 0)) self.assertEqual(datetime(2015, 9, 14, 0, 0), rt5) rt6 = round_time(datetime(2015, 9, 13, 0, 0), timedelta(1), datetime(2015, 9, 14, 0, 0)) self.assertEqual(datetime(2015, 9, 14, 0, 0), rt6) def test_infer_time_unit(self): self.assertEqual('minutes', infer_time_unit([130, 5400, 10])) self.assertEqual('seconds', infer_time_unit([110, 50, 10, 100])) self.assertEqual('hours', infer_time_unit([100000, 50000, 10000, 20000])) self.assertEqual('days', infer_time_unit([200000, 100000])) def test_scale_time_units(self): # use assert_almost_equal from numpy.testing since we are comparing # floating point arrays arr1 = scale_time_units([130, 5400, 10], 'minutes') assert_array_almost_equal(arr1, [2.167, 90.0, 0.167], decimal=3) arr2 = scale_time_units([110, 50, 10, 100], 'seconds') assert_array_almost_equal(arr2, [110.0, 50.0, 10.0, 100.0], decimal=3) arr3 = scale_time_units([100000, 50000, 10000, 20000], 'hours') assert_array_almost_equal(arr3, [27.778, 13.889, 2.778, 5.556], decimal=3) arr4 = scale_time_units([200000, 100000], 'days') assert_array_almost_equal(arr4, [2.315, 1.157], decimal=3) def test_bad_trigger_rule(self): with self.assertRaises(AirflowException): DummyOperator(task_id='test_bad_trigger', trigger_rule="non_existent", dag=self.dag) def test_terminate_task(self): """If a task instance's db state get deleted, it should fail""" from airflow.executors.sequential_executor import SequentialExecutor TI = TaskInstance dag = self.dagbag.dags.get('test_utils') task = dag.task_dict.get('sleeps_forever') ti = TI(task=task, execution_date=DEFAULT_DATE) job = LocalTaskJob(task_instance=ti, ignore_ti_state=True, executor=SequentialExecutor()) # Running task instance asynchronously proc = multiprocessing.Process(target=job.run) proc.start() sleep(5) settings.engine.dispose() session = settings.Session() ti.refresh_from_db(session=session) # making sure it's actually running self.assertEqual(State.RUNNING, ti.state) ti = (session.query(TI).filter_by(dag_id=task.dag_id, task_id=task.task_id, execution_date=DEFAULT_DATE).one()) # deleting the instance should result in a failure session.delete(ti) session.commit() # waiting for the async task to finish proc.join() # making sure that the task ended up as failed ti.refresh_from_db(session=session) self.assertEqual(State.FAILED, ti.state) session.close() def test_task_fail_duration(self): """If a task fails, the duration should be recorded in TaskFail""" op1 = BashOperator(task_id='pass_sleepy', bash_command='sleep 3', dag=self.dag) op2 = BashOperator( task_id='fail_sleepy', bash_command='sleep 5', execution_timeout=timedelta(seconds=3), retry_delay=timedelta(seconds=0), dag=self.dag, ) session = settings.Session() try: op1.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) except Exception: # pylint: disable=broad-except pass try: op2.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) except Exception: # pylint: disable=broad-except pass op1_fails = (session.query(TaskFail).filter_by( task_id='pass_sleepy', dag_id=self.dag.dag_id, execution_date=DEFAULT_DATE).all()) op2_fails = (session.query(TaskFail).filter_by( task_id='fail_sleepy', dag_id=self.dag.dag_id, execution_date=DEFAULT_DATE).all()) self.assertEqual(0, len(op1_fails)) self.assertEqual(1, len(op2_fails)) self.assertGreaterEqual(sum([f.duration for f in op2_fails]), 3) def test_externally_triggered_dagrun(self): TI = TaskInstance # Create the dagrun between two "scheduled" execution dates of the DAG execution_date = DEFAULT_DATE + timedelta(days=2) execution_ds = execution_date.strftime('%Y-%m-%d') execution_ds_nodash = execution_ds.replace('-', '') dag = DAG(TEST_DAG_ID, default_args=self.args, schedule_interval=timedelta(weeks=1), start_date=DEFAULT_DATE) task = DummyOperator(task_id='test_externally_triggered_dag_context', dag=dag) dag.create_dagrun( run_type=DagRunType.SCHEDULED, execution_date=execution_date, state=State.RUNNING, external_trigger=True, ) task.run(start_date=execution_date, end_date=execution_date) ti = TI(task=task, execution_date=execution_date) context = ti.get_template_context() # next_ds/prev_ds should be the execution date for manually triggered runs self.assertEqual(context['next_ds'], execution_ds) self.assertEqual(context['next_ds_nodash'], execution_ds_nodash) self.assertEqual(context['prev_ds'], execution_ds) self.assertEqual(context['prev_ds_nodash'], execution_ds_nodash)
def setUp(self): self.dag = DAG(TEST_DAG_ID, default_args=self.TRAINING_DEFAULT_ARGS)
def sync_to_db(self, session: Optional[Session] = None): """Save attributes about list of DAG to the DB.""" # To avoid circular import - airflow.models.dagbag -> airflow.models.dag -> airflow.models.dagbag from airflow.models.dag import DAG from airflow.models.serialized_dag import SerializedDagModel def _serialize_dag_capturing_errors(dag, session): """ Try to serialize the dag to the DB, but make a note of any errors. We can't place them directly in import_errors, as this may be retried, and work the next time """ if dag.is_subdag: return [] try: # We cant use bulk_write_to_db as we want to capture each error individually dag_was_updated = SerializedDagModel.write_dag( dag, min_update_interval=settings. MIN_SERIALIZED_DAG_UPDATE_INTERVAL, session=session, ) if dag_was_updated: self.log.debug("Syncing DAG permissions: %s to the DB", dag.dag_id) from airflow.www.security import ApplessAirflowSecurityManager security_manager = ApplessAirflowSecurityManager( session=session) security_manager.sync_perm_for_dag(dag.dag_id, dag.access_control) return [] except OperationalError: raise except Exception: # pylint: disable=broad-except return [(dag.fileloc, traceback.format_exc( limit=-self.dagbag_import_error_traceback_depth))] # Retry 'DAG.bulk_write_to_db' & 'SerializedDagModel.bulk_sync_to_db' in case # of any Operational Errors # In case of failures, provide_session handles rollback for attempt in run_with_db_retries(logger=self.log): with attempt: serialize_errors = [] self.log.debug( "Running dagbag.sync_to_db with retries. Try %d of %d", attempt.retry_state.attempt_number, MAX_DB_RETRIES, ) self.log.debug("Calling the DAG.bulk_sync_to_db method") try: # Write Serialized DAGs to DB, capturing errors for dag in self.dags.values(): serialize_errors.extend( _serialize_dag_capturing_errors(dag, session)) DAG.bulk_write_to_db(self.dags.values(), session=session) except OperationalError: session.rollback() raise # Only now we are "complete" do we update import_errors - don't want to record errors from # previous failed attempts self.import_errors.update(dict(serialize_errors))
def test_mark_success_on_success_callback(self): """ Test that ensures that where a task is marked success in the UI on_success_callback gets executed """ # use shared memory value so we can properly track value change even if # it's been updated across processes. success_callback_called = Value('i', 0) task_terminated_externally = Value('i', 1) shared_mem_lock = Lock() def success_callback(context): with shared_mem_lock: success_callback_called.value += 1 assert context['dag_run'].dag_id == 'test_mark_success' dag = DAG(dag_id='test_mark_success', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) def task_function(ti): # pylint: disable=unused-argument time.sleep(60) # This should not happen -- the state change should be noticed and the task should get killed with shared_mem_lock: task_terminated_externally.value = 0 task = PythonOperator( task_id='test_state_succeeded1', python_callable=task_function, on_success_callback=success_callback, dag=dag, ) session = settings.Session() dag.clear() dag.create_dagrun( run_id="test", state=State.RUNNING, execution_date=DEFAULT_DATE, start_date=DEFAULT_DATE, session=session, ) ti = TaskInstance(task=task, execution_date=DEFAULT_DATE) ti.refresh_from_db() job1 = LocalTaskJob(task_instance=ti, ignore_ti_state=True, executor=SequentialExecutor()) job1.task_runner = StandardTaskRunner(job1) settings.engine.dispose() process = multiprocessing.Process(target=job1.run) process.start() for _ in range(0, 25): ti.refresh_from_db() if ti.state == State.RUNNING: break time.sleep(0.2) assert ti.state == State.RUNNING ti.state = State.SUCCESS session.merge(ti) session.commit() process.join(timeout=10) assert success_callback_called.value == 1 assert task_terminated_externally.value == 1 assert not process.is_alive()
def task_3(value: str) -> None: """Dummy Task3""" print(f'[ Task3 {value} ]') @task def task_end() -> None: """Dummy Task which is Last Task of Dag""" print('[ Task_End ]') # Creating TaskGroups @task_group def task_group_function(value: int) -> None: """TaskGroup for grouping related Tasks""" return task_3(task_2(task_1(value))) # Executing Tasks and TaskGroups with DAG(dag_id="example_task_group_decorator", start_date=datetime(2021, 1, 1), catchup=False, tags=["example"]) as dag: start_task = task_start() end_task = task_end() for i in range(5): current_task_group = task_group_function(i) start_task >> current_task_group >> end_task # [END howto_task_group_decorator]
# Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from airflow.models.dag import DAG from airflow.utils import timezone from airflow.ti_deps.met_handlers.aiflow_met_handler import AIFlowMetHandler from airflow.operators.dummy_operator import DummyOperator from airflow.models.event import Event from airflow.operators.send_event_operator import SendEventOperator dag = DAG(dag_id='workflow_1', start_date=timezone.utcnow(), schedule_interval="@once") op_0 = DummyOperator(task_id='0_job', dag=dag) op_1 = DummyOperator(task_id='1_job', dag=dag) op_2 = SendEventOperator(task_id='2_job', dag=dag, uri='localhost:50051', event=Event(key='key_1', value='value_1', event_type='UNDEFINED')) op_3 = SendEventOperator(task_id='3_job', dag=dag, uri='localhost:50051', event=Event(key='key_2', value='value_2', event_type='UNDEFINED'))
def setUp(self): args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} self.dag = DAG('test_dag_id', default_args=args) self.mock_context = MagicMock()
CLUSTER_NAME = environ.get('EKS_CLUSTER_NAME', 'eks-demo') NODEGROUP_NAME = f'{CLUSTER_NAME}-nodegroup' ROLE_ARN = environ.get('EKS_DEMO_ROLE_ARN', 'arn:aws:iam::123456789012:role/role_name') SUBNETS = environ.get('EKS_DEMO_SUBNETS', 'subnet-12345ab subnet-67890cd').split(' ') VPC_CONFIG = { 'subnetIds': SUBNETS, 'endpointPublicAccess': True, 'endpointPrivateAccess': False, } with DAG( dag_id='example_eks_using_defaults_dag', default_args={'cluster_name': CLUSTER_NAME}, schedule_interval=None, start_date=datetime(2021, 1, 1), catchup=False, max_active_runs=1, tags=['example'], ) as dag: # [START howto_operator_eks_create_cluster_with_nodegroup] # Create an Amazon EKS cluster control plane and an EKS nodegroup compute platform in one step. create_cluster_and_nodegroup = EksCreateClusterOperator( task_id='create_eks_cluster_and_nodegroup', nodegroup_name=NODEGROUP_NAME, cluster_role_arn=ROLE_ARN, nodegroup_role_arn=ROLE_ARN, # Opting to use the same ARN for the cluster and the nodegroup here, # but a different ARN could be configured and passed if desired. resources_vpc_config=VPC_CONFIG,
class TestLatestOnlyOperator(unittest.TestCase): def setUp(self): super().setUp() self.dag = DAG('test_dag', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE }, schedule_interval=INTERVAL) with create_session() as session: session.query(DagRun).delete() session.query(TaskInstance).delete() freezer = freeze_time(FROZEN_NOW) freezer.start() self.addCleanup(freezer.stop) def test_run(self): task = LatestOnlyOperator(task_id='latest', dag=self.dag) task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) def test_skipping_non_latest(self): latest_task = LatestOnlyOperator(task_id='latest', dag=self.dag) downstream_task = DummyOperator(task_id='downstream', dag=self.dag) downstream_task2 = DummyOperator(task_id='downstream_2', dag=self.dag) downstream_task3 = DummyOperator(task_id='downstream_3', trigger_rule=TriggerRule.NONE_FAILED, dag=self.dag) downstream_task.set_upstream(latest_task) downstream_task2.set_upstream(downstream_task) downstream_task3.set_upstream(downstream_task) self.dag.create_dagrun( run_id="scheduled__1", start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING, ) self.dag.create_dagrun( run_id="scheduled__2", start_date=timezone.utcnow(), execution_date=timezone.datetime(2016, 1, 1, 12), state=State.RUNNING, ) self.dag.create_dagrun( run_id="scheduled__3", start_date=timezone.utcnow(), execution_date=END_DATE, state=State.RUNNING, ) latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task2.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task3.run(start_date=DEFAULT_DATE, end_date=END_DATE) latest_instances = get_task_instances('latest') exec_date_to_latest_state = { ti.execution_date: ti.state for ti in latest_instances } self.assertEqual( { timezone.datetime(2016, 1, 1): 'success', timezone.datetime(2016, 1, 1, 12): 'success', timezone.datetime(2016, 1, 2): 'success' }, exec_date_to_latest_state) downstream_instances = get_task_instances('downstream') exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances } self.assertEqual( { timezone.datetime(2016, 1, 1): 'skipped', timezone.datetime(2016, 1, 1, 12): 'skipped', timezone.datetime(2016, 1, 2): 'success' }, exec_date_to_downstream_state) downstream_instances = get_task_instances('downstream_2') exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances } self.assertEqual( { timezone.datetime(2016, 1, 1): None, timezone.datetime(2016, 1, 1, 12): None, timezone.datetime(2016, 1, 2): 'success' }, exec_date_to_downstream_state) downstream_instances = get_task_instances('downstream_3') exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances } self.assertEqual( { timezone.datetime(2016, 1, 1): 'success', timezone.datetime(2016, 1, 1, 12): 'success', timezone.datetime(2016, 1, 2): 'success' }, exec_date_to_downstream_state) def test_not_skipping_external(self): latest_task = LatestOnlyOperator(task_id='latest', dag=self.dag) downstream_task = DummyOperator(task_id='downstream', dag=self.dag) downstream_task2 = DummyOperator(task_id='downstream_2', dag=self.dag) downstream_task.set_upstream(latest_task) downstream_task2.set_upstream(downstream_task) self.dag.create_dagrun( run_id="manual__1", start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING, external_trigger=True, ) self.dag.create_dagrun( run_id="manual__2", start_date=timezone.utcnow(), execution_date=timezone.datetime(2016, 1, 1, 12), state=State.RUNNING, external_trigger=True, ) self.dag.create_dagrun( run_id="manual__3", start_date=timezone.utcnow(), execution_date=END_DATE, state=State.RUNNING, external_trigger=True, ) latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task2.run(start_date=DEFAULT_DATE, end_date=END_DATE) latest_instances = get_task_instances('latest') exec_date_to_latest_state = { ti.execution_date: ti.state for ti in latest_instances } self.assertEqual( { timezone.datetime(2016, 1, 1): 'success', timezone.datetime(2016, 1, 1, 12): 'success', timezone.datetime(2016, 1, 2): 'success' }, exec_date_to_latest_state) downstream_instances = get_task_instances('downstream') exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances } self.assertEqual( { timezone.datetime(2016, 1, 1): 'success', timezone.datetime(2016, 1, 1, 12): 'success', timezone.datetime(2016, 1, 2): 'success' }, exec_date_to_downstream_state) downstream_instances = get_task_instances('downstream_2') exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances } self.assertEqual( { timezone.datetime(2016, 1, 1): 'success', timezone.datetime(2016, 1, 1, 12): 'success', timezone.datetime(2016, 1, 2): 'success' }, exec_date_to_downstream_state)
from airflow.models.dag import DAG from datetime import datetime from airflow.operators.bash_operator import BashOperator DAG_NAME = "infinite_dag" default_args = { "owner": "airflow", "start_date": datetime(2020, 1, 1), } with DAG(DAG_NAME, schedule_interval="*/10 * * * *", default_args=default_args) as dag: echo_success = BashOperator(task_id="echo_success", bash_command="echo success")
def test_external_task_sensor_fn_multiple_execution_dates(self): bash_command_code = """ {% set s=execution_date.time().second %} echo "second is {{ s }}" if [[ $(( {{ s }} % 60 )) == 1 ]] then exit 1 fi exit 0 """ dag_external_id = TEST_DAG_ID + '_external' dag_external = DAG(dag_external_id, default_args=self.args, schedule_interval=timedelta(seconds=1)) task_external_with_failure = BashOperator( task_id="task_external_with_failure", bash_command=bash_command_code, retries=0, dag=dag_external) task_external_without_failure = DummyOperator( task_id="task_external_without_failure", retries=0, dag=dag_external) task_external_without_failure.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + timedelta(seconds=1), ignore_ti_state=True) session = settings.Session() TI = TaskInstance try: task_external_with_failure.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + timedelta(seconds=1), ignore_ti_state=True) # The test_with_failure task is excepted to fail # once per minute (the run on the first second of # each minute). except Exception as e: # pylint: disable=broad-except failed_tis = session.query(TI).filter( TI.dag_id == dag_external_id, TI.state == State.FAILED, TI.execution_date == DEFAULT_DATE + timedelta(seconds=1)).all() if len(failed_tis) == 1 and \ failed_tis[0].task_id == 'task_external_with_failure': pass else: raise e dag_id = TEST_DAG_ID dag = DAG(dag_id, default_args=self.args, schedule_interval=timedelta(minutes=1)) task_without_failure = ExternalTaskSensor( task_id='task_without_failure', external_dag_id=dag_external_id, external_task_id='task_external_without_failure', execution_date_fn=lambda dt: [dt + timedelta(seconds=i) for i in range(2)], allowed_states=['success'], retries=0, timeout=1, poke_interval=1, dag=dag) task_with_failure = ExternalTaskSensor( task_id='task_with_failure', external_dag_id=dag_external_id, external_task_id='task_external_with_failure', execution_date_fn=lambda dt: [dt + timedelta(seconds=i) for i in range(2)], allowed_states=['success'], retries=0, timeout=1, poke_interval=1, dag=dag) task_without_failure.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) with self.assertRaises(AirflowSensorTimeout): task_with_failure.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
This dag only runs some simple tasks to test Airflow's task execution. """ from airflow.models.dag import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.operators.bash_operator import BashOperator from airflow import utils DAG_ID = 'test_dag' DEFAULT_ARGS = { 'owner': 'airflow', 'depends_on_past': True, 'start_date': utils.dates.days_ago(2) } dag = DAG( **{ 'dag_id': DAG_ID, 'schedule_interval': '*/10 * * * *', # https://crontab.guru/#*/10_*_*_*_* 'default_args': DEFAULT_ARGS, }) with dag: run_this_1 = DummyOperator(task_id='run_this_1') run_this_2 = DummyOperator(task_id='run_this_2') run_this_2.set_upstream(run_this_1) run_this_3 = DummyOperator(task_id='run_this_3') run_this_3.set_upstream(run_this_2) BashOperator(task_id='say_hi', bash_command='echo hi')
def setUp(self): self.dagbag = DagBag(dag_folder=DEV_NULL, include_examples=True) self.args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} self.dag = DAG(TEST_DAG_ID, default_args=self.args)
NODEGROUP_SUFFIX = '-nodegroup' NODEGROUP_NAME = CLUSTER_NAME + NODEGROUP_SUFFIX ROLE_ARN = environ.get('EKS_DEMO_ROLE_ARN', 'arn:aws:iam::123456789012:role/role_name') SUBNETS = environ.get('EKS_DEMO_SUBNETS', 'subnet-12345ab subnet-67890cd').split(' ') VPC_CONFIG = { 'subnetIds': SUBNETS, 'endpointPublicAccess': True, 'endpointPrivateAccess': False, } with DAG( dag_id='example_eks_with_nodegroups_dag', default_args={'cluster_name': CLUSTER_NAME}, schedule_interval=None, start_date=datetime(2021, 1, 1), max_active_runs=1, tags=['example'], ) as dag: # [START howto_operator_eks_create_cluster] # Create an Amazon EKS Cluster control plane without attaching a compute service. create_cluster = EKSCreateClusterOperator( task_id='create_eks_cluster', cluster_role_arn=ROLE_ARN, resources_vpc_config=VPC_CONFIG, compute=None, ) # [END howto_operator_eks_create_cluster] await_create_cluster = EKSClusterStateSensor(
# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from datetime import datetime from airflow.models.dag import DAG from airflow.providers.influxdb.operators.influxdb import InfluxDBOperator dag = DAG( 'example_influxdb_operator', start_date=datetime(2021, 1, 1), tags=['example'], catchup=False, ) # [START howto_operator_influxdb] query_influxdb_task = InfluxDBOperator( influxdb_conn_id='influxdb_conn_id', task_id='query_influxdb', sql='from(bucket:"test-influx") |> range(start: -10m, stop: {{ds}})', dag=dag, ) # [END howto_operator_influxdb]
class TestCore(unittest.TestCase): default_scheduler_args = {"num_runs": 1} def setUp(self): self.dagbag = DagBag(dag_folder=DEV_NULL, include_examples=True, read_dags_from_db=False) self.args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} self.dag = DAG(TEST_DAG_ID, default_args=self.args) self.dag_bash = self.dagbag.dags['example_bash_operator'] self.runme_0 = self.dag_bash.get_task('runme_0') self.run_after_loop = self.dag_bash.get_task('run_after_loop') self.run_this_last = self.dag_bash.get_task('run_this_last') def tearDown(self): session = Session() session.query(DagRun).filter(DagRun.dag_id == TEST_DAG_ID).delete( synchronize_session=False) session.query(TaskInstance).filter( TaskInstance.dag_id == TEST_DAG_ID).delete( synchronize_session=False) session.query(TaskFail).filter(TaskFail.dag_id == TEST_DAG_ID).delete( synchronize_session=False) session.commit() session.close() clear_db_dags() clear_db_runs() def test_check_operators(self): conn_id = "sqlite_default" captain_hook = BaseHook.get_hook(conn_id=conn_id) # quite funny :D captain_hook.run("CREATE TABLE operator_test_table (a, b)") captain_hook.run("insert into operator_test_table values (1,2)") self.dag.create_dagrun(run_type=DagRunType.MANUAL, state=State.RUNNING, execution_date=DEFAULT_DATE) op = CheckOperator(task_id='check', sql="select count(*) from operator_test_table", conn_id=conn_id, dag=self.dag) op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) op = ValueCheckOperator( task_id='value_check', pass_value=95, tolerance=0.1, conn_id=conn_id, sql="SELECT 100", dag=self.dag, ) op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) captain_hook.run("drop table operator_test_table") def test_clear_api(self): task = self.dag_bash.tasks[0] task.clear(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, upstream=True, downstream=True) ti = TaskInstance(task=task, execution_date=DEFAULT_DATE) ti.are_dependents_done() def test_illegal_args(self): """ Tests that Operators reject illegal arguments """ msg = 'Invalid arguments were passed to BashOperator (task_id: test_illegal_args).' with conf_vars({('operators', 'allow_illegal_arguments'): 'True'}): with pytest.warns(PendingDeprecationWarning) as warnings: BashOperator( task_id='test_illegal_args', bash_command='echo success', dag=self.dag, illegal_argument_1234='hello?', ) assert any(msg in str(w) for w in warnings) def test_illegal_args_forbidden(self): """ Tests that operators raise exceptions on illegal arguments when illegal arguments are not allowed. """ with pytest.raises(AirflowException) as ctx: BashOperator( task_id='test_illegal_args', bash_command='echo success', dag=self.dag, illegal_argument_1234='hello?', ) assert 'Invalid arguments were passed to BashOperator (task_id: test_illegal_args).' in str( ctx.value) def test_bash_operator(self): op = BashOperator(task_id='test_bash_operator', bash_command="echo success", dag=self.dag) self.dag.create_dagrun(run_type=DagRunType.MANUAL, state=State.RUNNING, execution_date=DEFAULT_DATE) op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) def test_bash_operator_multi_byte_output(self): op = BashOperator( task_id='test_multi_byte_bash_operator', bash_command="echo \u2600", dag=self.dag, output_encoding='utf-8', ) self.dag.create_dagrun(run_type=DagRunType.MANUAL, state=State.RUNNING, execution_date=DEFAULT_DATE) op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) def test_bash_operator_kill(self): import psutil sleep_time = "100%d" % os.getpid() op = BashOperator( task_id='test_bash_operator_kill', execution_timeout=timedelta(seconds=1), bash_command=f"/bin/bash -c 'sleep {sleep_time}'", dag=self.dag, ) with pytest.raises(AirflowTaskTimeout): op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) sleep(2) pid = -1 for proc in psutil.process_iter(): if proc.cmdline() == ['sleep', sleep_time]: pid = proc.pid if pid != -1: os.kill(pid, signal.SIGTERM) self.fail( "BashOperator's subprocess still running after stopping on timeout!" ) def test_on_failure_callback(self): mock_failure_callback = MagicMock() op = BashOperator( task_id='check_on_failure_callback', bash_command="exit 1", dag=self.dag, on_failure_callback=mock_failure_callback, ) with pytest.raises(AirflowException): op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) mock_failure_callback.assert_called_once() def test_dryrun(self): op = BashOperator(task_id='test_dryrun', bash_command="echo success", dag=self.dag) op.dry_run() def test_sqlite(self): import airflow.providers.sqlite.operators.sqlite op = airflow.providers.sqlite.operators.sqlite.SqliteOperator( task_id='time_sqlite', sql="CREATE TABLE IF NOT EXISTS unitest (dummy VARCHAR(20))", dag=self.dag) self.dag.create_dagrun(run_type=DagRunType.MANUAL, state=State.RUNNING, execution_date=DEFAULT_DATE) op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) def test_timeout(self): op = PythonOperator( task_id='test_timeout', execution_timeout=timedelta(seconds=1), python_callable=lambda: sleep(5), dag=self.dag, ) with pytest.raises(AirflowTaskTimeout): op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) def test_python_op(self): def test_py_op(templates_dict, ds, **kwargs): if not templates_dict['ds'] == ds: raise Exception("failure") op = PythonOperator(task_id='test_py_op', python_callable=test_py_op, templates_dict={'ds': "{{ ds }}"}, dag=self.dag) self.dag.create_dagrun(run_type=DagRunType.MANUAL, state=State.RUNNING, execution_date=DEFAULT_DATE) op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) def test_complex_template(self): def verify_templated_field(context): assert context['ti'].task.some_templated_field['bar'][ 1] == context['ds'] op = OperatorSubclass( task_id='test_complex_template', some_templated_field={ 'foo': '123', 'bar': ['baz', '{{ ds }}'] }, dag=self.dag, ) op.execute = verify_templated_field self.dag.create_dagrun(run_type=DagRunType.MANUAL, state=State.RUNNING, execution_date=DEFAULT_DATE) op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) def test_template_non_bool(self): """ Test templates can handle objects with no sense of truthiness """ class NonBoolObject: def __len__(self): # pylint: disable=invalid-length-returned return NotImplemented def __bool__(self): # pylint: disable=invalid-bool-returned, bad-option-value return NotImplemented op = OperatorSubclass(task_id='test_bad_template_obj', some_templated_field=NonBoolObject(), dag=self.dag) op.resolve_template_files() def test_task_get_template(self): TI = TaskInstance ti = TI(task=self.runme_0, execution_date=DEFAULT_DATE) ti.dag = self.dag_bash self.dag_bash.create_dagrun(run_type=DagRunType.MANUAL, state=State.RUNNING, execution_date=DEFAULT_DATE) ti.run(ignore_ti_state=True) context = ti.get_template_context() # DEFAULT DATE is 2015-01-01 assert context['ds'] == '2015-01-01' assert context['ds_nodash'] == '20150101' # next_ds is 2015-01-02 as the dag interval is daily assert context['next_ds'] == '2015-01-02' assert context['next_ds_nodash'] == '20150102' # prev_ds is 2014-12-31 as the dag interval is daily assert context['prev_ds'] == '2014-12-31' assert context['prev_ds_nodash'] == '20141231' assert context['ts'] == '2015-01-01T00:00:00+00:00' assert context['ts_nodash'] == '20150101T000000' assert context['ts_nodash_with_tz'] == '20150101T000000+0000' assert context['yesterday_ds'] == '2014-12-31' assert context['yesterday_ds_nodash'] == '20141231' assert context['tomorrow_ds'] == '2015-01-02' assert context['tomorrow_ds_nodash'] == '20150102' def test_local_task_job(self): TI = TaskInstance ti = TI(task=self.runme_0, execution_date=DEFAULT_DATE) job = LocalTaskJob(task_instance=ti, ignore_ti_state=True) job.run() def test_raw_job(self): TI = TaskInstance ti = TI(task=self.runme_0, execution_date=DEFAULT_DATE) ti.dag = self.dag_bash self.dag_bash.create_dagrun(run_type=DagRunType.MANUAL, state=State.RUNNING, execution_date=DEFAULT_DATE) ti.run(ignore_ti_state=True) def test_bad_trigger_rule(self): with pytest.raises(AirflowException): DummyOperator(task_id='test_bad_trigger', trigger_rule="non_existent", dag=self.dag) def test_terminate_task(self): """If a task instance's db state get deleted, it should fail""" from airflow.executors.sequential_executor import SequentialExecutor TI = TaskInstance dag = self.dagbag.dags.get('test_utils') task = dag.task_dict.get('sleeps_forever') ti = TI(task=task, execution_date=DEFAULT_DATE) job = LocalTaskJob(task_instance=ti, ignore_ti_state=True, executor=SequentialExecutor()) # Running task instance asynchronously proc = multiprocessing.Process(target=job.run) proc.start() sleep(5) settings.engine.dispose() session = settings.Session() ti.refresh_from_db(session=session) # making sure it's actually running assert State.RUNNING == ti.state ti = (session.query(TI).filter_by(dag_id=task.dag_id, task_id=task.task_id, execution_date=DEFAULT_DATE).one()) # deleting the instance should result in a failure session.delete(ti) session.commit() # waiting for the async task to finish proc.join() # making sure that the task ended up as failed ti.refresh_from_db(session=session) assert State.FAILED == ti.state session.close() def test_task_fail_duration(self): """If a task fails, the duration should be recorded in TaskFail""" op1 = BashOperator(task_id='pass_sleepy', bash_command='sleep 3', dag=self.dag) op2 = BashOperator( task_id='fail_sleepy', bash_command='sleep 5', execution_timeout=timedelta(seconds=3), retry_delay=timedelta(seconds=0), dag=self.dag, ) session = settings.Session() try: op1.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) except Exception: # pylint: disable=broad-except pass try: op2.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) except Exception: # pylint: disable=broad-except pass op1_fails = (session.query(TaskFail).filter_by( task_id='pass_sleepy', dag_id=self.dag.dag_id, execution_date=DEFAULT_DATE).all()) op2_fails = (session.query(TaskFail).filter_by( task_id='fail_sleepy', dag_id=self.dag.dag_id, execution_date=DEFAULT_DATE).all()) assert 0 == len(op1_fails) assert 1 == len(op2_fails) assert sum([f.duration for f in op2_fails]) >= 3 def test_externally_triggered_dagrun(self): TI = TaskInstance # Create the dagrun between two "scheduled" execution dates of the DAG execution_date = DEFAULT_DATE + timedelta(days=2) execution_ds = execution_date.strftime('%Y-%m-%d') execution_ds_nodash = execution_ds.replace('-', '') dag = DAG(TEST_DAG_ID, default_args=self.args, schedule_interval=timedelta(weeks=1), start_date=DEFAULT_DATE) task = DummyOperator(task_id='test_externally_triggered_dag_context', dag=dag) dag.create_dagrun( run_type=DagRunType.SCHEDULED, execution_date=execution_date, state=State.RUNNING, external_trigger=True, ) task.run(start_date=execution_date, end_date=execution_date) ti = TI(task=task, execution_date=execution_date) context = ti.get_template_context() # next_ds/prev_ds should be the execution date for manually triggered runs assert context['next_ds'] == execution_ds assert context['next_ds_nodash'] == execution_ds_nodash assert context['prev_ds'] == execution_ds assert context['prev_ds_nodash'] == execution_ds_nodash def test_dag_params_and_task_params(self): # This test case guards how params of DAG and Operator work together. # - If any key exists in either DAG's or Operator's params, # it is guaranteed to be available eventually. # - If any key exists in both DAG's params and Operator's params, # the latter has precedence. TI = TaskInstance dag = DAG( TEST_DAG_ID, default_args=self.args, schedule_interval=timedelta(weeks=1), start_date=DEFAULT_DATE, params={ 'key_1': 'value_1', 'key_2': 'value_2_old' }, ) task1 = DummyOperator( task_id='task1', dag=dag, params={ 'key_2': 'value_2_new', 'key_3': 'value_3' }, ) task2 = DummyOperator(task_id='task2', dag=dag) dag.create_dagrun( run_type=DagRunType.SCHEDULED, execution_date=DEFAULT_DATE, state=State.RUNNING, external_trigger=True, ) task1.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) task2.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) ti1 = TI(task=task1, execution_date=DEFAULT_DATE) ti2 = TI(task=task2, execution_date=DEFAULT_DATE) context1 = ti1.get_template_context() context2 = ti2.get_template_context() assert context1['params'] == { 'key_1': 'value_1', 'key_2': 'value_2_new', 'key_3': 'value_3' } assert context2['params'] == { 'key_1': 'value_1', 'key_2': 'value_2_old' }
def setUp(self): args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} self.dag = DAG('test_dag_id', default_args=args)
# 'dag': dag, # 'sla': timedelta(hours=2), # 'execution_timeout': timedelta(seconds=300), # 'on_failure_callback': some_function, # 'on_success_callback': some_other_function, # 'on_retry_callback': another_function, # 'sla_miss_callback': yet_another_function, # 'trigger_rule': 'all_success' } # [END default_args] # [START instantiate_dag] dag = DAG( 'tutorial', default_args=default_args, description='A simple tutorial DAG', schedule_interval=timedelta(days=1), tags=['example'], ) # [END instantiate_dag] # t1, t2 and t3 are examples of tasks created by instantiating operators # [START basic_task] t1 = BashOperator( task_id='print_date', bash_command='date', dag=dag, ) t2 = BashOperator( task_id='sleep',
def setUp(self): args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} self.dag = DAG(TEST_DAG_ID, default_args=args)
def test_render_task_group(self): with DAG(dag_id="example_task_group", start_date=START_DATE) as dag: start = DummyOperator(task_id="start") with TaskGroup("section_1", tooltip="Tasks for section_1") as section_1: task_1 = DummyOperator(task_id="task_1") task_2 = BashOperator(task_id="task_2", bash_command='echo 1') task_3 = DummyOperator(task_id="task_3") task_1 >> [task_2, task_3] with TaskGroup("section_2", tooltip="Tasks for section_2") as section_2: task_1 = DummyOperator(task_id="task_1") with TaskGroup("inner_section_2", tooltip="Tasks for inner_section2"): task_2 = BashOperator(task_id="task_2", bash_command='echo 1') task_3 = DummyOperator(task_id="task_3") task_4 = DummyOperator(task_id="task_4") [task_2, task_3] >> task_4 end = DummyOperator(task_id='end') start >> section_1 >> section_2 >> end dot = dot_renderer.render_dag(dag) assert dot.source == '\n'.join( [ 'digraph example_task_group {', '\tgraph [label=example_task_group labelloc=t rankdir=LR]', '\tend [color="#000000" fillcolor="#e8f7e4" label=end shape=rectangle ' 'style="filled,rounded"]', '\tsubgraph cluster_section_1 {', '\t\tcolor="#000000" fillcolor="#6495ed7f" label=section_1 shape=rectangle style=filled', '\t\t"section_1.upstream_join_id" [color="#000000" fillcolor=CornflowerBlue height=0.2 ' 'label="" shape=circle style="filled,rounded" width=0.2]', '\t\t"section_1.downstream_join_id" [color="#000000" fillcolor=CornflowerBlue height=0.2 ' 'label="" shape=circle style="filled,rounded" width=0.2]', '\t\t"section_1.task_1" [color="#000000" fillcolor="#e8f7e4" label=task_1 shape=rectangle ' 'style="filled,rounded"]', '\t\t"section_1.task_2" [color="#000000" fillcolor="#f0ede4" label=task_2 shape=rectangle ' 'style="filled,rounded"]', '\t\t"section_1.task_3" [color="#000000" fillcolor="#e8f7e4" label=task_3 shape=rectangle ' 'style="filled,rounded"]', '\t}', '\tsubgraph cluster_section_2 {', '\t\tcolor="#000000" fillcolor="#6495ed7f" label=section_2 shape=rectangle style=filled', '\t\t"section_2.upstream_join_id" [color="#000000" fillcolor=CornflowerBlue height=0.2 ' 'label="" shape=circle style="filled,rounded" width=0.2]', '\t\t"section_2.downstream_join_id" [color="#000000" fillcolor=CornflowerBlue height=0.2 ' 'label="" shape=circle style="filled,rounded" width=0.2]', '\t\tsubgraph "cluster_section_2.inner_section_2" {', '\t\t\tcolor="#000000" fillcolor="#6495ed7f" label=inner_section_2 shape=rectangle ' 'style=filled', '\t\t\t"section_2.inner_section_2.task_2" [color="#000000" fillcolor="#f0ede4" label=task_2 ' 'shape=rectangle style="filled,rounded"]', '\t\t\t"section_2.inner_section_2.task_3" [color="#000000" fillcolor="#e8f7e4" label=task_3 ' 'shape=rectangle style="filled,rounded"]', '\t\t\t"section_2.inner_section_2.task_4" [color="#000000" fillcolor="#e8f7e4" label=task_4 ' 'shape=rectangle style="filled,rounded"]', '\t\t}', '\t\t"section_2.task_1" [color="#000000" fillcolor="#e8f7e4" label=task_1 shape=rectangle ' 'style="filled,rounded"]', '\t}', '\tstart [color="#000000" fillcolor="#e8f7e4" label=start shape=rectangle ' 'style="filled,rounded"]', '\t"section_1.downstream_join_id" -> "section_2.upstream_join_id"', '\t"section_1.task_1" -> "section_1.task_2"', '\t"section_1.task_1" -> "section_1.task_3"', '\t"section_1.task_2" -> "section_1.downstream_join_id"', '\t"section_1.task_3" -> "section_1.downstream_join_id"', '\t"section_1.upstream_join_id" -> "section_1.task_1"', '\t"section_2.downstream_join_id" -> end', '\t"section_2.inner_section_2.task_2" -> "section_2.inner_section_2.task_4"', '\t"section_2.inner_section_2.task_3" -> "section_2.inner_section_2.task_4"', '\t"section_2.inner_section_2.task_4" -> "section_2.downstream_join_id"', '\t"section_2.task_1" -> "section_2.downstream_join_id"', '\t"section_2.upstream_join_id" -> "section_2.inner_section_2.task_2"', '\t"section_2.upstream_join_id" -> "section_2.inner_section_2.task_3"', '\t"section_2.upstream_join_id" -> "section_2.task_1"', '\tstart -> "section_1.upstream_join_id"', '}', ] )