def test_should_response_200_task_instance_with_sla(self, session): self.create_task_instances(session) session.query() sla_miss = SlaMiss( task_id="print_the_context", dag_id="example_python_operator", execution_date=self.default_time, timestamp=self.default_time, ) session.add(sla_miss) session.commit() response = self.client.get( "/api/v1/dags/example_python_operator/dagRuns/TEST_DAG_RUN_ID/taskInstances/print_the_context", environ_overrides={"REMOTE_USER": "******"}, ) self.assertEqual(response.status_code, 200) self.assertDictEqual( response.json, { "dag_id": "example_python_operator", "duration": 10000.0, "end_date": "2020-01-03T00:00:00+00:00", "execution_date": "2020-01-01T00:00:00+00:00", "executor_config": "{}", "hostname": "", "max_tries": 0, "operator": "PythonOperator", "pid": 100, "pool": "default_pool", "pool_slots": 1, "priority_weight": 6, "queue": "default_queue", "queued_when": None, "sla_miss": { "dag_id": "example_python_operator", "description": None, "email_sent": False, "execution_date": "2020-01-01T00:00:00+00:00", "notification_sent": False, "task_id": "print_the_context", "timestamp": "2020-01-01T00:00:00+00:00", }, "start_date": "2020-01-02T00:00:00+00:00", "state": "running", "task_id": "print_the_context", "try_number": 0, "unixname": getpass.getuser(), }, )
def test_task_instance_schema_with_sla(self, session): ti = TI(task=self.task, **self.default_ti_init) for key, value in self.default_ti_extras.items(): setattr(ti, key, value) sla_miss = SlaMiss( task_id="TEST_TASK_ID", dag_id="TEST_DAG_ID", execution_date=self.default_time, ) session.add(ti) session.add(sla_miss) session.commit() serialized_ti = task_instance_schema.dump((ti, sla_miss)) expected_json = { "dag_id": "TEST_DAG_ID", "duration": 10000.0, "end_date": "2020-01-03T00:00:00+00:00", "execution_date": "2020-01-01T00:00:00+00:00", "executor_config": "{}", "hostname": "", "max_tries": 0, "operator": "DummyOperator", "pid": 100, "pool": "default_pool", "pool_slots": 1, "priority_weight": 1, "queue": "default_queue", "queued_when": None, "sla_miss": { "dag_id": "TEST_DAG_ID", "description": None, "email_sent": False, "execution_date": "2020-01-01T00:00:00+00:00", "notification_sent": False, "task_id": "TEST_TASK_ID", "timestamp": None, }, "start_date": "2020-01-02T00:00:00+00:00", "state": "running", "task_id": "TEST_TASK_ID", "try_number": 0, "unixname": getuser(), } assert serialized_ti == expected_json
def manage_slas(self, dag: DAG, session: Session = None) -> None: """ Finding all tasks that have SLAs defined, and sending alert emails where needed. New SLA misses are also recorded in the database. We are assuming that the scheduler runs often, so we only check for tasks that should have succeeded in the past hour. """ self.log.info("Running SLA Checks for %s", dag.dag_id) if not any(isinstance(ti.sla, timedelta) for ti in dag.tasks): self.log.info( "Skipping SLA check for %s because no tasks in DAG have SLAs", dag) return qry = (session.query( TI.task_id, func.max(TI.execution_date).label('max_ti')).with_hint( TI, 'USE INDEX (PRIMARY)', dialect_name='mysql').filter(TI.dag_id == dag.dag_id).filter( or_(TI.state == State.SUCCESS, TI.state == State.SKIPPED)).filter( TI.task_id.in_(dag.task_ids)).group_by( TI.task_id).subquery('sq')) max_tis: List[TI] = (session.query(TI).filter( TI.dag_id == dag.dag_id, TI.task_id == qry.c.task_id, TI.execution_date == qry.c.max_ti, ).all()) ts = timezone.utcnow() for ti in max_tis: task = dag.get_task(ti.task_id) if task.sla and not isinstance(task.sla, timedelta): raise TypeError( f"SLA is expected to be timedelta object, got " f"{type(task.sla)} in {task.dag_id}:{task.task_id}") dttm = dag.following_schedule(ti.execution_date) while dttm < timezone.utcnow(): following_schedule = dag.following_schedule(dttm) if following_schedule + task.sla < timezone.utcnow(): session.merge( SlaMiss(task_id=ti.task_id, dag_id=ti.dag_id, execution_date=dttm, timestamp=ts)) dttm = dag.following_schedule(dttm) session.commit() # pylint: disable=singleton-comparison slas: List[SlaMiss] = ( session.query(SlaMiss).filter(SlaMiss.notification_sent == False, SlaMiss.dag_id == dag.dag_id) # noqa .all()) # pylint: enable=singleton-comparison if slas: # pylint: disable=too-many-nested-blocks sla_dates: List[datetime.datetime] = [ sla.execution_date for sla in slas ] fetched_tis: List[TI] = (session.query(TI).filter( TI.state != State.SUCCESS, TI.execution_date.in_(sla_dates), TI.dag_id == dag.dag_id).all()) blocking_tis: List[TI] = [] for ti in fetched_tis: if ti.task_id in dag.task_ids: ti.task = dag.get_task(ti.task_id) blocking_tis.append(ti) else: session.delete(ti) session.commit() task_list = "\n".join(sla.task_id + ' on ' + sla.execution_date.isoformat() for sla in slas) blocking_task_list = "\n".join(ti.task_id + ' on ' + ti.execution_date.isoformat() for ti in blocking_tis) # Track whether email or any alert notification sent # We consider email or the alert callback as notifications email_sent = False notification_sent = False if dag.sla_miss_callback: # Execute the alert callback self.log.info('Calling SLA miss callback') try: dag.sla_miss_callback(dag, task_list, blocking_task_list, slas, blocking_tis) notification_sent = True except Exception: # pylint: disable=broad-except self.log.exception( "Could not call sla_miss_callback for DAG %s", dag.dag_id) email_content = f"""\ Here's a list of tasks that missed their SLAs: <pre><code>{task_list}\n<code></pre> Blocking tasks: <pre><code>{blocking_task_list}<code></pre> Airflow Webserver URL: {conf.get(section='webserver', key='base_url')} """ tasks_missed_sla = [] for sla in slas: try: task = dag.get_task(sla.task_id) except TaskNotFound: # task already deleted from DAG, skip it self.log.warning( "Task %s doesn't exist in DAG anymore, skipping SLA miss notification.", sla.task_id) continue tasks_missed_sla.append(task) emails: Set[str] = set() for task in tasks_missed_sla: if task.email: if isinstance(task.email, str): emails |= set(get_email_address_list(task.email)) elif isinstance(task.email, (list, tuple)): emails |= set(task.email) if emails: try: send_email(emails, f"[airflow] SLA miss on DAG={dag.dag_id}", email_content) email_sent = True notification_sent = True except Exception: # pylint: disable=broad-except Stats.incr('sla_email_notification_failure') self.log.exception( "Could not send SLA Miss email notification for DAG %s", dag.dag_id) # If we sent any notification, update the sla_miss table if notification_sent: for sla in slas: sla.email_sent = email_sent sla.notification_sent = True session.merge(sla) session.commit()