def delete_dag(self, keep_records_in_log: bool = True, session=None): dag = session.query(DagModel).filter( DagModel.dag_id == self.dag_id).first() if dag is None: raise DagNotFound(f"Dag id {self.dag_id} not found") # so explicitly removes serialized DAG here. if STORE_SERIALIZED_DAGS and SerializedDagModel.has_dag( dag_id=self.dag_id, session=session): SerializedDagModel.remove_dag(dag_id=self.dag_id, session=session) # noinspection PyUnresolvedReferences,PyProtectedMember for model in models.base.Base._decl_class_registry.values(): if hasattr(model, "dag_id"): if model.__name__: print(model.__name__) if keep_records_in_log and model.__name__ == "Log": continue cond = or_(model.dag_id == self.dag_id, model.dag_id.like(self.dag_id + ".%")) session.query(model).filter(cond).delete( synchronize_session="fetch") # Delete entries in Import Errors table for a deleted DAG # This handles the case when the dag_id is changed in the file session.query(models.ImportError).filter( models.ImportError.filename == dag.fileloc).delete( synchronize_session="fetch")
def test_remove_stale_dags(self): example_dags_list = list(self._write_example_dags().values()) # Remove SubDags from the list as they are not stored in DB in a separate row # and are directly added in Json blob of the main DAG filtered_example_dags_list = [dag for dag in example_dags_list if not dag.is_subdag] # Tests removing a stale DAG stale_dag = SDM(filtered_example_dags_list[0]) fresh_dag = SDM(filtered_example_dags_list[1]) # Overwrite stale_dag's last_updated to be 10 minutes ago stale_dag.last_updated = timezone.utcnow() - timezone.dt.timedelta(seconds=600) with create_session() as session: session.merge(stale_dag) session.commit() # Remove any stale DAGs older than 5 minutes SDM.remove_stale_dags(timezone.utcnow() - timezone.dt.timedelta(seconds=300)) self.assertFalse(SDM.has_dag(stale_dag.dag_id)) self.assertTrue(SDM.has_dag(fresh_dag.dag_id))
def test_remove_dags(self): """DAGs can be removed from database.""" example_dags_list = list(self._write_example_dags().values()) # Remove SubDags from the list as they are not stored in DB in a separate row # and are directly added in Json blob of the main DAG filtered_example_dags_list = [dag for dag in example_dags_list if not dag.is_subdag] # Tests removing by dag_id. dag_removed_by_id = filtered_example_dags_list[0] SDM.remove_dag(dag_removed_by_id.dag_id) self.assertFalse(SDM.has_dag(dag_removed_by_id.dag_id)) # Tests removing by file path. dag_removed_by_file = filtered_example_dags_list[1] example_dag_files = [dag.full_filepath for dag in filtered_example_dags_list] example_dag_files.remove(dag_removed_by_file.full_filepath) SDM.remove_deleted_dags(example_dag_files) self.assertFalse(SDM.has_dag(dag_removed_by_file.dag_id))
def delete_dag(dag_id: str, keep_records_in_log: bool = True, session=None) -> int: """ :param dag_id: the dag_id of the DAG to delete :param keep_records_in_log: whether keep records of the given dag_id in the Log table in the backend database (for reasons like auditing). The default value is True. :param session: session used :return count of deleted dags """ log.info("Deleting DAG: %s", dag_id) running_tis = (session.query(models.TaskInstance.state).filter( models.TaskInstance.dag_id == dag_id).filter( models.TaskInstance.state == State.RUNNING).first()) if running_tis: raise AirflowException("TaskInstances still running") dag = session.query(DagModel).filter(DagModel.dag_id == dag_id).first() if dag is None: raise DagNotFound(f"Dag id {dag_id} not found") # deleting a DAG should also delete all of its subdags dags_to_delete_query = session.query(DagModel.dag_id).filter( or_( DagModel.dag_id == dag_id, and_(DagModel.dag_id.like(f"{dag_id}.%"), DagModel.is_subdag), )) dags_to_delete = [dag_id for dag_id, in dags_to_delete_query] # Scheduler removes DAGs without files from serialized_dag table every dag_dir_list_interval. # There may be a lag, so explicitly removes serialized DAG here. if SerializedDagModel.has_dag(dag_id=dag_id, session=session): SerializedDagModel.remove_dag(dag_id=dag_id, session=session) count = 0 for model in get_sqla_model_classes(): if hasattr(model, "dag_id"): if keep_records_in_log and model.__name__ == 'Log': continue count += (session.query(model).filter( model.dag_id.in_(dags_to_delete)).delete( synchronize_session='fetch')) if dag.is_subdag: parent_dag_id, task_id = dag_id.rsplit(".", 1) for model in TaskFail, models.TaskInstance: count += (session.query(model).filter( model.dag_id == parent_dag_id, model.task_id == task_id).delete()) # Delete entries in Import Errors table for a deleted DAG # This handles the case when the dag_id is changed in the file session.query(models.ImportError).filter( models.ImportError.filename == dag.fileloc).delete( synchronize_session='fetch') return count
def test_remove_dags_by_id(self): """DAGs can be removed from database.""" example_dags_list = list(self._write_example_dags().values()) # Remove SubDags from the list as they are not stored in DB in a separate row # and are directly added in Json blob of the main DAG filtered_example_dags_list = [dag for dag in example_dags_list if not dag.is_subdag] # Tests removing by dag_id. dag_removed_by_id = filtered_example_dags_list[0] SDM.remove_dag(dag_removed_by_id.dag_id) assert not SDM.has_dag(dag_removed_by_id.dag_id)
def test_write_dag(self): """DAGs can be written into database.""" example_dags = self._write_example_dags() with create_session() as session: for dag in example_dags.values(): assert SDM.has_dag(dag.dag_id) result = session.query(SDM.fileloc, SDM.data).filter(SDM.dag_id == dag.dag_id).one() assert result.fileloc == dag.full_filepath # Verifies JSON schema. SerializedDAG.validate_schema(result.data)
def test_remove_dags_by_filepath(self): """DAGs can be removed from database.""" example_dags_list = list(self._write_example_dags().values()) # Remove SubDags from the list as they are not stored in DB in a separate row # and are directly added in Json blob of the main DAG filtered_example_dags_list = [dag for dag in example_dags_list if not dag.is_subdag] # Tests removing by file path. dag_removed_by_file = filtered_example_dags_list[0] # remove repeated files for those DAGs that define multiple dags in the same file (set comprehension) example_dag_files = list({dag.full_filepath for dag in filtered_example_dags_list}) example_dag_files.remove(dag_removed_by_file.full_filepath) SDM.remove_deleted_dags(example_dag_files) self.assertFalse(SDM.has_dag(dag_removed_by_file.dag_id))
def delete_dag(dag_id: str, keep_records_in_log: bool = True, session=None) -> int: """ :param dag_id: the dag_id of the DAG to delete :param keep_records_in_log: whether keep records of the given dag_id in the Log table in the backend database (for reasons like auditing). The default value is True. :param session: session used :return count of deleted dags """ logger = LoggingMixin() logger.log.info("Deleting DAG: %s", dag_id) dag = session.query(DagModel).filter(DagModel.dag_id == dag_id).first() if dag is None: raise DagNotFound("Dag id {} not found".format(dag_id)) # Scheduler removes DAGs without files from serialized_dag table every dag_dir_list_interval. # There may be a lag, so explicitly removes serialized DAG here. if STORE_SERIALIZED_DAGS and SerializedDagModel.has_dag(dag_id=dag_id, session=session): SerializedDagModel.remove_dag(dag_id=dag_id, session=session) count = 0 # noinspection PyUnresolvedReferences,PyProtectedMember for model in models.base.Base._decl_class_registry.values(): # pylint: disable=protected-access if hasattr(model, "dag_id"): if keep_records_in_log and model.__name__ == 'Log': continue cond = or_(model.dag_id == dag_id, model.dag_id.like(dag_id + ".%")) count += session.query(model).filter(cond).delete( synchronize_session='fetch') if dag.is_subdag: parent_dag_id, task_id = dag_id.rsplit(".", 1) for model in models.DagRun, TaskFail, models.TaskInstance: count += session.query(model).filter( model.dag_id == parent_dag_id, model.task_id == task_id).delete() # Delete entries in Import Errors table for a deleted DAG # This handles the case when the dag_id is changed in the file session.query(models.ImportError).filter( models.ImportError.filename == dag.fileloc).delete( synchronize_session='fetch') return count