def set_dirty(dag_id, session=None): """ :param dag_id: the dag_id to mark dirty :param session: database session :return: """ # 给dag的每一种状态都创建一条记录 # 将统计表中不存在的状态插入到db中 DagStat.create(dag_id=dag_id, session=session) try: # 给指定的dag所有的状态行加行锁 stats = session.query(DagStat).filter( DagStat.dag_id == dag_id ).with_for_update().all() # 修改设置dirty标记 for stat in stats: stat.dirty = True session.commit() except Exception as e: session.rollback() log = LoggingMixin().log log.warning("Could not update dag stats for %s", dag_id) log.exception(e)
def update(dag_ids=None, dirty_only=True, session=None): """更新dag每个状态的dag_run的数量,并设置dirty为False Updates the stats for dirty/out-of-sync dags :param dag_ids: dag_ids to be updated :type dag_ids: list :param dirty_only: only updated for marked dirty, defaults to True :type dirty_only: bool :param session: db session to use :type session: Session """ try: qry = session.query(DagStat) if dag_ids: qry = qry.filter(DagStat.dag_id.in_(set(dag_ids))) # 仅仅获得脏数据 if dirty_only: qry = qry.filter(DagStat.dirty == True) # noqa # 添加行级锁 qry = qry.with_for_update().all() # 获得所有的dagId列表 dag_ids = set([dag_stat.dag_id for dag_stat in qry]) # avoid querying with an empty IN clause if not dag_ids: session.commit() return # 获得dag每个dagrun状态的记录数量 begin_time = datetime.now() - timedelta(days=configuration.getint('core', 'sql_query_history_days')) dagstat_states = set(itertools.product(dag_ids, State.dag_states)) qry = ( session.query(DagRun.dag_id, DagRun.state, func.count('*')) .filter(DagRun.dag_id.in_(dag_ids)) .filter(DagRun.execution_date > begin_time) .group_by(DagRun.dag_id, DagRun.state) ) counts = {(dag_id, state): count for dag_id, state, count in qry} # 修改每个dag_id的每个状态的dagrund的数量 for dag_id, state in dagstat_states: count = counts.get((dag_id, state), 0) session.merge( DagStat(dag_id=dag_id, state=state, count=count, dirty=False) ) session.commit() except Exception as e: session.rollback() log = LoggingMixin().log log.warning("Could not update dag stat table") log.exception(e)
def execute_command(command): """airflow worker 执行shell命令 .""" log = LoggingMixin().log log.info("Executing command in Celery: %s", command) env = os.environ.copy() try: # celery worker 收到消息后,执行消息中的shell命令 subprocess.check_call(command, shell=True, stderr=subprocess.STDOUT, close_fds=True, env=env) except subprocess.CalledProcessError as e: log.exception('execute_command encountered a CalledProcessError') log.error(e.output) raise AirflowException('Celery command failed')
def create(dag_id, session=None): """将统计表中不存在的状态插入到db中 Creates the missing states the stats table for the dag specified :param dag_id: dag id of the dag to create stats for :param session: database session :return: """ # 获得DagStat中存在的状态 qry = session.query(DagStat).filter(DagStat.dag_id == dag_id).all() states = {dag_stat.state for dag_stat in qry} # 遍历所有状态, 找出不再数据库中的状态 states_not_found = set(State.dag_states) - states for state in states_not_found: try: session.merge(DagStat(dag_id=dag_id, state=state)) session.commit() except Exception as e: session.rollback() log = LoggingMixin().log log.warning("Could not create stat record") log.exception(e)
# 将所加载模块中的 XToolPlugin 子类添加到 plugins 列表中 # 遍历模块的属性值 for obj in list(m.__dict__.values()): # 判断模块中的类是否为 XToolPlugin 的子类 if ( inspect.isclass(obj) and issubclass(obj, XToolPlugin) and obj is not XToolPlugin): # 验证子类中是否定义了name静态变量 obj.validate() # 将类加入到插件列表中 if obj not in plugins: plugins.append(obj) except Exception as e: log.exception(e) log.error('Failed to import plugin %s', filepath) def make_module(name, objects): """动态创建模块 . :param name: 模块名称 :param objects: 模块中需要包含的对象列表 """ log.debug('Creating module %s', name) name = name.lower() # 创建模块 module = imp.new_module(name) # 给模块设置_name属性 (插件名) module._name = name.split('.')[-1]
def list_py_file_paths( directory, followlinks=True, ignore_filename='.ignore', file_ext='.py', safe_mode=False, safe_filters=(b'xTool', b'XTool')): """递归遍历目录,返回匹配规则的文件列表 Traverse a directory and look for Python files. :param directory: the directory to traverse :type directory: unicode :param safe_mode: whether to use a heuristic to determine whether a file contains Airflow DAG definitions :return: a list of paths to Python files in the specified directory :rtype: list[unicode] """ file_paths = [] if directory is None: return [] elif os.path.isfile(directory): return [directory] elif os.path.isdir(directory): patterns_by_dir = {} # 递归遍历目录,包含链接文件 for root, dirs, files in os.walk(directory, followlinks=followlinks): patterns = patterns_by_dir.get(root, []) # 获得需要忽略的文件 ignore_file = os.path.join(root, ignore_filename) if os.path.isfile(ignore_file): with open(ignore_file, 'r') as f: # If we have new patterns create a copy so we don't change # the previous list (which would affect other subdirs) patterns = patterns + \ [p for p in f.read().split('\n') if p] # If we can ignore any subdirs entirely we should - fewer paths # to walk is better. We have to modify the ``dirs`` array in # place for this to affect os.walk dirs[:] = [ d for d in dirs if not any(re.search(p, os.path.join(root, d)) for p in patterns) ] # We want patterns defined in a parent folder's .airflowignore to # apply to subdirs too for d in dirs: patterns_by_dir[os.path.join(root, d)] = patterns for f in files: try: # 获得文件的绝对路径 file_path = os.path.join(root, f) if not os.path.isfile(file_path): continue # 验证文件后缀 mod_name, file_extension = os.path.splitext( os.path.split(file_path)[-1]) if file_extension != file_ext and not zipfile.is_zipfile( file_path): continue # 验证忽略规则 if any([re.findall(p, file_path) for p in patterns]): continue # 使用启发式方式猜测是否是一个DAG文件,DAG文件需要包含DAG 或 airflow # Heuristic that guesses whether a Python file contains an # Airflow DAG definition. might_contain_dag = True if safe_mode and not zipfile.is_zipfile(file_path): with open(file_path, 'rb') as f: content = f.read() might_contain_dag = all( [s in content for s in safe_filters]) if not might_contain_dag: continue file_paths.append(file_path) except Exception: log = LoggingMixin().log log.exception("Error while examining %s", f) return file_paths
def list_py_file_paths(directory, followlinks=True, ignore_filename='.ignore', file_ext='.py', safe_mode=False, safe_filters=(b'xTool', b'XTool')): """递归遍历目录,返回匹配规则的文件列表 Traverse a directory and look for Python files. :param directory: the directory to traverse :type directory: unicode :param safe_mode: whether to use a heuristic to determine whether a file contains Airflow DAG definitions :return: a list of paths to Python files in the specified directory :rtype: list[unicode] """ file_paths = [] if directory is None: return [] elif os.path.isfile(directory): return [directory] elif os.path.isdir(directory): patterns = [] # 递归遍历目录,包含链接文件 for root, dirs, files in os.walk(directory, followlinks=followlinks): # 获得需要忽略的文件 ignore_file = [f for f in files if f == ignore_filename] if ignore_file: f = open(os.path.join(root, ignore_file[0]), 'r') patterns += [p.strip() for p in f.read().split('\n') if p] f.close() for f in files: try: # 获得文件的绝对路径 file_path = os.path.join(root, f) if not os.path.isfile(file_path): continue # 验证文件后缀 mod_name, file_extension = os.path.splitext( os.path.split(file_path)[-1]) if file_extension != file_ext and not zipfile.is_zipfile( file_path): continue # 验证忽略规则 if any([re.findall(p, file_path) for p in patterns]): continue # 使用启发式方式猜测是否是一个DAG文件,DAG文件需要包含DAG 或 airflow # Heuristic that guesses whether a Python file contains an # Airflow DAG definition. might_contain_dag = True if safe_mode and not zipfile.is_zipfile(file_path): with open(file_path, 'rb') as f: content = f.read() might_contain_dag = all( [s in content for s in safe_filters]) if not might_contain_dag: continue file_paths.append(file_path) except Exception: log = LoggingMixin().log log.exception("Error while examining %s", f) return file_paths