def _str(s): # cloudant-python doesn't support unicode. if isinstance(s, unicode): log = LoggingMixin().log log.debug( 'cloudant-python does not support unicode. Encoding %s as ' 'ascii using "ignore".', s) return s.encode('ascii', 'ignore') return s
def handle_failure_retry(context): ti = context['ti'] cmd_id = ti.xcom_pull(key='qbol_cmd_id', task_ids=ti.task_id) if cmd_id is not None: cmd = Command.find(cmd_id) if cmd is not None: if cmd.status == 'running': log = LoggingMixin().log log.info('Cancelling the Qubole Command Id: %s', cmd_id) cmd.cancel()
def set( cls, key, value, execution_date, task_id, dag_id, session=None): """保存中间结果 Store an XCom value. TODO: "pickling" has been deprecated and JSON is preferred. "pickling" will be removed in Airflow 2.0. :return: None """ # 调用底层,清除所有session实例 session.expunge_all() enable_pickling = configuration.getboolean('core', 'enable_xcom_pickling') # 序列化中间结果 if enable_pickling: value = pickle.dumps(value) else: try: # 注意编码转换 value = json.dumps(value).encode('UTF-8') except ValueError: log = LoggingMixin().log log.error("Could not serialize the XCOM value into JSON. " "If you are using pickles instead of JSON " "for XCOM, then you need to enable pickle " "support for XCOM in your airflow config.") raise # remove any duplicate XComs # 删除相同key的中间结果 session.query(cls).filter( cls.key == key, cls.execution_date == execution_date, cls.task_id == task_id, cls.dag_id == dag_id).delete() session.commit() # insert new XCom session.add(XCom( key=key, value=value, execution_date=execution_date, task_id=task_id, dag_id=dag_id)) session.commit()
def GetDefaultExecutor(): """Creates a new instance of the configured executor if none exists and returns it""" global DEFAULT_EXECUTOR if DEFAULT_EXECUTOR is not None: return DEFAULT_EXECUTOR executor_name = configuration.conf.get('core', 'EXECUTOR') DEFAULT_EXECUTOR = _get_executor(executor_name) log = LoggingMixin().log log.info("Using executor %s", executor_name) return DEFAULT_EXECUTOR
def get_one(cls, execution_date, key=None, task_id=None, dag_id=None, include_prior_dates=False, session=None): """获取一条中间结果 Retrieve an XCom value, optionally meeting certain criteria. TODO: "pickling" has been deprecated and JSON is preferred. "pickling" will be removed in Airflow 2.0. :return: XCom value """ # 构造搜索条件 filters = [] if key: filters.append(cls.key == key) if task_id: filters.append(cls.task_id == task_id) if dag_id: filters.append(cls.dag_id == dag_id) if include_prior_dates: filters.append(cls.execution_date <= execution_date) else: filters.append(cls.execution_date == execution_date) query = ( session.query(cls.value).filter(and_(*filters)) .order_by(cls.execution_date.desc(), cls.timestamp.desc())) # 获得最近的一条记录 result = query.first() if result: enable_pickling = configuration.getboolean('core', 'enable_xcom_pickling') if enable_pickling: return pickle.loads(result.value) else: try: # 注意编码转换 return json.loads(result.value.decode('UTF-8')) except ValueError: log = LoggingMixin().log log.error("Could not deserialize the XCOM value from JSON. " "If you are using pickles instead of JSON " "for XCOM, then you need to enable pickle " "support for XCOM in your airflow config.") raise
def set_dirty(dag_id, session=None): """ :param dag_id: the dag_id to mark dirty :param session: database session :return: """ # 给dag的每一种状态都创建一条记录 # 将统计表中不存在的状态插入到db中 DagStat.create(dag_id=dag_id, session=session) try: # 给指定的dag所有的状态行加行锁 stats = session.query(DagStat).filter( DagStat.dag_id == dag_id ).with_for_update().all() # 修改设置dirty标记 for stat in stats: stat.dirty = True session.commit() except Exception as e: session.rollback() log = LoggingMixin().log log.warning("Could not update dag stats for %s", dag_id) log.exception(e)
def send_MIME_email(e_from, e_to, mime_msg, dryrun=False): """发送邮件 .""" log = LoggingMixin().log SMTP_HOST = configuration.conf.get('smtp', 'SMTP_HOST') SMTP_PORT = configuration.conf.getint('smtp', 'SMTP_PORT') SMTP_STARTTLS = configuration.conf.getboolean('smtp', 'SMTP_STARTTLS') SMTP_SSL = configuration.conf.getboolean('smtp', 'SMTP_SSL') SMTP_USER = None SMTP_PASSWORD = None try: SMTP_USER = configuration.conf.get('smtp', 'SMTP_USER') SMTP_PASSWORD = configuration.conf.get('smtp', 'SMTP_PASSWORD') except XToolConfigException: log.debug( "No user/password found for SMTP, so logging in with no authentication." ) if not dryrun: s = smtplib.SMTP_SSL(SMTP_HOST, SMTP_PORT) if SMTP_SSL else smtplib.SMTP( SMTP_HOST, SMTP_PORT) if SMTP_STARTTLS: s.starttls() if SMTP_USER and SMTP_PASSWORD: s.login(SMTP_USER, SMTP_PASSWORD) log.info("Sent an alert email to %s", e_to) s.sendmail(e_from, e_to, mime_msg.as_string()) s.quit()
def send_MIME_email(e_from, e_to, mime_msg, host, port, ssl=False, starttls=False, user=None, password=None, dryrun=False): """发送邮件 .""" log = LoggingMixin().log if not user or not password: log.debug( "No user/password found for SMTP, so logging in with no authentication." ) if not dryrun: s = smtplib.SMTP_SSL(host, port) if ssl else smtplib.SMTP(host, port) if starttls: s.starttls() s.login(user, password) log.info("Sent an alert email to %s", e_to) s.sendmail(e_from, e_to, mime_msg.as_string()) s.quit()
def load_login(): log = LoggingMixin().log auth_backend = 'airflow.default_login' try: # 获得默认web认证 if conf.getboolean('webserver', 'AUTHENTICATE'): auth_backend = conf.get('webserver', 'auth_backend') except (AirflowConfigException, XToolConfigException): if conf.getboolean('webserver', 'AUTHENTICATE'): log.warning( "auth_backend not found in webserver config reverting to " "*deprecated* behavior of importing airflow_login") auth_backend = "airflow_login" # 导入认证模块 try: global login login = import_module(auth_backend) except ImportError as err: log.critical( "Cannot import authentication module %s. " "Please correct your authentication backend or disable authentication: %s", auth_backend, err ) if conf.getboolean('webserver', 'AUTHENTICATE'): raise AirflowException("Failed to import authentication backend")
def execute_command(command): """airflow worker 执行shell命令 .""" log = LoggingMixin().log log.info("Executing command in Celery: %s", command) env = os.environ.copy() try: # celery worker 收到消息后,执行消息中的shell命令 subprocess.check_call(command, shell=True, stderr=subprocess.STDOUT, close_fds=True, env=env) except subprocess.CalledProcessError as e: log.exception('execute_command encountered a CalledProcessError') log.error(e.output) raise AirflowException('Celery command failed')
def wrapper(*args, **kwargs): metrics = {} metrics['begin_datetime'] = datetime.now() metrics['end_datetime'] = None metrics['log'] = LoggingMixin().log on_pre_execution(**metrics) try: return f(*args, **kwargs) except Exception as e: metrics['error'] = e raise finally: metrics['end_datetime'] = datetime.now() on_post_execution(**metrics)
def _to_timestamp(cls, col): """ Convert a column of a dataframe to UNIX timestamps if applicable :param col: A Series object representing a column of a dataframe. """ # try and convert the column to datetimes # the column MUST have a four digit year somewhere in the string # there should be a better way to do this, # but just letting pandas try and convert every column without a format # caused it to convert floats as well # For example, a column of integers # between 0 and 10 are turned into timestamps # if the column cannot be converted, # just return the original column untouched try: col = pd.to_datetime(col) except ValueError: log = LoggingMixin().log log.warning("Could not convert field to timestamps: %s", col.name) return col # now convert the newly created datetimes into timestamps # we have to be careful here # because NaT cannot be converted to a timestamp # so we have to return NaN converted = [] for i in col: try: converted.append(i.timestamp()) except ValueError: converted.append(pd.np.NaN) except AttributeError: converted.append(pd.np.NaN) # return a new series that maintains the same index as the original return pd.Series(converted, index=col.index)
def _post_sendgrid_mail(mail_data): log = LoggingMixin().log sg = sendgrid.SendGridAPIClient(apikey=os.environ.get('SENDGRID_API_KEY')) response = sg.client.mail.send.post(request_body=mail_data) # 2xx status code. if response.status_code >= 200 and response.status_code < 300: log.info('Email with subject %s is successfully sent to recipients: %s' % (mail_data['subject'], mail_data['personalizations'])) else: log.warning('Failed to send out email with subject %s, status code: %s' % (mail_data['subject'], response.status_code))
def update(dag_ids=None, dirty_only=True, session=None): """更新dag每个状态的dag_run的数量,并设置dirty为False Updates the stats for dirty/out-of-sync dags :param dag_ids: dag_ids to be updated :type dag_ids: list :param dirty_only: only updated for marked dirty, defaults to True :type dirty_only: bool :param session: db session to use :type session: Session """ try: qry = session.query(DagStat) if dag_ids: qry = qry.filter(DagStat.dag_id.in_(set(dag_ids))) # 仅仅获得脏数据 if dirty_only: qry = qry.filter(DagStat.dirty == True) # noqa # 添加行级锁 qry = qry.with_for_update().all() # 获得所有的dagId列表 dag_ids = set([dag_stat.dag_id for dag_stat in qry]) # avoid querying with an empty IN clause if not dag_ids: session.commit() return # 获得dag每个dagrun状态的记录数量 begin_time = datetime.now() - timedelta(days=configuration.getint('core', 'sql_query_history_days')) dagstat_states = set(itertools.product(dag_ids, State.dag_states)) qry = ( session.query(DagRun.dag_id, DagRun.state, func.count('*')) .filter(DagRun.dag_id.in_(dag_ids)) .filter(DagRun.execution_date > begin_time) .group_by(DagRun.dag_id, DagRun.state) ) counts = {(dag_id, state): count for dag_id, state, count in qry} # 修改每个dag_id的每个状态的dagrund的数量 for dag_id, state in dagstat_states: count = counts.get((dag_id, state), 0) session.merge( DagStat(dag_id=dag_id, state=state, count=count, dirty=False) ) session.commit() except Exception as e: session.rollback() log = LoggingMixin().log log.warning("Could not update dag stat table") log.exception(e)
def get_query_results(self): log = LoggingMixin().log if self.cmd is not None: cmd_id = self.cmd.id log.info("command id: " + str(cmd_id)) query_result_buffer = StringIO() self.cmd.get_results(fp=query_result_buffer, inline=True, delim=COL_DELIM) query_result = query_result_buffer.getvalue() query_result_buffer.close() return query_result else: log.info("Qubole command not found")
def handle_failure_retry(context): ti = context['ti'] cmd_id = ti.xcom_pull(key='qbol_cmd_id', task_ids=ti.task_id) if cmd_id is not None: cmd = Command.find(cmd_id) if cmd is not None: log = LoggingMixin().log if cmd.status == 'done': log.info( 'Command ID: %s has been succeeded, hence marking this ' 'TI as Success.', cmd_id) ti.state = State.SUCCESS elif cmd.status == 'running': log.info('Cancelling the Qubole Command Id: %s', cmd_id) cmd.cancel()
def filter_for_filesize(result, size=None): """ Will test the filepath result and test if its size is at least self.filesize :param result: a list of dicts returned by Snakebite ls :param size: the file size in MB a file should be at least to trigger True :return: (bool) depending on the matching criteria """ if size: log = LoggingMixin().log log.debug('Filtering for file size >= %s in files: %s', size, map(lambda x: x['path'], result)) size *= settings.MEGABYTE result = [x for x in result if x['length'] >= size] log.debug('HdfsSensor.poke: after size filter result is %s', result) return result
def get_val(self): """获得常量的值 .""" log = LoggingMixin().log if self._val and self.is_encrypted: try: # 解密常量值 fernet_key = configuration.conf.get('core', 'FERNET_KEY') fernet = get_fernet(fernet_key) return fernet.decrypt(bytes(self._val, 'utf-8')).decode() except InvalidFernetToken: # 解密失败返回None log.error("Can't decrypt _val for key={}, invalid token " "or value".format(self.key)) return None except Exception: log.error("Can't decrypt _val for key={}, FERNET_KEY " "configuration missing".format(self.key)) return None else: return self._val
def _poll_with_exponential_delay(request, max_n, is_done_func, is_error_func): log = LoggingMixin().log for i in range(0, max_n): try: response = request.execute() if is_error_func(response): raise ValueError( 'The response contained an error: {}'.format(response) ) elif is_done_func(response): log.info('Operation is done: %s', response) return response else: time.sleep((2**i) + (random.randint(0, 1000) / 1000)) except errors.HttpError as e: if e.resp.status != 429: log.info('Something went wrong. Not retrying: %s', format(e)) raise else: time.sleep((2**i) + (random.randint(0, 1000) / 1000))
def create(dag_id, session=None): """将统计表中不存在的状态插入到db中 Creates the missing states the stats table for the dag specified :param dag_id: dag id of the dag to create stats for :param session: database session :return: """ # 获得DagStat中存在的状态 qry = session.query(DagStat).filter(DagStat.dag_id == dag_id).all() states = {dag_stat.state for dag_stat in qry} # 遍历所有状态, 找出不再数据库中的状态 states_not_found = set(State.dag_states) - states for state in states_not_found: try: session.merge(DagStat(dag_id=dag_id, state=state)) session.commit() except Exception as e: session.rollback() log = LoggingMixin().log log.warning("Could not create stat record") log.exception(e)
def filter_for_ignored_ext(result, ignored_ext, ignore_copying): """ Will filter if instructed to do so the result to remove matching criteria :param result: (list) of dicts returned by Snakebite ls :param ignored_ext: (list) of ignored extensions :param ignore_copying: (bool) shall we ignore ? :return: (list) of dicts which were not removed """ if ignore_copying: log = LoggingMixin().log regex_builder = "^.*\.(%s$)$" % '$|'.join(ignored_ext) ignored_extentions_regex = re.compile(regex_builder) log.debug( 'Filtering result for ignored extensions: %s in files %s', ignored_extentions_regex.pattern, map(lambda x: x['path'], result)) result = [ x for x in result if not ignored_extentions_regex.match(x['path']) ] log.debug('HdfsSensor.poke: after ext filter result is %s', result) return result
# specific language governing permissions and limitations # under the License. import ssl from airflow import configuration from airflow.exceptions import AirflowConfigException, AirflowException from xTool.exceptions import XToolConfigException from xTool.utils.log.logging_mixin import LoggingMixin def _broker_supports_visibility_timeout(url): return url.startswith("redis://") or url.startswith("sqs://") log = LoggingMixin().log broker_url = configuration.conf.get('celery', 'BROKER_URL') # 任务发出后,经过一段时间还未收到acknowledge , 就将任务重新交给其他worker执行 broker_transport_options = configuration.conf.getsection( 'celery_broker_transport_options') if 'visibility_timeout' not in broker_transport_options: if _broker_supports_visibility_timeout(broker_url): broker_transport_options = {'visibility_timeout': 21600} DEFAULT_CELERY_CONFIG = { 'accept_content': ['json', 'pickle'], # 指定接受的内容类型 'event_serializer': 'json', # 发送event message的格式 'worker_prefetch_multiplier': 1, # 每个worker每次只取一个消息
import six from six import iteritems import warnings from zope.deprecation import deprecated as _deprecated from xTool.crypto.fernet import generate_fernet_key from xTool.exceptions import AirflowConfigException from xTool.utils.helpers import expand_env_var from xTool.utils.configuration import read_config_file from xTool.utils.configuration import XToolConfigParser from xTool.utils.file import mkdir_p from xTool.utils.log.logging_mixin import LoggingMixin standard_library.install_aliases() log = LoggingMixin().log # 控制警告错误的输出 # show Airflow's deprecation warnings warnings.filterwarnings(action='default', category=DeprecationWarning, module='airflow') warnings.filterwarnings(action='default', category=PendingDeprecationWarning, module='airflow') def parameterized_config(template): """使用全局变量和局部变量渲染模版字符串 Generates a configuration from the provided template + variables defined in current scope
# with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from xTool.utils.dates import days_ago from xTool.utils.log.logging_mixin import LoggingMixin from airflow.models import DAG log = LoggingMixin().log try: # Kubernetes is optional, so not available in vanilla Airflow # pip install apache-airflow[kubernetes] from airflow.contrib.operators.kubernetes_pod_operator import KubernetesPodOperator args = {'owner': 'airflow', 'start_date': days_ago(2)} dag = DAG(dag_id='example_kubernetes_operator', default_args=args, schedule_interval=None) k = KubernetesPodOperator(namespace='default', image="ubuntu:16.04", cmds=["bash", "-cx"],
def list_py_file_paths(directory, followlinks=True, ignore_filename='.ignore', file_ext='.py', safe_mode=False, safe_filters=(b'xTool', b'XTool')): """递归遍历目录,返回匹配规则的文件列表 Traverse a directory and look for Python files. :param directory: the directory to traverse :type directory: unicode :param safe_mode: whether to use a heuristic to determine whether a file contains Airflow DAG definitions :return: a list of paths to Python files in the specified directory :rtype: list[unicode] """ file_paths = [] if directory is None: return [] elif os.path.isfile(directory): return [directory] elif os.path.isdir(directory): patterns = [] # 递归遍历目录,包含链接文件 for root, dirs, files in os.walk(directory, followlinks=followlinks): # 获得需要忽略的文件 ignore_file = [f for f in files if f == ignore_filename] if ignore_file: f = open(os.path.join(root, ignore_file[0]), 'r') patterns += [p.strip() for p in f.read().split('\n') if p] f.close() for f in files: try: # 获得文件的绝对路径 file_path = os.path.join(root, f) if not os.path.isfile(file_path): continue # 验证文件后缀 mod_name, file_extension = os.path.splitext( os.path.split(file_path)[-1]) if file_extension != file_ext and not zipfile.is_zipfile( file_path): continue # 验证忽略规则 if any([re.findall(p, file_path) for p in patterns]): continue # 使用启发式方式猜测是否是一个DAG文件,DAG文件需要包含DAG 或 airflow # Heuristic that guesses whether a Python file contains an # Airflow DAG definition. might_contain_dag = True if safe_mode and not zipfile.is_zipfile(file_path): with open(file_path, 'rb') as f: content = f.read() might_contain_dag = all( [s in content for s in safe_filters]) if not might_contain_dag: continue file_paths.append(file_path) except Exception: log = LoggingMixin().log log.exception("Error while examining %s", f) return file_paths
# specific language governing permissions and limitations # under the License. # from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals from airflow import configuration from xTool.utils.log.logging_mixin import LoggingMixin from xTool.utils.module_loading import prepare_classpath from xTool.utils.module_loading import make_module from xTool.plugins_manager import XToolPlugin from xTool.plugins_manager import import_plugins log = LoggingMixin().log class AirflowPlugin(XToolPlugin): pass # 获得插件目录 plugins_folder = configuration.conf.get('core', 'plugins_folder') if not plugins_folder: plugins_folder = configuration.conf.get('core', 'airflow_home') + '/plugins' # 将插件目录加入到系统路径中 prepare_classpath(plugins_folder) # 导入插件 plugins = import_plugins(plugins_folder)
def list_py_file_paths( directory, followlinks=True, ignore_filename='.ignore', file_ext='.py', safe_mode=False, safe_filters=(b'xTool', b'XTool')): """递归遍历目录,返回匹配规则的文件列表 Traverse a directory and look for Python files. :param directory: the directory to traverse :type directory: unicode :param safe_mode: whether to use a heuristic to determine whether a file contains Airflow DAG definitions :return: a list of paths to Python files in the specified directory :rtype: list[unicode] """ file_paths = [] if directory is None: return [] elif os.path.isfile(directory): return [directory] elif os.path.isdir(directory): patterns_by_dir = {} # 递归遍历目录,包含链接文件 for root, dirs, files in os.walk(directory, followlinks=followlinks): patterns = patterns_by_dir.get(root, []) # 获得需要忽略的文件 ignore_file = os.path.join(root, ignore_filename) if os.path.isfile(ignore_file): with open(ignore_file, 'r') as f: # If we have new patterns create a copy so we don't change # the previous list (which would affect other subdirs) patterns = patterns + \ [p for p in f.read().split('\n') if p] # If we can ignore any subdirs entirely we should - fewer paths # to walk is better. We have to modify the ``dirs`` array in # place for this to affect os.walk dirs[:] = [ d for d in dirs if not any(re.search(p, os.path.join(root, d)) for p in patterns) ] # We want patterns defined in a parent folder's .airflowignore to # apply to subdirs too for d in dirs: patterns_by_dir[os.path.join(root, d)] = patterns for f in files: try: # 获得文件的绝对路径 file_path = os.path.join(root, f) if not os.path.isfile(file_path): continue # 验证文件后缀 mod_name, file_extension = os.path.splitext( os.path.split(file_path)[-1]) if file_extension != file_ext and not zipfile.is_zipfile( file_path): continue # 验证忽略规则 if any([re.findall(p, file_path) for p in patterns]): continue # 使用启发式方式猜测是否是一个DAG文件,DAG文件需要包含DAG 或 airflow # Heuristic that guesses whether a Python file contains an # Airflow DAG definition. might_contain_dag = True if safe_mode and not zipfile.is_zipfile(file_path): with open(file_path, 'rb') as f: content = f.read() might_contain_dag = all( [s in content for s in safe_filters]) if not might_contain_dag: continue file_paths.append(file_path) except Exception: log = LoggingMixin().log log.exception("Error while examining %s", f) return file_paths
# under the License. from hdfs import InsecureClient, HdfsError from airflow import configuration from airflow.exceptions import AirflowException from airflow.hooks.base_hook import BaseHook from xTool.utils.log.logging_mixin import LoggingMixin _kerberos_security_mode = configuration.conf.get("core", "security") == "kerberos" if _kerberos_security_mode: try: from hdfs.ext.kerberos import KerberosClient except ImportError: log = LoggingMixin().log log.error("Could not load the Kerberos extension for the WebHDFSHook.") raise class AirflowWebHDFSHookException(AirflowException): pass class WebHDFSHook(BaseHook): """ Interact with HDFS. This class is a wrapper around the hdfscli library. """ def __init__(self, webhdfs_conn_id='webhdfs_default', proxy_user=None): self.webhdfs_conn_id = webhdfs_conn_id self.proxy_user = proxy_user
from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals from builtins import object import imp import inspect import os import re import sys from xTool import configuration from xTool.utils.log.logging_mixin import LoggingMixin log = LoggingMixin().log class XToolPluginException(Exception): pass class XToolPlugin(object): name = None operators = [] sensors = [] hooks = [] executors = [] macros = [] admin_views = [] flask_blueprints = []
def get_connection(cls, conn_id): conn = random.choice(cls.get_connections(conn_id)) if conn.host: log = LoggingMixin().log log.info("Using connection to: %s", conn.host) return conn