예제 #1
0
    def set_dirty(dag_id, session=None):
        """
        :param dag_id: the dag_id to mark dirty
        :param session: database session
        :return:
        """
        # 给dag的每一种状态都创建一条记录
        # 将统计表中不存在的状态插入到db中
        DagStat.create(dag_id=dag_id, session=session)

        try:
            # 给指定的dag所有的状态行加行锁
            stats = session.query(DagStat).filter(
                DagStat.dag_id == dag_id
            ).with_for_update().all()

            # 修改设置dirty标记
            for stat in stats:
                stat.dirty = True
            session.commit()
        except Exception as e:
            session.rollback()
            log = LoggingMixin().log
            log.warning("Could not update dag stats for %s", dag_id)
            log.exception(e)
예제 #2
0
def load_login():
    log = LoggingMixin().log

    auth_backend = 'airflow.default_login'
    try:
        # 获得默认web认证
        if conf.getboolean('webserver', 'AUTHENTICATE'):
            auth_backend = conf.get('webserver', 'auth_backend')
    except (AirflowConfigException, XToolConfigException):
        if conf.getboolean('webserver', 'AUTHENTICATE'):
            log.warning(
                "auth_backend not found in webserver config reverting to "
                "*deprecated*  behavior of importing airflow_login")
            auth_backend = "airflow_login"

    # 导入认证模块
    try:
        global login
        login = import_module(auth_backend)
    except ImportError as err:
        log.critical(
            "Cannot import authentication module %s. "
            "Please correct your authentication backend or disable authentication: %s",
            auth_backend, err
        )
        if conf.getboolean('webserver', 'AUTHENTICATE'):
            raise AirflowException("Failed to import authentication backend")
예제 #3
0
    def update(dag_ids=None, dirty_only=True, session=None):
        """更新dag每个状态的dag_run的数量,并设置dirty为False
        Updates the stats for dirty/out-of-sync dags

        :param dag_ids: dag_ids to be updated
        :type dag_ids: list
        :param dirty_only: only updated for marked dirty, defaults to True
        :type dirty_only: bool
        :param session: db session to use
        :type session: Session
        """
        try:
            qry = session.query(DagStat)
            if dag_ids:
                qry = qry.filter(DagStat.dag_id.in_(set(dag_ids)))
            # 仅仅获得脏数据
            if dirty_only:
                qry = qry.filter(DagStat.dirty == True) # noqa

            # 添加行级锁
            qry = qry.with_for_update().all()

            # 获得所有的dagId列表
            dag_ids = set([dag_stat.dag_id for dag_stat in qry])

            # avoid querying with an empty IN clause
            if not dag_ids:
                session.commit()
                return

            # 获得dag每个dagrun状态的记录数量
            begin_time = datetime.now() - timedelta(days=configuration.getint('core', 'sql_query_history_days'))
            dagstat_states = set(itertools.product(dag_ids, State.dag_states))
            qry = (
                session.query(DagRun.dag_id, DagRun.state, func.count('*'))
                .filter(DagRun.dag_id.in_(dag_ids))
                .filter(DagRun.execution_date > begin_time)
                .group_by(DagRun.dag_id, DagRun.state)
            )
            counts = {(dag_id, state): count for dag_id, state, count in qry}

            # 修改每个dag_id的每个状态的dagrund的数量
            for dag_id, state in dagstat_states:
                count = counts.get((dag_id, state), 0)
                session.merge(
                    DagStat(dag_id=dag_id, state=state, count=count, dirty=False)
                )

            session.commit()
        except Exception as e:
            session.rollback()
            log = LoggingMixin().log
            log.warning("Could not update dag stat table")
            log.exception(e)
예제 #4
0
def _post_sendgrid_mail(mail_data):
    log = LoggingMixin().log
    sg = sendgrid.SendGridAPIClient(apikey=os.environ.get('SENDGRID_API_KEY'))
    response = sg.client.mail.send.post(request_body=mail_data)
    # 2xx status code.
    if response.status_code >= 200 and response.status_code < 300:
        log.info('Email with subject %s is successfully sent to recipients: %s' %
                 (mail_data['subject'], mail_data['personalizations']))
    else:
        log.warning('Failed to send out email with subject %s, status code: %s' %
                    (mail_data['subject'], response.status_code))
예제 #5
0
    def create(dag_id, session=None):
        """将统计表中不存在的状态插入到db中

        Creates the missing states the stats table for the dag specified

        :param dag_id: dag id of the dag to create stats for
        :param session: database session
        :return:
        """
        # 获得DagStat中存在的状态
        qry = session.query(DagStat).filter(DagStat.dag_id == dag_id).all()
        states = {dag_stat.state for dag_stat in qry}
        # 遍历所有状态, 找出不再数据库中的状态
        states_not_found = set(State.dag_states) - states
        for state in states_not_found:
            try:
                session.merge(DagStat(dag_id=dag_id, state=state))
                session.commit()
            except Exception as e:
                session.rollback()
                log = LoggingMixin().log
                log.warning("Could not create stat record")
                log.exception(e)
예제 #6
0
    def _to_timestamp(cls, col):
        """
        Convert a column of a dataframe to UNIX timestamps if applicable

        :param col:     A Series object representing a column of a dataframe.
        """
        # try and convert the column to datetimes
        # the column MUST have a four digit year somewhere in the string
        # there should be a better way to do this,
        # but just letting pandas try and convert every column without a format
        # caused it to convert floats as well
        # For example, a column of integers
        # between 0 and 10 are turned into timestamps
        # if the column cannot be converted,
        # just return the original column untouched
        try:
            col = pd.to_datetime(col)
        except ValueError:
            log = LoggingMixin().log
            log.warning("Could not convert field to timestamps: %s", col.name)
            return col

        # now convert the newly created datetimes into timestamps
        # we have to be careful here
        # because NaT cannot be converted to a timestamp
        # so we have to return NaN
        converted = []
        for i in col:
            try:
                converted.append(i.timestamp())
            except ValueError:
                converted.append(pd.np.NaN)
            except AttributeError:
                converted.append(pd.np.NaN)

        # return a new series that maintains the same index as the original
        return pd.Series(converted, index=col.index)
예제 #7
0
    broker_url,
    'broker_transport_options':
    broker_transport_options,
    # worker执行结果输出的存储介质
    'result_backend':
    configuration.conf.get('celery', 'RESULT_BACKEND'),
    # worker并发执行的数量
    'worker_concurrency':
    configuration.conf.getint('celery', 'WORKER_CONCURRENCY'),
}

celery_ssl_active = False
try:
    celery_ssl_active = configuration.conf.getboolean('celery', 'SSL_ACTIVE')
except (AirflowConfigException, XToolConfigException) as e:
    log.warning("Celery Executor will run without SSL")

try:
    if celery_ssl_active:
        broker_use_ssl = {
            'keyfile': configuration.conf.get('celery', 'SSL_KEY'),
            'certfile': configuration.conf.get('celery', 'SSL_CERT'),
            'ca_certs': configuration.conf.get('celery', 'SSL_CACERT'),
            'cert_reqs': ssl.CERT_REQUIRED
        }
        DEFAULT_CELERY_CONFIG['broker_use_ssl'] = broker_use_ssl
except (AirflowConfigException, XToolConfigException) as e:
    raise AirflowException('AirflowConfigException: SSL_ACTIVE is True, '
                           'please ensure SSL_KEY, '
                           'SSL_CERT and SSL_CACERT are set')
except Exception as e: