示例#1
0
文件: jobs.py 项目: logiclord/airflow
    def _execute(self):
        dag_id = self.dag_id

        def signal_handler(signum, frame):
            logging.error("SIGINT (ctrl-c) received")
            sys.exit(1)
        signal.signal(signal.SIGINT, signal_handler)

        utils.pessimistic_connection_handling()

        logging.basicConfig(level=logging.DEBUG)
        logging.info("Starting the scheduler")

        dagbag = models.DagBag(self.subdir, sync_to_db=True)
        executor = dagbag.executor
        executor.start()
        i = 0
        while (not self.test_mode) or i < 1:
            try:
                self.prioritize_queued(executor=executor, dagbag=dagbag)
            except Exception as e:
                logging.exception(e)

            i += 1
            try:
                if i % self.refresh_dags_every == 0:
                    dagbag = models.DagBag(self.subdir, sync_to_db=True)
                else:
                    dagbag.collect_dags(only_if_updated=True)
            except:
                logging.error("Failed at reloading the dagbag")
                if statsd:
                    statsd.incr('dag_refresh_error', 1, 1)
                sleep(5)

            if dag_id:
                dags = [dagbag.dags[dag_id]]
            else:
                dags = [
                    dag for dag in dagbag.dags.values() if not dag.parent_dag]
            paused_dag_ids = dagbag.paused_dags()
            for dag in dags:
                logging.debug("Scheduling {}".format(dag.dag_id))
                dag = dagbag.get_dag(dag.dag_id)
                if not dag or (dag.dag_id in paused_dag_ids):
                    continue
                try:
                    self.process_dag(dag, executor)
                except Exception as e:
                    logging.exception(e)
            logging.debug(
                "Done queuing tasks, calling the executor's heartbeat")
            try:
                # We really just want the scheduler to never ever stop.
                executor.heartbeat()
                self.heartbeat()
            except Exception as e:
                logging.exception(e)
                logging.error("Tachycardia!")
        executor.end()
示例#2
0
    def _execute(self):
        dag_id = self.dag_id

        def signal_handler(signum, frame):
            logging.error("SIGINT (ctrl-c) received")
            sys.exit(1)
        signal.signal(signal.SIGINT, signal_handler)

        utils.pessimistic_connection_handling()

        logging.basicConfig(level=logging.DEBUG)
        logging.info("Starting the scheduler")

        dagbag = models.DagBag(self.subdir, sync_to_db=True)
        executor = dagbag.executor
        executor.start()
        i = 0
        while (not self.test_mode) or i < 1:
            try:
                self.prioritize_queued(executor=executor, dagbag=dagbag)
            except Exception as e:
                logging.exception(e)

            i += 1
            try:
                if i % self.refresh_dags_every == 0:
                    dagbag = models.DagBag(self.subdir, sync_to_db=True)
                else:
                    dagbag.collect_dags(only_if_updated=True)
            except:
                logging.error("Failed at reloading the dagbag")
                if statsd:
                    statsd.incr('dag_refresh_error', 1, 1)
                sleep(5)

            if dag_id:
                dags = [dagbag.dags[dag_id]]
            else:
                dags = [
                    dag for dag in dagbag.dags.values() if not dag.parent_dag]
            paused_dag_ids = dagbag.paused_dags()
            for dag in dags:
                logging.debug("Scheduling {}".format(dag.dag_id))
                dag = dagbag.get_dag(dag.dag_id)
                if not dag or (dag.dag_id in paused_dag_ids):
                    continue
                try:
                    self.process_dag(dag, executor)
                except Exception as e:
                    logging.exception(e)
            logging.debug(
                "Done queuing tasks, calling the executor's heartbeat")
            try:
                # We really just want the scheduler to never ever stop.
                executor.heartbeat()
                self.heartbeat()
            except Exception as e:
                logging.exception(e)
                logging.error("Tachycardia!")
        executor.end()
示例#3
0
文件: jobs.py 项目: lyft/Airflow
    def _execute(self):
        dag_id = self.dag_id

        def signal_handler(signum, frame):
            logging.error("SIGINT (ctrl-c) received")
            sys.exit(1)

        signal.signal(signal.SIGINT, signal_handler)

        utils.pessimistic_connection_handling()

        # Sleep time (seconds) between master runs

        logging.basicConfig(level=logging.DEBUG)
        logging.info("Starting a master scheduler")

        # This should get new code
        dagbag = models.DagBag(self.subdir)
        executor = dagbag.executor
        executor.start()
        i = 0
        while (not self.test_mode) or i < 1:
            i += 1
            if i % self.refresh_dags_every == 0:
                dagbag.collect_dags(only_if_updated=False)
            else:
                dagbag.collect_dags(only_if_updated=True)
            if dag_id:
                dags = [dagbag.dags[dag_id]]
            else:
                dags = [
                    dag for dag in dagbag.dags.values() if not dag.parent_dag
                ]
            paused_dag_ids = dagbag.paused_dags()
            for dag in dags:
                if dag.dag_id in paused_dag_ids:
                    continue
                try:
                    self.process_dag(dag, executor)
                except Exception as e:
                    logging.exeption(e)
            self.heartbeat()
        executor.end()
示例#4
0
文件: jobs.py 项目: lyft/Airflow
    def _execute(self):
        dag_id = self.dag_id

        def signal_handler(signum, frame):
            logging.error("SIGINT (ctrl-c) received")
            sys.exit(1)
        signal.signal(signal.SIGINT, signal_handler)

        utils.pessimistic_connection_handling()

        # Sleep time (seconds) between master runs

        logging.basicConfig(level=logging.DEBUG)
        logging.info("Starting a master scheduler")

        # This should get new code
        dagbag = models.DagBag(self.subdir)
        executor = dagbag.executor
        executor.start()
        i = 0
        while (not self.test_mode) or i < 1:
            i += 1
            if i % self.refresh_dags_every == 0:
                dagbag.collect_dags(only_if_updated=False)
            else:
                dagbag.collect_dags(only_if_updated=True)
            if dag_id:
                dags = [dagbag.dags[dag_id]]
            else:
                dags = [
                    dag for dag in dagbag.dags.values() if not dag.parent_dag]
            paused_dag_ids = dagbag.paused_dags()
            for dag in dags:
                if dag.dag_id in paused_dag_ids:
                    continue
                try:
                    self.process_dag(dag, executor)
                except Exception as e:
                    logging.exeption(e)
            self.heartbeat()
        executor.end()
示例#5
0
文件: cli.py 项目: johnw424/airflow
def run(args):

    utils.pessimistic_connection_handling()
    # Setting up logging
    log = os.path.expanduser(conf.get('core', 'BASE_LOG_FOLDER'))
    directory = log + "/{args.dag_id}/{args.task_id}".format(args=args)
    if not os.path.exists(directory):
        os.makedirs(directory)
    args.execution_date = dateutil.parser.parse(args.execution_date)
    iso = args.execution_date.isoformat()
    filename = "{directory}/{iso}".format(**locals())
    subdir = None
    if args.subdir:
        subdir = args.subdir.replace(
            "DAGS_FOLDER", conf.get("core", "DAGS_FOLDER"))
        subdir = os.path.expanduser(subdir)
    logging.basicConfig(
        filename=filename,
        level=settings.LOGGING_LEVEL,
        format=settings.LOG_FORMAT)
    if not args.pickle:
        dagbag = DagBag(subdir)
        if args.dag_id not in dagbag.dags:
            msg = 'DAG [{0}] could not be found'.format(args.dag_id)
            logging.error(msg)
            raise AirflowException(msg)
        dag = dagbag.dags[args.dag_id]
        task = dag.get_task(task_id=args.task_id)
    else:
        session = settings.Session()
        logging.info('Loading pickle id {args.pickle}'.format(**locals()))
        dag_pickle = session.query(
            DagPickle).filter(DagPickle.id == args.pickle).first()
        if not dag_pickle:
            raise AirflowException("Who hid the pickle!? [missing pickle]")
        dag = dag_pickle.pickle
        task = dag.get_task(task_id=args.task_id)

    task_start_date = None
    if args.task_start_date:
        task_start_date = dateutil.parser.parse(args.task_start_date)
        task.start_date = task_start_date
    ti = TaskInstance(task, args.execution_date)

    if args.local:
        print("Logging into: " + filename)
        run_job = jobs.LocalTaskJob(
            task_instance=ti,
            mark_success=args.mark_success,
            force=args.force,
            pickle_id=args.pickle,
            task_start_date=task_start_date,
            ignore_dependencies=args.ignore_dependencies)
        run_job.run()
    elif args.raw:
        ti.run(
            mark_success=args.mark_success,
            force=args.force,
            ignore_dependencies=args.ignore_dependencies,
            job_id=args.job_id,
        )
    else:
        pickle_id = None
        if args.ship_dag:
            try:
                # Running remotely, so pickling the DAG
                session = settings.Session()
                pickle = DagPickle(dag)
                session.add(pickle)
                session.commit()
                pickle_id = pickle.id
                print((
                    'Pickled dag {dag} '
                    'as pickle_id:{pickle_id}').format(**locals()))
            except Exception as e:
                print('Could not pickle the DAG')
                print(e)
                raise e

        executor = DEFAULT_EXECUTOR
        executor.start()
        print("Sending to executor.")
        executor.queue_task_instance(
            ti,
            mark_success=args.mark_success,
            pickle_id=pickle_id,
            ignore_dependencies=args.ignore_dependencies,
            force=args.force)
        executor.heartbeat()
        executor.end()
示例#6
0
文件: cli.py 项目: hoanghw/airflow
def run(args):

    utils.pessimistic_connection_handling()
    # Setting up logging
    log = os.path.expanduser(conf.get('core', 'BASE_LOG_FOLDER'))
    directory = log + "/{args.dag_id}/{args.task_id}".format(args=args)
    if not os.path.exists(directory):
        os.makedirs(directory)
    args.execution_date = dateutil.parser.parse(args.execution_date)
    iso = args.execution_date.isoformat()
    filename = "{directory}/{iso}".format(**locals())

    # store old log (to help with S3 appends)
    if os.path.exists(filename):
        with open(filename, 'r') as logfile:
            old_log = logfile.read()
    else:
        old_log = None

    subdir = process_subdir(args.subdir)
    logging.basicConfig(
        filename=filename,
        level=settings.LOGGING_LEVEL,
        format=settings.LOG_FORMAT)
    if not args.pickle:
        dagbag = DagBag(subdir)
        if args.dag_id not in dagbag.dags:
            msg = 'DAG [{0}] could not be found'.format(args.dag_id)
            logging.error(msg)
            raise AirflowException(msg)
        dag = dagbag.dags[args.dag_id]
        task = dag.get_task(task_id=args.task_id)
    else:
        session = settings.Session()
        logging.info('Loading pickle id {args.pickle}'.format(**locals()))
        dag_pickle = session.query(
            DagPickle).filter(DagPickle.id == args.pickle).first()
        if not dag_pickle:
            raise AirflowException("Who hid the pickle!? [missing pickle]")
        dag = dag_pickle.pickle
        task = dag.get_task(task_id=args.task_id)

    task_start_date = None
    if args.task_start_date:
        task_start_date = dateutil.parser.parse(args.task_start_date)
        task.start_date = task_start_date
    ti = TaskInstance(task, args.execution_date)

    if args.local:
        print("Logging into: " + filename)
        run_job = jobs.LocalTaskJob(
            task_instance=ti,
            mark_success=args.mark_success,
            force=args.force,
            pickle_id=args.pickle,
            task_start_date=task_start_date,
            ignore_dependencies=args.ignore_dependencies,
            pool=args.pool)
        run_job.run()
    elif args.raw:
        ti.run(
            mark_success=args.mark_success,
            force=args.force,
            ignore_dependencies=args.ignore_dependencies,
            job_id=args.job_id,
            pool=args.pool,
        )
    else:
        pickle_id = None
        if args.ship_dag:
            try:
                # Running remotely, so pickling the DAG
                session = settings.Session()
                pickle = DagPickle(dag)
                session.add(pickle)
                session.commit()
                pickle_id = pickle.id
                print((
                    'Pickled dag {dag} '
                    'as pickle_id:{pickle_id}').format(**locals()))
            except Exception as e:
                print('Could not pickle the DAG')
                print(e)
                raise e

        executor = DEFAULT_EXECUTOR
        executor.start()
        print("Sending to executor.")
        executor.queue_task_instance(
            ti,
            mark_success=args.mark_success,
            pickle_id=pickle_id,
            ignore_dependencies=args.ignore_dependencies,
            force=args.force)
        executor.heartbeat()
        executor.end()

    if conf.get('core', 'S3_LOG_FOLDER').startswith('s3:'):
        import boto
        s3_log = filename.replace(log, conf.get('core', 'S3_LOG_FOLDER'))
        bucket, key = s3_log.lstrip('s3:/').split('/', 1)
        if os.path.exists(filename):

            # get logs
            with open(filename, 'r') as logfile:
                new_log = logfile.read()

            # remove old logs (since they are already in S3)
            if old_log:
                new_log.replace(old_log, '')

            try:
                s3 = boto.connect_s3()
                s3_key = boto.s3.key.Key(s3.get_bucket(bucket), key)

                # append new logs to old S3 logs, if available
                if s3_key.exists():
                    old_s3_log = s3_key.get_contents_as_string().decode()
                    new_log = old_s3_log + '\n' + new_log

                # send log to S3
                s3_key.set_contents_from_string(new_log)
            except:
                print('Could not send logs to S3.')
示例#7
0
    def _execute(self):
        dag_id = self.dag_id

        def signal_handler(signum, frame):
            self.logger.error("SIGINT (ctrl-c) received")
            sys.exit(1)
        signal.signal(signal.SIGINT, signal_handler)

        utils.pessimistic_connection_handling()

        logging.basicConfig(level=logging.DEBUG)
        self.logger.info("Starting the scheduler")

        dagbag = models.DagBag(self.subdir, sync_to_db=True)
        executor = dagbag.executor
        executor.start()
        i = 0
        while not self.num_runs or self.num_runs > i:
            try:
                loop_start_dttm = datetime.now()
                try:
                    self.prioritize_queued(executor=executor, dagbag=dagbag)
                except Exception as e:
                    self.logger.exception(e)

                i += 1
                try:
                    if i % self.refresh_dags_every == 0:
                        dagbag = models.DagBag(self.subdir, sync_to_db=True)
                    else:
                        dagbag.collect_dags(only_if_updated=True)
                except:
                    self.logger.error("Failed at reloading the dagbag")
                    Stats.incr('dag_refresh_error', 1, 1)
                    sleep(5)

                if dag_id:
                    dags = [dagbag.dags[dag_id]]
                else:
                    dags = [
                        dag for dag in dagbag.dags.values()
                        if not dag.parent_dag]
                paused_dag_ids = dagbag.paused_dags()
                for dag in dags:
                    self.logger.debug("Scheduling {}".format(dag.dag_id))
                    dag = dagbag.get_dag(dag.dag_id)
                    if not dag or (dag.dag_id in paused_dag_ids):
                        continue
                    try:
                        self.schedule_dag(dag)
                        self.process_dag(dag, executor)
                        self.manage_slas(dag)
                    except Exception as e:
                        self.logger.exception(e)
                self.logger.info("Done queuing tasks, calling the executor's "
                              "heartbeat")
                duration_sec = (datetime.now() - loop_start_dttm).total_seconds()
                self.logger.info("Loop took: {} seconds".format(duration_sec))
                try:
                    self.import_errors(dagbag)
                except Exception as e:
                    self.logger.exception(e)
                try:
                    dagbag.kill_zombies()
                except Exception as e:
                    self.logger.exception(e)
                try:
                    # We really just want the scheduler to never ever stop.
                    executor.heartbeat()
                    self.heartbeat()
                except Exception as e:
                    self.logger.exception(e)
                    self.logger.error("Tachycardia!")
            except Exception as deep_e:
                self.logger.exception(deep_e)
            finally:
                settings.Session.remove()
        executor.end()
示例#8
0
def run(args, dag=None):

    utils.pessimistic_connection_handling()
    if dag:
        args.dag_id = dag.dag_id

    # Setting up logging
    log_base = os.path.expanduser(conf.get('core', 'BASE_LOG_FOLDER'))
    directory = log_base + "/{args.dag_id}/{args.task_id}".format(args=args)
    if not os.path.exists(directory):
        os.makedirs(directory)
    iso = args.execution_date.isoformat()
    filename = "{directory}/{iso}".format(**locals())

    logging.root.handlers = []
    logging.basicConfig(
        filename=filename,
        level=settings.LOGGING_LEVEL,
        format=settings.LOG_FORMAT)

    if not args.pickle and not dag:
        dag = get_dag(args)
    elif not dag:
        session = settings.Session()
        logging.info('Loading pickle id {args.pickle}'.format(**locals()))
        dag_pickle = session.query(
            DagPickle).filter(DagPickle.id == args.pickle).first()
        if not dag_pickle:
            raise AirflowException("Who hid the pickle!? [missing pickle]")
        dag = dag_pickle.pickle
    task = dag.get_task(task_id=args.task_id)

    ti = TaskInstance(task, args.execution_date)

    if args.local:
        print("Logging into: " + filename)
        run_job = jobs.LocalTaskJob(
            task_instance=ti,
            mark_success=args.mark_success,
            force=args.force,
            pickle_id=args.pickle,
            task_start_date=args.task_start_date,
            ignore_dependencies=args.ignore_dependencies,
            pool=args.pool)
        run_job.run()
    elif args.raw:
        ti.run(
            mark_success=args.mark_success,
            force=args.force,
            ignore_dependencies=args.ignore_dependencies,
            job_id=args.job_id,
            pool=args.pool,
        )
    else:
        pickle_id = None
        if args.ship_dag:
            try:
                # Running remotely, so pickling the DAG
                session = settings.Session()
                pickle = DagPickle(dag)
                session.add(pickle)
                session.commit()
                pickle_id = pickle.id
                print((
                    'Pickled dag {dag} '
                    'as pickle_id:{pickle_id}').format(**locals()))
            except Exception as e:
                print('Could not pickle the DAG')
                print(e)
                raise e

        executor = DEFAULT_EXECUTOR
        executor.start()
        print("Sending to executor.")
        executor.queue_task_instance(
            ti,
            mark_success=args.mark_success,
            pickle_id=pickle_id,
            ignore_dependencies=args.ignore_dependencies,
            force=args.force,
            pool=args.pool)
        executor.heartbeat()
        executor.end()

    # store logs remotely
    remote_base = conf.get('core', 'REMOTE_BASE_LOG_FOLDER')

    # deprecated as of March 2016
    if not remote_base and conf.get('core', 'S3_LOG_FOLDER'):
        warnings.warn(
            'The S3_LOG_FOLDER conf key has been replaced by '
            'REMOTE_BASE_LOG_FOLDER. Your conf still works but please '
            'update airflow.cfg to ensure future compatibility.',
            DeprecationWarning)
        remote_base = conf.get('core', 'S3_LOG_FOLDER')

    if os.path.exists(filename):
        # read log and remove old logs to get just the latest additions

        with open(filename, 'r') as logfile:
            log = logfile.read()

        remote_log_location = filename.replace(log_base, remote_base)
        # S3
        if remote_base.startswith('s3:/'):
            utils.S3Log().write(log, remote_log_location)
        # GCS
        elif remote_base.startswith('gs:/'):
            utils.GCSLog().write(
                log,
                remote_log_location,
                append=True)
        # Other
        elif remote_base:
            logging.error(
                'Unsupported remote log location: {}'.format(remote_base))
示例#9
0
    def run(self): 

        utils.pessimistic_connection_handling()
        # Setting up logging

        self.start_date = datetime.now()
        # pickle = DagPickle(self.dag)

        self.save()
        
        identifier = self.id 
        # logging.basicConfig(
        #     filename=filename,
        #     level=settings.LOGGING_LEVEL,
        #     format=settings.LOG_FORMAT)
        # print("Logging into: " + filename)

        # if we want to log to files. find way to do both!
        import logging 

        log_folder = os.path.expanduser(conf.get('core', 'BASE_LOG_FOLDER'))
        directory = log_folder + "/job_{self.id}".format(**locals())
        if not os.path.exists(directory):
            os.makedirs(directory)
        log_filename = "{directory}/job_{self.id}.run_log.txt".format(**locals())
        task_logger = logging.getLogger('job_{}_logger'.format(self.id))
        task_logger.setLevel(logging.DEBUG)
        logformat = logging.Formatter("%(asctime)s  %(name)s  %(levelname)s  %(message)s")
        channel = logging.FileHandler(log_filename, mode='a', encoding=None, delay=False)
        channel.setFormatter(logformat)
        task_logger.addHandler(channel)
        # old_logging = logging
        logging = task_logger
        logging.info("logging rerouted to {log_filename}")


        # session = settings.Session()
        # logging.info('Loading pickle id {args.pickle}'.format(**locals()))
        # dag_pickle = session.query(
        #     DagPickle).filter(DagPickle.id == args.pickle).first()
        # if not dag_pickle:
        #     raise AirflowException("Who hid the pickle!? [missing pickle]")

        # dag = dag_pickle.pickle

        # figure out how to traverse a dependency map 
        session = settings.Session()
        # executor = DEFAULT_EXECUTOR
        from airflow.executors import LocalBashExecutor
        executor = LocalBashExecutor(logging=logging)
        executor.start()
        self.task_instances = []

        for task in self.dag.tasks: 
            #make TI and kick off run 
            ti = models.TaskInstance(task, datetime.now())
            ti.dag_id = self.dag.dag_id
            ti.job_id = self.id 
            ti.state == State.QUEUED
            ti.save()
            self.task_instances.append(ti)
            logging.info("{} saved task instance {}".format(self.__class__.__name__, ti.id))


        for ti in sorted(self.task_instances, key=lambda x: x.priority_weight, reverse=True):
            logging.info("{} queuing task {} with priority {} on executor {}".format(self.__class__.__name__, ti.id, ti.priority_weight, executor.__class__.__name__))
            # print("Sending to executor.")
            executor.queue_task_instance(ti)
            executor.heartbeat()

        executor.heartbeat()
        executor.end()
示例#10
0
文件: app.py 项目: mkroc22/Airflow
from airflow import utils
from airflow.www import utils as wwwutils

from airflow.www.login import login_manager
import flask_login
from flask_login import login_required

QUERY_LIMIT = 100000
CHART_LIMIT = 200000

AUTHENTICATE = conf.getboolean('core', 'AUTHENTICATE')
if AUTHENTICATE is False:
    login_required = lambda x: x

dagbag = models.DagBag(conf.get('core', 'DAGS_FOLDER'))
utils.pessimistic_connection_handling()

app = Flask(__name__)
app.config['SQLALCHEMY_POOL_RECYCLE'] = 3600

login_manager.init_app(app)
app.secret_key = 'airflowified'

cache = Cache(app=app,
              config={
                  'CACHE_TYPE': 'filesystem',
                  'CACHE_DIR': '/tmp'
              })

# Init for chartkick, the python wrapper for highcharts
ck = Blueprint('ck_page',
示例#11
0
文件: cli.py 项目: xavierp/airflow
def run(args):

    utils.pessimistic_connection_handling()

    # Setting up logging
    log_base = os.path.expanduser(configuration.get('core', 'BASE_LOG_FOLDER'))
    directory = log_base + "/{args.dag_id}/{args.task_id}".format(args=args)
    if not os.path.exists(directory):
        os.makedirs(directory)
    args.execution_date = dateutil.parser.parse(args.execution_date)
    iso = args.execution_date.isoformat()
    filename = "{directory}/{iso}".format(**locals())

    subdir = process_subdir(args.subdir)
    logging.root.handlers = []
    logging.basicConfig(filename=filename,
                        level=settings.LOGGING_LEVEL,
                        format=settings.LOG_FORMAT)

    if not args.pickle:
        dagbag = DagBag(subdir)
        if args.dag_id not in dagbag.dags:
            msg = 'DAG [{0}] could not be found in {1}'.format(
                args.dag_id, subdir)
            logging.error(msg)
            raise AirflowException(msg)
        dag = dagbag.dags[args.dag_id]
        task = dag.get_task(task_id=args.task_id)
    else:
        session = settings.Session()
        logging.info('Loading pickle id {args.pickle}'.format(**locals()))
        dag_pickle = session.query(DagPickle).filter(
            DagPickle.id == args.pickle).first()
        if not dag_pickle:
            raise AirflowException("Who hid the pickle!? [missing pickle]")
        dag = dag_pickle.pickle
        task = dag.get_task(task_id=args.task_id)

    task_start_date = None
    if args.task_start_date:
        task_start_date = dateutil.parser.parse(args.task_start_date)
        task.start_date = task_start_date
    ti = TaskInstance(task, args.execution_date)

    if args.local:
        print("Logging into: " + filename)
        run_job = jobs.LocalTaskJob(
            task_instance=ti,
            mark_success=args.mark_success,
            force=args.force,
            pickle_id=args.pickle,
            task_start_date=task_start_date,
            ignore_dependencies=args.ignore_dependencies,
            pool=args.pool)
        run_job.run()
    elif args.raw:
        ti.run(
            mark_success=args.mark_success,
            force=args.force,
            ignore_dependencies=args.ignore_dependencies,
            job_id=args.job_id,
            pool=args.pool,
        )
    else:
        pickle_id = None
        if args.ship_dag:
            try:
                # Running remotely, so pickling the DAG
                session = settings.Session()
                pickle = DagPickle(dag)
                session.add(pickle)
                session.commit()
                pickle_id = pickle.id
                print(('Pickled dag {dag} '
                       'as pickle_id:{pickle_id}').format(**locals()))
            except Exception as e:
                print('Could not pickle the DAG')
                print(e)
                raise e

        executor = DEFAULT_EXECUTOR
        executor.start()
        print("Sending to executor.")
        executor.queue_task_instance(
            ti,
            mark_success=args.mark_success,
            pickle_id=pickle_id,
            ignore_dependencies=args.ignore_dependencies,
            force=args.force,
            pool=args.pool)
        executor.heartbeat()
        executor.end()

    # store logs remotely
    remote_base = configuration.get('core', 'REMOTE_BASE_LOG_FOLDER')

    # deprecated as of March 2016
    if not remote_base and configuration.get('core', 'S3_LOG_FOLDER'):
        warnings.warn(
            'The S3_LOG_FOLDER configuration key has been replaced by '
            'REMOTE_BASE_LOG_FOLDER. Your configuration still works but please '
            'update airflow.cfg to ensure future compatibility.',
            DeprecationWarning)
        remote_base = configuration.get('core', 'S3_LOG_FOLDER')

    if os.path.exists(filename):
        # read log and remove old logs to get just the latest additions

        with open(filename, 'r') as logfile:
            log = logfile.read()

        remote_log_location = filename.replace(log_base, remote_base)
        # S3

        if remote_base.startswith('s3:/'):
            utils.S3Log().write(log, remote_log_location)
        # GCS
        elif remote_base.startswith('gs:/'):
            utils.GCSLog().write(log, remote_log_location, append=True)
        # Other
        elif remote_base:
            logging.error(
                'Unsupported remote log location: {}'.format(remote_base))
示例#12
0
    def _execute(self):
        dag_id = self.dag_id

        def signal_handler(signum, frame):
            logging.error("SIGINT (ctrl-c) received")
            sys.exit(1)
        signal.signal(signal.SIGINT, signal_handler)

        utils.pessimistic_connection_handling()

        # Sleep time (seconds) between master runs

        logging.basicConfig(level=logging.DEBUG)
        logging.info("Starting a master scheduler")

        session = settings.Session()
        TI = models.TaskInstance

        # This should get new code
        dagbag = models.DagBag(self.subdir)
        executor = dagbag.executor
        executor.start()
        while True:
            self.heartbeat()
            dagbag.collect_dags(only_if_updated=True)
            dags = [dagbag.dags[dag_id]] if dag_id else dagbag.dags.values()
            for dag in dags:

                logging.info(
                    "Getting latest instance "
                    "for all task in dag " + dag.dag_id)
                sq = session.query(
                    TI.task_id,
                    func.max(TI.execution_date).label('max_ti')
                ).filter(TI.dag_id == dag.dag_id).group_by(TI.task_id).subquery(
                    'sq')

                qry = session.query(TI).filter(
                    TI.dag_id == dag.dag_id,
                    TI.task_id == sq.c.task_id,
                    TI.execution_date == sq.c.max_ti,
                )
                latest_ti = qry.all()
                ti_dict = {ti.task_id: ti for ti in latest_ti}
                session.expunge_all()
                session.commit()

                for task in dag.tasks:
                    if task.task_id not in ti_dict:
                        # Brand new task, let's get started
                        ti = TI(task, task.start_date)
                        ti.refresh_from_db()
                        if ti.is_runnable():
                            logging.debug(
                                'First run for {ti}'.format(**locals()))
                            executor.queue_command(ti.key, ti.command())
                    else:
                        ti = ti_dict[task.task_id]
                        ti.task = task  # Hacky but worky
                        if ti.state == State.RUNNING:
                            continue  # Only one task at a time
                        elif ti.state == State.UP_FOR_RETRY:
                            # If task instance if up for retry, make sure
                            # the retry delay is met
                            if ti.is_runnable():
                                logging.debug('Queuing retry: ' + str(ti))
                                executor.queue_command(ti.key, ti.command())
                        else:
                            # Trying to run the next schedule
                            ti = TI(
                                task=task,
                                execution_date=ti.execution_date +
                                    task.schedule_interval
                            )
                            ti.refresh_from_db()
                            if ti.is_runnable():
                                logging.debug('Queuing next run: ' + str(ti))
                                executor.queue_command(ti.key, ti.command())
                    executor.heartbeat()
                session.close()
        executor.end()
示例#13
0
from airflow import utils
from airflow.www import utils as wwwutils

from airflow.www.login import login_manager
import flask_login
from flask_login import login_required

QUERY_LIMIT = 100000
CHART_LIMIT = 200000

AUTHENTICATE = conf.getboolean('core', 'AUTHENTICATE')
if AUTHENTICATE is False:
    login_required = lambda x: x

dagbag = models.DagBag(conf.get('core', 'DAGS_FOLDER'))
utils.pessimistic_connection_handling()

app = Flask(__name__)
app.config['SQLALCHEMY_POOL_RECYCLE'] = 3600

login_manager.init_app(app)
app.secret_key = 'airflowified'

cache = Cache(
    app=app, config={'CACHE_TYPE': 'filesystem', 'CACHE_DIR': '/tmp'})

# Init for chartkick, the python wrapper for highcharts
ck = Blueprint(
    'ck_page', __name__,
    static_folder=chartkick.js(), static_url_path='/static')
app.register_blueprint(ck, url_prefix='/ck')
示例#14
0
def run(args):

    utils.pessimistic_connection_handling()
    # Setting up logging
    log = os.path.expanduser(conf.get('core', 'BASE_LOG_FOLDER'))
    directory = log + "/{args.dag_id}/{args.task_id}".format(args=args)
    if not os.path.exists(directory):
        os.makedirs(directory)
    args.execution_date = dateutil.parser.parse(args.execution_date)
    iso = args.execution_date.isoformat()
    filename = "{directory}/{iso}".format(**locals())
    subdir = None
    if args.subdir:
        subdir = args.subdir.replace(
            "DAGS_FOLDER", conf.get("core", "DAGS_FOLDER"))
        subdir = os.path.expanduser(subdir)
    logging.basicConfig(
        filename=filename,
        level=settings.LOGGING_LEVEL,
        format=settings.LOG_FORMAT)
    if not args.pickle:
        dagbag = DagBag(subdir)
        if args.dag_id not in dagbag.dags:
            msg = 'DAG [{0}] could not be found'.format(args.dag_id)
            logging.error(msg)
            raise AirflowException(msg)
        dag = dagbag.dags[args.dag_id]
        task = dag.get_task(task_id=args.task_id)
    else:
        session = settings.Session()
        logging.info('Loading pickle id {args.pickle}'.format(**locals()))
        dag_pickle = session.query(
            DagPickle).filter(DagPickle.id == args.pickle).first()
        if not dag_pickle:
            raise AirflowException("Who hid the pickle!? [missing pickle]")
        dag = dag_pickle.pickle
        task = dag.get_task(task_id=args.task_id)

    task_start_date = None
    if args.task_start_date:
        task_start_date = dateutil.parser.parse(args.task_start_date)
        task.start_date = task_start_date
    ti = TaskInstance(task, args.execution_date)

    if args.local:
        print("Logging into: " + filename)
        run_job = jobs.LocalTaskJob(
            task_instance=ti,
            mark_success=args.mark_success,
            force=args.force,
            pickle_id=args.pickle,
            task_start_date=task_start_date,
            ignore_dependencies=args.ignore_dependencies)
        run_job.run()
    elif args.raw:
        ti.run(
            mark_success=args.mark_success,
            force=args.force,
            ignore_dependencies=args.ignore_dependencies,
            job_id=args.job_id,
        )
    else:
        pickle_id = None
        if args.ship_dag:
            try:
                # Running remotely, so pickling the DAG
                session = settings.Session()
                pickle = DagPickle(dag)
                session.add(pickle)
                session.commit()
                pickle_id = pickle.id
                print((
                    'Pickled dag {dag} '
                    'as pickle_id:{pickle_id}').format(**locals()))
            except Exception as e:
                print('Could not pickle the DAG')
                print(e)
                raise e

        executor = DEFAULT_EXECUTOR
        executor.start()
        print("Sending to executor.")
        executor.queue_task_instance(
            ti,
            mark_success=args.mark_success,
            pickle_id=pickle_id,
            ignore_dependencies=args.ignore_dependencies,
            force=args.force)
        executor.heartbeat()
        executor.end()
示例#15
0
    def _execute(self):
        dag_id = self.dag_id

        def signal_handler(signum, frame):
            self.logger.error("SIGINT (ctrl-c) received")
            sys.exit(1)

        signal.signal(signal.SIGINT, signal_handler)

        utils.pessimistic_connection_handling()

        logging.basicConfig(level=logging.DEBUG)
        self.logger.info("Starting the scheduler")

        dagbag = models.DagBag(self.subdir, sync_to_db=True)
        executor = dagbag.executor
        executor.start()
        i = 0
        while not self.num_runs or self.num_runs > i:
            try:
                loop_start_dttm = datetime.now()
                try:
                    self.prioritize_queued(executor=executor, dagbag=dagbag)
                except Exception as e:
                    self.logger.exception(e)

                i += 1
                try:
                    if i % self.refresh_dags_every == 0:
                        dagbag = models.DagBag(self.subdir, sync_to_db=True)
                    else:
                        dagbag.collect_dags(only_if_updated=True)
                except:
                    self.logger.error("Failed at reloading the dagbag")
                    if statsd:
                        statsd.incr('dag_refresh_error', 1, 1)
                    sleep(5)

                if dag_id:
                    dags = [dagbag.dags[dag_id]]
                else:
                    dags = [
                        dag for dag in dagbag.dags.values()
                        if not dag.parent_dag
                    ]
                paused_dag_ids = dagbag.paused_dags()
                for dag in dags:
                    self.logger.debug("Scheduling {}".format(dag.dag_id))
                    dag = dagbag.get_dag(dag.dag_id)
                    if not dag or (dag.dag_id in paused_dag_ids):
                        continue
                    try:
                        self.schedule_dag(dag)
                        self.process_dag(dag, executor)
                        self.manage_slas(dag)
                    except Exception as e:
                        self.logger.exception(e)
                self.logger.info("Done queuing tasks, calling the executor's "
                                 "heartbeat")
                duration_sec = (datetime.now() -
                                loop_start_dttm).total_seconds()
                self.logger.info("Loop took: {} seconds".format(duration_sec))
                try:
                    self.import_errors(dagbag)
                except Exception as e:
                    self.logger.exception(e)
                try:
                    dagbag.kill_zombies()
                except Exception as e:
                    self.logger.exception(e)
                try:
                    # We really just want the scheduler to never ever stop.
                    executor.heartbeat()
                    self.heartbeat()
                except Exception as e:
                    self.logger.exception(e)
                    self.logger.error("Tachycardia!")
            except Exception as deep_e:
                self.logger.exception(deep_e)
        executor.end()
示例#16
0
def run(args):

    utils.pessimistic_connection_handling()
    # Setting up logging
    log = os.path.expanduser(conf.get('core', 'BASE_LOG_FOLDER'))
    directory = log + "/{args.dag_id}/{args.task_id}".format(args=args)
    if not os.path.exists(directory):
        os.makedirs(directory)
    args.execution_date = dateutil.parser.parse(args.execution_date)
    iso = args.execution_date.isoformat()
    filename = "{directory}/{iso}".format(**locals())

    # store old log (to help with S3 appends)
    if os.path.exists(filename):
        with open(filename, 'r') as logfile:
            old_log = logfile.read()
    else:
        old_log = None

    subdir = None
    if args.subdir:
        subdir = args.subdir.replace(
            "DAGS_FOLDER", conf.get("core", "DAGS_FOLDER"))
        subdir = os.path.expanduser(subdir)
    logging.basicConfig(
        filename=filename,
        level=settings.LOGGING_LEVEL,
        format=settings.LOG_FORMAT)
    if not args.pickle:
        dagbag = DagBag(subdir)
        if args.dag_id not in dagbag.dags:
            msg = 'DAG [{0}] could not be found'.format(args.dag_id)
            logging.error(msg)
            raise AirflowException(msg)
        dag = dagbag.dags[args.dag_id]
        task = dag.get_task(task_id=args.task_id)
    else:
        session = settings.Session()
        logging.info('Loading pickle id {args.pickle}'.format(**locals()))
        dag_pickle = session.query(
            DagPickle).filter(DagPickle.id == args.pickle).first()
        if not dag_pickle:
            raise AirflowException("Who hid the pickle!? [missing pickle]")
        dag = dag_pickle.pickle
        task = dag.get_task(task_id=args.task_id)

    task_start_date = None
    if args.task_start_date:
        task_start_date = dateutil.parser.parse(args.task_start_date)
        task.start_date = task_start_date
    ti = TaskInstance(task, args.execution_date)

    if args.local:
        print("Logging into: " + filename)
        run_job = jobs.LocalTaskJob(
            task_instance=ti,
            mark_success=args.mark_success,
            force=args.force,
            pickle_id=args.pickle,
            task_start_date=task_start_date,
            ignore_dependencies=args.ignore_dependencies)
        run_job.run()
    elif args.raw:
        ti.run(
            mark_success=args.mark_success,
            force=args.force,
            ignore_dependencies=args.ignore_dependencies,
            job_id=args.job_id,
        )
    else:
        pickle_id = None
        if args.ship_dag:
            try:
                # Running remotely, so pickling the DAG
                session = settings.Session()
                pickle = DagPickle(dag)
                session.add(pickle)
                session.commit()
                pickle_id = pickle.id
                print((
                    'Pickled dag {dag} '
                    'as pickle_id:{pickle_id}').format(**locals()))
            except Exception as e:
                print('Could not pickle the DAG')
                print(e)
                raise e

        executor = DEFAULT_EXECUTOR
        executor.start()
        print("Sending to executor.")
        executor.queue_task_instance(
            ti,
            mark_success=args.mark_success,
            pickle_id=pickle_id,
            ignore_dependencies=args.ignore_dependencies,
            force=args.force)
        executor.heartbeat()
        executor.end()

    if conf.get('core', 'S3_LOG_FOLDER').startswith('s3:'):
        import boto
        s3_log = filename.replace(log, conf.get('core', 'S3_LOG_FOLDER'))
        bucket, key = s3_log.lstrip('s3:/').split('/', 1)
        if os.path.exists(filename):

            # get logs
            with open(filename, 'r') as logfile:
                new_log = logfile.read()

            # remove old logs (since they are already in S3)
            if old_log:
                new_log.replace(old_log, '')

            try:
                s3 = boto.connect_s3()
                s3_key = boto.s3.key.Key(s3.get_bucket(bucket), key)

                # append new logs to old S3 logs, if available
                if s3_key.exists():
                    old_s3_log = s3_key.get_contents_as_string().decode()
                    new_log = old_s3_log + '\n' + new_log

                # send log to S3
                s3_key.set_contents_from_string(new_log)
            except:
                print('Could not send logs to S3.')
示例#17
0
    def _execute(self):
        dag_id = self.dag_id

        def signal_handler(signum, frame):
            logging.error("SIGINT (ctrl-c) received")
            sys.exit(1)

        signal.signal(signal.SIGINT, signal_handler)

        utils.pessimistic_connection_handling()

        # Sleep time (seconds) between master runs

        logging.basicConfig(level=logging.DEBUG)
        logging.info("Starting a master scheduler")

        session = settings.Session()
        TI = models.TaskInstance

        # This should get new code
        dagbag = models.DagBag(self.subdir)
        executor = dagbag.executor
        executor.start()
        while True:
            self.heartbeat()
            dagbag.collect_dags(only_if_updated=True)
            dags = [dagbag.dags[dag_id]] if dag_id else dagbag.dags.values()
            for dag in dags:

                logging.info("Getting latest instance "
                             "for all task in dag " + dag.dag_id)
                sq = session.query(
                    TI.task_id,
                    func.max(TI.execution_date).label('max_ti')).filter(
                        TI.dag_id == dag.dag_id).group_by(
                            TI.task_id).subquery('sq')

                qry = session.query(TI).filter(
                    TI.dag_id == dag.dag_id,
                    TI.task_id == sq.c.task_id,
                    TI.execution_date == sq.c.max_ti,
                )
                latest_ti = qry.all()
                ti_dict = {ti.task_id: ti for ti in latest_ti}
                session.expunge_all()
                session.commit()

                for task in dag.tasks:
                    if task.task_id not in ti_dict:
                        # Brand new task, let's get started
                        ti = TI(task, task.start_date)
                        ti.refresh_from_db()
                        if ti.is_runnable():
                            logging.debug(
                                'First run for {ti}'.format(**locals()))
                            executor.queue_command(ti.key, ti.command())
                    else:
                        ti = ti_dict[task.task_id]
                        ti.task = task  # Hacky but worky
                        if ti.state == State.RUNNING:
                            continue  # Only one task at a time
                        elif ti.state == State.UP_FOR_RETRY:
                            # If task instance if up for retry, make sure
                            # the retry delay is met
                            if ti.is_runnable():
                                logging.debug('Queuing retry: ' + str(ti))
                                executor.queue_command(ti.key, ti.command())
                        else:
                            # Trying to run the next schedule
                            ti = TI(task=task,
                                    execution_date=ti.execution_date +
                                    task.schedule_interval)
                            ti.refresh_from_db()
                            if ti.is_runnable():
                                logging.debug('Queuing next run: ' + str(ti))
                                executor.queue_command(ti.key, ti.command())
                    executor.heartbeat()
                session.close()
        executor.end()