Пример #1
0
def _create_scheduler_job(args):
    job = SchedulerJob(
        subdir=process_subdir(args.subdir),
        num_runs=args.num_runs,
        do_pickle=args.do_pickle,
    )
    return job
Пример #2
0
def scheduler(args):
    """Starts Airflow Scheduler"""
    print(settings.HEADER)
    job = SchedulerJob(subdir=process_subdir(args.subdir),
                       num_runs=args.num_runs,
                       do_pickle=args.do_pickle)

    if args.daemon:
        pid, stdout, stderr, log_file = setup_locations(
            "scheduler", args.pid, args.stdout, args.stderr, args.log_file)
        handle = setup_logging(log_file)
        stdout = open(stdout, 'w+')
        stderr = open(stderr, 'w+')

        ctx = daemon.DaemonContext(
            pidfile=TimeoutPIDLockFile(pid, -1),
            files_preserve=[handle],
            stdout=stdout,
            stderr=stderr,
        )
        with ctx:
            job.run()

        stdout.close()
        stderr.close()
    else:
        signal.signal(signal.SIGINT, sigint_handler)
        signal.signal(signal.SIGTERM, sigint_handler)
        signal.signal(signal.SIGQUIT, sigquit_handler)
        job.run()
Пример #3
0
    def test_should_raise_exception_for_multiple_scheduler_on_one_host(self):
        scheduler_jobs = []
        with create_session() as session:
            for _ in range(3):
                scheduler_job = SchedulerJob()
                scheduler_job.state = State.RUNNING
                scheduler_job.hostname = 'HOSTNAME'
                session.add(scheduler_job)
            session.commit()
            scheduler_job.heartbeat()

        with pytest.raises(SystemExit,
                           match=r"Found 3 alive jobs. Expected only one."):
            jobs_command.check(
                self.parser.parse_args([
                    'jobs',
                    'check',
                    '--job-type',
                    'SchedulerJob',
                    '--limit',
                    '100',
                ]))
        for scheduler_job in scheduler_jobs:
            if scheduler_job.processor_agent:
                scheduler_job.processor_agent.end()
Пример #4
0
def main():
    try:
        from airflow import conf
    except ImportError:
        from airflow.configuration import conf

    from airflow.jobs.scheduler_job import SchedulerJob
    from airflow.models import DagBag

    from dbnd import dbnd_bootstrap
    from dbnd._core.log.logging_utils import create_file_handler
    from dbnd_airflow.executors.simple_executor import InProcessExecutor
    from test_dbnd_airflow.scenarios.scheduler_perf_experiment import (
        dag_folder,
        dag_id,
        log_scheduler,
    )

    dbnd_bootstrap()
    conf.set("core", "unit_test_mode", "True")

    logging.root.addHandler(create_file_handler(log_file=log_scheduler))

    dag_bag = DagBag(dag_folder=dag_folder)
    scheduler_job = SchedulerJob(
        dag_ids=[dag_id],
        subdir=dag_folder,
        do_pickle=False,
        num_runs=3,
        executor=InProcessExecutor(dag_bag=dag_bag),
    )

    scheduler_job.run()
def scheduler(args):
    """Starts Airflow Scheduler"""
    skip_serve_logs = args.skip_serve_logs

    print(settings.HEADER)
    job = SchedulerJob(
        subdir=process_subdir(args.subdir),
        num_runs=args.num_runs,
        do_pickle=args.do_pickle,
    )

    if args.daemon:
        pid, stdout, stderr, log_file = setup_locations(
            "scheduler", args.pid, args.stdout, args.stderr, args.log_file)
        handle = setup_logging(log_file)
        with open(stdout, 'w+') as stdout_handle, open(stderr,
                                                       'w+') as stderr_handle:
            ctx = daemon.DaemonContext(
                pidfile=TimeoutPIDLockFile(pid, -1),
                files_preserve=[handle],
                stdout=stdout_handle,
                stderr=stderr_handle,
            )
            with ctx:
                sub_proc = _serve_logs(skip_serve_logs)
                job.run()
    else:
        signal.signal(signal.SIGINT, sigint_handler)
        signal.signal(signal.SIGTERM, sigint_handler)
        signal.signal(signal.SIGQUIT, sigquit_handler)
        sub_proc = _serve_logs(skip_serve_logs)
        job.run()

    if sub_proc:
        sub_proc.terminate()
Пример #6
0
def run_scheduler_job(with_db_reset=False) -> None:
    """
    Run the scheduler job, selectively resetting the db before creating a ScheduleJob instance
    """
    from airflow.jobs.scheduler_job import SchedulerJob

    if with_db_reset:
        reset_db()
    SchedulerJob(subdir=DAG_FOLDER, do_pickle=False, num_runs=3).run()
Пример #7
0
 def test_should_ignore_not_running_jobs(self):
     with create_session() as session:
         for _ in range(3):
             job = SchedulerJob()
             job.state = State.SHUTDOWN
             session.add(job)
         session.commit()
     # No alive jobs found.
     with pytest.raises(SystemExit, match=r"No alive jobs found."):
         jobs_command.check(self.parser.parse_args(['jobs', 'check']))
Пример #8
0
    def test_should_report_success_for_one_working_scheduler(self):
        with create_session() as session:
            job = SchedulerJob()
            job.state = State.RUNNING
            session.add(job)
            session.commit()
            job.heartbeat()

        with contextlib.redirect_stdout(io.StringIO()) as temp_stdout:
            jobs_command.check(
                self.parser.parse_args(
                    ['jobs', 'check', '--job-type', 'SchedulerJob']))
        self.assertIn("Found one alive job.", temp_stdout.getvalue())
Пример #9
0
 def test_should_ignore_not_running_jobs(self):
     scheduler_jobs = []
     with create_session() as session:
         for _ in range(3):
             scheduler_job = SchedulerJob()
             scheduler_job.state = State.SHUTDOWN
             session.add(scheduler_job)
             scheduler_jobs.append(scheduler_job)
         session.commit()
     # No alive jobs found.
     with pytest.raises(SystemExit, match=r"No alive jobs found."):
         jobs_command.check(self.parser.parse_args(['jobs', 'check']))
     for scheduler_job in scheduler_jobs:
         if scheduler_job.processor_agent:
             scheduler_job.processor_agent.end()
Пример #10
0
    def test_should_report_success_for_ha_schedulers(self):
        with create_session() as session:
            for _ in range(3):
                job = SchedulerJob()
                job.state = State.RUNNING
                session.add(job)
            session.commit()
            job.heartbeat()

        with contextlib.redirect_stdout(io.StringIO()) as temp_stdout:
            jobs_command.check(
                self.parser.parse_args([
                    'jobs', 'check', '--job-type', 'SchedulerJob', '--limit',
                    '100', '--allow-multiple'
                ]))
        self.assertIn("Found 3 alive jobs.", temp_stdout.getvalue())
def scheduler(args):
    """Starts Airflow Scheduler"""
    print(settings.HEADER)
    job = SchedulerJob(
        subdir=process_subdir(args.subdir),
        num_runs=args.num_runs,
        do_pickle=args.do_pickle,
    )
    scheduler_name = SchedulerFactory.get_scheduler_name()
    if scheduler_name == SchedulerFactory.DEFAULT_SCHEDULER:
        pass
    elif scheduler_name == SchedulerFactory.EVENT_BASED_SCHEDULER:
        job = EventBasedSchedulerJob(dag_directory=process_subdir(args.subdir),
                                     server_uri=args.server_uri)
    else:
        scheduler_class = SchedulerFactory.get_default_scheduler()
        job = scheduler_class()

    if args.daemon:
        pid, stdout, stderr, log_file = setup_locations(
            "scheduler", args.pid, args.stdout, args.stderr, args.log_file)
        handle = setup_logging(log_file)
        stdout = open(stdout, 'w+')
        stderr = open(stderr, 'w+')

        ctx = daemon.DaemonContext(
            pidfile=TimeoutPIDLockFile(pid, -1),
            files_preserve=[handle],
            stdout=stdout,
            stderr=stderr,
        )
        with ctx:
            job.run()

        stdout.close()
        stderr.close()
    else:
        signal.signal(signal.SIGINT, sigint_handler)
        signal.signal(signal.SIGTERM, sigint_handler)
        signal.signal(signal.SIGQUIT, sigquit_handler)
        job.run()
def main(num_runs, repeat, pre_create_dag_runs, executor_class, dag_ids):
    """
    This script can be used to measure the total "scheduler overhead" of Airflow.

    By overhead we mean if the tasks executed instantly as soon as they are
    executed (i.e. they do nothing) how quickly could we schedule them.

    It will monitor the task completion of the Mock/stub executor (no actual
    tasks are run) and after the required number of dag runs for all the
    specified dags have completed all their tasks, it will cleanly shut down
    the scheduler.

    The dags you run with need to have an early enough start_date to create the
    desired number of runs.

    Care should be taken that other limits (DAG concurrency, pool size etc) are
    not the bottleneck. This script doesn't help you in that regard.

    It is recommended to repeat the test at least 3 times (`--repeat=3`, the
    default) so that you can get somewhat-accurate variance on the reported
    timing numbers, but this can be disabled for longer runs if needed.
    """

    # Turn on unit test mode so that we don't do any sleep() in the scheduler
    # loop - not needed on master, but this script can run against older
    # releases too!
    os.environ['AIRFLOW__CORE__UNIT_TEST_MODE'] = 'True'

    os.environ['AIRFLOW__CORE__DAG_CONCURRENCY'] = '500'

    # Set this so that dags can dynamically configure their end_date
    os.environ['AIRFLOW_BENCHMARK_MAX_DAG_RUNS'] = str(num_runs)
    os.environ['PERF_MAX_RUNS'] = str(num_runs)

    if pre_create_dag_runs:
        os.environ['AIRFLOW__SCHEDULER__USE_JOB_SCHEDULE'] = 'False'

    from airflow.jobs.scheduler_job import SchedulerJob
    from airflow.models.dagbag import DagBag
    from airflow.utils import db

    dagbag = DagBag()

    dags = []

    with db.create_session() as session:
        pause_all_dags(session)
        for dag_id in dag_ids:
            dag = dagbag.get_dag(dag_id)
            dag.sync_to_db(session=session)
            dags.append(dag)
            reset_dag(dag, session)

            next_run_date = dag.normalize_schedule(dag.start_date
                                                   or min(t.start_date
                                                          for t in dag.tasks))

            for _ in range(num_runs - 1):
                next_run_date = dag.following_schedule(next_run_date)

            end_date = dag.end_date or dag.default_args.get('end_date')
            if end_date != next_run_date:
                message = (
                    f"DAG {dag_id} has incorrect end_date ({end_date}) for number of runs! "
                    f"It should be "
                    f" {next_run_date}")
                sys.exit(message)

            if pre_create_dag_runs:
                create_dag_runs(dag, num_runs, session)

    ShortCircuitExecutor = get_executor_under_test(executor_class)

    executor = ShortCircuitExecutor(dag_ids_to_watch=dag_ids,
                                    num_runs=num_runs)
    scheduler_job = SchedulerJob(dag_ids=dag_ids,
                                 do_pickle=False,
                                 executor=executor)
    executor.scheduler_job = scheduler_job

    total_tasks = sum(len(dag.tasks) for dag in dags)

    if 'PYSPY' in os.environ:
        pid = str(os.getpid())
        filename = os.environ.get('PYSPY_O', 'flame-' + pid + '.html')
        os.spawnlp(os.P_NOWAIT, 'sudo', 'sudo', 'py-spy', 'record', '-o',
                   filename, '-p', pid, '--idle')

    times = []

    # Need a lambda to refer to the _latest_ value for scheduler_job, not just
    # the initial one
    code_to_test = lambda: scheduler_job.run()  # pylint: disable=unnecessary-lambda

    for count in range(repeat):
        gc.disable()
        start = time.perf_counter()

        code_to_test()
        times.append(time.perf_counter() - start)
        gc.enable()
        print("Run %d time: %.5f" % (count + 1, times[-1]))

        if count + 1 != repeat:
            with db.create_session() as session:
                for dag in dags:
                    reset_dag(dag, session)

            executor.reset(dag_ids)
            scheduler_job = SchedulerJob(dag_ids=dag_ids,
                                         do_pickle=False,
                                         executor=executor)
            executor.scheduler_job = scheduler_job

    print()
    print()
    msg = "Time for %d dag runs of %d dags with %d total tasks: %.4fs"

    if len(times) > 1:
        print((msg + " (±%.3fs)") %
              (num_runs, len(dags), total_tasks, statistics.mean(times),
               statistics.stdev(times)))
    else:
        print(msg % (num_runs, len(dags), total_tasks, times[0]))

    print()
    print()
Пример #13
0
def run_scheduler_job(with_db_reset=False) -> None:
    from airflow.jobs.scheduler_job import SchedulerJob

    if with_db_reset:
        reset_db()
    SchedulerJob(subdir=DAG_FOLDER, do_pickle=False, num_runs=3).run()