def dag_show(args): """Displays DAG or saves it's graphic representation to the file""" dag = get_dag(args.subdir, args.dag_id) dot = render_dag(dag) filename = args.save imgcat = args.imgcat if filename and imgcat: raise SystemExit( "Option --save and --imgcat are mutually exclusive. " "Please remove one option to execute the command.", ) elif filename: _save_dot_to_file(dot, filename) elif imgcat: _display_dot_via_imgcat(dot) else: print(dot.source)
def dag_state(args): """ Returns the state (and conf if exists) of a DagRun at the command line. >>> airflow dags state tutorial 2015-01-01T00:00:00.000000 running >>> airflow dags state a_dag_with_conf_passed 2015-01-01T00:00:00.000000 failed, {"name": "bob", "age": "42"} """ if args.subdir: dag = get_dag(args.subdir, args.dag_id) else: dag = get_dag_by_file_location(args.dag_id) dr = DagRun.find(dag.dag_id, execution_date=args.execution_date) out = dr[0].state if dr else None conf_out = '' if out and dr[0].conf: conf_out = ', ' + json.dumps(dr[0].conf) print(str(out) + conf_out)
def task_test(args, dag=None): """Tests task for a given dag_id""" # We want to log output from operators etc to show up here. Normally # airflow.task would redirect to a file, but here we want it to propagate # up to the normal airflow handler. handlers = logging.getLogger('airflow.task').handlers already_has_stream_handler = False for handler in handlers: already_has_stream_handler = isinstance(handler, logging.StreamHandler) if already_has_stream_handler: break if not already_has_stream_handler: logging.getLogger('airflow.task').propagate = True env_vars = {'AIRFLOW_TEST_MODE': 'True'} if args.env_vars: env_vars.update(args.env_vars) os.environ.update(env_vars) dag = dag or get_dag(args.subdir, args.dag_id) task = dag.get_task(task_id=args.task_id) # Add CLI provided task_params to task.params if args.task_params: passed_in_params = json.loads(args.task_params) task.params.update(passed_in_params) ti = TaskInstance(task, args.execution_date) try: if args.dry_run: ti.dry_run() else: ti.run(ignore_task_deps=True, ignore_ti_state=True, test_mode=True) except Exception: # pylint: disable=broad-except if args.post_mortem: debugger = _guess_debugger() debugger.post_mortem() else: raise finally: if not already_has_stream_handler: # Make sure to reset back to normal. When run for CLI this doesn't # matter, but it does for test suite logging.getLogger('airflow.task').propagate = False
def task_run(args, dag=None): """Runs a single task instance""" # Load custom airflow config if args.cfg_path: with open(args.cfg_path) as conf_file: conf_dict = json.load(conf_file) if os.path.exists(args.cfg_path): os.remove(args.cfg_path) conf.read_dict(conf_dict, source=args.cfg_path) settings.configure_vars() settings.MASK_SECRETS_IN_LOGS = True # IMPORTANT, have to use the NullPool, otherwise, each "run" command may leave # behind multiple open sleeping connections while heartbeating, which could # easily exceed the database connection limit when # processing hundreds of simultaneous tasks. settings.configure_orm(disable_connection_pool=True) if dag and args.pickle: raise AirflowException("You cannot use the --pickle option when using DAG.cli() method.") elif args.pickle: print(f'Loading pickle id: {args.pickle}') dag = get_dag_by_pickle(args.pickle) elif not dag: dag = get_dag(args.subdir, args.dag_id) else: # Use DAG from parameter pass task = dag.get_task(task_id=args.task_id) ti = _get_ti(task, args.execution_date_or_run_id) ti.init_run_context(raw=args.raw) hostname = get_hostname() print(f"Running {ti} on host {hostname}") if args.interactive: _run_task_by_selected_method(args, dag, ti) else: with _capture_task_logs(ti): _run_task_by_selected_method(args, dag, ti)
def generate_pod_yaml(args): """Generates yaml files for each task in the DAG. Used for testing output of KubernetesExecutor""" from kubernetes.client.api_client import ApiClient from airflow.executors.kubernetes_executor import AirflowKubernetesScheduler, KubeConfig from airflow.kubernetes import pod_generator from airflow.kubernetes.pod_generator import PodGenerator from airflow.settings import pod_mutation_hook execution_date = args.execution_date dag = get_dag(subdir=args.subdir, dag_id=args.dag_id) yaml_output_path = args.output_path kube_config = KubeConfig() for task in dag.tasks: ti = TaskInstance(task, execution_date) pod = PodGenerator.construct_pod( dag_id=args.dag_id, task_id=ti.task_id, pod_id=AirflowKubernetesScheduler._create_pod_id( # pylint: disable=W0212 args.dag_id, ti.task_id), try_number=ti.try_number, kube_image=kube_config.kube_image, date=ti.execution_date, command=ti.command_as_list(), pod_override_object=PodGenerator.from_obj(ti.executor_config), worker_uuid="worker-config", namespace=kube_config.executor_namespace, base_worker_pod=PodGenerator.deserialize_model_file( kube_config.pod_template_file)) pod_mutation_hook(pod) api_client = ApiClient() date_string = pod_generator.datetime_to_label_safe_datestring( execution_date) yaml_file_name = f"{args.dag_id}_{ti.task_id}_{date_string}.yml" os.makedirs(os.path.dirname(yaml_output_path + "/airflow_yaml_output/"), exist_ok=True) with open(yaml_output_path + "/airflow_yaml_output/" + yaml_file_name, "w") as output: sanitized_pod = api_client.sanitize_for_serialization(pod) output.write(yaml.dump(sanitized_pod)) print( f"YAML output can be found at {yaml_output_path}/airflow_yaml_output/")
def task_render(args): """Renders and displays templated fields for a given task""" dag = get_dag(args.subdir, args.dag_id) task = dag.get_task(task_id=args.task_id) ti = TaskInstance(task, args.execution_date) ti.render_templates() for attr in task.__class__.template_fields: print( textwrap.dedent( """\ # ---------------------------------------------------------- # property: {} # ---------------------------------------------------------- {} """.format( attr, getattr(task, attr) ) ) )
def dag_next_execution(args): """ Returns the next execution datetime of a DAG at the command line. >>> airflow dags next-execution tutorial 2018-08-31 10:38:00 """ dag = get_dag(args.subdir, args.dag_id) if dag.get_is_paused(): print("[INFO] Please be reminded this DAG is PAUSED now.", file=sys.stderr) with create_session() as session: max_date_subq = (session.query( func.max(DagRun.execution_date).label("max_date")).filter( DagRun.dag_id == dag.dag_id).subquery()) max_date_run: Optional[DagRun] = (session.query(DagRun).filter( DagRun.dag_id == dag.dag_id, DagRun.execution_date == max_date_subq.c.max_date).one_or_none()) if max_date_run is None: print( "[WARN] Only applicable when there is execution record found for the DAG.", file=sys.stderr) print(None) return next_info = dag.next_dagrun_info(dag.get_run_data_interval(max_date_run), restricted=False) if next_info is None: print( "[WARN] No following schedule can be found. " "This DAG may have schedule interval '@once' or `None`.", file=sys.stderr, ) print(None) return print(next_info.logical_date.isoformat()) for _ in range(1, args.num_executions): next_info = dag.next_dagrun_info(next_info.data_interval, restricted=False) print(next_info.logical_date.isoformat())
def test_local_run(self): args = self.parser.parse_args([ 'tasks', 'run', 'example_python_operator', 'print_the_context', '2018-04-27T08:39:51.298439+00:00', '--interactive', '--subdir', '/root/dags/example_python_operator.py', ]) dag = get_dag(args.subdir, args.dag_id) reset(dag.dag_id) task_command.task_run(args) task = dag.get_task(task_id=args.task_id) ti = TaskInstance(task, args.execution_date) ti.refresh_from_db() state = ti.current_state() assert state == State.SUCCESS
def dag_next_execution(args): """ Returns the next execution datetime of a DAG at the command line. >>> airflow dags next_execution tutorial 2018-08-31 10:38:00 """ dag = get_dag(args.subdir, args.dag_id) if dag.is_paused: print("[INFO] Please be reminded this DAG is PAUSED now.") if dag.latest_execution_date: next_execution_dttm = dag.following_schedule(dag.latest_execution_date) if next_execution_dttm is None: print("[WARN] No following schedule can be found. " + "This DAG may have schedule interval '@once' or `None`.") print(next_execution_dttm) else: print("[WARN] Only applicable when there is execution record found for the DAG.") print(None)
def dag_test(args, session=None): """Execute one single DagRun for a given DAG and execution date, using the DebugExecutor.""" dag = get_dag(subdir=args.subdir, dag_id=args.dag_id) dag.clear(start_date=args.execution_date, end_date=args.execution_date, dag_run_state=False) try: dag.run( executor=DebugExecutor(), start_date=args.execution_date, end_date=args.execution_date, # Always run the DAG at least once even if no logical runs are # available. This does not make a lot of sense, but Airflow has # been doing this prior to 2.2 so we keep compatibility. run_at_least_once=True, ) except BackfillUnfinished as e: print(str(e)) show_dagrun = args.show_dagrun imgcat = args.imgcat_dagrun filename = args.save_dagrun if show_dagrun or imgcat or filename: tis = ( session.query(TaskInstance) .filter( TaskInstance.dag_id == args.dag_id, TaskInstance.execution_date == args.execution_date, ) .all() ) dot_graph = render_dag(dag, tis=tis) print() if filename: _save_dot_to_file(dot_graph, filename) if imgcat: _display_dot_via_imgcat(dot_graph) if show_dagrun: print(dot_graph.source)
def task_test(args, dag=None): """Tests task for a given dag_id""" # We want log outout from operators etc to show up here. Normally # airflow.task would redirect to a file, but here we want it to propagate # up to the normal airflow handler. handlers = logging.getLogger('airflow.task').handlers already_has_stream_handler = False for handler in handlers: already_has_stream_handler = isinstance(handler, logging.StreamHandler) if already_has_stream_handler: break if not already_has_stream_handler: logging.getLogger('airflow.task').propagate = True dag = dag or get_dag(args) task = dag.get_task(task_id=args.task_id) # Add CLI provided task_params to task.params if args.task_params: passed_in_params = json.loads(args.task_params) task.params.update(passed_in_params) ti = TaskInstance(task, args.execution_date) try: if args.dry_run: ti.dry_run() else: ti.run(ignore_task_deps=True, ignore_ti_state=True, test_mode=True) except Exception: # pylint: disable=broad-except if args.post_mortem: try: debugger = importlib.import_module("ipdb") except ImportError: debugger = importlib.import_module("pdb") debugger.post_mortem() else: raise
def task_failed_deps(args): """ Returns the unmet dependencies for a task instance from the perspective of the scheduler (i.e. why a task instance doesn't get scheduled and then queued by the scheduler, and then run by an executor). >>> airflow tasks failed-deps tutorial sleep 2015-01-01 Task instance dependencies not met: Dagrun Running: Task instance's dagrun did not exist: Unknown reason Trigger Rule: Task's trigger rule 'all_success' requires all upstream tasks to have succeeded, but found 1 non-success(es). """ dag = get_dag(args.subdir, args.dag_id) task = dag.get_task(task_id=args.task_id) ti = TaskInstance(task, args.execution_date) dep_context = DepContext(deps=SCHEDULER_QUEUED_DEPS) failed_deps = list(ti.get_failed_dep_statuses(dep_context=dep_context)) # TODO, Do we want to print or log this if failed_deps: print("Task instance dependencies not met:") for dep in failed_deps: print("{}: {}".format(dep.dep_name, dep.reason)) else: print("Task instance dependencies are all met.")
def dag_backfill(args, dag=None): """Creates backfill job or dry run for a DAG""" logging.basicConfig(level=settings.LOGGING_LEVEL, format=settings.SIMPLE_LOG_FORMAT) signal.signal(signal.SIGTERM, sigint_handler) import warnings warnings.warn( '--ignore-first-depends-on-past is deprecated as the value is always set to True', category=PendingDeprecationWarning) if args.ignore_first_depends_on_past is False: args.ignore_first_depends_on_past = True dag = dag or get_dag(args.subdir, args.dag_id) if not args.start_date and not args.end_date: raise AirflowException("Provide a start_date and/or end_date") # If only one date is passed, using same as start and end args.end_date = args.end_date or args.start_date args.start_date = args.start_date or args.end_date if args.task_regex: dag = dag.partial_subset(task_ids_or_regex=args.task_regex, include_upstream=not args.ignore_dependencies) run_conf = None if args.conf: run_conf = json.loads(args.conf) if args.dry_run: print("Dry run of DAG {0} on {1}".format(args.dag_id, args.start_date)) for task in dag.tasks: print("Task {0}".format(task.task_id)) ti = TaskInstance(task, args.start_date) ti.dry_run() else: if args.reset_dagruns: DAG.clear_dags( [dag], start_date=args.start_date, end_date=args.end_date, confirm_prompt=not args.yes, include_subdags=True, dag_run_state=State.NONE, ) dag.run(start_date=args.start_date, end_date=args.end_date, mark_success=args.mark_success, local=args.local, donot_pickle=(args.donot_pickle or conf.getboolean('core', 'donot_pickle')), ignore_first_depends_on_past=args.ignore_first_depends_on_past, ignore_task_deps=args.ignore_dependencies, pool=args.pool, delay_on_limit_secs=args.delay_on_limit, verbose=args.verbose, conf=run_conf, rerun_failed_tasks=args.rerun_failed_tasks, run_backwards=args.run_backwards)
def task_run(args, dag=None): """Runs a single task instance""" # Load custom airflow config if args.cfg_path: with open(args.cfg_path, 'r') as conf_file: conf_dict = json.load(conf_file) if os.path.exists(args.cfg_path): os.remove(args.cfg_path) conf.read_dict(conf_dict, source=args.cfg_path) settings.configure_vars() # IMPORTANT, have to use the NullPool, otherwise, each "run" command may leave # behind multiple open sleeping connections while heartbeating, which could # easily exceed the database connection limit when # processing hundreds of simultaneous tasks. settings.configure_orm(disable_connection_pool=True) if dag and args.pickle: raise AirflowException("You cannot use the --pickle option when using DAG.cli() method.") elif args.pickle: print(f'Loading pickle id: {args.pickle}') dag = get_dag_by_pickle(args.pickle) elif not dag: dag = get_dag(args.subdir, args.dag_id) else: # Use DAG from parameter pass task = dag.get_task(task_id=args.task_id) ti = TaskInstance(task, args.execution_date) ti.init_run_context(raw=args.raw) hostname = get_hostname() print(f"Running {ti} on host {hostname}") if args.interactive: _run_task_by_selected_method(args, dag, ti) else: if settings.DONOT_MODIFY_HANDLERS: with redirect_stdout(StreamLogWriter(ti.log, logging.INFO)), \ redirect_stderr(StreamLogWriter(ti.log, logging.WARN)): _run_task_by_selected_method(args, dag, ti) else: # Get all the Handlers from 'airflow.task' logger # Add these handlers to the root logger so that we can get logs from # any custom loggers defined in the DAG airflow_logger_handlers = logging.getLogger('airflow.task').handlers root_logger = logging.getLogger() root_logger_handlers = root_logger.handlers # Remove all handlers from Root Logger to avoid duplicate logs for handler in root_logger_handlers: root_logger.removeHandler(handler) for handler in airflow_logger_handlers: root_logger.addHandler(handler) root_logger.setLevel(logging.getLogger('airflow.task').level) with redirect_stdout(StreamLogWriter(ti.log, logging.INFO)), \ redirect_stderr(StreamLogWriter(ti.log, logging.WARN)): _run_task_by_selected_method(args, dag, ti) # We need to restore the handlers to the loggers as celery worker process # can call this command multiple times, # so if we don't reset this then logs from next task would go to the wrong place for handler in airflow_logger_handlers: root_logger.removeHandler(handler) for handler in root_logger_handlers: root_logger.addHandler(handler) logging.shutdown()
def _start_by_fork(self): pid = os.fork() if pid: self.log.info("Started process %d to run task", pid) return psutil.Process(pid) else: # Start a new process group os.setpgid(0, 0) import signal signal.signal(signal.SIGINT, signal.SIG_DFL) signal.signal(signal.SIGTERM, signal.SIG_DFL) from airflow import settings from airflow.cli.cli_parser import get_parser from airflow.sentry import Sentry from airflow.utils.cli import get_dag # Force a new SQLAlchemy session. We can't share open DB handles # between process. The cli code will re-create this as part of its # normal startup settings.engine.pool.dispose() settings.engine.dispose() parser = get_parser() # [1:] - remove "airflow" from the start of the command args = parser.parse_args(self._command[1:]) # We prefer the job_id passed on the command-line because at this time, the # task instance may not have been updated. job_id = getattr(args, "job_id", self._task_instance.job_id) self.log.info('Running: %s', self._command) self.log.info('Job %s: Subtask %s', job_id, self._task_instance.task_id) proc_title = "airflow task runner: {0.dag_id} {0.task_id} {0.execution_date_or_run_id}" if job_id is not None: proc_title += " {0.job_id}" setproctitle(proc_title.format(args)) return_code = 0 try: # parse dag file since `airflow tasks run --local` does not parse dag file dag = get_dag(args.subdir, args.dag_id) args.func(args, dag=dag) return_code = 0 except Exception as exc: return_code = 1 self.log.error( "Failed to execute job %s for task %s (%s; %r)", job_id, self._task_instance.task_id, exc, os.getpid(), ) except SystemExit as sys_ex: # Someone called sys.exit() in the fork - mistakenly. You should not run sys.exit() in # the fork because you can mistakenly execute atexit that were set by the parent process # before fork happened return_code = sys_ex.code except BaseException: # while we want to handle Also Base exceptions here - we do not want to log them (this # is the default behaviour anyway. Setting the return code here to 2 to indicate that # this had happened. return_code = 2 finally: try: # Explicitly flush any pending exception to Sentry and logging if enabled Sentry.flush() logging.shutdown() except BaseException: # also make sure to silently ignore ALL POSSIBLE exceptions thrown in the flush/shutdown, # otherwise os._exit() might never be called. We could have used `except:` but # except BaseException is more explicit (and linters do not comply). pass # We run os._exit() making sure it is not run within the `finally` clause. # We cannot run os._exit() in finally clause, because during finally clause processing, the # Exception handled is held in memory as well as stack trace and possibly some objects that # might need to be finalized. Running os._exit() inside the `finally` clause might cause effects # similar to https://github.com/apache/airflow/issues/22404. There Temporary file has not been # deleted at os._exit() os._exit(return_code)
def task_run(args, dag=None): """Run a single task instance. Note that there must be at least one DagRun for this to start, i.e. it must have been scheduled and/or triggered previously. Alternatively, if you just need to run it for testing then use "airflow tasks test ..." command instead. """ # Load custom airflow config if args.local and args.raw: raise AirflowException( "Option --raw and --local are mutually exclusive. " "Please remove one option to execute the command.") if args.raw: unsupported_options = [ o for o in RAW_TASK_UNSUPPORTED_OPTION if getattr(args, o) ] if unsupported_options: unsupported_raw_task_flags = ', '.join( f'--{o}' for o in RAW_TASK_UNSUPPORTED_OPTION) unsupported_flags = ', '.join(f'--{o}' for o in unsupported_options) raise AirflowException( "Option --raw does not work with some of the other options on this command. " "You can't use --raw option and the following options: " f"{unsupported_raw_task_flags}. " f"You provided the option {unsupported_flags}. " "Delete it to execute the command.") if dag and args.pickle: raise AirflowException( "You cannot use the --pickle option when using DAG.cli() method.") if args.cfg_path: with open(args.cfg_path) as conf_file: conf_dict = json.load(conf_file) if os.path.exists(args.cfg_path): os.remove(args.cfg_path) conf.read_dict(conf_dict, source=args.cfg_path) settings.configure_vars() settings.MASK_SECRETS_IN_LOGS = True # IMPORTANT, have to use the NullPool, otherwise, each "run" command may leave # behind multiple open sleeping connections while heartbeating, which could # easily exceed the database connection limit when # processing hundreds of simultaneous tasks. settings.configure_orm(disable_connection_pool=True) if args.pickle: print(f'Loading pickle id: {args.pickle}') dag = get_dag_by_pickle(args.pickle) elif not dag: dag = get_dag(args.subdir, args.dag_id) else: # Use DAG from parameter pass task = dag.get_task(task_id=args.task_id) ti = _get_ti(task, args.execution_date_or_run_id) ti.init_run_context(raw=args.raw) hostname = get_hostname() print(f"Running {ti} on host {hostname}") if args.interactive: _run_task_by_selected_method(args, dag, ti) else: with _capture_task_logs(ti): _run_task_by_selected_method(args, dag, ti)
def dag_backfill(args, dag=None): """Creates backfill job or dry run for a DAG""" logging.basicConfig(level=settings.LOGGING_LEVEL, format=settings.SIMPLE_LOG_FORMAT) signal.signal(signal.SIGTERM, sigint_handler) import warnings warnings.warn( '--ignore-first-depends-on-past is deprecated as the value is always set to True', category=PendingDeprecationWarning, ) if args.ignore_first_depends_on_past is False: args.ignore_first_depends_on_past = True if not args.start_date and not args.end_date: raise AirflowException("Provide a start_date and/or end_date") dag = dag or get_dag(args.subdir, args.dag_id) # If only one date is passed, using same as start and end args.end_date = args.end_date or args.start_date args.start_date = args.start_date or args.end_date if args.task_regex: dag = dag.partial_subset(task_ids_or_regex=args.task_regex, include_upstream=not args.ignore_dependencies) if not dag.task_dict: raise AirflowException( f"There are no tasks that match '{args.task_regex}' regex. Nothing to run, exiting..." ) run_conf = None if args.conf: run_conf = json.loads(args.conf) if args.dry_run: print(f"Dry run of DAG {args.dag_id} on {args.start_date}") dr = DagRun(dag.dag_id, execution_date=args.start_date) for task in dag.tasks: print(f"Task {task.task_id}") ti = TaskInstance(task, run_id=None) ti.dag_run = dr ti.dry_run() else: if args.reset_dagruns: DAG.clear_dags( [dag], start_date=args.start_date, end_date=args.end_date, confirm_prompt=not args.yes, include_subdags=True, dag_run_state=DagRunState.QUEUED, ) try: dag.run( start_date=args.start_date, end_date=args.end_date, mark_success=args.mark_success, local=args.local, donot_pickle=(args.donot_pickle or conf.getboolean('core', 'donot_pickle')), ignore_first_depends_on_past=args.ignore_first_depends_on_past, ignore_task_deps=args.ignore_dependencies, pool=args.pool, delay_on_limit_secs=args.delay_on_limit, verbose=args.verbose, conf=run_conf, rerun_failed_tasks=args.rerun_failed_tasks, run_backwards=args.run_backwards, continue_on_failures=args.continue_on_failures, ) except ValueError as vr: print(str(vr)) sys.exit(1)
def dag_test(args): """Execute one single DagRun for a given DAG and execution date, using the DebugExecutor.""" dag = get_dag(subdir=args.subdir, dag_id=args.dag_id) dag.clear(start_date=args.execution_date, end_date=args.execution_date, reset_dag_runs=True) dag.run(executor=DebugExecutor(), start_date=args.execution_date, end_date=args.execution_date)