def _capture_task_logs(ti): """Manage logging context for a task run - Replace the root logger configuration with the airflow.task configuration so we can capture logs from any custom loggers used in the task. - Redirect stdout and stderr to the task instance log, as INFO and WARNING level messages, respectively. """ modify = not settings.DONOT_MODIFY_HANDLERS if modify: root_logger, task_logger = logging.getLogger(), logging.getLogger( 'airflow.task') orig_level = root_logger.level root_logger.setLevel(task_logger.level) orig_handlers = root_logger.handlers.copy() root_logger.handlers[:] = task_logger.handlers try: info_writer = StreamLogWriter(ti.log, logging.INFO) warning_writer = StreamLogWriter(ti.log, logging.WARNING) with redirect_stdout(info_writer), redirect_stderr(warning_writer): yield finally: if modify: # Restore the root logger to its original state. root_logger.setLevel(orig_level) root_logger.handlers[:] = orig_handlers
def task_run(args, dag=None): """Runs a single task instance""" if dag: args.dag_id = dag.dag_id # Load custom airflow config if args.cfg_path: with open(args.cfg_path, 'r') as conf_file: conf_dict = json.load(conf_file) if os.path.exists(args.cfg_path): os.remove(args.cfg_path) conf.read_dict(conf_dict, source=args.cfg_path) settings.configure_vars() # IMPORTANT, have to use the NullPool, otherwise, each "run" command may leave # behind multiple open sleeping connections while heartbeating, which could # easily exceed the database connection limit when # processing hundreds of simultaneous tasks. settings.configure_orm(disable_connection_pool=True) if not args.pickle and not dag: dag = get_dag(args) elif not dag: with db.create_session() as session: print(f'Loading pickle id {args.pickle}') dag_pickle = session.query(DagPickle).filter( DagPickle.id == args.pickle).first() if not dag_pickle: raise AirflowException("Who hid the pickle!? [missing pickle]") dag = dag_pickle.pickle task = dag.get_task(task_id=args.task_id) ti = TaskInstance(task, args.execution_date) ti.refresh_from_db() ti.init_run_context(raw=args.raw) hostname = get_hostname() print(f"Running {ti} on host {hostname}") if args.interactive: _run(args, dag, ti) else: with redirect_stdout(StreamLogWriter(ti.log, logging.INFO)), \ redirect_stderr(StreamLogWriter(ti.log, logging.WARN)): _run(args, dag, ti) logging.shutdown()
def task_run(args, dag=None): """Runs a single task instance""" # Load custom airflow config if args.cfg_path: with open(args.cfg_path, 'r') as conf_file: conf_dict = json.load(conf_file) if os.path.exists(args.cfg_path): os.remove(args.cfg_path) conf.read_dict(conf_dict, source=args.cfg_path) settings.configure_vars() # IMPORTANT, have to use the NullPool, otherwise, each "run" command may leave # behind multiple open sleeping connections while heartbeating, which could # easily exceed the database connection limit when # processing hundreds of simultaneous tasks. settings.configure_orm(disable_connection_pool=True) if dag and args.pickle: raise AirflowException( "You cannot use the --pickle option when using DAG.cli() method.") elif args.pickle: print(f'Loading pickle id: {args.pickle}') dag = get_dag_by_pickle(args.pickle) elif not dag: dag = get_dag(args.subdir, args.dag_id) else: # Use DAG from parameter pass task = dag.get_task(task_id=args.task_id) ti = TaskInstance(task, args.execution_date) ti.refresh_from_db() ti.init_run_context(raw=args.raw) hostname = get_hostname() print(f"Running {ti} on host {hostname}") if args.interactive: _run_task_by_selected_method(args, dag, ti) else: with redirect_stdout(StreamLogWriter(ti.log, logging.INFO)), \ redirect_stderr(StreamLogWriter(ti.log, logging.WARN)): _run_task_by_selected_method(args, dag, ti) logging.shutdown()
def test_flush(self): logger = mock.MagicMock() logger.log = mock.MagicMock() log = StreamLogWriter(logger, 1) msg = "test_message" log.write(msg) assert log._buffer == msg log.flush() logger.log.assert_called_once_with(1, msg) assert log._buffer == ""
def test_write(self): logger = mock.MagicMock() logger.log = mock.MagicMock() log = StreamLogWriter(logger, 1) msg = "test_message" log.write(msg) self.assertEqual(log._buffer, msg) log.write(" \n") logger.log.assert_called_once_with(1, msg) self.assertEqual(log._buffer, "")
def _run_file_processor( result_channel: MultiprocessingConnection, parent_channel: MultiprocessingConnection, file_path: str, pickle_dags: bool, dag_ids: Optional[List[str]], thread_name: str, callback_requests: List[CallbackRequest], ) -> None: """ Process the given file. :param result_channel: the connection to use for passing back the result :type result_channel: multiprocessing.Connection :param parent_channel: the parent end of the channel to close in the child :type parent_channel: multiprocessing.Connection :param file_path: the file to process :type file_path: str :param pickle_dags: whether to pickle the DAGs found in the file and save them to the DB :type pickle_dags: bool :param dag_ids: if specified, only examine DAG ID's that are in this list :type dag_ids: list[str] :param thread_name: the name to use for the process that is launched :type thread_name: str :param callback_requests: failure callback to execute :type callback_requests: List[airflow.utils.callback_requests.CallbackRequest] :return: the process that was launched :rtype: multiprocessing.Process """ # This helper runs in the newly created process log: logging.Logger = logging.getLogger("airflow.processor") # Since we share all open FDs from the parent, we need to close the parent side of the pipe here in # the child, else it won't get closed properly until we exit. log.info("Closing parent pipe") parent_channel.close() del parent_channel set_context(log, file_path) setproctitle(f"airflow scheduler - DagFileProcessor {file_path}") try: # redirect stdout/stderr to log with redirect_stdout(StreamLogWriter( log, logging.INFO)), redirect_stderr( StreamLogWriter(log, logging.WARN)), Stats.timer() as timer: # Re-configure the ORM engine as there are issues with multiple processes settings.configure_orm() # Change the thread name to differentiate log lines. This is # really a separate process, but changing the name of the # process doesn't work, so changing the thread name instead. threading.current_thread().name = thread_name log.info("Started process (PID=%s) to work on %s", os.getpid(), file_path) dag_file_processor = DagFileProcessor(dag_ids=dag_ids, log=log) result: Tuple[int, int] = dag_file_processor.process_file( file_path=file_path, pickle_dags=pickle_dags, callback_requests=callback_requests, ) result_channel.send(result) log.info("Processing %s took %.3f seconds", file_path, timer.duration) except Exception: # pylint: disable=broad-except # Log exceptions through the logging framework. log.exception("Got an exception! Propagating...") raise finally: # We re-initialized the ORM within this Process above so we need to # tear it down manually here settings.dispose_orm() result_channel.close()
def task_run(args, dag=None): """Runs a single task instance""" # Load custom airflow config if args.cfg_path: with open(args.cfg_path, 'r') as conf_file: conf_dict = json.load(conf_file) if os.path.exists(args.cfg_path): os.remove(args.cfg_path) conf.read_dict(conf_dict, source=args.cfg_path) settings.configure_vars() # IMPORTANT, have to use the NullPool, otherwise, each "run" command may leave # behind multiple open sleeping connections while heartbeating, which could # easily exceed the database connection limit when # processing hundreds of simultaneous tasks. settings.configure_orm(disable_connection_pool=True) if dag and args.pickle: raise AirflowException("You cannot use the --pickle option when using DAG.cli() method.") elif args.pickle: print(f'Loading pickle id: {args.pickle}') dag = get_dag_by_pickle(args.pickle) elif not dag: dag = get_dag(args.subdir, args.dag_id) else: # Use DAG from parameter pass task = dag.get_task(task_id=args.task_id) ti = TaskInstance(task, args.execution_date) ti.init_run_context(raw=args.raw) hostname = get_hostname() print(f"Running {ti} on host {hostname}") if args.interactive: _run_task_by_selected_method(args, dag, ti) else: if settings.DONOT_MODIFY_HANDLERS: with redirect_stdout(StreamLogWriter(ti.log, logging.INFO)), \ redirect_stderr(StreamLogWriter(ti.log, logging.WARN)): _run_task_by_selected_method(args, dag, ti) else: # Get all the Handlers from 'airflow.task' logger # Add these handlers to the root logger so that we can get logs from # any custom loggers defined in the DAG airflow_logger_handlers = logging.getLogger('airflow.task').handlers root_logger = logging.getLogger() root_logger_handlers = root_logger.handlers # Remove all handlers from Root Logger to avoid duplicate logs for handler in root_logger_handlers: root_logger.removeHandler(handler) for handler in airflow_logger_handlers: root_logger.addHandler(handler) root_logger.setLevel(logging.getLogger('airflow.task').level) with redirect_stdout(StreamLogWriter(ti.log, logging.INFO)), \ redirect_stderr(StreamLogWriter(ti.log, logging.WARN)): _run_task_by_selected_method(args, dag, ti) # We need to restore the handlers to the loggers as celery worker process # can call this command multiple times, # so if we don't reset this then logs from next task would go to the wrong place for handler in airflow_logger_handlers: root_logger.removeHandler(handler) for handler in root_logger_handlers: root_logger.addHandler(handler) logging.shutdown()
def test_encoding(self): logger = mock.MagicMock() logger.log = mock.MagicMock() log = StreamLogWriter(logger, 1) assert log.encoding is None
def test_isatty(self): logger = mock.MagicMock() logger.log = mock.MagicMock() log = StreamLogWriter(logger, 1) assert not log.isatty()
def test_iobase_compatibility(self): log = StreamLogWriter(None, 1) assert not log.closed # has no specific effect log.close()
def test_encoding(self): logger = mock.MagicMock() logger.log = mock.MagicMock() log = StreamLogWriter(logger, 1) self.assertFalse(log.encoding)
def test_isatty(self): logger = mock.MagicMock() logger.log = mock.MagicMock() log = StreamLogWriter(logger, 1) self.assertFalse(log.isatty())