def list_py_file_paths(directory: str, safe_mode: bool = conf.getboolean('core', 'DAG_DISCOVERY_SAFE_MODE', fallback=True), include_examples: Optional[bool] = None): """ Traverse a directory and look for Python files. :param directory: the directory to traverse :type directory: unicode :param safe_mode: whether to use a heuristic to determine whether a file contains Airflow DAG definitions. If not provided, use the core.DAG_DISCOVERY_SAFE_MODE configuration setting. If not set, default to safe. :type safe_mode: bool :param include_examples: include example DAGs :type include_examples: bool :return: a list of paths to Python files in the specified directory :rtype: list[unicode] """ if include_examples is None: include_examples = conf.getboolean('core', 'LOAD_EXAMPLES') file_paths: List[str] = [] if directory is None: return [] elif os.path.isfile(directory): return [directory] elif os.path.isdir(directory): patterns_by_dir: Dict[str, List[Pattern[str]]] = {} for root, dirs, files in os.walk(directory, followlinks=True): patterns: List[Pattern[str]] = patterns_by_dir.get(root, []) ignore_file = os.path.join(root, '.airflowignore') if os.path.isfile(ignore_file): with open(ignore_file, 'r') as file: # If we have new patterns create a copy so we don't change # the previous list (which would affect other subdirs) lines_no_comments = [COMMENT_PATTERN.sub("", line) for line in file.read().split("\n")] patterns += [re.compile(line) for line in lines_no_comments if line] # If we can ignore any subdirs entirely we should - fewer paths # to walk is better. We have to modify the ``dirs`` array in # place for this to affect os.walk dirs[:] = [ subdir for subdir in dirs if not any(p.search(os.path.join(root, subdir)) for p in patterns) ] # We want patterns defined in a parent folder's .airflowignore to # apply to subdirs too for subdir in dirs: patterns_by_dir[os.path.join(root, subdir)] = patterns.copy() find_dag_file_paths(file_paths, files, patterns, root, safe_mode) if include_examples: from airflow import example_dags example_dag_folder = example_dags.__path__[0] # type: ignore file_paths.extend(list_py_file_paths(example_dag_folder, safe_mode, False)) return file_paths
def dag_backfill(args, dag=None): """Creates backfill job or dry run for a DAG""" logging.basicConfig( level=settings.LOGGING_LEVEL, format=settings.SIMPLE_LOG_FORMAT) signal.signal(signal.SIGTERM, sigint_handler) dag = dag or get_dag(args.subdir, args.dag_id) if not args.start_date and not args.end_date: raise AirflowException("Provide a start_date and/or end_date") # If only one date is passed, using same as start and end args.end_date = args.end_date or args.start_date args.start_date = args.start_date or args.end_date if args.task_regex: dag = dag.sub_dag( task_regex=args.task_regex, include_upstream=not args.ignore_dependencies) run_conf = None if args.conf: run_conf = json.loads(args.conf) if args.dry_run: print("Dry run of DAG {0} on {1}".format(args.dag_id, args.start_date)) for task in dag.tasks: print("Task {0}".format(task.task_id)) ti = TaskInstance(task, args.start_date) ti.dry_run() else: if args.reset_dagruns: DAG.clear_dags( [dag], start_date=args.start_date, end_date=args.end_date, confirm_prompt=not args.yes, include_subdags=True, ) dag.run( start_date=args.start_date, end_date=args.end_date, mark_success=args.mark_success, local=args.local, donot_pickle=(args.donot_pickle or conf.getboolean('core', 'donot_pickle')), ignore_first_depends_on_past=args.ignore_first_depends_on_past, ignore_task_deps=args.ignore_dependencies, pool=args.pool, delay_on_limit_secs=args.delay_on_limit, verbose=args.verbose, conf=run_conf, rerun_failed_tasks=args.rerun_failed_tasks, run_backwards=args.run_backwards )
def __init__(self): super().__init__() self.tasks_to_run: List[TaskInstance] = [] # Place where we keep information for task instance raw run self.tasks_params: Dict[TaskInstanceKeyType, Dict[str, Any]] = {} self.fail_fast = conf.getboolean("debug", "fail_fast")
def webserver(args): """Starts Airflow Webserver""" print(settings.HEADER) access_logfile = args.access_logfile or conf.get('webserver', 'access_logfile') error_logfile = args.error_logfile or conf.get('webserver', 'error_logfile') num_workers = args.workers or conf.get('webserver', 'workers') worker_timeout = (args.worker_timeout or conf.get('webserver', 'web_server_worker_timeout')) ssl_cert = args.ssl_cert or conf.get('webserver', 'web_server_ssl_cert') ssl_key = args.ssl_key or conf.get('webserver', 'web_server_ssl_key') if not ssl_cert and ssl_key: raise AirflowException( 'An SSL certificate must also be provided for use with ' + ssl_key) if ssl_cert and not ssl_key: raise AirflowException( 'An SSL key must also be provided for use with ' + ssl_cert) if args.debug: print( "Starting the web server on port {0} and host {1}.".format( args.port, args.hostname)) app, _ = create_app(None, testing=conf.getboolean('core', 'unit_test_mode')) app.run(debug=True, use_reloader=not app.config['TESTING'], port=args.port, host=args.hostname, ssl_context=(ssl_cert, ssl_key) if ssl_cert and ssl_key else None) else: os.environ['SKIP_DAGS_PARSING'] = 'True' app = cached_app(None) pid, stdout, stderr, log_file = setup_locations( "webserver", args.pid, args.stdout, args.stderr, args.log_file) os.environ.pop('SKIP_DAGS_PARSING') if args.daemon: handle = setup_logging(log_file) stdout = open(stdout, 'w+') stderr = open(stderr, 'w+') print( textwrap.dedent('''\ Running the Gunicorn Server with: Workers: {num_workers} {workerclass} Host: {hostname}:{port} Timeout: {worker_timeout} Logfiles: {access_logfile} {error_logfile} =================================================================\ '''.format(num_workers=num_workers, workerclass=args.workerclass, hostname=args.hostname, port=args.port, worker_timeout=worker_timeout, access_logfile=access_logfile, error_logfile=error_logfile))) run_args = [ 'gunicorn', '-w', str(num_workers), '-k', str(args.workerclass), '-t', str(worker_timeout), '-b', args.hostname + ':' + str(args.port), '-n', 'airflow-webserver', '-p', str(pid), '-c', 'python:airflow.www.gunicorn_config', ] if args.access_logfile: run_args += ['--access-logfile', str(args.access_logfile)] if args.error_logfile: run_args += ['--error-logfile', str(args.error_logfile)] if args.daemon: run_args += ['-D'] if ssl_cert: run_args += ['--certfile', ssl_cert, '--keyfile', ssl_key] webserver_module = 'www' run_args += ["airflow." + webserver_module + ".app:cached_app()"] gunicorn_master_proc = None def kill_proc(dummy_signum, dummy_frame): # pylint: disable=unused-argument gunicorn_master_proc.terminate() gunicorn_master_proc.wait() sys.exit(0) def monitor_gunicorn(gunicorn_master_proc): # These run forever until SIG{INT, TERM, KILL, ...} signal is sent if conf.getint('webserver', 'worker_refresh_interval') > 0: master_timeout = conf.getint('webserver', 'web_server_master_timeout') restart_workers(gunicorn_master_proc, num_workers, master_timeout) else: while gunicorn_master_proc.poll() is None: time.sleep(1) sys.exit(gunicorn_master_proc.returncode) if args.daemon: base, ext = os.path.splitext(pid) ctx = daemon.DaemonContext( pidfile=TimeoutPIDLockFile(base + "-monitor" + ext, -1), files_preserve=[handle], stdout=stdout, stderr=stderr, signal_map={ signal.SIGINT: kill_proc, signal.SIGTERM: kill_proc }, ) with ctx: subprocess.Popen(run_args, close_fds=True) # Reading pid file directly, since Popen#pid doesn't # seem to return the right value with DaemonContext. while True: try: with open(pid) as file: gunicorn_master_proc_pid = int(file.read()) break except OSError: log.debug("Waiting for gunicorn's pid file to be created.") time.sleep(0.1) gunicorn_master_proc = psutil.Process(gunicorn_master_proc_pid) monitor_gunicorn(gunicorn_master_proc) stdout.close() stderr.close() else: gunicorn_master_proc = subprocess.Popen(run_args, close_fds=True) signal.signal(signal.SIGINT, kill_proc) signal.signal(signal.SIGTERM, kill_proc) monitor_gunicorn(gunicorn_master_proc)
def __init__(self): super(DebugExecutor, self).__init__() self.tasks_to_run = [] # Place where we keep information for task instance raw run self.tasks_params = {} self.fail_fast = conf.getboolean("debug", "fail_fast")
# TODO: Logging format and level should be configured # in this file instead of from airflow.cfg. Currently # there are other log format and level configurations in # settings.py and cli.py. Please see AIRFLOW-1455. LOG_LEVEL: str = conf.get('logging', 'LOGGING_LEVEL').upper() # Flask appbuilder's info level log is very verbose, # so it's set to 'WARN' by default. FAB_LOG_LEVEL: str = conf.get('logging', 'FAB_LOGGING_LEVEL').upper() LOG_FORMAT: str = conf.get('logging', 'LOG_FORMAT') COLORED_LOG_FORMAT: str = conf.get('logging', 'COLORED_LOG_FORMAT') COLORED_LOG: bool = conf.getboolean('logging', 'COLORED_CONSOLE_LOG') COLORED_FORMATTER_CLASS: str = conf.get('logging', 'COLORED_FORMATTER_CLASS') BASE_LOG_FOLDER: str = conf.get('logging', 'BASE_LOG_FOLDER') PROCESSOR_LOG_FOLDER: str = conf.get('scheduler', 'CHILD_PROCESS_LOG_DIRECTORY') DAG_PROCESSOR_MANAGER_LOG_LOCATION: str = conf.get( 'logging', 'DAG_PROCESSOR_MANAGER_LOG_LOCATION') FILENAME_TEMPLATE: str = conf.get('logging', 'LOG_FILENAME_TEMPLATE') PROCESSOR_FILENAME_TEMPLATE: str = conf.get('logging', 'LOG_PROCESSOR_FILENAME_TEMPLATE')