def dbnd_airflow_bootstrap(): global _airflow_bootstrap_applied if _airflow_bootstrap_applied: return from dbnd._core.configuration.dbnd_config import config as dbnd_config from dbnd_airflow.airflow_override import patch_airflow_modules if dbnd_config.getboolean("airflow", "enable_dbnd_patches"): patch_airflow_modules() if os.name == "nt" and dbnd_config.getboolean("airflow", "enable_windows_support"): from dbnd_airflow.airflow_override.dbnd_airflow_windows import ( patch_airflow_windows_support, ) patch_airflow_windows_support() from dbnd_airflow.airflow_extensions.airflow_config import ( init_airflow_sqlconn_by_dbnd, ) init_airflow_sqlconn_by_dbnd() from dbnd_airflow.web.single_job_run_support import register_legacy_single_job_run register_legacy_single_job_run() _airflow_bootstrap_applied = True
def __init__(self): self.scheduled_jobs = [] if (config.getboolean("scheduler", "always_file_sync") or ("scheduler" in sys.argv)) and not config.getboolean( "scheduler", "never_file_sync"): self.file_config_loader = SchedulerFileConfigLoader() logger.debug("scheduler file syncing active") else: self.file_config_loader = None logger.debug("scheduler file syncing disabled") self.default_retries = config.getint("scheduler", "default_retries")
def initialize_task_id(self, params=None): name = self.task_name extra = {} if config.getboolean("task_build", "sign_with_full_qualified_name"): extra["full_task_family"] = self.task_definition.full_task_family if config.getboolean("task_build", "sign_with_task_code"): extra["task_code_hash"] = user_friendly_signature( self.task_definition.task_source_code) signature = build_signature(name=name, params=params, extra=extra) self.task_id, self.task_signature = (signature.id, signature.signature) self.task_signature_source = signature.signature_source
def job_to_dag(self, job): # type: (dict) -> Union[DAG, None] start_day = convert_to_utc(job.get("start_date", None)) end_date = convert_to_utc(job.get("end_date", None)) default_args = { "owner": job.get("create_user", None), "depends_on_past": job.get("depends_on_past", False), "start_date": job["start_date"], "end_date": end_date, } job_name = clean_job_name(job["name"]) dag = DAG( "dbnd_launcher__%s" % job_name, start_date=start_day, default_args=default_args, schedule_interval=job.get("schedule_interval", None), catchup=job.get("catchup", False), ) DbndSchedulerOperator( scheduled_cmd=job["cmd"], scheduled_job_name=job_name, scheduled_job_uid=job.get("uid", None), shell=config.getboolean("scheduler", "shell_cmd"), task_id="launcher", dag=dag, retries=job.get("retries", self.default_retries) or self.default_retries, ) return dag
def reinit_airflow_sql_conn(): from airflow.settings import configure_orm, configure_vars from dbnd._core.configuration.dbnd_config import config as dbnd_config configure_vars() # The webservers import this file from models.py with the default settings. configure_orm() # add query handler before every execute # this will print query, code line and stack trace if dbnd_config.getboolean("log", "sqlalchemy_trace"): from airflow import settings as airflow_settings from sqlalchemy import event from dbnd_airflow.db_utils import trace_sqlalchemy_query event.listen(airflow_settings.engine, "before_cursor_execute", trace_sqlalchemy_query) # this will print query execution time from airflow import settings as airflow_settings from sqlalchemy import event from dbnd_airflow.db_utils import ( profile_after_cursor_execute, profile_before_cursor_execute, ) event.listen(airflow_settings.engine, "before_cursor_execute", profile_before_cursor_execute) event.listen(airflow_settings.engine, "after_cursor_execute", profile_after_cursor_execute)
def set_tracking_config_overide(airflow_context=None, use_dbnd_log=None): # 1. create proper DatabandContext so we can create other objects track_with_cache = config.getboolean("run", "tracking_with_cache") config_for_tracking = { "run": { "skip_completed": track_with_cache, "skip_completed_on_run": track_with_cache, "validate_task_inputs": track_with_cache, "validate_task_outputs": track_with_cache, }, # we don't want to "check" as script is task_version="now" "task": { "task_in_memory_outputs": not track_with_cache }, # do not save any outputs "core": { "tracker_raise_on_error": False }, # do not fail on tracker errors } if airflow_context: import pytz task_target_date = pendulum.parse(airflow_context.execution_date, tz=pytz.UTC).date() use_dbnd_log = override_airflow_log_system_for_tracking() config_for_tracking["task"]["task_target_date"] = task_target_date if use_dbnd_log is not None: config_for_tracking["log"] = {"disabled": not use_dbnd_log} return config.set_values(config_values=config_for_tracking, override=True, source="dbnd_tracking_config")
def dbnd_airflow_bootstrap(): """ all relevant patches for airflow execution """ global _airflow_bootstrap_applied if _airflow_bootstrap_applied: return _airflow_bootstrap_applied = True # prevent recursive call from dbnd._core.configuration.dbnd_config import config as dbnd_config if dbnd_config.getboolean("airflow", "enable_dbnd_context_vars"): patch_airflow_context_vars() if os.name == "nt" and dbnd_config.getboolean("airflow", "enable_windows_support"): from dbnd_airflow.airflow_override.dbnd_airflow_windows import ( patch_airflow_windows_support, ) patch_airflow_windows_support()
def enable_osx_forked_request_calls(): if not is_osx: return from dbnd._core.configuration.dbnd_config import config if not config.getboolean("core", "fix_env_on_osx"): return if "no_proxy" not in os.environ: os.environ["no_proxy"] = "*" if "OBJC_DISABLE_INITIALIZE_FORK_SAFETY" not in os.environ: os.environ["OBJC_DISABLE_INITIALIZE_FORK_SAFETY"] = "yes"
def job_to_dag(self, job): # type: (dict) -> Union[DAG, None] # convert_to_utc usage might be dangerous, as there is the same function at airflow # however, that one use pendulum not from _vendorized default_args = {} if job.get("depends_on_past"): default_args["depends_on_past"] = job.get("depends_on_past") start_date = convert_to_utc(job.get("start_date")) if start_date: default_args["start_day"] = start_date if job.get("end_date"): default_args["end_date"] = convert_to_utc(job.get("end_date")) if job.get("owner"): default_args["owner"] = job.get("create_user") job_name = clean_job_name(job["name"]) dag = DAG( "%s" % job_name, start_date=start_date, default_args=default_args, schedule_interval=job.get("schedule_interval", None), catchup=job.get("catchup", False), ) custom_operator_class = self.custom_operator_class or DbndSchedulerOperator custom_operator_class( scheduled_cmd=job["cmd"], scheduled_job_name=job_name, extra_args=job.get("extra_args", None), with_name=False, scheduled_job_uid=job.get("uid", None), shell=config.getboolean("scheduler", "shell_cmd"), task_id="launcher", dag=dag, retries=job.get("retries") or self.default_retries, ) return dag
def job_to_dag(self, job): # type: (dict) -> Union[DAG, None] default_args = {} if job.get("depends_on_past"): default_args["depends_on_past"] = job.get("depends_on_past") start_date = convert_to_utc(job.get("start_date")) if start_date: default_args["start_day"] = start_date if job.get("end_date"): default_args["end_date"] = convert_to_utc(job.get("end_date")) if job.get("owner"): default_args["owner"] = job.get("owner") job_name = clean_job_name(job["name"]) dag = DAG( "%s" % job_name, start_date=start_date, default_args=default_args, schedule_interval=job.get("schedule_interval", None), catchup=job.get("catchup", False), ) DbndSchedulerOperator( task_id="launcher", dag=dag, retries=job.get("retries") or self.default_retries, scheduled_cmd=job["cmd"], scheduled_job_name=job_name, with_name=False, scheduled_job_uid=job.get("uid", None), shell=config.getboolean("scheduler", "shell_cmd"), ) return dag
def set_tracking_config_overide(use_dbnd_log=None): # 1. create proper DatabandContext so we can create other objects track_with_cache = config.getboolean("run", "tracking_with_cache") config_for_airflow = { "run": { "skip_completed": track_with_cache, "skip_completed_on_run": track_with_cache, "validate_task_inputs": track_with_cache, "validate_task_outputs": track_with_cache, }, # we don't want to "check" as script is task_version="now" "task": { "task_in_memory_outputs": not track_with_cache }, # do not save any outputs "core": { "tracker_raise_on_error": False }, # do not fail on tracker errors } if use_dbnd_log is not None: config_for_airflow["log"] = {"disabled": not use_dbnd_log} config.set_values(config_values=config_for_airflow, override=True, source="dbnd_tracking_config") return
def get_dags_from_file(): if environ_enabled(ENV_DBND_DISABLE_SCHEDULED_DAGS_LOAD): return None try: # let be sure that we are loaded config.load_system_configs() config_file = config.get("scheduler", "config_file") if not config_file: logger.info("No dags file has been defined at scheduler.config_file") return {} default_retries = config.getint("scheduler", "default_retries") active_by_default = config.getboolean("scheduler", "active_by_default") dags = DbndAirflowDagsProviderFromFile( config_file=config_file, active_by_default=active_by_default, default_retries=default_retries, ).get_dags() return {dag.dag_id: dag for dag in dags} except Exception as e: logging.exception("Failed to get dags from the file") raise e
def use_airflow_connections(): from dbnd._core.configuration.dbnd_config import config return is_airflow_enabled() and config.getboolean("airflow", "use_connections")
def get_databand_error_message(ex, args=None, sys_exit=True): args = args or sys.argv please_report = False print_source = True if isinstance(ex, DatabandRunError): # we already printed all information! return ( "There is an error! Your run has failed!", DatabandExitCodes.execution_failed, ) if isinstance(ex, DatabandRuntimeError): exit_code = DatabandExitCodes.execution_failed elif isinstance(ex, DatabandConfigError): exit_code = DatabandExitCodes.configuration_error elif isinstance(ex, DatabandSystemError): exit_code = DatabandExitCodes.error please_report = True elif isinstance(ex, DatabandError): exit_code = DatabandExitCodes.error elif ex.__class__.__name__ == "NoCredentialsError": # aws exit_code = DatabandExitCodes.configuration_error ex = friendly_error.config.no_credentials() print_source = False else: please_report = True exit_code = DatabandExitCodes.unknown_error msg = str(ex) extra_msg_lines = [] nested_exceptions = nested_exceptions_str(ex) if nested_exceptions: extra_msg_lines.append("Caused by: \n%s\n" % indent(nested_exceptions, "\t")) help_msg = get_help_msg(ex) if help_msg: extra_msg_lines.append(" Help: \n%s\n" % indent(help_msg, "\t")) user_frame_info_str = get_user_frame_info_str(ex) if user_frame_info_str and print_source: extra_msg_lines.append("Source: \n%s\n" % indent(user_frame_info_str, "\t")) # if we crashed before finishing bootstrap we probably want to see the full trace, and we could have failed during config init so the verbose flag does nothing if (show_exc_info(ex) or config.getboolean("databand", "verbose") or not bootstrap._dbnd_bootstrap): error_info = sys.exc_info() extra_msg_lines.append(format_exception_as_str(error_info)) msg = truncate_msg(msg, ERROR_MESSAGE_HEAD_SIZE, ERROR_MESSAGE_TAIL_SIZE) if please_report: extra_msg_lines.append( " Please report it to [email protected] or appropriate slack channel!" ) msg = ("There is an error! Your run has failed with {exc_type}\n" "{sep}\n" " Command line: {command_line}\n" " Failure:\n{msg}\n\n" "{extra_msg}\n" "{sep}\n" "".format( sep=console_utils.error_separator(), command_line=subprocess.list2cmdline(args or []), sep_small=console_utils.error_separator_small(), msg=console_utils.bold(indent(msg, "\t")), exc_type=ex.__class__.__name__, extra_msg="\n ".join(extra_msg_lines), )) return msg, exit_code
def __init__( self, task_passport, # type: TaskPassport classdict=None, # type: Optional[Dict[str, Any]] base_task_definitions=None, # type: Optional[List[TaskDefinition]] defaults=None, # type: Optional[Dict[ParameterDefinition, Any]] task_decorator=None, # type: Optional[TaskDecorator] source_code=None, # type: Optional[TaskSourceCode] external_parameters=None, # type: Optional[Parameters] task_definition_uid=None, # type: Optional[UUID] ): super(TaskDefinition, self).__init__() self.hidden = False self.task_passport = task_passport self.source_code = source_code self.task_decorator = task_decorator self.base_task_definitions = (base_task_definitions or []) # type: List[ TaskDefinition] # TODO: maybe use properties or other way to delegate those... self.full_task_family = self.task_passport.full_task_family self.full_task_family_short = self.task_passport.full_task_family_short self.task_family = self.task_passport.task_family self.task_config_section = self.task_passport.task_config_section # all the attributes that points to_Parameter self.task_param_defs = dict() # type: Dict[str, ParameterDefinition] # the defaults attribute self.defaults = dict() # type: Dict[ParameterDefinition, Any] self.task_param_defs = self._calculate_task_class_values( classdict, external_parameters) # if we have output params in function arguments, like f(some_p=parameter.output) # the new function can not return the result of return self.single_result_output = self._is_result_single_output( self.task_param_defs) self.param_defaults = { p.name: p.default for p in self.task_param_defs.values() if is_defined(p.default) } # TODO: consider joining with task_config # TODO: calculate defaults value as _ConfigStore and merge using standard mechanism self.defaults = self._calculate_task_defaults(defaults) self.task_defaults_config_store = parse_and_build_config_store( source=self.task_passport.format_source_name("task.defaults"), config_values=self.defaults, priority=ConfigValuePriority.FALLBACK, ) self.task_signature_extra = {} if config.getboolean("task_build", "sign_with_full_qualified_name"): self.task_signature_extra[ "full_task_family"] = self.full_task_family if config.getboolean("task_build", "sign_with_task_code"): self.task_signature_extra[ "task_code_hash"] = user_friendly_signature( self.source_code.task_source_code) if task_definition_uid: self.task_definition_uid = task_definition_uid else: self.task_definition_uid = get_uuid()
def __init__(self, dag_bag=None): super(InProcessExecutor, self).__init__() self.tasks_to_run = [] self.fail_fast = config.getboolean("run", "fail_fast") self.dag_bag = dag_bag