示例#1
0
文件: bootstrap.py 项目: lbtanh/dbnd
def dbnd_airflow_bootstrap():
    global _airflow_bootstrap_applied
    if _airflow_bootstrap_applied:
        return

    from dbnd._core.configuration.dbnd_config import config as dbnd_config
    from dbnd_airflow.airflow_override import patch_airflow_modules

    if dbnd_config.getboolean("airflow", "enable_dbnd_patches"):
        patch_airflow_modules()

    if os.name == "nt" and dbnd_config.getboolean("airflow",
                                                  "enable_windows_support"):
        from dbnd_airflow.airflow_override.dbnd_airflow_windows import (
            patch_airflow_windows_support, )

        patch_airflow_windows_support()

    from dbnd_airflow.airflow_extensions.airflow_config import (
        init_airflow_sqlconn_by_dbnd, )

    init_airflow_sqlconn_by_dbnd()

    from dbnd_airflow.web.single_job_run_support import register_legacy_single_job_run

    register_legacy_single_job_run()

    _airflow_bootstrap_applied = True
示例#2
0
    def __init__(self):
        self.scheduled_jobs = []

        if (config.getboolean("scheduler", "always_file_sync") or
            ("scheduler" in sys.argv)) and not config.getboolean(
                "scheduler", "never_file_sync"):
            self.file_config_loader = SchedulerFileConfigLoader()
            logger.debug("scheduler file syncing active")
        else:
            self.file_config_loader = None
            logger.debug("scheduler file syncing disabled")

        self.default_retries = config.getint("scheduler", "default_retries")
示例#3
0
文件: task_meta.py 项目: lbtanh/dbnd
    def initialize_task_id(self, params=None):
        name = self.task_name
        extra = {}
        if config.getboolean("task_build", "sign_with_full_qualified_name"):
            extra["full_task_family"] = self.task_definition.full_task_family
        if config.getboolean("task_build", "sign_with_task_code"):
            extra["task_code_hash"] = user_friendly_signature(
                self.task_definition.task_source_code)

        signature = build_signature(name=name, params=params, extra=extra)
        self.task_id, self.task_signature = (signature.id, signature.signature)

        self.task_signature_source = signature.signature_source
示例#4
0
    def job_to_dag(self, job):  # type: (dict) -> Union[DAG, None]
        start_day = convert_to_utc(job.get("start_date", None))
        end_date = convert_to_utc(job.get("end_date", None))

        default_args = {
            "owner": job.get("create_user", None),
            "depends_on_past": job.get("depends_on_past", False),
            "start_date": job["start_date"],
            "end_date": end_date,
        }

        job_name = clean_job_name(job["name"])
        dag = DAG(
            "dbnd_launcher__%s" % job_name,
            start_date=start_day,
            default_args=default_args,
            schedule_interval=job.get("schedule_interval", None),
            catchup=job.get("catchup", False),
        )

        DbndSchedulerOperator(
            scheduled_cmd=job["cmd"],
            scheduled_job_name=job_name,
            scheduled_job_uid=job.get("uid", None),
            shell=config.getboolean("scheduler", "shell_cmd"),
            task_id="launcher",
            dag=dag,
            retries=job.get("retries", self.default_retries)
            or self.default_retries,
        )

        return dag
示例#5
0
def reinit_airflow_sql_conn():
    from airflow.settings import configure_orm, configure_vars

    from dbnd._core.configuration.dbnd_config import config as dbnd_config

    configure_vars()
    # The webservers import this file from models.py with the default settings.
    configure_orm()
    # add query handler before every execute
    # this will print query, code line and stack trace
    if dbnd_config.getboolean("log", "sqlalchemy_trace"):
        from airflow import settings as airflow_settings
        from sqlalchemy import event

        from dbnd_airflow.db_utils import trace_sqlalchemy_query

        event.listen(airflow_settings.engine, "before_cursor_execute",
                     trace_sqlalchemy_query)

    # this will print query execution time
    from airflow import settings as airflow_settings
    from sqlalchemy import event

    from dbnd_airflow.db_utils import (
        profile_after_cursor_execute,
        profile_before_cursor_execute,
    )

    event.listen(airflow_settings.engine, "before_cursor_execute",
                 profile_before_cursor_execute)
    event.listen(airflow_settings.engine, "after_cursor_execute",
                 profile_after_cursor_execute)
示例#6
0
def set_tracking_config_overide(airflow_context=None, use_dbnd_log=None):
    # 1. create proper DatabandContext so we can create other objects
    track_with_cache = config.getboolean("run", "tracking_with_cache")
    config_for_tracking = {
        "run": {
            "skip_completed": track_with_cache,
            "skip_completed_on_run": track_with_cache,
            "validate_task_inputs": track_with_cache,
            "validate_task_outputs": track_with_cache,
        },  # we don't want to "check" as script is task_version="now"
        "task": {
            "task_in_memory_outputs": not track_with_cache
        },  # do not save any outputs
        "core": {
            "tracker_raise_on_error": False
        },  # do not fail on tracker errors
    }
    if airflow_context:
        import pytz

        task_target_date = pendulum.parse(airflow_context.execution_date,
                                          tz=pytz.UTC).date()
        use_dbnd_log = override_airflow_log_system_for_tracking()
        config_for_tracking["task"]["task_target_date"] = task_target_date

    if use_dbnd_log is not None:
        config_for_tracking["log"] = {"disabled": not use_dbnd_log}
    return config.set_values(config_values=config_for_tracking,
                             override=True,
                             source="dbnd_tracking_config")
示例#7
0
def dbnd_airflow_bootstrap():
    """
    all relevant patches for airflow execution
    """
    global _airflow_bootstrap_applied
    if _airflow_bootstrap_applied:
        return
    _airflow_bootstrap_applied = True  # prevent recursive call

    from dbnd._core.configuration.dbnd_config import config as dbnd_config

    if dbnd_config.getboolean("airflow", "enable_dbnd_context_vars"):
        patch_airflow_context_vars()

    if os.name == "nt" and dbnd_config.getboolean("airflow",
                                                  "enable_windows_support"):
        from dbnd_airflow.airflow_override.dbnd_airflow_windows import (
            patch_airflow_windows_support, )

        patch_airflow_windows_support()
示例#8
0
def enable_osx_forked_request_calls():
    if not is_osx:
        return

    from dbnd._core.configuration.dbnd_config import config

    if not config.getboolean("core", "fix_env_on_osx"):
        return

    if "no_proxy" not in os.environ:
        os.environ["no_proxy"] = "*"

    if "OBJC_DISABLE_INITIALIZE_FORK_SAFETY" not in os.environ:
        os.environ["OBJC_DISABLE_INITIALIZE_FORK_SAFETY"] = "yes"
    def job_to_dag(self, job):  # type: (dict) -> Union[DAG, None]

        # convert_to_utc usage might be dangerous, as there is the same function at airflow
        # however, that one use pendulum not from _vendorized

        default_args = {}
        if job.get("depends_on_past"):
            default_args["depends_on_past"] = job.get("depends_on_past")

        start_date = convert_to_utc(job.get("start_date"))
        if start_date:
            default_args["start_day"] = start_date

        if job.get("end_date"):
            default_args["end_date"] = convert_to_utc(job.get("end_date"))

        if job.get("owner"):
            default_args["owner"] = job.get("create_user")

        job_name = clean_job_name(job["name"])
        dag = DAG(
            "%s" % job_name,
            start_date=start_date,
            default_args=default_args,
            schedule_interval=job.get("schedule_interval", None),
            catchup=job.get("catchup", False),
        )

        custom_operator_class = self.custom_operator_class or DbndSchedulerOperator
        custom_operator_class(
            scheduled_cmd=job["cmd"],
            scheduled_job_name=job_name,
            extra_args=job.get("extra_args", None),
            with_name=False,
            scheduled_job_uid=job.get("uid", None),
            shell=config.getboolean("scheduler", "shell_cmd"),
            task_id="launcher",
            dag=dag,
            retries=job.get("retries") or self.default_retries,
        )

        return dag
示例#10
0
    def job_to_dag(self, job):  # type: (dict) -> Union[DAG, None]

        default_args = {}
        if job.get("depends_on_past"):
            default_args["depends_on_past"] = job.get("depends_on_past")

        start_date = convert_to_utc(job.get("start_date"))
        if start_date:
            default_args["start_day"] = start_date

        if job.get("end_date"):
            default_args["end_date"] = convert_to_utc(job.get("end_date"))

        if job.get("owner"):
            default_args["owner"] = job.get("owner")

        job_name = clean_job_name(job["name"])
        dag = DAG(
            "%s" % job_name,
            start_date=start_date,
            default_args=default_args,
            schedule_interval=job.get("schedule_interval", None),
            catchup=job.get("catchup", False),
        )

        DbndSchedulerOperator(
            task_id="launcher",
            dag=dag,
            retries=job.get("retries") or self.default_retries,
            scheduled_cmd=job["cmd"],
            scheduled_job_name=job_name,
            with_name=False,
            scheduled_job_uid=job.get("uid", None),
            shell=config.getboolean("scheduler", "shell_cmd"),
        )

        return dag
示例#11
0
def set_tracking_config_overide(use_dbnd_log=None):
    # 1. create proper DatabandContext so we can create other objects
    track_with_cache = config.getboolean("run", "tracking_with_cache")
    config_for_airflow = {
        "run": {
            "skip_completed": track_with_cache,
            "skip_completed_on_run": track_with_cache,
            "validate_task_inputs": track_with_cache,
            "validate_task_outputs": track_with_cache,
        },  # we don't want to "check" as script is task_version="now"
        "task": {
            "task_in_memory_outputs": not track_with_cache
        },  # do not save any outputs
        "core": {
            "tracker_raise_on_error": False
        },  # do not fail on tracker errors
    }
    if use_dbnd_log is not None:
        config_for_airflow["log"] = {"disabled": not use_dbnd_log}
    config.set_values(config_values=config_for_airflow,
                      override=True,
                      source="dbnd_tracking_config")

    return
示例#12
0
def get_dags_from_file():
    if environ_enabled(ENV_DBND_DISABLE_SCHEDULED_DAGS_LOAD):
        return None

    try:
        # let be sure that we are loaded
        config.load_system_configs()

        config_file = config.get("scheduler", "config_file")
        if not config_file:
            logger.info("No dags file has been defined at scheduler.config_file")
            return {}
        default_retries = config.getint("scheduler", "default_retries")
        active_by_default = config.getboolean("scheduler", "active_by_default")

        dags = DbndAirflowDagsProviderFromFile(
            config_file=config_file,
            active_by_default=active_by_default,
            default_retries=default_retries,
        ).get_dags()
        return {dag.dag_id: dag for dag in dags}
    except Exception as e:
        logging.exception("Failed to get dags from the file")
        raise e
示例#13
0
def use_airflow_connections():
    from dbnd._core.configuration.dbnd_config import config

    return is_airflow_enabled() and config.getboolean("airflow",
                                                      "use_connections")
示例#14
0
def get_databand_error_message(ex, args=None, sys_exit=True):
    args = args or sys.argv
    please_report = False
    print_source = True

    if isinstance(ex, DatabandRunError):
        # we already printed all information!
        return (
            "There is an error! Your run has failed!",
            DatabandExitCodes.execution_failed,
        )

    if isinstance(ex, DatabandRuntimeError):
        exit_code = DatabandExitCodes.execution_failed
    elif isinstance(ex, DatabandConfigError):
        exit_code = DatabandExitCodes.configuration_error
    elif isinstance(ex, DatabandSystemError):
        exit_code = DatabandExitCodes.error
        please_report = True
    elif isinstance(ex, DatabandError):
        exit_code = DatabandExitCodes.error
    elif ex.__class__.__name__ == "NoCredentialsError":  # aws
        exit_code = DatabandExitCodes.configuration_error
        ex = friendly_error.config.no_credentials()
        print_source = False
    else:
        please_report = True
        exit_code = DatabandExitCodes.unknown_error

    msg = str(ex)

    extra_msg_lines = []

    nested_exceptions = nested_exceptions_str(ex)
    if nested_exceptions:
        extra_msg_lines.append("Caused by: \n%s\n" %
                               indent(nested_exceptions, "\t"))

    help_msg = get_help_msg(ex)
    if help_msg:
        extra_msg_lines.append(" Help: \n%s\n" % indent(help_msg, "\t"))

    user_frame_info_str = get_user_frame_info_str(ex)
    if user_frame_info_str and print_source:
        extra_msg_lines.append("Source: \n%s\n" %
                               indent(user_frame_info_str, "\t"))

    # if we crashed before finishing bootstrap we probably want to see the full trace, and we could have failed during config init so the verbose flag does nothing
    if (show_exc_info(ex) or config.getboolean("databand", "verbose")
            or not bootstrap._dbnd_bootstrap):
        error_info = sys.exc_info()
        extra_msg_lines.append(format_exception_as_str(error_info))

    msg = truncate_msg(msg, ERROR_MESSAGE_HEAD_SIZE, ERROR_MESSAGE_TAIL_SIZE)

    if please_report:
        extra_msg_lines.append(
            " Please report it to [email protected] or appropriate slack channel!"
        )
    msg = ("There is an error! Your run has failed with {exc_type}\n"
           "{sep}\n"
           " Command line: {command_line}\n"
           " Failure:\n{msg}\n\n"
           "{extra_msg}\n"
           "{sep}\n"
           "".format(
               sep=console_utils.error_separator(),
               command_line=subprocess.list2cmdline(args or []),
               sep_small=console_utils.error_separator_small(),
               msg=console_utils.bold(indent(msg, "\t")),
               exc_type=ex.__class__.__name__,
               extra_msg="\n ".join(extra_msg_lines),
           ))
    return msg, exit_code
示例#15
0
    def __init__(
            self,
            task_passport,  # type: TaskPassport
            classdict=None,  # type: Optional[Dict[str, Any]]
            base_task_definitions=None,  # type: Optional[List[TaskDefinition]]
            defaults=None,  # type: Optional[Dict[ParameterDefinition, Any]]
            task_decorator=None,  # type: Optional[TaskDecorator]
            source_code=None,  # type: Optional[TaskSourceCode]
            external_parameters=None,  # type: Optional[Parameters]
            task_definition_uid=None,  # type: Optional[UUID]
    ):
        super(TaskDefinition, self).__init__()

        self.hidden = False

        self.task_passport = task_passport
        self.source_code = source_code
        self.task_decorator = task_decorator
        self.base_task_definitions = (base_task_definitions
                                      or [])  # type: List[ TaskDefinition]

        # TODO: maybe use properties or other way to delegate those...
        self.full_task_family = self.task_passport.full_task_family
        self.full_task_family_short = self.task_passport.full_task_family_short
        self.task_family = self.task_passport.task_family
        self.task_config_section = self.task_passport.task_config_section

        # all the attributes that points to_Parameter
        self.task_param_defs = dict()  # type: Dict[str, ParameterDefinition]

        # the defaults attribute
        self.defaults = dict()  # type: Dict[ParameterDefinition, Any]

        self.task_param_defs = self._calculate_task_class_values(
            classdict, external_parameters)
        # if we have output params in function arguments, like   f(some_p=parameter.output)
        # the new function can not return the result of return
        self.single_result_output = self._is_result_single_output(
            self.task_param_defs)

        self.param_defaults = {
            p.name: p.default
            for p in self.task_param_defs.values() if is_defined(p.default)
        }

        # TODO: consider joining with task_config
        # TODO: calculate defaults value as _ConfigStore and merge using standard mechanism
        self.defaults = self._calculate_task_defaults(defaults)
        self.task_defaults_config_store = parse_and_build_config_store(
            source=self.task_passport.format_source_name("task.defaults"),
            config_values=self.defaults,
            priority=ConfigValuePriority.FALLBACK,
        )

        self.task_signature_extra = {}
        if config.getboolean("task_build", "sign_with_full_qualified_name"):
            self.task_signature_extra[
                "full_task_family"] = self.full_task_family
        if config.getboolean("task_build", "sign_with_task_code"):
            self.task_signature_extra[
                "task_code_hash"] = user_friendly_signature(
                    self.source_code.task_source_code)

        if task_definition_uid:
            self.task_definition_uid = task_definition_uid
        else:
            self.task_definition_uid = get_uuid()
示例#16
0
 def __init__(self, dag_bag=None):
     super(InProcessExecutor, self).__init__()
     self.tasks_to_run = []
     self.fail_fast = config.getboolean("run", "fail_fast")
     self.dag_bag = dag_bag