def _on_enter(self): pm.hook.dbnd_on_pre_init_context(ctx=self) run_user_func(config.get("core", "user_pre_init")) # if we are deserialized - we don't need to run this code again. if not self.initialized_context: # noinspection PyTypeChecker if self._module: load_python_module(self._module, "--module") module_from_config = config.get("databand", "module") if self._autoload_modules and module_from_config: load_python_module(module_from_config, "config file (see [databand].module)") # will be called from singleton context manager # we want to be able to catch all "new" inline airflow operators self.system_settings = DatabandSystemConfig() if self.system_settings.conf: self.config.set_values(self.system_settings.conf, source="dbnd.conf") if self.system_settings.conf_file: conf_file = read_from_config_files( self.system_settings.conf_file) self.config.set_values(conf_file, source="dbnd__databand__conf") from dbnd._core.settings import DatabandSettings self.settings = DatabandSettings(databand_context=self) self.env = self.settings.get_env_config(self.system_settings.env) self.config.set_values( config_values={"task": { "task_env": self.system_settings.env }}, source="context", ) pm.hook.dbnd_on_new_context(ctx=self) # RUN USER SETUP FUNCTIONS _run_user_func( self.settings.core.__class__.user_driver_init, self.settings.core.user_driver_init, ) self.task_run_env = RunInfoConfig().build_task_run_info() self.initialized_context = True else: # we get here if we are running at sub process that recreates the Context pm.hook.dbnd_on_existing_context(ctx=self) # we do it every time we go into databand_config self.configure_targets() self.settings.log.configure_dbnd_logging() _run_user_func(self.settings.core.__class__.user_init, self.settings.core.user_init) pm.hook.dbnd_post_enter_context(ctx=self)
def basic_logging_config( filename=None, log_level=logging.INFO, console_stream=sys.stderr, console_formatter_name="formatter_colorlog", file_formatter_name="formatter_full", ): # type: (...) -> Optional[dict] config = { "version": 1, "disable_existing_loggers": False, "formatters": { "formatter_full": { "format": FORMAT_FULL }, "formatter_simple": { "format": FORMAT_SIMPLE }, "formatter_colorlog": { "()": "dbnd._vendor.colorlog.ColoredFormatter", "format": FORMAT_COLORLOG, "reset": True, }, }, "handlers": { "console": { "class": "logging.StreamHandler", "stream": console_stream, "formatter": console_formatter_name, } }, "root": { "handlers": ["console"], "level": log_level }, } if filename: setup_log_file(filename) config["handlers"]["file"] = { "class": "logging.FileHandler", "formatter": file_formatter_name, "filename": filename, "encoding": "utf-8", } config["root"]["handlers"].append("file") sentry_url = dbnd_config.get("log", "sentry_url") if sentry_url: sentry_env = dbnd_config.get("log", "sentry_env", default="dev") config["handlers"]["sentry"] = get_sentry_logging_config( sentry_url=sentry_url, sentry_env=sentry_env) config["root"]["handlers"].append("sentry") return config
def get_dags_from_databand(custom_operator_class: Optional[type] = None): if environ_enabled(ENV_DBND_DISABLE_SCHEDULED_DAGS_LOAD): return None from dbnd._core.errors.base import DatabandApiError, DatabandConnectionException try: # let be sure that we are loaded config.load_system_configs() if not config.get("core", "databand_url"): return {} default_retries = config.getint("scheduler", "default_retries") dags = DbndSchedulerDBDagsProvider( default_retries=default_retries, custom_operator_class=custom_operator_class).get_dags() if not in_quiet_mode(): logger.info("providing %s dags from scheduled jobs" % len(dags)) return {dag.dag_id: dag for dag in dags} except (DatabandConnectionException, DatabandApiError) as e: logger.error(str(e)) raise e except Exception as e: logging.exception("Failed to get dags form databand server") raise e
def set_airflow_sql_conn_from_dbnd_config(): logging.debug("updating airflow config from dbnd config") from dbnd._core.configuration.dbnd_config import config as dbnd_config sql_alchemy_conn = dbnd_config.get("airflow", "sql_alchemy_conn") if sql_alchemy_conn == "dbnd": logging.debug("updating airflow sql from dbnd core.sql_alchemy_conn") sql_alchemy_conn = dbnd_config.get("core", "sql_alchemy_conn") if sql_alchemy_conn and "AIRFLOW__CORE__SQL_ALCHEMY_CONN" not in os.environ: os.environ["AIRFLOW__CORE__SQL_ALCHEMY_CONN"] = sql_alchemy_conn fernet_key = dbnd_config.get("airflow", "fernet_key") if fernet_key == "dbnd": fernet_key = dbnd_config.get("core", "fernet_key") if fernet_key and "AIRFLOW__CORE__FERNET_KEY" not in os.environ: os.environ["AIRFLOW__CORE__FERNET_KEY"] = fernet_key
def test_str_interpolation(self): with config( { "b": dict( a="@python://%s" % "test_dbnd.configuration.test_config_layers._a" ) } ): assert config.get("b", "a") == "from_a"
def dbnd_bootstrap(): global _dbnd_bootstrap global _dbnd_bootstrap_started if _dbnd_bootstrap_started: return _dbnd_bootstrap_started = True dbnd_system_bootstrap() from targets.marshalling import register_basic_data_marshallers register_basic_data_marshallers() _surpress_loggers() _suppress_warnings() enable_osx_forked_request_calls() if is_airflow_enabled(): from dbnd_airflow.bootstrap import airflow_bootstrap airflow_bootstrap() register_dbnd_plugins() from dbnd._core.configuration import environ_config from dbnd._core.utils.basics.load_python_module import run_user_func from dbnd._core.plugin.dbnd_plugins import pm from dbnd._core.configuration.dbnd_config import config user_plugins = config.get("core", "plugins", None) if user_plugins: register_dbnd_user_plugins(user_plugins.split(",")) if is_unit_test_mode(): pm.hook.dbnd_setup_unittest() pm.hook.dbnd_setup_plugin() if is_sigquit_handler_on(): from dbnd._core.utils.basics.signal_utils import ( register_sigquit_stack_dump_handler, ) register_sigquit_stack_dump_handler() # now we can run user code ( at driver/task) user_preinit = environ_config.get_user_preinit() if user_preinit: run_user_func(user_preinit) # if for any reason there will be code that calls dbnd_bootstrap, this will prevent endless recursion _dbnd_bootstrap = True
def test_layers(self): with config({"b": dict(a=2)}): config.log_current_config() config.set("core", "a", "1") config.set("core", "b", "1") with config({"core": dict(a=5)}): config.log_current_config(as_table=True) assert config.get("core", "a") == 5 config.log_current_config() config.log_layers()
def _on_enter(self): pm.hook.dbnd_on_pre_init_context(ctx=self) run_user_func(config.get("core", "user_pre_init")) # if we are deserialized - we don't need to run this code again. if not self._is_initialized: # will be called from singleton context manager self.system_settings = DatabandSystemConfig() if self.system_settings.conf: self.config.set_values(self.system_settings.conf, source="[databand]conf") if self.system_settings.conf_file: conf_file = read_from_config_files( self.system_settings.conf_file) self.config.set_values(conf_file, source="[databand]conf") from dbnd._core.settings import DatabandSettings self.settings = DatabandSettings(databand_context=self) self.env = self.settings.get_env_config(self.system_settings.env) self.config.set_values( config_values={"task": { "task_env": self.system_settings.env }}, source="context", ) pm.hook.dbnd_on_new_context(ctx=self) # RUN USER SETUP FUNCTIONS _run_user_func( self.settings.core.__class__.user_driver_init, self.settings.core.user_driver_init, ) self.task_run_env = RunInfoConfig().build_task_run_info() self._is_initialized = True else: # we get here if we are running at sub process that recreates the Context pm.hook.dbnd_on_existing_context(ctx=self) # we do it every time we go into databand_config self.configure_targets() self.settings.log.configure_dbnd_logging() _run_user_func(self.settings.core.__class__.user_init, self.settings.core.user_init) pm.hook.dbnd_post_enter_context(ctx=self)
def get_job_run_uid(airflow_instance_uid, dag_id, execution_date): # TODO_CORE: change to source_instance_uid if isinstance(execution_date, six.string_types): execution_date = pendulum.parse(execution_date) if isinstance(execution_date, datetime.datetime): # Temporary fix for existing databases with uids without microseconds algo_threshold = config.get("webserver", "run_uid_execution_date_threshold") if algo_threshold and execution_date <= pendulum.parse(algo_threshold): execution_date = execution_date.replace(microsecond=0) execution_date = execution_date.astimezone(pytz.utc).isoformat() if airflow_instance_uid is None: return uuid.uuid5(NAMESPACE_DBND_RUN, "{}:{}".format(dag_id, execution_date)) else: return uuid.uuid5( NAMESPACE_DBND_RUN, "{}:{}:{}".format(airflow_instance_uid, dag_id, execution_date), )
def get_dags(self): # type: () -> List[DAG] if not config.get("core", "databand_url"): self.scheduled_jobs = [] return [] logger.debug("about to get scheduler job dags from dbnd db") self.refresh_scheduled_jobs() dags = [] for job in self.scheduled_jobs: if "schedule_interval" not in job: continue dag = self.job_to_dag(job) dag.sync_to_db() DagModel.get_dagmodel(dag.dag_id).set_is_paused( is_paused=not job["active"] or (job.get("validation_errors", None) is not None and len(job.get("validation_errors", None)) > 0), including_subdags=False, ) dags.append(dag) return dags
def get_dags_from_file(): if environ_enabled(ENV_DBND_DISABLE_SCHEDULED_DAGS_LOAD): return None try: # let be sure that we are loaded config.load_system_configs() config_file = config.get("scheduler", "config_file") if not config_file: logger.info("No dags file has been defined at scheduler.config_file") return {} default_retries = config.getint("scheduler", "default_retries") active_by_default = config.getboolean("scheduler", "active_by_default") dags = DbndAirflowDagsProviderFromFile( config_file=config_file, active_by_default=active_by_default, default_retries=default_retries, ).get_dags() return {dag.dag_id: dag for dag in dags} except Exception as e: logging.exception("Failed to get dags from the file") raise e
def __init__(self, config_file=None): config.load_system_configs() self.config_file = (config_file if config_file else config.get( "scheduler", "config_file")) self.active_by_default = config.get("scheduler", "active_by_default")
def _get_task_cls(self, task_name): from dbnd._core.utils.basics.load_python_module import load_python_module task_cls = self._get_registered_task_cls(task_name) if task_cls: return task_cls # we are going to check if we have override/definition in config config_task_type = config.get(task_name, "_type", None) if config_task_type: _validate_no_recursion_in_config(task_name, config_task_type, "_type") try: return self._get_task_cls(config_task_type) except Exception: logger.error( "Failed to load type required by [%s] using _type=%s", task_name, config_task_type, ) raise config_task_type = config.get(task_name, "_from", None) if config_task_type: _validate_no_recursion_in_config(task_name, config_task_type, "_from") return self._get_task_cls(config_task_type) if "." in task_name: parts = task_name.split(".") possible_root_task = parts.pop() possible_module = ".".join(parts) # Try to load module and check again for existance load_python_module(possible_module, "task name '%s'" % task_name) task_cls = self._get_registered_task_cls(task_name) if task_cls: return task_cls # Check if task exists but user forgot to decorate method with @task task_module = sys.modules.get(possible_module) if task_module and hasattr(task_module, possible_root_task): user_func = getattr(task_module, possible_root_task) if callable(user_func): # Non-decorated function was found - decorate and return it from dbnd._core.decorator import dbnd_func_proxy decorated_task = dbnd_func_proxy.task(user_func) setattr(task_module, possible_root_task, decorated_task) logger.warning( "Found non-decorated task: %s. " "Please decorate this task with the proper symbol @pipeline \ @task.\n" "Auto-decorating and treating it as @task ...", task_name, ) return decorated_task.task if is_airflow_enabled(): from dbnd_airflow.dbnd_task_executor.airflow_operator_as_dbnd import ( AirflowDagAsDbndTask, ) dag = self._get_aiflow_dag(task_name) if dag: return AirflowDagAsDbndTask return None
def __init__(self, dbnd_task_type, dbnd_task_id, **kwargs): super(DbndOperator, self).__init__(**kwargs) self._task_type = dbnd_task_type self.dbnd_task_id = dbnd_task_id # Make sure that we run in separate pool self.pool = dbnd_config.get("airflow", "dbnd_pool")
import json from collections import namedtuple from typing import Any, Dict from dbnd._core.configuration.dbnd_config import config from dbnd.api.api_utils import ApiClient from dbnd.api.shared_schemas.scheduled_job_schema import ScheduledJobSchemaV2 config.load_system_configs() api_client = ApiClient( config.get("core", "databand_url"), auth=True, user=config.get("scheduler", "dbnd_user"), password=config.get("scheduler", "dbnd_password"), ) schema = ScheduledJobSchemaV2(strict=False) ScheduledJobNamedTuple = namedtuple("ScheduledJobNamedTuple", schema.fields.keys()) ScheduledJobNamedTuple.__new__.__defaults__ = (None, ) * len( ScheduledJobNamedTuple._fields) def post_scheduled_job(scheduled_job_dict): data, _ = schema.dump({"DbndScheduledJob": scheduled_job_dict}) response = api_client.api_request("/api/v1/scheduled_jobs", data, method="POST", no_prefix=True)