def __init__(self, task_class, classdict): super(TaskDefinition, self).__init__() self.task_definition_uid = get_uuid() self.hidden = False self.task_class = task_class # type: Type[Task] self.task_passport = TaskPassport.from_task_cls(task_class) # TODO: maybe use properties or other way to delegate those... self.full_task_family = self.task_passport.full_task_family self.full_task_family_short = self.task_passport.full_task_family_short self.task_family = self.task_passport.task_family self.task_config_section = self.task_passport.task_config_section # all the attributes that points to_Parameter self.task_params = dict() # type: Dict[str, ParameterDefinition] # the defaults attribute self.defaults = dict() # type: Dict[ParameterDefinition, Any] self.task_params, self.defaults = self._calculate_task_class_values(classdict) # if we have output params in function arguments, like f(some_p=parameter.output) # the new function can not return the result of return self.single_result_output = self._is_result_single_output(self.task_params) defaults = { p.name: p.default for p in self.task_params.values() if is_defined(p.default) } self.task_defaults_config_store = parse_and_build_config_store( source=self.task_passport.format_source_name("defaults"), config_values={self.task_config_section: defaults}, set_if_not_exists_only=True, ) self.task_defaults_config_store.update( parse_and_build_config_store( source=self.task_passport.format_source_name("defaults_section"), config_values=self.defaults, ) ) # now, if we have overloads in code ( calculated in task_definition): # class T(BaseT): # some_base_t_property = new_value if self.task_class._conf__track_source_code: self.task_source_code = _get_task_source_code(self.task_class) self.task_module_code = _get_task_module_source_code(self.task_class) self.task_source_file = _get_source_file(self.task_class) else: self.task_source_code = None self.task_module_code = "" self.task_source_file = None
def _calculate_and_add_layer_of_task_config(self, param_task_config): """ calculate any task class level params and updates the config with thier values @pipeline(task_config={CoreConfig.tracker: ["console"]) <or> class a(Task): task_config = {"core": {"tracker": ["console"]}} <or> task_config = {CoreConfig.tracker: ["console"]} """ # calculate the value of `Task.task_config` using it's definition # (check for all inheritance and find value for `task_config` param_task_config_value = self._build_parameter_value( param_task_config) if param_task_config_value.value: # Support two modes: # 1. Task.param_name:333 # 2. {"section": {"key":"value"}} # dict parameter value can't have non string as a key param_task_config_value.value = parse_and_build_config_store( config_values=param_task_config_value.value, source=self._source_name("task_config"), ) # merging `Task.task_config` into current configuration # we are adding "ultimate" layer on top of all layers self.config.set_values(param_task_config_value.value) return param_task_config_value
def _new_config_layer(self, config_values, source=None, override=False, merge_settings=None): # let validate that we are initialized # user can call this function out of no-where, so we will create a layer, and will override it # the moment we create more layers on config.system_load dbnd_system_bootstrap() if not config_values: return self.config_layer if not isinstance(config_values, _ConfigStore): if not source: source = "{sig}".format(sig=id(config_values)) config_values = parse_and_build_config_store( config_values=config_values, source=source, override=override) # type: _ConfigStore source = source or config_values.source if not source: source = "{sig}".format(sig=id(config_values)) return self.config_layer.create_layer(name=source, config_values=config_values, merge_settings=merge_settings)
def _parse_cli(configs, source, override=False): """ Parse every item in configs , joining them into one big ConfigStore """ config_values_list = [ parse_and_build_config_store( config_values=c, source=source, override=override, auto_section_parse=True ) for c in configs ] return functools.reduce((lambda x, y: x.update(y)), config_values_list)
def get_autoloaded_config(self): # override values for task task_auto_config override_values = { "task_auto_config": { "param_int": self.param_int, "param_datetime": self.param_datetime, } } return parse_and_build_config_store(config_values=override_values, override=True, source="user_config")
def get_autoloaded_config(self): # override values for task task_auto_config override_values = { "task_auto_config": { "param_int": self.param_int, "param_datetime": self.param_datetime, } } return parse_and_build_config_store( config_values=override_values, priority=ConfigValuePriority.OVERRIDE, source="user_config", )
def get_and_process_dbnd_dag_config(self): dag = self.dag if not dag.default_args: dag_dbnd_config = {} else: dag_dbnd_config = dag.default_args.get("dbnd_config", {}) config_store = parse_and_build_config_store( source="%s default args" % dag.dag_id, config_values=dag_dbnd_config, auto_section_parse=True, ) # config can have problems around serialization, # let override with "normalized" config if dag.default_args: dag.default_args["dbnd_config"] = config_store config_store = _default_dbnd_dag_context_config.merge(config_store) logger.debug("Config store for %s: %s", self.dag.dag_id, config_store) return config_store
def run( ctx, is_help, task, module, _sets, _sets_config, _sets_root, _overrides, verbose, describe, env, parallel, conf_file, task_version, project_name, name, description, run_driver, alternative_task_name, scheduled_job_name, scheduled_date, interactive, submit_driver, submit_tasks, disable_web_tracker, ): """ Run a task or a DAG To see tasks use `dbnd show-tasks` (tab completion is available). """ from dbnd._core.context.databand_context import new_dbnd_context, DatabandContext from dbnd._core.utils.structures import combine_mappings from dbnd import config task_name = task # --verbose, --describe, --env, --parallel, --conf-file and --project-name # we filter out false flags since otherwise they will always override the config with their falseness main_switches = dict( databand=filter_dict_remove_false_values( dict( verbose=verbose > 0, describe=describe, env=env, conf_file=conf_file, project_name=project_name, ) ), run=filter_dict_remove_false_values( dict( name=name, parallel=parallel, description=description, is_archived=describe, ) ), ) if submit_driver is not None: main_switches["run"]["submit_driver"] = bool(submit_driver) if submit_tasks is not None: main_switches["run"]["submit_tasks"] = bool(submit_tasks) if disable_web_tracker: main_switches.setdefault("core", {})["tracker_api"] = "disabled" if task_version is not None: main_switches["task"] = {"task_version": task_version} cmd_line_config = parse_and_build_config_store( source="cli", config_values=main_switches ) _sets = list(_sets) _sets_config = list(_sets_config) _sets_root = list(_sets_root) root_task_config = {} for _set in _sets_root: root_task_config = combine_mappings(left=root_task_config, right=_set) # remove all "first level" config values, assume that they are for the main task # add them to _sets_root for _set in _sets: for k, v in list(_set.items()): # so json-like values won't be included if "." not in k and isinstance(v, six.string_types): root_task_config[k] = v del _set[k] # --set, --set-config if _sets: cmd_line_config.update(_parse_cli(_sets, source="--set")) if _sets_config: cmd_line_config.update(_parse_cli(_sets_config, source="--set-config")) if _overrides: cmd_line_config.update( _parse_cli(_overrides, source="--set-override", override=True) ) if interactive: cmd_line_config.update( _parse_cli([{"run.interactive": True}], source="--interactive") ) if verbose > 1: cmd_line_config.update( _parse_cli([{"task_build.verbose": True}], source="-v -v") ) if cmd_line_config: config.set_values(cmd_line_config, source="cmdline") if verbose: logger.info("CLI config: \n%s", pformat_config_store_as_table(cmd_line_config)) # double checking on bootstrap, as we can run from all kind of locations # usually we should be bootstraped already as we run from cli. dbnd_bootstrap() if not config.getboolean("log", "disabled"): configure_basic_logging(None) scheduled_run_info = None if scheduled_job_name: scheduled_run_info = ScheduledRunInfo( scheduled_job_name=scheduled_job_name, scheduled_date=scheduled_date ) with new_dbnd_context( name="run", module=module ) as context: # type: DatabandContext task_registry = get_task_registry() tasks = task_registry.list_dbnd_task_classes() completer.refresh(tasks) # modules are loaded, we can load the task task_cls = None if task_name: task_cls = task_registry.get_task_cls(task_name) if alternative_task_name: task_cls = build_dynamic_task( original_cls=task_cls, new_cls_name=alternative_task_name ) task_name = alternative_task_name # --set-root # now we can get it config, as it's not main task, we can load config after the configuration is loaded if task_cls is not None: if root_task_config: # adding root task to configuration config.set_values( {task_cls.task_definition.task_config_section: root_task_config}, source="--set-root", ) if is_help or not task_name: print_help(ctx, task_cls) return return context.dbnd_run_task( task_or_task_name=task_name, run_uid=run_driver, scheduled_run_info=scheduled_run_info, )
# dbnd code captures inline operator creation with Catcher, it's not in airflow mode. return not safe_isinstance(context_manager_dag, "DatabandOpCatcherDag") @dbnd_handle_errors(exit_on_error=False) def build_task_at_airflow_dag_context(task_cls, call_args, call_kwargs): dag = safe_get_context_manager_dag() dag_ctrl = DagFuncOperatorCtrl.build_or_get_dag_ctrl(dag) return dag_ctrl.build_airflow_operator( task_cls=task_cls, call_args=call_args, call_kwargs=call_kwargs ) _default_dbnd_dag_context_config = parse_and_build_config_store( source="airflow_defaults", config_values={"log": {"disabled": True, "capture_task_run_log": False}}, ) class DagFuncOperatorCtrl(object): dag_to_context = {} def __init__(self, dag): self.dag = dag self.dbnd_airflow_name = {} config_store = self.get_and_process_dbnd_dag_config() with dbnd_config( config_values=config_store, source="airflow" ) as current_config: self.dbnd_context = DatabandContext(name="airflow__%s" % self.dag.dag_id) with DatabandContext.context(_context=self.dbnd_context):
def cmd_run( ctx, is_help, task, module, _sets, _sets_config, _sets_root, _overrides, _extend, verbose, print_task_band, describe, env, parallel, conf_file, task_version, project, name, description, run_driver, override_run_uid, alternative_task_name, job_name, scheduled_job_name, scheduled_date, interactive, submit_driver, submit_tasks, disable_web_tracker, open_web_tab, docker_build_tag, ): """ Run a task or a DAG To see all available tasks use `dbnd show-tasks` (tab completion is available). `dbnd show-configs` will print all available configs. """ from dbnd import config from dbnd._core.context.databand_context import DatabandContext, new_dbnd_context from dbnd._core.utils.structures import combine_mappings task_registry = get_task_registry() # we need to do it before we are looking for the task cls load_user_modules(dbnd_config=config, modules=module) task_name = task # --verbose, --describe, --env, --parallel, --conf-file and --project # we filter out false flags since otherwise they will always override the config with their falseness main_switches = dict( databand=dict( verbose=verbose > 0, print_task_band=print_task_band, describe=describe, env=env, conf_file=conf_file, project=project, ), run=dict( name=name, parallel=parallel, interactive=interactive, description=description, is_archived=describe, open_web_tracker_in_browser=open_web_tab, submit_driver=_nullable_flag(submit_driver), submit_tasks=_nullable_flag(submit_tasks), ), kubernetes=dict(docker_build_tag=docker_build_tag), task=dict(task_version=task_version), task_build=dict(verbose=True if verbose > 1 else None), core=dict(tracker_api="disabled" if disable_web_tracker else None), ) main_switches = cleanup_empty_switches(main_switches) _sets = list(_sets) _sets_config = list(_sets_config) _sets_root = list(_sets_root) root_task_config = {} for _set in _sets_root: root_task_config = combine_mappings(left=root_task_config, right=_set) # remove all "first level" config values, assume that they are for the main task # add them to _sets_root for _set in _sets: for k, v in list(_set.items()): # so json-like values won't be included if "." not in k and isinstance(v, six.string_types): root_task_config[k] = v del _set[k] cmd_line_config = parse_and_build_config_store(source="cli", config_values=main_switches) # --set, --set-config if _sets: cmd_line_config.update(_parse_cli(_sets, source="--set")) if _sets_config: cmd_line_config.update(_parse_cli(_sets_config, source="--set-config")) if _extend: cmd_line_config.update( _parse_cli(_extend, source="--extend-config", extend=True)) if _overrides: cmd_line_config.update( _parse_cli( _overrides, source="--set-override", priority=ConfigValuePriority.OVERRIDE, )) # --set-root if root_task_config: task_cls = task_registry.get_task_cls(task_name) task_section = task_cls.task_definition.task_config_section # adding root task to configuration cmd_line_config.update( parse_and_build_config_store( config_values={task_section: root_task_config}, source="--set-root")) # UPDATE CURRENT CONFIG with CLI values if cmd_line_config: if verbose: logger.info("CLI config: \n%s", pformat_config_store_as_table(cmd_line_config)) config.set_values(cmd_line_config, source="cmdline") # double checking on bootstrap, as we can run from all kind of locations # usually we should be bootstraped already as we run from cli. dbnd_bootstrap() # initialize basic logging (until we get to the context logging if not config.getboolean("log", "disabled"): configure_basic_logging(None) scheduled_run_info = None if scheduled_job_name: scheduled_run_info = ScheduledRunInfo( scheduled_job_name=scheduled_job_name, scheduled_date=scheduled_date) # update completer if config.getboolean("databand", "completer"): tasks = task_registry.list_dbnd_task_classes() completer.refresh(tasks) # bootstrap and modules are loaded, we can load the task task_cls = None if task_name: task_cls = task_registry.get_task_cls(task_name) if not task_name: print_help(ctx, None) return if is_help: print_help(ctx, task_cls) return with tracking_mode_context(tracking=False), new_dbnd_context( name="run") as context: # type: DatabandContext if context.settings.system.describe: # we want to print describe without triggering real run logger.info("Building main task '%s'", task_name) root_task = get_task_registry().build_dbnd_task(task_name) root_task.ctrl.describe_dag.describe_dag() # currently there is bug with the click version we have when using python 2 # so we don't use the click.echo function # https://github.com/pallets/click/issues/564 print("Task %s has been described!" % task_name) return root_task return context.dbnd_run_task( task_or_task_name=task_name, force_task_name=alternative_task_name, job_name=job_name or alternative_task_name or task_name, run_uid=run_driver or override_run_uid, existing_run=run_driver is not None, scheduled_run_info=scheduled_run_info, project=project, )
def __init__(self, task_class, classdict, namespace_at_class_time): super(TaskDefinition, self).__init__() self.task_definition_uid = get_uuid() self.hidden = False self.task_class = task_class # type: Type[Task] self.namespace_at_class_time = namespace_at_class_time if self.task_class._conf__decorator_spec: cls_name = self.task_class._conf__decorator_spec.name else: cls_name = self.task_class.__name__ self.full_task_family = "%s.%s" % (task_class.__module__, cls_name) self.full_task_family_short = "%s.%s" % ( _short_name(task_class.__module__), cls_name, ) self.task_family = self._build_user_task_family() if not self.task_family: self.task_family = cls_name self.task_config_section = self.full_task_family else: self.task_config_section = self.task_family # all the attributes that points to_Parameter self.task_params = dict() # type: Dict[str, ParameterDefinition] # the defaults attribute self.defaults = dict() # type: Dict[ParameterDefinition, Any] self.task_params, self.defaults = self._calculate_task_class_values( classdict) # if we have output params in function arguments, like f(some_p=parameter.output) # the new function can not return the result of return self.single_result_output = self._is_result_single_output( self.task_params) defaults = { p.name: p.default for p in self.task_params.values() if is_defined(p.default) } self.task_defaults_config_store = parse_and_build_config_store( source="%s[defaults]" % self.full_task_family_short, config_values={self.task_config_section: defaults}, set_if_not_exists_only=True, ) self.task_defaults_config_store.update( parse_and_build_config_store( source="%s[defaults_section]" % self.full_task_family_short, config_values=self.defaults, )) # now, if we have overloads in code ( calculated in task_definition): # class T(BaseT): # some_base_t_property = new_value if self.task_class._conf__track_source_code: self.task_source_code = _get_task_source_code(self.task_class) self.task_module_code = _get_task_module_source_code( self.task_class) self.task_source_file = _get_source_file(self.task_class) else: self.task_source_code = None self.task_module_code = "" self.task_source_file = None
def __init__( self, task_passport, # type: TaskPassport classdict=None, # type: Optional[Dict[str, Any]] base_task_definitions=None, # type: Optional[List[TaskDefinition]] defaults=None, # type: Optional[Dict[ParameterDefinition, Any]] task_decorator=None, # type: Optional[TaskDecorator] source_code=None, # type: Optional[TaskSourceCode] external_parameters=None, # type: Optional[Parameters] task_definition_uid=None, # type: Optional[UUID] ): super(TaskDefinition, self).__init__() self.hidden = False self.task_passport = task_passport self.source_code = source_code self.task_decorator = task_decorator self.base_task_definitions = (base_task_definitions or []) # type: List[ TaskDefinition] # TODO: maybe use properties or other way to delegate those... self.full_task_family = self.task_passport.full_task_family self.full_task_family_short = self.task_passport.full_task_family_short self.task_family = self.task_passport.task_family self.task_config_section = self.task_passport.task_config_section # all the attributes that points to_Parameter self.task_param_defs = dict() # type: Dict[str, ParameterDefinition] # the defaults attribute self.defaults = dict() # type: Dict[ParameterDefinition, Any] self.task_param_defs = self._calculate_task_class_values( classdict, external_parameters) # if we have output params in function arguments, like f(some_p=parameter.output) # the new function can not return the result of return self.single_result_output = self._is_result_single_output( self.task_param_defs) self.param_defaults = { p.name: p.default for p in self.task_param_defs.values() if is_defined(p.default) } # TODO: consider joining with task_config # TODO: calculate defaults value as _ConfigStore and merge using standard mechanism self.defaults = self._calculate_task_defaults(defaults) self.task_defaults_config_store = parse_and_build_config_store( source=self.task_passport.format_source_name("task.defaults"), config_values=self.defaults, priority=ConfigValuePriority.FALLBACK, ) self.task_signature_extra = {} if config.getboolean("task_build", "sign_with_full_qualified_name"): self.task_signature_extra[ "full_task_family"] = self.full_task_family if config.getboolean("task_build", "sign_with_task_code"): self.task_signature_extra[ "task_code_hash"] = user_friendly_signature( self.source_code.task_source_code) if task_definition_uid: self.task_definition_uid = task_definition_uid else: self.task_definition_uid = get_uuid()