def parse_value(self, value, load_value=None, target_config=None): """ Parse an individual value from the input. probably this is the most important code in user value parsing :param str value: the value to parse. :return: the parsed value. """ from dbnd._core.utils.task_utils import to_targets from targets.inmemory_target import InMemoryTarget from targets.values.target_values import _TargetValueType if load_value is None: load_value = self.load_on_build value = self._interpolate_from_str(value) if value is None: return value if isinstance(value, six.string_types): # we are in the string mode # it's can be "serialized to string" or path value if load_value: # we can just load value from string if self.support_from_str: value = self.parse_from_str(value) value = self.normalize(value) return value # otherwise - the value is a path! target_kwargs = {} if target_config: target_kwargs["config"] = target_config return to_targets(json_utils.loads(value), from_string_kwargs=target_kwargs) from dbnd._core.task import Task from targets import Target if isinstance(value, Task): return to_targets(value) if isinstance(value, Target): return value # so we have a value that is obviously "Data" type, # we want to be able to supporet "load_value" behaviour if not load_value and not isinstance(self, _TargetValueType): return InMemoryTarget(value, value_type=self) value = self.normalize(value) return value
def initialize_required(self): # regular requirements -- just all inputs inputs = {"user": {}, "system": {}} # we take all parameters that are inputs (not outputs) # however Primitive parameters are inputs only if they are Target (deferred) # if isinstance(p, _TargetParameter) or isinstance(value, Target) for p, value in self.params.get_param_values(input_only=True): if value is None: continue value = traverse(value, convert_f=_find_target, filter_none=True, filter_empty=True) if not value: continue inputs[_section(p)][p.name] = value def _extend_system_section(key, extra): if not extra: return inputs["system"][key] = extra from dbnd import PipelineTask if isinstance(self.task, PipelineTask): task_output_values = {} for p, value in self.params.get_param_values(output_only=True, user_only=True): if p.name == "task_band" or isinstance(p, FuncResultParameter): continue if is_not_defined(value): raise friendly_error.task_build.pipeline_task_has_unassigned_outputs( task=self.task, param=p) task_output_values[p.name] = value _extend_system_section("band", task_output_values) # find all child pipelines and make them upstreams to the task _extend_system_section( "pipelines", {p.task_id: p for p in self._get_all_child_pipelines()}) # now may be user still use function _requires - so let add that to dependencies _extend_system_section("required", self.task._requires()) return to_targets(inputs)
def initialize_required(self): # regular requirements -- just all inputs inputs = {"user": {}, "system": {}} # we take all parameters that are inputs (not outputs) # however Primitive parameters are inputs only if they are Target (deferred) # if isinstance(p, _TargetParameter) or isinstance(value, Target) for p, value in self.params.get_params_with_value( ParameterFilters.INPUTS): if value is None: continue value = traverse(value, convert_f=_find_target, filter_none=True, filter_empty=True) if not value: continue inputs[_section(p)][p.name] = value def _extend_system_section(key, extra): if not extra: return inputs["system"][key] = extra from dbnd import PipelineTask if isinstance(self.task, PipelineTask): task_output_values = {} for p, value in self.params.get_params_with_value( ParameterFilters.USER_OUTPUTS): if p.name == "task_band" or isinstance(p, FuncResultParameter): continue # band outputs are going to be required as inputs! # @pipeline can run only when all of it's "outputs" are ready task_output_values[p.name] = value _extend_system_section("band", task_output_values) # find all child pipelines and make them upstreams to the task _extend_system_section( "pipelines", {p.task_id: p for p in self._get_all_child_pipelines()}) # now may be user still use function _requires - so let add that to dependencies _extend_system_section("required", self.task._requires()) return to_targets(inputs)
def normalize(self, x): # type: (T) -> T """ Given a parsed parameter value, normalizes it. The value can either be the result of parse(), the default value or arguments passed into the task's constructor by instantiation. This is very implementation defined, but can be used to validate/clamp valid values. For example, if you wanted to only accept even integers, and "correct" odd values to the nearest integer, you can implement normalize as ``x // 2 * 2``. """ if isinstance(self.value_type, _TargetValueType): # can not move to value_type, we need target_config from dbnd._core.utils.task_utils import to_targets return to_targets(x, from_string_kwargs=dict(config=self.target_config)) return self.value_type.normalize(x)
def initialize_outputs(self): """ The default output that this Task produces. Use outputs! Override only if you are writing "base" class """ task = self.task outputs = {"user": {}, "system": {}} for p, value in self.params.get_params_with_value( ParameterFilters.OUTPUTS): if is_not_defined(value): value = p.build_output(task=task) setattr(self.task, p.name, value) if isinstance(p, FuncResultParameter): continue value = traverse_and_set_target(value, p._target_source(self.task)) outputs[_section(p)][p.name] = value custom_outputs = self.task._output() if custom_outputs: if outputs["user"]: warnings.warn( "Task %s has custom outputs in _output() function, all other outputs will be removed: %s" % (task, outputs["user"]), stacklevel=2, ) outputs["user"] = custom_outputs # take ownership of all outputs and clean it, just in case # usually all outputs are assigned to task # just in case we have some "outputs" with Tasks outputs = to_targets(outputs) self.task_outputs = traverse_and_set_target( outputs, target_source=TargetSource(task_id=self.task_id))
def normalize_to_target(self, value): from dbnd._core.utils.task_utils import to_targets return to_targets(value)
def normalize(self, x): # can not move to value_type, we need target_config return to_targets(x, from_string_kwargs=dict(config=self.target_config))
def parse_value(self, value, load_value=None, target_config=None): """ Parse an individual value from the input. probably this is the most important code in user value parsing :param str value: the value to parse. :return: the parsed value. """ from dbnd._core.utils.task_utils import to_targets from targets.inmemory_target import InMemoryTarget from targets.values.target_values import _TargetValueType from targets import Target if load_value is None: load_value = self.load_on_build value = self._interpolate_from_str(value) if value is None: return value if isinstance(value, six.string_types): # we are in the string mode # it's can be "serialized to string" or path value if load_value: # in case we have simple type -> just load/parse it if self.support_from_str: value = self.parse_from_str(value) value = self.normalize(value) return value # otherwise - the data is "Complex object" # our assumption is that it can not be loaded from string # the value is a path! target_kwargs = {} if target_config: target_kwargs["config"] = target_config # Check for glob path if _is_glob_path(value): from targets import target return target(value, config=target_config) """ it's possible that we have a list of targets, or just a single target (all targets should be loaded as single object). we need to support: 1. /some/path 2. /some/path,.... 3. ["/some_path",..] we will try to parse it as list, if we get list with one element (1) -> we can return it, otherwise we wrap it with MultiTarget """ from targets.values.structure import ListValueType # Parse into value type list list_of_targets = ListValueType().parse_from_str(value) # Apply all values from config list_of_targets = to_targets(list_of_targets, from_string_kwargs=target_kwargs) if len(list_of_targets) == 1: return list_of_targets[0] else: from targets.multi_target import MultiTarget return MultiTarget(list_of_targets) from dbnd._core.task import Task if isinstance(value, Task): return to_targets(value) if isinstance(value, Target): return value # so we have a value that is obviously "Data" type, # we want to be able to support "load_value" behaviour if not load_value and not isinstance(self, _TargetValueType): return InMemoryTarget(value, value_type=self) value = self.normalize(value) return value
def initialize_band(self): try: band_context = [] if is_airflow_enabled(): from dbnd_airflow.dbnd_task_executor.airflow_operators_catcher import ( get_databand_op_catcher_dag, ) band_context.append(get_databand_op_catcher_dag()) original_param_values = [] for param_value in self.task.task_params.get_param_values( ParameterFilters.OUTPUTS): if param_value.name == "task_band" or isinstance( param_value.parameter, FuncResultParameter): continue original_param_values.append((param_value, param_value.value)) with nested(*band_context): band = self.task.band() # this one would be normalized self.task._task_band_result = band self.task_band_result = band # real value from dbnd import PipelineTask if isinstance(self.task, PipelineTask): # after .band has finished, all user outputs of the .band should be defined for param_value, _ in original_param_values: # we want to validate only user facing parameters # they should have assigned values by this moment, # pipeline task can not have None outputs, after band call if param_value.parameter.system: continue if is_not_defined(param_value.value): raise friendly_error.task_build.pipeline_task_has_unassigned_outputs( task=self.task, param=param_value.parameter) # now let's normalize if user has changed outputs for param_value, original_value in original_param_values: if param_value.value is original_value: continue try: from dbnd._core.utils.task_utils import to_targets normalized_value = to_targets(param_value.value) param_value.update_param_value(normalized_value) except Exception as ex: raise friendly_error.task_build.failed_to_assign_param_value_at_band( ex, param_value.parameter, param_value.value, self.task) except Exception as ex: logger.warning( self.visualiser.banner( msg="Failed to run %s" % _band_call_str(self.task), color="red", exc_info=sys.exc_info(), )) if self.task.task_decorator: # just re-raise, we already have an error from the "run" function raise raise friendly_error.task_build.failed_to_call_band(ex, self.task)
def auto_load_save_params(self): task = self.task original_values = task._params.get_param_values() # we don't support "nested" calls for now, # let's not overcomplicate code for non existing scenario if task._task_auto_read is not None: logger.warning( "You are running in {task} within already existing TaskAutoParamsReadWrite context".format( task=task ) ) if self.auto_read: task._task_auto_read_original = {p.name: v for p, v in original_values} task._task_auto_read = set() dirty = self.save_on_change try: yield original_values # now we disable "auto read" task._task_auto_read = None task._task_auto_read_original = None # from here we are going to read "the value" without autoresolving current_values = { p.name: value for p, value in task._params.get_param_values() } if not self.save_on_change and not self.normalize_on_change: return changed = [] for p, original_value in original_values: current_value = current_values[p.name] if id(original_value) != id(current_value): # nothing to do original_value is the same changed.append((p, original_value, current_value)) if self.save_on_change: try: for p, original_value, current_value in changed: # TODO: implement Atomic commit if p.is_output(): self.auto_save_param(p, original_value, current_value) finally: for p, original_value, current_value in changed: setattr(task, p.name, original_value) dirty = False elif self.normalize_on_change: for p, original_value, current_value in changed: try: # probably we are in the band # we are going just to normalize the value if p.is_output(): from dbnd._core.utils.task_utils import to_targets normalized_value = to_targets(current_value) else: normalized_value = p.normalize(current_value) if id(normalized_value) != id(current_value): setattr(task, p.name, normalized_value) except Exception as ex: raise friendly_error.task_build.failed_to_assign_param_value_at_band( ex, p, current_value, task ) finally: task._task_auto_read = None if dirty: for p, original_value in original_values: setattr(task, p.name, original_value)
def auto_load_save_params(self): task = self.task original_values = task._params.get_param_values() if self.auto_read: task.load_task_runtime_values() original_values = task._params.get_param_values() task._task_auto_read_original = { p.name: v for p, v in original_values } dirty = self.save_on_change try: yield original_values # now we disable "auto read" task._task_auto_read_original = None # from here we are going to read "the value" without autoresolving current_values = { p.name: value for p, value in task._params.get_param_values() } if not self.save_on_change and not self.normalize_on_change: return changed = [] for p, original_value in original_values: current_value = current_values[p.name] if id(original_value) != id(current_value): # nothing to do original_value is the same changed.append((p, original_value, current_value)) if self.save_on_change: try: for p, original_value, current_value in changed: # TODO: implement Atomic commit if p.is_output(): self.auto_save_param(p, original_value, current_value) finally: for p, original_value, current_value in changed: setattr(task, p.name, original_value) dirty = False elif self.normalize_on_change: for p, original_value, current_value in changed: try: # probably we are in the band # we are going just to normalize the value if p.is_output(): from dbnd._core.utils.task_utils import to_targets normalized_value = to_targets(current_value) else: normalized_value = p.normalize(current_value) if id(normalized_value) != id(current_value): setattr(task, p.name, normalized_value) except Exception as ex: raise friendly_error.task_build.failed_to_assign_param_value_at_band( ex, p, current_value, task) finally: if dirty: for p, original_value in original_values: setattr(task, p.name, original_value)
def data_combine(inputs, sort=False): targets = flatten(to_targets(inputs)) if sort: targets = sorted(targets, key=lambda x: x.path) data = MultiTarget(targets) return data