示例#1
0
    def _build_parameter(self, context="inline"):
        s = self.parameter  # type: ParameterDefinition
        update_kwargs = {}

        value_type = self._build_value_type(context)

        validator = s.validator
        if s.choices:
            validator = ChoiceValidator(s.choices)

        if is_not_defined(s.default):
            if s.empty_default:
                update_kwargs["default"] = value_type._generate_empty_default()

        if not is_defined(s.load_on_build):
            update_kwargs["load_on_build"] = value_type.load_on_build

        # create value meta
        if s.value_meta_conf is None:
            update_kwargs["value_meta_conf"] = ValueMetaConf(
                log_preview=s.log_preview,
                log_preview_size=s.log_preview_size,
                log_schema=s.log_schema,
                log_size=s.log_size,
                log_stats=s.log_stats,
                log_histograms=s.log_histograms,
            )

        # Whether different values for this parameter will differentiate otherwise equal tasks
        description = s.description or ""
        if not is_defined(description):
            if s.is_output() and s.default_output_description:
                description = s.default_output_description
            elif not s.load_on_build and s.default_input_description:
                description = s.default_input_description
            else:
                description = s.default_description

            if s.validator:
                description = _add_description(description, validator.description)
            update_kwargs["description"] = description()
        # We need to keep track of this to get the order right (see Task class)
        ParameterDefinition._total_counter += 1
        if s.kind == _ParameterKind.task_output:
            update_kwargs["significant"] = False

        updated = self.modify(
            value_type=value_type,
            value_type_defined=value_type,
            validator=validator,
            description=description,
            parameter_id=ParameterDefinition._total_counter,
            **update_kwargs
        )

        return updated.parameter
示例#2
0
    def partition(self, name=NOTHING, extension=NOTHING, config=NOTHING, **kwargs):
        """
        :param config:
        :param name: file name of the partition. if not provided - "part-%04d" % ID
        :param extension: extension. if not provided -> default extension will be used
        :return: FileTarget that represents the partition.
        """
        if is_not_defined(name):
            name = "part-%04d" % self._auto_partition_count
            self._auto_partition_count += 1
        if is_not_defined(config):
            # only if it's a call not from file,folder - we set it as file
            config = self.config.as_file()

        if is_not_defined(extension):
            extension = config.get_ext()
        if extension:
            name += extension
        return target(self.path, name, config=config, fs=self._fs, **kwargs)
示例#3
0
 def file(self, name=NOTHING, extension=NOTHING, config=NOTHING, **kwargs):
     """
     :param config:
     :param name: file name of the partition. if not provided - "part-%04d" % ID
     :param extension: extension. if not provided -> default extension will be used
     :return: FileTarget: that represents the partition.
     """
     config = self.config if is_not_defined(config) else config
     return self.partition(
         name=name, extension=extension, config=config.as_file(), **kwargs
     )
示例#4
0
    def initialize_required(self):
        # regular requirements -- just all inputs
        inputs = {"user": {}, "system": {}}

        # we take all parameters that are inputs (not outputs)
        # however Primitive parameters are inputs only if they are Target (deferred)
        #           if isinstance(p, _TargetParameter) or isinstance(value, Target)

        for p, value in self.params.get_param_values(input_only=True):
            if value is None:
                continue
            value = traverse(value,
                             convert_f=_find_target,
                             filter_none=True,
                             filter_empty=True)

            if not value:
                continue

            inputs[_section(p)][p.name] = value

        def _extend_system_section(key, extra):
            if not extra:
                return
            inputs["system"][key] = extra

        from dbnd import PipelineTask

        if isinstance(self.task, PipelineTask):
            task_output_values = {}
            for p, value in self.params.get_param_values(output_only=True,
                                                         user_only=True):

                if p.name == "task_band" or isinstance(p, FuncResultParameter):
                    continue

                if is_not_defined(value):
                    raise friendly_error.task_build.pipeline_task_has_unassigned_outputs(
                        task=self.task, param=p)
                task_output_values[p.name] = value

            _extend_system_section("band", task_output_values)

        # find all child pipelines and make them upstreams to the task
        _extend_system_section(
            "pipelines",
            {p.task_id: p
             for p in self._get_all_child_pipelines()})
        # now may be user still use function _requires - so let add that to dependencies
        _extend_system_section("required", self.task._requires())

        return to_targets(inputs)
示例#5
0
    def initialize_outputs(self):
        """
        The default output that this Task produces. Use outputs! Override only if you are writing "base" class
        """
        task = self.task

        outputs = {"user": {}, "system": {}}

        for p, value in self.params.get_params_with_value(
                ParameterFilters.OUTPUTS):
            if is_not_defined(value):
                value = p.build_output(task=task)
                setattr(self.task, p.name, value)

            if isinstance(p, FuncResultParameter):
                continue

            value = traverse_and_set_target(value, p._target_source(self.task))
            outputs[_section(p)][p.name] = value

        custom_outputs = self.task._output()
        if custom_outputs:
            if outputs["user"]:
                warnings.warn(
                    "Task %s has custom outputs in _output() function, all other outputs will be removed: %s"
                    % (task, outputs["user"]),
                    stacklevel=2,
                )
                outputs["user"] = custom_outputs

        # take ownership of all outputs and clean it, just in case
        # usually all outputs are assigned to task

        # just in case we have some "outputs" with Tasks
        outputs = to_targets(outputs)
        self.task_outputs = traverse_and_set_target(
            outputs, target_source=TargetSource(task_id=self.task_id))
示例#6
0
    def initialize_band(self):
        try:
            band_context = []
            if is_airflow_enabled():
                from dbnd_airflow.dbnd_task_executor.airflow_operators_catcher import (
                    get_databand_op_catcher_dag, )

                band_context.append(get_databand_op_catcher_dag())

            original_param_values = []
            for param_value in self.task.task_params.get_param_values(
                    ParameterFilters.OUTPUTS):
                if param_value.name == "task_band" or isinstance(
                        param_value.parameter, FuncResultParameter):
                    continue
                original_param_values.append((param_value, param_value.value))

            with nested(*band_context):
                band = self.task.band()
                # this one would be normalized
                self.task._task_band_result = band
            self.task_band_result = band  # real value

            from dbnd import PipelineTask

            if isinstance(self.task, PipelineTask):
                # after .band has finished, all user outputs of the .band should be defined
                for param_value, _ in original_param_values:
                    # we want to validate only user facing parameters
                    # they should have assigned values by this moment,
                    # pipeline task can not have None outputs, after band call
                    if param_value.parameter.system:
                        continue
                    if is_not_defined(param_value.value):
                        raise friendly_error.task_build.pipeline_task_has_unassigned_outputs(
                            task=self.task, param=param_value.parameter)

            # now let's normalize if user has changed outputs
            for param_value, original_value in original_param_values:
                if param_value.value is original_value:
                    continue

                try:
                    from dbnd._core.utils.task_utils import to_targets

                    normalized_value = to_targets(param_value.value)
                    param_value.update_param_value(normalized_value)
                except Exception as ex:
                    raise friendly_error.task_build.failed_to_assign_param_value_at_band(
                        ex, param_value.parameter, param_value.value,
                        self.task)

        except Exception as ex:
            logger.warning(
                self.visualiser.banner(
                    msg="Failed to run %s" % _band_call_str(self.task),
                    color="red",
                    exc_info=sys.exc_info(),
                ))

            if self.task.task_decorator:
                # just re-raise, we already have an error from the "run" function
                raise

            raise friendly_error.task_build.failed_to_call_band(ex, self.task)
示例#7
0
    def __init__(
        self,
        task_name,
        task_definition,
        task_params,
        task_signature_obj=None,
        task_version=None,
    ):
        task_signature_obj = task_signature_obj or _generate_unique_tracking_signature(
        )

        super(TrackingTask, self).__init__(
            task_name=task_name,
            task_definition=task_definition,
            task_signature_obj=task_signature_obj,
            task_params=task_params,
        )

        self.task_definition = task_definition  # type: TaskDefinition
        # we don't have signature for outputs
        self.task_outputs_signature_obj = self.task_signature_obj
        self.ctrl = TrackingTaskCtrl(self)

        self.task_call_source = [
            self.dbnd_context.user_code_detector.find_user_side_frame(1)
        ]
        parent_task = try_get_current_task()
        if parent_task:
            parent_task.descendants.add_child(self.task_id)
            self.task_call_source.extend(parent_task.task_call_source)

            # inherit from parent if it has it
            self.task_version = task_version or parent_task.task_version
            self.task_target_date = parent_task.task_target_date
            self.task_env = parent_task.task_env
            # pass-through parent children scope params
            # task_children_scope_params will be used in case of any Task inside TrackedTask
            # for example tracked task creates Config objects
            self.task_children_scope_params = parent_task.task_children_scope_params
        else:
            # we need better definition of "what we use for tracking"
            self.task_version = task_version or utcnow().strftime(
                "%Y%m%d_%H%M%S")
            self.task_target_date = utcnow().date()
            self.task_env = get_databand_context().env
            self.task_children_scope_params = {}

        self.task_outputs = dict()
        for parameter, value in self._params.get_params_with_value(
                ParameterFilters.OUTPUTS):
            if is_not_defined(value):
                value_as_target = self.build_tracking_output(parameter)
                task_params.update_param_value(parameter.name, value_as_target)

            if isinstance(parameter, FuncResultParameter):
                continue

            # This is used to keep backward compatibility for tracking luigi behaviour
            # This is not something we want to keep, at least not in this form
            value = traverse_and_set_target(value,
                                            parameter._target_source(self))
            self.task_outputs[parameter.name] = value

        self.ctrl._initialize_task()

        # so we can be found via task_id
        self.dbnd_context.task_instance_cache.register_task_instance(self)
示例#8
0
    def _get_result_parameter(self):
        context = "{}.{}".format(self.decorator_spec.name, RESULT_PARAM)

        return_spec = guess_func_return_type(self.decorator_spec)

        deco_spec = None
        # first of all , let parse the definition we have
        if RESULT_PARAM in self.decorator_kwargs:
            # @task(result=...)
            deco_spec = self.decorator_kwargs[RESULT_PARAM]
            if isinstance(deco_spec, dict):
                raise friendly_error.task_parameters.dict_in_result_definition(
                    deco_spec)

            # @task(result=None)
            if deco_spec is None:
                # user explicitly don't want to have result value
                return {}

            if isinstance(deco_spec, six.string_types):
                # we have result = "output1,output2"
                # support both space and comma
                deco_spec = deco_spec.replace(",", " ").split()
                if len(deco_spec) == 1:
                    deco_spec = deco_spec[0]
            elif isinstance(deco_spec, tuple):
                deco_spec = list(deco_spec)

            # user didn't specify - so we don't have any "hints"
            if is_not_defined(return_spec):
                return_spec = None
            elif return_spec is not None:
                # we will use type hints from  "-> ..." spec only if it's has exact match to our params
                return_spec = self._validate_return_spec(
                    deco_spec, return_spec)
        else:
            # we don't have @task(result=)
            if return_spec is None:
                # .. -> None
                # user explicitly don't want to have result value
                return {}
            # let return default parameter ( pickle in @task)
            if is_not_defined(return_spec):
                return build_parameter(self.decorator_spec.default_result,
                                       context)

            # so we have something in return speck, let use it
            if isinstance(return_spec, list):
                # we can get names from ->
                deco_spec = [r[0] for r in return_spec]
            else:
                # or we just use default  name
                deco_spec = RESULT_PARAM

        # so now we have 2 cases
        # 1. we have list of results -->
        if isinstance(deco_spec, list):
            result = []
            for i, deco_p in enumerate(deco_spec):
                value_type_hint = None
                if return_spec:
                    _, value_type_hint = return_spec[i]

                deco_p = self._get_result_parameter_part(
                    p=deco_p,
                    name_hint="result_%s" % i,
                    value_type_hint=value_type_hint)
                result.append(deco_p)
            param = self._build_multiple_outputs_result(result)

        # 2. we have only one result-->
        else:
            param = self._get_result_parameter_part(
                p=deco_spec,
                name_hint=RESULT_PARAM,
                value_type_hint=return_spec)
        return build_parameter(param, context)