예제 #1
class RunConfig(config.Config):
    """Databand's per run settings (e.g. execution date)"""

    _conf__task_family = "run"

    # on none generated at DatabandRun
    name = parameter.value(default=None, description="Specify run name")[str]

    description = parameter.value(default=None, description="Specify run description")[

    parallel = parameter(default=None)[bool]
    task_executor_type = parameter(
        description="Alternate executor type: "
        " local/airflow_inprocess/airflow_multiprocess_local/airflow_kubernetes,"
        "  see docs for more options",

    submit_driver = parameter(
        description="override env.submit_driver for specific environment"
    submit_tasks = parameter(
        description="override env.submit_tasks for specific environment"

    enable_airflow_kubernetes = parameter(
        description="Enable use of kubernetes executor for kubebernetes engine submission",

    execution_date = parameter(default=None, description="Override execution date")[

    # Execution specific
    id = parameter(default=None, description="The list of task ids to run")[List[str]]
    selected_tasks_regex = parameter(
        default=None, description="Run only specified tasks (regular expresion)"

    ignore_dependencies = parameter(
        description="The regex to filter specific task_ids"
    ignore_first_depends_on_past = parameter(
        description="The regex to filter specific task_ids"

    pool = parameter(default=None, description="Resource pool to use")[str]

    donot_pickle = parameter(
        description="Do not attempt to pickle the DAG object to send over "
        "to the workers, just tell the workers to run their version "
        "of the code."

    mark_success = parameter(
        description="Mark jobs as succeeded without running them"
    skip_completed = parameter(
        description="Mark jobs as succeeded without running them"
    fail_fast = parameter(
        description="Skip all remaining tasks if a task has failed"
    enable_prod = parameter(description="Enable production tasks").value(False)
    is_archived = parameter(description="Save this run in the archive").value(False)

    heartbeat_interval_s = parameter(
        description="How often a run should send a heartbeat to the server. Set -1 to disable"
    heartbeat_timeout_s = parameter(
        description="How old can a run's last heartbeat be before we consider it failed. Set -1 to disable"
    heartbeat_sender_log_to_file = parameter(
        description="create a separate log file for the heartbeat sender and don't log the run process stdout"
    open_web_tracker_in_browser = parameter(
        description="If True, open web tracker in browser during task run."

    enable_concurent_sqlite = parameter(
        description="Enable concurrent execution with sqlite db (use only for debug!)"

    interactive = parameter(
        description="When submitting driver to remote execution keep tracking of submitted process and wait for completion",

    skip_completed_on_run = parameter(default=True).help(
        "Should dbnd task check that task is completed and mark it as resued on task execution"

    validate_task_inputs = parameter(default=True).help(
        "Should dbnd task check that all input files exist"

    validate_task_outputs = parameter(default=True).help(
        "Should dbnd task check that all outputs exist after task has been executed"

    validate_task_outputs_on_build = parameter(default=False).help(
        "Should dbnd task check that there are no incomplete outputs before task executes"

    tracking_with_cache = parameter(default=False).help(
        "Should dbnd cache results during tracking"

    pipeline_band_only_check = parameter(default=False).help(
        "When checking if pipeline is completed, check only if the band file exist (skip the tasks)"

    task_complete_parallelism_level = parameter(default=1).help(
        "Number of threads to use when checking if tasks are already complete"

    dry = parameter(default=False).help(
        "Do not execute tasks, stop before sending them to the execution, and print their status"
예제 #2
파일: run.py 프로젝트: databand-ai/dbnd
class RunConfig(config.Config):
    """Databand's per run settings (e.g. execution date)"""

    _conf__task_family = "run"

    # on none generated at DatabandRun
    name = parameter.value(default=None, description="Specify run name")[str]

    description = parameter.value(default=None, description="Specify run description")[

    # Executor configuration
    parallel = parameter(default=None)[bool]
    task_executor_type = parameter(
        description="Alternate executor type: "
        " local/airflow_inprocess/airflow_multiprocess_local/airflow_kubernetes,"
        "  see docs for more options",

    enable_airflow_kubernetes = parameter(
        description="Enable use of kubernetes executor for kubebernetes engine submission",

    # Local/Remote control
    interactive = parameter(
        description="When submitting driver to remote execution keep tracking of submitted process and wait for completion",
    submit_driver = parameter(
        description="override env.submit_driver for specific environment"
    submit_tasks = parameter(
        description="override env.submit_tasks for specific environment"

    # What to do on run
    open_web_tracker_in_browser = parameter(
        description="If True, open web tracker in browser during task run."

    is_archived = parameter(description="Save this run in the archive").value(False)

    dry = parameter(default=False).help(
        "Do not execute tasks, stop before sending them to the execution, and print their status"

    run_result_json_path = parameter(default=None).help(
        "The path to save the task band of the run"

    debug_pydevd_pycharm_port = parameter(default=None).help(
        "Enable debugging with `pydevd_pycharm` by setting this to the port value expecting the debugger to connect.\n"
        "This will start a new `settrace` connecting to `localhost` on the requested port, "
        "right before starting the driver task_run."

    execution_date = parameter(default=None, description="Override execution date")[
    mark_success = parameter(
        description="Mark jobs as succeeded without running them"

    # Task Selectors (to schedule specific task from pipeline)
    id = parameter(default=None, description="The list of task ids to run")[List[str]]
    selected_tasks_regex = parameter(
        default=None, description="Run only specified tasks (regular expresion)"

    ignore_dependencies = parameter(
        description="The regex to filter specific task_ids"
    ignore_first_depends_on_past = parameter(
        description="The regex to filter specific task_ids"

    # Scheduler configuration

    skip_completed = parameter(
        description="Mark jobs as succeeded without running them"
    fail_fast = parameter(
        description="Skip all remaining tasks if a task has failed"
    enable_prod = parameter(description="Enable production tasks").value(False)

    skip_completed_on_run = parameter(default=True).help(
        "Should dbnd task check that task is completed and mark it as re-used on task execution"

    validate_task_inputs = parameter(default=True).help(
        "Should dbnd task check that all input files exist"

    validate_task_outputs = parameter(default=True).help(
        "Should dbnd task check that all outputs exist after task has been executed"

    validate_task_outputs_on_build = parameter(default=False).help(
        "Should dbnd task check that there are no incomplete outputs before task executes"

    pipeline_band_only_check = parameter(default=False).help(
        "When checking if pipeline is completed, check only if the band file exist (skip the tasks)"

    recheck_circle_dependencies = parameter(
        description="Re check circle dependencies on every task creation,"
        " use it if you need to find of circle in your graph "

    task_complete_parallelism_level = parameter(default=1).help(
        "Number of threads to use when checking if tasks are already complete"

    pool = parameter(default=None, description="Resource pool to use")[str]

    # Advanced Run settings (debug/workarounds)
    # run .pickle file
    always_save_pipeline = parameter(
        description="Boolean for always saving pipeline to pickle"
    disable_save_pipeline = parameter(
        description="Boolean for disabling pipeline pickling"
    donot_pickle = parameter(
        description="Do not attempt to pickle the DAG object to send over "
        "to the workers, just tell the workers to run their version "
        "of the code."
    pickle_handler = parameter(
        description="Defines a python pickle handler to be used to pickle the "
        "run's data",
    enable_concurent_sqlite = parameter(
        description="Enable concurrent execution with sqlite db (use only for debug!)"

    # HEARTBEAT (process that updates on driver status every `heartbeat_interval_s`
    heartbeat_interval_s = parameter(
        description="How often a run should send a heartbeat to the server. Set -1 to disable"
    heartbeat_timeout_s = parameter(
        description="How old can a run's last heartbeat be before we consider it failed. Set -1 to disable"
    heartbeat_sender_log_to_file = parameter(
        description="create a separate log file for the heartbeat sender and don't log the run process stdout"

    hearbeat_disable_plugins = parameter(
        default=False, description="disable dbnd plugins at heartbeat sub-process"
    # Task/Pipeline in task Execution
    task_run_at_execution_time_enabled = parameter(
        default=True, description="Allow tasks calls during another task execution"
    task_run_at_execution_time_in_memory_outputs = parameter(
        description="Store outputs for inline task at execution time in memory (do not use FileSystem)",
    target_cache_on_access = parameter(
        default=True, description="Cache targets values in memory during execution"
예제 #3
파일: task.py 프로젝트: lbtanh/dbnd
class Task(_BaseTask, _TaskParamContainer):
    This is the base class of all dbnd Tasks, the base unit of work in databand.

    A dbnd Task describes a unit or work.

    The key methods of a Task, which must be implemented in a subclass are:

    * :py:meth:`run` - the computation done by this task.

    Each :py:class:`~dbnd.parameter` of the Task should be declared as members:

    .. code:: python

        class MyTask(dbnd.Task):
            count = dbnd.parameter[int]
            second_param = dbnd.parameter[str]

    In addition to any declared properties and methods, there are a few
    non-declared properties, which are created by the :py:class:`TaskMetaclass`

        This value can be overriden to set the namespace that will be used.
        (See :ref:`Task.namespaces_famlies_and_ids`)
        If it's not specified and you try to read this value anyway, it will return
        garbage. Please use :py:meth:`get_task_namespace` to read the namespace.

        Note that setting this value with ``@property`` will not work, because this
        is a class level value.

    _task_band_result = output(default=None, system=True)
    _meta_output = output(
        description="Location of all internal outputs (e.g. metrics)",
    task_band = output.json(output_name="band")

    task_enabled = parameter.system(scope=ParameterScope.children)[bool]
    task_enabled_in_prod = parameter.system(

    # for permanent bump of task version use Task.task_class_version
    task_version = parameter(
        description="task version, directly affects task signature ",

    task_class_version = parameter.value(
        description="task code version, "
        "use while you want persistent change in your task version",

    task_env = parameter.value(description="task environment name",

    task_target_date = parameter(description="task data target date",

    task_airflow_op_kwargs = parameter.system(
        default=None, description="airflow operator kwargs")[Dict[str, object]]

    task_config = parameter.system(empty_default=True)[Dict]
    task_is_system = parameter.system(default=False)[bool]

    task_in_memory_outputs = parameter.system(
        description="Store all task outputs in memory")[bool]
    task_is_dynamic = parameter.system(
        description="task was executed from within another task",

    # for example: if task.run doesn't have access to databand, we can't run runtime tasks
    task_supports_dynamic_tasks = parameter.system(
        description="indicates if task can run dynamic databand tasks")[bool]

    task_retries = parameter.system(
        "Total number of attempts to run the task. So task_retries=3 -> task can fail 3 times before we give up"

    task_retry_delay = parameter.system(
        description="timedelta to wait before retrying a task. Example: 5s")[

    _dbnd_call_state = None  # type: TaskCallState

    def __init__(self, **kwargs):
        super(Task, self).__init__(**kwargs)
        self.ctrl = TaskCtrl(self)

    def band(self):
        Please, do not override this function only in Pipeline/External tasks! we do all wiring work in Meta classes only
        Our implementation should never be coupled to code!

    def run(self):
        The task run method, to be overridden in a subclass.

        See :ref:`Task.run`
        pass  # default impl

    def task_outputs(self):
        The output that this Task produces.

        The output of the Task determines if the Task needs to be run--the task
        is considered finished iff the outputs all exist.
        See :ref:`Task.task_outputs`
        return self.ctrl.relations.task_outputs_user

    def task_dag(self):
        # type: (...)->_TaskDagNode
        return self.ctrl.task_dag

    def _complete(self):
        If the task has any outputs, return ``True`` if all outputs exist.
        Otherwise, return ``False``.

        However, you may freely override this method with custom logic.
        # we check only user side task outputs
        # all system tasks outputs are not important (if the exists or not)
        # user don't see them
        outputs = flatten(self.task_outputs)
        if len(outputs) == 0:
                "Task %r without outputs has no custom complete() method" %
            return False

        return all((o.exists() for o in outputs))

    def current_task_run(self):
        # type: ()->TaskRun
        return get_databand_run().get_task_run(self.task_id)

    def _output(self):
        The default output that this Task produces. Use outputs! Override only if you are writing "base" class
        return NOTHING

    def _requires(self):
        Override in "template" tasks which themselves are supposed to be

        Must return an iterable which among others contains the _requires() of
        the superclass.
        See :ref:`Task.requires`

    def _task_submit(self):
        Task submission logic, by default we just call -> _task_run() -> run()
        return self._task_run()

    def _task_run(self):
        # bring all relevant files
        with self._auto_load_save_params(auto_read=self._conf_auto_read_params,
            result = self.run()

        # publish all relevant files
        return result

    def set_upstream(self, task_or_task_list):

    def set_downstream(self, task_or_task_list):

    def __lshift__(self, other):
        return self.set_upstream(other)

    def __rshift__(self, other):
        return self.set_downstream(other)

    def set_global_upstream(self, task_or_task_list):

    def metrics(self):
        # backward compatible code
        return self.current_task_run.tracker

    def log_dataframe(
        meta_conf = ValueMetaConf(
        self.metrics.log_dataframe(key, df, meta_conf=meta_conf)

    def log_metric(self, key, value, source=None):
        Logs the passed-in parameter under the current run, creating a run if necessary.
        :param key: Parameter name (string)
        :param value: Parameter value (string)
        return self.metrics.log_metric(key, value, source=source)

    def log_system_metric(self, key, value):
        """Shortcut for log_metric(..., source="system") """
        return self.log_metric(key, value, source="system")

    def log_artifact(self, name, artifact):
        """Log a local file or directory as an artifact of the currently active run."""
        return self.metrics.log_artifact(name, artifact)

    def get_template_vars(self):
        # TODO: move to cached version, (after relations are built)
        base = {
            "task": self,
            "task_family": self.task_meta.task_family,
            "task_name": self.task_meta.task_name,
            "task_signature": self.task_meta.task_signature,
            "task_id": self.task_meta.task_id,
        if self.task_target_date is None:
            base["task_target_date"] = "input"
        return base

    def on_kill(self):
        Override this method to cleanup subprocesses when a task instance
        gets killed. Any use of the threading, subprocess or multiprocessing
        module within an operator needs to be cleaned up or it will leave
        ghost processes behind.

    def _get_task_output_path_format(self, output_mode):
        if self.task_env.production and output_mode == OutputMode.prod_immutable:
            return self.settings.output.path_prod_immutable_task
        return self._conf__base_output_path_fmt or self.settings.output.path_task

    def get_target(self, name, config=None, output_ext=None, output_mode=None):
        name = name or "tmp/dbnd-tmp-%09d" % random.randint(0, 999999999)
        config = config or TargetConfig()
        path_pattern = self._get_task_output_path_format(output_mode)

        path = calculate_path(

        return target(path, config=config)

    def get_root(self):
        return self.task_env.root

    def _initialize(self):
        super(Task, self)._initialize()

    def _should_run(self):
        if not self.task_enabled:
            return False

        if self.task_env.production:
            return self.task_enabled_in_prod or self.settings.run.enable_prod

        return True

    def dbnd_run(self):
        # type: (...)-> DatabandRun
        Run task via Databand execution system
        # this code should be executed under context!
        from dbnd._core.current import get_databand_context

        ctx = get_databand_context()
        result = ctx.dbnd_run_task(self)
        return result
예제 #4
파일: task.py 프로젝트: databand-ai/dbnd
class Task(_TaskWithParams, _TaskCtrlMixin, _TaskParamContainer):
    This is the base class of all dbnd Tasks, the base unit of work in databand.

    A dbnd Task describes a unit or work.

    A ``run`` method must be present in a subclass

    Each ``parameter`` of the Task should be declared as members::

        class MyTask(dbnd.Task):
            count = dbnd.parameter[int]
            second_param = dbnd.parameter[str]

    _conf_confirm_on_kill_msg = None  # get user confirmation on task kill if not empty
    _conf__require_run_dump_file = False

    _task_band_result = output(default=None, system=True)
    _meta_output = output(
        description="Location of all internal outputs (e.g. metrics)",
    task_band = output.json(output_name="band", system=True)

    task_enabled = system_passthrough_param(default=True)[bool]
    task_enabled_in_prod = system_passthrough_param(default=True)[bool]
    validate_no_extra_params = ParamValidation.error

    # for permanent bump of task version use Task.task_class_version
    task_version = parameter(
        description="task version, directly affects task signature ",

    task_class_version = parameter.value(
        description="task code version, "
        "use while you want persistent change in your task version",

    task_env = parameter.value(
        description="task environment name",

    task_target_date = parameter(
        description="task data target date",

    task_airflow_op_kwargs = parameter.system(
        default=None, description="airflow operator kwargs"
    )[Dict[str, object]]

    task_config = parameter.system(empty_default=True)[Dict]
    task_is_system = parameter.system(default=False)[bool]

    task_in_memory_outputs = system_passthrough_param(
        default=False, description="Store all task outputs in memory"

    task_output_path_format = system_passthrough_param(
        default=None, description="Format string used to generate task output paths"

    task_is_dynamic = system_passthrough_param(
        description="task was executed from within another task",

    # for example: if task.run doesn't have access to databand, we can't run runtime tasks
    task_supports_dynamic_tasks = parameter.system(
        default=True, description="indicates if task can run dynamic databand tasks"

    task_retries = parameter.system(
        description="Total number of attempts to run the task. So task_retries=3 -> task can fail 3 times before we give up",

    task_retry_delay = parameter.system(
        description="timedelta to wait before retrying a task. Example: 5s",

    task_essence = TaskEssence.ORCHESTRATION

    def __init__(self, **kwargs):
        super(Task, self).__init__(**kwargs)

        # used to communicate return value of "user function"
        self._dbnd_call_state = None  # type: Optional[TaskCallState]
        self.ctrl = TaskCtrl(self)

    def band(self):
        Please, do not override this function only in Pipeline/External tasks!

        We do all wiring work in Meta classes only.
        Our implementation should never be coupled to code!

    def run(self):
        The task run method, to be overridden in a subclass.

        See :ref:`Task.run`
        pass  # default impl

    def task_outputs(self):
        The output that this Task produces.

        The output of the Task determines if the Task needs to be run--the task
        is considered finished iff the outputs all exist.
        return self.ctrl.relations.task_outputs_user

    def task_dag(self):
        # type: (...)->_TaskDagNode
        return self.ctrl.task_dag

    def descendants(self):
        return self.ctrl.descendants

    def _complete(self):
        If the task has any outputs, return ``True`` if all outputs exist. Otherwise, return ``False``.

        However, you may freely override this method with custom logic.
        # we check only user side task outputs
        # all system tasks outputs are not important (if the exists or not)
        # user don't see them
        outputs = [
            o for o in flatten(self.task_outputs) if not o.config.overwrite_target
        if len(outputs) == 0:
            if not self.task_band:
                    "Task %r without outputs has no custom complete() and no task band!"
                    % self,
                return False
                return self.task_band.exists()

        incomplete_outputs = [str(o) for o in outputs if not o.exists()]

        num_of_incomplete_outputs = len(incomplete_outputs)

        if 0 < num_of_incomplete_outputs < len(outputs):
            complete_outputs = [str(o) for o in outputs if o.exists()]
            exc = incomplete_output_found_for_task(
                self.task_name, complete_outputs, incomplete_outputs

            if self.settings.run.validate_task_outputs_on_build:
                raise exc

        return num_of_incomplete_outputs == 0

    def current_task_run(self):
        # type: ()->TaskRun
        return get_databand_run().get_task_run(self.task_id)

    def _output(self):
        The default output that this Task produces.

        Use outputs! Override only if you are writing "base" class.
        return NOTHING

    def _requires(self):
        Override in "template" tasks which themselves are supposed to be subclassed.

        Must return an iterable which, among others, contains the _requires() of
        the superclass.

    def _task_submit(self):
        """Task submission logic, by default we just call -> ``_task_run()`` -> ``run()``."""
        return self._task_run()

    def _task_run(self):
        # bring all relevant files
        param_values = self.task_params.get_param_values()

        with auto_load_save_params(
            task=self, auto_read=self._conf_auto_read_params, param_values=param_values
            result = self.run()

        # publish all relevant files
        return result

    def tracker(self):
        return self.current_task_run.tracker

    def metrics(self):
        # backward compatible code
        return self.tracker

    def get_template_vars(self):
        # TODO: move to cached version, (after relations are built)
        base = {
            "task": self,
            "task_family": self.task_family,
            "task_name": self.task_name,
            "task_signature": self.task_signature,
            "task_id": self.task_id,
        if self.task_target_date is None:
            base["task_target_date"] = "input"
        return base

    def on_kill(self):
        Override this method to cleanup subprocesses when a task instance gets killed.

        Any use of the threading, subprocess or multiprocessing
        module within an operator needs to be cleaned up or it will leave
        ghost processes behind.

    def _get_task_output_path_format(self, output_mode):
        Defines the format string used to generate all task outputs.

        For example:
        if self.task_output_path_format:
            # explicit input - first priority
            return self.task_output_path_format
        if self._conf__base_output_path_fmt:
            # from class definition
            return self._conf__base_output_path_fmt

        # default behaviour
        if self.task_env.production and output_mode == OutputMode.prod_immutable:
            return self.settings.output.path_prod_immutable_task
        return self.settings.output.path_task

    def get_target(self, name, config=None, output_ext=None, output_mode=None):
        name = name or "tmp/dbnd-tmp-%09d" % random.randint(0, 999999999)
        config = config or TargetConfig()
        path_pattern = self._get_task_output_path_format(output_mode)

        path = calculate_path(

        return target(path, config=config)

    def get_root(self):
        return self.task_env.root

    def _initialize(self):
        super(Task, self)._initialize()

    def _should_run(self):
        if not self.task_enabled:
            return False

        if self.task_env.production:
            return self.task_enabled_in_prod or self.settings.run.enable_prod

        return True

    def _save_param(self, parameter, original_value, current_value):
        # type: (ParameterDefinition, Any, Any) -> None
        # it's output! we are going to save it.
        # task run doesn't always exist
        task_run = try_get_current_task_run()
        access_status = DbndTargetOperationStatus.OK
            if isinstance(original_value, InMemoryTarget):
                parameter.value_type = get_value_type_of_obj(
                    current_value, parameter.value_type

            parameter.dump_to_target(original_value, current_value)
            # it's a workaround, we don't want to change parameter for outputs (dynamically)
            # however, we need proper value type to "dump" preview an other meta.
            # we will update it only for In memory targets only for now

        except Exception as ex:
            access_status = DbndTargetOperationStatus.NOK
            raise friendly_error.task_execution.failed_to_save_value_to_target(
                ex, self, parameter, original_value, current_value
            if task_run:
                except Exception as ex:
                    logger.warning("Failed to log target to tracking store. %s", ex)

    def dbnd_run(self):
        # type: (...)-> DatabandRun
        """Run task via Databand execution system."""
        # this code should be executed under context!
        from dbnd._core.current import get_databand_context

        ctx = get_databand_context()
        run = ctx.dbnd_run_task(self)
        return run