Exemplo n.º 1
0
def wait(timeout):
    with new_dbnd_context(name="new_context") as dbnd_ctx:
        logger.info("Waiting {} seconds for tracker to become ready:".format(timeout))
        is_ready = wait_until(dbnd_ctx.tracking_store.is_ready, timeout)
        if not is_ready:
            logger.error("Tracker is not ready after {} seconds.".format(timeout))
            sys.exit(1)
        logger.info("Tracker is ready.")
Exemplo n.º 2
0
 def test_user_code_run(self):
     with new_dbnd_context(conf={
             "core": {
                 "user_init": "test_dbnd.test_task_context._user_code"
             }
     }):
         pass
     assert _user_code_run, "user code wasn't executed"
     logger.info("done")
Exemplo n.º 3
0
 def test_user_code_fail(self):
     with pytest.raises(Exception, match=r"USER_CODE_ERROR"):
         with new_dbnd_context(
                 conf={
                     "core": {
                         "user_init":
                         "test_dbnd.test_task_context._user_code_raises"
                     }
                 }):
             pass
Exemplo n.º 4
0
    def start(
        self,
        root_task_name,
        in_memory=True,
        run_uid=None,
        airflow_context=False,
        job_name=None,
    ):
        if try_get_databand_context():
            return

        if not airflow_context and not self._atexit_registered:
            atexit.register(self.stop)
            if is_airflow_enabled():
                from airflow.settings import dispose_orm

                atexit.unregister(dispose_orm)
        c = {
            "run": {
                "skip_completed": False
            },  # we don't want to "check" as script is task_version="now"
            "task": {
                "task_in_memory_outputs": in_memory
            },  # do not save any outputs
        }
        config.set_values(config_values=c, override=True, source="dbnd_start")
        context_kwargs = {"name": "airflow"} if airflow_context else {}
        # create databand context
        dc = self._enter_cm(
            new_dbnd_context(**context_kwargs))  # type: DatabandContext

        root_task = _build_inline_root_task(root_task_name,
                                            airflow_context=airflow_context)
        # create databand run
        dr = self._enter_cm(
            new_databand_run(
                context=dc,
                task_or_task_name=root_task,
                run_uid=run_uid,
                existing_run=False,
                job_name=job_name,
            ))  # type: DatabandRun

        if run_uid:
            root_task_run_uid = get_task_run_uid(run_uid, root_task_name)
        else:
            root_task_run_uid = None
        dr._init_without_run(root_task_run_uid=root_task_run_uid)

        self._start_taskrun(dr.driver_task_run)
        self._start_taskrun(dr.root_task_run)
        return dr
Exemplo n.º 5
0
    def set_context(self, ti):
        """
        Airflow's log handler use this method to setup the context when running a TaskInstance(=ti).
        We use this method to setup the dbnd context and communicate information to
        the `<airflow_operator>_execute` task, that we create in `execute_tracking.py`.
        """
        # we setting up only when we are not in our own orchestration dag
        if ti.dag_id.startswith(AD_HOC_DAG_PREFIX):
            return

        if config.getboolean("mlflow_tracking", "databand_tracking"):
            self.airflow_logger.warning(
                "dbnd can't track mlflow and airflow together please disable dbnd config "
                "`databand_tracking` in section `mlflow_tracking`")
            return

        # we are not tracking SubDagOperator
        if ti.operator == SubDagOperator.__name__:
            return

        task_key = calc_task_run_attempt_key_from_af_ti(ti)
        env_attempt_uid = os.environ.get(task_key)

        # This key is already set which means we are in --raw run
        if env_attempt_uid:
            # no need for further actions inside --raw run
            return

        # communicate the task_run_attempt_uid to inner processes
        # will be used for the task_run of `<airflow_operator>_execute` task
        self.task_run_attempt_uid = get_uuid()
        self.task_env_key = task_key
        os.environ[self.task_env_key] = str(self.task_run_attempt_uid)

        # airflow calculation for the relevant log_file
        log_relative_path = self.log_file_name_factory(ti, ti.try_number)
        self.log_file = os.path.join(self.airflow_base_log_dir,
                                     log_relative_path)

        # make sure we are not polluting the airflow logs
        get_dbnd_project_config().quiet_mode = True

        # tracking msg
        self.airflow_logger.info(
            "Tracked by Databand {version}".format(version=dbnd.__version__))

        # context with disabled logs
        self.dbnd_context_manage = new_dbnd_context(
            conf={"log": {
                "disabled": True
            }})
        self.dbnd_context = self.dbnd_context_manage.__enter__()
Exemplo n.º 6
0
    def start(self, root_task_name, job_name=None):
        if self._run or self._active or try_get_databand_run():
            return

        airflow_context = try_get_airflow_context()
        set_tracking_config_overide(use_dbnd_log=True, airflow_context=airflow_context)

        # 1. create proper DatabandContext so we can create other objects
        dc = self._enter_cm(new_dbnd_context())  # type: DatabandContext

        if airflow_context:
            root_task_or_task_name = AirflowOperatorRuntimeTask.build_from_airflow_context(
                airflow_context
            )
            source = UpdateSource.airflow_tracking
            job_name = "{}.{}".format(airflow_context.dag_id, airflow_context.task_id)
        else:
            root_task_or_task_name = _build_inline_root_task(root_task_name)
            source = UpdateSource.dbnd

        # create databand run
        # this will create databand run with driver and root tasks.

        # create databand run
        # we will want to preserve
        self._run = self._enter_cm(
            new_databand_run(
                context=dc,
                task_or_task_name=root_task_or_task_name,
                job_name=job_name,
                existing_run=False,
                source=source,
                af_context=airflow_context,
            )
        )  # type: DatabandRun

        if not self._atexit_registered:
            _set_process_exit_handler(self.stop)
            self._atexit_registered = True

        sys.excepthook = self.stop_on_exception
        self._active = True

        # now we send data to DB
        self._run._init_without_run()

        self._start_taskrun(self._run.driver_task_run)
        self._start_taskrun(self._run.root_task_run)
        self._task_run = self._run.root_task_run
        return self._task_run
Exemplo n.º 7
0
    def test_user_config_inject(self):
        with new_dbnd_context(
                conf={
                    "core": {
                        "user_init":
                        "test_dbnd.test_task_context.inject_some_params"
                    },
                    "MyConfig22": {
                        "config_id": "1"
                    },
                }) as c:
            c.dbnd_run_task(task_or_task_name="user_func")

        logger.info("done")
Exemplo n.º 8
0
    def start(self, root_task_name=None, airflow_context=None):
        if self._run or self._active or try_get_databand_run():
            return

        # we probably should use only airlfow context via parameter.
        # also, there are mocks that cover only get_dbnd_project_config().airflow_context
        airflow_context = airflow_context or get_dbnd_project_config().airflow_context()
        set_tracking_config_overide(use_dbnd_log=True, airflow_context=airflow_context)

        dc = self._enter_cm(
            new_dbnd_context(name="inplace_tracking")
        )  # type: DatabandContext

        if airflow_context:
            root_task, job_name, source = build_run_time_airflow_task(airflow_context)
        else:
            root_task = _build_inline_root_task(root_task_name)
            job_name = root_task.task_name
            source = UpdateSource.dbnd

        self._run = run = self._enter_cm(
            new_databand_run(
                context=dc,
                job_name=job_name,
                existing_run=False,
                source=source,
                af_context=airflow_context,
            )
        )  # type: DatabandRun
        self._run.root_task = root_task

        if not self._atexit_registered:
            _set_process_exit_handler(self.stop)
            self._atexit_registered = True

        sys.excepthook = self.stop_on_exception
        self._active = True

        # now we send data to DB
        root_task_run = run._build_and_add_task_run(root_task)
        root_task_run.is_root = True

        # No need to track the state because we track in init_run
        run.root_task_run.set_task_run_state(TaskRunState.RUNNING, track=False)
        run.tracker.init_run()

        self._enter_cm(run.root_task_run.runner.task_run_execution_context())
        self._task_run = run.root_task_run

        return self._task_run
Exemplo n.º 9
0
def databand_test_context(
    request, tmpdir, databand_context_kwargs, databand_config
):  # type: (...) -> DatabandContext

    test_config = {
        "run": {
            "name": _run_name_for_test_request(request),
            "heartbeat_interval_s": -1,
        },
        "local": {"root": str(tmpdir.join("local_root"))},
    }
    with config(test_config, source="databand_test_context"), new_dbnd_context(
        **databand_context_kwargs
    ) as t:
        yield t
Exemplo n.º 10
0
    def start(self, root_task_name, job_name=None):
        if self._run:
            return
        if self._started or self._disabled:  # started or failed
            return

        try:
            if try_get_databand_run():
                return

            self._started = True

            # 1. create proper DatabandContext so we can create other objects
            set_tracking_config_overide(use_dbnd_log=True)
            # create databand context
            dc = self._enter_cm(new_dbnd_context())  # type: DatabandContext

            root_task = _build_inline_root_task(root_task_name)

            # create databand run
            self._run = self._enter_cm(
                new_databand_run(
                    context=dc,
                    task_or_task_name=root_task,
                    existing_run=False,
                    job_name=job_name,
                ))  # type: DatabandRun

            self._run._init_without_run()

            if not self._atexit_registered:
                atexit.register(self.stop)
            sys.excepthook = self.stop_on_exception

            self._start_taskrun(self._run.driver_task_run)
            self._start_taskrun(self._run.root_task_run)
            self._task_run = self._run.root_task_run
            return self._task_run
        except Exception:
            _handle_inline_error("inline-start")
            self._disabled = True
            return
        finally:
            self._started = True
Exemplo n.º 11
0
    def start(self, root_task_name=None, airflow_context=None):
        if self._run or self._active or try_get_databand_run():
            return

        airflow_context = airflow_context or try_get_airflow_context()
        set_tracking_config_overide(use_dbnd_log=True,
                                    airflow_context=airflow_context)

        dc = self._enter_cm(new_dbnd_context())  # type: DatabandContext

        if airflow_context:
            root_task, job_name, source = build_run_time_airflow_task(
                airflow_context)
        else:
            root_task = _build_inline_root_task(root_task_name)
            job_name = None
            source = UpdateSource.dbnd

        self._run = self._enter_cm(
            new_databand_run(
                context=dc,
                task_or_task_name=root_task,
                job_name=job_name,
                existing_run=False,
                source=source,
                af_context=airflow_context,
                send_heartbeat=False,
            ))  # type: DatabandRun

        if not self._atexit_registered:
            _set_process_exit_handler(self.stop)
            self._atexit_registered = True

        sys.excepthook = self.stop_on_exception
        self._active = True

        # now we send data to DB
        self._run._init_without_run()
        self._start_taskrun(self._run.driver_task_run)
        self._start_taskrun(self._run.root_task_run)
        self._task_run = self._run.root_task_run

        return self._task_run
Exemplo n.º 12
0
def _list_tasks(ctx, module, search, is_config):
    from dbnd import Config
    from dbnd._core.context.databand_context import new_dbnd_context
    from dbnd._core.parameter.parameter_definition import _ParameterKind

    formatter = ctx.make_formatter()

    load_user_modules(config, modules=module)

    with new_dbnd_context():
        tasks = get_task_registry().list_dbnd_task_classes()

    for task_cls in tasks:
        td = task_cls.task_definition
        full_task_family = td.full_task_family
        task_family = td.task_family

        if not (task_family.startswith(search) or full_task_family.startswith(search)):
            continue

        if issubclass(task_cls, Config) != is_config:
            continue

        dl = []
        for param_name, param_obj in td.task_param_defs.items():
            if param_obj.system or param_obj.kind == _ParameterKind.task_output:
                continue
            if not is_config and param_name in COMMON_PARAMS:
                continue
            param_help = _help(param_obj.description)
            dl.append((param_name, param_help))

        if dl:
            with formatter.section(
                "{task_family} ({full_task_family})".format(
                    full_task_family=full_task_family, task_family=task_family
                )
            ):
                formatter.write_dl(dl)

    click.echo(formatter.getvalue().rstrip("\n"))
Exemplo n.º 13
0
def _get_task_run_mock(tra_uid):
    """
    We need better implementation for this,
    currently in use only for spark
    """
    try:
        from dbnd._core.task_run.task_run_tracker import TaskRunTracker

        task_run = TaskRunMock(tra_uid)
        from dbnd import config
        from dbnd._core.settings import CoreConfig

        with config({CoreConfig.tracker_raise_on_error: False},
                    source="on_demand_tracking"):
            with new_dbnd_context(name="fast_dbnd_context",
                                  autoload_modules=False) as fast_dbnd_ctx:
                trt = TaskRunTracker(task_run, fast_dbnd_ctx.tracking_store)
                task_run.tracker = trt
                return task_run
    except Exception:
        logger.info("Failed during dbnd inplace tracking init.", exc_info=True)
        return None
Exemplo n.º 14
0
def run(
    ctx,
    is_help,
    task,
    module,
    _sets,
    _sets_config,
    _sets_root,
    _overrides,
    verbose,
    describe,
    env,
    parallel,
    conf_file,
    task_version,
    project_name,
    name,
    description,
    run_driver,
    alternative_task_name,
    scheduled_job_name,
    scheduled_date,
    interactive,
    submit_driver,
    submit_tasks,
    disable_web_tracker,
):
    """
    Run a task or a DAG

    To see tasks use `dbnd show-tasks` (tab completion is available).
    """

    from dbnd._core.context.databand_context import new_dbnd_context, DatabandContext
    from dbnd._core.utils.structures import combine_mappings
    from dbnd import config

    task_name = task
    # --verbose, --describe, --env, --parallel, --conf-file and --project-name
    # we filter out false flags since otherwise they will always override the config with their falseness
    main_switches = dict(
        databand=filter_dict_remove_false_values(
            dict(
                verbose=verbose > 0,
                describe=describe,
                env=env,
                conf_file=conf_file,
                project_name=project_name,
            )
        ),
        run=filter_dict_remove_false_values(
            dict(
                name=name,
                parallel=parallel,
                description=description,
                is_archived=describe,
            )
        ),
    )

    if submit_driver is not None:
        main_switches["run"]["submit_driver"] = bool(submit_driver)
    if submit_tasks is not None:
        main_switches["run"]["submit_tasks"] = bool(submit_tasks)
    if disable_web_tracker:
        main_switches.setdefault("core", {})["tracker_api"] = "disabled"

    if task_version is not None:
        main_switches["task"] = {"task_version": task_version}

    cmd_line_config = parse_and_build_config_store(
        source="cli", config_values=main_switches
    )

    _sets = list(_sets)
    _sets_config = list(_sets_config)
    _sets_root = list(_sets_root)

    root_task_config = {}
    for _set in _sets_root:
        root_task_config = combine_mappings(left=root_task_config, right=_set)

    # remove all "first level" config values, assume that they are for the main task
    # add them to _sets_root
    for _set in _sets:
        for k, v in list(_set.items()):
            # so json-like values won't be included
            if "." not in k and isinstance(v, six.string_types):
                root_task_config[k] = v
                del _set[k]

    # --set, --set-config
    if _sets:
        cmd_line_config.update(_parse_cli(_sets, source="--set"))
    if _sets_config:
        cmd_line_config.update(_parse_cli(_sets_config, source="--set-config"))
    if _overrides:
        cmd_line_config.update(
            _parse_cli(_overrides, source="--set-override", override=True)
        )
    if interactive:
        cmd_line_config.update(
            _parse_cli([{"run.interactive": True}], source="--interactive")
        )
    if verbose > 1:
        cmd_line_config.update(
            _parse_cli([{"task_build.verbose": True}], source="-v -v")
        )

    if cmd_line_config:
        config.set_values(cmd_line_config, source="cmdline")
    if verbose:
        logger.info("CLI config: \n%s", pformat_config_store_as_table(cmd_line_config))

    # double checking on bootstrap, as we can run from all kind of locations
    # usually we should be bootstraped already as we run from cli.
    dbnd_bootstrap()
    if not config.getboolean("log", "disabled"):
        configure_basic_logging(None)

    scheduled_run_info = None
    if scheduled_job_name:
        scheduled_run_info = ScheduledRunInfo(
            scheduled_job_name=scheduled_job_name, scheduled_date=scheduled_date
        )

    with new_dbnd_context(
        name="run", module=module
    ) as context:  # type: DatabandContext
        task_registry = get_task_registry()

        tasks = task_registry.list_dbnd_task_classes()
        completer.refresh(tasks)

        # modules are loaded, we can load the task
        task_cls = None
        if task_name:
            task_cls = task_registry.get_task_cls(task_name)
            if alternative_task_name:
                task_cls = build_dynamic_task(
                    original_cls=task_cls, new_cls_name=alternative_task_name
                )
                task_name = alternative_task_name

        # --set-root
        # now we can get it config, as it's not main task, we can load config after the configuration is loaded
        if task_cls is not None:
            if root_task_config:
                # adding root task to configuration
                config.set_values(
                    {task_cls.task_definition.task_config_section: root_task_config},
                    source="--set-root",
                )

        if is_help or not task_name:
            print_help(ctx, task_cls)
            return

        return context.dbnd_run_task(
            task_or_task_name=task_name,
            run_uid=run_driver,
            scheduled_run_info=scheduled_run_info,
        )
Exemplo n.º 15
0
def cmd_run(
    ctx,
    is_help,
    task,
    module,
    _sets,
    _sets_config,
    _sets_root,
    _overrides,
    _extend,
    verbose,
    print_task_band,
    describe,
    env,
    parallel,
    conf_file,
    task_version,
    project,
    name,
    description,
    run_driver,
    override_run_uid,
    alternative_task_name,
    job_name,
    scheduled_job_name,
    scheduled_date,
    interactive,
    submit_driver,
    submit_tasks,
    disable_web_tracker,
    open_web_tab,
    docker_build_tag,
):
    """
    Run a task or a DAG

    To see all available tasks use `dbnd show-tasks` (tab completion is available).
    `dbnd show-configs` will print all available configs.
    """

    from dbnd import config
    from dbnd._core.context.databand_context import DatabandContext, new_dbnd_context
    from dbnd._core.utils.structures import combine_mappings

    task_registry = get_task_registry()

    # we need to do it before we are looking for the task cls
    load_user_modules(dbnd_config=config, modules=module)

    task_name = task
    # --verbose, --describe, --env, --parallel, --conf-file and --project
    # we filter out false flags since otherwise they will always override the config with their falseness
    main_switches = dict(
        databand=dict(
            verbose=verbose > 0,
            print_task_band=print_task_band,
            describe=describe,
            env=env,
            conf_file=conf_file,
            project=project,
        ),
        run=dict(
            name=name,
            parallel=parallel,
            interactive=interactive,
            description=description,
            is_archived=describe,
            open_web_tracker_in_browser=open_web_tab,
            submit_driver=_nullable_flag(submit_driver),
            submit_tasks=_nullable_flag(submit_tasks),
        ),
        kubernetes=dict(docker_build_tag=docker_build_tag),
        task=dict(task_version=task_version),
        task_build=dict(verbose=True if verbose > 1 else None),
        core=dict(tracker_api="disabled" if disable_web_tracker else None),
    )

    main_switches = cleanup_empty_switches(main_switches)

    _sets = list(_sets)
    _sets_config = list(_sets_config)
    _sets_root = list(_sets_root)

    root_task_config = {}
    for _set in _sets_root:
        root_task_config = combine_mappings(left=root_task_config, right=_set)

    # remove all "first level" config values, assume that they are for the main task
    # add them to _sets_root
    for _set in _sets:
        for k, v in list(_set.items()):
            # so json-like values won't be included
            if "." not in k and isinstance(v, six.string_types):
                root_task_config[k] = v
                del _set[k]

    cmd_line_config = parse_and_build_config_store(source="cli",
                                                   config_values=main_switches)
    # --set, --set-config
    if _sets:
        cmd_line_config.update(_parse_cli(_sets, source="--set"))
    if _sets_config:
        cmd_line_config.update(_parse_cli(_sets_config, source="--set-config"))
    if _extend:
        cmd_line_config.update(
            _parse_cli(_extend, source="--extend-config", extend=True))
    if _overrides:
        cmd_line_config.update(
            _parse_cli(
                _overrides,
                source="--set-override",
                priority=ConfigValuePriority.OVERRIDE,
            ))

    # --set-root
    if root_task_config:
        task_cls = task_registry.get_task_cls(task_name)
        task_section = task_cls.task_definition.task_config_section
        # adding root task to configuration
        cmd_line_config.update(
            parse_and_build_config_store(
                config_values={task_section: root_task_config},
                source="--set-root"))

    # UPDATE CURRENT CONFIG with CLI values
    if cmd_line_config:
        if verbose:
            logger.info("CLI config: \n%s",
                        pformat_config_store_as_table(cmd_line_config))
        config.set_values(cmd_line_config, source="cmdline")

    # double checking on bootstrap, as we can run from all kind of locations
    # usually we should be bootstraped already as we run from cli.
    dbnd_bootstrap()

    # initialize basic logging (until we get to the context logging
    if not config.getboolean("log", "disabled"):
        configure_basic_logging(None)

    scheduled_run_info = None
    if scheduled_job_name:
        scheduled_run_info = ScheduledRunInfo(
            scheduled_job_name=scheduled_job_name,
            scheduled_date=scheduled_date)

    # update completer
    if config.getboolean("databand", "completer"):
        tasks = task_registry.list_dbnd_task_classes()
        completer.refresh(tasks)

    # bootstrap and modules are loaded, we can load the task
    task_cls = None
    if task_name:
        task_cls = task_registry.get_task_cls(task_name)

    if not task_name:
        print_help(ctx, None)
        return

    if is_help:
        print_help(ctx, task_cls)
        return

    with tracking_mode_context(tracking=False), new_dbnd_context(
            name="run") as context:  # type: DatabandContext
        if context.settings.system.describe:
            # we want to print describe without triggering real run
            logger.info("Building main task '%s'", task_name)
            root_task = get_task_registry().build_dbnd_task(task_name)
            root_task.ctrl.describe_dag.describe_dag()
            # currently there is bug with the click version we have when using python 2
            # so we don't use the click.echo function
            # https://github.com/pallets/click/issues/564
            print("Task %s has been described!" % task_name)
            return root_task
        return context.dbnd_run_task(
            task_or_task_name=task_name,
            force_task_name=alternative_task_name,
            job_name=job_name or alternative_task_name or task_name,
            run_uid=run_driver or override_run_uid,
            existing_run=run_driver is not None,
            scheduled_run_info=scheduled_run_info,
            project=project,
        )
Exemplo n.º 16
0
    def start(self,
              root_task_name=None,
              project_name=None,
              airflow_context=None):
        if self._run or self._active or try_get_databand_run():
            return

        # we probably should use only airlfow context via parameter.
        # also, there are mocks that cover only get_dbnd_project_config().airflow_context
        airflow_context = airflow_context or get_dbnd_project_config(
        ).airflow_context()
        if airflow_context:
            _set_dbnd_config_from_airflow_connections()

        _set_tracking_config_overide(airflow_context=airflow_context)
        dc = self._enter_cm(
            new_dbnd_context(name="inplace_tracking"))  # type: DatabandContext

        if not root_task_name:
            # extract the name of the script we are running (in Airflow scenario it will be just "airflow")
            root_task_name = sys.argv[0].split(os.path.sep)[-1]

        if airflow_context:
            root_task, job_name, source, run_uid = build_run_time_airflow_task(
                airflow_context, root_task_name)
            try_number = airflow_context.try_number
        else:
            root_task = _build_inline_root_task(root_task_name)
            job_name = root_task_name
            source = UpdateSource.generic_tracking
            run_uid = None
            try_number = 1

        tracking_source = (
            None  # TODO_CORE build tracking_source -> typeof TrackingSourceSchema
        )
        self._run = run = self._enter_cm(
            new_databand_run(
                context=dc,
                job_name=job_name,
                run_uid=run_uid,
                existing_run=run_uid is not None,
                source=source,
                af_context=airflow_context,
                tracking_source=tracking_source,
                project_name=project_name,
            ))  # type: DatabandRun

        self._run.root_task = root_task

        self.update_run_from_airflow_context(airflow_context)

        if not self._atexit_registered:
            _set_process_exit_handler(self.stop)
            self._atexit_registered = True

        sys.excepthook = self.stop_on_exception
        self._active = True

        # now we send data to DB
        root_task_run = run._build_and_add_task_run(
            root_task, task_af_id=root_task.task_name, try_number=try_number)

        root_task_run.is_root = True

        run.tracker.init_run()
        run.root_task_run.set_task_run_state(TaskRunState.RUNNING)

        should_capture_log = TrackingConfig.from_databand_context(
        ).capture_tracking_log
        self._enter_cm(
            run.root_task_run.runner.task_run_execution_context(
                capture_log=should_capture_log, handle_sigterm=False))
        self._task_run = run.root_task_run

        return self._task_run
Exemplo n.º 17
0
    def test_foreign_context_should_not_fail(self):
        with new_dbnd_context():
            t = SimplestTask()
            t.dbnd_run()

        TTaskWithInput(t_input=t).dbnd_run()
Exemplo n.º 18
0
    def set_context(self, ti):
        """
        Airflow's log handler use this method to setup the context when running a TaskInstance(=ti).
        We use this method to setup the dbnd context and communicate information to
        the `<airflow_operator>_execute` task, that we create in `execute_tracking.py`.
        """
        # we setting up only when we are not in our own orchestration dag
        if ti.dag_id.startswith(AD_HOC_DAG_PREFIX):
            return

        if not is_dag_eligable_for_tracking(ti.dag_id):
            return

        if config.getboolean("mlflow_tracking", "databand_tracking"):
            self.airflow_logger.warning(
                "dbnd can't track mlflow and airflow together please disable dbnd config "
                "`databand_tracking` in section `mlflow_tracking`")
            return

        # we are not tracking SubDagOperator
        if ti.operator is None or ti.operator == SubDagOperator.__name__:
            return

        # Airflow is running with two process `run` and `--raw run`.
        # But we want the handler to run only once (Idempotency)
        # So we are using an environment variable to sync those two process
        task_key = calc_task_key_from_af_ti(ti)
        if os.environ.get(task_key, False):
            # This key is already set which means we are in `--raw run`
            return
        else:
            # We are in the outer `run`
            self.task_env_key = task_key
            # marking the environment with the current key for the
            environ_utils.set_on(task_key)
            from dbnd_airflow.tracking.dbnd_airflow_conf import (
                set_dbnd_config_from_airflow_connections, )

            # When we are in `--raw run`, in tracking, it runs the main airflow process
            # for every task, which made some of the features to run twice,
            # once when the `worker` process ran, and once when the `main` one ran,
            # which made some of the features to run with different configurations.
            # it still runs twice, but know with the same configurations.
            set_dbnd_config_from_airflow_connections()

        self.task_run_attempt_uid = get_task_run_attempt_uid_from_af_ti(ti)

        # airflow calculation for the relevant log_file
        log_relative_path = self.log_file_name_factory(ti, ti.try_number)
        self.log_file = os.path.join(self.airflow_base_log_dir,
                                     log_relative_path)

        # make sure we are not polluting the airflow logs
        get_dbnd_project_config().quiet_mode = True

        # tracking msg
        self.airflow_logger.info("Databand Tracking Started {version}".format(
            version=dbnd.__version__))

        # context with disabled logs
        self.dbnd_context_manage = new_dbnd_context(
            conf={"log": {
                "disabled": True
            }})
        self.dbnd_context = self.dbnd_context_manage.__enter__()
Exemplo n.º 19
0
    def __init__(self, af_context):
        # type: (AirflowTaskContext) -> None
        self.run_uid = get_job_run_uid(
            dag_id=af_context.root_dag_id,
            execution_date=af_context.execution_date)
        self.dag_id = af_context.dag_id
        # this is the real operator uid, we need to connect to it with our "tracked" task,
        # so the moment monitor is on -> we can sync
        af_runtime_op_task_id = af_context.task_id
        self.af_operator_sync__task_run_uid = get_task_run_uid(
            self.run_uid, af_context.dag_id, af_runtime_op_task_id)
        # 1. create proper DatabandContext so we can create other objects
        set_tracking_config_overide(
            use_dbnd_log=override_airflow_log_system_for_tracking())

        # create databand context
        with new_dbnd_context(name="airflow") as dc:  # type: DatabandContext

            # now create "operator" task for current task_id,
            # we can't actually run it, we even don't know when it's going to finish
            # current execution is inside the operator, this is the only thing we know
            # STATE AFTER INIT:
            # AirflowOperator__runtime ->  DAG__runtime
            task_target_date = pendulum.parse(af_context.execution_date,
                                              tz=pytz.UTC).date()
            # AIRFLOW OPERATOR RUNTIME

            af_runtime_op = AirflowOperatorRuntimeTask(
                task_family=task_name_for_runtime(af_runtime_op_task_id),
                dag_id=af_context.dag_id,
                execution_date=af_context.execution_date,
                task_target_date=task_target_date,
                task_version="%s:%s" %
                (af_runtime_op_task_id, af_context.execution_date),
            )

            # this is the real operator uid, we need to connect to it with our "tracked" task,
            # so the moment monitor is on -> we can sync
            af_db_op_task_run_uid = get_task_run_uid(self.run_uid,
                                                     af_context.dag_id,
                                                     af_runtime_op_task_id)
            af_runtime_op.task_meta.extra_parents_task_run_uids.add(
                af_db_op_task_run_uid)
            af_runtime_op.ctrl.force_task_run_uid = TaskRunUidGen_TaskAfId(
                af_context.dag_id)

            self.af_operator_runtime__task = af_runtime_op
            # AIRFLOW DAG RUNTIME
            self.af_dag_runtime__task = AirflowDagRuntimeTask(
                task_name=task_name_for_runtime(DAG_SPECIAL_TASK_ID),
                dag_id=af_context.root_dag_id,  # <- ROOT DAG!
                execution_date=af_context.execution_date,
                task_target_date=task_target_date,
            )
            _add_child(self.af_dag_runtime__task,
                       self.af_operator_runtime__task)

            # this will create databand run with driver and root tasks.
            # we need the "root" task to be the same between different airflow tasks invocations
            # since in dbnd we must have single root task, so we create "dummy" task with dag_id name

            # create databand run
            # we will want to preserve
            with new_databand_run(
                    context=dc,
                    task_or_task_name=self.af_dag_runtime__task,
                    run_uid=self.run_uid,
                    existing_run=False,
                    job_name=af_context.root_dag_id,
                    send_heartbeat=False,  # we don't send heartbeat in tracking
                    source=UpdateSource.airflow_tracking,
            ) as dr:
                self.dr = dr
                dr._init_without_run()
                self.airflow_operator__task_run = dr.get_task_run_by_id(
                    af_runtime_op.task_id)