Exemplo n.º 1
0
    def test_task_metrics_simple(self, tmpdir, pandas_data_frame):
        metrics_folder = target(str(tmpdir))

        task_run = Mock()
        task_run.meta_files = TaskRunMetaFiles(metrics_folder)
        t = FileTrackingStore()
        tr_tracker = TaskRunTracker(task_run=task_run, tracking_store=t)
        tr_tracker.settings.features.get_value_meta_conf = Mock(
            return_value=ValueMetaConf.enabled()
        )
        tr_tracker.log_metric("a", 1)
        tr_tracker.log_metric("a_string", "1")
        tr_tracker.log_metric("a_list", [1, 3])
        tr_tracker.log_metric("a_tuple", (1, 2))

        user_metrics = TaskRunMetricsFileStoreReader(
            metrics_folder
        ).get_all_metrics_values(MetricSource.user)

        assert user_metrics == {
            "a": 1.0,
            "a_list": "[1, 3]",
            "a_string": 1.0,
            "a_tuple": "(1, 2)",
        }
Exemplo n.º 2
0
    def test_task_metrics_histograms(self, tmpdir, pandas_data_frame):
        metrics_folder = target(str(tmpdir))

        task_run = Mock()
        task_run.meta_files = TaskRunMetaFiles(metrics_folder)
        t = FileTrackingStore()
        tr_tracker = TaskRunTracker(task_run=task_run, tracking_store=t)
        tr_tracker.settings.tracking.get_value_meta_conf = Mock(
            return_value=ValueMetaConf.enabled()
        )
        tr_tracker.log_data("df", pandas_data_frame, meta_conf=ValueMetaConf.enabled())

        hist_metrics = TaskRunMetricsFileStoreReader(
            metrics_folder
        ).get_all_metrics_values(MetricSource.histograms)

        expected_preview = (
            "   Names  Births  Married\n"
            "     Bob     968     True\n"
            " Jessica     155    False\n"
            "    Mary      77     True\n"
            "    John     578    False\n"
            "     Mel     973     True"
        )

        # std value varies in different py versions due to float precision fluctuation
        df_births_std = hist_metrics["df.Births.std"]
        assert df_births_std == pytest.approx(428.4246)
Exemplo n.º 3
0
 def test_task_run_url(self):
     stub_task_run = Mock()
     stub_task_run.run.tracker.databand_url = "http://example.com"
     stub_task_run.run.job_name = "test_job"
     stub_task_run.run.run_uid = "00000000-0000-0000-0000-000000000001"
     stub_task_run.task_run_uid = "00000000-0000-0000-0000-000000000002"
     tracker = TaskRunTracker(stub_task_run, {})
     expected = "http://example.com/app/jobs/test_job/00000000-0000-0000-0000-000000000001/00000000-0000-0000-0000-000000000002"
     assert tracker.task_run_url() == expected
Exemplo n.º 4
0
def try_get_or_create_task_run():
    # type: ()-> TaskRunTracker
    task_run = try_get_current_task_run()
    if task_run:
        return task_run

    try:
        from dbnd._core.task_run.task_run_tracker import TaskRunTracker
        from dbnd._core.configuration.environ_config import DBND_TASK_RUN_ATTEMPT_UID

        tra_uid = os.environ.get(DBND_TASK_RUN_ATTEMPT_UID)
        if tra_uid:
            task_run = TaskRunMock(tra_uid)
            from dbnd import config
            from dbnd._core.settings import CoreConfig

            with config({CoreConfig.tracker_raise_on_error: False},
                        source="ondemand_tracking"):
                tracking_store = CoreConfig().get_tracking_store()
                trt = TaskRunTracker(task_run, tracking_store)
                task_run.tracker = trt
                return task_run

        # let's check if we are in airflow env
        from dbnd._core.inplace_run.airflow_dag_inplace_tracking import (
            try_get_airflow_context, )

        airflow_context = try_get_airflow_context()
        if airflow_context:
            from dbnd._core.inplace_run.airflow_dag_inplace_tracking import (
                get_airflow_tracking_manager, )

            atm = get_airflow_tracking_manager(airflow_context)
            if atm:
                return atm.airflow_operator__task_run
        from dbnd._core.inplace_run.inplace_run_manager import is_inplace_run

        if is_inplace_run():
            return dbnd_run_start()

    except Exception:
        logger.info("Failed during dbnd inplace tracking init.", exc_info=True)
        return None
Exemplo n.º 5
0
def _get_task_run_mock(tra_uid):
    """
    We need better implementation for this,
    currently in use only for spark
    """
    try:
        from dbnd._core.task_run.task_run_tracker import TaskRunTracker

        task_run = TaskRunMock(tra_uid)
        from dbnd import config
        from dbnd._core.settings import CoreConfig

        with config(
            {CoreConfig.tracker_raise_on_error: False}, source="on_demand_tracking"
        ):
            tracking_store = CoreConfig().get_tracking_store()
            trt = TaskRunTracker(task_run, tracking_store)
            task_run.tracker = trt
            return task_run
    except Exception:
        logger.info("Failed during dbnd inplace tracking init.", exc_info=True)
        return None
Exemplo n.º 6
0
    def __init__(
        self,
        task,
        run,
        task_af_id=None,
        try_number=1,
        is_dynamic=None,
        task_engine=None,
    ):
        # type: (Task, DatabandRun, str, int, bool, EngineConfig)-> None
        # actually this is used as Task uid

        self.task = task  # type: Task
        self.run = run  # type: DatabandRun
        self.task_engine = task_engine
        self.try_number = try_number
        self.is_dynamic = is_dynamic if is_dynamic is not None else task.task_is_dynamic
        self.is_system = task.task_is_system
        self.task_af_id = task_af_id or self.task.task_id

        if task.ctrl.force_task_run_uid:
            self.task_run_uid = tr_uid = task.ctrl.force_task_run_uid
            if isinstance(tr_uid, TaskRunUidGen):
                self.task_run_uid = tr_uid.generate_task_run_uid(
                    run=run, task=task, task_af_id=self.task_af_id
                )
        else:
            self.task_run_uid = get_uuid()

        # used by all kind of submission controllers
        self.job_name = clean_job_name(self.task_af_id).lower()
        self.job_id = self.job_name + "_" + str(self.task_run_uid)[:8]

        # DNS-1123 subdomain name (k8s)
        self.job_id__dns1123 = clean_job_name_dns1123(
            "dbnd.{task_family}.{task_name}".format(
                task_family=self.task.task_meta.task_family,
                task_name=self.task.task_meta.task_name,
            ),
            postfix=".%s" % str(self.task_run_uid)[:8],
        )

        # custom per task engine , or just use one from global env
        dbnd_local_root = (
            self.task_engine.dbnd_local_root or self.run.env.dbnd_local_root
        )
        self.local_task_run_root = (
            dbnd_local_root.folder(run.run_folder_prefix)
            .folder("tasks")
            .folder(self.task.task_id)
        )

        self._attempt_number = 1
        self.task_run_attempt_uid = get_uuid()
        self.attempt_folder = None
        self.meta_files = None
        self.log = None
        self.init_attempt()

        # TODO: inherit from parent task if disabled
        self.is_tracked = task._conf__tracked

        if self.is_tracked and self.run.is_tracked:
            tracking_store = self.run.context.tracking_store
        else:
            tracking_store = ConsoleStore()

        self.tracking_store = tracking_store
        self.tracker = TaskRunTracker(task_run=self, tracking_store=tracking_store)
        self.runner = TaskRunRunner(task_run=self)
        self.deploy = TaskSyncCtrl(task_run=self)
        self.task_tracker_url = self.tracker.task_run_url()
        self.external_resource_urls = dict()
        self.errors = []

        self.is_root = False
        self.is_reused = False
        self.is_skipped = False
        # Task can be skipped as it's not required by any other task scheduled to run
        self.is_skipped_as_not_required = False

        self._airflow_context = None
        self._task_run_state = None

        self.start_time = None
        self.finished_time = None
Exemplo n.º 7
0
class TaskRun(object):
    def __init__(
        self,
        task,
        run,
        task_af_id=None,
        try_number=1,
        is_dynamic=None,
        task_engine=None,
    ):
        # type: (Task, DatabandRun, str, int, bool, EngineConfig)-> None
        # actually this is used as Task uid

        self.task = task  # type: Task
        self.run = run  # type: DatabandRun
        self.task_engine = task_engine
        self.try_number = try_number
        self.is_dynamic = is_dynamic if is_dynamic is not None else task.task_is_dynamic
        self.is_system = task.task_is_system
        self.task_af_id = task_af_id or self.task.task_id

        if task.ctrl.force_task_run_uid:
            self.task_run_uid = tr_uid = task.ctrl.force_task_run_uid
            if isinstance(tr_uid, TaskRunUidGen):
                self.task_run_uid = tr_uid.generate_task_run_uid(
                    run=run, task=task, task_af_id=self.task_af_id
                )
        else:
            self.task_run_uid = get_uuid()

        # used by all kind of submission controllers
        self.job_name = clean_job_name(self.task_af_id).lower()
        self.job_id = self.job_name + "_" + str(self.task_run_uid)[:8]

        # DNS-1123 subdomain name (k8s)
        self.job_id__dns1123 = clean_job_name_dns1123(
            "dbnd.{task_family}.{task_name}".format(
                task_family=self.task.task_meta.task_family,
                task_name=self.task.task_meta.task_name,
            ),
            postfix=".%s" % str(self.task_run_uid)[:8],
        )

        # custom per task engine , or just use one from global env
        dbnd_local_root = (
            self.task_engine.dbnd_local_root or self.run.env.dbnd_local_root
        )
        self.local_task_run_root = (
            dbnd_local_root.folder(run.run_folder_prefix)
            .folder("tasks")
            .folder(self.task.task_id)
        )

        self._attempt_number = 1
        self.task_run_attempt_uid = get_uuid()
        self.attempt_folder = None
        self.meta_files = None
        self.log = None
        self.init_attempt()

        # TODO: inherit from parent task if disabled
        self.is_tracked = task._conf__tracked

        if self.is_tracked and self.run.is_tracked:
            tracking_store = self.run.context.tracking_store
        else:
            tracking_store = ConsoleStore()

        self.tracking_store = tracking_store
        self.tracker = TaskRunTracker(task_run=self, tracking_store=tracking_store)
        self.runner = TaskRunRunner(task_run=self)
        self.deploy = TaskSyncCtrl(task_run=self)
        self.task_tracker_url = self.tracker.task_run_url()
        self.external_resource_urls = dict()
        self.errors = []

        self.is_root = False
        self.is_reused = False
        self.is_skipped = False
        # Task can be skipped as it's not required by any other task scheduled to run
        self.is_skipped_as_not_required = False

        self._airflow_context = None
        self._task_run_state = None

        self.start_time = None
        self.finished_time = None

    @property
    def task_run_env(self):
        return self.run.context.task_run_env

    def task_run_attempt_file(self, *path):
        return target(self.attempt_folder, *path)

    @property
    def last_error(self):
        return self.errors[-1] if self.errors else None

    def _get_log_files(self):

        log_local = None
        if self.log.local_log_file:
            log_local = self.log.local_log_file.path

        log_remote = None
        if self.log.remote_log_file:
            log_remote = self.log.remote_log_file.path

        return log_local, log_remote

    @property
    def task_run_state(self):
        return self._task_run_state

    @task_run_state.setter
    def task_run_state(self, value):
        raise AttributeError("Please use explicit .set_task_run_state()")

    def set_task_run_state(self, state, track=True, error=None):
        # type: (TaskRunState, bool, Any) -> bool
        # Optional bool track param - will send tracker.set_task_run_state() by default
        if not state or self._task_run_state == state:
            return False

        if error:
            self.errors.append(error)
        if state == TaskRunState.RUNNING:
            self.start_time = utcnow()

        self._task_run_state = state
        if track:
            self.tracking_store.set_task_run_state(
                task_run=self, state=state, error=error
            )
        return True

    def set_task_reused(self):
        self._task_run_state = TaskRunState.SUCCESS
        self.tracking_store.set_task_reused(task_run=self)

    def set_external_resource_urls(self, links_dict):
        # if value is None skip
        if links_dict is None:
            # TODO: Throw exception?
            return

        for link in links_dict:
            if not links_dict[link]:
                # Dict has empty fields
                # TODO: Throw exception?
                return

        self.external_resource_urls.update(links_dict)
        self.tracking_store.save_external_links(
            task_run=self, external_links_dict=links_dict
        )

    @property
    def attempt_number(self):
        return self._attempt_number

    @attempt_number.setter
    def attempt_number(self, value):
        if not value:
            raise DatabandRuntimeError("cannot set None as the attempt number")

        if value != self._attempt_number:
            self._attempt_number = value
            self.init_attempt()
            self.run.tracker.tracking_store.add_task_runs(
                run=self.run, task_runs=[self]
            )

    @property
    def airflow_context(self):
        return self._airflow_context

    @airflow_context.setter
    def airflow_context(self, value):
        self._airflow_context = value
        if self._airflow_context:
            self.attempt_number = self._airflow_context["ti"].try_number

    def init_attempt(self):
        self.task_run_attempt_uid = get_uuid()
        self.attempt_folder = self.task._meta_output.folder(
            "attempt_%s_%s" % (self.attempt_number, self.task_run_attempt_uid),
            extension=None,
        )
        self.meta_files = TaskRunMetaFiles(self.attempt_folder)
        self.log = TaskRunLogManager(task_run=self)

    def __repr__(self):
        return "TaskRun(%s, %s)" % (self.task.task_name, self.task_run_state)
Exemplo n.º 8
0
    def test_task_metrics_simple(self, tmpdir, pandas_data_frame):
        metrics_folder = target(str(tmpdir))

        task_run = Mock()
        task_run.meta_files = TaskRunMetaFiles(metrics_folder)
        t = FileTrackingStore()
        tr_tracker = TaskRunTracker(task_run=task_run, tracking_store=t)
        tr_tracker.log_metric("a", 1)
        tr_tracker.log_metric("a_string", "1")
        tr_tracker.log_metric("a_list", [1, 3])
        tr_tracker.log_metric("a_tuple", (1, 2))
        tr_tracker.log_dataframe("df", pandas_data_frame)

        actual = TaskRunMetricsFileStoreReader(
            metrics_folder).get_all_metrics_values()

        print(actual)
        assert actual == {
            "a": 1.0,
            "a_list": "[1, 3]",
            "a_string": 1.0,
            "a_tuple": "(1, 2)",
            "df.preview": "Names  Births",
            "df.schema": "{",
            "df.shape": "[5, 2]",
            "df.shape_0_": 5.0,
            "df.shape_1_": 2.0,
        }
Exemplo n.º 9
0
    def test_task_metrics_histograms(self, tmpdir, pandas_data_frame):
        metrics_folder = target(str(tmpdir))

        task_run = Mock()
        task_run.meta_files = TaskRunMetaFiles(metrics_folder)
        t = FileTrackingStore()
        tr_tracker = TaskRunTracker(task_run=task_run, tracking_store=t)
        tr_tracker.settings.features.get_value_meta_conf = Mock(
            return_value=ValueMetaConf.enabled()
        )
        tr_tracker.log_data(
            "df", pandas_data_frame, meta_conf=ValueMetaConf.enabled(),
        )

        hist_metrics = TaskRunMetricsFileStoreReader(
            metrics_folder
        ).get_all_metrics_values(MetricSource.histograms)

        expected_preview = (
            "   Names  Births  Married\n"
            "     Bob     968     True\n"
            " Jessica     155    False\n"
            "    Mary      77     True\n"
            "    John     578    False\n"
            "     Mel     973     True"
        )

        # std value varies in different py versions due to float precision fluctuation
        df_births_std = hist_metrics["df.Births.std"]
        assert df_births_std == pytest.approx(428.4246)
        hist_metrics["df.histograms"].pop("Names")
        hist_metrics["df.histograms"].pop("Births")
        hist_metrics.pop("df.Married.top")
        hist_metrics.pop("df.Names.top")
        hist_metrics["df.stats"]["Names"].pop("top")
        hist_metrics["df.stats"]["Married"].pop("top")
        assert hist_metrics == {
            "df.Births.type": "int64",
            "df.Births.25%": 155.0,
            "df.Births.50%": 578.0,
            "df.Births.75%": 968.0,
            "df.Births.count": 5.0,
            "df.Births.distinct": 5,
            "df.Births.std": df_births_std,
            "df.Births.max": 973.0,
            "df.Births.mean": 550.2,
            "df.Births.min": 77.0,
            "df.Births.non-null": 5,
            "df.Births.null-count": 0,
            "df.Married.count": 5,
            "df.Married.distinct": 2,
            "df.Married.freq": 3,
            "df.Married.non-null": 5,
            "df.Married.null-count": 0,
            "df.Married.type": "bool",
            "df.Married.unique": 2,
            "df.Names.count": 5,
            "df.Names.distinct": 5,
            "df.Names.freq": 1,
            "df.Names.non-null": 5,
            "df.Names.null-count": 0,
            "df.Names.type": "object",
            "df.Names.unique": 5,
            "df.histograms": {"Married": [[3, 2], [True, False]],},
            "df.preview": expected_preview,
            "df.schema": {
                "columns": ["Names", "Births", "Married"],
                "dtypes": {"Births": "int64", "Names": "object", "Married": "bool"},
                "shape": [5, 3],
                "size": 15,
                "type": "DataFrame",
            },
            "df.shape": [5, 3],
            "df.shape0": 5,
            "df.shape1": 3,
            "df.stats": {
                "Births": {
                    "type": "int64",
                    "25%": 155.0,
                    "50%": 578.0,
                    "75%": 968.0,
                    "count": 5.0,
                    "distinct": 5,
                    "max": 973.0,
                    "mean": 550.2,
                    "min": 77.0,
                    "non-null": 5,
                    "null-count": 0,
                    "std": df_births_std,
                },
                "Married": {
                    "count": 5,
                    "distinct": 2,
                    "freq": 3,
                    "non-null": 5,
                    "null-count": 0,
                    "type": "bool",
                    "unique": 2,
                },
                "Names": {
                    "count": 5,
                    "distinct": 5,
                    "freq": 1,
                    "non-null": 5,
                    "null-count": 0,
                    "type": "object",
                    "unique": 5,
                },
            },
        }
Exemplo n.º 10
0
class TaskRun(object):
    def __init__(
        self,
        task,
        run,
        task_af_id=None,
        try_number=1,
        is_dynamic=None,
        task_engine=None,
    ):
        # type: (Task, DatabandRun, str, int, bool, EngineConfig)-> None
        # actually this is used as Task uid

        self.task = task  # type: Task
        self.run = run  # type: DatabandRun
        self.task_engine = task_engine
        self.is_dynamic = is_dynamic if is_dynamic is not None else task.task_is_dynamic
        self.is_system = task.task_is_system
        self.task_af_id = task_af_id or self.task.task_id

        if task.ctrl.force_task_run_uid:
            self.task_run_uid = tr_uid = task.ctrl.force_task_run_uid
            if isinstance(tr_uid, TaskRunUidGen):
                self.task_run_uid = tr_uid.generate_task_run_uid(
                    run=run, task=task, task_af_id=self.task_af_id
                )
        else:
            self.task_run_uid = get_uuid()

        # used by all kind of submission controllers
        self.job_name = clean_job_name(self.task_af_id).lower()
        self.job_id = self.job_name + "_" + str(self.task_run_uid)[:8]

        # custom per task engine , or just use one from global env
        dbnd_local_root = (
            self.task_engine.dbnd_local_root or self.run.env.dbnd_local_root
        )
        self.local_task_run_root = (
            dbnd_local_root.folder(run.run_folder_prefix)
            .folder("tasks")
            .folder(self.task.task_id)
        )

        self.attempt_number = try_number
        self.task_run_attempt_uid = None
        self.attempt_folder = None
        self.meta_files = None
        self.log = None
        self.init_new_task_run_attempt()

        # TODO: inherit from parent task if disabled
        self.is_tracked = task._conf__tracked

        if self.is_tracked and self.run.is_tracked:
            tracking_store = self.run.context.tracking_store
        else:
            tracking_store = get_tracking_store(
                self.run.context,
                tracking_store_names=["console"],
                api_channel_name=None,
                max_retires=1,
                tracker_raise_on_error=False,
                remove_failed_store=True,
            )

        self.tracking_store = tracking_store
        self.tracker = TaskRunTracker(task_run=self, tracking_store=tracking_store)
        self.runner = TaskRunRunner(task_run=self)
        self.deploy = TaskSyncCtrl(task_run=self)
        self.sync_local = TaskRunLocalSyncer(task_run=self)
        self.task_tracker_url = self.tracker.task_run_url()
        self.external_resource_urls = dict()
        self.errors = []

        self.is_root = False
        self.is_reused = False
        self.is_skipped = False
        # Task can be skipped as it's not required by any other task scheduled to run
        self.is_skipped_as_not_required = False

        self.airflow_context = None
        self._task_run_state = None

        self.start_time = None
        self.finished_time = None

    def __getstate__(self):
        d = self.__dict__.copy()
        if "airflow_context" in d:
            # airflow context contains "auto generated code" that can not be pickled (Vars class)
            # we don't need to pickle it as we pickle DAGs separately
            d = self.__dict__.copy()
            del d["airflow_context"]
        return d

    @property
    def task_run_env(self):
        return self.run.context.task_run_env

    def task_run_attempt_file(self, *path):
        return target(self.attempt_folder, *path)

    @property
    def last_error(self):
        return self.errors[-1] if self.errors else None

    def _get_log_files(self):

        log_local = None
        if self.log.local_log_file:
            log_local = self.log.local_log_file.path

        log_remote = None
        if self.log.remote_log_file:
            log_remote = self.log.remote_log_file.path

        return log_local, log_remote

    @property
    def task_run_state(self):
        return self._task_run_state

    @task_run_state.setter
    def task_run_state(self, value):
        raise AttributeError("Please use explicit .set_task_run_state()")

    def set_task_run_state(self, state, track=True, error=None):
        # type: (TaskRunState, bool, Any) -> bool
        # Optional bool track param - will send tracker.set_task_run_state() by default
        if not state or self._task_run_state == state:
            return False

        if error:
            self.errors.append(error)
        if state == TaskRunState.RUNNING:
            self.start_time = utcnow()

        self._task_run_state = state
        if track:
            self.tracking_store.set_task_run_state(
                task_run=self, state=state, error=error
            )
        return True

    def set_task_reused(self):
        self._task_run_state = TaskRunState.SUCCESS
        self.tracking_store.set_task_reused(task_run=self)

    def set_external_resource_urls(self, links_dict):
        # if value is None skip
        if links_dict is None:
            # TODO: Throw exception?
            return

        for link in links_dict:
            if not links_dict[link]:
                # Dict has empty fields
                # TODO: Throw exception?
                return

        self.external_resource_urls.update(links_dict)
        self.tracking_store.save_external_links(
            task_run=self, external_links_dict=links_dict
        )

    def update_task_run_attempt(self, attempt_number):
        if attempt_number is None:
            raise DatabandRuntimeError("cannot set None as the attempt number")

        if self.attempt_number != attempt_number:
            self.attempt_number = attempt_number
            self.init_new_task_run_attempt()

    def init_new_task_run_attempt(self):
        # trying to find if we should use attempt_uid that been set from external process.
        # if so - the attempt_uid is uniquely for this task_run_attempt, and that why we pop.
        self.task_run_attempt_uid = get_task_run_attempt_uid_by_task_run(self)

        self.attempt_folder = self.task._meta_output.folder(
            "attempt_%s_%s" % (self.attempt_number, self.task_run_attempt_uid),
            extension=None,
        )
        self.attempt_folder_local = self.local_task_run_root.folder(
            "attempt_%s_%s" % (self.attempt_number, self.task_run_attempt_uid),
            extension=None,
        )
        self.attemp_folder_local_cache = self.attempt_folder_local.folder("cache")
        self.meta_files = TaskRunMetaFiles(self.attempt_folder)
        self.log = TaskRunLogManager(task_run=self)

    def __repr__(self):
        return "TaskRun(id=%s, af_id=%s)" % (self.task.task_id, self.task_af_id)