示例#1
0
    def test_task_metrics_simple(self, tmpdir, pandas_data_frame):
        metrics_folder = target(str(tmpdir))

        task_run = Mock()
        task_run.meta_files = TaskRunMetaFiles(metrics_folder)
        t = FileTrackingStore()
        tr_tracker = TaskRunTracker(task_run=task_run, tracking_store=t)
        tr_tracker.log_metric("a", 1)
        tr_tracker.log_metric("a_string", "1")
        tr_tracker.log_metric("a_list", [1, 3])
        tr_tracker.log_metric("a_tuple", (1, 2))
        tr_tracker.log_dataframe("df", pandas_data_frame)

        actual = TaskRunMetricsFileStoreReader(
            metrics_folder).get_all_metrics_values()

        print(actual)
        assert actual == {
            "a": 1.0,
            "a_list": "[1, 3]",
            "a_string": 1.0,
            "a_tuple": "(1, 2)",
            "df.preview": "Names  Births",
            "df.schema": "{",
            "df.shape": "[5, 2]",
            "df.shape_0_": 5.0,
            "df.shape_1_": 2.0,
        }
示例#2
0
    def test_task_metrics_simple(self, tmpdir, pandas_data_frame):
        metrics_folder = target(str(tmpdir))

        task_run = Mock()
        task_run.meta_files = TaskRunMetaFiles(metrics_folder)
        t = FileTrackingStore()
        tr_tracker = TaskRunTracker(task_run=task_run, tracking_store=t)
        tr_tracker.settings.features.get_value_meta_conf = Mock(
            return_value=ValueMetaConf.enabled()
        )
        tr_tracker.log_metric("a", 1)
        tr_tracker.log_metric("a_string", "1")
        tr_tracker.log_metric("a_list", [1, 3])
        tr_tracker.log_metric("a_tuple", (1, 2))

        user_metrics = TaskRunMetricsFileStoreReader(
            metrics_folder
        ).get_all_metrics_values(MetricSource.user)

        assert user_metrics == {
            "a": 1.0,
            "a_list": "[1, 3]",
            "a_string": 1.0,
            "a_tuple": "(1, 2)",
        }
    def test_task_metrics_histograms(self, tmpdir, pandas_data_frame):
        metrics_folder = target(str(tmpdir))

        task_run = Mock()
        task_run.meta_files = TaskRunMetaFiles(metrics_folder)
        t = FileTrackingStore()
        tr_tracker = TaskRunTracker(task_run=task_run, tracking_store=t)
        tr_tracker.settings.tracking.get_value_meta_conf = Mock(
            return_value=ValueMetaConf.enabled()
        )
        tr_tracker.log_data("df", pandas_data_frame, meta_conf=ValueMetaConf.enabled())

        hist_metrics = TaskRunMetricsFileStoreReader(
            metrics_folder
        ).get_all_metrics_values(MetricSource.histograms)

        expected_preview = (
            "   Names  Births  Married\n"
            "     Bob     968     True\n"
            " Jessica     155    False\n"
            "    Mary      77     True\n"
            "    John     578    False\n"
            "     Mel     973     True"
        )

        # std value varies in different py versions due to float precision fluctuation
        df_births_std = hist_metrics["df.Births.std"]
        assert df_births_std == pytest.approx(428.4246)
示例#4
0
 def init_attempt(self):
     self.task_run_attempt_uid = get_uuid()
     self.attempt_folder = self.task._meta_output.folder(
         "attempt_%s_%s" % (self.attempt_number, self.task_run_attempt_uid),
         extension=None,
     )
     self.meta_files = TaskRunMetaFiles(self.attempt_folder)
     self.log = TaskRunLogManager(task_run=self)
示例#5
0
    def init_new_task_run_attempt(self):
        # trying to find if we should use attempt_uid that been set from external process.
        # if so - the attempt_uid is uniquely for this task_run_attempt, and that why we pop.
        self.task_run_attempt_uid = get_task_run_attempt_uid_by_task_run(self)

        self.attempt_folder = self.task._meta_output.folder(
            "attempt_%s_%s" % (self.attempt_number, self.task_run_attempt_uid),
            extension=None,
        )
        self.attempt_folder_local = self.local_task_run_root.folder(
            "attempt_%s_%s" % (self.attempt_number, self.task_run_attempt_uid),
            extension=None,
        )
        self.attemp_folder_local_cache = self.attempt_folder_local.folder("cache")
        self.meta_files = TaskRunMetaFiles(self.attempt_folder)
        self.log = TaskRunLogManager(task_run=self)
示例#6
0
 def __init__(self, attempt_folder, **kwargs):
     super(TaskRunMetricsFileStoreReader, self).__init__(**kwargs)
     self.meta = TaskRunMetaFiles(attempt_folder)
示例#7
0
class TaskRunMetricsFileStoreReader(object):
    def __init__(self, attempt_folder, **kwargs):
        super(TaskRunMetricsFileStoreReader, self).__init__(**kwargs)
        self.meta = TaskRunMetaFiles(attempt_folder)

    def _get_all_metrics_names(self, source=None):
        metrics_root = self.meta.get_metric_folder(source=source)
        all_files = [
            os.path.basename(str(p)) for p in metrics_root.list_partitions()
        ]
        return all_files

    def get_metric_history(self, key, source=None):
        metric_target = self.meta.get_metric_target(key, source=source)
        if not metric_target.exists():
            raise DatabandError("Metric '%s' not found" % key)
        metric_data = metric_target.readlines()
        rsl = []
        for pair in metric_data:
            ts, val = pair.strip().split(" ")
            rsl.append(Metric(key, float(val),
                              datetime.fromtimestamp(int(ts))))
        return rsl

    def get_all_metrics_values(self, source=None):
        metrics = []
        for key in self._get_all_metrics_names(source=source):
            try:
                metrics.append(self.get_metric(key, source=source))
            except Exception as ex:
                raise DatabandError(
                    "Failed to read metrics for %s at %s" %
                    (key, self.meta.root),
                    nested_exceptions=ex,
                )
        return {m.key: m.value for m in metrics}

    def get_run_info(self):

        from dbnd.api.serialization.run import RunInfoSchema

        with self.meta.get_meta_data_file().open("r") as yaml_file:
            return RunInfoSchema().load(**yaml.load(yaml_file))

    def get_metric(self, key, source=None):
        metric_target = self.meta.get_metric_target(key, source=source)
        if not metric_target.exists():
            raise DatabandRuntimeError("Metric '%s' not found" % key)
        metric_data = metric_target.readlines()
        if len(metric_data) == 0:
            raise DatabandRuntimeError(
                "Metric '%s' is malformed. No data found." % key)
        first_line = metric_data[0]

        metric_parsed = _METRICS_RE.match(first_line)
        if not metric_parsed:
            raise DatabandRuntimeError(
                "Metric '%s' is malformed. Expected format: 'TS VALUE', got='%s'"
                % (key, first_line))

        timestamp, val = metric_parsed.groups()

        return Metric(
            key=key,
            value=_parse_metric(val),
            timestamp=datetime.fromtimestamp(int(timestamp)),
        )

    def get_artifact(self, name):
        artifact_target = self.meta.get_artifact_target(name)
        if not artifact_target.exists():
            raise DatabandError("Artifact '%s' not found" % name)
        return Artifact(artifact_target.path)
示例#8
0
    def test_task_metrics_histograms(self, tmpdir, pandas_data_frame):
        metrics_folder = target(str(tmpdir))

        task_run = Mock()
        task_run.meta_files = TaskRunMetaFiles(metrics_folder)
        t = FileTrackingStore()
        tr_tracker = TaskRunTracker(task_run=task_run, tracking_store=t)
        tr_tracker.settings.features.get_value_meta_conf = Mock(
            return_value=ValueMetaConf.enabled()
        )
        tr_tracker.log_data(
            "df", pandas_data_frame, meta_conf=ValueMetaConf.enabled(),
        )

        hist_metrics = TaskRunMetricsFileStoreReader(
            metrics_folder
        ).get_all_metrics_values(MetricSource.histograms)

        expected_preview = (
            "   Names  Births  Married\n"
            "     Bob     968     True\n"
            " Jessica     155    False\n"
            "    Mary      77     True\n"
            "    John     578    False\n"
            "     Mel     973     True"
        )

        # std value varies in different py versions due to float precision fluctuation
        df_births_std = hist_metrics["df.Births.std"]
        assert df_births_std == pytest.approx(428.4246)
        hist_metrics["df.histograms"].pop("Names")
        hist_metrics["df.histograms"].pop("Births")
        hist_metrics.pop("df.Married.top")
        hist_metrics.pop("df.Names.top")
        hist_metrics["df.stats"]["Names"].pop("top")
        hist_metrics["df.stats"]["Married"].pop("top")
        assert hist_metrics == {
            "df.Births.type": "int64",
            "df.Births.25%": 155.0,
            "df.Births.50%": 578.0,
            "df.Births.75%": 968.0,
            "df.Births.count": 5.0,
            "df.Births.distinct": 5,
            "df.Births.std": df_births_std,
            "df.Births.max": 973.0,
            "df.Births.mean": 550.2,
            "df.Births.min": 77.0,
            "df.Births.non-null": 5,
            "df.Births.null-count": 0,
            "df.Married.count": 5,
            "df.Married.distinct": 2,
            "df.Married.freq": 3,
            "df.Married.non-null": 5,
            "df.Married.null-count": 0,
            "df.Married.type": "bool",
            "df.Married.unique": 2,
            "df.Names.count": 5,
            "df.Names.distinct": 5,
            "df.Names.freq": 1,
            "df.Names.non-null": 5,
            "df.Names.null-count": 0,
            "df.Names.type": "object",
            "df.Names.unique": 5,
            "df.histograms": {"Married": [[3, 2], [True, False]],},
            "df.preview": expected_preview,
            "df.schema": {
                "columns": ["Names", "Births", "Married"],
                "dtypes": {"Births": "int64", "Names": "object", "Married": "bool"},
                "shape": [5, 3],
                "size": 15,
                "type": "DataFrame",
            },
            "df.shape": [5, 3],
            "df.shape0": 5,
            "df.shape1": 3,
            "df.stats": {
                "Births": {
                    "type": "int64",
                    "25%": 155.0,
                    "50%": 578.0,
                    "75%": 968.0,
                    "count": 5.0,
                    "distinct": 5,
                    "max": 973.0,
                    "mean": 550.2,
                    "min": 77.0,
                    "non-null": 5,
                    "null-count": 0,
                    "std": df_births_std,
                },
                "Married": {
                    "count": 5,
                    "distinct": 2,
                    "freq": 3,
                    "non-null": 5,
                    "null-count": 0,
                    "type": "bool",
                    "unique": 2,
                },
                "Names": {
                    "count": 5,
                    "distinct": 5,
                    "freq": 1,
                    "non-null": 5,
                    "null-count": 0,
                    "type": "object",
                    "unique": 5,
                },
            },
        }
示例#9
0
class TaskRunMetricsFileStoreReader(object):
    def __init__(self, attempt_folder, **kwargs):
        super(TaskRunMetricsFileStoreReader, self).__init__(**kwargs)
        self.meta = TaskRunMetaFiles(attempt_folder)

    def _get_all_metrics_names(self, source=None):
        metrics_root = self.meta.get_metric_folder(source=source)
        all_files = [
            os.path.basename(str(p)) for p in metrics_root.list_partitions()
        ]
        return [re.sub(r"\.json\b", "", f) for f in all_files]

    def get_metric_history(self, key, source=None):
        metric_target = self.meta.get_metric_target(key, source=source)
        if not metric_target.exists():
            raise DatabandError("Metric '%s' not found" % key)
        metric_data = metric_target.readlines()
        rsl = []
        for pair in metric_data:
            ts, val = pair.strip().split(" ")
            rsl.append(Metric(key, float(val),
                              datetime.fromtimestamp(int(ts))))
        return rsl

    def get_all_metrics_values(self, source=None):
        metrics = []
        for key in self._get_all_metrics_names(source=source):
            try:
                metrics.extend(self.get_metrics(key, source=source))
            except Exception as ex:
                raise DatabandError(
                    "Failed to read metrics for %s at %s" %
                    (key, self.meta.root),
                    nested_exceptions=ex,
                )
        return {m.key: m.value for m in metrics}

    def get_run_info(self):
        with self.meta.get_meta_data_file().open("r") as yaml_file:
            return RunInfoSchema().load(**yaml.load(yaml_file))

    def get_metrics(self, key, source=None):
        # type: (str, MetricSource) -> Iterable[Metric]
        if source == MetricSource.histograms:
            return self.get_histogram_metrics(key)

        metric_target = self.meta.get_metric_target(key, source=source)
        if not metric_target.exists():
            raise DatabandRuntimeError("Metric '%s' not found" % key)
        metric_data = metric_target.readlines()
        if len(metric_data) == 0:
            raise DatabandRuntimeError(
                "Metric '%s' is malformed. No data found." % key)
        first_line = metric_data[0]

        metric_parsed = _METRICS_RE.match(first_line)
        if not metric_parsed:
            raise DatabandRuntimeError(
                "Metric '%s' is malformed. Expected format: 'TS VALUE', got='%s'"
                % (key, first_line))

        timestamp, val = metric_parsed.groups()

        metric = Metric(
            key=key,
            value=_parse_metric(val),
            timestamp=datetime.fromtimestamp(int(timestamp)),
        )
        return [metric]

    def get_histogram_metrics(self, key):
        # type: (str) -> Iterable[Metric]
        metric_target = self.meta.get_metric_target(
            "{}.json".format(key), source=MetricSource.histograms)
        hist_metrics = json.load(metric_target)
        timestamp = hist_metrics["timestamp"]
        metrics = hist_metrics["metrics"]
        for name, value in metrics.items():
            if not isinstance(value, (dict, list)):
                yield Metric(
                    key="{}.{}".format(key, name),
                    value=value,
                    timestamp=datetime.fromtimestamp(timestamp),
                )
                continue

            yield Metric(
                key="{}.{}".format(key, name),
                value_json=value,
                timestamp=datetime.fromtimestamp(timestamp),
            )
            if name == "stats":
                for column, stats in value.items():
                    for stat, val in stats.items():
                        yield Metric(
                            key="{}.{}.{}".format(key, column, stat),
                            value=val,
                            timestamp=datetime.fromtimestamp(timestamp),
                        )
            elif name == "shape":
                for dim, val in enumerate(value):
                    yield Metric(
                        key="{}.shape{}".format(key, dim),
                        value=val,
                        timestamp=datetime.fromtimestamp(timestamp),
                    )

    def get_artifact(self, name):
        artifact_target = self.meta.get_artifact_target(name)
        if not artifact_target.exists():
            raise DatabandError("Artifact '%s' not found" % name)
        return Artifact(artifact_target.path)