def test_task_metrics_simple(self, tmpdir, pandas_data_frame): metrics_folder = target(str(tmpdir)) task_run = Mock() task_run.meta_files = TaskRunMetaFiles(metrics_folder) t = FileTrackingStore() tr_tracker = TaskRunTracker(task_run=task_run, tracking_store=t) tr_tracker.log_metric("a", 1) tr_tracker.log_metric("a_string", "1") tr_tracker.log_metric("a_list", [1, 3]) tr_tracker.log_metric("a_tuple", (1, 2)) tr_tracker.log_dataframe("df", pandas_data_frame) actual = TaskRunMetricsFileStoreReader( metrics_folder).get_all_metrics_values() print(actual) assert actual == { "a": 1.0, "a_list": "[1, 3]", "a_string": 1.0, "a_tuple": "(1, 2)", "df.preview": "Names Births", "df.schema": "{", "df.shape": "[5, 2]", "df.shape_0_": 5.0, "df.shape_1_": 2.0, }
def test_task_metrics_simple(self, tmpdir, pandas_data_frame): metrics_folder = target(str(tmpdir)) task_run = Mock() task_run.meta_files = TaskRunMetaFiles(metrics_folder) t = FileTrackingStore() tr_tracker = TaskRunTracker(task_run=task_run, tracking_store=t) tr_tracker.settings.features.get_value_meta_conf = Mock( return_value=ValueMetaConf.enabled() ) tr_tracker.log_metric("a", 1) tr_tracker.log_metric("a_string", "1") tr_tracker.log_metric("a_list", [1, 3]) tr_tracker.log_metric("a_tuple", (1, 2)) user_metrics = TaskRunMetricsFileStoreReader( metrics_folder ).get_all_metrics_values(MetricSource.user) assert user_metrics == { "a": 1.0, "a_list": "[1, 3]", "a_string": 1.0, "a_tuple": "(1, 2)", }
def test_task_metrics_histograms(self, tmpdir, pandas_data_frame): metrics_folder = target(str(tmpdir)) task_run = Mock() task_run.meta_files = TaskRunMetaFiles(metrics_folder) t = FileTrackingStore() tr_tracker = TaskRunTracker(task_run=task_run, tracking_store=t) tr_tracker.settings.tracking.get_value_meta_conf = Mock( return_value=ValueMetaConf.enabled() ) tr_tracker.log_data("df", pandas_data_frame, meta_conf=ValueMetaConf.enabled()) hist_metrics = TaskRunMetricsFileStoreReader( metrics_folder ).get_all_metrics_values(MetricSource.histograms) expected_preview = ( " Names Births Married\n" " Bob 968 True\n" " Jessica 155 False\n" " Mary 77 True\n" " John 578 False\n" " Mel 973 True" ) # std value varies in different py versions due to float precision fluctuation df_births_std = hist_metrics["df.Births.std"] assert df_births_std == pytest.approx(428.4246)
def init_attempt(self): self.task_run_attempt_uid = get_uuid() self.attempt_folder = self.task._meta_output.folder( "attempt_%s_%s" % (self.attempt_number, self.task_run_attempt_uid), extension=None, ) self.meta_files = TaskRunMetaFiles(self.attempt_folder) self.log = TaskRunLogManager(task_run=self)
def init_new_task_run_attempt(self): # trying to find if we should use attempt_uid that been set from external process. # if so - the attempt_uid is uniquely for this task_run_attempt, and that why we pop. self.task_run_attempt_uid = get_task_run_attempt_uid_by_task_run(self) self.attempt_folder = self.task._meta_output.folder( "attempt_%s_%s" % (self.attempt_number, self.task_run_attempt_uid), extension=None, ) self.attempt_folder_local = self.local_task_run_root.folder( "attempt_%s_%s" % (self.attempt_number, self.task_run_attempt_uid), extension=None, ) self.attemp_folder_local_cache = self.attempt_folder_local.folder("cache") self.meta_files = TaskRunMetaFiles(self.attempt_folder) self.log = TaskRunLogManager(task_run=self)
def __init__(self, attempt_folder, **kwargs): super(TaskRunMetricsFileStoreReader, self).__init__(**kwargs) self.meta = TaskRunMetaFiles(attempt_folder)
class TaskRunMetricsFileStoreReader(object): def __init__(self, attempt_folder, **kwargs): super(TaskRunMetricsFileStoreReader, self).__init__(**kwargs) self.meta = TaskRunMetaFiles(attempt_folder) def _get_all_metrics_names(self, source=None): metrics_root = self.meta.get_metric_folder(source=source) all_files = [ os.path.basename(str(p)) for p in metrics_root.list_partitions() ] return all_files def get_metric_history(self, key, source=None): metric_target = self.meta.get_metric_target(key, source=source) if not metric_target.exists(): raise DatabandError("Metric '%s' not found" % key) metric_data = metric_target.readlines() rsl = [] for pair in metric_data: ts, val = pair.strip().split(" ") rsl.append(Metric(key, float(val), datetime.fromtimestamp(int(ts)))) return rsl def get_all_metrics_values(self, source=None): metrics = [] for key in self._get_all_metrics_names(source=source): try: metrics.append(self.get_metric(key, source=source)) except Exception as ex: raise DatabandError( "Failed to read metrics for %s at %s" % (key, self.meta.root), nested_exceptions=ex, ) return {m.key: m.value for m in metrics} def get_run_info(self): from dbnd.api.serialization.run import RunInfoSchema with self.meta.get_meta_data_file().open("r") as yaml_file: return RunInfoSchema().load(**yaml.load(yaml_file)) def get_metric(self, key, source=None): metric_target = self.meta.get_metric_target(key, source=source) if not metric_target.exists(): raise DatabandRuntimeError("Metric '%s' not found" % key) metric_data = metric_target.readlines() if len(metric_data) == 0: raise DatabandRuntimeError( "Metric '%s' is malformed. No data found." % key) first_line = metric_data[0] metric_parsed = _METRICS_RE.match(first_line) if not metric_parsed: raise DatabandRuntimeError( "Metric '%s' is malformed. Expected format: 'TS VALUE', got='%s'" % (key, first_line)) timestamp, val = metric_parsed.groups() return Metric( key=key, value=_parse_metric(val), timestamp=datetime.fromtimestamp(int(timestamp)), ) def get_artifact(self, name): artifact_target = self.meta.get_artifact_target(name) if not artifact_target.exists(): raise DatabandError("Artifact '%s' not found" % name) return Artifact(artifact_target.path)
def test_task_metrics_histograms(self, tmpdir, pandas_data_frame): metrics_folder = target(str(tmpdir)) task_run = Mock() task_run.meta_files = TaskRunMetaFiles(metrics_folder) t = FileTrackingStore() tr_tracker = TaskRunTracker(task_run=task_run, tracking_store=t) tr_tracker.settings.features.get_value_meta_conf = Mock( return_value=ValueMetaConf.enabled() ) tr_tracker.log_data( "df", pandas_data_frame, meta_conf=ValueMetaConf.enabled(), ) hist_metrics = TaskRunMetricsFileStoreReader( metrics_folder ).get_all_metrics_values(MetricSource.histograms) expected_preview = ( " Names Births Married\n" " Bob 968 True\n" " Jessica 155 False\n" " Mary 77 True\n" " John 578 False\n" " Mel 973 True" ) # std value varies in different py versions due to float precision fluctuation df_births_std = hist_metrics["df.Births.std"] assert df_births_std == pytest.approx(428.4246) hist_metrics["df.histograms"].pop("Names") hist_metrics["df.histograms"].pop("Births") hist_metrics.pop("df.Married.top") hist_metrics.pop("df.Names.top") hist_metrics["df.stats"]["Names"].pop("top") hist_metrics["df.stats"]["Married"].pop("top") assert hist_metrics == { "df.Births.type": "int64", "df.Births.25%": 155.0, "df.Births.50%": 578.0, "df.Births.75%": 968.0, "df.Births.count": 5.0, "df.Births.distinct": 5, "df.Births.std": df_births_std, "df.Births.max": 973.0, "df.Births.mean": 550.2, "df.Births.min": 77.0, "df.Births.non-null": 5, "df.Births.null-count": 0, "df.Married.count": 5, "df.Married.distinct": 2, "df.Married.freq": 3, "df.Married.non-null": 5, "df.Married.null-count": 0, "df.Married.type": "bool", "df.Married.unique": 2, "df.Names.count": 5, "df.Names.distinct": 5, "df.Names.freq": 1, "df.Names.non-null": 5, "df.Names.null-count": 0, "df.Names.type": "object", "df.Names.unique": 5, "df.histograms": {"Married": [[3, 2], [True, False]],}, "df.preview": expected_preview, "df.schema": { "columns": ["Names", "Births", "Married"], "dtypes": {"Births": "int64", "Names": "object", "Married": "bool"}, "shape": [5, 3], "size": 15, "type": "DataFrame", }, "df.shape": [5, 3], "df.shape0": 5, "df.shape1": 3, "df.stats": { "Births": { "type": "int64", "25%": 155.0, "50%": 578.0, "75%": 968.0, "count": 5.0, "distinct": 5, "max": 973.0, "mean": 550.2, "min": 77.0, "non-null": 5, "null-count": 0, "std": df_births_std, }, "Married": { "count": 5, "distinct": 2, "freq": 3, "non-null": 5, "null-count": 0, "type": "bool", "unique": 2, }, "Names": { "count": 5, "distinct": 5, "freq": 1, "non-null": 5, "null-count": 0, "type": "object", "unique": 5, }, }, }
class TaskRunMetricsFileStoreReader(object): def __init__(self, attempt_folder, **kwargs): super(TaskRunMetricsFileStoreReader, self).__init__(**kwargs) self.meta = TaskRunMetaFiles(attempt_folder) def _get_all_metrics_names(self, source=None): metrics_root = self.meta.get_metric_folder(source=source) all_files = [ os.path.basename(str(p)) for p in metrics_root.list_partitions() ] return [re.sub(r"\.json\b", "", f) for f in all_files] def get_metric_history(self, key, source=None): metric_target = self.meta.get_metric_target(key, source=source) if not metric_target.exists(): raise DatabandError("Metric '%s' not found" % key) metric_data = metric_target.readlines() rsl = [] for pair in metric_data: ts, val = pair.strip().split(" ") rsl.append(Metric(key, float(val), datetime.fromtimestamp(int(ts)))) return rsl def get_all_metrics_values(self, source=None): metrics = [] for key in self._get_all_metrics_names(source=source): try: metrics.extend(self.get_metrics(key, source=source)) except Exception as ex: raise DatabandError( "Failed to read metrics for %s at %s" % (key, self.meta.root), nested_exceptions=ex, ) return {m.key: m.value for m in metrics} def get_run_info(self): with self.meta.get_meta_data_file().open("r") as yaml_file: return RunInfoSchema().load(**yaml.load(yaml_file)) def get_metrics(self, key, source=None): # type: (str, MetricSource) -> Iterable[Metric] if source == MetricSource.histograms: return self.get_histogram_metrics(key) metric_target = self.meta.get_metric_target(key, source=source) if not metric_target.exists(): raise DatabandRuntimeError("Metric '%s' not found" % key) metric_data = metric_target.readlines() if len(metric_data) == 0: raise DatabandRuntimeError( "Metric '%s' is malformed. No data found." % key) first_line = metric_data[0] metric_parsed = _METRICS_RE.match(first_line) if not metric_parsed: raise DatabandRuntimeError( "Metric '%s' is malformed. Expected format: 'TS VALUE', got='%s'" % (key, first_line)) timestamp, val = metric_parsed.groups() metric = Metric( key=key, value=_parse_metric(val), timestamp=datetime.fromtimestamp(int(timestamp)), ) return [metric] def get_histogram_metrics(self, key): # type: (str) -> Iterable[Metric] metric_target = self.meta.get_metric_target( "{}.json".format(key), source=MetricSource.histograms) hist_metrics = json.load(metric_target) timestamp = hist_metrics["timestamp"] metrics = hist_metrics["metrics"] for name, value in metrics.items(): if not isinstance(value, (dict, list)): yield Metric( key="{}.{}".format(key, name), value=value, timestamp=datetime.fromtimestamp(timestamp), ) continue yield Metric( key="{}.{}".format(key, name), value_json=value, timestamp=datetime.fromtimestamp(timestamp), ) if name == "stats": for column, stats in value.items(): for stat, val in stats.items(): yield Metric( key="{}.{}.{}".format(key, column, stat), value=val, timestamp=datetime.fromtimestamp(timestamp), ) elif name == "shape": for dim, val in enumerate(value): yield Metric( key="{}.shape{}".format(key, dim), value=val, timestamp=datetime.fromtimestamp(timestamp), ) def get_artifact(self, name): artifact_target = self.meta.get_artifact_target(name) if not artifact_target.exists(): raise DatabandError("Artifact '%s' not found" % name) return Artifact(artifact_target.path)