def _get_artifact_run_info_map(store: metadata_store.MetadataStore, artifact_ids: List[int]) -> Dict[int, _RunInfo]: """Returns a dictionary mapping artifact_id to its MyOrchestrator run_id. Args: store: MetaDataStore object to connect to MLMD instance. artifact_ids: A list of artifact ids to load. Returns: A dictionary containing artifact_id as a key and MyOrchestrator run_id as value. """ # Get events of artifacts. events = store.get_events_by_artifact_ids(artifact_ids) exec_to_artifact = {} for event in events: exec_to_artifact[event.execution_id] = event.artifact_id # Get execution of artifacts. executions = store.get_executions_by_id(list(exec_to_artifact.keys())) artifact_to_run_info = {} for execution in executions: run_id = execution.properties[RUN_ID_KEY].string_value component = execution.properties[_COMPONENT_ID].string_value artifact_id = exec_to_artifact[execution.id] artifact_to_run_info[artifact_id] = _RunInfo( run_id=run_id, component_name=component, started_at=execution.create_time_since_epoch) return artifact_to_run_info
def get_statisticsgen_dir_list( store: metadata_store.MetadataStore) -> List[str]: """Obtains a list of statisticsgen_dir from the store.""" stats_artifacts = store.get_artifacts_by_type(_STATS) stat_dirs_list = [artifact.uri for artifact in stats_artifacts] return stat_dirs_list
def get_model_dir_map(store: metadata_store.MetadataStore) -> Dict[str, str]: """Obtains a map of run_id to model_dir from the store.""" evaluator_execs = store.get_executions_by_type(_EVALUATOR) def _go_up_2_levels(eval_model_dirs): model_dir_set = set() for eval_model_dir in eval_model_dirs: model_dir_set.add(os.sep.join(eval_model_dir.split(os.sep)[:-2])) return list(model_dir_set) def _eval_execs_to_model_dir_map(eval_execs): model_dir_map = {} for eval_exec in eval_execs: run_id = eval_exec.properties[_RUN_ID].string_value pipeline_root = eval_exec.properties[_PIPELINE_ROOT].string_value eval_component_id = eval_exec.properties[ _COMPONENT_ID].string_value eval_config_path = os.path.join(pipeline_root, eval_component_id, 'evaluation', str(eval_exec.id), 'eval_config.json') with tf.io.gfile.GFile(eval_config_path, 'r') as f: eval_config = json.load(f) model_dir_map[run_id] = _go_up_2_levels( eval_config['modelLocations'].values()) return model_dir_map return _eval_execs_to_model_dir_map(evaluator_execs)
def _get_kaggle_results(store: metadata_store.MetadataStore) -> _Result: """Returns the kaggle score detail from the KagglePublisher component. Args: store: MetaDataStore object to connect to MLMD instance. Returns: A _Result objects with properties containing kaggle results. """ results = {} property_names = set() kaggle_artifacts = store.get_artifacts_by_type(_KAGGLE_RESULT) for artifact in kaggle_artifacts: submit_info = {} for key, val in artifact.custom_properties.items(): if key not in _DEFAULT_CUSTOM_PROPERTIES: name = _KAGGLE + '_' + key submit_info[name] = _parse_value(val) property_names = property_names.union(submit_info.keys()) results[artifact.id] = submit_info artifact_to_run_info = _get_artifact_run_info_map(store, list(results.keys())) properties = {} for artifact_id, submit_info in results.items(): run_info = artifact_to_run_info[artifact_id] result_key = run_info.run_id + run_info.component_name.replace( _KAGGLE_PUBLISHER_PREFIX, '') properties[result_key] = submit_info property_names = property_names.difference( {_NAME, _PRODUCER_COMPONENT, _STATE, *_DEFAULT_COLUMNS}) return _Result(properties=properties, property_names=sorted(property_names))
def _get_hparams(store: metadata_store.MetadataStore) -> _Result: """Returns the hparams of the EstimatorTrainer component. Args: store: MetaDataStore object to connect to MLMD instance. Returns: A _Result objects with properties containing hparams. """ results = {} hparam_names = set() trainer_execs = store.get_executions_by_type(_TRAINER) for ex in trainer_execs: run_id = ex.properties[RUN_ID_KEY].string_value hparams = _parse_hparams(ex.properties[_HPARAMS].string_value) hparam_names.update(hparams.keys()) hparams[RUN_ID_KEY] = run_id trainer_id = ex.properties[_COMPONENT_ID].string_value.replace( _TRAINER_PREFIX, '') result_key = run_id + trainer_id hparams[BENCHMARK_KEY] = trainer_id[1:] # Removing '.' prefix # BeamDagRunner uses iso format timestamp. See for details: # http://google3/third_party/py/tfx/orchestration/beam/beam_dag_runner.py try: hparams[STARTED_AT] = datetime.datetime.fromtimestamp(int(run_id)) except ValueError: hparams[STARTED_AT] = run_id results[result_key] = hparams return _Result(properties=results, property_names=sorted(hparam_names))
def _get_benchmark_results(store: metadata_store.MetadataStore) -> _Result: """Returns the benchmark results of the BenchmarkResultPublisher component. Args: store: MetaDataStore object to connect to MLMD instance. Returns: A _Result objects with properties containing benchmark results. """ metrics = {} property_names = set() publisher_artifacts = store.get_artifacts_by_type( br.BenchmarkResult.TYPE_NAME) for artifact in publisher_artifacts: evals = {} for key, val in artifact.custom_properties.items(): evals[key] = _parse_value(val) # Change for the IR world. if key == 'name': new_id = _parse_value(val).split(':') if len(new_id) > 2: evals[RUN_ID_KEY] = new_id[1] property_names = property_names.union(evals.keys()) metrics[artifact.id] = evals artifact_to_run_info = _get_artifact_run_info_map(store, list(metrics.keys())) properties = {} for artifact_id, evals in metrics.items(): run_info = artifact_to_run_info[artifact_id] started_at = run_info.started_at // 1000 evals[STARTED_AT] = datetime.datetime.fromtimestamp(started_at) if RUN_ID_KEY not in metrics[artifact_id]: # Non-IR based runner. continue run_id = metrics[artifact_id][RUN_ID_KEY] result_key = run_id + '.' + evals[ br.BenchmarkResult.BENCHMARK_NAME_KEY] if result_key in properties: properties[result_key].update(evals) else: properties[result_key] = {**evals} property_names = property_names.difference( {_NAME, _PRODUCER_COMPONENT, _STATE, *_DEFAULT_COLUMNS, _IS_IR_KEY}) return _Result(properties=properties, property_names=sorted(property_names))
def _get_benchmark_results(store: metadata_store.MetadataStore) -> _Result: """Returns the benchmark results of the BenchmarkResultPublisher component. Args: store: MetaDataStore object to connect to MLMD instance. Returns: A _Result objects with properties containing benchmark results. """ metrics = {} property_names = set() publisher_artifacts = store.get_artifacts_by_type(_BENCHMARK_RESULT) for artifact in publisher_artifacts: evals = {} for key, val in artifact.custom_properties.items(): evals[key] = _parse_value(val) property_names = property_names.union(evals.keys()) metrics[artifact.id] = evals artifact_to_run_info = _get_artifact_run_info_map(store, list(metrics.keys())) properties = {} for artifact_id, evals in metrics.items(): run_info = artifact_to_run_info[artifact_id] evals[RUN_ID_KEY] = run_info.run_id # BeamDagRunner uses iso format timestamp. See for details: # http://google3/third_party/py/tfx/orchestration/beam/beam_dag_runner.py try: evals[STARTED_AT] = datetime.datetime.fromtimestamp( int(run_info.run_id)) except ValueError: evals[STARTED_AT] = run_info.run_id result_key = run_info.run_id + '.' + evals[BENCHMARK_KEY] properties[result_key] = evals property_names = property_names.difference( {_NAME, _PRODUCER_COMPONENT, _STATE, *_DEFAULT_COLUMNS}) return _Result(properties=properties, property_names=sorted(property_names))