def test_create_and_validate_inference_load_config_from_saved_models( self, use_recent): project_dir = self.get_temp_dir() saved_models_dir = os.path.join(project_dir, ProjectDirs.TRAINER.saved_models) maybe_mkdir(saved_models_dir) saved_models_tag_dirs = [ os.path.join(saved_models_dir, sd) for sd in ['first', 'second', 'third'] ] for d in saved_models_tag_dirs: maybe_mkdir(d) if use_recent: saved_model_path = None saved_model_path_must = saved_models_tag_dirs[-1] else: saved_model_path = "first" saved_model_path_must = saved_models_tag_dirs[0] config = cconfigs.create_and_validate_inference_load_config( project_dir=project_dir, saved_model_path=saved_model_path) self.assertIsInstance(config, cconfigs.InferenceLoadConfig) self.assertEqual(config.saved_model, saved_model_path_must) self.assertIsNone(config.checkpoint) self.assertIsNone(config.meta_graph) with self.assertRaises(ValueError): _ = cconfigs.create_and_validate_inference_load_config( project_dir=project_dir, saved_model_path="path_without_file", checkpoint_path="path_without_file") with self.assertRaises(FileNotFoundError): _ = cconfigs.create_and_validate_inference_load_config( project_dir=project_dir, saved_model_path="path_without_file")
def _create_project_dirs_with_run_subfolders(project_dir, subdir, project_dirs_structure, run_name: Optional[str] = None, continue_last=False): logger = logging.getLogger(__name__) io_utils.maybe_mkdir(project_dir) lock = project_utils.ProjectLock(project_dir) lock.lock_or_wait() if continue_last: allowed_content_for_run = _get_allowed_content_for_run_to_continue( project_dirs_structure) else: allowed_content_for_run = ["configs"] run_subfolder = _select_run_subfolder( project_dir, subdir, run_name=run_name, allowed_content_for_run=allowed_content_for_run) logger.info("Use %s run", run_subfolder) project_structure = _update_project_structure_with_run_dir( run_subfolder, project_dirs_structure) kpi_dirs = _create_project_directories(project_dir, project_structure) dir_with_runs = os.path.join(project_dir, subdir) _add_symlink_for_last_run(os.path.join(dir_with_runs, run_subfolder)) lock.release() return kpi_dirs
def save(self, name: str, values): io_utils.maybe_mkdir(self.save_target) save_fname = os.path.join(self.save_target, name + ".json") values_filtered, values_filtered_out = kpi_utils.filter_kpi_values( values) if values_filtered_out: logging.info( "Following KPI keys will not be stored to json: " "%s", list(values_filtered_out.keys())) with open(save_fname, 'w') as file: json.dump(values_filtered, file, indent=2, sort_keys=True)
def cache(self, values): io_utils.maybe_mkdir(self.cache_target) cache_fname = self._get_cache_fname() inputs_flatten = nest_utils.flatten_nested_struct(values) if os.path.exists(cache_fname): logger = logging.getLogger(__name__) logger.warning("Cache with name %s already exist!", cache_fname) return with open(cache_fname, "w") as file: json.dump(inputs_flatten, file, default=lambda x: x.tolist())
def begin(self): """ Add graph to summary writer and create directory for summaries it it does not exist overridden from :obj:`tf.train.SummarySaverHook`. See its documentation for more information """ self._summary_op = tf.get_collection(tf.GraphKeys.SUMMARIES) super().begin() io_utils.maybe_mkdir(self._output_dir) graph = self._get_graph_fn() self._summary_writer.add_graph(graph)
def restore(self): io_utils.maybe_mkdir(self.cache_target) cache_fname = self._get_cache_fname() if not os.path.exists(cache_fname): return None with open(cache_fname, "r") as file: restored = json.load(file) for each_key in restored: if isinstance(restored[each_key], list): restored[each_key] = np.array(restored[each_key]) restored_unflatten = nest_utils.unflatten_dict_to_nested(restored) logger = logging.getLogger(__name__) logger.debug("restoring KPI values from %s", cache_fname) return restored_unflatten
def set_save_name(self, save_name: Optional[str] = None): """ Set full save target by joining prefix, save name, suffix together with save_target Parameters ---------- save_name save name """ logger = logging.getLogger(__name__) if isinstance(save_name, bytes): save_name = save_name.decode() if self.remove_save_ext and save_name: save_name = os.path.splitext(save_name)[0] if save_name: save_name = os.path.join(*file_utils.get_basename_with_depth( save_name, self.save_name_depth)) if not save_name: save_name = self.get_save_name_from_iteration_info() additional_names = [self.save_prefix, save_name, self.save_suffix] additional_names_concat = "-".join( [each_name for each_name in additional_names if each_name is not None]) self._save_name = self.target_separator.join( [self.log_dir, additional_names_concat]) if save_name and os.path.sep in save_name: root_directory = os.path.split(self._save_name)[0] try: os.makedirs(root_directory) logger.info("Directory %s was created by %s", root_directory, self.name) except FileExistsError: logger.debug("Directory %s needed by %s already exists", root_directory, self.name) io_utils.maybe_mkdir(root_directory)
def _get_cache_fname(cache_dir: str, file_name: str): """ Create name for cache file with temp stamp Parameters ---------- cache_dir directory for cache file_name file name; to this file name the temp unique suffix will be added Returns ------- cache_fname file name for cache """ if cache_dir is None: return None io_utils.maybe_mkdir(cache_dir) _, cache_fname = tempfile.mkstemp(prefix=file_name + "-", dir=cache_dir) os.remove(cache_fname) logger = logging.getLogger(__name__) logger.info("Use cache with file file_name %s", cache_fname) return cache_fname
def _get_and_maybe_create_summary_dir_for_mode(summary_dir: str, mode: str) -> str: summary_dir = os.path.join(summary_dir, mode) io_utils.maybe_mkdir(summary_dir) return summary_dir
def create_trainer_project_dirs(project_dir: str, continue_training: bool = False ) -> _TrainerDirs: """ Create project directories for training if needed and check if training project already exists Create following directories under project_dir: - training: - artifacts - artifacts of the training, e.g. dna, file_lists etc. - callbacks/{train/eval} - callbacks can write here - summaries/{train/eval} - tensorflow summaries - saved_models - saved models in timestamp subfolders and inference_graph.meta together with input_output_names.json - checkpoints - checkpoints like meta graph and the weights - (optional, if no mlflow URI was specified) mlruns - mlflow tracking uri Parameters ---------- project_dir path to project directory continue_training if the training should be continued in same project folder; if no project directories exist, it will have no effect, otherwise if this set to False, no training will be started Returns ------- training_dirs trainer directories Raises ------ FileExistsError if this run was already used and exist inside of nucleus7_project.json file under runs key FileExistsError if project_dir has other content as """ lock = project_utils.ProjectLock(project_dir) lock.lock_or_wait() io_utils.maybe_mkdir(os.path.join(project_dir, _TRAINING_DIR)) try: _validate_training_project(project_dir, continue_training) # pylint: disable=invalid-name # is common practice to call exceptions as e except Exception as e: lock.release() raise e training_dirs = _create_project_directories(project_dir, ProjectDirs.TRAINER) for each_mode in ["train", "eval"]: io_utils.maybe_mkdir(os.path.join(training_dirs.summaries, each_mode)) io_utils.maybe_mkdir(os.path.join(training_dirs.callbacks, each_mode)) project_utils.collect_and_add_project_meta_info(project_dir) project_utils.add_runtype_to_project_meta_info(project_dir) lock.release() return training_dirs