def prepare_resume(self): """Tries to resume the experiment by using the defined resume path or PytorchExperiment.""" checkpoint_file = "" base_dir = "" reset_epochs = self._resume_reset_epochs if self._resume_path is not None: if isinstance(self._resume_path, str): if self._resume_path.endswith(".pth.tar"): checkpoint_file = self._resume_path base_dir = os.path.dirname( os.path.dirname(checkpoint_file)) elif self._resume_path.endswith( "checkpoint") or self._resume_path.endswith( "checkpoint/"): checkpoint_file = get_last_file(self._resume_path) base_dir = os.path.dirname( os.path.dirname(checkpoint_file)) elif "checkpoint" in os.listdir( self._resume_path) and "config" in os.listdir( self._resume_path): checkpoint_file = get_last_file(self._resume_path) base_dir = self._resume_path else: warnings.warn( "You have not selected a valid experiment folder, will search all sub folders", UserWarning) if self.elog is not None: self.elog.text_logger.log_to( "You have not selected a valid experiment folder, will search all " "sub folders", "warnings") checkpoint_file = get_last_file(self._resume_path) base_dir = os.path.dirname( os.path.dirname(checkpoint_file)) if base_dir: if not self._ignore_resume_config: load_config = Config() load_config.load(os.path.join(base_dir, "config/config.json")) self._config_raw = load_config self.config = Config.init_objects(self._config_raw) self.print("Loaded existing config from:", base_dir) if self.n_epochs is None: self.n_epochs = self._config_raw.get("n_epochs") if checkpoint_file: self.load_checkpoint(name="", path=checkpoint_file, save_types=self._resume_save_types) self._resume_path = checkpoint_file shutil.copyfile( checkpoint_file, os.path.join(self.elog.checkpoint_dir, "0_checkpoint.pth.tar")) self.print("Loaded existing checkpoint from:", checkpoint_file) self._resume_reset_epochs = reset_epochs if self._resume_reset_epochs: self._epoch_idx = 0
def __init__(self, config=None, name=None, n_epochs=None, seed=None, base_dir=None, globs=None, resume=None, ignore_resume_config=False, resume_save_types=("model", "optimizer", "simple", "th_vars", "results"), resume_reset_epochs=True, parse_sys_argv=False, parse_config_sys_argv=True, checkpoint_to_cpu=True, safe_checkpoint_every_epoch=1, use_visdomlogger=True, visdomlogger_kwargs=None, visdomlogger_c_freq=1, use_explogger=True, explogger_kwargs=None, explogger_c_freq=100, use_telegrammessagelogger=False, telegrammessagelogger_kwargs=None, telegrammessagelogger_c_freq=1000, append_rnd_to_name=False): # super(PytorchExperiment, self).__init__() Experiment.__init__(self) if parse_sys_argv: config_path, resume_path = get_vars_from_sys_argv() if config_path: config = config_path if resume_path: resume = resume_path self._config_raw = None if isinstance(config, str): self._config_raw = Config(file_=config, update_from_argv=parse_config_sys_argv) elif isinstance(config, Config): self._config_raw = Config(config=config, update_from_argv=parse_config_sys_argv) elif isinstance(config, dict): self._config_raw = Config(config=config, update_from_argv=parse_config_sys_argv) else: self._config_raw = Config(update_from_argv=parse_config_sys_argv) self.n_epochs = n_epochs if 'n_epochs' in self._config_raw: self.n_epochs = self._config_raw["n_epochs"] if self.n_epochs is None: self.n_epochs = 0 self._seed = seed if 'seed' in self._config_raw: self._seed = self._config_raw.seed if self._seed is None: random_data = os.urandom(4) seed = int.from_bytes(random_data, byteorder="big") self._config_raw.seed = seed self._seed = seed self.exp_name = name if 'name' in self._config_raw: self.exp_name = self._config_raw["name"] if append_rnd_to_name: rnd_str = ''.join( random.choice(string.ascii_letters + string.digits) for _ in range(5)) self.exp_name += "_" + rnd_str if 'base_dir' in self._config_raw: base_dir = self._config_raw["base_dir"] self._checkpoint_to_cpu = checkpoint_to_cpu self._safe_checkpoint_every_epoch = safe_checkpoint_every_epoch self.results = dict() # Init loggers logger_list = [] self.vlog = None if use_visdomlogger: if visdomlogger_kwargs is None: visdomlogger_kwargs = {} self.vlog = PytorchVisdomLogger(name=self.exp_name, **visdomlogger_kwargs) if visdomlogger_c_freq is not None and visdomlogger_c_freq > 0: logger_list.append((self.vlog, visdomlogger_c_freq)) self.elog = None if use_explogger: if explogger_kwargs is None: explogger_kwargs = {} self.elog = PytorchExperimentLogger(base_dir=base_dir, experiment_name=self.exp_name, **explogger_kwargs) if explogger_c_freq is not None and explogger_c_freq > 0: logger_list.append((self.elog, explogger_c_freq)) # Set results log dict to the right path self.results = ResultLogDict("results-log.json", base_dir=self.elog.result_dir) self.tlog = None if use_telegrammessagelogger: if telegrammessagelogger_kwargs is None: telegrammessagelogger_kwargs = {} self.tlog = TelegramMessageLogger(**telegrammessagelogger_kwargs, exp_name=self.exp_name) if telegrammessagelogger_c_freq is not None and telegrammessagelogger_c_freq > 0: logger_list.append((self.tlog, telegrammessagelogger_c_freq)) self.clog = CombinedLogger(*logger_list) set_seed(self._seed) # Do the resume stuff self._resume_path = None self._resume_save_types = resume_save_types self._ignore_resume_config = ignore_resume_config self._resume_reset_epochs = resume_reset_epochs if resume is not None: if isinstance(resume, str): if resume == "last": self._resume_path = os.path.join( base_dir, sorted(os.listdir(base_dir))[-1]) else: self._resume_path = resume elif isinstance(resume, PytorchExperiment): self._resume_path = resume.elog.base_dir if self._resume_path is not None and not self._ignore_resume_config: self._config_raw.update(Config(file_=os.path.join( self._resume_path, "config", "config.json")), ignore=list( map(lambda x: re.sub("^-+", "", x), sys.argv))) # self.elog.save_config(self.config, "config_pre") if globs is not None: zip_name = os.path.join(self.elog.save_dir, "sources.zip") SourcePacker.zip_sources(globs, zip_name) # Init objects in config self.config = Config.init_objects(self._config_raw) atexit.register(self.at_exit_func)
def __init__(self, config=None, name=None, n_epochs=None, seed=None, base_dir=None, globs=None, resume=None, ignore_resume_config=False, resume_save_types=("model", "optimizer", "simple", "th_vars", "results"), resume_reset_epochs=True, parse_sys_argv=False, checkpoint_to_cpu=True, save_checkpoint_every_epoch=1, explogger_kwargs=None, explogger_freq=1, loggers=None, append_rnd_to_name=False, default_save_types=("model", "optimizer", "simple", "th_vars", "results")): # super(PytorchExperiment, self).__init__() Experiment.__init__(self) # check for command line inputs for config_path and resume_path, # will be prioritized over config and resume! config_path_from_argv = None if parse_sys_argv: config_path_from_argv, resume_path_from_argv = get_vars_from_sys_argv( ) if resume_path_from_argv: resume = resume_path_from_argv # construct _config_raw if config_path_from_argv is None: self._config_raw = self._config_raw_from_input( config, name, n_epochs, seed, append_rnd_to_name) else: self._config_raw = Config(file_=config_path_from_argv) update_from_sys_argv(self._config_raw) # set a few experiment attributes self.n_epochs = self._config_raw["n_epochs"] self._seed = self._config_raw['seed'] set_seed(self._seed) self.exp_name = self._config_raw["name"] self._checkpoint_to_cpu = checkpoint_to_cpu self._save_checkpoint_every_epoch = save_checkpoint_every_epoch self._default_save_types = ("model", "optimizer", "simple", "th_vars", "results") self.results = dict() # get base_dir from _config_raw or store there if base_dir is not None: self._config_raw["base_dir"] = base_dir base_dir = self._config_raw["base_dir"] # Construct experiment logger (automatically activated if base_dir is there) self.loggers = {} logger_list = [] if base_dir is not None: if explogger_kwargs is None: explogger_kwargs = {} self.elog = PytorchExperimentLogger(base_dir=base_dir, exp_name=self.exp_name, **explogger_kwargs) if explogger_freq is not None and explogger_freq > 0: logger_list.append((self.elog, explogger_freq)) self.results = ResultLogDict("results-log.json", base_dir=self.elog.result_dir) else: self.elog = None # Construct other loggers if loggers is not None: for logger_name, logger_cfg in loggers.items(): _logger, log_freq = self._make_logger(logger_name, logger_cfg) self.loggers[logger_name] = _logger if log_freq is not None and log_freq > 0: logger_list.append((_logger, log_freq)) self.clog = CombinedLogger(*logger_list) # Set resume attributes and update _config_raw, # actual resuming is done automatically after setup in _setup_internal self._resume_path = None self._resume_save_types = resume_save_types self._ignore_resume_config = ignore_resume_config self._resume_reset_epochs = resume_reset_epochs if resume is not None: if isinstance(resume, str): if resume == "last": if base_dir is None: raise ValueError("resume='last' requires base_dir.") self._resume_path = os.path.join( base_dir, sorted(os.listdir(base_dir))[-1]) else: self._resume_path = resume elif isinstance(resume, PytorchExperiment): self._resume_path = resume.elog.base_dir if self._resume_path is not None and not self._ignore_resume_config: self._config_raw.update(Config(file_=os.path.join( self._resume_path, "config", "config.json")), ignore=list( map(lambda x: re.sub("^-+", "", x), sys.argv))) # Save everything we need to reproduce experiment if globs is not None and self.elog is not None: zip_name = os.path.join(self.elog.save_dir, "sources.zip") SourcePacker.zip_sources(globs, zip_name) # Init objects in config self.config = Config.init_objects(self._config_raw) atexit.register(self.at_exit_func)