def start(self, *, run_uuid: Optional[str] = None, checkpoint: Optional[int] = None): if run_uuid is not None: if checkpoint is None: checkpoint = -1 global_step = self.__start_from_checkpoint(run_uuid, checkpoint) else: global_step = 0 self.run.start_step = global_step self._start_tracker() tracker().set_start_global_step(global_step) if self.distributed_rank == 0: self.__print_info() if self.check_repo_dirty and self.run.is_dirty: logger.log([ ("[FAIL]", Text.danger), " Cannot trial an experiment with uncommitted changes." ]) exit(1) if not self.is_evaluate: if self.distributed_rank == 0: from labml.internal.computer.configs import computer_singleton computer_singleton().add_project(lab_singleton().path) self.run.save_info() self._save_pid() if self.distributed_rank == 0: if self.configs_processor is not None: self.configs_processor.add_saver( FileConfigsSaver(self.run.configs_path)) if self.web_api is not None: self.web_api.start(self.run) if self.configs_processor is not None: self.configs_processor.add_saver( self.web_api.get_configs_saver()) self.web_api.set_dynamic_handler( ExperimentDynamicUpdateHandler( self.configs_processor)) if self.wandb is not None: self.wandb.init(self.run.name, self.run.run_path) if self.configs_processor is not None: self.configs_processor.add_saver( self.wandb.get_configs_saver()) tracker().save_indicators(self.run.indicators_path) self.is_started = True return ExperimentWatcher(self)
def start(self, *, run_uuid: Optional[str] = None, checkpoint: Optional[int] = None): if run_uuid is not None: if checkpoint is None: checkpoint = -1 global_step = self.__start_from_checkpoint(run_uuid, checkpoint) else: global_step = 0 self.run.start_step = global_step self._start_tracker() tracker().set_start_global_step(global_step) if self.distributed_rank == 0: self.__print_info() if self.check_repo_dirty and self.run.is_dirty: logger.log([ ("[FAIL]", Text.danger), " Cannot trial an experiment with uncommitted changes." ]) exit(1) if not self.is_evaluate: if self.distributed_rank == 0: self.run.save_info() self._save_pid() if self.distributed_rank == 0: if self.configs_processor is not None: self.configs_processor.add_saver( FileConfigsSaver(self.run.configs_path)) if self.web_api is not None: self.web_api.set_info(run_uuid=self.run.uuid, name=self.run.name, comment=self.run.comment) self.web_api.start() if self.configs_processor is not None: self.configs_processor.add_saver( self.web_api.get_configs_saver()) tracker().save_indicators(self.run.indicators_path) # PERF: Writing to tensorboard takes about 4 seconds # Also wont work when configs are updated live # if self.configs_processor: # tracker().write_h_parameters(self.configs_processor.get_hyperparams()) self.is_started = True return ExperimentWatcher(self)
def start(self, *, run_uuid: Optional[str] = None, checkpoint: Optional[int] = None): if run_uuid is not None: if checkpoint is None: checkpoint = -1 global_step = self.__start_from_checkpoint(run_uuid, checkpoint) else: global_step = 0 self.run.start_step = global_step logger_internal().set_start_global_step(global_step) self.__print_info() if self.check_repo_dirty and self.run.is_dirty: logger.log([("[FAIL]", Text.danger), " Cannot trial an experiment with uncommitted changes." ]) exit(1) if self.configs_processor is not None: self.configs_processor.print() self.run.save_info() if self.configs_processor is not None: self.configs_processor.save(self.run.configs_path) if self.web_api is not None: self.web_api.set_info(run_uuid=self.run.uuid, name=self.name, comment=self.run.comment) if self.configs_processor is not None: self.web_api.set_configs(self.configs_processor.to_json()) url = self.web_api.start() if url is not None: logger.log([('Monitor experiment at ', Text.meta), (url, Text.highlight)]) logger_internal().save_indicators(self.run.indicators_path) if self.configs_processor: logger_internal().write_h_parameters( self.configs_processor.get_hyperparams()) return ExperimentWatcher(self)