예제 #1
0
파일: __init__.py 프로젝트: bdtmnk/labml
    def start(self,
              *,
              run_uuid: Optional[str] = None,
              checkpoint: Optional[int] = None):
        if run_uuid is not None:
            if checkpoint is None:
                checkpoint = -1
            global_step = self.__start_from_checkpoint(run_uuid, checkpoint)
        else:
            global_step = 0

        self.run.start_step = global_step

        self._start_tracker()
        tracker().set_start_global_step(global_step)

        if self.distributed_rank == 0:
            self.__print_info()
            if self.check_repo_dirty and self.run.is_dirty:
                logger.log([
                    ("[FAIL]", Text.danger),
                    " Cannot trial an experiment with uncommitted changes."
                ])
                exit(1)

        if not self.is_evaluate:
            if self.distributed_rank == 0:
                from labml.internal.computer.configs import computer_singleton
                computer_singleton().add_project(lab_singleton().path)

                self.run.save_info()
            self._save_pid()

            if self.distributed_rank == 0:
                if self.configs_processor is not None:
                    self.configs_processor.add_saver(
                        FileConfigsSaver(self.run.configs_path))

                if self.web_api is not None:
                    self.web_api.start(self.run)
                    if self.configs_processor is not None:
                        self.configs_processor.add_saver(
                            self.web_api.get_configs_saver())
                        self.web_api.set_dynamic_handler(
                            ExperimentDynamicUpdateHandler(
                                self.configs_processor))

                if self.wandb is not None:
                    self.wandb.init(self.run.name, self.run.run_path)
                    if self.configs_processor is not None:
                        self.configs_processor.add_saver(
                            self.wandb.get_configs_saver())

                tracker().save_indicators(self.run.indicators_path)

        self.is_started = True
        return ExperimentWatcher(self)
예제 #2
0
    def start(self,
              *,
              run_uuid: Optional[str] = None,
              checkpoint: Optional[int] = None):
        if run_uuid is not None:
            if checkpoint is None:
                checkpoint = -1
            global_step = self.__start_from_checkpoint(run_uuid, checkpoint)
        else:
            global_step = 0

        self.run.start_step = global_step

        self._start_tracker()
        tracker().set_start_global_step(global_step)

        if self.distributed_rank == 0:
            self.__print_info()
            if self.check_repo_dirty and self.run.is_dirty:
                logger.log([
                    ("[FAIL]", Text.danger),
                    " Cannot trial an experiment with uncommitted changes."
                ])
                exit(1)

        if not self.is_evaluate:
            if self.distributed_rank == 0:
                self.run.save_info()
            self._save_pid()

            if self.distributed_rank == 0:
                if self.configs_processor is not None:
                    self.configs_processor.add_saver(
                        FileConfigsSaver(self.run.configs_path))

                if self.web_api is not None:
                    self.web_api.set_info(run_uuid=self.run.uuid,
                                          name=self.run.name,
                                          comment=self.run.comment)
                    self.web_api.start()
                    if self.configs_processor is not None:
                        self.configs_processor.add_saver(
                            self.web_api.get_configs_saver())

                tracker().save_indicators(self.run.indicators_path)

                # PERF: Writing to tensorboard takes about 4 seconds
                # Also wont work when configs are updated live
                # if self.configs_processor:
                #     tracker().write_h_parameters(self.configs_processor.get_hyperparams())

        self.is_started = True
        return ExperimentWatcher(self)