Exemplo n.º 1
0
    def __init__(self, model, resume, config, logger_path):
        self.config = config

        self.device, device_ids = self._prepare_device(config.device)
        self.model = model.to(self.device)
        if len(device_ids) > 1:
            self.model = torch.nn.DataParallel(model, device_ids=device_ids)

        self.epochs = config.trainer.epochs
        self.save_freq = config.trainer.save_freq
        self.verbosity = config.trainer.verbosity

        self.checkpoint_dir = config.trainer.checkpoint_dir
        mkdir_dir(self.checkpoint_dir)
        self.train_logger = Logger(logger_path)

        self.monitor = config.trainer.monitor
        self.monitor_mode = config.trainer.monitor_mode
        assert self.monitor_mode in ['min', 'max', 'off']
        self.monitor_best = math.inf if self.monitor_mode == 'min' else -math.inf
        self.start_epoch = 1

        self.writer = WriterTensorboardX(config.trainer.checkpoint_dir,
                                         self.train_logger,
                                         config.visualization.tensorboardX)
        if resume:
            self._resume_checkpoint(resume)
Exemplo n.º 2
0
    def __init__(self, model, loss, optimizer, resume, config):
        self.config = config
        self.logger = logging.getLogger(self.__class__.__name__)

        # setup GPU device if available, move model into configured device
        self.device, device_ids = self._prepare_device(config['n_gpu'])
        self.model = model.to(self.device)
        if len(device_ids) > 1:
            self.model = torch.nn.DataParallel(model, device_ids=device_ids)

        self.loss = loss
        self.optimizer = optimizer

        self.steps = config['trainer']['steps']
        self.save_freq = config['trainer']['save_freq']
        self.verbosity = config['trainer']['verbosity']
        self.start_step = 0

        # setup directory for checkpoint saving
        start_time = datetime.datetime.now().strftime('%m%d_%H%M%S')
        self.checkpoint_dir = os.path.join(config['trainer']['save_dir'], config['name'], start_time)
        # setup visualization writer instance
        writer_dir = os.path.join(config['visualization']['log_dir'], config['name'], start_time)
        self.writer = WriterTensorboardX(writer_dir, self.logger, config['visualization']['tensorboardX'])

        # Save configuration file into checkpoint directory:
        ensure_dir(self.checkpoint_dir)
        config_save_path = os.path.join(self.checkpoint_dir, 'config.json')
        with open(config_save_path, 'w') as handle:
            json.dump(config, handle, indent=4, sort_keys=False)

        if resume:
            self._resume_checkpoint(resume)
Exemplo n.º 3
0
    def __init__(self,
                 model,
                 loss,
                 metrics,
                 optimizer,
                 resume,
                 config,
                 train_logger=None):
        self.config = config
        self.logger = logging.getLogger(self.__class__.__name__)  # 生成一个对应类的日志

        # setup GPU device if available, move model into configured device
        self.device, device_ids = self._prepare_device(config['n_gpu'])
        self.model = model.to(self.device)
        if len(device_ids) > 1:  # 如果gpu > 1并行gpu
            self.model = torch.nn.DataParallel(model, device_ids=device_ids)
        #self.device = torch.device('cuda:0' if config['n_gpu'] > 0 else 'cpu')
        #self.model = model.to(self.device)
        #if config['n_gpu'] > 1:
        #self.model = torch.nn.DataParallel(model)

        self.loss = loss
        self.metrics = metrics
        self.optimizer = optimizer

        self.epochs = config['trainer']['epochs']
        self.save_freq = config['trainer']['save_freq']
        self.verbosity = config['trainer']['verbosity']

        self.train_logger = train_logger

        # configuration to monitor model performance and save best
        self.monitor = config['trainer']['monitor']
        self.monitor_mode = config['trainer']['monitor_mode']
        assert self.monitor_mode in ['min', 'max',
                                     'off']  # 判断monitor_mode是否是这三个 不是抛出异常
        self.monitor_best = math.inf if self.monitor_mode == 'min' else -math.inf  # 初始化模型最优loss
        self.start_epoch = 1

        # setup directory for checkpoint saving
        start_time = datetime.datetime.now().strftime('%m%d_%H%M%S')  # 获取当前时间
        self.checkpoint_dir = os.path.join(config['trainer']['save_dir'],
                                           config['name'],
                                           start_time)  # checkpoint的目录(名字)
        # setup visualization writer instance
        writer_dir = os.path.join(config['visualization']['log_dir'],
                                  config['name'], start_time)  # 可视化的地址
        self.writer = WriterTensorboardX(
            writer_dir, self.logger,
            config['visualization']['tensorboardX'])  # 第三个参数决定是否开启可视化

        # Save configuration file into checkpoint directory:
        ensure_dir(self.checkpoint_dir)  # 如果checkpoint_dir不存在创建一个
        config_save_path = os.path.join(self.checkpoint_dir,
                                        'config.json')  # path
        with open(config_save_path, 'w') as handle:  # 把路径存到config
            json.dump(config, handle, indent=4, sort_keys=False)

        if resume:
            self._resume_checkpoint(resume)
Exemplo n.º 4
0
    def __init__(self,
                 model,
                 loss,
                 metrics,
                 optimizer,
                 resume,
                 config,
                 train_logger=None):
        self.config = config
        self.logger = logging.getLogger(self.__class__.__name__)

        # setup GPU device if available, move model into configured device
        self.device, device_ids = self._prepare_device(config['n_gpu'])
        self.model = model.to(self.device)
        if len(device_ids) > 1:
            self.model = torch.nn.DataParallel(model, device_ids=device_ids)

        self.loss = loss
        self.metrics = metrics
        self.optimizer = optimizer
        self.train_logger = train_logger

        cfg_trainer = config['trainer']
        self.epochs = cfg_trainer['epochs']
        self.save_period = cfg_trainer['save_period']
        self.verbosity = cfg_trainer['verbosity']
        self.monitor = cfg_trainer.get('monitor', 'off')
        self.validation_every = cfg_trainer['validation_every']

        # configuration to monitor model performance and save best
        if self.monitor == 'off':
            self.mnt_mode = 'off'
            self.mnt_best = 0
        else:
            self.mnt_mode, self.mnt_metric = self.monitor.split()
            assert self.mnt_mode in ['min', 'max']

            self.mnt_best = math.inf if self.mnt_mode == 'min' else -math.inf
            self.early_stop = cfg_trainer.get('early_stop', math.inf)

        self.start_epoch = 1

        # setup directory for checkpoint saving
        start_time = datetime.datetime.now().strftime('%m%d_%H%M%S')
        self.checkpoint_dir = os.path.join(cfg_trainer['save_dir'],
                                           config['name'], start_time)
        # setup visualization writer instance
        writer_dir = os.path.join(cfg_trainer['log_dir'], config['name'],
                                  start_time)
        self.writer = WriterTensorboardX(writer_dir, self.logger,
                                         cfg_trainer['tensorboardX'])

        # Save configuration file into checkpoint directory:
        ensure_dir(self.checkpoint_dir)
        config_save_path = os.path.join(self.checkpoint_dir, 'config.json')
        with open(config_save_path, 'w') as handle:
            json.dump(config, handle, indent=4, sort_keys=False)

        if resume:
            self._resume_checkpoint(resume)
Exemplo n.º 5
0
    def __init__(self,
                 modelD,
                 modelG,
                 loss1,
                 loss2,
                 metrics,
                 optimizerD,
                 optimizerG,
                 resume,
                 config,
                 train_logger=None):
        #用于GAN的base,输入有D和G两个模型
        self.config = config
        self.logger = logging.getLogger(self.__class__.__name__)

        # setup GPU device if available, move model into configured device
        self.device, device_ids = self._prepare_device(config['n_gpu'])
        self.modelD = modelD.to(self.device)
        self.modelG = modelG.to(self.device)
        if len(device_ids) > 1:
            self.modelD = torch.nn.DataParallel(modelD, device_ids=device_ids)
            self.modelG = torch.nn.DataParallel(modelG, device_ids=device_ids)

        self.loss1 = loss1
        self.loss2 = loss2
        self.metrics = metrics
        self.optimizerD = optimizerD
        self.optimizerG = optimizerG

        self.epochs = config['trainer']['epochs']
        self.save_freq = config['trainer']['save_freq']
        self.verbosity = config['trainer']['verbosity']

        self.train_logger = train_logger

        # configuration to monitor model performance and save best
        self.monitor = config['trainer']['monitor']
        self.monitor_mode = config['trainer']['monitor_mode']
        assert self.monitor_mode in ['min', 'max', 'off']
        self.monitor_best = math.inf if self.monitor_mode == 'min' else -math.inf
        self.start_epoch = 1

        # setup directory for checkpoint saving
        start_time = datetime.datetime.now().strftime('%m%d_%H%M%S')
        self.checkpoint_dir = os.path.join(config['trainer']['save_dir'],
                                           config['name'], start_time)
        # setup visualization writer instance
        writer_dir = os.path.join(config['visualization']['log_dir'],
                                  config['name'], start_time)
        self.writer = WriterTensorboardX(
            writer_dir, self.logger, config['visualization']['tensorboardX'])

        # Save configuration file into checkpoint directory:
        ensure_dir(self.checkpoint_dir)
        config_save_path = os.path.join(self.checkpoint_dir, 'config.json')
        with open(config_save_path, 'w') as handle:
            json.dump(config, handle, indent=4, sort_keys=False)

        if resume:
            self._resume_checkpoint(resume)
    def __init__(
        self, model, losses, metrics, optimizer_g,
        optimizer_d_s, optimizer_d_t,
        resume, config,
        train_logger=None,
        pretrained_path=None,
    ):
        self.config = config
        self.logger = logging.getLogger(self.__class__.__name__)

        # setup GPU device if available, move model into configured device
        self.device, device_ids = self._prepare_device(config['n_gpu'])
        self.model = model.to(self.device)

        self.losses = losses
        self.metrics = metrics
        self.optimizer_g = optimizer_g
        self.optimizer_d_s = optimizer_d_s
        self.optimizer_d_t = optimizer_d_t

        self.epochs = config['trainer']['epochs']
        self.save_freq = config['trainer']['save_freq']
        self.verbosity = config['trainer']['verbosity']

        # Set pretrained_load_strict to False to load model without strict state name matching
        # It's useful when pretrained model without GAN but we want to use GAN for this time
        self.pretrained_load_strict = config['trainer']['pretrained_load_strict']

        self.train_logger = train_logger

        # configuration to monitor model performance and save best
        self.monitor = config['trainer']['monitor']
        self.monitor_mode = config['trainer']['monitor_mode']
        assert self.monitor_mode in ['min', 'max', 'off']
        self.monitor_best = math.inf if self.monitor_mode == 'min' else -math.inf
        self.start_epoch = 1

        # setup directory for checkpoint saving
        start_time = datetime.datetime.now().strftime('%m%d_%H%M%S')
        self.checkpoint_dir = os.path.join(config['trainer']['save_dir'], config['name'], start_time)
        # setup visualization writer instance
        writer_dir = os.path.join(config['visualization']['log_dir'], config['name'], start_time)
        self.writer = WriterTensorboardX(writer_dir, self.logger, config['visualization']['tensorboardX'])

        # Save configuration file into checkpoint directory:
        ensure_dir(self.checkpoint_dir)
        config_save_path = os.path.join(self.checkpoint_dir, 'config.json')
        with open(config_save_path, 'w') as handle:
            json.dump(config, handle, indent=4, sort_keys=False)

        if resume:
            self._resume_checkpoint(resume)
        elif pretrained_path is not None:
            self._load_pretrained(pretrained_path)

        # put model into DataParallel module only after the checkpoint is loaded
        if len(device_ids) > 1:
            self.model = torch.nn.DataParallel(model, device_ids=device_ids)
    def __init__(self,
                 model,
                 loss,
                 metrics,
                 optimizer,
                 resume,
                 config,
                 train_logger=None):
        self.config = config
        self.logger = logging.getLogger(self.__class__.__name__)

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.model = model.to(self.device)

        self.loss = loss
        self.metrics = metrics
        self.optimizer = optimizer
        self.train_logger = train_logger

        cfg_trainer = config['train']
        self.epochs = cfg_trainer['epochs']
        self.save_period = cfg_trainer['save_p']
        self.verbosity = cfg_trainer['verbosity']
        self.monitor = cfg_trainer.get('monitor', 'off')

        # configuration to monitor model performance and save best
        if self.monitor == 'off':
            self.mnt_mode = 'off'
            self.mnt_best = 0
        else:
            self.mnt_mode, self.mnt_metric = self.monitor.split()
            assert self.mnt_mode in ['min', 'max']

            self.mnt_best = math.inf if self.mnt_mode == 'min' else -math.inf
            self.early_stop = cfg_trainer.get('early_stop', math.inf)

        self.start_epoch = 1

        # setup directory for checkpoint saving
        start_time = datetime.datetime.now().strftime('%m%d_%H%M%S')
        self.checkpoint_dir = os.path.join(cfg_trainer['save_dir'], start_time,
                                           'checkpoints')
        self.log_dir = os.path.join(cfg_trainer['save_dir'], start_time,
                                    'logs')

        self.writer = WriterTensorboardX(self.log_dir, self.logger,
                                         cfg_trainer['tbX'])

        # Save configuration file into checkpoint directory:
        mkdir_p(self.checkpoint_dir)
        config_save_path = os.path.join(self.checkpoint_dir, 'config.json')

        with open(config_save_path, 'w') as handle:
            json.dump(config, handle, indent=4, sort_keys=False)

        if resume:
            self._resume_checkpoint(resume)
Exemplo n.º 8
0
    def __init__(self,
                 model,
                 loss,
                 metrics,
                 optimizer,
                 resume,
                 config,
                 train_logger=None):
        self.config = config
        self.logger = logging.getLogger(self.__class__.__name__)

        # setup GPU device if available, move model into configured device
        self.with_cuda = config['cuda'] and torch.cuda.is_available()
        if config['cuda'] and not torch.cuda.is_available():
            self.logger.warning(
                'Warning: There\'s no GPU available on this machine, '
                'training will be performed on CPU.')
        self.device = torch.device(
            'cuda:' + str(config['gpu']) if self.with_cuda else 'cpu')
        self.model = model.to(self.device)

        self.loss = loss
        self.metrics = metrics
        self.optimizer = optimizer

        self.epochs = config['trainer']['epochs']
        self.save_freq = config['trainer']['save_freq']
        self.verbosity = config['trainer']['verbosity']

        self.train_logger = train_logger

        # configuration to monitor model performance and save best
        self.monitor = config['trainer']['monitor']
        self.monitor_mode = config['trainer']['monitor_mode']
        assert self.monitor_mode in ['min', 'max', 'off']
        self.monitor_best = math.inf if self.monitor_mode == 'min' else -math.inf
        self.start_epoch = 1

        # setup directory for checkpoint saving
        start_time = datetime.datetime.now().strftime('%m%d_%H%M%S')
        self.checkpoint_dir = os.path.join(config['trainer']['save_dir'],
                                           config['name'], start_time)
        # setup visualization writer instance
        writer_dir = os.path.join(config['visualization']['log_dir'],
                                  config['name'], start_time)
        self.writer = WriterTensorboardX(
            writer_dir, self.logger, config['visualization']['tensorboardX'])

        # Save configuration into checkpoint directory:
        ensure_dir(self.checkpoint_dir)
        config_save_path = os.path.join(self.checkpoint_dir, 'config.json')
        with open(config_save_path, 'w') as handle:
            json.dump(config, handle, indent=4, sort_keys=False)

        if resume:
            self._resume_checkpoint(resume)
Exemplo n.º 9
0
    def __init__(
        self, model, loss, metrics, optimizer, resume, config, train_logger=None
    ):
        self.config = config
        self.logger = logging.getLogger(self.__class__.__name__)

        # setup GPU device if available, move model into configured device
        self.device, device_ids = self._prepare_device(config["n_gpu"])
        self.model = model.to(self.device)
        if len(device_ids) > 1:
            self.model = torch.nn.DataParallel(model, device_ids=device_ids)

        self.loss = loss
        self.metrics = metrics
        self.optimizer = optimizer
        self.train_logger = train_logger

        cfg_trainer = config["trainer"]
        self.epochs = cfg_trainer["epochs"]
        self.save_period = cfg_trainer["save_period"]
        self.verbosity = cfg_trainer["verbosity"]
        self.monitor = cfg_trainer.get("monitor", "off")

        # configuration to monitor model performance and save best
        if self.monitor == "off":
            self.mnt_mode = "off"
            self.mnt_best = 0
        else:
            self.mnt_mode, self.mnt_metric = self.monitor.split()
            assert self.mnt_mode in ["min", "max"]

            self.mnt_best = math.inf if self.mnt_mode == "min" else -math.inf
            self.early_stop = cfg_trainer.get("early_stop", math.inf)

        self.start_epoch = 1

        # setup directory for checkpoint saving
        start_time = datetime.datetime.now().strftime("%m%d_%H%M%S")
        self.checkpoint_dir = os.path.join(
            cfg_trainer["save_dir"], config["name"], start_time
        )
        # setup visualization writer instance
        writer_dir = os.path.join(cfg_trainer["log_dir"], config["name"], start_time)
        self.writer = WriterTensorboardX(
            writer_dir, self.logger, cfg_trainer["tensorboardX"]
        )

        # Save configuration file into checkpoint directory:
        ensure_dir(self.checkpoint_dir)
        config_save_path = os.path.join(self.checkpoint_dir, "config.json")
        with open(config_save_path, "w") as handle:
            json.dump(config, handle, indent=4, sort_keys=False)

        if resume:
            self._resume_checkpoint(resume)
Exemplo n.º 10
0
    def __init__(self, model, loss, metrics, optimizer, resume, config):

        self.config = config
        self.logger = config.get_logger(
            'trainer', config['trainer']['verbosity'])

        # setup GPU device if available, move model into configured device
        self.device, device_ids = self._prepare_device(config['n_gpu'])
        self.model = model.to(self.device)
        if len(device_ids) > 1:
            print("Using DataParallel for loss")
            self.model = torch.nn.DataParallel(model, device_ids=device_ids)

        self.loss = loss
        self.metrics = metrics
        self.optimizer = optimizer

        cfg_trainer = config['trainer']
        self.epochs = cfg_trainer['epochs']
        self.save_period = cfg_trainer['save_period']
        self.verbosity = cfg_trainer['verbosity']
        self.monitor = cfg_trainer.get('monitor', 'off')

        # configuration to monitor model performance and save best
        if self.monitor == 'off':
            self.mnt_mode = 'off'
            self.mnt_best = 0
        else:
            self.mnt_mode, self.mnt_metric = self.monitor.split()
            assert self.mnt_mode in ['min', 'max']
            self.mnt_best = math.inf if self.mnt_mode == 'min' else -math.inf

            # Since early stopping was not available in earlier versions of the codebase
            # we disable it completely (this allows previous experiments to be reproduced
            # without modifying configs)
            # self.early_stop = cfg_trainer.get('early_stop', math.inf)
            self.early_stop = math.inf

        self.start_epoch = 1
        self.latest_log = None

        # setup directory for checkpoint saving
        # if resume:
        #     start_time = os.path.split(os.path.split(resume)[0])[1]
        # else:
        #     start_time = datetime.datetime.now().strftime('%m%d_%H%M%S')
        self.checkpoint_dir = config.save_dir

        # setup visualization writer instance
        # writer_dir = os.path.join(cfg_trainer['log_dir'], config['name'], start_time)
        self.writer = WriterTensorboardX(
            config.log_dir, self.logger, cfg_trainer['tensorboardX'])

        if resume:
            self._resume_checkpoint(resume)
Exemplo n.º 11
0
    def __init__(self, game, nnet, args):
        self.game = game
        self.args = args
        self.nnet = nnet
        self.pnet = self.nnet.__class__(self.game,
                                        self.args)  # the competitor network
        self.mcts = MCTS(self.game, self.nnet, self.args)
        self.trainExamplesHistory = [
        ]  # history of examples from args.numItersForTrainExamplesHistory latest iterations
        self.skipFirstSelfPlay = False  # can be overriden in loadTrainExamples()

        self.elo = 0  # elo score of the current model

        self.logger = logging.getLogger(self.__class__.__name__)
        start_time = datetime.datetime.now().strftime('%m%d_%H%M%S')
        # setup visualization writer instance
        writer_dir = os.path.join(self.args.log_dir, self.args.name,
                                  start_time)
        self.writer = WriterTensorboardX(writer_dir, self.logger,
                                         self.args.tensorboardX)
Exemplo n.º 12
0
class Coach():
    """
    This class executes the self-play + learning. It uses the functions defined
    in Game and NeuralNet. args are specified in main.py.
    """
    def __init__(self, game, nnet, args):
        self.game = game
        self.args = args
        self.nnet = nnet
        self.pnet = self.nnet.__class__(self.game,
                                        self.args)  # the competitor network
        self.mcts = MCTS(self.game, self.nnet, self.args)
        self.trainExamplesHistory = [
        ]  # history of examples from args.numItersForTrainExamplesHistory latest iterations
        self.skipFirstSelfPlay = False  # can be overriden in loadTrainExamples()

        self.elo = 0  # elo score of the current model

        self.logger = logging.getLogger(self.__class__.__name__)
        start_time = datetime.datetime.now().strftime('%m%d_%H%M%S')
        # setup visualization writer instance
        writer_dir = os.path.join(self.args.log_dir, self.args.name,
                                  start_time)
        self.writer = WriterTensorboardX(writer_dir, self.logger,
                                         self.args.tensorboardX)

    def executeEpisode(self):
        """
        This function executes one episode of self-play, starting with player 1.
        As the game is played, each turn is added as a training example to
        trainExamples. The game is played till the game ends. After the game
        ends, the outcome of the game is used to assign values to each example
        in trainExamples.
        It uses a temp=1 if episodeStep < tempThreshold, and thereafter
        uses temp=0.
        Returns:
            trainExamples: a list of examples of the form (canonicalBoard,pi,v)
                           pi is the MCTS informed policy vector, v is +1 if
                           the player eventually won the game, else -1.
        """
        self.mcts = MCTS(self.game, self.nnet, self.args)  # reset search tree
        trainExamples = []
        board = self.game.getInitBoard()
        self.curPlayer = 1
        episodeStep = 0

        while True:
            episodeStep += 1
            canonicalBoard = self.game.getCanonicalForm(board, self.curPlayer)
            temp = int(episodeStep < self.args.tempThreshold)

            pi = self.mcts.getActionProb(canonicalBoard, temp=temp)
            sym = self.game.getSymmetries(canonicalBoard, pi)
            for b, p in sym:
                trainExamples.append([b, self.curPlayer, p, None])

            action = np.random.choice(len(pi), p=pi)
            board, self.curPlayer = self.game.getNextState(
                board, self.curPlayer, action)

            r = self.game.getGameEnded(board, self.curPlayer)

            if r != 0:
                return [(x[0], x[2], r * ((-1)**(x[1] != self.curPlayer)))
                        for x in trainExamples]

    def learn(self):
        """
        Performs numIters iterations with numEps episodes of self-play in each
        iteration. After every iteration, it retrains neural network with
        examples in trainExamples (which has a maximium length of maxlenofQueue).
        It then pits the new neural network against the old one and accepts it
        only if it wins >= updateThreshold fraction of games.
        """

        for i in tqdm(range(1, self.args.numIters + 1), desc='Iteration'):
            # examples of the iteration
            if not self.skipFirstSelfPlay or i > 1:
                iterationTrainExamples = deque([],
                                               maxlen=self.args.maxlenOfQueue)

                for eps in tqdm(range(self.args.numEps), desc='mcts.Episode'):
                    iterationTrainExamples += self.executeEpisode()

                # save the iteration examples to the history
                self.trainExamplesHistory.append(iterationTrainExamples)

            if len(self.trainExamplesHistory
                   ) > self.args.numItersForTrainExamplesHistory:
                print("len(trainExamplesHistory) =",
                      len(self.trainExamplesHistory),
                      " => remove the oldest trainExamples")
                self.trainExamplesHistory.pop(0)
            # backup history to a file
            # NB! the examples were collected using the model from the previous iteration, so (i-1)
            self.saveTrainExamples(i - 1)

            # shuffle examples before training
            trainExamples = []
            for e in self.trainExamplesHistory:
                trainExamples.extend(e)
            shuffle(trainExamples)

            # training new network, keeping a copy of the old one
            self.nnet.save_checkpoint(folder=self.args.checkpoint,
                                      filename='temp.pth.tar')
            self.pnet.load_checkpoint(folder=self.args.checkpoint,
                                      filename='temp.pth.tar')
            pmcts = MCTS(self.game, self.pnet, self.args)

            self.nnet.train(trainExamples, self.writer)
            self.writer.set_step(i - 1, "learning")
            nmcts = MCTS(self.game, self.nnet, self.args)

            print("PITTING AGAINST METRIC COMPONENTS")
            for metric_opponent in self.args.metric_opponents:
                arena = Arena(
                    lambda x: np.argmax(nmcts.getActionProb(x, temp=0)),
                    metric_opponent(self.game).play, self.game)
                nwins, owins, draws = arena.playGames(
                    self.args.metricArenaCompare)
                print('%s WINS : %d / %d ; DRAWS : %d' %
                      (metric_opponent.__name__, nwins, owins, draws))
                if nwins + owins == 0: win_prct = 0
                else: win_prct = float(nwins) / (nwins + owins)
                self.writer.add_scalar(
                    '{}_win'.format(metric_opponent.__name__), win_prct)
                # Reset nmcts
                nmcts = MCTS(self.game, self.nnet, self.args)

            print('PITTING AGAINST PREVIOUS VERSION')
            arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)),
                          lambda x: np.argmax(nmcts.getActionProb(x, temp=0)),
                          self.game)
            pwins, nwins, draws = arena.playGames(self.args.arenaCompare)
            if nwins + pwins == 0: win_prct = 0
            else: win_prct = float(nwins) / (nwins + pwins)
            self.writer.add_scalar('self_win', win_prct)

            # Calculate elo score for self play
            results = [-x for x in arena.get_results()
                       ]  # flip to be next neural network wins
            nelo, pelo = elo(self.elo, self.elo, results)

            print('NEW/PREV WINS : %d / %d ; DRAWS : %d' %
                  (nwins, pwins, draws))
            if pwins + nwins == 0 or float(nwins) / (
                    pwins + nwins) < self.args.updateThreshold:
                print('REJECTING NEW MODEL')
                self.elo = pelo
                self.nnet.load_checkpoint(folder=self.args.checkpoint,
                                          filename='temp.pth.tar')
            else:
                print('ACCEPTING NEW MODEL')
                self.elo = nelo
                self.nnet.save_checkpoint(folder=self.args.checkpoint,
                                          filename=self.getCheckpointFile(i))
                self.nnet.save_checkpoint(folder=self.args.checkpoint,
                                          filename='best.pth.tar')

            self.writer.add_scalar('self_elo', self.elo)

    def getCheckpointFile(self, iteration):
        return 'checkpoint_' + str(iteration) + '.pth.tar'

    def saveTrainExamples(self, iteration):
        folder = self.args.checkpoint
        if not os.path.exists(folder):
            os.makedirs(folder)
        filename = os.path.join(
            folder,
            self.getCheckpointFile(iteration) + ".examples")
        with open(filename, "wb+") as f:
            Pickler(f).dump(self.trainExamplesHistory)
        f.closed

    def loadTrainExamples(self):
        modelFile = os.path.join(self.args.load_folder_file[0],
                                 self.args.load_folder_file[1])
        examplesFile = modelFile + ".examples"
        if not os.path.isfile(examplesFile):
            print(examplesFile)
            r = input("File with trainExamples not found. Continue? [y|n]")
            if r != "y":
                sys.exit()
        else:
            print("File with trainExamples found. Read it.")
            with open(examplesFile, "rb") as f:
                self.trainExamplesHistory = Unpickler(f).load()
            f.closed
            # examples based on the model were already collected (loaded)
            self.skipFirstSelfPlay = True
Exemplo n.º 13
0
    def __init__(
        self,
        model: Module,
        loss: Callable,
        loss_args: dict,
        metrics: List[Callable],
        metric_args: List[dict],
        optimizer: Optimizer,
        config: dict,
        resume: Optional[str] = None,
        train_logger: Optional[Logger] = None
    ) -> None:
        self.config: dict = config
        self.logger: logging.Logger = logging.getLogger(self.__class__.__name__)

        # Setup GPU device if available.
        self.device: str
        device_ids: List[int]
        self.device, device_ids = self._prepare_device(config["n_gpu"])

        # Move model into device(s).
        self.model: Module = model.to(self.device)
        if len(device_ids) > 1:
            self.model: Module = DataParallel(model, device_ids = device_ids)

        self.loss: Callable = loss
        self.loss_args: dict = loss_args
        self.metrics: List[Callable] = metrics
        self.metric_args: List[dict] = metric_args
        self.optimizer: Optimizer = optimizer
        self.train_logger: Optional[Logger] = train_logger

        cfg_trainer: dict = config["trainer"]
        self.epochs: int = cfg_trainer["epochs"]
        self.save_period: int = cfg_trainer["save_period"]
        self.verbosity: int = cfg_trainer["verbosity"]
        self.monitor: str = cfg_trainer.get("monitor", "off")

        self.mnt_mode: str
        self.mnt_best: float

        # Configuration to monitor model performance and save the best result.
        if self.monitor == "off":
            self.mnt_mode = "off"
            self.mnt_best = 0

        else:
            self.mnt_mode: str
            self.mnt_metric: str
            self.mnt_mode, self.mnt_metric = self.monitor.split()
            assert self.mnt_mode in ["min", "max"]

            self.mnt_best = math.inf if self.mnt_mode == "min" \
                else -math.inf
            self.early_stop: float = cfg_trainer.get("early_stop", math.inf)

        self.start_epoch = 1

        # Setup directory for saving checkpoints.
        start_time: str = datetime.datetime.now().strftime("%m%d_%H%M%S")
        self.checkpoint_dir: str = os.path.join(
            cfg_trainer["save_dir"], config["name"], start_time
        )

        # Setup visualization writer instance.
        writer_dir: str = os.path.join(
            cfg_trainer["log_dir"], config["name"], start_time
        )
        self.writer: WriterTensorboardX = WriterTensorboardX(
            writer_dir, self.logger, cfg_trainer["tensorboardX"]
        )

        # Save configuration file into checkpoint directory.
        ensure_dir(self.checkpoint_dir)
        config_save_path: str = os.path.join(
            self.checkpoint_dir, "config.json"
        )
        with open(config_save_path, 'w') as handle:
            json.dump(config, handle, indent = 4, sort_keys = False)

        if resume:
            self._resume_checkpoint(resume)
Exemplo n.º 14
0
    def __init__(self,
                 model,
                 loss,
                 metrics,
                 optimizer,
                 resume,
                 config,
                 train_logger=None):
        self.config = config
        self.logger = logging.getLogger(self.__class__.__name__)

        # setup GPU device if available, move model into configured device
        self.device, device_ids = self._prepare_device(config['n_gpu'])
        self.model = model.to(self.device)
        if len(device_ids) > 1:
            self.model = torch.nn.DataParallel(model, device_ids=device_ids)

        self.loss = loss
        self.metrics = metrics
        self.optimizer = optimizer
        self.train_logger = train_logger

        cfg_trainer = config['trainer']
        self.epochs = cfg_trainer['epochs']
        self.save_period = cfg_trainer['save_period']
        self.verbosity = cfg_trainer['verbosity']
        self.monitor = cfg_trainer.get('monitor', 'off')

        # configuration to monitor model performance and save best
        if self.monitor == 'off':
            self.mnt_mode = 'off'
            self.mnt_best = 0
        else:
            self.mnt_mode, self.mnt_metric = self.monitor.split()
            assert self.mnt_mode in ['min', 'max']

            self.mnt_best = math.inf if self.mnt_mode == 'min' else -math.inf
            self.early_stop = cfg_trainer.get('early_stop', math.inf)

        self.start_epoch = 1
        # UTC Time to Beijing Time, +8 hours
        start_time = datetime.datetime.now().strftime('%m%d_%H%M%S')
        # start_time = start_time[:5] + str((int(start_time[5:7]) + 8) % 24) + start_time[7:]

        # setup directory for checkpoint saving
        self.checkpoint_dir = os.path.join(cfg_trainer['save_dir'],
                                           config['name'], start_time)
        # setup visualization writer instance
        writer_dir = os.path.join(cfg_trainer['log_dir'], config['name'],
                                  start_time)
        self.writer = WriterTensorboardX(writer_dir, self.logger,
                                         cfg_trainer['tensorboardX'])

        # Save configuration file into checkpoint directory:
        ensure_dir(self.checkpoint_dir)
        config_save_path = os.path.join(self.checkpoint_dir, 'config.json')

        # logging to file
        fh = logging.handlers.RotatingFileHandler(
            Path(config['trainer']['save_dir']) / config['name'] / start_time /
            'main.log', 'w+', 20 * 1024 * 1024, 5)
        formatter = logging.Formatter(
            '%(asctime)s %(levelname)5s - %(name)s '
            '[%(filename)s line %(lineno)d] - %(message)s',
            datefmt='%m-%d %H:%M:%S')
        fh.setFormatter(formatter)
        self.logger.addHandler(fh)

        # print(self.model)
        # self.logger.info(self.model)
        print('saving weight/log/config to {}'.format(start_time))

        with open(config_save_path, 'w') as handle:
            json.dump(config, handle, indent=4, sort_keys=False)

        if resume:
            if resume.split('/') == 1:
                self._resume_checkpoint_outer(resume)
            else:
                self._resume_checkpoint(resume)
Exemplo n.º 15
0
    def __init__(self, resume, config, train_logger=None):
        self.config = config
        self.logger = logging.getLogger(
            self.__class__.__name__
        )  # used for displaying logging and warning info

        # setup GPU device if available, move model into configured device
        self.device, self.device_ids = self._prepare_device(
            self.config['n_gpu'])

        # dataloader
        self.train_set, self.val_set = get_INBreast_dataloader(self.config)

        # class weight for balanced dataset
        self.class_weight = [
            torch.FloatTensor([1, 20]),
            torch.FloatTensor([1, 4])
        ]
        self.label_weight = [2, 1]
        self.output_ch = len(self.class_weight)

        # build model architecture
        self.model = module_arch.AttU_Net(img_ch=1, output_ch=self.output_ch)
        # print(self.model)

        # model parallel using muti-gpu
        self.model = model_parallel(self.model, self.device, self.device_ids)

        # build optimizer, learning rate scheduler.
        self.optimizer, self.scheduler = build_optimizer(
            self.model, self.config)

        self.train_logger = train_logger  # used for saving logging info

        trainer_config = self.config['trainer']
        self.max_epochs = trainer_config['max_epochs']
        self.save_period = trainer_config['save_period']
        self.val_period = trainer_config['val_period']
        self.verbosity = trainer_config['verbosity']
        self.monitor = trainer_config.get('monitor', 'off')

        # configuration to monitor model performance and save best
        if self.monitor == 'off':
            self.mnt_mode = 'off'
            self.mnt_best = 0
        else:
            self.mnt_mode, self.mnt_metric = self.monitor.split()
            assert self.mnt_mode in ['min', 'max']

            self.mnt_best = math.inf if self.mnt_mode == 'min' else -math.inf
            self.early_stop = trainer_config.get('early_stop', math.inf)

        self.start_epoch = 1

        # setup directory for checkpoint saving
        start_time = datetime.datetime.now().strftime('%m%d_%H%M%S')
        if self.save_period != 0:
            self.checkpoint_dir = os.path.join(trainer_config['save_dir'],
                                               self.config['name'], start_time)
            ensure_dir(self.checkpoint_dir)
        # setup visualization writer instance
        writer_dir = os.path.join(trainer_config['log_dir'],
                                  self.config['name'], start_time)
        ensure_dir(writer_dir)
        self.writer = WriterTensorboardX(writer_dir, self.logger,
                                         trainer_config['tensorboardX'])

        # Save configuration file into logging directory:
        self._save_config(writer_dir)

        if resume:
            self._resume_checkpoint(resume)
Exemplo n.º 16
0
    def __init__(self, resume, config, train_logger=None):
        self.config = config
        self.logger = logging.getLogger(
            self.__class__.__name__
        )  # used for displaying logging and warning info

        # setup GPU device if available, move model into configured device
        self.device, self.device_ids = self._prepare_device(
            self.config['n_gpu'])

        # dataloader
        self.train_set, self.val_set = get_UCLA_dataset(self.config)

        # class weight for balanced dataset
        self.class_weight = torch.FloatTensor([1, 1])

        # build model architecture
        self.model = module_arch.AttU_Net_Classification(img_ch=1)
        # loading pretrained model
        pretrain_path = '../deployment/checkpoint/segmentation_model.pth'
        checkpoint = torch.load(pretrain_path)
        self.model.load_state_dict(checkpoint['model_state_dict'],
                                   strict=False)

        # freeze all parameters
        for param in self.model.parameters():
            param.requires_grad = False

        # unfreeze classification part
        for param in self.model.classification.parameters():
            param.requires_grad = True
        for param in self.model.fc.parameters():
            param.requires_grad = True

        # model parallel using muti-gpu
        self.model = model_parallel(self.model, self.device, self.device_ids)

        # build optimizer, learning rate scheduler.
        self.optimizer, self.scheduler = build_optimizer(
            self.model, self.config)

        self.train_logger = train_logger  # used for saving logging info

        trainer_config = self.config['trainer']
        self.max_epochs = trainer_config['max_epochs']
        self.save_period = trainer_config['save_period']
        self.val_period = trainer_config['val_period']
        self.verbosity = trainer_config['verbosity']
        self.monitor = trainer_config.get('monitor', 'off')

        # configuration to monitor model performance and save best
        if self.monitor == 'off':
            self.mnt_mode = 'off'
            self.mnt_best = 0
        else:
            self.mnt_mode, self.mnt_metric = self.monitor.split()
            assert self.mnt_mode in ['min', 'max']

            self.mnt_best = math.inf if self.mnt_mode == 'min' else -math.inf
            self.early_stop = trainer_config.get('early_stop', math.inf)

        self.start_epoch = 1

        # setup directory for checkpoint saving
        start_time = datetime.datetime.now().strftime('%m%d_%H%M%S')
        if self.save_period != 0:
            self.checkpoint_dir = os.path.join(trainer_config['save_dir'],
                                               self.config['name'], start_time)
            ensure_dir(self.checkpoint_dir)
        # setup visualization writer instance
        writer_dir = os.path.join(trainer_config['log_dir'],
                                  self.config['name'], start_time)
        ensure_dir(writer_dir)
        self.writer = WriterTensorboardX(writer_dir, self.logger,
                                         trainer_config['tensorboardX'])

        # Save configuration file into logging directory:
        self._save_config(writer_dir)

        if resume:
            self._resume_checkpoint(resume)
Exemplo n.º 17
0
    def __init__(self,
                 models,
                 optimizers,
                 loss,
                 metrics,
                 resume,
                 config,
                 train_logger=None):
        """
        class initialization
        :param models: models dictionary contains generator model, local discriminator model, and global discriminator model
        :param optimizers: optimizers dictionary contains generator optimizer and the discriminators optimizers
        :param loss: loss dictionary contains the loss objectives
        :param metrics: other metrics except for the loss want to display during training
        :param resume: resume checkpoint
        :param config: config file
        :param train_logger: logger
        """
        self.config = config
        self.logger = logging.getLogger(self.__class__.__name__)

        # setup GPU device if available, move model into configured device
        self.device, self.device_ids = self._prepare_device(config['n_gpu'])
        self.generator = models["generator"].to(self.device)
        self.local_discriminator = models["local_discriminator"].to(
            self.device)

        # paralleling the models if multiple GPUs
        if len(self.device_ids) > 1:
            self.generator = torch.nn.DataParallel(self.generator,
                                                   device_ids=self.device_ids)
            self.local_discriminator = torch.nn.DataParallel(
                self.local_discriminator, device_ids=self.device_ids)

        self.loss = loss
        self.metrics = metrics
        self.train_logger = train_logger
        self.generator_optimizer = optimizers["generator"]
        self.local_discriminator_optimizer = optimizers["local_discriminator"]

        # read training settings
        cfg_trainer = config['trainer']
        self.epochs = cfg_trainer['epochs']
        self.save_period = cfg_trainer['save_period']
        self.verbosity = cfg_trainer['verbosity']
        self.monitor = cfg_trainer.get('monitor', 'off')

        # configuration to monitor model performance and save best
        if self.monitor == 'off':
            self.mnt_mode = 'off'
            self.mnt_best = 0
        else:
            self.mnt_mode, self.mnt_metric = self.monitor.split()
            assert self.mnt_mode in ['min', 'max']

            self.mnt_best = math.inf if self.mnt_mode == 'min' else -math.inf
            self.early_stop = cfg_trainer.get('early_stop', math.inf)

        self.start_epoch = 1

        # setup directory for checkpoint saving
        start_time = datetime.datetime.now().strftime('%m%d_%H%M%S')
        self.checkpoint_dir = os.path.join(cfg_trainer['save_dir'],
                                           config['name'], start_time)
        # setup visualization writer instance
        writer_dir = os.path.join(cfg_trainer['log_dir'], config['name'],
                                  start_time)
        self.writer = WriterTensorboardX(writer_dir, self.logger,
                                         cfg_trainer['tensorboardX'])

        # Save configuration file into checkpoint directory:
        ensure_dir(self.checkpoint_dir)
        self.config["checkpoint_dir"] = self.checkpoint_dir
        config_save_path = os.path.join(self.checkpoint_dir,
                                        'line_gan_local_config.json')
        with open(config_save_path, 'w') as handle:
            json.dump(config, handle, indent=4, sort_keys=False)

        if resume:
            self._resume_checkpoint(resume)
Exemplo n.º 18
0
class BaseTrainer:
    """
	Base class for all trainers
	"""
    def __init__(self,
                 model,
                 loss,
                 metrics,
                 optimizer,
                 resume,
                 config,
                 train_logger=None,
                 data_loader=None):
        self.config = config
        self.data_loader = data_loader  #if any sent

        # Setup directory for checkpoint saving
        start_time = datetime.datetime.now().strftime('%m%d_%H%M%S')
        self.checkpoint_dir = os.path.join(config['trainer']['save_dir'],
                                           config['name'], start_time)
        os.makedirs(self.checkpoint_dir, exist_ok=True)

        # Setup logger
        logging.basicConfig(level=logging.INFO,
                            format="%(asctime)s %(message)s",
                            handlers=[
                                logging.FileHandler(
                                    os.path.join(self.checkpoint_dir,
                                                 "train.log")),
                                logging.StreamHandler(),
                            ])
        self.logger = logging.getLogger(self.__class__.__name__)

        # Setup GPU device if available, move model into configured device
        self.device, device_ids = self._prepare_device(config['n_gpu'])
        self.model = model.to(self.device)
        if len(device_ids) > 1:
            self.model = torch.nn.DataParallel(model, device_ids=device_ids)

        self.loss = loss
        self.metrics = metrics
        self.optimizer = optimizer

        self.epochs = config['trainer']['epochs']
        self.save_freq = config['trainer']['save_freq']
        self.verbosity = config['trainer']['verbosity']

        self.train_logger = train_logger

        # configuration to monitor model performance and save best
        self.monitor = config['trainer']['monitor']
        self.monitor_mode = config['trainer']['monitor_mode']
        assert self.monitor_mode in ['min', 'max', 'off']
        self.monitor_best = math.inf if self.monitor_mode == 'min' else -math.inf
        self.start_epoch = 1

        # setup visualization writer instance
        writer_train_dir = os.path.join(config['visualization']['log_dir'],
                                        config['name'], start_time, "train")
        writer_valid_dir = os.path.join(config['visualization']['log_dir'],
                                        config['name'], start_time, "valid")
        self.writer_train = WriterTensorboardX(
            writer_train_dir, self.logger,
            config['visualization']['tensorboardX'])
        self.writer_valid = WriterTensorboardX(
            writer_valid_dir, self.logger,
            config['visualization']['tensorboardX'])

        # Save configuration file into checkpoint directory
        config_save_path = os.path.join(self.checkpoint_dir, 'config.json')
        with open(config_save_path, 'w') as handle:
            json.dump(config, handle, indent=4, sort_keys=False)

        # Resume
        if resume:
            self._resume_checkpoint(resume)

    def _prepare_device(self, n_gpu_use):
        """ 
		setup GPU device if available, move model into configured device
		"""
        n_gpu = torch.cuda.device_count()
        if n_gpu_use > 0 and n_gpu == 0:
            self.logger.warning(
                "Warning: There\'s no GPU available on this machine, training will be performed on CPU."
            )
            n_gpu_use = 0
        if n_gpu_use > n_gpu:
            msg = "Warning: The number of GPU\'s configured to use is {}, but only {} are available on this machine.".format(
                n_gpu_use, n_gpu)
            self.logger.warning(msg)
            n_gpu_use = n_gpu
        device = torch.device('cuda:0' if n_gpu_use > 0 else 'cpu')
        list_ids = list(range(n_gpu_use))
        return device, list_ids

    def addGraph_Tensorboard(self):
        if self.data_loader == None:
            return
        data, _ = next(iter(self.data_loader))
        data = data.to(self.device)
        self.writer_train.addGraph(self.model, data)

    def train(self):
        #first lets write the graph to tensorboardX if possible
        self.addGraph_Tensorboard()

        for epoch in range(self.start_epoch, self.epochs + 1):
            self.logger.info(
                "\n----------------------------------------------------------------"
            )
            self.logger.info("[EPOCH %d]" % (epoch))
            start_time = time()
            result = self._train_epoch(epoch)
            finish_time = time()
            self.logger.info("Finish at {}, Runtime: {:.3f} [s]".format(
                datetime.datetime.now(), finish_time - start_time))

            # save logged informations into log dict
            log = {}
            for key, value in result.items():
                if key == 'train_metrics':
                    log.update({
                        'train_' + mtr.__name__: value[i]
                        for i, mtr in enumerate(self.metrics)
                    })
                elif key == 'valid_metrics':
                    log.update({
                        'valid_' + mtr.__name__: value[i]
                        for i, mtr in enumerate(self.metrics)
                    })
                else:
                    log[key] = value

            # print logged informations to the screen
            if self.train_logger is not None:
                self.train_logger.add_entry(log)
                if self.verbosity >= 1:
                    for key, value in sorted(list(log.items())):
                        self.logger.info('{:25s}: {}'.format(str(key), value))

            # evaluate model performance according to configured metric, save best checkpoint as model_best
            best = False
            if self.monitor_mode != 'off':
                try:
                    if  (self.monitor_mode == 'min' and log[self.monitor] < self.monitor_best) or\
                     (self.monitor_mode == 'max' and log[self.monitor] > self.monitor_best):
                        self.logger.info(
                            "Monitor improved from %f to %f" %
                            (self.monitor_best, log[self.monitor]))
                        self.monitor_best = log[self.monitor]
                        best = True
                except KeyError:
                    if epoch == 1:
                        msg = "Warning: Can\'t recognize metric named '{}' ".format(self.monitor)\
                         + "for performance monitoring. model_best checkpoint won\'t be updated."
                        self.logger.warning(msg)

            # Save checkpoint
            self._save_checkpoint(epoch, save_best=best)

    def _train_epoch(self, epoch):
        """
		Training logic for an epoch

		:param epoch: Current epoch number
		"""
        raise NotImplementedError

    def _save_checkpoint(self, epoch, save_best=False):
        """
		Saving checkpoints

		:param epoch: current epoch number
		:param log: logging information of the epoch
		:param save_best: if True, rename the saved checkpoint to 'model_best.pth'
		"""
        # Construct savedict
        arch = type(self.model).__name__
        state = {
            'arch': arch,
            'epoch': epoch,
            'logger': self.train_logger,
            'state_dict': self.model.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'monitor_best': self.monitor_best,
            'config': self.config
        }

        # Save checkpoint for each epoch
        if self.save_freq is not None:  # Use None mode to avoid over disk space with large models
            if epoch % self.save_freq == 0:
                filename = os.path.join(self.checkpoint_dir,
                                        'epoch{}.pth'.format(epoch))
                torch.save(state, filename)
                self.logger.info("Saving checkpoint at {}".format(filename))

        # Save the best checkpoint
        if save_best:
            best_path = os.path.join(self.checkpoint_dir, 'model_best.pth')
            torch.save(state, best_path)
            self.logger.info("Saving current best at {}".format(best_path))
        else:
            self.logger.info("Monitor is not improved from %f" %
                             (self.monitor_best))

    def _resume_checkpoint(self, resume_path):
        """
		Resume from saved checkpoints

		:param resume_path: Checkpoint path to be resumed
		"""
        self.logger.info("Loading checkpoint: {}".format(resume_path))
        checkpoint = torch.load(resume_path)
        self.start_epoch = checkpoint['epoch'] + 1
        self.monitor_best = checkpoint['monitor_best']

        # load architecture params from checkpoint.
        if checkpoint['config']['arch'] != self.config['arch']:
            self.logger.warning('Warning: Architecture configuration given in config file is different from that of checkpoint. ' + \
                 'This may yield an exception while state_dict is being loaded.')
        self.model.load_state_dict(checkpoint['state_dict'], strict=True)

        # # load optimizer state from checkpoint only when optimizer type is not changed.
        # if checkpoint['config']['optimizer']['type'] != self.config['optimizer']['type']:
        # 	self.logger.warning('Warning: Optimizer type given in config file is different from that of checkpoint. ' + \
        # 						'Optimizer parameters not being resumed.')
        # else:
        # 	self.optimizer.load_state_dict(checkpoint['optimizer'])

        self.train_logger = checkpoint['logger']
        self.logger.info("Checkpoint '{}' (epoch {}) loaded".format(
            resume_path, self.start_epoch - 1))
Exemplo n.º 19
0
    def __init__(self,
                 model,
                 loss,
                 metrics,
                 optimizer,
                 resume,
                 config,
                 train_logger=None):
        self.config = config

        # Setup directory for checkpoint saving
        start_time = datetime.datetime.now().strftime('%m%d_%H%M%S')
        self.checkpoint_dir = os.path.join(config['trainer']['save_dir'],
                                           config['name'], start_time)
        os.makedirs(self.checkpoint_dir, exist_ok=True)

        # Setup logger
        '''
		logging.basicConfig(
			level=logging.INFO,
			format="%(asctime)s %(message)s",
			handlers=[
				logging.FileHandler(os.path.join(self.checkpoint_dir, "train.log")),

		])
		self.logger = logging.getLogger(self.__class__.__name__)
		'''
        fh = logging.FileHandler(os.path.join(self.checkpoint_dir,
                                              "train.log"))
        fh.setLevel(logging.INFO)
        self.logger = logging.getLogger(self.__class__.__name__)
        self.logger.setLevel(logging.INFO)
        self.logger.addHandler(fh)

        # Setup GPU device if available, move model into configured device
        self.device, device_ids = self._prepare_device(config['n_gpu'])
        self.model = model.to(self.device)
        if len(device_ids) > 1:
            self.model = torch.nn.DataParallel(model, device_ids=device_ids)

        self.loss = loss
        self.metrics = metrics
        self.optimizer = optimizer

        self.epochs = config['trainer']['epochs']
        self.save_freq = config['trainer']['save_freq']
        self.verbosity = config['trainer']['verbosity']

        self.train_logger = train_logger

        # configuration to monitor model performance and save best
        self.monitor = config['trainer']['monitor']
        self.monitor_mode = config['trainer']['monitor_mode']
        assert self.monitor_mode in ['min', 'max', 'off']
        self.monitor_best = math.inf if self.monitor_mode == 'min' else -math.inf
        self.start_epoch = 1

        # setup visualization writer instance
        writer_train_dir = os.path.join(config['visualization']['log_dir'],
                                        config['name'], start_time, "train")
        writer_valid_dir = os.path.join(config['visualization']['log_dir'],
                                        config['name'], start_time, "valid")
        self.writer_train = WriterTensorboardX(
            writer_train_dir, self.logger,
            config['visualization']['tensorboardX'])
        self.writer_valid = WriterTensorboardX(
            writer_valid_dir, self.logger,
            config['visualization']['tensorboardX'])

        # Save configuration file into checkpoint directory
        config_save_path = os.path.join(self.checkpoint_dir, 'config.json')
        with open(config_save_path, 'w') as handle:
            json.dump(config, handle, indent=4, sort_keys=False)

        # Resume
        if resume:
            self._resume_checkpoint(resume)
Exemplo n.º 20
0
	def __init__(self, model, loss, metric, optimizer, resume, config, train_logger=None):

	self.config = config
	self.logger = logging.getLogger(self.__class__.__name__)		

	self.device, device_ids = self._prepare_device(config['n_gpu'])
	self.model = model.to(self.device)

	if len(device_ids) > 1:

		self.model = torch.nn.DataParallel(model, device_ids=device_ids)

	self.loss = loss
	self.metrics = metrics
	self.optimizer = optimizer
	self.train_logger = train_logger

	cfg_trainer = config['trainer']
	self.epochs = cfg_trainer['epochs']
	self.save_period = cfg_trainer['save_period']
	self.verbosity = cfg_trainer['verbosity']
	self.monitor = cfg_trainer.get('monitor', 'off')

	if self.monitor == 'off':

		self.mnt_mode = 'off'
		self.mnt_best = 0

	else:

		self.mnt_mode, self.mnt_metric = self.monitor.split()
		assert self.mnt_mode in ['min', 'max']

		self.mnt_best = math.inf if self.mnt_mode == 'min' else -math.inf
		self.early_stop = cfg_trainer.get('early_stop', math.inf)

	self.start_epoch = 1

	start_time = datatime.datatime.now().strftime('%m%d_%H%M%S')
	self.checkpoint_dir = os.path.join(cfg_trainer['save_dir'], config['name'], start_time)

	writer_dir = os.path.join(cfg_trainer['log_dir'], config['name'],start_time)
	self.writer = WriterTensorboardX(writer_dir, self.logger, cfg_trainer['tensorboardX'])

	ensure_dir(self.checkpoint_dir)
	config_save_path = os.path.join(self.checkpoint_dir, 'config.json')

	with open(config_save_path, 'w') as handle:

		json.dump(config, handle, indent=4, sort_keys=False)

	if resume:

		self._resume_checkpoint(resume)


	def _prepare_device(self, n_gpu_use):

		n_gpu = torch.cuda.device_count()
		if n_gpu_use > 0 and n_gpu == 0:

			self.logger.warning("Warning: There\'s no GPU available on this machine, training will be performed on CPU.")
			n_gpu_use = 0

		if n_gpu_use > n_gpu:

			self.logger.warning("Warning: The number of GPU\'s configured to use but only on this machine".format(n_gpu_use, n_gpu))
			n_gpu_use = n_gpu

		device = torch.device('cuda:0' if n_gpu_use > 0 else 'cpu')
		list_ids = list(range(n_gpu_use))

		return device, list_ids

	def train(self):

		for epoch in range(self.start_epoch, self.epochs + 1):

			result = self._train_epoch(epoch)

			log = {'epoch': epoch}

			for key, value in result.items():

				if key == 'metrics':
					log.update({mtr.__name__: value[i] for i, mtr in enumerate(self.metrics)})
				elif key == 'val_metrics':
					log.update({'val_' + mtr.__name__: value[i] for i, mtr in enumerate(self.metrics)})
				else:
					log[key] = value

			if self.train_logger is not None:
				self.train_logger.add_entry(log)
				if self.verbosity >= 1:
					for key, value in log.items():
						self.logger.info('    {:15s}: {}'.format(str(key), value))

			best = False
			if self.mnt_mode != 'off':
				try:
					improved = (self.mnt_mode == 'min' and log[self.mnt_metric] < self.mnt_best) or (self.mnt_mode == 'max' and log[self.mnt_metric] > self.mnt_best)

				except KeyError:
					self.logger.warning("Warning: Metric '{}' is not found. Model performance monitoring is disabled.".format(self.mnt_metric))
					self.mnt_mode = 'off'
					improved = False
					not_improved_count = 0

				if improved:

					self.mnt_best = log[self.mnt_metric]
					not_improved_count = 0
					best = True

				else:

					not_imporved_count += 1

				if not_improved_count > self.early_stop:

					self.logger.info("Validation performance didn\'t improve for {} epochs. Training stops.".format(self.early_stop))
					break

			if epoch % self.save_period == 0:

				self._save_checkpoint(epoch, save_best=best)
	
	def _train_epoch(self, epoch):

		raise NotImplementedError

	def _save_checkpoint(self, epoch, save_best=False):

		arch = type(self.model).__name__
		state = {
		'arch': arch, 
		'epoch': epoch, 
		'logger': self.train_logger, 
		'state_dict': self.model.state_dict(), 
		'monitor_best': self.mnt_best, 
		'config': self.config
		}

		filename = os.path.join(self.checkpoint_dir, 'checkpoint-epoch{}.pth'.format(epoch))
		torch.save(state, filename)
		self.logger.info("Saving checkpoint: {} ...".format(filename))
		if save_best:
			best_path = os.path.join(self.checkpoint_dir, 'model_best.pth')
			torch.save(state, best_path)				
			self.logger.info("Saving current best: {} ...".format('model_best.pth'))

	def _resume_checkpoint(self, resume_path):

		self.logger.info("Loading checkpoint: {} ...".format(resume_path))
		checkpoint = torch.load(resume_path)
		self.start_epoch = checkpoint['epoch'] + 1
		self.mnt_best = checkpoint['monitor_best']

		if checkpoit['config']['arch'] != self.config['arch']:

			self.logger.warning('Warning: Architecture configuration given in config file is different from that of checkpoint. This may yield an exception while state_dict is being loaded.')

		self.model.load_state_dict(checkpoint['state_dict'])


		if checkpoint['config']['optimizer']['type'] != self.config['optimizer']['type']:
			self.logger.warning('Warning: Optimizer type given in config file is different from that of checkpoint. Optimizer parameters not being resumed')
		else:
			self.optimizer.load_state_dict(checkpoint['optimizer'])

		self.train_logger = checkpoint['logger']
		self.logger.info("Checkpoint '{}' (epoch {}) loaded".format(resume_path, self.start_epoch))
Exemplo n.º 21
0
class BaseTrainer:
    """
    Base class for all trainers
    """
    def __init__(self,
                 model,
                 loss,
                 metrics,
                 optimizer,
                 resume,
                 config,
                 train_logger=None):
        self.config = config
        self.logger = logging.getLogger(self.__class__.__name__)

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.model = model.to(self.device)

        self.loss = loss
        self.metrics = metrics
        self.optimizer = optimizer
        self.train_logger = train_logger

        cfg_trainer = config['train']
        self.epochs = cfg_trainer['epochs']
        self.save_period = cfg_trainer['save_p']
        self.verbosity = cfg_trainer['verbosity']
        self.monitor = cfg_trainer.get('monitor', 'off')

        # configuration to monitor model performance and save best
        if self.monitor == 'off':
            self.mnt_mode = 'off'
            self.mnt_best = 0
        else:
            self.mnt_mode, self.mnt_metric = self.monitor.split()
            assert self.mnt_mode in ['min', 'max']

            self.mnt_best = math.inf if self.mnt_mode == 'min' else -math.inf
            self.early_stop = cfg_trainer.get('early_stop', math.inf)

        self.start_epoch = 1

        # setup directory for checkpoint saving
        start_time = datetime.datetime.now().strftime('%m%d_%H%M%S')
        self.checkpoint_dir = os.path.join(cfg_trainer['save_dir'], start_time,
                                           'checkpoints')
        self.log_dir = os.path.join(cfg_trainer['save_dir'], start_time,
                                    'logs')

        self.writer = WriterTensorboardX(self.log_dir, self.logger,
                                         cfg_trainer['tbX'])

        # Save configuration file into checkpoint directory:
        mkdir_p(self.checkpoint_dir)

        if self.config.get('cfg', None) is not None:
            cfg_save_path = os.path.join(self.checkpoint_dir, 'model.cfg')
            with open(cfg_save_path, 'w') as fw:
                fw.write(open(self.config['cfg']).read())
            self.config['cfg'] = cfg_save_path

        config_save_path = os.path.join(self.checkpoint_dir, 'config.json')
        with open(config_save_path, 'w') as handle:
            json.dump(self.config, handle, indent=4, sort_keys=False)

        if resume:
            self._resume_checkpoint(resume)

    def train(self):
        """
        Full training logic
        """
        best_df = None
        not_improved_count = 0

        #f = open(os.path.join(self.log_dir, 'lr.txt'), 'w')

        for epoch in range(self.start_epoch, self.epochs + 1):

            # _train_epoch returns dict with train metrics ("metrics"), validation
            # metrics ("val_metrics") and other key,value pairs. Store/update them in log.
            result = self._train_epoch(epoch)

            # save logged informations into log dict
            log = {'epoch': epoch}
            for key, value in result.items():
                if key == 'metrics':
                    log.update({
                        mtr.__name__: value[i]
                        for i, mtr in enumerate(self.metrics)
                    })
                elif key == 'val_metrics':
                    log.update({
                        'val_' + mtr.__name__: value[i]
                        for i, mtr in enumerate(self.metrics)
                    })
                else:
                    log[key] = value

            c_lr = self.optimizer.param_groups[0]['lr']

            # print logged informations to the screen
            if self.train_logger is not None:
                self.train_logger.add_entry(log)
                if self.verbosity >= 1:

                    df = pd.DataFrame.from_dict([log]).T
                    df.columns = ['']
                    #self.logger.info('Epoch: {}'.format(epoch))
                    self.logger.info('{}'.format(df.loc[df.index != 'epoch']))
                    self.logger.info('lr_0: {}'.format(c_lr))

            #f.write('%.5f\t%.5f\t%.5f\n'%(c_lr, result['loss'], result['metrics'][0]))
            #f.flush()
            self.writer.add_scalar('lr', c_lr)

            # evaluate model performance according to configured metric, save best checkpoint as model_best
            best = False
            if self.mnt_mode != 'off':
                try:
                    # check whether model performance improved or not, according to specified metric(mnt_metric)
                    improved = (self.mnt_mode == 'min' and log[self.mnt_metric] < self.mnt_best) or \
                               (self.mnt_mode == 'max' and log[self.mnt_metric] > self.mnt_best)
                except KeyError:
                    self.logger.warning(
                        "Warning: Metric '{}' is not found. Model performance monitoring is disabled."
                        .format(self.mnt_metric))
                    self.mnt_mode = 'off'
                    improved = False
                    not_improved_count = 0

                if improved:
                    self.mnt_best = log[self.mnt_metric]
                    not_improved_count = 0
                    best = True
                    best_df = df
                else:
                    not_improved_count += 1

                if not_improved_count > self.early_stop:
                    self.logger.info(
                        "Validation performance didn\'t improve for {} epochs. Training stops."
                        .format(self.early_stop))
                    self.logger.info('Final:\n{}'.format(
                        best_df.loc[best_df.index != 'epoch']))
                    break

            if len(self.writer) > 0:
                self.logger.info(
                    '\nRun TensorboardX:\ntensorboard --logdir={}\n'.format(
                        self.log_dir))

            if epoch % self.save_period == 0:
                self._save_checkpoint(epoch, save_best=best)
                #self.logger.info('\n\n\tTensorboardX Path: {}\n'.format(self.log_dir))

    def _train_epoch(self, epoch):
        """
        Training logic for an epoch

        :param epoch: Current epoch number
        """
        raise NotImplementedError

    def _save_checkpoint(self, epoch, save_best=False):
        """
        Saving checkpoints

        :param epoch: current epoch number
        :param log: logging information of the epoch
        :param save_best: if True, rename the saved checkpoint to 'model_best.pth'
        """
        arch = type(self.model).__name__
        state = {
            'arch': arch,
            'epoch': epoch,
            'logger': self.train_logger,
            'state_dict': self.model.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'monitor_best': self.mnt_best,
            'config': self.config,
            'classes': self.model.classes
        }

        filename = os.path.join(self.checkpoint_dir, 'checkpoint-current.pth')
        #filename = os.path.join(self.checkpoint_dir, 'checkpoint-epoch{}.pth'.format(epoch))
        torch.save(state, filename)
        self.logger.info("Saving checkpoint: {} ...".format(filename))
        if save_best:
            best_path = os.path.join(self.checkpoint_dir, 'model_best.pth')
            torch.save(state, best_path)
            self.logger.info(
                "Saving current best: {} ...".format('model_best.pth'))
            self.logger.info("[IMPROVED]")

    def _resume_checkpoint(self, resume_path):
        """
        Resume from saved checkpoints

        :param resume_path: Checkpoint path to be resumed
        """
        self.logger.info("Loading checkpoint: {} ...".format(resume_path))
        checkpoint = torch.load(resume_path)
        self.start_epoch = checkpoint['epoch'] + 1
        self.mnt_best = checkpoint['monitor_best']

        # load architecture params from checkpoint.
        self.model.load_state_dict(checkpoint['state_dict'], )

        # load optimizer state from checkpoint only when optimizer type is not changed.

        self.optimizer.load_state_dict(checkpoint['optimizer'])

        self.train_logger = checkpoint['logger']
        self.logger.info("Checkpoint '{}' (epoch {}) loaded".format(
            resume_path, self.start_epoch))
Exemplo n.º 22
0
    def __init__(self,
                 model,
                 loss,
                 metrics,
                 optimizer,
                 resume,
                 config,
                 train_logger=None):
        self.config = config
        self.logger = logging.getLogger(self.__class__.__name__)

        # setup GPU device if available, move model into configured device
        self.device, device_ids = self._prepare_device(config['n_gpu'])
        self.model = model.to(self.device)
        if len(device_ids) > 1:
            self.model = torch.nn.DataParallel(model, device_ids=device_ids)

        self.loss = loss
        self.metrics = metrics
        self.optimizer = optimizer
        self.train_logger = train_logger

        cfg_trainer = config['trainer']
        cfg_model = config['arch']['args']
        self.epochs = cfg_trainer['epochs']
        self.save_period = cfg_trainer['save_period']
        self.verbosity = cfg_trainer['verbosity']
        self.monitor = cfg_trainer.get('monitor', 'off')

        # configuration to monitor model performance and save best
        if self.monitor == 'off':
            self.mnt_mode = 'off'
            self.mnt_best = 0
        else:
            self.mnt_mode, self.mnt_metric = self.monitor.split()
            assert self.mnt_mode in ['min', 'max']

            self.mnt_best = math.inf if self.mnt_mode == 'min' else -math.inf
            self.early_stop = cfg_trainer.get('early_stop', math.inf)

        self.start_epoch = 1
        self.not_improved_count = 0

        ss = str(config['trainer']['label_portion'])
        logvar_trainable = int(cfg_model['logvar_trainable'])
        pow_exp = int(cfg_model['pow_exp'])
        latent_dim = cfg_model['latent_dim']
        is_pitch_condition = int(cfg_model['is_pitch_condition'])
        is_pitch_discriminate = int(cfg_model['is_pitch_discriminate'])
        model_dir = '-'.join([
            config['data_loader']['args']['data_dir'].split('/')[-1],
            'ss_%s' % ss,
            'latent_%d' % latent_dim,
            'pow_%d' % pow_exp,
            'lvl_%d' % logvar_trainable,
            'pc_%d' % is_pitch_condition,
            'pd_%d' % is_pitch_discriminate, self.mnt_metric
        ])

        self.checkpoint_dir = os.path.join(cfg_trainer['save_dir'],
                                           config['name'], model_dir)
        # setup visualization writer instance
        writer_dir = os.path.join(cfg_trainer['log_dir'], config['name'],
                                  model_dir)
        self.writer = WriterTensorboardX(writer_dir, self.logger,
                                         cfg_trainer['tensorboardX'])

        # Save configuration file into checkpoint directory:
        ensure_dir(self.checkpoint_dir)
        config_save_path = os.path.join(self.checkpoint_dir, 'config.json')
        with open(config_save_path, 'w') as handle:
            json.dump(config, handle, indent=4, sort_keys=False)

        if resume:
            self._resume_checkpoint(resume)
Exemplo n.º 23
0
    def __init__(self,
                 model,
                 loss,
                 metrics,
                 optimizer,
                 resume,
                 config,
                 train_logger=None):
        self.config = config
        self.logger = logging.getLogger(self.__class__.__name__)

        # setup GPU device if available, move model into configured device
        n_gpu_use = config['n_gpu']
        n_gpu = torch.cuda.device_count()
        if n_gpu_use > 0 and n_gpu == 0:
            self.logger.warning(
                "Warning: There\'s no GPU available on this machine, training will be performed on CPU."
            )
            n_gpu_use = 0
        if n_gpu_use > n_gpu:
            msg = "Warning: The number of GPU\'s configured to use is {}, but only {} are available on this machine.".format(
                n_gpu_use, n_gpu)
            self.logger.warning(msg)
            n_gpu_use = n_gpu
        self.device = torch.device('cuda:0' if n_gpu_use > 0 else 'cpu')
        self.model = model.to(self.device)
        if n_gpu_use > 1:
            self.model = torch.nn.DataParallel(model,
                                               device_ids=list(
                                                   range(n_gpu_use)))

        self.loss = loss
        self.metrics = metrics
        self.optimizer = optimizer

        self.epochs = config['trainer']['epochs']
        self.save_freq = config['trainer']['save_freq']
        self.verbosity = config['trainer']['verbosity']

        self.train_logger = train_logger

        # configuration to monitor model performance and save best
        self.monitor = config['trainer']['monitor']
        self.monitor_mode = config['trainer']['monitor_mode']
        assert self.monitor_mode in ['min', 'max', 'off']
        self.monitor_best = math.inf if self.monitor_mode == 'min' else -math.inf
        self.start_epoch = 1

        # setup directory for checkpoint saving
        start_time = datetime.datetime.now().strftime('%m%d_%H%M%S')
        self.checkpoint_dir = os.path.join(config['trainer']['save_dir'],
                                           config['name'], start_time)
        # setup visualization writer instance
        writer_dir = os.path.join(config['visualization']['log_dir'],
                                  config['name'], start_time)
        self.writer = WriterTensorboardX(
            writer_dir, self.logger, config['visualization']['tensorboardX'])

        # Save configuration into checkpoint directory:
        ensure_dir(self.checkpoint_dir)
        config_save_path = os.path.join(self.checkpoint_dir, 'config.json')
        with open(config_save_path, 'w') as handle:
            json.dump(config, handle, indent=4, sort_keys=False)

        if resume:
            self._resume_checkpoint(resume)
Exemplo n.º 24
0
    def train(self, examples, writer=None):
        """
        examples: list of examples, each example is of form (board, pi, v)
        writer: optional tensorboardX writer
        """
        optimizer = self.args.optimizer(self.nnet.parameters(), lr=self.args.lr, **self.args.optimizer_kwargs)
        scheduler = self.args.lr_scheduler(optimizer, **self.args.lr_scheduler_kwargs)

        # If no writer, create unusable writer
        if writer is None: writer = WriterTensorboardX(None, None, False)

        epoch_bar = tqdm(desc="Training Epoch", total=self.args.epochs)
        for epoch in range(self.args.epochs):
            self.nnet.train()
            scheduler.step()

            pi_losses = AverageMeter()
            v_losses = AverageMeter()
            total_losses = AverageMeter()

            num_batches = int(len(examples)/self.args.batch_size)
            bar = tqdm(desc='Batch', total=num_batches)
            batch_idx = 0

            while batch_idx < num_batches:
                writer.set_step((self.train_iteration * self.args.epochs * num_batches) + (epoch * num_batches) + batch_idx)

                sample_ids = np.random.randint(len(examples), size=self.args.batch_size)
                boards, pis, vs = list(zip(*[examples[i] for i in sample_ids]))
                boards = torch.FloatTensor(np.array(boards).astype(np.float64))
                target_pis = torch.FloatTensor(np.array(pis))
                target_vs = torch.FloatTensor(np.array(vs).astype(np.float64))

                # predict
                if self.args.cuda:
                    boards, target_pis, target_vs = boards.contiguous().cuda(), target_pis.contiguous().cuda(), target_vs.contiguous().cuda()

                # compute output
                out_pi, out_v = self.nnet(boards)
                l_pi = self.loss_pi(target_pis, out_pi)
                l_v = self.loss_v(target_vs, out_v)
                total_loss = l_pi + l_v

                pi_losses.update(l_pi.item(), boards.size(0))
                v_losses.update(l_v.item(), boards.size(0))
                total_losses.update(total_loss.item(), boards.size(0))

                # record loss
                writer.add_scalar('pi_loss', l_pi.item())
                writer.add_scalar('v_loss', l_v.item())
                writer.add_scalar('loss', total_loss.item())

                # compute gradient and do SGD step
                optimizer.zero_grad()
                total_loss.backward()
                optimizer.step()

                # measure elapsed time
                batch_idx += 1

                # plot progress
                bar.set_postfix(
                    lpi=l_pi.item(),
                    lv=l_v.item(),
                    loss=total_loss.item()
                )

                bar.update()
            bar.close()

            writer.set_step((self.train_iteration * self.args.epochs) + epoch, 'train_epoch')
            writer.add_scalar('epoch_pi_loss', pi_losses.avg)
            writer.add_scalar('epoch_v_loss', v_losses.avg)
            writer.add_scalar('epoch_loss', total_losses.avg)

            epoch_bar.set_postfix(
                avg_lpi=pi_losses.avg,
                avg_lv=v_losses.avg,
                avg_l=total_losses.avg
            )
            epoch_bar.update()

        epoch_bar.close()
        self.train_iteration += 1