示例#1
0
class Trainer(OptunaMlFlow):
    def __init__(self, args, search_space):
        super().__init__(args, search_space)
        self.args=args
        self.best_pred = 0.0

        # 再現性を上げるためrandomを使用している場合はrandom.seed()でseedを設定する
        random.seed(self.args.seed)
        
        # numpyで再現性を上げるためのの設定
        np.random.seed(self.args.seed)
        
        # pytorchで再現性を上げるための設定
        torch.manual_seed(self.args.seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        
        self.scaler = GradScaler()
        
        self.evaluator=Evaluator(self.args.nclass)

    def save_checkpoint(self, state, filename='checkpoint.pth.tar'):
        """Saves checkpoint to disk"""
        torch.save(state, filename)#, _use_new_zipfile_serialization=False)
        mlflow.log_artifact(filename)

    def training(self, epoch):
        train_loss = 0.0
        self.model.train()
        tbar = tqdm(self.train_loader)
        num_img_tr = len(self.train_loader)
        for i, sample in enumerate(tbar):
            image, target = sample
            if self.args.cuda:
                image, target = image.cuda(), target.cuda()
#            self.scheduler(self.optimizer, i, epoch, self.best_pred)
            self.optimizer.zero_grad()
            # for amp
            with autocast():
                output = self.model(image)
                loss = self.criterion(output, target)
            self.scaler.scale(loss).backward()
            # needed for horovod+amp
#            self.optimizer.synchronize()
#            with self.optimizer.skip_synchronize():
            self.scaler.step(self.optimizer)
            self.scaler.update()
            train_loss += loss.item()
            tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1)))

            mlflow.log_metric('train/total_loss_iter', loss.item(), i + num_img_tr * epoch)

            if i > self.args.proc_batch_count:
                break

        mlflow.log_metric('train/total_loss_epoch', train_loss, epoch)
#        print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0]))
        print('Loss: %.3f' % train_loss)

        if self.args.no_val:
            # save checkpoint every epoch
            self.save_checkpoint(self.model.state_dict())

    def validating(self, epoch):
        self.model.eval()
        self.evaluator.reset()
        tbar = tqdm(self.val_loader, desc='\r')
        test_loss = 0.0
        for i, sample in enumerate(tbar):
            image, target = sample
            if self.args.cuda:
                image, target = image.cuda(), target.cuda()
            with autocast():
                with torch.no_grad():
                    output = self.model(image)
            output=output.float()
            loss = self.criterion(output, target)
            test_loss += loss.item()
            tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1)))
#            pred = output.data.cpu().detach().numpy()
            pred = torch.argmax(output.data, dim=1).cpu().detach().numpy()
            target = target.cpu().detach().numpy()
#            target = nn.functional.one_hot(target, num_classes=10).cpu().detach().numpy().astype(np.float32)

            # Add batch sample into evaluator
            self.evaluator.add_batch(target, pred)
        
            if i > self.args.proc_batch_count:
                break

        # Fast test during the training
        Acc = self.evaluator.Pixel_Accuracy()
        """
        Acc_class = self.evaluator.Pixel_Accuracy_Class()
        """

        print('Validation:')
#        print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0]))
        
        new_pred = Acc
        if new_pred > self.best_pred:
            is_best = True
            self.best_pred = new_pred

            self.save_checkpoint(self.model.state_dict())

#            print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format(Acc, Acc_class, mIoU, FWIoU))

        mlflow.log_metric('val/total_loss_epoch', test_loss, epoch)
        mlflow.log_metric("val/best_Acc", self.best_pred, epoch)

        mlflow.log_metric('val/Acc', Acc, epoch)
        """
        mlflow.log_metric('val/Acc_class', Acc_class, epoch)
        """

        return self.best_pred

    # ここで一つのパラメータの組み合わせについて評価する
    def trial_process(self,
                    trial,
                    optimizer,
                    learning_rate,
                    horizontal_flip,
                    horizontal_shift_ratio,
                    vertical_shift_ratio,
                    random_erasing):
        self.best_pred=0.0
        self.start_run(trial)

        # mlflowにtrialごとの情報をロギング
        self.log_trial(trial)

        self.model = EfficientNet.from_pretrained(self.args.backbone)

        # Unfreeze model weights
        for param in self.model.parameters():
            param.requires_grad = True

        num_ftrs = self.model._fc.in_features
        self.model._fc = nn.Linear(num_ftrs, self.args.nclass)

        if self.args.smry_viz:
            from torchinfo import summary
            from torchviz import make_dot
            dummy_image=torch.zeros((2, 3, 32,32))
            dummy_output=self.model(dummy_image)
            make_dot(dummy_output,params=dict(self.model.named_parameters())).render("torchviz", format="png")
            summary(self.model, (1,3, 32,32))
            import sys;sys.exit()

        if self.args.cuda:
            self.model = self.model.to('cuda')

        if optimizer=='SGD':
            self.optimizer = optim.SGD(self.model.parameters(), lr=learning_rate)
        elif optimizer=='Adam':
            self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)


        pipeline = [
            T.ToTensor(),
            T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ]

        if strtobool(horizontal_flip) == 1:
            pipeline.append(T.RandomHorizontalFlip(p=0.5))

        pipeline.append(T.RandomAffine(0,translate=(horizontal_shift_ratio,vertical_shift_ratio)))

        if strtobool(random_erasing) == 1:
            pipeline.append(T.RandomErasing())

        transform = T.Compose(pipeline)
                
        train_set = torchvision.datasets.CIFAR10(root='./data', train=True,
                                                download=True, transform=transform)
        self.train_loader = torch.utils.data.DataLoader(train_set, batch_size=self.args.batch_size,
                                                shuffle=True, num_workers=self.args.workers)

        val_set = torchvision.datasets.CIFAR10(root='./data', train=False,
                                                download=True, transform=transform)
        self.val_loader = torch.utils.data.DataLoader(val_set, batch_size=self.args.batch_size,
                                                shuffle=False, num_workers=self.args.workers)

        self.criterion = nn.CrossEntropyLoss()

        for epoch in range(self.args.start_epoch, self.args.epochs):
            self.training(epoch)
            if not self.args.no_val and epoch % self.args.eval_interval == (self.args.eval_interval - 1):
                best_score=self.validating(epoch)

        self.end_run()

        # scoring by best
        return 1.0 - best_score

    # ランダムおよびTPEサーチを行うための目的関数
    def objective_no_grid(self, trial):
        '''
        # Categorical parameter
        optimizer = trial.suggest_categorical('optimizer', self.args.optimizer)

        # Int parameter
        num_layers = trial.suggest_int('num_layers', self.args.num_layers[0], self.args.num_layers[1])

        # Uniform parameter
        dropout_rate = trial.suggest_uniform('dropout_rate', self.args.dropout_rate[0], self.args.dropout_rate[1])

        # Loguniform parameter
        learning_rate = trial.suggest_loguniform('learning_rate', self.args.learning_rate[0], self.args.learning_rate[1])

        # Discrete-uniform parameter
        drop_path_rate = trial.suggest_discrete_uniform('drop_path_rate', self.args.drop_path_rate[0], self.args.drop_path_rate[1], self.args.drop_path_rate[2])
        '''

        # Int parameter
#        num_layers = trial.suggest_int('num_layers', self.args.num_layers[0], self.args.num_layers[1])

        optimizer = trial.suggest_categorical('optimizer', self.args.optimizer)

        learning_rate = trial.suggest_loguniform('learning_rate', self.args.learning_rate[0], self.args.learning_rate[1])

        horizontal_flip = trial.suggest_categorical('horizontal_flip', self.args.horizontal_flip)

        horizontal_shift_ratio = trial.suggest_uniform('horizontal_shift_ratio', self.args.horizontal_shift_ratio[0], self.args.horizontal_shift_ratio[1])

        vertical_shift_ratio = trial.suggest_uniform('vertical_shift_ratio', self.args.vertical_shift_ratio[0], self.args.vertical_shift_ratio[1])

        random_erasing = trial.suggest_categorical('random_erasing', self.args.random_erasing)

        # ここで一つのパラメータの組み合わせについて評価する
        result=self.trial_process(trial,
                                optimizer,
                                learning_rate,
                                horizontal_flip,
                                horizontal_shift_ratio,
                                vertical_shift_ratio,
                                random_erasing)

        return result

    # 固定パラメータおよびグリッドサーチを行うための目的関数
    def objective_grid(self, trial):
        '''
        パラメータは原則trial,suggest_categorical()で指定する。 
        '''    

        optimizer = trial.suggest_categorical('optimizer', self.args.optimizer)

        learning_rate = trial.suggest_categorical('learning_rate', self.args.learning_rate)

        horizontal_flip = trial.suggest_categorical('horizontal_flip', self.args.horizontal_flip)

        horizontal_shift_ratio = trial.suggest_categorical('horizontal_shift_ratio', self.args.horizontal_shift_ratio)

        vertical_shift_ratio = trial.suggest_categorical('vertical_shift_ratio', self.args.vertical_shift_ratio)

        random_erasing = trial.suggest_categorical('random_erasing', self.args.random_erasing)

        # ここで一つのパラメータの組み合わせについて評価する
        result=self.trial_process(trial,
                                optimizer,
                                learning_rate,
                                horizontal_flip,
                                horizontal_shift_ratio,
                                vertical_shift_ratio,
                                random_erasing)

        return result
class Trainer(object):
    def __init__(self, args):
        self.args = args

        # Define Saver
        self.saver = Saver(args)
        self.saver.save_experiment_config()
        # Define Tensorboard Summary
        self.summary = TensorboardSummary(args.logdir)
        self.writer = self.summary.create_summary()

        # Define Dataloader
        kwargs = {'num_workers': args.workers, 'pin_memory': True}
        dltrain = DLDataset('trainval', "./data/pascal_voc_seg/tfrecord/")
        dlval = DLDataset('val', "./data/pascal_voc_seg/tfrecord/")
        # dltrain = DLDataset('trainval', "./data/pascal_voc_seg/VOCdevkit/VOC2012/")
        # dlval = DLDataset('val', "./data/pascal_voc_seg/VOCdevkit/VOC2012/")
        self.train_loader = DataLoader(dltrain,
                                       batch_size=args.batch_size,
                                       shuffle=True,
                                       num_workers=args.workers,
                                       pin_memory=True)
        self.val_loader = DataLoader(dlval,
                                     batch_size=args.batch_size,
                                     shuffle=True,
                                     num_workers=args.workers,
                                     pin_memory=True)

        # Define network
        model = Deeplab()

        train_params = [{
            'params': model.get_1x_lr_params(),
            'lr': args.lr
        }, {
            'params': model.get_10x_lr_params(),
            'lr': args.lr * 10
        }]

        # Define Optimizer
        optimizer = torch.optim.SGD(train_params,
                                    momentum=args.momentum,
                                    weight_decay=args.weight_decay,
                                    nesterov=args.nesterov)

        # Define Criterion
        # whether to use class balanced weights
        self.criterion = nn.CrossEntropyLoss(ignore_index=255).cuda()
        self.model, self.optimizer = model, optimizer

        # Define Evaluator
        self.evaluator = Evaluator(21)
        # Define lr scheduler
        self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            optimizer=optimizer)

        # Using cuda
        # if args.cuda:
        # self.model = torch.nn.DataParallel(self.model)
        self.model = self.model.cuda()

        # Resuming checkpoint
        self.best_pred = 0.0
        if args.resume is not None:
            if not os.path.isfile(args.resume):
                raise RuntimeError("=> no checkpoint found at '{}'".format(
                    args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            if args.cuda:
                self.model.module.load_state_dict(checkpoint['state_dict'])
            else:
                self.model.load_state_dict(checkpoint['state_dict'])
            if not args.ft:
                self.optimizer.load_state_dict(checkpoint['optimizer'])
            self.best_pred = checkpoint['best_pred']
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))

        # Clear start epoch if fine-tuning
        if args.ft:
            args.start_epoch = 0

    def training(self, epoch):
        train_loss = 0.0
        self.model.train()
        tbar = tqdm(self.train_loader)
        num_img_tr = len(self.train_loader)
        for i, (image, target) in enumerate(tbar):
            if self.args.cuda:
                image, target = image.cuda(), target.cuda()

            self.optimizer.zero_grad()

            output = self.model(image)
            loss = self.criterion(output, target.long())
            loss.backward()
            self.optimizer.step()

            train_loss += loss.item()
            tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1)))
            self.writer.add_scalar('train/total_loss_iter', loss.item(),
                                   i + num_img_tr * epoch)

            # Show 10 * 3 inference results each epoch
            # if i % (num_img_tr // 10) == 0:
            if i % 10 == 0:
                global_step = i + num_img_tr * epoch
                self.summary.visualize_image(self.writer, self.args.dataset,
                                             image, target, output,
                                             global_step)
        self.scheduler.step(train_loss)
        self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch)
        print('[Epoch: %d, numImages: %5d]' %
              (epoch, i * self.args.batch_size + image.data.shape[0]))
        print('Loss: %.3f' % train_loss)

        if self.args.no_val:
            # save checkpoint every epoch
            is_best = False
            self.saver.save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': self.model.module.state_dict(),
                    'optimizer': self.optimizer.state_dict(),
                    'best_pred': self.best_pred,
                }, is_best)

    def validation(self, epoch):
        self.model.eval()
        self.evaluator.reset()
        tbar = tqdm(self.val_loader, desc='\r')
        test_loss = 0.0
        for i, (image, target) in enumerate(tbar):

            if self.args.cuda:
                image, target = image.cuda(), target.cuda()

            with torch.no_grad():
                output = self.model(image)

            loss = self.criterion(output, target.long())
            test_loss += loss.item()
            tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1)))
            pred = output.data.cpu().numpy()
            target = target.cpu().numpy()
            pred = np.argmax(pred, axis=1)
            # Add batch sample into evaluator
            self.evaluator.add_batch(target, pred)

        # Fast test during the training
        Acc = self.evaluator.Pixel_Accuracy()
        Acc_class = self.evaluator.Pixel_Accuracy_Class()
        mIoU = self.evaluator.Mean_Intersection_over_Union()
        FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union()
        self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch)
        self.writer.add_scalar('val/mIoU', mIoU, epoch)
        self.writer.add_scalar('val/Acc', Acc, epoch)
        self.writer.add_scalar('val/Acc_class', Acc_class, epoch)
        self.writer.add_scalar('val/fwIoU', FWIoU, epoch)
        print('Validation:')
        print('[Epoch: %d, numImages: %5d]' %
              (epoch, i * self.args.batch_size + image.data.shape[0]))
        print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format(
            Acc, Acc_class, mIoU, FWIoU))
        print('Loss: %.3f' % test_loss)

        new_pred = mIoU
        if new_pred > self.best_pred:
            is_best = True
            self.best_pred = new_pred
            self.saver.save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': self.model.state_dict(),
                    'optimizer': self.optimizer.state_dict(),
                    'best_pred': self.best_pred,
                }, is_best)