def __init__(self, model, device, config, folder):
        self.config = config
        self.epoch = 0

        #设置工作目录
        self.base_dir = f'./model/seresnext_512/{folder}'
        if not os.path.exists(self.base_dir):
            os.makedirs(self.base_dir)

        self.log_path = f'{self.base_dir}/log.txt'
        self.best_score = 0
        self.best_loss = 10**5
        self.best_ap = 0
        
        self.model = model
        self.device = device
        self.best_true = np.array([])
        self.best_pred = np.array([])

        param_optimizer = list(self.model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ] 

        self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=config.lr)
        self.scheduler = config.SchedulerClass(self.optimizer, **config.scheduler_params)
        # self.scheduler.step
        #self.scheduler_warmup = GradualWarmupScheduler(self.optimizer, multiplier=1, total_epoch=5, after_scheduler=self.scheduler)
        self.scheduler_warmup = GradualWarmupScheduler(self.optimizer, multiplier=1, total_epoch=6)

#         self.criterion = FocalLoss(logits=True).to(self.device)
        self.criterion = LabelSmoothing().to(self.device)
        self.log(f'Fitter prepared. Device is {self.device}')
예제 #2
0
def train2(net, train_loader, test_loader):

    loss_fn = nn.CrossEntropyLoss()
    net2 = BYOL_Classification(net, 10)

    net2.eval()
    net2.cuda()
    for pq in net.parameters():
        pq.requires_grad = False
    optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                  net2.parameters()),
                           lr=1e-3)
    from warmup_scheduler import GradualWarmupScheduler
    scheduler = GradualWarmupScheduler(
        optimizer,
        multiplier=1,
        total_epoch=20,
        after_scheduler=optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                             T_max=80))

    train_start = time.time()
    for epoch in range(1, 100 + 1):

        train_loss = 0
        net2.train()

        epoch_start = time.time()
        for idx, (data, target) in enumerate(train_loader):
            optimizer.zero_grad()
            data = data.cuda()
            target = target.cuda()
            data = net2(data)[1]
            loss = loss_fn(data, target)

            train_loss += loss.item()

            loss.backward()
            optimizer.step()

        train_loss /= (idx + 1)
        scheduler.step()

        epoch_time = time.time() - epoch_start
        if epoch % 10 == 0:
            net.eval()
            total = 0.0
            correct = 0.0
            for test_data in test_loader:
                images, labels = test_data
                images = images.cuda()
                labels = labels.cuda()
                outputs = net2(images)[1]
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

            print("Epoch\t", epoch, "\tTest accuracy\t", correct / total * 100)

    elapsed_train_time = time.time() - train_start
    print('Finished training. Train time was:', elapsed_train_time)
예제 #3
0
def train(args, config, loader, device):
    logging.info('Start training...')
    model = getattr(net, config.model.name)(**config.model.args,
                                            **config.embedder)
    model = model.to(device)

    criterion = getattr(nn, config.loss.name)(**config.loss.args).to(device)
    optimizer = getattr(torch.optim,
                        config.optimizer.name)(model.parameters(),
                                               **config.optimizer.args)
    if hasattr(config, 'lr_scheduler'):
        if hasattr(config.lr_scheduler, 'name'):
            scheduler = getattr(torch.optim.lr_scheduler,
                                config.lr_scheduler.name)(
                                    optimizer, **config.lr_scheduler.args)
        else:
            scheduler = None
        if hasattr(config.lr_scheduler, 'warm_up'):
            scheduler_warm_up = GradualWarmupScheduler(
                optimizer,
                multiplier=config.lr_scheduler.warm_up.multiplier,
                total_epoch=config.lr_scheduler.warm_up.epoch,
                after_scheduler=scheduler)

    loss = Box({'train': 0.0, 'val': 0.0})
    metrics = Box({'train': [Accuracy()], 'val': [Accuracy()]})

    for epoch in range(config.train.n_epoch):
        if hasattr(config, 'lr_scheduler'):
            if hasattr(config.lr_scheduler, 'warm_up'):
                scheduler_warm_up.step()
            else:
                scheduler.step()

        loss.train, metrics.train = run_epoch(
            model,
            optimizer,
            criterion,
            loader.train,
            train=True,
            metrics=metrics.train,
            max_norm=config.max_norm if hasattr(config, 'max_norm') else -1)
        loss.val, metrics.val = run_epoch(model,
                                          optimizer,
                                          criterion,
                                          loader.val,
                                          train=False,
                                          metrics=metrics.val)

        saved_path = os.path.join(args.model_folder, 'checkpoints',
                                  f'epoch_{epoch}.pt')
        save_model(saved_path, epoch, model, optimizer)
        log_metrics(epoch, args.model_folder, loss, metrics)
예제 #4
0
def train(net, loader):
    optimizer = SGD_with_lars(net.parameters(),
                              lr=0.1,
                              momentum=0.9,
                              weight_decay=1e-6)

    from warmup_scheduler import GradualWarmupScheduler
    scheduler = GradualWarmupScheduler(
        optimizer,
        multiplier=1,
        total_epoch=20,
        after_scheduler=optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                             T_max=180))

    train_start = time.time()

    for epoch in range(1, 100 + 1):
        print('hi')
        train_loss = 0
        net.train()
        epoch_start = time.time()
        for idx, (data, target) in enumerate(loader):
            optimizer.zero_grad()

            dat1 = data[0].cuda()
            dat2 = data[1].cuda()
            loss = net(dat1, dat2)

            train_loss += loss.item()

            loss.backward()
            optimizer.step()

        train_loss /= (idx + 1)
        scheduler.step()

        epoch_time = time.time() - epoch_start
        print(
            "Epoch\t",
            epoch,
            "\tLoss\t",
            train_loss,
            "\tTime\t",
            epoch_time,
        )

    elapsed_train_time = time.time() - train_start
    print('Finished training. Train time was:', elapsed_train_time)
예제 #5
0
def get_scheduler(args, optimizer):
    args = vars(args)

    if args['scheduler'] == "warmup":
        print(f'Using warmup scheduler with cosine annealing')
        print(
            f"warmup epochs : {args['warmup_epochs']} | total epochs {args['epochs']}"
        )
        print(f"lr_start : {args['lr']} ---> lr_end : {args['lr_end']}")

        scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, args['epochs'], eta_min=args['lr_end'])
        scheduler = GradualWarmupScheduler(optimizer,
                                           multiplier=1,
                                           total_epoch=args['warmup_epochs'],
                                           after_scheduler=scheduler_cosine)

    elif args['scheduler'] == "multistep":
        print(
            f"Using multistep scheduler with gamma = {args['gamma']} and milestones = {args['milestones']}"
        )

        scheduler = MultiStepLR(optimizer,
                                milestones=args['milestones'],
                                gamma=args['gamma'])

    elif args['scheduler'] == "cosine":
        print(f"Using cosine annealing from {args['lr']} to {args['lr_end']}")
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, args['epochs'], eta_min=args['lr_end'])

    return scheduler
예제 #6
0
def get_scheduler(optimizer, args):
    if args.lr_scheduler == 'CosineAnnealingLR':
        print('Use cosine scheduler')
        scheduler_next = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, T_max=args.epochs)
    elif args.lr_scheduler == 'StepLR':
        print('Use step scheduler, step size: {}, gamma: {}'.format(
            args.step_size, args.gamma))
        scheduler_next = torch.optim.lr_scheduler.StepLR(
            optimizer, step_size=args.step_size, gamma=args.gamma)
    elif args.lr_scheduler == 'MultiStepLR':
        print('Use MultiStepLR scheduler, milestones: {}, gamma: {}'.format(
            args.milestones, args.gamma))
        scheduler_next = torch.optim.lr_scheduler.MultiStepLR(
            optimizer, milestones=args.milestones, gamma=args.gamma)
    else:
        raise NotImplementedError
    if args.warmup_epoch <= 0:
        return scheduler_next
    print('Use warmup scheduler')
    lr_scheduler = GradualWarmupScheduler(optimizer,
                                          multiplier=args.warmup_multiplier,
                                          total_epoch=args.warmup_epoch,
                                          after_scheduler=scheduler_next)
    return lr_scheduler
예제 #7
0
    def configure_optimizers(self):
        if self.hparams['opt'] == 'sgd':
            opt = torch.optim.SGD(self.parameters(),
                                  lr=self.hparams.lr,
                                  momentum=0.9,
                                  weight_decay=5e-4)
        elif self.hparams['opt'] == 'adam':
            opt = torch.optim.Adam(self.parameters(),
                                   lr=self.hparams.lr,
                                   weight_decay=5e-4)

        if self.hparams['sched'] == 'cyclic:':
            scheduler = CyclicLR(optimizer=opt,
                                 base_lr=self.hparams.lr / 500,
                                 max_lr=self.hparams.lr / 10)
        elif self.hparams['sched'] == 'cosine_annealing_warm_restarts':
            scheduler = CosineAnnealingWarmRestarts(
                optimizer=opt,
                T_0=2000,
                eta_min=self.hparams.lr / 1000.0,
                T_mult=1,
            )
        elif self.hparams['sched'] == 'exp':
            scheduler_steplr = ExponentialLR(opt, gamma=0.95)
            scheduler = GradualWarmupScheduler(
                opt,
                multiplier=1,
                total_epoch=5,
                after_scheduler=scheduler_steplr)

        self.sched = scheduler
        self.opt = opt
        return opt
예제 #8
0
    def configure_optimizers(self):
        self.optim = Adam(
            self.parameters(),
            lr=self.cfg["train"]["lr"],
            weight_decay=self.cfg["train"]["l2"],
        )
        self.sched = CosineAnnealingLR(self.optim,
                                       T_max=self.cfg["train"]["lr_restart"])
        self.warmup = GradualWarmupScheduler(
            self.optim,
            multiplier=1,
            total_epoch=self.cfg["train"]["warmup"],
            after_scheduler=self.sched,
        )

        return [self.optim], [self.sched]
예제 #9
0
def get_optimizer_and_scheduler(args, model):
    """
    Returns an pytorch optimizer and scheduler for each specific dataset
    Args:
        args: arguments of the program
        model: the model we use for training and testing

    Returns: optimizer, scheduler

    """

    if args.dataset == 'QM9':
        optimizer = optim.Adam(model.parameters(),
                               lr=args.lr,
                               weight_decay=args.wd,
                               amsgrad=False)
        scheduler_ = torch.optim.lr_scheduler.ExponentialLR(optimizer,
                                                            gamma=0.9961697)
        scheduler = GradualWarmupScheduler(optimizer,
                                           multiplier=1.0,
                                           total_epoch=1,
                                           after_scheduler=scheduler_)
    else:
        optimizer = optim.Adam(model.parameters(), lr=args.lr)
        scheduler = None

    return optimizer, scheduler
예제 #10
0
    def __init__(self, model, device, config):
        self.config = config
        self.epoch = 0

        self.base_dir = f'./{config.folder}'
        if not os.path.exists(self.base_dir):
            os.makedirs(self.base_dir)

        self.log_path = f'{self.base_dir}/log.txt'
        self.best_summary_loss = 10**5

        self.model = model
        self.device = device

        param_optimizer = list(self.model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.001
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        self.optimizer = config.OptimizerClass(self.model.parameters(),
                                               lr=config.lr)
        self.scheduler = config.SchedulerClass(self.optimizer,
                                               **config.scheduler_params)
        if self.config.warmup:
            self.warmup_scheduler = GradualWarmupScheduler(
                self.optimizer,
                multiplier=1,
                total_epoch=5,
                after_scheduler=self.scheduler)

        self.log(f'Fitter prepared. Device is {self.device}')

        if self.config.apex:
            self.model, self.optimizer = amp.initialize(self.model,
                                                        self.optimizer,
                                                        opt_level='O1')
예제 #11
0
 def configure_optimizers(self):
     self.optimizer = RAdam(self.parameters(), lr=self.cfg.train.lr, weight_decay=2e-5)
     warmup_epo = 1
     warmup_factor = 10
     scheduler_cos = CosineAnnealingLR(self.optimizer, T_max=self.cfg.train.epoch - warmup_epo, eta_min=0)
     self.scheduler = GradualWarmupScheduler(self.optimizer, multiplier=warmup_factor,
                                             total_epoch=warmup_epo, after_scheduler=scheduler_cos)
     return [self.optimizer], [self.scheduler]
    def configure_optimizers(self):
        optimizer = optim.Adam(model.parameters(), lr=init_lr / warmup_factor)

        scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, n_epochs - warmup_epo)
        scheduler = GradualWarmupScheduler(optimizer, multiplier=warmup_factor, total_epoch=warmup_epo,
                                           after_scheduler=scheduler_cosine)

        return [optimizer], [scheduler]
예제 #13
0
 def make_scheduler(optimizer, stage):
     if stage['scheduler'] == 'OneCycleLR':
         return OneCycleLR(optimizer=optimizer, **stage['scheduler_params'])
     elif stage['scheduler'] == 'GradualWarmupScheduler':
         return GradualWarmupScheduler(optimizer=optimizer,
                                       **stage['scheduler_params'])
     return getattr(torch.optim.lr_scheduler,
                    stage['scheduler'])(optimizer=optimizer,
                                        **stage['scheduler_params'])
예제 #14
0
def build_model(config, device, train=True):
    # load model
    if config['model'] == 'default':
        net = model.Resnet50()
    elif config['model'] == 'fused':
        net = model_fused.Resnet50()
    elif config['model'] == 'quant':
        net = model_quant.Resnet50()
    elif config['model'] == 'tf':
        net = model_tf.Resnet50()
    elif config['model'] == 'tf_fused':
        net = model_tf_fused.Resnet50()
    else:
        raise ValueError('cannot load model, check config file')
    # load loss
    if config['loss'] == 'cross_entropy':
        loss_fn = nn.CrossEntropyLoss()
    else:
        raise ValueError('cannot load loss, check config file')

    net = net.to(device)
    loss_fn = loss_fn.to(device)

    if not train:
        return net, loss_fn
    # load optimizer
    if config['optimizer'] == 'sgd':
        optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad,
                                           net.parameters()),
                                    lr=config['learning_rate'],
                                    momentum=0.9,
                                    weight_decay=config['weight_decay'])
    elif config['optimizer'] == 'adam':
        optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                            net.parameters()),
                                     lr=config['learning_rate'],
                                     weight_decay=config['weight_decay'])
    else:
        raise ValueError('cannot load optimizer, check config file')
    # load scheduler
    if config['scheduler'] == 'cosine':
        scheduler_step = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, T_max=config['t_max'])
    elif config['scheduler'] == 'step':
        scheduler_step = torch.optim.lr_scheduler.StepLR(
            optimizer,
            step_size=config['lr_decay_every'],
            gamma=config["lr_decay"])
    else:
        raise ValueError('cannot load scheduler, check config file')
    scheduler = GradualWarmupScheduler(optimizer,
                                       multiplier=config['lr_multiplier'],
                                       total_epoch=config['lr_epoch'],
                                       after_scheduler=scheduler_step)

    return net, loss_fn, optimizer, scheduler
예제 #15
0
def get_scheduler(optimizer, name):
    if name == "gradual_warmup":
        exp_lr = ExponentialLR(optimizer, gamma=0.996)
        return GradualWarmupScheduler(optimizer,
                                      multiplier=1,
                                      total_epoch=800,
                                      after_scheduler=exp_lr)
    if name == "cosine":
        return CosineAnnealingWarmRestarts(optimizer, 100, 2)
    raise ValueError("incorrect scheduler name: %s" % name)
예제 #16
0
    def _setup_model(self):
        num_classes = 2
        num_aux_classes = self.train_dataloader.dataset.num_auxiliary_classes
        freeze_backbone = self.model_kwargs.get('freeze_backbone', False)
        self.model_kwargs['num_aux_classes'] = num_aux_classes
        self.model = Model(num_main_classes=num_classes,
                           num_aux_classes=num_aux_classes,
                           freeze_backbone=freeze_backbone)
        if self.model_kwargs.get('aux_labels_type', None) == "imagenet":
            # Initialize auxiliary head to imagenet fc
            self.model.auxiliary_head.weight = self.model.backbone.fc.weight
            self.model.auxiliary_head.bias = self.model.backbone.fc.bias
        if self.use_cuda:
            self.model = self.model.cuda()
        self.model = nn.DataParallel(self.model)
        self.main_loss = nn.CrossEntropyLoss()
        self.auxiliary_loss = nn.CrossEntropyLoss()
        self.start_epoch = 0
        self.end_epoch = self.model_kwargs.get('epochs_to_run', 1)
        self.current_epoch = 0
        self.global_train_batch_idx = 0
        self.global_val_batch_idx = 0

        lr = float(self.model_kwargs.get('initial_lr', 0.01))
        endlr = float(self.model_kwargs.get('endlr', 0.0))
        optim_params = dict(
            lr=lr,
            momentum=float(self.model_kwargs.get('momentum', 0.9)),
            weight_decay=float(self.model_kwargs.get('weight_decay', 0.0001)),
        )
        self.optimizer = optim.SGD(self.model.parameters(), **optim_params)
        max_epochs = int(self.model_kwargs.get('max_epochs', 90))
        warmup_epochs = int(self.model_kwargs.get('warmup_epochs', 0))
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer,
                                                               max_epochs -
                                                               warmup_epochs,
                                                               eta_min=endlr)
        self.optimizer_scheduler = GradualWarmupScheduler(
            optimizer=self.optimizer,
            multiplier=1.0,
            warmup_epochs=warmup_epochs,
            after_scheduler=scheduler)
예제 #17
0
def create_lr_scheduler(
        conf_lrs: Config, epochs: int, optimizer: Optimizer,
        steps_per_epoch: Optional[int]) -> Tuple[Optional[_LRScheduler], bool]:

    # epoch_or_step - apply every epoch or every step
    scheduler, epoch_or_step = None, True

    if conf_lrs is not None:
        lr_scheduler_type = conf_lrs['type']  # TODO: default should be none?
        if lr_scheduler_type == 'cosine':
            # adjust max epochs for warmup
            # TODO: shouldn't we be increasing epochs or schedule lr only after warmup?
            if conf_lrs.get('warmup', None):
                epochs -= conf_lrs['warmup']['epochs']
            scheduler = lr_scheduler.CosineAnnealingLR(
                optimizer, T_max=epochs, eta_min=conf_lrs['min_lr'])
        elif lr_scheduler_type == 'resnet':
            scheduler = _adjust_learning_rate_resnet(optimizer, epochs)
        elif lr_scheduler_type == 'pyramid':
            scheduler = _adjust_learning_rate_pyramid(optimizer, epochs,
                                                      get_optim_lr(optimizer))
        elif lr_scheduler_type == 'step':
            decay_period = conf_lrs['decay_period']
            gamma = conf_lrs['gamma']
            scheduler = lr_scheduler.StepLR(optimizer,
                                            decay_period,
                                            gamma=gamma)
        elif lr_scheduler_type == 'one_cycle':
            assert steps_per_epoch is not None
            ensure_pytorch_ver('1.3.0',
                               'LR scheduler OneCycleLR is not available.')
            max_lr = conf_lrs['max_lr']
            epoch_or_step = False
            scheduler = lr_scheduler.OneCycleLR(
                optimizer,
                max_lr=max_lr,
                epochs=epochs,
                steps_per_epoch=steps_per_epoch,
            )  # TODO: other params
        elif not lr_scheduler_type:
            scheduler = None  # TODO: check support for this or use StepLR
        else:
            raise ValueError('invalid lr_schduler=%s' % lr_scheduler_type)

        # select warmup for LR schedule
        if conf_lrs.get('warmup', None):
            scheduler = GradualWarmupScheduler(
                optimizer,
                multiplier=conf_lrs['warmup']['multiplier'],
                total_epoch=conf_lrs['warmup']['epochs'],
                after_scheduler=scheduler)

    return scheduler, epoch_or_step
def get_scheduler(optimizer, opt):
    """Return a learning rate scheduler

    Parameters:
        optimizer          -- the optimizer of the network
        opt (option class) -- stores all the experiment flags; needs to be a subclass of BaseOptions. 
                              opt.lr_policy is the name of learning rate policy: linear | step | plateau | cosine

    For 'linear', we keep the same learning rate for the first <opt.niter> epochs
    and linearly decay the rate to zero over the next <opt.niter_decay> epochs.
    For other schedulers (step, plateau, and cosine), we use the default PyTorch schedulers.
    See https://pytorch.org/docs/stable/optim.html for more details.
    """
    if opt.lr_policy == 'linear':

        def lambda_rule(epoch):
            lr_l = 1.0 - max(0, epoch + opt.epoch_count -
                             opt.niter) / float(opt.niter_decay + 1)
            return lr_l

        scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule)
    elif opt.lr_policy == 'step':
        scheduler = lr_scheduler.StepLR(optimizer,
                                        step_size=opt.lr_decay_iters,
                                        gamma=0.1)
    elif opt.lr_policy == 'plateau':
        scheduler = lr_scheduler.ReduceLROnPlateau(optimizer,
                                                   mode='min',
                                                   factor=0.2,
                                                   threshold=0.01,
                                                   patience=5)
    elif opt.lr_policy == 'cosine':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer,
                                                   T_max=opt.niter,
                                                   eta_min=0)
    elif opt.lr_policy == 'warmup':
        scheduler_cosine = lr_scheduler.CosineAnnealingLR(optimizer,
                                                          T_max=opt.niter,
                                                          eta_min=0)
        scheduler = GradualWarmupScheduler(optimizer,
                                           multiplier=10,
                                           total_epoch=10,
                                           after_scheduler=scheduler_cosine)
    else:
        return NotImplementedError(
            'learning rate policy [%s] is not implemented', opt.lr_policy)
    return scheduler
예제 #19
0
    def configure_optimizers(self):
        optimizer_cls, scheduler_cls = get_optimizer(self.cfg)

        conf_optim = self.cfg.Optimizer
        optimizer = optimizer_cls(self.parameters(), **conf_optim.optimizer.params)
        if scheduler_cls is None:
            return [optimizer]
        else:
            scheduler_default = scheduler_cls(
                optimizer, **conf_optim.lr_scheduler.params
            )
            scheduler = GradualWarmupScheduler(
                optimizer,
                multiplier=10,
                total_epoch=1,
                after_scheduler=scheduler_default,
            )
        return [optimizer], [scheduler]
예제 #20
0
def fit(model, data, optimizer, scheduler, loss, augmentation, parameters):

    model = model.train()

    optimizer_fun = getattr(optim, optimizer['name'])(model.parameters(),
                                                      **optimizer)
    scheduler_fun = getattr(optim, scheduler['name'])(optimizer, **scheduler)
    loss_fun = getattr(nn, loss['name'])(**loss)

    if scheduler.get("warmup", None) is not None:
        nb_epoch_warmup = int(parameters['epoch'] * scheduler["warmup"])
        optimizer_fun.defaults['lr'] *= 0.01
        scheduler_fun = GradualWarmupScheduler(optimizer_fun,
                                               multiplier=100,
                                               total_epoch=nb_epoch_warmup,
                                               after_scheduler=scheduler_fun)

    for ep in range(
            parameters['epoch']):  # loop over the dataset multiple times
        running_loss = 0.0
        for i, d in enumerate(data):
            # get the inputs; d is a list of [inputs, labels]
            inputs, labels = d

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)
            loss_measure = loss_fun(outputs, labels)
            loss_measure.backward()
            optimizer.step()
            scheduler.step()

            # print statistics
            running_loss += loss_measure.item()
            if i % 10 == 0:  # print every 10 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (ep + 1, i + 1, running_loss / 10))
                running_loss = 0.0
            print('Finished Training')
    return model
예제 #21
0
파일: utils.py 프로젝트: apchenstu/mvsnerf
def get_scheduler(hparams, optimizer):
    eps = 1e-8
    if hparams.lr_scheduler == 'steplr':
        scheduler = MultiStepLR(optimizer,
                                milestones=hparams.decay_step,
                                gamma=hparams.decay_gamma)
    elif hparams.lr_scheduler == 'cosine':
        scheduler = CosineAnnealingLR(optimizer,
                                      T_max=hparams.num_epochs,
                                      eta_min=eps)

    else:
        raise ValueError('scheduler not recognized!')

    if hparams.warmup_epochs > 0 and hparams.optimizer not in [
            'radam', 'ranger'
    ]:
        scheduler = GradualWarmupScheduler(
            optimizer,
            multiplier=hparams.warmup_multiplier,
            total_epoch=hparams.warmup_epochs,
            after_scheduler=scheduler)
    return scheduler
예제 #22
0
def main():
    lr = 0.00001
    PATH = '/home/ruoyaow/imageqa-qgen/evaluation'

    if len(sys.argv) > 1 and sys.argv[1] == 'c':
        pretrained = PATH
    else:
        pretrained = 'bert-base-uncased'

    model = BertForMaskedLM.from_pretrained(pretrained,
                                            output_hidden_states=True,
                                            output_attentions=False)
    if GPU:
        model = model.cuda()

    with open('nouns_unbalance.pkl', 'rb') as f:
        word_dict = pickle.load(f)

    max_epoch = 10
    batch_size = 32

    optimizer = AdamW(model.parameters(), lr=lr)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, max_epoch)
    scheduler_warmup = GradualWarmupScheduler(optimizer,
                                              multiplier=5,
                                              total_epoch=max_epoch,
                                              after_scheduler=scheduler_cosine)

    train_data = 'noun_blank_unbalance.txt'
    evaluation, trainld, testld = loadData(train_data, batch_size)

    eval(testld, model, tokenizer, word_dict)
    eval(evaluation, model, tokenizer, word_dict)
예제 #23
0
    dataset_valid,
    batch_size=batch_size,
    sampler=SequentialSampler(dataset_valid),
    num_workers=num_workers)

model = enetv2(enet_type, out_dim=out_dim)
model = model.to(device)

criterion = LabelSmoothingLoss(out_dim, smoothing=0.1)
# criterion = nn.CrossEntropyLoss()
# criterion = MyBCELoss()
optimizer = optim.Adam(model.parameters(), lr=init_lr / warmup_factor)
scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer, n_epochs - warmup_epo)
scheduler = GradualWarmupScheduler(optimizer,
                                   multiplier=warmup_factor,
                                   total_epoch=warmup_epo,
                                   after_scheduler=scheduler_cosine)

# optimizer = Radam.Over9000(model.parameters(), lr = init_lr)
# scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, n_epochs)

model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
model = torch.nn.DataParallel(model,
                              device_ids=list(range(len(gpus.split(",")))))

qwk_max = 0.
for epoch in range(1, n_epochs + 1):
    printOut(time.ctime(), 'Epoch:', epoch)
    scheduler.step(epoch - 1)

    train_loss = train_epoch(train_loader, optimizer)
예제 #24
0
파일: val.py 프로젝트: linan2017/URFC-top4
def training(train_data_list, val_data_list, test_files, fold):

    os.makedirs(os.path.join(config.weights, config.model_name) + os.sep +
                str(fold),
                exist_ok=True)
    os.makedirs(config.best_models, exist_ok=True)
    ### ---------- get model ------------------------------------------
    model = FF3DNet(drop=0.5)
    ### ---------- set lr, opt, loss ------------------------------------------
    img_params = list(map(id, model.img_encoder.parameters()))
    rest_params = filter(lambda p: id(p) not in img_params, model.parameters())
    params = [
        {
            'params': rest_params,
            'lr': config.lr
        },
        {
            'params': model.img_encoder.parameters(),
            'lr': config.lr * 3
        },
    ]
    optimizer = torch.optim.SGD(params, momentum=0.9, weight_decay=1e-4)
    scheduler = lr_scheduler.CosineAnnealingLR(optimizer,
                                               T_max=config.epochs - 5,
                                               eta_min=config.lr / 100)
    scheduler_warmup = GradualWarmupScheduler(optimizer,
                                              multiplier=10,
                                              total_epoch=5,
                                              after_scheduler=scheduler)

    criterion = nn.CrossEntropyLoss().to(device)

    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)
    model.to(device)

    best_results = [0, np.inf, 0]
    val_metrics = [0, np.inf, 0]
    ### ---------- load dataset ------------------------------------------
    train_gen = MultiModalDataset(train_data_list,
                                  config.train_data,
                                  config.train_vis,
                                  mode="train")
    train_loader = DataLoader(train_gen,
                              batch_size=config.batch_size,
                              shuffle=True,
                              pin_memory=True,
                              num_workers=4)

    # val_data=getfiles("val")
    # val_data.sort()
    val_csv = "/root/userfolder/linan/C/preliminary/val.csv"
    val_data = pd.read_csv(val_csv)
    val_gen = MultiModalDataset(val_data,
                                config.train_data,
                                config.train_vis,
                                augument=False,
                                mode="val")
    val_loader = DataLoader(val_gen,
                            512,
                            shuffle=False,
                            pin_memory=True,
                            num_workers=4)
    test_gen = MultiModalDataset(test_files,
                                 config.test_data,
                                 config.test_vis,
                                 augument=False,
                                 mode="test")
    test_loader = DataLoader(test_gen,
                             512,
                             shuffle=False,
                             pin_memory=True,
                             num_workers=4)

    # --- train, val, test -------------------------
    resume = False

    start = timer()
    print("multi fold val")
    #___________________________________________________________________________________________________________________
    for index in [1, 2, 3]:
        print(index)
        checkpoint_loss = torch.load(
            'checkpoints/best_models/0626_debug_fold_' + str(index) +
            '_model_best_loss.pth.tar')
        model.load_state_dict(checkpoint_loss["state_dict"])
        test(val_loader, model, fold, checkpoint_loss, 'best_loss', False,
             index)

        checkpoint_acc = torch.load(
            'checkpoints/best_models/0626_debug_fold_' + str(index) +
            '_model_best_acc.pth.tar')
        model.load_state_dict(checkpoint_acc["state_dict"])
        test(val_loader, model, fold, checkpoint_acc, 'best_acc', False, index)
        #test_ensemble_loss_acc(test_loader, fold, [checkpoint_loss, checkpoint_acc], 'ensemble', True)
    0 / 0
    #___________________________________________________________________________________________________________________

    if resume:
        checkpoint_loss = torch.load(
            'checkpoints/best_models/0616_coslr_55_fold_0_model_best_loss.pth.tar'
        )
        model.load_state_dict(checkpoint_loss["state_dict"])
        test(test_loader, model, fold, checkpoint_loss, 'best_loss', False)
        checkpoint_acc = torch.load(
            'checkpoints/best_models/0616_coslr_55_fold_0_model_best_acc.pth.tar'
        )
        model.load_state_dict(checkpoint_acc["state_dict"])
        test(test_loader, model, fold, checkpoint_acc, 'best_acc', False)
        test_ensemble_loss_acc(test_loader, fold,
                               [checkpoint_loss, checkpoint_acc], 'ensemble',
                               True)
    else:
        ### ---------- train loop ----------------
        for epoch in range(4, config.epochs):
            scheduler_warmup.step(metrics=val_metrics[0])
            for param_group in optimizer.param_groups:
                log.write(str(param_group['lr']) + '\n')
            train_metrics = train(train_loader, model, criterion, optimizer,
                                  epoch, val_metrics, best_results, start)
            # val_metrics_tta = evaluate(val_loader_tta,model,criterion,epoch,train_metrics,best_results,start)
            val_metrics = evaluate(val_loader, model, criterion, epoch,
                                   train_metrics, best_results, start)
            is_best_acc = val_metrics[0] > best_results[0]
            best_results[0] = max(val_metrics[0], best_results[0])
            is_best_loss = val_metrics[1] < best_results[1]
            best_results[1] = min(val_metrics[1], best_results[1])
            is_best_f1 = val_metrics[2] > best_results[2]
            best_results[2] = max(val_metrics[2], best_results[2])
            save_checkpoint(
                {
                    "epoch": epoch + 1,
                    "model_name": config.model_name,
                    "state_dict": model.state_dict(),
                    "best_acc": best_results[0],
                    "best_loss": best_results[1],
                    "optimizer": optimizer.state_dict(),
                    "fold": fold,
                    "best_f1": best_results[2],
                }, is_best_acc, is_best_loss, is_best_f1, fold)
            print('\r', end='', flush=True)
            print(val_metrics[0], val_metrics[1], val_metrics[2], "val")
            log.write(
                '%s  %5.1f %6.1f      |   %0.3f   %0.3f   %0.3f     |  %0.3f   %0.3f    %0.3f    |   %s  %s  %s | %s' % ( \
                    "best", epoch, epoch,
                    train_metrics[0], train_metrics[1], train_metrics[2],
                    val_metrics[0], val_metrics[1], val_metrics[2],
                    str(best_results[0])[:8], str(best_results[1])[:8], str(best_results[2])[:8],
                    time_to_str((timer() - start), 'min'))
                )
            log.write("\n")
            time.sleep(0.01)
        # log.write("\n----------------------------------------------- [START %s] %s\n\n" % (
        # datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '-' * 51))
        # log.write(
        #     '                           |------------ Train -------|----------- Valid ---------|----------Best Results---|------------|\n')
        # log.write(
        #     'mode     iter     epoch    |    acc  loss  f1_macro   |    acc  loss  f1_macro    |    acc  loss  f1_macro       | time       |\n')
        # log.write(
        #     '-------------------------------------------------------------------------------------------------------------------------|\n')

        ### ---------- per fold ensemble best loss ckpt and best acc ckpt
        checkpoint_loss = torch.load(
            'checkpoints/best_models/%s_fold_%s_model_best_loss.pth.tar' %
            (config.model_name, str(fold)))
        model.load_state_dict(checkpoint_loss["state_dict"])
        test(test_loader, model, fold, checkpoint_loss, 'best_loss', False)
        checkpoint_acc = torch.load(
            'checkpoints/best_models/%s_fold_%s_model_best_acc.pth.tar' %
            (config.model_name, str(fold)))
        model.load_state_dict(checkpoint_acc["state_dict"])
        test(test_loader, model, fold, checkpoint_acc, 'best_acc', False)
        test_ensemble_loss_acc(test_loader, fold,
                               [checkpoint_loss, checkpoint_acc], 'ensemble',
                               not config.k_fold)

    ### ----------- last kfold ensemble all before k ensemble ckpts
    if config.k_fold and fold == config.num_kf:
        mean_npy = np.zeros([10000, 9])
        for i in range(1, config.num_kf + 1):
            checkpoint = torch.load(
                'checkpoints/best_models/%s_fold_%s_model_best_loss.pth.tar' %
                (config.model_name, str(i)))
            loss_pred = np.load('preds_9/%s/%s_val_fold%s_%s.npy' %
                                (checkpoint["model_name"],
                                 checkpoint["model_name"], str(i), 'ensemble'))
            mean_npy += loss_pred
        mean_npy = mean_npy / config.num_kf
        np.save(
            'preds_9/%s/%s_val_fold%s_%s.npy' %
            (checkpoint["model_name"], checkpoint["model_name"], 'cv',
             'ensemble'), mean_npy)
        gen_txt(mean_npy, checkpoint, 'cv', 'ensemble')
예제 #25
0
elif args.opt == "sgd":
    optimizer = optim.SGD(net.parameters(), lr=args.lr)
if not args.cos:
    from torch.optim import lr_scheduler

    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer,
                                               'min',
                                               patience=3,
                                               verbose=True,
                                               min_lr=1e-3 * 1e-5,
                                               factor=0.1)
else:
    scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, args.n_epochs - 1)
    scheduler = GradualWarmupScheduler(optimizer,
                                       multiplier=10,
                                       total_epoch=1,
                                       after_scheduler=scheduler_cosine)


##### Training
def train(epoch):
    print('\nEpoch: %d' % epoch)
    net.train()
    train_loss = 0
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets)
예제 #26
0
import torch

from warmup_scheduler import GradualWarmupScheduler


if __name__ == '__main__':
    v = torch.zeros(10)
    optim = torch.optim.SGD([v], lr=0.01)
    scheduler = GradualWarmupScheduler(optim, multiplier=8, total_epoch=10)

    for epoch in range(1, 20):
        scheduler.step(epoch)

        print(epoch, optim.param_groups[0]['lr'])
예제 #27
0
def train(name, df, VAL_FOLD=0, resume=False):
    dt_string = datetime.now().strftime("%d|%m_%H|%M|%S")
    print("Starting -->", dt_string)

    os.makedirs(OUTPUT_DIR, exist_ok=True)
    os.makedirs('checkpoint', exist_ok=True)
    run = f"{name}_[{dt_string}]"

    wandb.init(project="imanip", config=config_defaults, name=run)
    config = wandb.config

    # model = SRM_Classifer(num_classes=1, encoder_checkpoint='weights/pretrain_[31|03_12|16|32].h5')
    model = SMP_SRM_UPP(classifier_only=True)

    # for name_, param in model.named_parameters():
    #     if 'classifier' in name_:
    #         continue
    #     else:
    #         param.requires_grad = False

    print("Parameters : ",
          sum(p.numel() for p in model.parameters() if p.requires_grad))

    wandb.save('segmentation/smp_srm.py')
    wandb.save('dataset.py')

    train_imgaug, train_geo_aug = get_train_transforms()
    transforms_normalize = get_transforms_normalize()

    #region ########################-- CREATE DATASET and DATALOADER --########################
    train_dataset = DATASET(dataframe=df,
                            mode="train",
                            val_fold=VAL_FOLD,
                            test_fold=TEST_FOLD,
                            transforms_normalize=transforms_normalize,
                            imgaug_augment=train_imgaug,
                            geo_augment=train_geo_aug)
    train_loader = DataLoader(train_dataset,
                              batch_size=config.train_batch_size,
                              shuffle=True,
                              num_workers=4,
                              pin_memory=True,
                              drop_last=False)

    valid_dataset = DATASET(
        dataframe=df,
        mode="val",
        val_fold=VAL_FOLD,
        test_fold=TEST_FOLD,
        transforms_normalize=transforms_normalize,
    )
    valid_loader = DataLoader(valid_dataset,
                              batch_size=config.valid_batch_size,
                              shuffle=True,
                              num_workers=4,
                              pin_memory=True,
                              drop_last=False)

    test_dataset = DATASET(
        dataframe=df,
        mode="test",
        val_fold=VAL_FOLD,
        test_fold=TEST_FOLD,
        transforms_normalize=transforms_normalize,
    )
    test_loader = DataLoader(test_dataset,
                             batch_size=config.valid_batch_size,
                             shuffle=True,
                             num_workers=4,
                             pin_memory=True,
                             drop_last=False)
    #endregion ######################################################################################

    optimizer = get_optimizer(model, config.optimizer, config.learning_rate,
                              config.weight_decay)
    # after_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    #     optimizer,
    #     patience=config.schedule_patience,
    #     mode="min",
    #     factor=config.schedule_factor,
    # )
    after_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                                 T_0=35,
                                                                 T_mult=2)
    scheduler = GradualWarmupScheduler(optimizer=optimizer,
                                       multiplier=1,
                                       total_epoch=config.warmup + 1,
                                       after_scheduler=after_scheduler)

    # this zero gradient update is needed to avoid a warning message, issue #8.
    # optimizer.zero_grad()
    # optimizer.step()

    criterion = nn.BCEWithLogitsLoss()
    es = EarlyStopping(patience=200, mode="min")

    model = nn.DataParallel(model).to(device)

    # wandb.watch(model, log_freq=50, log='all')

    start_epoch = 0
    if resume:
        checkpoint = torch.load(
            'checkpoint/(using pretrain)COMBO_ALL_FULL_[09|04_12|46|35].pt')
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch'] + 1
        print("-----------> Resuming <------------")

    for epoch in range(start_epoch, config.epochs):
        print(f"Epoch = {epoch}/{config.epochs-1}")
        print("------------------")

        train_metrics = train_epoch(model, train_loader, optimizer, scheduler,
                                    criterion, epoch)
        valid_metrics = valid_epoch(model, valid_loader, criterion, epoch)

        scheduler.step(valid_metrics['valid_loss'])

        print(
            f"TRAIN_ACC = {train_metrics['train_acc_05']}, TRAIN_LOSS = {train_metrics['train_loss']}"
        )
        print(
            f"VALID_ACC = {valid_metrics['valid_acc_05']}, VALID_LOSS = {valid_metrics['valid_loss']}"
        )
        print("Optimizer LR", optimizer.param_groups[0]['lr'])
        print("Scheduler LR", scheduler.get_lr()[0])
        wandb.log({
            'optim_lr': optimizer.param_groups[0]['lr'],
            'schedule_lr': scheduler.get_lr()[0]
        })

        es(
            valid_metrics["valid_loss"],
            model,
            model_path=os.path.join(OUTPUT_DIR, f"{run}.h5"),
        )
        if es.early_stop:
            print("Early stopping")
            break

        checkpoint = {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
        }
        torch.save(checkpoint, os.path.join('checkpoint', f"{run}.pt"))

    if os.path.exists(os.path.join(OUTPUT_DIR, f"{run}.h5")):
        print(
            model.load_state_dict(
                torch.load(os.path.join(OUTPUT_DIR, f"{run}.h5"))))
        print("LOADED FOR TEST")

    test_metrics = test(model, test_loader, criterion)
    wandb.save(os.path.join(OUTPUT_DIR, f"{run}.h5"))

    return test_metrics
예제 #28
0
lr = 0.001
optim = torch.optim.SGD([v], lr=lr)
optim.param_groups[0]['initial_lr'] = lr

last_epoch = -1
scheduler = lr_scheduler.MultiStepLR(optim,
                                     milestones=[4],
                                     gamma=0.1,
                                     last_epoch=-1)
# scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=10, eta_min=0.00001, last_epoch=-1)
# scheduler = lr_scheduler.OneCycleLR(optim, max_lr=0.001, total_steps=6000, pct_start=0.033, anneal_strategy='cos', last_epoch=last_epoch)

warmup = True
if warmup:
    scheduler = GradualWarmupScheduler(optim,
                                       multiplier=5,
                                       total_epoch=5,
                                       after_scheduler=scheduler)

# if last_epoch != -1:
#     scheduler.step()

lrs = []
for epoch in range(last_epoch + 1, 30):
    print(epoch, optim.param_groups[0]['lr'])
    lrs.append(optim.param_groups[0]['lr'])

    scheduler.step()

plt.plot(lrs)
plt.show()
예제 #29
0
def train(args, train_dataset, model):
    tb_writer = SummaryWriter(args.tb_writer_dir)
    result_writer = ResultWriter(args.eval_results_dir)

    if args.weighted_sampling == 1:
        # 세 가지 구질이 불균일하게 분포되었으므로 세 개를 동일한 비율로 샘플링
        # 결과적으로 이 방법을 썼을 때 좋지 않아서 wighted_sampling은 쓰지 않았음
        ball_type, counts = np.unique(train_dataset.pitch, return_counts=True)
        count_dict = dict(zip(ball_type, counts))
        weights = [1.0 / count_dict[p] for p in train_dataset.pitch]
        sampler = WeightedRandomSampler(weights,
                                        len(train_dataset),
                                        replacement=True)
        logger.info("Do Weighted Sampling")
    else:
        sampler = RandomSampler(train_dataset)

    train_dataloader = DataLoader(train_dataset,
                                  batch_size=args.train_batch_size,
                                  sampler=sampler)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // len(train_dataloader) + 1
    else:
        t_total = len(train_dataloader) * args.num_train_epochs

    args.warmup_step = int(args.warmup_percent * t_total)

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = [
        "bias",
        "layernorm.weight",
    ]

    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0,
        },
    ]

    optimizer = optim.Adam(optimizer_grouped_parameters,
                           lr=args.learning_rate,
                           eps=args.adam_epsilon)
    if args.warmup_step != 0:
        scheduler_cosine = CosineAnnealingLR(optimizer, t_total)
        scheduler = GradualWarmupScheduler(optimizer,
                                           1,
                                           args.warmup_step,
                                           after_scheduler=scheduler_cosine)
    else:
        scheduler = CosineAnnealingLR(optimizer, t_total)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    loss_fct = torch.nn.NLLLoss()

    # Train!
    logger.info("***** Running Baseball Transformer *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Warmup Steps = %d", args.warmup_step)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.train_batch_size)
    logger.info("  Total train batch size = %d", args.train_batch_size)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    best_step = 0
    steps_trained_in_current_epoch = 0
    tr_loss, logging_loss, logging_val_loss = 0.0, 0.0, 0.0

    best_pitch_micro_f1, best_pitch_macro_f1, = 0, 0
    best_loss = 1e10
    best_pitch_macro_f1 = 0

    model.zero_grad()
    train_iterator = trange(
        epochs_trained,
        int(args.num_train_epochs),
        desc="Epoch",
    )
    set_seed(args)  # Added here for reproducibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        for step, batch in enumerate(epoch_iterator):

            (
                pitcher,
                batter,
                state,
                pitch,
                label,
                pitch_memory,
                label_memory,
                memory_mask,
            ) = list(map(lambda x: x.to(args.device), batch))
            model.train()
            pitching_score, memories = model(
                pitcher,
                batter,
                state,
                pitch_memory,
                label_memory,
                memory_mask,
            )

            pitching_score = pitching_score.log_softmax(dim=-1)
            loss = loss_fct(pitching_score, pitch)

            if args.n_gpu > 1:
                loss = loss.mean()

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()

            if args.fp16:
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               args.max_grad_norm)
            else:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)
            optimizer.step()
            scheduler.step()
            model.zero_grad()
            global_step += 1

            if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                if args.evaluate_during_training:
                    results, f1_results, f1_log, cm = evaluate(
                        args, args.eval_data_file, model)
                    output_eval_file = os.path.join(args.output_dir,
                                                    "eval_results.txt")
                    print_result(output_eval_file, results, f1_log, cm)

                    for key, value in results.items():
                        tb_writer.add_scalar("eval_{}".format(key), value,
                                             global_step)
                    logging_val_loss = results["loss"]

                tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                tb_writer.add_scalar("loss", (tr_loss - logging_loss) /
                                     args.logging_steps, global_step)
                logging_loss = tr_loss

                # best 모델 선정 지표를 loss말고 macro-f1으로 설정(trade-off 존재)
                # if best_loss > results["loss"]:
                if best_pitch_macro_f1 < results["pitch_macro_f1"]:
                    best_pitch_micro_f1 = results["pitch_micro_f1"]
                    best_pitch_macro_f1 = results["pitch_macro_f1"]
                    best_loss = results["loss"]
                    results["best_step"] = best_step = global_step

                    output_dir = os.path.join(args.output_dir, "best_model/")
                    os.makedirs(output_dir, exist_ok=True)
                    torch.save(model.state_dict(),
                               os.path.join(output_dir, "pytorch_model.bin"))
                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving best model to %s", output_dir)

                    result_path = os.path.join(output_dir, "best_results.txt")
                    print_result(result_path,
                                 results,
                                 f1_log,
                                 cm,
                                 off_logger=True)

                    results.update(dict(f1_results))
                    result_writer.update(args, **results)

                logger.info("  best pitch micro f1 : %s", best_pitch_micro_f1)
                logger.info("  best pitch macro f1 : %s", best_pitch_macro_f1)
                logger.info("  best loss : %s", best_loss)
                logger.info("  best step : %s", best_step)

            if args.save_steps > 0 and global_step % args.save_steps == 0:
                checkpoint_prefix = "checkpoint"
                # Save model checkpoint
                output_dir = os.path.join(
                    args.output_dir, "{}-{}".format(checkpoint_prefix,
                                                    global_step))
                os.makedirs(output_dir, exist_ok=True)
                torch.save(model.state_dict(),
                           os.path.join(output_dir, "pytorch_model.bin"))
                torch.save(args, os.path.join(output_dir, "training_args.bin"))
                logger.info("Saving model checkpoint to %s", output_dir)

                rotate_checkpoints(args, checkpoint_prefix)

                torch.save(optimizer.state_dict(),
                           os.path.join(output_dir, "optimizer.pt"))
                torch.save(scheduler.state_dict(),
                           os.path.join(output_dir, "scheduler.pt"))
                logger.info("Saving optimizer and scheduler states to %s",
                            output_dir)

    tb_writer.close()

    return global_step, tr_loss / global_step
예제 #30
0
model_restoration.cuda()

device_ids = [i for i in range(torch.cuda.device_count())]
if torch.cuda.device_count() > 1:
  print("\n\nLet's use", torch.cuda.device_count(), "GPUs!\n\n")


new_lr = opt.OPTIM.LR_INITIAL

optimizer = optim.Adam(model_restoration.parameters(), lr=new_lr, betas=(0.9, 0.999),eps=1e-8, weight_decay=1e-8)

######### Scheduler ###########
if warmup:
    warmup_epochs = 3
    scheduler_cosine = optim.lr_scheduler.CosineAnnealingLR(optimizer, opt.OPTIM.NUM_EPOCHS-warmup_epochs, eta_min=1e-6)
    scheduler = GradualWarmupScheduler(optimizer, multiplier=1, total_epoch=warmup_epochs, after_scheduler=scheduler_cosine)
    scheduler.step()

######### Resume ###########
if opt.TRAINING.RESUME:
    path_chk_rest    = utils.get_last_path(model_dir, '_latest.pth')
    utils.load_checkpoint(model_restoration,path_chk_rest)
    start_epoch = utils.load_start_epoch(path_chk_rest) + 1
    utils.load_optim(optimizer, path_chk_rest)

    for i in range(1, start_epoch):
        scheduler.step()
    new_lr = scheduler.get_lr()[0]
    print('------------------------------------------------------------------------------')
    print("==> Resuming Training with learning rate:", new_lr)
    print('------------------------------------------------------------------------------')