示例#1
0
def training_loop_lars(model,
                       criterion,
                       train_loader,
                       valid_loader,
                       epochs,
                       device,
                       print_every=1):
    '''
    Function defining the entire training loop
    '''

    # set objects for storing metrics
    train_losses = []
    valid_losses = []
    train_accuracy = []
    valid_accuracy = []

    # Train model
    for epoch in range(0, epochs):

        #training
        if epoch < 10:
            optimizer = LARS(model.parameters(),
                             lr=0.1 * (epoch + 1) / 10,
                             momentum=0.9)
        elif epoch < 15:
            optimizer = LARS(model.parameters(), lr=0.1, momentum=0.9)
        else:
            optimizer = LARS(model.parameters(),
                             lr=0.1 * (0.95**(epoch - 15)),
                             momentum=0.9)
        model, optimizer, train_loss = train(train_loader, model, criterion,
                                             optimizer, device)
        train_losses.append(train_loss)

        # validation
        with torch.no_grad():
            model, valid_loss = validate(valid_loader, model, criterion,
                                         device)
            valid_losses.append(valid_loss)

        if epoch % print_every == (print_every - 1):
            train_acc = get_accuracy(model, train_loader, device=device)
            valid_acc = get_accuracy(model, valid_loader, device=device)
            train_accuracy.append(float(train_acc))
            valid_accuracy.append(float(valid_acc))
            print(f'{datetime.now().time().replace(microsecond=0)} --- '
                  f'Epoch: {epoch}\t'
                  f'Train loss: {train_loss:.4f}\t'
                  f'Valid loss: {valid_loss:.4f}\t'
                  f'Train accuracy: {100 * train_acc:.2f}\t'
                  f'Valid accuracy: {100 * valid_acc:.2f}\n')

    #plot_losses(train_losses, valid_losses)

    return model, [train_losses, valid_losses, train_accuracy, valid_accuracy]
示例#2
0
文件: utils.py 项目: wangxin0716/ssl_
def load_model(args):
    model = SimCLR(backbone=args.backbone,
                   projection_dim=args.projection_dim,
                   pretrained=args.pretrained,
                   normalize=args.normalize)

    if args.inference:
        model.load_state_dict(
            torch.load("SimCLR_{}_epoch90.pth".format(args.backbone)))

    model = model.to(args.device)

    scheduler = None
    if args.optimizer == "Adam":
        optimizer = Adam(model.parameters(), lr=3e-4)  # TODO: LARS
    elif args.optimizer == "LARS":
        # optimized using LARS with linear learning rate scaling
        # (i.e. LearningRate = 0.3 × BatchSize/256) and weight decay of 10−6.
        learning_rate = 0.3 * args.batch_size / 256
        optimizer = LARS(
            model.parameters(),
            lr=learning_rate,
            weight_decay=args.weight_decay,
            exclude_from_weight_decay=["batch_normalization", "bias"],
        )

        # "decay the learning rate with the cosine decay schedule without restarts"
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                               args.epochs,
                                                               eta_min=0,
                                                               last_epoch=-1)
    else:
        raise NotImplementedError

    return model, optimizer, scheduler
示例#3
0
def build_byol_optimizer(hparams: AttrDict, model: nn.Module) -> Optimizer:
    """
    Build optimizer for BYOL self-supervised network, including backbone.
    """
    regular_parameters = []
    excluded_parameters = []
    for name, parameter in model.named_parameters():
        if parameter.requires_grad is False:
            continue
        if any(x in name for x in [".bn", ".bias"]):
            excluded_parameters.append(parameter)
        else:
            regular_parameters.append(parameter)
    param_groups = [
        {
            "params": regular_parameters,
            "use_lars": True
        },
        {
            "params": excluded_parameters,
            "use_lars": False,
            "weight_decay": 0,
        },
    ]
    return LARS(
        param_groups,
        lr=hparams.self_supervised.learning_rate.base,
        eta=hparams.self_supervised.lars_eta,
        momentum=hparams.self_supervised.momentum,
        weight_decay=hparams.self_supervised.weight_decay,
    )
    def configure_optimizers(self):
        #With this thing we get only params, which requires grad (weights needed to train)
        params = filter(lambda p: p.requires_grad, self.model.parameters())

        if self.hparams.optimizer == "SGD":
            self.optimizer = torch.optim.SGD(params, self.hparams.lr, momentum = self.hparams.momentum, weight_decay=self.hparams.wd)
        elif self.hparams.optimizer == "LARS":
            self.optimizer = LARS(params, lr=self.hparams.lr, momentum=self.hparams.momentum, weight_decay=self.hparams.wd, max_epoch=self.hparams.epochs)

        self.scheduler = torch.optim.lr_scheduler.OneCycleLR(self.optimizer, self.hparams.lr, epochs=self.hparams.epochs, steps_per_epoch=1, pct_start=self.hparams.pct_start)
        sched_dict = {'scheduler': self.scheduler}

        return [self.optimizer], [sched_dict]
示例#5
0
def do_training(args):
    trainloader, testloader = build_dataset(
        args.dataset,
        dataroot=args.dataroot,
        batch_size=args.batch_size,
        eval_batch_size=args.eval_batch_size,
        num_workers=2)
    model = build_model(args.arch, num_classes=num_classes(args.dataset))
    if args.cuda:
        model = torch.nn.DataParallel(model).cuda()

    # Calculate total number of model parameters
    num_params = sum(p.numel() for p in model.parameters())
    track.metric(iteration=0, num_params=num_params)

    num_chunks = max(1, args.batch_size // args.max_samples_per_gpu)

    optimizer = LARS(params=model.parameters(),
                     lr=args.lr,
                     momentum=args.momentum,
                     weight_decay=args.weight_decay,
                     eta=args.eta,
                     max_epoch=args.epochs)

    criterion = torch.nn.CrossEntropyLoss()

    best_acc = 0.0
    for epoch in range(args.epochs):
        track.debug("Starting epoch %d" % epoch)
        train_loss, train_acc = train(trainloader,
                                      model,
                                      criterion,
                                      optimizer,
                                      epoch,
                                      args.cuda,
                                      num_chunks=num_chunks)
        test_loss, test_acc = test(testloader, model, criterion, epoch,
                                   args.cuda)
        track.debug('Finished epoch %d... | train loss %.3f | train acc %.3f '
                    '| test loss %.3f | test acc %.3f' %
                    (epoch, train_loss, train_acc, test_loss, test_acc))
        # Save model
        model_fname = os.path.join(track.trial_dir(),
                                   "model{}.ckpt".format(epoch))
        torch.save(model, model_fname)
        if test_acc > best_acc:
            best_acc = test_acc
            best_fname = os.path.join(track.trial_dir(), "best.ckpt")
            track.debug("New best score! Saving model")
            torch.save(model, best_fname)
class ApexDistributeModel(object):
    def __init__(self, model, criterion, optimizer, args, gpu=None):
        super(ApexDistributeModel, self).__init__()
        self.model = model
        self.args = args
        self.sync_bn = self.args.sync_bn
        self.gpu = gpu
        if self.gpu is not None:
            assert isinstance(self.gpu, int), "GPU should is a int type."
        self.criterion = criterion
        self.optimizer = optimizer
        self.opt_level = None

    def convert(self, opt_level='O0'):
        self.opt_level = opt_level
        if self.sync_bn:
            # synchronization batch normal
            self.model = apex.parallel.convert_syncbn_model(self.model)
        # assign specific gpu
        self.model = self.model.cuda(self.gpu)
        self.criterion = self.criterion.cuda(self.gpu)
        # init model and optimizer by apex
        self.model, self.optimizer = apex.amp.initialize(self.model, self.optimizer,
                                                         opt_level=self.opt_level)
        # apex parallel
        self.model = apex.parallel.DistributedDataParallel(self.model, delay_allreduce=True)
        # self.model = apex.parallel.DistributedDataParallel(self.model)

        self.model = nn.parallel.DistributedDataParallel(self.model, device_ids=[self.gpu])
        return self.model, self.criterion, self.optimizer

    def lars(self):
        print("Enable LARS Optimizer Algorithm")
        self.optimizer = LARS(self.optimizer)

    def train(self, epoch, train_loader, max_batches, train_index):
        """
        you must run it after the 'convert' function.
        :param epoch:
        :param train_loader:
        :return:
        """
        batch_time = AverageMeter()
        data_time = AverageMeter()
        losses = AverageMeter()
        top1 = AverageMeter()
        top5 = AverageMeter()
        self.model.train()

        train_log_file = 'train_log'

        end = time.time()
        epoch_batch = epoch * (len(train_loader.dataset) / self.args.batch_size)
        for i, (inputs, target) in enumerate(train_loader):
            # measure data loading time
            data_time.update(time.time() - end)
            if train_index != -1 and i < train_index:
                continue
            adjust_learning_rate_epoch_poly(self.optimizer, self.args, burn_in=Darknet53.burn_in, power=Darknet53.power,
                                            batch_num=i + epoch_batch, max_batches=max_batches)
            inputs = inputs.cuda(self.gpu, non_blocking=True)
            target = target.cuda(self.gpu, non_blocking=True)

            # compute output
            output = self.model(inputs)
            loss = self.criterion(output, target)

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), inputs.size(0))
            top1.update(acc1[0], inputs.size(0))
            top5.update(acc5[0], inputs.size(0))

            # compute gradient and do SGD step
            self.optimizer.zero_grad()
            # loss.backward()
            time1 = time.time()
            with apex.amp.scale_loss(loss, self.optimizer) as scale_loss:
                scale_loss.backward()
            time2 = time.time()
            self.optimizer.step()
            time3 = time.time()
            print("step cost time: {}, backward cost time: {}".format(time3-time2, time2-time1))
            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % self.args.print_freq == 0:
                print('Epoch: [{0}][{1}/{2}]\t'
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      'Acc@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                      'Acc@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                       epoch, i, len(train_loader), batch_time=batch_time,
                       data_time=data_time, loss=losses, top1=top1, top5=top5))

            if i % 1000 == 0 and i:
                save_checkpoint({
                    'epoch': epoch,
                    'index': i,
                    'arch': "Darknet_53",
                    'state_dict': self.model.state_dict(),
                    'optimizer': self.optimizer.state_dict(),
                }, False, filename='darknet_53_init_55_pytorch_train_tmp.pth.tar')

        with open(train_log_file, 'a+') as log_file:
            log_file.write('Epoch:{0}, Loss{loss.avg:.4f}, Top1:{top1.avg:.3f},Top5:{top5.avg:.3f}\n'.format(
                epoch, loss=losses, top1=top1, top5=top5))
def main(lr=0.1):
    global best_acc
    args.lr = lr
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    best_acc = 0  # best test accuracy
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch

    # Data
    print('==> Preparing data..')
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])

    trainset = torchvision.datasets.CIFAR10(root='/tmp/cifar10',
                                            train=True,
                                            download=True,
                                            transform=transform_train)
    trainloader = torch.utils.data.DataLoader(trainset,
                                              batch_size=args.batch_size,
                                              shuffle=True,
                                              num_workers=2)

    testset = torchvision.datasets.CIFAR10(root='/tmp/cifar10',
                                           train=False,
                                           download=True,
                                           transform=transform_test)
    testloader = torch.utils.data.DataLoader(testset,
                                             batch_size=100,
                                             shuffle=False,
                                             num_workers=2)

    classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse',
               'ship', 'truck')

    # Model
    print('==> Building model..')
    # net = VGG('VGG19')
    # net = ResNet18()
    # net = PreActResNet18()
    # net = GoogLeNet()
    # net = DenseNet121()
    # net = ResNeXt29_2x64d()
    # net = MobileNet()
    # net = MobileNetV2()
    # net = DPN92()
    # net = ShuffleNetG2()
    # net = SENet18()
    # net = ShuffleNetV2(1)
    # net = EfficientNetB0()
    # net = RegNetX_200MF()
    net = ResNet50()
    net = net.to(device)
    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True

    ckpt = './checkpoint/' + args.optimizer + str(lr) + '_ckpt.pth'

    if args.resume:
        # Load checkpoint.
        print('==> Resuming from checkpoint..')
        assert os.path.isdir(
            'checkpoint'), 'Error: no checkpoint directory found!'
        checkpoint = torch.load(ckpt)
        net.load_state_dict(checkpoint['net'])
        best_acc = checkpoint['acc']
        start_epoch = checkpoint['epoch']

    criterion = nn.CrossEntropyLoss()
    if args.optimizer.lower() == 'sgd':
        optimizer = optim.SGD(net.parameters(),
                              lr=args.lr,
                              weight_decay=args.weight_decay)
    if args.optimizer.lower() == 'sgdwm':
        optimizer = optim.SGD(net.parameters(),
                              lr=args.lr,
                              momentum=args.momentum,
                              weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'adam':
        optimizer = torch.optim.Adam(net.parameters(),
                                     lr=args.lr,
                                     weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'rmsprop':
        optimizer = optim.RMSprop(net.parameters(),
                                  lr=args.lr,
                                  momentum=args.momentum,
                                  weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'adagrad':
        optimizer = optim.Adagrad(net.parameters(),
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'radam':
        from radam import RAdam
        optimizer = RAdam(net.parameters(),
                          lr=args.lr,
                          weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'lars':  #no tensorboardX
        from lars import LARS
        optimizer = LARS(net.parameters(),
                         lr=args.lr,
                         momentum=args.momentum,
                         weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'lamb':
        from lamb import Lamb
        optimizer = Lamb(net.parameters(),
                         lr=args.lr,
                         weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'novograd':
        from novograd import NovoGrad
        optimizer = NovoGrad(net.parameters(),
                             lr=args.lr,
                             weight_decay=args.weight_decay)
    else:
        optimizer = optim.SGD(net.parameters(),
                              lr=args.lr,
                              momentum=args.momentum,
                              weight_decay=args.weight_decay)
    # lrs = create_lr_scheduler(args.warmup_epochs, args.lr_decay)
    # lr_scheduler = LambdaLR(optimizer,lrs)
    # lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, args.lr_decay, gamma=0.1)
    train_acc = []
    valid_acc = []

    # Training
    def train(epoch):
        print('\nEpoch: %d' % epoch)
        net.train()
        train_loss = 0
        correct = 0
        total = 0
        for batch_idx, (inputs, targets) in enumerate(trainloader):
            print(batch_idx)
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = net(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            # lr_scheduler.step()
            train_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
        print(100. * correct / total)
        train_acc.append(correct / total)

    def test(epoch):
        global best_acc
        net.eval()
        test_loss = 0
        correct = 0
        total = 0
        print('test')
        with torch.no_grad():
            for batch_idx, (inputs, targets) in enumerate(testloader):
                print(batch_idx)
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = net(inputs)
                loss = criterion(outputs, targets)

                test_loss += loss.item()
                _, predicted = outputs.max(1)
                total += targets.size(0)
                correct += predicted.eq(targets).sum().item()

        # Save checkpoint.
        acc = 100. * correct / total
        print(acc)
        valid_acc.append(correct / total)

        if acc > best_acc:
            print('Saving..')
            state = {
                'net': net.state_dict(),
                'acc': acc,
                'epoch': epoch,
            }
            if not os.path.isdir('checkpoint'):
                os.mkdir('checkpoint')
            torch.save(state, ckpt)
            best_acc = acc

    for epoch in range(200):
        if epoch in args.lr_decay:
            checkpoint = torch.load(ckpt)
            net.load_state_dict(checkpoint['net'])
            best_acc = checkpoint['acc']
            args.lr *= 0.1
            if args.optimizer.lower() == 'sgd':
                optimizer = optim.SGD(net.parameters(),
                                      lr=args.lr,
                                      weight_decay=args.weight_decay)
            if args.optimizer.lower() == 'sgdwm':
                optimizer = optim.SGD(net.parameters(),
                                      lr=args.lr,
                                      momentum=args.momentum,
                                      weight_decay=args.weight_decay)
            elif args.optimizer.lower() == 'adam':
                optimizer = optim.Adam(net.parameters(),
                                       lr=args.lr,
                                       weight_decay=args.weight_decay)
            elif args.optimizer.lower() == 'rmsprop':
                optimizer = optim.RMSprop(net.parameters(),
                                          lr=args.lr,
                                          momentum=args.momentum,
                                          weight_decay=args.weight_decay)
            elif args.optimizer.lower() == 'adagrad':
                optimizer = optim.Adagrad(net.parameters(),
                                          lr=args.lr,
                                          weight_decay=args.weight_decay)
            elif args.optimizer.lower() == 'radam':
                from radam import RAdam

                optimizer = RAdam(net.parameters(),
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
            elif args.optimizer.lower() == 'lars':  # no tensorboardX
                optimizer = LARS(net.parameters(),
                                 lr=args.lr,
                                 momentum=args.momentum,
                                 weight_decay=args.weight_decay,
                                 dampening=args.damping)
            elif args.optimizer.lower() == 'lamb':
                optimizer = Lamb(net.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)
            elif args.optimizer.lower() == 'novograd':
                optimizer = NovoGrad(net.parameters(),
                                     lr=args.lr,
                                     weight_decay=args.weight_decay)
            else:
                optimizer = optim.SGD(net.parameters(),
                                      lr=args.lr,
                                      momentum=args.momentum,
                                      weight_decay=args.weight_decay)
        train(epoch)
        test(epoch)
    file = open(args.optimizer + str(lr) + 'log.json', 'w+')
    json.dump([train_acc, valid_acc], file)
    return best_acc
示例#8
0
                              lr=args.base_lr,
                              momentum=args.momentum,
                              weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'adagrad':
    optimizer = optim.Adagrad(model.parameters(),
                              lr=args.base_lr,
                              weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'radam':
    from radam import RAdam
    optimizer = RAdam(model.parameters(),
                      lr=args.base_lr,
                      weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'lars':  #no tensorboardX
    from lars import LARS
    optimizer = LARS(model.parameters(),
                     lr=args.base_lr,
                     momentum=args.momentum,
                     weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'lamb':
    from lamb import Lamb
    optimizer = Lamb(model.parameters(),
                     lr=args.base_lr,
                     weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'novograd':
    from novograd import NovoGrad
    optimizer = NovoGrad(model.parameters(),
                         lr=args.base_lr,
                         weight_decay=args.weight_decay)
    lr_scheduler = [
        optim.lr_scheduler.CosineAnnealingLR(optimizer, 3 * len(train_loader),
                                             1e-4)
    ]
示例#9
0
epochs = 400
stop_at_epoch = 100
batch_size = 64
image_size = (92, 92)

train_loader, mem_loader, test_loader = get_train_mem_test_dataloaders(
    batch_size=batch_size)
train_transform, test_transform = gpu_transformer(image_size)

loss_ls = []
acc_ls = []

model = BYOL().to(device)

optimizer = LARS(model.named_modules(),
                 lr=lr,
                 momentum=momentum,
                 weight_decay=weight_decay)

scheduler = LR_Scheduler(optimizer,
                         warmup_epochs,
                         warmup_lr * batch_size / 8,
                         epochs,
                         lr * batch_size / 8,
                         final_lr * batch_size / 8,
                         len(train_loader),
                         constant_predictor_lr=True)

min_loss = np.inf
accuracy = 0

# start training
    optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum,
                      weight_decay=args.weight_decay)
elif args.optimizer.lower()=='adam':
    optimizer = torch.optim.Adam(net.parameters(), lr=args.lr,
                      weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'rmsprop':
    optimizer = optim.RMSprop(net.parameters(),lr=args.lr, momentum=args.momentum,
                      weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'adagrad':
    optimizer = optim.Adagrad(net.parameters(), lr=args.lr, weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'radam':
    from radam import RAdam
    optimizer = RAdam(net.parameters(),lr=args.lr,weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'lars':#no tensorboardX
    from lars import LARS
    optimizer = LARS(net.parameters(), lr=args.lr,momentum=args.momentum,weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'lamb':
    from lamb import Lamb
    optimizer  = Lamb(net.parameters(),lr=args.lr,weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'novograd':
    from novograd import NovoGrad
    optimizer = NovoGrad(net.parameters(), lr=args.lr,weight_decay=args.weight_decay)
else:
    optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum,
                          weight_decay=args.weight_decay)
# lrs = create_lr_scheduler(args.warmup_epochs, args.lr_decay)
# lr_scheduler = LambdaLR(optimizer,lrs)
# lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, args.lr_decay, gamma=0.1)
train_acc = []
valid_acc = []
    optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum,
                      weight_decay=args.weight_decay)
elif args.optimizer.lower()=='adam':
    optimizer = torch.optim.Adam(net.parameters(), lr=args.lr,
                      weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'rmsprop':
    optimizer = optim.RMSprop(net.parameters(),lr=args.lr, momentum=args.momentum,
                      weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'adagrad':
    optimizer = optim.Adagrad(net.parameters(), lr=args.lr, weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'radam':
    from radam import RAdam
    optimizer = RAdam(net.parameters(),lr=args.lr,weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'lars':#no tensorboardX
    from lars import LARS
    optimizer = LARS(net.parameters(), lr=args.lr,momentum=args.momentum,weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'lamb':
    from lamb import Lamb
    optimizer  = Lamb(net.parameters(),lr=args.lr,weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'novograd':
    from novograd import NovoGrad
    optimizer = NovoGrad(net.parameters(), lr=args.lr,weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'dyna':
    from dyna import Dyna
    optimizer = Dyna(net.parameters(), lr=args.lr, weight_decay=args.weight_decay)
else:
    optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum,
                          weight_decay=args.weight_decay)
# lrs = create_lr_scheduler(args.warmup_epochs, args.lr_decay)
# lr_scheduler = LambdaLR(optimizer,lrs)
# lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, args.lr_decay, gamma=0.1)
示例#12
0
 def lars(self):
     self.optimizer = LARS(self.optimizer)
示例#13
0
class ApexDistributeModel(object):
    def __init__(self, model, criterion, optimizer, config, gpu=None):
        super(ApexDistributeModel, self).__init__()
        self.model = model
        self.config = config
        self.sync_bn = self.config['sync_bn']
        self.gpu = gpu
        if self.gpu is not None:
            assert isinstance(self.gpu, int), "GPU should is a int type."

        self.criterion = criterion
        self.optimizer = optimizer
        self.opt_level = None

    def convert(self, opt_level='O0'):
        self.opt_level = opt_level
        if self.sync_bn:
            # synchronization batch normal
            self.model = apex.parallel.convert_syncbn_model(self.model)
        # assign specific gpu
        self.model = self.model.cuda(self.gpu)
        self.criterion = self.criterion.cuda(self.gpu)
        # init model and optimizer by apex
        self.model, self.optimizer = apex.amp.initialize(
            self.model, self.optimizer, opt_level=self.opt_level)
        # apex parallel
        self.model = apex.parallel.DistributedDataParallel(
            self.model, delay_allreduce=True)
        return self.model, self.criterion, self.optimizer

    def lars(self):
        self.optimizer = LARS(self.optimizer)

    def train(self, epoch, train_loader):
        """
        you must run it after the 'convert' function.
        :param epoch:
        :param train_loader:
        :return:
        """
        self.model.train()
        print("Epoch is {}".format(epoch))
        train_iter = iter(train_loader)
        inputs, target = next(train_iter)
        step = 0
        start_time = time.time()
        while inputs is not None:
            step += 1
            inputs = inputs.cuda(self.gpu, non_blocking=True)
            target = target.cuda(self.gpu, non_blocking=True)
            output = self.model(inputs)
            loss = self.criterion(output, target)
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            self.optimizer.zero_grad()
            with apex.amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
            self.optimizer.step()
            inputs, target = next(train_iter, (None, None))
            if step % 10 == 0:
                end_time = time.time()
                print("Step is {}, cost time: {}, loss: {}, acc1: {}, acc5:{}".
                      format(step, (end_time - start_time), loss.item(),
                             acc1.item(), acc5.item()))
                start_time = time.time()
示例#14
0
class ApexDistributeModel(object):
    def __init__(self, model, criterion, optimizer, args, gpu=None):
        super(ApexDistributeModel, self).__init__()
        self.model = model
        self.args = args
        self.sync_bn = self.args.sync_bn
        self.gpu = gpu
        if self.gpu is not None:
            assert isinstance(self.gpu, int), "GPU should is a int type."
        self.criterion = criterion
        self.optimizer = optimizer
        self.opt_level = None

    def convert(self, opt_level='O0'):
        self.opt_level = opt_level
        if self.sync_bn:
            # synchronization batch normal
            self.model = apex.parallel.convert_syncbn_model(self.model)
        # assign specific gpu
        self.model = self.model.cuda(self.gpu)
        # self.criterion = self.criterion.cuda(self.gpu)
        # init model and optimizer by apex
        # self.model, self.optimizer = apex.amp.initialize(self.model, self.optimizer,
        #                                                  opt_level=self.opt_level)
        # apex parallel
        # self.model = apex.parallel.DistributedDataParallel(self.model, delay_allreduce=True)
        # self.model = apex.parallel.DistributedDataParallel(self.model)

        self.model = nn.parallel.DistributedDataParallel(self.model,
                                                         device_ids=[self.gpu],
                                                         bucket_cap_mb=10)
        return self.model, self.criterion, self.optimizer

    def lars(self):
        print("Enable LARS Optimizer Algorithm")
        self.optimizer = LARS(self.optimizer)

    def train(self, epoch, train_loader, max_batches, train_index):
        """
        you must run it after the 'convert' function.
        :param epoch:
        :param train_loader:
        :return:
        """
        batch_time = AverageMeter()
        data_time = AverageMeter()
        losses = AverageMeter()
        top1 = AverageMeter()
        top5 = AverageMeter()
        self.model.train()

        train_log_file = 'train_log'

        end = time.time()
        epoch_batch = epoch * (len(train_loader.dataset) /
                               self.args.batch_size)
        for i, (inputs, target) in enumerate(train_loader):
            data_time.update(time.time() - end)
            print("data iteration cost time: {}ms".format(data_time.val *
                                                          1000))
            # print(target)
            gpu_1 = time.time()
            inputs = inputs.cuda(self.gpu, non_blocking=True)
            target = target.cuda(self.gpu, non_blocking=True)
            gpu_2 = time.time()
            print("convert datasets to gpu cost time: {}ms".format(
                (gpu_2 - gpu_1) * 1000))

            inference_1 = time.time()
            output = self.model(inputs)
            inference_2 = time.time()
            print("inference time cost: {}ms".format(
                (inference_2 - inference_1) * 1000))

            loss_1 = time.time()
            loss = self.criterion(output, target)
            loss_2 = time.time()
            print("loss cost time: {}ms".format((loss_2 - loss_1) * 1000))

            zero_1 = time.time()
            self.optimizer.zero_grad()
            zero_2 = time.time()
            print("zero cost time: {}ms".format((zero_2 - zero_1) * 1000))

            backward_1 = time.time()
            loss.backward()
            backward_2 = time.time()
            print("backward cost time: {}ms".format(
                (backward_2 - backward_1) * 1000))

            step_1 = time.time()
            self.optimizer.step()
            step_2 = time.time()
            print("step cost time: {}ms".format((step_2 - step_1) * 1000))

            batch_time.update(time.time() - end)
            print("total cost time is: {}s".format(batch_time.val))
            # item_1 = time.time()
            # print("loss is {}".format(loss.item()))
            # item_2 = time.time()
            # print("loss item cost: {}s".format(item_2 - item_1))
            print("==================================")

            end = time.time()
#we are going to try to find the sum(n_releant_orders) 
#important predictors. first compute all X possible products
X_aug = poly_order_of_cols(X,orders,include_cross_terms)

print 'shape of X_aug ',X_aug.shape

B = linear_regression(X_aug,Y[:,0],True)
r_sq = lin_reg_coefficient_of_determination(X_aug,Y[:,0],B)
print 'r_squared for brute force fit ',r_sq
lin_reg_statistics(X_aug,Y[:,0],B,False)
print 'full linreg coefs'
print B

print 'beginning LARS test'
lars_obj = LARS(X_aug,Y[:,0],0)
lars_obj.train()

print 'beginning LASSO test'
lars_obj = LARS(X_aug,Y[:,0],1)
lars_obj.train()

#Cross Validation
#cvn = 10
#print 'performing cross validation CV('+str(cvn)+') for linear regression'
#cvn_lr = n_cross_validation(X_aug,Y,linear_regression_train_test,None,cvn)
#print 'linear regression cvn mse is ',cvn_lr

#Now we will compute cvn for LASSO  with a number of lambdas
#if m==1:
if False:
 def lars(self):
     print("Enable LARS Optimizer Algorithm")
     self.optimizer = LARS(self.optimizer)
示例#17
0
if len(sys.argv) == 1:
    optimizer = optim.SGD(model.parameters(), lr=0.01)
elif sys.argv[1] == 'adam':
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
elif sys.argv[1] == 'sgd':
    optimizer = optim.SGD(model.parameters(), lr=0.01)
elif sys.argv[1] == 'sgdwm':
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
elif sys.argv[1] == 'rmsprop':
    optimizer = optim.RMSprop(model.parameters(), lr=0.001, momentum=0.9)
elif sys.argv[1] == 'adagrad':
    optimizer = optim.Adagrad(model.parameters(), lr=0.01)
elif sys.argv[1] == 'radam':
    optimizer = RAdam(model.parameters())
elif sys.argv[1] == 'lars':  #no tensorboardX
    optimizer = LARS(model.parameters(), lr=0.1, momentum=0.9)
elif sys.argv[1] == 'lamb':
    optimizer = Lamb(model.parameters())
elif sys.argv[1] == 'novograd':
    optimizer = NovoGrad(model.parameters(), lr=0.01, weight_decay=0.001)
    schedular = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                     3 * len(train_loader),
                                                     1e-4)

    def train(train_loader, model, criterion, optimizer, schedular, device):
        '''
        Function for the training step of the training loop
        '''

        model.train()
        running_loss = 0
示例#18
0
def main():
    args = parse_args()

    if args.name is None:
        args.name = '%s_WideResNet%s-%s_%d' % (args.dataset, args.depth,
                                               args.width, args.batch_size)
    if args.linear_scaling:
        args.name += '_wLS'
    if args.lars:
        args.name += '_wLARS'

    if not os.path.exists('models/%s' % args.name):
        os.makedirs('models/%s' % args.name)

    print('Config -----')
    for arg in vars(args):
        print('%s: %s' % (arg, getattr(args, arg)))
    print('------------')

    with open('models/%s/args.txt' % args.name, 'w') as f:
        for arg in vars(args):
            print('%s: %s' % (arg, getattr(args, arg)), file=f)

    joblib.dump(args, 'models/%s/args.pkl' % args.name)

    criterion = nn.CrossEntropyLoss().cuda()

    cudnn.benchmark = True

    # data loading code
    if args.dataset == 'cifar10':
        transform_train = transforms.Compose([
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465),
                                 (0.2023, 0.1994, 0.2010)),
        ])

        transform_test = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465),
                                 (0.2023, 0.1994, 0.2010)),
        ])

        train_set = datasets.CIFAR10(root='~/data',
                                     train=True,
                                     download=True,
                                     transform=transform_train)
        train_loader = torch.utils.data.DataLoader(train_set,
                                                   batch_size=args.batch_size,
                                                   shuffle=True,
                                                   num_workers=8)

        test_set = datasets.CIFAR10(root='~/data',
                                    train=False,
                                    download=True,
                                    transform=transform_test)
        test_loader = torch.utils.data.DataLoader(test_set,
                                                  batch_size=128,
                                                  shuffle=False,
                                                  num_workers=8)

        num_classes = 10

    elif args.dataset == 'cifar100':
        transform_train = transforms.Compose([
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465),
                                 (0.2023, 0.1994, 0.2010)),
        ])

        transform_test = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465),
                                 (0.2023, 0.1994, 0.2010)),
        ])

        train_set = datasets.CIFAR100(root='~/data',
                                      train=True,
                                      download=True,
                                      transform=transform_train)
        train_loader = torch.utils.data.DataLoader(train_set,
                                                   batch_size=128,
                                                   shuffle=True,
                                                   num_workers=8)

        test_set = datasets.CIFAR100(root='~/data',
                                     train=False,
                                     download=True,
                                     transform=transform_test)
        test_loader = torch.utils.data.DataLoader(test_set,
                                                  batch_size=128,
                                                  shuffle=False,
                                                  num_workers=8)

        num_classes = 100

    # create model
    model = WideResNet(args.depth, args.width, num_classes=num_classes)
    model = model.cuda()

    if args.lars:
        optimizer = LARS(filter(lambda p: p.requires_grad, model.parameters()),
                         lr=args.lr,
                         momentum=args.momentum,
                         weight_decay=args.weight_decay)
    else:
        optimizer = optim.SGD(filter(lambda p: p.requires_grad,
                                     model.parameters()),
                              lr=args.lr,
                              momentum=args.momentum,
                              weight_decay=args.weight_decay)

    if args.linear_scaling:
        scheduler = WarmupMultiStepLR(
            optimizer,
            milestones=[int(e) for e in args.milestones.split(',')],
            target_lr=args.lr * args.batch_size / base_batch_size,
            gamma=args.gamma)
    else:
        scheduler = lr_scheduler.MultiStepLR(
            optimizer,
            milestones=[int(e) for e in args.milestones.split(',')],
            gamma=args.gamma)

    log = pd.DataFrame(
        index=[],
        columns=['epoch', 'lr', 'loss', 'acc', 'val_loss', 'val_acc'])

    best_acc = 0
    for epoch in range(args.epochs):
        print('Epoch [%d/%d]' % (epoch + 1, args.epochs))

        scheduler.step()

        # train for one epoch
        train_log = train(args, train_loader, model, criterion, optimizer,
                          epoch)
        # evaluate on validation set
        val_log = validate(args, test_loader, model, criterion)

        print('loss %.4f - acc %.4f - val_loss %.4f - val_acc %.4f' %
              (train_log['loss'], train_log['acc'], val_log['loss'],
               val_log['acc']))

        tmp = pd.Series(
            [
                epoch,
                scheduler.get_lr()[0],
                train_log['loss'],
                train_log['acc'],
                val_log['loss'],
                val_log['acc'],
            ],
            index=['epoch', 'lr', 'loss', 'acc', 'val_loss', 'val_acc'])

        log = log.append(tmp, ignore_index=True)
        log.to_csv('models/%s/log.csv' % args.name, index=False)

        if val_log['acc'] > best_acc:
            torch.save(model.state_dict(), 'models/%s/model.pth' % args.name)
            best_acc = val_log['acc']
            print("=> saved best model")
示例#19
0
args = parser.parse_args()

if args.optimizer.lower() == 'adam':
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
elif args.optimizer.lower() == 'sgd':
    optimizer = optim.SGD(model.parameters(), lr=args.lr)
elif args.optimizer.lower() == 'sgdwm':
    optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9)
elif args.optimizer.lower() == 'rmsprop':
    optimizer = optim.RMSprop(model.parameters(), lr=args.lr, momentum=0.9)
elif args.optimizer.lower() == 'adagrad':
    optimizer = optim.Adagrad(model.parameters(), lr=args.lr)
elif args.optimizer.lower() == 'radam':
    optimizer = RAdam(model.parameters(), lr=args.lr)
elif args.optimizer.lower() == 'lars':  #no tensorboardX
    optimizer = LARS(model.parameters(), lr=args.lr, momentum=0.9)
elif args.optimizer.lower() == 'lamb':
    optimizer = Lamb(model.parameters(), lr=args.lr)
elif args.optimizer.lower() == 'novograd':
    optimizer = NovoGrad(model.parameters(), lr=args.lr, weight_decay=0.0001)
else:
    optimizer = optim.SGD(model.parameters(), lr=0.01)

optname = args.optimizer if len(sys.argv) >= 2 else 'sgd'

# log = open(optname+'log.txt','w+')

log = None

criterion = nn.CrossEntropyLoss()
示例#20
0
def main(is_distributed, rank, ip, sync_bn):
    world_size = 1
    if is_distributed:
        world_size = 2
        torch.distributed.init_process_group(backend='nccl',
                                             init_method=ip,
                                             world_size=world_size,
                                             rank=rank)
    assert torch.backends.cudnn.enabled, "Amp requires cudnn backend to be enabled."
    print("Connect")
    # set hyper parameters
    batch_size = 128
    lr = 0.01  # base on batch size 256
    momentum = 0.9
    weight_decay = 0.0001
    epoch = 100

    # recompute lr
    lr = lr * world_size

    # create model
    model = AlexNet(10)

    # synchronization batch normal
    if sync_bn:
        model = apex.parallel.convert_syncbn_model(model)

    model = model.cuda()

    # define loss function
    criterion = nn.CrossEntropyLoss().cuda()

    # define optimizer strategy
    optimizer = torch.optim.SGD(model.parameters(),
                                lr,
                                momentum=momentum,
                                weight_decay=weight_decay)

    model, optimizer = apex.amp.initialize(model, optimizer, opt_level='O0')
    optimizer = LARS(optimizer)

    if is_distributed:
        # for distribute training
        model = nn.parallel.DistributedDataParallel(model)
        # model = apex.parallel.DistributedDataParallel(model, delay_allreduce=True)

    # load train data
    data_path = '~/datasets/cifar10/train'
    train_set = LoadClassifyDataSets(data_path, 227)
    train_sampler = None
    if is_distributed:
        train_sampler = distributed.DistributedSampler(train_set)
    train_loader = DataLoader(train_set,
                              batch_size,
                              shuffle=(train_sampler is None),
                              num_workers=4,
                              pin_memory=True,
                              sampler=train_sampler,
                              collate_fn=collate_fn)

    for epoch in range(100):
        # for distribute
        if is_distributed:
            train_sampler.set_epoch(epoch)

        model.train()
        # train_iter = iter(train_loader)
        # inputs, target = next(train_iter)
        prefetcher = DataPrefetcher(train_loader)
        inputs, target = prefetcher.next()

        step = 0
        print("Epoch is {}".format(epoch))
        while inputs is not None:
            step += 1
            print("Step is {}".format(step))

            time_model_1 = time.time()
            output = model(inputs)
            time_model_2 = time.time()
            print("model time: {}".format(time_model_2 - time_model_1))
            time_loss_1 = time.time()
            loss = criterion(output, target.cuda(async=True))
            time_loss_2 = time.time()
            print("loss time: {}".format(time_loss_2 - time_loss_1))
            optimizer.zero_grad()
            time_back_1 = time.time()
            # loss.backward()
            with apex.amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            time_back_2 = time.time()
            print("back time: {}".format(time_back_2 - time_back_1))
            optimizer.step()
            # if step % 10 == 0:
            #     print("loss is : {}", loss.item())
            # inputs, target = next(train_iter, (None, None))
            inputs, target = prefetcher.next()