예제 #1
0
    def __init__(self,
                 model: Module,
                 train_loader: DataLoader,
                 test_loader: DataLoader,
                 device=DEFAULT_DEVICE,
                 lr=DEFAULT_LR,
                 momentum=DEFAULT_MOMENTUM,
                 epochs=DEFAULT_EPOCHS,
                 batch_size=DEFAULT_BATCH_SIZE,
                 parallelism=DEFAULT_PARALLELISM,
                 milestones=MILESTONES,
                 gamma=0.2,
                 warm_phases=WARM_PHASES,
                 criterion=loss.CrossEntropyLoss()):
        print("initialize trainer")
        # parameter pre-processing
        self.test_loader = test_loader

        if torch.cuda.device_count() > 1 and parallelism:
            print(f"using {torch.cuda.device_count()} GPUs")
            self.model = nn.DataParallel(model)
        else:
            self.model = model
        self.model.to(device)

        optimizer = optim.SGD(
            # choose whether train or not
            filter(lambda p: p.requires_grad, self.model.parameters()),
            lr=lr,
            momentum=momentum,
            weight_decay=5e-4)

        train_scheduler = optim.lr_scheduler.MultiStepLR(optimizer,
                                                         milestones=milestones,
                                                         gamma=gamma)

        # warm phases
        self.warm_phases = warm_phases
        # warmup learning rate
        self.warmup_scheduler = WarmUpLR(optimizer,
                                         len(train_loader) * self.warm_phases)

        self.hp = HyperParameter(scheduler=train_scheduler,
                                 optimizer=optimizer,
                                 criterion=criterion,
                                 batch_size=batch_size,
                                 epochs=epochs,
                                 device=device)

        self.train_loader = train_loader
        print("initialize finished")
        print(f"hyper parameter: {self.hp}")
예제 #2
0
    cifar100_test_loader = get_test_dataloader(settings.CIFAR100_TRAIN_MEAN,
                                               settings.CIFAR100_TRAIN_STD,
                                               num_workers=4,
                                               batch_size=args.b,
                                               shuffle=True)

    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(),
                          lr=args.lr,
                          momentum=0.9,
                          weight_decay=5e-4)
    train_scheduler = optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=settings.MILESTONES,
        gamma=0.2)  #learning rate decay
    iter_per_epoch = len(cifar100_training_loader)
    warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * args.warm)

    if args.resume:
        recent_folder = most_recent_folder(os.path.join(
            settings.CHECKPOINT_PATH, args.net),
                                           fmt=settings.DATE_FORMAT)
        if not recent_folder:
            raise Exception('no recent folder were found')

        checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, args.net,
                                       recent_folder)

    else:
        checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, args.net,
                                       settings.TIME_NOW)
예제 #3
0
    checkpoints_path = os.path.join(conf.CHECKPOINTS_PATH, args.model,
                                    datetime.now().isoformat())
    if not os.path.exists(checkpoints_path):
        os.makedirs(checkpoints_path)
    checkpoints_path = os.path.join(checkpoints_path,
                                    '{model}-{epoch}-{type}.pth')

    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(),
                          lr=conf.LEARNING_RATE,
                          momentum=conf.MOMENTUM,
                          weight_decay=conf.WEIGHT_DECAY)
    train_scheduler = optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=conf.MILESTONES, gamma=conf.GAMMA)
    iter_per_epoch = len(train_loader)
    warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * conf.WARM_EPOCH)

    best_acc = 0.0
    for epoch in range(1, conf.EPOCH):
        if epoch > conf.WARM_EPOCH:
            train_scheduler.step(epoch)

        train(model, epoch, train_loader, loss_function, optimizer,
              warmup_scheduler, args.gpu)
        acc = eval(model, epoch, val_loader, loss_function, args.gpu)

        if best_acc < acc:
            torch.save(
                model.state_dict(),
                checkpoints_path.format(model=args.model,
                                        epoch=epoch,
                                           shuffle=True)
    cell_train_test_loader = get_test_dataloader(path=trainpath,
                                                 mean=cell_train_mean,
                                                 std=cell_train_std,
                                                 num_workers=4,
                                                 batch_size=args.b,
                                                 shuffle=True)
    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(),
                          lr=args.lr,
                          momentum=0.9,
                          weight_decay=5e-4)
    train_scheduler = optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=settings.MILESTONES, gamma=0.2)
    iter_per_epoch = len(cell_training_loader)
    warmup_scheduler = WarmUpLR(optimizer, 10000 * args.warm)
    checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, args.net, '3_4',
                                   settings.TIME_NOW)
    # create checkpoint folder to save model
    if not os.path.exists(checkpoint_path):
        os.makedirs(checkpoint_path)
    checkpoint_path = os.path.join(checkpoint_path,
                                   '{net}-{epoch}-{type}-{accuracy}.pth')

    best_acc = 0.7

    for epoch in range(1, settings.EPOCH + 1):
        if epoch > args.warm:  # =1
            train_scheduler.step(epoch)
        train(epoch)
        acc_train = eval_training(epoch)
예제 #5
0
def main():
    global best_acc
    start_epoch = args.start_epoch  # start from epoch 0 or last checkpoint epoch

    if not os.path.isdir(args.checkpoint):
        mkdir_p(args.checkpoint)
    print('==> Preparing dataset %s' % args.dataset)

    if args.dataset == 'cifar100':
        training_loader = get_training_dataloader(settings.CIFAR100_TRAIN_MEAN,
                                                  settings.CIFAR100_TRAIN_STD,
                                                  num_workers=4,
                                                  batch_size=args.train_batch,
                                                  shuffle=True)

        test_loader = get_test_dataloader(settings.CIFAR100_TRAIN_MEAN,
                                          settings.CIFAR100_TRAIN_STD,
                                          num_workers=4,
                                          batch_size=args.test_batch,
                                          shuffle=False)
        num_classes = 100
    else:
        training_loader = get_training_dataloader_10(
            settings.CIFAR10_TRAIN_MEAN,
            settings.CIFAR10_TRAIN_STD,
            num_workers=4,
            batch_size=args.train_batch,
            shuffle=True)

        test_loader = get_test_dataloader_10(settings.CIFAR10_TRAIN_MEAN,
                                             settings.CIFAR10_TRAIN_STD,
                                             num_workers=4,
                                             batch_size=args.test_batch,
                                             shuffle=False)
        num_classes = 10
    #data preprocessing:
    print("==> creating model '{}'".format(args.arch))

    model = get_network(args, num_classes=num_classes)
    model = torch.nn.DataParallel(model).cuda()
    cudnn.benchmark = True
    print('    Total params: %.2fM' %
          (sum(p.numel() for p in model.parameters()) / 1000000.0))
    criterion1 = am_softmax.AMSoftmax()
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr,
                          momentum=0.9,
                          weight_decay=5e-4)
    title = 'cifar-10-' + args.arch
    if args.resume:
        # Load checkpoint.
        print('==> Resuming from checkpoint..')
        assert os.path.isfile(
            args.resume), 'Error: no checkpoint directory found!'
        args.checkpoint = os.path.dirname(args.resume)
        checkpoint = torch.load(args.resume)
        best_acc = checkpoint['best_acc']
        start_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        logger = Logger(os.path.join(args.checkpoint, 'log.txt'),
                        title=title,
                        resume=True)
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title)
        logger.set_names([
            'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.',
            'Valid Acc.'
        ])

    train_scheduler = optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=args.schedule, gamma=0.2)  #learning rate decay
    iter_per_epoch = len(training_loader)
    warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * args.warm)

    for epoch in range(start_epoch, args.epochs):
        if epoch > args.warm:
            train_scheduler.step(epoch)
        train_loss, train_acc = train(training_loader, model, warmup_scheduler,
                                      criterion, criterion1, optimizer, epoch,
                                      use_cuda)
        test_loss, test_acc = eval_training(test_loader, model, criterion,
                                            epoch, use_cuda)

        logger.append([
            optimizer.param_groups[0]['lr'], train_loss, test_loss, train_acc,
            test_acc
        ])

        # save model
        is_best = test_acc > best_acc
        best_acc = max(test_acc, best_acc)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'acc': test_acc,
                'best_acc': best_acc,
                'optimizer': optimizer.state_dict(),
            },
            is_best,
            checkpoint=args.checkpoint)

    logger.close()
    # logger.plot()
    # savefig(os.path.join(args.checkpoint, 'log.eps'))

    print('Best acc:')
    print(best_acc)
예제 #6
0
                                               settings.CIFAR100_TRAIN_STD,
                                               num_workers=args.w,
                                               batch_size=args.b,
                                               shuffle=args.s)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(),
                          lr=args.lr,
                          momentum=0.9,
                          weight_decay=5e-4)
    train_scheduler = optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=settings.MILESTONES,
        gamma=0.2)  # learning rate decay
    iter_per_epoch = len(cifar100_train_loader)
    total_iters = iter_per_epoch * args.warm
    warmup_scheduler = WarmUpLR(optimizer, total_iters)
    checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, args.net,
                                   settings.TIME_NOW)

    # create checkpoint folder to save model
    if not os.path.exists(checkpoint_path):
        os.makedirs(checkpoint_path)
    checkpoint_path = os.path.join(checkpoint_path, '{net}-{epoch}-{type}.pth')

    best_acc = 0.0
    for epoch in range(1, settings.EPOCH):
        if epoch > args.warm:
            train_scheduler.step(epoch)

        train(epoch)
        acc = eval(epoch)
예제 #7
0
def main():
    global args, best_prec1
    args = parser.parse_args()
    print(args)
    check_rootfolders()

    categories, train_list, val_list, root_path, prefix = datasets_video.return_dataset(
        args.dataset, args.root_path)
    num_class = len(categories)

    global store_name
    store_name = '_'.join([
        args.type, args.dataset, args.arch,
        'segment%d' % args.num_segments, args.store_name
    ])
    print(('storing name: ' + store_name))

    if args.dataset == 'somethingv1' or args.dataset == 'somethingv2':
        # label transformation for left/right categories
        # please refer to labels.json file in sometingv2 for detail.
        target_transforms = {
            86: 87,
            87: 86,
            93: 94,
            94: 93,
            166: 167,
            167: 166
        }
    else:
        target_transforms = None

    if args.conv_config in conv_configs:
        conv_config = conv_configs[args.conv_config]
        conv_index = None  # conv_indexs[args.conv_config]
    else:
        conv_config = None

    model = TemporalModel(num_class,
                          args.num_segments,
                          model=args.type,
                          backbone=args.arch,
                          alpha=args.alpha,
                          beta=args.beta,
                          dropout=args.dropout,
                          target_transforms=target_transforms,
                          search=args.search,
                          op_code=args.op_code,
                          conv_config=conv_config)

    crop_size = model.crop_size
    scale_size = model.scale_size
    input_mean = model.input_mean
    input_std = model.input_std
    policies = get_optim_policies(model)
    train_augmentation = model.get_augmentation()

    if torch.cuda.is_available():
        model = torch.nn.DataParallel(model).cuda()

    if args.prune:
        prune(model, args.prune_model_path, './conv_config.txt')
        # prune_select(model, args.prune_model_path, './conv_config.txt')
        exit(0)

    if args.resume:
        if os.path.isfile(args.resume):
            print(("=> loading checkpoint '{}'".format(args.resume)))
            checkpoint = torch.load(args.resume)

            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            # model.module.load_state_dict(checkpoint['state_dict'])
            model.load_state_dict(checkpoint['state_dict'])
            # print(("=> loaded checkpoint '{}' (epoch {})"
            # 	  	.format(args.evaluate, checkpoint['epoch'])))
        else:
            print(("=> no checkpoint found at '{}'".format(args.resume)))

    if args.finetune:
        if os.path.isfile(args.finetune):
            print(("=> loading checkpoint '{}'".format(args.finetune)))
            checkpoint = torch.load(args.finetune)
            from I3D import load_state_dict_supernet
            model = load_state_dict_supernet(model, checkpoint['state_dict'],
                                             conv_index)
            #args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            #model.module.load_state_dict(checkpoint['state_dict'])
            print(("=> loaded checkpoint '{}' (epoch {})".format(
                args.evaluate, checkpoint['epoch'])))
        else:
            print(("=> no checkpoint found at '{}'".format(args.finetune)))
            exit(0)

    cudnn.benchmark = True

    # Data loading code
    normalize = GroupNormalize(input_mean, input_std)

    train_loader = torch.utils.data.DataLoader(VideoDataSet(
        root_path,
        train_list,
        num_segments=args.num_segments,
        image_tmpl=prefix,
        transform=torchvision.transforms.Compose([
            train_augmentation,
            Stack(roll=(args.arch in ['BNInception', 'InceptionV3'])),
            ToTorchFormatTensor(
                div=(args.arch not in ['BNInception', 'InceptionV3'])),
            normalize,
        ])),
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               drop_last=True,
                                               num_workers=args.workers,
                                               pin_memory=True)

    val_loader = torch.utils.data.DataLoader(VideoDataSet(
        root_path,
        val_list,
        num_segments=args.num_segments,
        image_tmpl=prefix,
        random_shift=False,
        transform=torchvision.transforms.Compose([
            GroupScale(int(scale_size)),
            GroupCenterCrop(crop_size),
            Stack(roll=(args.arch in ['BNInception', 'InceptionV3'])),
            ToTorchFormatTensor(
                div=(args.arch not in ['BNInception', 'InceptionV3'])),
            normalize,
        ])),
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    # define loss function (criterion) and optimizer
    criterion = torch.nn.CrossEntropyLoss().cuda()

    for group in policies:
        print(('group: {} has {} params, lr_mult: {}, decay_mult: {}'.format(
            group['name'], len(group['params']), group['lr_mult'],
            group['decay_mult'])))

    optimizer = torch.optim.SGD(policies,
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    if args.evaluate:
        prec1 = validate(val_loader, model, criterion, 0)
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)
        exit()

        return

    log_training = open(
        os.path.join(args.checkpoint_dir, 'log', '%s.csv' % store_name), 'w')
    warmup_scheduler = WarmUpLR(optimizer, len(train_loader) * args.warm)
    for epoch in range(args.start_epoch, args.epochs):
        # adjust learning rate
        if epoch > args.warm:
            adjust_learning_rate(optimizer, epoch, args.lr_steps)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, log_training,
              warmup_scheduler, args)

        # evaluate on validation set
        if ((epoch + 1) % args.eval_freq == 0 or epoch == args.epochs - 1):
            if 'dropout' not in args.op_code:
                prec1 = validate(val_loader, model, criterion,
                                 (epoch + 1) * len(train_loader), log_training)
            else:
                prec1 = 0.0

            # remember best prec@1 and save checkpoint
            is_best = prec1 > best_prec1
            best_prec1 = max(prec1, best_prec1)
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'arch': args.arch,
                    'state_dict': model.state_dict(),
                    'best_prec1': best_prec1,
                }, is_best)
예제 #8
0
    def f(self, x, return_acc=False):  #x, layer number to calculate
        if x.size == 1:
            x = np.append(x, 0.32)
        x = x.reshape(1, 2)
        target = int(x[:, 0])
        print("Start run ", target)
        start_time = default_timer()

        self.net = resnet50(60).cuda()
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        if device == 'cuda':
            self.net = torch.nn.DataParallel(self.net)
            cudnn.benchmark = True

        self.net.load_state_dict(torch.load(checkpoint), True)
        if self.inc_index == 1:
            self.net.module.fc = nn.Linear(512 * 4, 30).cuda()
        else:
            self.net.module.fc = nn.Linear(512 * 4, 10).cuda()

        self.net.train()

        cur_wc = 0
        count = 0
        for m in self.net.modules():
            if target == count:
                break
            elif isinstance(m, nn.Conv2d):
                for param in m.parameters():
                    cur_wc += param.numel()
                    param.requires_grad = False
            elif isinstance(m, nn.BatchNorm2d):
                for param in m.parameters():
                    param.requires_grad = False
                count += 1

        BASE_DATA_ROOT = '/home/bbboming/HDD/Paper/datasets_object/ICIFAR100_60_30_10/BASE/'
        DATA_ROOT = '/home/bbboming/HDD/Paper/datasets_object/ICIFAR100_60_30_10/INC%d/' % self.inc_index
        train_transform = transforms.Compose([
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize(settings.CIFAR100_TRAIN_MEAN,
                                 settings.CIFAR100_TRAIN_STD),
        ])
        trainset = datasets.ImageFolder(os.path.join(DATA_ROOT, 'train'),
                                        train_transform)
        cifar100_training_loader = torch.utils.data.DataLoader(
            trainset,
            batch_size=self.batch_size,
            pin_memory=True,
            num_workers=4,
            shuffle=self.shuffle)

        test_transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(settings.CIFAR100_TRAIN_MEAN,
                                 settings.CIFAR100_TRAIN_STD),
        ])

        testset = datasets.ImageFolder(os.path.join(DATA_ROOT, 'test'),
                                       test_transform)
        cifar100_test_loader = torch.utils.data.DataLoader(
            testset,
            batch_size=self.batch_size,
            pin_memory=True,
            num_workers=4,
            shuffle=False)

        base_testset = datasets.ImageFolder(
            os.path.join(BASE_DATA_ROOT, 'test'), test_transform)
        cifar100_base_test_loader = torch.utils.data.DataLoader(
            base_testset,
            batch_size=self.batch_size,
            pin_memory=True,
            num_workers=4,
            shuffle=False)

        loss_function = nn.CrossEntropyLoss()
        optimizer = optim.SGD(self.net.parameters(),
                              lr=self.lr,
                              momentum=0.9,
                              weight_decay=5e-4)
        train_scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, 'min')
        iter_per_epoch = len(cifar100_training_loader)
        warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * self.warm)
        checkpoint_path = os.path.join(settings.CHECKPOINT_PATH,
                                       'resnet50_inc%d' % self.inc_index,
                                       settings.TIME_NOW)

        #create checkpoint folder to save model
        if not os.path.exists(checkpoint_path):
            os.makedirs(checkpoint_path)
        checkpoint_path = os.path.join(checkpoint_path,
                                       '{net}-{target}-{type}.pth')

        best_acc = 0.0
        best_base_acc = 0.0
        best_inc_acc = 0.0
        for epoch in range(1, settings.EPOCH):
            self.net.train()
            # train(epoch)
            for batch_index, (images,
                              labels) in enumerate(cifar100_training_loader):
                images = Variable(images)
                labels = Variable(labels)

                labels = labels.cuda()
                images = images.cuda()

                optimizer.zero_grad()
                outputs = self.net(images)
                loss = loss_function(outputs, labels)
                loss.backward()
                optimizer.step()

                if epoch <= self.warm:
                    warmup_scheduler.step()
                n_iter = (epoch -
                          1) * len(cifar100_training_loader) + batch_index + 1

            #print('[Target {target}] [Training Epoch: {epoch}/{total_epoch}]\tLoss: {:0.4f}\tLR: {:0.6f}'.format(
            #    loss.item(),
            #    optimizer.param_groups[0]['lr'],
            #    target=target,
            #    epoch=epoch,
            #    total_epoch=settings.EPOCH
            #))

            #Evaluation Accuracy
            self.net.eval()
            self.basenet.eval()

            test_loss = 0.0  # cost function error
            correct = 0.0

            #INC Testset
            for (images, labels) in cifar100_test_loader:
                images = Variable(images)
                labels = Variable(labels)
                images = images.cuda()
                labels = labels.cuda()

                soft_layer = nn.Softmax(dim=1).cuda()

                base_outputs = self.basenet(images)
                outputs = self.net(images)

                loss = loss_function(outputs, labels)
                test_loss += loss.item()

                soft_base = soft_layer(base_outputs)
                soft_inc = soft_layer(outputs)
                softmax = torch.cat([soft_base, soft_inc], dim=1)
                labels_all = labels + 60
                _, preds = softmax.max(1)
                correct += preds.eq(labels_all).sum()

            #Base Testset
            correct_base = 0.0
            for (images, labels) in cifar100_base_test_loader:
                images = Variable(images)
                labels = Variable(labels)
                images = images.cuda()
                labels = labels.cuda()

                soft_layer = nn.Softmax(dim=1).cuda()

                base_outputs = self.basenet(images)
                outputs = self.net(images)

                soft_base = soft_layer(base_outputs)
                soft_inc = soft_layer(outputs)
                softmax = torch.cat([soft_base, soft_inc], dim=1)
                labels_all = labels
                _, preds = softmax.max(1)
                correct_base += preds.eq(labels_all).sum()

            avg_loss = test_loss / len(cifar100_test_loader.dataset)
            base_acc = correct_base.float() / len(
                cifar100_base_test_loader.dataset)
            inc_acc = correct.float() / len(cifar100_test_loader.dataset)
            acc = (correct.float() + correct_base.float()) / (
                len(cifar100_test_loader.dataset) +
                len(cifar100_base_test_loader.dataset))

            print(
                'Test set: Average loss: {:.4f}, Accuracy: {:.4f} (BaseAcc {:.4f} IncAcc {:.4f})'
                .format(avg_loss, acc, base_acc, inc_acc))

            train_scheduler.step(avg_loss)

            #start to save best performance model after learning rate decay to 0.01
            if epoch > 10 and best_acc < acc:
                torch.save(
                    self.net.state_dict(),
                    checkpoint_path.format(target=target,
                                           net='resnet50',
                                           type='best'))
                best_acc = acc
                best_inc_acc = inc_acc
                best_base_acc = base_acc

        # share_ratio = target / self.count
        best_dict[str(target)] = best_acc.detach().cpu().item()

        memory_efficiency = cur_wc / self.total_wc
        obj_acc = best_acc.detach().cpu().item()
        alpha = x[:, 1].item()
        threshold = 0.02
        target_mem_eff = 0.70
        #Objective Function
        obj_f = np.abs((self.max_acc - obj_acc) - threshold)
        print_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " x= {x}, alpha= {alpha} Memory_Efficiency= {memory_efficiency}, combined_classification_acc= {best_acc}, obj_acc= {obj_acc}, OBJ_F= {obj_f}" \
                        .format(x=target, alpha=alpha,best_acc=best_acc, obj_acc=obj_acc, memory_efficiency=memory_efficiency, obj_f=obj_f)
        with open("history.log", "a") as f_hist:
            f_hist.write(print_str + "\n")
        print(print_str)

        if self.min_acc != 0:
            csv.write("%d, %d, %f, %f, %f, %f, %f\n" %
                      (self.iteration, target, obj_acc, threshold, obj_f,
                       self.min_acc, self.max_acc))
            self.iteration += 1

        end_time = default_timer()
        print("operation time: ", (end_time - start_time))

        if return_acc:
            return (best_acc.detach().cpu().item())
        return (obj_f)
예제 #9
0
def main(args):
    print(f'started: {args}')
    torch.cuda.set_device(args.gpu)
    cifar100_training_loader = get_training_dataloader(
        settings.CIFAR100_TRAIN_MEAN,
        settings.CIFAR100_TRAIN_STD,
        num_workers=4,
        batch_size=args.b,
        shuffle=True)

    cifar100_test_loader = get_test_dataloader(settings.CIFAR100_TRAIN_MEAN,
                                               settings.CIFAR100_TRAIN_STD,
                                               num_workers=4,
                                               batch_size=args.b,
                                               shuffle=True)

    net = resnet18(with_permute_adain=(args.padain > 0), p_adain=args.padain)
    net = net.cuda()
    print(net)
    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(),
                          lr=args.lr,
                          momentum=0.9,
                          weight_decay=5e-4)
    train_scheduler = optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=settings.MILESTONES,
        gamma=0.2)  # learning rate decay
    iter_per_epoch = len(cifar100_training_loader)
    warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * args.warm)
    checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, 'cifar100',
                                   args.net, str(args.padain),
                                   settings.TIME_NOW)

    # use tensorboard
    if not os.path.exists(settings.LOG_DIR):
        os.mkdir(settings.LOG_DIR)
    writer = SummaryWriter(
        log_dir=os.path.join(settings.LOG_DIR, args.net + '_padain' +
                             str(args.padain), settings.TIME_NOW))
    input_tensor = torch.Tensor(1, 3, 32, 32).cuda()
    writer.add_graph(net, input_tensor)

    # create checkpoint folder to save model
    if not os.path.exists(checkpoint_path):
        os.makedirs(checkpoint_path)
    checkpoint_path = os.path.join(checkpoint_path, '{net}-{type}.pth')

    best_acc = 0.0
    for epoch in range(1, settings.EPOCH):
        if epoch > args.warm:
            train_scheduler.step(epoch)

        train(net, cifar100_training_loader, warmup_scheduler, optimizer,
              loss_function, writer, epoch, args.warm, args.b)
        acc = eval_training(net, cifar100_test_loader, loss_function, writer,
                            epoch)

        # start to save best performance model after learning rate decay to 0.01
        if epoch > settings.MILESTONES[1] and best_acc < acc:
            torch.save(net.state_dict(),
                       checkpoint_path.format(net=args.net, type='best'))
            best_acc = acc
            continue

        if not epoch % settings.SAVE_EPOCH:
            torch.save(net.state_dict(),
                       checkpoint_path.format(net=args.net, type='other'))

    writer.close()
def train_variant(conv, fcl, args):

    net, arch_name = construct_vgg_variant(conv_variant=conv,
                                           fcl_variant=fcl,
                                           batch_norm=True,
                                           progress=True,
                                           pretrained=False)
    args.net = arch_name
    if args.gpu:  #use_gpu
        net = net.cuda()

    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(),
                          lr=args.lr,
                          momentum=0.9,
                          weight_decay=5e-4)
    train_scheduler = optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=settings.MILESTONES,
        gamma=0.2)  # learning rate decay
    iter_per_epoch = len(cifar100_training_loader)
    warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * args.warm)

    if args.resume:
        recent_folder = most_recent_folder(os.path.join(
            settings.CHECKPOINT_PATH, args.net),
                                           fmt=settings.DATE_FORMAT)
        if not recent_folder:
            raise Exception('no recent folder were found')

        checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, args.net,
                                       recent_folder)

    else:
        checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, args.net,
                                       settings.TIME_NOW)

    #use tensorboard
    if not os.path.exists(settings.LOG_DIR):
        os.mkdir(settings.LOG_DIR)

    #since tensorboard can't overwrite old values
    #so the only way is to create a new tensorboard log
    writer = SummaryWriter(
        log_dir=os.path.join(settings.LOG_DIR, args.net, settings.TIME_NOW))
    if args.gpu:
        input_tensor = torch.Tensor(1, 3, 32, 32).cuda()
    else:
        input_tensor = torch.Tensor(1, 3, 32, 32)
    writer.add_graph(net, input_tensor)

    #create checkpoint folder to save model
    if not os.path.exists(checkpoint_path):
        os.makedirs(checkpoint_path)
    checkpoint_path = os.path.join(checkpoint_path, '{net}-{epoch}-{type}.pth')

    best_acc = 0.0
    if args.resume:
        best_weights = best_acc_weights(
            os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder))
        if best_weights:
            weights_path = os.path.join(settings.CHECKPOINT_PATH, args.net,
                                        recent_folder, best_weights)
            print('found best acc weights file:{}'.format(weights_path))
            print('load best training file to test acc...')
            net.load_state_dict(torch.load(weights_path))
            best_acc = eval_training(tb=False)
            print('best acc is {:0.2f}'.format(best_acc))

        recent_weights_file = most_recent_weights(
            os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder))
        if not recent_weights_file:
            raise Exception('no recent weights file were found')
        weights_path = os.path.join(settings.CHECKPOINT_PATH, args.net,
                                    recent_folder, recent_weights_file)
        print('loading weights file {} to resume training.....'.format(
            weights_path))
        net.load_state_dict(torch.load(weights_path))

        resume_epoch = last_epoch(
            os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder))

    train_params = {
        'net': net,
        'warmup_scheduler': warmup_scheduler,
        'loss_function': loss_function,
        'optimizer': optimizer,
        'writer': writer
    }
    for epoch in range(1, settings.EPOCH):
        # for epoch in [1]:# range(1, 2):
        if epoch > args.warm:
            train_scheduler.step(epoch)

        if args.resume:
            if epoch <= resume_epoch:
                continue

        train(epoch=epoch, **train_params)
        acc = eval_training(epoch=epoch, **train_params)

        #start to save best performance model after learning rate decay to 0.01
        if epoch > settings.MILESTONES[1] and best_acc < acc:
            torch.save(
                net.state_dict(),
                checkpoint_path.format(net=args.net, epoch=epoch, type='best'))
            best_acc = acc
            continue

        if not epoch % settings.SAVE_EPOCH:
            torch.save(
                net.state_dict(),
                checkpoint_path.format(net=args.net,
                                       epoch=epoch,
                                       type='regular'))

    writer.close()
예제 #11
0
class Trainer:
    def __init__(self,
                 model: Module,
                 train_loader: DataLoader,
                 test_loader: DataLoader,
                 device=DEFAULT_DEVICE,
                 lr=DEFAULT_LR,
                 momentum=DEFAULT_MOMENTUM,
                 epochs=DEFAULT_EPOCHS,
                 batch_size=DEFAULT_BATCH_SIZE,
                 parallelism=DEFAULT_PARALLELISM,
                 milestones=MILESTONES,
                 gamma=0.2,
                 warm_phases=WARM_PHASES,
                 criterion=loss.CrossEntropyLoss()):
        print("initialize trainer")
        # parameter pre-processing
        self.test_loader = test_loader

        if torch.cuda.device_count() > 1 and parallelism:
            print(f"using {torch.cuda.device_count()} GPUs")
            self.model = nn.DataParallel(model)
        else:
            self.model = model
        self.model.to(device)

        optimizer = optim.SGD(
            # choose whether train or not
            filter(lambda p: p.requires_grad, self.model.parameters()),
            lr=lr,
            momentum=momentum,
            weight_decay=5e-4)

        train_scheduler = optim.lr_scheduler.MultiStepLR(optimizer,
                                                         milestones=milestones,
                                                         gamma=gamma)

        # warm phases
        self.warm_phases = warm_phases
        # warmup learning rate
        self.warmup_scheduler = WarmUpLR(optimizer,
                                         len(train_loader) * self.warm_phases)

        self.hp = HyperParameter(scheduler=train_scheduler,
                                 optimizer=optimizer,
                                 criterion=criterion,
                                 batch_size=batch_size,
                                 epochs=epochs,
                                 device=device)

        self.train_loader = train_loader
        print("initialize finished")
        print(f"hyper parameter: {self.hp}")

    def train(self,
              save_path,
              attack=False,
              attacker=None,
              params: Dict = None):
        self._init_attacker(attack, attacker, params)

        batch_number = len(self.train_loader)
        # get current learning rate
        now_lr = self.hp.optimizer.state_dict().get("param_groups")[0].get(
            "lr")
        # record best accuracy
        best_acc = 0

        for ep in range(1, self.hp.epochs + 1):

            training_acc, running_loss = 0, .0
            start_time = time.process_time()

            for index, data in enumerate(self.train_loader):
                inputs, labels = data[0].to(self.hp.device), data[1].to(
                    self.hp.device)

                self.hp.optimizer.zero_grad()
                if attack:
                    # calculate this first, for this will zero the grad
                    adv_inputs = self.attacker.calc_perturbation(
                        inputs, labels)
                    # zero the grad
                    self.hp.optimizer.zero_grad()
                    outputs = self.model(inputs)
                    adv_outputs = self.model(adv_inputs)
                    _loss = self.hp.criterion(outputs,
                                              labels) + self.hp.criterion(
                                                  adv_outputs, labels)
                else:
                    outputs = self.model(inputs)
                    _loss = self.hp.criterion(outputs, labels)

                _loss.backward()
                self.hp.optimizer.step()

                outputs: torch.Tensor
                training_acc += (outputs.argmax(
                    dim=1) == labels).float().mean().item()

                # warm up learning rate
                if ep <= self.warm_phases:
                    self.warmup_scheduler.step()

                # detect learning rate change
                new_lr = self.hp.optimizer.state_dict().get(
                    "param_groups")[0].get("lr")
                if new_lr != now_lr:
                    now_lr = new_lr
                    print(f"learning rate changes to {now_lr:.6f}")

                running_loss += _loss.item()

                if index % batch_number == batch_number - 1:
                    end_time = time.process_time()

                    acc = self.test(self.model,
                                    test_loader=self.test_loader,
                                    device=self.hp.device)
                    print(
                        f"epoch: {ep}   loss: {(running_loss / batch_number):.6f}   train accuracy: {training_acc / batch_number}   "
                        f"test accuracy: {acc}   time: {end_time - start_time:.2f}s"
                    )

                    if best_acc < acc:
                        best_acc = acc
                        self._save_best_model(save_path, ep, acc)

            # change learning rate by step
            self.hp.scheduler.step(ep)
        torch.save(self.model.state_dict(), f"{save_path}-latest")
        print("finished training")
        print(f"best accuracy on test set: {best_acc}")

    @staticmethod
    def test(model: Module, test_loader, device, debug=False):

        correct = 0
        with torch.no_grad():
            for data in test_loader:
                inputs, labels = data[0].to(device), data[1].to(device)
                _, y_hats = model(inputs).max(1)
                match = (y_hats == labels)
                correct += len(match.nonzero())

        if debug:
            print(f"Testing: {len(test_loader.dataset)}")
            print(f"correct: {correct}")
            print(f"accuracy: {100*correct/len(test_loader.dataset):.3f}%")

        return correct / len(test_loader.dataset)

    def _init_attacker(self, attack, attacker, params):
        self.attack = attack
        if attack:
            print(f"robustness training with {attacker.__name__}")
            self.attacker = attacker(self.model, **params)
            self.attacker.print_params()
        else:
            print("normal training")

    def _save_best_model(self, save_path, current_epochs, accuracy):
        """save best model with current info"""
        info = {
            "current_epochs": current_epochs,
            "total_epochs": self.hp.epochs,
            "accuracy": accuracy
        }
        if self.attack:
            info.update({
                "attack": self.attack,
                "attacker": type(self.attacker).__name__,
                "epsilons": self.attacker.epsilon,
            })
        with open(os.path.join(os.path.dirname(save_path), "info.json"),
                  "w",
                  encoding="utf8") as f:
            json.dump(info, f)
        torch.save(self.model.state_dict(), f"{save_path}-best")

    @staticmethod
    def train_tl(origin_model_path,
                 save_path,
                 train_loader,
                 test_loader,
                 device,
                 choice="resnet50"):
        print(f"transform learning on model: {origin_model_path}")
        model = TLResNet.create_model(choice)
        model.load_model(origin_model_path)
        trainer = Trainer(model=model,
                          train_loader=train_loader,
                          test_loader=test_loader,
                          device=device)
        trainer.train(save_path)