Exemplo n.º 1
0
def main():

    args = parse_option()

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    # set the data loader
    data_folder = os.path.join(args.data_folder, 'train')
    val_folder = os.path.join(args.data_folder, 'val')

    crop_padding = 32
    image_size = 224
    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]
    normalize = transforms.Normalize(mean=mean, std=std)

    if args.aug == 'NULL' and args.dataset == 'imagenet':
        train_transform = transforms.Compose([
            transforms.RandomResizedCrop(image_size, scale=(args.crop, 1.)),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ])
    elif args.aug == 'CJ':
        train_transform = transforms.Compose([
            transforms.RandomResizedCrop(image_size, scale=(args.crop, 1.)),
            transforms.RandomGrayscale(p=0.2),
            transforms.ColorJitter(0.4, 0.4, 0.4, 0.4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ])
    # elif args.aug == 'NULL' and args.dataset == 'cifar':
    #     train_transform = transforms.Compose([
    #         transforms.RandomResizedCrop(size=32, scale=(0.2, 1.)),
    #         transforms.ColorJitter(0.4, 0.4, 0.4, 0.4),
    #         transforms.RandomGrayscale(p=0.2),
    #         transforms.RandomHorizontalFlip(p=0.5),
    #         transforms.ToTensor(),
    #         transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    #     ])
    #
    #     test_transform = transforms.Compose([
    #         transforms.ToTensor(),
    #         transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    #     ])
    elif args.aug == 'simple' and args.dataset == 'imagenet':
        train_transform = transforms.Compose([
            transforms.RandomResizedCrop(image_size, scale=(args.crop, 1.)),
            transforms.RandomHorizontalFlip(),
            get_color_distortion(1.0),
            transforms.ToTensor(),
            normalize,
        ])

        # TODO: Currently follow CMC
        test_transform = transforms.Compose([
            transforms.Resize(image_size + crop_padding),
            transforms.CenterCrop(image_size),
            transforms.ToTensor(),
            normalize,
        ])
    elif args.aug == 'simple' and args.dataset == 'cifar':
        train_transform = transforms.Compose([
            transforms.RandomResizedCrop(size=32),
            transforms.RandomHorizontalFlip(p=0.5),
            get_color_distortion(0.5),
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465),
                                 (0.2023, 0.1994, 0.2010)),
        ])

        test_transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465),
                                 (0.2023, 0.1994, 0.2010)),
        ])

    else:
        raise NotImplemented('augmentation not supported: {}'.format(args.aug))

    # Get Datasets
    if args.dataset == "imagenet":
        train_dataset = ImageFolderInstance(data_folder,
                                            transform=train_transform,
                                            two_crop=args.moco)
        print(len(train_dataset))
        train_sampler = None
        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=args.batch_size,
            shuffle=(train_sampler is None),
            num_workers=args.num_workers,
            pin_memory=True,
            sampler=train_sampler)

        test_dataset = datasets.ImageFolder(val_folder,
                                            transforms=test_transform)

        test_loader = torch.utils.data.DataLoader(test_dataset,
                                                  batch_size=256,
                                                  shuffle=False,
                                                  num_workers=args.num_workers,
                                                  pin_memory=True)

    elif args.dataset == 'cifar':
        # cifar-10 dataset
        if args.contrastive_model == 'simclr':
            train_dataset = CIFAR10Instance_double(root='./data',
                                                   train=True,
                                                   download=True,
                                                   transform=train_transform,
                                                   double=True)
        else:
            train_dataset = CIFAR10Instance(root='./data',
                                            train=True,
                                            download=True,
                                            transform=train_transform)
        train_sampler = None
        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=args.batch_size,
            shuffle=(train_sampler is None),
            num_workers=args.num_workers,
            pin_memory=True,
            sampler=train_sampler,
            drop_last=True)

        test_dataset = CIFAR10Instance(root='./data',
                                       train=False,
                                       download=True,
                                       transform=test_transform)
        test_loader = torch.utils.data.DataLoader(test_dataset,
                                                  batch_size=100,
                                                  shuffle=False,
                                                  num_workers=args.num_workers)

    # create model and optimizer
    n_data = len(train_dataset)

    if args.model == 'resnet50':
        model = InsResNet50()
        if args.contrastive_model == 'moco':
            model_ema = InsResNet50()
    elif args.model == 'resnet50x2':
        model = InsResNet50(width=2)
        if args.contrastive_model == 'moco':
            model_ema = InsResNet50(width=2)
    elif args.model == 'resnet50x4':
        model = InsResNet50(width=4)
        if args.contrastive_model == 'moco':
            model_ema = InsResNet50(width=4)
    elif args.model == 'resnet50_cifar':
        model = InsResNet50_cifar()
        if args.contrastive_model == 'moco':
            model_ema = InsResNet50_cifar()
    else:
        raise NotImplementedError('model not supported {}'.format(args.model))

    # copy weights from `model' to `model_ema'
    if args.contrastive_model == 'moco':
        moment_update(model, model_ema, 0)

    # set the contrast memory and criterion
    if args.contrastive_model == 'moco':
        contrast = MemoryMoCo(128, n_data, args.nce_k, args.nce_t,
                              args.softmax).cuda(args.gpu)
    elif args.contrastive_model == 'simclr':
        contrast = None
    else:
        contrast = MemoryInsDis(128, n_data, args.nce_k, args.nce_t,
                                args.nce_m, args.softmax).cuda(args.gpu)

    if args.softmax:
        criterion = NCESoftmaxLoss()
    elif args.contrastive_model == 'simclr':
        criterion = BatchCriterion(1, args.nce_t, args.batch_size)
    else:
        criterion = NCECriterion(n_data)
    criterion = criterion.cuda(args.gpu)

    model = model.cuda()
    if args.contrastive_model == 'moco':
        model_ema = model_ema.cuda()

    # Exclude BN and bias if needed
    weight_decay = args.weight_decay
    if weight_decay and args.filter_weight_decay:
        parameters = add_weight_decay(model, weight_decay,
                                      args.filter_weight_decay)
        weight_decay = 0.
    else:
        parameters = model.parameters()

    optimizer = torch.optim.SGD(parameters,
                                lr=args.learning_rate,
                                momentum=args.momentum,
                                weight_decay=weight_decay)
    cudnn.benchmark = True

    if args.amp:
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.opt_level)
        if args.contrastive_model == 'moco':
            optimizer_ema = torch.optim.SGD(model_ema.parameters(),
                                            lr=0,
                                            momentum=0,
                                            weight_decay=0)
            model_ema, optimizer_ema = amp.initialize(model_ema,
                                                      optimizer_ema,
                                                      opt_level=args.opt_level)

    if args.LARS:
        optimizer = LARS(optimizer=optimizer, eps=1e-8, trust_coef=0.001)

    # optionally resume from a checkpoint
    args.start_epoch = 0
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume, map_location='cpu')
            # checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch'] + 1
            model.load_state_dict(checkpoint['model'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            if contrast:
                contrast.load_state_dict(checkpoint['contrast'])
            if args.contrastive_model == 'moco':
                model_ema.load_state_dict(checkpoint['model_ema'])

            if args.amp and checkpoint['opt'].amp:
                print('==> resuming amp state_dict')
                amp.load_state_dict(checkpoint['amp'])

            print("=> loaded successfully '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
            del checkpoint
            torch.cuda.empty_cache()
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # tensorboard
    logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2)

    # routine
    for epoch in range(args.start_epoch, args.epochs + 1):

        print("==> training...")

        time1 = time.time()
        if args.contrastive_model == 'moco':
            loss, prob = train_moco(epoch, train_loader, model, model_ema,
                                    contrast, criterion, optimizer, args)
        elif args.contrastive_model == 'simclr':
            print("Train using simclr")
            loss, prob = train_simclr(epoch, train_loader, model, criterion,
                                      optimizer, args)
        else:
            print("Train using InsDis")
            loss, prob = train_ins(epoch, train_loader, model, contrast,
                                   criterion, optimizer, args)
        time2 = time.time()
        print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1))

        # tensorboard logger
        logger.log_value('ins_loss', loss, epoch)
        logger.log_value('ins_prob', prob, epoch)
        logger.log_value('learning_rate', optimizer.param_groups[0]['lr'],
                         epoch)

        test_epoch = 2
        if epoch % test_epoch == 0:
            model.eval()

            if args.contrastive_model == 'moco':
                model_ema.eval()

            print('----------Evaluation---------')
            start = time.time()

            if args.dataset == 'cifar':
                acc = kNN(epoch,
                          model,
                          train_loader,
                          test_loader,
                          200,
                          args.nce_t,
                          n_data,
                          low_dim=128,
                          memory_bank=None)

            print("Evaluation Time: '{}'s".format(time.time() - start))
            # writer.add_scalar('nn_acc', acc, epoch)
            logger.log_value('Test accuracy', acc, epoch)

            # print('accuracy: {}% \t (best acc: {}%)'.format(acc, best_acc))
            print('[Epoch]: {}'.format(epoch))
            print('accuracy: {}%)'.format(acc))
            # test_log_file.flush()

        # save model
        if epoch % args.save_freq == 0:
            print('==> Saving...')
            state = {
                'opt': args,
                'model': model.state_dict(),
                # 'contrast': contrast.state_dict(),
                'optimizer': optimizer.state_dict(),
                'epoch': epoch,
            }
            if args.contrastive_model == 'moco':
                state['model_ema'] = model_ema.state_dict()
            if args.amp:
                state['amp'] = amp.state_dict()
            save_file = os.path.join(
                args.model_folder,
                'ckpt_epoch_{epoch}.pth'.format(epoch=epoch))
            torch.save(state, save_file)
            # help release GPU memory
            del state

        # saving the model
        print('==> Saving...')
        state = {
            'opt': args,
            'model': model.state_dict(),
            # 'contrast': contrast.state_dict(),
            'optimizer': optimizer.state_dict(),
            'epoch': epoch,
        }
        if args.contrastive_model == 'moco':
            state['model_ema'] = model_ema.state_dict()
        if args.amp:
            state['amp'] = amp.state_dict()
        save_file = os.path.join(args.model_folder, 'current.pth')
        torch.save(state, save_file)
        if epoch % args.save_freq == 0:
            save_file = os.path.join(
                args.model_folder,
                'ckpt_epoch_{epoch}.pth'.format(epoch=epoch))
            torch.save(state, save_file)
        # help release GPU memory
        del state
        torch.cuda.empty_cache()
Exemplo n.º 2
0
def main_worker(gpu, ngpus_per_node, args):
    global best_acc1
    best_acc1 = 0

    args.gpu = gpu

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)

    # set the model
    model, classifier, criterion = set_model(args, ngpus_per_node)

    # set optimizer
    optimizer = set_optimizer(args, classifier)

    cudnn.benchmark = True

    # optionally resume linear classifier
    args.start_epoch = 1
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch'] + 1
            best_acc1 = checkpoint['best_acc1']
            if args.gpu is not None:
                # best_acc1 may be from a checkpoint from a different GPU
                best_acc1 = best_acc1.to(args.gpu)
            classifier.load_state_dict(checkpoint['classifier'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # set the data loader
    train_loader, val_loader, train_sampler = get_train_val_loader(args)

    # tensorboard
    logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2)

    # routine
    for epoch in range(args.start_epoch, args.epochs + 1):

        if args.distributed:
            train_sampler.set_epoch(epoch)

        adjust_learning_rate(epoch, args, optimizer)
        print("==> training...")

        time1 = time.time()
        train_acc, train_loss = train(epoch, train_loader, model, classifier,
                                      criterion, optimizer, args)
        time2 = time.time()
        print('train epoch {}, total time {:.2f}'.format(epoch, time2 - time1))

        logger.log_value('train_acc', train_acc, epoch)
        logger.log_value('train_loss', train_loss, epoch)

        print("==> testing...")
        test_acc, test_loss = validate(val_loader, model, classifier,
                                       criterion, args)

        logger.log_value('test_acc', test_acc, epoch)
        logger.log_value('test_loss', test_loss, epoch)

        # save the best model
        if test_acc > best_acc1:
            best_acc1 = test_acc
            if not args.multiprocessing_distributed or (
                    args.multiprocessing_distributed
                    and args.rank % ngpus_per_node == 0):
                state = {
                    'epoch': epoch,
                    'classifier': classifier.state_dict(),
                    'best_acc1': best_acc1,
                    'optimizer': optimizer.state_dict(),
                }
                save_name = '{}_layer{}.pth'.format(args.model, args.layer)
                save_name = os.path.join(args.save_folder, save_name)
                print('saving model!')
                torch.save(state, save_name)

        # regular save
        if not args.multiprocessing_distributed or \
                (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0):
            if epoch % args.save_freq == 0:
                print('==> Saving...')
                state = {
                    'epoch': epoch,
                    'classifier': classifier.state_dict(),
                    'best_acc1': best_acc1,
                    'optimizer': optimizer.state_dict(),
                }
                save_file = os.path.join(
                    args.save_folder,
                    'ckpt_epoch_{epoch}.pth'.format(epoch=epoch))
                torch.save(state, save_file)

        # tensorboard logger
        pass
Exemplo n.º 3
0
def main():

    # parse the args
    args = parse_option()

    # set the loader
    train_loader, n_data = get_train_loader(
        args)  # change to this if testing on cifar as baseline

    # set the model
    model, contrast, criterion_ab, criterion_l = set_model(args, n_data)

    # set the optimizer
    optimizer = set_optimizer(args, model)

    # set mixed precision
    if args.amp:
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.opt_level)

    # optionally resume from a checkpoint
    args.start_epoch = 1
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume, map_location='cpu')
            args.start_epoch = checkpoint['epoch'] + 1
            model.load_state_dict(checkpoint['model'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            contrast.load_state_dict(checkpoint['contrast'])
            if args.amp and checkpoint['opt'].amp:
                print('==> resuming amp state_dict')
                amp.load_state_dict(checkpoint['amp'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
            del checkpoint
            torch.cuda.empty_cache()
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # tensorboard
    logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2)

    # routine
    for epoch in range(args.start_epoch, args.epochs + 1):

        adjust_learning_rate(epoch, args, optimizer)
        print("==> training...")

        time1 = time.time()
        l_loss, l_prob, ab_loss, ab_prob = train(epoch, train_loader, model,
                                                 contrast, criterion_l,
                                                 criterion_ab, optimizer, args)
        time2 = time.time()
        print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1))

        # tensorboard logger
        logger.log_value('l_loss', l_loss, epoch)
        logger.log_value('l_prob', l_prob, epoch)
        logger.log_value('ab_loss', ab_loss, epoch)
        logger.log_value('ab_prob', ab_prob, epoch)

        # save model
        if epoch % args.save_freq == 0:
            print('==> Saving...')
            state = {
                'opt': args,
                'model': model.state_dict(),
                'contrast': contrast.state_dict(),
                'optimizer': optimizer.state_dict(),
                'epoch': epoch,
            }
            if args.amp:
                state['amp'] = amp.state_dict()
            save_file = os.path.join(
                args.model_folder,
                'ckpt_epoch_{epoch}.pth'.format(epoch=epoch))
            torch.save(state, save_file)
            # help release GPU memory
            del state

        torch.cuda.empty_cache()
Exemplo n.º 4
0
def main():

    global best_acc1
    best_acc1 = 0

    args = parse_option()

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    # set the data loader
    train_folder = os.path.join(args.data_folder, 'train')
    val_folder = os.path.join(args.data_folder, 'val')

    image_size = 224
    crop_padding = 32
    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]
    normalize = transforms.Normalize(mean=mean, std=std)

    if args.aug == 'NULL':
        train_transform = transforms.Compose([
            transforms.RandomResizedCrop(image_size, scale=(args.crop, 1.)),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ])
    elif args.aug == 'CJ':
        train_transform = transforms.Compose([
            transforms.RandomResizedCrop(image_size, scale=(args.crop, 1.)),
            transforms.RandomGrayscale(p=0.2),
            transforms.ColorJitter(0.4, 0.4, 0.4, 0.4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ])
    else:
        raise NotImplemented('augmentation not supported: {}'.format(args.aug))

    train_dataset = datasets.ImageFolder(train_folder, train_transform)
    val_dataset = datasets.ImageFolder(
        val_folder,
        transforms.Compose([
            transforms.Resize(image_size + crop_padding),
            transforms.CenterCrop(image_size),
            transforms.ToTensor(),
            normalize,
        ]))

    print(len(train_dataset))
    train_sampler = None

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.num_workers,
                                               pin_memory=True,
                                               sampler=train_sampler)
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.num_workers,
                                             pin_memory=True)

    # create model and optimizer
    if args.model == 'resnet50':
        model = InsResNet50()
        classifier = LinearClassifierResNet(args.layer, args.n_label, 'avg', 1)
    elif args.model == 'resnet50x2':
        model = InsResNet50(width=2)
        classifier = LinearClassifierResNet(args.layer, args.n_label, 'avg', 2)
    elif args.model == 'resnet50x4':
        model = InsResNet50(width=4)
        classifier = LinearClassifierResNet(args.layer, args.n_label, 'avg', 4)
    else:
        raise NotImplementedError('model not supported {}'.format(args.model))

    print('==> loading pre-trained model')
    ckpt = torch.load(args.model_path)
    model.load_state_dict(ckpt['model'])
    print("==> loaded checkpoint '{}' (epoch {})".format(
        args.model_path, ckpt['epoch']))
    print('==> done')

    model = model.cuda()
    classifier = classifier.cuda()

    criterion = torch.nn.CrossEntropyLoss().cuda(args.gpu)

    if not args.adam:
        optimizer = torch.optim.SGD(classifier.parameters(),
                                    lr=args.learning_rate,
                                    momentum=args.momentum,
                                    weight_decay=args.weight_decay)
    else:
        optimizer = torch.optim.Adam(classifier.parameters(),
                                     lr=args.learning_rate,
                                     betas=(args.beta1, args.beta2),
                                     weight_decay=args.weight_decay,
                                     eps=1e-8)

    model.eval()
    cudnn.benchmark = True

    # set mixed precision training
    # if args.amp:
    #     model = amp.initialize(model, opt_level=args.opt_level)
    #     classifier, optimizer = amp.initialize(classifier, optimizer, opt_level=args.opt_level)

    # optionally resume from a checkpoint
    args.start_epoch = 1
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume, map_location='cpu')
            # checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch'] + 1
            classifier.load_state_dict(checkpoint['classifier'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            best_acc1 = checkpoint['best_acc1']
            best_acc1 = best_acc1.cuda()
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
            if 'opt' in checkpoint.keys():
                # resume optimization hyper-parameters
                print('=> resume hyper parameters')
                if 'bn' in vars(checkpoint['opt']):
                    print('using bn: ', checkpoint['opt'].bn)
                if 'adam' in vars(checkpoint['opt']):
                    print('using adam: ', checkpoint['opt'].adam)
                if 'cosine' in vars(checkpoint['opt']):
                    print('using cosine: ', checkpoint['opt'].cosine)
                args.learning_rate = checkpoint['opt'].learning_rate
                # args.lr_decay_epochs = checkpoint['opt'].lr_decay_epochs
                args.lr_decay_rate = checkpoint['opt'].lr_decay_rate
                args.momentum = checkpoint['opt'].momentum
                args.weight_decay = checkpoint['opt'].weight_decay
                args.beta1 = checkpoint['opt'].beta1
                args.beta2 = checkpoint['opt'].beta2
            del checkpoint
            torch.cuda.empty_cache()
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # set cosine annealing scheduler
    if args.cosine:

        # last_epoch = args.start_epoch - 2
        # eta_min = args.learning_rate * (args.lr_decay_rate ** 3) * 0.1
        # scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs, eta_min, last_epoch)

        eta_min = args.learning_rate * (args.lr_decay_rate**3) * 0.1
        scheduler = optim.lr_scheduler.CosineAnnealingLR(
            optimizer, args.epochs, eta_min, -1)
        # dummy loop to catch up with current epoch
        for i in range(1, args.start_epoch):
            scheduler.step()

    # tensorboard
    logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2)

    # routine
    for epoch in range(args.start_epoch, args.epochs + 1):

        if args.cosine:
            scheduler.step()
        else:
            adjust_learning_rate(epoch, args, optimizer)
        print("==> training...")

        time1 = time.time()
        train_acc, train_acc5, train_loss = train(epoch, train_loader, model,
                                                  classifier, criterion,
                                                  optimizer, args)
        time2 = time.time()
        print('train epoch {}, total time {:.2f}'.format(epoch, time2 - time1))

        logger.log_value('train_acc', train_acc, epoch)
        logger.log_value('train_acc5', train_acc5, epoch)
        logger.log_value('train_loss', train_loss, epoch)
        logger.log_value('learning_rate', optimizer.param_groups[0]['lr'],
                         epoch)

        print("==> testing...")
        test_acc, test_acc5, test_loss = validate(val_loader, model,
                                                  classifier, criterion, args)

        logger.log_value('test_acc', test_acc, epoch)
        logger.log_value('test_acc5', test_acc5, epoch)
        logger.log_value('test_loss', test_loss, epoch)

        # save the best model
        if test_acc > best_acc1:
            best_acc1 = test_acc
            state = {
                'opt': args,
                'epoch': epoch,
                'classifier': classifier.state_dict(),
                'best_acc1': best_acc1,
                'optimizer': optimizer.state_dict(),
            }
            save_name = '{}_layer{}.pth'.format(args.model, args.layer)
            save_name = os.path.join(args.save_folder, save_name)
            print('saving best model!')
            torch.save(state, save_name)

        # save model
        if epoch % args.save_freq == 0:
            print('==> Saving...')
            state = {
                'opt': args,
                'epoch': epoch,
                'classifier': classifier.state_dict(),
                'best_acc1': test_acc,
                'optimizer': optimizer.state_dict(),
            }
            save_name = 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch)
            save_name = os.path.join(args.save_folder, save_name)
            print('saving regular model!')
            torch.save(state, save_name)

        # tensorboard logger
        pass
Exemplo n.º 5
0
def main_worker(gpu, ngpus_per_node, opt):
    global best_acc, total_time
    opt.gpu = int(gpu)
    opt.gpu_id = int(gpu)

    if opt.gpu is not None:
        print("Use GPU: {} for training".format(opt.gpu))

    if opt.multiprocessing_distributed:
        # Only one node now.
        opt.rank = gpu
        dist_backend = 'nccl'
        dist.init_process_group(backend=dist_backend,
                                init_method=opt.dist_url,
                                world_size=opt.world_size,
                                rank=opt.rank)
        opt.batch_size = int(opt.batch_size / ngpus_per_node)
        opt.num_workers = int(
            (opt.num_workers + ngpus_per_node - 1) / ngpus_per_node)

    if opt.deterministic:
        torch.manual_seed(12345)
        cudnn.deterministic = True
        cudnn.benchmark = False
        numpy.random.seed(12345)

    class_num_map = {
        'cifar100': 100,
        'imagenet': 1000,
        'imagenette': 10,
    }
    if opt.dataset not in class_num_map:
        raise NotImplementedError(opt.dataset)
    n_cls = class_num_map[opt.dataset]

    # model
    model_t = load_teacher(opt.path_t, n_cls, opt.gpu, opt)
    module_args = {'num_classes': n_cls}
    model_s = model_dict[opt.model_s](**module_args)

    if opt.dataset == 'cifar100':
        data = torch.randn(2, 3, 32, 32)
    elif opt.dataset == 'imagenet':
        data = torch.randn(2, 3, 224, 224)

    model_t.eval()
    model_s.eval()
    feat_t, _ = model_t(data, is_feat=True)
    feat_s, _ = model_s(data, is_feat=True)

    module_list = nn.ModuleList([])
    module_list.append(model_s)
    trainable_list = nn.ModuleList([])
    trainable_list.append(model_s)

    criterion_cls = nn.CrossEntropyLoss()
    criterion_div = DistillKL(opt.kd_T)
    if opt.distill == 'kd':
        criterion_kd = DistillKL(opt.kd_T)
    elif opt.distill == 'hint':
        criterion_kd = HintLoss()
        regress_s = ConvReg(feat_s[opt.hint_layer].shape,
                            feat_t[opt.hint_layer].shape)
        module_list.append(regress_s)
        trainable_list.append(regress_s)
    elif opt.distill == 'semckd':
        s_n = [f.shape[1] for f in feat_s[1:-1]]
        t_n = [f.shape[1] for f in feat_t[1:-1]]

        criterion_kd = SemCKDLoss()
        self_attention = SelfA(
            len(feat_s) - 2,
            len(feat_t) - 2, opt.batch_size, s_n, t_n)
        module_list.append(self_attention)
        trainable_list.append(self_attention)
    elif opt.distill == 'crd':
        opt.s_dim = feat_s[-1].shape[1]
        opt.t_dim = feat_t[-1].shape[1]
        opt.n_data = 50000
        criterion_kd = CRDLoss(opt)
        module_list.append(criterion_kd.embed_s)
        module_list.append(criterion_kd.embed_t)
        trainable_list.append(criterion_kd.embed_s)
        trainable_list.append(criterion_kd.embed_t)
    elif opt.distill == 'attention':
        criterion_kd = Attention()
    elif opt.distill == 'similarity':
        criterion_kd = Similarity()
    elif opt.distill == 'rkd':
        criterion_kd = RKDLoss()
    elif opt.distill == 'irg':
        criterion_kd = IRGLoss()
    elif opt.distill == 'pkt':
        criterion_kd = PKT()
    elif opt.distill == 'hkd':
        criterion_kd = HKDLoss(init_weight=opt.hkd_initial_weight,
                               decay=opt.hkd_decay)
    elif opt.distill == 'correlation':
        criterion_kd = Correlation()
        embed_s = LinearEmbed(feat_s[-1].shape[1], opt.feat_dim)
        embed_t = LinearEmbed(feat_t[-1].shape[1], opt.feat_dim)
        module_list.append(embed_s)
        module_list.append(embed_t)
        trainable_list.append(embed_s)
        trainable_list.append(embed_t)
    elif opt.distill == 'vid':
        s_n = [f.shape[1] for f in feat_s[1:-1]]
        t_n = [f.shape[1] for f in feat_t[1:-1]]
        criterion_kd = nn.ModuleList(
            [VIDLoss(s, t, t) for s, t in zip(s_n, t_n)])
        # add this as some parameters in VIDLoss need to be updated
        trainable_list.append(criterion_kd)
    else:
        raise NotImplementedError(opt.distill)

    criterion_list = nn.ModuleList([])
    criterion_list.append(criterion_cls)  # classification loss
    criterion_list.append(
        criterion_div)  # KL divergence loss, original knowledge distillation
    criterion_list.append(criterion_kd)  # other knowledge distillation loss

    module_list.append(model_t)

    if torch.cuda.is_available():
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if opt.multiprocessing_distributed:
            if opt.gpu is not None:
                torch.cuda.set_device(opt.gpu)
                module_list.cuda(opt.gpu)
                distributed_modules = []
                for module in module_list:
                    DDP = torch.nn.parallel.DistributedDataParallel
                    distributed_modules.append(
                        DDP(module, device_ids=[opt.gpu]))
                module_list = distributed_modules
                criterion_list.cuda(opt.gpu)
            else:
                print(
                    'multiprocessing_distributed must be with a specifiec gpu id'
                )
        else:
            criterion_list.cuda()
            module_list.cuda()
        if not opt.deterministic:
            cudnn.benchmark = True

    optimizer = optim.SGD(trainable_list.parameters(),
                          lr=opt.learning_rate,
                          momentum=opt.momentum,
                          weight_decay=opt.weight_decay)

    # dataloader
    if opt.dataset == 'cifar100':
        if opt.distill in ['crd']:
            train_loader, val_loader, n_data = get_cifar100_dataloaders_sample(
                batch_size=opt.batch_size,
                num_workers=opt.num_workers,
                k=opt.nce_k,
                mode=opt.mode)
        else:
            train_loader, val_loader = get_cifar100_dataloaders(
                batch_size=opt.batch_size, num_workers=opt.num_workers)
    elif opt.dataset in imagenet_list:
        if opt.dali is None:
            train_loader, val_loader, train_sampler = get_imagenet_dataloader(
                dataset=opt.dataset,
                batch_size=opt.batch_size,
                num_workers=opt.num_workers,
                multiprocessing_distributed=opt.multiprocessing_distributed)
        else:
            train_loader, val_loader = get_dali_data_loader(opt)
    else:
        raise NotImplementedError(opt.dataset)

    if not opt.multiprocessing_distributed or opt.rank % ngpus_per_node == 0:
        logger = tb_logger.Logger(logdir=opt.tb_folder, flush_secs=2)

    if not opt.skip_validation:
        # validate teacher accuracy
        teacher_acc, _, _ = validate(val_loader, model_t, criterion_cls, opt)

        if opt.dali is not None:
            val_loader.reset()

        if not opt.multiprocessing_distributed or opt.rank % ngpus_per_node == 0:
            print('teacher accuracy: ', teacher_acc)
    else:
        print('Skipping teacher validation.')

    # routine
    for epoch in range(1, opt.epochs + 1):
        torch.cuda.empty_cache()
        if opt.multiprocessing_distributed:
            if opt.dali is None:
                train_sampler.set_epoch(epoch)

        adjust_learning_rate(epoch, opt, optimizer)
        print("==> training...")

        time1 = time.time()
        train_acc, train_acc_top5, train_loss, data_time = train(
            epoch, train_loader, module_list, criterion_list, optimizer, opt)
        time2 = time.time()

        if opt.multiprocessing_distributed:
            metrics = torch.tensor(
                [train_acc, train_acc_top5, train_loss,
                 data_time]).cuda(opt.gpu, non_blocking=True)
            reduced = reduce_tensor(
                metrics, opt.world_size if 'world_size' in opt else 1)
            train_acc, train_acc_top5, train_loss, data_time = reduced.tolist()

        if not opt.multiprocessing_distributed or opt.rank % ngpus_per_node == 0:
            print(
                ' * Epoch {}, GPU {}, Acc@1 {:.3f}, Acc@5 {:.3f}, Time {:.2f}, Data {:.2f}'
                .format(epoch, opt.gpu, train_acc, train_acc_top5,
                        time2 - time1, data_time))

            logger.log_value('train_acc', train_acc, epoch)
            logger.log_value('train_loss', train_loss, epoch)

        print('GPU %d validating' % (opt.gpu))
        test_acc, test_acc_top5, test_loss = validate(val_loader, model_s,
                                                      criterion_cls, opt)

        if opt.dali is not None:
            train_loader.reset()
            val_loader.reset()

        if not opt.multiprocessing_distributed or opt.rank % ngpus_per_node == 0:
            print(' ** Acc@1 {:.3f}, Acc@5 {:.3f}'.format(
                test_acc, test_acc_top5))

            logger.log_value('test_acc', test_acc, epoch)
            logger.log_value('test_loss', test_loss, epoch)
            logger.log_value('test_acc_top5', test_acc_top5, epoch)

            # save the best model
            if test_acc > best_acc:
                best_acc = test_acc
                state = {
                    'epoch': epoch,
                    'model': model_s.state_dict(),
                    'best_acc': best_acc,
                }
                if opt.distill == 'semckd':
                    state['attention'] = trainable_list[-1].state_dict()
                save_file = os.path.join(opt.save_folder,
                                         '{}_best.pth'.format(opt.model_s))

                test_merics = {
                    'test_loss': test_loss,
                    'test_acc': test_acc,
                    'test_acc_top5': test_acc_top5,
                    'epoch': epoch
                }

                save_dict_to_json(
                    test_merics,
                    os.path.join(opt.save_folder, "test_best_metrics.json"))
                print('saving the best model!')
                torch.save(state, save_file)

    if not opt.multiprocessing_distributed or opt.rank % ngpus_per_node == 0:
        # This best accuracy is only for printing purpose.
        print('best accuracy:', best_acc)

        # save parameters
        save_state = {k: v for k, v in opt._get_kwargs()}
        # No. parameters(M)
        num_params = (sum(p.numel() for p in model_s.parameters()) / 1000000.0)
        save_state['Total params'] = num_params
        save_state['Total time'] = (time.time() - total_time) / 3600.0
        params_json_path = os.path.join(opt.save_folder, "parameters.json")
        save_dict_to_json(save_state, params_json_path)
Exemplo n.º 6
0
def main():

    opt = parse_option()

    # dataloader
    train_partition = 'trainval' if opt.use_trainval else 'train'
    if opt.dataset == 'miniImageNet':
        train_trans, test_trans = transforms_options[opt.transform]
        train_loader = DataLoader(ImageNet(args=opt,
                                           partition=train_partition,
                                           transform=train_trans),
                                  batch_size=opt.batch_size,
                                  shuffle=True,
                                  drop_last=True,
                                  num_workers=opt.num_workers)
        val_loader = DataLoader(ImageNet(args=opt,
                                         partition='val',
                                         transform=test_trans),
                                batch_size=opt.batch_size // 2,
                                shuffle=False,
                                drop_last=False,
                                num_workers=opt.num_workers // 2)
        meta_testloader = DataLoader(MetaImageNet(args=opt,
                                                  partition='test',
                                                  train_transform=train_trans,
                                                  test_transform=test_trans),
                                     batch_size=opt.test_batch_size,
                                     shuffle=False,
                                     drop_last=False,
                                     num_workers=opt.num_workers)
        meta_valloader = DataLoader(MetaImageNet(args=opt,
                                                 partition='val',
                                                 train_transform=train_trans,
                                                 test_transform=test_trans),
                                    batch_size=opt.test_batch_size,
                                    shuffle=False,
                                    drop_last=False,
                                    num_workers=opt.num_workers)
        if opt.use_trainval:
            n_cls = 80
        else:
            n_cls = 64
    elif opt.dataset == 'tieredImageNet':
        train_trans, test_trans = transforms_options[opt.transform]
        train_loader = DataLoader(TieredImageNet(args=opt,
                                                 partition=train_partition,
                                                 transform=train_trans),
                                  batch_size=opt.batch_size,
                                  shuffle=True,
                                  drop_last=True,
                                  num_workers=opt.num_workers)
        val_loader = DataLoader(TieredImageNet(args=opt,
                                               partition='train_phase_val',
                                               transform=test_trans),
                                batch_size=opt.batch_size // 2,
                                shuffle=False,
                                drop_last=False,
                                num_workers=opt.num_workers // 2)
        meta_testloader = DataLoader(MetaTieredImageNet(
            args=opt,
            partition='test',
            train_transform=train_trans,
            test_transform=test_trans),
                                     batch_size=opt.test_batch_size,
                                     shuffle=False,
                                     drop_last=False,
                                     num_workers=opt.num_workers)
        meta_valloader = DataLoader(MetaTieredImageNet(
            args=opt,
            partition='val',
            train_transform=train_trans,
            test_transform=test_trans),
                                    batch_size=opt.test_batch_size,
                                    shuffle=False,
                                    drop_last=False,
                                    num_workers=opt.num_workers)
        if opt.use_trainval:
            n_cls = 448
        else:
            n_cls = 351
    elif opt.dataset == 'CIFAR-FS' or opt.dataset == 'FC100':
        train_trans, test_trans = transforms_options['D']

        train_loader = DataLoader(CIFAR100(args=opt,
                                           partition=train_partition,
                                           transform=train_trans),
                                  batch_size=opt.batch_size,
                                  shuffle=True,
                                  drop_last=True,
                                  num_workers=opt.num_workers)
        val_loader = DataLoader(CIFAR100(args=opt,
                                         partition='train',
                                         transform=test_trans),
                                batch_size=opt.batch_size // 2,
                                shuffle=False,
                                drop_last=False,
                                num_workers=opt.num_workers // 2)
        meta_testloader = DataLoader(MetaCIFAR100(args=opt,
                                                  partition='test',
                                                  train_transform=train_trans,
                                                  test_transform=test_trans),
                                     batch_size=opt.test_batch_size,
                                     shuffle=False,
                                     drop_last=False,
                                     num_workers=opt.num_workers)
        meta_valloader = DataLoader(MetaCIFAR100(args=opt,
                                                 partition='val',
                                                 train_transform=train_trans,
                                                 test_transform=test_trans),
                                    batch_size=opt.test_batch_size,
                                    shuffle=False,
                                    drop_last=False,
                                    num_workers=opt.num_workers)
        if opt.use_trainval:
            n_cls = 80
        else:
            if opt.dataset == 'CIFAR-FS':
                n_cls = 64
            elif opt.dataset == 'FC100':
                n_cls = 60
            else:
                raise NotImplementedError('dataset not supported: {}'.format(
                    opt.dataset))
    else:
        raise NotImplementedError(opt.dataset)

    # model
    if not opt.load_latest:
        model = create_model(opt.model, n_cls, opt.dataset)
    else:
        latest_file = os.path.join(opt.save_folder, 'latest.pth')
        model = load_teacher(latest_file, n_cls, opt.dataset)

    # optimizer
    if opt.adam:
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=opt.learning_rate,
                                     weight_decay=0.0005)
    else:
        optimizer = optim.SGD(model.parameters(),
                              lr=opt.learning_rate,
                              momentum=opt.momentum,
                              weight_decay=opt.weight_decay)

    criterion = nn.CrossEntropyLoss()

    if torch.cuda.is_available():
        if opt.n_gpu > 1:
            model = nn.DataParallel(model)
        model = model.cuda()
        criterion = criterion.cuda()
        cudnn.benchmark = True

    # tensorboard
    logger = tb_logger.Logger(logdir=opt.tb_folder, flush_secs=2)

    # set cosine annealing scheduler
    if opt.cosine:
        eta_min = opt.learning_rate * (opt.lr_decay_rate**3)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, opt.epochs, eta_min, -1)

    # routine: supervised pre-training
    for epoch in range(1, opt.epochs + 1):

        if opt.cosine:
            scheduler.step()
        else:
            adjust_learning_rate(epoch, opt, optimizer)
        print("==> training...")

        time1 = time.time()
        train_acc, train_loss = train(epoch, train_loader, model, criterion,
                                      optimizer, opt)
        time2 = time.time()
        print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1))

        logger.log_value('train_acc', train_acc, epoch)
        logger.log_value('train_loss', train_loss, epoch)

        test_acc, test_acc_top5, test_loss = validate(val_loader, model,
                                                      criterion, opt)

        logger.log_value('test_acc', test_acc, epoch)
        logger.log_value('test_acc_top5', test_acc_top5, epoch)
        logger.log_value('test_loss', test_loss, epoch)

        # regular saving
        if epoch % opt.save_freq == 0:
            print('==> Saving...')
            state = {
                'epoch':
                epoch,
                'model':
                model.state_dict()
                if opt.n_gpu <= 1 else model.module.state_dict(),
            }
            save_file = os.path.join(
                opt.save_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch))
            torch.save(state, save_file)
            latest_file = os.path.join(opt.save_folder, 'latest.pth')
            os.symlink(save_file, latest_file)

    # save the last model
    state = {
        'opt':
        opt,
        'model':
        model.state_dict() if opt.n_gpu <= 1 else model.module.state_dict(),
    }
    save_file = os.path.join(opt.save_folder, '{}_last.pth'.format(opt.model))
    torch.save(state, save_file)
Exemplo n.º 7
0
def main():

    args = parse_option()

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    # set the data loader
    data_folder = os.path.join(args.data_folder, 'train')

    image_size = 224
    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]
    normalize = transforms.Normalize(mean=mean, std=std)

    if args.aug == 'NULL':
        train_transform = transforms.Compose([
            transforms.RandomResizedCrop(image_size, scale=(args.crop, 1.)),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ])
    elif args.aug == 'CJ':
        train_transform = transforms.Compose([
            transforms.RandomResizedCrop(image_size, scale=(args.crop, 1.)),
            transforms.RandomGrayscale(p=0.2),
            transforms.ColorJitter(0.4, 0.4, 0.4, 0.4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ])
    else:
        raise NotImplemented('augmentation not supported: {}'.format(args.aug))

    train_dataset = ImageFolderInstance(data_folder,
                                        transform=train_transform,
                                        two_crop=args.moco)
    print(len(train_dataset))
    train_sampler = None
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.num_workers,
                                               pin_memory=True,
                                               sampler=train_sampler)

    # create model and optimizer
    n_data = len(train_dataset)

    if args.model == 'resnet50':
        model = InsResNet50()
        if args.moco:
            model_ema = InsResNet50()
    elif args.model == 'resnet50x2':
        model = InsResNet50(width=2)
        if args.moco:
            model_ema = InsResNet50(width=2)
    elif args.model == 'resnet50x4':
        model = InsResNet50(width=4)
        if args.moco:
            model_ema = InsResNet50(width=4)
    else:
        raise NotImplementedError('model not supported {}'.format(args.model))

    # copy weights from `model' to `model_ema'
    if args.moco:
        moment_update(model, model_ema, 0)

    # set the contrast memory and criterion
    if args.moco:
        contrast = MemoryMoCo(128, n_data, args.nce_k, args.nce_t,
                              args.softmax).cuda(args.gpu)
    else:
        contrast = MemoryInsDis(128, n_data, args.nce_k, args.nce_t,
                                args.nce_m, args.softmax).cuda(args.gpu)

    criterion = NCESoftmaxLoss() if args.softmax else NCECriterion(n_data)
    criterion = criterion.cuda(args.gpu)

    model = model.cuda()
    if args.moco:
        model_ema = model_ema.cuda()

    optimizer = torch.optim.SGD(model.parameters(),
                                lr=args.learning_rate,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    cudnn.benchmark = True

    if args.amp:
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.opt_level)
        if args.moco:
            optimizer_ema = torch.optim.SGD(model_ema.parameters(),
                                            lr=0,
                                            momentum=0,
                                            weight_decay=0)
            model_ema, optimizer_ema = amp.initialize(model_ema,
                                                      optimizer_ema,
                                                      opt_level=args.opt_level)

    # optionally resume from a checkpoint
    args.start_epoch = 1
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume, map_location='cpu')
            # checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch'] + 1
            model.load_state_dict(checkpoint['model'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            contrast.load_state_dict(checkpoint['contrast'])
            if args.moco:
                model_ema.load_state_dict(checkpoint['model_ema'])

            if args.amp and checkpoint['opt'].amp:
                print('==> resuming amp state_dict')
                amp.load_state_dict(checkpoint['amp'])

            print("=> loaded successfully '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
            del checkpoint
            torch.cuda.empty_cache()
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # tensorboard
    logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2)

    # routine
    for epoch in range(args.start_epoch, args.epochs + 1):

        adjust_learning_rate(epoch, args, optimizer)
        print("==> training...")

        time1 = time.time()
        if args.moco:
            loss, prob = train_moco(epoch, train_loader, model, model_ema,
                                    contrast, criterion, optimizer, args)
        else:
            loss, prob = train_ins(epoch, train_loader, model, contrast,
                                   criterion, optimizer, args)
        time2 = time.time()
        print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1))

        # tensorboard logger
        logger.log_value('ins_loss', loss, epoch)
        logger.log_value('ins_prob', prob, epoch)
        logger.log_value('learning_rate', optimizer.param_groups[0]['lr'],
                         epoch)

        # save model
        if epoch % args.save_freq == 0:
            print('==> Saving...')
            state = {
                'opt': args,
                'model': model.state_dict(),
                'contrast': contrast.state_dict(),
                'optimizer': optimizer.state_dict(),
                'epoch': epoch,
            }
            if args.moco:
                state['model_ema'] = model_ema.state_dict()
            if args.amp:
                state['amp'] = amp.state_dict()
            save_file = os.path.join(
                args.model_folder,
                'ckpt_epoch_{epoch}.pth'.format(epoch=epoch))
            torch.save(state, save_file)
            # help release GPU memory
            del state

        # saving the model
        print('==> Saving...')
        state = {
            'opt': args,
            'model': model.state_dict(),
            'contrast': contrast.state_dict(),
            'optimizer': optimizer.state_dict(),
            'epoch': epoch,
        }
        if args.moco:
            state['model_ema'] = model_ema.state_dict()
        if args.amp:
            state['amp'] = amp.state_dict()
        save_file = os.path.join(args.model_folder, 'current.pth')
        torch.save(state, save_file)
        if epoch % args.save_freq == 0:
            save_file = os.path.join(
                args.model_folder,
                'ckpt_epoch_{epoch}.pth'.format(epoch=epoch))
            torch.save(state, save_file)
        # help release GPU memory
        del state
        torch.cuda.empty_cache()
Exemplo n.º 8
0
 def __init__(self, log_dir):
     """Create a summary writer logging to log_dir."""
     # self.writer = tf.summary.FileWriter(log_dir)
     self.logger = tb_logger.Logger(logdir=log_dir, flush_secs=2)
Exemplo n.º 9
0
def main():
    global best_acc1
    best_acc1 = 0

    args = parse_option()

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    # set the data loader
    train_loader, val_loader, train_sampler = get_train_val_loader(args)

    # set the model
    model, classifier, criterion = set_model(args)

    # set optimizer
    optimizer = set_optimizer(args, classifier)

    cudnn.benchmark = True

    # optionally resume linear classifier
    args.start_epoch = 1
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch'] + 1
            best_acc1 = checkpoint['best_acc1']
            if args.gpu is not None:
                # best_acc1 may be from a checkpoint from a different GPU
                best_acc1 = best_acc1.to(args.gpu)
            classifier.load_state_dict(checkpoint['classifier'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    args.start_epoch = 1
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume, map_location='cpu')
            args.start_epoch = checkpoint['epoch'] + 1
            classifier.load_state_dict(checkpoint['classifier'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            best_acc1 = checkpoint['best_acc1']
            best_acc1 = best_acc1.cuda()
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
            del checkpoint
            torch.cuda.empty_cache()
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # tensorboard
    logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2)

    # routine
    for epoch in range(args.start_epoch, args.epochs + 1):

        adjust_learning_rate(epoch, args, optimizer)
        print("==> training...")

        time1 = time.time()
        train_acc, train_acc5, train_loss = train(epoch, train_loader, model,
                                                  classifier, criterion,
                                                  optimizer, args)
        time2 = time.time()
        print('train epoch {}, total time {:.2f}'.format(epoch, time2 - time1))

        logger.log_value('train_acc', train_acc, epoch)
        logger.log_value('train_acc5', train_acc5, epoch)
        logger.log_value('train_loss', train_loss, epoch)

        print("==> testing...")
        test_acc, test_acc5, test_loss = validate(val_loader, model,
                                                  classifier, criterion, args)

        logger.log_value('test_acc', test_acc, epoch)
        logger.log_value('test_acc5', test_acc5, epoch)
        logger.log_value('test_loss', test_loss, epoch)

        # save the best model
        if test_acc > best_acc1:
            best_acc1 = test_acc
            state = {
                'opt': args,
                'epoch': epoch,
                'classifier': classifier.state_dict(),
                'best_acc1': best_acc1,
                'optimizer': optimizer.state_dict(),
            }
            save_name = '{}_layer{}.pth'.format(args.model, args.layer)
            save_name = os.path.join(args.save_folder, save_name)
            print('saving best model!')
            torch.save(state, save_name)

        # save model
        if epoch % args.save_freq == 0:
            print('==> Saving...')
            state = {
                'opt': args,
                'epoch': epoch,
                'classifier': classifier.state_dict(),
                'best_acc1': test_acc,
                'optimizer': optimizer.state_dict(),
            }
            save_name = 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch)
            save_name = os.path.join(args.save_folder, save_name)
            print('saving regular model!')
            torch.save(state, save_name)

        # tensorboard logger
        pass
Exemplo n.º 10
0
def train(gpu=None):
    logs = {
    'train':tensorboard_logger.Logger(tb_path + "/train"),
    'prs':tensorboard_logger.Logger(tb_path + "/prs"),
    'spr':tensorboard_logger.Logger(tb_path + "/sp"),
    'r2':tensorboard_logger.Logger(tb_path + "/r2"),
    }

    db = Dataset(training_path, testing_path, post_map_path, feature_path, aux_path, attr_path, settings['min_images'])
    print 'Training Attributes:', db.attr_names

    model = neural_net(num_attributes=len(db.attr_inds), aux_size=len(db.aux_list))

    if resume_train is None:
        start_train = 0
    else:
        epochs_str = [el.split('_')[-1].split('.')[0] for el in glob.glob('log/' + resume_train + "/*.dat")]
        if 'model' in epochs_str:
            epochs_str.remove('model')
        last_epoch = np.max([int(el) for el in epochs_str])
        # last_epoch = np.max([int(el.split('_')[-1][0]) for el in glob.glob('log/' + resume_train + "/*.dat")])
        resume_path = 'log/' + resume_train + "/vgg_model_ep_" + str(last_epoch) + ".dat"
        start_train = last_epoch + 1
        if gpu is not None:
            model.load_state_dict(torch.load(resume_path, map_location='cuda:' + str(gpu)))
        else:
            model.load_state_dict(torch.load(resume_path, map_location=lambda gpu, loc: gpu))

    # Initializing PyTorch Dataloader
    dataloader = DataLoader(db, batch_size=settings['batch_size'], shuffle=True, num_workers=4)

    mr_loss = MarginRankingLoss(margin=0.3).to(gpu)

    optimizer = optim.Adadelta(model.parameters(), lr=settings['lr'], weight_decay=1e-5)

    model = model.to(gpu)

    step = 0
    for epoch in range(start_train, settings['num_epochs']):
        print 'Epoch', epoch
        pbar = tqdm(total=db.__len__())
        for i_batch, sample_batched in enumerate(dataloader):
            optimizer.zero_grad()
            image_1 = sample_batched['image_1'].type(torch.FloatTensor)
            image_2 = sample_batched['image_2'].type(torch.FloatTensor)

            aux_1 = sample_batched['label_1'].type(torch.FloatTensor).to(gpu)
            aux_2 = sample_batched['label_2'].type(torch.FloatTensor).to(gpu)

            gt = (aux_1 > aux_2).type(torch.FloatTensor)

            reg_loss_1 = torch.zeros(image_1.shape[0], dtype=torch.float32)
            reg_loss_2 = torch.zeros(image_1.shape[0], dtype=torch.float32)
            ranking_loss = torch.zeros(image_1.shape[0], dtype=torch.float32)

            if gpu is not None:
                image_1 = image_1.to(gpu)
                image_2 = image_2.to(gpu)
                aux_1 = aux_1.to(gpu)
                aux_2 = aux_2.to(gpu)
                gt = gt.to(gpu)
                reg_loss_1 = reg_loss_1.to(gpu)
                reg_loss_2 = reg_loss_2.to(gpu)
                ranking_loss = ranking_loss.to(gpu)

            out_1 = model(image_1)
            out_2 = model(image_2)

            for i in range(len(db.attr_inds)):  # avg over attributes
                ranking_loss += mr_loss(out_1[i], out_2[i], gt[:, i])
            ranking_loss = ranking_loss / len(db.attr_inds)

            if fixed_std:
                p = [torch.distributions.normal.Normal(aux_1[:, i], 0.1) for i in range(len(db.attr_inds))]
                q = [torch.distributions.normal.Normal(out_1[i].mean(1).squeeze(), out_1[i].std(1).squeeze()) for i in range(len(db.attr_inds))]
                for i in range(len(db.attr_inds)):  # avg over attributes
                    reg_loss_1 += torch.distributions.kl.kl_divergence(p[i], q[i])
                reg_loss_1 = reg_loss_1 / len(db.attr_inds)
                p = [torch.distributions.normal.Normal(aux_2[:, i], 0.1) for i in range(len(db.attr_inds))]
                q = [torch.distributions.normal.Normal(out_2[i].mean(1).squeeze(), out_2[i].std(1).squeeze()) for i in range(len(db.attr_inds))]
                for i in range(len(db.attr_inds)):  # avg over attributes
                    reg_loss_2 += torch.distributions.kl.kl_divergence(p[i], q[i])
                reg_loss_2 = reg_loss_2 / len(db.attr_inds)
            else:
                p = [torch.distributions.normal.Normal(aux_1[:, i], model.aux_stds[sample_batched['aux_1'], i]) for i in range(len(db.attr_inds))]
                q = [torch.distributions.normal.Normal(out_1[i].mean(1).squeeze(), out_1[i].std(1).squeeze()) for i in range(len(db.attr_inds))]
                for i in range(len(db.attr_inds)):  # avg over attributes
                    reg_loss_1 += torch.distributions.kl.kl_divergence(p[i], q[i])
                reg_loss_1 = reg_loss_1 / len(db.attr_inds)
                p = [torch.distributions.normal.Normal(aux_2[:, i], model.aux_stds[sample_batched['aux_2'], i]) for i in range(len(db.attr_inds))]
                q = [torch.distributions.normal.Normal(out_2[i].mean(1).squeeze(), out_2[i].std(1).squeeze()) for i in range(len(db.attr_inds))]
                for i in range(len(db.attr_inds)):  # avg over attributes
                    reg_loss_2 += torch.distributions.kl.kl_divergence(p[i], q[i])
                reg_loss_2 = reg_loss_2 / len(db.attr_inds)

            ranking_loss = ranking_loss.mean()  # avg over batch
            reg_loss = reg_loss_1.mean() + reg_loss_2.mean()  # avg over batch

            loss = reg_loss + ranking_loss

            step += 1
            logs['train'].log_value('loss', loss.item(), step)

            loss.backward()
            optimizer.step()

            _loss = loss.item()
            pbar.update(image_1.shape[0])

        pbar.close()

        if epoch % 50 == 0:
            model.eval()
            test(model, db, gpu, logs=logs, step=step)
            model.train()
            persist_model(model, experiment_folder + '/vgg_model_ep_' + str(epoch) + '.dat')

    # Performing final evaluation
    model.eval()
    test(model, db, gpu)
    persist_model(model, model_path)
    return
Exemplo n.º 11
0
    train_data = PlacesRoom(train=True)
    test_data = PlacesRoom(train=False)

    train_dataloader = DataLoader(train_data,
                                  batch_size=256,
                                  num_workers=6,
                                  shuffle=True,
                                  drop_last=True)
    val_dataloader = DataLoader(test_data,
                                batch_size=256,
                                num_workers=6,
                                shuffle=True,
                                drop_last=False)

    criterion = nn.CrossEntropyLoss().cuda()
    logger = tb_logger.Logger(logdir="linear_finetune", flush_secs=2)

    for epoch in range(60):
        # adjust_learning_rate(epoch, FLAGS, optimizer)

        print("==> training...")

        train_acc, train_acc5, train_loss = train(epoch, train_dataloader,
                                                  model, classifier, criterion,
                                                  optimizer)

        logger.log_value('train_acc', train_acc, epoch)
        logger.log_value('train_acc5', train_acc5, epoch)
        logger.log_value('train_loss', train_loss, epoch)

        print("==> testing...")
Exemplo n.º 12
0
def main():
    global args, best_prec1, use_cuda
    args = parser.parse_args()

    # Data loading code
    normalize = transforms.Normalize(
        mean=[x / 255.0 for x in [125.3, 123.0, 113.9]],
        std=[x / 255.0 for x in [63.0, 62.1, 66.7]])

    if args.augment:
        transform_train = transforms.Compose([
            transforms.ToTensor(),
            transforms.Lambda(lambda x: F.pad(x.unsqueeze(0), (4, 4, 4, 4),
                                              mode='reflect').squeeze()),
            transforms.ToPILImage(),
            transforms.RandomCrop(32),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor()
        ])
    else:
        transform_train = transforms.Compose([transforms.ToTensor()])
    transform_test = transforms.Compose([transforms.ToTensor()])

    kwargs = {'num_workers': 0, 'pin_memory': True}
    assert (args.dataset == 'cifar10' or args.dataset == 'cifar100'
            or args.dataset == 'svhn')

    if args.dataset == 'cifar10':
        nclasses = 10
        dataset_train = NCIFAR10('./data',
                                 train=True,
                                 transform=transform_train,
                                 normalize_transform=normalize)
        dataset_test = NCIFAR10('./data',
                                train=False,
                                transform=transform_test,
                                normalize_transform=normalize)
    else:
        raise RuntimeError('no other data set implementations available')
    '''
    elif args.dataset == 'cifar100':
        nclasses = 100
        dataset_train = NCIFAR100('./data', train=True, transform=transform_train,
                                  normalize_transform=normalize, download=True)
        dataset_test = NCIFAR100('./data', train=False, transform=transform_test,
                                 normalize_transform=normalize, download=True)
    elif args.dataset == 'svhn':
        nclasses = 10
        dataset_train = NSVHN('./data', split='train', transform=transform_train,
                              normalize_transform=normalize)
        dataset_test = NSVHN('./data', split='test', transform=transform_test,
                             normalize_transform=normalize)
    '''

    train_loader = torch.utils.data.DataLoader(dataset_train,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               **kwargs)

    val_loader = torch.utils.data.DataLoader(dataset_test,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             **kwargs)

    # --------------------------------------------------------------------------------
    # create model
    if args.arch == 'resnet':
        experiment_dir = 'experiments/'
        output_dir = experiment_dir + 'resnet-cifar/'

        rnargs = {
            'use_pp1': args.pushpull,
            'pp_block1': args.pp_block1,
            'pp_all': args.pp_all,
            'train_alpha': args.train_alpha,
            'size_lpf': args.lpf_size
        }

        if args.layers == 20:
            model = resnet20(**rnargs)
        elif args.layers == 32:
            model = resnet32(**rnargs)
        elif args.layers == 44:
            model = resnet44(**rnargs)
        elif args.layers == 56:
            model = resnet56(**rnargs)
    elif args.arch == 'densenet':
        experiment_dir = 'experiments/'
        output_dir = experiment_dir + 'densenet-cifar/'
        rnargs = {
            'use_pp1': args.pushpull,
            'pp_block1': args.pp_block1,
            'num_classes': nclasses,
            'small_inputs': True,
            'efficient': args.efficient,
            'compression': args.reduce,
            'drop_rate': args.droprate,
            # 'scale_pp': args.scale_pp,
            # 'alpha_pp': args.alpha_pp
        }

        if args.layers == 40:
            model = densenet40_12(**rnargs)
        elif args.layers == 100:
            if args.growth == 12:
                model = densenet100_12(**rnargs)
            elif args.growth == 24:
                model = densenet100_24(**rnargs)
    else:
        raise RuntimeError('chosen architecture not implemented (yet)...')

    logger = None
    if args.tensorboard:
        ustr = datetime.datetime.now().strftime(
            "%y-%m-%d_%H-%M_") + uuid.uuid4().hex[:3]
        logger = tensorboard_logger.Logger(experiment_dir + "tensorboard/" +
                                           args.name + '/' + ustr)

    # get the number of model parameters
    print('Number of model parameters: {}'.format(
        sum([p.data.nelement() for p in model.parameters()])))
    # --------------------------------------------------------------------------------

    use_cuda = torch.cuda.is_available()

    # for training on multiple GPUs.
    # Use CUDA_VISIBLE_DEVICES=0,1 to specify which GPUs to use
    # model = torch.nn.DataParallel(model).cuda()
    if use_cuda:
        model = model.cuda()

    # optionally resume from a checkpoint
    epoch = None
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']

            model.load_state_dict(checkpoint['state_dict'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss()
    if use_cuda:
        criterion = criterion.cuda()
    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                nesterov=args.nesterov,
                                weight_decay=args.weight_decay)

    lr_milestones = json.loads(args.milestones)
    if args.extra_epochs > 0:
        lr_milestones = list(
            set(lr_milestones + json.loads(args.extra_milestones)))

    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                     milestones=lr_milestones,
                                                     gamma=0.1)
    scheduler.step(epoch)

    directory = output_dir + "%s/" % args.name
    if not os.path.exists(directory):
        os.makedirs(directory)

    for epoch in range(args.start_epoch, args.epochs + args.extra_epochs):
        fileout = open(output_dir + args.name + '/log.txt', "a+")
        # adjust_learning_rate(logger, optimizer, epoch + 1, args.epochs)
        scheduler.step()
        print('lr(', epoch, '): ', scheduler.get_lr())

        # train for one epoch
        train(logger, train_loader, model, criterion, optimizer, epoch,
              fileout)

        # evaluate on validation set
        prec1 = validate(logger, val_loader, model, criterion, epoch, fileout)
        fileout.close()

        # remember best prec@1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'best_prec1': best_prec1,
            }, is_best, output_dir)

    print('Best accuracy: ', best_prec1)
    fileout = open(output_dir + args.name + '/log.txt', "a+")
    fileout.write('Best accuracy: {}\n'.format(best_prec1))
    fileout.close()
Exemplo n.º 13
0
 def _setup_tensorboard_logger(self):
     if self.checkpoint_path:
         self._tensortboard_logger = tensorboard_logger.Logger(
             self.checkpoint_path, flush_secs=5)
     else:
         self._tensortboard_logger = None
Exemplo n.º 14
0
def main():

    args = parse_option()
    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))
    # create model and optimizer
    # == dataset config==
    """
    CUDA_VISIBLE_DEVICES=0,1 python train_temporal_dis.py \
     --batch_size 16 --num_workers 8 --nce_k 3569 --softmax --moco
    """

    # args.dataset = 'hmdb51'
    # args.train_list = '../datasets/lists/hmdb51/hmdb51_rgb_train_split_1.txt'
    # args.val_list = '../datasets/lists/hmdb51/hmdb51_rgb_val_split_1.txt'
    """
    CUDA_VISIBLE_DEVICES=1 python train_temporal_dis.py \
     --batch_size 16 --num_workers 8 --nce_k 9536 --softmax --moco
    """
    # args.print_freq = 100
    # args.dataset = 'ucf101'
    # args.train_list = '../datasets/lists/ucf101/ucf101_rgb_train_split_1.txt'
    # args.val_list = '../datasets/lists/ucf101/ucf101_rgb_val_split_1.txt'

    # args.print_freq = 1000
    # args.dataset = 'kinetics'
    # args.train_list = '../datasets/lists/kinetics-400/ssd_kinetics_video_trainlist.txt'
    # args.val_list = '../datasets/lists/kinetics-400/ssd_kinetics_video_vallist.txt'

    args.dropout = 0.5
    args.clips = 1
    args.data_length = 16
    args.stride = 4
    args.spatial_size = 224
    args.root = ""
    args.mode = 'rgb'
    args.eval_indict = 'loss'
    args.pt_loss = 'TemporalDis'
    args.workers = 4
    # args.arch = 'i3d' # 'r2p1d'
    num_class, data_length, image_tmpl = data_config(args)
    train_transforms, test_transforms, eval_transforms = augmentation_config(
        args)
    train_loader, val_loader, eval_loader, train_samples, val_samples, eval_samples = data_loader_init(
        args, data_length, image_tmpl, train_transforms, test_transforms,
        eval_transforms)

    n_data = len(train_loader)
    if args.arch == 'i3d':
        model = I3D(num_classes=101,
                    modality=args.mode,
                    dropout_prob=args.dropout,
                    with_classifier=False)
        model_ema = I3D(num_classes=101,
                        modality=args.mode,
                        dropout_prob=args.dropout,
                        with_classifier=False)
    elif args.arch == 'r2p1d':
        model = R2Plus1DNet((1, 1, 1, 1),
                            num_classes=num_class,
                            with_classifier=False)
        model_ema = R2Plus1DNet((1, 1, 1, 1),
                                num_classes=num_class,
                                with_classifier=False)
    elif args.arch == 'r3d':
        from model.r3d import resnet18
        model = resnet18(num_classes=num_class, with_classifier=False)
        model_ema = resnet18(num_classes=num_class, with_classifier=False)
    else:
        Exception("Not implemene error!")
    model = torch.nn.DataParallel(model)
    model_ema = torch.nn.DataParallel(model_ema)
    # random initialization
    model.apply(weights_init)
    model_ema.apply(weights_init)
    # copy weights from `model' to `model_ema'
    moment_update(model, model_ema, 0)
    contrast = MemoryMoCo(128, n_data, args.nce_k, args.nce_t,
                          args.softmax).cuda(args.gpu)
    # contrast2 = MemoryMoCo(128, n_data, args.nce_k, args.nce_t, args.softmax).cuda(args.gpu)
    criterion = NCESoftmaxLoss() if args.softmax else NCECriterion(n_data)
    criterion = criterion.cuda(args.gpu)
    cls_criterion = nn.CrossEntropyLoss().cuda()

    model = model.cuda()
    if args.moco:
        model_ema = model_ema.cuda()

    optimizer = torch.optim.SGD(model.parameters(),
                                lr=args.learning_rate,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    cudnn.benchmark = True

    if args.amp:
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.opt_level)
        if args.moco:
            optimizer_ema = torch.optim.SGD(model_ema.parameters(),
                                            lr=0,
                                            momentum=0,
                                            weight_decay=0)
            model_ema, optimizer_ema = amp.initialize(model_ema,
                                                      optimizer_ema,
                                                      opt_level=args.opt_level)

    # optionally resume from a checkpoint
    args.start_epoch = 1
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume, map_location='cpu')
            # checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch'] + 1
            model.load_state_dict(checkpoint['model'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            contrast.load_state_dict(checkpoint['contrast'])
            if args.moco:
                model_ema.load_state_dict(checkpoint['model_ema'])

            if args.amp and checkpoint['opt'].amp:
                print('==> resuming amp state_dict')
                amp.load_state_dict(checkpoint['amp'])

            print("=> loaded successfully '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
            del checkpoint
            torch.cuda.empty_cache()
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # tensorboard
    logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2)
    logger2 = tb_logger.Logger(logdir=args.tb_folder2, flush_secs=2)

    #==================================== our data augmentation method=================================
    pos_aug = GenPositive()
    neg_aug = GenNegative()

    # routine
    for epoch in range(args.start_epoch, args.epochs + 1):

        adjust_learning_rate(epoch, args, optimizer)
        print("==> training...")

        time1 = time.time()
        loss, prob = train_moco(epoch, train_loader, model, model_ema,
                                contrast, criterion, optimizer, args, pos_aug,
                                neg_aug)
        time2 = time.time()
        print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1))
        saving(logger, loss, epoch, optimizer, args, model, contrast, prob,
               model_ema, 'TemporalDis')
Exemplo n.º 15
0
 def add_tensorboard_output(self, file_name):
     import tensorboard_logger
     self._use_tensorboard = True
     self.tensorboard_logger = tensorboard_logger.Logger(file_name)
Exemplo n.º 16
0
def main():
    best_acc = 0

    opt = parse_option()

    # tensorboard logger
    logger = tb_logger.Logger(logdir=opt.tb_folder, flush_secs=2)

    # dataloader
    train_partition = 'trainval' if opt.use_trainval else 'train'
    if opt.dataset == 'miniImageNet':
        train_trans, test_trans = transforms_options[opt.transform]
        if opt.distill in ['contrast']:
            train_set = ImageNet(args=opt,
                                 partition=train_partition,
                                 transform=train_trans,
                                 is_sample=True,
                                 k=opt.nce_k)
        else:
            train_set = ImageNet(args=opt,
                                 partition=train_partition,
                                 transform=train_trans)
        n_data = len(train_set)
        train_loader = DataLoader(train_set,
                                  batch_size=opt.batch_size,
                                  shuffle=True,
                                  drop_last=True,
                                  num_workers=opt.num_workers)
        val_loader = DataLoader(ImageNet(args=opt,
                                         partition='val',
                                         transform=test_trans),
                                batch_size=opt.batch_size // 2,
                                shuffle=False,
                                drop_last=False,
                                num_workers=opt.num_workers // 2)
        meta_testloader = DataLoader(MetaImageNet(args=opt,
                                                  partition='test',
                                                  train_transform=train_trans,
                                                  test_transform=test_trans),
                                     batch_size=opt.test_batch_size,
                                     shuffle=False,
                                     drop_last=False,
                                     num_workers=opt.num_workers)
        meta_valloader = DataLoader(MetaImageNet(args=opt,
                                                 partition='val',
                                                 train_transform=train_trans,
                                                 test_transform=test_trans),
                                    batch_size=opt.test_batch_size,
                                    shuffle=False,
                                    drop_last=False,
                                    num_workers=opt.num_workers)
        if opt.use_trainval:
            n_cls = 80
        else:
            n_cls = 64
    elif opt.dataset == 'tieredImageNet':
        train_trans, test_trans = transforms_options[opt.transform]
        if opt.distill in ['contrast']:
            train_set = TieredImageNet(args=opt,
                                       partition=train_partition,
                                       transform=train_trans,
                                       is_sample=True,
                                       k=opt.nce_k)
        else:
            train_set = TieredImageNet(args=opt,
                                       partition=train_partition,
                                       transform=train_trans)
        n_data = len(train_set)
        train_loader = DataLoader(train_set,
                                  batch_size=opt.batch_size,
                                  shuffle=True,
                                  drop_last=True,
                                  num_workers=opt.num_workers)
        val_loader = DataLoader(TieredImageNet(args=opt,
                                               partition='train_phase_val',
                                               transform=test_trans),
                                batch_size=opt.batch_size // 2,
                                shuffle=False,
                                drop_last=False,
                                num_workers=opt.num_workers // 2)
        meta_testloader = DataLoader(MetaTieredImageNet(
            args=opt,
            partition='test',
            train_transform=train_trans,
            test_transform=test_trans),
                                     batch_size=opt.test_batch_size,
                                     shuffle=False,
                                     drop_last=False,
                                     num_workers=opt.num_workers)
        meta_valloader = DataLoader(MetaTieredImageNet(
            args=opt,
            partition='val',
            train_transform=train_trans,
            test_transform=test_trans),
                                    batch_size=opt.test_batch_size,
                                    shuffle=False,
                                    drop_last=False,
                                    num_workers=opt.num_workers)
        if opt.use_trainval:
            n_cls = 448
        else:
            n_cls = 351
    elif opt.dataset == 'CIFAR-FS' or opt.dataset == 'FC100':
        train_trans, test_trans = transforms_options['D']
        if opt.distill in ['contrast']:
            train_set = CIFAR100(args=opt,
                                 partition=train_partition,
                                 transform=train_trans,
                                 is_sample=True,
                                 k=opt.nce_k)
        else:
            train_set = CIFAR100(args=opt,
                                 partition=train_partition,
                                 transform=train_trans)
        n_data = len(train_set)
        train_loader = DataLoader(train_set,
                                  batch_size=opt.batch_size,
                                  shuffle=True,
                                  drop_last=True,
                                  num_workers=opt.num_workers)
        val_loader = DataLoader(CIFAR100(args=opt,
                                         partition='train',
                                         transform=test_trans),
                                batch_size=opt.batch_size // 2,
                                shuffle=False,
                                drop_last=False,
                                num_workers=opt.num_workers // 2)
        meta_testloader = DataLoader(MetaCIFAR100(args=opt,
                                                  partition='test',
                                                  train_transform=train_trans,
                                                  test_transform=test_trans),
                                     batch_size=opt.test_batch_size,
                                     shuffle=False,
                                     drop_last=False,
                                     num_workers=opt.num_workers)
        meta_valloader = DataLoader(MetaCIFAR100(args=opt,
                                                 partition='val',
                                                 train_transform=train_trans,
                                                 test_transform=test_trans),
                                    batch_size=opt.test_batch_size,
                                    shuffle=False,
                                    drop_last=False,
                                    num_workers=opt.num_workers)
        if opt.use_trainval:
            n_cls = 80
        else:
            if opt.dataset == 'CIFAR-FS':
                n_cls = 64
            elif opt.dataset == 'FC100':
                n_cls = 60
            else:
                raise NotImplementedError('dataset not supported: {}'.format(
                    opt.dataset))
    else:
        raise NotImplementedError(opt.dataset)

    # model
    model_t = load_teacher(opt.path_t, n_cls, opt.dataset)
    model_s = create_model(opt.model_s, n_cls, opt.dataset)

    data = torch.randn(2, 3, 84, 84)
    model_t.eval()
    model_s.eval()
    feat_t, _ = model_t(data, is_feat=True)
    feat_s, _ = model_s(data, is_feat=True)

    module_list = nn.ModuleList([])
    module_list.append(model_s)
    trainable_list = nn.ModuleList([])
    trainable_list.append(model_s)

    criterion_cls = nn.CrossEntropyLoss()
    criterion_div = DistillKL(opt.kd_T)
    if opt.distill == 'kd':
        criterion_kd = DistillKL(opt.kd_T)
    elif opt.distill == 'contrast':
        criterion_kd = NCELoss(opt, n_data)
        embed_s = Embed(feat_s[-1].shape[1], opt.feat_dim)
        embed_t = Embed(feat_t[-1].shape[1], opt.feat_dim)
        module_list.append(embed_s)
        module_list.append(embed_t)
        trainable_list.append(embed_s)
        trainable_list.append(embed_t)
    elif opt.distill == 'attention':
        criterion_kd = Attention()
    elif opt.distill == 'hint':
        criterion_kd = HintLoss()
    else:
        raise NotImplementedError(opt.distill)

    criterion_list = nn.ModuleList([])
    criterion_list.append(criterion_cls)  # classification loss
    criterion_list.append(
        criterion_div)  # KL divergence loss, original knowledge distillation
    criterion_list.append(criterion_kd)  # other knowledge distillation loss

    # optimizer
    optimizer = optim.SGD(trainable_list.parameters(),
                          lr=opt.learning_rate,
                          momentum=opt.momentum,
                          weight_decay=opt.weight_decay)

    # append teacher after optimizer to avoid weight_decay
    module_list.append(model_t)

    if torch.cuda.is_available():
        module_list.cuda()
        criterion_list.cuda()
        cudnn.benchmark = True

    # validate teacher accuracy
    teacher_acc, _, _ = validate(val_loader, model_t, criterion_cls, opt)
    print('teacher accuracy: ', teacher_acc)

    # set cosine annealing scheduler
    if opt.cosine:
        eta_min = opt.learning_rate * (opt.lr_decay_rate**3)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, opt.epochs, eta_min, -1)

    # routine: supervised model distillation
    for epoch in range(1, opt.epochs + 1):

        if opt.cosine:
            scheduler.step()
        else:
            adjust_learning_rate(epoch, opt, optimizer)
        print("==> training...")

        time1 = time.time()
        train_acc, train_loss = train(epoch, train_loader, module_list,
                                      criterion_list, optimizer, opt)
        time2 = time.time()
        print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1))

        logger.log_value('train_acc', train_acc, epoch)
        logger.log_value('train_loss', train_loss, epoch)

        test_acc, test_acc_top5, test_loss = validate(val_loader, model_s,
                                                      criterion_cls, opt)

        logger.log_value('test_acc', test_acc, epoch)
        logger.log_value('test_acc_top5', test_acc_top5, epoch)
        logger.log_value('test_loss', test_loss, epoch)

        # regular saving
        if epoch % opt.save_freq == 0:
            print('==> Saving...')
            state = {
                'epoch': epoch,
                'model': model_s.state_dict(),
            }
            save_file = os.path.join(
                opt.save_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch))
            torch.save(state, save_file)

    # save the last model
    state = {
        'opt': opt,
        'model': model_s.state_dict(),
    }
    save_file = os.path.join(opt.save_folder,
                             '{}_last.pth'.format(opt.model_s))
    torch.save(state, save_file)
Exemplo n.º 17
0
def main_worker(gpu, ngpus_per_node, args):
    global best_acc1  # stliu: best accuracy
    args.gpu = gpu

    # suppress printing if not master
    if args.multiprocessing_distributed and args.gpu != 0:

        def print_pass(*args):
            pass

        builtins.print = print_pass

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)
    # create model
    print("=> creating model '{}'".format(args.arch))
    # stliu: add resnet_ttt
    if args.arch == 'resnet_ttt':
        model = moco.builder.MoCo(ResNetCifar,
                                  args.moco_dim,
                                  args.moco_k,
                                  args.moco_m,
                                  args.moco_t,
                                  args.mlp,
                                  width=args.width,
                                  norm=args.norm)
        _, ext, head, ssh = build_model(
            args, model.encoder_q
        )  # stliu: ext, head and ssh share same paras as encoder_q
        # stliu: SVM with model_val on single GPU
        norm_layer = get_norm(args.norm)
        model_val = ResNetCifar(num_classes=args.moco_dim,
                                width=args.width,
                                norm_layer=norm_layer)
    else:
        model = moco.builder.MoCo(models.__dict__[args.arch], args.moco_dim,
                                  args.moco_k, args.moco_m, args.moco_t,
                                  args.mlp)
    # print(model) # stliu: comment this

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            model_val.cuda(args.gpu)  # stliu: for SVM
            ssh = ssh.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(
                (args.workers + ngpus_per_node - 1) / ngpus_per_node)
            # stliu: add broadcast_buffers=False to use normal BN
            model = torch.nn.parallel.DistributedDataParallel(
                model,
                device_ids=[args.gpu],
                broadcast_buffers=False,
                find_unused_parameters=True)
            ssh = torch.nn.parallel.DistributedDataParallel(
                ssh,
                device_ids=[args.gpu],
                broadcast_buffers=False,
                find_unused_parameters=True)
            # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
            # ssh = torch.nn.parallel.DistributedDataParallel(ssh, device_ids=[args.gpu])
        else:
            model.cuda()
            model_val.cuda()  # stliu: for SVM
            ssh = ssh.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(
                model, broadcast_buffers=False, find_unused_parameters=True)
            ssh = torch.nn.parallel.DistributedDataParallel(
                ssh, broadcast_buffers=False, find_unused_parameters=True)
            # model = torch.nn.parallel.DistributedDataParallel(model)
            # ssh = torch.nn.parallel.DistributedDataParallel(ssh)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
        model_val = model_val.cuda(args.gpu)  # stliu: for SVM
        ssh = ssh.cuda(args.gpu)
        # comment out the following line for debugging
        raise NotImplementedError("Only DistributedDataParallel is supported.")
    else:
        # AllGather implementation (batch shuffle, queue update, etc.) in
        # this code only supports DistributedDataParallel.
        raise NotImplementedError("Only DistributedDataParallel is supported.")

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)

    parameters = list(model.parameters()) + list(head.parameters())
    optimizer = torch.optim.SGD(parameters,
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            if args.gpu is None:
                checkpoint = torch.load(args.resume)
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(args.gpu)
                checkpoint = torch.load(args.resume, map_location=loc)
            args.start_epoch = checkpoint['epoch']
            model.load_state_dict(checkpoint['state_dict'])
            head.load_state_dict(checkpoint['head'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # stliu: I design it as a function
    train_loader, train_sampler, memory_loader, test_loader, teset = get_loader(
        args)

    if args.val:
        state_dict = model.state_dict()
        for k in list(state_dict.keys()):
            if k.startswith('module.encoder_q'
                            ) and not k.startswith('module.encoder_q.fc'):
                state_dict[k[len("module.encoder_q."):]] = state_dict[k]
            del state_dict[k]
        model_val.load_state_dict(state_dict, strict=False)
        flag_liblinear = '-s 2 -q -n ' + str(args.workers)
        if args.ttt:
            test_acc_svm = ttt_test(memory_loader, model, model_val,
                                    test_loader, flag_liblinear, args, ssh,
                                    teset, head)
        else:
            test_acc_svm = test(memory_loader, model, model_val, test_loader,
                                flag_liblinear, args, ssh)
        print('#### result ####\n' + args.val + ':', test_acc_svm,
              '\n################')
    else:
        # stliu: tensorboard
        logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2)

        for epoch in range(args.start_epoch,
                           args.epochs + 1):  # stliu: to save the last one
            if args.distributed:
                train_sampler.set_epoch(epoch)
            adjust_learning_rate(optimizer, epoch, args)

            # train for one epoch
            loss = train(train_loader, model, criterion, optimizer, epoch,
                         args, ssh)

            # stliu: tensorboard logger
            logger.log_value('loss', loss, epoch)

            if not args.multiprocessing_distributed or (
                    args.multiprocessing_distributed
                    and args.rank % ngpus_per_node == 0):
                if (epoch % args.save_freq == 0 and epoch != 0
                    ) or epoch == args.epochs:  # stliu: ignore the first model
                    print('==> Saving...')
                    save_checkpoint(
                        {
                            'epoch': epoch + 1,
                            'arch': args.arch,
                            'state_dict': model.state_dict(),
                            'head': head.state_dict(),
                            'optimizer': optimizer.state_dict(),
                        },
                        is_best=False,
                        filename=args.model_folder +
                        '/checkpoint_{:04d}.pth.tar'.format(epoch))
            # stliu: test with SVM
            if (epoch + 1) % args.svm_freq == 0:
                state_dict = model.state_dict()
                for k in list(state_dict.keys()):
                    if k.startswith('module.encoder_q') and not k.startswith(
                            'module.encoder_q.fc'):
                        state_dict[
                            k[len("module.encoder_q."):]] = state_dict[k]
                    del state_dict[k]
                model_val.load_state_dict(state_dict, strict=False)
                flag_liblinear = '-s 2 -q -n ' + str(args.workers)
                test_acc_svm = test(memory_loader, model, model_val,
                                    test_loader, flag_liblinear, args, ssh)

                # stliu: save the best model
                is_best = test_acc_svm > best_acc1
                best_acc1 = max(test_acc_svm, best_acc1)
                if is_best:
                    print('==> Saving the Best...')
                    save_checkpoint(
                        {
                            'epoch': epoch + 1,
                            'arch': args.arch,
                            'state_dict': model.state_dict(),
                            'head': head.state_dict(),
                            'optimizer': optimizer.state_dict(),
                        },
                        is_best=True,
                        filename=args.model_folder + '/best.pth.tar')
        print('The Best SVM Accuracy:', best_acc1)
Exemplo n.º 18
0
def main():
    best_acc = 0

    opt = parse_option()

    # tensorboard logger
    logger = tb_logger.Logger(logdir=opt.tb_folder, flush_secs=2)

    # dataloader
    if opt.dataset == 'cifar100':
        if opt.distill in ['crd']:
            train_loader, val_loader, n_data = get_cifar100_dataloaders_sample(batch_size=opt.batch_size,
                                                                               num_workers=opt.num_workers,
                                                                               k=opt.nce_k,
                                                                               mode=opt.mode)
        else:
            train_loader, val_loader, n_data = get_cifar100_dataloaders(batch_size=opt.batch_size,
                                                                        num_workers=opt.num_workers,
                                                                        n_test=100,
                                                                        is_instance=True)
        n_cls = 100
    else:
        raise NotImplementedError(opt.dataset)

    # model
    model_t = load_teacher(opt.path_t, n_cls)
    model_s = model_dict[opt.model_s](num_classes=n_cls)

    data = torch.randn(2, 3, 32, 32)
    model_t.eval()
    model_s.eval()
    feat_t, _ = model_t(data, is_feat=True)
    feat_s, _ = model_s(data, is_feat=True)

    module_list = nn.ModuleList([])
    module_list.append(model_s)
    trainable_list = nn.ModuleList([])
    trainable_list.append(model_s)

    criterion_cls = nn.CrossEntropyLoss()
    criterion_div = DistillKL(opt.kd_T)
    if opt.distill == 'kd':
        criterion_kd = DistillKL(opt.kd_T)
    elif opt.distill == 'hint':
        criterion_kd = HintLoss()
        regress_s = ConvReg(feat_s[opt.hint_layer].shape, feat_t[opt.hint_layer].shape)
        module_list.append(regress_s)
        trainable_list.append(regress_s)
    elif opt.distill == 'crd':
        opt.s_dim = feat_s[-1].shape[1]
        opt.t_dim = feat_t[-1].shape[1]
        opt.n_data = n_data
        criterion_kd = CRDLoss(opt)
        module_list.append(criterion_kd.embed_s)
        module_list.append(criterion_kd.embed_t)
        trainable_list.append(criterion_kd.embed_s)
        trainable_list.append(criterion_kd.embed_t)
    elif opt.distill == 'attention':
        criterion_kd = Attention()
    elif opt.distill == 'nst':
        criterion_kd = NSTLoss()
    elif opt.distill == 'similarity':
        criterion_kd = Similarity()
    elif opt.distill == 'rkd':
        criterion_kd = RKDLoss()
    elif opt.distill == 'pkt':
        criterion_kd = PKT()
    elif opt.distill == 'kdsvd':
        criterion_kd = KDSVD()
    elif opt.distill == 'correlation':
        criterion_kd = Correlation()
        embed_s = LinearEmbed(feat_s[-1].shape[1], opt.feat_dim)
        embed_t = LinearEmbed(feat_t[-1].shape[1], opt.feat_dim)
        module_list.append(embed_s)
        module_list.append(embed_t)
        trainable_list.append(embed_s)
        trainable_list.append(embed_t)
    elif opt.distill == 'vid':
        s_n = [f.shape[1] for f in feat_s[1:-1]]
        t_n = [f.shape[1] for f in feat_t[1:-1]]
        criterion_kd = nn.ModuleList(
            [VIDLoss(s, t, t) for s, t in zip(s_n, t_n)]
        )
        # add this as some parameters in VIDLoss need to be updated
        trainable_list.append(criterion_kd)
    elif opt.distill == 'abound':
        s_shapes = [f.shape for f in feat_s[1:-1]]
        t_shapes = [f.shape for f in feat_t[1:-1]]
        connector = Connector(s_shapes, t_shapes)
        # init stage training
        init_trainable_list = nn.ModuleList([])
        init_trainable_list.append(connector)
        init_trainable_list.append(model_s.get_feat_modules())
        criterion_kd = ABLoss(len(feat_s[1:-1]))
        init(model_s, model_t, init_trainable_list, criterion_kd, train_loader, logger, opt)
        # classification
        module_list.append(connector)
    elif opt.distill == 'factor':
        s_shape = feat_s[-2].shape
        t_shape = feat_t[-2].shape
        paraphraser = Paraphraser(t_shape)
        translator = Translator(s_shape, t_shape)
        # init stage training
        init_trainable_list = nn.ModuleList([])
        init_trainable_list.append(paraphraser)
        criterion_init = nn.MSELoss()
        init(model_s, model_t, init_trainable_list, criterion_init, train_loader, logger, opt)
        # classification
        criterion_kd = FactorTransfer()
        module_list.append(translator)
        module_list.append(paraphraser)
        trainable_list.append(translator)
    elif opt.distill == 'fsp':
        s_shapes = [s.shape for s in feat_s[:-1]]
        t_shapes = [t.shape for t in feat_t[:-1]]
        criterion_kd = FSP(s_shapes, t_shapes)
        # init stage training
        init_trainable_list = nn.ModuleList([])
        init_trainable_list.append(model_s.get_feat_modules())
        init(model_s, model_t, init_trainable_list, criterion_kd, train_loader, logger, opt)
        # classification training
        pass
    else:
        raise NotImplementedError(opt.distill)

    criterion_list = nn.ModuleList([])
    criterion_list.append(criterion_cls)  # classification loss
    criterion_list.append(criterion_div)  # KL divergence loss, original knowledge distillation
    criterion_list.append(criterion_kd)  # other knowledge distillation loss

    # optimizer
    optimizer = optim.SGD(trainable_list.parameters(),
                          lr=opt.learning_rate,
                          momentum=opt.momentum,
                          weight_decay=opt.weight_decay)

    # append teacher after optimizer to avoid weight_decay
    module_list.append(model_t)

    if torch.cuda.is_available():
        module_list.cuda()
        criterion_list.cuda()
        cudnn.benchmark = True

    # embed the watermark into teacher model
    wm_loader = None
    if opt.watermark == "usenix":
        # paper: https://www.usenix.org/system/files/conference/usenixsecurity18/sec18-adi.pdf

        # Define an optimizer for the teacher
        trainable_list_t = nn.ModuleList([model_t])
        optimizer_t = optim.SGD(trainable_list_t.parameters(),
                                lr=opt.learning_rate,
                                momentum=opt.momentum,
                                weight_decay=opt.weight_decay)

        wm_loader = get_usenixwm_dataloader()

        print("## Train data + Watermark Val Acc")
        teacher_acc, _, _ = validate(val_loader, model_t, nn.CrossEntropyLoss(), opt)

        print("==> embedding USENIX watermark...")

        max_epochs = 250  # Cutoff val
        for epoch in range(1, max_epochs + 1):
            set_learning_rate(2e-4, optimizer_t)
            top1, top5 = train_vanilla(epoch, wm_loader, model_t, nn.CrossEntropyLoss(), optimizer_t, opt)
            if top1 >= 97:
                break

        print("## Watermark val")
        teacher_acc, _, _ = validate(wm_loader, model_t, nn.CrossEntropyLoss(), opt)
        print("==> done")

    # validate teacher accuracy
    teacher_acc, _, _ = validate(val_loader, model_t, criterion_cls, opt)
    print('teacher accuracy: ', teacher_acc)

    # If teacher and student models match, copy over weights for initialization
    if (opt.init_strat == "noise") and (type(model_t) == type(model_s)):
        print("==> Copying teachers weights to student with a weight of {}".format(opt.init_inv_corr))
        model_s.load_state_dict(model_t.state_dict())

        with torch.no_grad():
            for param in model_s.parameters():
                if torch.cuda.is_available():
                    noise = (torch.randn(param.size()) * opt.init_inv_corr).cuda()
                    param.add_(noise)
                else:
                    param.add_(torch.randn(param.size()) * opt.init_inv_corr)

        student_acc, _, _ = validate(val_loader, model_s, criterion_cls, opt)
        print('student accuracy: ', student_acc)
        print("==> done")

    # routine
    for epoch in range(1, opt.epochs + 1):

        adjust_learning_rate(epoch, opt, optimizer)
        print("==> training...")

        time1 = time.time()
        train_acc, train_loss = train(epoch, train_loader, module_list, criterion_list, optimizer, opt)
        time2 = time.time()
        print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1))

        logger.log_value('train_acc', train_acc, epoch)
        logger.log_value('train_loss', train_loss, epoch)

        test_acc, tect_acc_top5, test_loss = validate(val_loader, model_s, criterion_cls, opt)

        if wm_loader is not None:
            print("==> wm retention")
            wm_top1, wm_top5, wm_loss = validate(wm_loader, model_s, criterion_cls, opt)
            logger.log_value('wm_ret', wm_top1, epoch)

        logger.log_value('test_acc', test_acc, epoch)
        logger.log_value('test_loss', test_loss, epoch)
        logger.log_value('test_acc_top5', tect_acc_top5, epoch)

        # save the best model
        if test_acc > best_acc:
            best_acc = test_acc
            state = {
                'epoch': epoch,
                'model': model_s.state_dict(),
                'best_acc': best_acc,
            }
            save_file = os.path.join(opt.save_folder, '{}_best.pth'.format(opt.model_s))
            print('saving the best model!')
            torch.save(state, save_file)

        # regular saving
        if epoch % opt.save_freq == 0:
            print('==> Saving...')
            state = {
                'epoch': epoch,
                'model': model_s.state_dict(),
                'accuracy': test_acc,
            }
            save_file = os.path.join(opt.save_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch))
            torch.save(state, save_file)

    # This best accuracy is only for printing purpose.
    # The results reported in the paper/README is from the last epoch. 
    print('best accuracy:', best_acc)

    # save model
    state = {
        'opt': opt,
        'model': model_s.state_dict(),
    }
    save_file = os.path.join(opt.save_folder, '{}_last.pth'.format(opt.model_s))
    torch.save(state, save_file)
Exemplo n.º 19
0
def main():

    # parse the args
    args = parse_option()

    # set the loader
    train_loader, n_data = get_train_loader(args)

    # set the model
    model, contrast, criterion_ab, criterion_l = set_model(args, n_data)

    # set the optimizer
    optimizer = set_optimizer(args, model)

    # optionally resume from a checkpoint
    args.start_epoch = 1
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch'] + 1
            model.load_state_dict(checkpoint['model'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            contrast.load_state_dict(checkpoint['contrast'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # tensorboard
    logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2)

    # routine
    for epoch in range(args.start_epoch, args.epochs + 1):

        adjust_learning_rate(epoch, args, optimizer)
        print("==> training...")

        time1 = time.time()
        l_loss, l_prob, ab_loss, ab_prob = train(epoch, train_loader, model,
                                                 contrast, criterion_l,
                                                 criterion_ab, optimizer, args)
        time2 = time.time()
        print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1))

        # tensorboard logger
        logger.log_value('l_loss', l_loss, epoch)
        logger.log_value('l_prob', l_prob, epoch)
        logger.log_value('ab_loss', ab_loss, epoch)
        logger.log_value('ab_prob', ab_prob, epoch)

        # save model
        if epoch % args.save_freq == 0:
            print('==> Saving...')
            state = {
                'opt': args,
                'model': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'contrast': contrast.state_dict(),
                'epoch': epoch,
            }
            save_file = os.path.join(
                args.model_folder,
                'ckpt_epoch_{epoch}.pth'.format(epoch=epoch))
            torch.save(state, save_file)

        pass
Exemplo n.º 20
0
def main():
    best_acc = 0

    opt = parse_option()

    # dataloader
    if opt.dataset == 'cifar100':
        train_loader, val_loader = get_cifar100_dataloaders(
            batch_size=opt.batch_size, num_workers=opt.num_workers)
        n_cls = 100
    else:
        raise NotImplementedError(opt.dataset)

    # model
    # model = model_dict[opt.model](num_classes=n_cls)
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    model = model_dict[opt.model](num_classes=n_cls,
                                  grad_proj=True,
                                  device=device)

    # optimizer
    optimizer = optim.SGD(model.parameters(),
                          lr=opt.learning_rate,
                          momentum=opt.momentum,
                          weight_decay=opt.weight_decay)

    criterion = nn.CrossEntropyLoss()

    if torch.cuda.is_available():
        model = model.cuda()
        criterion = criterion.cuda()
        cudnn.benchmark = True

    # tensorboard
    logger = tb_logger.Logger(logdir=opt.tb_folder, flush_secs=2)

    # routine
    for epoch in range(1, opt.epochs + 1):

        adjust_learning_rate(epoch, opt, optimizer)
        print("==> training...")

        time1 = time.time()
        train_acc, train_loss = train(epoch, train_loader, model, criterion,
                                      optimizer, opt)
        time2 = time.time()
        print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1))

        logger.log_value('train_acc', train_acc, epoch)
        logger.log_value('train_loss', train_loss, epoch)

        test_acc, test_acc_top5, test_loss = validate(val_loader, model,
                                                      criterion, opt)

        logger.log_value('test_acc', test_acc, epoch)
        logger.log_value('test_acc_top5', test_acc_top5, epoch)
        logger.log_value('test_loss', test_loss, epoch)

        # save the best model
        if test_acc > best_acc:
            best_acc = test_acc
            state = {
                'epoch': epoch,
                'model': model.state_dict(),
                'best_acc': best_acc,
                'optimizer': optimizer.state_dict(),
            }
            save_file = os.path.join(opt.save_folder,
                                     '{}_best.pth'.format(opt.model))
            print('saving the best model!')
            torch.save(state, save_file)

        # regular saving
        if epoch % opt.save_freq == 0:
            print('==> Saving...')
            state = {
                'epoch': epoch,
                'model': model.state_dict(),
                'accuracy': test_acc,
                'optimizer': optimizer.state_dict(),
            }
            save_file = os.path.join(
                opt.save_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch))
            torch.save(state, save_file)

    # This best accuracy is only for printing purpose.
    # The results reported in the paper/README is from the last epoch.
    print('best accuracy:', best_acc)

    # save model
    state = {
        'opt': opt,
        'model': model.state_dict(),
        'optimizer': optimizer.state_dict(),
    }
    save_file = os.path.join(opt.save_folder, '{}_last.pth'.format(opt.model))
    torch.save(state, save_file)
def main():
    best_acc = 0

    opt = parse_option()
    print('method: %s' % 'Ours')
    print('model: %s' % opt.model)
    print('architecture: %s' % opt.arch)
    print('weight_decay %f' % opt.weight_decay)
    print('batch_size: %d' % opt.batch_size)
    print('r %f' % opt.gamma)
    #print('a %f' %opt.alpha)
    print('e %f' % opt.eta)
    print('KD_T %f' % opt.kd_T)

    # dataloader
    if opt.dataset == 'cifar100':
        n_cls = 100

    # model
    model = model_dict[opt.model](num_classes=n_cls)

    # optimizer
    optimizer = optim.SGD(model.parameters(),
                          lr=opt.learning_rate,
                          momentum=opt.momentum,
                          weight_decay=opt.weight_decay)

    criterion = nn.CrossEntropyLoss()

    criterion_cls = nn.CrossEntropyLoss()

    criterion_soft = Softmax_T(opt.kd_T)

    criterion_kl = KL(opt.kd_T)

    criterion_list = nn.ModuleList([])
    criterion_list.append(criterion_cls)  # classification loss
    criterion_list.append(
        criterion_soft)  # KL divergence loss, original knowledge distillation
    criterion_list.append(criterion_kl)  # other knowledge distillation loss

    if torch.cuda.is_available():
        model = model.cuda()
        criterion = criterion.cuda()
        criterion_list.cuda()
        cudnn.benchmark = True

    # tensorboard
    logger = tb_logger.Logger(logdir=opt.tb_folder, flush_secs=2)

    # routine
    #targets=get_traintarget()

    train_logits = torch.zeros((50000, 100))

    train_loader, val_loader = get_cifar100_dataloaders(
        batch_size=opt.batch_size,
        num_workers=opt.num_workers,
        is_instance=False,
        is_shuffle=True,
        is_soft=True,
        train_softlabels=train_logits)

    for epoch in range(1, opt.epochs + 1):

        adjust_learning_rate(epoch, opt, optimizer)
        print("==> training...")

        time1 = time.time()
        if epoch <= opt.eta:
            train_acc, train_loss, train_logits = train1(
                epoch, train_loader, model, criterion, optimizer, opt,
                train_logits)
        else:
            train_acc, train_loss, train_logits = train2(
                epoch, train_loader, model, criterion_list, optimizer, opt,
                train_logits)

        train_logits = train_logits.detach().cpu()

        if epoch >= opt.eta and epoch % opt.eta == 0:
            print('label update')
            train_loader, val_loader = get_cifar100_dataloaders(
                batch_size=opt.batch_size,
                num_workers=opt.num_workers,
                is_instance=False,
                is_shuffle=True,
                is_soft=True,
                train_softlabels=train_logits)

        time2 = time.time()
        print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1))

        logger.log_value('train_acc', train_acc, epoch)
        logger.log_value('train_loss', train_loss, epoch)

        test_acc, test_acc_top5, test_loss = validate(val_loader, model,
                                                      criterion, opt)

        logger.log_value('test_acc', test_acc, epoch)
        logger.log_value('test_acc_top5', test_acc_top5, epoch)
        logger.log_value('test_loss', test_loss, epoch)

        # save the best model
        if test_acc > best_acc:
            best_acc = test_acc
            state = {
                'epoch': epoch,
                'model': model.state_dict(),
                'best_acc': best_acc,
                'optimizer': optimizer.state_dict(),
            }
            save_file = os.path.join(opt.save_folder,
                                     '{}_best.pth'.format(opt.model))
            print('saving the best model!')
            torch.save(state, save_file)

        # regular saving
        if epoch % opt.save_freq == 0:
            print('==> Saving...')
            state = {
                'epoch': epoch,
                'model': model.state_dict(),
                'accuracy': test_acc,
                'optimizer': optimizer.state_dict(),
            }
            save_file = os.path.join(
                opt.save_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch))
            torch.save(state, save_file)

    # This best accuracy is only for printing purpose.
    # The results reported in the paper/README is from the last epoch.
    print('best accuracy:', best_acc)

    # save model
    state = {
        'opt': opt,
        'model': model.state_dict(),
        'optimizer': optimizer.state_dict(),
    }
    save_file = os.path.join(opt.save_folder, '{}_last.pth'.format(opt.model))
    torch.save(state, save_file)
Exemplo n.º 22
0
def main_worker(gpu, ngpus_per_node, args):
    args.gpu = gpu

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))
        
    if args.multiprocessing_distributed and args.gpu != 0:
        def print_pass(*args):
            pass
        builtins.print = print_pass
        
    if args.dist_url == "env://" and args.rank == -1:
        args.rank = int(os.environ["RANK"])
    if args.multiprocessing_distributed:
        # For multiprocessing distributed training, rank needs to be the
        # global rank among all the processes
        args.rank = args.rank * ngpus_per_node + gpu
    dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                            world_size=args.world_size, rank=args.rank)

    # create model
    print("=> creating model '{}'".format(args.arch))
    if args.arch == 'resnet50':
        model = Model(resnet50,args,width=1)
    elif args.arch == 'resnet50x2':    
        model = Model(resnet50,args,width=2)
    elif args.arch == 'resnet50x4':    
        model = Model(resnet50,args,width=4)
    else:
        raise NotImplementedError('model not supported {}'.format(args.arch))    
            
    
    if args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model.cuda(args.gpu)
        # When using a single GPU per process and per
        # DistributedDataParallel, we need to divide the batch size
        # ourselves based on the total number of GPUs we have
        args.batch_size = int(args.batch_size / ngpus_per_node)
        args.batch_size_u = int(args.batch_size_u / ngpus_per_node)
        args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) #find_unused_parameters=True
    else:
        model.cuda()
        # DistributedDataParallel will divide and allocate batch_size to all
        # available GPUs if device_ids are not set
        model = torch.nn.parallel.DistributedDataParallel(model)

    # define loss function (criterion) and optimizer
    criteria_x = nn.CrossEntropyLoss().cuda(args.gpu)    
    
    optimizer = torch.optim.SGD(model.parameters(), args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay,
                                nesterov=True
                               )

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            if args.gpu is None:
                checkpoint = torch.load(args.resume)
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(args.gpu)
                checkpoint = torch.load(args.resume, map_location=loc)
            args.start_epoch = checkpoint['epoch']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True
    
    print("=> preparing dataset")
    # Data loading code         
    
    transform_strong = transforms.Compose([
            transforms.RandomResizedCrop(96, scale=(0.2, 1.)),                
            transforms.RandomApply([
                transforms.ColorJitter(0.4, 0.4, 0.4, 0.1) 
            ], p=0.8),
            transforms.RandomGrayscale(p=0.2),                         
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
        ])
    transform_weak = transforms.Compose([
            transforms.RandomResizedCrop(96, scale=(0.2, 1.)),                                     
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
        ])    
    transform_eval = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
        ])
    
    three_crops_transform = loader.ThreeCropsTransform(transform_weak, transform_strong, transform_strong)

    unlabeled_dataset = CustomDataset(args.data, 'unlabeled', transform=three_crops_transform)

    labeled_dataset = CustomDataset(args.data, 'train', transform=transform_weak)

    #labeled_sampler = torch.utils.data.distributed.DistributedSampler(labeled_dataset)
    labeled_loader = torch.utils.data.DataLoader(
        labeled_dataset, batch_size=args.batch_size, shuffle=True,
        num_workers=args.workers, pin_memory=True)   
 
    #unlabeled_sampler = torch.utils.data.distributed.DistributedSampler(unlabeled_dataset)
    unlabeled_loader = torch.utils.data.DataLoader(
        unlabeled_dataset, batch_size=int(args.batch_size_u), shuffle=True,
        num_workers=args.workers, pin_memory=True)
    
    val_loader = torch.utils.data.DataLoader(CustomDataset('/content/dataset', 'val', transform_eval),
        batch_size=64, shuffle=False,
        num_workers=args.workers, pin_memory=True)
    
    # create loggers
    if args.gpu==0:
        tb_logger = tensorboard_logger.Logger(logdir=os.path.join(args.exp_dir,'tensorboard'), flush_secs=2)
        logger = setup_default_logging(args)
        logger.info(dict(args._get_kwargs()))        
    else:
        tb_logger = None
        logger = None

    for epoch in range(args.start_epoch, args.epochs):
        if epoch==0:
            args.m = 0.99 # larger update in first epoch
        else:
            args.m = args.moco_m
        # args.lr=0.01
        adjust_learning_rate(optimizer, epoch, args)

        train(labeled_loader, unlabeled_loader, model, criteria_x, optimizer, epoch, args, logger, tb_logger)

        # evaluate on validation set
        acc1 = validate(val_loader, model, args, logger, tb_logger, epoch)

        if not args.multiprocessing_distributed or (args.multiprocessing_distributed
                and args.rank % ngpus_per_node == 0):
            save_checkpoint({
                'args': args,
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'optimizer' : optimizer.state_dict()
            },filename='{}/checkpoint_{:04d}.pt'.format(args.exp_dir,epoch))
    
    # evaluate ema model
    acc1 = validate(val_loader, model, args, logger, tb_logger, -1)
Exemplo n.º 23
0
def TrainModels(learners,
                train_loader,
                val_loader,
                train_settings,
                out_dir,
                cuda_device_id=0,
                batch_use_prob=1.0,
                print_log=True,
                log_dir=''):
    logger = None
    if log_dir != '':
        logger = tensorboard_logger.Logger(log_dir, flush_secs=5)

    train_log = []
    min_validation_losses = [float('inf') for _ in learners]
    min_validation_loss = float('inf')
    num_inputs = len(learners[0].net.InputNames())
    num_labels = len(learners[0].net.LabelNames())
    for epoch in range(train_settings.epochs):
        running_losses = [0.0 for _ in learners]
        train_examples_per_net = [0 for _ in learners]
        for learner in learners:
            learner.optimizer.zero_grad()

        epoch_start_time = time.time()
        for training_batch in train_loader:
            input_vars, label_vars = DataBatchToVariables(
                training_batch[:-1], num_inputs, num_labels, cuda_device_id)
            example_indices = torch.squeeze(training_batch[-1], dim=1)

            for net_idx, learner in enumerate(learners):
                if random.uniform(0.0, 1.0) < batch_use_prob:
                    # Read the weights.
                    example_weights = MakeWeightsVariable(
                        learner.weighter, example_indices, cuda_device_id)

                    # forward + backward + optimize
                    outputs = learner.net(input_vars)
                    loss_value_per_example = train_settings.loss(
                        outputs, label_vars)

                    # TODO track unweighted losses separately.
                    loss_value = torch.mean(
                        torch.mul(loss_value_per_example, example_weights))
                    loss_value.backward()
                    learner.optimizer.step()
                    learner.optimizer.zero_grad()

                    # Update weights using loss_value_per_example.
                    learner.weighter.RegisterLosses(
                        example_indices,
                        loss_value_per_example.cpu().data.numpy())

                    # Accumulate statistics
                    batch_size = input_vars[0].size()[0]
                    train_examples_per_net[net_idx] += batch_size
                    running_losses[net_idx] += loss_value.data[0] * batch_size
        epoch_end_time = time.time()

        epoch_duration = epoch_end_time - epoch_start_time
        examples_per_sec = sum(train_examples_per_net) / epoch_duration
        avg_losses = AverageLosses(running_losses, train_examples_per_net)
        avg_loss = sum(running_losses) / sum(train_examples_per_net)

        validation_total_losses = [0.0 for _ in learners]
        validation_examples = [0 for _ in learners]

        for learner in learners:
            learner.weighter.Step()
            learner.net.eval()

        for val_batch in val_loader:
            input_vars, label_vars = DataBatchToVariables(
                val_batch[:-1], num_inputs, num_labels, cuda_device_id)

            for net_idx, learner in enumerate(learners):
                outputs = learner.net(input_vars)
                loss_value_per_example = train_settings.loss(
                    outputs, label_vars)
                loss_value = torch.mean(loss_value_per_example)

                batch_size = input_vars[0].size()[0]
                validation_examples[net_idx] += batch_size
                validation_total_losses[
                    net_idx] += loss_value.data[0] * batch_size

        for learner in learners:
            learner.net.train()

        validation_avg_losses = AverageLosses(validation_total_losses,
                                              validation_examples)
        validation_avg_loss = (sum(validation_total_losses) /
                               sum(validation_examples))

        epoch_metrics = {
            TRAIN_LOSS: avg_loss,
            VAL_LOSS: validation_avg_loss,
            EPOCH_DURATION_SEC: epoch_duration,
            EXAMPLES_PER_SEC: examples_per_sec
        }
        train_log.append(epoch_metrics)

        val_improved_marker = ''
        if validation_avg_loss < min_validation_loss:
            val_improved_marker = ' ***'
            min_validation_loss = validation_avg_loss
        elif (validation_avg_loss * 0.9) < min_validation_loss:
            # Highlight epochs with validation loss almost as good as the current
            # best loss.
            val_improved_marker = ' *'

        for net_idx, learner in enumerate(learners):
            if learner.lr_scheduler is not None:
                learner.lr_scheduler.step(validation_avg_losses[net_idx])
            if validation_avg_losses[net_idx] < min_validation_losses[net_idx]:
                learner.net.cpu()
                torch.save(
                    learner.net.state_dict(),
                    io_helpers.ModelFileName(out_dir, net_idx,
                                             io_helpers.BEST))
                learner.net.cuda(cuda_device_id)
                min_validation_losses[net_idx] = validation_avg_losses[net_idx]

        # Maybe print metrics to screen.
        if print_log:
            print('Epoch %d;  %s%s' %
                  (epoch, TrainLogEventToString(epoch_metrics),
                   val_improved_marker))
        # Maybe log metrics to tensorboard.
        if log_dir != '':
            logger.log_value('train_loss', avg_loss, epoch)
            logger.log_value('val_loss', validation_avg_loss, epoch)

    for net_idx, learner in enumerate(learners):
        learner.net.cpu()
        torch.save(learner.net.state_dict(),
                   io_helpers.ModelFileName(out_dir, net_idx, io_helpers.LAST))
        learner.net.cuda(cuda_device_id)

    return train_log
Exemplo n.º 24
0
def main_worker(gpu, ngpus_per_node, args):
    args.gpu = gpu

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    # suppress printing if not master
    if args.multiprocessing_distributed and args.gpu != 0:

        def print_pass(*args):
            pass

        builtins.print = print_pass

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)
    # create model
    print("=> creating model '{}'".format(args.arch))
    if args.arch == 'resnet50':
        model = MoPro(resnet50, args, width=1)
    elif args.arch == 'resnet50x2':
        model = MoPro(resnet50, args, width=2)
    elif args.arch == 'resnet50x4':
        model = MoPro(resnet50, args, width=4)
    else:
        raise NotImplementedError('model not supported {}'.format(args.arch))

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(
                (args.workers + ngpus_per_node - 1) / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[args.gpu])
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
        # comment out the following line for debugging
        raise NotImplementedError("Only DistributedDataParallel is supported.")
    else:
        # AllGather implementation (batch shuffle, queue update, etc.) in
        # this code only supports DistributedDataParallel.
        raise NotImplementedError("Only DistributedDataParallel is supported.")

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            if args.gpu is None:
                checkpoint = torch.load(args.resume)
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(args.gpu)
                checkpoint = torch.load(args.resume, map_location=loc)
            args.start_epoch = checkpoint['epoch']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    cudnn.benchmark = True

    # Data loading code
    loader = dataloader.webvision_dataloader(batch_size=args.batch_size,num_workers=args.workers,\
                                             root_dir=args.data,num_class=args.num_class,distributed=args.distributed)
    train_loader, test_loader, _ = loader.run()

    if args.gpu == 0:
        logger = tb_logger.Logger(logdir=os.path.join(args.exp_dir,
                                                      'tensorboard'),
                                  flush_secs=2)
    else:
        logger = None

    for epoch in range(args.start_epoch, args.epochs):

        if args.distributed:
            loader.train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch, args)

        train(train_loader, model, criterion, optimizer, epoch, args, logger)

        if not args.multiprocessing_distributed or (
                args.multiprocessing_distributed
                and args.rank % ngpus_per_node == 0):
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'arch': args.arch,
                    'state_dict': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                },
                is_best=False,
                filename='{}/checkpoint_{:04d}.pth.tar'.format(
                    args.exp_dir, epoch))
        test(model, test_loader, args, epoch, logger)
Exemplo n.º 25
0
model = model.cuda()
''' 
vocab, vec = torchwordemb.load_word2vec_bin("../dataset/GoogleNews-vectors-negative300.bin")
text_field.vocab.set_vectors(vocab, vec, EMBEDDING_DIM)
'''
text_field.vocab.load_vectors('glove.6B.300d')

model.word_embeddings.weight.data = text_field.vocab.vectors.cuda()
model.word_embeddings.weight.requires_grad = False

loss_function = nn.NLLLoss()
update_parameter = filter(lambda p: p.requires_grad, model.parameters())
optimizer = optim.Adam(update_parameter, lr=1e-3)

os.system('rm -rf ' + RESULT_PATH)
train_logger = tensorboard_logger.Logger(RESULT_PATH + "/summaries/train/")
dev_logger = tensorboard_logger.Logger(RESULT_PATH + '/summaries/dev/')

no_up = 0
start_time = time.time()
for i in range(EPOCH):
    print('epoch: %d start!' % i)
    train_epoch(model, train_iter, loss_function, optimizer, text_field,
                label_field, i, train_logger)
    print('now best dev acc:', best_dev_acc)
    dev_acc = evaluate(model, dev_iter, loss_function, i, dev_logger, 'dev')
    if dev_acc > best_dev_acc:
        best_dev_acc = dev_acc
        os.system('rm ' + RESULT_PATH +
                  '/best_models/mr_best_model_minibatch_acc_*.model')
        os.system('mkdir ' + RESULT_PATH + '/best_models/')
Exemplo n.º 26
0
def main():
    best_acc = 0

    opt = parse_option()

    # tensorboard logger
    logger = tb_logger.Logger(logdir=opt.tb_folder, flush_secs=2)

    # dataloader
    if opt.dataset == 'cifar100':
        if opt.distill in ['crd']:
            train_loader, val_loader, n_data = get_cifar100_dataloaders_sample(
                batch_size=opt.batch_size,
                num_workers=opt.num_workers,
                k=opt.nce_k,
                mode=opt.mode)
        else:
            train_loader, val_loader, n_data = get_cifar100_dataloaders(
                batch_size=opt.batch_size,
                num_workers=opt.num_workers,
                is_instance=True)
        n_cls = 100
    else:
        raise NotImplementedError(opt.dataset)

    # model
    arch_t = opt.model_t
    arch_s = opt.model_s
    model_t = load_teacher(opt.path_t, n_cls)
    model_s = model_dict[opt.model_s](num_classes=n_cls)

    data = torch.randn(2, 3, 32, 32)
    model_t.eval()
    model_s.eval()
    feat_t, _ = model_t(data, is_feat=True)
    feat_s, _ = model_s(data, is_feat=True)

    # print(arch_t)
    # for ft in feat_t:
    #     print(ft.shape)
    # print(arch_s)
    # for fs in feat_s:
    #     print(fs.shape)
    # return

    module_list = nn.ModuleList([])
    module_list.append(model_s)
    trainable_list = nn.ModuleList([])
    trainable_list.append(model_s)

    criterion_cls = nn.CrossEntropyLoss()
    criterion_div = DistillKL(opt.kd_T)
    if opt.distill == 'kd':
        criterion_kd = DistillKL(opt.kd_T)
    elif opt.distill == 'hint':
        criterion_kd = HintLoss()
        regress_s = ConvReg(feat_s[opt.hint_layer].shape,
                            feat_t[opt.hint_layer].shape)
        module_list.append(regress_s)
        trainable_list.append(regress_s)
    elif opt.distill == 'crd':
        opt.s_dim = feat_s[-1].shape[1]
        opt.t_dim = feat_t[-1].shape[1]
        opt.n_data = n_data
        criterion_kd = CRDLoss(opt)
        module_list.append(criterion_kd.embed_s)
        module_list.append(criterion_kd.embed_t)
        trainable_list.append(criterion_kd.embed_s)
        trainable_list.append(criterion_kd.embed_t)
    elif opt.distill == 'attention':
        criterion_kd = Attention()
    elif opt.distill == 'nst':
        criterion_kd = NSTLoss()
    elif opt.distill == 'similarity':
        criterion_kd = Similarity()
    elif opt.distill == 'rkd':
        criterion_kd = RKDLoss()
    elif opt.distill == 'irg':
        criterion_kd = IRGLoss()
    elif opt.distill == 'hkd':
        criterion_kd = HKDLoss(init_weight=opt.hkd_initial_weight,
                               decay=opt.hkd_decay)
    elif opt.distill == 'pkt':
        criterion_kd = PKT()
    elif opt.distill == 'kdsvd':
        criterion_kd = KDSVD()
    elif opt.distill == 'correlation':
        criterion_kd = Correlation()
        embed_s = LinearEmbed(feat_s[-1].shape[1], opt.feat_dim)
        embed_t = LinearEmbed(feat_t[-1].shape[1], opt.feat_dim)
        module_list.append(embed_s)
        module_list.append(embed_t)
        trainable_list.append(embed_s)
        trainable_list.append(embed_t)
    elif opt.distill == 'vid':
        s_n = [f.shape[1] for f in feat_s[1:-1]]
        t_n = [f.shape[1] for f in feat_t[1:-1]]
        criterion_kd = nn.ModuleList(
            [VIDLoss(s, t, t) for s, t in zip(s_n, t_n)])
        # add this as some parameters in VIDLoss need to be updated
        trainable_list.append(criterion_kd)
    elif opt.distill == 'abound':
        s_shapes = [f.shape for f in feat_s[1:-1]]
        t_shapes = [f.shape for f in feat_t[1:-1]]
        connector = Connector(s_shapes, t_shapes)
        # init stage training
        init_trainable_list = nn.ModuleList([])
        init_trainable_list.append(connector)
        init_trainable_list.append(model_s.get_feat_modules())
        criterion_kd = ABLoss(len(feat_s[1:-1]))
        init(model_s, model_t, init_trainable_list, criterion_kd, train_loader,
             logger, opt)
        # classification
        module_list.append(connector)
    elif opt.distill == 'factor':
        s_shape = feat_s[-2].shape
        t_shape = feat_t[-2].shape
        paraphraser = Paraphraser(t_shape)
        translator = Translator(s_shape, t_shape)
        # init stage training
        init_trainable_list = nn.ModuleList([])
        init_trainable_list.append(paraphraser)
        criterion_init = nn.MSELoss()
        init(model_s, model_t, init_trainable_list, criterion_init,
             train_loader, logger, opt)
        # classification
        criterion_kd = FactorTransfer()
        module_list.append(translator)
        module_list.append(paraphraser)
        trainable_list.append(translator)
    elif opt.distill == 'fsp':
        s_shapes = [s.shape for s in feat_s[:-1]]
        t_shapes = [t.shape for t in feat_t[:-1]]
        criterion_kd = FSP(s_shapes, t_shapes)
        # init stage training
        init_trainable_list = nn.ModuleList([])
        init_trainable_list.append(model_s.get_feat_modules())
        init(model_s, model_t, init_trainable_list, criterion_kd, train_loader,
             logger, opt)
        # classification training
        pass
    elif opt.distill == 'l2tww':
        pairs = []
        for pair in opt.pairs.split(','):
            pairs.append((int(pair.split('-')[0]), int(pair.split('-')[1])))

        criterion_kd = L2TWW(opt.model_t, opt.model_s, pairs)

        wnet = WeightNetwork(opt.model_t, pairs)
        lwnet = LossWeightNetwork(opt.model_t, pairs, opt.loss_weight_type,
                                  opt.loss_weight_init)
        trainable_list.append(wnet)
        trainable_list.append(lwnet)
        module_list.append(wnet)
        module_list.append(lwnet)

    else:
        raise NotImplementedError(opt.distill)

    criterion_list = nn.ModuleList([])
    criterion_list.append(criterion_cls)  # classification loss
    criterion_list.append(
        criterion_div)  # KL divergence loss, original knowledge distillation
    criterion_list.append(criterion_kd)  # other knowledge distillation loss

    # optimizer
    optimizer = optim.SGD(trainable_list.parameters(),
                          lr=opt.learning_rate,
                          momentum=opt.momentum,
                          weight_decay=opt.weight_decay)

    # append teacher after optimizer to avoid weight_decay
    module_list.append(model_t)

    if torch.cuda.is_available():
        module_list.cuda()
        criterion_list.cuda()
        cudnn.benchmark = True

    # validate teacher accuracy
    teacher_acc, _, _ = validate(val_loader, model_t, criterion_cls, opt)
    print('teacher accuracy: ', teacher_acc)

    # routine
    for epoch in range(1, opt.epochs + 1):

        adjust_learning_rate(epoch, opt, optimizer)
        print("==> training...")

        time1 = time.time()
        # train_loss, train_acc = 0, 0
        train_acc, train_loss = train(epoch, train_loader, module_list,
                                      criterion_list, optimizer, opt)
        time2 = time.time()
        print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1))

        logger.log_value('train_acc', train_acc, epoch)
        logger.log_value('train_loss', train_loss, epoch)

        test_acc, tect_acc_top5, test_loss = validate(val_loader, model_s,
                                                      criterion_cls, opt)

        logger.log_value('test_acc', test_acc, epoch)
        logger.log_value('test_loss', test_loss, epoch)
        logger.log_value('test_acc_top5', tect_acc_top5, epoch)

        # save the best model
        if test_acc > best_acc:
            best_acc = test_acc
            state = {
                'epoch': epoch,
                'model': model_s.state_dict(),
                'best_acc': best_acc,
            }
            save_file = os.path.join(opt.save_folder,
                                     '{}_best.pth'.format(opt.model_s))
            print('saving the best model!')
            torch.save(state, save_file)

        # regular saving
        if epoch % opt.save_freq == 0:
            print('==> Saving...')
            state = {
                'epoch': epoch,
                'model': model_s.state_dict(),
                'accuracy': test_acc,
            }
            save_file = os.path.join(
                opt.save_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch))
            torch.save(state, save_file)

    # This best accuracy is only for printing purpose.
    # The results reported in the paper/README is from the last epoch.
    print('best accuracy:', best_acc)

    # save model
    state = {
        'opt': opt,
        'model': model_s.state_dict(),
    }
    save_file = os.path.join(opt.save_folder,
                             '{}_last.pth'.format(opt.model_s))
    torch.save(state, save_file)
Exemplo n.º 27
0
def main():
    if not torch.cuda.is_available():
        raise 'Only support GPU mode'
    # parse the args
    args = parse_option()
    print(vars(args))

    best_acc = 0  # best test accuracy
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    print('[Warning] The training modalities are RGB and [{}]'.format(
        args.modality))

    # Data
    train_transforms = transforms.Compose([
        transforms.Resize((128, 171)),  # smaller edge to 128
        transforms.RandomCrop(112),
        transforms.ToTensor()
    ])
    if args.dataset == 'ucf101':
        trainset = UCF101Dataset('./data/ucf101/',
                                 transforms_=train_transforms)
    else:
        trainset = HMDB51Dataset('./data/hmdb51/',
                                 transforms_=train_transforms)

    train_loader = DataLoader(trainset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.num_workers,
                              pin_memory=True,
                              drop_last=True)

    n_data = trainset.__len__()

    # set the model
    model, contrast, criterion_1, criterion_2 = set_model(args, n_data)

    # set the optimizer
    optimizer = set_optimizer(args, model)

    # optionally resume from a checkpoint
    args.start_epoch = 1
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume, map_location='cpu')
            args.start_epoch = checkpoint['epoch'] + 1
            model.load_state_dict(checkpoint['model'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            contrast.load_state_dict(checkpoint['contrast'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
            del checkpoint
            torch.cuda.empty_cache()
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # tensorboard
    logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2)

    scheduler = lr_scheduler.MultiStepLR(optimizer,
                                         milestones=[45, 90, 125, 160],
                                         gamma=0.2)
    # routine
    for epoch in range(args.start_epoch, args.epochs + 1):
        time1 = time.time()
        view1_loss, view1_prob, view2_loss, view2_prob = train(
            epoch, train_loader, model, contrast, criterion_1, criterion_2,
            optimizer, args)
        time2 = time.time()
        print('\nepoch {}, total time {:.2f}'.format(epoch, time2 - time1))

        # tensorboard logger
        logger.log_value('view1_loss', view1_loss, epoch)
        logger.log_value('view1_prob', view1_prob, epoch)
        logger.log_value('view2_loss', view2_loss, epoch)
        logger.log_value('view2_prob', view2_prob, epoch)

        # save model
        if epoch % args.save_freq == 0:
            print('==> Saving...')
            state = {
                'opt': args,
                'model': model.state_dict(),
                'contrast': contrast.state_dict(),
                'optimizer': optimizer.state_dict(),
                'epoch': epoch,
            }
            save_file = os.path.join(
                args.model_folder,
                'ckpt_epoch_{epoch}.pth'.format(epoch=epoch))
            torch.save(state, save_file)
            # help release GPU memory
            del state

        torch.cuda.empty_cache()
        scheduler.step()

    print(args.model_name)
def main():
    global args
    global tensorboard_logger
    use_cuda = torch.cuda.is_available()
    initialize_environment(random_seed=cfg.RNG_SEED, use_cuda=use_cuda)

    assert 0 <= args.feat_id <= 3
    assert 0 <= args.corpora_id <= 6
    feat_name = feat_dict[args.feat_id]
    corpora_name = data_dict[args.corpora_id]
    datadir = os.path.join('data', corpora_name)
    nepoch = args.nepoch
    step = args.step_epoch
    dropout = args.dropout
    n_layers = cfg.N_LAYERS
    input_dim = input_feat_size_dict[args.feat_id]
    hidden_dims = cfg.HIDDEN_DIMS
    if args.feat_id == 0 or args.feat_id == 1:
        feat_func_dict = {'ln': ln, 'n': norm}
    elif args.feat_id == 2:
        feat_func_dict = {'ln': ln, 'n': norm, 'i': lambda x: x}
    elif args.feat_id == 3:
        feat_func_dict = {'i': lambda x: x}

    for feat_func_name, feat_func in feat_func_dict.items():
        print(corpora_name, feat_name, feat_func_name)
        outputdir = os.path.join('data', corpora_name,
                                 feat_name + '_' + feat_func_name)
        # logging information
        loggin_dir = os.path.join(outputdir, 'runs', 'pretraining')
        if not os.path.exists(loggin_dir):
            os.makedirs(loggin_dir)
        tensorboard_logger = TF_LOGGER.Logger(
            os.path.join(loggin_dir, '%s' % (args.id)))
        # tensorboard_logger.configure(os.path.join(loggin_dir, '%s' % (args.id)))

        trainset = EncodedTextDataset(root=datadir,
                                      train=True,
                                      feat_name=feat_name,
                                      feat_func=feat_func)
        testset = EncodedTextDataset(root=datadir,
                                     train=False,
                                     feat_name=feat_name,
                                     feat_func=feat_func)
        kwargs = {'num_workers': 0, 'pin_memory': True} if use_cuda else {}
        trainloader = torch.utils.data.DataLoader(trainset,
                                                  batch_size=args.batchsize,
                                                  shuffle=True,
                                                  **kwargs)
        testloader = torch.utils.data.DataLoader(testset,
                                                 batch_size=100,
                                                 shuffle=True,
                                                 **kwargs)

        pretrain(
            outputdir, {
                'nlayers': n_layers,
                'dropout': dropout,
                'reluslope': 0.0,
                'nepoch': nepoch,
                'lrate': [args.lr],
                'wdecay': [0.0],
                'step': step,
                'input_dim': input_dim,
                'hidden_dims': hidden_dims
            }, use_cuda, trainloader, testloader)
Exemplo n.º 29
0
 def initialize_tensorboard(self):
     outputdir = get_output_dir(self.root_dir)
     loggin_dir = os.path.join(outputdir, 'runs', 'clustering')
     if not os.path.exists(loggin_dir):
         os.makedirs(loggin_dir)
     self.logger_tensorboard = tensorboard_logger.Logger(os.path.join(loggin_dir, '{}'.format(self.id)))
Exemplo n.º 30
0
def main_worker(gpu, ngpus_per_node, args):
    args.gpu = gpu
    
    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))
        
    # suppress printing if not master
    if args.multiprocessing_distributed and args.gpu != 0:
        def print_pass(*args):
            pass
        builtins.print = print_pass
        
    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                world_size=args.world_size, rank=args.rank)
    # create model
    print("=> creating model '{}'".format(args.arch))
    model = models.__dict__[args.arch]()

    # freeze all layers but the last fc
    for name, param in model.named_parameters():
        if name not in ['fc.weight', 'fc.bias']:
            param.requires_grad = False
    # init the fc layer
    model.fc.weight.data.normal_(mean=0.0, std=0.01)
    model.fc.bias.data.zero_()
    
    if args.gpu==0:
        logger = tb_logger.Logger(logdir=args.tb_folder, flush_secs=2)
    else:
        logger = None
        
    # load from pre-trained, before DistributedDataParallel constructor
    if args.pretrained:
        if os.path.isfile(args.pretrained):
            print("=> loading checkpoint '{}'".format(args.pretrained))
            checkpoint = torch.load(args.pretrained, map_location="cpu")

            # rename pre-trained keys
            state_dict = checkpoint['state_dict']
            for k in list(state_dict.keys()):
                # retain only encoder_q up to before the embedding layer
                if k.startswith('module.encoder_q') and not k.startswith('module.encoder_q.fc'):
                    # remove prefix
                    state_dict[k[len("module.encoder_q."):]] = state_dict[k]
                # delete renamed or unused k
                del state_dict[k]

            args.start_epoch = 0
            msg = model.load_state_dict(state_dict, strict=False)
            assert set(msg.missing_keys) == {"fc.weight", "fc.bias"}

            print("=> loaded pre-trained model '{}'".format(args.pretrained))
        else:
            print("=> no checkpoint found at '{}'".format(args.pretrained))

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:
        # DataParallel will divide and allocate batch_size to all available GPUs
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model).cuda()

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)

    # optimize only the linear classifier
    parameters = list(filter(lambda p: p.requires_grad, model.parameters()))
    assert len(parameters) == 2  # fc.weight, fc.bias
    optimizer = torch.optim.SGD(parameters, args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            if args.gpu is None:
                checkpoint = torch.load(args.resume)
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(args.gpu)
                checkpoint = torch.load(args.resume, map_location=loc)
            args.start_epoch = checkpoint['epoch']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
        num_workers=args.workers, pin_memory=True, sampler=train_sampler)

    val_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(valdir, transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
        batch_size=args.batch_size, shuffle=False,
        num_workers=args.workers, pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion, args)
        return

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch, args)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args)

        # evaluate on validation set
        acc1 = validate(val_loader, model, criterion, args, logger, epoch)

        if not args.multiprocessing_distributed or (args.multiprocessing_distributed
                and args.rank % ngpus_per_node == 0):
            save_checkpoint({
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'optimizer' : optimizer.state_dict(),
            })
            if epoch == args.start_epoch:
                sanity_check(model.state_dict(), args.pretrained)