def __init__(self, inputs, *args, df=None, modalities=None, **kwargs): """ :param inputs: a dict {mod: [np.arr]} :param *args, **kwargs: arguments to give to ArrayDataset :param df: a pandas DataFrame with {index_mnist, index_svhn, digit} in df.columns :param modalities: must be {'mnist', 'svhn'} """ kwargs["concat_datasets"] = False super().__init__(None, *args, **kwargs) self.inputs = inputs self.modalities = modalities assert self.outputs is None, "Unknown output" assert set(self.modalities) == { 'mnist', 'svhn' }, "Missing modalities: {}".format(self.modalities) if self.patch_size is not None or self.features_to_add is not None: raise NotImplementedError( "Not yet implemented for multimodal dataset.") self.df = df[["index_%s" % m for m in modalities] + ['digit']].values.copy() self.cumulative_sizes = { m: np.cumsum([len(inp) for inp in self.inputs[m]]) for m in modalities } # Transfos to sample from in the SimCLR framework s = 1 color_jitter = transforms.ColorJitter(0.8 * s, 0.8 * s, 0.8 * s, 0.2 * s) self.compose_transforms = { "svhn": transforms.Compose( [ # transforms.RandomApply([lambda x: add_blur(x, sigma=(0.2, 1))], p=0.5), lambda x: np.swapaxes(x, 0, 2), # channel-last transforms.ToPILImage(mode='RGB'), transforms.RandomResizedCrop(size=32, scale=(0.5, 1)), transforms.RandomHorizontalFlip(), transforms.RandomApply([color_jitter], p=0.8), transforms.RandomGrayscale(p=0.2), transforms.ToTensor(), transforms.Normalize((.5, .5, .5), (.5, .5, .5)) ]), "mnist": transforms.Compose( [ # transforms.RandomApply([lambda x: add_blur(x, sigma=(0.2, 1))], p=0.5), transforms.ToPILImage(mode='L'), transforms.RandomResizedCrop(size=28, scale=(0.5, 1)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize( (0.1307, ), (0.3081, )) # mean, std of MNIST training set ]) } self.minimum_tfs = { 'svhn': transforms.Compose([ lambda x: np.swapaxes(x, 0, 2), transforms.ToTensor(), transforms.Normalize((.5, .5, .5), (.5, .5, .5)) ]), 'mnist': transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ]) }
# 在这步中初始化模型 model_ft, input_size = initialize_model(model_name, num_classes, feature_extract, use_pretrained=True) # 打印我们刚刚实例化的模型 print(model_ft) # 数据扩充和训练规范化 # 只需验证标准化 data_transforms = { 'train': transforms.Compose([ transforms.RandomResizedCrop(input_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), 'val': transforms.Compose([ transforms.Resize(input_size), transforms.CenterCrop(input_size), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), } print("Initializing Datasets and Dataloaders...") # 创建训练和验证数据集
def main(): global args, best_prec1 args = parser.parse_args() print(args) args.distributed = args.world_size > 1 if not os.path.exists(args.save): os.makedirs(args.save) if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) model = vgg11() if args.scratch: checkpoint = torch.load(args.scratch) model = vgg11(pretrained=False, config=checkpoint['cfg']) model_ref = vgg11() flops_std = count_model_param_flops(model_ref, 224) flops_small = count_model_param_flops(model, 224) args.epochs = int(90 * (flops_std / flops_small)) step_size = int(args.epochs / 3) if not args.distributed: if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() else: model.cuda() model = torch.nn.parallel.DistributedDataParallel(model) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader( datasets.ImageFolder(valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion) return for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, step_size) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args.s) # evaluate on validation set prec1 = validate(val_loader, model, criterion) # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer' : optimizer.state_dict(), }, is_best, args.save)
parser.add_argument('--l2loss', type=float, default=1.0) args = parser.parse_args() image_dir = args.dataset_dir data_transform = transforms.Compose([ transforms.Resize((args.img_h, args.img_w)), #transforms.RandomHorizontalFlip(), transforms.ToTensor(), # range [0, 255] -> [0.0,1.0] transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) data_transform_resize = transforms.Compose([ #transforms.Resize((args.img_bi_h, args.img_bi_w)), transforms.RandomHorizontalFlip(p=0.5), transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1), transforms.RandomRotation(10), #transforms.RandomCrop(size=(384,128)), my_transforms.RandomCrop(range=(0.70, 0.95)), transforms.Resize((args.img_bi_h, args.img_bi_w)), transforms.ToTensor(), # range [0, 255] -> [0.0,1.0] transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) data_transform_resize2 = transforms.Compose([ #transforms.Resize((args.img_tri_h, args.img_tri_w)), transforms.RandomHorizontalFlip(p=0.5),
def transforms_imagenet_train( img_size=224, scale=(0.08, 1.0), color_jitter=0.4, auto_augment=None, interpolation='random', use_prefetcher=False, mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, re_prob=0., re_mode='const', re_count=1, re_num_splits=0, separate=False, ): """ If separate==True, the transforms are returned as a tuple of 3 separate transforms for use in a mixing dataset that passes * all data through the first (primary) transform, called the 'clean' data * a portion of the data through the secondary transform * normalizes and converts the branches above with the third, final transform """ primary_tfl = [ RandomResizedCropAndInterpolation( img_size, scale=scale, interpolation=interpolation), transforms.RandomHorizontalFlip() ] secondary_tfl = [] if auto_augment: assert isinstance(auto_augment, str) if isinstance(img_size, tuple): img_size_min = min(img_size) else: img_size_min = img_size aa_params = dict( translate_const=int(img_size_min * 0.45), img_mean=tuple([min(255, round(255 * x)) for x in mean]), ) if interpolation and interpolation != 'random': aa_params['interpolation'] = _pil_interp(interpolation) if auto_augment.startswith('rand'): secondary_tfl += [rand_augment_transform(auto_augment, aa_params)] elif auto_augment.startswith('augmix'): aa_params['translate_pct'] = 0.3 secondary_tfl += [augment_and_mix_transform(auto_augment, aa_params)] else: secondary_tfl += [auto_augment_transform(auto_augment, aa_params)] elif color_jitter is not None: # color jitter is enabled when not using AA if isinstance(color_jitter, (list, tuple)): # color jitter should be a 3-tuple/list if spec brightness/contrast/saturation # or 4 if also augmenting hue assert len(color_jitter) in (3, 4) else: # if it's a scalar, duplicate for brightness, contrast, and saturation, no hue color_jitter = (float(color_jitter),) * 3 secondary_tfl += [transforms.ColorJitter(*color_jitter)] final_tfl = [] if use_prefetcher: # prefetcher and collate will handle tensor conversion and norm final_tfl += [ToNumpy()] else: final_tfl += [ transforms.ToTensor(), transforms.Normalize( mean=torch.tensor(mean), std=torch.tensor(std)) ] if re_prob > 0.: final_tfl.append( RandomErasing(re_prob, mode=re_mode, max_count=re_count, num_splits=re_num_splits, device='cpu')) if separate: return transforms.Compose(primary_tfl), transforms.Compose(secondary_tfl), transforms.Compose(final_tfl) else: return transforms.Compose(primary_tfl + secondary_tfl + final_tfl)
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=14, metavar='N', help='number of epochs to train (default: 14)') parser.add_argument('--lr', type=float, default=1.0, metavar='LR', help='learning rate (default: 1.0)') parser.add_argument('--gamma', type=float, default=0.7, metavar='M', help='Learning rate step gamma (default: 0.7)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') parser.add_argument('--data', default='./tiny-imagenet-200', help='path to dataset') args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, pin_memory=True) val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, pin_memory=True) model = Net().to(device) optimizer = optim.Adadelta(model.parameters(), lr=args.lr) #optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) for epoch in range(1, args.epochs + 1): train(args, model, device, train_loader, optimizer, epoch) test(model, device, val_loader) scheduler.step() if args.save_model: torch.save(model.state_dict(), "mnist_cnn.pt")
def main(): #if not os.path.exists(args.save_dir): #os.makedirs(args.save_dir) best_acc = 0 # best validation set accuracy start_epoch = 0 # start from epoch 0 or last checkpoint epoch print("scratch::::::::::", p) print('==> Preparing data..') path = 'runs_c10_res34_3248_/' + p + x print("saving logs to:", path) writer = SummaryWriter(path) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]),]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]),]) trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train) trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers) testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test) testloader = torch.utils.data.DataLoader(testset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers) print("BUILDING MODEL...") # defining the model as inbuilt model model = ResNet34() model = model.to(device) if device == 'cuda': model = torch.nn.DataParallel(model) cudnn.benchmark = True criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) args.start_epoch = checkpoint['epoch'] best_acc = checkpoint['acc'] print("=> loaded checkpoint '{}' (epoch {})" .format(args.evaluate, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # if args.scratch is false, then load the baseline model if args.scratch == False: ckpt = torch.load('/data/chercheurs/garg191/checkpoint/baseline_c10_res34_3248/checkpoint_final.pth') print("loaded checkpoint\n") new_dict_to_load = {} for k, v in ckpt['state_dict'].items(): if k.startswith('module.layer1') or k.startswith('module.layer2') or k.startswith('module.layer4') or k.startswith('module.conv') or k.startswith('module.bn'): new_dict_to_load[k] = v if k.startswith('module.layer3.0.'): new_key = k.replace('module.layer3.0.', 'module.o1.') new_dict_to_load[new_key] = v if k.startswith('module.layer3.1.'): new_key = k.replace('module.layer3.1.', 'module.layer3.0.') new_dict_to_load[new_key] = v if k.startswith('module.layer3.2.'): new_key = k.replace('module.layer3.2.', 'module.layer3.1.') new_dict_to_load[new_key] = v if k.startswith('module.layer3.3.'): new_key = k.replace('module.layer3.3.', 'module.layer3.2.') new_dict_to_load[new_key] = v if k.startswith('module.layer3.4.'): new_key = k.replace('module.layer3.4.', 'module.layer3.3.') new_dict_to_load[new_key] = v if k.startswith('module.layer3.5.'): new_key = k.replace('module.layer3.5.', 'module.o2.') new_dict_to_load[new_key] = v model.load_state_dict(new_dict_to_load, strict=False) if args.evaluate: if os.path.isfile(args.test_checkpoint): print("=> loading test checkpoint") checkpoint = torch.load(args.test_checkpoint) model.load_state_dict(checkpoint['state_dict']) acc, test_loss = validate(testloader, model, criterion) print("Test accuracy attained: {}, Test loss: {} ".format(acc, test_loss)) return test_accuracy = 0.0 for epoch in range(args.start_epoch, args.epochs): if epoch % 82 == 0 or epoch % 123 == 0: adjust_learning_rate(optimizer, epoch) print("Epoch: ", epoch) # train for one epoch train(trainloader, model, criterion, optimizer, epoch) # evaluate on validation set test_accuracy, test_loss = validate(testloader, model, criterion, 0) train_accuracy, train_loss = validate(trainloader, model, criterion, 1) info = { 'test_loss': test_loss, 'test_accuracy': test_accuracy, 'train_loss': train_loss, 'train_accuracy': train_accuracy } for tag, value in info.items(): writer.add_scalar(tag, value, epoch+1) # remember best prec@1 and save checkpoint is_best = test_accuracy > best_acc best_acc = max(test_accuracy, best_acc) if is_best: save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.module.state_dict(), 'acc': best_acc, 'optimizer': optimizer.state_dict(), }, 'checkpoint_best.pth') if epoch % 50 == 0 and epoch!=0: save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.module.state_dict(), 'acc': test_accuracy, 'optimizer': optimizer.state_dict(), }, 'checkpoint_{}.pth'.format(epoch)) save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.module.state_dict(), 'acc': test_accuracy, 'optimizer': optimizer.state_dict(), }, 'checkpoint_final.pth')
def main(): global args, best_prec1 args = parser.parse_args() print(args) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch](low_dim=args.low_dim) if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolderInstance( traindir, transforms.Compose([ transforms.RandomResizedCrop(224, scale=(0.2, 1.)), transforms.RandomGrayscale(p=0.2), transforms.ColorJitter(0.4, 0.4, 0.4, 0.4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) train_labels = torch.tensor(train_dataset.targets).long().cuda() train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, sampler=None) val_loader = torch.utils.data.DataLoader(datasets.ImageFolderInstance( valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) # define lemniscate and loss function (criterion) ndata = train_dataset.__len__() lemniscate = LinearAverage(args.low_dim, ndata, args.nce_t, args.nce_m).cuda() rlb = ReliableSearch(ndata, args.low_dim, args.threshold_1, args.threshold_2, args.batch_size).cuda() criterion = ReliableCrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = 0 best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) lemniscate = checkpoint['lemniscate'] optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True if args.evaluate: kNN(0, model, lemniscate, train_loader, val_loader, 200, args.nce_t) return for rnd in range(args.start_round, args.rounds): if rnd > 0: memory = recompute_memory(model, lemniscate, train_loader, val_loader, args.batch_size, args.workers) num_reliable_1, consistency_1, num_reliable_2, consistency_2 = rlb.update( memory, train_labels) print( 'Round [%02d/%02d]\tReliable1: %.12f\tReliable2: %.12f\tConsistency1: %.12f\tConsistency2: %.12f' % (rnd, args.rounds, num_reliable_1, num_reliable_2, consistency_1, consistency_2)) for epoch in range(args.start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) # train for one epoch train(train_loader, model, lemniscate, rlb, criterion, optimizer, epoch) # evaluate on validation set prec1 = NN(epoch, model, lemniscate, train_loader, val_loader) # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'lemniscate': lemniscate, 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), #}, is_best, filename='ckpts/%02d-%04d-checkpoint.pth.tar'%(rnd+1, epoch + 1)) }, is_best) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'lemniscate': lemniscate, 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best=False, filename='ckpts/%02d-checkpoint.pth.tar' % (rnd + 1)) # evaluate KNN after last epoch top1, top5 = kNN(0, model, lemniscate, train_loader, val_loader, 200, args.nce_t) print('Round [%02d/%02d]\tTop1: %.2f\tTop5: %.2f' % (rnd + 1, args.rounds, top1, top5))
def main(): parser = train_args.get_args() parser.add_argument('--version', action='version', version='%(prog)s ' + __version__ + ' by ' + __author__) cli_args = parser.parse_args() # directory #First check if not os.path.isdir(cli_args.data_directory): print(f'Data directory {cli_args.data_directory} not found.') exit(1) # Then save directory if not os.path.isdir(cli_args.save_dir): print(f'Directory {cli_args.save_dir} does not exist. Creating...') os.makedirs(cli_args.save_dir) with open(cli_args.categories_json, 'r') as f: cat_to_name = json.load(f) output_size = len(cat_to_name) expected_means = [0.485, 0.456, 0.406] expected_std = [0.229, 0.224, 0.225] max_image_size = 224 batch_size = 32 #train_transform tr_transform = transforms.Compose([transforms.RandomHorizontalFlip(p=0.25), transforms.RandomRotation(25), transforms.RandomGrayscale(p=0.02), transforms.RandomResizedCrop(max_image_size), transforms.ToTensor(), transforms.Normalize(expected_means, expected_std)]) #train_dataset tr_dataset = datasets.ImageFolder(cli_args.data_directory, transform=tr_transform) #tr_dataloader tr_dataloader = torch.utils.data.DataLoader(tr_dataset, batch_size=batch_size, shuffle=True) # model if not cli_args.arch.startswith("vgg") and not cli_args.arch.startswith("densenet"): print("Only supporting VGG and DenseNet") exit(1) print(f"Using a pre-trained {cli_args.arch} network.") my_model = models.__dict__[cli_args.arch](pretrained=True) densenet_input = { 'densenet121': 1024, 'densenet169': 1664, 'densenet161': 2208, 'densenet201': 1920 } input_size = 0 if cli_args.arch.startswith("vgg"): input_size = my_model.classifier[0].in_features if cli_args.arch.startswith("densenet"): input_size = densenet_input[cli_args.arch] for param in my_model.parameters(): param.requires_grad = False od = OrderedDict() hidden_sizes = cli_args.hidden_units hidden_sizes.insert(0, input_size) print(f"Building a {len(cli_args.hidden_units)} hidden layer classifier with inputs {cli_args.hidden_units}") for i in range(len(hidden_sizes) - 1): od['fc' + str(i + 1)] = nn.Linear(hidden_sizes[i], hidden_sizes[i + 1]) od['relu' + str(i + 1)] = nn.ReLU() od['dropout' + str(i + 1)] = nn.Dropout(p=0.15) od['output'] = nn.Linear(hidden_sizes[i + 1], output_size) od['softmax'] = nn.LogSoftmax(dim=1) classifier = nn.Sequential(od) # Replace the classifier my_model.classifier = classifier my_model.zero_grad()
def main(): #기본 설정 부분 global args, best_acc args = parser.parse_args() data_path = args.data args.cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(random.randint(1, 10000)) pin = True best_acc = 0.0 if args.cuda: print("GPU Mode") torch.cuda.manual_seed(random.randint(1, 10000)) else: pin = False print("CPU Mode") model = models.__dict__[args.arch](pretrained=True) model.fc = nn.Linear(512, args.num_classes) if args.cuda: model = torch.nn.DataParallel(model).cuda() # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_acc = checkpoint['best_acc'] model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) val_data = torch.utils.data.DataLoader( ImageFolder( data_path, False, transforms.Compose([ transforms.Scale(400), transforms.CenterCrop(400), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=pin, ) print("Complete Validation Data loading(%s)" % len(val_data)) if args.evaluate: validate(val_data, model) return image_data = torch.utils.data.DataLoader( ImageFolder( data_path, True, transforms.Compose([ transforms.Scale(400), transforms.CenterCrop(400), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=pin, ) print("Complete Data loading(%s)" % len(image_data)) criterion = nn.CrossEntropyLoss().cuda() params = filter(lambda p: p.requires_grad, model.parameters()) optimizer = torch.optim.Adagrad(params, lr=args.lr, weight_decay=args.weight_decay) for epoch in range(args.start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) # train for one epoch train(image_data, model, criterion, optimizer, epoch) # evaluate on validation set acc = validate(val_data, model) # remember best prec@1 and save checkpoint is_best = acc > best_acc best_acc = max(acc, best_acc) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc': best_acc, }, is_best, args.arch + '_' + 'type.pth.tar')
def main_worker(args): global best_acc1, save_folder # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() print(model) model.to(device) # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set # model = torch.nn.parallel.DistributedDataParallel(model) # torch.save(model.state_dict(), save_folder+'/'+ str(args.arch) + '_' + str(0)+'.pth') # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().to(device) # optionally resume from a checkpoint if args.resume: print('continue training ...') model.load_state_dict(torch.load(args.resume)) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[args.epochs//3, args.epochs//3*2], gamma=0.1) # scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[60, 100], gamma=0.1) if args.epochs == 90: lr_epoch = [30, 60] else: lr_epoch = [args.epochs // 2, args.epochs // 4 * 3] lr = args.lr cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion, args) return print("total training epochs: %d " % (args.epochs)) print("lr step epoch: ", lr_epoch) for epoch in range(args.start_epoch, args.epochs): lr = adjust_learning_rate(optimizer, epoch, lr_epoch, lr, gamma=0.1) # acc1 = validate(val_loader, model, criterion, args) # exit() # train for one epoch train(train_loader, model, criterion, optimizer, epoch, lr, args) # evaluate on validation set acc1 = validate(val_loader, model, criterion, args) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) if is_best: torch.save( model.state_dict(), save_folder + '/' + str(args.arch) + '_' + str(epoch + 1) + '_' + str(acc1.item()) + '.pth')
pickle.dump(samples, file_train_images) class_ids = list(set(class_ids)) with open(conf.pickle_class_labels, 'wb') as file_train_images: pickle.dump(class_ids, file_train_images) data_transforms = { 'train': trans.Compose([ # trans.RandomHorizontalFlip(), # trans.ColorJitter(brightness=0.125, contrast=0.125, saturation=0.125), # trans.ToTensor(), # trans.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), trans.RandomHorizontalFlip(), trans.ToTensor(), trans.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]) ]), 'val': trans.Compose([ trans.ToTensor(), trans.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), } class AilabFaceDataset(Dataset): def __init__(self, split): conf = get_config() data = None
def get_transforms(train=False): transform = [] if train: transform.append(transforms.RandomHorizontalFlip(0.5)) transform.append(transforms.ToTensor()) return transforms.Compose(transform)
def __init__(self, inputs, *args, df=None, modalities=None, self_supervision=None, **kwargs): """ :param inputs: a dict {mod: [np.arr]} :param *args, **kwargs: arguments to give to ArrayDataset :param df: a pandas DataFrame with {index_mnist, index_svhn, digit} in df.columns :param modalities: must be {'mnist', 'svhn'} """ kwargs["concat_datasets"] = False ArrayDataset.__init__(self, None, *args, **kwargs) self.inputs = inputs self.modalities = modalities assert self.outputs is None, "Unknown output" assert set(self.modalities) == { 'colorful_mnist', 'svhn' }, "Missing modalities: {}".format(self.modalities) if self.patch_size is not None or self.features_to_add is not None: raise NotImplementedError( "Not yet implemented for multimodal dataset.") self.df = df[["index_%s" % m for m in modalities] + ['mnist_digit']].values.copy() self.cumulative_sizes = { m: np.cumsum([len(inp) for inp in self.inputs[m]]) for m in modalities } # Transfos to sample from in the SimCLR framework s = 1 color_jitter = transforms.ColorJitter(0.8 * s, 0.8 * s, 0.8 * s, 0.2 * s) normalization = lambda x: (255.0 * (x - x.min()) / (x.max() - x.min()) ).astype(np.uint8).swapaxes(0, 2) self.compose_transforms = { "svhn": [ lambda x: np.swapaxes(x, 0, 2), transforms.ToPILImage(mode='RGB') ], "colorful_mnist": [normalization, transforms.ToPILImage(mode='RGB')] } self.minimum_tfs = { 'svhn': transforms.Compose([ lambda x: np.swapaxes(x, 0, 2), transforms.ToTensor(), transforms.Normalize((.5, .5, .5), (.5, .5, .5)) ]), 'colorful_mnist': transforms.Compose([ normalization, transforms.ToTensor(), transforms.Normalize((.5, .5, .5), (.5, .5, .5)) ]) } self_supervision = set(self_supervision or []) if "crop" in self_supervision: self.compose_transforms["svhn"].append( transforms.RandomResizedCrop(size=32, scale=(0.5, 1))) self.compose_transforms["colorful_mnist"].append( transforms.RandomResizedCrop(size=96, scale=(0.2, 1))) self_supervision -= {"crop"} if "flip" in self_supervision: self.compose_transforms["svhn"].append( transforms.RandomHorizontalFlip()) self.compose_transforms["colorful_mnist"].append( transforms.RandomHorizontalFlip()) self_supervision -= {"flip"} if "jittering" in self_supervision: self.compose_transforms["svhn"].append( transforms.RandomApply([color_jitter], p=0.8)) self.compose_transforms["colorful_mnist"].append( transforms.RandomApply([color_jitter], p=0.8)) self_supervision -= {"jittering"} if "gray" in self_supervision: self.compose_transforms["svhn"].append( transforms.RandomGrayscale(p=0.2)) self.compose_transforms["colorful_mnist"].append( transforms.RandomGrayscale(p=0.2)) self_supervision -= {"gray"} self.compose_transforms["svhn"].extend([ transforms.ToTensor(), transforms.Normalize((.5, .5, .5), (.5, .5, .5)) ]) self.compose_transforms["colorful_mnist"].extend([ transforms.ToTensor(), transforms.Normalize((.5, .5, .5), (.5, .5, .5)) ]) self.compose_transforms["svhn"] = transforms.Compose( self.compose_transforms["svhn"]) self.compose_transforms["colorful_mnist"] = transforms.Compose( self.compose_transforms["colorful_mnist"]) if len(self_supervision) > 0: logger.warning("Unknown transformations {}. Ignored.".format( self_supervision)) logger.info( "SimCLR Transformations:\n\t (svhn) {}\n\t (colorful MNIST) {}". format(self.compose_transforms["svhn"], self.compose_transforms["colorful_mnist"]))
return x import sys workspace_dir = sys.argv[1] print("Reading data") train_x, train_y = readfile(os.path.join(workspace_dir, "training"), True) print("Size of training data = {}".format(len(train_x))) val_x, val_y = readfile(os.path.join(workspace_dir, "validation"), True) print("Size of validation data = {}".format(len(val_x))) test_x = readfile(os.path.join(workspace_dir, "testing"), False) print("Size of Testing data = {}".format(len(test_x))) train_transform = transforms.Compose([ transforms.ToPILImage(), transforms.RandomHorizontalFlip(), # 隨機將圖片水平翻轉 transforms.RandomRotation(15), # 隨機旋轉圖片 transforms.ToTensor( ), # 將圖片轉成 Tensor,並把數值 normalize 到 [0,1] (data normalization) ]) # testing 時不需做 data augmentation test_transform = transforms.Compose([ transforms.ToPILImage(), transforms.ToTensor(), ]) class ImgDataset(Dataset): def __init__(self, x, y=None, transform=None): self.x = x # label is required to be a LongTensor
def create_cifar_experiment(num_targets: int, num_reps: int, target_dir: str, sleep: float = 0.0): # Converting data to torch.FloatTensor transform = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomRotation(degrees=45), transforms.RandomHorizontalFlip(), transforms.ToTensor() ]) # Download the training and test datasets train_data = torchvision.datasets.CIFAR10(root='data', train=True, download=True, transform=transform) val_data = torchvision.datasets.CIFAR10(root='data', train=False, download=True, transform=transform) # Prepare data loaders train_loader = torch.utils.data.DataLoader(train_data, batch_size=32, num_workers=0, shuffle=True) val_loader = torch.utils.data.DataLoader(val_data, batch_size=32, num_workers=0, shuffle=True) parameter_dict = { "lr": [0.0005, 0.001, 0.005, 0.01], "num_filters": [4, 6, 8, 10, 12] } grid = ParameterGrid(parameter_dict) grid = list(grid)[:num_targets] grid = grid[:num_targets] iterations = 1 baseline_iterations = [1, 3, 8] burn_in_phase_length = 3 m_max = 10000 strategies = [] j = 0 for it in baseline_iterations: algorithms = [ ConvolutionalAEAlg(num_channels=3, num_filters=params["num_filters"], learning_rate=params["lr"]) for params in grid ] strategies.append(Baseline("Baseline (round robin, m={})".format(it), algorithms=algorithms, iterations=it, burn_in_phase_length=burn_in_phase_length, sleep=0.0)) j += 1 algorithms = [ ConvolutionalAEAlg(num_channels=3, num_filters=params["num_filters"], learning_rate=params["lr"]) for params in grid ] strategies.append(AnygradSelectAll("Anygrad (no target selection)", algorithms=algorithms, iterations=iterations, burn_in_phase_length=burn_in_phase_length, sleep=0.0)) j += 1 algorithms = [ ConvolutionalAEAlg(num_channels=3, num_filters=params["num_filters"], learning_rate=params["lr"]) for params in grid ] strategies.append(AnygradOnlySelection("Anygrad (m={})".format(150), algorithms=algorithms, iterations=3, burn_in_phase_length=burn_in_phase_length, sleep=0.0)) j += 1 algorithms = [ ConvolutionalAEAlg(num_channels=3, num_filters=params["num_filters"], learning_rate=params["lr"]) for params in grid ] strategies.append(Anygrad("Anygrad (full)", algorithms=algorithms, iterations=iterations, burn_in_phase_length=burn_in_phase_length, sleep=0.0)) return Experiment(name="Convolutional on Cifar", strategies=strategies, train_data=[train_loader], val_data=[val_loader], targets=[i for i in range(num_targets)], num_reps=num_reps, parallel=False, target_dir=target_dir, m_max=m_max)
def run(): batch_size = 32 train_transform = transforms.Compose([ transforms.Resize(144, interpolation=3), transforms.RandomCrop((256, 128)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) test_transform = transforms.Compose([ transforms.Resize((288, 144), interpolation=3), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) test_flip_transform = transforms.Compose([ transforms.Resize((288, 144), interpolation=3), functional.hflip, transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) train_dataset = Market1501(root + '/bounding_box_train', transform=train_transform) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) query_dataset = Market1501(root + '/query', transform=test_transform) query_flip_dataset = Market1501(root + '/query', transform=test_flip_transform) query_loader = DataLoader(query_dataset, batch_size=batch_size, shuffle=False) query_flip_loader = DataLoader(query_flip_dataset, batch_size=batch_size, shuffle=False) test_dataset = Market1501(root + '/bounding_box_test', transform=test_transform) test_flip_dataset = Market1501(root + '/bounding_box_test', transform=test_flip_transform) test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) test_flip_loader = DataLoader(test_flip_dataset, batch_size=batch_size, shuffle=False) ide = IDE(num_classes=len(train_dataset.unique_ids)).to(DEVICE) criterion = nn.CrossEntropyLoss() params = [ { 'params': ide.backbone.parameters(), 'lr': 0.01 }, { 'params': ide.classifier.parameters(), 'lr': 0.1 }, ] optimizer = optim.SGD(params, momentum=0.9, weight_decay=5e-4, nesterov=True) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1) epochs = 50 for epoch in range(epochs): ide.train() scheduler.step() running_loss = 0.0 for i, data in enumerate(train_loader): inputs, labels = data inputs, labels = inputs.to(DEVICE), labels.to(DEVICE) optimizer.zero_grad() outputs = ide(inputs) loss = criterion(outputs[1], labels) loss.backward() optimizer.step() running_loss += loss.item() print('%d/%d - %d/%d - loss: %f' % (epoch, epochs, i, len(train_loader), loss.item())) print('epoch: %d/%d - loss: %f' % (epoch, epochs, running_loss / len(train_loader))) if epoch % 10 == 9: ide.eval() query = np.concatenate([ ide(inputs.to(DEVICE))[0].detach().cpu().numpy() for inputs, _ in query_loader ]) query_flip = np.concatenate([ ide(inputs.to(DEVICE))[0].detach().cpu().numpy() for inputs, _ in query_flip_loader ]) test = np.concatenate([ ide(inputs.to(DEVICE))[0].detach().cpu().numpy() for inputs, _ in test_loader ]) test_flip = np.concatenate([ ide(inputs.to(DEVICE))[0].detach().cpu().numpy() for inputs, _ in test_flip_loader ]) # dist = cdist((query + query_flip) / 2., (test + test_flip) / 2.) dist = cdist(normalize(query + query_flip), normalize(test + test_flip)) r = cmc(dist, query_dataset.ids, test_dataset.ids, query_dataset.cameras, test_dataset.cameras, separate_camera_set=False, single_gallery_shot=False, first_match_break=True) m_ap = mean_ap(dist, query_dataset.ids, test_dataset.ids, query_dataset.cameras, test_dataset.cameras) print('epoch[%d]: mAP=%f, r@1=%f, r@3=%f, r@5=%f, r@10=%f' % (epoch + 1, m_ap, r[0], r[2], r[4], r[9]))
def main(config): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") train_transform = transforms.Compose([ transforms.Scale(256), transforms.RandomCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor()]) val_transform = transforms.Compose([ transforms.Scale(256), transforms.RandomCrop(224), transforms.ToTensor()]) test_transform = transforms.Compose([ transforms.ToTensor()]) trainset = AVADataset(csv_file=config.train_csv_file, root_dir=config.train_img_path, transform=train_transform) valset = AVADataset(csv_file=config.val_csv_file, root_dir=config.val_img_path, transform=val_transform) train_loader = torch.utils.data.DataLoader(trainset, batch_size=config.train_batch_size, shuffle=True, num_workers=config.num_workers) val_loader = torch.utils.data.DataLoader(valset, batch_size=config.val_batch_size, shuffle=False, num_workers=config.num_workers) base_model = models.vgg16(pretrained=True) # base_model = models.resnet18(pretrained=True) # base_model = models.inception_v3(pretrained=True) model = NIMA(base_model) # model = NIMA() if config.warm_start: model.load_state_dict(torch.load(os.path.join(config.ckpt_path, 'epoch-%d.pkl' % config.warm_start_epoch))) print('Successfully loaded model epoch-%d.pkl' % config.warm_start_epoch) if config.multi_gpu: model.features = torch.nn.DataParallel(model.features, device_ids=config.gpu_ids) model = model.to(device) else: model = model.to(device) conv_base_lr = config.conv_base_lr dense_lr = config.dense_lr optimizer = optim.SGD([ {'params': model.features.parameters(), 'lr': conv_base_lr}, {'params': model.classifier.parameters(), 'lr': dense_lr}], momentum=0.6 ) criterion = torch.nn.L1Loss() # send hyperparams lrs.send({ 'title': 'EMD Loss', 'train_batch_size': config.train_batch_size, 'val_batch_size': config.val_batch_size, 'optimizer': 'SGD', 'conv_base_lr': config.conv_base_lr, 'dense_lr': config.dense_lr, 'momentum': 0.9 }) param_num = 0 for param in model.parameters(): param_num += int(np.prod(param.shape)) print('Trainable params: %.2f million' % (param_num / 1e6)) if config.train: # for early stopping count = 0 init_val_loss = float('inf') train_losses = [] val_losses = [] for epoch in range(config.warm_start_epoch, config.epochs): lrs.send('epoch', epoch) batch_losses = [] for i, data in enumerate(train_loader): images = data['image'].to(device) labels = data['annotations'].to(device).float() outputs = model(images) outputs = outputs.view(-1, 1, 1) optimizer.zero_grad() loss = criterion(outputs, labels) # loss = emd_loss(labels, outputs) batch_losses.append(loss.item()) loss.backward() optimizer.step() lrs.send('train_emd_loss', loss.item()) # print('Epoch: %d/%d | Step: %d/%d | Training EMD loss: %.4f' % (epoch + 1, config.epochs, i + 1, len(trainset) // config.train_batch_size + 1, loss.data[0])) avg_loss = sum(batch_losses) / (len(trainset) // config.train_batch_size + 1) train_losses.append(avg_loss) print('Epoch %d averaged training EMD loss: %.4f' % (epoch + 1, avg_loss)) # exponetial learning rate decay if (epoch + 1) % 10 == 0: conv_base_lr = conv_base_lr * config.lr_decay_rate ** ((epoch + 1) / config.lr_decay_freq) dense_lr = dense_lr * config.lr_decay_rate ** ((epoch + 1) / config.lr_decay_freq) optimizer = optim.SGD([ {'params': model.features.parameters(), 'lr': conv_base_lr}, {'params': model.classifier.parameters(), 'lr': dense_lr}], momentum=0.6 ) # send decay hyperparams lrs.send({ 'lr_decay_rate': config.lr_decay_rate, 'lr_decay_freq': config.lr_decay_freq, 'conv_base_lr': config.conv_base_lr, 'dense_lr': config.dense_lr }) # do validation after each epoch batch_val_losses = [] for data in val_loader: images = data['image'].to(device) labels = data['annotations'].to(device).float() with torch.no_grad(): outputs = model(images) val_outputs = outputs.view(-1, 1, 1) val_loss = criterion(val_outputs, labels) # val_loss = emd_loss(labels, outputs) batch_val_losses.append(val_loss.item()) avg_val_loss = sum(batch_val_losses) / (len(valset) // config.val_batch_size + 1) val_losses.append(avg_val_loss) lrs.send('val_emd_loss', avg_val_loss) print('Epoch %d completed. Averaged MSE loss on val set: %.4f. Inital val loss : %.4f.' % (epoch + 1, avg_val_loss, init_val_loss)) # Use early stopping to monitor training if avg_val_loss < init_val_loss: init_val_loss = avg_val_loss # save model weights if val loss decreases print('Saving model...') torch.save(model.state_dict(), os.path.join(config.ckpt_path, 'epoch-%d.pkl' % (epoch + 1))) print('Done.\n') # reset count count = 0 elif avg_val_loss >= init_val_loss: count += 1 if count == config.early_stopping_patience: print('Val EMD loss has not decreased in %d epochs. Training terminated.' % config.early_stopping_patience) # break print('Training completed.') if config.save_fig: # plot train and val loss epochs = range(1, epoch + 2) plt.plot(epochs, train_losses, 'b-', label='train loss') plt.plot(epochs, val_losses, 'g-', label='val loss') plt.title('EMD loss') plt.legend() plt.savefig('./loss.png') if config.test: start.record() print('Testing') # compute mean score test_transform = test_transform#val_transform testset = AVADataset(csv_file=config.test_csv_file, root_dir=config.test_img_path, transform=val_transform) test_loader = torch.utils.data.DataLoader(testset, batch_size=config.test_batch_size, shuffle=False, num_workers=config.num_workers) mean_preds = np.zeros(45) mean_labels = np.zeros(45) # std_preds = [] count = 0 for data in test_loader: im_id = data['img_id'] image = data['image'].to(device) labels = data['annotations'].to(device).float() output = model(image) output = output.view(1, 1) bpred = output.to(torch.device("cpu")) cpred = bpred.data.numpy() blabel = labels.to(torch.device("cpu")) clabel = blabel.data.numpy() # predicted_mean, predicted_std = 0.0, 0.0 # for i, elem in enumerate(output, 1): # predicted_mean += i * elem # for j, elem in enumerate(output, 1): # predicted_std += elem * (i - predicted_mean) ** 2 mean_preds[count] = cpred mean_labels[count] = clabel print(im_id,mean_preds[count]) count= count+1 # std_preds.append(predicted_std) # Do what you want with predicted and std... end.record()
def model_setup(self, config): """ Tons of parameters! This should be called at the beginning of each repetition with a dict containing all the parameters required to setup the trial. """ # Get trial parameters seed = config.get("seed", random.randint(0, 10000)) self.data_dir = os.path.expanduser(config.get("data_dir", "data")) self.model_filename = config.get("model_filename", "model.pth") self.iterations = config["iterations"] # Training / testing parameters batch_size = config["batch_size"] first_epoch_batch_size = config.get("first_epoch_batch_size", batch_size) self.batches_in_epoch = config.get("batches_in_epoch", sys.maxsize) self.batches_in_first_epoch = config.get("batches_in_first_epoch", self.batches_in_epoch) self.test_batch_size = config["test_batch_size"] self.test_batches_in_epoch = config.get("test_batches_in_epoch", sys.maxsize) self.noise_values = config.get("noise_values", [0.0, 0.1]) self.loss_function = nn.functional.cross_entropy self.learning_rate = config["learning_rate"] self.momentum = config.get("momentum", 0.5) self.weight_decay = config.get("weight_decay", 0.0005) self.learning_rate_gamma = config.get("learning_rate_gamma", 0.9) self.last_noise_results = None self.lr_step_schedule = config.get("lr_step_schedule", None) self.early_stopping = config.get("early_stopping", None) # Network parameters network_type = config.get("network_type", "vgg") in_channels, self.h, self.w = config["input_shape"] self.boost_strength = config["boost_strength"] self.boost_strength_factor = config["boost_strength_factor"] self.k_inference_factor = config["k_inference_factor"] # CNN parameters - these are lists, one for each CNN layer self.cnn_percent_on = config["cnn_percent_on"] self.cnn_kernel_sizes = config.get("cnn_kernel_size", [3] * len(self.cnn_percent_on)) self.cnn_out_channels = config.get("cnn_out_channels", [32] * len(self.cnn_percent_on)) self.cnn_weight_sparsity = config.get("cnn_weight_sparsity", [1.0] * len(self.cnn_percent_on)) self.in_channels = [in_channels] + self.cnn_out_channels self.block_sizes = config.get("block_sizes", [1] * len(self.cnn_percent_on)) self.use_max_pooling = config.get("use_max_pooling", False) # Linear parameters self.linear_weight_sparsity = config["weight_sparsity"] self.linear_n = config["linear_n"] self.linear_percent_on = config["linear_percent_on"] if isinstance(self.linear_n, int): self.linear_n = [self.linear_n] self.linear_percent_on = [self.linear_percent_on] self.linear_weight_sparsity = [self.linear_weight_sparsity] self.output_size = config.get("output_size", 10) self.optimizer_alg = config.get("optimizer", "SGD") # Setup devices, model, and dataloaders print("setup: Torch device count=", torch.cuda.device_count()) torch.manual_seed(seed) if torch.cuda.is_available(): print("setup: Using cuda") self.device = torch.device("cuda") torch.cuda.manual_seed(seed) else: print("setup: Using cpu") self.device = torch.device("cpu") self.transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), # can store stats in database or dynamically obtain transforms.Normalize( (0.50707516, 0.48654887, 0.44091784), (0.26733429, 0.25643846, 0.27615047), ), ]) self.transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize( (0.50707516, 0.48654887, 0.44091784), (0.26733429, 0.25643846, 0.27615047), ), ]) # added custom dataset and output sizes to reuse model self.output_size = config.get("output_size", 10) self.dataset = config.get("dataset", "CIFAR10") train_dataset = getattr(datasets, self.dataset)(self.data_dir, train=True, transform=self.transform_train) self.train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) self.first_loader = torch.utils.data.DataLoader( train_dataset, batch_size=first_epoch_batch_size, shuffle=True) self.test_loaders = create_test_loaders(self.dataset, self.noise_values, self.test_batch_size, self.data_dir) if network_type == "vgg": self._create_vgg_model() self.optimizer = self._create_optimizer(self.model, self.optimizer_alg) self.lr_scheduler = self._create_learning_rate_scheduler( self.optimizer) # adding track of losses for early stopping # self.mean_losses = deque(maxlen=max(3,int(self.iterations/10))) self.mean_losses = deque(maxlen=self.iterations) self.bad_epochs = 0 self.grace_period = max(1, int(self.iterations / 5)) self.patience = 3
def main(): global global_epoch_confusion parser = argparse.ArgumentParser() parser.add_argument('--log_dir', type=str, default='log' , help='path for saving trained models and log info') parser.add_argument('--ann_dir', type=str, default='/media/data/dataset/coco/annotations', help='path for annotation json file') parser.add_argument('--image_dir', default = '/media/data/dataset/coco') parser.add_argument('--resume', default=1, type=int, help='whether to resume from log_dir if existent') parser.add_argument('--finetune', default=0, type=int) parser.add_argument('--num_epochs', type=int, default=20) parser.add_argument('--start_epoch', type=int, default=1) parser.add_argument('--batch_size', type=int, default=64) # batch size should be smaller if use text parser.add_argument('--crop_size', type=int, default=224) parser.add_argument('--image_size', type=int, default=256) parser.add_argument('--seed', type=int, default=1) parser.add_argument('--learning_rate', type=float, default=0.1) parser.add_argument('--lam', default=0.5, type=float, help='hyperparameter lambda') parser.add_argument('--first', default="person", type=str, help='first object index') parser.add_argument('--second', default="clock", type=str, help='second object index') parser.add_argument('--third', default="bus", type=str, help='third object index') parser.add_argument( '--pretrained', default='/set/your/model/path', type=str, metavar='PATH') parser.add_argument('--debug', help='Check model accuracy', action='store_true') parser.add_argument('--ratio', default=0.5, type=float, help='target ratio for batchnorm layers') parser.add_argument('--replace', help='replace bn layer ', action='store_true') args = parser.parse_args() torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) if os.path.exists(args.log_dir) and not args.resume: print('Path {} exists! and not resuming'.format(args.log_dir)) return if not os.path.exists(args.log_dir): os.makedirs(args.log_dir) #save all parameters for training with open(os.path.join(args.log_dir, "arguments.log"), "a") as f: f.write(str(args)+'\n') normalize = transforms.Normalize(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225]) # Image preprocessing train_transform = transforms.Compose([ transforms.Scale(args.image_size), transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize]) val_transform = transforms.Compose([ transforms.Scale(args.image_size), transforms.CenterCrop(args.crop_size), transforms.ToTensor(), normalize]) # Data samplers. train_data = CocoObject(ann_dir = args.ann_dir, image_dir = args.image_dir, split = 'train', transform = train_transform) first_data = CocoObject(ann_dir = args.ann_dir, image_dir = args.image_dir, split = 'train', transform = train_transform, filter=args.first) second_data = CocoObject(ann_dir = args.ann_dir, image_dir = args.image_dir, split = 'train', transform = train_transform, filter=args.second) third_data = CocoObject(ann_dir = args.ann_dir, image_dir = args.image_dir, split = 'train', transform = train_transform, filter=args.third) val_data = CocoObject(ann_dir = args.ann_dir, image_dir = args.image_dir, split = 'val', transform = val_transform) # Data loaders / batch assemblers. train_loader = torch.utils.data.DataLoader(train_data, batch_size = args.batch_size, shuffle = True, num_workers = 1, pin_memory = True) first_loader = torch.utils.data.DataLoader(first_data, batch_size = args.batch_size/3, shuffle = True, num_workers = 0, pin_memory = False) second_loader = torch.utils.data.DataLoader(second_data, batch_size = args.batch_size/3, shuffle = True, num_workers = 0, pin_memory = False) third_loader = torch.utils.data.DataLoader(third_data, batch_size = args.batch_size/3, shuffle = True, num_workers = 0, pin_memory = False) val_loader = torch.utils.data.DataLoader(val_data, batch_size = args.batch_size, shuffle = False, num_workers = 0, pin_memory = True) # Build the models model = MultilabelObject(args, 80).cuda() criterion = nn.BCEWithLogitsLoss(weight = torch.FloatTensor(train_data.getObjectWeights()), size_average = True, reduction='None').cuda() def trainable_params(): for param in model.parameters(): if param.requires_grad: yield param optimizer = torch.optim.Adam(trainable_params(), args.learning_rate, weight_decay = 1e-5) best_performance = 0 if os.path.isfile(args.pretrained): train_F = open(os.path.join(args.log_dir, 'train.csv'), 'w') val_F = open(os.path.join(args.log_dir, 'val.csv'), 'w') score_F = open(os.path.join(args.log_dir, 'score.csv'), 'w') print("=> loading checkpoint '{}'".format(args.pretrained)) checkpoint = torch.load(args.pretrained) args.start_epoch = checkpoint['epoch'] best_performance = checkpoint['best_performance'] model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint (epoch {})".format(checkpoint['epoch'])) else: exit() if args.replace: model.to('cpu') global glob_bn_count global glob_bn_total glob_bn_total = 0 glob_bn_count = 0 count_bn_layer(model) print("total bn layer: " + str(glob_bn_total)) glob_bn_count = 0 replace_bn(model, args.ratio) print(model) model = model.cuda() for epoch in range(args.start_epoch, args.num_epochs + 1): global_epoch_confusion.append({}) adjust_learning_rate(optimizer, epoch) train(args, epoch, model, criterion, train_loader, optimizer, train_F, score_F, train_data, first_loader, second_loader, third_loader) current_performance = get_confusion(args, epoch, model, criterion, val_loader, optimizer, val_F, score_F, val_data) is_best = current_performance > best_performance best_performance = max(current_performance, best_performance) model_state = { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_performance': best_performance} save_checkpoint(args, model_state, is_best, os.path.join(args.log_dir, 'checkpoint.pth.tar')) confusion_matrix = global_epoch_confusion[-1]["confusion"] first_second = compute_confusion(confusion_matrix, args.first, args.second) first_third = compute_confusion(confusion_matrix, args.first, args.third) print(str((args.first, args.second, args.third)) + " triplet: " + str(compute_bias(confusion_matrix, args.first, args.second, args.third))) print(str((args.first, args.second)) + ": " + str(first_second)) print(str((args.first, args.third)) + ": " + str(first_third)) #os.system('python plot.py {} &'.format(args.log_dir)) train_F.close() val_F.close() score_F.close() np.save(os.path.join(args.log_dir, 'global_epoch_confusion.npy'), global_epoch_confusion) glob_bn_total = 0 glob_bn_count = 0
def main(): global best_acc1, start_epoch model = get_model(config.get_string('arch')) model.cuda() learning_rate = scale_lr( config.get_float('optimizer.lr'), config.get_int('dataloader.batch_size') ) optimizer = optim.SGD( model.parameters(), lr=learning_rate, momentum=config.get_float('optimizer.momentum'), weight_decay=config.get_float('optimizer.weight_decay'), nesterov=config.get_bool('optimizer.nesterov') ) criterion = nn.CrossEntropyLoss() scheduler = optim.lr_scheduler.MultiStepLR( optimizer, config.get_list('scheduler.milestones') ) if tpp.distributed: model = DistributedDataParallel(model, device_ids=[tpp.local_rank]) normalize = T.Normalize( config.get_list('dataset.mean'), config.get_list('dataset.std') ) train_transform = T.Compose([ # UT.RandomCrop(32, padding=4), # UT.RandomHorizontalFlip(), T.RandomCrop(32, padding=4), T.RandomHorizontalFlip(), T.ToTensor(), normalize ]) val_transform = T.Compose([ T.ToTensor(), normalize ]) train_set = CIFAR10( config.get_string('dataset.root'), train=True, transform=train_transform, download=True ) val_set = CIFAR10( config.get_string('dataset.root'), train=False, transform=val_transform, download=False ) train_sampler = None val_sampler = None if tpp.distributed: train_sampler = DistributedSampler(train_set) val_sampler = DistributedSampler(val_set) train_loader = DataLoader( train_set, batch_size=config.get_int('dataloader.batch_size'), pin_memory=True, shuffle=(train_sampler is None), num_workers=config.get_int('dataloader.num_workers'), sampler=train_sampler ) val_loader = DataLoader( val_set, batch_size=config.get_int('dataloader.batch_size'), pin_memory=True, num_workers=config.get_int('dataloader.num_workers'), sampler=val_sampler ) for epoch in range(start_epoch, config.get_int('strategy.num_epochs')): # for epoch in range(start_epoch, 1): if tpp.distributed: train_sampler.set_epoch(epoch) train(model, train_loader, criterion, optimizer, epoch) acc1 = validate(model, val_loader, criterion, epoch) scheduler.step() writer.add_scalar('lr', optimizer.param_groups[0]['lr'], epoch) is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) save_checkpoint({ 'epoch': epoch + 1, 'arch': config.get_string('arch'), 'state_dict': model.module.state_dict() if tpp.distributed else model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), }, is_best=is_best, folder=experiment_path)
def __getitem__(self, index): img_path = self.imgs[index] label = 1 if 'dog' in img_path.split('/')[-1] else 0 # 狗的label设为1,猫的设为0 data = Image.open(img_path) data = self.transform(data) return data, label def __len__(self): return len(self.imgs) # 对数据集训练集的处理,其实可以直接放到 DogCat 类里面去 transform_train = transforms.Compose([ transforms.Resize((256, 256)), # 先调整图片大小至256x256 transforms.RandomCrop((224, 224)), # 再随机裁剪到224x224 transforms.RandomHorizontalFlip(), # 随机的图像水平翻转,通俗讲就是图像的左右对调 transforms.ToTensor(), # Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor. transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.2225)) # 归一化,数值是用ImageNet给出的数值 ]) # 对数据集验证集的处理 transform_val = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), ]) # 生成训练集和验证集 trainset = CustomData('/raid/bruce/datasets/dogs_cats/train', transform=transform_train) valset = CustomData('/raid/bruce/datasets/dogs_cats/train', transform=transform_val, train=False, val=True) # 将训练集和验证集放到 DataLoader 中去,shuffle 进行打乱顺序(在多个 epoch 的情况下)
def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu # suppress printing if not master if args.multiprocessing_distributed and args.gpu != 0: def print_pass(*args): pass builtins.print = print_pass if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() # freeze all layers but the last fc for name, param in model.named_parameters(): if name not in ['fc.weight', 'fc.bias']: param.requires_grad = False # init the fc layer # monkey patch fix for cifar100 if args.data == 'cifar100': model.fc = nn.Linear(model.fc.weight.size(1), 100) model.fc.weight.data.normal_(mean=0.0, std=0.01) model.fc.bias.data.zero_() # load from pre-trained, before DistributedDataParallel constructor if args.pretrained: if os.path.isfile(args.pretrained): print("=> loading checkpoint '{}'".format(args.pretrained)) checkpoint = torch.load(args.pretrained, map_location="cpu") # rename moco pre-trained keys state_dict = checkpoint['state_dict'] for k in list(state_dict.keys()): # retain only encoder_q up to before the embedding layer if k.startswith('module.encoder_q') and not k.startswith('module.encoder_q.fc'): # remove prefix state_dict[k[len("module.encoder_q."):]] = state_dict[k] # delete renamed or unused k del state_dict[k] args.start_epoch = 0 msg = model.load_state_dict(state_dict, strict=False) assert set(msg.missing_keys) == {"fc.weight", "fc.bias"} print("=> loaded pre-trained model '{}'".format(args.pretrained)) else: print("=> no checkpoint found at '{}'".format(args.pretrained)) if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() if args.rank == 0: writer = SummaryWriter(logdir=args.save_dir) else: writer = None # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) # optimize only the linear classifier parameters = list(filter(lambda p: p.requires_grad, model.parameters())) assert len(parameters) == 2 # fc.weight, fc.bias optimizer = torch.optim.SGD(parameters, args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code if args.data == 'cifar10': if os.path.exists(f'{data_path}/cifar10'): traindir = os.path.join(f'{data_path}/cifar10') else: traindir = '../cifar10' elif args.data == 'cifar100': if os.path.exists(f'{data_path}/cifar100'): traindir = os.path.join(f'{data_path}/cifar100') else: traindir = '../cifar100' valdir = traindir if args.data == 'cifar10': dataset_cls = datasets.CIFAR10 normalize = transforms.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010)) elif args.data == 'cifar100': dataset_cls = datasets.CIFAR100 normalize = transforms.Normalize(mean=(0.5071, 0.4867, 0.4408), std=(0.2675, 0.2565, 0.2761)) train_dataset = dataset_cls( traindir, train=True, transform=transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=( train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader( dataset_cls(valdir, train=False, transform=transforms.Compose([ transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion, args, writer) return for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args, writer) # evaluate on validation set acc1 = validate(val_loader, model, criterion, args, writer, epoch) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) if not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): if epoch % 10 == 0 or epoch == args.epochs - 1: save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, is_best, filename='{}/checkpoint.pt'.format(args.save_dir)) if epoch == args.start_epoch: sanity_check(model.state_dict(), args.pretrained)
def __init__(self, config, setname, path_prefix=None, data_length=1000000): ROOT_PATH = 'data/miniImageNet/' if path_prefix is not None: ROOT_PATH = path_prefix + ROOT_PATH self.horizon = config['data.horizon'] self.switch_prob = config['data.hazard'] self.data_length = data_length if setname == 'train': self.superclass_dict = imagenet_train_superclasses elif setname == 'val': self.superclass_dict = imagenet_val_superclasses elif setname == 'test': self.superclass_dict = imagenet_test_superclasses else: raise ValueError csv_path = osp.join(ROOT_PATH, setname + '.csv') lines = [x.strip() for x in open(csv_path, 'r').readlines()][1:] data = [] label = [] lb = -1 self.wnids = [] self.class_probs = [0.2, 0.2, 0.2, 0.2, 0.2] for l in lines: name, wnid = l.split(',') path = osp.join(ROOT_PATH, 'images', name) if wnid not in self.wnids: self.wnids.append(wnid) lb += 1 data.append(path) label.append(lb) self.data = data self.label = label if setname == 'train' or setname == 'val': self.transform = transforms.Compose([ transforms.Resize(84), transforms.CenterCrop(84), transforms.RandomHorizontalFlip(), transforms.RandomVerticalFlip(), transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.4), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) elif setname == 'test': self.transform = transforms.Compose([ transforms.Resize(84), transforms.CenterCrop(84), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ])
def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model '''if 'res' in args.model: model = eval('networks.resnet.' + args.model)() elif 'dense' in args.model: model = eval('networks.densenet.' + args.model)() else: print('Please select the model from resnet{18, 34, 50, 101, 152} / ' 'resnext{50_32x4d, 101_32x8d} / densenet{121, 169, 201, 265}')''' checkpoint_dir = os.path.join(args.train_url, 'resnet50-19c8e357.pth') print('checkpoint_dir:', checkpoint_dir) print('=========> Load Checkpoint from checkpoint_dir') checkpoint = torch.load(checkpoint_dir) model = networks_imagenet.resnet.resnet50() model.load_state_dict(checkpoint) if args.pre_train: pre_train_model = torch.load(args.pre_train) model.load_state_dict(pre_train_model) feature_num = model.feature_num print('Number of final features: {}'.format(int(model.feature_num))) print('Number of model parameters: {}'.format(sum([p.data.nelement() for p in model.parameters()]))) if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: model = model.cuda() # model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer # criterion = nn.CrossEntropyLoss().cuda(args.gpu) criterion_isda = ISDALoss(feature_num, 1000).cuda() criterion_ce = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code print('=========> Load Dataset') traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') # traindir = os.path.join(args.data_url, 'train') # valdir = os.path.join(args.data_url, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ''' Preprocessing ''' train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) val_loader = torch.utils.data.DataLoader( datasets.ImageFolder(valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) if args.evaluate: validate(val_loader, model, criterion_ce, args) return print('=========> Load Dataset Ended') print('=========> Start Training') print('\lambda_0: {}'.format(args.lambda_0)) for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) print('\lambda_now: {}'.format(args.lambda_0 * (epoch / args.epochs))) # train for one epoch print('epoch:', epoch) train(train_loader, model, criterion_isda, optimizer, epoch, args) print('====> Save Average and Covariance to train_url...') var = np.array(criterion_isda.estimator.CoVariance.cpu(), dtype=np.float) print('var.shape:', var.shape) for i in range(var.shape[0]): csv_name = os.path.join(args.train_url, 'Covariance/', '{0}_cov_imagenet.csv'.format(i)) f = open(csv_name, 'w') for j in range(var.shape[1]): f.write(str(var[i][j]) + '\n') f.close() # np.savetxt('{0}/{1}_cov_imagenet.csv'.format(record_path, i), var[i], delimeter=' ') # evaluate on validation set # acc1 = validate(val_loader, model, criterion_ce, args) # remember best acc@1 and save checkpoint '''
def teacher_train(teacher, args): seed = 98 torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # Used to make sure we sample the same image for few-shot scenarios seed = 98 train_set = eval(args.teacher_dataset)( args.teacher_datapath, True, [ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ], args.shot, seed, preload=False, portion=args.argportion, fixed_pic=args.fixed_pic, is_poison=args.is_poison) # print(len(train_set)) # print(train_set.chosen) # input() test_set = eval(args.teacher_dataset)( args.teacher_datapath, False, [ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ], # target attack args.shot, seed, preload=False, portion=1, fixed_pic=args.fixed_pic, four_corner=args.four_corner, is_poison=args.is_poison) clean_set = eval(args.teacher_dataset)(args.teacher_datapath, False, [ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ], args.shot, seed, preload=False, portion=0, fixed_pic=args.fixed_pic, is_poison=args.is_poison) # print("trigger py file",args.argportion) # for j in range(20): # iii = random.randint(0, len(train_set)) # originphoto = train_set[iii][0] # # originphoto = originphoto.numpy() * normalize.std + normalize.mean # numpyphoto = np.transpose(originphoto.numpy(), (1, 2, 0)) # # numpyphoto = numpyphoto * normalize.std + normalize.mean # plt.imshow(numpyphoto) # plt.show() # print(iii, train_set[iii][1], "teacher") # input() train_loader = torch.utils.data.DataLoader(train_set, batch_size=args.batch_size, shuffle=True, num_workers=8, pin_memory=False) test_loader = torch.utils.data.DataLoader(test_set, batch_size=args.batch_size, shuffle=False, num_workers=8, pin_memory=False) clean_loader = torch.utils.data.DataLoader(clean_set, batch_size=args.batch_size, shuffle=False, num_workers=8, pin_memory=False) # input() student = copy.deepcopy(teacher).cuda() if True: if args.teacher_method == "weight": finetune_machine = WeightPruner( args, student, teacher, train_loader, test_loader, ) elif args.teacher_method == "taylor_filter": finetune_machine = TaylorFilterPruner( args, student, teacher, train_loader, test_loader, ) elif args.teacher_method == "snip": finetune_machine = SNIPPruner( args, student, teacher, train_loader, test_loader, ) elif args.teacher_method == "perlayer_weight": finetune_machine = PerlayerWeightPruner( args, student, teacher, train_loader, test_loader, ) elif args.teacher_method == "dataset_grad": finetune_machine = DatasetGrad( args, student, teacher, train_loader, test_loader, ) elif args.teacher_method == "dataset_grad_optim": finetune_machine = DatasetGradOptim( args, student, teacher, train_loader, test_loader, ) elif args.teacher_method == "global_dataset_grad_optim": finetune_machine = GlobalDatasetGradOptim( args, student, teacher, train_loader, test_loader, ) elif args.teacher_method == "global_dataset_grad_optim_3kiter": finetune_machine = GlobalDatasetGradOptimIter( args, student, teacher, train_loader, test_loader, ) elif args.teacher_method == "global_datasetgrad_mul_mag": finetune_machine = GlobalDatasetGradOptimMulMag( args, student, teacher, train_loader, test_loader, ) elif args.teacher_method == "global_datasetgrad_div_mag": finetune_machine = GlobalDatasetGradOptimDivMag( args, student, teacher, train_loader, test_loader, ) elif args.teacher_method == "global_datasetgrad_div_mag_3kiter": finetune_machine = GlobalDatasetGradOptimDivMagIter( args, student, teacher, train_loader, test_loader, ) elif args.teacher_method == "inv_grad_optim": finetune_machine = InvGradOptim( args, student, teacher, train_loader, test_loader, ) elif args.teacher_method == "inv_grad_plane": finetune_machine = InvGradPlane( args, student, teacher, train_loader, test_loader, ) elif args.teacher_method == "inv_grad_avg": finetune_machine = InvGradAvg( args, student, teacher, train_loader, test_loader, ) elif args.teacher_method == "forward_backward_grad": finetune_machine = ForwardBackwardGrad( args, student, teacher, train_loader, test_loader, ) elif args.teacher_method == "backdoor_finetune": student = weight_prune( student, args.backdoor_update_ratio, ) finetune_machine = AttackFinetuner( args, student, teacher, train_loader, test_loader, ) else: finetune_machine = Finetuner(args, student, teacher, train_loader, test_loader, "ONE") finetune_machine.train() # finetune_machine.adv_eval() # if args.teacher_method is not None: # finetune_machine.final_check_param_num() # start testing (more testing, more cases) finetune_machine.test_loader = test_loader test_top1, test_ce_loss, test_feat_loss, test_weight_loss, test_feat_layer_loss = finetune_machine.test( ) test_path = osp.join(args.output_dir, "test.tsv") with open(test_path, 'a') as af: af.write( 'Teacher! Start testing: trigger dataset(target attack):\n') columns = ['time', 'Acc', 'celoss', 'featloss', 'l2sp'] af.write('\t'.join(columns) + '\n') localtime = time.asctime(time.localtime(time.time()))[4:-6] test_cols = [ localtime, round(test_top1, 2), round(test_ce_loss, 2), round(test_feat_loss, 2), round(test_weight_loss, 2), ] af.write('\t'.join([str(c) for c in test_cols]) + '\n') finetune_machine.test_loader = clean_loader test_top2, clean_test_ce_loss, clean_test_feat_loss, clean_test_weight_loss, clean_test_feat_layer_loss = finetune_machine.test( ) test_path = osp.join(args.output_dir, "test.tsv") with open(test_path, 'a') as af: af.write('Teacher! Start testing: clean dataset:\n') columns = ['time', 'Acc', 'celoss', 'featloss', 'l2sp'] af.write('\t'.join(columns) + '\n') localtime = time.asctime(time.localtime(time.time()))[4:-6] test_cols = [ localtime, round(test_top2, 2), round(clean_test_ce_loss, 2), round(clean_test_feat_loss, 2), round(clean_test_weight_loss, 2), ] af.write('\t'.join([str(c) for c in test_cols]) + '\n') return student
def __init__(self, root, add_labeled=0, advanced_transforms=True, remove_classes=False, expand_labeled=0, expand_unlabeled=0, unlabeled_subset_ratio=1, oversampling=True, stratified=False, merged=True, unlabeled_augmentations=False, seed=9999, k_medoids=False, k_medoids_model=None, k_medoids_n_clusters=10, start_labeled=300): self.root = root self.train_path = os.path.join(self.root, "isic", "train") self.test_path = os.path.join(self.root, "isic", "test") self.isic_mean = (0.6679, 0.5297, 0.5246) self.isic_std = (0.1338, 0.1470, 0.1577) self.input_size = 128 self.crop_size = 128 self.expand_labeled = expand_labeled self.expand_unlabeled = expand_unlabeled self.oversampling = oversampling self.stratified = stratified self.merged = merged self.merge_classes = [] if advanced_transforms: self.transform_train = transforms.Compose([ transforms.RandomCrop(self.crop_size), transforms.RandomAffine(degrees=90, translate=(0.2, 0.2)), transforms.Resize(size=self.input_size), transforms.RandomHorizontalFlip(), transforms.RandomVerticalFlip(), transforms.ToTensor(), transforms.RandomErasing(scale=(0.02, 0.2), ratio=(0.3, 0.9)), ]) self.transform_test = transforms.Compose([ transforms.Resize(size=self.input_size), transforms.ToTensor(), ]) else: self.transform_train = transforms.Compose([ transforms.Resize(size=self.input_size), transforms.ToTensor(), ]) self.transform_test = transforms.Compose([ transforms.Resize(size=self.input_size), transforms.ToTensor(), ]) self.transform_autoencoder = transforms.Compose([ transforms.RandomCrop(self.crop_size), transforms.RandomAffine(degrees=90, translate=(0.2, 0.2)), transforms.Resize(size=self.input_size), transforms.RandomHorizontalFlip(), transforms.RandomVerticalFlip(), transforms.ToTensor(), transforms.RandomErasing(scale=(0.02, 0.2), ratio=(0.3, 0.9)), ]) self.transform_simclr = TransformsSimCLR(size=self.input_size) self.transform_fixmatch = TransformFix(crop_size=self.crop_size, input_size=self.input_size) self.merged_classes = 0 if self.merged else 0 self.num_classes = 8 - self.merged_classes self.add_labeled = add_labeled self.unlabeled_subset_ratio = unlabeled_subset_ratio self.unlabeled_subset_num = None self.remove_classes = remove_classes self.unlabeled_augmentations = unlabeled_augmentations self.labeled_class_samples = None self.classes_to_remove = [2, 3, 4, 5, 6, 7] self.seed = seed self.labeled_amount = self.num_classes self.k_medoids = k_medoids self.k_medoids_model = k_medoids_model self.k_medoids_n_clusters = k_medoids_n_clusters self.start_labeled = start_labeled
def main(): # set the path to pre-trained model and output outdir = os.path.join(args.outdir, args.net_type + '_' + args.dataset) pretrained_path = os.path.join('./pretrained/', args.net_type + '_' + args.dataset + '.pth') model_path = os.path.join(args.modeldir, args.net_type + '_' + args.dataset + '.pth') if os.path.isdir(outdir) == False: os.mkdir(outdir) if os.path.isdir(args.modeldir) == False: os.mkdir(args.modeldir) torch.cuda.manual_seed(0) torch.cuda.manual_seed_all(0) torch.cuda.set_device(args.gpu) if args.dataset == 'svhn': num_classes = 10 ood_list = ['cifar10', 'imagenet_crop', 'lsun_crop'] elif args.dataset == 'cifar10': num_classes = 10 ood_list = ['svhn', 'imagenet_crop', 'lsun_crop'] elif args.dataset == 'cifar100': num_classes = 100 ood_list = ['svhn', 'imagenet_crop', 'lsun_crop'] if args.net_type == 'densenet': model = models.DenseNet_DeepMCDD(num_classes=num_classes) if args.pretrained == True: model.load_fe_weights( torch.load(pretrained_path, map_location="cuda:" + str(args.gpu))) in_transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((125.3 / 255, 123.0 / 255, 113.9 / 255), (63.0 / 255, 62.1 / 255.0, 66.7 / 255.0)), ]) in_transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((125.3 / 255, 123.0 / 255, 113.9 / 255), (63.0 / 255, 62.1 / 255.0, 66.7 / 255.0)), ]) elif args.net_type == 'resnet': model = models.ResNet_DeepMCDD(num_classes=num_classes) if args.pretrained == True: model.load_fe_weights( torch.load(pretrained_path, map_location="cuda:" + str(args.gpu))) in_transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) in_transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) model.cuda() optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=1e-4, nesterov=True) scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=[int(args.num_epochs * 0.5), int(args.num_epochs * 0.75)], gamma=0.1) train_loader, test_id_loader = get_id_image_data(args.dataset, args.batch_size, in_transform_train, in_transform_test, args.datadir) ce_loss = torch.nn.CrossEntropyLoss() for epoch in range(args.num_epochs): model.train() total_loss = 0.0 for i, (images, labels) in enumerate(train_loader): images, labels = images.cuda(), labels.cuda() dists = model(images) scores = -dists + model.alphas label_mask = torch.zeros(labels.size(0), num_classes).cuda().scatter_( 1, labels.unsqueeze(dim=1), 1) pull_loss = torch.mean( torch.sum(torch.mul(label_mask, dists), dim=1)) push_loss = ce_loss(scores, labels) loss = args.reg_lambda * pull_loss + push_loss optimizer.zero_grad() loss.backward() optimizer.step() total_loss += loss.item() scheduler.step() model.eval() with torch.no_grad(): # (1) evaluate ID classification correct, total = 0, 0 for images, labels in test_id_loader: images, labels = images.cuda(), labels.cuda() scores = -model(images) + model.alphas _, predicted = torch.max(scores, 1) total += labels.size(0) correct += (predicted == labels).sum().item() idacc = 100 * correct / total ood_results_list = [] compute_confscores(model, test_id_loader, outdir, True) for ood in ood_list: test_ood_loader = get_ood_image_data(ood, args.batch_size, in_transform_test, args.datadir) compute_confscores(model, test_ood_loader, outdir, False) ood_results_list.append(compute_metrics(outdir)) print('== Epoch [{}/{}], Loss {} =='.format(epoch + 1, args.num_epochs, total_loss)) print('ID Accuracy on "{idset:s}" test images : {val:6.2f}\n'.format( idset=args.dataset, val=idacc)) for ood_idx, ood_results in enumerate(ood_results_list): print('OOD accuracy on "{oodset:s}" test samples :'.format( oodset=ood_list[ood_idx])) print_ood_results(ood_results) torch.save(model.state_dict(), model_path)
for ds, images in dataset_data: for img_path in images: shutil.copy(img_path, f'{DATA_DIR}/{ds}/{class_name}/') """We have some class imbalance, but it is not that bad. We'll ignore it. We'll apply some image augmentation techniques to artificially increase the size of our training dataset: """ mean_nums = [0.485, 0.456, 0.406] std_nums = [0.229, 0.224, 0.225] transforms = {'train': T.Compose([ T.RandomResizedCrop(size=256), T.RandomRotation(degrees=15), T.RandomHorizontalFlip(), T.ToTensor(), T.Normalize(mean_nums, std_nums) ]), 'val': T.Compose([ T.Resize(size=256), T.CenterCrop(size=224), T.ToTensor(), T.Normalize(mean_nums, std_nums) ]), 'test': T.Compose([ T.Resize(size=256), T.CenterCrop(size=224), T.ToTensor(), T.Normalize(mean_nums, std_nums) ]), }
def train_imagenet(): print('==> Preparing data..') img_dim = get_model_property('img_dim') if FLAGS.fake_data: train_dataset_len = 1200000 # Roughly the size of Imagenet dataset. train_loader = xu.SampleGenerator( data=(torch.zeros(FLAGS.batch_size, 3, img_dim, img_dim), torch.zeros(FLAGS.batch_size, dtype=torch.int64)), sample_count=train_dataset_len // FLAGS.batch_size // xm.xrt_world_size()) test_loader = xu.SampleGenerator( data=(torch.zeros(FLAGS.test_set_batch_size, 3, img_dim, img_dim), torch.zeros(FLAGS.test_set_batch_size, dtype=torch.int64)), sample_count=50000 // FLAGS.batch_size // xm.xrt_world_size()) else: normalize = transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = torchvision.datasets.ImageFolder( os.path.join(FLAGS.datadir, 'train'), transforms.Compose([ transforms.RandomResizedCrop(img_dim), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) train_dataset_len = len(train_dataset.imgs) resize_dim = max(img_dim, 256) test_dataset = torchvision.datasets.ImageFolder( os.path.join(FLAGS.datadir, 'val'), # Matches Torchvision's eval transforms except Torchvision uses size # 256 resize for all models both here and in the train loader. Their # version crashes during training on 299x299 images, e.g. inception. transforms.Compose([ transforms.Resize(resize_dim), transforms.CenterCrop(img_dim), transforms.ToTensor(), normalize, ])) train_sampler, test_sampler = None, None if xm.xrt_world_size() > 1: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True) test_sampler = torch.utils.data.distributed.DistributedSampler( test_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=False) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=FLAGS.batch_size, sampler=train_sampler, drop_last=FLAGS.drop_last, shuffle=False if train_sampler else True, num_workers=FLAGS.num_workers) test_loader = torch.utils.data.DataLoader( test_dataset, batch_size=FLAGS.test_set_batch_size, sampler=test_sampler, drop_last=FLAGS.drop_last, shuffle=False, num_workers=FLAGS.num_workers) torch.manual_seed(42) device = xm.xla_device() model = get_model_property('model_fn')().to(device) writer = None if xm.is_master_ordinal(): writer = test_utils.get_summary_writer(FLAGS.logdir) optimizer = optim.SGD( model.parameters(), lr=FLAGS.lr, momentum=FLAGS.momentum, weight_decay=1e-4) num_training_steps_per_epoch = train_dataset_len // ( FLAGS.batch_size * xm.xrt_world_size()) lr_scheduler = schedulers.wrap_optimizer_with_scheduler( optimizer, scheduler_type=getattr(FLAGS, 'lr_scheduler_type', None), scheduler_divisor=getattr(FLAGS, 'lr_scheduler_divisor', None), scheduler_divide_every_n_epochs=getattr( FLAGS, 'lr_scheduler_divide_every_n_epochs', None), num_steps_per_epoch=num_training_steps_per_epoch, summary_writer=writer) loss_fn = nn.CrossEntropyLoss() def train_loop_fn(loader, epoch): tracker = xm.RateTracker() model.train() for step, (data, target) in enumerate(loader): optimizer.zero_grad() output = model(data) loss = loss_fn(output, target) loss.backward() xm.optimizer_step(optimizer) tracker.add(FLAGS.batch_size) if lr_scheduler: lr_scheduler.step() if step % FLAGS.log_steps == 0: xm.add_step_closure( _train_update, args=(device, step, loss, tracker, epoch)) def test_loop_fn(loader, epoch): total_samples, correct = 0, 0 model.eval() for step, (data, target) in enumerate(loader): output = model(data) pred = output.max(1, keepdim=True)[1] correct += pred.eq(target.view_as(pred)).sum() total_samples += data.size()[0] if step % FLAGS.log_steps == 0: xm.add_step_closure( test_utils.print_test_update, args=(device, None, epoch, step)) accuracy = 100.0 * correct.item() / total_samples accuracy = xm.mesh_reduce('test_accuracy', accuracy, np.mean) return accuracy accuracy, max_accuracy = 0.0, 0.0 for epoch in range(1, FLAGS.num_epochs + 1): xm.master_print('Epoch {} train begin {}'.format(epoch, test_utils.now())) para_loader = pl.ParallelLoader(train_loader, [device]) train_loop_fn(para_loader.per_device_loader(device), epoch) xm.master_print('Epoch {} train end {}'.format(epoch, test_utils.now())) para_loader = pl.ParallelLoader(test_loader, [device]) accuracy = test_loop_fn(para_loader.per_device_loader(device), epoch) xm.master_print('Epoch {} test end {}, Accuracy={:.2f}'.format( epoch, test_utils.now(), accuracy)) max_accuracy = max(accuracy, max_accuracy) test_utils.write_to_summary( writer, epoch, dict_to_write={'Accuracy/test': accuracy}, write_xla_metrics=True) if FLAGS.metrics_debug: xm.master_print(met.metrics_report()) test_utils.close_summary_writer(writer) xm.master_print('Max Accuracy: {:.2f}%'.format(max_accuracy)) return max_accuracy