def train( embed_file, train_file, test_file, n_epoch=20, batch_size=70, model=1): # Load files print("init preprocessor with %s" % embed_file) processor = MsrpCorpusPreprocessor(embed_file) print("load MSRParaphraseCorpus [train] from %s" % train_file) X_train_raw, y_train = load_msrp_corpus(train_file) print("load MSRParaphraseCorpus [test] from %s" % test_file) X_test_raw, y_test = load_msrp_corpus(test_file) print('') print("initialize ...") print('--------------------------------') print('# Minibatch-size: %d' % batch_size) print('# epoch: %d' % n_epoch) print('--------------------------------') # Preprocess data X_train = processor.fit_transform(X_train_raw) X_test = processor.transform(X_test_raw) # Set up a neural network to train if model == 2: print("use BCNN model") model = BClassifier(BCNN( channels=3, filter_width=3, embeddings=processor.embeddings, )) else: print("use BiCNN model") model = BiClassifier(BiCNN( channels=[3, 5], filter_width=[6, 14], embeddings=processor.embeddings, k_top=4, beta=2, pool_size=[(10, 10), (10, 10), (6, 6), (2, 2)] )) # Setup an optimizer optimizer = optimizers.AdaGrad(lr=0.01) optimizer.setup(model) # Initialize datasets train_iter = iterators.SerialIterator(datasets.TupleDataset(X_train, y_train), batch_size, repeat=True, shuffle=True) test_iter = iterators.SerialIterator(datasets.TupleDataset(X_test, y_test), batch_size, repeat=False, shuffle=False) # Set up a trainer updater = training.StandardUpdater(train_iter, optimizer) trainer = training.Trainer(updater, (n_epoch, 'epoch'), out='logs') # Set extensions trainer.extend(E.Evaluator(test_iter, model)) trainer.extend(E.dump_graph('main/loss')) trainer.extend(E.snapshot(), trigger=(n_epoch, 'epoch')) trainer.extend(E.LogReport()) trainer.extend(E.PrintReport( ['epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy'])) trainer.extend(E.ProgressBar(update_interval=2)) # Run the training print("trainer.run() executed") print('') trainer.run()
def main(): args = arg_parse() print_args(args) print("==> Creating dataloader...") data_dir = args.data_path test_list1 = './list/image/test.txt' test_loader1 = get_test_set(data_dir, test_list1, args) test_list2 = './list/video/large/test.txt' test_loader2 = get_test_set_v(data_dir, test_list2, args) test_list3 = './list/audio/test.txt' test_loader3 = get_test_set(data_dir, test_list3, args) test_list4 = './list/text/test.txt' test_loader4 = get_text_set(data_dir, test_list4, args, 'test') data_set1 = CubTextDataset('dataset', 'list/text/test.txt', 'test') test_loader5 = DataLoader(dataset=data_set1, batch_size=1, shuffle=False) out_feature_dir1 = os.path.join(args.feature, 'image') out_feature_dir2 = os.path.join(args.feature, 'video') out_feature_dir3 = os.path.join(args.feature, 'audio') out_feature_dir4 = os.path.join(args.feature, 'text') mkdir(out_feature_dir1) mkdir(out_feature_dir2) mkdir(out_feature_dir3) mkdir(out_feature_dir4) print("==> Loading the modelwork ...") model = resnet50(num_classes=200) model = model.cuda() ''' if args.gpu is not None: model = nn.DataParallel(model, device_ids=range(args.gpu)) model = model.cuda() cudnn.benchmark = True ''' if args.snapshot: if os.path.isfile(args.snapshot): print("==> loading checkpoint '{}'".format(args.snapshot)) checkpoint = torch.load(args.snapshot) model.load_state_dict(checkpoint) print("==> loaded checkpoint '{}'".format(args.snapshot)) else: print("==> no checkpoint found at '{}'".format(args.snapshot)) exit() model_audio = VGG16BN(n_classes=200, pretrained=False).cuda() if args.snapshotaudio: if os.path.isfile(args.snapshotaudio): print("==> loading checkpoint '{}'".format(args.snapshotaudio)) checkpoint = torch.load(args.snapshotaudio) model_audio.load_state_dict(checkpoint) print("==> loaded checkpoint '{}'".format(args.snapshot)) else: print("==> no checkpoint found at '{}'".format(args.snapshot)) exit() model_img = BCNN_img(n_classes=200, pretrained=False).cuda() if args.gpu is not None: # model = torch.nn.DataParallel(model, device_ids=range(args.gpu)) model_img = nn.DataParallel(model_img, device_ids=[0]) # model = model.cuda() cudnn.benchmark = True if args.snapshotimg: # os.path.isfile(args.snapshot): print("==> loading checkpoint '{}'".format(args.snapshotimg)) checkpoint = torch.load(args.snapshotimg) model_dict = model_img.module.state_dict() restore_param = { k: v for k, v in checkpoint.items() if k in model_dict } model_dict.update(restore_param) model_img.module.load_state_dict(model_dict) print("==> loaded checkpoint '{}'".format(args.snapshotimg)) else: print("==> no checkpoint found at '{}'".format(args.snapshotimg)) model_img.eval() model_rnn = LSTMClassifier().cuda() if True: # if os.path.isfile(args.snapshot): # 'snapshot是path to latest checkpoint' print("==> loading checkpoint '{}'".format( './pretrained/rnnmodel_word2vec_39.375.pkl')) checkpoint = torch.load( './pretrained/rnnmodel_word2vec_39.375.pkl') # 加载模型 model_dict = model_rnn.state_dict() restore_param = { k: v for k, v in checkpoint.items() if k in model_dict } model_dict.update(restore_param) model_rnn.load_state_dict(model_dict) print("==> loaded checkpoint '{}'".format( './pretrained/rnnmodel_word2vec_39.375.pkl')) else: print("==> no checkpoint found at '{}'".format( './pretrained/rnnmodel_word2vec_39.375.pkl')) model_video = BCNN(n_classes=200, pretrained=False).cuda() # if args.gpu is not None: # # model = torch.nn.DataParallel(model, device_ids=range(args.gpu)) # model_video = nn.DataParallel(model_video, device_ids=[0]) # # model = model.cuda() # cudnn.benchmark = True # # if args.snapshotvideo: # os.path.isfile(args.snapshot): # print("==> loading checkpoint '{}'".format(args.snapshotvideo)) # checkpoint = torch.load(args.snapshotvideo) # model_dict = model_video.module.state_dict() # restore_param = {k: v for k, v in checkpoint.items() if k in model_dict} # model_dict.update(restore_param) # model_video.module.load_state_dict(model_dict) # print("==> loaded checkpoint '{}'".format(args.snapshotvideo)) # else: # print("==> no checkpoint found at '{}'".format(args.snapshotvideo)) model.eval() # model_video.eval() # model_rnn.eval() # print("Text Acc:") # text_acc = validate(test_loader4, model_rnn, args, True) # print("image Acc:") # # image_acc = validate(test_loader1, model, args, False) # print("V Acc:") # video_acc = validate_v(test_loader2, model,model_video,args, False) # print("A Acc:") # text_acc = validate(test_loader3, model, args, False) # model = model.module print("Text Features ...") txt = extra_t(model, model_rnn, test_loader4, test_loader5, out_feature_dir4, args, flag='t') print("Image Features ...") img = extra_i(model, model_img, test_loader1, out_feature_dir1, args, flag='i') # img=os.path.join(args.feature, 'image') + '/features_te.txt' print("Video Features ...") vid = extra(model, test_loader2, out_feature_dir2, args, flag='v') print("Audio Features ...") aud = extra_i(model, model_audio, test_loader3, out_feature_dir3, args, flag='a') # aud = os.path.join(args.feature, 'audio') + '/features_te.txt' # print("Text Features ...") # txt = extra_t(model,model_rnn,test_loader4,test_loader5, out_feature_dir4, args, flag='t') # txt = os.path.join(args.feature, 'text') + '/features_te.txt' compute_mAP(img, vid, aud, txt)
def __init__(self, options, path, ckpt_basename='vgg_16'): """Prepare the network, criterion, solver, and data. Args: options, dict: Hyperparameters. """ print('Prepare the network and data.') self._options = options self._path = path self.ckpt_basename = ckpt_basename # Network. self._net = BCNN(freeze_features=True) #self._net = torch.nn.DataParallel(self._net) self._net.features = torch.nn.DataParallel(self._net.features) self._net.cuda() if 'ckpt_path' in self._path: if os.path.exists(self._path['ckpt_path']): print('Continue from', self._path['ckpt_path']) self._net.load_state_dict(torch.load(self._path['ckpt_path'])) else: print('Ckpt {} not found!'.format(self._path['ckpt_path'])) print(self._net) # Criterion. self._criterion = torch.nn.CrossEntropyLoss().cuda() # Solver. self._solver = torch.optim.SGD( self._net.fc.parameters(), lr=self._options['base_lr'], momentum=0.9, weight_decay=self._options['weight_decay']) if self._options['lr_scheduler'] == 'reduce_on_plateau': self._scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( self._solver, mode='max', factor=0.1, patience=5, verbose=True, threshold=1e-4, min_lr=1e-6) elif self._options['lr_scheduler'] == 'fixed': self._scheduler = torch.optim.lr_scheduler.LambdaLR( self._solver, lambda epoch: 1.0) else: raise ValueError('Unknown scheduler:', self._options['lr_scheduler']) # Imagenet normalization normalize = torchvision.transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_transforms = torchvision.transforms.Compose([ torchvision.transforms.Resize(size=448), # Let smaller edge match torchvision.transforms.RandomHorizontalFlip(), torchvision.transforms.RandomCrop(size=448), torchvision.transforms.ToTensor(), normalize ]) test_transforms = torchvision.transforms.Compose([ torchvision.transforms.Resize(size=448), torchvision.transforms.CenterCrop(size=448), torchvision.transforms.ToTensor(), normalize ]) train_data = cub200.CUB200(root=self._path['cub200'], train=True, download=True, transform=train_transforms) test_data = cub200.CUB200(root=self._path['cub200'], train=False, download=True, transform=test_transforms) self._train_loader = torch.utils.data.DataLoader( train_data, batch_size=self._options['batch_size'], shuffle=True, num_workers=4, pin_memory=True) self._test_loader = torch.utils.data.DataLoader(test_data, batch_size=16, shuffle=False, num_workers=4, pin_memory=True)
from PIL import ImageFile # Python:IOError: image file is truncated 的解决办法 ImageFile.LOAD_TRUNCATED_IMAGES = True torch.manual_seed(0) torch.cuda.manual_seed(0) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--data', type=str, required=True) parser.add_argument('--model', type=str, required=True) args = parser.parse_args() data_dir = args.data model_path = args.model net = BCNN(pretrained=False) if torch.cuda.device_count() >= 1: net = torch.nn.DataParallel(net).cuda() print('cuda device : ', torch.cuda.device_count()) else: raise EnvironmentError( 'This is designed to run on GPU but no GPU is found') net.load_state_dict(torch.load(model_path)) test_transform = torchvision.transforms.Compose([ torchvision.transforms.Resize(size=448), torchvision.transforms.CenterCrop(size=448), torchvision.transforms.ToTensor(), torchvision.transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
class BCNNTrainer(object): """Manager class to train bilinear CNN. Attributes: _options: Hyperparameters. _path: Useful paths. _net: Bilinear CNN. _criterion: Cross-entropy loss. _solver: SGD with momentum. _scheduler: Reduce learning rate by a fator of 0.1 when plateau. _train_loader: Training data. _test_loader: Testing data. """ def __init__(self, options, path, ckpt_basename='vgg_16'): """Prepare the network, criterion, solver, and data. Args: options, dict: Hyperparameters. """ print('Prepare the network and data.') self._options = options self._path = path self.ckpt_basename = ckpt_basename # Network. self._net = BCNN(freeze_features=True) #self._net = torch.nn.DataParallel(self._net) self._net.features = torch.nn.DataParallel(self._net.features) self._net.cuda() if 'ckpt_path' in self._path: if os.path.exists(self._path['ckpt_path']): print('Continue from', self._path['ckpt_path']) self._net.load_state_dict(torch.load(self._path['ckpt_path'])) else: print('Ckpt {} not found!'.format(self._path['ckpt_path'])) print(self._net) # Criterion. self._criterion = torch.nn.CrossEntropyLoss().cuda() # Solver. self._solver = torch.optim.SGD( self._net.fc.parameters(), lr=self._options['base_lr'], momentum=0.9, weight_decay=self._options['weight_decay']) if self._options['lr_scheduler'] == 'reduce_on_plateau': self._scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( self._solver, mode='max', factor=0.1, patience=5, verbose=True, threshold=1e-4, min_lr=1e-6) elif self._options['lr_scheduler'] == 'fixed': self._scheduler = torch.optim.lr_scheduler.LambdaLR( self._solver, lambda epoch: 1.0) else: raise ValueError('Unknown scheduler:', self._options['lr_scheduler']) # Imagenet normalization normalize = torchvision.transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_transforms = torchvision.transforms.Compose([ torchvision.transforms.Resize(size=448), # Let smaller edge match torchvision.transforms.RandomHorizontalFlip(), torchvision.transforms.RandomCrop(size=448), torchvision.transforms.ToTensor(), normalize ]) test_transforms = torchvision.transforms.Compose([ torchvision.transforms.Resize(size=448), torchvision.transforms.CenterCrop(size=448), torchvision.transforms.ToTensor(), normalize ]) train_data = cub200.CUB200(root=self._path['cub200'], train=True, download=True, transform=train_transforms) test_data = cub200.CUB200(root=self._path['cub200'], train=False, download=True, transform=test_transforms) self._train_loader = torch.utils.data.DataLoader( train_data, batch_size=self._options['batch_size'], shuffle=True, num_workers=4, pin_memory=True) self._test_loader = torch.utils.data.DataLoader(test_data, batch_size=16, shuffle=False, num_workers=4, pin_memory=True) def train(self): """Train the network.""" print('Training.') self._net.train() best_acc = 0.0 best_epoch = None for epoch in range(self._options['epochs']): epoch_loss = [] num_correct = 0 num_total = 0 for batch_idx, (X, y) in enumerate(self._train_loader): # Data. X = X.cuda(non_blocking=True) y = y.cuda(non_blocking=True) # Clear the existing gradients. self._solver.zero_grad() # Forward pass. score = self._net(X) loss = self._criterion(score, y) epoch_loss.append(loss.data.item()) # Prediction. _, prediction = torch.max(score.data, 1) num_total += y.size(0) num_correct += torch.sum(prediction == y.data).item() # Backward pass. loss.backward() self._solver.step() sys.stdout.write('\r') sys.stdout.write( '| Epoch [%3d/%3d] Iter[%3d/%3d]\t\tLoss: %.4f Acc@1: %.3f%%' % (epoch, self._options['epochs'], batch_idx + 1, len(self._train_loader), loss.data.item(), (100. * num_correct) / num_total)) sys.stdout.flush() train_acc = 100 * num_correct / num_total test_acc = self._accuracy(self._test_loader) print('\nEpoch\tTrain loss\tTrain acc\tTest acc') print('%d\t%4.3f\t\t%4.2f%%\t\t%4.2f%%' % (epoch + 1, np.mean(epoch_loss), train_acc, test_acc)) self._scheduler.step(test_acc) if test_acc > best_acc: best_acc = test_acc best_epoch = epoch + 1 print('*', end='') # Save model onto disk. save_path = os.path.join( self._path['model'], '{}_epoch_{}.pth'.format(self.ckpt_basename, epoch + 1)) save_path_best = os.path.join( self._path['model'], '{}_epoch_best.pth'.format(self.ckpt_basename)) torch.save(self._net.state_dict(), save_path) shutil.copy(save_path, save_path_best) print('Best at epoch %d, test accuaray %f' % (best_epoch, best_acc)) def _accuracy(self, data_loader): """Compute the train/test accuracy. Args: data_loader: Train/Test DataLoader. Returns: Train/Test accuracy in percentage. """ self._net.eval() num_correct = 0 num_total = 0 for X, y in data_loader: # Data. X = X.cuda(non_blocking=True) y = y.cuda(non_blocking=True) with torch.no_grad(): # Prediction. score = self._net(X) _, prediction = torch.max(score.data, 1) num_total += y.size(0) num_correct += torch.sum(prediction == y.data).item() return 100 * num_correct / num_total def getStat(self): """Get the mean and std value for a certain dataset.""" print('Compute mean and variance for training data.') train_data = cub200.CUB200(root=self._path['cub200'], train=True, transform=torchvision.transforms.ToTensor(), download=True) train_loader = torch.utils.data.DataLoader(train_data, batch_size=1, shuffle=False, num_workers=4, pin_memory=True) mean = torch.zeros(3) std = torch.zeros(3) for X, _ in tqdm(train_loader): for d in range(3): mean[d] += X[:, d, :, :].mean() std[d] += X[:, d, :, :].std() mean.div_(len(train_data)) std.div_(len(train_data)) print(mean) print(std)
def __init__(self, options, path): """Prepare the network, criterion, solver, and data. Args: options, dict: Hyperparameters. """ print('Prepare the network and data.') self._options = options self._path = path # Network. if self._options['dataset'] == 'cub200': num_classes = 200 elif self._options['dataset'] == 'aircraft': num_classes = 100 else: raise NotImplementedError("Dataset " + self._options['dataset'] + " is not implemented.") self._net = BCNN(num_classes=num_classes, pretrained=options['target'] == 'fc') # Load the model from disk. if options['target'] == 'all': self._net.load_state_dict(torch.load(self._path['model'])) self._net = torch.nn.parallel.DistributedDataParallel( self._net.cuda(), device_ids=[self._options['local_rank']], output_device=self._options['local_rank']) if dist.get_rank() == 0: print(self._net) # Criterion. self._criterion = torch.nn.CrossEntropyLoss().cuda() # Solver. self._solver = torch.optim.SGD( self._net.module.trainable_params, lr=self._options['base_lr'] * dist.get_world_size(), momentum=0.9, weight_decay=self._options['weight_decay']) self._scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( self._solver, mode='max', factor=0.1, patience=3, verbose=True, threshold=1e-4) train_transforms = torchvision.transforms.Compose([ torchvision.transforms.Resize(size=448), # Let smaller edge match torchvision.transforms.RandomHorizontalFlip(), torchvision.transforms.RandomCrop(size=448), torchvision.transforms.ToTensor(), torchvision.transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) ]) test_transforms = torchvision.transforms.Compose([ torchvision.transforms.Resize(size=448), torchvision.transforms.CenterCrop(size=448), torchvision.transforms.ToTensor(), torchvision.transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) ]) if self._options['dataset'] == 'cub200': train_data = cub200.CUB200(root=self._path['dataset'], train=True, download=True, transform=train_transforms) test_data = cub200.CUB200(root=self._path['dataset'], train=False, download=True, transform=test_transforms) elif self._options['dataset'] == 'aircraft': train_data = aircraft.Aircraft(root=self._path['dataset'], train=True, download=True, transform=train_transforms) test_data = aircraft.Aircraft(root=self._path['dataset'], train=False, download=True, transform=test_transforms) else: raise NotImplementedError("Dataset " + self._options['dataset'] + " is not implemented.") # Partition dataset among workers using DistributedSampler train_sampler = distributed.DistributedSampler( train_data, num_replicas=dist.get_world_size(), rank=dist.get_rank()) test_sampler = distributed.DistributedSampler( test_data, num_replicas=dist.get_world_size(), rank=dist.get_rank()) self._train_loader = torch.utils.data.DataLoader( train_data, batch_size=self._options['batch_size'], shuffle=False, num_workers=4, pin_memory=True, sampler=train_sampler) self._test_loader = torch.utils.data.DataLoader( test_data, batch_size=self._options['batch_size'], shuffle=False, num_workers=4, pin_memory=True, sampler=test_sampler)
class BCNNManager(object): """Manager class to train bilinear CNN. Attributes: _options: Hyperparameters. _path: Useful paths. _net: Bilinear CNN. _criterion: Cross-entropy loss. _solver: SGD with momentum. _scheduler: Reduce learning rate by a fator of 0.1 when plateau. _train_loader: Training data. _test_loader: Testing data. """ def __init__(self, options, path): """Prepare the network, criterion, solver, and data. Args: options, dict: Hyperparameters. """ print('Prepare the network and data.') self._options = options self._path = path # Network. if self._options['dataset'] == 'cub200': num_classes = 200 elif self._options['dataset'] == 'aircraft': num_classes = 100 else: raise NotImplementedError("Dataset " + self._options['dataset'] + " is not implemented.") self._net = BCNN(num_classes=num_classes, pretrained=options['target'] == 'fc') # Load the model from disk. if options['target'] == 'all': self._net.load_state_dict(torch.load(self._path['model'])) self._net = torch.nn.parallel.DistributedDataParallel( self._net.cuda(), device_ids=[self._options['local_rank']], output_device=self._options['local_rank']) if dist.get_rank() == 0: print(self._net) # Criterion. self._criterion = torch.nn.CrossEntropyLoss().cuda() # Solver. self._solver = torch.optim.SGD( self._net.module.trainable_params, lr=self._options['base_lr'] * dist.get_world_size(), momentum=0.9, weight_decay=self._options['weight_decay']) self._scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( self._solver, mode='max', factor=0.1, patience=3, verbose=True, threshold=1e-4) train_transforms = torchvision.transforms.Compose([ torchvision.transforms.Resize(size=448), # Let smaller edge match torchvision.transforms.RandomHorizontalFlip(), torchvision.transforms.RandomCrop(size=448), torchvision.transforms.ToTensor(), torchvision.transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) ]) test_transforms = torchvision.transforms.Compose([ torchvision.transforms.Resize(size=448), torchvision.transforms.CenterCrop(size=448), torchvision.transforms.ToTensor(), torchvision.transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) ]) if self._options['dataset'] == 'cub200': train_data = cub200.CUB200(root=self._path['dataset'], train=True, download=True, transform=train_transforms) test_data = cub200.CUB200(root=self._path['dataset'], train=False, download=True, transform=test_transforms) elif self._options['dataset'] == 'aircraft': train_data = aircraft.Aircraft(root=self._path['dataset'], train=True, download=True, transform=train_transforms) test_data = aircraft.Aircraft(root=self._path['dataset'], train=False, download=True, transform=test_transforms) else: raise NotImplementedError("Dataset " + self._options['dataset'] + " is not implemented.") # Partition dataset among workers using DistributedSampler train_sampler = distributed.DistributedSampler( train_data, num_replicas=dist.get_world_size(), rank=dist.get_rank()) test_sampler = distributed.DistributedSampler( test_data, num_replicas=dist.get_world_size(), rank=dist.get_rank()) self._train_loader = torch.utils.data.DataLoader( train_data, batch_size=self._options['batch_size'], shuffle=False, num_workers=4, pin_memory=True, sampler=train_sampler) self._test_loader = torch.utils.data.DataLoader( test_data, batch_size=self._options['batch_size'], shuffle=False, num_workers=4, pin_memory=True, sampler=test_sampler) def train(self): """Train the network.""" best_acc = 0.0 best_epoch = None if dist.get_rank() == 0: print('Training.') print('Epoch\tTrain loss\tTrain acc\tTest acc\tTrain time') for t in range(self._options['epochs']): t0 = time.time() self._train_loader.sampler.set_epoch(t) epoch_loss = [] num_correct = 0 num_total = 0 for X, y in self._train_loader: # Data. X = torch.autograd.Variable(X.cuda()) y = torch.autograd.Variable(y.cuda(async=True)) # Clear the existing gradients. self._solver.zero_grad() # Forward pass. score = self._net(X) loss = self._criterion(score, y) epoch_loss.append(loss.item()) # Prediction. _, prediction = torch.max(score.data, 1) num_total += y.size(0) num_correct += torch.sum(prediction == y.data) # Backward pass. loss.backward() self._solver.step() test_acc = self._accuracy(self._test_loader) self._scheduler.step(test_acc) if dist.get_rank() == 0: train_acc = 100 * num_correct / num_total if test_acc > best_acc: best_acc = test_acc best_epoch = t + 1 print('*', end='') # Save model onto disk. torch.save( self._net.module.state_dict(), os.path.join(self._path['model_dir'], 'vgg_16_epoch_%d.pth' % (t + 1))) print('%d\t%4.3f\t\t%4.2f%%\t\t%4.2f%%\t\t%4.2fs' % (t + 1, sum(epoch_loss) / len(epoch_loss), train_acc, test_acc, time.time() - t0)) if dist.get_rank() == 0: print('Best at epoch %d, test accuaray %f' % (best_epoch, best_acc)) def _accuracy(self, data_loader): """Compute the train/test accuracy. Args: data_loader: Train/Test DataLoader. Returns: Train/Test accuracy in percentage. """ self._net.train(False) num_correct = 0 num_total = 0 with torch.no_grad(): for X, y in data_loader: # Data. X = torch.autograd.Variable(X.cuda()) y = torch.autograd.Variable(y.cuda(async=True)) # Prediction. score = self._net(X) _, prediction = torch.max(score.data, 1) num_total += y.size(0) num_correct += torch.sum(prediction == y.data).item() self._net.train(True) # Set the model to training phase num_total = torch.tensor(num_total).cuda() num_correct = torch.tensor(num_correct).cuda() dist.all_reduce(num_total, op=dist.ReduceOp.SUM) dist.all_reduce(num_correct, op=dist.ReduceOp.SUM) return 100 * num_correct.data.item() / num_total.data.item() def getStat(self): """Get the mean and std value for a certain dataset.""" print('Compute mean and variance for training data.') train_data = cub200.CUB200(root=self._path['cub200'], train=True, transform=torchvision.transforms.ToTensor(), download=True) train_loader = torch.utils.data.DataLoader(train_data, batch_size=1, shuffle=False, num_workers=4, pin_memory=True) mean = torch.zeros(3) std = torch.zeros(3) for X, _ in train_loader: for d in range(3): mean[d] += X[:, d, :, :].mean() std[d] += X[:, d, :, :].std() mean.div_(len(train_data)) std.div_(len(train_data)) print(mean) print(std)
def main(): # step = args.step print('===> About training in a two-step process! ===') print('------\n' 'drop rate: [{}]\t' '\n------'.format(drop_rate)) # step 1: only train the fc layer if step == 1: print('===> Step 1 ...') bnn = BCNN(pretrained=True, n_classes=num_classes) bnn = nn.DataParallel(bnn).cuda() optimizer = optim.Adam(bnn.module.fc.parameters(), lr=learning_rate, weight_decay=weight_decay) # step 1: train the whole network elif step == 2: print('===> Step 2 ...') bnn = BCNN(pretrained=False, n_classes=num_classes) bnn = nn.DataParallel(bnn).cuda() optimizer = optim.Adam(bnn.parameters(), lr=learning_rate, weight_decay=weight_decay) else: raise AssertionError('Wrong step argument') correcter = self_correcter.Correcter(num_train_images, num_classes, queue_size) loadmodel = 'checkpoint.pth' # check if it is resume mode print( '-----------------------------------------------------------------------------' ) if resume: assert os.path.isfile( loadmodel), 'please make sure checkpoint.pth exists' print('---> loading checkpoint.pth <---') checkpoint = torch.load(loadmodel) assert step == checkpoint[ 'step'], 'step in checkpoint does not match step in argument' start_epoch = checkpoint['epoch'] best_accuracy = checkpoint['best_accuracy'] best_epoch = checkpoint['best_epoch'] bnn.load_state_dict(checkpoint['bnn_state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) correcter.all_predictions = (checkpoint['all_predictions']) correcter.softmax_record = (checkpoint['softmax_record']) correcter.update_counters = (checkpoint['update_counters']) else: if step == 2: print('---> step2 checkpoint loaded <---') bnn.load_state_dict( torch.load('model/bnn_step1_vgg16_best_epoch.pth')) else: print('---> no checkpoint loaded <---') start_epoch = 0 best_accuracy = 0.0 best_epoch = None print( '-----------------------------------------------------------------------------' ) with open(logfile, "a") as f: f.write('------ Step: {} ...\n'.format(step)) f.write('------\n' 'drop rate: [{}]\tqueue_size: [{}]\t' 'warm_up: [{}]\tinit_lr: [{}]\t' '\n'.format(drop_rate, queue_size, warm_up, learning_rate)) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=4, verbose=True, threshold=learning_rate * 1e-3) for epoch in range(start_epoch, num_epochs): epoch_start_time = time.time() bnn.train() if epoch < warm_up: warm = True else: warm = False if not warm: correcter.separate_clean_and_unclean_keys(drop_rate) print("干净的样本数:", len(correcter.clean_key)) train_acc, train_total = train(train_loader, epoch, bnn, optimizer, warm, correcter=correcter) test_acc = evaluate(test_loader, bnn) if not warm: scheduler.step(test_acc) if test_acc > best_accuracy: best_accuracy = test_acc best_epoch = epoch + 1 torch.save(bnn.state_dict(), 'model/bnn_step{}_vgg16_best_epoch.pth'.format(step)) epoch_end_time = time.time() print("all_predictions", len(correcter.all_predictions[0])) print("update_counters", correcter.update_counters[0]) save_checkpoint( { 'epoch': epoch + 1, 'bnn_state_dict': bnn.state_dict(), 'optimizer': optimizer.state_dict(), 'best_epoch': best_epoch, 'best_accuracy': best_accuracy, 'step': step, 'all_predictions': correcter.all_predictions, 'softmax_record': correcter.softmax_record, 'update_counters': correcter.update_counters }, filename=loadmodel) print('------\n' 'Epoch: [{:03d}/{:03d}]\tTrain Accuracy: [{:6.2f}]\t' 'Test Accuracy: [{:6.2f}]\t' 'Epoch Runtime: [{:6.2f}]\t'\ '\n------'.format( epoch + 1, num_epochs, train_acc, test_acc, epoch_end_time - epoch_start_time)) with open(logfile, "a") as f: output = 'Epoch: [{:03d}/{:03d}]\tTrain Accuracy: [{:6.2f}]\t' \ 'Test Accuracy: [{:6.2f}]\t' \ 'Epoch Runtime: [{:7.2f}]\tTrain_total[{:06d}]\tclean_key[{:06d}]'.format( epoch + 1, num_epochs, train_acc, test_acc, epoch_end_time - epoch_start_time,train_total,len(correcter.clean_key)) f.write(output + "\n") print('******\n' 'Best Accuracy 1: [{0:6.2f}], at Epoch [{1:03d}] ' '\n******'.format(best_accuracy, best_epoch)) with open(logfile, "a") as f: output = '******\n' \ 'Best Accuracy 1: [{0:6.2f}], at Epoch [{1:03d}]; ' \ '\n******'.format(best_accuracy, best_epoch) f.write(output + "\n")
def main(): # step = args.step print('===> About training in a two-step process! ===') print('------\n' 'drop rate: [{}]\tT_k: [{}]\t' 'start epoch: [{}]\t' '\n------'.format(drop_rate, T_k, start)) # step 1: only train the fc layer if step == 1: print('===> Step 1 ...') bnn = BCNN(pretrained=True, n_classes=num_classes) bnn = nn.DataParallel(bnn).cuda() optimizer = optim.Adam(bnn.module.fc.parameters(), lr=learning_rate, weight_decay=weight_decay) # step 1: train the whole network elif step == 2: print('===> Step 2 ...') bnn = BCNN(pretrained=False, n_classes=num_classes) bnn = nn.DataParallel(bnn).cuda() optimizer = optim.Adam(bnn.parameters(), lr=learning_rate, weight_decay=weight_decay) else: raise AssertionError('Wrong step argument') loadmodel = 'checkpoint.pth' # check if it is resume mode print( '-----------------------------------------------------------------------------' ) if resume: assert os.path.isfile( loadmodel), 'please make sure checkpoint.pth exists' print('---> loading checkpoint.pth <---') checkpoint = torch.load(loadmodel) assert step == checkpoint[ 'step'], 'step in checkpoint does not match step in argument' start_epoch = checkpoint['epoch'] best_accuracy = checkpoint['best_accuracy'] best_epoch = checkpoint['best_epoch'] bnn.load_state_dict(checkpoint['bnn_state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) Cross_entropy = checkpoint['Cross_entropy'] logits_softmax = checkpoint['logits_softmax'] else: if step == 2: print('---> step2 checkpoint loaded <---') bnn.load_state_dict( torch.load('model/bnn_step1_vgg16_best_epoch.pth')) else: print('---> no checkpoint loaded <---') Cross_entropy = [] logits_softmax = [] start_epoch = 0 best_accuracy = 0.0 best_epoch = None print( '-----------------------------------------------------------------------------' ) with open(logfile, "a") as f: f.write('------ Step: {} ...\n'.format(step)) f.write('------\n' 'drop rate: [{}]\tT_k: [{}]\t' 'start epoch: [{}]\t' '\n------'.format(drop_rate, T_k, start)) for epoch in range(start_epoch, num_epochs): epoch_start_time = time.time() bnn.train() adjust_learning_rate(optimizer, epoch) #train returns 'Cross_entropy', used in saving checkpoints. train_acc, logits_softmax, Cross_entropy = train( train_loader, epoch, bnn, optimizer, logits_softmax, Cross_entropy) # dump the output: cross_entropy, image path, image label, image id. If you want to check the selection result, just use the code blow. # if len(Cross_entropy) > 0: # pickle.dump(Cross_entropy, open(cross_entropy_savapath + 'crossentropy_epoch{}_step{}.pkl'.format(epoch + 1,step), 'wb')) test_acc = evaluate(test_loader, bnn) if test_acc > best_accuracy: best_accuracy = test_acc best_epoch = epoch + 1 torch.save(bnn.state_dict(), 'model/bnn_step{}_vgg16_best_epoch.pth'.format(step)) epoch_end_time = time.time() # save checkpoint save_checkpoint( { 'epoch': epoch + 1, 'bnn_state_dict': bnn.state_dict(), 'optimizer': optimizer.state_dict(), 'best_epoch': best_epoch, 'best_accuracy': best_accuracy, 'step': step, 'Cross_entropy': Cross_entropy, 'logits_softmax': logits_softmax }, filename=loadmodel) print('------\n' 'Epoch: [{:03d}/{:03d}]\tTrain Accuracy: [{:6.2f}]\t' 'Test Accuracy: [{:6.2f}]\t' 'Epoch Runtime: [{:6.2f}]\t'\ '\n------'.format( epoch + 1, num_epochs, train_acc, test_acc, epoch_end_time - epoch_start_time)) with open(logfile, "a") as f: output = 'Epoch: [{:03d}/{:03d}]\tTrain Accuracy: [{:6.2f}]\t' \ 'Test Accuracy: [{:6.2f}]\t' \ 'Epoch Runtime: [{:6.2f}]\t'.format( epoch + 1, num_epochs, train_acc, test_acc, epoch_end_time - epoch_start_time) f.write(output + "\n") print('******\n' 'Best Accuracy 1: [{0:6.2f}], at Epoch [{1:03d}] ' '\n******'.format(best_accuracy, best_epoch)) with open(logfile, "a") as f: output = '******\n' \ 'Best Accuracy 1: [{0:6.2f}], at Epoch [{1:03d}]; ' \ '\n******'.format(best_accuracy, best_epoch) f.write(output + "\n")