if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--ckpt_path', type=str, default='work_dir/bert_ner.pt') parser.add_argument('--input_path', type=str, default='work_dir/selected_paras.json') parser.add_argument('--output_path', type=str, default='work_dir/entities.json') parser.add_argument('--batch_size', type=int, default=32) args = parser.parse_args() eval_dataset = EvalDataset(args.input_path, debug=False) eval_iter = data.DataLoader(dataset=eval_dataset, batch_size=args.batch_size, shuffle=False, collate_fn=pad) device = 'cuda' if torch.cuda.is_available() else 'cpu' model = Net(top_rnns=False, vocab_size=len(VOCAB), device=device, finetuning=True).cuda() model = nn.DataParallel(model) model.load_state_dict(torch.load(args.ckpt_path)) model.eval() eval_para(model, eval_iter, eval_dataset.sent_id, args.output_path)
optimizer.zero_grad() confusion_matrix = net(captions, {'face': face, 'audio': audio, 'visual': video, 'motion': flow}, ind, True) loss = max_margin(confusion_matrix) loss.backward() optimizer.step() running_loss += loss.data[0] if (i_batch+1) % n_display == 0: print 'Epoch %d, Epoch status: %.2f, Training loss: %.4f'%(epoch + 1, args.batch_size*float(i_batch)/dataset_size,running_loss/n_display) running_loss = 0.0 print 'evaluating epoch %d ...'%(epoch+1) net.eval() if args.MSRVTT: retrieval_samples = dataset.getRetrievalSamples() video = Variable(retrieval_samples['video'].cuda(), volatile=True) captions = Variable(retrieval_samples['text'].cuda(), volatile=True) audio = Variable(retrieval_samples['audio'].cuda(), volatile=True) flow = Variable(retrieval_samples['flow'].cuda(), volatile=True) face = Variable(retrieval_samples['face'].cuda(), volatile=True) face_ind = retrieval_samples['face_ind'] ind = {} ind['face'] = face_ind ind['visual'] = np.ones((len(face_ind))) ind['motion'] = np.ones((len(face_ind)))
eta_min=0.005) # Training n_epoch = int(config["Train"]["epoch"]) save_every_epoch = int(config["Train"]["SaveEveryEpoch"]) if save_every_epoch == -1: save_every_epoch = n_epoch + 1 dst_dir = os.path.join(ABS_DIR, config["Train"]["RecordDestination"]) for t in range(n_epoch): result = [] for phase in ['train', 'val']: if phase == 'train': net.train() else: net.eval() # keep track of training and validation loss running_loss = 0.0 running_batch = 0 running_confusion = np.zeros(shape=(n_class, n_class), dtype=np.float) for data, target in data_loader[phase]: data, target = data.to(device), target.to(device) # print(target.shape) with torch.set_grad_enabled(phase == 'train'): # feed the input preds = net(data) # calculate the loss loss = loss_func(preds, target) if phase == 'train': optimizer.zero_grad()
class FSRCNNTrainer(object): def __init__(self, config, training_loader, testing_loader): super(FSRCNNTrainer, self).__init__() self.CUDA = torch.cuda.is_available() self.device = torch.device('cuda' if self.CUDA else 'cpu') self.model = None self.lr = config.lr self.nEpochs = config.nEpochs self.criterion = None self.optimizer = None self.scheduler = None self.seed = config.seed self.upscale_factor = config.upscale_factor self.training_loader = training_loader self.testing_loader = testing_loader configure('logs', flush_secs=5) def build_model(self): self.model = Net(num_channels=1, upscale_factor=self.upscale_factor).to(self.device) self.model.weight_init(mean=0.0, std=0.2) self.criterion = torch.nn.MSELoss() torch.manual_seed(self.seed) if self.CUDA: torch.cuda.manual_seed(self.seed) # 随机种子 cudnn.benchmark = True self.criterion.cuda() self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr) self.scheduler = torch.optim.lr_scheduler.MultiStepLR(self.optimizer, milestones=[50, 75, 100], gamma=0.5) # lr decay def save_model(self): model_out_path = "./checkpoints/model_path.pth" torch.save(self.model, model_out_path) print("Checkpoint saved to {}".format(model_out_path)) def train(self, epoch): self.model.train() train_loss = 0 for batch_num, (data, target) in enumerate(self.training_loader): data, target = data.to(self.device), target.to(self.device) self.optimizer.zero_grad() loss = self.criterion(self.model(data), target) train_loss += loss.item() loss.backward() self.optimizer.step() progress_bar(batch_num, len(self.training_loader), 'Loss: %.4f' % (train_loss / (batch_num + 1))) print(" Average Loss: {:.4f}".format(train_loss / len(self.training_loader))) log_value('train_loss', train_loss / (batch_num + 1), epoch) def test(self, epoch): self.model.eval() avg_psnr = 0 with torch.no_grad(): for batch_num, (data, target) in enumerate(self.testing_loader): data, target = data.to(self.device), target.to(self.device) prediction = self.model(data) mse = self.criterion(prediction, target) psnr = 10 * log10(1 / mse.item()) avg_psnr += psnr progress_bar(batch_num, len(self.testing_loader), 'PSNR: %.4f' % (avg_psnr / (batch_num + 1))) print(" Average PSNR: {:.4f} dB".format(avg_psnr / len(self.testing_loader))) log_value('PSNR', avg_psnr / (batch_num + 1), epoch) def run(self): self.build_model() for epoch in range(1, self.nEpochs + 1): print("\n===> Epoch {} starts:".format(epoch)) self.train(epoch) self.test(epoch) self.scheduler.step(epoch) if epoch == self.nEpochs: self.save_model()
class Solver(object): def __init__(self, config, source_loader, source_val_loader, target_loader, target_val_loader): self.source_loader = source_loader self.source_val_loader = source_val_loader self.target_loader = target_loader self.target_val_loader = target_val_loader self.net = None self.net_optimizer = None self.net_d = None self.net_optimizer_d = None self.beta1 = config.beta1 self.beta2 = config.beta2 self.train_iters = config.train_iters self.pretrain_iters = config.pretrain_iters self.batch_size = config.batch_size self.lr = config.lr self.lr_d = config.lr_d self.alpha_s = config.alpha_s self.alpha_t = config.alpha_t self.beta_c = config.beta_c self.beta_sep = config.beta_sep self.beta_p = config.beta_p self.log_step = config.log_step self.model_path = config.model_path self.num_classes = config.num_classes self.build_model() def build_model(self): """Builds a generator and a discriminator.""" self.net = Net() self.net_d = Net_D() net_params = list(self.net.parameters()) net_d_params = list(self.net_d.parameters()) self.net_optimizer = optim.Adam(net_params, self.lr, [self.beta1, self.beta2]) self.net_optimizer_d = optim.Adam(net_d_params, self.lr_d, [self.beta1, self.beta2]) if torch.cuda.is_available(): self.net.cuda() self.net_d.cuda() def to_var(self, x): """Converts numpy to variable.""" if torch.cuda.is_available(): x = x.cuda() return Variable(x, requires_grad=False) def to_data(self, x): """Converts variable to numpy.""" if torch.cuda.is_available(): x = x.cpu() return x.data.numpy() def reset_grad(self): """Zeros the gradient buffers.""" self.net_optimizer.zero_grad() self.net_optimizer_d.zero_grad() def separability_loss(self, labels, latents, imbalance_parameter=1): criteria = torch.nn.modules.loss.CosineEmbeddingLoss() loss_up = 0 one_cuda = torch.ones(1).cuda() mean = torch.mean(latents, dim=0).cuda().view(1, -1) loss_down = 0 for i in range(self.num_classes): indexes = labels.eq(i) mean_i = torch.mean(latents[indexes], dim=0).view(1, -1) if str(mean_i.norm().item()) != 'nan': for latent in latents[indexes]: loss_up += criteria(latent.view(1, -1), mean_i, one_cuda) loss_down += criteria(mean, mean_i, one_cuda) loss = (loss_up / loss_down) * imbalance_parameter return loss def initialisation(self): self.net.apply(xavier_weights_init) self.net_d.apply(xavier_weights_init) source_iter = iter(self.source_loader) target_iter = iter(self.target_loader) source_val_loader = iter(self.source_val_loader) target_val_iter = iter(self.target_val_loader) source_per_epoch = len(source_iter) target_per_epoch = len(target_iter) targetval_per_epoch = len(target_val_iter) print(source_per_epoch, target_per_epoch, targetval_per_epoch) criterion = nn.CrossEntropyLoss() f_labels = torch.LongTensor(128) f_labels[...] = 10 t_labels = torch.LongTensor(128) t_labels[...] = 1 # pretrain log_pre = 50 source_iter = iter(self.source_loader) source_val_iter = iter(self.source_val_loader) target_iter = iter(self.target_loader) return criterion, source_per_epoch, target_per_epoch, target_iter, source_iter, log_pre, source_val_iter def train(self): criterion, source_per_epoch, target_per_epoch, target_iter, source_iter, log_pre, source_val_iter = self.initialisation() pre_train = not os.path.exists(os.path.join(self.model_path, 'pre_train.pth')) print("Pretrain:\n*********") if pre_train: for step in range(self.pretrain_iters + 1): # ============ Initialization ============# # refresh if (step + 1) % (source_per_epoch) == 0: source_iter = iter(self.source_loader) if (step + 1) % (target_per_epoch) == 0: target_iter = iter(self.target_loader) # load the data source, s_labels = source_iter.next() target, t_labels = target_iter.next() target_rgb = target target, t_labels = self.to_var(target_rgb), self.to_var(t_labels).long().squeeze() source, s_labels = self.to_var(source), self.to_var(s_labels).long().squeeze() # ============ Training ============ # self.reset_grad() # forward latent, c = self.net(source) # loss loss_source_class = criterion(c, s_labels) # one step loss_source_class.backward() self.net_optimizer.step() self.reset_grad() # ============ Validation ============ # if (step + 1) % log_pre == 0: _, c_source = self.net(source) _, c_target = self.net(target) print("[%d/20000] classification loss: %.4f" % ( step + 1, loss_source_class.item())) print("source accuracy %.4f; target accuracy %.4f" % ( accuracy(s_labels, c_source), accuracy(t_labels, c_target))) self.save_model() else: self.load_model() # ============ Initialization ============ # source_iter = iter(self.source_loader) target_iter = iter(self.target_loader) source_val_iter = iter(self.source_val_loader) maxacc = 0.0 maximum_acc = 0.0 max_iter = 0 net_params = list(self.net.parameters()) net_d_params = list(self.net_d.parameters()) self.net_optimizer = optim.Adam(net_params, self.lr, [self.beta1, self.beta2]) self.net_optimizer_d = optim.Adam(net_d_params, self.lr_d, [self.beta1, self.beta2]) print("Second:\n******") # self.validate_source() self.validate_target() for step in range(self.train_iters): # ============ Initialization ============# # refresh if (step + 1) % (target_per_epoch) == 0: target_iter = iter(self.target_loader) if (step + 1) % (source_per_epoch) == 0: source_iter = iter(self.source_loader) source_val_iter = iter(self.source_val_loader) # load the data source, s_labels = source_iter.next() source, s_labels = self.to_var(source), self.to_var(s_labels).long().squeeze() # must squeeze # source_val, s_val_labels = source_val_iter.next() # source_val, s_val_labels = self.to_var(source_val), self.to_var(s_val_labels).long().squeeze() target, t_labels = target_iter.next() target_rgb = target target, t_labels = self.to_var(target_rgb), self.to_var(t_labels).long().squeeze() # ============ train D ============# self.reset_grad() latent_source, c = self.net(source) d = self.net_d(latent_source) loss_d_s1 = F.binary_cross_entropy(d, torch.ones_like(d, dtype=torch.float32)) loss_d_s0 = F.binary_cross_entropy(d, torch.zeros_like(d, dtype=torch.float32)) loss_c_source = criterion(c, s_labels) latent_target, c = self.net(target) d = self.net_d(latent_target) loss_d_t0 = F.binary_cross_entropy(d, torch.zeros_like(d, dtype=torch.float32)) loss_p = loss_d_s0 loss_d = loss_d_s1 + loss_d_t0 # ============ train pseudo labeling ============# chosen_target, pseudo_labels, indexes, imbalance_parameter = pseudo_labeling(target, c) if chosen_target is not None: loss_c_target = criterion(c[indexes], pseudo_labels) latent_target = latent_target[indexes] # ============ class loss ============# loss_sep = self.separability_loss(torch.cat((s_labels, pseudo_labels)), torch.cat((latent_source, latent_target)), imbalance_parameter=imbalance_parameter) else: loss_c_target = 0 loss_sep = 0 loss = self.beta_c * (self.alpha_s * loss_c_source + self.alpha_t * loss_c_target) + \ self.beta_p * loss_p + \ self.beta_sep * loss_sep loss.backward(retain_graph=True) self.net_optimizer.step() loss_d.backward() self.net_optimizer_d.step() self.reset_grad() # ============ Validation ============ # if (step + 1) % self.log_step == 0: print("max accuracy:", colored(maximum_acc, "green"), "iteration", max_iter) _, c_source = self.net(source) _, c_target = self.net(target) print("source accuracy (svhn_train) %.4f; target accuracy %.4f" % ( accuracy(s_labels, c_source), accuracy(t_labels, c_target))) self.validate_source() acc = self.validate_target() if acc > maximum_acc: maximum_acc = acc max_iter = step if acc > maxacc: maxacc = acc torch.save(self.net, "./model_c_" + str(step) + '_' + str(acc) + ".pth") torch.save(self.net_d, "./model_d_" + str(step) + '_' + str(acc) + ".pth") self.reset_grad() # ============ Save the model ============ # torch.save(self.net, "./model_c_final.pth") torch.save(self.net_d, "./model_d_final.pth") def validate_target(self, ): class_correct = [0] * self.num_classes class_total = [0.] * self.num_classes classes = [str(i) for i in range(self.num_classes)] self.net.eval() # prep model for evaluation for data, target in self.target_val_loader: # forward pass: compute predicted outputs by passing inputs to the model data, target = self.to_var(data), self.to_var(target).long().squeeze() data = data.cuda() target = target.cuda() latent, output = self.net(data) _, pred = torch.max(output, 1) correct = np.squeeze(pred.eq(target.data.view_as(pred))) # calculate test accuracy for each object class for i in range(len(target.data)): label = target.data[i] class_correct[label] += correct[i].item() class_total[label] += 1 for i in range(self.num_classes): if class_total[i] > 0: print('Test Accuracy (mnist-test) of %5s: %2d%% (%2d/%2d)' % ( str(i), 100 * class_correct[i] / class_total[i], np.sum(class_correct[i]), np.sum(class_total[i]))) else: print('Test Accuracy (mnist-test) of %5s: N/A (no training examples)' % (classes[i])) print("\nTest Accuracy (mnist-test) (Overall): ", end="") print(colored('%2d%% ' % (100. * np.sum(class_correct) / np.sum(class_total)), "red"), end="") print("(", end="") print(colored(str(int(np.sum(class_correct))), "red"), end=" ") print('/%2d)' % (np.sum(class_total))) self.net.train() return 100. * np.sum(class_correct) / np.sum(class_total) def validate_source(self, ): class_correct = [0] * self.num_classes class_total = [0.] * self.num_classes classes = [str(i) for i in range(self.num_classes)] self.net.eval() # prep model for evaluation for data, target in self.source_val_loader: # forward pass: compute predicted outputs by passing inputs to the model data, target = self.to_var(data), self.to_var(target).long().squeeze() data = data.cuda() target = target.cuda() latent, output = self.net(data) _, pred = torch.max(output, 1) correct = np.squeeze(pred.eq(target.data.view_as(pred))) # calculate test accuracy for each object class for i in range(len(target.data)): label = target.data[i] class_correct[label] += correct[i].item() class_total[label] += 1 for i in range(self.num_classes): if class_total[i] > 0: print('Test Accuracy (svhn-test) of %5s: %2d%% (%2d/%2d)' % ( str(i), 100 * class_correct[i] / class_total[i], np.sum(class_correct[i]), np.sum(class_total[i]))) else: print('Test Accuracy (svhn_test) of %5s: N/A (no training examples)' % (classes[i])) print("\nTest Accuracy (svhn-test) (Overall): ", end="") print(colored('%2d%% ' % (100. * np.sum(class_correct) / np.sum(class_total)), "red"), end="") print("(", end="") print(colored(str(int(np.sum(class_correct))), "red"), end=" ") print('/%2d)' % (np.sum(class_total))) self.net.train() return 100. * np.sum(class_correct) / np.sum(class_total) def save_model(self): torch.save(self.net, os.path.join(self.model_path, 'pre_train.pth')) def load_model(self): self.net = torch.load(os.path.join(self.model_path, 'pre_train.pth'))
class Trainer: def __init__(self, cfg): self.cfg = cfg self.init_env() self.init_device() self.init_data() self.init_model() self.init_optimizer() def init_env(self): self.exp_dir = Path( self.cfg.train_log_root).expanduser().joinpath(self.cfg.exp_id) self.exp_dir.mkdir(parents=True, exist_ok=True) self.log_dir = self.exp_dir.joinpath(self.cfg.log_subdir) self.tb_dir = self.exp_dir.joinpath(self.cfg.tb_subdir) self.ckpt_dir = self.exp_dir.joinpath(self.cfg.ckpt_subdir) self.logger = get_logger(__name__, self.log_dir) self.tb = SummaryWriter(self.tb_dir) torch.manual_seed(self.cfg.seed) self.epoch = 0 self.acc = 0. self.logger.info('Train log location: {}'.format(self.exp_dir)) def init_device(self): self.use_cuda = not self.cfg.no_cuda and torch.cuda.is_available() if self.use_cuda: self.device = torch.device('cuda') self.logger.info('Using gpu') else: self.device = torch.device('cpu') self.logger.info('Using cpu') def init_data(self): self.logger.info('Initializing data loader...') kwargs = { 'num_workers': 1, 'pin_memory': True} if self.use_cuda else {} self.train_loader = torch.utils.data.DataLoader( datasets.MNIST( self.cfg.data_root, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])), batch_size=self.cfg.batch_size, shuffle=True, **kwargs) self.logger.info('Train loader has been initialized.') self.val_loader = torch.utils.data.DataLoader( datasets.MNIST( self.cfg.data_root, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])), batch_size=self.cfg.val_batch_size, shuffle=True, **kwargs) self.logger.info('Test loader has been initialized.') def init_model(self): self.model = Net() data, target = next(iter(self.train_loader)) self.tb.add_graph(self.model, data) self.model = self.model.to(self.device) self.logger.info('Model has been initialized.') def init_optimizer(self): cfg_optim = self.cfg.optim optim_func = getattr(optim, cfg_optim.type) self.optimizer = optim_func( self.model.parameters(), **dict(self.cfg.optim.args)) self.logger.info('Optimizer has been initialized.') def train(self): self.model.train() for batch_idx, (data, target) in enumerate(self.train_loader): data, target = data.to(self.device), target.to(self.device) self.optimizer.zero_grad() output = self.model(data) loss = F.nll_loss(output, target) loss.backward() self.optimizer.step() self.train_loss = loss.item() if batch_idx % self.cfg.log_interval == 0: self.logger.info( '{:2d}, {}/{} loss: {:.6f}, test acc: {:.2f}%'.format( self.epoch, batch_idx * len(data), len(self.train_loader.dataset), loss.item(), self.acc)) total_iter = self.epoch * len(self.train_loader) + batch_idx self.tb.add_scalar('train/loss', loss.item(), total_iter) def test(self): self.model.eval() test_loss = 0 correct = 0 with torch.no_grad(): for data, target in self.val_loader: data, target = data.to(self.device), target.to(self.device) output = self.model(data) test_loss += F.nll_loss(output, target, reduction='sum').item() pred = output.argmax(dim=1, keepdim=True) correct += pred.eq(target.view_as(pred)).sum().item() test_loss /= len(self.val_loader.dataset) self.acc = 100. * correct / len(self.val_loader.dataset) self.logger.info( '{:2d}, test loss: {:.4f}, test acc: {}/{} ({:.2f}%)'.format( self.epoch, test_loss, correct, len(self.val_loader.dataset), self.acc)) self.tb.add_scalar('test/acc', self.acc, self.epoch) self.tb.add_scalar('test/loss', test_loss, self.epoch) def load(self, for_resuming_training=True, label='latest'): ckpt_path = self.ckpt_dir.joinpath('{}.pt'.format(label)) if ckpt_path.is_file(): self.logger.info('Loading model from {}'.format(ckpt_path)) ckpt = torch.load(ckpt_path, map_location=self.device) self.model.load_state_dict(ckpt['model_state_dict']) if for_resuming_training: self.optimizer.load_state_dict(ckpt['optimizer_state_dict']) self.epoch = ckpt['epoch'] + 1 self.acc = ckpt['acc'] self.logger.info( 'Model of epoch {} loaded.'.format(ckpt['epoch'])) else: self.logger.info('No checkpoint found.') def save(self, label='latest'): self.logger.info('Saving model...') self.ckpt_dir.mkdir(exist_ok=True, parents=True) ckpt_path = self.ckpt_dir.joinpath('{}.pt'.format(label)) torch.save({ 'epoch': self.epoch, 'model_state_dict': self.model.state_dict(), 'optimizer_state_dict': self.optimizer.state_dict(), 'train_loss': self.train_loss, 'acc': self.acc }, ckpt_path) self.logger.info('Model saved to {}.'.format(ckpt_path)) def start(self): self.load(for_resuming_training=True) if self.epoch > 0: self.logger.info('Training start from epoch {}'.format(self.epoch)) try: for self.epoch in range(self.epoch, self.cfg.epochs): self.train() self.test() self.logger.info('Training is finished.') except KeyboardInterrupt: self.logger.warning('Keyboard Interrupted.') except Exception as e: self.logger.exception(repr(e)) finally: if self.epoch > 0: self.save()
def train(args): check_paths(args) np.random.seed(args.seed) torch.manual_seed(args.seed) try: with open(param_path, 'r') as tc: trainingParams = json.load(tc) ngf = int(trainingParams.get('ngf', args.ngf)) epochs = int(trainingParams.get('epochs', args.epochs)) batch_size = int(trainingParams.get('batch_size', args.batch_size)) log_interval = int( trainingParams.get('log_interval', args.log_interval)) learning_rate = float( trainingParams.get('learning_rate', args.learning_rate)) cuda = int(trainingParams.get('cuda', args.cuda)) if cuda: logger.info("Using CUDA") torch.cuda.manual_seed(args.seed) kwargs = {'num_workers': 8, 'pin_memory': True} logger.info("Using kwarguments: \n" + str(kwargs)) else: kwargs = {} transform = transforms.Compose([ transforms.Scale(args.image_size), transforms.CenterCrop(args.image_size), transforms.ToTensor(), transforms.Lambda(lambda x: x.mul(255)) ]) train_dataset = datasets.ImageFolder(args.dataset, transform) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, **kwargs) style_model = Net(ngf=ngf) print(style_model) optimizer = Adam(style_model.parameters(), learning_rate) mse_loss = torch.nn.MSELoss() vgg = Vgg16() utils.mod_utils.init_vgg16(args.vgg_model_dir) vgg.load_state_dict( torch.load(os.path.join(args.vgg_model_dir, "vgg16.weight"))) if cuda: style_model.cuda() vgg.cuda() style_loader = StyleLoader(args.style_folder, args.style_size) for e in range(epochs): style_model.train() agg_content_loss = 0. agg_style_loss = 0. count = 0 for batch_id, (x, _) in enumerate(train_loader): n_batch = len(x) count += n_batch optimizer.zero_grad() x = Variable(preprocess_batch(x)) if cuda: x.cuda() style_v = style_loader.get(batch_id) style_model.setTarget(style_v) style_v = utils.img_utils.subtract_imagenet_mean_batch( style_v) features_style = vgg(style_v) gram_style = [ utils.img_utils.gram_matrix(y) for y in features_style ] y = style_model(x.cuda()) xc = Variable(x.data.clone(), volatile=True) y = utils.img_utils.subtract_imagenet_mean_batch(y) xc = utils.img_utils.subtract_imagenet_mean_batch(xc) features_y = vgg(y) features_xc = vgg(xc.cuda()) f_xc_c = Variable(features_xc[1].data, requires_grad=False) content_loss = args.content_weight * \ mse_loss(features_y[1], f_xc_c) style_loss = 0. for m in range(len(features_y)): gram_y = utils.img_utils.gram_matrix(features_y[m]) gram_s = Variable(gram_style[m].data, requires_grad=False).repeat( args.batch_size, 1, 1, 1) style_loss += args.style_weight * mse_loss( gram_y, gram_s[:n_batch, :, :]) total_loss = content_loss + style_loss total_loss.backward() optimizer.step() agg_content_loss += content_loss.data[0] agg_style_loss += style_loss.data[0] if (batch_id + 1) % log_interval == 0: msg = "{}\tEpoch {}:\t[{}/{}]\tcontent: {:.6f}\tstyle: {:.6f}\ttotal: {:.6f}".format( time.ctime(), e + 1, count, len(train_dataset), agg_content_loss / (batch_id + 1), agg_style_loss / (batch_id + 1), (agg_content_loss + agg_style_loss) / (batch_id + 1)) print(msg) if (batch_id + 1) % (20 * log_interval) == 0: # save model style_model.eval() style_model.cpu() save_model_filename = "Epoch_" + str(e) + "_" +\ "iters_" + str(count) + \ "_" + str(time.ctime()).replace(' ','_') + \ "_" + str(args.content_weight) + "_" + \ str(args.style_weight) + ".model" save_model_path = os.path.join(temp_save_model_dir, save_model_filename) torch.save(style_model.state_dict(), save_model_path) style_model.train() style_model.cuda() logger.info("Checkpoint, trained model saved at " + str(save_model_path)) # save the final model style_model.eval() style_model.cpu() save_final_model_path = os.path.join(model_path, final_model_filename) torch.save(style_model.state_dict(), save_final_model_path) logger.info("Done, trained model saved at " + save_final_model_path) # Write out the success file with open(os.path.join(output_path, 'success'), 'w') as s: s.write('Done') except Exception as e: with open(os.path.join(output_path, 'failure'), 'w') as s: trc = traceback.format_exc() logger.info('Exception during training: ' + str(e) + '\n' + trc) s.write('Exception during training: ' + str(e) + '\n' + trc)