def data_gen(V, batch, nbatches): "Generate random data for a src-tgt copy task." for i in range(nbatches): data = torch.from_numpy(np.random.randint(1, V, size=(batch, 10))) data[:, 0] = 1 data.cuda() src = Variable(data, requires_grad=False) tgt = Variable(data, requires_grad=False) yield Batch(src, tgt, 0)
def fit(epoch, model, data_loader, mode, is_cuda, optim): if mode == 'train': model = model.train() if mode == 'val': model = model.eval() running_loss = 0 running_correct = 0 for batch_idx, data_batch in enumerate(data_loader): data, label = data_batch.text, data_batch.label if is_cuda: model, data, label = model.cuda(), data.cuda(), label.cuda() if mode == 'train': optim.zero_grad() output = model(data) loss = F.nll_loss(output, label) if mode == 'train': loss.backward() optim.step() running_loss += loss.item() pre = output.max(dim=1, keepdim=True)[1] equal = pre.eq(label.view_as(pre)).cpu().sum() running_correct += equal.item() average_loss = running_loss / len(data_loader.dataset) accuracy = running_correct / len(data_loader.dataset) print("mode:%s, epoch:%d, loss:%f, acc:%f" % (mode, epoch, average_loss, accuracy)) return average_loss, accuracy
def evaluate(data_source): model.eval() total_loss = 0 total_kld = 0 count = 0 truth_res = [] pred_res = [] for batch in data_source: data, label = batch.text, batch.label data, label = data.cuda(device_id), label.cuda(device_id) label.data.sub_(2) truth_res += list(label.data) args.batch_size = data.size(1) model.decoder.bsz = args.batch_size seq_len = data.size(0) - 1 out_ix = data[1:, :].contiguous().view(-1) row = range(args.batch_size) label_2 = Variable(torch.zeros(args.batch_size, 2).cuda(device_id), requires_grad=False) label_2[row, label] = 1 recon_batch, mu, logvar, fake_label = model(data[:-1, :], label_2) BCE, KLD = loss_function(recon_batch, out_ix, mu, logvar) loss = BCE + KLD _, pred_label = torch.max(fake_label, 1) pred_res += list(pred_label.data) total_loss += loss.data.item() total_kld += KLD.data.item() count += 1 avg = total_loss / count avg_kld = total_kld / count acc = get_accuracy(truth_res, pred_res) print(' acc :%g avg_loss:%g kld:%g' % (acc, avg, avg_kld)) return acc
def evaluate_att(model, data_iter, loss_function): model.eval() loss_meter = meter.AverageValueMeter() loss_meter.reset() for batch in tqdm.tqdm(data_iter): loss = 0 data = batch.text batch_size = data.shape[1] att_hidden = Variable(t.zeros(batch_size, 150)) # (batch_size, hidden_dim) pre_hiddens = Variable(t.zeros(batch_size, 1, 150)) if opt.use_gpu: data = data.cuda() att_hidden = att_hidden.cuda() pre_hiddens = pre_hiddens.cuda() input_, target_ = Variable(data[:-1, :]), Variable(data[1:, :]) max_len = input_.size(0) model.batch_size = batch_size hidden = model.init_hidden() for ii in range(max_len): input = input_[ii] # (batch_size,) target = target_[ii] output, att_hidden, pre_hidden, hidden, alpha = model( input, att_hidden, pre_hiddens, hidden) pre_hidden = pre_hidden.detach() pre_hiddens = t.cat((pre_hiddens, pre_hidden), 1) loss += loss_function(output, target) loss_meter.add(loss.item() / max_len) return loss_meter.value()[0]
def evaluate(data_source): model.eval() total_loss = 0 total_kld = 0 count = 0 truth_res = [] pred_res = [] for batch in data_source: data, label = batch.text, batch.label data, label = data.cuda(device_id), label.cuda(device_id) label.data.sub_(2) truth_res += list(label.data) args.batch_size = data.size(1) model.decoder.bsz = args.batch_size model.encoder.bsz = data.size(1) model.label.bsz = data.size(1) out_ix = data[1:, :].contiguous().view(-1) row = range(args.batch_size) label_2 = Variable(torch.zeros(args.batch_size, 2).cuda(device_id), requires_grad=False) label_2[row, label] = 1 recon_batch, z, fake_label = model(data[:-1, :]) _, pred_label = torch.max(fake_label, 1) pred_res += list(pred_label.data) count += 1 acc = get_accuracy(truth_res, pred_res) print(' acc :%g ' % (acc)) return acc
def train_step(self, optimizer, start_time): accuracies = torch.zeros(self.log_interval) total_loss = 0 for i, batch in enumerate(self.train_iterator): #CLEARING HISTORY optimizer.zero_grad #GETTING TENSORS data, targets = batch.text, batch.label.view(-1) targets = targets - 1 #from zero to one data, lengths = data[0], data[1] #CONVERTING TO CUDA IF ON NEEDED if self.cuda: data = data.cuda() targets = targets.cuda() lengths = lengths.cuda() if data.size(0) == self.batch_size: #GETTING PREDICTIONS output, h, A = self.model(data, lengths = lengths) predictions = output.view(-1, self.num_classes) #GET ACCURACY preds = torch.max(predictions, dim = 1)[1] pct_correct = float(torch.sum(targets == preds)[0].data[0]/predictions.size(0)) accuracies[i % self.log_interval] = pct_correct if self.weight_saving: #SAVING ATTENTION WEIGHTS self.save_weights(i, data, A, h, preds, targets, 'train') #CALCULATING AND PROPAGATING LOSS loss = self.objective(predictions, targets) loss.backward() if self.clip is not None: torch.nn.utils.clip_grad_norm(self.model.parameters(), self.clip) if self.optim in ['adam', 'SGD']: optimizer.step() elif self.optim == 'vanilla_grad': parameters = filter(lambda p: p.requires_grad, self.model.parameters()) for p in parameters: p.data.add_(-self.lr, p.grad.data) total_loss += loss.data if i % self.log_interval == 0 and i != 0: current_accuracy = float(torch.sum(accuracies)) / float(torch.nonzero(accuracies).size(0)) current_loss = total_loss[0] / self.log_interval total_loss = 0 elapsed = time() - start_time accuracies = torch.zeros(self.log_interval) print('At time: {elapsed} accuracy is {current_accuracy} and loss is {loss}'\ .format(elapsed=elapsed, current_accuracy = current_accuracy, loss = current_loss)) return optimizer
def train(): model.train() total_loss = 0 start_time = time.time() model.decoder.bsz = args.batch_size truth_res = [] pred_res = [] count = 0.0 iterator = zip(unsup_data, itertools.cycle(train_data)) for (unbatch, lbatch) in iterator: data, label = lbatch.text, lbatch.label undata = unbatch.text undata = undata.cuda(device_id) data, label = data.cuda(device_id), label.cuda(device_id) data.volatile = False label.volatile = False label.data.sub_(2) truth_res += list(label.data) args.batch_size = data.size(1) model.decoder.bsz = args.batch_size seq_len = data.size(0) - 1 out_ix = data[1:, :].contiguous().view(-1) unout_ix = undata[1:, :].contiguous().view(-1) row = range(args.batch_size) label_2 = Variable(torch.zeros(args.batch_size, 2).cuda(device_id), requires_grad=False) label_2[row, label] = 1 model.zero_grad() recon_batch, mu, logvar, fake_label = model(data[:-1, :], label_2) BCE, KLD = loss_function(recon_batch, out_ix, mu, logvar) label_loss = loss_label(fake_label, label_2) loss = label_loss + BCE + KLD model.decoder.bsz = undata.size(1) recon_batch, mu, logvar, _ = model(undata[:-1, :]) unBCE, unKLD = loss_function(recon_batch, unout_ix, mu, logvar) loss += unBCE + unKLD if args.model == "bvae": noise_loss = model.noise_loss(lr, alpha) noise_loss /= args.bptt * len(train_data) loss += noise_loss loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) optimizer.step() count += 1 total_loss += loss.data _, pred_label = torch.max(torch.exp(fake_label), 1) pred_res += list(pred_label.data) if count % args.log_interval == 0 and count > 0: cur_loss = total_loss.item() / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | lr {:5.5f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | kld {:5.9f}'.format( epoch, lr, elapsed * 1000 / args.log_interval, cur_loss, KLD.data.item())) total_loss = 0 start_time = time.time() print('epoch: %d done!\n acc:%g' % (epoch, get_accuracy(truth_res, pred_res)))
def evaluate(model, data_iter, loss_function): model.eval() loss_meter = meter.AverageValueMeter() loss_meter.reset() for batch in tqdm.tqdm(data_iter): data = batch.text if opt.use_gpu: data = data.cuda() input_, target = Variable(data[:-1, :]), Variable(data[1:, :]) output, _ = model(input_) loss = loss_function(output, target.view(-1)) loss_meter.add(loss.item()) return loss_meter.value()[0]
def eval_epoch(model, data_iter, criterion): total_loss = 0. total_words = 0. for (data, target) in data_iter: #tqdm( #data_iter, mininterval=2, desc=' - Training', leave=False): data = Variable(data, volatile=True) target = Variable(target, volatile=True) if opt.cuda: data, target = data.cuda(), target.cuda() target = target.contiguous().view(-1) pred = model.forward(data) loss = criterion(pred, target) total_loss += loss.data.item() total_words += data.size(0) * data.size(1) data_iter.reset() return math.exp(total_loss / total_words)
def train_discriminator(model, generators, data_iter, criterion, optimizer): total_loss = 0. total_sents = 0. total_correct = 0. if not opt.server: data_iter = tqdm(data_iter, mininterval=2, desc=' - Discriminator Training', leave=False) for real_data in data_iter: fake_data = [] fake_label = [] for i in range(len(generators)): fake_data.append(generators[i].sample( BATCH_SIZE, g_sequence_len).detach().cpu()) fake_label.append( torch.zeros(BATCH_SIZE, dtype=torch.int64) + i + 1) fake_data = torch.cat(fake_data) fake_label = torch.cat(fake_label) data = torch.cat([real_data.text, fake_data]) target = torch.cat([ torch.zeros(real_data.text.shape[0], dtype=torch.int64), fake_label ]) shuffle_index = torch.randperm(real_data.text.shape[0] + BATCH_SIZE * len(generators)) data = data[shuffle_index] target = target[shuffle_index] if opt.cuda: data, target = data.cuda(), target.cuda() target = target.contiguous().view(-1) pred = model.forward(data) total_correct += torch.sum((target == torch.max(pred, axis=1)[1])) if len(pred.shape) > 2: pred = torch.reshape(pred, (pred.shape[0] * pred.shape[1], -1)) loss = criterion(pred, target) total_loss += loss.data.item() total_sents += data.shape[0] optimizer.zero_grad() loss.backward() optimizer.step() total_sents = torch.tensor(total_sents) if opt.cuda: total_sents = total_sents.cuda() return total_loss / total_sents, total_correct / total_sents
def data_gen(num_words=11, batch_size=16, num_batches=100, length=10, pad_index=0, sos_index=1): """Generate random data for a src-tgt copy task.""" for i in range(num_batches): data = torch.from_numpy( np.random.randint(1, num_words, size=(batch_size, length))) data[:, 0] = sos_index data = data.cuda() if USE_CUDA else data src = data[:, 1:] trg = data src_lengths = [length - 1] * batch_size trg_lengths = [length] * batch_size yield Batch((src, src_lengths), (trg, trg_lengths), pad_index=pad_index)
def evaluate_twin(model, data_iter, loss_function): model.eval() loss_meter = meter.AverageValueMeter() loss_meter.reset() for batch in tqdm.tqdm(data_iter): data = batch.text model.batch_size = data.size(1) hidden = model.init_hidden() if opt.use_gpu: data = data.cuda() input_, target = Variable(data[:-1, :]), Variable(data[1:, :]) output = model.work(input_, hidden) loss = loss_function(output[0], target.view(-1)) loss_meter.add(loss.item()) return loss_meter.value()[0]
def eval(model, iterator, optimizer, crit): epoch_loss, epoch_acc = 0., 0. model.eval() hidden = model.init_hidden(BATCH_SIZE) for batch in iter(iterator): data, target = batch.text, batch.label if USE_CUDA: data, target = data.cuda(), target.cuda() data.t_() output, hidden = model(data, hidden) loss = crit(output, target) acc = binary_accuracy(output, t.unsqueeze(target, -1)) epoch_loss += loss.item() * len(batch) epoch_acc += acc * len(batch) optimizer.step() model.train() return epoch_loss / len(iterator), epoch_acc / len(iterator)
def train(model, inputs, labels, optimizer, crit): epoch_loss, epoch_acc = 0., 0. model.train() hidden = model.init_hidden(BATCH_SIZE) for batch in iter(iterator): data, target = batch.text, batch.label if USE_CUDA: data, target = data.cuda(), target.cuda() data.t_() output, hidden = model(data, hidden) loss = crit(output, t.unsqueeze(target, -1)) acc = binary_accuracy(output, target) optimizer.zero_grad() loss.backward() optimizer.step() epoch_loss += loss.item() epoch_acc += acc return epoch_loss / len(iterator)
def evaluate(self): self.model.eval() i = 0 accuracies = torch.zeros(len(self.test_iterator)) total_loss = 0 for i, batch in enumerate(self.test_iterator): #GETTING TENSORS data, targets = batch.text, batch.label.view(-1) data, lengths = data[0], data[1] targets = targets - 1 #CONVERTING TO CUDA IF ON NEEDED if self.cuda: data = data.cuda() targets = targets.cuda() lengths = lengths.cuda() if data.size(0) == self.batch_size: #GETTING PREDICTIONS output, h, A = self.model(data, lengths = lengths) predictions = output.view(-1, self.num_classes) #GET ACCURACY preds = torch.max(predictions, dim = 1)[1] pct_correct = float(torch.sum(targets == preds)[0].data[0]/predictions.size(0)) accuracies[i] = pct_correct if self.weight_saving: #SAVING ATTENTION WEIGHTS self.save_weights(i, data, A, h, preds, targets, "test") #CALCULATING LOSS loss = self.objective(predictions, targets) total_loss += loss.data self.eval_accuracy = float(torch.sum(accuracies)) / float(torch.nonzero(accuracies).size(0)) print('Done Evaluating: Achieved accuracy of {}' .format(self.eval_accuracy))
def evaluate(model, data): model.eval() total_loss = 0. it = iter(data) total_count = 0. with torch.no_grad(): hidden = model.init_hidden(BATCH_SIZE, requires_grad=False) for i, batch in enumerate(it): data, target = batch.text, batch.target if USE_CUDA: data, target = data.cuda(), target.cuda() hidden = repackage_hidden(hidden) with torch.no_grad(): output, hidden = model(data, hidden) loss = loss_fn(output.view(-1, VOCAB_SIZE), target.view(-1)) total_count += np.multiply(*data.size()) total_loss += loss.item() * np.multiply(*data.size()) loss = total_loss / total_count model.train() return loss
def train_generator(model, data_iter, criterion, optimizer): total_loss = 0. total_words = 0. if not opt.server: data_iter = tqdm(data_iter, mininterval=2, desc=' - Generator Training', leave=False) for each in data_iter: data, target = each.text[:, :-1], each.text[:, 1:] if opt.cuda: data, target = data.cuda(), target.cuda() target = target.contiguous().view(-1) pred = model.forward(data) if len(pred.shape) > 2: pred = torch.reshape(pred, (pred.shape[0] * pred.shape[1], -1)) loss = criterion(pred, target) total_loss += loss.data.item() total_words += data.size(0) * data.size(1) optimizer.zero_grad() loss.backward() optimizer.step() return math.exp(total_loss / total_words)
def evaluate(data_source): model.eval() total_loss = 0 total_kld = 0 count = 0 truth_res = [] pred_res = [] pred = [] for batch in data_source: data, label = batch.text, batch.label data, label = data.cuda(device_id), label.cuda(device_id) label.data.sub_(2) truth_res += list(label.data) args.batch_size = data.size(1) model.decoder.bsz = args.batch_size seq_len = data.size(0) - 1 out_ix = data[1:, :].contiguous().view(-1) row = range(args.batch_size) label_2 = Variable(torch.zeros(args.batch_size, 2).cuda(device_id), requires_grad=False) label_2[row, label] = 1 if args.model == 'bae' or args.model == 'baeg': model.encoder.bsz = data.size(1) model.label.bsz = data.size(1) recon_batch, z, fake_label = model(data[:-1, :]) else: recon_batch, mu, logvar, fake_label = model(data[:-1, :], label_2) _, pred_label = torch.max(fake_label, 1) pred_res += list(pred_label.data) pred.append(fake_label) count += 1 pred = torch.cat(pred, 0) acc = get_accuracy(truth_res, pred_res) print(' acc :%g' % (acc)) return pred, truth_res
def train(model, iterator, optimizer, crit): epoch_loss, epoch_acc = 0., 0. model.train() hidden = model.init_hidden(BATCH_SIZE) for batch in iter(iterator): data, target = batch.text, batch.label if USE_CUDA: data, target = data.cuda(), target.cuda() data.t_() print("data=", data.shape) output, hidden = model(data, hidden) print("output=", output.shape, "target=", target.shape) loss = crit(t.squeeze(output), target) #acc = binary_accuracy(output, target) optimizer.zero_grad() loss.backward() optimizer.step() epoch_loss += loss.item() print(loss.item()) #epoch_acc += acc return epoch_loss / len(iterator)
def evaluate(self): print('Begin Evaluating...') self.model.eval() hidden = self.model.init_hidden(self.batch_size) total_loss = 0 self.valid_iterator = self.get_iterator(self.valid_sentences) for i, batch in enumerate(self.valid_iterator): hidden = self.repackage_hidden(hidden) data, targets = batch.text, batch.target.view(-1) if self.cuda: data = data.cuda() targets = targets.cuda() output, hidden = self.model(data, hidden) if self.objective_function == 'crossentropy': output = output.view(-1, self.ntokens) else: output = output.view(output.size(0) * output.size(1), \ output.size(2)) loss = self.objective(output, targets) total_loss += loss.data if self.few_batches is not None: if i >= self.few_batches: break avg_loss = total_loss[0] / i perplexity = math.exp(avg_loss) print( 'Done Evaluating: Achieved loss of {} and perplexity of {}'.format( avg_loss, perplexity)) return perplexity
loss_fn = nn.CrossEntropyLoss() learning_rate = 0.001 #Choose your favorite Adam's optimizer optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) val_losses = [] for epoch in range(NUM_EPOCHS): model.train() it = iter(train_iter) hidden = model.init_hidden(BATCH_SIZE) for i, batch in enumerate(it): data, target = batch.text, batch.target if USE_CUDA: data, target = data.cuda(), target.cuda() model.zero_grad() output, hidden = model(data, hidden) hidden = repackage_hidden(hidden) loss = loss_fn(output.view(-1, VOCAB_SIZE), target.view(-1)) loss.backward() # apply gradient clipping to prevent the exploding gradient problem in RNN torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP) #TODO optimizer.step() if i % 1000 == 0: print("epoch", epoch, "iter", i, "loss", loss.item())
def train_twin(**kwargs): for k, v in kwargs.items(): setattr(opt, k, v) # setattr(object, name, value) 设置属性值 vis = Visualizer(env=opt.env) # 设置visdom的环境变量 # 获取数据 train_iter, valid_iter, test_iter, field = load_data() word2ix = field.vocab.stoi ix2word = field.vocab.itos # 模型定义 model = lstm_twin(len(word2ix), 300, 150) best_model = model best_valid_loss = float("inf") optimizer = t.optim.Adam(model.parameters(), lr=opt.lr) scheduler = t.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', min_lr=1e-5) # CrossEntropyLoss 会把每个字符的损失求平均,所以损失是个10以内的数,如果加上size_average = False, 就变成一个10000以内的 # 数了,正好差不多2000倍吧,如果想以每句话为单位,那么就乘上seq_len criterion = nn.CrossEntropyLoss() if opt.model_path: model.load_state_dict(t.load(opt.model_path)) if opt.use_gpu: model.cuda() criterion.cuda() count = 0 for epoch in range(opt.epoch): model.train() logging.info("这是第{0}次epoch".format(count + 1)) cnt = 0 b_fwd_loss, b_bwd_loss, b_twin_loss, b_all_loss = 0., 0., 0., 0. for batch in tqdm.tqdm( train_iter ): # tqdm是一个python进度条库,可以封装iterator,it/s表示的就是每秒迭代了多少次 # 训练 data = batch.text seq_len = data.size(0) # 生成一个倒着的序列,因为tensor不支持负步长 idx = np.arange(seq_len)[::-1].tolist() idx = t.LongTensor(idx) idx = Variable(idx).cuda() model.batch_size = data.size(1) hidden1 = model.init_hidden() hidden2 = model.init_hidden() if opt.use_gpu: data = data.cuda() optimizer.zero_grad() # 输入和目标错开,CharRNN的做法 f_input, f_target = Variable(data[:-1, :]), Variable(data[1:, :]) bx = data.index_select(0, idx) b_input, b_target = Variable(bx[:-1, :]), Variable(bx[1:, :]) # print(f_input.size(),b_input.size()) f_out, b_out, f_h, b_h = model(f_input, b_input, hidden1, hidden2) f_loss = criterion(f_out, f_target.view(-1)) b_loss = criterion(b_out, b_target.view(-1)) b_h_inv = b_h.index_select(0, idx[1:]) b_h_inv = b_h_inv[1:] #将<sos>去除 # print(f_h.size(), b_h_inv.size()) b_h_inv = b_h_inv.detach() f_h = f_h[:-1] #将<eos>去掉 twin_loss = ((f_h - b_h_inv)**2).mean() twin_loss *= 1.5 all_loss = f_loss + b_loss + twin_loss all_loss.backward() t.nn.utils.clip_grad_norm(model.parameters(), 5.) optimizer.step() # 累加 b_all_loss += all_loss.item() b_fwd_loss += f_loss.item() b_bwd_loss += b_loss.item() b_twin_loss += twin_loss.item() # 可视化 if (1 + cnt) % opt.plot_every == 0: vis.plot('all_loss', b_all_loss / opt.plot_every) vis.plot('twin_loss', b_twin_loss / opt.plot_every) vis.plot('loss', b_fwd_loss / opt.plot_every) # logging.info("训练第{}个plot的all_loss:{:f}, f_loss: {:f}, b_loss: {:f}, twin_loss: {:f}" # .format(int((cnt + 1) / opt.plot_every), b_all_loss / opt.plot_every, # b_fwd_loss / opt.plot_every, # b_bwd_loss / opt.plot_every, b_twin_loss / opt.plot_every)) b_fwd_loss, b_bwd_loss, b_twin_loss, b_all_loss = 0., 0., 0., 0. cnt += 1 count += 1 valid_loss = evaluate_twin(model, valid_iter, criterion) scheduler.step(valid_loss) logging.info("第%d次验证集的loss为: %f" % (count, valid_loss)) if valid_loss < best_valid_loss: # os.system('rm ' + opt.model_prefix +opt.model + '.pth') best_valid_loss = valid_loss best_model = model t.save(best_model.state_dict(), '%s%s_%d.pth' % (opt.model_prefix, opt.model, count)) test_loss = evaluate_twin(best_model, test_iter, criterion) logging.info("测试集的loss为: %f" % test_loss) # 学习率减半 if epoch in [5, 10, 15]: for param_group in optimizer.param_groups: lr = param_group['lr'] lr *= 0.5 param_group['lr'] = lr
def train_attention(**kwargs): for k, v in kwargs.items(): setattr(opt, k, v) # setattr(object, name, value) 设置属性值 vis = Visualizer(env=opt.env) # 设置visdom的环境变量 logging.info("============attention的训练过程================") # 获取数据 train_iter, valid_iter, test_iter, field = load_data() word2ix = field.vocab.stoi ix2word = field.vocab.itos # 模型定义 model = lstm_att(len(word2ix), 300, 150) best_model = model best_valid_loss = float("inf") # lambda1 = lambda epoch: epoch // 5 # lambda2 = lambda epoch: 0.95 ** epoch optimizer = t.optim.Adam(model.parameters(), lr=opt.lr, weight_decay=1e-6) scheduler = t.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min') criterion = nn.NLLLoss() if opt.model_path: model.load_state_dict(t.load(opt.model_path)) if opt.use_gpu: model.cuda() criterion.cuda() loss_meter = meter.AverageValueMeter() count = 0 for epoch in range(opt.epoch): model.train() loss_meter.reset() logging.info("这是第{0}次epoch".format(count + 1)) cnt = 0 use_teacher_forcing = True if random.random( ) < teacher_forcing_ratio else False for batch in tqdm.tqdm(train_iter): loss = 0 # 训练 data = batch.text batch_size = data.shape[1] att_hidden = Variable( t.zeros(batch_size, 150), requires_grad=False) # (batch_size, hidden_dim) pre_hiddens = Variable(t.zeros(batch_size, 1, 150), requires_grad=False) if opt.use_gpu: data = data.cuda() att_hidden = att_hidden.cuda() pre_hiddens = pre_hiddens.cuda() optimizer.zero_grad() # 输入和目标错开,CharRNN的做法 input_, target_ = Variable(data[:-1, :]), Variable(data[1:, :]) max_len = input_.size(0) model.batch_size = batch_size hidden = model.init_hidden() for ii in range(max_len): input = input_[ii] # (batch_size,) target = target_[ii] output, att_hidden, pre_hidden, hidden, alpha = model( input, att_hidden, pre_hiddens, hidden) # logging.info("第%d次: %s" % (ii, alpha)) pre_hidden = pre_hidden.detach() pre_hiddens = t.cat((pre_hiddens, pre_hidden), 1) # topv, topi = decoder_output.topk(1) # decoder_input = topi.squeeze().detach() # detach from history as input loss += criterion(output, target) loss.backward() # 梯度剪裁 t.nn.utils.clip_grad_norm(model.parameters(), 5.) optimizer.step() loss_meter.add(loss.item() / max_len) # 可视化 if (1 + cnt) % opt.plot_every == 0: vis.plot('loss', loss_meter.value()[0]) # logging.info("训练第%d次batch_plot的loss为: %f" % ((cnt+1)/opt.plot_every, loss_meter.value()[0])) cnt += 1 count += 1 valid_loss = evaluate_att(model, valid_iter, criterion) scheduler.step(valid_loss) logging.info("======第%d次验证集的loss为: %f=====" % (count, valid_loss)) if valid_loss < best_valid_loss: best_valid_loss = valid_loss best_model = model t.save(best_model.state_dict(), '%s%s_%d.pth' % (opt.model_prefix, opt.model, count)) test_loss = evaluate_att(best_model, test_iter, criterion) logging.info("------测试集的loss为: %f" % test_loss) # 学习率减半 if epoch in [5, 10, 15]: for param_group in optimizer.param_groups: lr = param_group['lr'] lr *= 0.5 param_group['lr'] = lr
def train(): model.train() total_loss = 0 start_time = time.time() model.decoder.bsz = args.batch_size truth_res = [] pred_res = [] count = 0.0 iterator = zip(unsup_data, itertools.cycle(train_data)) for (unbatch, lbatch) in iterator: data, label = lbatch.text, lbatch.label undata = unbatch.text undata = undata.cuda(device_id) data, label = data.cuda(device_id), label.cuda(device_id) data.volatile = False label.volatile = False label.data.sub_(2) truth_res += list(label.data) args.bptt = (data.size(0) + undata.size(0)) / 2 out_ix = data[1:, :].contiguous().view(-1) unout_ix = undata[1:, :].contiguous().view(-1) row = range(data.size(1)) label_2 = Variable(torch.zeros(data.size(1), 2).cuda(device_id), requires_grad=False) label_2[row, label] = 1 model.zero_grad() for j in range(J): if j == 0: model.zero_grad() model.decoder.bsz = data.size(1) model.encoder.bsz = data.size(1) model.label.bsz = data.size(1) recon_batch, z, fake_label = model(data[:-1, :]) model.decoder.bsz = undata.size(1) model.encoder.bsz = undata.size(1) model.label.bsz = undata.size(1) unrecon_batch, unz, _ = model(undata[:-1, :]) z_sample = Variable(z.data, requires_grad=True) z_optimizer = z_opt(z_sample) z_optimizer.zero_grad() unz_sample = Variable(unz.data, requires_grad=True) unz_optimizer = z_opt(unz_sample) unz_optimizer.zero_grad() else: model.zero_grad() emb = model.embed(data[:-1, :]) model.decoder.bsz = data.size(1) model.label.bsz = data.size(1) fake_label = model.label(emb, z_sample) recon_batch = model.decoder(emb, z_sample) model.decoder.bsz = undata.size(1) model.label.bsz = undata.size(1) unemb = model.embed(undata[:-1, :]) unrecon_batch = model.decoder(unemb, unz_sample) BCE = loss_function(recon_batch, out_ix) unBCE = loss_function(unrecon_batch, unout_ix) label_loss = loss_label(fake_label, label_2) noise_loss = model.noise_loss(lr, alpha) noise_loss /= args.bptt * len(train_data) prior_loss_z = z_prior_loss(z_sample) noise_loss_z = z_noise_loss(z_sample) prior_loss_z /= args.bptt * len(train_data) noise_loss_z /= args.bptt * len(train_data) unprior_loss_z = z_prior_loss(unz_sample) unnoise_loss_z = z_noise_loss(unz_sample) unprior_loss_z /= args.bptt * len(train_data) unnoise_loss_z /= args.bptt * len(train_data) loss = BCE + unBCE + label_loss + noise_loss + prior_loss_z + noise_loss_z + unprior_loss_z + unnoise_loss_z if j > burnin + 1: loss_en = en_loss(z_sample, z) unloss_en = en_loss(unz_sample, unz) loss += loss_en + unloss_en if j % 2 == 0: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) optimizer.step() else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) z_optimizer.step() unz_optimizer.step() count += 1 total_loss += label_loss.data + BCE.data + unBCE.data _, pred_label = torch.max(torch.exp(fake_label), 1) pred_res += list(pred_label.data) if count % args.log_interval == 0 and count > 0: cur_loss = total_loss.item() / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | lr {:5.5f} | ms/batch {:5.2f} | ' 'loss {:5.2f} '.format(epoch, lr, elapsed * 1000 / args.log_interval, cur_loss)) total_loss = 0 start_time = time.time() print('epoch: %d done!\n acc:%g' % (epoch, get_accuracy(truth_res, pred_res)))
def train(**kwargs): for k, v in kwargs.items(): setattr(opt, k, v) # setattr(object, name, value) 设置属性值 vis = Visualizer(env=opt.env) # 设置visdom的环境变量 # 获取数据 train_iter, valid_iter, test_iter, field = load_data() word2ix = field.vocab.stoi ix2word = field.vocab.itos # np.savez('data/word2ix.npz', word2ix = word2ix,ix2word = ix2word) # 模型定义 model = lstm(len(word2ix), 300, 150) best_model = model best_valid_loss = float("inf") optimizer = t.optim.Adam(model.parameters(), lr=opt.lr, weight_decay=1e-6) criterion = nn.CrossEntropyLoss() if opt.model_path: model.load_state_dict(t.load(opt.model_path)) if opt.use_gpu: model.cuda() criterion.cuda() loss_meter = meter.AverageValueMeter() count = 0 for epoch in range(opt.epoch): model.train() loss_meter.reset() logging.info("这是第{0}次epoch".format(count + 1)) cnt = 0 for batch in tqdm.tqdm( train_iter ): # tqdm是一个python进度条库,可以封装iterator,it/s表示的就是每秒迭代了多少次 # 训练 data = batch.text if opt.use_gpu: data = data.cuda() optimizer.zero_grad() # 输入和目标错开,CharRNN的做法 input_, target = Variable(data[:-1, :]), Variable(data[1:, :]) output, _ = model(input_) loss = criterion(output, target.view(-1)) loss.backward() optimizer.step() loss_meter.add(loss.item()) # 可视化 if (1 + cnt) % opt.plot_every == 0: vis.plot('loss', loss_meter.value()[0]) cnt += 1 count += 1 valid_loss = evaluate(model, valid_iter, criterion) logging.info("第%d次验证集的loss为: %f" % (count, valid_loss)) if valid_loss < best_valid_loss: os.system('rm ' + opt.model_prefix + opt.model + '.pth') best_valid_loss = valid_loss best_model = model t.save(best_model.state_dict(), '%s%s.pth' % (opt.model_prefix, opt.model)) test_loss = evaluate(best_model, test_iter, criterion) logging.info("测试集的loss为: %f" % test_loss)
def train_step(self, optimizer, model, start_time): print('Completing Train Step...') hidden = self.model.init_hidden(self.batch_size) total_loss = 0 for i, batch in enumerate(self.train_iterator): if i >= self.current_batch: elapsed = time() - start_time self.current_batch = i if self.time_limit is not None: if elapsed > self.time_limit: print('REACHED TIME LIMIT!') self.save_checkpoint() break hidden = self.repackage_hidden(hidden) data, targets = batch.text, batch.target.view(-1) if self.cuda: data = data.cuda() targets = targets.cuda() output, hidden = model(data, hidden) if self.objective_function == 'crossentropy': output = output.view(-1, self.ntokens) else: output = output.view(output.size(0) * output.size(1), \ output.size(2)) loss = self.objective(output, targets) loss.backward() if self.clip is not None: torch.nn.utils.clip_grad_norm(self.model.parameters(),\ self.clip) total_loss += loss.data[0] if self.optim == 'adam': optimizer.step() optimizer.zero_grad() elif self.optim == 'vanilla_grad': parameters = filter(lambda p: p.requires_grad,\ self.model.parameters()) for p in parameters: p.data.add_(-self.lr, p.grad.data) if self.few_batches is not None: if i >= self.few_batches: break if ((i + 1) % self.log_interval) == 0: self.current_loss.append(total_loss / self.log_interval) total_loss = 0 print('At time: {time} and batch: {i}, loss is {loss}' ' and perplexity is {ppl}'.format( i=i + 1, time=elapsed, loss=self.current_loss[-1], ppl=math.exp(self.current_loss[-1]))) print('Finished Train Step') self.current_batch = 0 return optimizer
def Bleu(**kwargs): for k, v in kwargs.items(): setattr(opt, k, v) # setattr(object, name, value) 设置属性值 print('Loading model from {}'.format(opt.model_path)) # 加载词典 if os.path.exists(opt.pickle_path): data = np.load(opt.pickle_path) word2ix, ix2word = data['word2ix'].item(), data['ix2word'] else: train_iter, valid_iter, test_iter, field = load_data() word2ix = field.vocab.stoi ix2word = field.vocab.itos # 加载模型 if opt.model == 'lstm': model = lstm(len(word2ix), 300, 150) elif opt.model == 'lstm_twin': model = lstm_twin(len(word2ix), 300, 150) map_location = lambda s, l: s state_dict = t.load(opt.model_path, map_location=map_location) model.load_state_dict(state_dict) if opt.use_gpu: model.cuda() print("加载完毕") # model.eval() hypothesis = [] references = [] cnt = 0 for batch in tqdm.tqdm(test_iter): cnt += 1 # batch = next(iter(test_iter)) data = batch.text if opt.model == 'lstm_twin': model.batch_size = data.size(1) hidden = model.init_hidden() if opt.use_gpu: data = data.cuda() input_, target = Variable(data[:-1, :]), Variable(data[1:, :]) tmp = target.transpose(0, 1).cpu().numpy() # print(tmp) print('===========输入==========') for ii in tmp: ii_ = list(ii) for i in ii_: print(ix2word[i], end='') print('') ii_ = ii_[:ii_.index(3) + 1] references.append([ii_]) print('===========输出==========') # print(references) if opt.model == 'lstm': output, _ = model(input_) output = output.view(data.size(0) - 1, data.size(1), -1) elif opt.model == 'lstm_twin': output = model.work(input_, hidden) output = output[0].view(data.size(0) - 1, data.size(1), -1) # print(output.size()) top = output.topk(1, dim=2)[1].squeeze().transpose(0, 1) top = top.cpu().numpy() for ii in top: ii_ = list(ii) for i in ii_: print(ix2word[i], end='') print('') haha = ii_.index(3) if 3 in ii_ else None if (haha): ii_ = ii_[:haha + 1] hypothesis.append(ii_) # if cnt > 10: # break # print(hypothesis) bleu1 = corpus_bleu(references, hypothesis, weights=(1, 0, 0, 0)) bleu2 = corpus_bleu(references, hypothesis, weights=(1. / 2., 1. / 2., 0, 0)) bleu3 = corpus_bleu(references, hypothesis, weights=(1. / 3., 1. / 3., 1. / 3., 0)) bleu4 = corpus_bleu(references, hypothesis) print("bleu1: ", bleu1, "bleu2: ", bleu2, "bleu3: ", bleu3, "bleu4: ", bleu4)
def Bleu_att(**kwargs): for k, v in kwargs.items(): setattr(opt, k, v) # setattr(object, name, value) 设置属性值 print('Loading model from {}'.format(opt.model_path)) # 加载词典 if os.path.exists(opt.pickle_path): data = np.load(opt.pickle_path) word2ix, ix2word = data['word2ix'].item(), data['ix2word'].item() else: train_iter, valid_iter, test_iter, field = load_data() word2ix = field.vocab.stoi ix2word = field.vocab.itos # 加载模型 model = lstm_att(len(word2ix), 300, 150) map_location = lambda s, l: s state_dict = t.load(opt.model_path, map_location=map_location) model.load_state_dict(state_dict) if opt.use_gpu: model.cuda() print("加载完毕") # model.eval() hypothesis = [] references = [] for batch in tqdm.tqdm(test_iter): # batch = next(iter(test_iter)) data = batch.text batch_size = data.shape[1] att_hidden = Variable(t.zeros(batch_size, 150)) # (batch_size, hidden_dim) pre_hiddens = Variable(t.zeros(batch_size, 1, 150)) if opt.use_gpu: data = data.cuda() att_hidden = att_hidden.cuda() pre_hiddens = pre_hiddens.cuda() input_, target_ = Variable(data[:-1, :]), Variable(data[1:, :]) tmp = target_.transpose(0, 1).cpu().numpy() # print(tmp) for ii in tmp: ii_ = list(ii) ii_ = ii_[:ii_.index(3) + 1] references.append([ii_]) # print(references) max_len = input_.size(0) model.batch_size = batch_size hidden = model.init_hidden() hy = None for ii in range(max_len): input = input_[ii] # (batch_size,) output, att_hidden, pre_hidden, hidden, alpha = model( input, att_hidden, pre_hiddens, hidden) pre_hidden = pre_hidden.detach() pre_hiddens = t.cat((pre_hiddens, pre_hidden), 1) tmp = output.topk(1, dim=1)[1].cpu().numpy() if ii == 0: hy = tmp.copy() else: hy = np.append(hy, tmp, axis=1) for ii in hy: ii_ = list(ii) haha = ii_.index(3) if 3 in ii_ else None if (haha): ii_ = ii_[:haha + 1] hypothesis.append(ii_) bleu1 = corpus_bleu(references, hypothesis, weights=(1, 0, 0, 0)) bleu2 = corpus_bleu(references, hypothesis, weights=(1. / 2., 1. / 2., 0, 0)) bleu3 = corpus_bleu(references, hypothesis, weights=(1. / 3., 1. / 3., 1. / 3., 0)) bleu4 = corpus_bleu(references, hypothesis) print("bleu1: ", bleu1, "bleu2: ", bleu2, "bleu3: ", bleu3, "bleu4: ", bleu4)
def train_attention_twin(**kwargs): for k, v in kwargs.items(): setattr(opt, k, v) # setattr(object, name, value) 设置属性值 vis = Visualizer(env=opt.env) # 设置visdom的环境变量 logging.info("============attention_twin================") # 获取数据 train_iter, valid_iter, test_iter, field = load_data() word2ix = field.vocab.stoi ix2word = field.vocab.itos # 模型定义 model = lstm_att_twin(len(word2ix), 300, 150) best_model = model best_valid_loss = float("inf") # lambda1 = lambda epoch: epoch // 5 # lambda2 = lambda epoch: 0.95 ** epoch optimizer = t.optim.Adam(model.parameters(), lr=opt.lr, weight_decay=1e-6) scheduler = t.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min') criterion = nn.NLLLoss() if opt.model_path: model.load_state_dict(t.load(opt.model_path)) if opt.use_gpu: model.cuda() criterion.cuda() loss_meter = meter.AverageValueMeter() count = 0 for epoch in range(opt.epoch): print('this is {0}'.format(count + 1)) model.train() loss_meter.reset() logging.info("this is the {0}th epoch".format(count + 1)) cnt = 0 for batch in tqdm.tqdm(train_iter): fwd_loss = 0 # 训练 data = batch.text batch_size = data.shape[1] seq_len = data.shape[0] idx = np.arange(seq_len)[::-1].tolist() idx = t.LongTensor(idx) idx = Variable(idx).cuda() att_hidden = Variable( t.zeros(batch_size, 150), requires_grad=False) # (batch_size, hidden_dim) pre_hiddens = Variable(t.zeros(batch_size, 1, 150), requires_grad=False) f_h = None if opt.use_gpu: data = data.cuda() att_hidden = att_hidden.cuda() pre_hiddens = pre_hiddens.cuda() optimizer.zero_grad() # 输入和目标错开,CharRNN的做法 input_, target_ = Variable(data[:-1, :]), Variable(data[1:, :]) bx = data.index_select(0, idx) b_input, b_target = Variable(bx[:-1, :]), Variable(bx[1:, :]) max_len = input_.size(0) model.batch_size = batch_size hidden = model.init_hidden() bwd_hidden = model.init_hidden() for ii in range(max_len): input = input_[ii] # (batch_size,) target = target_[ii] output, att_hidden, pre_hidden, hidden, alpha = model( input, att_hidden, pre_hiddens, hidden) # logging.info("第%d次: %s" % (ii, alpha)) pre_hidden = pre_hidden.detach() pre_hiddens = t.cat((pre_hiddens, pre_hidden), 1) if ii == 0: f_h = att_hidden.unsqueeze(0) else: f_h = t.cat((f_h, att_hidden.unsqueeze(0)), 0) # topv, topi = decoder_output.topk(1) # decoder_input = topi.squeeze().detach() # detach from history as input fwd_loss += criterion(output, target) fwd_loss = fwd_loss / max_len # 反向网络 b_out, b_h = model.bwd_forward(b_input, bwd_hidden) b_loss = criterion(b_out, b_target.view(-1)) seq_len, batch_size, _ = f_h.size() # 计算twin_loss f_h = model.fwd_affine(f_h) b_h_inv = b_h.index_select(0, idx[1:]) b_h_inv = b_h_inv[1:] # 将<sos>去除 # print(f_h.size(), b_h_inv.size()) b_h_inv = b_h_inv.detach() f_h = f_h[:-1] # 将<eos>去掉 twin_loss = ((f_h - b_h_inv)**2).mean() twin_loss *= 1.5 all_loss = b_loss + fwd_loss + twin_loss all_loss.backward() # 梯度剪裁 t.nn.utils.clip_grad_norm(model.parameters(), 5.) optimizer.step() loss_meter.add(all_loss.item()) # 可视化 if (1 + cnt) % opt.plot_every == 0: # vis.plot('loss', loss_meter.value()[0]) logging.info( "train the %dth batch_plot's loss is: %f" % ((cnt + 1) / opt.plot_every, loss_meter.value()[0])) cnt += 1 count += 1 valid_loss = evaluate_att(model, valid_iter, criterion) scheduler.step(valid_loss) logging.info("======the %dth validation's loss is: %f=====" % (count, valid_loss)) if valid_loss < best_valid_loss: best_valid_loss = valid_loss best_model = model t.save(best_model.state_dict(), '%s%s_%d.pth' % (opt.model_prefix, opt.model, count)) test_loss = evaluate_att(best_model, test_iter, criterion) logging.info("------test's loss为: %f" % test_loss) # 学习率减半 if epoch in [5, 10, 15]: for param_group in optimizer.param_groups: lr = param_group['lr'] lr *= 0.5 param_group['lr'] = lr