def step(self, norm_type=2): """ Run an update eventually clipping the gradients """ if self.max_norm is not None: clip_grad_norm(self.params, self.max_norm, norm_type=norm_type) self.optim.step()
def train(train_loader, model, criterion, optimizer, epoch): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() if args.no_partialbn: model.module.partialBN(False) else: model.module.partialBN(True) # switch to train mode model.train() end = time.time() for i, (input, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) target = target.cuda(async=True) input_var = torch.autograd.Variable(input) target_var = torch.autograd.Variable(target) # compute output output = model(input_var) loss = criterion(output, target_var) # measure accuracy and record loss prec1, prec5 = accuracy(output.data, target, topk=(1,5)) losses.update(loss.data[0], input.size(0)) top1.update(prec1[0], input.size(0)) top5.update(prec5[0], input.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() if args.clip_gradient is not None: total_norm = clip_grad_norm(model.parameters(), args.clip_gradient) if total_norm > args.clip_gradient: print("clipping gradient: {} with coef {}".format(total_norm, args.clip_gradient / total_norm)) optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: print(('Epoch: [{0}][{1}/{2}], lr: {lr:.5f}\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top1=top1, top5=top5, lr=optimizer.param_groups[-1]['lr'])))
def step(self): """Update the model parameters based on current gradients. Optionally, will employ gradient modification or update learning rate. """ self._step += 1 # Decay method used in tensor2tensor. if self.decay_method == "noam": self._set_rate( self.original_lr * (self.model_size ** (-0.5) * min(self._step ** (-0.5), self._step * self.warmup_steps**(-1.5)))) if self.max_grad_norm: clip_grad_norm(self.params, self.max_grad_norm) self.optimizer.step()
def train(e, model, optimizer, train_iter, vocab_size, grad_clip, DE, EN): model.train() total_loss = 0 pad = EN.vocab.stoi['<pad>'] for b, batch in enumerate(train_iter): src, len_src = batch.src trg, len_trg = batch.trg src, trg = src.cuda(), trg.cuda() optimizer.zero_grad() output = model(src, trg) loss = F.cross_entropy(output[1:].view(-1, vocab_size), trg[1:].contiguous().view(-1), ignore_index=pad) loss.backward() clip_grad_norm(model.parameters(), grad_clip) optimizer.step() total_loss += loss.data[0] if b % 100 == 0 and b != 0: total_loss = total_loss / 100 print("[%d][loss:%5.2f][pp:%5.2f]" % (b, total_loss, math.exp(total_loss))) total_loss = 0
log_prob_actions_v = batch_scale_v * log_prob_v[range(BATCH_SIZE), batch_actions_t] loss_policy_v = -log_prob_actions_v.mean() loss_policy_v.backward(retain_graph=True) grads = np.concatenate([ p.grad.data.cpu().numpy().flatten() for p in net.parameters() if p.grad is not None ]) prob_v = F.softmax(logits_v) entropy_v = -(prob_v * log_prob_v).sum(dim=1).mean() entropy_loss_v = -ENTROPY_BETA * entropy_v loss_v = loss_policy_v + entropy_loss_v loss_v.backward() nn_utils.clip_grad_norm(net.parameters(), GRAD_L2_CLIP) optimizer.step() loss_v += loss_policy_v # calc KL-div new_logits_v = net(states_v) new_prob_v = F.softmax(new_logits_v) kl_div_v = -( (new_prob_v / prob_v).log() * prob_v).sum(dim=1).mean() writer.add_scalar("kl", kl_div_v.data.cpu().numpy()[0], step_idx) writer.add_scalar("baseline", baseline, step_idx) writer.add_scalar("entropy", entropy_v.data.cpu().numpy()[0], step_idx) writer.add_scalar("batch_scales", np.mean(batch_scales), step_idx) writer.add_scalar("batch_scales_std", scale_std, step_idx)
def train(lr, net, epoch, train_loader, valid_loader, transform, hyperparameters, batch_size): # register hypercurve agent = Agent(port=5001) hyperparameters['criteria'] = 'train loss' train_loss = agent.register(hyperparameters, 'loss') hyperparameters['criteria'] = 'valid loss' valid_loss = agent.register(hyperparameters, 'loss') hyperparameters['criteria'] = 'valid bleu' valid_bleu = agent.register(hyperparameters, 'bleu') hyperparameters['criteria'] = 'train bleu' train_bleu = agent.register(hyperparameters, 'bleu') hyperparameters['criteria'] = 'scheduled sampling probability' hyper_ssprob = agent.register(hyperparameters, 'probability') if torch.cuda.is_available(): net.cuda() optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr) net.train() best_score = -1 global_steps = 0 best_valid_loss = 10000 for iepoch in range(epoch): new_epoch = False batchid = 0 for (_, data) in enumerate(train_loader, 0): entext = data['entext'] enlen = data['enlen'] zhlabel = data['zhlabel'] zhgtruth = data['zhgtruth'] zhlen = data['zhlen'] ssprob = max(math.exp(-(global_steps - 100000) / 500000), 0.8) print('scheduled sampling pro: ', ssprob) logits, predic = net(entext, zhgtruth, enlen, ssprob, True) loss = net.get_loss(logits, zhlabel) optimizer.zero_grad() loss.backward() utils.clip_grad_norm(net.parameters(), 5) optimizer.step() batchid += 1 global_steps += 1 print(global_steps, iepoch, batchid, sum(loss.data.cpu().numpy())) agent.append(train_loss, global_steps, sum(loss.data.cpu().numpy())) agent.append(hyper_ssprob, global_steps, ssprob) if batchid % 50 == 0: net.eval() logits, predic = net(entext, zhgtruth, enlen, ssprob, True) tmppre = [0 for i in range(len(entext))] tmplabel = [0 for i in range(len(entext))] for i in range(len(entext)): tmppre[i] = transform.clip(predic[i], language='zh') tmplabel[i] = zhlabel[i][:zhlen[i]] tmpscore = bleuscore.score(tmppre, tmplabel) for i in range(25): ans_ = transform.i2t(tmplabel[i], language='zh') pre_ = transform.i2t(tmppre[i], language='zh') print(ans_) print(pre_) print('-------------------\n') agent.append(train_bleu, global_steps, tmpscore) del logits, predic if batchid % 400 == 0: print('\n------------------------\n') net.eval() all_pre = [] all_lable = [] all_len = [] all_loss = 0 bats = 0 for (_, data) in enumerate(valid_loader, 0): entext = data['entext'] enlen = data['enlen'] zhlabel = data['zhlabel'] zhgtruth = data['zhgtruth'] zhlen = data['zhlen'] logits, predic = net(entext, zhgtruth, enlen, 0, False) loss = net.get_loss(logits, zhlabel) all_pre.extend(predic) all_lable.extend(zhlabel) all_len.extend(zhlen) all_loss += sum(loss.data.cpu().numpy()) del loss, logits, predic bats += 1 for i in range(len(all_pre)): all_pre[i] = transform.clip(all_pre[i], language='zh') all_lable[i] = all_lable[i][:all_len[i]] score = bleuscore.score(all_pre, all_lable) for i in range(0, 600, 6): ans_ = transform.i2t(all_lable[i], language='zh') pre_ = transform.i2t(all_pre[i], language='zh') print(ans_) print(pre_) print('-------------------\n') all_loss /= bats print(global_steps, iepoch, batchid, all_loss, score, '\n********************\n') agent.append(valid_loss, global_steps, all_loss) agent.append(valid_bleu, global_steps, score) if best_valid_loss > all_loss or best_score < score: best_valid_loss = all_loss bestscore = score torch.save( net.state_dict(), model_dir + "ssprob-{:3f}-loss-{:3f}-steps-{:d}-model.pkl".format( ssprob, all_loss, global_steps)) del all_lable, all_len, all_loss, all_pre net.train()
def train(hp_dict, args, data_dir, save_path): use_chars = hp_dict['char_dim'] > 0 # load data dp = preprocessing() data = dp.preprocess(data_dir, no_training_set=False, use_chars=use_chars) # build minibatch loader train_batch_loader = mini_batch_loader(data.training, BATCH_SIZE, sample_rate=1.0, len_bin=hp_dict['use_bin']) valid_batch_loader = mini_batch_loader(data.validation, BATCH_SIZE, shuffle=False, len_bin=hp_dict['use_bin']) test_batch_loader = mini_batch_loader(data.test, BATCH_SIZE, shuffle=False, len_bin=hp_dict['use_bin']) logging.info("loading word2vec file ...") embed_init, embed_dim = \ load_word2vec_embeddings(data.dictionary[0], hp_dict['embed_file'],EMBED_SIZE) logging.info("embedding dim: {}".format(embed_dim)) logging.info("initialize model ...") model = GA_reader(hp_dict['nhidden'], data.vocab_size, embed_dim, embed_init, hp_dict['train_emb'], use_chars, hp_dict['char_nhidden'], data.n_chars, hp_dict['char_dim'], hp_dict['nlayers'], hp_dict['gating_fn'], hp_dict['use_feat'], hp_dict['dropout']) if USE_CUDA: model.cuda() logging.info("Running on cuda: {}".format(USE_CUDA)) # training phase opt = torch.optim.Adam(params=filter(lambda p: p.requires_grad, model.parameters()), lr=LEARNING_RATE) shutil.copyfile('config.py', os.path.join(save_path, 'config.py')) # # load existing best model if os.path.isfile(os.path.join(save_path, 'best_model.pkl')): print('loading previously best model') model.load_state_dict( torch.load(os.path.join(save_path, 'best_model.pkl'))) # load existing train_model elif os.path.isfile(os.path.join(save_path, 'init_model.pkl')): print('loading init model') model.load_state_dict( torch.load(os.path.join(save_path, 'init_model.pkl'))) logging.info('-' * 50) logging.info("Start training ...") best_valid_acc = best_test_acc = 0 for epoch in range(NUM_EPOCHS): new_max = False if epoch >= 2: for param_group in opt.param_groups: param_group['lr'] /= 2 model.train() acc = loss = n_examples = it = 0 start = time.time() for dw, dw_m,qw,qw_m,dt,qt,tt,tm, \ answear, candidate, candi_m, cloze_pos, fnames in train_batch_loader: n_examples += dw.shape[0] feat = feat_fuc(dw, qw) #-------train-------# dw, dw_m,qw,qw_m,dt,qt,tt,tm, answear, candidate, candi_m, cloze_pos,feat=to_vars(\ [dw, dw_m,qw,qw_m,dt,qt,tt,tm, answear, candidate, candi_m, cloze_pos,feat], use_cuda=USE_CUDA) loss_, acc_ = model(dw, dw_m, qw, qw_m, dt, qt, tt, tm, answear, candidate, candi_m, cloze_pos, feat) # tensor.float size 1 #print(acc_.cpu().data.numpy()) loss += loss_.cpu().data.numpy()[0] # numpy [1] acc += acc_.cpu().data.numpy()[0] it += 1 opt.zero_grad() loss_.backward() clip_grad_norm(parameters=filter(lambda p: p.requires_grad, model.parameters()), max_norm=GRAD_CLIP) opt.step() if it % print_every == 0 \ or it % len(train_batch_loader) == 0: spend = (time.time() - start) / 60 statement = "Epoch: {}, it: {} (max: {}), "\ .format(epoch, it, len(train_batch_loader)) statement += "loss: {:.3f}, acc: {:.3f}, time: {:.1f}(m)"\ .format(loss / print_every, acc / n_examples, spend) logging.info(statement) del acc, loss, n_examples acc = loss = n_examples = 0 start = time.time() # save every print torch.save(model.state_dict(), os.path.join(save_path, 'init_model.pkl')) # torch.save(model,os.path.join(save_path,'init_model.pkl')) #-------valid-------# if it % eval_every == 0: start = time.time() model.eval() test_loss, test_acc = evaluate(model, valid_batch_loader, USE_CUDA) spend = (time.time() - start) / 60 statement = "Valid loss: {:.3f}, acc: {:.3f}, time: {:.1f}(m)"\ .format(test_loss, test_acc, spend) logging.info(statement) if best_valid_acc < test_acc: best_valid_acc = test_acc new_max = True # store best valid model torch.save(model.state_dict(), os.path.join(save_path, 'best_model.pkl')) #torch.save(model,os.path.join(save_path,'best_model.pkl')) logging.info("Best valid acc: {:.3f}".format(best_valid_acc)) model.train() start = time.time() #-------test-------# start = time.time() model.eval() test_loss, test_acc = evaluate(model, test_batch_loader, USE_CUDA) spend = (time.time() - start) / 60 logging.info("Test loss: {:.3f}, acc: {:.3f}, time: {:.1f}(m)"\ .format(test_loss, test_acc, spend)) if best_test_acc < test_acc: best_test_acc = test_acc logging.info("Best test acc: {:.3f}".format(best_test_acc))
def main(): parser = argparse.ArgumentParser( description='PyTorch PennTreeBank RNN/LSTM Language Model') parser.add_argument('--data', type=str, default='../data/', help='location of the data corpus') parser.add_argument('--presaved', action='store_true', help='use presaved data') parser.add_argument('--glovedata', type=str, default='../data/', help='location of the pretrained glove embeddings') parser.add_argument('--din', type=int, default=30, help='length of LSTM') parser.add_argument('--demb', type=int, default=100, help='size of word embeddings') parser.add_argument('--dhid', type=int, default=100, help='humber of hidden units per layer') parser.add_argument('--dout', type=int, default=2, help='number of output classes') parser.add_argument('--nlayers', type=int, default=1, help='number of layers') parser.add_argument('--lr', type=float, default=0.001, help='initial learning rate') parser.add_argument('--clip', type=float, default=0.25, help='gradient clipping') parser.add_argument('--embinit', type=str, default='random', help='embedding weight initialization type') parser.add_argument('--decinit', type=str, default='random', help='decoder weight initialization type') parser.add_argument('--hidinit', type=str, default='random', help='recurrent hidden weight initialization type') parser.add_argument('--dropout', type=float, default=0.0, help='dropout applied to layers (0 = no dropout)') parser.add_argument('--reweight', action='store_true', help='reweight loss function') parser.add_argument('--epochs', type=int, default=50, help='upper epoch limit') parser.add_argument('--clean', action='store_true', help='clean text') parser.add_argument('--rm_stops', action='store_true', help='remove stop words') parser.add_argument('--batchsize', type=int, default=2000, metavar='N', help='batch size') parser.add_argument('--seed', type=int, default=3, help='random seed') parser.add_argument('--vocabsize', type=int, default=200000, help='random seed') parser.add_argument('--optimizer', action='store_true', help='use ADAM optimizer') parser.add_argument('--cuda', action='store_true', help='use CUDA') parser.add_argument('--loginterval', type=int, default=20, metavar='N', help='report interval') parser.add_argument('--save', type=str, default='', help='path to save the final model') args = parser.parse_args() pipe = None corpus = TacoText(args.vocabsize, lower=True, vocab_pipe=pipe) train_data = pd.read_csv('../data/train_data_shuffle.csv') valid_data = pd.read_csv('../data/val_data_shuffle.csv') train_data = train_data.fillna(' ') valid_data = valid_data.fillna(' ') if args.reweight: print('Downsampling') #downsample pos_valid = valid_data[valid_data['is_duplicate'] == 1] neg_valid = valid_data[valid_data['is_duplicate'] == 0] p = 0.19 pl = len(pos_valid) tl = len(pos_valid) + len(neg_valid) val = int(pl - (pl - p * tl) / ((1 - p))) pos_valid = pos_valid.iloc[:int(val)] valid_data = pd.concat([pos_valid, neg_valid]) print('Splitting Train') q1 = list(train_data['question1'].map(str)) q2 = list(train_data['question2'].map(str)) y = list(train_data['is_duplicate']) print('Splitting Valid') q1_val = list(valid_data['question1'].map(str)) q2_val = list(valid_data['question2'].map(str)) y_val = list(valid_data['is_duplicate']) train_feat = pd.read_csv('../data/train_features_all_norm.csv') val_feat = train_feat.iloc[valid_data['id']].values train_feat = train_feat.iloc[train_data['id']].values print('Splitting Data') if args.clean: print('Cleaning Data') stops = None if args.rm_stops: stops = stops = set(stopwords.words("english")) q1 = [split_text(x, stops) for x in q1] q2 = [split_text(x, stops) for x in q2] q1_val = [split_text(x, stops) for x in q1_val] q2_val = [split_text(x, stops) for x in q2_val] else: q1 = [x.lower().split() for x in q1] q2 = [x.lower().split() for x in q2] q1_val = [x.lower().split() for x in q1_val] q2_val = [x.lower().split() for x in q2_val] print('Downsample Weight: ', np.mean(y_val)) corpus.gen_vocab(q1 + q2 + q2_val + q1_val) n_feat = train_feat.shape[1] d_in = args.din feat_max = int(np.max([n_feat, d_in])) X = torch.Tensor(len(train_data), 1, 3, feat_max) X[:, 0, 0, :] = torch.from_numpy(corpus.pad_numericalize(q1, feat_max)).long() X[:, 0, 1, :] = torch.from_numpy(corpus.pad_numericalize(q2, feat_max)).long() X[:, 0, 2, :n_feat] = torch.from_numpy(np.array(train_feat)) y = torch.from_numpy(np.array(y)).long() X_val = torch.Tensor(len(valid_data), 1, 3, feat_max) X_val[:, 0, 0, :] = torch.from_numpy(corpus.pad_numericalize(q1_val, feat_max)).long() X_val[:, 0, 1, :] = torch.from_numpy(corpus.pad_numericalize(q2_val, feat_max)).long() X_val[:, 0, 2, :n_feat] = torch.from_numpy(np.array(val_feat)) y_val = torch.from_numpy(np.array(y_val)).long() if args.cuda: X, y = X.cuda(), y.cuda() X_val, y_val = X_val.cuda(), y_val.cuda() print('Generating Data Loaders') #X.size len(train_data),1,2,fix_length train_dataset = TensorDataset(X, y) train_loader = DataLoader(train_dataset, batch_size=args.batchsize, shuffle=True) valid_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=args.batchsize, shuffle=False) num_train = len(X) del X, y, X_val, y_val, train_feat, val_feat, q1, q2, q1_val, q2_val ntokens = len(corpus) glove_embeddings = None if args.embinit == 'glove': assert args.demb in (50, 100, 200, 300) glove_embeddings = get_glove_embeddings(args.glovedata, corpus.dictionary.word2idx, ntokens, args.demb) model = LSTMModelMLPFeat(args.din, args.dhid, args.nlayers, args.dout, args.demb, n_feat, args.vocabsize, args.dropout, args.embinit, args.hidinit, args.decinit, glove_embeddings, args.cuda) if args.cuda: model.cuda() if args.reweight: w_tensor = torch.Tensor([1.309028344, 0.472001959]) if args.cuda: w_tensor = w_tensor.cuda() criterion = nn.NLLLoss(weight=w_tensor) else: criterion = nn.NLLLoss() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) model_config = '\t'.join([ str(x) for x in (torch.__version__, args.clip, args.nlayers, args.din, args.demb, args.dhid, args.embinit, args.decinit, args.hidinit, args.dropout, args.optimizer, args.reweight, args.lr, args.vocabsize, args.batchsize, args.clean, args.rm_stops) ]) print( 'Pytorch | Clip | #Layers | InSize | EmbDim | HiddenDim | EncoderInit | DecoderInit | WeightInit | Dropout | Optimizer | Reweight | LR | VocabSize | batchsize | Clean | Stops' ) print(model_config) # best_val_acc = 0.78 best_ll = 0.3 for epoch in range(args.epochs): model.train() total_cost = 0 start_time = time.time() cur_loss = 0 for ind, (qs, duplicate) in enumerate(train_loader): model.zero_grad() pred = model(qs[:, 0, 0, :d_in].long(), qs[:, 0, 1, :d_in].long(), qs[:, 0, 2, :n_feat]) if args.cuda: pred = pred.cuda() duplicate = duplicate.cuda() duplicate = Variable(duplicate) loss = criterion(pred, duplicate) loss.backward() clip_grad_norm(model.parameters(), args.clip) if optimizer: optimizer.step() else: for p in model.parameters(): p.data.add_(-args.lr, p.grad.data) total_cost += loss.data[0] cur_loss += loss.data[0] if ind % args.loginterval == 0 and ind > 0: cur_loss = loss.data[0] / args.loginterval elapsed = time.time() - start_time print( '| Epoch {:3d} | {:5d}/{:5d} Batches | ms/batch {:5.2f} | ' 'Loss {:.6f}'.format(epoch, ind, num_train // args.batchsize, elapsed * 1000.0 / args.loginterval, cur_loss)) start_time = time.time() cur_loss = 0 model.eval() train_acc, train_ll = evaluate(model, train_loader, args.cuda, d_in, n_feat) val_acc, val_ll = evaluate(model, valid_loader, args.cuda, d_in, n_feat) # if args.save and (val_acc > best_val_acc): if args.save and (val_ll < best_ll): with open(args.save + '_corpus.pkl', 'wb') as corp_f: pkl.dump(corpus, corp_f, protocol=pkl.HIGHEST_PROTOCOL) torch.save(model.cpu(), args.save) torch.save(model.cpu().state_dict(), args.save + ".state_dict") with open(args.save + ".state_dict.config", "w") as f: f.write(model_config) best_ll = val_ll if args.cuda: model.cuda() print( 'Epoch: {} | Train Loss: {:.4f} | Train Accuracy: {:.4f} | Val Accuracy: {:.4f} | Train LL: {:.4f} | Val LL: {:.4f}' .format(epoch, total_cost, train_acc, val_acc, train_ll, val_ll)) print('-' * 89) del train_loader print('Reloading Best Model') model = torch.load(args.save) model.cuda() model.eval() print('RELOADING VALID') valid_data = pd.read_csv('../data/val_data_shuffle.csv') valid_data = valid_data.fillna(' ') q1_val = list(valid_data['question1'].map(str)) q2_val = list(valid_data['question2'].map(str)) y_val = list(valid_data['is_duplicate']) train_feat = pd.read_csv('../data/train_features_all_norm.csv') val_feat = train_feat.iloc[valid_data['id']].values if args.clean: print('Cleaning Data') stops = None if args.rm_stops: stops = stops = set(stopwords.words("english")) q1_val = [split_text(x, stops) for x in q1_val] q2_val = [split_text(x, stops) for x in q2_val] else: q1_val = [x.lower().split() for x in q1_val] q2_val = [x.lower().split() for x in q2_val] X_val = torch.Tensor(len(valid_data), 1, 3, feat_max) X_val[:, 0, 0, :] = torch.from_numpy(corpus.pad_numericalize(q1_val, feat_max)).long() X_val[:, 0, 1, :] = torch.from_numpy(corpus.pad_numericalize(q2_val, feat_max)).long() X_val[:, 0, 2, :n_feat] = torch.from_numpy(np.array(val_feat)) y_val = torch.from_numpy(np.array(y_val)).long() if args.cuda: X_val, y_val = X_val.cuda(), y_val.cuda() valid_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=args.batchsize, shuffle=False) del X_val, y_val, train_feat, val_feat, q1_val, q2_val, valid_data print('PREDICTING VALID') pred_list = [] for ind, (qs, _) in enumerate(valid_loader): out = model(qs[:, 0, 0, :d_in].long(), qs[:, 0, 1, :d_in].long(), qs[:, 0, 2, :n_feat]) pred_list += list(out.exp()[:, 1].data.cpu().numpy()) with open('../predictions/' + args.save + '_val.pkl', 'wb') as f: pkl.dump(pred_list, f, protocol=pkl.HIGHEST_PROTOCOL) if args.reweight: print('LOADING TEST DATA') test_data = pd.read_csv('../data/test.csv') test_data = test_data.fillna(' ') q1 = list(test_data['question1'].map(str)) q2 = list(test_data['question2'].map(str)) q1 = [x.lower().split() for x in q1] q2 = [x.lower().split() for x in q2] print('LOADING TEST FEATURES') test_feat = pd.read_csv('../data/test_features_all_norm.csv').values n_feat = test_feat.shape[1] d_in = args.din feat_max = int(np.max([n_feat, d_in])) X = torch.Tensor(len(test_data), 1, 3, feat_max) X[:, 0, 0, :] = torch.from_numpy(corpus.pad_numericalize(q1, feat_max)).long() X[:, 0, 1, :] = torch.from_numpy(corpus.pad_numericalize(q2, feat_max)).long() X[:, 0, 2, :n_feat] = torch.from_numpy(np.array(test_feat)) y = torch.LongTensor(len(test_data)).zero_() if args.cuda: X = X.cuda() y = y.cuda() test_loader = DataLoader(TensorDataset(X, y), batch_size=500, shuffle=False) print('PREDICTING') pred_list = [] for ind, (qs, _) in enumerate(test_loader): out = model(qs[:, 0, 0, :d_in].long(), qs[:, 0, 1, :d_in].long(), qs[:, 0, 2, :n_feat]) pred_list += list(out.exp()[:, 1].data.cpu().numpy()) with open('../predictions/' + args.save + '.pkl', 'wb') as f: pkl.dump(pred_list, f, protocol=pkl.HIGHEST_PROTOCOL)
def main(): cmd = argparse.ArgumentParser('Key-Value by H&Q') cmd.add_argument('--data_path', help='', default='../data/') cmd.add_argument('--hidden_size', help='', type=int, default=200) cmd.add_argument('--embed_size', help='', type=int, default=200) cmd.add_argument('--batch_size', help='', type=int, default=32) cmd.add_argument('--lr', help='', type=float, default=0.001) cmd.add_argument('--lr_decay', help='', type=float, default=1.0) cmd.add_argument('--max_epoch', help='', type=int, default=200) cmd.add_argument('--seed', help='', type=int, default=1234) cmd.add_argument('--dropout', help='', type=float, default=0.8) cmd.add_argument('--bleu_path', help='', default='../bleu/') cmd.add_argument('--grad_clip', help='', type=float, default=10) cmd.add_argument('--parallel_suffix', help='', type=str, default='123') cmd.add_argument('--model_save_path', help='', type=str, default='../model') cmd.add_argument('--l2', help='', type=float, default=0.000005) cmd.add_argument('--key_flag', help='', type=str, default='True') args = cmd.parse_args(sys.argv[2:]) print(args) # 存储参数配置 json.dump( vars(args), codecs.open(os.path.join(args.model_save_path, 'config.json'), 'w', encoding='utf-8')) random.seed(args.seed) torch.manual_seed(args.seed) # 数据预处理: 把标点和词分开,没有标点的统一加上. train_dialogs, valid_dialogs, test_dialogs = data_preprocess( args.data_path) # 提取keys, triples, entities keys, triples, entities, value_to_abstract_keys = key_extraction( train_dialogs, args.data_path) # 生成词典,先将key变成下划线形式加入词典,再将对话加入词典 lang = Lang() lang, underlined_keys = generate_dict(keys, train_dialogs, lang, value_to_abstract_keys) logging.info('dict generated! dict size:{0}'.format(lang.word_size)) # 存储词典 with codecs.open(os.path.join(args.model_save_path, 'dict'), 'w', encoding='utf-8') as fs: res_dict = [] for word, idx in lang.word2idx.items(): temp = word + '\t' + str(idx) res_dict.append(temp) res_dict = '\n'.join(res_dict) fs.write(res_dict) # 生成训练数据instances train_instances = generate_instances(keys, train_dialogs, triples, value_to_abstract_keys) valid_instances = generate_instances(keys, valid_dialogs, triples, value_to_abstract_keys) test_instances = generate_instances(keys, test_dialogs, triples, value_to_abstract_keys) # valid_instances = test_instances = train_instances #logging.info('instances sample: {0}'.format(train_instances)) # Word2idx train_instances_idx = sentence_to_idx(lang, train_instances) # [([],[]),()] valid_instances_idx = sentence_to_idx(lang, valid_instances) test_instances_idx = sentence_to_idx(lang, test_instances) # valid_instances_idx = test_instances_idx = train_instances_idx # keys2idx keys_idx = key_to_idx(lang, underlined_keys) train_instances_size = len(train_instances_idx) valid_instances_size = len(valid_instances_idx) test_instances_size = len(test_instances_idx) logging.info('trainging size:{0} valid size:{1} test size:{2}'.format(train_instances_size, valid_instances_size, \ test_instances_size)) encoder = Encoder(args.embed_size, args.hidden_size, args.dropout, lang) decoder = AttnDecoder(args.embed_size, args.hidden_size, args.dropout, lang, args.key_flag) encoderdecoder = EncoderDecoder(args.embed_size, args.hidden_size, args.dropout, lang) encoder = encoder.cuda() if use_cuda else encoder decoder = decoder.cuda() if use_cuda else decoder encoderdecoder = encoderdecoder.cuda() if use_cuda else encoderdecoder encoder_optimizer = optim.Adam(encoder.parameters(), lr=args.lr, weight_decay=args.l2) decoder_optimizer = optim.Adam(decoder.parameters(), lr=args.lr, weight_decay=args.l2) encoderdecoder_optimizer = optim.Adam(decoder.parameters(), lr=args.lr, weight_decay=args.l2) # train best_valid_bleu_score, best_test_bleu_score = 0, 0 best_valid_f, best_test_f = 0, 0 order = list(range(len(train_instances_idx))) for i in range(args.max_epoch): logging.info( '--------------------Round {0}---------------------'.format(i)) #random.shuffle(order) start_id = 0 count = 0 total_loss = 0 for start_id in range(0, train_instances_size, args.batch_size): end_id = start_id + args.batch_size if start_id + args.batch_size < train_instances_size else train_instances_size batch_size = end_id - start_id batch_to_be_generated = [ train_instances_idx[ids] for ids in order[start_id:end_id] ] batch_gold = [ train_instances[ids] for ids in order[start_id:end_id] ] # 对于train来说没有用 batch_input, batch_output, _, sentence_lens = generate_batch( batch_to_be_generated, batch_gold, batch_size, lang.word2idx['pad']) # train encoder.train() decoder.train() encoderdecoder.train() encoder.zero_grad() decoder.zero_grad() encoderdecoder.zero_grad() loss = encoderdecoder.forward(batch_input, batch_output, sentence_lens, keys_idx, \ encoder, decoder, lang.word2idx['pad'], args.embed_size) loss.backward() clip_grad_norm(encoder.parameters(), args.grad_clip) clip_grad_norm(decoder.parameters(), args.grad_clip) clip_grad_norm(encoderdecoder.parameters(), args.grad_clip) encoder_optimizer.step() decoder_optimizer.step() encoderdecoder_optimizer.step() total_loss += loss.data count += 1 # if (count % 100 == 0): # logging.info('average loss: {0}'.format(total_loss*1.0/count)) valid_bleu_score, valid_f = evaluate(keys_idx, encoder, decoder, encoderdecoder, valid_instances_idx, valid_instances, lang, \ args.batch_size, args.embed_size, args.hidden_size, args.bleu_path, args.parallel_suffix) # if (valid_bleu_score > best_valid_bleu_score): # test_bleu_score, test_f = evaluate(keys_idx, encoder, decoder, encoderdecoder, test_instances_idx, test_instances, lang, \ # args.batch_size, args.embed_size, args.hidden_size, args.bleu_path, args.parallel_suffix) # best_test_bleu_score = max(best_test_bleu_score, test_bleu_score) # # logging.info('New Record! test bleu score now: {0} best test bleu score ever: {1}'.format(\ # test_bleu_score, best_test_bleu_score)) if (valid_f > best_valid_f): torch.save( encoder.state_dict(), os.path.join(args.model_save_path, 'encoder' + args.parallel_suffix)) torch.save( decoder.state_dict(), os.path.join(args.model_save_path, 'decoder' + args.parallel_suffix)) torch.save( encoderdecoder.state_dict(), os.path.join(args.model_save_path, 'encoderdecoder' + args.parallel_suffix)) test_bleu_score, test_f = evaluate(keys_idx, encoder, decoder, encoderdecoder, test_instances_idx, test_instances, lang, \ args.batch_size, args.embed_size, args.hidden_size, args.bleu_path, args.parallel_suffix) best_test_f = max(best_test_f, test_f) best_test_bleu_score = max(best_test_bleu_score, test_bleu_score) logging.info('New Record! test F now: {0} best test F ever: {1} test bleu score now: {2} best test bleu score ever: {3}'.format(\ test_f, best_test_f, test_bleu_score, best_test_bleu_score)) best_valid_f = max(best_valid_f, valid_f) best_valid_bleu_score = max(best_valid_bleu_score, valid_bleu_score) logging.info('valid F: {0} best valid F ever: {1}'.format( valid_f, best_valid_f)) logging.info( 'valid bleu score: {0} best valid bleu score ever: {1}'.format( valid_bleu_score, best_valid_bleu_score)) logging.info('Trianing complete! best valid bleu score: {0} best test bleu score: {1} best valid F: {2} best test F: {3}'\ .format(best_valid_bleu_score, best_test_bleu_score, best_valid_f, best_test_f)) logging.info('suffix is {0}'.format(args.parallel_suffix)) print(args)
def train(self): info = {} if self.T % self.args.nec_update != 0: return info # print("Training") for _ in range(self.args.iters): # TODO: Use a named tuple for experience replay batch = self.replay.Sample(self.args.batch_size) columns = list(zip(*batch)) states = Variable(torch.from_numpy(np.array(columns[0])).float().transpose_(1, 3)) # print(states) actions = columns[1] # print(actions) targets = Variable(torch.FloatTensor(columns[2])) # print(targets) keys = self.embedding(states).cpu() # print(keys) # print("Keys", keys.requires_grad) # for action in actions: # print(action) # for action, key in zip(actions, keys): # print(action, key) # kk = key.unsqueeze(0) # print("kk", kk.requires_grad) # k = self.dnds[action].lookup(key.unsqueeze(0)) # print("key", key.requires_grad, key.volatile) model_predictions = torch.cat([self.dnds[action].lookup(key.unsqueeze(0)) for action, key in zip(actions, keys)]) # print(model_predictions) # print(targets) td_error = model_predictions - targets # print(td_error) info["TD_Error"] = td_error.mean().data[0] l2_loss = (td_error).pow(2).mean() info["Loss"] = l2_loss.data[0] # Update self.optimizer.zero_grad() l2_loss.backward() # Taken from pytorch clip_grad_norm # Remove once the pip version it up to date with source gradient_norm = clip_grad_norm(self.embedding.parameters(), self.args.clip_value) if gradient_norm is not None: info["Norm"] = gradient_norm self.optimizer.step() if "States" in info: states_trained = info["States"] info["States"] = states_trained + columns[0] else: info["States"] = columns[0] return info
def forward(data_loader, model, criterion, epoch=0, training=True, optimizer=None): if args.gpus and len(args.gpus) > 1: model = torch.nn.DataParallel(model, args.gpus) batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() end = time.time() if training: optimizer.zero_grad() for i, (inputs, target) in enumerate(data_loader): # measure data loading time data_time.update(time.time() - end) if args.gpus is not None: target = target.cuda(async=True) input_var = Variable(inputs.type(args.type), volatile=not training) target_var = Variable(target) # compute output if not training: output = model(input_var) loss = criterion(output, target_var) # measure accuracy and record loss prec1, prec5 = accuracy(output.data, target_var.data, topk=(1, 5)) losses.update(loss.data[0], input_var.size(0)) top1.update(prec1[0], input_var.size(0)) top5.update(prec5[0], input_var.size(0)) else: is_updating = ((i + 1) % args.batch_multiplier == 0) or (i + 1 == len(data_loader)) mini_inputs = input_var.chunk(args.batch_size // args.mini_batch_size) mini_targets = target_var.chunk(args.batch_size // args.mini_batch_size) # get the coefficent to scale noise eq_batch_num = (len(data_loader) + args.batch_multiplier - 1) // args.batch_multiplier if args.smoothing_type == 'constant': noise_coef = 1. elif args.smoothing_type == 'anneal': noise_coef = 1.0 / ( (1 + epoch * eq_batch_num + i // args.batch_multiplier)** args.anneal_index) noise_coef = noise_coef**0.5 elif args.smoothing_type == 'tanh': noise_coef = np.tanh( args.tanh_scale * ((float)(epoch * eq_batch_num + i // args.batch_multiplier) / (float)(args.epochs * eq_batch_num) - .5)) noise_coef = (noise_coef + 1.) / 2.0 else: raise ValueError('Unknown smoothing-type') if i % args.print_freq == 0: logging.info( '{phase} - Epoch: [{0}][{1}/{2}]\t' 'Noise Coefficient: {noise_coef:.4f}\t'.format( epoch, i, len(data_loader), phase='TRAINING' if training else 'EVALUATING', noise_coef=noise_coef)) for k, mini_input_var in enumerate(mini_inputs): noises = {} # randomly change current model @ each mini-mini-batch if args.sharpness_smoothing: for key, p in model.named_parameters(): if hasattr(model, 'quiet_parameters') and ( key in model.quiet_parameters): continue if args.adapt_type == 'weight': noise = ( torch.cuda.FloatTensor(p.size()).uniform_() * 2. - 1.) * args.sharpness_smoothing * torch.abs( p.data) * noise_coef elif args.adapt_type == 'filter': noise = ( torch.cuda.FloatTensor(p.size()).uniform_() * 2. - 1.) noise_shape = noise.shape noise_norms = noise.view( [noise_shape[0], -1]).norm(p=2, dim=1) + 1.0e-6 p_norms = p.view([noise_shape[0], -1]).norm(p=2, dim=1) for shape_idx in range(1, len(noise_shape)): noise_norms = noise_norms.unsqueeze(-1) p_norms = p_norms.unsqueeze(-1) noise = noise / noise_norms * p_norms.data #for idx in range(0, noise.shape[0]): # if 1 == len(noise.shape): # if np.abs(np.linalg.norm(noise[idx]))>1.0e-6: # noise[idx] = noise[idx] / np.linalg.norm(noise[idx]) * np.linalg.norm(p.data[idx]) # else: # if np.abs(noise[idx].norm())>1.0e-6: # noise[idx] = noise[idx] / noise[idx].norm() * p.data[idx].norm() noise = noise * args.sharpness_smoothing * noise_coef elif args.adapt_type == 'none': noise = ( torch.cuda.FloatTensor(p.size()).uniform_() * 2. - 1.) * args.sharpness_smoothing * noise_coef else: raise ValueError('Unkown --adapt-type') noises[key] = noise p.data.add_(noise) mini_target_var = mini_targets[k] output = model(mini_input_var) loss = criterion(output, mini_target_var) prec1, prec5 = accuracy(output.data, mini_target_var.data, topk=(1, 5)) losses.update(loss.data[0], mini_input_var.size(0)) top1.update(prec1[0], mini_input_var.size(0)) top5.update(prec5[0], mini_input_var.size(0)) # compute gradient and do SGD step loss.backward() # denoise @ each mini-mini-batch. if args.sharpness_smoothing: for key, p in model.named_parameters(): if key in noises: p.data.sub_(noises[key]) if is_updating: n_batches = args.batch_multiplier if (i + 1) == len(data_loader): n_batches = (i % args.batch_multiplier) + 1 for p in model.parameters(): p.grad.data.div_(len(mini_inputs) * n_batches) clip_grad_norm(model.parameters(), 5.) optimizer.step() optimizer.zero_grad() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: logging.info('{phase} - Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( epoch, i, len(data_loader), phase='TRAINING' if training else 'EVALUATING', batch_time=batch_time, data_time=data_time, loss=losses, top1=top1, top5=top5)) return {'loss': losses.avg, 'prec1': top1.avg, 'prec5': top5.avg}
def fit_model(model, loss_op, optim_op, train_gen, val_gen, epochs, checkpoint_path, patience): """ Analog to Keras fit_generator function. # Arguments: model: Model to be finetuned. loss_op: loss operation (BCEWithLogitsLoss or CrossEntropy for e.g.) optim_op: optimization operation (Adam e.g.) train_gen: Training data iterator (DataLoader) val_gen: Validation data iterator (DataLoader) epochs: Number of epochs. checkpoint_path: Filepath where weights will be checkpointed to during training. This file will be rewritten by the function. patience: Patience for callback methods. verbose: Verbosity flag. # Returns: Accuracy of the trained model, ONLY if 'evaluate' is set. """ # Save original checkpoint torch.save(model.state_dict(), checkpoint_path) model.eval() best_loss = np.mean([loss_op(model(Variable(xv)).squeeze(), Variable(yv.float()).squeeze()).data.cpu().numpy()[0] for xv, yv in val_gen]) print("original val loss", best_loss) epoch_without_impr = 0 for epoch in range(epochs): for i, data in enumerate(train_gen): X_train, y_train = data X_train = Variable(X_train, requires_grad=False) y_train = Variable(y_train, requires_grad=False) model.train() optim_op.zero_grad() output = model(X_train) loss = loss_op(output, y_train.float()) loss.backward() clip_grad_norm(model.parameters(), 1) optim_op.step() acc = evaluate_using_acc(model, [(X_train.data, y_train.data)]) print("== Epoch", epoch, "step", i, "train loss", loss.data.cpu().numpy()[0], "train acc", acc) model.eval() acc = evaluate_using_acc(model, val_gen) print("val acc", acc) val_loss = np.mean([loss_op(model(Variable(xv)).squeeze(), Variable(yv.float()).squeeze()).data.cpu().numpy()[0] for xv, yv in val_gen]) print("val loss", val_loss) if best_loss is not None and val_loss >= best_loss: epoch_without_impr += 1 print('No improvement over previous best loss: ', best_loss) # Save checkpoint if best_loss is None or val_loss < best_loss: best_loss = val_loss torch.save(model.state_dict(), checkpoint_path) print('Saving model at', checkpoint_path) # Early stopping if epoch_without_impr >= patience: break
def train(train_iter, dev_iter, test_iter, model, args): if args.cuda: model.cuda() # optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-8) # optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.init_weight_decay) # optimizer = torch.optim.SGD(model.parameters(), lr=args.lr,momentum=) # optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr) if args.Adam is True: print("Adam Training......") optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.init_weight_decay) elif args.SGD is True: print("SGD Training.......") optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.init_weight_decay, momentum=args.momentum_value) elif args.Adadelta is True: print("Adadelta Training.......") optimizer = torch.optim.Adadelta(model.parameters(), lr=args.lr, weight_decay=args.init_weight_decay) # lambda1 = lambda epoch: epoch // 30 # lambda2 = lambda epoch: 0.99 ** epoch # print("lambda1 {} lambda2 {} ".format(lambda1, lambda2)) # scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=[lambda2]) # scheduler = lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.9) scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min') steps = 0 epoch_step = 0 model_count = 0 model.train() for epoch in range(1, args.epochs+1): print("\n## 第{} 轮迭代,共计迭代 {} 次 !##\n".format(epoch, args.epochs)) # scheduler.step() # print("now lr is {} \n".format(scheduler.get_lr())) print("now lr is {} \n".format(optimizer.param_groups[0].get("lr"))) for batch in train_iter: feature, target = batch.text, batch.label feature.data.t_(), target.data.sub_(1) # batch first, index align if args.cuda: feature, target = feature.cuda(), target.cuda() optimizer.zero_grad() logit = model(feature) loss = F.cross_entropy(logit, target) loss.backward() if args.init_clip_max_norm is not None: utils.clip_grad_norm(model.parameters(), max_norm=args.init_clip_max_norm) optimizer.step() steps += 1 if steps % args.log_interval == 0: train_size = len(train_iter.dataset) corrects = (torch.max(logit, 1)[1].view(target.size()).data == target.data).sum() accuracy = float(corrects)/batch.batch_size * 100.0 sys.stdout.write( '\rBatch[{}/{}] - loss: {:.6f} acc: {:.4f}%({}/{})'.format(steps, train_size, loss.data[0], accuracy, corrects, batch.batch_size)) # eval and test after every epoch eval(dev_iter, model, args, scheduler) if not os.path.isdir(args.save_dir): os.makedirs(args.save_dir) save_prefix = os.path.join(args.save_dir, 'snapshot') save_path = '{}_steps{}.pt'.format(save_prefix, steps) torch.save(model, save_path) print("\n", save_path, end=" ") test_model = torch.load(save_path) model_count += 1 test_eval(test_iter, test_model, save_path, args, model_count) return model_count
def train(): global_steps = 0 best_em = -1 best_f1 = -1 for iepoch in range(epochs): batch = 0 for tdata in train_loader: passage_tokens = tdata['passage_tokens'] passage_len = tdata['passage_len'] char_start_end = tdata['char_start_end'] question_tokens = tdata['question_tokens'] question_len = tdata['question_len'] ground_truth = tdata['ground_truth'] answer_tokens = tdata['answer_tokens'] answer_len = tdata['answer_len'] boundary = tdata['boundary'] passage_str = tdata['passage_str'] question_str = tdata['question_str'] answer_str = tdata['answer_str'] key = tdata['key'] classification_labels, answer_index = prepare_classify(question_tokens, answer_tokens) fw_res = net(passage=passage_tokens, question=question_tokens, answer=answer_tokens, answer_index=answer_index, decoder_inputs=ground_truth, is_classification = True, is_generation = True, is_teacher_forcing = True) match_logits = fw_res['match_logits'] match_predictions = fw_res['match_predictions'] generation_logits = fw_res['generation_logits'] generation_predictions = fw_res['generation_predictions'] classification_logits = fw_res['classification_logits'] classification_predictions = fw_res['classification_predictions'] print (classification_predictions) print (classification_labels.numpy()) print (sum(classification_labels.numpy() == classification_predictions) / len(classification_predictions)) loss_return = net.get_loss(match_logits=match_logits, match_labels=boundary, generation_logits=generation_logits, generation_labels=question_tokens, classification_logits=classification_logits, classification_labels= classification_labels, is_match = False, is_generation = False, is_classification = True, lambda_m = lambda_m, lambda_g = lambda_g, lambda_c = lambda_c) match_loss = loss_return['match_loss'] loss = loss_return['loss'] generation_loss = loss_return['generation_loss'] classification_loss = loss_return['classification_loss'] optimizer.zero_grad() loss.backward() utils.clip_grad_norm(net.parameters(), 5) optimizer.step() print (global_steps, iepoch, batch, 'match loss: ', match_loss, 'generation loss: ', generation_loss,'classification loss: ', classification_loss ) agent.append(train_match_loss, global_steps, match_loss) agent.append(train_generation_loss, global_steps, generation_loss) agent.append(train_classification_loss, global_steps, classification_loss) agent.append(train_loss, global_steps, sum(loss.cpu().data.numpy())) batch += 1 global_steps += 1 del fw_res, match_logits, match_predictions, loss, match_loss, generation_loss, loss_return ''' if global_steps % 10 == 0: match_loss, loss = check(net, tdata) net.train() if global_steps % 20 == 0: dev_loss, em, f1 = valid() agent.append(valid_match_loss, global_steps, dev_loss) agent.append(valid_match_em, global_steps, em) agent.append(valid_match_f1, global_steps, f1) print (global_steps, iepoch, batch, dev_loss, em, f1) ''' ''' if em > best_em and f1 > best_f1: save_model(net, dev_loss, em, f1, global_steps) ''' '''
def train(self): if self.T - self.target_sync_T > self.args.target: self.sync_target_network() self.target_sync_T = self.T info = {} for _ in range(self.args.iters): self.dqn.eval() # TODO: Use a named tuple for experience replay n_step_sample = 1 if np.random.random() < self.args.n_step_mixing: n_step_sample = self.args.n_step batch, indices, is_weights = self.replay.Sample_N(self.args.batch_size, n_step_sample, self.args.gamma) columns = list(zip(*batch)) states = Variable(torch.from_numpy(np.array(columns[0])).float().transpose_(1, 3)) actions = Variable(torch.LongTensor(columns[1])) terminal_states = Variable(torch.FloatTensor(columns[5])) rewards = Variable(torch.FloatTensor(columns[2])) # Have to clip rewards for DQN rewards = torch.clamp(rewards, -1, 1) steps = Variable(torch.FloatTensor(columns[4])) new_states = Variable(torch.from_numpy(np.array(columns[3])).float().transpose_(1, 3)) target_dqn_qvals = self.target_dqn(new_states).cpu() # Make a new variable with those values so that these are treated as constants target_dqn_qvals_data = Variable(target_dqn_qvals.data) q_value_targets = (Variable(torch.ones(terminal_states.size()[0])) - terminal_states) inter = Variable(torch.ones(terminal_states.size()[0]) * self.args.gamma) # print(steps) q_value_targets = q_value_targets * torch.pow(inter, steps) if self.args.double: # Double Q Learning new_states_qvals = self.dqn(new_states).cpu() new_states_qvals_data = Variable(new_states_qvals.data) q_value_targets = q_value_targets * target_dqn_qvals_data.gather(1, new_states_qvals_data.max(1)[1]) else: q_value_targets = q_value_targets * target_dqn_qvals_data.max(1)[0] q_value_targets = q_value_targets + rewards self.dqn.train() if self.args.gpu: actions = actions.cuda() q_value_targets = q_value_targets.cuda() model_predictions = self.dqn(states).gather(1, actions.view(-1, 1)) # info = {} td_error = model_predictions - q_value_targets info["TD_Error"] = td_error.mean().data[0] # Update the priorities if not self.args.density_priority: self.replay.Update_Indices(indices, td_error.cpu().data.numpy(), no_pseudo_in_priority=self.args.count_td_priority) # If using prioritised we need to weight the td_error if self.args.prioritized and self.args.prioritized_is: # print(td_error) weights_tensor = torch.from_numpy(is_weights).float() weights_tensor = Variable(weights_tensor) if self.args.gpu: weights_tensor = weights_tensor.cuda() # print(weights_tensor) td_error = td_error * weights_tensor l2_loss = (td_error).pow(2).mean() info["Loss"] = l2_loss.data[0] # Update self.optimizer.zero_grad() l2_loss.backward() # Taken from pytorch clip_grad_norm # Remove once the pip version it up to date with source gradient_norm = clip_grad_norm(self.dqn.parameters(), self.args.clip_value) if gradient_norm is not None: info["Norm"] = gradient_norm self.optimizer.step() if "States" in info: states_trained = info["States"] info["States"] = states_trained + columns[0] else: info["States"] = columns[0] # Pad out the states to be of size batch_size if len(info["States"]) < self.args.batch_size: old_states = info["States"] new_states = old_states[0] * (self.args.batch_size - len(old_states)) info["States"] = new_states return info
def train(train_loader, model, act_criterion, comp_criterion, regression_criterion, optimizer, epoch): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() act_losses = AverageMeter() comp_losses = AverageMeter() reg_losses = AverageMeter() act_accuracies = AverageMeter() fg_accuracies = AverageMeter() bg_accuracies = AverageMeter() # switch to train mode model.train() end = time.time() optimizer.zero_grad() ohem_num = train_loader.dataset.fg_per_video comp_group_size = train_loader.dataset.fg_per_video + train_loader.dataset.incomplete_per_video for i, (out_frames, out_prop_len, out_prop_scaling, out_prop_type, out_prop_labels, out_prop_reg_targets, out_stage_split) \ in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) input_var = torch.autograd.Variable(out_frames) scaling_var = torch.autograd.Variable(out_prop_scaling) target_var = torch.autograd.Variable(out_prop_labels) reg_target_var = torch.autograd.Variable(out_prop_reg_targets) prop_type_var = torch.autograd.Variable(out_prop_type) # compute output activity_out, activity_target, \ completeness_out, completeness_target, \ regression_out, regression_labels, regression_target = model(input_var, scaling_var, target_var, reg_target_var, prop_type_var) act_loss = act_criterion(activity_out, activity_target) comp_loss = comp_criterion(completeness_out, completeness_target, ohem_num, comp_group_size) reg_loss = regression_criterion(regression_out, regression_labels, regression_target) loss = act_loss + comp_loss * args.comp_loss_weight + reg_loss * args.reg_loss_weight reg_losses.update(reg_loss.data[0], out_frames.size(0)) # measure mAP and record loss losses.update(loss.data[0], out_frames.size(0)) act_losses.update(act_loss.data[0], out_frames.size(0)) comp_losses.update(comp_loss.data[0], out_frames.size(0)) act_acc = accuracy(activity_out, activity_target) act_accuracies.update(act_acc[0].data[0], activity_out.size(0)) fg_acc = accuracy(activity_out.view(-1, 2, activity_out.size(1))[:, 0, :].contiguous(), activity_target.view(-1, 2)[:, 0].contiguous()) bg_acc = accuracy(activity_out.view(-1, 2, activity_out.size(1))[:, 1, :].contiguous(), activity_target.view(-1, 2)[:, 1].contiguous()) fg_accuracies.update(fg_acc[0].data[0], activity_out.size(0) // 2) bg_accuracies.update(bg_acc[0].data[0], activity_out.size(0) // 2) # compute gradient and do SGD step loss.backward() if i % args.iter_size == 0: # scale down gradients when iter size is functioning if args.iter_size != 1: for g in optimizer.param_groups: for p in g['params']: p.grad /= args.iter_size if args.clip_gradient is not None: total_norm = clip_grad_norm(model.parameters(), args.clip_gradient) if total_norm > args.clip_gradient: print("clipping gradient: {} with coef {}".format(total_norm, args.clip_gradient / total_norm)) else: total_norm = 0 optimizer.step() optimizer.zero_grad() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: print('Epoch: [{0}][{1}/{2}], lr: {lr:.5f}\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Act. Loss {act_losses.val:.3f} ({act_losses.avg: .3f}) \t' 'Comp. Loss {comp_losses.val:.3f} ({comp_losses.avg: .3f}) ' .format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, act_losses=act_losses, comp_losses=comp_losses, lr=optimizer.param_groups[0]['lr'], ) + '\tReg. Loss {reg_loss.val:.3f} ({reg_loss.avg:.3f})'.format( reg_loss=reg_losses) + '\n Act. FG {fg_acc.val:.02f} ({fg_acc.avg:.02f}) Act. BG {bg_acc.avg:.02f} ({bg_acc.avg:.02f})' .format(act_acc=act_accuracies, fg_acc=fg_accuracies, bg_acc=bg_accuracies) )
def train(lr, net, epoch, train_loader, valid_loader, transform, hyperparameters, batch_size): # register hypercurve agent = Agent(port=5005) hyperparameters['criteria'] = 'train loss' train_loss = agent.register(hyperparameters, 'loss') hyperparameters['criteria'] = 'valid loss' valid_loss = agent.register(hyperparameters, 'loss') hyperparameters['criteria'] = 'valid bleu' valid_bleu = agent.register(hyperparameters, 'bleu') hyperparameters['criteria'] = 'train bleu' train_bleu = agent.register(hyperparameters, 'bleu') hyperparameters['criteria'] = 'teacher_forcing_ratio' hyper_tfr = agent.register(hyperparameters, 'ratio') hyperparameters['criteria'] = 'teacher_forcing_loss' valid_tf_loss = agent.register(hyperparameters, 'loss') if torch.cuda.is_available(): net.cuda() optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr = lr) net.train() best_score = -1 global_steps = 578800 best_valid_loss = 10000 for iepoch in range(epoch): batchid = 0 for (_, tdata) in enumerate(train_loader, 0): entext = tdata['entext'] enlen = tdata['enlen'] zhlabel = tdata['zhlabel'] zhgtruth = tdata['zhgtruth'] zhlen = tdata['zhlen'] enstr = tdata['enstr'] zhstr = tdata['zhstr'] teacher_forcing_ratio = 1 print ('teacher_forcing_ratio: ', teacher_forcing_ratio) decoder_outputs, ret_dict = net(entext, zhgtruth,True, teacher_forcing_ratio) loss = net.get_loss(decoder_outputs, zhlabel) optimizer.zero_grad() loss.backward() utils.clip_grad_norm(net.parameters(), 5) optimizer.step() batchid += 1 global_steps += 1 print (global_steps, iepoch, batchid, max(enlen), sum(loss.data.cpu().numpy())) agent.append(train_loss, global_steps, sum(loss.data.cpu().numpy())) agent.append(hyper_tfr, global_steps, teacher_forcing_ratio) if global_steps % 50 == 0: net.eval() decoder_outputs, ret_dict = net(entext, zhgtruth, True, teacher_forcing_ratio) length = ret_dict['length'] prediction = [0 for i in range(len(length))] tmppre = [_.squeeze().cpu().data.tolist() for _ in ret_dict['sequence']] tmppre = np.array(tmppre).transpose(1, 0) for i in range(len(tmppre)): prediction[i] = tmppre[i][:length[i]] prediction[i] = transform.i2t(prediction[i], language = 'zh') prediction[i] = re.sub(r'nuk#', '', prediction[i]) prediction[i] = re.sub(r'eos#', '', prediction[i]) tmpscore = bleuscore.score(prediction, zhstr) for i in range(5): print (prediction[i]) print (zhstr[i]) print ('-------------------\n') del decoder_outputs, ret_dict agent.append(train_bleu, global_steps, tmpscore) net.train() if global_steps % 200 == 0: print ('\n------------------------\n') net.eval() all_pre = [] all_label = [] all_loss = 0 all_en = [] bats = 0 teacher_forcing_loss = 0 for (_, vdata) in enumerate(valid_loader, 0): entext = vdata['entext'] enlen = vdata['enlen'] zhlabel = vdata['zhlabel'] zhgtruth = vdata['zhgtruth'] zhlen = vdata['zhlen'] enstr = vdata['enstr'] zhstr = vdata['zhstr'] decoder_outputs, ret_dict = net(entext, None, True, 0) length = ret_dict['length'] prediction = [0 for i in range(len(length))] tmppre = [_.squeeze().cpu().data.tolist() for _ in ret_dict['sequence']] tmppre = np.array(tmppre).transpose(1, 0) for i in range(len(tmppre)): prediction[i] = tmppre[i][:length[i]] prediction[i] = transform.i2t(prediction[i], language = 'zh') prediction[i] = re.sub(r'nuk#', '', prediction[i]) prediction[i] = re.sub(r'eos#', '', prediction[i]) loss = net.get_loss(decoder_outputs, zhlabel) all_pre.extend(prediction) all_label.extend(zhstr) all_en.extend(enstr) all_loss += sum(loss.data.cpu().numpy()) del loss, decoder_outputs, ret_dict # teacher forcing loss, to judge if overfit decoder_outputs, _ = net(entext, zhgtruth, True, 1) loss = net.get_loss(decoder_outputs, zhlabel) teacher_forcing_loss += sum(loss.data.cpu().numpy()) bats += 1 score = bleuscore.score(all_pre, all_label) for i in range(0, 400): print (all_en[i]) print (all_pre[i]) print (all_label[i]) print ('-------------------\n') all_loss /= bats teacher_forcing_loss /= bats print (global_steps, iepoch, batchid, all_loss, teacher_forcing_loss, score, '\n********************\n') agent.append(valid_loss, global_steps, all_loss) agent.append(valid_bleu, global_steps, score) agent.append(valid_tf_loss, global_steps, teacher_forcing_loss) if best_valid_loss > all_loss: best_valid_loss = all_loss #bestscore = score _ = model_dir + "ratio-{:3f}-loss-{:3f}-score-{:3f}-steps-{:d}-model.pkl".format(teacher_forcing_ratio, all_loss, score, global_steps) torch.save(net.state_dict(), _) elif global_steps % 400 == 0: _ = model_dir + "ratio-{:3f}-loss-{:3f}-score-{:3f}-steps-{:d}-model.pkl".format(teacher_forcing_ratio, all_loss, score, global_steps) torch.save(net.state_dict(), _) del all_label, all_loss, all_pre net.train()
def train(args, word_idx, train_data, valid_data, dev_data, writer=None): # build minibatch loader train_batch_loader = minibatch_loader(train_data, args.batch_size, sample=1.0, punc=args.punc) valid_batch_loader = minibatch_loader(valid_data, args.batch_size, shuffle=False, punc=args.punc) dev_batch_loader = minibatch_loader(dev_data, args.batch_size, shuffle=False, punc=args.punc) # training phase if args.restore_model != None: logging.info("restore from previous training...") _, embed_dim = load_word2vec_embeddings(word_idx, args.embed_file, args.embed_dim, False) model = AttSum(args.n_layers, args.vocab_size, args.drop_out, args.gru_size, None, embed_dim, args.train_emb) checkpoint = torch.load(args.restore_model + '.chkpt') opt = torch.optim.Adam(params=filter(lambda p: p.requires_grad, model.parameters()), lr=args.init_learning_rate) model.load_state_dict(checkpoint) ''' model.load_state_dict(checkpoint['state_dict']) opt.load_state_dict(checkpoint['optimizer']) ''' else: embed_init, embed_dim = load_word2vec_embeddings( word_idx, args.embed_file, args.embed_dim, True) logging.info("embedding dim: {}".format(embed_dim)) logging.info("initialize model ...") model = AttSum(args.n_layers, args.vocab_size, args.drop_out, args.gru_size, embed_init, embed_dim, args.train_emb) opt = torch.optim.Adam(params=filter(lambda p: p.requires_grad, model.parameters()), lr=args.init_learning_rate) if USE_CUDA: model.cuda() logging.info("Running on cuda: {}".format(USE_CUDA)) logging.info('-' * 50) logging.info("Start training ...") best_valid_acc = best_dev_acc = 0 for epoch in range(args.epoch): ''' if epoch >= 2: for param_group in opt.param_groups: param_group['lr'] /= 2 ''' model.train() train_acc = acc = train_loss = loss = n_examples = train_examples = it = 0 start = time.time() for docs, anss, docs_mask, \ cand_position in train_batch_loader: train_examples += docs.shape[0] n_examples += docs.shape[0] docs, anss, docs_mask, \ cand_position = to_vars([docs, anss, docs_mask, \ cand_position], use_cuda=USE_CUDA) opt.zero_grad() loss_, acc_ = model(docs, anss, docs_mask, cand_position) train_loss += loss_.cpu().data.numpy()[0] loss += loss_.cpu().data.numpy()[0] train_acc += acc_.cpu().data.numpy()[0] acc += acc_.cpu().data.numpy()[0] it += 1 loss_.backward() clip_grad_norm(parameters=filter(lambda p: p.requires_grad, model.parameters()), max_norm=args.grad_clip) opt.step() if (it % args.print_every == 0): # on training spend = (time.time() - start) / 60 statement = "it: {} (max: {}), "\ .format(it, len(train_batch_loader)) statement += "train loss: {:.3f}, acc: {:.3f}, time: {:.1f}(m)"\ .format(loss / float(args.print_every), acc / n_examples, spend) logging.info(statement) # on valid model.eval() start = time.time() valid_loss, valid_acc = evaluate(model, valid_batch_loader, USE_CUDA, False) spend = (time.time() - start) / 60 logging.info( "Valid loss: {:.3f}, acc: {:.3f}, time: {:.1f}(m)".format( valid_loss, valid_acc, spend)) if best_valid_acc < valid_acc: best_valid_acc = valid_acc logging.info( "Best valid acc: {:.3f}".format(best_valid_acc)) # on lambada dev start = time.time() dev_loss, dev_acc = evaluate(model, dev_batch_loader, USE_CUDA, True) spend = (time.time() - start) / 60 logging.info( "dev loss: {:.3f}, acc: {:.3f}, time: {:.1f}(m)".format( dev_loss, dev_acc, spend)) if best_dev_acc < dev_acc: best_dev_acc = dev_acc logging.info("Best dev acc: {:.3f}".format(best_dev_acc)) if args.save_mode == 'best': model_name = args.save_model + '.chkpt' torch.save(model.state_dict(), model_name) logging.info( ' - [Info] The checkpoint file has been updated [best].' ) if writer != None: it_w = it / args.print_every writer.add_scalar('data/train_loss', loss / float(args.print_every), it_w) writer.add_scalar('data/train_acc', acc / n_examples, it_w) writer.add_scalar('data/valid_loss', valid_loss, it_w) writer.add_scalar('data/valid_acc', valid_acc, it_w) writer.add_scalar('data/valid_loss', dev_loss, it_w) writer.add_scalar('data/valid_acc', dev_acc, it_w) model.train() start = time.time() acc = loss = n_examples = 0 logging.info( "End: train loss: {:.3f}, acc: {:.3f}, time: {:.1f}(m)".format( train_loss / len(train_batch_loader), train_acc / train_examples, spend)) # on valid start = time.time() model.eval() valid_loss, valid_acc = evaluate(model, valid_batch_loader, USE_CUDA, False) spend = (time.time() - start) / 60 logging.info( "End: Valid loss: {:.3f}, acc: {:.3f}, time: {:.1f}(m)".format( valid_loss, valid_acc, spend)) if best_valid_acc < valid_acc: best_valid_acc = valid_acc logging.info("Best valid acc: {:.3f}".format(best_valid_acc)) # on lambada dev start = time.time() dev_loss, dev_acc = evaluate(model, dev_batch_loader, USE_CUDA, True) spend = (time.time() - start) / 60 logging.info( "End: dev loss: {:.3f}, acc: {:.3f}, time: {:.1f}(m)".format( dev_loss, dev_acc, spend)) if best_dev_acc < dev_acc: best_dev_acc = dev_acc logging.info("Best dev acc: {:.3f}".format(best_dev_acc)) #save checkpoint checkpoint = { 'state_dict': model.state_dict(), 'optimizer': opt.state_dict() } if args.save_model: if args.save_mode == 'series': model_name = args.save_model + '.chkpt' torch.save(model.state_dict(), model_name) #torch.save(checkpoint, model_name) logging.info( ' - [Info] The checkpoint file has been updated [series].') elif args.save_mode == 'all': model_name = args.save_model + '_accu_{accu:3.3f}.chkpt'.format( accu=100 * valid_accu) torch.save(checkpoint, model_name) logging.info( ' - [Info] The checkpoint file has been updated [all].') '''
def forward(data_loader, model, criterion, epoch=0, training=True, optimizer=None, U=None, V=None): if args.gpus and len(args.gpus) > 1: model = torch.nn.DataParallel(model, args.gpus) batch_time = AverageMeter() pruning_time = AverageMeter() select_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() end = time.time() masks = [torch.zeros(w.size()).cuda() for w in list(model.parameters())] for i, (inputs, target) in enumerate(data_loader): # measure data loading time data_time.update(time.time() - end) if args.gpus is not None: target = target.cuda(async=True) input_var = Variable(inputs.type(args.type), volatile=not training) target_var = Variable(target) # compute output if not training: output = model(input_var) loss = criterion(output, target_var) # measure accuracy and record loss prec1, prec5 = accuracy(output.data, target_var.data, topk=(1, 5)) losses.update(loss.data[0], input_var.size(0)) top1.update(prec1[0], input_var.size(0)) top5.update(prec5[0], input_var.size(0)) else: mini_inputs = input_var.chunk(args.batch_size // args.mini_batch_size) mini_targets = target_var.chunk(args.batch_size // args.mini_batch_size) #TODO for debug shoul be delete if(0 == i): print('number of ghost batch is ', len(mini_inputs)) optimizer.zero_grad() # fjr simulate distributed senario acc_grad = [] if args.use_residue_acc: if torch.cuda.is_available(): acc_grad = [torch.zeros(w.size()).cuda() for w in list(model.parameters())] else: print("gpu is not avaiable for acc_grad allocation") for k, mini_input_var in enumerate(mini_inputs): mini_target_var = mini_targets[k] output = model(mini_input_var) loss = criterion(output, mini_target_var) prec1, prec5 = accuracy(output.data, mini_target_var.data, topk=(1, 5)) losses.update(loss.data[0], mini_input_var.size(0)) top1.update(prec1[0], mini_input_var.size(0)) top5.update(prec5[0], mini_input_var.size(0)) # compute gradient and do SGD step # fjr if args.use_residue_acc: # clear grad before accumulating optimizer.zero_grad() loss.backward() if args.use_residue_acc: if args.use_pruning: clip_grad_norm(model.parameters(), 5. * (len(mini_inputs) ** -0.5)) idx = 0 for u, v, p in zip(U[k], V[k], model.parameters()): prune_begin = time.time() if args.use_pruning: # TODO how to set rho (momentum) g = p.grad.data / len(mini_inputs) g += p.data * args.weight_decay / len(mini_inputs) if args.use_nesterov: u = args.momentum * (u + g) v = v + u + g else: u = args.momentum * u + g v = v + u select_begin = time.time() if args.use_sync and i % args.sync_interval == 0: masks[idx] = 1; else: if args.use_warmup: # print("iter", i, "node ", k, " pruning layer ", idx) if (epoch == 0): masks[idx] = select_top_k(v, 1 - 0.75, masks[idx]) elif (epoch == 1): masks[idx] = select_top_k(v, 1 - 0.9375, masks[idx]) elif (epoch == 2): masks[idx] = select_top_k(v, 1 - 0.984375, masks[idx]) elif (epoch == 3): masks[idx] = select_top_k(v, 1 - 0.996, masks[idx]) else: masks[idx] = select_top_k(v, 1 - 0.999, masks[idx]) else: masks[idx] = select_top_k(v, 1 - 0.999, masks[idx]) select_time.update(time.time() - select_begin) p.grad.data = v * masks[idx] v = v * (1 - masks[idx]) u = u * (1 - masks[idx]) acc_grad[idx] += p.grad.data U[k][idx] = u #new_residue V[k][idx] = v else: acc_grad[idx] += p.grad.data / len(mini_inputs) pruning_time.update(time.time() - prune_begin) idx = idx + 1 if args.use_residue_acc: # Master idx = 0 for g, p in zip(acc_grad, model.parameters()): # print("accumulated sparsity is", check_sparsity(g)) if args.use_pruning: # TODO 1. use pytorch sgd optimizer to calculate mom and weight_decay, set mom and wd # used with pruning p.grad.data = g else: # TODO 2. implement weight_decay and momentum by myself, set mom=0 and wd = 0 # used with baseline g += p.data * args.weight_decay V[k][idx] = args.momentum * V[k][idx] + g p.grad.data = V[k][idx] # clip_grad_norm(model.parameters(), 5.) idx = idx+1 else: for p in model.parameters(): p.grad.data.div_(len(mini_inputs)) #print("original grad norm before clip", p.grad.data.norm()) clip_grad_norm(model.parameters(), 5.) #print("original grad norm after clip", p.grad.data.norm()) optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: logging.info('{phase} - Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Prune {pruning_time.val:.9f} ({pruning_time.avg:.3f})\t' 'Select {select_time.val:.9f} ({select_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( epoch, i, len(data_loader), phase='TRAINING' if training else 'EVALUATING', batch_time=batch_time, data_time=data_time, pruning_time = pruning_time, select_time = select_time, loss=losses, top1=top1, top5=top5)) return {'loss': losses.avg, 'prec1': top1.avg, 'prec5': top5.avg, 'U' : U, 'V' : V}
def train( train_loader, model, act_criterion, comp_criterion, regression_criterion, optimizer, epoch, ): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() act_losses = AverageMeter() comp_losses = AverageMeter() reg_losses = AverageMeter() act_accuracies = AverageMeter() fg_accuracies = AverageMeter() bg_accuracies = AverageMeter() # switch to train mode model.train() end = time.time() optimizer.zero_grad() ohem_num = train_loader.dataset.fg_per_video comp_group_size = ( train_loader.dataset.fg_per_video + train_loader.dataset.incomplete_per_video ) for ( i, ( out_frames, out_prop_len, out_prop_scaling, out_prop_type, out_prop_labels, out_prop_reg_targets, out_stage_split, ), ) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) input_var = torch.autograd.Variable(out_frames) scaling_var = torch.autograd.Variable(out_prop_scaling) target_var = torch.autograd.Variable(out_prop_labels) reg_target_var = torch.autograd.Variable(out_prop_reg_targets) prop_type_var = torch.autograd.Variable(out_prop_type) # compute output activity_out, activity_target, completeness_out, completeness_target, regression_out, regression_labels, regression_target = model( input_var, scaling_var, target_var, reg_target_var, prop_type_var ) act_loss = act_criterion(activity_out, activity_target) comp_loss = comp_criterion( completeness_out, completeness_target, ohem_num, comp_group_size ) reg_loss = regression_criterion( regression_out, regression_labels, regression_target ) loss = ( act_loss + comp_loss * args.comp_loss_weight + reg_loss * args.reg_loss_weight ) reg_losses.update(reg_loss.data[0], out_frames.size(0)) # measure mAP and record loss losses.update(loss.data[0], out_frames.size(0)) act_losses.update(act_loss.data[0], out_frames.size(0)) comp_losses.update(comp_loss.data[0], out_frames.size(0)) act_acc = accuracy(activity_out, activity_target) act_accuracies.update(act_acc[0].data[0], activity_out.size(0)) fg_acc = accuracy( activity_out.view(-1, 2, activity_out.size(1))[:, 0, :].contiguous(), activity_target.view(-1, 2)[:, 0].contiguous(), ) bg_acc = accuracy( activity_out.view(-1, 2, activity_out.size(1))[:, 1, :].contiguous(), activity_target.view(-1, 2)[:, 1].contiguous(), ) fg_accuracies.update(fg_acc[0].data[0], activity_out.size(0) // 2) bg_accuracies.update(bg_acc[0].data[0], activity_out.size(0) // 2) # compute gradient and do SGD step loss.backward() if i % args.iter_size == 0: # scale down gradients when iter size is functioning if args.iter_size != 1: for g in optimizer.param_groups: for p in g["params"]: p.grad /= args.iter_size if args.clip_gradient is not None: total_norm = clip_grad_norm(model.parameters(), args.clip_gradient) if total_norm > args.clip_gradient: print( ( "clipping gradient: {} with coef {}".format( total_norm, args.clip_gradient / total_norm ) ) ) else: total_norm = 0 optimizer.step() optimizer.zero_grad() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: print( ( "Epoch: [{0}][{1}/{2}], lr: {lr:.5f}\t" "Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t" "Data {data_time.val:.3f} ({data_time.avg:.3f})\t" "Loss {loss.val:.4f} ({loss.avg:.4f})\t" "Act. Loss {act_losses.val:.3f} ({act_losses.avg: .3f}) \t" "Comp. Loss {comp_losses.val:.3f} ({comp_losses.avg: .3f}) ".format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, act_losses=act_losses, comp_losses=comp_losses, lr=optimizer.param_groups[0]["lr"], ) + "\tReg. Loss {reg_loss.val:.3f} ({reg_loss.avg:.3f})".format( reg_loss=reg_losses ) + "\n Act. FG {fg_acc.val:.02f} ({fg_acc.avg:.02f}) Act. BG {bg_acc.avg:.02f} ({bg_acc.avg:.02f})".format( act_acc=act_accuracies, fg_acc=fg_accuracies, bg_acc=bg_accuracies, ) ) )
def train(args: Dict[str, str]): # LJ: source corpus and target corpus train_data_src = read_corpus(args['--train-src'], source='src') train_data_tgt = read_corpus(args['--train-tgt'], source='tgt') # LJ: the validation set (source and target) dev_data_src = read_corpus(args['--dev-src'], source='src') dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt') # LJ: the training and validation sentences pairs train_data = list(zip(train_data_src, train_data_tgt)) dev_data = list(zip(dev_data_src, dev_data_tgt)) # LJ: the configurations train_batch_size = int(args['--batch-size']) clip_grad = float(args['--clip-grad']) valid_niter = int(args['--valid-niter']) log_every = int(args['--log-every']) model_save_path = args['--save-to'] # LJ: read the vocabulary vocab = pickle.load(open(args['--vocab'], 'rb')) # LJ: set up the loss function (ignore to <pad>) nll_loss = nn.NLLLoss(ignore_index=0) # LJ: build the model model = NMT(embed_size=int(args['--embed-size']), hidden_size=int(args['--hidden-size']), dropout_rate=float(args['--dropout']), local_att=bool(args['--local']), conv=bool(args['--conv']), vocab=vocab, loss=nll_loss) bound = float(args['--uniform-init']) for p in model.parameters(): torch.nn.init.uniform_(p.data, a=-bound, b=bound) src_embed_fn = args['--src_ebed_fn'] tgt_embed_fn = args['--tgt_ebed_fn'] #print(src_embed_fn) #print(tgt_embed_fn) if not src_embed_fn == "None": src_vectors = np.load(src_embed_fn)['embedding'] model.src_embed.weight.data = torch.from_numpy( src_vectors).float().cuda() if not tgt_embed_fn == "None": tgt_vectors = np.load(tgt_embed_fn)['embedding'] model.tgt_embed.weight.data = torch.from_numpy( tgt_vectors).float().cuda() # LJ: the learning rate lr = float(args['--lr']) # LJ: setting some initial losses, etc. num_trial = 0 train_iter = patience = cum_loss = report_loss = cumulative_tgt_words = report_tgt_words = 0 cumulative_examples = report_examples = epoch = valid_num = 0 hist_valid_scores = [] train_time = begin_time = time.time() print('begin Maximum Likelihood training') # LJ: setup the optimizer # optimizer = optim.Adam(list(model.encoder.parameters())+list(model.decoder.parameters()), lr=lr) optimizer = optim.Adam(filter(lambda x: x.requires_grad, model.parameters()), lr=lr) while True: # start the epoch epoch += 1 # LJ: ok, we yield the sentences in a shuffle manner. for src_sents, tgt_sents in batch_iter(train_data, batch_size=train_batch_size, shuffle=True): model.set_model_to_train() train_iter += 1 # LJ: current batch size batch_size = len(src_sents) # (batch_size) # LJ: train on the mini-batch and get the loss, backpropagation # loss = -model(src_sents, tgt_sents) optimizer.zero_grad() loss, num_words = model(src_sents, tgt_sents) loss.backward() clip_grad_norm( list(model.encoder.parameters()) + list(model.decoder.parameters()), clip_grad) optimizer.step() # add the loss to cumlinative loss report_loss += loss.detach().cpu().numpy() * num_words cum_loss += loss.detach().cpu().numpy() * num_words # LJ: how many targets words are there in all target sentences in current batch tgt_words_num_to_predict = sum( len(s[1:]) for s in tgt_sents) # omitting leading `<s>` # LJ: all cumulative words report_tgt_words += tgt_words_num_to_predict cumulative_tgt_words += tgt_words_num_to_predict # LJ: all number of instances handled report_examples += batch_size cumulative_examples += batch_size # LJ: print out the training loss if train_iter % log_every == 0: print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \ 'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter, report_loss / report_examples, math.exp( report_loss / report_tgt_words), cumulative_examples, report_tgt_words / ( time.time() - train_time), time.time() - begin_time), file=sys.stderr) train_time = time.time() report_loss = report_tgt_words = report_examples = 0. # the following code performs validation on dev set, and controls the learning schedule # if the dev score is better than the last check point, then the current model is saved. # otherwise, we allow for that performance degeneration for up to `--patience` times; # if the dev score does not increase after `--patience` iterations, we reload the previously # saved best model (and the state of the optimizer), halve the learning rate and continue # training. This repeats for up to `--max-num-trial` times. if train_iter % valid_niter == 0: model.set_model_to_eval() print( 'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter, cum_loss / cumulative_examples, np.exp(cum_loss / cumulative_tgt_words), cumulative_examples), file=sys.stderr) cum_loss = cumulative_examples = cumulative_tgt_words = 0. valid_num += 1 print('begin validation ...', file=sys.stderr) # compute dev. ppl and bleu # LJ: the validation is implemented in a seperate function cum_loss, dev_ppl = model.evaluate_ppl( dev_data, batch_size=128) # dev batch size can be a bit larger # valid_metric = -dev_ppl valid_metric = -cum_loss print('validation: iter %d, dev. ppl %f, val cum loss: %f' % (train_iter, dev_ppl, cum_loss), file=sys.stderr) # LJ: a new better model is found. is_better = len(hist_valid_scores ) == 0 or valid_metric > max(hist_valid_scores) hist_valid_scores.append(valid_metric) if is_better: patience = 0 print('save currently the best model to [%s]' % model_save_path, file=sys.stderr) model.save(model_save_path) # You may also save the optimizer's state, adjust the training weight, since we found there are too # much iterations without improvements. elif patience < int(args['--patience']): patience += 1 print('hit patience %d' % patience, file=sys.stderr) if patience == int(args['--patience']): num_trial += 1 print('hit #%d trial' % num_trial, file=sys.stderr) if num_trial == int(args['--max-num-trial']): print('early stop!', file=sys.stderr) exit(0) # decay learning rate, and restore from previously best checkpoint lr = lr * float(args['--lr-decay']) optimizer = optim.Adam(filter( lambda x: x.requires_grad, model.parameters()), lr=lr) print( 'load previously best model and decay learning rate to %f' % lr, file=sys.stderr) # load model # model.load(model_save_path) model = utils.load_model_by_state_dict( model, model_save_path) print('restore parameters of the optimizers', file=sys.stderr) # You may also need to load the state of the optimizer saved before # reset patience patience = 0 if epoch == int(args['--max-epoch']): print('reached maximum number of epochs!', file=sys.stderr) exit(0)
def train(train_loader, model, criterion, optimizer, epoch, log): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() if args.no_partialbn: model.module.partialBN(False) else: model.module.partialBN(True) # switch to train mode model.train() end = time.time() for i, (input, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) target = target.cuda(async=True) input_var = torch.autograd.Variable(input) target_var = torch.autograd.Variable(target) # compute output output = model(input_var) loss = criterion(output, target_var) # measure accuracy and record loss prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) losses.update(loss.data[0], input.size(0)) top1.update(prec1[0], input.size(0)) top5.update(prec5[0], input.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() if args.clip_gradient is not None: total_norm = clip_grad_norm(model.parameters(), args.clip_gradient) # if total_norm > args.clip_gradient: # print("clipping gradient: {} with coef {}".format(total_norm, args.clip_gradient / total_norm)) optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: output = ('Epoch: [{0}][{1}/{2}], lr: {lr:.5f}\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top1=top1, top5=top5, lr=optimizer.param_groups[-1]['lr'])) print(output) log.write(output + '\n') log.flush()
objective_loss = F.cross_entropy(input=logits_squeezed, target=Variable(torch_labels)) if objective_loss.data[0] > 5 and epoch > 10: #interested in phrase that have large loss (i.e. incorrectly classified) print(' '.join(tree.get_words())) loss_history.append(objective_loss.data[0]) if step % 20 == 0 and step > 0: print("step %3d, last loss %0.3f, mean loss (%d steps) %0.3f" % (step, objective_loss.data[0], average_over, np.mean(loss_history[-average_over:]))) optimizer.zero_grad() if np.isnan(objective_loss.data[0]): print("object_loss was not a number") sys.exit(1) else: objective_loss.backward() clip_grad_norm(model.parameters(), 5, norm_type=2.) #temp_grad += model.fcl._parameters['weight'].grad.data # # Update weights using gradient descent; w1.data and w2.data are Tensors, # # w1.grad and w2.grad are Variables and w1.grad.data and w2.grad.data are # # Tensors. # loss.backward() # w1.data -= learning_rate * w1.grad.data # w2.data -= learning_rate * w2.grad.data optimizer.step() print("total root predicted correctly = ", total_root_prediction/ float(train_size)) print("total node (including root) predicted correctly = ", total_summed_accuracy / float(train_size)) total_dev_loss = 0. dev_correct_at_root = 0. dev_correct_all = 0. for step, dev_example in enumerate(dev_data):
def train(): logging.info('Loading vocab,train and val dataset.Wait a second,please') embed = torch.Tensor(np.load(args.embedding)['embedding']) with open(args.word2id) as f: word2id = json.load(f) vocab = utils.Vocab(embed, word2id) with open(args.train_dir) as f: examples = [json.loads(line) for line in f] train_dataset = utils.Dataset(examples) with open(args.val_dir) as f: examples = [json.loads(line) for line in f] val_dataset = utils.Dataset(examples) # update args args.embed_num = embed.size(0) args.embed_dim = embed.size(1) args.kernel_sizes = [int(ks) for ks in args.kernel_sizes.split(',')] # build model net = getattr(models, args.model)(args, embed) if use_gpu: net.cuda() # load dataset train_iter = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True) val_iter = DataLoader(dataset=val_dataset, batch_size=args.batch_size, shuffle=False) # loss function criterion = nn.BCELoss() # model info print(net) params = sum(p.numel() for p in list(net.parameters())) / 1e6 print('#Params: %.1fM' % (params)) min_loss = float('inf') optimizer = torch.optim.Adam(net.parameters(), lr=args.lr) net.train() t1 = time() for epoch in range(1, args.epochs + 1): for i, batch in enumerate(train_iter): features, targets, _, doc_lens = vocab.make_features(batch) features, targets = Variable(features), Variable(targets.float()) if use_gpu: features = features.cuda() targets = targets.cuda() probs = net(features, doc_lens) loss = criterion(probs, targets) optimizer.zero_grad() loss.backward() clip_grad_norm(net.parameters(), args.max_norm) optimizer.step() if args.debug: print('Batch ID:%d Loss:%f' % (i, loss.data[0])) continue if i % args.report_every == 0: cur_loss = eval(net, vocab, val_iter, criterion) if cur_loss < min_loss: min_loss = cur_loss best_path = net.save() logging.info('Epoch: %2d Min_Val_Loss: %f Cur_Val_Loss: %f' % (epoch, min_loss, cur_loss)) t2 = time() logging.info('Total Cost:%f h' % ((t2 - t1) / 3600))
def train(self): if self.T - self.target_sync_T > self.args.target: self.sync_target_network() self.target_sync_T = self.T info = {} for _ in range(self.args.iters): self.dqn.eval() batch, indices, is_weights = self.replay.Sample_N(self.args.batch_size, self.args.n_step, self.args.gamma) columns = list(zip(*batch)) states = Variable(torch.from_numpy(np.array(columns[0])).float().transpose_(1, 3)) actions = Variable(torch.LongTensor(columns[1])) terminal_states = Variable(torch.FloatTensor(columns[5])) rewards = Variable(torch.FloatTensor(columns[2])) # Have to clip rewards for DQN rewards = torch.clamp(rewards, -1, 1) steps = Variable(torch.FloatTensor(columns[4])) new_states = Variable(torch.from_numpy(np.array(columns[3])).float().transpose_(1, 3)) target_dqn_qvals = self.target_dqn(new_states).cpu() # Make a new variable with those values so that these are treated as constants target_dqn_qvals_data = Variable(target_dqn_qvals.data) q_value_gammas = (Variable(torch.ones(terminal_states.size()[0])) - terminal_states) inter = Variable(torch.ones(terminal_states.size()[0]) * self.args.gamma) # print(steps) q_value_gammas = q_value_gammas * torch.pow(inter, steps) values = torch.linspace(self.args.v_min, self.args.v_max, steps=self.args.atoms) values = Variable(values) values = values.view(1, 1, self.args.atoms) values = values.expand(self.args.batch_size, self.args.actions, self.args.atoms) # print(values) q_value_gammas = q_value_gammas.view(self.args.batch_size, 1, 1) q_value_gammas = q_value_gammas.expand(self.args.batch_size, self.args.actions, self.args.atoms) # print(q_value_gammas) gamma_values = q_value_gammas * values # print(gamma_values) rewards = rewards.view(self.args.batch_size, 1, 1) rewards = rewards.expand(self.args.batch_size, self.args.actions, self.args.atoms) # print(rewards) operator_q_values = rewards + gamma_values # print(operator_q_values) clipped_operator_q_values = torch.clamp(operator_q_values, self.args.v_min, self.args.v_max) delta_z = (self.args.v_max - self.args.v_min) / (self.args.atoms - 1) # Using the notation from the categorical paper b_j = (clipped_operator_q_values - self.args.v_min) / delta_z # print(b_j) lower_bounds = torch.floor(b_j) upper_bounds = torch.ceil(b_j) # Work out the max action atom_values = Variable(torch.linspace(self.args.v_min, self.args.v_max, steps=self.args.atoms)) atom_values = atom_values.view(1, 1, self.args.atoms) atom_values = atom_values.expand(self.args.batch_size, self.args.actions, self.args.atoms) # Sum over the atoms dimension target_expected_qvalues = torch.sum(target_dqn_qvals_data * atom_values, dim=2) # Get the maximum actions index across the batch size max_actions = target_expected_qvalues.max(dim=1)[1].view(-1) # Project back onto the original support for the max actions q_value_distribution_targets = torch.zeros(self.args.batch_size, self.args.atoms) # Distributions for the max actions # print(target_dqn_qvals_data, max_actions) q_value_max_actions_distribs = target_dqn_qvals_data.index_select(dim=1, index=max_actions)[:,0,:] # print(q_value_max_actions_distribs) # Lower_bounds_actions lower_bounds_actions = lower_bounds.index_select(dim=1, index=max_actions)[:,0,:] upper_bounds_actions = upper_bounds.index_select(dim=1, index=max_actions)[:,0,:] b_j_actions = b_j.index_select(dim=1, index=max_actions)[:,0,:] lower_bound_values_to_add = q_value_max_actions_distribs * (upper_bounds_actions - b_j_actions) upper_bound_values_to_add = q_value_max_actions_distribs * (b_j_actions - lower_bounds_actions) # print(lower_bounds_actions) # print(lower_bound_values_to_add) # Naive looping for b in range(self.args.batch_size): for l, pj in zip(lower_bounds_actions.data.type(torch.LongTensor)[b], lower_bound_values_to_add[b].data): q_value_distribution_targets[b][l] += pj for u, pj in zip(upper_bounds_actions.data.type(torch.LongTensor)[b], upper_bound_values_to_add[b].data): q_value_distribution_targets[b][u] += pj self.dqn.train() if self.args.gpu: actions = actions.cuda() # q_value_targets = q_value_targets.cuda() q_value_distribution_targets = q_value_distribution_targets.cuda() model_predictions = self.dqn(states).index_select(1, actions.view(-1))[:,0,:] q_value_distribution_targets = Variable(q_value_distribution_targets) # print(q_value_distribution_targets) # print(model_predictions) # Cross entropy loss ce_loss = -torch.sum(q_value_distribution_targets * torch.log(model_predictions), dim=1) ce_batch_loss = ce_loss.mean() info = {} self.log("DQN/X_Entropy_Loss", ce_batch_loss.data[0], step=self.T) # Update self.optimizer.zero_grad() ce_batch_loss.backward() # Taken from pytorch clip_grad_norm # Remove once the pip version it up to date with source gradient_norm = clip_grad_norm(self.dqn.parameters(), self.args.clip_value) if gradient_norm is not None: info["Norm"] = gradient_norm self.optimizer.step() if "States" in info: states_trained = info["States"] info["States"] = states_trained + columns[0] else: info["States"] = columns[0] # Pad out the states to be of size batch_size if len(info["States"]) < self.args.batch_size: old_states = info["States"] new_states = old_states[0] * (self.args.batch_size - len(old_states)) info["States"] = new_states return info
def train(model_str, embeddings, train_iter, val_iter=None, context_size=None, early_stopping=False, save=False, save_path=None, model_params={}, opt_params={}, train_params={}, cuda=CUDA_DEFAULT, reshuffle_train=False, TEXT=None): # Initialize model and other variables train_iter_, val_iter_, model, criterion, optimizer, scheduler = _train_initialize_variables(model_str, embeddings, model_params, train_iter, val_iter, opt_params, cuda) # First validation round before any training if val_iter_ is not None: model.eval() print("Model initialized") val_loss = predict(model, val_iter_, context_size=context_size, save_loss=False, expt_name="dummy_expt", cuda=cuda) model.train() if scheduler is not None: scheduler.step(val_loss) print("All set. Actual Training begins") for epoch in range(train_params.get('n_ep', 30)): # Monitoring loss total_loss = 0 count = 0 # if using NNLM, reshuffle sentences if model_str == 'NNLM' and reshuffle_train: train_iter_, _, _ = rebuild_iterators(TEXT, batch_size=int(model_params['batch_size'])) # Initialize hidden layer and memory(for LSTM). Converting to variable later. if model_str in recur_models: model.hidden = model.init_hidden() # Actual training loop. for x_train, y_train in data_generator(train_iter_, model_str, context_size=context_size, cuda=cuda): optimizer.zero_grad() if model_str in recur_models: output, model_hidden = model(x_train) if model.model_str == 'LSTM': model.hidden = model.hidden[0].detach(), model.hidden[1].detach() # to break the computational graph epxlictly (backprop through `bptt_steps` steps only) else: model.hidden = model.hidden.detach() # to break the computational graph epxlictly (backprop through `bptt_steps` steps only) else: output = model(x_train) # Dimension matching to cut it right for loss function. if model_str in recur_models: batch_size, sent_length = y_train.size(0), y_train.size(1) loss = criterion(output.view(batch_size, -1, sent_length), y_train) else: loss = criterion(output, y_train) # backprop loss.backward() # Clip gradients to prevent exploding gradients in RNN/LSTM/GRU if model_str in recur_models: clip_grad_norm(model.parameters(), model_params.get("clip_grad_norm", 5)) optimizer.step() # monitoring count += x_train.size(0) if model.model_str == 'NNLM2' else x_train.size(0) * x_train.size(1) # in that case there are batch_size x bbp_length classifications per batch total_loss += t.sum(loss.data) # .data to break so that you dont keep references # monitoring avg_loss = total_loss / count print("Average loss after %d epochs is %.4f" % (epoch, avg_loss)) if val_iter_ is not None: model.eval() former_val_loss = val_loss * 1. val_loss = predict(model, val_iter_, context_size=context_size, save_loss=False, expt_name="dummy_expt", cuda=cuda) if scheduler is not None: scheduler.step(val_loss) if val_loss > former_val_loss: if early_stopping: break else: if save: assert save_path is not None # weights save_model(model, save_path + '.pytorch') # params with open(save_path + '.params.json', 'w') as fp: json.dump(model.params, fp) # loss with open(save_path + '.losses.txt', 'w') as fp: fp.write('val: ' + str(val_loss)) fp.write('train: ' + str(avg_loss)) model.train() return model
def step(self): """Compute gradients norm.""" if self.max_grad_norm: clip_grad_norm(self.params, self.max_grad_norm) self.optimizer.step()
def main(): args_parser = argparse.ArgumentParser( description='Tuning with stack pointer parser') args_parser.add_argument('--mode', choices=['RNN', 'LSTM', 'GRU', 'FastLSTM'], help='architecture of rnn', required=True) args_parser.add_argument('--num_epochs', type=int, default=200, help='Number of training epochs') args_parser.add_argument('--batch_size', type=int, default=64, help='Number of sentences in each batch') args_parser.add_argument('--hidden_size', type=int, default=256, help='Number of hidden units in RNN') args_parser.add_argument('--arc_space', type=int, default=128, help='Dimension of tag space') args_parser.add_argument('--type_space', type=int, default=128, help='Dimension of tag space') args_parser.add_argument('--num_layers', type=int, default=1, help='Number of layers of RNN') args_parser.add_argument('--num_filters', type=int, default=50, help='Number of filters in CNN') args_parser.add_argument('--pos_dim', type=int, default=50, help='Dimension of POS embeddings') args_parser.add_argument('--char_dim', type=int, default=50, help='Dimension of Character embeddings') args_parser.add_argument('--opt', choices=['adam', 'sgd', 'adadelta'], help='optimization algorithm') args_parser.add_argument('--learning_rate', type=float, default=0.001, help='Learning rate') args_parser.add_argument('--decay_rate', type=float, default=0.5, help='Decay rate of learning rate') args_parser.add_argument('--clip', type=float, default=5.0, help='gradient clipping') args_parser.add_argument('--gamma', type=float, default=0.0, help='weight for regularization') args_parser.add_argument('--coverage', type=float, default=0.0, help='weight for coverage loss') args_parser.add_argument('--p_rnn', nargs=2, type=float, required=True, help='dropout rate for RNN') args_parser.add_argument('--p_in', type=float, default=0.33, help='dropout rate for input embeddings') args_parser.add_argument('--p_out', type=float, default=0.33, help='dropout rate for output layer') args_parser.add_argument( '--prior_order', choices=['inside_out', 'left2right', 'deep_first', 'shallow_first'], help='prior order of children.', required=True) args_parser.add_argument('--schedule', type=int, help='schedule for learning rate decay') args_parser.add_argument( '--unk_replace', type=float, default=0., help='The rate to replace a singleton word with UNK') args_parser.add_argument('--punctuation', nargs='+', type=str, help='List of punctuations') args_parser.add_argument('--beam', type=int, default=1, help='Beam size for decoding') args_parser.add_argument('--word_embedding', choices=['glove', 'senna', 'sskip', 'polyglot'], help='Embedding for words', required=True) args_parser.add_argument('--word_path', help='path for word embedding dict') args_parser.add_argument('--char_embedding', choices=['random', 'polyglot'], help='Embedding for characters', required=True) args_parser.add_argument('--char_path', help='path for character embedding dict') args_parser.add_argument( '--train') # "data/POS-penn/wsj/split1/wsj1.train.original" args_parser.add_argument( '--dev') # "data/POS-penn/wsj/split1/wsj1.dev.original" args_parser.add_argument( '--test') # "data/POS-penn/wsj/split1/wsj1.test.original" args_parser.add_argument('--model_path', help='path for saving model file.', required=True) args_parser.add_argument('--model_name', help='name for saving model file.', required=True) args = args_parser.parse_args() logger = get_logger("PtrParser") mode = args.mode train_path = args.train dev_path = args.dev test_path = args.test model_path = args.model_path model_name = args.model_name num_epochs = args.num_epochs batch_size = args.batch_size hidden_size = args.hidden_size arc_space = args.arc_space type_space = args.type_space num_layers = args.num_layers num_filters = args.num_filters learning_rate = args.learning_rate opt = args.opt momentum = 0.9 betas = (0.9, 0.9) rho = 0.9 eps = 1e-6 decay_rate = args.decay_rate clip = args.clip gamma = args.gamma cov = args.coverage schedule = args.schedule p_rnn = tuple(args.p_rnn) p_in = args.p_in p_out = args.p_out unk_replace = args.unk_replace prior_order = args.prior_order beam = args.beam punctuation = args.punctuation word_embedding = args.word_embedding word_path = args.word_path char_embedding = args.char_embedding char_path = args.char_path pos_dim = args.pos_dim word_dict, word_dim = utils.load_embedding_dict(word_embedding, word_path) char_dict = None char_dim = args.char_dim if char_embedding != 'random': char_dict, char_dim = utils.load_embedding_dict( char_embedding, char_path) logger.info("Creating Alphabets") alphabet_path = os.path.join(model_path, 'alphabets/') model_name = os.path.join(model_path, model_name) word_alphabet, char_alphabet, pos_alphabet, type_alphabet = conllx_stacked_data.create_alphabets( alphabet_path, train_path, data_paths=[dev_path, test_path], max_vocabulary_size=50000, embedd_dict=word_dict) num_words = word_alphabet.size() num_chars = char_alphabet.size() num_pos = pos_alphabet.size() num_types = type_alphabet.size() logger.info("Word Alphabet Size: %d" % num_words) logger.info("Character Alphabet Size: %d" % num_chars) logger.info("POS Alphabet Size: %d" % num_pos) logger.info("Type Alphabet Size: %d" % num_types) logger.info("Reading Data") use_gpu = torch.cuda.is_available() data_train = conllx_stacked_data.read_stacked_data_to_variable( train_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet, use_gpu=use_gpu, prior_order=prior_order) num_data = sum(data_train[1]) data_dev = conllx_stacked_data.read_stacked_data_to_variable( dev_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet, use_gpu=use_gpu, volatile=True, prior_order=prior_order) data_test = conllx_stacked_data.read_stacked_data_to_variable( test_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet, use_gpu=use_gpu, volatile=True, prior_order=prior_order) punct_set = None if punctuation is not None: punct_set = set(punctuation) logger.info("punctuations(%d): %s" % (len(punct_set), ' '.join(punct_set))) def construct_word_embedding_table(): scale = np.sqrt(3.0 / word_dim) table = np.empty([word_alphabet.size(), word_dim], dtype=np.float32) table[conllx_stacked_data.UNK_ID, :] = np.random.uniform( -scale, scale, [1, word_dim]).astype(np.float32) oov = 0 for word, index in word_alphabet.items(): if word in word_dict: embedding = word_dict[word] elif word.lower() in word_dict: embedding = word_dict[word.lower()] else: embedding = np.random.uniform(-scale, scale, [1, word_dim]).astype(np.float32) oov += 1 table[index, :] = embedding print('word OOV: %d' % oov) return torch.from_numpy(table) def construct_char_embedding_table(): if char_dict is None: return None scale = np.sqrt(3.0 / char_dim) table = np.empty([num_chars, char_dim], dtype=np.float32) table[conllx_stacked_data.UNK_ID, :] = np.random.uniform( -scale, scale, [1, char_dim]).astype(np.float32) oov = 0 for char, index, in char_alphabet.items(): if char in char_dict: embedding = char_dict[char] else: embedding = np.random.uniform(-scale, scale, [1, char_dim]).astype(np.float32) oov += 1 table[index, :] = embedding print('character OOV: %d' % oov) return torch.from_numpy(table) word_table = construct_word_embedding_table() char_table = construct_char_embedding_table() window = 3 network = StackPtrNet(word_dim, num_words, char_dim, num_chars, pos_dim, num_pos, num_filters, window, mode, hidden_size, num_layers, num_types, arc_space, type_space, embedd_word=word_table, embedd_char=char_table, p_in=p_in, p_out=p_out, p_rnn=p_rnn, biaffine=True, prior_order=prior_order) if use_gpu: network.cuda() pred_writer = CoNLLXWriter(word_alphabet, char_alphabet, pos_alphabet, type_alphabet) gold_writer = CoNLLXWriter(word_alphabet, char_alphabet, pos_alphabet, type_alphabet) def generate_optimizer(opt, lr, params): if opt == 'adam': return Adam(params, lr=lr, betas=betas, weight_decay=gamma, eps=eps) elif opt == 'sgd': return SGD(params, lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True) elif opt == 'adadelta': return Adadelta(params, lr=lr, rho=rho, weight_decay=gamma, eps=eps) else: raise ValueError('Unknown optimization algorithm: %s' % opt) lr = learning_rate optim = generate_optimizer(opt, lr, network.parameters()) opt_info = 'opt: %s, ' % opt if opt == 'adam': opt_info += 'betas=%s, eps=%.1e' % (betas, eps) elif opt == 'sgd': opt_info += 'momentum=%.2f' % momentum elif opt == 'adadelta': opt_info += 'rho=%.2f, eps=%.1e' % (rho, eps) logger.info("Embedding dim: word=%d, char=%d, pos=%d" % (word_dim, char_dim, pos_dim)) logger.info( "Network: %s, num_layer=%d, hidden=%d, filter=%d, arc_space=%d, type_space=%d" % (mode, num_layers, hidden_size, num_filters, arc_space, type_space)) logger.info( "train: cov: %.1f, (#data: %d, batch: %d, clip: %.2f, dropout(in, out, rnn): (%.2f, %.2f, %s), unk_repl: %.2f)" % (cov, num_data, batch_size, clip, p_in, p_out, p_rnn, unk_replace)) logger.info('prior order: %s, beam: %d' % (prior_order, beam)) logger.info(opt_info) num_batches = num_data / batch_size + 1 dev_ucorrect = 0.0 dev_lcorrect = 0.0 dev_ucomlpete_match = 0.0 dev_lcomplete_match = 0.0 dev_ucorrect_nopunc = 0.0 dev_lcorrect_nopunc = 0.0 dev_ucomlpete_match_nopunc = 0.0 dev_lcomplete_match_nopunc = 0.0 dev_root_correct = 0.0 best_epoch = 0 test_ucorrect = 0.0 test_lcorrect = 0.0 test_ucomlpete_match = 0.0 test_lcomplete_match = 0.0 test_ucorrect_nopunc = 0.0 test_lcorrect_nopunc = 0.0 test_ucomlpete_match_nopunc = 0.0 test_lcomplete_match_nopunc = 0.0 test_root_correct = 0.0 test_total = 0 test_total_nopunc = 0 test_total_inst = 0 test_total_root = 0 patient = 0 for epoch in range(1, num_epochs + 1): print( 'Epoch %d (%s, optim: %s, learning rate=%.6f, decay rate=%.2f (schedule=%d, patient=%d)): ' % (epoch, mode, opt, lr, decay_rate, schedule, patient)) train_err_arc_leaf = 0. train_err_arc_non_leaf = 0. train_err_type_leaf = 0. train_err_type_non_leaf = 0. train_err_cov = 0. train_total_leaf = 0. train_total_non_leaf = 0. start_time = time.time() num_back = 0 network.train() for batch in range(1, num_batches + 1): input_encoder, input_decoder = conllx_stacked_data.get_batch_stacked_variable( data_train, batch_size, unk_replace=unk_replace) word, char, pos, heads, types, masks_e, lengths_e = input_encoder stacked_heads, children, stacked_types, masks_d, lengths_d = input_decoder optim.zero_grad() loss_arc_leaf, loss_arc_non_leaf, \ loss_type_leaf, loss_type_non_leaf, \ loss_cov, num_leaf, num_non_leaf = network.loss(word, char, pos, stacked_heads, children, stacked_types, mask_e=masks_e, length_e=lengths_e, mask_d=masks_d, length_d=lengths_d) loss_arc = loss_arc_leaf + loss_arc_non_leaf loss_type = loss_type_leaf + loss_type_non_leaf loss = loss_arc + loss_type + cov * loss_cov loss.backward() clip_grad_norm(network.parameters(), clip) optim.step() num_leaf = num_leaf.data[0] num_non_leaf = num_non_leaf.data[0] train_err_arc_leaf += loss_arc_leaf.data[0] * num_leaf train_err_arc_non_leaf += loss_arc_non_leaf.data[0] * num_non_leaf train_err_type_leaf += loss_type_leaf.data[0] * num_leaf train_err_type_non_leaf += loss_type_non_leaf.data[0] * num_non_leaf train_err_cov += loss_cov.data[0] * (num_leaf + num_non_leaf) train_total_leaf += num_leaf train_total_non_leaf += num_non_leaf time_ave = (time.time() - start_time) / batch time_left = (num_batches - batch) * time_ave # update log if batch % 10 == 0: sys.stdout.write("\b" * num_back) sys.stdout.write(" " * num_back) sys.stdout.write("\b" * num_back) err_arc_leaf = train_err_arc_leaf / train_total_leaf err_arc_non_leaf = train_err_arc_non_leaf / train_total_non_leaf err_arc = err_arc_leaf + err_arc_non_leaf err_type_leaf = train_err_type_leaf / train_total_leaf err_type_non_leaf = train_err_type_non_leaf / train_total_non_leaf err_type = err_type_leaf + err_type_non_leaf err_cov = train_err_cov / (train_total_leaf + train_total_non_leaf) err = err_arc + err_type + cov * err_cov log_info = 'train: %d/%d loss (leaf, non_leaf): %.4f, arc: %.4f (%.4f, %.4f), type: %.4f (%.4f, %.4f), coverage: %.4f, time left (estimated): %.2fs' % ( batch, num_batches, err, err_arc, err_arc_leaf, err_arc_non_leaf, err_type, err_type_leaf, err_type_non_leaf, err_cov, time_left) sys.stdout.write(log_info) sys.stdout.flush() num_back = len(log_info) sys.stdout.write("\b" * num_back) sys.stdout.write(" " * num_back) sys.stdout.write("\b" * num_back) err_arc_leaf = train_err_arc_leaf / train_total_leaf err_arc_non_leaf = train_err_arc_non_leaf / train_total_non_leaf err_arc = err_arc_leaf + err_arc_non_leaf err_type_leaf = train_err_type_leaf / train_total_leaf err_type_non_leaf = train_err_type_non_leaf / train_total_non_leaf err_type = err_type_leaf + err_type_non_leaf err_cov = train_err_cov / (train_total_leaf + train_total_non_leaf) err = err_arc + err_type + cov * err_cov print( 'train: %d loss (leaf, non_leaf): %.4f, arc: %.4f (%.4f, %.4f), type: %.4f (%.4f, %.4f), coverage: %.4f, time: %.2fs' % (num_batches, err, err_arc, err_arc_leaf, err_arc_non_leaf, err_type, err_type_leaf, err_type_non_leaf, err_cov, time.time() - start_time)) # evaluate performance on dev data network.eval() pred_filename = 'tmp/%spred_dev%d' % (str(uid), epoch) pred_writer.start(pred_filename) gold_filename = 'tmp/%sgold_dev%d' % (str(uid), epoch) gold_writer.start(gold_filename) dev_ucorr = 0.0 dev_lcorr = 0.0 dev_total = 0 dev_ucomlpete = 0.0 dev_lcomplete = 0.0 dev_ucorr_nopunc = 0.0 dev_lcorr_nopunc = 0.0 dev_total_nopunc = 0 dev_ucomlpete_nopunc = 0.0 dev_lcomplete_nopunc = 0.0 dev_root_corr = 0.0 dev_total_root = 0.0 dev_total_inst = 0.0 for batch in conllx_stacked_data.iterate_batch_stacked_variable( data_dev, batch_size): input_encoder, _ = batch word, char, pos, heads, types, masks, lengths = input_encoder heads_pred, types_pred, _, _ = network.decode( word, char, pos, mask=masks, length=lengths, beam=beam, leading_symbolic=conllx_stacked_data.NUM_SYMBOLIC_TAGS) word = word.data.cpu().numpy() pos = pos.data.cpu().numpy() lengths = lengths.cpu().numpy() heads = heads.data.cpu().numpy() types = types.data.cpu().numpy() pred_writer.write(word, pos, heads_pred, types_pred, lengths, symbolic_root=True) gold_writer.write(word, pos, heads, types, lengths, symbolic_root=True) stats, stats_nopunc, stats_root, num_inst = parser.eval( word, pos, heads_pred, types_pred, heads, types, word_alphabet, pos_alphabet, lengths, punct_set=punct_set, symbolic_root=True) ucorr, lcorr, total, ucm, lcm = stats ucorr_nopunc, lcorr_nopunc, total_nopunc, ucm_nopunc, lcm_nopunc = stats_nopunc corr_root, total_root = stats_root dev_ucorr += ucorr dev_lcorr += lcorr dev_total += total dev_ucomlpete += ucm dev_lcomplete += lcm dev_ucorr_nopunc += ucorr_nopunc dev_lcorr_nopunc += lcorr_nopunc dev_total_nopunc += total_nopunc dev_ucomlpete_nopunc += ucm_nopunc dev_lcomplete_nopunc += lcm_nopunc dev_root_corr += corr_root dev_total_root += total_root dev_total_inst += num_inst pred_writer.close() gold_writer.close() print( 'W. Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%%' % (dev_ucorr, dev_lcorr, dev_total, dev_ucorr * 100 / dev_total, dev_lcorr * 100 / dev_total, dev_ucomlpete * 100 / dev_total_inst, dev_lcomplete * 100 / dev_total_inst)) print( 'Wo Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%%' % (dev_ucorr_nopunc, dev_lcorr_nopunc, dev_total_nopunc, dev_ucorr_nopunc * 100 / dev_total_nopunc, dev_lcorr_nopunc * 100 / dev_total_nopunc, dev_ucomlpete_nopunc * 100 / dev_total_inst, dev_lcomplete_nopunc * 100 / dev_total_inst)) print('Root: corr: %d, total: %d, acc: %.2f%%' % (dev_root_corr, dev_total_root, dev_root_corr * 100 / dev_total_root)) if dev_ucorrect_nopunc <= dev_ucorr_nopunc: dev_ucorrect_nopunc = dev_ucorr_nopunc dev_lcorrect_nopunc = dev_lcorr_nopunc dev_ucomlpete_match_nopunc = dev_ucomlpete_nopunc dev_lcomplete_match_nopunc = dev_lcomplete_nopunc dev_ucorrect = dev_ucorr dev_lcorrect = dev_lcorr dev_ucomlpete_match = dev_ucomlpete dev_lcomplete_match = dev_lcomplete dev_root_correct = dev_root_corr best_epoch = epoch patient = 0 torch.save(network, model_name) pred_filename = 'tmp/%spred_test%d' % (str(uid), epoch) pred_writer.start(pred_filename) gold_filename = 'tmp/%sgold_test%d' % (str(uid), epoch) gold_writer.start(gold_filename) test_ucorrect = 0.0 test_lcorrect = 0.0 test_ucomlpete_match = 0.0 test_lcomplete_match = 0.0 test_total = 0 test_ucorrect_nopunc = 0.0 test_lcorrect_nopunc = 0.0 test_ucomlpete_match_nopunc = 0.0 test_lcomplete_match_nopunc = 0.0 test_total_nopunc = 0 test_total_inst = 0 test_root_correct = 0.0 test_total_root = 0 for batch in conllx_stacked_data.iterate_batch_stacked_variable( data_test, batch_size): input_encoder, _ = batch word, char, pos, heads, types, masks, lengths = input_encoder heads_pred, types_pred, _, _ = network.decode( word, char, pos, mask=masks, length=lengths, beam=beam, leading_symbolic=conllx_stacked_data.NUM_SYMBOLIC_TAGS) word = word.data.cpu().numpy() pos = pos.data.cpu().numpy() lengths = lengths.cpu().numpy() heads = heads.data.cpu().numpy() types = types.data.cpu().numpy() pred_writer.write(word, pos, heads_pred, types_pred, lengths, symbolic_root=True) gold_writer.write(word, pos, heads, types, lengths, symbolic_root=True) stats, stats_nopunc, stats_root, num_inst = parser.eval( word, pos, heads_pred, types_pred, heads, types, word_alphabet, pos_alphabet, lengths, punct_set=punct_set, symbolic_root=True) ucorr, lcorr, total, ucm, lcm = stats ucorr_nopunc, lcorr_nopunc, total_nopunc, ucm_nopunc, lcm_nopunc = stats_nopunc corr_root, total_root = stats_root test_ucorrect += ucorr test_lcorrect += lcorr test_total += total test_ucomlpete_match += ucm test_lcomplete_match += lcm test_ucorrect_nopunc += ucorr_nopunc test_lcorrect_nopunc += lcorr_nopunc test_total_nopunc += total_nopunc test_ucomlpete_match_nopunc += ucm_nopunc test_lcomplete_match_nopunc += lcm_nopunc test_root_correct += corr_root test_total_root += total_root test_total_inst += num_inst pred_writer.close() gold_writer.close() else: if patient < schedule: patient += 1 else: network = torch.load(model_name) lr = lr * decay_rate optim = generate_optimizer(opt, lr, network.parameters()) patient = 0 print( '----------------------------------------------------------------------------------------------------------------------------' ) print( 'best dev W. Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%% (epoch: %d)' % (dev_ucorrect, dev_lcorrect, dev_total, dev_ucorrect * 100 / dev_total, dev_lcorrect * 100 / dev_total, dev_ucomlpete_match * 100 / dev_total_inst, dev_lcomplete_match * 100 / dev_total_inst, best_epoch)) print( 'best dev Wo Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%% (epoch: %d)' % (dev_ucorrect_nopunc, dev_lcorrect_nopunc, dev_total_nopunc, dev_ucorrect_nopunc * 100 / dev_total_nopunc, dev_lcorrect_nopunc * 100 / dev_total_nopunc, dev_ucomlpete_match_nopunc * 100 / dev_total_inst, dev_lcomplete_match_nopunc * 100 / dev_total_inst, best_epoch)) print('best dev Root: corr: %d, total: %d, acc: %.2f%% (epoch: %d)' % (dev_root_correct, dev_total_root, dev_root_correct * 100 / dev_total_root, best_epoch)) print( '----------------------------------------------------------------------------------------------------------------------------' ) print( 'best test W. Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%% (epoch: %d)' % (test_ucorrect, test_lcorrect, test_total, test_ucorrect * 100 / test_total, test_lcorrect * 100 / test_total, test_ucomlpete_match * 100 / test_total_inst, test_lcomplete_match * 100 / test_total_inst, best_epoch)) print( 'best test Wo Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%% (epoch: %d)' % (test_ucorrect_nopunc, test_lcorrect_nopunc, test_total_nopunc, test_ucorrect_nopunc * 100 / test_total_nopunc, test_lcorrect_nopunc * 100 / test_total_nopunc, test_ucomlpete_match_nopunc * 100 / test_total_inst, test_lcomplete_match_nopunc * 100 / test_total_inst, best_epoch)) print('best test Root: corr: %d, total: %d, acc: %.2f%% (epoch: %d)' % (test_root_correct, test_total_root, test_root_correct * 100 / test_total_root, best_epoch)) print( '============================================================================================================================' )
def train(self): if self.T - self.target_sync_T > self.args.target: self.sync_target_network() self.target_sync_T = self.T info = {} for _ in range(self.args.iters): self.dqn.eval() # TODO: Use a named tuple for experience replay n_step_sample = self.args.n_step batch, indices, is_weights = self.replay.Sample_N(self.args.batch_size, n_step_sample, self.args.gamma) columns = list(zip(*batch)) states = Variable(torch.from_numpy(np.array(columns[0])).float().transpose_(1, 3)) actions = Variable(torch.LongTensor(columns[1])) terminal_states = Variable(torch.FloatTensor(columns[5])) rewards = Variable(torch.FloatTensor(columns[2])) # Have to clip rewards for DQN rewards = torch.clamp(rewards, -1, 1) steps = Variable(torch.FloatTensor(columns[4])) new_states = Variable(torch.from_numpy(np.array(columns[3])).float().transpose_(1, 3)) target_dqn_qvals = self.target_dqn(new_states).cpu() # Make a new variable with those values so that these are treated as constants target_dqn_qvals_data = Variable(target_dqn_qvals.data) q_value_targets = (Variable(torch.ones(terminal_states.size()[0])) - terminal_states) inter = Variable(torch.ones(terminal_states.size()[0]) * self.args.gamma) # print(steps) q_value_targets = q_value_targets * torch.pow(inter, steps) if self.args.double: # Double Q Learning new_states_qvals = self.dqn(new_states).cpu() new_states_qvals_data = Variable(new_states_qvals.data) q_value_targets = q_value_targets * target_dqn_qvals_data.gather(1, new_states_qvals_data.max(1)[1]) else: q_value_targets = q_value_targets * target_dqn_qvals_data.max(1)[0] q_value_targets = q_value_targets + rewards self.dqn.train() one_hot_actions = torch.zeros(self.args.batch_size, self.args.actions) for i in range(self.args.batch_size): one_hot_actions[i][actions[i].data] = 1 if self.args.gpu: actions = actions.cuda() one_hot_actions = one_hot_actions.cuda() q_value_targets = q_value_targets.cuda() new_states = new_states.cuda() model_predictions_q_vals, model_predictions_state = self.dqn(states, Variable(one_hot_actions)) model_predictions = model_predictions_q_vals.gather(1, actions.view(-1, 1)) # info = {} td_error = model_predictions - q_value_targets info["TD_Error"] = td_error.mean().data[0] # Update the priorities if not self.args.density_priority: self.replay.Update_Indices(indices, td_error.cpu().data.numpy(), no_pseudo_in_priority=self.args.count_td_priority) # If using prioritised we need to weight the td_error if self.args.prioritized and self.args.prioritized_is: # print(td_error) weights_tensor = torch.from_numpy(is_weights).float() weights_tensor = Variable(weights_tensor) if self.args.gpu: weights_tensor = weights_tensor.cuda() # print(weights_tensor) td_error = td_error * weights_tensor # Model 1 step state transition error # Save them every x steps if self.T % self.args.model_save_image == 0: os.makedirs("{}/transition_model/{}".format(self.args.log_path, self.T)) for ii, image, action, next_state, current_state in zip(range(self.args.batch_size), model_predictions_state.cpu().data, actions.data, new_states.cpu().data, states.cpu().data): image = image.numpy()[0] image = np.clip(image, 0, 1) # print(next_state) next_state = next_state.numpy()[0] current_state = current_state.numpy()[0] black_bars = np.zeros_like(next_state[:1, :]) # print(black_bars.shape) joined_image = np.concatenate((current_state, black_bars, image, black_bars, next_state), axis=0) joined_image = np.transpose(joined_image) self.log_image("{}/transition_model/{}/{}_____Action_{}".format(self.args.log_path, self.T, ii + 1, action), joined_image * 255) # self.log_image("{}/transition_model/{}/{}_____Action_{}".format(self.args.log_path, self.T, ii + 1, action), image * 255) # self.log_image("{}/transition_model/{}/{}_____Correct".format(self.args.log_path, self.T, ii + 1), next_state * 255) # print(model_predictions_state) # Cross Entropy Loss # TODO # Regresssion loss state_error = model_predictions_state - new_states # state_error_val = state_error.mean().data[0] info["State_Error"] = state_error.mean().data[0] self.log("DQN/State_Loss", state_error.mean().data[0], step=self.T) self.log("DQN/State_Loss_Squared", state_error.pow(2).mean().data[0], step=self.T) self.log("DQN/State_Loss_Max", state_error.abs().max().data[0], step=self.T) # self.log("DQN/Action_Matrix_Norm", self.dqn.action_matrix.weight.norm().cpu().data[0], step=self.T) combined_loss = (1 - self.args.model_loss) * td_error.pow(2).mean() + (self.args.model_loss) * state_error.pow(2).mean() l2_loss = combined_loss # l2_loss = (combined_loss).pow(2).mean() info["Loss"] = l2_loss.data[0] # Update self.optimizer.zero_grad() l2_loss.backward() # Taken from pytorch clip_grad_norm # Remove once the pip version it up to date with source gradient_norm = clip_grad_norm(self.dqn.parameters(), self.args.clip_value) if gradient_norm is not None: info["Norm"] = gradient_norm self.optimizer.step() if "States" in info: states_trained = info["States"] info["States"] = states_trained + columns[0] else: info["States"] = columns[0] # Pad out the states to be of size batch_size if len(info["States"]) < self.args.batch_size: old_states = info["States"] new_states = old_states[0] * (self.args.batch_size - len(old_states)) info["States"] = new_states return info
def step(self, closure=None): """Gradient clipping aware step().""" if self.gclip is not None and self.gclip > 0: # print("aaaa") clip_grad_norm(self.params, self.gclip) self.optim.step(closure)
def train(train_loader, model, criterion, optimizer, epoch, batch_logger): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() acc = AverageMeter() if args.no_partialbn: model.module.partialBN(False) else: model.module.partialBN(True) # switch to train mode model.train() end = time.time() for i, (input, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) target = target.cuda(async=True) input_var = torch.autograd.Variable(input) target_var = torch.autograd.Variable(target) # compute output output = model(input_var) loss = criterion(output, target_var) # measure accuracy and record loss prec = calculate_accuracy(output.data, target) losses.update(loss.item(), input.size(0)) acc.update(prec, input.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() if args.clip_gradient is not None: total_norm = clip_grad_norm(model.parameters(), args.clip_gradient) if total_norm > args.clip_gradient: print("clipping gradient: {} with coef {}".format( total_norm, args.clip_gradient / total_norm)) optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() batch_logger.log({ 'epoch': epoch, 'batch': i + 1, 'loss': losses.val, 'acc': acc.val, 'lr': optimizer.param_groups[-1]['lr'] }) if i % args.print_freq == 0: print(('Epoch: [{0}][{1}/{2}], lr: {lr:.5f}\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec {acc.val:.3f} ({acc.avg:.3f})'.format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, acc=acc, lr=optimizer.param_groups[-1]['lr'])))
def model_train(self, epoch_offset=0): create_dir(MODEL_SAVE_PATH) loss_for_regression = MSELoss() img_coors_json = read_json_file(BBOX_XYWH_JSON_PATH) optimizer = RMSprop(self.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM) # optimizer = Adam(self.parameters(), lr=LEARNING_RATE) # optimizer = SGD(self.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM) scheduler = StepLR(optimizer, step_size=SCHEDULER_STEP, gamma=SCHEDULER_GAMMA) for epoch in range(EPOCHS): epoch_loss = 0.0 scheduler.step(epoch) LOGGER.debug('Epoch: %s, Current Learning Rate: %s', str(epoch + epoch_offset), str(scheduler.get_lr())) for image, coors in img_coors_json.items(): path_of_image = NORMALISED_IMAGES_PATH + image path_of_image = path_of_image.replace('%', '_') img = cv2.imread(path_of_image) img = torch.tensor(img).float().permute(2, 0, 1).unsqueeze(0) img = img.to(self.device) predicted_width, predicted_height, predicted_midpoint = self.forward( img) #all are scaled mp_x = coors[0][0] mp_y = coors[0][1] mp = torch.cat((torch.tensor([[mp_x]]).to( self.device), torch.tensor([[mp_y]]).to(self.device)), dim=1).float() w = coors[0][2] h = coors[0][3] loss1 = loss_for_regression( predicted_height, torch.tensor([[h]]).float().to(self.device)) loss2 = loss_for_regression( predicted_width, torch.tensor([[w]]).float().to(self.device)) loss3 = loss_for_regression(predicted_midpoint, mp.to(self.device)) loss = loss1 + loss2 + loss3 / 2 optimizer.zero_grad() loss.backward() clip_grad_norm(self.parameters(), 0.5) optimizer.step() epoch_loss = epoch_loss + loss.item() if epoch % 5 == 0: print('epoch: ' + str(epoch) + ' ' + 'loss: ' + str(epoch_loss)) if epoch % EPOCH_SAVE_INTERVAL == 0: print('saving') torch.save( self.state_dict(), MODEL_SAVE_PATH + 'model_epc_' + str(epoch + epoch_offset) + '.pt') torch.save( self.state_dict(), MODEL_SAVE_PATH + 'model_epc_' + str(epoch + epoch_offset) + '.pt')
logits_v = net(states_v) log_prob_v = F.log_softmax(logits_v) log_prob_actions_v = batch_scale_v * log_prob_v[range(BATCH_SIZE), batch_actions_t] loss_policy_v = -log_prob_actions_v.mean() loss_policy_v.backward(retain_graph=True) grads = np.concatenate([p.grad.data.cpu().numpy().flatten() for p in net.parameters() if p.grad is not None]) prob_v = F.softmax(logits_v) entropy_v = -(prob_v * log_prob_v).sum(dim=1).mean() entropy_loss_v = -ENTROPY_BETA * entropy_v loss_v = loss_policy_v + entropy_loss_v loss_v.backward() nn_utils.clip_grad_norm(net.parameters(), GRAD_L2_CLIP) optimizer.step() loss_v += loss_policy_v # calc KL-div new_logits_v = net(states_v) new_prob_v = F.softmax(new_logits_v) kl_div_v = -((new_prob_v / prob_v).log() * prob_v).sum(dim=1).mean() writer.add_scalar("kl", kl_div_v.data.cpu().numpy()[0], step_idx) writer.add_scalar("baseline", baseline, step_idx) writer.add_scalar("entropy", entropy_v.data.cpu().numpy()[0], step_idx) writer.add_scalar("batch_scales", np.mean(batch_scales), step_idx) writer.add_scalar("batch_scales_std", scale_std, step_idx) writer.add_scalar("loss_entropy", entropy_loss_v.data.cpu().numpy()[0], step_idx) writer.add_scalar("loss_policy", loss_policy_v.data.cpu().numpy()[0], step_idx)
def train(train_data_loader, net, criterion, optimizer, scheduler, epoch): net.train() # loss counters batch_time = AverageMeter() losses = AverageMeter() loc_losses = AverageMeter() cls_losses = AverageMeter() #### shuffle #### xxxx = copy.deepcopy(train_data_loader.dataset.ids) np.random.shuffle(arr) iii = 0 for arr_i in arr: key = keys[arr_i] rang = my_dict[key] xxxx[iii:(iii + rang[1] - rang[0])] = xxx[rang[0]:rang[1]] iii += rang[1] - rang[0] train_data_loader.dataset.ids = copy.deepcopy(xxxx) # create batch iterator batch_iterator = None iter_count = 0 t0 = time.perf_counter() for iteration in range(len(train_data_loader)): if not batch_iterator: batch_iterator = iter(train_data_loader) # load train data images, targets, img_indexs = next(batch_iterator) if args.cuda: images = Variable(images.cuda()) targets = [ Variable(anno.cuda(), volatile=True) for anno in targets ] else: images = Variable(images) targets = [Variable(anno, volatile=True) for anno in targets] # forward out = net(images, img_indexs) # backprop optimizer.zero_grad() loss_l, loss_c = criterion(out, targets) loss = loss_l + loss_c loss.backward() if args.clip_gradient is not None: total_norm = clip_grad_norm(net.parameters(), args.clip_gradient) if total_norm > args.clip_gradient: print("clipping gradient: {} with coef {}".format( total_norm, args.clip_gradient / total_norm)) optimizer.step() if scheduler is not None: scheduler.step() loc_loss = loss_l.data[0] conf_loss = loss_c.data[0] # print('Loss data type ',type(loc_loss)) loc_losses.update(loc_loss) cls_losses.update(conf_loss) losses.update((loc_loss + conf_loss) / 2.0) if iteration % args.print_step == 0 and iteration > 0: torch.cuda.synchronize() t1 = time.perf_counter() batch_time.update(t1 - t0) print_line = 'Epoch {:02d}/{:02d} Iteration {:06d}/{:06d} loc-loss {:.3f}({:.3f}) cls-loss {:.3f}({:.3f}) ' \ 'average-loss {:.3f}({:.3f}) Timer {:0.3f}({:0.3f}) lr {:0.5f}'.format( epoch, args.epochs, iteration, len(train_data_loader), loc_losses.val, loc_losses.avg, cls_losses.val, cls_losses.avg, losses.val, losses.avg, batch_time.val, batch_time.avg, args.lr) torch.cuda.synchronize() t0 = time.perf_counter() log_file.write(print_line + '\n') print(print_line) iter_count += 1 if iter_count % args.loss_reset_step == 0 and iter_count > 0: loc_losses.reset() cls_losses.reset() losses.reset() batch_time.reset() print('Reset accumulators of ', args.snapshot_pref, ' at', iter_count * args.print_step) iter_count = 0
def train(model_str, train_iter, val_iter=None, source_embedding=None, target_embedding=None, early_stopping=False, save=False, save_path=None, model_params={}, opt_params={}, train_params={}, cuda=CUDA_DEFAULT): # Initialize model and other variables model, criterion, optimizer, scheduler = _train_initialize_variables(model_str, model_params, opt_params, cuda, source_embedding, target_embedding) val_loss = 1e6 best_val_loss = 1e6 if scheduler is not None: assert val_iter is not None scheduler.step(val_loss) print("All set. Actual Training begins") for epoch in range(train_params.get('n_ep', 30)): # Monitoring loss total_loss = 0 count = 0 # Actual training loop. for batch in train_iter: # Get data source = batch.src.transpose(0, 1) # batch first target = batch.trg.transpose(0, 1) if cuda: source = source.cuda() target = target.cuda() # Initialize hidden layer and memory if model.model_str == 'LSTM': # for LSTMA it is done in the forward because the init of the dec needs the last h of the enc model.hidden_enc = model.init_hidden('enc', source.size(0)) model.hidden_dec = model.init_hidden('dec', source.size(0)) # zero gradients optimizer.zero_grad() model.zero_grad() # predict output = model(source, target) # Dimension matching to cut it right for loss function. batch_size, sent_length = target.size(0), target.size(1)-1 loss = criterion(output.view(batch_size, -1, sent_length), target[:, 1:]) # remove the first element of target (it is the SOS token) # Compute gradients, clip, and backprop loss.backward() clip_grad_norm(model.parameters(), model_params.get("clip_gradients", 5.)) optimizer.step() # monitoring count += t.sum((target.data[:, 1:] != PAD_TOKEN).long()) # in that case there are batch_size x bbp_length classifications per batch, minus the pad tokens total_loss += t.sum(loss.data) # .data so that you dont keep references # monitoring avg_loss = total_loss / count print("Average loss after %d epochs is %.4f" % (epoch, avg_loss)) if val_iter is not None: model.eval() former_val_loss = val_loss * 1. val_loss = predict(model, val_iter, cuda=cuda) if scheduler is not None: scheduler.step(val_loss) if val_loss > former_val_loss: if early_stopping: break else: if save and best_val_loss > val_loss: # save only the best best_val_loss = val_loss * 1. assert save_path is not None # weights save_model(model, save_path + '.pytorch') # params with open(save_path + '.params.json', 'w') as fp: json.dump(model.params, fp) # loss with open(save_path + '.losses.txt', 'w') as fp: fp.write('val: ' + str(val_loss)) fp.write('train: ' + str(avg_loss)) model.train() return model
def step(self): # Compute gradients norm. if self.max_grad_norm: clip_grad_norm(self.params, self.max_grad_norm) self.optimizer.step()
def train(train_loader, model, act_criterion, comp_criterion, regression_criterion, optimizer, epoch): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() act_losses = AverageMeter() comp_losses = AverageMeter() reg_losses = AverageMeter() act_accuracies = AverageMeter() fg_accuracies = AverageMeter() bg_accuracies = AverageMeter() # switch to train mode model.train() end = time.time() optimizer.zero_grad() ohem_num = train_loader.dataset.fg_per_video comp_group_size = train_loader.dataset.fg_per_video + train_loader.dataset.incomplete_per_video for i, (prop_fts, prop_type, prop_labels, prop_reg_targets) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) batch_size = prop_fts[0].size(0) activity_out, activity_target, activity_prop_type, \ completeness_out, completeness_target, \ regression_out, regression_labels, regression_target = model((prop_fts[0], prop_fts[1]), prop_labels, prop_reg_targets, prop_type) act_loss = act_criterion(activity_out, activity_target) comp_loss = comp_criterion(completeness_out, completeness_target, ohem_num, comp_group_size) reg_loss = regression_criterion(regression_out, regression_labels, regression_target) loss = act_loss + comp_loss * args.comp_loss_weight + reg_loss * args.reg_loss_weight losses.update(loss.item(), batch_size) act_losses.update(act_loss.item(), batch_size) comp_losses.update(comp_loss.item(), batch_size) reg_losses.update(reg_loss.item(), batch_size) act_acc = accuracy(activity_out, activity_target) act_accuracies.update(act_acc[0].item(), activity_out.size(0)) fg_indexer = (activity_prop_type == 0).nonzero().squeeze() bg_indexer = (activity_prop_type == 2).nonzero().squeeze() fg_acc = accuracy(activity_out[fg_indexer, :], activity_target[fg_indexer]) fg_accuracies.update(fg_acc[0].item(), len(fg_indexer)) if len(bg_indexer) > 0: bg_acc = accuracy(activity_out[bg_indexer, :], activity_target[bg_indexer]) bg_accuracies.update(bg_acc[0].item(), len(bg_indexer)) loss.backward() if i % args.iter_size == 0: # scale down gradients when iter size is functioning if args.iter_size != 1: for g in optimizer.param_groups: for p in g['params']: p.grad /= args.iter_size if args.clip_gradient is not None: total_norm = clip_grad_norm(model.parameters(), args.clip_gradient) if total_norm > args.clip_gradient: logger.info("clipping gradient: {} with coef {}".format( total_norm, args.clip_gradient / total_norm)) else: total_norm = 0 optimizer.step() optimizer.zero_grad() batch_time.update(time.time() - end) end = time.time() writer.add_scalar('data/loss', losses.val, epoch * len(train_loader) + i + 1) writer.add_scalar('data/Reg_loss', reg_losses.val, epoch * len(train_loader) + i + 1) writer.add_scalar('data/Act_loss', act_losses.val, epoch * len(train_loader) + i + 1) writer.add_scalar('data/comp_loss', comp_losses.val, epoch * len(train_loader) + i + 1) if i % args.print_freq == 0: logger.info( 'Epoch: [{0}][{1}/{2}], lr: {lr:.5f}\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Act. Loss {act_losses.val:.3f} ({act_losses.avg: .3f}) \t' 'Comp. Loss {comp_losses.val:.3f} ({comp_losses.avg: .3f}) '. format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, act_losses=act_losses, comp_losses=comp_losses, lr=optimizer.param_groups[0]['lr'], ) + '\tReg. Loss {reg_loss.val:.3f} ({reg_loss.avg:.3f})'.format( reg_loss=reg_losses) + '\n Act. FG {fg_acc.val:.02f} ({fg_acc.avg:.02f}) Act. BG {bg_acc.avg:.02f} ({bg_acc.avg:.02f})' .format(act_acc=act_accuracies, fg_acc=fg_accuracies, bg_acc=bg_accuracies))
def train(train_loader, model, criterion, optimizer, epoch): batch_time = AverageMeter() data_time = AverageMeter() cls_losses = AverageMeter() res_losses = AverageMeter() losses = AverageMeter() model.train() end = time.time() optimizer.zero_grad() for i, (feats, labels, start_offsets, end_offsets) in enumerate(train_loader): data_time.update(time.time() - end) input_feats = torch.autograd.Variable(feats).cuda() input_labels = torch.autograd.Variable(labels).cuda() start_offsets = torch.autograd.Variable(start_offsets).cuda().float() end_offsets = torch.autograd.Variable(end_offsets).cuda().float() pred_labels = model(input_feats) cls_loss = criterion[0](pred_labels[:, :2], input_labels) res_loss = criterion[1](pred_labels[:, 2:], input_labels.float(), start_offsets, end_offsets) cls_losses.update(cls_loss.cpu().item(), feats.size(0)) res_losses.update(res_loss.cpu().item(), torch.sum(labels)) loss = cls_loss + args.lambda_reg * res_loss losses.update(loss.cpu().item(), feats.size(0)) # compute gradient and do SGD step loss.backward() if args.clip_gradient is not None: total_norm = clip_grad_norm(model.parameters(), args.clip_gradient) if total_norm > args.clip_gradient: print('Clipping gradient: {} with coef {}'.format( total_norm, args.clip_gradient / total_norm)) optimizer.step() optimizer.zero_grad() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: print( 'Epoch: [{0}][{1}/{2}], lr: {lr:.5f}\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\n' 'Classification Loss {cls_loss.val:.4f} ({cls_loss.avg:.4f})\t' 'Regression Loss {res_loss.val:.4f} ({res_loss.avg:.4f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\n'.format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, cls_loss=cls_losses, res_loss=res_losses, lr=optimizer.param_groups[0]['lr']))
# Load batch progress, notFinished, batch_input, batch_metadata, batch_labels = get_batch_data(args.batch_size, pick_data) if not notFinished: break # zero the parameter gradients optimizer.zero_grad() # Run neural net + Calculate Loss outputs = net(Variable(batch_input), Variable(batch_metadata)).cuda() loss = criterion(outputs, Variable(batch_labels)) # Backprop loss.backward() nnutils.clip_grad_norm(net.parameters(), 1.0) optimizer.step() # Update progress bar pb.animate(progress) batch_counter += 1 sum_counter += 1 sum += loss.data[0] if sum_counter == 80: log_file.write( '\n' + str(batch_counter) + ',' + str(sum / sum_counter)) log_file.flush() sum = 0 sum_counter = 0
def train(train_iter, dev_iter, test_iter, model, args): if args.cuda: model.cuda() torch.cuda.manual_seed(hyperparams.seed_num) # optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-8) # optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.init_weight_decay) # optimizer = torch.optim.SGD(model.parameters(), lr=args.lr,momentum=) # optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr) if args.Adam is True: print("Adam Training......") optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.init_weight_decay) elif args.SGD is True: print("SGD Training.......") optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.init_weight_decay, momentum=args.momentum_value) elif args.Adadelta is True: print("Adadelta Training.......") optimizer = torch.optim.Adadelta(model.parameters(), lr=args.lr, weight_decay=args.init_weight_decay) # lambda1 = lambda epoch: epoch // 30 # lambda2 = lambda epoch: 0.99 ** epoch # print("lambda1 {} lambda2 {} ".format(lambda1, lambda2)) # scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=[lambda2]) # scheduler = lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.9) scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min') steps = 0 epoch_step = 0 model_count = 0 model.train() for epoch in range(1, args.epochs + 1): print("\n## 第{} 轮迭代,共计迭代 {} 次 !##\n".format(epoch, args.epochs)) # scheduler.step() # print("now lr is {} \n".format(scheduler.get_lr())) print("now lr is {} \n".format(optimizer.param_groups[0].get("lr"))) for batch in train_iter: feature, target = batch.text, batch.label feature.data.t_(), target.data.sub_(1) # batch first, index align if args.cuda: feature, target = feature.cuda(), target.cuda() optimizer.zero_grad() model.zero_grad() logit = model(feature) loss = F.cross_entropy(logit, target) loss.backward() if args.init_clip_max_norm is not None: utils.clip_grad_norm(model.parameters(), max_norm=args.init_clip_max_norm) optimizer.step() steps += 1 if steps % args.log_interval == 0: train_size = len(train_iter.dataset) corrects = (torch.max(logit, 1)[1].view( target.size()).data == target.data).sum() accuracy = float(corrects) / batch.batch_size * 100.0 sys.stdout.write( '\rBatch[{}/{}] - loss: {:.6f} acc: {:.4f}%({}/{})'. format(steps, train_size, loss.data[0], accuracy, corrects, batch.batch_size)) if steps % args.test_interval == 0: eval(dev_iter, model, args) if steps % args.save_interval == 0: if not os.path.isdir(args.save_dir): os.makedirs(args.save_dir) save_prefix = os.path.join(args.save_dir, 'snapshot') save_path = '{}_steps{}.pt'.format(save_prefix, steps) torch.save(model, save_path) print("\n", save_path, end=" ") test_model = torch.load(save_path) model_count += 1 test_eval(test_iter, test_model, save_path, args, model_count) return model_count
def main(): parser = argparse.ArgumentParser( description='Tuning with Multitask bi-directional RNN-CNN-CRF') parser.add_argument('--config', help='Config file (Python file format)', default="config_multitask.py") parser.add_argument('--grid', help='Grid Search Options', default="{}") args = parser.parse_args() logger = get_logger("Multi-Task") use_gpu = torch.cuda.is_available() # Config Tensorboard Writer log_writer = SummaryWriter() # Load from config file spec = importlib.util.spec_from_file_location("config", args.config) config_module = importlib.util.module_from_spec(spec) spec.loader.exec_module(config_module) config = config_module.entries # Load options from grid search options = eval(args.grid) for k, v in options.items(): if isinstance(v, six.string_types): cmd = "%s = \"%s\"" % (k, v) else: cmd = "%s = %s" % (k, v) log_writer.add_scalar(k, v, 1) exec(cmd) # Load embedding dict embedding = config.embedding.embedding_type embedding_path = config.embedding.embedding_dict embedd_dict, embedd_dim = utils.load_embedding_dict( embedding, embedding_path) # Collect data path data_dir = config.data.data_dir data_names = config.data.data_names train_paths = [ os.path.join(data_dir, data_name, "train.tsv") for data_name in data_names ] dev_paths = [ os.path.join(data_dir, data_name, "devel.tsv") for data_name in data_names ] test_paths = [ os.path.join(data_dir, data_name, "test.tsv") for data_name in data_names ] # Create alphabets logger.info("Creating Alphabets") if not os.path.exists('tmp'): os.mkdir('tmp') word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet, ner_alphabet_task, label_reflect = \ bionlp_data.create_alphabets(os.path.join(Path(data_dir).abspath(), "alphabets", "_".join(data_names)), train_paths, data_paths=dev_paths + test_paths, use_cache=True, embedd_dict=embedd_dict, max_vocabulary_size=50000) logger.info("Word Alphabet Size: %d" % word_alphabet.size()) logger.info("Character Alphabet Size: %d" % char_alphabet.size()) logger.info("POS Alphabet Size: %d" % pos_alphabet.size()) logger.info("Chunk Alphabet Size: %d" % chunk_alphabet.size()) logger.info("NER Alphabet Size: %d" % ner_alphabet.size()) logger.info( "NER Alphabet Size per Task: %s", str([task_alphabet.size() for task_alphabet in ner_alphabet_task])) #task_reflects = torch.LongTensor(reverse_reflect(label_reflect, ner_alphabet.size())) #if use_gpu: # task_reflects = task_reflects.cuda() if embedding == 'elmo': logger.info("Loading ELMo Embedder") ee = ElmoEmbedder(options_file=config.embedding.elmo_option, weight_file=config.embedding.elmo_weight, cuda_device=config.embedding.elmo_cuda) else: ee = None logger.info("Reading Data") # Prepare dataset data_trains = [ bionlp_data.read_data_to_variable(train_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet_task[task_id], use_gpu=use_gpu, elmo_ee=ee) for task_id, train_path in enumerate(train_paths) ] num_data = [sum(data_train[1]) for data_train in data_trains] num_labels = ner_alphabet.size() num_labels_task = [task_item.size() for task_item in ner_alphabet_task] data_devs = [ bionlp_data.read_data_to_variable(dev_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet_task[task_id], use_gpu=use_gpu, volatile=True, elmo_ee=ee) for task_id, dev_path in enumerate(dev_paths) ] data_tests = [ bionlp_data.read_data_to_variable(test_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet_task[task_id], use_gpu=use_gpu, volatile=True, elmo_ee=ee) for task_id, test_path in enumerate(test_paths) ] writer = BioNLPWriter(word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet) def construct_word_embedding_table(): scale = np.sqrt(3.0 / embedd_dim) table = np.empty([word_alphabet.size(), embedd_dim], dtype=np.float32) table[bionlp_data.UNK_ID, :] = np.random.uniform( -scale, scale, [1, embedd_dim]).astype(np.float32) oov = 0 for word, index in word_alphabet.items(): if not embedd_dict == None and word in embedd_dict: embedding = embedd_dict[word] elif not embedd_dict == None and word.lower() in embedd_dict: embedding = embedd_dict[word.lower()] else: embedding = np.random.uniform( -scale, scale, [1, embedd_dim]).astype(np.float32) oov += 1 table[index, :] = embedding print('oov: %d' % oov) return torch.from_numpy(table) word_table = construct_word_embedding_table() logger.info("constructing network...") # Construct network window = 3 num_layers = 1 mode = config.rnn.mode hidden_size = config.rnn.hidden_size char_dim = config.rnn.char_dim num_filters = config.rnn.num_filters tag_space = config.rnn.tag_space bigram = config.rnn.bigram attention_mode = config.rnn.attention if config.rnn.dropout == 'std': network = MultiTaskBiRecurrentCRF( len(data_trains), embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), num_filters, window, mode, hidden_size, num_layers, num_labels, num_labels_task=num_labels_task, tag_space=tag_space, embedd_word=word_table, p_in=config.rnn.p, p_rnn=config.rnn.p, bigram=bigram, elmo=(embedding == 'elmo'), attention_mode=attention_mode, adv_loss_coef=config.multitask.adv_loss_coef, diff_loss_coef=config.multitask.diff_loss_coef, char_level_rnn=config.rnn.char_level_rnn) else: raise NotImplementedError if use_gpu: network.cuda() # Prepare training unk_replace = config.embedding.unk_replace num_epochs = config.training.num_epochs batch_size = config.training.batch_size lr = config.training.learning_rate momentum = config.training.momentum alpha = config.training.alpha lr_decay = config.training.lr_decay schedule = config.training.schedule gamma = config.training.gamma # optim = SGD(network.parameters(), lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True) optim = RMSprop(network.parameters(), lr=lr, alpha=alpha, momentum=momentum, weight_decay=gamma) logger.info( "Network: %s, num_layer=%d, hidden=%d, filter=%d, tag_space=%d, crf=%s" % (mode, num_layers, hidden_size, num_filters, tag_space, 'bigram' if bigram else 'unigram')) logger.info( "training: l2: %f, (#training data: %s, batch: %d, dropout: %.2f, unk replace: %.2f)" % (gamma, num_data, batch_size, config.rnn.p, unk_replace)) num_batches = [x // batch_size + 1 for x in num_data] dev_f1 = [0.0 for x in num_data] dev_acc = [0.0 for x in num_data] dev_precision = [0.0 for x in num_data] dev_recall = [0.0 for x in num_data] test_f1 = [0.0 for x in num_data] test_acc = [0.0 for x in num_data] test_precision = [0.0 for x in num_data] test_recall = [0.0 for x in num_data] best_epoch = [0 for x in num_data] # Training procedure for epoch in range(1, num_epochs + 1): print( 'Epoch %d (%s(%s), learning rate=%.4f, decay rate=%.4f (schedule=%d)): ' % (epoch, mode, config.rnn.dropout, lr, lr_decay, schedule)) train_err = 0. train_total = 0. # Gradient decent on training data start_time = time.time() num_back = 0 network.train() batch_count = 0 for batch in range(1, 2 * num_batches[0] + 1): r = random.random() task_id = 0 if r <= 0.5 else random.randint(1, len(num_data) - 1) #if batch > num_batches[task_id]: # batch = batch % num_batches[task_id] + 1 batch_count += 1 word, char, _, _, labels, masks, lengths, elmo_embedding = bionlp_data.get_batch_variable( data_trains[task_id], batch_size, unk_replace=unk_replace) optim.zero_grad() loss, task_loss, adv_loss, diff_loss = network.loss( task_id, word, char, labels, mask=masks, elmo_word=elmo_embedding) #log_writer.add_scalars( # 'train_loss_task' + str(task_id), # {'all_loss': loss, 'task_loss': task_loss, 'adv_loss': adv_loss, 'diff_loss': diff_loss}, # (epoch - 1) * (num_batches[task_id] + 1) + batch #) #log_writer.add_scalars( # 'train_loss_overview', # {'all_loss': loss, 'task_loss': task_loss, 'adv_loss': adv_loss, 'diff_loss': diff_loss}, # (epoch - 1) * (sum(num_batches) + 1) + batch_count #) loss.backward() clip_grad_norm(network.parameters(), 5.0) optim.step() num_inst = word.size(0) train_err += loss.data[0] * num_inst train_total += num_inst time_ave = (time.time() - start_time) / batch time_left = (2 * num_batches[0] - batch) * time_ave # update log if batch % 100 == 0: sys.stdout.write("\b" * num_back) sys.stdout.write(" " * num_back) sys.stdout.write("\b" * num_back) log_info = 'train: %d/%d loss: %.4f, time left (estimated): %.2fs' % ( batch, 2 * num_batches[0], train_err / train_total, time_left) sys.stdout.write(log_info) sys.stdout.flush() num_back = len(log_info) sys.stdout.write("\b" * num_back) sys.stdout.write(" " * num_back) sys.stdout.write("\b" * num_back) print('train: %d loss: %.4f, time: %.2fs' % (2 * num_batches[0], train_err / train_total, time.time() - start_time)) # Evaluate performance on dev data network.eval() for task_id in range(len(num_batches)): tmp_filename = 'tmp/%s_dev%d%d' % (str(uid), epoch, task_id) writer.start(tmp_filename) for batch in bionlp_data.iterate_batch_variable( data_devs[task_id], batch_size): word, char, pos, chunk, labels, masks, lengths, elmo_embedding = batch preds, _ = network.decode( task_id, word, char, target=labels, mask=masks, leading_symbolic=bionlp_data.NUM_SYMBOLIC_TAGS, elmo_word=elmo_embedding) writer.write(word.data.cpu().numpy(), pos.data.cpu().numpy(), chunk.data.cpu().numpy(), preds.cpu().numpy(), labels.data.cpu().numpy(), lengths.cpu().numpy()) writer.close() acc, precision, recall, f1 = evaluate(tmp_filename) log_writer.add_scalars( 'dev_task' + str(task_id), { 'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1 }, epoch) print( 'dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' % (acc, precision, recall, f1)) if dev_f1[task_id] < f1: dev_f1[task_id] = f1 dev_acc[task_id] = acc dev_precision[task_id] = precision dev_recall[task_id] = recall best_epoch[task_id] = epoch # Evaluate on test data when better performance detected tmp_filename = 'tmp/%s_test%d%d' % (str(uid), epoch, task_id) writer.start(tmp_filename) for batch in bionlp_data.iterate_batch_variable( data_tests[task_id], batch_size): word, char, pos, chunk, labels, masks, lengths, elmo_embedding = batch preds, _ = network.decode( task_id, word, char, target=labels, mask=masks, leading_symbolic=bionlp_data.NUM_SYMBOLIC_TAGS, elmo_word=elmo_embedding) writer.write(word.data.cpu().numpy(), pos.data.cpu().numpy(), chunk.data.cpu().numpy(), preds.cpu().numpy(), labels.data.cpu().numpy(), lengths.cpu().numpy()) writer.close() test_acc[task_id], test_precision[task_id], test_recall[ task_id], test_f1[task_id] = evaluate(tmp_filename) log_writer.add_scalars( 'test_task' + str(task_id), { 'accuracy': test_acc[task_id], 'precision': test_precision[task_id], 'recall': test_recall[task_id], 'f1': test_f1[task_id] }, epoch) print( "================================================================================" ) print("dataset: %s" % data_names[task_id]) print( "best dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % (dev_acc[task_id], dev_precision[task_id], dev_recall[task_id], dev_f1[task_id], best_epoch[task_id])) print( "best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % (test_acc[task_id], test_precision[task_id], test_recall[task_id], test_f1[task_id], best_epoch[task_id])) print( "================================================================================\n" ) if epoch % schedule == 0: # lr = learning_rate / (1.0 + epoch * lr_decay) # optim = SGD(network.parameters(), lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True) lr = lr * lr_decay optim.param_groups[0]['lr'] = lr # writer.export_scalars_to_json("./all_scalars.json") writer.close()
def fit_model(model, loss_op, optim_op, train_gen, val_gen, epochs, checkpoint_path, patience): """ Analog to Keras fit_generator function. # Arguments: model: Model to be finetuned. loss_op: loss operation (BCEWithLogitsLoss or CrossEntropy for e.g.) optim_op: optimization operation (Adam e.g.) train_gen: Training data iterator (DataLoader) val_gen: Validation data iterator (DataLoader) epochs: Number of epochs. checkpoint_path: Filepath where weights will be checkpointed to during training. This file will be rewritten by the function. patience: Patience for callback methods. verbose: Verbosity flag. # Returns: Accuracy of the trained model, ONLY if 'evaluate' is set. """ # Save original checkpoint torch.save(model.state_dict(), checkpoint_path) model.eval() best_loss = np.mean([calc_loss(loss_op, model(Variable(xv)), Variable(yv)).data.cpu().numpy()[0] for xv, yv in val_gen]) print("original val loss", best_loss) epoch_without_impr = 0 for epoch in range(epochs): for i, data in enumerate(train_gen): X_train, y_train = data X_train = Variable(X_train, requires_grad=False) y_train = Variable(y_train, requires_grad=False) model.train() optim_op.zero_grad() output = model(X_train) loss = calc_loss(loss_op, output, y_train) loss.backward() clip_grad_norm(model.parameters(), 1) optim_op.step() acc = evaluate_using_acc(model, [(X_train.data, y_train.data)]) print("== Epoch", epoch, "step", i, "train loss", loss.data.cpu().numpy()[0], "train acc", acc) model.eval() acc = evaluate_using_acc(model, val_gen) print("val acc", acc) val_loss = np.mean([calc_loss(loss_op, model(Variable(xv)), Variable(yv)).data.cpu().numpy()[0] for xv, yv in val_gen]) print("val loss", val_loss) if best_loss is not None and val_loss >= best_loss: epoch_without_impr += 1 print('No improvement over previous best loss: ', best_loss) # Save checkpoint if best_loss is None or val_loss < best_loss: best_loss = val_loss torch.save(model.state_dict(), checkpoint_path) print('Saving model at', checkpoint_path) # Early stopping if epoch_without_impr >= patience: break
def main(): parser = argparse.ArgumentParser( description='PyTorch PennTreeBank RNN/LSTM Language Model') parser.add_argument('--data', type=str, default='../data/', help='location of the data corpus') parser.add_argument('--presaved', action='store_true', help='use presaved data') parser.add_argument('--glovedata', type=str, default='../data/glove.6B', help='location of the pretrained glove embeddings') parser.add_argument('--din', type=int, default=30, help='length of LSTM') parser.add_argument('--demb', type=int, default=100, help='size of word embeddings') parser.add_argument('--dhid', type=int, default=100, help='humber of hidden units per layer') parser.add_argument('--dout', type=int, default=2, help='number of output classes') parser.add_argument('--nlayers', type=int, default=1, help='number of layers') parser.add_argument('--lr', type=float, default=0.001, help='initial learning rate') parser.add_argument('--clip', type=float, default=0.25, help='gradient clipping') parser.add_argument('--embinit', type=str, default='random', help='embedding weight initialization type') parser.add_argument('--decinit', type=str, default='random', help='decoder weight initialization type') parser.add_argument('--hidinit', type=str, default='random', help='recurrent hidden weight initialization type') parser.add_argument('--dropout', type=float, default=0.0, help='dropout applied to layers (0 = no dropout)') parser.add_argument('--reweight', action='store_true', help='reweight loss function') parser.add_argument('--epochs', type=int, default=40, help='upper epoch limit') parser.add_argument('--batchsize', type=int, default=20, metavar='N', help='batch size') parser.add_argument('--seed', type=int, default=3, help='random seed') parser.add_argument('--vocabsize', type=int, default=200000, help='random seed') parser.add_argument('--optimizer', action='store_true', help='use ADAM optimizer') parser.add_argument('--pipeline', action='store_true', help='use pipeline file') parser.add_argument('--psw', type=int, default=1, help='remove stop words') parser.add_argument('--ppunc', action='store_true', help='remove punctuation') parser.add_argument('--pntok', action='store_true', help='use number tokens') parser.add_argument('--pkq', action='store_true', help='keep question words') parser.add_argument('--stem', action='store_true', help='use stemmer') parser.add_argument('--lemma', action='store_true', help='use lemmatizer') parser.add_argument('--freezeemb', action='store_false', help='freezes embeddings') parser.add_argument('--cuda', action='store_true', help='use CUDA') parser.add_argument('--loginterval', type=int, default=100, metavar='N', help='report interval') parser.add_argument('--save', type=str, default='', help='path to save the final model') args = parser.parse_args() pipe = None if args.pipeline: stemmer, lemmatizer = None, None if args.stem: stemmer = SnowballStemmer('english') elif args.lemma: lemmatizer = WordNetLemmatizer() if not args.presaved: pipe = functools.partial(pipeline, rm_stop_words=args.psw, rm_punc=args.ppunc, number_token=args.pntok, keep_questions=args.pkq, stemmer=stemmer, lemmatizer=lemmatizer) corpus = TacoText(args.vocabsize, lower=True, vocab_pipe=pipe) X, y, X_val, y_val = load_data(args.data, corpus, args.din, train_split=0.9) else: print('Loading Presaved Data') X = torch.load(args.data + 'train_x.t') y = torch.load(args.data + 'train_y.t') X_val = torch.load(args.data + 'val_x.t') y_val = torch.load(args.data + 'val_y.t') with open(args.data + 'corpus.pkl', 'rb') as f: corpus = pkl.load(f) if args.cuda: X, y = X.cuda(), y.cuda() X_val, y_val = X_val.cuda(), y_val.cuda() print('Generating Data Loaders') #X.size len(train_data),1,2,fix_length train_dataset = TensorDataset(X, y) train_loader = DataLoader(train_dataset, batch_size=args.batchsize, shuffle=True) valid_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=args.batchsize, shuffle=False) ntokens = len(corpus) glove_embeddings = None if args.embinit == 'glove': assert args.demb in (50, 100, 200, 300) glove_embeddings = get_glove_embeddings(args.glovedata, corpus.dictionary.word2idx, ntokens, args.demb) model = LSTMModelMLP(args.din, args.dhid, args.nlayers, args.dout, args.demb, args.vocabsize, args.dropout, args.embinit, args.hidinit, args.decinit, glove_embeddings, args.cuda) if args.cuda: model.cuda() if args.reweight: w_tensor = torch.Tensor([1.309028344, 0.472001959]) if args.cuda: w_tensor = w_tensor.cuda() criterion = nn.NLLLoss(weight=w_tensor) else: criterion = nn.NLLLoss() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) model_config = '\t'.join([ str(x) for x in (torch.__version__, args.clip, args.nlayers, args.din, args.demb, args.dhid, args.embinit, args.decinit, args.hidinit, args.dropout, args.optimizer, args.reweight, args.lr, args.vocabsize, args.pipeline, args.psw, args.ppunc, args.pntok, args.pkq, args.stem, args.lemma) ]) print( 'Pytorch | Clip | #Layers | InSize | EmbDim | HiddenDim | EncoderInit | DecoderInit | WeightInit | Dropout | Optimizer | Reweight | LR | VocabSize | pipeline | stop | punc | ntoken | keep_ques | stem | lemma' ) print(model_config) # best_val_acc = 0.78 best_ll = 0.48 for epoch in range(args.epochs): model.train() total_cost = 0 start_time = time.time() cur_loss = 0 for ind, (qs, duplicate) in enumerate(train_loader): model.zero_grad() pred = model(qs[:, 0, 0, :], qs[:, 0, 1, :]) if args.cuda: pred = pred.cuda() duplicate = duplicate.cuda() duplicate = Variable(duplicate) loss = criterion(pred, duplicate) loss.backward() clip_grad_norm(model.parameters(), args.clip) if optimizer: optimizer.step() else: for p in model.parameters(): p.data.add_(-args.lr, p.grad.data) total_cost += loss.data[0] cur_loss += loss.data[0] if ind % args.loginterval == 0 and ind > 0: cur_loss = loss.data[0] / args.loginterval elapsed = time.time() - start_time print( '| Epoch {:3d} | {:5d}/{:5d} Batches | ms/batch {:5.2f} | ' 'Loss {:.6f}'.format(epoch, ind, len(X) // args.batchsize, elapsed * 1000.0 / args.loginterval, cur_loss)) start_time = time.time() cur_loss = 0 model.eval() train_acc, train_ll = evaluate(model, train_loader, args.cuda) val_acc, val_ll = evaluate(model, valid_loader, args.cuda) # if args.save and (val_acc > best_val_acc): if args.save and (val_ll < best_ll): with open(args.save + '_corpus.pkl', 'wb') as corp_f: pkl.dump(corpus, corp_f, protocol=pkl.HIGHEST_PROTOCOL) torch.save(model.cpu(), args.save) torch.save(model.cpu().state_dict(), args.save + ".state_dict") with open(args.save + ".state_dict.config", "w") as f: f.write(model_config) best_ll = val_ll if args.cuda: model.cuda() print( 'Epoch: {} | Train Loss: {:.4f} | Train Accuracy: {:.4f} | Val Accuracy: {:.4f} | Train LL: {:.4f} | Val LL: {:.4f}' .format(epoch, total_cost, train_acc, val_acc, train_ll, val_ll)) print('-' * 89)
# add summary to logger logger.scalar_summary('square loss', square_loss.data[0], step) # Next train Generator on Criterion from Discriminator real_labels = Variable(label.fill_(1)) g_loss = 0 if not use_adv_model else label_loss(model_adv.forward(outputs), real_labels) loss = square_loss + g_loss losses.append(loss.data[0]) if use_adv_model: logger.scalar_summary('Generator Loss', g_loss.data[0], step) # Clip gradient norms optimizer.zero_grad() loss.backward() logger.scalar_summary('Composite Loss', loss.data[0], step) clip_grad_norm(model.parameters(), 1.0) optimizer.step() step += args.batch print('Composite loss: ', np.array(losses).mean()) if epoch % args.snapshot == 0: # snapshot model and optimizer snapshot = { 'epoch': epoch + 1, 'model': model.state_dict(), 'adv_model': None if not use_adv_model else model_adv.state_dict(), 'optimizer': optimizer.state_dict(), 'optimizer_adv': None if not use_adv_model else optimizer_adv.state_dict() } torch.save(snapshot, os.path.join(exp_path, 'best.pth'))
def train(train_loader, model, criterion, optimizer, optimizer_cent, epoch): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() if args.no_partialbn: model.module.partialBN(False) else: model.module.partialBN(True) # switch to train mode model.train() end = time.time() #centers = model.centers #print(center) for i, (input, target) in enumerate(train_loader): # print('##### i:', i) # measure data loading time data_time.update(time.time() - end) target = target.cuda(async=True) # print('input size ====>', input.size()) # print('input size', input.size()) # input = input.view(-1,3,224,224) # print('input size ====>', input.size()) input_var = torch.autograd.Variable(input) target_var = torch.autograd.Variable(target) # compute output output = model(input_var) #print(output.shape) #feature = output(0) #center_loss = compute_center_loss(feature, model.centers, target_var) #print("tar shape {}".format(target_var.shape)) #print("ourput shape {}".format(output.shape)) #print("feature shape {}".format(feature.shape)) loss = criterion(output, target_var) #print(loss) #print(output.shape) #sys.exit() #loss_cent = criterion_cent(feature, output[target_var) #print(loss_cent) #loss_cent = 0.03 * center_loss #loss = loss_cent + loss # measure accuracy and record loss prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) losses.update(loss.data.item(), input.size(0)) top1.update(prec1.item(), input.size(0)) top5.update(prec5.item(), input.size(0)) # compute gradient and do SGD step optimizer.zero_grad() #optimizer_cent.zero_grad() loss.backward(retain_graph=True) if args.clip_gradient is not None: total_norm = clip_grad_norm(model.parameters(), args.clip_gradient) #print("total_norm: {}".format(total_norm)) if total_norm > args.clip_gradient: print("clipping gradient: {} with coef {}".format( total_norm, args.clip_gradient / total_norm)) optimizer.step() #for param in criterion_cent.parameters(): # param.grad.data *= (1./0.1) #center_deltas =get_center_delta(feature, centers, target, alpha=0.5) #model.centers = centers - center_deltas # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: print(('Epoch: [{0}][{1}/{2}], lr: {lr:.5f}\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top1=top1, top5=top5, lr=optimizer.param_groups[-1]['lr'])))
torch.zeros(num_layers, batch_size, hidden_size).to(device)) for i in range(0, ids.size(1) - seq_length, seq_length): # Get mini-batch inputs and targets inputs = ids[:, i:i+seq_length].to(device) targets = ids[:, (i+1):(i+1)+seq_length].to(device) # Forward pass states = detach(states) outputs, states = model(inputs, states) loss = criterion(outputs, targets.reshape(-1)) # Backward and optimize model.zero_grad() loss.backward() clip_grad_norm(model.parameters(), 0.5) optimizer.step() step = (i+1) // seq_length if step % 100 == 0: print ('Epoch [{}/{}], Step[{}/{}], Loss: {:.4f}, Perplexity: {:5.2f}' .format(epoch+1, num_epochs, step, num_batches, loss.item(), np.exp(loss.item()))) # Test the model with torch.no_grad(): with open('sample.txt', 'w') as f: # Set intial hidden ane cell states state = (torch.zeros(num_layers, 1, hidden_size).to(device), torch.zeros(num_layers, 1, hidden_size).to(device)) # Select one word id randomly
def main(args): CUDA = False folder_name = 'RL_' + args.name + '_' + args.task + '_' + args.architecture folder_path = os.path.join('./', folder_name) create_folder(folder_name) datasets = [IntegersLargerThanAverage(10000, i, 10) for i in range(4, 5)] critic = MovingAverageBaseline(0.9) if args.architecture == 'set': policy = BernoulliPolicy(IntegerSubsetNet()) elif args.architecture == 'null': policy = BernoulliPolicy(IntegerSubsetNet(null_model=True)) else: raise ValueError('Unknown architecture. Must be set or null!') optimizer = torch.optim.Adam(policy.parameters(), lr=1e-3, eps=1e-2) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 5000, gamma=0.99) if torch.cuda.is_available() and args.gpu != '': policy.cuda() CUDA = True print('Using GPU') environment = RLWrapper(datasets, 64, use_cuda=CUDA) data = environment.reset() rewards_list = [] for n in range(args.n_episodes): # run for epochs actions, log_prob_actions = policy(data) #policy_p = F.sigmoid(policy.fn_approximator(data)) log_prob_actions = log_prob_actions.sum(1) baseline = critic(data).view(-1, 1) if n % 100 == 0: y_target = torch.FloatTensor( environment.current_dataset.supervised_objective( data.data.int())) data, reward, _, info = environment.step(actions) advantage = reward - baseline critic.update_baseline(None, reward) loss = gradients.calculate_policy_gradient_terms( log_prob_actions, advantage) loss = loss.mean( ) # mean is fine since there is only really "one action"? optimizer.zero_grad() loss.backward() clip_grad_norm(policy.fn_approximator.parameters(), 40) optimizer.step() scheduler.step() rewards_list.append(reward.mean()) if n % 100 == 0: set_acc, elem_acc = set_accuracy(y_target, actions.data) print('{}: loss {:3g}, episode_reward {:3g}, set acc: {},' ' elem_acc: {}, set_size {}, entropy {}'.format( n, loss.cpu().data[0], reward.mean(), set_acc, elem_acc, environment.current_dataset.set_size, (-log_prob_actions * log_prob_actions.exp()).sum().data[0])) print('reward distribution: {}'.format( Counter(reward.numpy().ravel().tolist()))) # now put this into "supervised" mode datasets = [ (i, torch.utils.data.DataLoader(IntegerSubsetsSupervised(256, i, 10, target='mean', seed=5), batch_size=256)) for i in range(4, 10) ] set_sizes = [] mse = [] set_accs = [] elem_accs = [] torch.save(policy, os.path.join(folder_path, 'model-gpu.pyt')) criterion = torch.nn.BCELoss() for set_size, dataset in datasets: for i, (x, y) in enumerate(dataset): # prepare the data if CUDA: x = x.cuda() y = y.cuda() x, y = Variable(x, volatile=True), Variable(y, volatile=True).float() # run it through the network y_hat, _ = policy(x) y_hat = y_hat.view_as(y) # calculate the loss loss = criterion(y_hat, y) if CUDA: loss = loss.cpu() set_sizes.append(set_size) mse.append(loss.data[0]) set_acc, elem_acc = set_accuracy(y.squeeze(), y_hat.squeeze()) set_accs.append(set_acc.data[0]) elem_accs.append(elem_acc.data[0]) print(set_sizes) print(mse) print(set_accs) print(torch.FloatTensor(set_accs).mean()) policy.cpu() torch.save( { 'set_sizes': set_sizes, 'rewards_list': rewards_list, 'mse': mse, 'set_acc': set_accs, 'elem_accs': elem_accs, 'mean_acc': torch.FloatTensor(set_accs).mean() }, os.path.join(folder_path, 'results.json')) torch.save(policy, os.path.join(folder_path, 'model.pyt'))