def main(options): use_cuda = (len(options.gpuid) >= 1) if options.gpuid: cuda.set_device(options.gpuid[0]) _, src_dev, _, src_vocab = torch.load(open(options.data_file + "." + options.src_lang, 'rb')) _, trg_dev, _, trg_vocab = torch.load(open(options.data_file + "." + options.trg_lang, 'rb')) batched_dev_src, batched_dev_src_mask, sort_index = utils.tensor.advanced_batchize(src_dev, options.batch_size, src_vocab.stoi["<blank>"]) batched_dev_trg, batched_dev_trg_mask = utils.tensor.advanced_batchize_no_sort(trg_dev, options.batch_size, trg_vocab.stoi["<blank>"], sort_index) trg_vocab_size = len(trg_vocab) print(trg_vocab.itos[4]) original_model = torch.load(open(options.original_model_file, 'rb')) nmt = NMT(original_model) # TODO: add more arguments as necessary nmt.eval() if use_cuda > 0: nmt.cuda() else: nmt.cpu() criterion = torch.nn.NLLLoss() # optimizer = eval("torch.optim." + options.optimizer)(nmt.parameters(), options.learning_rate) total_loss = 0 num_sents = 0 for i, batch_i in enumerate(utils.rand.srange(len(batched_dev_src))): print("{0}/ {1}".format(i, len(batched_dev_src))) dev_src_batch = Variable(batched_dev_src[batch_i]) # of size (src_seq_len, batch_size) dev_trg_batch = Variable(batched_dev_trg[batch_i]) # of size (src_seq_len, batch_size) dev_src_mask = Variable(batched_dev_src_mask[batch_i]) dev_trg_mask = Variable(batched_dev_trg_mask[batch_i]) if use_cuda: dev_src_batch = dev_src_batch.cuda() dev_trg_batch = dev_trg_batch.cuda() dev_src_mask = dev_src_mask.cuda() dev_trg_mask = dev_trg_mask.cuda() num_sents += 1 sys_out_batch = nmt(dev_src_batch, dev_trg_batch) # (trg_seq_len, batch_size, trg_vocab_size) # TODO: add more arguments as necessary dev_trg_mask = dev_trg_mask.view(-1) dev_trg_batch = dev_trg_batch.view(-1) dev_trg_batch = dev_trg_batch.masked_select(dev_trg_mask) dev_trg_mask = dev_trg_mask.unsqueeze(1).expand(len(dev_trg_mask), trg_vocab_size) sys_out_batch = sys_out_batch.view(-1, trg_vocab_size) sys_out_batch = sys_out_batch.masked_select(dev_trg_mask).view(-1, trg_vocab_size) loss = criterion(sys_out_batch, dev_trg_batch) # _, max = torch.max(sys_out_batch,dim=1) # print(sys_out_batch[dev_trg_batch]) # print(max, dev_trg_batch) total_loss += loss # break print(total_loss, num_sents) print(total_loss/num_sents) print(torch.exp(total_loss/num_sents))
def sample(args): train_data_src = read_corpus(args.src_file, source='src') train_data_tgt = read_corpus(args.tgt_file, source='tgt') train_data = zip(train_data_src, train_data_tgt) # load model params print('load model from [%s]' % args.model_bin, file=sys.stderr) params = torch.load(args.model_bin, map_location=lambda storage, loc: storage) vocab = params['vocab'] opt = params['args'] state_dict = params['state_dict'] # build model model = NMT(opt, vocab) model.load_state_dict(state_dict) model.eval() model = model.cuda() # sampling print('begin sampling') train_iter = cum_samples = 0 for src_sents, tgt_sents in data_iter(train_data, batch_size=1): train_iter += 1 samples = model.sample(src_sents, sample_size=5, to_word=True) cum_samples += sum(len(sample) for sample in samples) for i, tgt_sent in enumerate(tgt_sents): print('*' * 80) print('target:' + ' '.join(tgt_sent)) tgt_samples = samples[i] print('samples:') for sid, sample in enumerate(tgt_samples, 1): print('[%d] %s' % (sid, ' '.join(sample[1:-1]))) print('*' * 80)
def main(options): _, _, src_test, src_vocab = torch.load( open(options.data_file + "." + options.src_lang, 'rb')) _, _, trg_test, trg_vocab = torch.load( open(options.data_file + "." + options.trg_lang, 'rb')) src_vocab_size = len(src_vocab) trg_vocab_size = len(trg_vocab) nmt = NMT(src_vocab_size, trg_vocab_size) nmt = torch.load(open(options.modelname, 'rb')) nmt.eval() if torch.cuda.is_available(): nmt.cuda() else: nmt.cpu() with open('data/output_tanay.txt', 'w') as f_write: for i in range(len(src_test)): src = to_var(torch.unsqueeze(src_test[i], 1), volatile=True) trg = to_var(torch.unsqueeze(trg_test[i], 1), volatile=True) results = nmt(src, trg) s = "" for ix in results: idx = np.argmax(ix.data.cpu().numpy()) if idx == 2: # if <s>, don't write it continue if idx == 3: # if </s>, end the loop break s += trg_vocab.itos[idx] + " " #print s.encode('utf-8') #if len(s): s += '\n' s += '\n' f_write.write(s.encode('utf-8'))
def beam(args): # load model params print('load model from [%s]' % args.model_bin, file=sys.stderr) params = torch.load(args.model_bin, map_location=lambda storage, loc: storage) vocab = params['vocab'] opt = params['args'] state_dict = params['state_dict'] # build model model = NMT(opt, vocab) model.load_state_dict(state_dict) model.train() # model.eval() model = model.cuda() # loss function loss_fn = torch.nn.NLLLoss() # sampling print('begin beam searching') src_sent = ['we', 'have', 'told', 'that', '.'] hyps = model.beam(src_sent) print('src_sent:', ' '.join(src_sent)) for ids, hyp, dist in hyps: print('tgt_sent:', ' '.join(hyp)) print('tgt_ids :', end=' ') for id in ids: print(id, end=', ') print() print('out_dist:', dist) var_ids = torch.autograd.Variable(torch.LongTensor(ids[1:]), requires_grad=False) loss = loss_fn(dist, var_ids) print('NLL loss =', loss) loss.backward()
def main(options): use_cuda = (len(options.gpuid) >= 1) if options.gpuid: cuda.set_device(options.gpuid[0]) src_train, src_dev, src_test, src_vocab = torch.load(open(options.data_file + "." + options.src_lang, 'rb')) trg_train, trg_dev, trg_test, trg_vocab = torch.load(open(options.data_file + "." + options.trg_lang, 'rb')) batched_train_src, batched_train_src_mask, sort_index = utils.tensor.advanced_batchize(src_train, options.batch_size, src_vocab.stoi["<blank>"]) batched_train_trg, batched_train_trg_mask = utils.tensor.advanced_batchize_no_sort(trg_train, options.batch_size, trg_vocab.stoi["<blank>"], sort_index) batched_dev_src, batched_dev_src_mask, sort_index = utils.tensor.advanced_batchize(src_dev, options.batch_size, src_vocab.stoi["<blank>"]) batched_dev_trg, batched_dev_trg_mask = utils.tensor.advanced_batchize_no_sort(trg_dev, options.batch_size, trg_vocab.stoi["<blank>"], sort_index) trg_vocab_size = len(trg_vocab) src_vocab_size = len(src_vocab) word_emb_size = 300 hidden_size = 1024 nmt = NMT(src_vocab_size, trg_vocab_size, word_emb_size, hidden_size, src_vocab, trg_vocab, attn_model = "general", use_cuda = True) if use_cuda > 0: nmt.cuda() if options.distributed: nmt = torch.nn.DataParallel(nmt) else: nmt.cpu() criterion = torch.nn.NLLLoss() # Configure optimization lr = options.learning_rate optimizer = eval("torch.optim." + options.optimizer)(nmt.parameters(), lr) # main training loop last_dev_avg_loss = float("inf") for epoch_i in range(options.epochs): logging.info("At {0}-th epoch.".format(epoch_i)) # Set training mode nmt.train() # srange generates a lazy sequence of shuffled range for i, batch_i in enumerate(utils.rand.srange(len(batched_train_src))): train_src_batch = Variable(batched_train_src[batch_i]) # of size (src_seq_len, batch_size) train_trg_batch = Variable(batched_train_trg[batch_i]) # of size (src_seq_len, batch_size) train_src_mask = Variable(batched_train_src_mask[batch_i]) train_trg_mask = Variable(batched_train_trg_mask[batch_i]) if use_cuda: train_src_batch = train_src_batch.cuda() train_trg_batch = train_trg_batch.cuda() train_src_mask = train_src_mask.cuda() train_trg_mask = train_trg_mask.cuda() sys_out_batch = nmt(train_src_batch, train_trg_batch, True) del train_src_batch train_trg_mask = train_trg_mask.view(-1) train_trg_batch = train_trg_batch.view(-1) train_trg_batch = train_trg_batch.masked_select(train_trg_mask) train_trg_mask = train_trg_mask.unsqueeze(1).expand(len(train_trg_mask), trg_vocab_size) sys_out_batch = sys_out_batch.view(-1, trg_vocab_size) sys_out_batch = sys_out_batch.masked_select(train_trg_mask).view(-1, trg_vocab_size) loss = criterion(sys_out_batch, train_trg_batch) logging.debug("loss at batch {0}: {1}".format(i, loss.data[0])) optimizer.zero_grad() loss.backward() # # gradient clipping torch.nn.utils.clip_grad_norm(nmt.parameters(), 5.0) optimizer.step() # validation -- this is a crude esitmation because there might be some paddings at the end dev_loss = 0.0 # Set validation mode nmt.eval() for batch_i in range(len(batched_dev_src)): dev_src_batch = Variable(batched_dev_src[batch_i], volatile=True) dev_trg_batch = Variable(batched_dev_trg[batch_i], volatile=True) dev_src_mask = Variable(batched_dev_src_mask[batch_i], volatile=True) dev_trg_mask = Variable(batched_dev_trg_mask[batch_i], volatile=True) if use_cuda: dev_src_batch = dev_src_batch.cuda() dev_trg_batch = dev_trg_batch.cuda() dev_src_mask = dev_src_mask.cuda() dev_trg_mask = dev_trg_mask.cuda() sys_out_batch = nmt(dev_src_batch, dev_trg_batch, False) dev_trg_mask = dev_trg_mask.view(-1) dev_trg_batch = dev_trg_batch.view(-1) dev_trg_batch = dev_trg_batch.masked_select(dev_trg_mask) dev_trg_mask = dev_trg_mask.unsqueeze(1).expand(len(dev_trg_mask), trg_vocab_size) sys_out_batch = sys_out_batch.view(-1, trg_vocab_size) sys_out_batch = sys_out_batch.masked_select(dev_trg_mask).view(-1, trg_vocab_size) loss = criterion(sys_out_batch, dev_trg_batch) logging.debug("dev loss at batch {0}: {1}".format(batch_i, loss.data[0])) dev_loss += loss dev_avg_loss = dev_loss / len(batched_dev_src) logging.info("Average loss value per instance is {0} at the end of epoch {1}".format(dev_avg_loss.data[0], epoch_i)) # if (last_dev_avg_loss - dev_avg_loss).data[0] < options.estop: # logging.info("Early stopping triggered with threshold {0} (previous dev loss: {1}, current: {2})".format(epoch_i, last_dev_avg_loss.data[0], dev_avg_loss.data[0])) # break torch.save(nmt, open(options.model_file + ".nll_{0:.2f}.epoch_{1}".format(dev_avg_loss.data[0], epoch_i), 'wb'), pickle_module=dill) last_dev_avg_loss = dev_avg_loss
def main(options): use_cuda = (len(options.gpuid) >= 1) # if options.gpuid: # cuda.set_device(options.gpuid[0]) src_train, src_dev, src_test, src_vocab = torch.load( open(options.data_file + "." + options.src_lang, 'rb')) trg_train, trg_dev, trg_test, trg_vocab = torch.load( open(options.data_file + "." + options.trg_lang, 'rb')) batched_train_src, batched_train_src_mask, sort_index = utils.tensor.advanced_batchize( src_train, options.batch_size, src_vocab.stoi["<blank>"]) batched_train_trg, batched_train_trg_mask = utils.tensor.advanced_batchize_no_sort( trg_train, options.batch_size, trg_vocab.stoi["<blank>"], sort_index) batched_dev_src, batched_dev_src_mask, sort_index = utils.tensor.advanced_batchize( src_dev, options.batch_size, src_vocab.stoi["<blank>"]) batched_dev_trg, batched_dev_trg_mask = utils.tensor.advanced_batchize_no_sort( trg_dev, options.batch_size, trg_vocab.stoi["<blank>"], sort_index) print "preprocessing batched data..." processed_src = list() processed_trg = list() processed_src_mask = list() processed_trg_mask = list() for batch_i in range(len(batched_train_src)): if batched_train_src[batch_i].size( 0) <= 35 and batched_train_trg[batch_i].size(0) <= 35: processed_src.append(batched_train_src[batch_i]) processed_trg.append(batched_train_trg[batch_i]) processed_src_mask.append(batched_train_src_mask[batch_i]) processed_trg_mask.append(batched_train_trg_mask[batch_i]) batched_train_src = processed_src batched_train_trg = processed_trg batched_train_src_mask = processed_src_mask batched_train_trg_mask = processed_trg_mask processed_src = list() processed_trg = list() processed_src_mask = list() processed_trg_mask = list() for batch_i in range(len(batched_dev_src)): if batched_dev_src[batch_i].size( 0) <= 35 and batched_dev_trg[batch_i].size(0) <= 35: processed_src.append(batched_dev_src[batch_i]) processed_trg.append(batched_dev_trg[batch_i]) processed_src_mask.append(batched_dev_src_mask[batch_i]) processed_trg_mask.append(batched_dev_trg_mask[batch_i]) batched_dev_src = processed_src batched_dev_trg = processed_trg batched_dev_src_mask = processed_src_mask batched_dev_trg_mask = processed_trg_mask del processed_src, processed_trg, processed_trg_mask, processed_src_mask trg_vocab_size = len(trg_vocab) src_vocab_size = len(src_vocab) word_emb_size = 50 hidden_size = 1024 nmt = NMT(src_vocab_size, trg_vocab_size, word_emb_size, hidden_size, src_vocab, trg_vocab, attn_model="general", use_cuda=True) discriminator = Discriminator(src_vocab_size, trg_vocab_size, word_emb_size, src_vocab, trg_vocab, use_cuda=True) if use_cuda > 0: #nmt = torch.nn.DataParallel(nmt,device_ids=options.gpuid).cuda() nmt.cuda() #discriminator = torch.nn.DataParallel(discriminator,device_ids=options.gpuid).cuda() discriminator.cuda() else: nmt.cpu() discriminator.cpu() criterion_g = torch.nn.NLLLoss().cuda() criterion = torch.nn.CrossEntropyLoss().cuda() # Configure optimization optimizer_g = eval("torch.optim." + options.optimizer)( nmt.parameters(), options.learning_rate) optimizer_d = eval("torch.optim." + options.optimizer)( discriminator.parameters(), options.learning_rate) # main training loop f1 = open("train_loss", "a") f2 = open("dev_loss", "a") last_dev_avg_loss = float("inf") for epoch_i in range(options.epochs): logging.info("At {0}-th epoch.".format(epoch_i)) # srange generates a lazy sequence of shuffled range train_loss_g = 0.0 train_loss_d = 0.0 train_loss_g_nll = 0.0 train_loss_g_ce = 0.0 train_loss_nll_batch_num = 0 train_loss_ce_batch_num = 0 for i, batch_i in enumerate(utils.rand.srange(len(batched_train_src))): if i == 1500: break # if i==5: # break train_src_batch = Variable(batched_train_src[batch_i] ) # of size (src_seq_len, batch_size) train_trg_batch = Variable(batched_train_trg[batch_i] ) # of size (src_seq_len, batch_size) train_src_mask = Variable(batched_train_src_mask[batch_i]) train_trg_mask = Variable(batched_train_trg_mask[batch_i]) if use_cuda: train_src_batch = train_src_batch.cuda() train_trg_batch = train_trg_batch.cuda() train_src_mask = train_src_mask.cuda() train_trg_mask = train_trg_mask.cuda() # train discriminator sys_out_batch = nmt(train_src_batch, train_trg_batch, True).detach() _, predict_batch = sys_out_batch.topk(1) del _ predict_batch = predict_batch.squeeze(2) real_dis_label_out = discriminator(train_src_batch, train_trg_batch, True) fake_dis_label_out = discriminator(train_src_batch, predict_batch, True) optimizer_d.zero_grad() loss_d_real = criterion( real_dis_label_out, Variable( torch.ones(options.batch_size * len(options.gpuid)).long()).cuda()) loss_d_real.backward() loss_d_fake = criterion( fake_dis_label_out, Variable( torch.zeros(options.batch_size * len(options.gpuid)).long()).cuda()) #loss_d_fake.backward(retain_graph=True) loss_d_fake.backward() loss_d = loss_d_fake.data[0] + loss_d_real.data[0] del loss_d_fake, loss_d_real logging.debug("D loss at batch {0}: {1}".format(i, loss_d)) f1.write("D train loss at batch {0}: {1}\n".format(i, loss_d)) optimizer_d.step() if use_cuda > 0: sys_out_batch = sys_out_batch.cuda() train_trg_batch = train_trg_batch.cuda() else: sys_out_batch = sys_out_batch.cpu() train_trg_batch = train_trg_batch.cpu() # train nmt sys_out_batch = nmt(train_src_batch, train_trg_batch, True) _, predict_batch = sys_out_batch.topk(1) predict_batch = predict_batch.squeeze(2) fake_dis_label_out = discriminator(train_src_batch, predict_batch, True) if random.random() > 0.5: train_trg_mask = train_trg_mask.view(-1) train_trg_batch = train_trg_batch.view(-1) train_trg_batch = train_trg_batch.masked_select(train_trg_mask) train_trg_mask = train_trg_mask.unsqueeze(1).expand( len(train_trg_mask), trg_vocab_size) sys_out_batch = sys_out_batch.view(-1, trg_vocab_size) sys_out_batch = sys_out_batch.masked_select( train_trg_mask).view(-1, trg_vocab_size) loss_g = criterion_g(sys_out_batch, train_trg_batch) train_loss_g_nll += loss_g train_loss_nll_batch_num += 1 f1.write("G train NLL loss at batch {0}: {1}\n".format( i, loss_g.data[0])) else: loss_g = criterion( fake_dis_label_out, Variable( torch.ones(options.batch_size * len(options.gpuid)).long()).cuda()) train_loss_g_ce += loss_g train_loss_ce_batch_num += 1 f1.write("G train CE loss at batch {0}: {1}\n".format( i, loss_g.data[0])) logging.debug("G loss at batch {0}: {1}".format(i, loss_g.data[0])) optimizer_g.zero_grad() loss_g.backward() # # gradient clipping torch.nn.utils.clip_grad_norm(nmt.parameters(), 5.0) optimizer_g.step() train_loss_d += loss_d train_avg_loss_g_nll = train_loss_g_nll / train_loss_nll_batch_num train_avg_loss_g_ce = train_loss_g_ce / train_loss_ce_batch_num train_avg_loss_d = train_loss_d / len(train_src_batch) logging.info( "G TRAIN Average NLL loss value per instance is {0} at the end of epoch {1}" .format(train_avg_loss_g_nll, epoch_i)) logging.info( "G TRAIN Average CE loss value per instance is {0} at the end of epoch {1}" .format(train_avg_loss_g_ce, epoch_i)) logging.info( "D TRAIN Average loss value per instance is {0} at the end of epoch {1}" .format(train_avg_loss_d, epoch_i)) # validation -- this is a crude esitmation because there might be some paddings at the end # dev_loss_g_nll = 0.0 # dev_loss_g_ce = 0.0 # dev_loss_d = 0.0 # for batch_i in range(len(batched_dev_src)): # dev_src_batch = Variable(batched_dev_src[batch_i], volatile=True) # dev_trg_batch = Variable(batched_dev_trg[batch_i], volatile=True) # dev_src_mask = Variable(batched_dev_src_mask[batch_i], volatile=True) # dev_trg_mask = Variable(batched_dev_trg_mask[batch_i], volatile=True) # if use_cuda: # dev_src_batch = dev_src_batch.cuda() # dev_trg_batch = dev_trg_batch.cuda() # dev_src_mask = dev_src_mask.cuda() # dev_trg_mask = dev_trg_mask.cuda() # sys_out_batch = nmt(dev_src_batch, dev_trg_batch, False).detach() # _,predict_batch = sys_out_batch.topk(1) # predict_batch = predict_batch.squeeze(2) # real_dis_label_out = discriminator(dev_src_batch, dev_trg_batch, True).detach() # fake_dis_label_out = discriminator(dev_src_batch, predict_batch, True).detach() # if use_cuda > 0: # sys_out_batch = sys_out_batch.cuda() # dev_trg_batch = dev_trg_batch.cuda() # else: # sys_out_batch = sys_out_batch.cpu() # dev_trg_batch = dev_trg_batch.cpu() # dev_trg_mask = dev_trg_mask.view(-1) # dev_trg_batch = dev_trg_batch.view(-1) # dev_trg_batch = dev_trg_batch.masked_select(dev_trg_mask) # dev_trg_mask = dev_trg_mask.unsqueeze(1).expand(len(dev_trg_mask), trg_vocab_size) # sys_out_batch = sys_out_batch.view(-1, trg_vocab_size) # sys_out_batch = sys_out_batch.masked_select(dev_trg_mask).view(-1, trg_vocab_size) # loss_g_nll = criterion_g(sys_out_batch, dev_trg_batch) # loss_g_ce = criterion(fake_dis_label_out, Variable(torch.ones(options.batch_size*len(options.gpuid)).long(),volatile=True).cuda()) # loss_d = criterion(real_dis_label_out, Variable(torch.ones(options.batch_size*len(options.gpuid)).long(),volatile=True).cuda()) + criterion(fake_dis_label_out, Variable(torch.zeros(options.batch_size*len(options.gpuid)).long(),volatile=True).cuda()) # logging.debug("G dev NLL loss at batch {0}: {1}".format(batch_i, loss_g_nll.data[0])) # logging.debug("G dev CE loss at batch {0}: {1}".format(batch_i, loss_g_ce.data[0])) # f2.write("G dev NLL loss at batch {0}: {1}\n".format(batch_i, loss_g_nll.data[0])) # f2.write("G dev CE loss at batch {0}: {1}\n".format(batch_i, loss_g_ce.data[0])) # logging.debug("D dev loss at batch {0}: {1}".format(batch_i, loss_d.data[0])) # f2.write("D dev loss at batch {0}: {1}\n".format(batch_i, loss_d.data[0])) # dev_loss_g_nll += loss_g_nll # dev_loss_g_ce += loss_g_ce # dev_loss_d += loss_d # dev_avg_loss_g_nll = dev_loss_g_nll / len(batched_dev_src) # dev_avg_loss_g_ce = dev_loss_g_ce / len(batched_dev_src) # dev_avg_loss_d = dev_loss_d / len(batched_dev_src) # logging.info("G DEV Average NLL loss value per instance is {0} at the end of epoch {1}".format(dev_avg_loss_g_nll.cpu().data[0], epoch_i)) # logging.info("G DEV Average CE loss value per instance is {0} at the end of epoch {1}".format(dev_avg_loss_g_ce.cpu().data[0], epoch_i)) # logging.info("D DEV Average loss value per instance is {0} at the end of epoch {1}".format(dev_avg_loss_d.data[0], epoch_i)) # # if (last_dev_avg_loss - dev_avg_loss).data[0] < options.estop: # # logging.info("Early stopping triggered with threshold {0} (previous dev loss: {1}, current: {2})".format(epoch_i, last_dev_avg_loss.data[0], dev_avg_loss.data[0])) # # break torch.save(nmt, open( "nmt.nll_{0:.2f}.epoch_{1}".format( train_avg_loss_g_nll.cpu().data[0], epoch_i), 'wb'), pickle_module=dill) torch.save(discriminator, open( "discriminator.nll_{0:.2f}.epoch_{1}".format( train_avg_loss_d.data[0], epoch_i), 'wb'), pickle_module=dill) f1.close() f2.close()
def main(options): use_cuda = (len(options.gpuid) >= 1) if options.gpuid: cuda.set_device(options.gpuid[0]) src_train, src_dev, src_test, src_vocab = torch.load( open(options.data_file + "." + options.src_lang, 'rb')) trg_train, trg_dev, trg_test, trg_vocab = torch.load( open(options.data_file + "." + options.trg_lang, 'rb')) """src_train = get_lm_input(src_train) src_dev = get_lm_input(src_dev) src_test = get_lm_input(src_test) trg_train = get_lm_output(trg_train) trg_dev = get_lm_output(trg_dev) trg_test = get_lm_output(trg_test)""" batched_train_src, batched_train_src_mask, sort_index = tensor.advanced_batchize( src_train, options.batch_size, src_vocab.stoi["<blank>"]) batched_train_trg, batched_train_trg_mask = tensor.advanced_batchize_no_sort( trg_train, options.batch_size, trg_vocab.stoi["<blank>"], sort_index) batched_dev_src, batched_dev_src_mask, sort_index = tensor.advanced_batchize( src_dev, options.batch_size, src_vocab.stoi["<blank>"]) batched_dev_trg, batched_dev_trg_mask = tensor.advanced_batchize_no_sort( trg_dev, options.batch_size, trg_vocab.stoi["<blank>"], sort_index) (src_vocab_size, trg_vocab_size) = len(src_vocab), len(trg_vocab) nmt = NMT((src_vocab_size, trg_vocab_size), use_cuda) # TODO: add more arguments as necessary #nmt = init_model((src_vocab_size, trg_vocab_size), use_cuda) if use_cuda > 0: nmt.cuda() else: nmt.cpu() criterion = torch.nn.NLLLoss() optimizer = eval("torch.optim." + options.optimizer)(nmt.parameters(), options.learning_rate) # main training loop last_dev_avg_loss = float("inf") for epoch_i in range(options.epochs): logging.info("At {0}-th epoch.".format(epoch_i)) # srange generates a lazy sequence of shuffled range for i, batch_i in enumerate(rand.srange(len(batched_train_src))): #if random.random() > 0.5: if False: #i > 1500: # TODO REMOVE THIS !!!!!!!!!!!!!!!!!!!! #model.save("intermediate_ds_new", nmt); break if i % 200 == 0: model.save("intermediate_ds", nmt) pass train_src_batch = Variable(batched_train_src[batch_i] ) # of size (src_seq_len, batch_size) train_trg_batch = Variable(batched_train_trg[batch_i] ) # of size (src_seq_len, batch_size) train_src_mask = Variable(batched_train_src_mask[batch_i]) train_trg_mask = Variable(batched_train_trg_mask[batch_i]) if use_cuda: train_src_batch = train_src_batch.cuda() train_trg_batch = train_trg_batch.cuda() train_src_mask = train_src_mask.cuda() train_trg_mask = train_trg_mask.cuda() sys_out_batch, translated_sentence_wd_index = nmt( train_src_batch, train_trg_batch ) # (trg_seq_len, batch_size, trg_vocab_size) # TODO: add more arguments as necessary train_trg_mask = train_trg_mask.view(-1) train_trg_batch = train_trg_batch.view(-1) train_trg_batch = train_trg_batch.masked_select(train_trg_mask) train_trg_mask = train_trg_mask.unsqueeze(1).expand( len(train_trg_mask), trg_vocab_size) sys_out_batch = sys_out_batch.view(-1, trg_vocab_size) sys_out_batch = sys_out_batch.masked_select(train_trg_mask).view( -1, trg_vocab_size) loss = criterion(sys_out_batch, train_trg_batch) logging.debug("loss at batch {0}: {1}".format(i, loss.data[0])) optimizer.zero_grad() loss.backward() optimizer.step() # validation -- this is a crude esitmation because there might be some paddings at the end dev_loss = 0.0 for batch_i in range(len(batched_dev_src)): #if random.random() > 0.5: if False: #i > 1500: # TODO REMOVE THIS !!!!!!!!!!!!!!!!!!!! #model.save("intermediate_ds_new", nmt); break dev_src_batch = Variable(batched_dev_src[batch_i], volatile=True) dev_trg_batch = Variable(batched_dev_trg[batch_i], volatile=True) dev_src_mask = Variable(batched_dev_src_mask[batch_i], volatile=True) dev_trg_mask = Variable(batched_dev_trg_mask[batch_i], volatile=True) if use_cuda: dev_src_batch = dev_src_batch.cuda() dev_trg_batch = dev_trg_batch.cuda() dev_src_mask = dev_src_mask.cuda() dev_trg_mask = dev_trg_mask.cuda() sys_out_batch, translated_sentence_wd_index = nmt( dev_src_batch ) # (trg_seq_len, batch_size, trg_vocab_size) # TODO: add more arguments as necessary actual_trg_max_len = dev_trg_mask.data.shape[0] dev_trg_mask = dev_trg_mask.view(-1) dev_trg_batch = dev_trg_batch.view(-1) dev_trg_batch = dev_trg_batch.masked_select(dev_trg_mask) dev_trg_mask = dev_trg_mask.unsqueeze(1).expand( len(dev_trg_mask), trg_vocab_size) # TODO Remove this!!!!!! predicted_trg_max_len = sys_out_batch.data.shape[0] if actual_trg_max_len > predicted_trg_max_len: sys_out_batch = torch.cat( (sys_out_batch, torch.ones((actual_trg_max_len - predicted_trg_max_len, options.batch_size, trg_vocab_size)))) else: sys_out_batch = sys_out_batch[0:actual_trg_max_len] # TODO Remove this ^^^ !!!!!! sys_out_batch = sys_out_batch.view(-1, trg_vocab_size) sys_out_batch = sys_out_batch.masked_select(dev_trg_mask).view( -1, trg_vocab_size) loss = criterion(sys_out_batch, dev_trg_batch) logging.debug("dev loss at batch {0}: {1}".format( batch_i, loss.data[0])) dev_loss += loss #if True: break dev_avg_loss = dev_loss / len(batched_dev_src) logging.info( "Average loss value per instance is {0} at the end of epoch {1}". format(dev_avg_loss.data[0], epoch_i)) if (last_dev_avg_loss - dev_avg_loss).data[0] < options.estop: logging.info( "Early stopping triggered with threshold {0} (previous dev loss: {1}, current: {2})" .format(epoch_i, last_dev_avg_loss.data[0], dev_avg_loss.data[0])) break torch.save( nmt, open( options.model_file + ".nll_{0:.2f}.epoch_{1}".format(dev_avg_loss.data[0], epoch_i), 'wb'), pickle_module=dill) last_dev_avg_loss = dev_avg_loss import datetime current_dt_tm = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S") model.save("%s_%s" % (options.model_file, current_dt_tm), nmt)
def main(options): src_train, src_dev, src_test, src_vocab = torch.load( open(options.data_file + "." + options.src_lang, 'rb')) trg_train, trg_dev, trg_test, trg_vocab = torch.load( open(options.data_file + "." + options.trg_lang, 'rb')) batched_train_src, batched_train_src_mask, sort_index = utils.tensor.advanced_batchize( src_train, options.batch_size, src_vocab.stoi["<blank>"]) batched_train_trg, batched_train_trg_mask = utils.tensor.advanced_batchize_no_sort( trg_train, options.batch_size, trg_vocab.stoi["<blank>"], sort_index) batched_dev_src, batched_dev_src_mask, sort_index = utils.tensor.advanced_batchize( src_dev, options.batch_size, src_vocab.stoi["<blank>"]) batched_dev_trg, batched_dev_trg_mask = utils.tensor.advanced_batchize_no_sort( trg_dev, options.batch_size, trg_vocab.stoi["<blank>"], sort_index) src_vocab_size = len(src_vocab) trg_vocab_size = len(trg_vocab) nmt = NMT(src_vocab_size, trg_vocab_size) # TODO: add more arguments as necessary if torch.cuda.is_available(): nmt.cuda() else: nmt.cpu() criterion = torch.nn.NLLLoss() optimizer = eval("torch.optim." + options.optimizer)(nmt.parameters(), options.learning_rate) # main training loop last_dev_avg_loss = float("inf") for epoch_i in range(options.epochs): logging.info("At {0}-th epoch.".format(epoch_i)) # srange generates a lazy sequence of shuffled range for i, batch_i in enumerate(utils.rand.srange(len(batched_train_src))): train_src_batch = to_var(batched_train_src[batch_i] ) # of size (src_seq_len, batch_size) train_trg_batch = to_var(batched_train_trg[batch_i] ) # of size (src_seq_len, batch_size) train_src_mask = to_var(batched_train_src_mask[batch_i]) train_trg_mask = to_var(batched_train_trg_mask[batch_i]) sys_out_batch = nmt( train_src_batch, train_trg_batch, training=True ) # (trg_seq_len, batch_size, trg_vocab_size) # TODO: add more arguments as necessary train_trg_mask = train_trg_mask.view(-1) train_trg_batch = train_trg_batch.view(-1) train_trg_batch = train_trg_batch.masked_select(train_trg_mask) train_trg_mask = train_trg_mask.unsqueeze(1).expand( len(train_trg_mask), trg_vocab_size) sys_out_batch = sys_out_batch.view(-1, trg_vocab_size) sys_out_batch = sys_out_batch.masked_select(train_trg_mask).view( -1, trg_vocab_size) loss = criterion(sys_out_batch, train_trg_batch) # logging.debug("loss at batch {0}: {1}".format(i, loss.data[0])) optimizer.zero_grad() loss.backward() optimizer.step() # validation -- this is a crude esitmation because there might be some paddings at the end dev_loss = 0.0 for batch_i in range(len(batched_dev_src)): dev_src_batch = to_var(batched_dev_src[batch_i], volatile=True) dev_trg_batch = to_var(batched_dev_trg[batch_i], volatile=True) dev_src_mask = to_var(batched_dev_src_mask[batch_i], volatile=True) dev_trg_mask = to_var(batched_dev_trg_mask[batch_i], volatile=True) sys_out_batch = nmt( dev_src_batch, dev_trg_batch ) # (trg_seq_len, batch_size, trg_vocab_size) # TODO: add more arguments as necessary dev_trg_mask = dev_trg_mask.view(-1) dev_trg_batch = dev_trg_batch.view(-1) dev_trg_batch = dev_trg_batch.masked_select(dev_trg_mask) dev_trg_mask = dev_trg_mask.unsqueeze(1).expand( len(dev_trg_mask), trg_vocab_size) sys_out_batch = sys_out_batch.view(-1, trg_vocab_size) sys_out_batch = sys_out_batch.masked_select(dev_trg_mask).view( -1, trg_vocab_size) #sys_out_batch = sys_out_batch.masked_select(dev_trg_mask).view(-1, trg_vocab_size) loss = criterion(sys_out_batch, dev_trg_batch) # logging.debug("dev loss at batch {0}: {1}".format(batch_i, loss.data[0])) dev_loss += loss dev_avg_loss = dev_loss / len(batched_dev_src) logging.info( "Average loss value per instance is {0} at the end of epoch {1}". format(dev_avg_loss.data[0], epoch_i)) # if (last_dev_avg_loss - dev_avg_loss).data[0] < options.estop: # logging.info("Early stopping triggered with threshold {0} (previous dev loss: {1}, current: {2})".format(epoch_i, last_dev_avg_loss.data[0], dev_avg_loss.data[0])) # break torch.save( nmt, open( options.model_file + ".nll_{0:.2f}.epoch_{1}".format(dev_avg_loss.data[0], epoch_i), 'wb'), pickle_module=dill) last_dev_avg_loss = dev_avg_loss