def main(): argparser = argparse.ArgumentParser() argparser.add_argument('--test_query_file', '-i', type=str, required=True) argparser.add_argument('--load_path', '-p', type=str, required=True) # TODO: load epoch -> load best model argparser.add_argument('--load_epoch', '-e', type=int, required=True) argparser.add_argument('--output_file', '-o', type=str) argparser.add_argument('--dec_algorithm', '-algo', type=str, default='greedy') new_args = argparser.parse_args() arg_file = os.path.join(new_args.load_path, 'args.pkl') if not os.path.exists(arg_file): raise RuntimeError('No default arguments file to load') f = open(arg_file, 'rb') args = pickle.load(f) f.close() if args.use_cuda: USE_CUDA = True vocab, rev_vocab = load_vocab(args.vocab_file, max_vocab=args.max_vocab_size) vocab_size = len(vocab) word_embeddings = nn.Embedding(vocab_size, args.emb_dim, padding_idx=SYM_PAD) E = EncoderRNN(vocab_size, args.emb_dim, args.hidden_dim, args.n_layers, bidirectional=True, variable_lengths=True) G = Generator(vocab_size, args.response_max_len, args.emb_dim, 2 * args.hidden_dim, args.n_layers) if USE_CUDA: word_embeddings.cuda() E.cuda() G.cuda() reload_model(new_args.load_path, new_args.load_epoch, word_embeddings, E, G) predict(new_args.test_query_file, args.response_max_len, vocab, rev_vocab, word_embeddings, E, G, new_args.output_file)
def trainDemo(lang, dataSet, nlVocab, codeVocab, train_variables): print("Training...") encoder1 = EncoderRNN(codeVocab.n_words, setting.HIDDDEN_SIAZE) attn_decoder1 = AttnDecoderRNN(setting.HIDDDEN_SIAZE, nlVocab.n_words, 1, dropout_p=0.1) if setting.USE_CUDA: encoder1 = encoder1.cuda() attn_decoder1 = attn_decoder1.cuda() trainIters(lang, dataSet, train_variables, encoder1, attn_decoder1, 2000000, print_every=5000)
# hidden_size = 256 encoder1 = EncoderRNN(input_lang.n_words, hidden_size) attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1) TRAIN = False if "-t" in sys.argv: TRAIN = True TRAIN_ITER = 7500 if len(sys.argv) == 3: TRAIN_ITER = int(sys.argv[2]) if use_cuda: encoder1 = encoder1.cuda() attn_decoder1 = attn_decoder1.cuda() if os.path.exists("encoder.pt") and os.path.exists("decoder.pt") and not TRAIN: print("Found saved models") encoder_state = torch.load('encoder.pt') decoder_state = torch.load('decoder.pt') encoder1.load_state_dict(encoder_state) attn_decoder1.load_state_dict(decoder_state) else: trainIters(encoder1, attn_decoder1, TRAIN_ITER, print_every=50) torch.save(encoder1.state_dict(), "encoder.pt") torch.save(attn_decoder1.state_dict(), "decoder.pt") ######################################################################
def adversarial(): # user the root logger logger = logging.getLogger("lan2720") argparser = argparse.ArgumentParser(add_help=False) argparser.add_argument('--load_path', '-p', type=str, required=True) # TODO: load best argparser.add_argument('--load_epoch', '-e', type=int, required=True) argparser.add_argument('--filter_num', type=int, required=True) argparser.add_argument('--filter_sizes', type=str, required=True) argparser.add_argument('--training_ratio', type=int, default=2) argparser.add_argument('--g_learning_rate', '-glr', type=float, default=0.001) argparser.add_argument('--d_learning_rate', '-dlr', type=float, default=0.001) argparser.add_argument('--batch_size', '-b', type=int, default=168) # new arguments used in adversarial new_args = argparser.parse_args() # load default arguments default_arg_file = os.path.join(new_args.load_path, 'args.pkl') if not os.path.exists(default_arg_file): raise RuntimeError('No default argument file in %s' % new_args.load_path) else: with open(default_arg_file, 'rb') as f: args = pickle.load(f) args.mode = 'adversarial' #args.d_learning_rate = 0.0001 args.print_every = 1 args.g_learning_rate = new_args.g_learning_rate args.d_learning_rate = new_args.d_learning_rate args.batch_size = new_args.batch_size # add new arguments args.load_path = new_args.load_path args.load_epoch = new_args.load_epoch args.filter_num = new_args.filter_num args.filter_sizes = new_args.filter_sizes args.training_ratio = new_args.training_ratio # set up the output directory exp_dirname = os.path.join(args.exp_dir, args.mode, time.strftime("%Y-%m-%d-%H-%M-%S")) os.makedirs(exp_dirname) # set up the logger tqdm_logging.config(logger, os.path.join(exp_dirname, 'adversarial.log'), mode='w', silent=False, debug=True) # load vocabulary vocab, rev_vocab = load_vocab(args.vocab_file, max_vocab=args.max_vocab_size) vocab_size = len(vocab) word_embeddings = nn.Embedding(vocab_size, args.emb_dim, padding_idx=SYM_PAD) E = EncoderRNN(vocab_size, args.emb_dim, args.hidden_dim, args.n_layers, args.dropout_rate, bidirectional=True, variable_lengths=True) G = Generator(vocab_size, args.response_max_len, args.emb_dim, 2*args.hidden_dim, args.n_layers, dropout_p=args.dropout_rate) D = Discriminator(args.emb_dim, args.filter_num, eval(args.filter_sizes)) if args.use_cuda: word_embeddings.cuda() E.cuda() G.cuda() D.cuda() # define optimizer opt_G = torch.optim.Adam(G.rnn.parameters(), lr=args.g_learning_rate) opt_D = torch.optim.Adam(D.parameters(), lr=args.d_learning_rate) logger.info('----------------------------------') logger.info('Adversarial a neural conversation model') logger.info('----------------------------------') logger.info('Args:') logger.info(str(args)) logger.info('Vocabulary from ' + args.vocab_file) logger.info('vocabulary size: %d' % vocab_size) logger.info('Loading text data from ' + args.train_query_file + ' and ' + args.train_response_file) reload_model(args.load_path, args.load_epoch, word_embeddings, E, G) # start_epoch = args.resume_epoch + 1 #else: # start_epoch = 0 # dump args with open(os.path.join(exp_dirname, 'args.pkl'), 'wb') as f: pickle.dump(args, f) # TODO: num_epoch is old one for e in range(args.num_epoch): train_data_generator = batcher(args.batch_size, args.train_query_file, args.train_response_file) logger.info("Epoch: %d/%d" % (e, args.num_epoch)) step = 0 cur_time = time.time() while True: try: post_sentences, response_sentences = train_data_generator.next() except StopIteration: # save model save_model(exp_dirname, e, word_embeddings, E, G, D) ## evaluation #eval(args.valid_query_file, args.valid_response_file, args.batch_size, # word_embeddings, E, G, loss_func, args.use_cuda, vocab, args.response_max_len) break # prepare data post_ids = [sentence2id(sent, vocab) for sent in post_sentences] response_ids = [sentence2id(sent, vocab) for sent in response_sentences] posts_var, posts_length = padding_inputs(post_ids, None) responses_var, responses_length = padding_inputs(response_ids, args.response_max_len) # sort by post length posts_length, perms_idx = posts_length.sort(0, descending=True) posts_var = posts_var[perms_idx] responses_var = responses_var[perms_idx] responses_length = responses_length[perms_idx] if args.use_cuda: posts_var = posts_var.cuda() responses_var = responses_var.cuda() embedded_post = word_embeddings(posts_var) real_responses = word_embeddings(responses_var) # forward _, dec_init_state = E(embedded_post, input_lengths=posts_length.numpy()) fake_responses = G(dec_init_state, word_embeddings) # [B, T, emb_size] prob_real = D(embedded_post, real_responses) prob_fake = D(embedded_post, fake_responses) # loss D_loss = - torch.mean(torch.log(prob_real) + torch.log(1. - prob_fake)) G_loss = torch.mean(torch.log(1. - prob_fake)) if step % args.training_ratio == 0: opt_D.zero_grad() D_loss.backward(retain_graph=True) opt_D.step() opt_G.zero_grad() G_loss.backward() opt_G.step() if step % args.print_every == 0: logger.info('Step %5d: D accuracy=%.2f (0.5 for D to converge) D score=%.2f (-1.38 for G to converge) (%.1f iters/sec)' % ( step, prob_real.cpu().data.numpy().mean(), -D_loss.cpu().data.numpy()[0], args.print_every/(time.time()-cur_time))) cur_time = time.time() step = step + 1
else: train_embedding = Embedding(filename=args.glove_filename, embedding_size=embedding_size).load_embedding(train_dataset.src_vocab) target_embedding = Embedding(filename=args.glove_filename, embedding_size=embedding_size).load_embedding(train_dataset.tgt_vocab) encoder.embedding.weight.data.copy_(train_embedding) decoder.embedding.weight.data.copy_(target_embedding) if opts.fixed_embeddings: encoder.embedding.weight.requires_grad = False decoder.embedding.weight.requires_grad = False else: decoder.embedding.weight.requires_grad = True print("emb end") if LOAD_CHECKPOINT: encoder.load_state_dict(checkpoint['encoder_state_dict']) decoder.load_state_dict(checkpoint['decoder_state_dict']) if USE_CUDA: encoder.cuda() decoder.cuda() FINE_TUNE = True if FINE_TUNE: encoder.embedding.weight.requires_grad = True print('='*100) print('Model log:\n') print(encoder) print(decoder) print('- Encoder input embedding requires_grad={}'.format(encoder.embedding.weight.requires_grad)) print('- Decoder input embedding requires_grad={}'.format(decoder.embedding.weight.requires_grad)) print('- Decoder output embedding requires_grad={}'.format(decoder.W_s.weight.requires_grad)) print('='*100 + '\n') # Initialize optimizers (we can experiment different learning rates)
class BiLSTMModel(nn.Module): def __init__(self): super(BiLSTMModel, self).__init__() self.base_rnn = None self.wd = None def init_model(self, wd, hidden_size, e_layers, d_layers, base_rnn, pretrained_embeddings=None, dropout_p=0.1): self.base_rnn = base_rnn self.wd = wd self.dropout_p = dropout_p if pretrained_embeddings is True: print("Loading GloVe Embeddings ...") pretrained_embeddings = load_glove_embeddings( wd.word2index, hidden_size) self.encoder = EncoderRNN(wd.n_words, hidden_size, n_layers=e_layers, base_rnn=base_rnn, pretrained_embeddings=pretrained_embeddings) self.mlp = torch.nn.Sequential( torch.nn.Linear(int(hidden_size * 8), int(hidden_size)), torch.nn.ReLU(), torch.nn.Dropout(dropout_p), torch.nn.Linear(int(hidden_size), 3), torch.nn.Softmax(dim=1)) self.parameter_list = [ self.encoder.parameters(), self.mlp.parameters() ] if USE_CUDA: self.encoder = self.encoder.cuda() self.mlp = self.mlp.cuda() return self def forward(self, batch, inference=False): # Convert batch from numpy to torch if inference is True: text_batch, text_lengths, hyp_batch, hyp_lengths = batch else: text_batch, text_lengths, hyp_batch, hyp_lengths, labels = batch batch_size = text_batch.size(1) # Pass the input batch through the encoder text_enc_fwd_outputs, text_enc_bkwd_outputs, text_encoder_hidden = self.encoder( text_batch, text_lengths) hyp_enc_fwd_outputs, hyp_enc_bkwd_outputs, hyp_encoder_hidden = self.encoder( hyp_batch, hyp_lengths) last_text_enc_fwd = text_enc_fwd_outputs[-1, :, :] last_text_enc_bkwd = text_enc_bkwd_outputs[0, :, :] last_text_enc = torch.cat((last_text_enc_fwd, last_text_enc_bkwd), dim=1) last_hyp_enc_fwd = hyp_enc_fwd_outputs[-1, :, :] last_hyp_enc_bkwd = hyp_enc_bkwd_outputs[0, :, :] last_hyp_enc = torch.cat((last_hyp_enc_fwd, last_hyp_enc_bkwd), dim=1) mult_feature, diff_feature = last_text_enc * last_hyp_enc, torch.abs( last_text_enc - last_hyp_enc) features = torch.cat( [last_text_enc, last_hyp_enc, mult_feature, diff_feature], dim=1) outputs = self.mlp(features) # B x 3 return outputs def get_loss_for_batch(self, batch): labels = batch[-1] outputs = self(batch) loss_fn = torch.nn.CrossEntropyLoss() loss = loss_fn(outputs, labels) return loss def torch_batch_from_numpy_batch(self, batch): batch = list(batch) variable_indices = [ 0, 2, 4 ] # tuple indices of variables need to be converted for i in variable_indices: var = Variable(torch.from_numpy(batch[i])) if USE_CUDA: var = var.cuda() batch[i] = var return batch # Trains on a single batch def train_batch(self, batch, tl_mode=False): self.train() batch = self.torch_batch_from_numpy_batch(batch) loss = self.get_loss_for_batch(batch) loss.backward() return loss.item() #loss.data[0] def validate(self, batch): self.eval() batch = self.torch_batch_from_numpy_batch(batch) return self.get_loss_for_batch(batch).item() #.data[0] def score(self, data): batch_size = 1 batches = nli_batches(batch_size, data) total_correct = 0 for batch in tqdm(batches): batch = self.torch_batch_from_numpy_batch(batch) num_correct = self._acc_for_batch(batch) total_correct += num_correct acc = total_correct / (len(batches) * batch_size) return acc def _acc_for_batch(self, batch): ''' :param batch: :return: The number of correct predictions in a batch ''' self.eval() outputs = self(batch) predictions = outputs.max(1)[1] labels = batch[-1] num_error = torch.nonzero(labels - predictions) num_correct = labels.size(0) - num_error.size(0) return num_correct def export_state(self, dir, label, epoch=-1): print("Saving models.") cwd = os.getcwd() + '/' enc_out = dir + ENC_1_FILE mlp_out = dir + MLP_FILE i2w_out = dir + I2W_FILE w2i_out = dir + W2I_FILE w2c_out = dir + W2C_FILE inf_out = dir + INF_FILE torch.save(self.encoder.state_dict(), enc_out) torch.save(self.mlp.state_dict(), mlp_out) i2w = open(i2w_out, 'wb') pickle.dump(self.wd.index2word, i2w) i2w.close() w2i = open(w2i_out, 'wb') pickle.dump(self.wd.word2index, w2i) w2i.close() w2c = open(w2c_out, 'wb') pickle.dump(self.wd.word2count, w2c) w2c.close() info = open(inf_out, 'w') using_lstm = 1 if self.base_rnn == nn.LSTM else 0 info.write( str(self.encoder.hidden_size) + "\n" + str(self.encoder.n_layers) + "\n" + str(self.wd.n_words) + "\n" + str(using_lstm)) if epoch > 0: info.write("\n" + str(epoch)) info.close() files = [enc_out, mlp_out, i2w_out, w2i_out, w2c_out, inf_out] print("Bundling models") tf = tarfile.open(cwd + dir + label, mode='w') for file in files: tf.add(file) tf.close() for file in files: os.remove(file) print("Finished saving models.") def import_state(self, model_file, active_dir=TEMP_DIR, load_epoch=False): print("Loading models.") cwd = os.getcwd() + '/' tf = tarfile.open(model_file) # extract directly to current model directory for member in tf.getmembers(): if member.isreg(): member.name = os.path.basename(member.name) tf.extract(member, path=active_dir) info = open(active_dir + INF_FILE, 'r') lns = info.readlines() hidden_size, e_layers, n_words, using_lstm = [int(i) for i in lns[:4]] if load_epoch: epoch = int(lns[-1]) i2w = open(cwd + TEMP_DIR + I2W_FILE, 'rb') w2i = open(cwd + TEMP_DIR + W2I_FILE, 'rb') w2c = open(cwd + TEMP_DIR + W2C_FILE, 'rb') i2w_dict = pickle.load(i2w) w2i_dict = pickle.load(w2i) w2c_dict = pickle.load(w2c) wd = WordDict(dicts=[w2i_dict, i2w_dict, w2c_dict, n_words]) w2i.close() i2w.close() w2c.close() self.base_rnn = nn.LSTM if using_lstm == 1 else nn.GRU self.wd = wd self.encoder = EncoderRNN(wd.n_words, hidden_size, n_layers=e_layers, base_rnn=self.base_rnn) self.mlp = torch.nn.Sequential( torch.nn.Linear(int(hidden_size * 8), int(hidden_size)), torch.nn.ReLU(), torch.nn.Dropout(0.1), torch.nn.Linear(int(hidden_size), 3), torch.nn.Softmax(dim=1)) if not USE_CUDA: self.encoder.load_state_dict( torch.load(cwd + TEMP_DIR + ENC_1_FILE, map_location=lambda storage, loc: storage)) self.mlp.load_state_dict( torch.load(cwd + TEMP_DIR + MLP_FILE, map_location=lambda storage, loc: storage)) else: self.encoder.load_state_dict( torch.load(cwd + TEMP_DIR + ENC_1_FILE)) self.mlp.load_state_dict(torch.load(cwd + TEMP_DIR + MLP_FILE)) self.encoder = self.encoder.cuda() self.mlp = self.mlp.cuda() self.encoder.eval() self.mlp.eval() self.parameter_list = [ self.encoder.parameters(), self.mlp.parameters() ] tf.close() print("Loaded models.") if load_epoch: return self, epoch else: return self def torch_batch_from_numpy_batch_without_label(self, batch): batch = list(batch) variable_indices = [0, 2] for i in variable_indices: var = Variable(torch.from_numpy(batch[i])) if USE_CUDA: var = var.cuda() batch[i] = var return batch def predict(self, data): batch_size = 1 batches = nli_batches_without_label(batch_size, data) predictions = [] for batch in tqdm(batches): batch = self.torch_batch_from_numpy_batch_without_label(batch) outputs = self(batch, inference=True) pred = outputs.max(1)[1] predictions.append(pred) return torch.cat(predictions) def add_new_vocabulary(self, genre): old_vocab_size = self.wd.n_words print("Previous vocabulary size: " + str(old_vocab_size)) train_set = nli_preprocessor.get_multinli_text_hyp_labels( genre=genre ) #nli_preprocessor.get_multinli_training_set(max_lines=args.max_lines) matched_val_set = nli_preprocessor.get_multinli_matched_val_set( ) #genre_val_set(genre) unmerged_sentences = [] for data in [train_set, matched_val_set]: unmerged_sentences.extend([data["text"], data["hyp"]]) all_sentences = list(itertools.chain.from_iterable(unmerged_sentences)) for line in all_sentences: self.wd.add_sentence(line) print("New vocabulary size: " + str(self.wd.n_words)) print("Extending the Embedding layer with new vocabulary...") num_new_words = self.wd.n_words - old_vocab_size self.encoder.extend_embedding_layer(self.wd.word2index, num_new_words) self.new_vocab_size = num_new_words def freeze_source_params(self): for name, param in self.named_parameters(): if "rnn" in name: param.requires_grad = False if ("M_k" in name or "M_v" in name) and "target_4" not in name: param.requires_grad = False for name, param in self.named_parameters(): if param.requires_grad is True: print(name)
def pretrain(): # Parse command line arguments argparser = argparse.ArgumentParser() # train argparser.add_argument('--mode', '-m', choices=('pretrain', 'adversarial', 'inference'), type=str, required=True) argparser.add_argument('--batch_size', '-b', type=int, default=168) argparser.add_argument('--num_epoch', '-e', type=int, default=10) argparser.add_argument('--print_every', type=int, default=100) argparser.add_argument('--use_cuda', default=True) argparser.add_argument('--g_learning_rate', '-glr', type=float, default=0.001) argparser.add_argument('--d_learning_rate', '-dlr', type=float, default=0.001) # resume argparser.add_argument('--resume', action='store_true', dest='resume') argparser.add_argument('--resume_dir', type=str) argparser.add_argument('--resume_epoch', type=int) # save argparser.add_argument('--exp_dir', type=str, required=True) # model argparser.add_argument('--emb_dim', type=int, default=128) argparser.add_argument('--hidden_dim', type=int, default=256) argparser.add_argument('--dropout_rate', '-drop', type=float, default=0.5) argparser.add_argument('--n_layers', type=int, default=1) argparser.add_argument('--response_max_len', type=int, default=15) # data argparser.add_argument('--train_query_file', '-tqf', type=str, required=True) argparser.add_argument('--train_response_file', '-trf', type=str, required=True) argparser.add_argument('--valid_query_file', '-vqf', type=str, required=True) argparser.add_argument('--valid_response_file', '-vrf', type=str, required=True) argparser.add_argument('--vocab_file', '-vf', type=str, default='') argparser.add_argument('--max_vocab_size', '-mv', type=int, default=100000) args = argparser.parse_args() # set up the output directory exp_dirname = os.path.join(args.exp_dir, args.mode, time.strftime("%Y-%m-%d-%H-%M-%S")) os.makedirs(exp_dirname) # set up the logger tqdm_logging.config(logger, os.path.join(exp_dirname, 'train.log'), mode='w', silent=False, debug=True) if not args.vocab_file: logger.info("no vocabulary file") build_vocab(args.train_query_file, args.train_response_file, seperated=True) sys.exit() else: vocab, rev_vocab = load_vocab(args.vocab_file, max_vocab=args.max_vocab_size) vocab_size = len(vocab) word_embeddings = nn.Embedding(vocab_size, args.emb_dim, padding_idx=SYM_PAD) E = EncoderRNN(vocab_size, args.emb_dim, args.hidden_dim, args.n_layers, args.dropout_rate, bidirectional=True, variable_lengths=True) G = Generator(vocab_size, args.response_max_len, args.emb_dim, 2 * args.hidden_dim, args.n_layers, dropout_p=args.dropout_rate) if args.use_cuda: word_embeddings.cuda() E.cuda() G.cuda() loss_func = nn.NLLLoss(size_average=False) params = list(word_embeddings.parameters()) + list(E.parameters()) + list( G.parameters()) opt = torch.optim.Adam(params, lr=args.g_learning_rate) logger.info('----------------------------------') logger.info('Pre-train a neural conversation model') logger.info('----------------------------------') logger.info('Args:') logger.info(str(args)) logger.info('Vocabulary from ' + args.vocab_file) logger.info('vocabulary size: %d' % vocab_size) logger.info('Loading text data from ' + args.train_query_file + ' and ' + args.train_response_file) # resume training from other experiment if args.resume: assert args.resume_epoch >= 0, 'If resume training, please assign resume_epoch' reload_model(args.resume_dir, args.resume_epoch, word_embeddings, E, G) start_epoch = args.resume_epoch + 1 else: start_epoch = 0 # dump args with open(os.path.join(exp_dirname, 'args.pkl'), 'wb') as f: pickle.dump(args, f) for e in range(start_epoch, args.num_epoch): logger.info('---------------------training--------------------------') train_data_generator = batcher(args.batch_size, args.train_query_file, args.train_response_file) logger.info("Epoch: %d/%d" % (e, args.num_epoch)) step = 0 total_loss = 0.0 total_valid_char = [] cur_time = time.time() while True: try: post_sentences, response_sentences = train_data_generator.next( ) except StopIteration: # save model save_model(exp_dirname, e, word_embeddings, E, G) # evaluation eval(args.valid_query_file, args.valid_response_file, args.batch_size, word_embeddings, E, G, loss_func, args.use_cuda, vocab, args.response_max_len) break post_ids = [sentence2id(sent, vocab) for sent in post_sentences] response_ids = [ sentence2id(sent, vocab) for sent in response_sentences ] posts_var, posts_length = padding_inputs(post_ids, None) responses_var, responses_length = padding_inputs( response_ids, args.response_max_len) # sort by post length posts_length, perms_idx = posts_length.sort(0, descending=True) posts_var = posts_var[perms_idx] responses_var = responses_var[perms_idx] responses_length = responses_length[perms_idx] # 在sentence后面加eos references_var = torch.cat([ responses_var, Variable(torch.zeros(responses_var.size(0), 1).long(), requires_grad=False) ], dim=1) for idx, length in enumerate(responses_length): references_var[idx, length] = SYM_EOS # show case #for p, r, ref in zip(posts_var.data.numpy()[:10], responses_var.data.numpy()[:10], references_var.data.numpy()[:10]): # print ''.join(id2sentence(p, rev_vocab)) # print ''.join(id2sentence(r, rev_vocab)) # print ''.join(id2sentence(ref, rev_vocab)) # print if args.use_cuda: posts_var = posts_var.cuda() responses_var = responses_var.cuda() references_var = references_var.cuda() embedded_post = word_embeddings(posts_var) embedded_response = word_embeddings(responses_var) _, dec_init_state = E(embedded_post, input_lengths=posts_length.numpy()) log_softmax_outputs = G.supervise( embedded_response, dec_init_state, word_embeddings) # [B, T, vocab_size] outputs = log_softmax_outputs.view(-1, vocab_size) mask_pos = mask(references_var).view(-1).unsqueeze(-1) masked_output = outputs * (mask_pos.expand_as(outputs)) loss = loss_func(masked_output, references_var.view(-1)) / (posts_var.size(0)) opt.zero_grad() loss.backward() opt.step() total_loss += loss * (posts_var.size(0)) total_valid_char.append(mask_pos) if step % args.print_every == 0: total_loss_val = total_loss.cpu().data.numpy()[0] total_valid_char_val = torch.sum( torch.cat(total_valid_char, dim=1)).cpu().data.numpy()[0] logger.info( 'Step %5d: (per word) training perplexity %.2f (%.1f iters/sec)' % (step, math.exp(total_loss_val / total_valid_char_val), args.print_every / (time.time() - cur_time))) total_loss = 0.0 total_valid_char = [] total_case_num = 0 cur_time = time.time() step = step + 1
print_loss_total = 0 print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters), iter, iter / n_iters * 100, print_loss_avg)) evaluateRandomly(encoder, decoder, 1) if iter % plot_every == 0: plot_loss_avg = plot_loss_total / plot_every plot_losses.append(plot_loss_avg) plot_loss_total = 0 showPlot(plot_losses) lang, pairs = prepareData("filtered_dataset.jsonl") filtered_embeddings = filter_embedding(lang, "glove.6B.100d.txt") print(random.choice(pairs)) hidden_size = 256 encoder1 = EncoderRNN(lang.n_words, hidden_size, filtered_embeddings) attn_decoder1 = AttnDecoderRNN(hidden_size, lang.n_words, dropout_p=0.1, embeddings=filtered_embeddings) print("parameters ", get_n_params(encoder1) + get_n_params(attn_decoder1)) encoder1.cuda() attn_decoder1.cuda() trainIters(encoder1, attn_decoder1, 7, print_every=100)