def beam_search(model: NMT, test_data_src: List[List[str]], beam_size: int, max_decoding_time_step: int) -> List[List[Hypothesis]]: """ 对源句子列表使用beam search去构建假设. @param model (NMT): NMT 模型 @param test_data_src (List[List[str]]): 源句子列表, 测试集中的. @param beam_size (int): beam_size (每一步的候选数) @param max_decoding_time_step (int): Beam search 能产生的最大句子长度 @returns hypotheses (List[List[Hypothesis]]): 每个源句子的beam_size个假设. """ was_training = model.training model.eval() hypotheses = [] # 所有句子的候选句列表 with torch.no_grad(): for src_sent in tqdm(test_data_src, desc='Decoding', file=sys.stdout): example_hyps = model.beam_search( src_sent, beam_size=beam_size, max_decoding_time_step=max_decoding_time_step) hypotheses.append(example_hyps) # 把这句话的所有候选句加入列表 if was_training: model.train(was_training) return hypotheses
def beam_search(model: NMT, test_iter, beam_size: int, max_decoding_time_step: int) -> List[List[Hypothesis]]: """ Run beam search to construct hypotheses for a list of src-language sentences. @param model (NMT): NMT Model @param test_data_src (List[List[str]]): List of sentences (words) in source language, from test set. @param beam_size (int): beam_size (# of hypotheses to hold for a translation at every step) @param max_decoding_time_step (int): maximum sentence length that Beam search can produce @returns hypotheses (List[List[Hypothesis]]): List of Hypothesis translations for every source sentence. """ was_training = model.training model.eval() hypotheses = [] with torch.no_grad(): for _, data in enumerate(test_iter): print(data) (src_sents, src_lengths), (_, _) = data.abc, data.d example_hyps = model.beam_search( src_sents, src_lengths, beam_size=beam_size, max_decoding_time_step=max_decoding_time_step) hypotheses.append(example_hyps) if was_training: model.train(was_training) return hypotheses
def beam_search(model: NMT, test_iterator: BucketIterator, beam_size: int, max_decoding_time_step: int) -> List[List[Hypothesis]]: """ Run beam search to construct hypotheses for a list of src-language sentences. @param model (NMT): NMT Model @param test_iterator BucketIterator: BucketIterator in source language, from test set. @param beam_size (int): beam_size (# of hypotheses to hold for a translation at every step) @param max_decoding_time_step (int): maximum sentence length that Beam search can produce @returns hypotheses (List[List[Hypothesis]]): List of Hypothesis translations for every source sentence. """ was_training = model.training model.eval() hypotheses = [] with torch.no_grad(): # for src_sent in tqdm(test_data_src, desc='Decoding', file=sys.stdout): for i, batch in enumerate(test_iterator): src_sents, src_sents_lens = batch.src src_sents = src_sents.permute(1, 0) for j in range(len(src_sents_lens)): src_sent = src_sents[j] example_hyps = model.beam_search( src_sent, src_sents_lens[j], beam_size=beam_size, max_decoding_time_step=max_decoding_time_step) hypotheses.append(example_hyps) if was_training: model.train(was_training) return hypotheses
def beam_search2(model1: NMT, model2: DPPNMT, test_data_src: List[List[str]], beam_size: int, max_decoding_time_step: int, test_data_tgt) -> List[List[Hypothesis]]: """ Run beam search to construct hypotheses for a list of src-language sentences. @param model (NMT): NMT Model @param test_data_src (List[List[str]]): List of sentences (words) in source language, from test set. @param beam_size (int): beam_size (# of hypotheses to hold for a translation at every step) @param max_decoding_time_step (int): maximum sentence length that Beam search can produce @returns hypotheses (List[List[Hypothesis]]): List of Hypothesis translations for every source sentence. """ model1.eval() model2.eval() i = 0 with torch.no_grad(): for src_sent in tqdm(test_data_src, desc='Decoding', file=sys.stdout): hyp1 = model1.beam_search( src_sent, beam_size=beam_size, max_decoding_time_step=max_decoding_time_step) hyp2 = model2.beam_search( src_sent, beam_size=beam_size, max_decoding_time_step=max_decoding_time_step) ref = test_data_tgt[i][1:-1] #print(ref, hyp1[0].value) bleu_topk = sentence_bleu(ref, hyp1[0].value) bleu_dpp = sentence_bleu(test_data_tgt[i], hyp2[0].value) #print(bleu_topk, bleu_dpp) if bleu_dpp > bleu_topk: print(i) print(" ".join(hyp1[0].value)) print(" ".join(hyp2[0].value)) print(" ".join(ref)) i += 1
def beam_search(model: NMT, test_data_src: List[List[str]], beam_size: int, max_decoding_time_step: int) -> List[List[Hypothesis]]: """ Run beam search to construct hypotheses for a list of src-language sentences. @param model (NMT): NMT Model @param test_data_src (List[List[str]]): List of sentences (words) in source language, from test set. @param beam_size (int): beam_size (# of hypotheses to hold for a translation at every step) @param max_decoding_time_step (int): maximum sentence length that Beam search can produce @returns hypotheses (List[List[Hypothesis]]): List of Hypothesis translations for every source sentence. """ was_training = model.training model.eval() hypotheses = [] with torch.no_grad(): for src_sent in tqdm(test_data_src, desc='Decoding', file=sys.stdout): example_hyps = model.beam_search( src_sent, beam_size=beam_size, max_decoding_time_step=max_decoding_time_step) hypotheses.append(example_hyps) if was_training: model.train(was_training) return hypotheses
def load(model_path: str): """ Load the model from a file. @param model_path (str): path to model """ params = torch.load(model_path, map_location=lambda storage, loc: storage) args = params['args'] nmt_model = NMT(vocab=params['vocab'], **args) nmt_model.load_state_dict(params['state_dict']) model = DPPNMT(nmt_model=nmt_model) return model
def sample(args): train_data_src = read_corpus(args.train_src, source='src') train_data_tgt = read_corpus(args.train_tgt, source='tgt') train_data = zip(train_data_src, train_data_tgt) if args.load_model: print('load model from [%s]' % args.load_model) params = torch.load(args.load_model, map_location=lambda storage, loc: storage) vocab = params['vocab'] opt = params['args'] state_dict = params['state_dict'] model = NMT(opt, vocab) model.load_state_dict(state_dict) else: vocab = torch.load(args.vocab) model = NMT(args, vocab) model.eval() if args.cuda: # model = nn.DataParallel(model).cuda() model = model.cuda() print('begin sampling') check_every = 10 train_iter = cum_samples = 0 train_time = time.time() for src_sents, tgt_sents in data_iter(train_data, batch_size=args.batch_size): train_iter += 1 samples = model.sample(src_sents, sample_size=args.sample_size, to_word=True) cum_samples += sum(len(sample) for sample in samples) if train_iter % check_every == 0: elapsed = time.time() - train_time print('sampling speed: %d/s' % (cum_samples / elapsed)) cum_samples = 0 train_time = time.time() for i, tgt_sent in enumerate(tgt_sents): print('*' * 80) print('target:' + ' '.join(tgt_sent)) tgt_samples = samples[i] print('samples:') for sid, sample in enumerate(tgt_samples, 1): print('[%d] %s' % (sid, ' '.join(sample[1:-1]))) print('*' * 80)
def test(args): test_data_src = read_corpus(args.test_src, source='src') test_data_tgt = read_corpus(args.test_tgt, source='tgt') test_data = list(zip(test_data_src, test_data_tgt)) if args.load_model: print('load model from [%s]' % args.load_model) params = torch.load(args.load_model, map_location=lambda storage, loc: storage) vocab = params['vocab'] saved_args = params['args'] state_dict = params['state_dict'] model = NMT(saved_args, vocab) model.load_state_dict(state_dict) else: vocab = torch.load(args.vocab) model = NMT(args, vocab) model.eval() if args.cuda: # model = nn.DataParallel(model).cuda() model = model.cuda() hypotheses = decode(model, test_data) top_hypotheses = [hyps[0] for hyps in hypotheses] bleu_score = get_bleu([tgt for src, tgt in test_data], top_hypotheses) word_acc = get_acc([tgt for src, tgt in test_data], top_hypotheses, 'word_acc') sent_acc = get_acc([tgt for src, tgt in test_data], top_hypotheses, 'sent_acc') print('Corpus Level BLEU: %f, word level acc: %f, sentence level acc: %f' % (bleu_score, word_acc, sent_acc), file=sys.stderr) if args.save_to_file: print('save decoding results to %s' % args.save_to_file) with open(args.save_to_file, 'w') as f: for hyps in hypotheses: f.write(' '.join(hyps[0][1:-1]) + '\n') if args.save_nbest: nbest_file = args.save_to_file + '.nbest' print('save nbest decoding results to %s' % nbest_file) with open(nbest_file, 'w') as f: for src_sent, tgt_sent, hyps in zip(test_data_src, test_data_tgt, hypotheses): print('Source: %s' % ' '.join(src_sent), file=f) print('Target: %s' % ' '.join(tgt_sent), file=f) print('Hypotheses:', file=f) for i, hyp in enumerate(hyps, 1): print('[%d] %s' % (i, ' '.join(hyp)), file=f) print('*' * 30, file=f)
def test(args): data_path = args['--data_path'] model_path = args['--model_path'] beam_size = int(args['--beam_size']) max_len = int(args['--max_len']) vocab = Vocab.load() source = load_corpus(data_path+'/test.es', 'es', limit=100) reference_tgt = load_corpus(data_path+'/test.en', 'en', limit=100) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') model = NMT.load(model_path, vocab) model.to(device) model.eval() translate_tgt = [] with torch.no_grad(): for src in tqdm(source, desc='translate '): src, _ = vocab.src.to_tensor([src], 'src') tgt = model.translate(src.to(device), beam_size, max_len) translate_tgt.append(tgt) translate_tgt = vocab.tgt.to_sentences(translate_tgt) if reference_tgt[0][0] == '<s>': regerence_tgt = [sent[1:-1] for sent in reference_tgt] bleu_score = nltk.translate.bleu_score.corpus_bleu([[refer] for refer in reference_tgt], translate_tgt) print("corpus bleu score on test data is %.2f" % (bleu_score*100)) # write translate sentences to file with open(data_path+'/result.txt', 'w') as f: detokenizer = MosesDetokenizer('en') for sent in translate_tgt: sent = detokenizer(sent) f.write(sent+'\n') detokenizer.close()
def test(args): print("load model from {}".format(args["MODEL_PATH"]), file=sys.stderr) model = NMT.load(args["MODEL_PATH"]) if args["--cuda"]: model = model.to(torch.device("cuda:0")) binary = int(args["--num-classes"]) == 2 test_data = load_test_data(binary=binary) batch_size = int(args["--batch-size"]) cum_correct = 0 cum_score = 0 with torch.no_grad(): for sentences, sentiments in batch_iter(test_data, batch_size): correct = model.compute_accuracy(sentences, sentiments) * len(sentences) cum_correct += correct score = -model(sentences, sentiments).sum() cum_score += score print("test dataset size: %d" % len(test_data)) print("accuracy: %f" % (cum_correct / len(test_data))) print("loss: %f" % (cum_score / len(test_data)))
def multi_parameter_tuning(args): lrs = [1e-2, 1e-3, 5e-3, 1e-4, 5e-4] hidden_sizes = [128, 256, 512] lr_decays = [0.9, 0.7, 0.5] iter = 0 valid_metric = {} # 存储各个模型ppl的值 dev_data_src = read_corpus(args['dev_source'], source='src') dev_data_tgt = read_corpus(args['dev_target'], source='tgt') dev_data = list(zip(dev_data_src, dev_data_tgt)) for i in lrs: for j in hidden_sizes: for k in lr_decays: print( '第%d次测试=================================================' % iter) arg_test = args arg_test['lr'], arg_test['hidden_size'], arg_test[ 'lr_decay'] = i, j, k arg_test['save_to'] = 'model_' + 'lr_' + str( i) + 'hd_size_' + str(j) + 'lr_dys_' + str(k) + '.bin' run.train(arg_test) model = NMT.load(arg_test['save_to']) dev_ppl = run.evaluate_ppl( model, dev_data, batch_size=128) # dev batch size can be a bit larger valid_metric[arg_test['save_to']] = dev_ppl print(arg_test['save_to'], ' validation: iter %d, dev. ppl %f' % (iter, dev_ppl), file=sys.stderr) iter += 1 model = min(valid_metric, key=valid_metric.get()) print('best_model is %s ,ppl is %f' % (model, valid_metric[model]))
def test(): print( f"load test sentences from [{config.test_path_src}], [{config.test_path_tar}]", file=sys.stderr) test_data = Data(config.test_path_src, config.test_path_tar) test_data_loader = DataLoader(dataset=test_data, batch_size=config.test_batch_size, shuffle=True, collate_fn=utils.get_batch) model_path = "/home/wangshuhe/shuhelearn/ShuHeLearning/NMT_attention/result/02.08_window35_6_8.810715463205241_checkpoint.pth" print(f"load model from {model_path}", file=sys.stderr) model = NMT.load(model_path) if (config.cuda): model = model.to(torch.device("cuda:0")) #model = model.cuda() #model = nn.parallel.DistributedDataParallel(model) predict, test_data_tar = beam_search(model, test_data, test_data_loader, 15, config.max_tar_length) for i in range(len(test_data_tar)): for j in range(len(test_data_tar[i])): test_data_tar[i][j] = model.text.tar.id2word[test_data_tar[i][j]] for i in range(len(predict)): for j in range(len(predict[i])): predict[i][j] = model.text.tar.id2word[predict[i][j]] best_predict = [] for i in tqdm(range(len(test_data_tar)), desc="find best predict"): best_predict.append(predict[i][compare_bleu(predict[i], test_data_tar[i])]) bleu = corpus_bleu([[ref[1:-1]] for ref in test_data_tar], [pre for pre in predict]) print(f"BLEU is {bleu*100}", file=sys.stderr)
def main(): """ Main func. """ # args = '1d' # Check Python & PyTorch Versions assert (sys.version_info >= (3, 5)), "Please update your installation of Python to version >= 3.5" # assert(torch.__version__ == "1.0.0"), "Please update your installation of PyTorch. You have {} and you should have version 1.0.0".format(torch.__version__) # Seed the Random Number Generators seed = 1234 torch.manual_seed(seed) torch.cuda.manual_seed(seed) np.random.seed(seed * 13 // 7) # Load training data & vocabulary train_data_src = read_corpus('/Users/Pannu/Desktop/Python/AI/NLP/CS224N-2019-master/Assignment/a4/sanity_check_en_es_data/train_sanity_check.es', 'src') train_data_tgt = read_corpus('/Users/Pannu/Desktop/Python/AI/NLP/CS224N-2019-master/Assignment/a4/sanity_check_en_es_data/train_sanity_check.en', 'tgt') train_data = list(zip(train_data_src, train_data_tgt)) for src_sents, tgt_sents in batch_iter(train_data, batch_size=BATCH_SIZE, shuffle=True): src_sents = src_sents tgt_sents = tgt_sents break vocab = Vocab.load('/Users/Pannu/Desktop/Python/AI/NLP/CS224N-2019-master/Assignment/a4/sanity_check_en_es_data/vocab_sanity_check.json') # Create NMT Model model = NMT( embed_size=EMBED_SIZE, hidden_size=HIDDEN_SIZE, dropout_rate=DROPOUT_RATE, vocab=vocab)
def decode(args: Dict[str, str]): """ Performs decoding on a test set, and save the best-scoring decoding results. If the target gold-standard sentences are given, the function also computes corpus-level BLEU score. @param args (Dict): args from cmd line """ print("load test source sentences from [{}]".format(args['TEST_SOURCE_FILE']), file=sys.stderr) test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src') if args['TEST_TARGET_FILE']: print("load test target sentences from [{}]".format(args['TEST_TARGET_FILE']), file=sys.stderr) test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt') print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr) model = NMT.load(args['MODEL_PATH'], no_char_decoder=args['--no-char-decoder']) if args['--cuda']: model = model.to(torch.device("cuda:0")) hypotheses = beam_search(model, test_data_src, beam_size=int(args['--beam-size']), max_decoding_time_step=int(args['--max-decoding-time-step'])) if args['TEST_TARGET_FILE']: top_hypotheses = [hyps[0] for hyps in hypotheses] bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses) print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr) with open(args['OUTPUT_FILE'], 'w') as f: for src_sent, hyps in zip(test_data_src, hypotheses): top_hyp = hyps[0] hyp_sent = ' '.join(top_hyp.value) f.write(hyp_sent + '\n')
def test(): print( f"load test sentences from [{config.test_path_src}], [{config.test_path_tar}]", file=sys.stderr) #test_data_src, test_data_tar = utils.read_corpus(config.test_path) test_data = Data(config.test_path_src, config.test_path_tar) test_data_loader = DataLoader(dataset=test_data, batch_size=config.test_batch_size, shuffle=True, collate_fn=utils.get_batch) model_path = "/home/wangshuhe/shuhelearn/ShuHeLearning/NMT_transformer/small/result/02.10_145_1.037565227213504_checkpoint.pth" model = NMT.load(model_path) if (config.cuda): model = model.to(torch.device("cuda:0")) predict, test_data_tar = beam_search(model, test_data, test_data_loader, 15, config.max_tar_length) for i in range(len(test_data_tar)): for j in range(len(test_data_tar[i])): test_data_tar[i][j] = model.text.tar.id2word[test_data_tar[i][j]] for i in range(len(predict)): for j in range(len(predict[i])): predict[i][j] = model.text.tar.id2word[predict[i][j]] bleu = corpus_bleu([[tar[1:-1]] for tar in test_data_tar], [pre for pre in predict]) print(f"Corpus BLEU: {bleu * 100}", file=sys.stderr)
def decode(args: Dict[str, str]): """ Performs decoding on a test set, and save the best-scoring decoding results. If the target gold-standard sentences are given, the function also computes corpus-level BLEU score. @param args (Dict): args from cmd line """ print("load test source sentences from [{}]".format( args['TEST_SOURCE_FILE']), file=sys.stderr) test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src') if args['TEST_TARGET_FILE']: print("load test target sentences from [{}]".format( args['TEST_TARGET_FILE']), file=sys.stderr) test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt') print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr) model = NMT.load(args['MODEL_PATH']) if args['--cuda']: model = model.to(torch.device("cuda")) hypotheses = beam_search(model, test_data_src, beam_size=int(args['--beam-size']), max_decoding_time_step=int( args['--max-decoding-time-step'])) if args['TEST_TARGET_FILE']: top_hypotheses = [hyps[0] for hyps in hypotheses] bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses) print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr) with open(args['OUTPUT_FILE'], 'w') as f: for src_sent, hyps in zip(test_data_src, hypotheses): top_hyp = hyps[0] hyp_sent = ' '.join(top_hyp.value) f.write(hyp_sent + '\n') if args['--plot-attention']: plt.rcParams['font.family'] = ['sans-serif'] plt.rcParams['font.sans-serif'] = [ 'Arial Unicode MS', 'Arial', 'sans-serif' ] from matplotlib.font_manager import _rebuild _rebuild() output_dir = os.path.dirname(args['OUTPUT_FILE']) for idx, (src_sent, hyps) in tqdm(enumerate(zip(test_data_src, hypotheses)), desc='Plot attention', file=sys.stdout): top_hyp = hyps[0] hyp_sent = top_hyp.value hyp_att = top_hyp.attention filename = output_dir + '/att_%d.jpg' % idx plot_attention(hyp_att, src_sent, hyp_sent, filename)
def main(): """ Main func. """ # args = docopt(__doc__) args = { '1d': False, '1e': False, '1f': True, 'overwrite_output_for_sanity_check': False } # print(args) # Check Python & PyTorch Versions assert (sys.version_info >= (3, 5)), "Please update your installation of Python to version >= 3.5" assert ( torch.__version__ >= "1.0.0" ), "Please update your installation of PyTorch. You have {} and you should have version 1.0.0".format( torch.__version__) # Seed the Random Number Generators seed = 1234 torch.manual_seed(seed) torch.cuda.manual_seed(seed) np.random.seed(seed * 13 // 7) # Load training data & vocabulary train_data_src = read_corpus( './sanity_check_en_es_data/train_sanity_check.es', 'src') train_data_tgt = read_corpus( './sanity_check_en_es_data/train_sanity_check.en', 'tgt') train_data = list(zip(train_data_src, train_data_tgt)) for src_sents, tgt_sents in batch_iter(train_data, batch_size=BATCH_SIZE, shuffle=True): src_sents = src_sents tgt_sents = tgt_sents break vocab = Vocab.load('./sanity_check_en_es_data/vocab_sanity_check.json') # Create NMT Model model = NMT(embed_size=EMBED_SIZE, hidden_size=HIDDEN_SIZE, dropout_rate=DROPOUT_RATE, vocab=vocab) if args['1d']: question_1d_sanity_check(model, src_sents, tgt_sents, vocab) elif args['1e']: question_1e_sanity_check(model, src_sents, tgt_sents, vocab) elif args['1f']: question_1f_sanity_check(model, src_sents, tgt_sents, vocab) elif args['overwrite_output_for_sanity_check']: generate_outputs(model, src_sents, tgt_sents, vocab) else: raise RuntimeError('invalid run mode')
def main(): """ Main func. """ args = docopt(__doc__) # Check Python & PyTorch Versions assert (sys.version_info >= (3, 5)), "Please update your installation of Python to version >= 3.5" assert ( torch.__version__ >= "1.0.0"), "Please update your installation of PyTorch. You have {} and you should have version 1.0.0 or greater".format( torch.__version__) # Seed the Random Number Generators seed = 1234 torch.manual_seed(seed) torch.cuda.manual_seed(seed) np.random.seed(seed * 13 // 7) vocab = Vocab.load('./sanity_check_en_es_data/vocab_sanity_check.json') # Create NMT Model model = NMT( embed_size=EMBED_SIZE, hidden_size=HIDDEN_SIZE, dropout_rate=DROPOUT_RATE, vocab=vocab) char_vocab = DummyVocab() # Initialize CharDecoder decoder = CharDecoder( hidden_size=HIDDEN_SIZE, char_embedding_size=EMBED_SIZE, target_vocab=char_vocab) if args['1a']: question_1a_sanity_check() elif args['1b']: question_1b_sanity_check() elif args['1c']: question_1c_sanity_check() elif args['1d']: question_1d_sanity_check() elif args['1e']: question_1e_sanity_check() elif args['1f']: question_1f_sanity_check(model) elif args['2a']: question_2a_sanity_check(decoder, char_vocab) elif args['2b']: question_2b_sanity_check(decoder, char_vocab) elif args['2c']: question_2c_sanity_check(decoder) elif args['2c2']: question_2c2_sanity_check(decoder) elif args['2d']: question_2d_sanity_check(decoder) else: raise RuntimeError('invalid run mode')
def init_training(args): from functools import partial import pickle pickle.load = partial(pickle.load, encoding="latin1") pickle.Unpickler = partial(pickle.Unpickler, encoding="latin1") # model = torch.load(model_file, map_location=lambda storage, loc: storage, pickle_module=pickle) vocab = torch.load(args.vocab, map_location=lambda storage, loc: storage, pickle_module=pickle) model = NMT(args, vocab) model.train() if args.uniform_init: print('uniformly initialize parameters [-%f, +%f]' % (args.uniform_init, args.uniform_init), file=sys.stderr) for p in model.parameters(): p.data.uniform_(-args.uniform_init, args.uniform_init) vocab_mask = torch.ones(len(vocab.tgt)) vocab_mask[vocab.tgt['<pad>']] = 0 nll_loss = nn.NLLLoss(weight=vocab_mask, reduction='sum') cross_entropy_loss = nn.CrossEntropyLoss(weight=vocab_mask, reduction='sum') if args.cuda: # model = nn.DataParallel(model).cuda() model = model.cuda() nll_loss = nll_loss.cuda() cross_entropy_loss = cross_entropy_loss.cuda() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) return vocab, model, optimizer, nll_loss, cross_entropy_loss
def main(): """ Main func. """ # args = docopt(__doc__) char_emb_size = 50 # Check Python & PyTorch Versions assert (sys.version_info >= (3, 5)), "Please update your installation of Python to version >= 3.5" assert ( torch.__version__ >= "1.0.0" ), "Please update your installation of PyTorch. You have {} and you should have version 1.0.0".format( torch.__version__) # Seed the Random Number Generators seed = 1234 torch.manual_seed(seed) torch.cuda.manual_seed(seed) np.random.seed(seed * 13 // 7) vocab = Vocab.load('./sanity_check_en_es_data/vocab_sanity_check.json') # cnn layer cnn = CNN(char_emb_size, EMBED_SIZE, 5) # Create NMT Model model = NMT(word_embed_size=EMBED_SIZE, hidden_size=HIDDEN_SIZE, dropout_rate=DROPOUT_RATE, vocab=vocab) # highway layer hiway_layer = Highway(EMBED_SIZE) char_vocab = DummyVocab() # Initialize CharDecoder decoder = CharDecoder(hidden_size=HIDDEN_SIZE, char_embedding_size=EMBED_SIZE, target_vocab=char_vocab) # if args['1e']: # question_1e_sanity_check() # elif args['1f']: # question_1f_sanity_check(hiway_layer) # elif args['1g']: # question_1g_sanity_check(cnn) # elif args['1h']: # question_1h_sanity_check(model) # elif args['2a']: # question_2a_sanity_check(decoder, char_vocab) # elif args['2b']: # question_2b_sanity_check(decoder) # elif args['2c']: question_2c_sanity_check(decoder)
def decode(args: Dict[str, str]): """ 在测试集上执行解码操作, 保存最高得分的解码结果. 如果给定标准句子,函数还会计算平均字符准确率CA,第一个候选句子命中率HRF,前k个候选句子命中率kHRF @param args (Dict): 命令行参数 """ if args['SENTENCE']: ps = PinyinSplit() test_data_src = [ps.split(args['SENTENCE'])] if args['TEST_SOURCE_FILE']: print("load test source sentences from [{}]".format( args['TEST_SOURCE_FILE']), file=sys.stderr) test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src') if args['TEST_TARGET_FILE']: print("load test target sentences from [{}]".format( args['TEST_TARGET_FILE']), file=sys.stderr) test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt') print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr) model = NMT.load(args['MODEL_PATH']) if args['--cuda']: model = model.to(torch.device("cuda:0")) beam_size = int(args['--beam-size']) hypotheses = beam_search(model, test_data_src, beam_size=beam_size, max_decoding_time_step=int( args['--max-decoding-time-step'])) if args['TEST_TARGET_FILE']: top_hypotheses = [hyps[0] for hyps in hypotheses] # 每句话转汉字的首选项形成的列表 avg_ca, hrf = evaluate_ca_hrf(test_data_tgt, top_hypotheses) khrf = evaluate_khrf(test_data_tgt, hypotheses) # bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses) # 打分 # print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr) print('avg_ca: {}'.format(avg_ca), file=sys.stderr) print('hrf: {}'.format(hrf), file=sys.stderr) print('{}hrf: {}'.format(beam_size, khrf), file=sys.stderr) if args['OUTPUT_FILE']: with open(args['OUTPUT_FILE'], 'w') as f: for src_sent, hyps in zip(test_data_src, hypotheses): top_hyp = hyps[0] hyp_sent = ''.join(top_hyp.value) f.write(hyp_sent + '\n') if args['SENTENCE']: print('source sentence: {}'.format(args['SENTENCE'])) for i in range(len(hypotheses[0])): result = ''.join(hypotheses[0][i].value) print('top_{}_hypotheses_{}: {}'.format(beam_size, i + 1, result))
def main(): """ Main func """ args = docopt(__doc__) # Check Python & PyTorch Versions assert (sys.version_info >= (3, 5)), "Please update your installation of Python to version >= 3.5" assert ( torch.__version__ == "1.1.0" ), "Please update your installation of PyTorch. You have {} and you should have version 1.0.0".format( torch.__version__) # Seed the Random Number Generators seed = 1234 torch.manual_seed(seed) torch.cuda.manual_seed(seed) np.random.seed(seed * 13 // 7) vocab = Vocab.load('./sanity_check_en_es_data/vocab_sanity_check.json') # Create NMT Model model = NMT(embed_size=EMBED_SIZE, hidden_size=HIDDEN_SIZE, dropout_rate=DROPOUT_RATE, vocab=vocab) char_vocab = DummyVocab() # Initialize CharDecoder decoder = CharDecoder(hidden_size=HIDDEN_SIZE, char_embedding_size=EMBED_SIZE, target_vocab=char_vocab) # initialize highway highway_model = Highway(embed_size_word=EMBED_SIZE_WORD, dropout_rate=DROPOUT_RATE) # initialize cnn cnn_model = CNN(EMBED_SIZE, MAX_WORD_LEN, EMBED_SIZE_WORD, 5) if args['hw']: highway_sanity_check(highway_model) elif args['generate_data']: generate_highway_data() elif args['gen_cnn_data']: generate_cnn_data() elif args['cnn']: cnn_sanity_check(cnn_model) else: raise RuntimeError('invalid run mode')
def question_1i_sanity_check(): """ Sanity check for nmt_model.py basic shape check """ print("-" * 80) print("Running Sanity Check for Question 1i: NMT") print("-" * 80) src_vocab_entry = VocabEntry() tgt_vocab_entry = VocabEntry() dummy_vocab = Vocab(src_vocab_entry, tgt_vocab_entry) word_embed_size = 5 hidden_size = 10 nmt = NMT(word_embed_size, hidden_size, dummy_vocab) source = [["Hello my friend"], ["How are you"]] target = [["Bonjour mon ami"], ["Comment vas tu"]] output = nmt.forward(source, target) print(output) #output_expected_size = [sentence_length, BATCH_SIZE, EMBED_SIZE] #assert(list(output.size()) == output_expected_size), "output shape is incorrect: it should be:\n {} but is:\n{}".format(output_expected_size, list(output.size())) print("Sanity Check Passed for Question 1i: NMT!") print("-" * 80)
def train(args: Dict): """ Train the NMT Model. @param args (Dict): args from cmd line """ train_data_src = read_corpus(args['--train-src'], source='src') train_data_tgt = read_corpus(args['--train-tgt'], source='tgt') dev_data_src = read_corpus(args['--dev-src'], source='src') dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt') train_data = list(zip(train_data_src, train_data_tgt)) dev_data = list(zip(dev_data_src, dev_data_tgt)) train_batch_size = int(args['--batch-size']) clip_grad = float(args['--clip-grad']) valid_niter = int(args['--valid-niter']) log_every = int(args['--log-every']) model_save_path = args['--save-to'] vocab = Vocab.load(args['--vocab']) model = NMT(embed_size=int(args['--embed-size']), hidden_size=int(args['--hidden-size']), dropout_rate=float(args['--dropout']), vocab=vocab) model.train() uniform_init = float(args['--uniform-init']) if np.abs(uniform_init) > 0.: print('uniformly initialize parameters [-%f, +%f]' % (uniform_init, uniform_init), file=sys.stderr) for p in model.parameters(): p.data.uniform_(-uniform_init, uniform_init) vocab_mask = torch.ones(len(vocab.tgt)) vocab_mask[vocab.tgt['<pad>']] = 0 device = torch.device("cuda:0" if args['--cuda'] else "cpu")
def decode(test_src_path, test_tgt_path=None, model_path='model.bin', beam_size=5, max_decoding=70, device='cpu', output_path='output.txt'): """ Performs decoding on a test set, and save the best-scoring decoding results. If the target gold-standard sentences are given, the function also computes corpus-level BLEU score. Params: test_src_path (str): Path to the test source file test_tgt_path (str): Path to the test target file (optional). Default=None model_path (str): Path to the model file generated after training. Default='model.bin' beam_size (int): beam size (# of hypotheses to hold for a translation at every step) max_decoding (int): maximum sentence length that Beam search can produce. Default=70 device (str): device to perform the calc on. Default = 'cpu' output_path (str): Path for the output file to write the results of the translation. Default='output.txt' """ print(f'load test source sentences from [{test_src_path}]', file=sys.stderr) test_data_src = read_corpus(test_src_path, corpus_type='src') if test_tgt_path is not None: print(f'load test target sentences from [{test_tgt_path}]', file=sys.stderr) test_data_tgt = read_corpus(test_tgt_path, corpus_type='tgt') print(f'load model from {model_path}', file=sys.stderr) model = NMT.load(model_path) model = model.to(torch.device(device)) hypotheses = beam_search(model, test_data_src, beam_size=beam_size, max_decoding_time_step=max_decoding) if test_tgt_path is not None: top_hypotheses = [hyps[0] for hyps in hypotheses] bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses) print(f'Corpus BLEU: {bleu_score}', file=sys.stderr) with open(output_path, 'w') as f: for src_sent, hyps in zip(test_data_src, hypotheses): top_hyp = hyps[0] hyp_sent = ' '.join(top_hyp.value) f.write(hyp_sent + '\n')
def beam_search(model: NMT, test_data_src: List[List[str]], beam_size: int, max_decoding_time_step: int)\ -> List[List[Hypothesis]]: """ Run beam search to construct hypotheses for a list of src-language sentences. :param NMT model: NMT Model :param List[List[str]] test_data_src: List of sentences (words) in source language, from test set :param int beam_size: beam_size (number of hypotheses to keep for a translation at every step) :param int max_decoding_time_step: maximum sentence length that beam search can produce :returns List[List[Hypothesis]] hypotheses: List of Hypothesis translations for every source sentence """ was_training = model.training model.eval() hypotheses = [] with torch.no_grad(): for src_sent in tqdm(test_data_src, desc='Decoding', file=sys.stdout): example_hyps = model.beam_search( src_sent, beam_size=beam_size, max_decoding_time_step=max_decoding_time_step) hypotheses.append(example_hyps) if was_training: model.train(was_training) return hypotheses
def __init__(self, speakers, embed_size, hidden_size, dropout_rate, vocab, no_char_decoder, lr, clip_grad, lr_decay): # Need: NMT Model for each speaker (ex: translate from "person speaking to Michael" to "Michael") # Model for determining who speaks after who super(NLG, self).__init__() self.NMT_speakers = [] self.NMT_models = [] self.NMT_optimizers = [] self.clip_grad = clip_grad self.lrs = [] self.lr_decay = lr_decay # find a way to not have to hard-code speakers? for speaker in speakers: model = NMT(embed_size=embed_size, hidden_size=hidden_size, dropout_rate=dropout_rate, vocab=vocab, no_char_decoder=no_char_decoder) optimizer = torch.optim.Adam(model.parameters(), lr=lr) self.NMT_speakers.append( speaker.replace("/", "-").replace(" ", "-")) self.NMT_models.append(model) self.NMT_optimizers.append(optimizer) self.lrs.append(lr)
def setUp(cls): # Initialize CharDecoder cls.decoder = CharDecoder( hidden_size=HIDDEN_SIZE, char_embedding_size=EMBED_SIZE, target_vocab=char_vocab) cls.vocab = Vocab.load('./sanity_check_en_es_data/vocab_sanity_check.json') # cl NMT Model cls.model = NMT( embed_size=EMBED_SIZE, hidden_size=HIDDEN_SIZE, dropout_rate=DROPOUT_RATE, vocab=vocab ) cls.char_vocab = DummyVocab()
def decode(args: Dict[str, str]): """ Performs decoding on a test set, and save the best-scoring decoding results. If the target gold-standard sentences are given, the function also computes corpus-level BLEU score. @param args (Dict): args from cmd line """ print("load test source sentences from [{}]".format(args['TEST_SOURCE_FILE']), file=sys.stderr) test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src') if args['TEST_TARGET_FILE']: print("load test target sentences from [{}]".format(args['TEST_TARGET_FILE']), file=sys.stderr) test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt') print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr) model_dir = './saved_model/' + args['--exp-name'] model_save_path = os.path.join(model_dir, args['--save-to']) #model = NMT.load(args['MODEL_PATH'], no_char_decoder=args['--no-char-decoder']) print('loading model from path: ' + model_save_path) model = NMT.load(model_save_path, no_char_decoder=args['--no-char-decoder'], with_contex=args['--with-contex'], contex_LSTM=args['--contex-LSTM'], multi_encoder=args['--multi-encoder']) if args['--cuda']: model = model.to(torch.device("cuda:0")) hypotheses = beam_search(model, test_data_src, beam_size=int(args['--beam-size']), max_decoding_time_step=int(args['--max-decoding-time-step'])) if args['TEST_TARGET_FILE']: top_hypotheses = [hyps[0] for hyps in hypotheses] bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses) print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr) with open(args['OUTPUT_FILE'], 'w') as f: for src_sent, hyps in zip(test_data_src, hypotheses): top_hyp = hyps[0] detokenizer = TreebankWordDetokenizer() detokenizer.DOUBLE_DASHES = (re.compile(r'--'), r'--') hyp_sent = detokenizer.detokenize(top_hyp.value) # hyp_sent = ' '.join(top_hyp.value) f.write(hyp_sent + '\n')
def decode(args: Dict[str, str]): """ Performs decoding on a test set, and save the best-scoring decoding results. If the target gold-standard sentences are given, the function also computes corpus-level BLEU score. @param args (Dict): args from cmd line """ print("load test source sentences from [{}]".format( args['TEST_SOURCE_FILE']), file=sys.stderr) test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src') if args['TEST_TARGET_FILE']: print("load test target sentences from [{}]".format( args['TEST_TARGET_FILE']), file=sys.stderr) test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt') print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr) model1 = NMT.load(args['MODEL_PATH']) model2 = DPPNMT.load(args['MODEL_PATH']) if args['INDEX']: index = int(args['INDEX']) beam_search2( model1, model2, [test_data_src[index]], 5, 70, [test_data_tgt[index]], ) else: beam_search2( model1, model2, test_data_src, #int(args['--beam-size']), 5, #int(args['--max-decoding-time-step']), 70, test_data_tgt, )