示例#1
0
def beam_search(model: NMT, test_data_src: List[List[str]], beam_size: int,
                max_decoding_time_step: int) -> List[List[Hypothesis]]:
    """ 对源句子列表使用beam search去构建假设.
    @param model (NMT): NMT 模型
    @param test_data_src (List[List[str]]): 源句子列表, 测试集中的.
    @param beam_size (int): beam_size (每一步的候选数)
    @param max_decoding_time_step (int): Beam search 能产生的最大句子长度
    @returns hypotheses (List[List[Hypothesis]]): 每个源句子的beam_size个假设.
    """
    was_training = model.training
    model.eval()

    hypotheses = []  # 所有句子的候选句列表
    with torch.no_grad():
        for src_sent in tqdm(test_data_src, desc='Decoding', file=sys.stdout):
            example_hyps = model.beam_search(
                src_sent,
                beam_size=beam_size,
                max_decoding_time_step=max_decoding_time_step)

            hypotheses.append(example_hyps)  # 把这句话的所有候选句加入列表

    if was_training: model.train(was_training)

    return hypotheses
示例#2
0
def beam_search(model: NMT, test_iter, beam_size: int,
                max_decoding_time_step: int) -> List[List[Hypothesis]]:
    """ Run beam search to construct hypotheses for a list of src-language sentences.
    @param model (NMT): NMT Model
    @param test_data_src (List[List[str]]): List of sentences (words) in source language, from test set.
    @param beam_size (int): beam_size (# of hypotheses to hold for a translation at every step)
    @param max_decoding_time_step (int): maximum sentence length that Beam search can produce
    @returns hypotheses (List[List[Hypothesis]]): List of Hypothesis translations for every source sentence.
    """
    was_training = model.training
    model.eval()

    hypotheses = []
    with torch.no_grad():
        for _, data in enumerate(test_iter):
            print(data)
            (src_sents, src_lengths), (_, _) = data.abc, data.d
            example_hyps = model.beam_search(
                src_sents,
                src_lengths,
                beam_size=beam_size,
                max_decoding_time_step=max_decoding_time_step)

            hypotheses.append(example_hyps)

    if was_training: model.train(was_training)

    return hypotheses
示例#3
0
def beam_search(model: NMT, test_iterator: BucketIterator, beam_size: int,
                max_decoding_time_step: int) -> List[List[Hypothesis]]:
    """ Run beam search to construct hypotheses for a list of src-language sentences.
    @param model (NMT): NMT Model
    @param test_iterator BucketIterator: BucketIterator in source language, from test set.
    @param beam_size (int): beam_size (# of hypotheses to hold for a translation at every step)
    @param max_decoding_time_step (int): maximum sentence length that Beam search can produce
    @returns hypotheses (List[List[Hypothesis]]): List of Hypothesis translations for every source sentence.
    """
    was_training = model.training
    model.eval()

    hypotheses = []
    with torch.no_grad():
        # for src_sent in tqdm(test_data_src, desc='Decoding', file=sys.stdout):
        for i, batch in enumerate(test_iterator):
            src_sents, src_sents_lens = batch.src
            src_sents = src_sents.permute(1, 0)
            for j in range(len(src_sents_lens)):
                src_sent = src_sents[j]
                example_hyps = model.beam_search(
                    src_sent,
                    src_sents_lens[j],
                    beam_size=beam_size,
                    max_decoding_time_step=max_decoding_time_step)
                hypotheses.append(example_hyps)

    if was_training: model.train(was_training)

    return hypotheses
示例#4
0
def beam_search2(model1: NMT, model2: DPPNMT, test_data_src: List[List[str]],
                 beam_size: int, max_decoding_time_step: int,
                 test_data_tgt) -> List[List[Hypothesis]]:
    """ Run beam search to construct hypotheses for a list of src-language sentences.
    @param model (NMT): NMT Model
    @param test_data_src (List[List[str]]): List of sentences (words) in source language, from test set.
    @param beam_size (int): beam_size (# of hypotheses to hold for a translation at every step)
    @param max_decoding_time_step (int): maximum sentence length that Beam search can produce
    @returns hypotheses (List[List[Hypothesis]]): List of Hypothesis translations for every source sentence.
    """
    model1.eval()
    model2.eval()

    i = 0
    with torch.no_grad():
        for src_sent in tqdm(test_data_src, desc='Decoding', file=sys.stdout):
            hyp1 = model1.beam_search(
                src_sent,
                beam_size=beam_size,
                max_decoding_time_step=max_decoding_time_step)
            hyp2 = model2.beam_search(
                src_sent,
                beam_size=beam_size,
                max_decoding_time_step=max_decoding_time_step)
            ref = test_data_tgt[i][1:-1]
            #print(ref, hyp1[0].value)
            bleu_topk = sentence_bleu(ref, hyp1[0].value)
            bleu_dpp = sentence_bleu(test_data_tgt[i], hyp2[0].value)
            #print(bleu_topk, bleu_dpp)
            if bleu_dpp > bleu_topk:
                print(i)
                print(" ".join(hyp1[0].value))
                print(" ".join(hyp2[0].value))
                print(" ".join(ref))
            i += 1
示例#5
0
def beam_search(model: NMT, test_data_src: List[List[str]], beam_size: int,
                max_decoding_time_step: int) -> List[List[Hypothesis]]:
    """ Run beam search to construct hypotheses for a list of src-language sentences.
    @param model (NMT): NMT Model
    @param test_data_src (List[List[str]]): List of sentences (words) in source language, from test set.
    @param beam_size (int): beam_size (# of hypotheses to hold for a translation at every step)
    @param max_decoding_time_step (int): maximum sentence length that Beam search can produce
    @returns hypotheses (List[List[Hypothesis]]): List of Hypothesis translations for every source sentence.
    """
    was_training = model.training
    model.eval()

    hypotheses = []
    with torch.no_grad():
        for src_sent in tqdm(test_data_src, desc='Decoding', file=sys.stdout):
            example_hyps = model.beam_search(
                src_sent,
                beam_size=beam_size,
                max_decoding_time_step=max_decoding_time_step)

            hypotheses.append(example_hyps)

    if was_training: model.train(was_training)

    return hypotheses
示例#6
0
    def load(model_path: str):
        """ Load the model from a file.
        @param model_path (str): path to model
        """
        params = torch.load(model_path,
                            map_location=lambda storage, loc: storage)
        args = params['args']
        nmt_model = NMT(vocab=params['vocab'], **args)
        nmt_model.load_state_dict(params['state_dict'])
        model = DPPNMT(nmt_model=nmt_model)

        return model
示例#7
0
def sample(args):
    train_data_src = read_corpus(args.train_src, source='src')
    train_data_tgt = read_corpus(args.train_tgt, source='tgt')
    train_data = zip(train_data_src, train_data_tgt)

    if args.load_model:
        print('load model from [%s]' % args.load_model)
        params = torch.load(args.load_model,
                            map_location=lambda storage, loc: storage)
        vocab = params['vocab']
        opt = params['args']
        state_dict = params['state_dict']

        model = NMT(opt, vocab)
        model.load_state_dict(state_dict)
    else:
        vocab = torch.load(args.vocab)
        model = NMT(args, vocab)

    model.eval()

    if args.cuda:
        # model = nn.DataParallel(model).cuda()
        model = model.cuda()

    print('begin sampling')

    check_every = 10
    train_iter = cum_samples = 0
    train_time = time.time()
    for src_sents, tgt_sents in data_iter(train_data,
                                          batch_size=args.batch_size):
        train_iter += 1
        samples = model.sample(src_sents,
                               sample_size=args.sample_size,
                               to_word=True)
        cum_samples += sum(len(sample) for sample in samples)

        if train_iter % check_every == 0:
            elapsed = time.time() - train_time
            print('sampling speed: %d/s' % (cum_samples / elapsed))
            cum_samples = 0
            train_time = time.time()

        for i, tgt_sent in enumerate(tgt_sents):
            print('*' * 80)
            print('target:' + ' '.join(tgt_sent))
            tgt_samples = samples[i]
            print('samples:')
            for sid, sample in enumerate(tgt_samples, 1):
                print('[%d] %s' % (sid, ' '.join(sample[1:-1])))
            print('*' * 80)
示例#8
0
def test(args):
    test_data_src = read_corpus(args.test_src, source='src')
    test_data_tgt = read_corpus(args.test_tgt, source='tgt')
    test_data = list(zip(test_data_src, test_data_tgt))

    if args.load_model:
        print('load model from [%s]' % args.load_model)
        params = torch.load(args.load_model,
                            map_location=lambda storage, loc: storage)
        vocab = params['vocab']
        saved_args = params['args']
        state_dict = params['state_dict']

        model = NMT(saved_args, vocab)
        model.load_state_dict(state_dict)
    else:
        vocab = torch.load(args.vocab)
        model = NMT(args, vocab)

    model.eval()

    if args.cuda:
        # model = nn.DataParallel(model).cuda()
        model = model.cuda()

    hypotheses = decode(model, test_data)
    top_hypotheses = [hyps[0] for hyps in hypotheses]

    bleu_score = get_bleu([tgt for src, tgt in test_data], top_hypotheses)
    word_acc = get_acc([tgt for src, tgt in test_data], top_hypotheses,
                       'word_acc')
    sent_acc = get_acc([tgt for src, tgt in test_data], top_hypotheses,
                       'sent_acc')
    print('Corpus Level BLEU: %f, word level acc: %f, sentence level acc: %f' %
          (bleu_score, word_acc, sent_acc),
          file=sys.stderr)

    if args.save_to_file:
        print('save decoding results to %s' % args.save_to_file)
        with open(args.save_to_file, 'w') as f:
            for hyps in hypotheses:
                f.write(' '.join(hyps[0][1:-1]) + '\n')

        if args.save_nbest:
            nbest_file = args.save_to_file + '.nbest'
            print('save nbest decoding results to %s' % nbest_file)
            with open(nbest_file, 'w') as f:
                for src_sent, tgt_sent, hyps in zip(test_data_src,
                                                    test_data_tgt, hypotheses):
                    print('Source: %s' % ' '.join(src_sent), file=f)
                    print('Target: %s' % ' '.join(tgt_sent), file=f)
                    print('Hypotheses:', file=f)
                    for i, hyp in enumerate(hyps, 1):
                        print('[%d] %s' % (i, ' '.join(hyp)), file=f)
                    print('*' * 30, file=f)
示例#9
0
def test(args):
    data_path = args['--data_path']
    model_path = args['--model_path']
    beam_size = int(args['--beam_size'])
    max_len = int(args['--max_len'])
    
    vocab = Vocab.load()
    source = load_corpus(data_path+'/test.es', 'es', limit=100)
    reference_tgt = load_corpus(data_path+'/test.en', 'en', limit=100)
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    model = NMT.load(model_path, vocab)
    model.to(device)
    model.eval()
    
    translate_tgt = []
    with torch.no_grad():
        for src in tqdm(source, desc='translate '):
            src, _ = vocab.src.to_tensor([src], 'src')
            tgt = model.translate(src.to(device), beam_size, max_len)
            translate_tgt.append(tgt)
            
    translate_tgt = vocab.tgt.to_sentences(translate_tgt)
    if reference_tgt[0][0] == '<s>':
        regerence_tgt = [sent[1:-1] for sent in reference_tgt]
    bleu_score = nltk.translate.bleu_score.corpus_bleu([[refer] for refer in reference_tgt], translate_tgt)
    print("corpus bleu score on test data is %.2f" % (bleu_score*100))
    
    # write translate sentences to file
    with open(data_path+'/result.txt', 'w') as f:
        detokenizer = MosesDetokenizer('en')
        for sent in translate_tgt:
            sent = detokenizer(sent)
            f.write(sent+'\n')
        detokenizer.close()
def test(args):
    print("load model from {}".format(args["MODEL_PATH"]), file=sys.stderr)
    model = NMT.load(args["MODEL_PATH"])

    if args["--cuda"]:
        model = model.to(torch.device("cuda:0"))

    binary = int(args["--num-classes"]) == 2
    test_data = load_test_data(binary=binary)
    batch_size = int(args["--batch-size"])

    cum_correct = 0
    cum_score = 0

    with torch.no_grad():
        for sentences, sentiments in batch_iter(test_data, batch_size):
            correct = model.compute_accuracy(sentences,
                                             sentiments) * len(sentences)
            cum_correct += correct
            score = -model(sentences, sentiments).sum()
            cum_score += score

    print("test dataset size: %d" % len(test_data))
    print("accuracy: %f" % (cum_correct / len(test_data)))
    print("loss: %f" % (cum_score / len(test_data)))
示例#11
0
def multi_parameter_tuning(args):
    lrs = [1e-2, 1e-3, 5e-3, 1e-4, 5e-4]
    hidden_sizes = [128, 256, 512]
    lr_decays = [0.9, 0.7, 0.5]
    iter = 0
    valid_metric = {}  # 存储各个模型ppl的值
    dev_data_src = read_corpus(args['dev_source'], source='src')
    dev_data_tgt = read_corpus(args['dev_target'], source='tgt')
    dev_data = list(zip(dev_data_src, dev_data_tgt))
    for i in lrs:
        for j in hidden_sizes:
            for k in lr_decays:
                print(
                    '第%d次测试=================================================' %
                    iter)
                arg_test = args
                arg_test['lr'], arg_test['hidden_size'], arg_test[
                    'lr_decay'] = i, j, k
                arg_test['save_to'] = 'model_' + 'lr_' + str(
                    i) + 'hd_size_' + str(j) + 'lr_dys_' + str(k) + '.bin'
                run.train(arg_test)
                model = NMT.load(arg_test['save_to'])
                dev_ppl = run.evaluate_ppl(
                    model, dev_data,
                    batch_size=128)  # dev batch size can be a bit larger
                valid_metric[arg_test['save_to']] = dev_ppl
                print(arg_test['save_to'],
                      '  validation: iter %d, dev. ppl %f' % (iter, dev_ppl),
                      file=sys.stderr)
                iter += 1
    model = min(valid_metric, key=valid_metric.get())
    print('best_model is %s ,ppl is %f' % (model, valid_metric[model]))
示例#12
0
def test():
    print(
        f"load test sentences from [{config.test_path_src}], [{config.test_path_tar}]",
        file=sys.stderr)
    test_data = Data(config.test_path_src, config.test_path_tar)
    test_data_loader = DataLoader(dataset=test_data,
                                  batch_size=config.test_batch_size,
                                  shuffle=True,
                                  collate_fn=utils.get_batch)
    model_path = "/home/wangshuhe/shuhelearn/ShuHeLearning/NMT_attention/result/02.08_window35_6_8.810715463205241_checkpoint.pth"
    print(f"load model from {model_path}", file=sys.stderr)
    model = NMT.load(model_path)
    if (config.cuda):
        model = model.to(torch.device("cuda:0"))
        #model = model.cuda()
        #model = nn.parallel.DistributedDataParallel(model)
    predict, test_data_tar = beam_search(model, test_data, test_data_loader,
                                         15, config.max_tar_length)
    for i in range(len(test_data_tar)):
        for j in range(len(test_data_tar[i])):
            test_data_tar[i][j] = model.text.tar.id2word[test_data_tar[i][j]]
    for i in range(len(predict)):
        for j in range(len(predict[i])):
            predict[i][j] = model.text.tar.id2word[predict[i][j]]
    best_predict = []
    for i in tqdm(range(len(test_data_tar)), desc="find best predict"):
        best_predict.append(predict[i][compare_bleu(predict[i],
                                                    test_data_tar[i])])
    bleu = corpus_bleu([[ref[1:-1]] for ref in test_data_tar],
                       [pre for pre in predict])
    print(f"BLEU is {bleu*100}", file=sys.stderr)
示例#13
0
def main():
    """ Main func.
    """
    # args = '1d'

    # Check Python & PyTorch Versions
    assert (sys.version_info >= (3, 5)), "Please update your installation of Python to version >= 3.5"
    # assert(torch.__version__ == "1.0.0"), "Please update your installation of PyTorch. You have {} and you should have version 1.0.0".format(torch.__version__)

    # Seed the Random Number Generators
    seed = 1234
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed * 13 // 7)

    # Load training data & vocabulary
    train_data_src = read_corpus('/Users/Pannu/Desktop/Python/AI/NLP/CS224N-2019-master/Assignment/a4/sanity_check_en_es_data/train_sanity_check.es', 'src')
    train_data_tgt = read_corpus('/Users/Pannu/Desktop/Python/AI/NLP/CS224N-2019-master/Assignment/a4/sanity_check_en_es_data/train_sanity_check.en', 'tgt')
    train_data = list(zip(train_data_src, train_data_tgt))

    for src_sents, tgt_sents in batch_iter(train_data, batch_size=BATCH_SIZE, shuffle=True):
        src_sents = src_sents
        tgt_sents = tgt_sents
        break
    vocab = Vocab.load('/Users/Pannu/Desktop/Python/AI/NLP/CS224N-2019-master/Assignment/a4/sanity_check_en_es_data/vocab_sanity_check.json') 

    # Create NMT Model
    model = NMT(
        embed_size=EMBED_SIZE,
        hidden_size=HIDDEN_SIZE,
        dropout_rate=DROPOUT_RATE,
        vocab=vocab)
示例#14
0
def decode(args: Dict[str, str]):
    """ Performs decoding on a test set, and save the best-scoring decoding results.
    If the target gold-standard sentences are given, the function also computes
    corpus-level BLEU score.
    @param args (Dict): args from cmd line
    """

    print("load test source sentences from [{}]".format(args['TEST_SOURCE_FILE']), file=sys.stderr)
    test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src')
    if args['TEST_TARGET_FILE']:
        print("load test target sentences from [{}]".format(args['TEST_TARGET_FILE']), file=sys.stderr)
        test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt')

    print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr)
    model = NMT.load(args['MODEL_PATH'], no_char_decoder=args['--no-char-decoder'])

    if args['--cuda']:
        model = model.to(torch.device("cuda:0"))

    hypotheses = beam_search(model, test_data_src,
                             beam_size=int(args['--beam-size']),
                             max_decoding_time_step=int(args['--max-decoding-time-step']))

    if args['TEST_TARGET_FILE']:
        top_hypotheses = [hyps[0] for hyps in hypotheses]
        bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses)
        print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr)

    with open(args['OUTPUT_FILE'], 'w') as f:
        for src_sent, hyps in zip(test_data_src, hypotheses):
            top_hyp = hyps[0]
            hyp_sent = ' '.join(top_hyp.value)
            f.write(hyp_sent + '\n')
示例#15
0
def test():
    print(
        f"load test sentences from [{config.test_path_src}], [{config.test_path_tar}]",
        file=sys.stderr)
    #test_data_src, test_data_tar = utils.read_corpus(config.test_path)
    test_data = Data(config.test_path_src, config.test_path_tar)
    test_data_loader = DataLoader(dataset=test_data,
                                  batch_size=config.test_batch_size,
                                  shuffle=True,
                                  collate_fn=utils.get_batch)
    model_path = "/home/wangshuhe/shuhelearn/ShuHeLearning/NMT_transformer/small/result/02.10_145_1.037565227213504_checkpoint.pth"
    model = NMT.load(model_path)
    if (config.cuda):
        model = model.to(torch.device("cuda:0"))
    predict, test_data_tar = beam_search(model, test_data, test_data_loader,
                                         15, config.max_tar_length)
    for i in range(len(test_data_tar)):
        for j in range(len(test_data_tar[i])):
            test_data_tar[i][j] = model.text.tar.id2word[test_data_tar[i][j]]
    for i in range(len(predict)):
        for j in range(len(predict[i])):
            predict[i][j] = model.text.tar.id2word[predict[i][j]]
    bleu = corpus_bleu([[tar[1:-1]] for tar in test_data_tar],
                       [pre for pre in predict])
    print(f"Corpus BLEU: {bleu * 100}", file=sys.stderr)
示例#16
0
文件: run.py 项目: qxj/cs224n-2019
def decode(args: Dict[str, str]):
    """ Performs decoding on a test set, and save the best-scoring decoding results.
    If the target gold-standard sentences are given, the function also computes
    corpus-level BLEU score.
    @param args (Dict): args from cmd line
    """

    print("load test source sentences from [{}]".format(
        args['TEST_SOURCE_FILE']),
          file=sys.stderr)
    test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src')
    if args['TEST_TARGET_FILE']:
        print("load test target sentences from [{}]".format(
            args['TEST_TARGET_FILE']),
              file=sys.stderr)
        test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt')

    print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr)
    model = NMT.load(args['MODEL_PATH'])

    if args['--cuda']:
        model = model.to(torch.device("cuda"))

    hypotheses = beam_search(model,
                             test_data_src,
                             beam_size=int(args['--beam-size']),
                             max_decoding_time_step=int(
                                 args['--max-decoding-time-step']))

    if args['TEST_TARGET_FILE']:
        top_hypotheses = [hyps[0] for hyps in hypotheses]
        bleu_score = compute_corpus_level_bleu_score(test_data_tgt,
                                                     top_hypotheses)
        print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr)

    with open(args['OUTPUT_FILE'], 'w') as f:
        for src_sent, hyps in zip(test_data_src, hypotheses):
            top_hyp = hyps[0]
            hyp_sent = ' '.join(top_hyp.value)
            f.write(hyp_sent + '\n')

    if args['--plot-attention']:
        plt.rcParams['font.family'] = ['sans-serif']
        plt.rcParams['font.sans-serif'] = [
            'Arial Unicode MS', 'Arial', 'sans-serif'
        ]
        from matplotlib.font_manager import _rebuild
        _rebuild()
        output_dir = os.path.dirname(args['OUTPUT_FILE'])
        for idx, (src_sent,
                  hyps) in tqdm(enumerate(zip(test_data_src, hypotheses)),
                                desc='Plot attention',
                                file=sys.stdout):
            top_hyp = hyps[0]
            hyp_sent = top_hyp.value
            hyp_att = top_hyp.attention
            filename = output_dir + '/att_%d.jpg' % idx
            plot_attention(hyp_att, src_sent, hyp_sent, filename)
示例#17
0
def main():
    """ Main func.
    """
    # args = docopt(__doc__)
    args = {
        '1d': False,
        '1e': False,
        '1f': True,
        'overwrite_output_for_sanity_check': False
    }

    # print(args)
    # Check Python & PyTorch Versions
    assert (sys.version_info >=
            (3,
             5)), "Please update your installation of Python to version >= 3.5"
    assert (
        torch.__version__ >= "1.0.0"
    ), "Please update your installation of PyTorch. You have {} and you should have version 1.0.0".format(
        torch.__version__)

    # Seed the Random Number Generators
    seed = 1234
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed * 13 // 7)

    # Load training data & vocabulary
    train_data_src = read_corpus(
        './sanity_check_en_es_data/train_sanity_check.es', 'src')
    train_data_tgt = read_corpus(
        './sanity_check_en_es_data/train_sanity_check.en', 'tgt')
    train_data = list(zip(train_data_src, train_data_tgt))

    for src_sents, tgt_sents in batch_iter(train_data,
                                           batch_size=BATCH_SIZE,
                                           shuffle=True):
        src_sents = src_sents
        tgt_sents = tgt_sents
        break
    vocab = Vocab.load('./sanity_check_en_es_data/vocab_sanity_check.json')

    # Create NMT Model
    model = NMT(embed_size=EMBED_SIZE,
                hidden_size=HIDDEN_SIZE,
                dropout_rate=DROPOUT_RATE,
                vocab=vocab)

    if args['1d']:
        question_1d_sanity_check(model, src_sents, tgt_sents, vocab)
    elif args['1e']:
        question_1e_sanity_check(model, src_sents, tgt_sents, vocab)
    elif args['1f']:
        question_1f_sanity_check(model, src_sents, tgt_sents, vocab)
    elif args['overwrite_output_for_sanity_check']:
        generate_outputs(model, src_sents, tgt_sents, vocab)
    else:
        raise RuntimeError('invalid run mode')
示例#18
0
def main():
    """ Main func.
    """
    args = docopt(__doc__)

    # Check Python & PyTorch Versions
    assert (sys.version_info >= (3, 5)), "Please update your installation of Python to version >= 3.5"
    assert (
            torch.__version__ >= "1.0.0"), "Please update your installation of PyTorch. You have {} and you should have version 1.0.0 or greater".format(
        torch.__version__)

    # Seed the Random Number Generators
    seed = 1234
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed * 13 // 7)

    vocab = Vocab.load('./sanity_check_en_es_data/vocab_sanity_check.json')

    # Create NMT Model
    model = NMT(
        embed_size=EMBED_SIZE,
        hidden_size=HIDDEN_SIZE,
        dropout_rate=DROPOUT_RATE,
        vocab=vocab)

    char_vocab = DummyVocab()

    # Initialize CharDecoder
    decoder = CharDecoder(
        hidden_size=HIDDEN_SIZE,
        char_embedding_size=EMBED_SIZE,
        target_vocab=char_vocab)

    if args['1a']:
        question_1a_sanity_check()
    elif args['1b']:
        question_1b_sanity_check()
    elif args['1c']:
        question_1c_sanity_check()
    elif args['1d']:
        question_1d_sanity_check()
    elif args['1e']:
        question_1e_sanity_check()
    elif args['1f']:
        question_1f_sanity_check(model)
    elif args['2a']:
        question_2a_sanity_check(decoder, char_vocab)
    elif args['2b']:
        question_2b_sanity_check(decoder, char_vocab)
    elif args['2c']:
        question_2c_sanity_check(decoder)
    elif args['2c2']:
        question_2c2_sanity_check(decoder)
    elif args['2d']:
        question_2d_sanity_check(decoder)
    else:
        raise RuntimeError('invalid run mode')
示例#19
0
def init_training(args):
    from functools import partial
    import pickle
    pickle.load = partial(pickle.load, encoding="latin1")
    pickle.Unpickler = partial(pickle.Unpickler, encoding="latin1")
    # model = torch.load(model_file, map_location=lambda storage, loc: storage, pickle_module=pickle)
    vocab = torch.load(args.vocab,
                       map_location=lambda storage, loc: storage,
                       pickle_module=pickle)

    model = NMT(args, vocab)
    model.train()

    if args.uniform_init:
        print('uniformly initialize parameters [-%f, +%f]' %
              (args.uniform_init, args.uniform_init),
              file=sys.stderr)
        for p in model.parameters():
            p.data.uniform_(-args.uniform_init, args.uniform_init)

    vocab_mask = torch.ones(len(vocab.tgt))
    vocab_mask[vocab.tgt['<pad>']] = 0
    nll_loss = nn.NLLLoss(weight=vocab_mask, reduction='sum')
    cross_entropy_loss = nn.CrossEntropyLoss(weight=vocab_mask,
                                             reduction='sum')

    if args.cuda:
        # model = nn.DataParallel(model).cuda()
        model = model.cuda()
        nll_loss = nll_loss.cuda()
        cross_entropy_loss = cross_entropy_loss.cuda()

    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    return vocab, model, optimizer, nll_loss, cross_entropy_loss
示例#20
0
def main():
    """ Main func.
    """
    # args = docopt(__doc__)

    char_emb_size = 50

    # Check Python & PyTorch Versions
    assert (sys.version_info >=
            (3,
             5)), "Please update your installation of Python to version >= 3.5"
    assert (
        torch.__version__ >= "1.0.0"
    ), "Please update your installation of PyTorch. You have {} and you should have version 1.0.0".format(
        torch.__version__)

    # Seed the Random Number Generators
    seed = 1234
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed * 13 // 7)

    vocab = Vocab.load('./sanity_check_en_es_data/vocab_sanity_check.json')

    # cnn layer
    cnn = CNN(char_emb_size, EMBED_SIZE, 5)

    # Create NMT Model
    model = NMT(word_embed_size=EMBED_SIZE,
                hidden_size=HIDDEN_SIZE,
                dropout_rate=DROPOUT_RATE,
                vocab=vocab)

    # highway layer
    hiway_layer = Highway(EMBED_SIZE)

    char_vocab = DummyVocab()

    # Initialize CharDecoder
    decoder = CharDecoder(hidden_size=HIDDEN_SIZE,
                          char_embedding_size=EMBED_SIZE,
                          target_vocab=char_vocab)

    # if args['1e']:
    #     question_1e_sanity_check()
    # elif args['1f']:
    #     question_1f_sanity_check(hiway_layer)
    # elif args['1g']:
    #     question_1g_sanity_check(cnn)
    # elif args['1h']:
    #     question_1h_sanity_check(model)
    # elif args['2a']:
    #     question_2a_sanity_check(decoder, char_vocab)
    # elif args['2b']:
    #     question_2b_sanity_check(decoder)
    # elif args['2c']:
    question_2c_sanity_check(decoder)
示例#21
0
def decode(args: Dict[str, str]):
    """ 在测试集上执行解码操作, 保存最高得分的解码结果.
        如果给定标准句子,函数还会计算平均字符准确率CA,第一个候选句子命中率HRF,前k个候选句子命中率kHRF
    @param args (Dict): 命令行参数
    """
    if args['SENTENCE']:
        ps = PinyinSplit()
        test_data_src = [ps.split(args['SENTENCE'])]
    if args['TEST_SOURCE_FILE']:
        print("load test source sentences from [{}]".format(
            args['TEST_SOURCE_FILE']),
              file=sys.stderr)
        test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src')

    if args['TEST_TARGET_FILE']:
        print("load test target sentences from [{}]".format(
            args['TEST_TARGET_FILE']),
              file=sys.stderr)
        test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt')

    print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr)
    model = NMT.load(args['MODEL_PATH'])

    if args['--cuda']:
        model = model.to(torch.device("cuda:0"))

    beam_size = int(args['--beam-size'])
    hypotheses = beam_search(model,
                             test_data_src,
                             beam_size=beam_size,
                             max_decoding_time_step=int(
                                 args['--max-decoding-time-step']))

    if args['TEST_TARGET_FILE']:
        top_hypotheses = [hyps[0] for hyps in hypotheses]  # 每句话转汉字的首选项形成的列表
        avg_ca, hrf = evaluate_ca_hrf(test_data_tgt, top_hypotheses)
        khrf = evaluate_khrf(test_data_tgt, hypotheses)
        # bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses) # 打分
        # print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr)
        print('avg_ca: {}'.format(avg_ca), file=sys.stderr)
        print('hrf: {}'.format(hrf), file=sys.stderr)
        print('{}hrf: {}'.format(beam_size, khrf), file=sys.stderr)

    if args['OUTPUT_FILE']:
        with open(args['OUTPUT_FILE'], 'w') as f:
            for src_sent, hyps in zip(test_data_src, hypotheses):
                top_hyp = hyps[0]
                hyp_sent = ''.join(top_hyp.value)
                f.write(hyp_sent + '\n')

    if args['SENTENCE']:
        print('source sentence: {}'.format(args['SENTENCE']))
        for i in range(len(hypotheses[0])):
            result = ''.join(hypotheses[0][i].value)
            print('top_{}_hypotheses_{}: {}'.format(beam_size, i + 1, result))
def main():
    """ Main func
	"""
    args = docopt(__doc__)

    # Check Python & PyTorch Versions
    assert (sys.version_info >=
            (3,
             5)), "Please update your installation of Python to version >= 3.5"
    assert (
        torch.__version__ == "1.1.0"
    ), "Please update your installation of PyTorch. You have {} and you should have version 1.0.0".format(
        torch.__version__)

    # Seed the Random Number Generators
    seed = 1234
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed * 13 // 7)

    vocab = Vocab.load('./sanity_check_en_es_data/vocab_sanity_check.json')

    # Create NMT Model
    model = NMT(embed_size=EMBED_SIZE,
                hidden_size=HIDDEN_SIZE,
                dropout_rate=DROPOUT_RATE,
                vocab=vocab)

    char_vocab = DummyVocab()

    # Initialize CharDecoder
    decoder = CharDecoder(hidden_size=HIDDEN_SIZE,
                          char_embedding_size=EMBED_SIZE,
                          target_vocab=char_vocab)

    # initialize highway
    highway_model = Highway(embed_size_word=EMBED_SIZE_WORD,
                            dropout_rate=DROPOUT_RATE)

    # initialize cnn
    cnn_model = CNN(EMBED_SIZE, MAX_WORD_LEN, EMBED_SIZE_WORD, 5)

    if args['hw']:
        highway_sanity_check(highway_model)
    elif args['generate_data']:
        generate_highway_data()
    elif args['gen_cnn_data']:
        generate_cnn_data()
    elif args['cnn']:
        cnn_sanity_check(cnn_model)
    else:
        raise RuntimeError('invalid run mode')
示例#23
0
def question_1i_sanity_check():
    """ Sanity check for nmt_model.py
        basic shape check
    """
    print("-" * 80)
    print("Running Sanity Check for Question 1i: NMT")
    print("-" * 80)
    src_vocab_entry = VocabEntry()
    tgt_vocab_entry = VocabEntry()
    dummy_vocab = Vocab(src_vocab_entry, tgt_vocab_entry)
    word_embed_size = 5
    hidden_size = 10

    nmt = NMT(word_embed_size, hidden_size, dummy_vocab)
    source = [["Hello my friend"], ["How are you"]]
    target = [["Bonjour mon ami"], ["Comment vas tu"]]
    output = nmt.forward(source, target)

    print(output)
    #output_expected_size = [sentence_length, BATCH_SIZE, EMBED_SIZE]
    #assert(list(output.size()) == output_expected_size), "output shape is incorrect: it should be:\n {} but is:\n{}".format(output_expected_size, list(output.size()))
    print("Sanity Check Passed for Question 1i: NMT!")
    print("-" * 80)
def train(args: Dict):
    """ Train the NMT Model.
    @param args (Dict): args from cmd line
    """
    train_data_src = read_corpus(args['--train-src'], source='src')
    train_data_tgt = read_corpus(args['--train-tgt'], source='tgt')

    dev_data_src = read_corpus(args['--dev-src'], source='src')
    dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt')

    train_data = list(zip(train_data_src, train_data_tgt))
    dev_data = list(zip(dev_data_src, dev_data_tgt))

    train_batch_size = int(args['--batch-size'])
    clip_grad = float(args['--clip-grad'])
    valid_niter = int(args['--valid-niter'])
    log_every = int(args['--log-every'])
    model_save_path = args['--save-to']

    vocab = Vocab.load(args['--vocab'])

    model = NMT(embed_size=int(args['--embed-size']),
                hidden_size=int(args['--hidden-size']),
                dropout_rate=float(args['--dropout']),
                vocab=vocab)
    model.train()

    uniform_init = float(args['--uniform-init'])
    if np.abs(uniform_init) > 0.:
        print('uniformly initialize parameters [-%f, +%f]' % (uniform_init, uniform_init), file=sys.stderr)
        for p in model.parameters():
            p.data.uniform_(-uniform_init, uniform_init)

    vocab_mask = torch.ones(len(vocab.tgt))
    vocab_mask[vocab.tgt['<pad>']] = 0

    device = torch.device("cuda:0" if args['--cuda'] else "cpu")
示例#25
0
def decode(test_src_path,
           test_tgt_path=None,
           model_path='model.bin',
           beam_size=5,
           max_decoding=70,
           device='cpu',
           output_path='output.txt'):
    """ Performs decoding on a test set, and save the best-scoring decoding results.
        If the target gold-standard sentences are given, the function also computes
        corpus-level BLEU score.
        Params:
            test_src_path (str): Path to the test source file
            test_tgt_path (str): Path to the test target file (optional). Default=None
            model_path (str): Path to the model file generated after training. Default='model.bin'
            beam_size (int): beam size (# of hypotheses to hold for a translation at every step)
            max_decoding (int): maximum sentence length that Beam search can produce. Default=70
            device (str): device to perform the calc on. Default = 'cpu'
            output_path (str): Path for the output file to write the results of the translation. Default='output.txt'
    """

    print(f'load test source sentences from [{test_src_path}]',
          file=sys.stderr)
    test_data_src = read_corpus(test_src_path, corpus_type='src')

    if test_tgt_path is not None:
        print(f'load test target sentences from [{test_tgt_path}]',
              file=sys.stderr)
        test_data_tgt = read_corpus(test_tgt_path, corpus_type='tgt')

    print(f'load model from {model_path}', file=sys.stderr)
    model = NMT.load(model_path)
    model = model.to(torch.device(device))

    hypotheses = beam_search(model,
                             test_data_src,
                             beam_size=beam_size,
                             max_decoding_time_step=max_decoding)

    if test_tgt_path is not None:
        top_hypotheses = [hyps[0] for hyps in hypotheses]
        bleu_score = compute_corpus_level_bleu_score(test_data_tgt,
                                                     top_hypotheses)
        print(f'Corpus BLEU: {bleu_score}', file=sys.stderr)

    with open(output_path, 'w') as f:
        for src_sent, hyps in zip(test_data_src, hypotheses):
            top_hyp = hyps[0]
            hyp_sent = ' '.join(top_hyp.value)
            f.write(hyp_sent + '\n')
示例#26
0
def beam_search(model: NMT, test_data_src: List[List[str]], beam_size: int, max_decoding_time_step: int)\
        -> List[List[Hypothesis]]:
    """ Run beam search to construct hypotheses for a list of src-language sentences.
    :param NMT model: NMT Model
    :param List[List[str]] test_data_src: List of sentences (words) in source language, from test set
    :param int beam_size: beam_size (number of hypotheses to keep for a translation at every step)
    :param int max_decoding_time_step: maximum sentence length that beam search can produce
    :returns List[List[Hypothesis]] hypotheses: List of Hypothesis translations for every source sentence
    """
    was_training = model.training
    model.eval()

    hypotheses = []
    with torch.no_grad():
        for src_sent in tqdm(test_data_src, desc='Decoding', file=sys.stdout):
            example_hyps = model.beam_search(
                src_sent,
                beam_size=beam_size,
                max_decoding_time_step=max_decoding_time_step)
            hypotheses.append(example_hyps)

    if was_training: model.train(was_training)

    return hypotheses
示例#27
0
 def __init__(self, speakers, embed_size, hidden_size, dropout_rate, vocab,
              no_char_decoder, lr, clip_grad, lr_decay):
     # Need: NMT Model for each speaker (ex: translate from "person speaking to Michael" to "Michael")
     # Model for determining who speaks after who
     super(NLG, self).__init__()
     self.NMT_speakers = []
     self.NMT_models = []
     self.NMT_optimizers = []
     self.clip_grad = clip_grad
     self.lrs = []
     self.lr_decay = lr_decay
     # find a way to not have to hard-code speakers?
     for speaker in speakers:
         model = NMT(embed_size=embed_size,
                     hidden_size=hidden_size,
                     dropout_rate=dropout_rate,
                     vocab=vocab,
                     no_char_decoder=no_char_decoder)
         optimizer = torch.optim.Adam(model.parameters(), lr=lr)
         self.NMT_speakers.append(
             speaker.replace("/", "-").replace(" ", "-"))
         self.NMT_models.append(model)
         self.NMT_optimizers.append(optimizer)
         self.lrs.append(lr)
示例#28
0
    def setUp(cls):

        # Initialize CharDecoder
        cls.decoder = CharDecoder(
            hidden_size=HIDDEN_SIZE,
            char_embedding_size=EMBED_SIZE,
            target_vocab=char_vocab)
        cls.vocab = Vocab.load('./sanity_check_en_es_data/vocab_sanity_check.json')

        # cl NMT Model
        cls.model = NMT(
            embed_size=EMBED_SIZE,
            hidden_size=HIDDEN_SIZE,
            dropout_rate=DROPOUT_RATE,
            vocab=vocab
        )
        cls.char_vocab = DummyVocab()
示例#29
0
def decode(args: Dict[str, str]):
    """ Performs decoding on a test set, and save the best-scoring decoding results.
    If the target gold-standard sentences are given, the function also computes
    corpus-level BLEU score.
    @param args (Dict): args from cmd line
    """

    print("load test source sentences from [{}]".format(args['TEST_SOURCE_FILE']), file=sys.stderr)
    test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src')
    if args['TEST_TARGET_FILE']:
        print("load test target sentences from [{}]".format(args['TEST_TARGET_FILE']), file=sys.stderr)
        test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt')

    print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr)

    model_dir = './saved_model/' + args['--exp-name']
    model_save_path = os.path.join(model_dir, args['--save-to'])

    #model = NMT.load(args['MODEL_PATH'], no_char_decoder=args['--no-char-decoder'])
    print('loading model from path: ' + model_save_path)
    model = NMT.load(model_save_path, no_char_decoder=args['--no-char-decoder'], 
                with_contex=args['--with-contex'], contex_LSTM=args['--contex-LSTM'], 
                multi_encoder=args['--multi-encoder'])

    if args['--cuda']:
        model = model.to(torch.device("cuda:0"))

    hypotheses = beam_search(model, test_data_src,
                             beam_size=int(args['--beam-size']),
                             max_decoding_time_step=int(args['--max-decoding-time-step']))

    if args['TEST_TARGET_FILE']:
        top_hypotheses = [hyps[0] for hyps in hypotheses]
        bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses)
        print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr)

    with open(args['OUTPUT_FILE'], 'w') as f:
        for src_sent, hyps in zip(test_data_src, hypotheses):
            top_hyp = hyps[0]
            detokenizer = TreebankWordDetokenizer()
            detokenizer.DOUBLE_DASHES = (re.compile(r'--'), r'--')
            hyp_sent = detokenizer.detokenize(top_hyp.value)
            # hyp_sent = ' '.join(top_hyp.value)
            f.write(hyp_sent + '\n')
示例#30
0
def decode(args: Dict[str, str]):
    """ Performs decoding on a test set, and save the best-scoring decoding results.
    If the target gold-standard sentences are given, the function also computes
    corpus-level BLEU score.
    @param args (Dict): args from cmd line
    """

    print("load test source sentences from [{}]".format(
        args['TEST_SOURCE_FILE']),
          file=sys.stderr)
    test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src')
    if args['TEST_TARGET_FILE']:
        print("load test target sentences from [{}]".format(
            args['TEST_TARGET_FILE']),
              file=sys.stderr)
        test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt')

    print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr)
    model1 = NMT.load(args['MODEL_PATH'])
    model2 = DPPNMT.load(args['MODEL_PATH'])

    if args['INDEX']:
        index = int(args['INDEX'])
        beam_search2(
            model1,
            model2,
            [test_data_src[index]],
            5,
            70,
            [test_data_tgt[index]],
        )
    else:
        beam_search2(
            model1,
            model2,
            test_data_src,
            #int(args['--beam-size']),
            5,
            #int(args['--max-decoding-time-step']),
            70,
            test_data_tgt,
        )