Пример #1
0
def main():
    '''Main Function'''

    parser = argparse.ArgumentParser(description='translate.py')

    parser.add_argument('-model',
                        required=True,
                        help='Path to model weight file')
    parser.add_argument('-data_pkl',
                        required=True,
                        help='Pickle file with both instances and vocabulary.')
    parser.add_argument('-output',
                        default='pred.txt',
                        help="""Path to output the predictions (each line will
                        be the decoded sequence""")
    parser.add_argument('-beam_size', type=int, default=5)
    parser.add_argument('-max_seq_len', type=int, default=100)
    parser.add_argument('-no_cuda', action='store_true')

    # TODO: Translate bpe encoded files
    # parser.add_argument('-src', required=True,
    #                    help='Source sequence to decode (one line per sequence)')
    # parser.add_argument('-vocab', required=True,
    #                    help='Source sequence to decode (one line per sequence)')
    # TODO: Batch translation
    # parser.add_argument('-batch_size', type=int, default=30,
    #                    help='Batch size')
    # parser.add_argument('-n_best', type=int, default=1,
    #                    help="""If verbose is set, will output the n_best
    #                    decoded sentences""")

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    data = pickle.load(open(opt.data_pkl, 'rb'))
    SRC, TRG = data['vocab']['src'], data['vocab']['trg']
    opt.src_pad_idx = SRC.vocab.stoi[Constants.PAD_WORD]
    opt.trg_pad_idx = TRG.vocab.stoi[Constants.PAD_WORD]
    opt.trg_bos_idx = TRG.vocab.stoi[Constants.BOS_WORD]
    opt.trg_eos_idx = TRG.vocab.stoi[Constants.EOS_WORD]

    test_loader = Dataset(examples=data['test'],
                          fields={
                              'src': SRC,
                              'trg': TRG
                          })

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    translator = Translator(model=load_model(opt, device),
                            beam_size=opt.beam_size,
                            max_seq_len=opt.max_seq_len,
                            src_pad_idx=opt.src_pad_idx,
                            trg_pad_idx=opt.trg_pad_idx,
                            trg_bos_idx=opt.trg_bos_idx,
                            trg_eos_idx=opt.trg_eos_idx).to(device)

    unk_idx = SRC.vocab.stoi[SRC.unk_token]

    preds = []
    trgs = []
    srcs = []
    scores = []
    with open(opt.output, 'w') as f:
        for example in tqdm(test_loader,
                            mininterval=2,
                            desc='  - (Test)',
                            leave=False):
            # print(' '.join(example.src))
            src_seq = [
                SRC.vocab.stoi.get(word, unk_idx) for word in example.src
            ]

            src_line = ' '.join(TRG.vocab.itos[idx] for idx in src_seq)
            src_line = src_line.replace(Constants.BOS_WORD,
                                        '').replace(Constants.EOS_WORD, '')
            srcs.append(src_line)

            pred_seq = translator.translate_sentence(
                torch.LongTensor([src_seq]).to(device))
            pred_line = ' '.join(TRG.vocab.itos[idx] for idx in pred_seq)
            pred_line = pred_line.replace(Constants.BOS_WORD,
                                          '').replace(Constants.EOS_WORD, '')
            preds.append(pred_line.split())

            trg_seq = [
                SRC.vocab.stoi.get(word, unk_idx) for word in example.trg
            ]
            trg_line = ' '.join(TRG.vocab.itos[idx] for idx in trg_seq)
            trg_line = trg_line.replace(Constants.BOS_WORD,
                                        '').replace(Constants.EOS_WORD, '')
            trgs.append(trg_line.strip().split())

            # print(pred_line)
            f.write(pred_line.strip() + '\n')
            score = bleu_score([pred_line.split()],
                               [[trg_line.strip().split()]])
            scores.append(score)

    references = list(map(lambda l: [l], trgs))
    print(references[:1])
    print(preds[:1])
    b_score = bleu_score(preds, references)
    print(f'BLEU score = {b_score * 100:.2f}')
    report_name = opt.output.split('.')[0] + '_scores.csv'
    pd.DataFrame({
        'src_sent': srcs,
        'pred_sent': list(map(' '.join, preds)),
        'trg_sent': list(map(' '.join, trgs)),
        'metric': ['bleu_score'] * len(preds),
        'score': scores
    }).to_csv(report_name)
    print('[Info] Finished.')
Пример #2
0
def main():
    '''Main Function'''

    parser = argparse.ArgumentParser(description='translate.py')

    parser.add_argument('-model', required=True,
                        help='Path to model weight file')
    parser.add_argument('-data_pkl', required=True,
                        help='Pickle file with both instances and vocabulary.')
    parser.add_argument('-output', default='pred.txt',
                        help="""Path to output the predictions (each line will
                        be the decoded sequence""")
    parser.add_argument('-beam_size', type=int, default=5)
    parser.add_argument('-max_seq_len', type=int, default=100)
    parser.add_argument('-no_cuda', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    data = pickle.load(open(opt.data_pkl, 'rb'))
    src_vocb, trg_vocab = data['vocab']['src'], data['vocab']['trg']
    opt.src_pad_idx = src_vocb.stoi[Constants.PAD_WORD]
    opt.trg_pad_idx = trg_vocab.stoi[Constants.PAD_WORD]
    opt.trg_bos_idx = trg_vocab.stoi[Constants.BOS_WORD]
    opt.trg_eos_idx = trg_vocab.stoi[Constants.EOS_WORD]

    test_inputs = torch.tensor(data['test']['src'])
    test_outputs = torch.tensor(data['test']['trg'])
    test_data = TensorDataset(test_inputs, test_outputs)
    test_sampler = SequentialSampler(test_data)
    test_data_loader = DataLoader(test_data, sampler=test_sampler, batch_size=1)
    
    device = torch.device('cuda' if opt.cuda else 'cpu')
    translator = Translator(
        model=load_model(opt, device),
        beam_size=opt.beam_size,
        max_seq_len=opt.max_seq_len,
        src_pad_idx=opt.src_pad_idx,
        trg_pad_idx=opt.trg_pad_idx,
        trg_bos_idx=opt.trg_bos_idx,
        trg_eos_idx=opt.trg_eos_idx).to(device)

    with open(opt.output, 'w') as f:
        for example in tqdm(test_data_loader, mininterval=2, desc='  - (Test)', leave=False):
            src_seq = example[0]

            pred_seq = translator.translate_sentence(src_seq).to(device)
            pred_line = ' '.join(trg_vocab.itos[idx] for idx in pred_seq)
            pred_line = pred_line.replace(Constants.BOS_WORD, '').replace(Constants.EOS_WORD, '').strip()
            pred_line = 'Predicted: ' + pred_line

            trg_seq = example[1].detach().cpu().numpy()
            trg_line = ' '.join(trg_vocab.itos[idx] for idx in trg_seq)
            trg_line = trg_line.replace(Constants.BOS_WORD, '').replace(Constants.EOS_WORD, '').\
                replace(Constants.PAD_WORD, '').strip()
            trg_line = 'Ground truth: ' + trg_line

            line = '\n'.join([pred_line, trg_line])
            f.write(line + '\n\n')

    print('[Info] Finished.')
Пример #3
0
def main():
    '''Main Function'''
    parser = argparse.ArgumentParser(description='translate.py')

    parser.add_argument('-model',
                        required=True,
                        help='Path to model weight file')
    parser.add_argument('-data_pkl',
                        required=True,
                        help='Pickle file with both instances and vocabulary.')
    parser.add_argument(
        '-output',
        default='pred.txt',
        help=
        """Path to output the predictions (each line will be the decoded sequence"""
    )
    parser.add_argument('-beam_size', type=int, default=5)
    parser.add_argument('-max_seq_len', type=int, default=100)
    parser.add_argument('-no_cuda', action='store_true')

    # TODO: Translate bpe encoded files
    # parser.add_argument('-src', required=True,
    #                    help='Source sequence to decode (one line per sequence)')
    # parser.add_argument('-vocab', required=True,
    #                    help='Source sequence to decode (one line per sequence)')
    # TODO: Batch translation
    # parser.add_argument('-batch_size', type=int, default=30,
    #                    help='Batch size')
    # parser.add_argument('-n_best', type=int, default=1,
    #                    help="""If verbose is set, will output the n_best
    #                    decoded sentences""")

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    data = pickle.load(open(opt.data_pkl, 'rb'))
    SRC, TRG = data['vocab']['src'], data['vocab']['trg']
    opt.src_pad_idx = SRC.vocab.stoi[Constants.PAD_WORD]
    opt.trg_pad_idx = TRG.vocab.stoi[Constants.PAD_WORD]
    opt.trg_bos_idx = TRG.vocab.stoi[Constants.BOS_WORD]
    opt.trg_eos_idx = TRG.vocab.stoi[Constants.EOS_WORD]

    test_loader = Dataset(examples=data['test'],
                          fields={
                              'src': SRC,
                              'trg': TRG
                          })

    device = torch.device('cuda' if opt.cuda else 'cpu')
    translator = Translator(model=load_model(opt, device),
                            beam_size=opt.beam_size,
                            max_seq_len=opt.max_seq_len,
                            src_pad_idx=opt.src_pad_idx,
                            trg_pad_idx=opt.trg_pad_idx,
                            trg_bos_idx=opt.trg_bos_idx,
                            trg_eos_idx=opt.trg_eos_idx).to(device)

    unk_idx = SRC.vocab.stoi[SRC.unk_token]
    with open(opt.output, 'w') as f:
        for example in tqdm(test_loader,
                            mininterval=2,
                            desc='  - (Test)',
                            leave=False):
            # print(' '.join(example.src))
            src_seq = [
                SRC.vocab.stoi.get(word, unk_idx) for word in example.src
            ]
            pred_seq = translator.translate_sentence(
                torch.LongTensor([src_seq]).to(device))
            pred_line = ' '.join(TRG.vocab.itos[idx] for idx in pred_seq)
            pred_line = pred_line.replace(Constants.BOS_WORD,
                                          '').replace(Constants.EOS_WORD, '')
            # print(pred_line)
            f.write(pred_line.strip() + '\n')

    print('[Info] Finished.')
Пример #4
0
def main():
    '''Main Function'''

    parser = argparse.ArgumentParser(description='translate.py')

    parser.add_argument('-model',
                        required=True,
                        help='Path to model weight file')
    parser.add_argument('-data_pkl',
                        required=True,
                        help='Pickle file with both instances and vocabulary.')
    parser.add_argument('-input', default='translate_src.txt')
    parser.add_argument('-output',
                        default='pred.txt',
                        help="""Path to output the predictions (each line will
                        be the decoded sequence""")
    parser.add_argument('-beam_size', type=int, default=5)
    parser.add_argument('-max_seq_len', type=int, default=100)
    parser.add_argument('-no_cuda', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    data = pickle.load(open(opt.data_pkl, 'rb'))
    SRC, TRG = data['vocab']['src'], data['vocab']['trg']
    opt.src_pad_idx = SRC.vocab.stoi[Constants.PAD_WORD]
    opt.trg_pad_idx = TRG.vocab.stoi[Constants.PAD_WORD]
    opt.trg_bos_idx = TRG.vocab.stoi[Constants.BOS_WORD]
    opt.trg_eos_idx = TRG.vocab.stoi[Constants.EOS_WORD]

    with open(opt.input, 'r') as f:
        translate_src = list(f)

    fields = [('src', SRC), ('trg', TRG)]

    data_loader = Dataset(examples=[
        Example.fromlist(x, fields) for x in zip(translate_src, translate_src)
    ],
                          fields={
                              'src': SRC,
                              'trg': TRG
                          })

    device = torch.device('cuda' if opt.cuda else 'cpu')
    translator = Translator(model=load_model(opt, device),
                            beam_size=opt.beam_size,
                            max_seq_len=opt.max_seq_len,
                            src_pad_idx=opt.src_pad_idx,
                            trg_pad_idx=opt.trg_pad_idx,
                            trg_bos_idx=opt.trg_bos_idx,
                            trg_eos_idx=opt.trg_eos_idx).to(device)

    unk_idx = SRC.vocab.stoi[SRC.unk_token]
    with open(opt.output, 'w') as f:
        for example in tqdm(data_loader,
                            mininterval=2,
                            desc='  - (Test)',
                            leave=False):
            src_seq = [
                SRC.vocab.stoi.get(word, unk_idx) for word in example.src
            ]
            pred_seq = translator.translate_sentence(
                torch.LongTensor([src_seq]).to(device))
            pred_line = ' '.join(TRG.vocab.itos[idx] for idx in pred_seq)
            pred_line = pred_line.replace(Constants.BOS_WORD,
                                          '').replace(Constants.EOS_WORD, '')

            # print('\n')
            # print('SRC', ' '.join(example.src))
            # print('TRG', ' '.join(example.trg))
            # print('PRED', pred_line)

            f.write(pred_line.strip() + '\n')

    print('[Info] Finished.')