예제 #1
0
def create_tokenizer(tokenizer_type, model_path, vocab_path):
    if tokenizer_type == 'whitespace':
        return tokenizers.create(tokenizer_type, vocab=Vocab.load(vocab_path))
    elif tokenizer_type == 'spm':
        return tokenizers.create(tokenizer_type,
                                 model_path=model_path,
                                 vocab=vocab_path)
    elif tokenizer_type == 'subword_nmt':
        return tokenizers.create(tokenizer_type,
                                 model_path=model_path,
                                 vocab=vocab_path)
    elif tokenizer_type == 'yttm':
        return tokenizers.create(tokenizer_type, model_path=model_path)
    elif tokenizer_type in ['hf_bytebpe', 'hf_wordpiece', 'hf_bpe']:
        if huggingface.is_new_version_model_file(model_path):
            return tokenizers.create('hf_tokenizer',
                                     model_path=model_path,
                                     vocab=vocab_path)
        elif tokenizer_type == 'hf_bytebpe':
            return tokenizers.create(tokenizer_type,
                                     merges_file=model_path,
                                     vocab_file=vocab_path)
        elif tokenizer_type == 'hf_wordpiece':
            return tokenizers.create(tokenizer_type, vocab_file=vocab_path)
        elif tokenizer_type == 'hf_bpe':
            return tokenizers.create(tokenizer_type,
                                     merges_file=model_path,
                                     vocab_file=vocab_path)
    else:
        raise NotImplementedError
예제 #2
0
def create_tokenizer(tokenizer_type, model_path, vocab_path):
    if tokenizer_type == 'whitespace':
        return tokenizers.create(tokenizer_type, vocab=Vocab.load(vocab_path))
    elif tokenizer_type == 'spm':
        return tokenizers.create(tokenizer_type,
                                 model_path=model_path,
                                 vocab=vocab_path)
    elif tokenizer_type == 'subword_nmt':
        return tokenizers.create(tokenizer_type,
                                 codec_path=model_path,
                                 vocab_path=vocab_path)
    elif tokenizer_type == 'yttm':
        return tokenizers.create(tokenizer_type, model_path=model_path)
    elif tokenizer_type == 'hf_bytebpe':
        return tokenizers.create(tokenizer_type,
                                 merges_file=model_path,
                                 vocab_file=vocab_path)
    elif tokenizer_type == 'hf_wordpiece':
        return tokenizers.create(tokenizer_type, vocab_file=vocab_path)
    elif tokenizer_type == 'hf_bpe':
        return tokenizers.create(tokenizer_type,
                                 merges_file=model_path,
                                 vocab_file=vocab_path)
    else:
        raise NotImplementedError
예제 #3
0
def get_base_tokenizer(method, lang):
    """The base tokenization method

    Parameters
    ----------
    method

    lang

    Returns
    -------

    """
    if method == 'moses':
        return tokenizers.create('moses', lang)
    elif method == 'whitespace':
        return tokenizers.create('whitespace')
    elif method == 'no':
        return None
    else:
        raise NotImplementedError
예제 #4
0
def test_subword_algorithms_ende(model):
    dir_path = os.path.join(_CURR_DIR, 'learn_apply_subword_ende_results')
    os.makedirs(dir_path, exist_ok=True)
    dir_path = os.path.realpath(dir_path)
    parser = learn_subword.get_parser()
    apply_parser = apply_subword.get_parser()
    corpus_path_pair = [
        os.path.join(_CURR_DIR, 'data', 'wmt19-test-de-en.de'),
        os.path.join(_CURR_DIR, 'data', 'wmt19-test-de-en.en')
    ]
    args = parser.parse_args(
        ['--corpus'] + corpus_path_pair +
        ['--model', model, '--vocab-size', '5000', '--save-dir', dir_path])
    # Train the tokenizer
    learn_subword.main(args)
    if model in ['yttm', 'spm', 'subword_nmt']:
        model_key = model
    else:
        model_key = 'hf_tokenizer'
    tokenizer = tokenizers.create(model_key,
                                  model_path=os.path.join(
                                      dir_path, '{}.model'.format(model)),
                                  vocab=os.path.join(dir_path,
                                                     '{}.vocab'.format(model)))
    args = apply_parser.parse_args(['--corpus'] + [corpus_path_pair[0]] + [
        '--model', model, '--model-path',
        os.path.join(dir_path, '{}.model'.format(model)), '--vocab-path',
        os.path.join(dir_path, '{}.vocab'.format(model)), '--save-path',
        os.path.join(dir_path, 'wmt19-test-de-en.de.{}'.format(model))
    ])
    apply_subword.main(args)
    args = apply_parser.parse_args(['--corpus'] + [corpus_path_pair[1]] + [
        '--model', model, '--model-path',
        os.path.join(dir_path, '{}.model'.format(model)), '--vocab-path',
        os.path.join(dir_path, '{}.vocab'.format(model)), '--save-path',
        os.path.join(dir_path, 'wmt19-test-de-en.en.{}'.format(model))
    ])
    apply_subword.main(args)

    # Decode back with the trained tokenizer
    for prefix_fname in [
            'wmt19-test-de-en.de.{}'.format(model),
            'wmt19-test-de-en.en.{}'.format(model)
    ]:
        with open(os.path.join(dir_path, '{}.decode'.format(prefix_fname)),
                  'w',
                  encoding='utf-8') as out_f:
            with open(os.path.join(dir_path, '{}'.format(prefix_fname)),
                      'r',
                      encoding='utf-8') as in_f:
                for line in in_f:
                    out_f.write(tokenizer.decode(line.split()) + '\n')
예제 #5
0
def main(args):
    start = time.time()
    if args.model == 'spm':
        tokenizer_model = tokenizers.create('spm',
                                            model_path=args.model_path,
                                            vocab=args.vocab_path)
    elif args.model == 'subword_nmt':
        tokenizer_model = tokenizers.create('subword_nmt',
                                            codec_path=args.model_path,
                                            vocab_path=args.vocab_path,
                                            bpe_dropout=args.bpe_dropout)
    elif args.model == 'yttm':
        args.bpe_dropout = 0.0 if not args.bpe_dropout else args.bpe_dropout
        tokenizer_model = tokenizers.create('yttm',
                                            model_path=args.model_path,
                                            bpe_dropout=args.bpe_dropout,
                                            n_threads=1)
    elif args.model == 'hf_bytebpe':
        tokenizer_model = tokenizers.create('hf_bytebpe',
                                            merges_file=args.model_path,
                                            vocab_file=args.vocab_path,
                                            dropout=args.bpe_dropout,
                                            lowercase=args.lowercase)
    elif args.model == 'hf_wordpiece':
        tokenizer_model = tokenizers.create('hf_wordpiece',
                                            vocab_file=args.vocab_path,
                                            lowercase=args.lowercase,
                                            strip_accents=args.strip_accents)
    elif args.model == 'hf_bpe':
        tokenizer_model = tokenizers.create('hf_bpe',
                                            merges_file=args.model_path,
                                            vocab_file=args.vocab_path,
                                            dropout=args.bpe_dropout,
                                            lowercase=args.lowercase)
    else:
        raise NotImplementedError
    print('Applying {} to {}'. format(tokenizer_model.__class__.__name__,
                                      ', '.join(args.corpus)))
    output_type = {'subword': str, 'id': int}[args.output_type]
    applyer = ParallelCorpusApplyer(args.corpus, tokenizer_model, output_type)
    with open(args.save_path, 'w', encoding='utf-8', newline='\n') as fo:
        with Pool(args.num_process) as pool:
            sentence_count = token_count = unk_count = 0
            for i, (tokenized_sentences, sentence_num, token_num, unk_num) in \
                enumerate(pool.imap(applyer.process_chunk, applyer.chunk_iter())):
                fo.write('\n'.join(tokenized_sentences))
                fo.write('\n')
                sentence_count += sentence_num
                token_count += token_num
                unk_count += unk_num
                if (i + 1) % 100 == 0:
                    print('Chunk {} , #Lines processed: {}'
                          .format(i + 1, sentence_count))
    end = time.time()
    print('Done, #Lines processed {}, Avg tokens of sentences {:.1f},'
          'Unknown rate {:.1f}%, Time spent {}'
          .format(sentence_count, token_count / sentence_count,
                  unk_count * 100 / token_count, end - start))    
예제 #6
0
def test_subword_custom_token(model):
    parser = learn_subword.get_parser()
    corpus_path = os.path.join(_CURR_DIR, 'data', 'wmt19-test-zh-en.zh.jieba')
    with tempfile.TemporaryDirectory() as tempdir:
        dir_path = tempdir
        arguments = ['--corpus'] + [corpus_path] + \
                    ['--model', model, '--vocab-size', '5000',
                     '--save-dir', dir_path,
                     '--disable-bos', '--disable-eos',
                     '--custom-special-tokens',
                     'cls_token=<cls>', 'sep_token=<sep>']
        args = parser.parse_args(arguments)
        # Train the tokenizer
        learn_subword.main(args)
        if model in ['yttm', 'spm', 'subword_nmt']:
            model_key = model
        else:
            model_key = 'hf_tokenizer'
        tokenizer = tokenizers.create(
            model_key,
            model_path=os.path.join(dir_path, '{}.model'.format(model)),
            vocab=os.path.join(dir_path, '{}.vocab'.format(model)))
        assert tokenizer.vocab.sep_token == '<sep>'
        assert tokenizer.vocab.cls_token == '<cls>'
예제 #7
0
def evaluate(args):
    ctx_l = [mx.cpu()] if args.gpus is None or args.gpus == '' else [
        mx.gpu(int(x)) for x in args.gpus.split(',')
    ]
    src_normalizer = MosesNormalizer(args.src_lang)
    tgt_normalizer = MosesNormalizer(args.tgt_lang)
    base_src_tokenizer = tokenizers.create('moses', args.src_lang)
    base_tgt_tokenizer = tokenizers.create('moses', args.tgt_lang)

    src_tokenizer = create_tokenizer(args.src_tokenizer,
                                     args.src_subword_model_path,
                                     args.src_vocab_path)
    tgt_tokenizer = create_tokenizer(args.tgt_tokenizer,
                                     args.tgt_subword_model_path,
                                     args.tgt_vocab_path)
    src_vocab = src_tokenizer.vocab
    tgt_vocab = tgt_tokenizer.vocab
    if args.cfg.endswith('.yml'):
        cfg = TransformerModel.get_cfg().clone_merge(args.cfg)
    else:
        cfg = TransformerModel.get_cfg(args.cfg)
    cfg.defrost()
    cfg.MODEL.src_vocab_size = len(src_vocab)
    cfg.MODEL.tgt_vocab_size = len(tgt_vocab)
    if args.fp16:
        cfg.MODEL.dtype = 'float16'
    cfg.freeze()
    model = TransformerModel.from_cfg(cfg)
    model.hybridize()
    model.load_parameters(args.param_path, ctx=ctx_l)
    inference_model = TransformerNMTInference(model=model)
    inference_model.hybridize()
    # Construct the BeamSearchSampler
    if args.stochastic:
        scorer = BeamSearchScorer(alpha=0.0,
                                  K=0.0,
                                  temperature=1.0,
                                  from_logits=False)
    else:
        scorer = BeamSearchScorer(alpha=args.lp_alpha,
                                  K=args.lp_k,
                                  from_logits=False)
    beam_search_sampler = BeamSearchSampler(beam_size=args.beam_size,
                                            decoder=inference_model,
                                            vocab_size=len(tgt_vocab),
                                            eos_id=tgt_vocab.eos_id,
                                            scorer=scorer,
                                            stochastic=args.stochastic,
                                            max_length_a=args.max_length_a,
                                            max_length_b=args.max_length_b)

    logging.info(beam_search_sampler)
    all_src_token_ids, all_src_lines = process_corpus(
        args.src_corpus,
        sentence_normalizer=src_normalizer,
        base_tokenizer=base_src_tokenizer,
        bpe_tokenizer=src_tokenizer,
        add_bos=False,
        add_eos=True)
    if args.tgt_corpus is not None:
        all_tgt_token_ids, all_tgt_lines = process_corpus(
            args.tgt_corpus,
            sentence_normalizer=tgt_normalizer,
            base_tokenizer=base_tgt_tokenizer,
            bpe_tokenizer=tgt_tokenizer,
            add_bos=True,
            add_eos=True)
    else:  # when applying inference, populate the fake tgt tokens
        all_tgt_token_ids = all_tgt_lines = [
            [] for i in range(len(all_src_token_ids))
        ]
    test_dataloader = gluon.data.DataLoader(list(
        zip(all_src_token_ids, [len(ele) for ele in all_src_token_ids],
            all_tgt_token_ids, [len(ele) for ele in all_tgt_token_ids])),
                                            batch_size=32,
                                            batchify_fn=Tuple(
                                                Pad(), Stack(), Pad(),
                                                Stack()),
                                            shuffle=False)

    ctx = ctx_l[0]
    pred_sentences = []
    start_eval_time = time.time()
    # evaluate
    if not args.inference:
        avg_nll_loss = 0
        ntokens = 0
        for i, (src_token_ids, src_valid_length, tgt_token_ids, tgt_valid_length)\
                in enumerate(test_dataloader):
            src_token_ids = mx.np.array(src_token_ids, ctx=ctx, dtype=np.int32)
            src_valid_length = mx.np.array(src_valid_length,
                                           ctx=ctx,
                                           dtype=np.int32)
            tgt_token_ids = mx.np.array(tgt_token_ids, ctx=ctx, dtype=np.int32)
            tgt_valid_length = mx.np.array(tgt_valid_length,
                                           ctx=ctx,
                                           dtype=np.int32)
            tgt_pred = model(src_token_ids, src_valid_length,
                             tgt_token_ids[:, :-1], tgt_valid_length - 1)
            pred_logits = mx.npx.log_softmax(tgt_pred, axis=-1)
            nll = -mx.npx.pick(pred_logits, tgt_token_ids[:, 1:])
            avg_nll_loss += mx.npx.sequence_mask(
                nll,
                sequence_length=tgt_valid_length - 1,
                use_sequence_length=True,
                axis=1).sum().asnumpy()
            ntokens += int((tgt_valid_length - 1).sum().asnumpy())
            init_input = mx.np.array(
                [tgt_vocab.bos_id for _ in range(src_token_ids.shape[0])],
                ctx=ctx)
            states = inference_model.init_states(src_token_ids,
                                                 src_valid_length)
            samples, scores, valid_length = beam_search_sampler(
                init_input, states, src_valid_length)
            for j in range(samples.shape[0]):
                pred_tok_ids = samples[j, 0, :valid_length[
                    j, 0].asnumpy()].asnumpy().tolist()
                bpe_decode_line = tgt_tokenizer.decode(pred_tok_ids[1:-1])
                pred_sentence = base_tgt_tokenizer.decode(
                    bpe_decode_line.split(' '))
                pred_sentences.append(pred_sentence)
                print(pred_sentence)
            print('Processed {}/{}'.format(len(pred_sentences),
                                           len(all_tgt_lines)))
        end_eval_time = time.time()
        avg_nll_loss = avg_nll_loss / ntokens

        with open(os.path.join(args.save_dir, 'gt_sentences.txt'),
                  'w',
                  encoding='utf-8') as of:
            of.write('\n'.join(all_tgt_lines))
            of.write('\n')
        with open(os.path.join(args.save_dir, 'pred_sentences.txt'),
                  'w',
                  encoding='utf-8') as of:
            of.write('\n'.join(pred_sentences))
            of.write('\n')

        sacrebleu_out = sacrebleu.corpus_bleu(sys_stream=pred_sentences,
                                              ref_streams=[all_tgt_lines])
        logging.info('Time Spent: {}, #Sent={}, SacreBlEU={} '
                     '({:2.1f} {:2.1f} {:2.1f} {:2.1f}) '
                     '(BP={:.3f}, ratio={:.3f}, syslen={}, reflen={}), '
                     'Avg NLL={}, Perplexity={}'.format(
                         end_eval_time - start_eval_time, len(all_tgt_lines),
                         sacrebleu_out.score, *sacrebleu_out.precisions,
                         sacrebleu_out.bp,
                         sacrebleu_out.sys_len / sacrebleu_out.ref_len,
                         sacrebleu_out.sys_len, sacrebleu_out.ref_len,
                         avg_nll_loss, np.exp(avg_nll_loss)))
    # inference only
    else:
        with open(os.path.join(args.save_dir, 'pred_sentences.txt'),
                  'w',
                  encoding='utf-8') as of:
            processed_sentences = 0
            for src_token_ids, src_valid_length, _, _ in tqdm(test_dataloader):
                src_token_ids = mx.np.array(src_token_ids,
                                            ctx=ctx,
                                            dtype=np.int32)
                src_valid_length = mx.np.array(src_valid_length,
                                               ctx=ctx,
                                               dtype=np.int32)
                init_input = mx.np.array(
                    [tgt_vocab.bos_id for _ in range(src_token_ids.shape[0])],
                    ctx=ctx)
                states = inference_model.init_states(src_token_ids,
                                                     src_valid_length)
                samples, scores, valid_length = beam_search_sampler(
                    init_input, states, src_valid_length)
                for j in range(samples.shape[0]):
                    pred_tok_ids = samples[j, 0, :valid_length[
                        j, 0].asnumpy()].asnumpy().tolist()
                    bpe_decode_line = tgt_tokenizer.decode(pred_tok_ids[1:-1])
                    pred_sentence = base_tgt_tokenizer.decode(
                        bpe_decode_line.split(' '))
                    pred_sentences.append(pred_sentence)
                of.write('\n'.join(pred_sentences))
                of.write('\n')
                processed_sentences += len(pred_sentences)
                pred_sentences = []
        end_eval_time = time.time()
        logging.info('Time Spent: {}, Inferred sentences: {}'.format(
            end_eval_time - start_eval_time, processed_sentences))