def create_tokenizer(tokenizer_type, model_path, vocab_path): if tokenizer_type == 'whitespace': return tokenizers.create(tokenizer_type, vocab=Vocab.load(vocab_path)) elif tokenizer_type == 'spm': return tokenizers.create(tokenizer_type, model_path=model_path, vocab=vocab_path) elif tokenizer_type == 'subword_nmt': return tokenizers.create(tokenizer_type, model_path=model_path, vocab=vocab_path) elif tokenizer_type == 'yttm': return tokenizers.create(tokenizer_type, model_path=model_path) elif tokenizer_type in ['hf_bytebpe', 'hf_wordpiece', 'hf_bpe']: if huggingface.is_new_version_model_file(model_path): return tokenizers.create('hf_tokenizer', model_path=model_path, vocab=vocab_path) elif tokenizer_type == 'hf_bytebpe': return tokenizers.create(tokenizer_type, merges_file=model_path, vocab_file=vocab_path) elif tokenizer_type == 'hf_wordpiece': return tokenizers.create(tokenizer_type, vocab_file=vocab_path) elif tokenizer_type == 'hf_bpe': return tokenizers.create(tokenizer_type, merges_file=model_path, vocab_file=vocab_path) else: raise NotImplementedError
def main(args): start = time.time() if args.model == 'spm': assert args.model_path is not None, 'Must specify --model_path when using the "spm" model.' tokenizer_model = tokenizers.create('spm', model_path=args.model_path, vocab=args.vocab_path) elif args.model == 'subword_nmt': assert args.model_path is not None,\ 'Must specify --model_path when using the "subword_nmt" model.' assert args.vocab_path is not None, \ 'Must specify --vocab_path when using the "subword_nmt" model.' tokenizer_model = tokenizers.create('subword_nmt', model_path=args.model_path, vocab=args.vocab_path, bpe_dropout=args.bpe_dropout) elif args.model == 'yttm': assert args.model_path is not None,\ 'Must specify --model_path when using the "subword_nmt" model.' args.bpe_dropout = 0.0 if not args.bpe_dropout else args.bpe_dropout tokenizer_model = tokenizers.create('yttm', model_path=args.model_path, vocab=args.vocab_path, bpe_dropout=args.bpe_dropout, n_threads=1) elif args.model == 'hf_bytebpe' or 'hf_bpe' or 'hf_wordpiece': if is_new_version_model_file(args.model_path): assert args.model_path is not None, \ 'Must specify --model_path when using the "{}" model.'.format(args.model) assert args.vocab_path is not None, \ 'Must specify --vocab_path when using the "{}" model.'.format(args.model) tokenizer_model = tokenizers.create('hf_tokenizer', model_path=args.model_path, vocab=args.vocab_path) else: if args.model == 'hf_bytebpe': tokenizer_model = tokenizers.create( 'hf_bytebpe', merges_file=args.model_path, vocab_file=args.vocab_path, dropout=args.bpe_dropout, lowercase=args.lowercase) elif args.model == 'hf_wordpiece': tokenizer_model = tokenizers.create( 'hf_wordpiece', vocab_file=args.vocab_path, lowercase=args.lowercase, strip_accents=args.strip_accents) elif args.model == 'hf_bpe': tokenizer_model = tokenizers.create( 'hf_bpe', merges_file=args.model_path, vocab_file=args.vocab_path, dropout=args.bpe_dropout, lowercase=args.lowercase) else: raise NotImplementedError print('Applying "{}" to "{}" and save to "{}"'.format( tokenizer_model.__class__.__name__, ', '.join(args.corpus), args.save_path)) output_type = {'subword': str, 'id': int}[args.output_type] applyer = ParallelCorpusApplyer(args.corpus, tokenizer_model, output_type) with open(args.save_path, 'w', encoding='utf-8', newline='\n') as fo: with Pool(args.num_process) as pool: sentence_count = token_count = unk_count = 0 for i, (tokenized_sentences, sentence_num, token_num, unk_num) in \ enumerate(pool.imap(applyer.process_chunk, applyer.chunk_iter())): fo.write('\n'.join(tokenized_sentences)) fo.write('\n') sentence_count += sentence_num token_count += token_num unk_count += unk_num if (i + 1) % 100 == 0: print('Chunk {} , #Lines processed: {}'.format( i + 1, sentence_count)) end = time.time() print('Done, #Lines processed {}, Avg tokens of sentences {:.1f},' 'Unknown rate {:.1f}%, Time spent {}'.format( sentence_count, token_count / sentence_count, unk_count * 100 / token_count, end - start))