def train_tokenizer(data_file_paths): t = BertWordPieceTokenizer() t.train( files=data_file_paths, vocab_size=vocab_size, min_frequency=2, show_progress=True, limit_alphabet=1000, ) return t
def load_WordPiece(self): tokenizer = BertWordPieceTokenizer(vocab=os.path.join( self.config['tokenizer_path'], 'tokenizer-vocab.txt'), clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True, wordpieces_prefix='##') return tokenizer
def train_custom_tokenizer(files: List[str], tokenizer_file: str, **kwargs) -> BertWordPieceTokenizer: """ Tokenizerの学習・保存処理:custom PreTokenizer付きのTokenizerを学習・保存する。 """ tokenizer = BertWordPieceTokenizer( handle_chinese_chars=False, # for japanese strip_accents=False, # for japanese ) tokenizer._tokenizer.pre_tokenizer = PreTokenizer.custom( MecabPreTokenizer()) # 与えられたコーパスファイル集合からサブワード分割を学習 tokenizer.train(files, **kwargs) # vocab情報に加えて、前処理等パラメータ情報を含んだトークナイザ設定のJSONを保存 # NOTE: Pythonで書かれたcustom PreTokenizerはシリアライズできないので、RustベースのPreTokenizerをダミー注入してシリアライズ # JSONにはダミーのPreTokenizerが記録されるので、ロード時にcustom PreTokenizerを再設定する必要がある。 tokenizer._tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.save(tokenizer_file) # (Optional) .txt形式のvocabファイルは f"vocab-{filename}.txt" で保存される(外部の処理で欲しい場合) filename = "wordpiece" model_files = tokenizer._tokenizer.model.save( str(pathlib.Path(tokenizer_file).parent), filename) return tokenizer
def train_bert_tokenizer(sentences: List[str], serialize_path: str, vocab_size: int = 6000) -> BertWordPieceTokenizer: tokenizer = BertWordPieceTokenizer( clean_text=True, handle_chinese_chars=False, strip_accents=False, lowercase=False, ) tokenizer.train_from_iterator( sentences, vocab_size=vocab_size, min_frequency=2, show_progress=True, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], limit_alphabet=500, wordpieces_prefix="##", ) # Save the files--first write out the vocab, then use BertTokenizer's save_pretrained tokenizer.save_model(serialize_path) bert_tokenizer = BertTokenizer.from_pretrained(serialize_path + os.sep + "vocab.txt") bert_tokenizer.save_pretrained(serialize_path) os.rename(serialize_path + os.sep + "tokenizer_config.json", serialize_path + os.sep + "config.json") return bert_tokenizer
def training_WordPiece(self): tokenizer = BertWordPieceTokenizer(vocab=None, clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True, wordpieces_prefix='##') tokenizer.train([ os.path.join(self.corpus_dir_path, file_path) for file_path in os.listdir(self.corpus_dir_path) if 'mecab' in file_path ], limit_alphabet=self.config['limit_alphabet'], vocab_size=self.config['vocab_size'], special_tokens=self.get_special_tokens()) print('training WordPiece is finished!') tokenizer.save_model(self.config['tokenizer_path'], prefix='tokenizer') print('tokenizer is saved in {}'.format( os.path.join(self.config['tokenizer_path'], 'tokenizer-vocab.txt')))
def test_encode_formats(self, bert_files): with pytest.deprecated_call(): tokenizer = BertWordPieceTokenizer(bert_files["vocab"]) # Encode output = tokenizer.encode("my name is john") assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"] output = tokenizer.encode("my name is john", "pair") assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"] output = tokenizer.encode(["my", "name", "is", "john"], is_pretokenized=True) assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"] output = tokenizer.encode(["my", "name", "is", "john"], ["pair"], is_pretokenized=True) assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"] # Encode batch result_single = [ ["[CLS]", "my", "name", "is", "john", "[SEP]"], ["[CLS]", "my", "name", "is", "georges", "[SEP]"], ] result_pair = [ ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"], ["[CLS]", "my", "name", "is", "georges", "[SEP]", "pair", "[SEP]"], ] def format(encodings): return [e.tokens for e in encodings] def test_single(input, is_pretokenized=False): output = tokenizer.encode_batch(input, is_pretokenized=is_pretokenized) assert format(output) == result_single def test_pair(input, is_pretokenized=False): output = tokenizer.encode_batch(input, is_pretokenized=is_pretokenized) assert format(output) == result_pair # Classic inputs # Lists test_single(["My name is John", "My name is Georges"]) test_pair([("my name is john", "pair"), ("my name is georges", "pair")]) test_pair([["my name is john", "pair"], ["my name is georges", "pair"]]) # Tuples test_single(("My name is John", "My name is Georges")) test_pair((("My name is John", "pair"), ("My name is Georges", "pair"))) # Numpy test_single(np.array(["My name is John", "My name is Georges"])) test_pair(np.array([("My name is John", "pair"), ("My name is Georges", "pair")])) test_pair(np.array([["My name is John", "pair"], ["My name is Georges", "pair"]])) # PreTokenized inputs # Lists test_single([["My", "name", "is", "John"], ["My", "name", "is", "Georges"]], True) test_pair( [ (["My", "name", "is", "John"], ["pair"]), (["My", "name", "is", "Georges"], ["pair"]), ], True, ) test_pair( [ [["My", "name", "is", "John"], ["pair"]], [["My", "name", "is", "Georges"], ["pair"]], ], True, ) # Tuples test_single((("My", "name", "is", "John"), ("My", "name", "is", "Georges")), True) test_pair( ( (("My", "name", "is", "John"), ("pair",)), (("My", "name", "is", "Georges"), ("pair",)), ), True, ) test_pair( ( (["My", "name", "is", "John"], ["pair"]), (["My", "name", "is", "Georges"], ["pair"]), ), True, ) # Numpy test_single( np.array([["My", "name", "is", "John"], ["My", "name", "is", "Georges"]]), True, ) test_single( np.array((("My", "name", "is", "John"), ("My", "name", "is", "Georges"))), True, ) test_pair( np.array( [ [["My", "name", "is", "John"], ["pair"]], [["My", "name", "is", "Georges"], ["pair"]], ], dtype=object, ), True, ) test_pair( np.array( ( (("My", "name", "is", "John"), ("pair",)), (("My", "name", "is", "Georges"), ("pair",)), ), dtype=object, ), True, ) # Mal formed with pytest.raises(TypeError, match="TextInputSequence must be str"): tokenizer.encode([["my", "name"]]) with pytest.raises(TypeError, match="TextInputSequence must be str"): tokenizer.encode("My name is john", [["pair"]]) with pytest.raises(TypeError, match="TextInputSequence must be str"): tokenizer.encode("my name is john", ["pair"]) with pytest.raises(TypeError, match="InputSequence must be Union[List[str]"): tokenizer.encode("My name is john", is_pretokenized=True) with pytest.raises(TypeError, match="InputSequence must be Union[List[str]"): tokenizer.encode("My name is john", ["pair"], is_pretokenized=True) with pytest.raises(TypeError, match="InputSequence must be Union[List[str]"): tokenizer.encode(["My", "name", "is", "John"], "pair", is_pretokenized=True)
def test_encode_formats(self, bert_files): tokenizer = BertWordPieceTokenizer(bert_files["vocab"]) # Well formed output = tokenizer.encode("my name is john") assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"] output = tokenizer.encode("my name is john", "pair") assert output.tokens == [ "[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]" ] output = tokenizer.encode(["my", "name", "is", "john"], is_pretokenized=True) assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"] output = tokenizer.encode(["my", "name", "is", "john"], ["pair"], is_pretokenized=True) assert output.tokens == [ "[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]" ] output = tokenizer.encode_batch( ["My name is John", "My name is Georges"]) assert output[0].tokens == [ "[CLS]", "my", "name", "is", "john", "[SEP]" ] assert output[1].tokens == [ "[CLS]", "my", "name", "is", "georges", "[SEP]" ] output = tokenizer.encode_batch([("my name is john", "pair"), ("my name is john", "pair")]) assert output[0].tokens == [ "[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]" ] assert output[1].tokens == [ "[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]" ] output = tokenizer.encode_batch([["my", "name", "is", "john"]], is_pretokenized=True) assert output[0].tokens == [ "[CLS]", "my", "name", "is", "john", "[SEP]" ] # Mal formed with pytest.raises(ValueError, match="InputSequence must be str"): tokenizer.encode([["my", "name"]]) tokenizer.encode("My name is john", [["pair"]]) tokenizer.encode("my name is john", ["pair"]) with pytest.raises(ValueError, match="InputSequence must be Union[List[str]"): tokenizer.encode("My name is john", is_pretokenized=True) tokenizer.encode("My name is john", ["pair"], is_pretokenized=True) tokenizer.encode(["My", "name", "is", "John"], "pair", is_pretokenized=True)
from tokenizers.implementations import ByteLevelBPETokenizer, BertWordPieceTokenizer from tokenizers.processors import BertProcessing from transformers import RobertaConfig from transformers import RobertaTokenizerFast from transformers import RobertaForMaskedLM from transformers import LineByLineTextDataset from transformers import DataCollatorForLanguageModeling from transformers import Trainer, TrainingArguments tokenizer = BertWordPieceTokenizer(vocab_file="./FaBerto/vocab.txt") # tokenizer._tokenizer.post_processor = BertProcessing( # ("</s>", tokenizer.token_to_id("</s>")), # ("<s>", tokenizer.token_to_id("<s>")), # ) tokenizer.enable_truncation(max_length=512) print(tokenizer.encode("مرکزگرایی یکی از عوامل اصلی خداناباوری است.").tokens) print(tokenizer.encode("این یک تست است.").ids) config = RobertaConfig( vocab_size=52_000, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=6, type_vocab_size=1, ) tokenizer = RobertaTokenizerFast.from_pretrained("./FaBerto", max_len=512) model = RobertaForMaskedLM(config=config)
import tokenizers from transformers import BertTokenizer import glob from tokenizers.implementations import BertWordPieceTokenizer tokenizer = BertWordPieceTokenizer() tokenizer = BertWordPieceTokenizer( clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True, ) files = glob.glob("./corpus_for_tokenization/*.txt") tokenizer.train(files, vocab_size=50000, min_frequency=3, show_progress=True, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], limit_alphabet=15000, wordpieces_prefix="##") tokenizer.save_model("./")