def load_janome_tokenizer(tokenizer_path) -> Tokenizer: tokenizer = Tokenizer.from_file(str(tokenizer_path)) tokenizer.pre_tokenizer = Sequence([ Whitespace(), PreTokenizer.custom(JanomePreTokenizer()), ]) tokenizer.decoder = Decoder.custom(JanomeDecoder()) return tokenizer
def test_bert_like(self): pre_tokenizer = Sequence([WhitespaceSplit(), Punctuation()]) assert isinstance(Sequence([]), PreTokenizer) assert isinstance(Sequence([]), Sequence) assert isinstance(pickle.loads(pickle.dumps(pre_tokenizer)), Sequence) result = pre_tokenizer.pre_tokenize_str("Hey friend! How are you?!?") assert result == [ ("Hey", (0, 3)), ("friend", (4, 10)), ("!", (10, 11)), ("How", (16, 19)), ("are", (20, 23)), ("you", (24, 27)), ("?", (27, 28)), ("!", (28, 29)), ("?", (29, 30)), ]
def test_instantiate(self): assert Sequence([]) is not None assert isinstance(Sequence([]), PreTokenizer) assert isinstance(Sequence([]), Sequence) dumped = pickle.dumps(Sequence([])) assert isinstance(pickle.loads(dumped), Sequence)
from tokenizers import Tokenizer from tokenizers.models import BPE, WordPiece, Unigram from tokenizers.normalizers import Lowercase from tokenizers.pre_tokenizers import Whitespace, Digits, Sequence from tokenizers.trainers import BpeTrainer, WordPieceTrainer, UnigramTrainer TRAIN_DATA_PATH = 'data/data_fusion_train.parquet' OUTPUT_PATH = 'data/tokenizers/' # Prepare data train = pd.read_parquet(TRAIN_DATA_PATH, columns=['item_name']) item_names = train.item_name.drop_duplicates().tolist() # WordPiece tokenizer tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()]) tokenizer.normalizer = Lowercase() trainer = WordPieceTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=70000) tokenizer.train_from_iterator(item_names, trainer) tokenizer.save(os.path.join(OUTPUT_PATH, 'wordpiece_70k.json')) # BPE tokenizer tokenizer = Tokenizer(BPE(unk_token="[UNK]")) tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()]) tokenizer.normalizer = Lowercase() trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],