Пример #1
0
def load_janome_tokenizer(tokenizer_path) -> Tokenizer:
    tokenizer = Tokenizer.from_file(str(tokenizer_path))
    tokenizer.pre_tokenizer = Sequence([
        Whitespace(),
        PreTokenizer.custom(JanomePreTokenizer()),
    ])
    tokenizer.decoder = Decoder.custom(JanomeDecoder())
    return tokenizer
Пример #2
0
    def test_bert_like(self):
        pre_tokenizer = Sequence([WhitespaceSplit(), Punctuation()])
        assert isinstance(Sequence([]), PreTokenizer)
        assert isinstance(Sequence([]), Sequence)
        assert isinstance(pickle.loads(pickle.dumps(pre_tokenizer)), Sequence)

        result = pre_tokenizer.pre_tokenize_str("Hey friend!     How are you?!?")
        assert result == [
            ("Hey", (0, 3)),
            ("friend", (4, 10)),
            ("!", (10, 11)),
            ("How", (16, 19)),
            ("are", (20, 23)),
            ("you", (24, 27)),
            ("?", (27, 28)),
            ("!", (28, 29)),
            ("?", (29, 30)),
        ]
Пример #3
0
 def test_instantiate(self):
     assert Sequence([]) is not None
     assert isinstance(Sequence([]), PreTokenizer)
     assert isinstance(Sequence([]), Sequence)
     dumped = pickle.dumps(Sequence([]))
     assert isinstance(pickle.loads(dumped), Sequence)
from tokenizers import Tokenizer
from tokenizers.models import BPE, WordPiece, Unigram
from tokenizers.normalizers import Lowercase
from tokenizers.pre_tokenizers import Whitespace, Digits, Sequence
from tokenizers.trainers import BpeTrainer, WordPieceTrainer, UnigramTrainer

TRAIN_DATA_PATH = 'data/data_fusion_train.parquet'
OUTPUT_PATH = 'data/tokenizers/'

# Prepare data
train = pd.read_parquet(TRAIN_DATA_PATH, columns=['item_name'])
item_names = train.item_name.drop_duplicates().tolist()

# WordPiece tokenizer
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()])
tokenizer.normalizer = Lowercase()

trainer = WordPieceTrainer(
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
    vocab_size=70000)
tokenizer.train_from_iterator(item_names, trainer)
tokenizer.save(os.path.join(OUTPUT_PATH, 'wordpiece_70k.json'))

# BPE tokenizer
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()])
tokenizer.normalizer = Lowercase()

trainer = BpeTrainer(
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],