def test_can_modify(self):
        pretok = Digits(individual_digits=False)
        assert pretok.individual_digits == False

        # Modify these
        pretok.individual_digits = True
        assert pretok.individual_digits == True
Пример #2
0
 def __init__(self, path):
     self.path = path
     text_paths = [
         str(x) for x in Path("./dataset/corpus/").glob("**/*.txt")
     ]
     savedpath = "./dataset/tok_model/MALBERT-vocab.txt"
     if os.path.exists(savedpath):
         self.tokenizer = tokenizers.BertWordPieceTokenizer(
             "./dataset/tok_model/MALBERT-vocab.txt", )
     else:
         self.tokenizer = tokenizers.BertWordPieceTokenizer()
         self.tokenizer.train(
             files=text_paths,
             special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
             vocab_size=14200)
         self.tokenizer.save_model("./dataset/tok_model", "MALBERT")
     self.tokenizer.enable_truncation(max_length=512)
     self.pretokenizer = tokenizers.pre_tokenizers.Sequence(
         [Whitespace(), Digits(individual_digits=True)])
     self.vocab = self.tokenizer.get_vocab()
     self.mask_index = self.vocab.get("[MASK]")
     self.pad_index = self.vocab.get("[PAD]")
     self.eos_index = self.vocab.get("[SEP]")
     self.sos_index = self.vocab.get("[CLS]")
     self.unk_index = self.vocab.get("[UNK]")
Пример #3
0
 def test_instantiate(self):
     assert Digits() is not None
     assert isinstance(Digits(), PreTokenizer)
     assert isinstance(Digits(), Digits)
     assert isinstance(Digits(True), Digits)
     assert isinstance(Digits(False), Digits)
     assert isinstance(pickle.loads(pickle.dumps(Digits())), Digits)
from tokenizers import Tokenizer
from tokenizers.models import BPE, WordPiece, Unigram
from tokenizers.normalizers import Lowercase
from tokenizers.pre_tokenizers import Whitespace, Digits, Sequence
from tokenizers.trainers import BpeTrainer, WordPieceTrainer, UnigramTrainer

TRAIN_DATA_PATH = 'data/data_fusion_train.parquet'
OUTPUT_PATH = 'data/tokenizers/'

# Prepare data
train = pd.read_parquet(TRAIN_DATA_PATH, columns=['item_name'])
item_names = train.item_name.drop_duplicates().tolist()

# WordPiece tokenizer
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()])
tokenizer.normalizer = Lowercase()

trainer = WordPieceTrainer(
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
    vocab_size=70000)
tokenizer.train_from_iterator(item_names, trainer)
tokenizer.save(os.path.join(OUTPUT_PATH, 'wordpiece_70k.json'))

# BPE tokenizer
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()])
tokenizer.normalizer = Lowercase()

trainer = BpeTrainer(
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],