def test_can_modify(self): pretok = Digits(individual_digits=False) assert pretok.individual_digits == False # Modify these pretok.individual_digits = True assert pretok.individual_digits == True
def __init__(self, path): self.path = path text_paths = [ str(x) for x in Path("./dataset/corpus/").glob("**/*.txt") ] savedpath = "./dataset/tok_model/MALBERT-vocab.txt" if os.path.exists(savedpath): self.tokenizer = tokenizers.BertWordPieceTokenizer( "./dataset/tok_model/MALBERT-vocab.txt", ) else: self.tokenizer = tokenizers.BertWordPieceTokenizer() self.tokenizer.train( files=text_paths, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=14200) self.tokenizer.save_model("./dataset/tok_model", "MALBERT") self.tokenizer.enable_truncation(max_length=512) self.pretokenizer = tokenizers.pre_tokenizers.Sequence( [Whitespace(), Digits(individual_digits=True)]) self.vocab = self.tokenizer.get_vocab() self.mask_index = self.vocab.get("[MASK]") self.pad_index = self.vocab.get("[PAD]") self.eos_index = self.vocab.get("[SEP]") self.sos_index = self.vocab.get("[CLS]") self.unk_index = self.vocab.get("[UNK]")
def test_instantiate(self): assert Digits() is not None assert isinstance(Digits(), PreTokenizer) assert isinstance(Digits(), Digits) assert isinstance(Digits(True), Digits) assert isinstance(Digits(False), Digits) assert isinstance(pickle.loads(pickle.dumps(Digits())), Digits)
from tokenizers import Tokenizer from tokenizers.models import BPE, WordPiece, Unigram from tokenizers.normalizers import Lowercase from tokenizers.pre_tokenizers import Whitespace, Digits, Sequence from tokenizers.trainers import BpeTrainer, WordPieceTrainer, UnigramTrainer TRAIN_DATA_PATH = 'data/data_fusion_train.parquet' OUTPUT_PATH = 'data/tokenizers/' # Prepare data train = pd.read_parquet(TRAIN_DATA_PATH, columns=['item_name']) item_names = train.item_name.drop_duplicates().tolist() # WordPiece tokenizer tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()]) tokenizer.normalizer = Lowercase() trainer = WordPieceTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=70000) tokenizer.train_from_iterator(item_names, trainer) tokenizer.save(os.path.join(OUTPUT_PATH, 'wordpiece_70k.json')) # BPE tokenizer tokenizer = Tokenizer(BPE(unk_token="[UNK]")) tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()]) tokenizer.normalizer = Lowercase() trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],