def Tok_Train(input_file_path,vocab_size,output_path): """Train a Simple BPE Tokenizer""" GPTToken = ByteLevelBPETokenizer(lowercase=True) GPTToken.enable_padding() GPTToken.train([input_file_path],vocab_size=vocab_size,min_frequency=2,special_tokens=["PAD"]) GPTToken.save_model(output_path) return None
def __init__(self, max_tokens=512): ## RoBERTa uses BPE tokenizer similar to GPT t = ByteLevelBPETokenizer("tokenizer/vocab.json", "tokenizer/merges.txt") t._tokenizer.post_processor = BertProcessing( ("</s>", t.token_to_id("</s>")), ("<s>", t.token_to_id("<s>")), ) t.enable_truncation(max_tokens) t.enable_padding(length=max_tokens, pad_id=t.token_to_id("<pad>")) self.tokenizer = t
def inference(): from tokenizers import ByteLevelBPETokenizer from tokenizers.processors import BertProcessing ''' initialize tokenizer with saved model files ''' tokenizer = ByteLevelBPETokenizer( "./tok_checkpoints/tokenizer_model-vocab.json", "./tok_checkpoints/tokenizer_model-merges.txt", ) ''' optional step : preprocess the strings Ex: add <s> and </s> as BOS and EOS tokens to the string pad string to some max length and truncate string to some max length ''' tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_padding(pad_token='<pad>', pad_id=tokenizer.get_vocab()['<pad>'], length=20) tokenizer.enable_truncation(max_length=20) ''' tokenize/encode strings ''' input_ids = tokenizer.encode("Hello World, Whats up!!!").ids print("input ids", input_ids) tokens = tokenizer.encode("Hello World, Whats up!!!").tokens print("tokens", tokens) ''' tokenize/encode batch of string ''' batch_tokenized = tokenizer.encode_batch( ["Hello World, Whats up!!!", "Whata whata wa wada wada"]) input_ids = [i.ids for i in batch_tokenized] print("input ids", input_ids) tokens = [i.tokens for i in batch_tokenized] print("tokens", tokens)
class ByteBPETokenizer: def __init__(self, vocab_json, merge_txt, max_length=750): self.tokenizer = ByteLevelBPETokenizer(vocab_json, merge_txt) self.tokenizer.enable_truncation(max_length=max_length) self.tokenizer.enable_padding(max_length=max_length) self.tokenizer.add_special_tokens(["[PAD]", "[CLS]"]) # self.tokenizer.post_processor = RobertaProcessing(("</s>", 2), ("<s>", 1)) # self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base') def encode(self, review): review = clean_sentence(review) encoded = self.tokenizer.encode(review.lower()) # pp_encoded = self.tokenizer.post_process(encoded) return encoded def tokenize2Index(self, review, should_stem=False): encoded = self.encode(review) return encoded.ids def trainBPE(self, paths, vocab_size=30000, min_frequency=10, special_tokens=["[PAD]", "[CLS]"]): tokenizer = ByteLevelBPETokenizer() tokenizer.train(files=paths, vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=special_tokens) tokenizer.save("yelp_bpe/", "yelp-bpe")