def inference(): from tokenizers import ByteLevelBPETokenizer from tokenizers.processors import BertProcessing ''' initialize tokenizer with saved model files ''' tokenizer = ByteLevelBPETokenizer( "./tok_checkpoints/tokenizer_model-vocab.json", "./tok_checkpoints/tokenizer_model-merges.txt", ) ''' optional step : preprocess the strings Ex: add <s> and </s> as BOS and EOS tokens to the string pad string to some max length and truncate string to some max length ''' tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_padding(pad_token='<pad>', pad_id=tokenizer.get_vocab()['<pad>'], length=20) tokenizer.enable_truncation(max_length=20) ''' tokenize/encode strings ''' input_ids = tokenizer.encode("Hello World, Whats up!!!").ids print("input ids", input_ids) tokens = tokenizer.encode("Hello World, Whats up!!!").tokens print("tokens", tokens) ''' tokenize/encode batch of string ''' batch_tokenized = tokenizer.encode_batch( ["Hello World, Whats up!!!", "Whata whata wa wada wada"]) input_ids = [i.ids for i in batch_tokenized] print("input ids", input_ids) tokens = [i.tokens for i in batch_tokenized] print("tokens", tokens)
def __init__(self, path, vocab_size=-1, use_bpe=False, tokenizer_data=""): self.dictionary = Dictionary() if use_bpe: assert os.path.exists(path), "Path does not exist: " + path print( "-------------------------------------------------------------" ) tokenizer = ByteLevelBPETokenizer() if len(tokenizer_data) != 0: print("Training tokenizer on: " + os.path.join(tokenizer_data, 'train.txt')) tokenizer.train([os.path.join(tokenizer_data, 'train.txt')], vocab_size=vocab_size, show_progress=False) else: print("Training tokenizer on: " + os.path.join(path, 'train.txt')) tokenizer.train( [ os.path.join(path, 'train.txt') # os.path.join(path, 'valid.txt'), # os.path.join(path, 'test.txt') ], vocab_size=vocab_size, show_progress=False) print( "-------------------------------------------------------------" ) print("Encoding dataset at: " + path) with open(os.path.join(path, 'train.txt'), 'r', encoding='utf-8') as f: text = f.read() enc = tokenizer.encode(text) tokens = len(enc.ids) ids = torch.LongTensor(tokens) for index, id in enumerate(enc.ids): ids[index] = id self.train = ids self.dictionary.avg_characters_per_token['train'] = len( text) / len(enc.ids) with open(os.path.join(path, 'valid.txt'), 'r', encoding='utf-8') as f: text = f.read() enc = tokenizer.encode(text) tokens = len(enc.ids) ids = torch.LongTensor(tokens) for index, id in enumerate(enc.ids): ids[index] = id self.valid = ids self.dictionary.avg_characters_per_token['valid'] = len( text) / len(enc.ids) with open(os.path.join(path, 'test.txt'), 'r', encoding='utf-8') as f: text = f.read() enc = tokenizer.encode(text) tokens = len(enc.ids) ids = torch.LongTensor(tokens) for index, id in enumerate(enc.ids): ids[index] = id self.test = ids self.dictionary.avg_characters_per_token['test'] = len( text) / len(enc.ids) print( "-------------------------------------------------------------" ) self.dictionary.word2idx = tokenizer.get_vocab() self.dictionary.idx2word = [ tokenizer.id_to_token(x) for x in range(tokenizer.get_vocab_size()) ] self.dictionary.total = tokenizer.get_vocab_size() else: self.train = self.tokenize(os.path.join(path, 'train.txt')) self.valid = self.tokenize(os.path.join(path, 'valid.txt')) self.test = self.tokenize(os.path.join(path, 'test.txt'))