def inference(checkpoint_path, hyperparameters_path, tokenizer_path, merges_path, input='In 1691 Moscow established ', generated_length=64, random_selection=True): # Iitialize tokenizer and model from files tokenizer = ByteLevelBPETokenizer( tokenizer_path, merges_path, add_prefix_space=True, ) #tokenizer2 = Tokenizer(BPE(unk_token="[UNK]")) #tokenizer2.pre_tokenizer2 = Whitespace() #tokenizer2 = Tokenizer.from_file("example/tokenizer.json") #initialize model model = LMModel.load_from_checkpoint(checkpoint_path=checkpoint_path, hparams_file=hyperparameters_path) # Tokenize input sample encoded_sample = tokenizer.encode(input).ids for i in range(generated_length): input_ids = torch.unsqueeze(torch.tensor(encoded_sample, dtype=torch.long), axis=0) # Inference output, attn = model(input_ids) last_word = output[0][-1] if not random_selection: # Pick highest probability token from probability distributions prediction = torch.argmax(output, axis=2).squeeze(axis=0).tolist()[-1] else: # Pick Tokens acording to their probabilities prediction = torch.multinomial(torch.softmax(last_word, 0)**10, 1)[0] # Add prediciton to sequence encoded_sample.append(prediction) # Detokenize output sample decoded_output = tokenizer.decode(encoded_sample) #decoded_output2 = tokenizer2.decode(encoded_sample) output_tokens = [tokenizer.id_to_token(int(id)) for id in encoded_sample] #output_tokens2 = [tokenizer2.id_to_token(int(id)) for id in encoded_sample] #print('\n========================\n ORIGINAL BPE \n========================') #print(output_tokens2, decoded_output2, sep='\n') #print('\n========================\n MODIFIED BPE \n========================') return decoded_output, output_tokens, attn
class HuggingFaceBpeHelper(BPEHelper): """ HuggingFace's ByteLevelBPE Tokenizer. Fast because Rust. """ def __init__(self, opt: Opt, shared: TShared = None): super().__init__(opt, shared) # Default true for HF self.special_tok_map = {} # map from HF self.add_prefix_space = opt.get('bpe_add_prefix_space', True) if self.add_prefix_space is None: self.add_prefix_space = True if opt.get('dict_loaded'): dfname = opt['dict_file'] if PathManager.exists(f'{dfname}-merges.txt'): opt['bpe_merge'] = f'{dfname}-merges.txt' if PathManager.exists(f'{dfname}-vocab.json'): opt['bpe_vocab'] = f'{dfname}-vocab.json' try: from tokenizers import ByteLevelBPETokenizer except ImportError: raise ImportError( 'Please install HuggingFace tokenizer with: pip install tokenizers' ) if self.bpe_dropout: raise NotImplementedError( '--bpe-dropout is not supported with ByteLevelBPE because tokenizers ' 'library does not allow dynamically turning BPE on/off. You can use ' '--dict-tokenizer slow_bytelevel_bpe to gain this feature.' ) if self.lower: warn_once('Are you sure you want to lower case your BPE dictionary?') if self.maxtokens > 0 or self.minfreq > 0: raise ValueError( 'You should not filter vocabulary with using --dict-tokenizer bytelevelbpe' ' (no --dict-minfreq or --dict-maxtokens).' ) if 'bpe_vocab' not in opt: raise ValueError('--bpe-vocab is required for loading pretrained tokenizer') if 'bpe_merge' not in opt: raise ValueError('--bpe-merge is required for loading pretrained tokenizer') self.vocab_path = opt['bpe_vocab'] self.merge_path = opt['bpe_merge'] if not self.vocab_path or not self.merge_path: raise IOError( '--bpe-vocab and --bpe-merge are mandatory with ' '--dict-tokenizer bytelevelbpe' ) if not PathManager.exists(self.vocab_path): raise IOError( f'File {self.vocab_path} does not exist. --bpe-vocab must be pretrained.' ) if not PathManager.exists(self.merge_path): raise IOError( f'File {self.merge_path} does not exist. --bpe-merge must be pretrained.' ) self.tokenizer = ByteLevelBPETokenizer( self.vocab_path, self.merge_path, self.add_prefix_space ) def helper_encode(self, text: str) -> List[str]: """ Decode list of tokens into text string. :param tokens: list of tokens :param delimiter: string delimiter for tokens :return text: decoded text """ return self.tokenizer.encode(text).tokens def helper_decode( self, tokens: List[str], token_ids: List[int], delimiter: str ) -> str: """ Decode list of tokens into text string. :param tokens: list of tokens :param token_ids: list of token ids :param delimiter: string delimiter for tokens :return text: decoded text """ text = self.tokenizer.decode(token_ids, skip_special_tokens=False) return text def add_special_tokens(self, dict_agent, special_tokens: List[str]): """ Add special tokens to the tokenizer and dict_agent. """ logging.debug(f'adding the following special tokens: {special_tokens}') self.tokenizer.add_special_tokens(special_tokens) # add to HF for tok in special_tokens: parlai_key = dict_agent[tok] hf_key = self.tokenizer.token_to_id(tok) self.special_tok_map[parlai_key] = hf_key def sync_with_dict(self, dict_agent): """ Sync the dictionary agent with Hugging Face tokenizer's BPE dict. Called only once on initialization. """ special_tokens = [ dict_agent.null_token, dict_agent.start_token, dict_agent.end_token, dict_agent.unk_token, ] self.add_special_tokens(dict_agent, special_tokens) for i in range(self.tokenizer.get_vocab_size() - len(special_tokens)): token = self.tokenizer.id_to_token(i) dict_agent.add_token(token) # We don't have access to the hugging face word frequency table, # just set it to 1 instead dict_agent.freq[token] = 1 def save(self, dir_name: str, file_name: str): """ Save appropriate files. :param dir_name: directory to save. :param file_name: file to save. """ self.tokenizer.save_model(dir_name, file_name)
class HuggingFaceBpeHelper(BPEHelper): """ HuggingFace's ByteLevelBPE Tokenizer. Fast because Rust. """ def __init__(self, opt: Opt, shared: TShared = None): super().__init__(opt, shared) # Default true for HF self.add_prefix_space = opt.get('bpe_add_prefix_space', True) if self.add_prefix_space is None: self.add_prefix_space = True if opt.get('dict_loaded'): dfname = opt['dict_file'] if os.path.isfile(f'{dfname}-merges.txt'): opt['bpe_merge'] = f'{dfname}-merges.txt' if os.path.isfile(f'{dfname}-vocab.json'): opt['bpe_vocab'] = f'{dfname}-vocab.json' try: from tokenizers import ByteLevelBPETokenizer except ImportError: raise ImportError( 'Please install HuggingFace tokenizer with: pip install tokenizers' ) if self.lower: raise ValueError( 'Only use --dict-lower false with --dict-tokenizer bytelevelbpe' ) if self.maxtokens > 0 or self.minfreq > 0: raise ValueError( 'You should not filter vocabulary with using --dict-tokenizer bytelevelbpe' ' (no --dict-minfreq or --dict-maxtokens).') if 'bpe_vocab' not in opt: raise ValueError( '--bpe-vocab is required for loading pretrained tokenizer') if 'bpe_merge' not in opt: raise ValueError( '--bpe-merge is required for loading pretrained tokenizer') self.vocab_path = opt['bpe_vocab'] self.merge_path = opt['bpe_merge'] if not self.vocab_path or not self.merge_path: raise IOError('--bpe-vocab and --bpe-merge are mandatory with ' '--dict-tokenizer bytelevelbpe') if not os.path.isfile(self.vocab_path): raise IOError( f'File {self.vocab_path} does not exist. --bpe-vocab must be pretrained.' ) if not os.path.isfile(self.merge_path): raise IOError( f'File {self.merge_path} does not exist. --bpe-merge must be pretrained.' ) self.tokenizer = ByteLevelBPETokenizer(self.vocab_path, self.merge_path, self.add_prefix_space) def helper_encode(self, text: str) -> List[str]: """ Decode list of tokens into text string. :param tokens: list of tokens :param delimiter: string delimiter for tokens :return text: decoded text """ return self.tokenizer.encode(text).tokens def helper_decode(self, tokens: List[str], token_ids: List[int], delimiter: str) -> str: """ Decode list of tokens into text string. :param tokens: list of tokens :param token_ids: list of token ids :param delimiter: string delimiter for tokens :return text: decoded text """ text = self.tokenizer.decode(token_ids) return text def sync_with_dict(self, dict_agent): """ Sync the dictionary agent with Hugging Face tokenizer's BPE dict. Called only once on initialization. """ special_tokens = [ dict_agent.null_token, dict_agent.start_token, dict_agent.end_token, dict_agent.unk_token, ] self.tokenizer.add_special_tokens(special_tokens) for i in range(self.tokenizer.get_vocab_size() - 4): token = self.tokenizer.id_to_token(i) dict_agent.add_token(token) # We don't have access to the hugging face word frequency table, # just set it to 1 instead dict_agent.freq[token] = 1 def save(self, dir_name: str, file_name: str): """ Save appropriate files. :param dir_name: directory to save. :param file_name: file to save. """ self.tokenizer.save(dir_name, file_name)
def __init__(self, path, vocab_size=-1, use_bpe=False, tokenizer_data=""): self.dictionary = Dictionary() if use_bpe: assert os.path.exists(path), "Path does not exist: " + path print( "-------------------------------------------------------------" ) tokenizer = ByteLevelBPETokenizer() if len(tokenizer_data) != 0: print("Training tokenizer on: " + os.path.join(tokenizer_data, 'train.txt')) tokenizer.train([os.path.join(tokenizer_data, 'train.txt')], vocab_size=vocab_size, show_progress=False) else: print("Training tokenizer on: " + os.path.join(path, 'train.txt')) tokenizer.train( [ os.path.join(path, 'train.txt') # os.path.join(path, 'valid.txt'), # os.path.join(path, 'test.txt') ], vocab_size=vocab_size, show_progress=False) print( "-------------------------------------------------------------" ) print("Encoding dataset at: " + path) with open(os.path.join(path, 'train.txt'), 'r', encoding='utf-8') as f: text = f.read() enc = tokenizer.encode(text) tokens = len(enc.ids) ids = torch.LongTensor(tokens) for index, id in enumerate(enc.ids): ids[index] = id self.train = ids self.dictionary.avg_characters_per_token['train'] = len( text) / len(enc.ids) with open(os.path.join(path, 'valid.txt'), 'r', encoding='utf-8') as f: text = f.read() enc = tokenizer.encode(text) tokens = len(enc.ids) ids = torch.LongTensor(tokens) for index, id in enumerate(enc.ids): ids[index] = id self.valid = ids self.dictionary.avg_characters_per_token['valid'] = len( text) / len(enc.ids) with open(os.path.join(path, 'test.txt'), 'r', encoding='utf-8') as f: text = f.read() enc = tokenizer.encode(text) tokens = len(enc.ids) ids = torch.LongTensor(tokens) for index, id in enumerate(enc.ids): ids[index] = id self.test = ids self.dictionary.avg_characters_per_token['test'] = len( text) / len(enc.ids) print( "-------------------------------------------------------------" ) self.dictionary.word2idx = tokenizer.get_vocab() self.dictionary.idx2word = [ tokenizer.id_to_token(x) for x in range(tokenizer.get_vocab_size()) ] self.dictionary.total = tokenizer.get_vocab_size() else: self.train = self.tokenize(os.path.join(path, 'train.txt')) self.valid = self.tokenize(os.path.join(path, 'valid.txt')) self.test = self.tokenize(os.path.join(path, 'test.txt'))
class HuggingfaceTokenizerBPE(nn.Module): def __init__(self, text_files, dataset_info_path='', config_data=None): super().__init__() # The default vocab size in the BERT model is 30522. If we want a number larger than that, we will also have to # change the BERT configuration. vocab_size = 30000 self.info = f'hug{vocab_size}' with open(f'config/data/{config_data}.json') as json_file: tokenizer_from = json.load(json_file)['tokenizer_from'] config_name = config_data if tokenizer_from == "" else tokenizer_from print( os.path.join(dataset_info_path, f'tokenizer_{config_name}_{vocab_size}-vocab.json')) # The loading is only properly implemented starting from version 0.8. However, it makes the system use a lot of # CPU for no reason (it is much slower). Maybe it will be fixed in the future. if not os.path.isfile( os.path.join( dataset_info_path, f'tokenizer_{config_name}_{vocab_size}-vocab.json')): text_files = text_files() self.tokenizer = ByteLevelBPETokenizer() # Join into a single file. This should NOT be necessary but it does not work properly with a lot of files with open('/tmp/text_files.txt', 'wb') as outfile: for filename in tqdm( text_files, desc='Joining all files into one for tokenization'): with open(filename, 'rb') as readfile: shutil.copyfileobj(readfile, outfile) text_files = '/tmp/text_files.txt' self.tokenizer.train(text_files, vocab_size=vocab_size, special_tokens=special_tokens) self.tokenizer.save(dataset_info_path, f'tokenizer_{config_name}_{vocab_size}') # No "else", always load for consistency vocab_file = os.path.join( dataset_info_path, f'tokenizer_{config_name}_{vocab_size}-vocab.json') merges_file = os.path.join( dataset_info_path, f'tokenizer_{config_name}_{vocab_size}-merges.txt') self.tokenizer = ByteLevelBPETokenizer(vocab_file=vocab_file, merges_file=merges_file) self.tokenizer.add_special_tokens(special_tokens) self.index_special_tokens = { tok: self.tokenizer.encode(tok).ids[0] for tok in special_tokens } @property def device(self): return self._float_tensor.device def encode(self, sentence: str): output = self.tokenizer.encode(sentence) token_ids = output.ids tokens = output.tokens return torch.tensor(token_ids), tokens def decode(self, tokens: torch.LongTensor): assert tokens.dim() == 1 tokens = list(tokens.cpu().numpy()) sentences = self.tokenizer.decode(tokens) return sentences def id_to_token(self, token_id): if type(token_id) != torch.Tensor: token_id = torch.tensor(token_id) return self.tokenizer.id_to_token(token_id) def token_to_id(self, token): assert type(token) == str return self.tokenizer.token_to_id(token) def __len__(self): return self.tokenizer.get_vocab_size() # This is simply for PyCharm to find the correct reference to the methods of the class def __call__(self, *input, **kwargs) -> typing.Any: return super().__call__(*input, **kwargs)