def test_train_from_iterator(self): text = ["A first sentence", "Another sentence", "And a last one"] tokenizer = CharBPETokenizer() tokenizer.train_from_iterator(text, show_progress=False) output = tokenizer.encode("A sentence") assert output.tokens == ["A</w>", "sentence</w>"]
def test_basic_encode(self, openai_files): tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"]) output = tokenizer.encode("My name is John", "pair") assert output.ids == [0, 253, 1362, 544, 0, 7, 12662, 2688] assert output.tokens == [ "<unk>", "y</w>", "name</w>", "is</w>", "<unk>", "o", "hn</w>", "pair</w>", ] assert output.offsets == [ (0, 1), (1, 2), (3, 7), (8, 10), (11, 12), (12, 13), (13, 15), (0, 4), ] assert output.type_ids == [0, 0, 0, 0, 0, 0, 0, 1]
class HuggingFaceTokenizer: def __init__(self, cache_dir, max_length=None, vocab_size=400): self.vocab_size = vocab_size self.max_length = max_length self.cache_dir = cache_dir self.name = "%d-%s" % (vocab_size, max_length) self.tokenizer = None vocab = os.path.join(self.cache_dir, self.name + '-vocab.json') merges = os.path.join(self.cache_dir, self.name + '-merges.txt') if os.path.exists(vocab) and os.path.exists(merges): self.tokenizer = CharBPETokenizer(vocab, merges, lowercase=True) print('Using cached HuggingFaceTokenizer') def build(self, texts): if self.tokenizer is not None: return tmp_file = tempfile.NamedTemporaryFile() with open(tmp_file.name, "w") as f: f.write(' '.join(texts).lower()) self.tokenizer = CharBPETokenizer(lowercase=True) self.tokenizer.train( [tmp_file.name], vocab_size=self.vocab_size, special_tokens=[ NUL_token, PAD_token, BOS_token, UNK_token, ], ) os.makedirs(self.cache_dir, exist_ok=True) self.tokenizer.save(self.cache_dir, self.name) def encode(self, text): token_ids = self.tokenizer.encode(text.lower()).ids token_ids = token_ids[:self.max_length] return token_ids def decode(self, tokens, skip_special_tokens=True): text = self.tokenizer.decode( # My special tokens tokens, # [token for token in tokens if token > 3], # aren't skipped skip_special_tokens=skip_special_tokens, # even I set f*****g ) # skip_special_tokens return text # to True def decode_plus(self, token_batch): sentences = [] for tokens in token_batch: sentences.append(self.decode(tokens)) return sentences
def test_lowercase(self, openai_files): tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"], lowercase=True) output = tokenizer.encode("My name is John", "pair", add_special_tokens=False) assert output.ids == [547, 1362, 544, 2476, 2688] assert output.tokens == [ "my</w>", "name</w>", "is</w>", "john</w>", "pair</w>" ] assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15), (0, 4)] assert output.type_ids == [0, 0, 0, 0, 1]
def test(): """Test trained tokenizer""" tokenizer = CharBPETokenizer('./thyme-tokenizer-vocab.json', './thyme-tokenizer-merges.txt') vocab = tokenizer.get_vocab() print('vocab size:', len(vocab)) encoded = tokenizer.encode('patient dr. who diagnosed with brain abc') encoded.pad(15) print('encoded:', encoded.ids) print('decoded:', tokenizer.decode(encoded.ids)) print(encoded.tokens) print(encoded.attention_mask)
class SubwordEncoder: "Subword tokenization" def __init__(self, path='subword/'): """ Args: path: str, a path to vocab file. """ # Load vocab self.subword_tokenizer = CharBPETokenizer(vocab_file=path+"/bpe-vocab.json", merges_file=path+"/bpe-merges.txt") self.encode = self._encode_subwords self.id_to_token = self._id_to_subword() self.token_to_id = self._subword_to_id() def get_vocab_size(self): return self.subword_tokenizer.get_vocab_size() def _encode_subwords(self, sentence, with_eos): """ Args: sentence: str, texts to be encoded. with_eos: end with <EOS> token. Returns: tokens: list, encoded sequence. """ tokens = self.subword_tokenizer.encode(sentence).ids if with_eos: tokens += [2] # 2 is the id of <EOS> token return tokens def _id_to_subword(self): id2subword = {} for i in range(self.get_vocab_size()): id2subword[i] = self.subword_tokenizer.id_to_token(i) return id2subword def _subword_to_id(self): subword2id = {} for i in range(self.get_vocab_size()): subword2id[self.subword_tokenizer.id_to_token(i)] = i return subword2id
def to_lstm_inputs(texts, max_len=None): """Padded at the beginning rather than at the end""" tokenizer = CharBPETokenizer( '../Tokenize/thyme-tokenizer-vocab.json', '../Tokenize/thyme-tokenizer-merges.txt') seqs = [tokenizer.encode(text).ids for text in texts] if max_len is None: # set max_len to the length of the longest sequence max_len = max(len(id_seq) for id_seq in seqs) ids = torch.zeros(len(seqs), max_len, dtype=torch.long) for i, seq in enumerate(seqs): if len(seq) > max_len: seq = seq[:max_len] ids[i, -len(seq):] = torch.tensor(seq) return ids
def to_token_id_sequences(texts, max_len=None): """Matrix of token ids""" tokenizer = CharBPETokenizer( '../Tokenize/thyme-tokenizer-vocab.json', '../Tokenize/thyme-tokenizer-merges.txt') seqs = [tokenizer.encode(text).ids for text in texts] if max_len is None: # set max_len to the length of the longest sequence max_len = max(len(id_seq) for id_seq in seqs) ids = torch.zeros(len(seqs), max_len, dtype=torch.long) for i, seq in enumerate(seqs): if len(seq) > max_len: seq = seq[:max_len] ids[i, :len(seq)] = torch.tensor(seq) return ids
def to_transformer_inputs(texts, max_len=None): """Matrix of token ids and a square attention mask for eash sample""" tokenizer = CharBPETokenizer( '../Tokenize/thyme-tokenizer-vocab.json', '../Tokenize/thyme-tokenizer-merges.txt') seqs = [tokenizer.encode(text).ids for text in texts] if max_len is None: # set max_len to the length of the longest sequence max_len = max(len(id_seq) for id_seq in seqs) ids = torch.zeros(len(seqs), max_len, dtype=torch.long) mask = torch.zeros(len(seqs), max_len, max_len, dtype=torch.long) for i, seq in enumerate(seqs): if len(seq) > max_len: seq = seq[:max_len] ids[i, :len(seq)] = torch.tensor(seq) mask[i, :len(seq), :len(seq)] = 1 return ids, mask
class BPETokenizer: def __init__(self, text_list, vocab_size, lazy=False): if not lazy: self.tokenizer = CharBPETokenizer() self.tokenizer.train(text_list, vocab_size=vocab_size, special_tokens=[PAD, BOS, EOS, "<unk>"]) self.tokenizer.add_special_tokens([PAD, BOS, EOS]) else: self.tokenizer = None def tokens_to_ids(self, tokens): return [self.tokenizer.token_to_id(t) for t in tokens] def ids_to_tokens(self, ids): return [self.tokenizer.id_to_token(i) for i in ids] def encode(self, text): encodes = self.tokenizer.encode(text) return encodes.ids def decode(self, ids, skip_special=True): return self.tokenizer.decode(ids, skip_special_tokens=skip_special) def save(self, path, file_name): self.tokenizer.save(path, file_name) @classmethod def load(cls, vocab, merges): tkz = cls(None, None, lazy=True) tkz.tokenizer = CharBPETokenizer(vocab, merges) tkz.tokenizer.add_special_tokens([PAD, BOS, EOS]) return tkz def __len__(self): return self.tokenizer.get_vocab_size()
from tokenizers import CharBPETokenizer import json import tqdm if __name__ == "__main__": # Initialize a tokenizer tokenizer = CharBPETokenizer() # Then train it! tokenizer.train( [ "data\\train.txt", "D:/数据/wikitext-2-raw-v1/wikitext-2-raw/wiki.train.raw", "D:/数据/webtext2019zh/web_text_raw.txt" ], vocab_size=30000, min_frequency=2, special_tokens=['<UNK>', '<BOS>', '<EOS>', '<PAD>', '<CLS>', '<SEP>']) # Now, let's use it: encoded = tokenizer.encode("I can feel the magic, can you?") # And finally save it somewhere tokenizer.save("./", "bpe.tokenizer.json")
class EngGerNewstest(Dataset): """ The newstest 2014 dataset used for testing """ def __init__(self, data_folder, rank=0, val_set=False, world_size=1, seed=0, eng_to_ger=True, vocab_size=37000, MASK="<MASK>", START="<START>", STOP="<STOP>", exp_name="", max_context=None, batch_size=128, val_size=30000, **kwargs): """ rank: int the rank in the distributed training val_set: bool if true, this dataset is created as the validation set world_size: int the number of processes if using distributed training seed: int random seed data_folder: str the path to the folder that should contain a `train.en` and a `train.de` file. eng_to_ger: bool if true, the x values are returned as english ids and the y values are german ids. If false, then visa-versa vocab_size: int the number of encodings for the byte-pair encoding scheme MASK: str the mask token START: str the start token STOP: str the stop token exp_name: str name of the experiment max_context: int the maximum sequence length val_size: int the number of samples to be set aside for validation """ self.rank = rank print("rank:", self.rank) self.world_size = world_size self.val_set = val_set self.val_size = val_size self.batch_size = batch_size self.data_folder = os.path.expanduser(data_folder) self.en_path = os.path.join(data_folder, "newstest2014.en") self.de_path = os.path.join(data_folder, "newstest2014.de") self.eng_to_ger = eng_to_ger self.vocab_size = vocab_size self.MASK = MASK self.START = START self.STOP = STOP self.max_context = max_context self.en_tok_path = os.path.join(self.data_folder, "en_tokenizer") self.de_tok_path = os.path.join(self.data_folder, "de_tokenizer") self.en_arr_path = os.path.join(self.data_folder, "en_bcolz") self.de_arr_path = os.path.join(self.data_folder, "de_bcolz") self.en_lens_path = os.path.join(self.data_folder, "en_bcolz_lens") self.de_lens_path = os.path.join(self.data_folder, "de_bcolz_lens") # Train tokenizers if rank == 0: print("Tokenizing english..") self.en_tokenizer = CharBPETokenizer() if os.path.exists(self.en_tok_path): # Load trained tokenizer if rank == 0: print("loading from pretrained tokenizer", self.en_tok_path) self.en_tokenizer = ml_utils.datas.load_tokenizer( self.en_tokenizer, self.en_tok_path) else: self.en_tokenizer.train([self.en_path], vocab_size=vocab_size) os.mkdir(self.en_tok_path) self.en_tokenizer.save_model(self.en_tok_path) self.en_tokenizer.add_special_tokens([self.MASK]) self.en_tokenizer.add_tokens([self.START]) self.en_tokenizer.add_tokens([self.STOP]) self.en_mask_idx = self.en_tokenizer.token_to_id(self.MASK) self.en_start_idx = self.en_tokenizer.token_to_id(self.START) self.en_stop_idx = self.en_tokenizer.token_to_id(self.STOP) if rank == 0: print("Tokenizing german..") self.de_tokenizer = CharBPETokenizer() if os.path.exists(self.de_tok_path): # Load trained tokenizer if rank == 0: print("loading from pretrained tokenizer", self.de_tok_path) self.de_tokenizer = ml_utils.datas.load_tokenizer( self.de_tokenizer, self.de_tok_path) else: self.de_tokenizer.train([self.de_path], vocab_size=vocab_size) os.mkdir(self.de_tok_path) self.de_tokenizer.save_model(self.de_tok_path) self.de_tokenizer.add_special_tokens([self.MASK]) self.de_tokenizer.add_tokens([self.START]) self.de_tokenizer.add_tokens([self.STOP]) self.de_mask_idx = self.de_tokenizer.token_to_id(self.MASK) self.de_start_idx = self.de_tokenizer.token_to_id(self.START) self.de_stop_idx = self.de_tokenizer.token_to_id(self.STOP) # Get English sentence lists if rank == 0: print("Making english idxs") self.en_max_len = 0 self.en_idxs = [] self.en_lens = [] with open(self.en_path, 'r') as f: for i, l in tqdm(enumerate(f.readlines())): l = l.strip() if len(l) > 0: output = self.en_tokenizer.encode(l) ids = [self.en_start_idx]+list(output.ids)\ +[self.en_stop_idx] self.en_idxs.append(ids) self.en_lens.append(len(ids)) if len(ids) > self.en_max_len: self.en_max_len = len(ids) if exp_name == "test" and i > 100: break mask = [self.en_mask_idx for i in range(self.en_max_len)] l = 0 if rank == 0: print("Padding english idxs") for i in tqdm(range(len(self.en_idxs))): diff = self.en_max_len - len(self.en_idxs[i]) self.en_idxs[i] = self.en_idxs[i] + mask[:diff] # Get German Sentence Lists if rank == 0: print("Making german idxs") self.de_max_len = 0 self.de_idxs = [] self.de_lens = [] with open(self.de_path, 'r') as f: for i, l in tqdm(enumerate(f.readlines())): l = l.strip() if len(l) > 0: output = self.de_tokenizer.encode(l) ids = [self.de_start_idx]+list(output.ids)\ +[self.de_stop_idx] self.de_idxs.append(ids) self.de_lens.append(len(ids)) if len(ids) > self.de_max_len: self.de_max_len = len(ids) if exp_name == "test" and i > 100: break mask = [self.de_mask_idx for i in range(self.de_max_len)] if rank == 0: print("Padding german idxs") for i in tqdm(range(len(self.de_idxs))): diff = self.de_max_len - len(self.de_idxs[i]) self.de_idxs[i] = self.de_idxs[i] + mask[:diff] if rank == 0: print("Converting to numpy arrays") if self.eng_to_ger: self.X = np.asarray(self.en_idxs) self.X_lens = np.asarray(self.en_lens) self.X_tokenizer = self.en_tokenizer self.X_mask_idx = self.en_mask_idx self.X_start_idx = self.en_start_idx self.X_stop_idx = self.en_stop_idx self.X_max_len = self.en_max_len self.Y = np.asarray(self.de_idxs) self.Y_lens = np.asarray(self.de_lens) self.Y_tokenizer = self.de_tokenizer self.Y_mask_idx = self.de_mask_idx self.Y_start_idx = self.de_start_idx self.Y_stop_idx = self.de_stop_idx self.Y_max_len = self.de_max_len else: self.X = np.asarray(self.de_idxs) self.X_lens = np.asarray(self.de_lens) self.X_tokenizer = self.de_tokenizer self.X_mask_idx = self.de_mask_idx self.X_start_idx = self.de_start_idx self.X_stop_idx = self.de_stop_idx self.X_max_len = self.de_max_len self.Y = np.asarray(self.en_idxs) self.Y_lens = np.asarray(self.en_lens) self.Y_tokenizer = self.en_tokenizer self.Y_mask_idx = self.en_mask_idx self.Y_start_idx = self.en_start_idx self.Y_stop_idx = self.en_stop_idx self.Y_max_len = self.en_max_len def __len__(self): return len(self.en_idxs) #def __getitem__(self,i,l=None): # if l is None: # l = self.X_lens[int(i)] # idxs = np.zeros(1) # margin = 5 # while idxs.sum()<25 and margin < 400: # min_l = l-margin # max_l = l+margin # idxs = (self.X_lens>min_l)&(self.X_lens<max_l) # margin += 5 # max_l = min(np.max(self.X_lens[idxs]),self.max_context) # if max_l < 50 : batch_size = self.batch_size # elif max_l < 70: batch_size = self.batch_size//2 # elif max_l < 100: batch_size = self.batch_size//4 # elif max_l < 120: batch_size = self.batch_size//8 # elif max_l < 140: batch_size = self.batch_size//16 # elif max_l < 160: batch_size = self.batch_size//32 # else: batch_size = self.batch_size//64 # batch_size = max(16,batch_size) # perm = np.random.permutation(idxs.sum())[:batch_size] # max_l = np.max(self.X_lens[idxs][perm]) # x = np.asarray(self.X[idxs][perm,:max_l]) # max_l = np.max(self.Y_lens[idxs][perm]) # y = np.asarray(self.Y[idxs][perm,:max_l]) # return torch.LongTensor(x), torch.LongTensor(y) def __getitem__(self, idx): return torch.LongTensor(self.X[idx]), torch.LongTensor(self.Y[idx]) def get_largest_batch(self, size_num): l = 10 if size_num == 1: l = 25 elif size_num == 2: l = 400 elif size_num == 3: l = 130 elif size_num == 4: l = 75 elif size_num == 5: l = 44 elif size_num == 6: l = 94 elif size_num == 7: l = 200 elif size_num == 8: l = 300 return self.__getitem__(0, l) def X_idxs2tokens(self, idxs): """ idxs: LongTensor (N,) converts an array of tokens to a sentence """ return self.X_tokenizer.decode(idxs) def Y_idxs2tokens(self, idxs): """ idxs: LongTensor (N,) converts an array of tokens to a sentence """ return self.Y_tokenizer.decode(idxs)
class EngGerDataset(Dataset): """ Can be english to german or german to english. """ def __init__(self, data_folder, rank=0, val_set=False, world_size=1, seed=0, eng_to_ger=True, vocab_size=37000, MASK="<MASK>", START="<START>", STOP="<STOP>", exp_name="", max_context=None, batch_size=128, val_size=30000, **kwargs): """ rank: int the rank in the distributed training val_set: bool if true, this dataset is created as the validation set world_size: int the number of processes if using distributed training seed: int random seed data_folder: str the path to the folder that should contain a `train.en` and a `train.de` file. eng_to_ger: bool if true, the x values are returned as english ids and the y values are german ids. If false, then visa-versa vocab_size: int the number of encodings for the byte-pair encoding scheme MASK: str the mask token START: str the start token STOP: str the stop token exp_name: str name of the experiment max_context: int the maximum sequence length val_size: int the number of samples to be set aside for validation """ self.rank = rank print("rank:", self.rank) self.world_size = world_size self.val_set = val_set self.val_size = val_size self.batch_size = batch_size self.data_folder = os.path.expanduser(data_folder) self.en_path = os.path.join(data_folder, "train.en") self.de_path = os.path.join(data_folder, "train.de") self.eng_to_ger = eng_to_ger self.vocab_size = vocab_size self.MASK = MASK self.START = START self.STOP = STOP self.max_context = max_context self.en_tok_path = os.path.join(self.data_folder, "en_tokenizer") self.de_tok_path = os.path.join(self.data_folder, "de_tokenizer") self.en_arr_path = os.path.join(self.data_folder, "en_bcolz") self.de_arr_path = os.path.join(self.data_folder, "de_bcolz") self.en_lens_path = os.path.join(self.data_folder, "en_bcolz_lens") self.de_lens_path = os.path.join(self.data_folder, "de_bcolz_lens") # Train tokenizers if rank == 0: print("Tokenizing english..") self.en_tokenizer = CharBPETokenizer() if os.path.exists(self.en_tok_path): # Load trained tokenizer if rank == 0: print("loading from pretrained tokenizer", self.en_tok_path) self.en_tokenizer = ml_utils.datas.load_tokenizer( self.en_tokenizer, self.en_tok_path) else: self.en_tokenizer.train([self.en_path], vocab_size=vocab_size) os.mkdir(self.en_tok_path) self.en_tokenizer.save_model(self.en_tok_path) self.en_tokenizer.add_special_tokens([self.MASK]) self.en_tokenizer.add_tokens([self.START]) self.en_tokenizer.add_tokens([self.STOP]) self.en_mask_idx = self.en_tokenizer.token_to_id(self.MASK) self.en_start_idx = self.en_tokenizer.token_to_id(self.START) self.en_stop_idx = self.en_tokenizer.token_to_id(self.STOP) if rank == 0: print("Tokenizing german..") self.de_tokenizer = CharBPETokenizer() if os.path.exists(self.de_tok_path): # Load trained tokenizer if rank == 0: print("loading from pretrained tokenizer", self.de_tok_path) self.de_tokenizer = ml_utils.datas.load_tokenizer( self.de_tokenizer, self.de_tok_path) else: self.de_tokenizer.train([self.de_path], vocab_size=vocab_size) os.mkdir(self.de_tok_path) self.de_tokenizer.save_model(self.de_tok_path) self.de_tokenizer.add_special_tokens([self.MASK]) self.de_tokenizer.add_tokens([self.START]) self.de_tokenizer.add_tokens([self.STOP]) self.de_mask_idx = self.de_tokenizer.token_to_id(self.MASK) self.de_start_idx = self.de_tokenizer.token_to_id(self.START) self.de_stop_idx = self.de_tokenizer.token_to_id(self.STOP) # Get English sentence lists if rank == 0: print("Making english idxs") if os.path.exists(self.en_arr_path): if rank == 0: print("loading from bcolz", self.en_arr_path) self.en_idxs = bcolz.carray(rootdir=self.en_arr_path) self.en_lens = bcolz.carray(rootdir=self.en_lens_path) self.en_max_len = self.en_idxs.shape[-1] if exp_name == "test": self.val_size = 250 self.en_idxs = self.en_idxs[:1000] self.en_lens = self.en_lens[:1000] if self.world_size > 1: with temp_seed(seed - rank): sample_perm = np.random.permutation(len(self.en_idxs)) if not self.val_set: n_samps = (len(self.en_idxs) - self.val_size) n_samps = n_samps // self.world_size indices = sample_perm[rank * n_samps:(rank + 1) * n_samps] else: indices = sample_perm[-self.val_size:] try: if rank == 0: print("splitting dataset.. ", end="") starttime = time.time() self.en_idxs = self.en_idxs[indices] self.en_lens = self.en_lens[indices] if rank == 0: print("duration:", time.time() - starttime) except: temp_idxs = [] temp_lens = [] if rank == 0: print("Collecting data") rnge = tqdm(indices) else: rnge = indices for i in rnge: temp_idxs.append(self.en_idxs[i]) temp_lens.append(self.en_lens[i]) self.en_idxs = np.asarray(temp_idxs) self.en_lens = np.asarray(temp_lens) if rank == 0: print("duration:", time.time() - starttime) elif world_size == 1: self.en_max_len = 0 self.en_idxs = [] self.en_lens = [] with open(self.en_path, 'r') as f: for i, l in tqdm(enumerate(f.readlines())): l = l.strip() if len(l) > 0: output = self.en_tokenizer.encode(l) ids = [self.en_start_idx]+list(output.ids)\ +[self.en_stop_idx] self.en_idxs.append(ids) self.en_lens.append(len(ids)) if len(ids) > self.en_max_len: self.en_max_len = len(ids) if exp_name == "test" and i > 100: break mask = [self.en_mask_idx for i in range(self.en_max_len)] l = 0 if rank == 0: print("Padding english idxs") for i in tqdm(range(len(self.en_idxs))): diff = self.en_max_len - len(self.en_idxs[i]) self.en_idxs[i] = self.en_idxs[i] + mask[:diff] if rank == 0: print("Saving to bcolz") self.en_idxs = bcolz.carray(self.en_idxs, rootdir=self.en_arr_path, dtype="int32") self.en_idxs.flush() self.en_lens = bcolz.carray(self.en_lens, rootdir=self.en_lens_path, dtype="int32") self.en_lens.flush() else: print("Make dataset without using multi-processing!!") assert False if self.en_max_len > max_context: if rank == 0: print("Truncating context from", self.en_max_len, "to", self.max_context) self.en_max_len = self.max_context # Get German Sentence Lists if rank == 0: print("Making german idxs") if os.path.exists(self.de_arr_path): if rank == 0: print("loading from bcolz", self.de_arr_path) self.de_idxs = bcolz.carray(rootdir=self.de_arr_path) self.de_lens = bcolz.carray(rootdir=self.de_lens_path) self.de_max_len = self.de_idxs.shape[-1] if exp_name == "test": self.val_size = 250 self.en_idxs = self.en_idxs[:1000] self.en_lens = self.en_lens[:1000] if self.world_size > 1: try: if rank == 0: print("splitting dataset.. ", end="") starttime = time.time() self.de_idxs = self.de_idxs[indices] self.de_lens = self.de_lens[indices] if rank == 0: print("duration:", time.time() - starttime) except: temp_idxs = [] temp_lens = [] try: if rank == 0: print("Collecting data") for i in rnge: temp_idxs.append(self.de_idxs[i]) temp_lens.append(self.de_lens[i]) except Exception as e: print("Likely error caused by bcolz existing "+\ "for en but not de data") print(e) assert False self.de_idxs = np.asarray(temp_idxs) self.de_lens = np.asarray(temp_lens) if rank == 0: print("duration:", time.time() - starttime) else: self.de_max_len = 0 self.de_idxs = [] self.de_lens = [] with open(self.de_path, 'r') as f: for i, l in tqdm(enumerate(f.readlines())): l = l.strip() if len(l) > 0: output = self.de_tokenizer.encode(l) ids = [self.de_start_idx]+list(output.ids)\ +[self.de_stop_idx] self.de_idxs.append(ids) self.de_lens.append(len(ids)) if len(ids) > self.de_max_len: self.de_max_len = len(ids) if exp_name == "test" and i > 100: break mask = [self.de_mask_idx for i in range(self.de_max_len)] if rank == 0: print("Padding german idxs") for i in tqdm(range(len(self.de_idxs))): diff = self.de_max_len - len(self.de_idxs[i]) self.de_idxs[i] = self.de_idxs[i] + mask[:diff] if rank == 0: print("Saving to bcolz") self.de_idxs = bcolz.carray(self.de_idxs, rootdir=self.de_arr_path, dtype="int32") self.de_idxs.flush() self.de_lens = bcolz.carray(self.de_lens, rootdir=self.de_lens_path, dtype="int32") self.de_lens.flush() if self.de_max_len > max_context: if rank == 0: print("Truncating context from", self.de_max_len, "to", self.max_context) self.de_max_len = self.max_context if rank == 0: print("Converting to numpy arrays") if self.eng_to_ger: self.X = np.asarray(self.en_idxs) self.X_lens = np.asarray(self.en_lens) self.X_tokenizer = self.en_tokenizer self.X_mask_idx = self.en_mask_idx self.X_start_idx = self.en_start_idx self.X_stop_idx = self.en_stop_idx self.X_max_len = self.en_max_len self.Y = np.asarray(self.de_idxs) self.Y_lens = np.asarray(self.de_lens) self.Y_tokenizer = self.de_tokenizer self.Y_mask_idx = self.de_mask_idx self.Y_start_idx = self.de_start_idx self.Y_stop_idx = self.de_stop_idx self.Y_max_len = self.de_max_len else: self.X = np.asarray(self.de_idxs) self.X_lens = np.asarray(self.de_lens) self.X_tokenizer = self.de_tokenizer self.X_mask_idx = self.de_mask_idx self.X_start_idx = self.de_start_idx self.X_stop_idx = self.de_stop_idx self.X_max_len = self.de_max_len self.Y = np.asarray(self.en_idxs) self.Y_lens = np.asarray(self.en_lens) self.Y_tokenizer = self.en_tokenizer self.Y_mask_idx = self.en_mask_idx self.Y_start_idx = self.en_start_idx self.Y_stop_idx = self.en_stop_idx self.Y_max_len = self.en_max_len def __len__(self): return len(self.en_idxs) def __getitem__(self, i, l=None): if l is None: l = self.X_lens[int(i)] idxs = np.zeros(1) margin = 5 while idxs.sum() < 25 and margin < 400: min_l = l - margin max_l = l + margin idxs = (self.X_lens > min_l) & (self.X_lens < max_l) margin += 5 max_l = min(np.max(self.X_lens[idxs]), self.max_context) if max_l < 50: batch_size = self.batch_size elif max_l < 70: batch_size = self.batch_size // 2 elif max_l < 100: batch_size = self.batch_size // 4 elif max_l < 120: batch_size = self.batch_size // 8 elif max_l < 140: batch_size = self.batch_size // 16 elif max_l < 160: batch_size = self.batch_size // 32 else: batch_size = self.batch_size // 64 batch_size = max(16, batch_size) perm = np.random.permutation(idxs.sum())[:batch_size] max_l = np.max(self.X_lens[idxs][perm]) x = np.asarray(self.X[idxs][perm, :max_l]) max_l = np.max(self.Y_lens[idxs][perm]) y = np.asarray(self.Y[idxs][perm, :max_l]) return torch.LongTensor(x), torch.LongTensor(y) def get_largest_batch(self, size_num): l = 10 if size_num == 1: l = 25 elif size_num == 2: l = 400 elif size_num == 3: l = 130 elif size_num == 4: l = 75 elif size_num == 5: l = 44 elif size_num == 6: l = 94 elif size_num == 7: l = 200 elif size_num == 8: l = 300 return self.__getitem__(0, l) def X_idxs2tokens(self, idxs): """ idxs: LongTensor (N,) converts an array of tokens to a sentence """ return self.X_tokenizer.decode(idxs) def Y_idxs2tokens(self, idxs): """ idxs: LongTensor (N,) converts an array of tokens to a sentence """ return self.Y_tokenizer.decode(idxs)
print(tokenizer_code.get_vocab) print(tokenizer_doc.get_vocab) #use the trained tokenizer_code to encode and write in output_file file_dir = "data/ncs_preprocessed_data/train-CoDesc/" src_file_name = "code.original_subtoken" tgt_file_name = "code.bpe" output_file = open(file_dir + "/" + tgt_file_name, "w") output_file.close() output_file = open(file_dir + "/" + tgt_file_name, "a") with open(file_dir + "/" + src_file_name, 'r') as file: for line in file: output = tokenizer_code.encode(line) line = ' '.join(output.tokens).replace("</w>", "") output_file.write(line) output_file.write("\n") output_file.close() #use the trained tokenizer_doc to encode and write in output_file file_dir = "data/ncs_preprocessed_data/train-CoDesc/" src_file_name = "javadoc.original" tgt_file_name = "javadoc.bpe" output_file = open(file_dir + "/" + tgt_file_name, "w") output_file.close() output_file = open(file_dir + "/" + tgt_file_name, "a") with open(file_dir + "/" + src_file_name, 'r') as file:
def torchtext_iterators(args): """ Builds torchtext iterators from the files. """ logger = logging.getLogger('logger') logger.info('Starting to load data and create iterators.') # Tokenizer. if args['model_name'] == 'roberta': tokenizer = lambda x: [x] elif args['subword']: tokenizer = 'subword' elif args['bpe']: bpe_tokenizer = CharBPETokenizer('log/bpe-trained-vocab.json', 'log/bpe-trained-merges.txt') tokenizer = lambda x: bpe_tokenizer.encode(x).tokens else: tokenizer = None # `sequential` does not tokenize the label. label = data.Field(batch_first=True, sequential=False) text = data.Field(batch_first=True, lower=True, tokenize=tokenizer) fields = [('text', text), ('label', label)] train = data.TabularDataset(args['train_path'], 'tsv', fields, skip_header=True) valid = data.TabularDataset(args['valid_path'], 'tsv', fields, skip_header=True) test = data.TabularDataset(args['test_path'], 'tsv', [('text', text)], skip_header=True) text.build_vocab(train, min_freq=args['min_freq']) label.build_vocab(train) train_iter, valid_iter, test_iter = torchtext.data.BucketIterator.splits( (train, valid, test), batch_size=args['batch_size'], repeat=False, device=torch.device(args['device']), sort=False, sort_within_batch=False) if not args['no_pretrained_vectors']: if not args['load_vectors_manually']: logger.info('Starting to load vectors from Glove.') text.vocab.load_vectors(vectors=GloVe(name='6B')) else: logger.info('Starting to manually load vectors from FastText.') vector_map, stoi = load_vectors(args['fasttext_path'], text.vocab, torch.device(args['device'])) average_embed = get_average_embedding(vector_map) text.vocab.set_vectors(stoi, vector_map, 300, unk_init=lambda x: average_embed.clone()) text.vocab.vectors[ text.vocab.stoi['<unk>']] = average_embed.clone() logger.info('Built train vocabulary of {} words'.format(len(text.vocab))) return train_iter, valid_iter, test_iter, text, label
tokenizer = CharBPETokenizer() # And then train tokenizer.train( files, vocab_size=args.vocab_size, min_frequency=2, show_progress=True, special_tokens=['<unk>'], suffix='</w>', limit_alphabet=args.limit_alphabet, ) # Save the files tokenizer.save(args.out, args.name) # Restoring model from learned vocab/merges tokenizer = CharBPETokenizer( join(args.out, '{}-vocab.json'.format(args.name)), join(args.out, '{}-merges.txt'.format(args.name)), ) # Test encoding logger.info( 'Tokens and their ids from CharBPETokenizer with GFP protein sequence: \n MSKGEE LFTGVVPILVELDGDVNGHKFSVSGEGEG DAT' ) encoded = tokenizer.encode('MSKGEE LFTGVVPILVELDGDVNGHKFSVSGEGEG DAT') logger.info(encoded.tokens) logger.info(encoded.ids) logger.info('done!')
special_tokens=[ "<blank>", "<bos>", "<unk>", ], ) # os.makedirs('./BPE-1000', exist_ok=True) tokenizer.save(f'./BPE-1000', '') tokenizer = CharBPETokenizer('./BPE-1000/-vocab.json', './BPE-1000/-merges.txt') # with open('.test.pkl', 'w') as f: # pickle.dump(tokenizer, f) tokenizer = HuggingFaceTokenizer() print( tokenizer.encode( 'might have a solution it might take a long time nobody')) print( tokenizer.decode( tokenizer.encode( 'might have a solution it might take a long time nobody'), )) # transforms = torchaudio.transforms.MFCC(n_mfcc=40) # concat = ConcatFeature() # waveform = transforms(data) # print(waveform.shape) # waveform = concat(waveform) # print(waveform[:, -1])
def test_decoding(self, openai_files): tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"], lowercase=True) decoded = tokenizer.decode(tokenizer.encode("my name is john").ids) assert decoded == "my name is john"
# coding: utf-8 from tokenizers import CharBPETokenizer # Initialize a tokenizer merges = "./saved_tokenizer/wiki_sunyang/merges.txt" vocab = "./saved_tokenizer/wiki_sunyang/vocab.json" tokenizer = CharBPETokenizer(vocab, merges) # And then encode: encoded = tokenizer.encode( "In 2012, Sun became the first Chinese man to win an Olympic gold medal in swimming." ) print(encoded.ids) print(encoded.tokens)