class SubwordEncoder: "Subword tokenization" def __init__(self, path='subword/'): """ Args: path: str, a path to vocab file. """ # Load vocab self.subword_tokenizer = CharBPETokenizer(vocab_file=path+"/bpe-vocab.json", merges_file=path+"/bpe-merges.txt") self.encode = self._encode_subwords self.id_to_token = self._id_to_subword() self.token_to_id = self._subword_to_id() def get_vocab_size(self): return self.subword_tokenizer.get_vocab_size() def _encode_subwords(self, sentence, with_eos): """ Args: sentence: str, texts to be encoded. with_eos: end with <EOS> token. Returns: tokens: list, encoded sequence. """ tokens = self.subword_tokenizer.encode(sentence).ids if with_eos: tokens += [2] # 2 is the id of <EOS> token return tokens def _id_to_subword(self): id2subword = {} for i in range(self.get_vocab_size()): id2subword[i] = self.subword_tokenizer.id_to_token(i) return id2subword def _subword_to_id(self): subword2id = {} for i in range(self.get_vocab_size()): subword2id[self.subword_tokenizer.id_to_token(i)] = i return subword2id
class BPETokenizer: def __init__(self, text_list, vocab_size, lazy=False): if not lazy: self.tokenizer = CharBPETokenizer() self.tokenizer.train(text_list, vocab_size=vocab_size, special_tokens=[PAD, BOS, EOS, "<unk>"]) self.tokenizer.add_special_tokens([PAD, BOS, EOS]) else: self.tokenizer = None def tokens_to_ids(self, tokens): return [self.tokenizer.token_to_id(t) for t in tokens] def ids_to_tokens(self, ids): return [self.tokenizer.id_to_token(i) for i in ids] def encode(self, text): encodes = self.tokenizer.encode(text) return encodes.ids def decode(self, ids, skip_special=True): return self.tokenizer.decode(ids, skip_special_tokens=skip_special) def save(self, path, file_name): self.tokenizer.save(path, file_name) @classmethod def load(cls, vocab, merges): tkz = cls(None, None, lazy=True) tkz.tokenizer = CharBPETokenizer(vocab, merges) tkz.tokenizer.add_special_tokens([PAD, BOS, EOS]) return tkz def __len__(self): return self.tokenizer.get_vocab_size()