Exemplo n.º 1
0
class SubwordEncoder:
    "Subword tokenization" 

    def __init__(self, path='subword/'):
        """ 
        Args:
            path: str, a path to vocab file.
        """
        
        # Load vocab
        self.subword_tokenizer = CharBPETokenizer(vocab_file=path+"/bpe-vocab.json", merges_file=path+"/bpe-merges.txt")

        self.encode = self._encode_subwords
        self.id_to_token = self._id_to_subword()
        self.token_to_id = self._subword_to_id()

    def get_vocab_size(self):
        return self.subword_tokenizer.get_vocab_size()   

    def _encode_subwords(self, sentence, with_eos):
        """ 
        Args:
            sentence: str, texts to be encoded.
            with_eos: end with <EOS> token.
        Returns:
            tokens: list, encoded sequence.
        """
        tokens = self.subword_tokenizer.encode(sentence).ids
        if with_eos:
            tokens += [2] # 2 is the id of <EOS> token
        return tokens

    def _id_to_subword(self):
        id2subword = {}
        for i in range(self.get_vocab_size()):
            id2subword[i] = self.subword_tokenizer.id_to_token(i)
        return id2subword

    def _subword_to_id(self):
        subword2id = {}
        for i in range(self.get_vocab_size()):
            subword2id[self.subword_tokenizer.id_to_token(i)] = i
        return subword2id
Exemplo n.º 2
0
class BPETokenizer:
    def __init__(self, text_list, vocab_size, lazy=False):
        if not lazy:
            self.tokenizer = CharBPETokenizer()
            self.tokenizer.train(text_list,
                                 vocab_size=vocab_size,
                                 special_tokens=[PAD, BOS, EOS, "<unk>"])
            self.tokenizer.add_special_tokens([PAD, BOS, EOS])
        else:
            self.tokenizer = None

    def tokens_to_ids(self, tokens):
        return [self.tokenizer.token_to_id(t) for t in tokens]

    def ids_to_tokens(self, ids):
        return [self.tokenizer.id_to_token(i) for i in ids]

    def encode(self, text):
        encodes = self.tokenizer.encode(text)
        return encodes.ids

    def decode(self, ids, skip_special=True):
        return self.tokenizer.decode(ids, skip_special_tokens=skip_special)

    def save(self, path, file_name):
        self.tokenizer.save(path, file_name)

    @classmethod
    def load(cls, vocab, merges):
        tkz = cls(None, None, lazy=True)
        tkz.tokenizer = CharBPETokenizer(vocab, merges)
        tkz.tokenizer.add_special_tokens([PAD, BOS, EOS])
        return tkz

    def __len__(self):
        return self.tokenizer.get_vocab_size()