예제 #1
0
    def _tokenize(self, text, never_split=None, **kwargs):
        if self.do_preprocessing:
            if self.do_lower_case:
                text = text.lower()
            text = str(" ".join(text_processor.pre_process_doc(text)))
            text = re.sub(r'[^a-zA-ZÀ-ú</>!?♥♡\s\U00010000-\U0010ffff]', ' ',
                          text)
            text = re.sub(r'\s+', ' ', text)
            text = re.sub(r'(\w)\1{2,}', r'\1\1', text)
            text = re.sub(r'^\s', '', text)
            text = re.sub(r'\s$', '', text)
            # print(s)

        split_tokens = [text]
        if self.do_wordpiece_tokenize:
            wordpiece_tokenizer = WordpieceTokenizer(self.vocab,
                                                     self.unk_token)
            split_tokens = wordpiece_tokenizer.tokenize(text)

        elif self.do_char_tokenize:
            tokenizer = CharacterTokenizer(self.vocab, self.unk_token)
            split_tokens = tokenizer.tokenize(text)

        elif self.do_basic_tokenize:
            """Tokenizes a piece of text."""
            split_tokens = self.base_bert_tok.tokenize(text)

        return split_tokens
예제 #2
0
class MecabBertTokenizer(BertTokenizer):
    """BERT tokenizer for Japanese text; MeCab tokenization + WordPiece"""

    def __init__(self, vocab_file, do_lower_case=False,
                 do_basic_tokenize=True, do_wordpiece_tokenize=True,
                 mecab_dict_path=None, unk_token='[UNK]', sep_token='[SEP]',
                 pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', **kwargs):
        """Constructs a MecabBertTokenizer.

        Args:
            **vocab_file**: Path to a one-wordpiece-per-line vocabulary file.
            **do_lower_case**: (`optional`) boolean (default True)
                Whether to lower case the input.
                Only has an effect when do_basic_tokenize=True.
            **do_basic_tokenize**: (`optional`) boolean (default True)
                Whether to do basic tokenization with MeCab before wordpiece.
            **mecab_dict_path**: (`optional`) string
                Path to a directory of a MeCab dictionary.
        """
        super(BertTokenizer, self).__init__(
            unk_token=unk_token, sep_token=sep_token, pad_token=pad_token,
            cls_token=cls_token, mask_token=mask_token, **kwargs)

        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens

        if not os.path.isfile(vocab_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'.".format(vocab_file))

        self.vocab = load_vocab(vocab_file)
        self.ids_to_tokens = collections.OrderedDict(
            [(ids, tok) for tok, ids in self.vocab.items()])
        self.do_basic_tokenize = do_basic_tokenize
        self.do_wordpiece_tokenize = do_wordpiece_tokenize
        if do_basic_tokenize:
            self.basic_tokenizer = MecabBasicTokenizer(do_lower_case=do_lower_case,
                                                       mecab_dict_path=mecab_dict_path)

        if do_wordpiece_tokenize:
            self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab,
                                                          unk_token=self.unk_token)

    def _tokenize(self, text):
        if self.do_basic_tokenize:
            tokens = self.basic_tokenizer.tokenize(text,
                                                   never_split=self.all_special_tokens)
        else:
            tokens = [text]

        if self.do_wordpiece_tokenize:
            split_tokens = [sub_token for token in tokens
                            for sub_token in self.wordpiece_tokenizer.tokenize(token)]
        else:
            split_tokens = tokens

        return split_tokens
예제 #3
0
class SubwordTokenizer(Tokenizer):
    """
    Subword Tokenizer

    text -> [word tokens] -> [[sub word tokens], ...]

    * Args:
        name: tokenizer name [wordpiece]
    """

    def __init__(self, name, word_tokenizer, config={}):
        super(SubwordTokenizer, self).__init__(name, f"subword-{name}+{word_tokenizer.cache_name}")
        self.data_handler = DataHandler(CachePath.VOCAB)
        self.config = config
        self.word_tokenizer = word_tokenizer
        self.subword_tokenizer = None

    """ Tokenizers """

    def _wordpiece(self, text, unit="text"):
        """
        ex) Hello World -> ['Hello', 'World'] -> ['He', '##llo', 'Wo', '##rld']
        """
        if self.subword_tokenizer is None:
            vocab_path = self.data_handler.read(self.config["vocab_path"], return_path=True)
            vocab = load_vocab(vocab_path)
            self.subword_tokenizer = WordpieceTokenizer(
                vocab, unk_token=self.config.get("unk_token", "[UNK]"))

        tokens = []

        if unit == "word":
            for sub_token in self.subword_tokenizer.tokenize(text):
                tokens.append(sub_token)
        else:
            for token in self.word_tokenizer.tokenize(text):
                for sub_token in self.subword_tokenizer.tokenize(token):
                    tokens.append(sub_token)

        return tokens
def make_alignment(tokenizer: transformers.WordpieceTokenizer,
                   tokens: List[str]) -> Tuple[List[str], List[List[int]]]:
    """ Make the alignment between tokens and the subtokens. It is
    useful to interpret results or to understand the model reasoning. """
    i = 0
    sub_tokens = []
    alignment = []
    for token in tokens:

        indices = []
        word_pieces = tokenizer.tokenize(token)
        for sub_token in word_pieces:
            indices.append(i)
            sub_tokens.append(sub_token)
            i += 1

        alignment.append(indices)
    return sub_tokens, alignment
예제 #5
0
class WordPieceVocab(object):
    """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
    def __init__(self,
                 vocab_path,
                 do_lower_case=True,
                 max_len=None,
                 freq_path=None):
        """Constructs a BertTokenizer.

        Args:
          vocab_file: Path to a one-wordpiece-per-line vocabulary file
          max_len: An artificial maximum length to truncate tokenized sequences to;
                         Effective maximum length is always the minimum of this
                         value (if specified) and the underlying BERT model's
                         sequence length.
        """
        self.token_to_idx = json.load(open(vocab_path, 'r'),
                                      object_pairs_hook=OrderedDict)
        self.idx_to_token = OrderedDict([
            (idx, tok) for tok, idx in self.token_to_idx.items()
        ])
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.token_to_idx)
        self.max_len = max_len if max_len is not None else int(1e12)

        if freq_path is not None:
            self.token_to_freq = json.load(open(freq_path, 'r'),
                                           object_pairs_hook=OrderedDict)

    def tokenize(self, text):
        split_tokens = self.wordpiece_tokenizer.tokenize(text)
        return split_tokens

    def detokenize(self, tokens):
        text = ' '.join(tokens)
        return text.replace(' ##', '')

    def to_input_tensor(self, sents: List[List[str]], device) -> torch.Tensor:
        """ Convert list of tokens into tensor with necessary padding for
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tesnor, i.e. CPU or GPU

        @returns sents_var: tensor of (max_sentence_length, batch_size)
        """
        sents = [self.convert_tokens_to_idx(sent) for sent in sents]
        sents, mask = self.pad_sentences(sents)
        sents_var = torch.tensor(sents, dtype=torch.long, device=device)
        mask_var = torch.tensor(mask, dtype=torch.long, device=device)
        return sents_var, mask_var

    def from_output_tensor(self, batch_output):
        """ Places batch output on cpu and converts it to tokens ignoring -1's and padding.
        args:
            batch_output    (tensor)   (batch_size, max_len)
        """
        place_on_cpu(batch_output)
        sents = []
        for output in batch_output:
            sent = []
            for idx in output:
                idx = idx.item()
                if idx == -1:
                    continue

                token = self.idx_to_token[idx]

                if token == "[PAD]":
                    continue

                sent.append(token)
            sents.append(sent)
        return sents

    def pad_sentences(self, sents):
        """
        args:
            sents   (list(list(str)))
        """
        sents_padded = []
        mask_padded = []

        max_len = max(map(len, sents))
        for sent in sents:
            sents_padded.append(sent[:] + [self.token_to_idx['[PAD]']] *
                                (max_len - len(sent)))

        mask = [[int(token != self.token_to_idx['[PAD]']) for token in sent]
                for sent in sents_padded]

        return sents_padded, mask

    def wrap_sentence(self, sent):
        """ Wrap sentences with start and stop tokens.
        args:
            sent (list[str]])
        """
        sent = ['[CLS]'] + sent + ['[SEP]']

        return sent

    def unwrap_sentence(self, tokens):
        new_tokens = [
            token for token in tokens if token != '[CLS]' and token != '[SEP]'
        ]
        return new_tokens

    def convert_tokens_to_idx(self, tokens):
        """Converts a sequence of tokens into ids using the vocab."""
        ids = []
        for token in tokens:
            ids.append(self.token_to_idx[token])
        if len(ids) > self.max_len:
            logging.warning(
                "Token indices sequence length is longer than the specified maximum "
                " sequence length for this BERT model ({} > {}). Running this"
                " sequence through BERT will result in indexing errors".format(
                    len(ids), self.max_len))
        return ids

    def convert_idxs_to_token(self, ids):
        """Converts a sequence of ids in wordpiece tokens using the vocab."""
        tokens = []
        for i in ids:
            tokens.append(self.idx_to_token[i])
        return tokens

    def get_tokens_in_range(self, tokens, text, start, end):
        """
        Get all of the tokens in the range (start, end) in original string.
        """
        token_idxs = []
        find_start = 0

        for idx, token in enumerate(tokens):
            if token == "[CLS]" or token == "[SEP]":
                continue

            if token.startswith("##"):
                # remove pounds
                token = token[2:]

            token_start = text.find(token, find_start)
            token_end = token_start + len(token)
            find_start = token_end

            if ((token_start >= start and token_start < end)
                    or (token_end >= start and token_end < end)):
                token_idxs.append(idx)
        return token_idxs

    def __len__(self):
        """ Compute number of words in VocabEntry.
        @returns len (int): number of words in VocabEntry
        """
        return len(self.token_to_idx)