def _tokenize(self, text, never_split=None, **kwargs): if self.do_preprocessing: if self.do_lower_case: text = text.lower() text = str(" ".join(text_processor.pre_process_doc(text))) text = re.sub(r'[^a-zA-ZÀ-ú</>!?♥♡\s\U00010000-\U0010ffff]', ' ', text) text = re.sub(r'\s+', ' ', text) text = re.sub(r'(\w)\1{2,}', r'\1\1', text) text = re.sub(r'^\s', '', text) text = re.sub(r'\s$', '', text) # print(s) split_tokens = [text] if self.do_wordpiece_tokenize: wordpiece_tokenizer = WordpieceTokenizer(self.vocab, self.unk_token) split_tokens = wordpiece_tokenizer.tokenize(text) elif self.do_char_tokenize: tokenizer = CharacterTokenizer(self.vocab, self.unk_token) split_tokens = tokenizer.tokenize(text) elif self.do_basic_tokenize: """Tokenizes a piece of text.""" split_tokens = self.base_bert_tok.tokenize(text) return split_tokens
class MecabBertTokenizer(BertTokenizer): """BERT tokenizer for Japanese text; MeCab tokenization + WordPiece""" def __init__(self, vocab_file, do_lower_case=False, do_basic_tokenize=True, do_wordpiece_tokenize=True, mecab_dict_path=None, unk_token='[UNK]', sep_token='[SEP]', pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', **kwargs): """Constructs a MecabBertTokenizer. Args: **vocab_file**: Path to a one-wordpiece-per-line vocabulary file. **do_lower_case**: (`optional`) boolean (default True) Whether to lower case the input. Only has an effect when do_basic_tokenize=True. **do_basic_tokenize**: (`optional`) boolean (default True) Whether to do basic tokenization with MeCab before wordpiece. **mecab_dict_path**: (`optional`) string Path to a directory of a MeCab dictionary. """ super(BertTokenizer, self).__init__( unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, **kwargs) self.max_len_single_sentence = self.max_len - 2 # take into account special tokens self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens if not os.path.isfile(vocab_file): raise ValueError( "Can't find a vocabulary file at path '{}'.".format(vocab_file)) self.vocab = load_vocab(vocab_file) self.ids_to_tokens = collections.OrderedDict( [(ids, tok) for tok, ids in self.vocab.items()]) self.do_basic_tokenize = do_basic_tokenize self.do_wordpiece_tokenize = do_wordpiece_tokenize if do_basic_tokenize: self.basic_tokenizer = MecabBasicTokenizer(do_lower_case=do_lower_case, mecab_dict_path=mecab_dict_path) if do_wordpiece_tokenize: self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) def _tokenize(self, text): if self.do_basic_tokenize: tokens = self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens) else: tokens = [text] if self.do_wordpiece_tokenize: split_tokens = [sub_token for token in tokens for sub_token in self.wordpiece_tokenizer.tokenize(token)] else: split_tokens = tokens return split_tokens
class SubwordTokenizer(Tokenizer): """ Subword Tokenizer text -> [word tokens] -> [[sub word tokens], ...] * Args: name: tokenizer name [wordpiece] """ def __init__(self, name, word_tokenizer, config={}): super(SubwordTokenizer, self).__init__(name, f"subword-{name}+{word_tokenizer.cache_name}") self.data_handler = DataHandler(CachePath.VOCAB) self.config = config self.word_tokenizer = word_tokenizer self.subword_tokenizer = None """ Tokenizers """ def _wordpiece(self, text, unit="text"): """ ex) Hello World -> ['Hello', 'World'] -> ['He', '##llo', 'Wo', '##rld'] """ if self.subword_tokenizer is None: vocab_path = self.data_handler.read(self.config["vocab_path"], return_path=True) vocab = load_vocab(vocab_path) self.subword_tokenizer = WordpieceTokenizer( vocab, unk_token=self.config.get("unk_token", "[UNK]")) tokens = [] if unit == "word": for sub_token in self.subword_tokenizer.tokenize(text): tokens.append(sub_token) else: for token in self.word_tokenizer.tokenize(text): for sub_token in self.subword_tokenizer.tokenize(token): tokens.append(sub_token) return tokens
def make_alignment(tokenizer: transformers.WordpieceTokenizer, tokens: List[str]) -> Tuple[List[str], List[List[int]]]: """ Make the alignment between tokens and the subtokens. It is useful to interpret results or to understand the model reasoning. """ i = 0 sub_tokens = [] alignment = [] for token in tokens: indices = [] word_pieces = tokenizer.tokenize(token) for sub_token in word_pieces: indices.append(i) sub_tokens.append(sub_token) i += 1 alignment.append(indices) return sub_tokens, alignment
class WordPieceVocab(object): """Runs end-to-end tokenization: punctuation splitting + wordpiece""" def __init__(self, vocab_path, do_lower_case=True, max_len=None, freq_path=None): """Constructs a BertTokenizer. Args: vocab_file: Path to a one-wordpiece-per-line vocabulary file max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the minimum of this value (if specified) and the underlying BERT model's sequence length. """ self.token_to_idx = json.load(open(vocab_path, 'r'), object_pairs_hook=OrderedDict) self.idx_to_token = OrderedDict([ (idx, tok) for tok, idx in self.token_to_idx.items() ]) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.token_to_idx) self.max_len = max_len if max_len is not None else int(1e12) if freq_path is not None: self.token_to_freq = json.load(open(freq_path, 'r'), object_pairs_hook=OrderedDict) def tokenize(self, text): split_tokens = self.wordpiece_tokenizer.tokenize(text) return split_tokens def detokenize(self, tokens): text = ' '.join(tokens) return text.replace(' ##', '') def to_input_tensor(self, sents: List[List[str]], device) -> torch.Tensor: """ Convert list of tokens into tensor with necessary padding for shorter sentences. @param sents (List[List[str]]): list of sentences (words) @param device: device on which to load the tesnor, i.e. CPU or GPU @returns sents_var: tensor of (max_sentence_length, batch_size) """ sents = [self.convert_tokens_to_idx(sent) for sent in sents] sents, mask = self.pad_sentences(sents) sents_var = torch.tensor(sents, dtype=torch.long, device=device) mask_var = torch.tensor(mask, dtype=torch.long, device=device) return sents_var, mask_var def from_output_tensor(self, batch_output): """ Places batch output on cpu and converts it to tokens ignoring -1's and padding. args: batch_output (tensor) (batch_size, max_len) """ place_on_cpu(batch_output) sents = [] for output in batch_output: sent = [] for idx in output: idx = idx.item() if idx == -1: continue token = self.idx_to_token[idx] if token == "[PAD]": continue sent.append(token) sents.append(sent) return sents def pad_sentences(self, sents): """ args: sents (list(list(str))) """ sents_padded = [] mask_padded = [] max_len = max(map(len, sents)) for sent in sents: sents_padded.append(sent[:] + [self.token_to_idx['[PAD]']] * (max_len - len(sent))) mask = [[int(token != self.token_to_idx['[PAD]']) for token in sent] for sent in sents_padded] return sents_padded, mask def wrap_sentence(self, sent): """ Wrap sentences with start and stop tokens. args: sent (list[str]]) """ sent = ['[CLS]'] + sent + ['[SEP]'] return sent def unwrap_sentence(self, tokens): new_tokens = [ token for token in tokens if token != '[CLS]' and token != '[SEP]' ] return new_tokens def convert_tokens_to_idx(self, tokens): """Converts a sequence of tokens into ids using the vocab.""" ids = [] for token in tokens: ids.append(self.token_to_idx[token]) if len(ids) > self.max_len: logging.warning( "Token indices sequence length is longer than the specified maximum " " sequence length for this BERT model ({} > {}). Running this" " sequence through BERT will result in indexing errors".format( len(ids), self.max_len)) return ids def convert_idxs_to_token(self, ids): """Converts a sequence of ids in wordpiece tokens using the vocab.""" tokens = [] for i in ids: tokens.append(self.idx_to_token[i]) return tokens def get_tokens_in_range(self, tokens, text, start, end): """ Get all of the tokens in the range (start, end) in original string. """ token_idxs = [] find_start = 0 for idx, token in enumerate(tokens): if token == "[CLS]" or token == "[SEP]": continue if token.startswith("##"): # remove pounds token = token[2:] token_start = text.find(token, find_start) token_end = token_start + len(token) find_start = token_end if ((token_start >= start and token_start < end) or (token_end >= start and token_end < end)): token_idxs.append(idx) return token_idxs def __len__(self): """ Compute number of words in VocabEntry. @returns len (int): number of words in VocabEntry """ return len(self.token_to_idx)