def test_basic_tokenizer_no_lower(self): tokenizer = BasicTokenizer(do_lower_case=False) self.assertListEqual( tokenizer.tokenize(" \tHeLLo!how \n Are yoU? "), ["HeLLo", "!", "how", "Are", "yoU", "?"], )
def test_basic_tokenizer_lower(self): tokenizer = BasicTokenizer(do_lower_case=True) self.assertListEqual( tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), ["hello", "!", "how", "are", "you", "?"]) self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): """Project the tokenized prediction back to the original text.""" def _strip_spaces(text): ns_chars = [] ns_to_s_map = collections.OrderedDict() for (i, c) in enumerate(text): if c == " ": continue ns_to_s_map[len(ns_chars)] = i ns_chars.append(c) ns_text = "".join(ns_chars) return (ns_text, ns_to_s_map) # We first tokenize `orig_text`, strip whitespace from the result # and `pred_text`, and check if they are the same length. If they are # NOT the same length, the heuristic has failed. If they are the same # length, we assume the characters are one-to-one aligned. tokenizer = BasicTokenizer(do_lower_case=do_lower_case) tok_text = " ".join(tokenizer.tokenize(orig_text)) start_position = tok_text.find(pred_text) if start_position == -1: return orig_text end_position = start_position + len(pred_text) - 1 (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) if len(orig_ns_text) != len(tok_ns_text): return orig_text # We then project the characters in `pred_text` back to `orig_text` using # the character-to-character alignment. tok_s_to_ns_map = {} for (i, tok_index) in tok_ns_to_s_map.items(): tok_s_to_ns_map[tok_index] = i orig_start_position = None if start_position in tok_s_to_ns_map: ns_start_position = tok_s_to_ns_map[start_position] if ns_start_position in orig_ns_to_s_map: orig_start_position = orig_ns_to_s_map[ns_start_position] if orig_start_position is None: return orig_text orig_end_position = None if end_position in tok_s_to_ns_map: ns_end_position = tok_s_to_ns_map[end_position] if ns_end_position in orig_ns_to_s_map: orig_end_position = orig_ns_to_s_map[ns_end_position] if orig_end_position is None: return orig_text output_text = orig_text[orig_start_position:(orig_end_position + 1)] return output_text
def __init__(self, vocab, do_lower_case=True, unk_token='[UNK]', never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")): super(BaseTokenizer, self).__init__(vocab, do_lower_case, unk_token) self.never_split = list(never_split) self._tokenizer = BasicTokenizer(do_lower_case, self.never_split)
def finish_deserializing(self): self.ids_to_tokens = OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) if self.do_basic_tokenize: self.basic_tokenizer = BasicTokenizer( do_lower_case=self.do_lower_case, never_split=self.never_split, tokenize_chinese_chars=self.tokenize_chinese_chars, ) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
class SerializableBertTokenizer(pytt.BertTokenizer, SerializationMixin): serialization_fields = list(BASE_CLASS_FIELDS) + [ "vocab", "do_basic_tokenize", "do_lower_case", "never_split", "tokenize_chinese_chars", ] @classmethod def blank(cls): self = cls.__new__(cls) for field in self.serialization_fields: setattr(self, field, None) self.ids_to_tokens = None self.basic_tokenizer = None self.wordpiece_tokenizer = None return self def prepare_for_serialization(self): if self.basic_tokenizer is not None: self.do_lower_case = self.basic_tokenizer.do_lower_case self.never_split = self.basic_tokenizer.never_split self.tokenize_chinese_chars = self.basic_tokenizer.tokenize_chinese_chars def finish_deserializing(self): self.ids_to_tokens = OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) if self.do_basic_tokenize: self.basic_tokenizer = BasicTokenizer( do_lower_case=self.do_lower_case, never_split=self.never_split, tokenize_chinese_chars=self.tokenize_chinese_chars, ) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) def clean_token(self, text): if self.do_basic_tokenize: text = self.basic_tokenizer._clean_text(text) text = text.strip() return clean_accents(text) def clean_wp_token(self, token): return token.replace("##", "", 1).strip() def add_special_tokens(self, tokens): return [self.cls_token] + tokens + [self.sep_token]
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): """Project the tokenized prediction back to the original text.""" # When we created the data, we kept track of the alignment between original # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So # now `orig_text` contains the span of our original text corresponding to the # span that we predicted. # # However, `orig_text` may contain extra characters that we don't want in # our prediction. # # For example, let's say: # pred_text = steve smith # orig_text = Steve Smith's # # We don't want to return `orig_text` because it contains the extra "'s". # # We don't want to return `pred_text` because it's already been normalized # (the SQuAD eval script also does punctuation stripping/lower casing but # our tokenizer does additional normalization like stripping accent # characters). # # What we really want to return is "Steve Smith". # # Therefore, we have to apply a semi-complicated alignment heuristic between # `pred_text` and `orig_text` to get a character-to-character alignment. This # can fail in certain cases in which case we just return `orig_text`. def _strip_spaces(text): ns_chars = [] ns_to_s_map = collections.OrderedDict() for (i, c) in enumerate(text): if c == " ": continue ns_to_s_map[len(ns_chars)] = i ns_chars.append(c) ns_text = "".join(ns_chars) return (ns_text, ns_to_s_map) # We first tokenize `orig_text`, strip whitespace from the result # and `pred_text`, and check if they are the same length. If they are # NOT the same length, the heuristic has failed. If they are the same # length, we assume the characters are one-to-one aligned. tokenizer = BasicTokenizer(do_lower_case=do_lower_case) tok_text = " ".join(tokenizer.tokenize(orig_text)) start_position = tok_text.find(pred_text) if start_position == -1: if verbose_logging: logger.info("Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) return orig_text end_position = start_position + len(pred_text) - 1 (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) if len(orig_ns_text) != len(tok_ns_text): if verbose_logging: logger.info( "Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text) return orig_text # We then project the characters in `pred_text` back to `orig_text` using # the character-to-character alignment. tok_s_to_ns_map = {} for (i, tok_index) in tok_ns_to_s_map.items(): tok_s_to_ns_map[tok_index] = i orig_start_position = None if start_position in tok_s_to_ns_map: ns_start_position = tok_s_to_ns_map[start_position] if ns_start_position in orig_ns_to_s_map: orig_start_position = orig_ns_to_s_map[ns_start_position] if orig_start_position is None: if verbose_logging: logger.info("Couldn't map start position") return orig_text orig_end_position = None if end_position in tok_s_to_ns_map: ns_end_position = tok_s_to_ns_map[end_position] if ns_end_position in orig_ns_to_s_map: orig_end_position = orig_ns_to_s_map[ns_end_position] if orig_end_position is None: if verbose_logging: logger.info("Couldn't map end position") return orig_text output_text = orig_text[orig_start_position:(orig_end_position + 1)] return output_text
class SerializableBertTokenizer(pytt.BertTokenizer, SerializationMixin): serialization_fields = list(BASE_CLASS_FIELDS) + [ "vocab", "do_basic_tokenize", "do_lower_case", "never_split", "tokenize_chinese_chars", ] @classmethod def blank(cls): self = cls.__new__(cls) for field in self.serialization_fields: setattr(self, field, None) self.ids_to_tokens = None self.basic_tokenizer = None self.wordpiece_tokenizer = None return self def prepare_for_serialization(self): if self.basic_tokenizer is not None: self.do_lower_case = self.basic_tokenizer.do_lower_case self.never_split = self.basic_tokenizer.never_split self.tokenize_chinese_chars = self.basic_tokenizer.tokenize_chinese_chars def finish_deserializing(self): self.ids_to_tokens = OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) if self.do_basic_tokenize: self.basic_tokenizer = BasicTokenizer( do_lower_case=self.do_lower_case, never_split=self.never_split, tokenize_chinese_chars=self.tokenize_chinese_chars, ) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) def clean_token(self, text): if self.do_basic_tokenize: text = self.basic_tokenizer._clean_text(text) text = text.strip() return clean_accents(text) def clean_wp_token(self, token): return token.replace("##", "", 1).strip() def add_special_tokens(self, segments): output = [] for segment in segments: output.extend(segment) if segment: output.append(self.sep_token) if output: # If we otherwise would have an empty output, don't add cls output.insert(0, self.cls_token) return output def fix_alignment(self, segments): """Turn a nested segment alignment into an alignment for the whole input, by offsetting and accounting for special tokens.""" offset = 0 output = [] for segment in segments: if segment: offset += 1 seen = set() for idx_group in segment: output.append([idx + offset for idx in idx_group]) seen.update({idx for idx in idx_group}) offset += len(seen) return output
def test_chinese(self): tokenizer = BasicTokenizer() self.assertListEqual(tokenizer.tokenize(u"ah\u535A\u63A8zz"), [u"ah", u"\u535A", u"\u63A8", u"zz"])
def __init__(self, vocab_file, do_lower_case=True, do_basic_tokenize=True, never_split=None, never_split_chars=None, unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]", mask_token="[MASK]", tokenize_chinese_chars=True, **kwargs): """Constructs a BertTokenizer. Args: **vocab_file**: Path to a one-wordpiece-per-line vocabulary file **do_lower_case**: (`optional`) boolean (default True) Whether to lower case the input Only has an effect when do_basic_tokenize=True **do_basic_tokenize**: (`optional`) boolean (default True) Whether to do basic tokenization before wordpiece. **never_split**: (`optional`) list of string List of tokens which will never be split during tokenization. Only has an effect when do_basic_tokenize=True **tokenize_chinese_chars**: (`optional`) boolean (default True) Whether to tokenize Chinese characters. This should likely be desactivated for Japanese: see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328 """ super(BertTokenizer, self).__init__(vocab_file, do_lower_case=True, do_basic_tokenize=True, never_split=None, never_split_chars=None, unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]", mask_token="[MASK]", tokenize_chinese_chars=True, **kwargs) if not os.path.isfile(vocab_file): raise ValueError( "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" .format(vocab_file)) self.vocab = load_vocab(vocab_file) self.ids_to_tokens = collections.OrderedDict([ (ids, tok) for tok, ids in self.vocab.items() ]) self.do_basic_tokenize = do_basic_tokenize if do_basic_tokenize: self.basic_tokenizer = BasicTokenizer( do_lower_case=do_lower_case, never_split=never_split, never_split_chars=never_split_chars, tokenize_chinese_chars=tokenize_chinese_chars) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)