def test_basic_tokenizer_lower_strip_accents_default(self): tokenizer = BasicTokenizer(do_lower_case=True) self.assertListEqual( tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["hallo", "!", "how", "are", "you", "?"] ) self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
def test_basic_tokenizer_lower(self): tokenizer = BasicTokenizer(do_lower_case=True) self.assertListEqual( tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), ["hello", "!", "how", "are", "you", "?"]) self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
class BertPreTokenizer(Tokenizer): """ The ``BasicTokenizer`` from the BERT implementation. This is used to split a sentence into words. Then the ``BertTokenIndexer`` converts each word into wordpieces. """ default_never_split = ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"] def __init__(self, do_lower_case: bool = True, never_split: Optional[List[str]] = None) -> None: if never_split is None: never_split = self.default_never_split else: never_split = never_split + self.default_never_split self.basic_tokenizer = BertTokenizer(do_lower_case, never_split) self.basic_tokenizer._run_split_on_punc = self._run_split_on_punc self.never_split = never_split @overrides def tokenize(self, text: str) -> List[Token]: return [Token(text) for text in self.basic_tokenizer.tokenize(text)] # HACK: Monkeypatch for huggingface's broken BasicTokenizer. # TODO(Mark): Remove this once https://github.com/huggingface/transformers/pull/2557 # is merged. def _run_split_on_punc(self, text, never_split=None): """Splits punctuation on a piece of text.""" if never_split is None: never_split = self.never_split if never_split is not None and text in never_split: return [text] chars = list(text) i = 0 start_new_word = True output = [] while i < len(chars): char = chars[i] if _is_punctuation(char): output.append([char]) start_new_word = True else: if start_new_word: output.append([]) start_new_word = False output[-1].append(char) i += 1 return ["".join(x) for x in output]
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): """Project the tokenized prediction back to the original text.""" # When we created the data, we kept track of the alignment between original # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So # now `orig_text` contains the span of our original text corresponding to the # span that we predicted. # # However, `orig_text` may contain extra characters that we don't want in # our prediction. # # For example, let's say: # pred_text = steve smith # orig_text = Steve Smith's # # We don't want to return `orig_text` because it contains the extra "'s". # # We don't want to return `pred_text` because it's already been normalized # (the SQuAD eval script also does punctuation stripping/lower casing but # our tokenizer does additional normalization like stripping accent # characters). # # What we really want to return is "Steve Smith". # # Therefore, we have to apply a semi-complicated alignment heuristic between # `pred_text` and `orig_text` to get a character-to-character alignment. This # can fail in certain cases in which case we just return `orig_text`. def _strip_spaces(text): ns_chars = [] ns_to_s_map = collections.OrderedDict() for (i, c) in enumerate(text): if c == " ": continue ns_to_s_map[len(ns_chars)] = i ns_chars.append(c) ns_text = "".join(ns_chars) return (ns_text, ns_to_s_map) # We first tokenize `orig_text`, strip whitespace from the result # and `pred_text`, and check if they are the same length. If they are # NOT the same length, the heuristic has failed. If they are the same # length, we assume the characters are one-to-one aligned. # There are some unicodes that causes an error, replace it as ' ' or ''. orig_text = orig_text.replace(u'\xa0', u' ') orig_text = orig_text.replace('', '') orig_text = orig_text.replace(u'\u200e', u'') pred_text = pred_text.replace(u'\xa0', u' ') pred_text = pred_text.replace('', '') pred_text = pred_text.replace(u'\u200e', u'') tokenizer = BasicTokenizer(do_lower_case=do_lower_case) tok_text = " ".join(tokenizer.tokenize(orig_text)) # Also, at some cases, if pred_text contains [UNK] or '##', it causes error and this function return just orig_text. # To fix this, we did some pre-processing for pred_text based on tok_text. correct_text = word_correction(pred_text, tok_text) if correct_text == -1: if verbose_logging: logger.info("Fail to correction: '%s' using '%s'" % (pred_text, tok_text)) return orig_text pred_text = correct_text start_position = tok_text.find(pred_text) if start_position == -1: if verbose_logging: logger.info("Unable to find text: '%s' in '%s'" % (pred_text, tok_text)) return orig_text end_position = start_position + len(pred_text) - 1 (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) if len(orig_ns_text) != len(tok_ns_text): if verbose_logging: logger.info( "Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text) logger.info("Lenth error: '%s' vs '%s'", orig_text, tok_text) return orig_text # We then project the characters in `pred_text` back to `orig_text` using # the character-to-character alignment. tok_s_to_ns_map = {} for (i, tok_index) in tok_ns_to_s_map.items(): tok_s_to_ns_map[tok_index] = i orig_start_position = None if start_position in tok_s_to_ns_map: ns_start_position = tok_s_to_ns_map[start_position] if ns_start_position in orig_ns_to_s_map: orig_start_position = orig_ns_to_s_map[ns_start_position] if orig_start_position is None: if verbose_logging: logger.info("Couldn't map start position") return orig_text orig_end_position = None if end_position in tok_s_to_ns_map: ns_end_position = tok_s_to_ns_map[end_position] if ns_end_position in orig_ns_to_s_map: orig_end_position = orig_ns_to_s_map[ns_end_position] if orig_end_position is None: if verbose_logging: logger.info("Couldn't map end position") return orig_text output_text = orig_text[orig_start_position:(orig_end_position + 1)] return output_text
import sys from collections import OrderedDict import json from transformers.tokenization_bert import BasicTokenizer tokenizer = BasicTokenizer(do_lower_case=True) data1 = json.load(open(sys.argv[1]),object_pairs_hook=OrderedDict) data2 = json.load(open(sys.argv[2])) for key,value in data1.items(): value = tokenizer.tokenize(value) if len(value) >30 and data2[key] != "" and data2[key] != "empty": data1[key] = data2[key] json.dump(data1,open(sys.argv[3],"w"),ensure_ascii=False,indent=4)
def test_basic_tokenizer_no_lower(self): tokenizer = BasicTokenizer(do_lower_case=False) self.assertListEqual( tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), ["HeLLo", "!", "how", "Are", "yoU", "?"])
def test_chinese(self): tokenizer = BasicTokenizer() self.assertListEqual(tokenizer.tokenize(u"ah\u535A\u63A8zz"), [u"ah", u"\u535A", u"\u63A8", u"zz"])
def test_basic_tokenizer_respects_never_split_tokens(self): tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"]) self.assertListEqual( tokenizer.tokenize(" \tHeLLo!how \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"])
def test_basic_tokenizer_no_lower_strip_accents_true(self): tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=True) self.assertListEqual(tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["HaLLo", "!", "how", "Are", "yoU", "?"])
def get_final_text(pred_text, orig_text, do_lower_case): """Project the tokenized prediction back to the original text.""" # When we created the data, we kept track of the alignment between original # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So # now `orig_text` contains the span of our original text corresponding to the # span that we predicted. # # However, `orig_text` may contain extra characters that we don't want in # our prediction. # # For example, let's say: # pred_text = steve smith # orig_text = Steve Smith's # # We don't want to return `orig_text` because it contains the extra "'s". # # We don't want to return `pred_text` because it's already been normalized # (the SQuAD eval script also does punctuation stripping/lower casing but # our tokenizer does additional normalization like stripping accent # characters). # # What we really want to return is "Steve Smith". # # Therefore, we have to apply a semi-complicated alignment heuristic between # `pred_text` and `orig_text` to get a character-to-character alignment. This # can fail in certain cases in which case we just return `orig_text`. def _strip_spaces(text): ns_chars = [] ns_to_s_map = collections.OrderedDict() for idx, c in enumerate(text): if c == " ": continue ns_to_s_map[len(ns_chars)] = idx ns_chars.append(c) ns_text = "".join(ns_chars) return ns_text, ns_to_s_map # We first tokenize `orig_text`, strip whitespace from the result # and `pred_text`, and check if they are the same length. If they are # NOT the same length, the heuristic has failed. If they are the same # length, we assume the characters are one-to-one aligned. tokenizer = BasicTokenizer(do_lower_case=do_lower_case) tok_text = " ".join(tokenizer.tokenize(orig_text)) start_position = tok_text.find(pred_text) if start_position == -1: return orig_text end_position = start_position + len(pred_text) - 1 (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) if len(orig_ns_text) != len(tok_ns_text): return orig_text # We then project the characters in `pred_text` back to `orig_text` using # the character-to-character alignment. tok_s_to_ns_map = {} for (i, tok_index) in tok_ns_to_s_map.items(): tok_s_to_ns_map[tok_index] = i orig_start_position = None if start_position in tok_s_to_ns_map: ns_start_position = tok_s_to_ns_map[start_position] if ns_start_position in orig_ns_to_s_map: orig_start_position = orig_ns_to_s_map[ns_start_position] if orig_start_position is None: return orig_text orig_end_position = None if end_position in tok_s_to_ns_map: ns_end_position = tok_s_to_ns_map[end_position] if ns_end_position in orig_ns_to_s_map: orig_end_position = orig_ns_to_s_map[ns_end_position] if orig_end_position is None: return orig_text output_text = orig_text[orig_start_position : (orig_end_position + 1)] return output_text
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): """Project the tokenized prediction back to the original text.""" def _strip_spaces(text): ns_chars = [] ns_to_s_map = collections.OrderedDict() for (i, c) in enumerate(text): if c == " ": continue ns_to_s_map[len(ns_chars)] = i ns_chars.append(c) ns_text = "".join(ns_chars) return (ns_text, ns_to_s_map) # We first tokenize `orig_text`, strip whitespace from the result # and `pred_text`, and check if they are the same length. If they are # NOT the same length, the heuristic has failed. If they are the same # length, we assume the characters are one-to-one aligned. tokenizer = BasicTokenizer(do_lower_case=do_lower_case) tok_text = " ".join(tokenizer.tokenize(orig_text)) start_position = tok_text.find(pred_text) if start_position == -1: if verbose_logging: logger.info("Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) return orig_text end_position = start_position + len(pred_text) - 1 (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) if len(orig_ns_text) != len(tok_ns_text): if verbose_logging: logger.info( "Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text) return orig_text # We then project the characters in `pred_text` back to `orig_text` using # the character-to-character alignment. tok_s_to_ns_map = {} for (i, tok_index) in tok_ns_to_s_map.items(): tok_s_to_ns_map[tok_index] = i orig_start_position = None if start_position in tok_s_to_ns_map: ns_start_position = tok_s_to_ns_map[start_position] if ns_start_position in orig_ns_to_s_map: orig_start_position = orig_ns_to_s_map[ns_start_position] if orig_start_position is None: if verbose_logging: logger.info("Couldn't map start position") return orig_text orig_end_position = None if end_position in tok_s_to_ns_map: ns_end_position = tok_s_to_ns_map[end_position] if ns_end_position in orig_ns_to_s_map: orig_end_position = orig_ns_to_s_map[ns_end_position] if orig_end_position is None: if verbose_logging: logger.info("Couldn't map end position") return orig_text output_text = orig_text[orig_start_position:(orig_end_position + 1)] return output_text
class WordEmbedding(nn.Module): def __init__(self, config): super(WordEmbedding, self).__init__() self.config = config self.tokenizer = BasicTokenizer(do_lower_case=True) # standard deviation of initialization init_std = config.MODEL.MMSS_HEAD.TRANSFORMER.BERT_CONFIG.initializer_range self.words = [] self.word2idx = {} self.embeddings = [] with open(config.MODEL.LANGUAGE_BACKBONE.EMBEDDING_PATH, 'r') as fin: for row in fin: row_tk = row.split() self.words.append(row_tk[0]) self.word2idx[row_tk[0]] = len(self.words) - 1 self.embeddings.append([float(num) for num in row_tk[1:]]) self.embeddings = torch.tensor( np.asarray(self.embeddings, dtype=np.float32)).cuda() self.embeddings = nn.Parameter(self.embeddings) self.out_channels = self.embeddings.shape[-1] if self.config.MODEL.LANGUAGE_BACKBONE.FREEZE: self.embeddings.requires_grad = False self.words.extend(['[OOV]', '[PAD]', '[CLS]', '[SEP]', '[MASK]']) self.oov_idx = len(self.words) - 5 self.pad_idx = len(self.words) - 4 self.cls_idx = len(self.words) - 3 self.sep_idx = len(self.words) - 2 self.mask_idx = len(self.words) - 1 self.special_tokens = set([ self.oov_idx, self.pad_idx, self.cls_idx, self.sep_idx, self.mask_idx ]) self.special_embeddings = nn.Parameter( torch.zeros(5, self.out_channels).cuda()) self.special_embeddings.data.normal_(mean=0.0, std=init_std) self.aug_embeddings = torch.cat( [self.embeddings, self.special_embeddings], dim=0) head_config = self.config.MODEL.MMSS_HEAD.TRANSFORMER self.mlm = head_config.MASKED_LANGUAGE_MODELING self.mlm_prob = head_config.MASKED_LANGUAGE_MODELING_PROB self.mlm_prob_mask = head_config.MASKED_LANGUAGE_MODELING_PROB_MASK self.mlm_prob_noise = head_config.MASKED_LANGUAGE_MODELING_PROB_NOISE self.mlm_during_validation = head_config.MASKED_LANGUAGE_MODELING_VALIDATION self.add_position_embedding = config.MODEL.LANGUAGE_BACKBONE.ADD_POSITION_EMBEDDING if self.add_position_embedding: # maximum length of a sentence m = config.MODEL.MMSS_HEAD.TRANSFORMER.BERT_CONFIG.max_position_embeddings self.position_embedding = nn.Parameter( torch.zeros(m, self.out_channels)) self.position_embedding.data.normal_(mean=0.0, std=init_std) def forward(self, text_list): tokenized_batch = { 'input_ids': [], 'attention_mask': [], 'encoded_tokens': [], 'input_embeddings': [], 'special_tokens_mask': [], } for i in range(len(text_list)): tokens = self.tokenizer.tokenize(text_list[i]) ids = [self.word2idx.get(t, self.oov_idx) for t in tokens] ids = [self.cls_idx] + ids + [self.sep_idx] tokenized_batch['input_ids'].append(ids) max_len = max([len(i) for i in tokenized_batch['input_ids']]) for i in range(len(text_list)): ids = tokenized_batch['input_ids'][i] l = len(ids) ids.extend([self.pad_idx] * (max_len - l)) if self.mlm: tokenized_batch['target_ids'] = deepcopy( tokenized_batch['input_ids']) tokenized_batch['mlm_mask'] = [] for i, item in enumerate(tokenized_batch['input_ids']): mlm_mask = [] for j in range(len(item)): if (item[j] in self.special_tokens or not (self.training or self.mlm_during_validation)): mlm_mask.append(0) continue prob = np.random.rand() if prob < self.mlm_prob: mlm_mask.append(1) prob /= self.mlm_prob if prob < self.mlm_prob_mask: item[j] = self.mask_idx elif prob < self.mlm_prob_mask + self.mlm_prob_noise: # assuming special tokens are at the end of the words list item[j] = np.random.randint( len(self.words) - len(self.special_tokens)) else: mlm_mask.append(0) tokenized_batch['mlm_mask'].append(mlm_mask) for i in range(len(text_list)): ids = np.asarray(tokenized_batch['input_ids'][i]) tokenized_batch['attention_mask'].append( (ids != self.pad_idx).astype(np.int64)) enc = self.aug_embeddings[ids] tokenized_batch['input_embeddings'].append(enc) if self.add_position_embedding: enc = enc + self.position_embedding[:max_len] tokenized_batch['encoded_tokens'].append(enc) sp_mask = [] for tk in ids: if tk in self.special_tokens: sp_mask.append(1) else: sp_mask.append(0) tokenized_batch['special_tokens_mask'].append(sp_mask) tokenized_batch['input_embeddings'] = torch.stack( tokenized_batch['input_embeddings'], dim=0) tokenized_batch['encoded_tokens'] = torch.stack( tokenized_batch['encoded_tokens'], dim=0) tokenized_batch['input_ids'] = torch.tensor( tokenized_batch['input_ids']).cuda() tokenized_batch['attention_mask'] = torch.tensor( tokenized_batch['attention_mask']).cuda() tokenized_batch['special_tokens_mask'] = torch.tensor( tokenized_batch['special_tokens_mask']).cuda() if self.mlm: tokenized_batch['mlm_mask'] = torch.tensor( tokenized_batch['mlm_mask']).cuda() tokenized_batch['target_ids'] = torch.tensor( tokenized_batch['target_ids']).cuda() return tokenized_batch