def test_basic_tokenizer_lower_strip_accents_default(self):
        tokenizer = BasicTokenizer(do_lower_case=True)

        self.assertListEqual(
            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
        )
        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
    def test_basic_tokenizer_lower(self):
        tokenizer = BasicTokenizer(do_lower_case=True)

        self.assertListEqual(
            tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
            ["hello", "!", "how", "are", "you", "?"])
        self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
Exemplo n.º 3
0
class BertPreTokenizer(Tokenizer):
    """
    The ``BasicTokenizer`` from the BERT implementation.
    This is used to split a sentence into words.
    Then the ``BertTokenIndexer`` converts each word into wordpieces.
    """

    default_never_split = ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]

    def __init__(self,
                 do_lower_case: bool = True,
                 never_split: Optional[List[str]] = None) -> None:

        if never_split is None:
            never_split = self.default_never_split
        else:
            never_split = never_split + self.default_never_split

        self.basic_tokenizer = BertTokenizer(do_lower_case, never_split)
        self.basic_tokenizer._run_split_on_punc = self._run_split_on_punc
        self.never_split = never_split

    @overrides
    def tokenize(self, text: str) -> List[Token]:
        return [Token(text) for text in self.basic_tokenizer.tokenize(text)]

    # HACK: Monkeypatch for huggingface's broken BasicTokenizer.
    # TODO(Mark): Remove this once https://github.com/huggingface/transformers/pull/2557
    # is merged.
    def _run_split_on_punc(self, text, never_split=None):
        """Splits punctuation on a piece of text."""
        if never_split is None:
            never_split = self.never_split
        if never_split is not None and text in never_split:
            return [text]
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        while i < len(chars):
            char = chars[i]
            if _is_punctuation(char):
                output.append([char])
                start_new_word = True
            else:
                if start_new_word:
                    output.append([])
                start_new_word = False
                output[-1].append(char)
            i += 1

        return ["".join(x) for x in output]
Exemplo n.º 4
0
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
    """Project the tokenized prediction back to the original text."""

    # When we created the data, we kept track of the alignment between original
    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
    # now `orig_text` contains the span of our original text corresponding to the
    # span that we predicted.
    #
    # However, `orig_text` may contain extra characters that we don't want in
    # our prediction.
    #
    # For example, let's say:
    #   pred_text = steve smith
    #   orig_text = Steve Smith's
    #
    # We don't want to return `orig_text` because it contains the extra "'s".
    #
    # We don't want to return `pred_text` because it's already been normalized
    # (the SQuAD eval script also does punctuation stripping/lower casing but
    # our tokenizer does additional normalization like stripping accent
    # characters).
    #
    # What we really want to return is "Steve Smith".
    #
    # Therefore, we have to apply a semi-complicated alignment heuristic between
    # `pred_text` and `orig_text` to get a character-to-character alignment. This
    # can fail in certain cases in which case we just return `orig_text`.

    def _strip_spaces(text):
        ns_chars = []
        ns_to_s_map = collections.OrderedDict()
        for (i, c) in enumerate(text):
            if c == " ":
                continue
            ns_to_s_map[len(ns_chars)] = i
            ns_chars.append(c)
        ns_text = "".join(ns_chars)
        return (ns_text, ns_to_s_map)

    # We first tokenize `orig_text`, strip whitespace from the result
    # and `pred_text`, and check if they are the same length. If they are
    # NOT the same length, the heuristic has failed. If they are the same
    # length, we assume the characters are one-to-one aligned.

    # There are some unicodes that causes an error, replace it as ' ' or ''.
    orig_text = orig_text.replace(u'\xa0', u' ')
    orig_text = orig_text.replace('', '')
    orig_text = orig_text.replace(u'\u200e', u'')

    pred_text = pred_text.replace(u'\xa0', u' ')
    pred_text = pred_text.replace('', '')
    pred_text = pred_text.replace(u'\u200e', u'')

    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
    tok_text = " ".join(tokenizer.tokenize(orig_text))

    # Also, at some cases, if pred_text contains [UNK] or '##', it causes error and this function return just orig_text.
    # To fix this, we did some pre-processing for pred_text based on tok_text.
    correct_text = word_correction(pred_text, tok_text)
    if correct_text == -1:
        if verbose_logging:
            logger.info("Fail to correction: '%s' using '%s'" %
                        (pred_text, tok_text))
        return orig_text

    pred_text = correct_text

    start_position = tok_text.find(pred_text)
    if start_position == -1:
        if verbose_logging:
            logger.info("Unable to find text: '%s' in '%s'" %
                        (pred_text, tok_text))
        return orig_text
    end_position = start_position + len(pred_text) - 1

    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)

    if len(orig_ns_text) != len(tok_ns_text):
        if verbose_logging:
            logger.info(
                "Length not equal after stripping spaces: '%s' vs '%s'",
                orig_ns_text, tok_ns_text)
            logger.info("Lenth error: '%s' vs '%s'", orig_text, tok_text)
        return orig_text

    # We then project the characters in `pred_text` back to `orig_text` using
    # the character-to-character alignment.
    tok_s_to_ns_map = {}
    for (i, tok_index) in tok_ns_to_s_map.items():
        tok_s_to_ns_map[tok_index] = i

    orig_start_position = None
    if start_position in tok_s_to_ns_map:
        ns_start_position = tok_s_to_ns_map[start_position]
        if ns_start_position in orig_ns_to_s_map:
            orig_start_position = orig_ns_to_s_map[ns_start_position]

    if orig_start_position is None:
        if verbose_logging:
            logger.info("Couldn't map start position")
        return orig_text

    orig_end_position = None
    if end_position in tok_s_to_ns_map:
        ns_end_position = tok_s_to_ns_map[end_position]
        if ns_end_position in orig_ns_to_s_map:
            orig_end_position = orig_ns_to_s_map[ns_end_position]

    if orig_end_position is None:
        if verbose_logging:
            logger.info("Couldn't map end position")
        return orig_text

    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
    return output_text
Exemplo n.º 5
0
import sys
from collections import OrderedDict
import json
from transformers.tokenization_bert import BasicTokenizer
tokenizer = BasicTokenizer(do_lower_case=True)

data1 = json.load(open(sys.argv[1]),object_pairs_hook=OrderedDict)
data2 = json.load(open(sys.argv[2]))

for key,value in data1.items():
    value = tokenizer.tokenize(value)
    if len(value) >30 and data2[key] != "" and  data2[key] != "empty":
        data1[key] = data2[key]

json.dump(data1,open(sys.argv[3],"w"),ensure_ascii=False,indent=4)
    def test_basic_tokenizer_no_lower(self):
        tokenizer = BasicTokenizer(do_lower_case=False)

        self.assertListEqual(
            tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
            ["HeLLo", "!", "how", "Are", "yoU", "?"])
    def test_chinese(self):
        tokenizer = BasicTokenizer()

        self.assertListEqual(tokenizer.tokenize(u"ah\u535A\u63A8zz"),
                             [u"ah", u"\u535A", u"\u63A8", u"zz"])
Exemplo n.º 8
0
    def test_basic_tokenizer_respects_never_split_tokens(self):
        tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"])

        self.assertListEqual(
            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU? [UNK]"),
            ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"])
Exemplo n.º 9
0
    def test_basic_tokenizer_no_lower_strip_accents_true(self):
        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=True)

        self.assertListEqual(tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "),
                             ["HaLLo", "!", "how", "Are", "yoU", "?"])
Exemplo n.º 10
0
def get_final_text(pred_text, orig_text, do_lower_case):
    """Project the tokenized prediction back to the original text."""
    # When we created the data, we kept track of the alignment between original
    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
    # now `orig_text` contains the span of our original text corresponding to the
    # span that we predicted.
    #
    # However, `orig_text` may contain extra characters that we don't want in
    # our prediction.
    #
    # For example, let's say:
    #   pred_text = steve smith
    #   orig_text = Steve Smith's
    #
    # We don't want to return `orig_text` because it contains the extra "'s".
    #
    # We don't want to return `pred_text` because it's already been normalized
    # (the SQuAD eval script also does punctuation stripping/lower casing but
    # our tokenizer does additional normalization like stripping accent
    # characters).
    #
    # What we really want to return is "Steve Smith".
    #
    # Therefore, we have to apply a semi-complicated alignment heuristic between
    # `pred_text` and `orig_text` to get a character-to-character alignment. This
    # can fail in certain cases in which case we just return `orig_text`.

    def _strip_spaces(text):
        ns_chars = []
        ns_to_s_map = collections.OrderedDict()
        for idx, c in enumerate(text):
            if c == " ":
                continue
            ns_to_s_map[len(ns_chars)] = idx
            ns_chars.append(c)
        ns_text = "".join(ns_chars)
        return ns_text, ns_to_s_map

    # We first tokenize `orig_text`, strip whitespace from the result
    # and `pred_text`, and check if they are the same length. If they are
    # NOT the same length, the heuristic has failed. If they are the same
    # length, we assume the characters are one-to-one aligned.
    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)

    tok_text = " ".join(tokenizer.tokenize(orig_text))

    start_position = tok_text.find(pred_text)
    if start_position == -1:
        return orig_text
    end_position = start_position + len(pred_text) - 1

    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)

    if len(orig_ns_text) != len(tok_ns_text):
        return orig_text

    # We then project the characters in `pred_text` back to `orig_text` using
    # the character-to-character alignment.
    tok_s_to_ns_map = {}
    for (i, tok_index) in tok_ns_to_s_map.items():
        tok_s_to_ns_map[tok_index] = i

    orig_start_position = None
    if start_position in tok_s_to_ns_map:
        ns_start_position = tok_s_to_ns_map[start_position]
        if ns_start_position in orig_ns_to_s_map:
            orig_start_position = orig_ns_to_s_map[ns_start_position]

    if orig_start_position is None:
        return orig_text

    orig_end_position = None
    if end_position in tok_s_to_ns_map:
        ns_end_position = tok_s_to_ns_map[end_position]
        if ns_end_position in orig_ns_to_s_map:
            orig_end_position = orig_ns_to_s_map[ns_end_position]

    if orig_end_position is None:
        return orig_text

    output_text = orig_text[orig_start_position : (orig_end_position + 1)]
    return output_text
Exemplo n.º 11
0
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
    """Project the tokenized prediction back to the original text."""
    def _strip_spaces(text):
        ns_chars = []
        ns_to_s_map = collections.OrderedDict()
        for (i, c) in enumerate(text):
            if c == " ":
                continue
            ns_to_s_map[len(ns_chars)] = i
            ns_chars.append(c)
        ns_text = "".join(ns_chars)
        return (ns_text, ns_to_s_map)

    # We first tokenize `orig_text`, strip whitespace from the result
    # and `pred_text`, and check if they are the same length. If they are
    # NOT the same length, the heuristic has failed. If they are the same
    # length, we assume the characters are one-to-one aligned.
    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)

    tok_text = " ".join(tokenizer.tokenize(orig_text))

    start_position = tok_text.find(pred_text)
    if start_position == -1:
        if verbose_logging:
            logger.info("Unable to find text: '%s' in '%s'" %
                        (pred_text, orig_text))
        return orig_text
    end_position = start_position + len(pred_text) - 1

    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)

    if len(orig_ns_text) != len(tok_ns_text):
        if verbose_logging:
            logger.info(
                "Length not equal after stripping spaces: '%s' vs '%s'",
                orig_ns_text, tok_ns_text)
        return orig_text

    # We then project the characters in `pred_text` back to `orig_text` using
    # the character-to-character alignment.
    tok_s_to_ns_map = {}
    for (i, tok_index) in tok_ns_to_s_map.items():
        tok_s_to_ns_map[tok_index] = i

    orig_start_position = None
    if start_position in tok_s_to_ns_map:
        ns_start_position = tok_s_to_ns_map[start_position]
        if ns_start_position in orig_ns_to_s_map:
            orig_start_position = orig_ns_to_s_map[ns_start_position]

    if orig_start_position is None:
        if verbose_logging:
            logger.info("Couldn't map start position")
        return orig_text

    orig_end_position = None
    if end_position in tok_s_to_ns_map:
        ns_end_position = tok_s_to_ns_map[end_position]
        if ns_end_position in orig_ns_to_s_map:
            orig_end_position = orig_ns_to_s_map[ns_end_position]

    if orig_end_position is None:
        if verbose_logging:
            logger.info("Couldn't map end position")
        return orig_text

    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
    return output_text
Exemplo n.º 12
0
class WordEmbedding(nn.Module):
    def __init__(self, config):
        super(WordEmbedding, self).__init__()
        self.config = config
        self.tokenizer = BasicTokenizer(do_lower_case=True)
        # standard deviation of initialization
        init_std = config.MODEL.MMSS_HEAD.TRANSFORMER.BERT_CONFIG.initializer_range

        self.words = []
        self.word2idx = {}
        self.embeddings = []
        with open(config.MODEL.LANGUAGE_BACKBONE.EMBEDDING_PATH, 'r') as fin:
            for row in fin:
                row_tk = row.split()
                self.words.append(row_tk[0])
                self.word2idx[row_tk[0]] = len(self.words) - 1
                self.embeddings.append([float(num) for num in row_tk[1:]])

        self.embeddings = torch.tensor(
            np.asarray(self.embeddings, dtype=np.float32)).cuda()
        self.embeddings = nn.Parameter(self.embeddings)
        self.out_channels = self.embeddings.shape[-1]
        if self.config.MODEL.LANGUAGE_BACKBONE.FREEZE:
            self.embeddings.requires_grad = False

        self.words.extend(['[OOV]', '[PAD]', '[CLS]', '[SEP]', '[MASK]'])
        self.oov_idx = len(self.words) - 5
        self.pad_idx = len(self.words) - 4
        self.cls_idx = len(self.words) - 3
        self.sep_idx = len(self.words) - 2
        self.mask_idx = len(self.words) - 1
        self.special_tokens = set([
            self.oov_idx, self.pad_idx, self.cls_idx, self.sep_idx,
            self.mask_idx
        ])
        self.special_embeddings = nn.Parameter(
            torch.zeros(5, self.out_channels).cuda())
        self.special_embeddings.data.normal_(mean=0.0, std=init_std)
        self.aug_embeddings = torch.cat(
            [self.embeddings, self.special_embeddings], dim=0)
        head_config = self.config.MODEL.MMSS_HEAD.TRANSFORMER
        self.mlm = head_config.MASKED_LANGUAGE_MODELING
        self.mlm_prob = head_config.MASKED_LANGUAGE_MODELING_PROB
        self.mlm_prob_mask = head_config.MASKED_LANGUAGE_MODELING_PROB_MASK
        self.mlm_prob_noise = head_config.MASKED_LANGUAGE_MODELING_PROB_NOISE
        self.mlm_during_validation = head_config.MASKED_LANGUAGE_MODELING_VALIDATION
        self.add_position_embedding = config.MODEL.LANGUAGE_BACKBONE.ADD_POSITION_EMBEDDING
        if self.add_position_embedding:
            # maximum length of a sentence
            m = config.MODEL.MMSS_HEAD.TRANSFORMER.BERT_CONFIG.max_position_embeddings
            self.position_embedding = nn.Parameter(
                torch.zeros(m, self.out_channels))
            self.position_embedding.data.normal_(mean=0.0, std=init_std)

    def forward(self, text_list):
        tokenized_batch = {
            'input_ids': [],
            'attention_mask': [],
            'encoded_tokens': [],
            'input_embeddings': [],
            'special_tokens_mask': [],
        }
        for i in range(len(text_list)):
            tokens = self.tokenizer.tokenize(text_list[i])
            ids = [self.word2idx.get(t, self.oov_idx) for t in tokens]
            ids = [self.cls_idx] + ids + [self.sep_idx]
            tokenized_batch['input_ids'].append(ids)

        max_len = max([len(i) for i in tokenized_batch['input_ids']])
        for i in range(len(text_list)):
            ids = tokenized_batch['input_ids'][i]
            l = len(ids)
            ids.extend([self.pad_idx] * (max_len - l))

        if self.mlm:
            tokenized_batch['target_ids'] = deepcopy(
                tokenized_batch['input_ids'])
            tokenized_batch['mlm_mask'] = []
            for i, item in enumerate(tokenized_batch['input_ids']):
                mlm_mask = []
                for j in range(len(item)):
                    if (item[j] in self.special_tokens or
                            not (self.training or self.mlm_during_validation)):
                        mlm_mask.append(0)
                        continue
                    prob = np.random.rand()
                    if prob < self.mlm_prob:
                        mlm_mask.append(1)
                        prob /= self.mlm_prob
                        if prob < self.mlm_prob_mask:
                            item[j] = self.mask_idx
                        elif prob < self.mlm_prob_mask + self.mlm_prob_noise:
                            # assuming special tokens are at the end of the words list
                            item[j] = np.random.randint(
                                len(self.words) - len(self.special_tokens))
                    else:
                        mlm_mask.append(0)
                tokenized_batch['mlm_mask'].append(mlm_mask)

        for i in range(len(text_list)):
            ids = np.asarray(tokenized_batch['input_ids'][i])
            tokenized_batch['attention_mask'].append(
                (ids != self.pad_idx).astype(np.int64))
            enc = self.aug_embeddings[ids]
            tokenized_batch['input_embeddings'].append(enc)
            if self.add_position_embedding:
                enc = enc + self.position_embedding[:max_len]
            tokenized_batch['encoded_tokens'].append(enc)
            sp_mask = []
            for tk in ids:
                if tk in self.special_tokens:
                    sp_mask.append(1)
                else:
                    sp_mask.append(0)
            tokenized_batch['special_tokens_mask'].append(sp_mask)

        tokenized_batch['input_embeddings'] = torch.stack(
            tokenized_batch['input_embeddings'], dim=0)
        tokenized_batch['encoded_tokens'] = torch.stack(
            tokenized_batch['encoded_tokens'], dim=0)
        tokenized_batch['input_ids'] = torch.tensor(
            tokenized_batch['input_ids']).cuda()
        tokenized_batch['attention_mask'] = torch.tensor(
            tokenized_batch['attention_mask']).cuda()
        tokenized_batch['special_tokens_mask'] = torch.tensor(
            tokenized_batch['special_tokens_mask']).cuda()
        if self.mlm:
            tokenized_batch['mlm_mask'] = torch.tensor(
                tokenized_batch['mlm_mask']).cuda()
            tokenized_batch['target_ids'] = torch.tensor(
                tokenized_batch['target_ids']).cuda()
        return tokenized_batch