Пример #1
0
 def __init__(self, vocab_file, do_lower_case=True, split_on_punc=True):
   self._whitespace_tokenizer = tokenization.BasicTokenizer(
       do_lower_case=False, split_on_punc=False)
   self._punctuation_tokenizer = tokenization.BasicTokenizer(
       do_lower_case=False, split_on_punc=split_on_punc)
   self._full_tokenizer = tokenization.FullTokenizer(
       vocab_file, do_lower_case=do_lower_case, split_on_punc=split_on_punc)
   self._vocab = list(self._full_tokenizer.vocab.keys())
Пример #2
0
 def __init__(self, vocab_file, do_lower_case=True, vocab_override=None):
     super().__init__()
     self.vocab_file = vocab_file
     self.do_lower_case = do_lower_case
     if vocab_override is None:
         self.vocab = tokenization.load_vocab(vocab_file)
     else:
         self.vocab = vocab_override
     self.inv_vocab = {v: k for k, v in self.vocab.items()}
     self.basic_tokenizer = tokenization.BasicTokenizer(
         do_lower_case=do_lower_case)
     self.wordpiece_tokenizer = tokenization.WordpieceTokenizer(
         vocab=self.vocab)
Пример #3
0
 def __init__(self, vocab_file):
     self._basic_tokenizer = tokenization.BasicTokenizer(do_lower_case=True)
     self._wp_tokenizer = tokenization.FullTokenizer(
         vocab_file=vocab_file, do_lower_case=True)
def get_final_text(pred_text, orig_text, do_lower_case, verbose=False):
    """Project the tokenized prediction back to the original text."""

    # When we created the data, we kept track of the alignment between original
    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
    # now `orig_text` contains the span of our original text corresponding to the
    # span that we predicted.
    #
    # However, `orig_text` may contain extra characters that we don't want in
    # our prediction.
    #
    # For example, let's say:
    #   pred_text = steve smith
    #   orig_text = Steve Smith's
    #
    # We don't want to return `orig_text` because it contains the extra "'s".
    #
    # We don't want to return `pred_text` because it's already been normalized
    # (the SQuAD eval script also does punctuation stripping/lower casing but
    # our tokenizer does additional normalization like stripping accent
    # characters).
    #
    # What we really want to return is "Steve Smith".
    #
    # Therefore, we have to apply a semi-complicated alignment heruistic between
    # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
    # can fail in certain cases in which case we just return `orig_text`.

    def _strip_spaces(text):
        ns_chars = []
        ns_to_s_map = collections.OrderedDict()
        for (i, c) in enumerate(text):
            if c == " ":
                continue
            ns_to_s_map[len(ns_chars)] = i
            ns_chars.append(c)
        ns_text = "".join(ns_chars)
        return (ns_text, ns_to_s_map)

    # We first tokenize `orig_text`, strip whitespace from the result
    # and `pred_text`, and check if they are the same length. If they are
    # NOT the same length, the heuristic has failed. If they are the same
    # length, we assume the characters are one-to-one aligned.
    tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)

    tok_text = " ".join(tokenizer.tokenize(orig_text))

    start_position = tok_text.find(pred_text)
    if start_position == -1:
        if verbose:
            logging.info("Unable to find text: '%s' in '%s'", pred_text,
                         orig_text)
        return orig_text
    end_position = start_position + len(pred_text) - 1

    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)

    if len(orig_ns_text) != len(tok_ns_text):
        if verbose:
            logging.info(
                "Length not equal after stripping spaces: '%s' vs '%s'",
                orig_ns_text, tok_ns_text)
        return orig_text

    # We then project the characters in `pred_text` back to `orig_text` using
    # the character-to-character alignment.
    tok_s_to_ns_map = {}
    for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
        tok_s_to_ns_map[tok_index] = i

    orig_start_position = None
    if start_position in tok_s_to_ns_map:
        ns_start_position = tok_s_to_ns_map[start_position]
        if ns_start_position in orig_ns_to_s_map:
            orig_start_position = orig_ns_to_s_map[ns_start_position]

    if orig_start_position is None:
        if verbose:
            logging.info("Couldn't map start position")
        return orig_text

    orig_end_position = None
    if end_position in tok_s_to_ns_map:
        ns_end_position = tok_s_to_ns_map[end_position]
        if ns_end_position in orig_ns_to_s_map:
            orig_end_position = orig_ns_to_s_map[ns_end_position]

    if orig_end_position is None:
        if verbose:
            logging.info("Couldn't map end position")
        return orig_text

    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
    return output_text
Пример #5
0
    def test_basic_tokenizer_no_lower(self):
        tokenizer = tokenization.BasicTokenizer(do_lower_case=False)

        self.assertAllEqual(tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
                            ["HeLLo", "!", "how", "Are", "yoU", "?"])
Пример #6
0
    def test_basic_tokenizer_lower(self):
        tokenizer = tokenization.BasicTokenizer(do_lower_case=True)

        self.assertAllEqual(tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
                            ["hello", "!", "how", "are", "you", "?"])
        self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
Пример #7
0
    def test_chinese(self):
        tokenizer = tokenization.BasicTokenizer()

        self.assertAllEqual(tokenizer.tokenize(u"ah\u535A\u63A8zz"),
                            [u"ah", u"\u535A", u"\u63A8", u"zz"])
    def test_basic_tokenizer_no_split_on_punc(self):
        tokenizer = tokenization.BasicTokenizer(do_lower_case=True,
                                                split_on_punc=False)

        self.assertAllEqual(tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
                            ["hello!how", "are", "you?"])