Exemplo n.º 1
0
    def tokenize_with_offsets(self, text_input):
        """Performs basic word tokenization for BERT.

    Args:
      text_input: A `Tensor` or `RaggedTensor` of untokenized UTF-8 strings.
    Returns:
      A `RaggedTensor` of tokenized strings from text_input.
    """
        # lowercase and strip accents (if option is set)
        if self._lower_case:
            text_input = case_fold_utf8(text_input)
            text_input = normalize_utf8(text_input, "NFD")
            text_input = string_ops.regex_replace(text_input, r"\p{Mn}", "")
        else:
            # utf8 normalization
            if self._normalization_form is not None:
                text_input = normalize_utf8(text_input,
                                            self._normalization_form)

        # strip out control characters
        text_input = string_ops.regex_replace(text_input, r"\p{Cc}|\p{Cf}",
                                              " ")

        return regex_split_ops.regex_split_with_offsets(
            text_input, _DELIM_REGEX_PATTERN, self._keep_delim_regex_pattern,
            "BertBasicTokenizer")
Exemplo n.º 2
0
 def test_lowercase_empty_string(self):
     txt = [
         "",
     ]
     expected = [
         b"",
     ]
     self.assertAllEqual(expected, normalize_ops.case_fold_utf8(txt))
Exemplo n.º 3
0
 def test_lowercase_one_string(self):
     txt = [
         " TExt to loWERcase! ",
     ]
     expected = [
         b" text to lowercase! ",
     ]
     self.assertAllEqual(expected, normalize_ops.case_fold_utf8(txt))
Exemplo n.º 4
0
 def test_lowercase_text(self):
     txt = [
         "Punctuation and digits: -*/+$#%@%$123456789#^$*%&",
         "Non-latin UTF8 chars: ΘͽʦȺЩ",
         "Accented chars: ĎÔPQRŔSŠoóôpqrŕsštťuúvwxyý",
         "Non-UTF8-letters: e.g. ◆, ♥, and the emoji symbol ( ͡° ͜ʖ ͡°)",
         "Folded: ßς", ""
     ]
     expected = [
         "punctuation and digits: -*/+$#%@%$123456789#^$*%&",
         "non-latin utf8 chars: θͽʦⱥщ",
         "accented chars: ďôpqrŕsšoóôpqrŕsštťuúvwxyý",
         "non-utf8-letters: e.g. ◆, ♥, and the emoji symbol ( ͡° ͜ʖ ͡°)",
         "folded: ssσ", ""
     ]
     self.assertAllEqual(expected, normalize_ops.case_fold_utf8(txt))
Exemplo n.º 5
0
  def tokenize(self, text_input):
    """Performs basic word tokenization for BERT.

    Args:
      text_input: A `Tensor` or `RaggedTensor` of untokenized UTF-8 strings.
    Returns:
      A `RaggedTensor` of tokenized strings from text_input.
    """
    # lowercase and strip accents (if option is set)
    if self._lower_case:
      text_input = case_fold_utf8(text_input)
      text_input = normalize_utf8(text_input, "NFD")
      text_input = string_ops.regex_replace(text_input, r"\p{Mn}", "")
    else:
      # utf8 normalization
      if self._normalization_form is not None:
        text_input = normalize_utf8(text_input, self._normalization_form)

    # strip out control characters
    text_input = string_ops.regex_replace(text_input, r"\p{Cc}|\p{Cf}", " ")

    # For chinese and emoji characters, tokenize by unicode codepoints
    unicode_tokenizer = UnicodeScriptTokenizer(
        keep_whitespace=self._keep_whitespace)
    script_tokenized = unicode_tokenizer.tokenize(text_input)

    split_cond = self._should_split(script_tokenized)

    unicode_char_split = ragged_string_ops.unicode_split(
        script_tokenized, "UTF-8")
    unicode_split_tokens = array_ops.where(
        array_ops.squeeze(split_cond),
        y=array_ops.expand_dims(script_tokenized.values, axis=1),
        x=unicode_char_split.values)
    final_tokens = script_tokenized.with_flat_values(unicode_split_tokens)
    return final_tokens.merge_dims(-2, -1)
Exemplo n.º 6
0
 def test_lowercase_one_string_ragged(self):
     txt = ragged_factory_ops.constant([[" TExt ", "to", " loWERcase! "],
                                        [" TExt to loWERcase! "]])
     expected = [[b" text ", b"to", b" lowercase! "],
                 [b" text to lowercase! "]]
     self.assertRaggedEqual(expected, normalize_ops.case_fold_utf8(txt))
Exemplo n.º 7
0
 def lower_case(self, text_input):
     """Lower-cases the `text_input'."""
     text_input = case_fold_utf8(text_input)
     text_input = normalize_utf8(text_input, "NFD")
     text_input = string_ops.regex_replace(text_input, r"\p{Mn}", "")
     return text_input