Exemplo n.º 1
0
    def test_is_control(self):
        self.assertTrue(tokenization._is_control(u"\u0005"))

        self.assertFalse(tokenization._is_control(u"A"))
        self.assertFalse(tokenization._is_control(u" "))
        self.assertFalse(tokenization._is_control(u"\t"))
        self.assertFalse(tokenization._is_control(u"\r"))
Exemplo n.º 2
0
def customize_tokenizer(text, do_lower_case=False):
    tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
    temp_x = ""
    text = tokenization.convert_to_unicode(text)
    for c in text:
        if tokenizer._is_chinese_char(ord(c)) or tokenization._is_punctuation(
                c) or tokenization._is_whitespace(
                    c) or tokenization._is_control(c):
            temp_x += " " + c + " "
        else:
            temp_x += c
    if do_lower_case:
        temp_x = temp_x.lower()
    return temp_x.split()
Exemplo n.º 3
0
def get_dirty_text_ind(text):
    """Performs invalid character removal and whitespace cleanup on text."""

    text = [unicodedata.normalize("NFD", t) for t in text]
    output = []
    for char_ind, char in enumerate(text):
        if len(char) > 1:
            output.append(char_ind)
            continue
        cp = ord(char)
        if cp == 0 or cp == 0xfffd or _is_control(char):
            output.append(char_ind)

    return output
Exemplo n.º 4
0
 def _is_control(self, char):
     return bert_tokenization._is_control(char)  # pylint: disable=protected-access