示例#1
0
    def test_is_punctuation(self):
        self.assertTrue(tokenization._is_punctuation(u"-"))
        self.assertTrue(tokenization._is_punctuation(u"$"))
        self.assertTrue(tokenization._is_punctuation(u"`"))
        self.assertTrue(tokenization._is_punctuation(u"."))

        self.assertFalse(tokenization._is_punctuation(u"A"))
        self.assertFalse(tokenization._is_punctuation(u" "))
示例#2
0
def customize_tokenizer(text, do_lower_case=False):
    tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
    temp_x = ""
    text = tokenization.convert_to_unicode(text)
    for c in text:
        if tokenizer._is_chinese_char(ord(c)) or tokenization._is_punctuation(
                c) or tokenization._is_whitespace(
                    c) or tokenization._is_control(c):
            temp_x += " " + c + " "
        else:
            temp_x += c
    if do_lower_case:
        temp_x = temp_x.lower()
    return temp_x.split()
示例#3
0
 def _is_punctuation(self, char):
     return bert_tokenization._is_punctuation(char)  # pylint: disable=protected-access