def test_is_control(self): self.assertTrue(_is_control(u"\u0005")) self.assertFalse(_is_control(u"A")) self.assertFalse(_is_control(u" ")) self.assertFalse(_is_control(u"\t")) self.assertFalse(_is_control(u"\r"))
def _clean_text(self, text) -> Tuple[List[str], List[str]]: """ 相较于原本的transformers.tokenization_bert.BertTokenizer._clean_text, 将控制字符也输出为空格。主要目的是保持输出前后的长度一致。 同时也应该输出原始text :param text: :return: (cleaned_text, raw_text, ) """ output = [] raw_rsv_output = [] token = '' for char in text: cp = ord(char) if cp == 0 or cp == 0xFFFD or _is_control(char) or _is_whitespace( char): if len(token) > 0: output.append(token) raw_rsv_output.append(token) token = '' output.append(" ") else: token += char if len(token) > 0: output.append(token) raw_rsv_output.append(token) assert sum(len(token) for token in output) == sum( len(token) for token in raw_rsv_output) return output, raw_rsv_output
def _clean_token(self, token: RawRsvSimpleToken) -> List[RawRsvSimpleToken]: """Performs invalid character removal and whitespace cleanup on text.""" output = [] is_start = True for char, idx, raw_char in token: cp = ord(char) if cp == 0 or cp == 0xFFFD or _is_control(char) or _is_whitespace(char): output.append(RawRsvSimpleToken(text=" ", pos_ids=[idx], raw_text=char)) is_start = True else: if is_start: output.append(RawRsvSimpleToken()) is_start = False output[-1].text += char output[-1].pos_ids.append(idx) output[-1].raw_text += char return output