예제 #1
0
 def _clean_text(self, text) -> Tuple[List[str], List[str]]:
     """
     相较于原本的transformers.tokenization_bert.BertTokenizer._clean_text,
     将控制字符也输出为空格。主要目的是保持输出前后的长度一致。
     同时也应该输出原始text
     :param text:
     :return: (cleaned_text, raw_text, )
     """
     output = []
     raw_rsv_output = []
     token = ''
     for char in text:
         cp = ord(char)
         if cp == 0 or cp == 0xFFFD or _is_control(char) or _is_whitespace(
                 char):
             if len(token) > 0:
                 output.append(token)
                 raw_rsv_output.append(token)
             token = ''
             output.append(" ")
         else:
             token += char
         if len(token) > 0:
             output.append(token)
             raw_rsv_output.append(token)
     assert sum(len(token) for token in output) == sum(
         len(token) for token in raw_rsv_output)
     return output, raw_rsv_output
    def test_is_whitespace(self):
        self.assertTrue(_is_whitespace(u" "))
        self.assertTrue(_is_whitespace(u"\t"))
        self.assertTrue(_is_whitespace(u"\r"))
        self.assertTrue(_is_whitespace(u"\n"))
        self.assertTrue(_is_whitespace(u"\u00A0"))

        self.assertFalse(_is_whitespace(u"A"))
        self.assertFalse(_is_whitespace(u"-"))
예제 #3
0
def strip_whitespace(tokens: List[RawRsvSimpleToken]) -> List[RawRsvSimpleToken]:
    output = []
    is_start = True
    for token in tokens:
        for char, pos_idx, raw_char in token:
            if _is_whitespace(char):
                is_start = True
            else:
                if is_start:
                    output.append(RawRsvSimpleToken())
                    is_start = False
                output[-1].text += char
                output[-1].pos_ids.append(pos_idx)
                output[-1].raw_text += raw_char
    return output
예제 #4
0
 def process_inner_text(inner_text: List[str]) -> List[str]:
     output = []
     is_start = True
     inner_text = inner_text[0]
     for ch in inner_text:
         if _is_whitespace(ch):
             output.append(ch)
             is_start = True
         else:
             if is_start:
                 output.append(ch)
                 is_start = False
             else:
                 output[-1] += ch
     return output
예제 #5
0
 def _clean_token(self, token: RawRsvSimpleToken) -> List[RawRsvSimpleToken]:
     """Performs invalid character removal and whitespace cleanup on text."""
     output = []
     is_start = True
     for char, idx, raw_char in token:
         cp = ord(char)
         if cp == 0 or cp == 0xFFFD or _is_control(char) or _is_whitespace(char):
             output.append(RawRsvSimpleToken(text=" ", pos_ids=[idx], raw_text=char))
             is_start = True
         else:
             if is_start:
                 output.append(RawRsvSimpleToken())
                 is_start = False
             output[-1].text += char
             output[-1].pos_ids.append(idx)
             output[-1].raw_text += char
     return output