def test_is_punctuation(self):
        self.assertTrue(_is_punctuation(u"-"))
        self.assertTrue(_is_punctuation(u"$"))
        self.assertTrue(_is_punctuation(u"`"))
        self.assertTrue(_is_punctuation(u"."))

        self.assertFalse(_is_punctuation(u"A"))
        self.assertFalse(_is_punctuation(u" "))
 def _run_split_on_punc(self, token: RawRsvSimpleToken, never_split=None) -> List[RawRsvSimpleToken]:
     output = []
     is_start = True
     if token.text in never_split:
         return [token]
     for char, pos_idx, raw_char in token:
         if _is_punctuation(char):
             output.append(RawRsvSimpleToken(text=char, pos_ids=[pos_idx], raw_text=raw_char))
             is_start = True
         else:
             if is_start:
                 output.append(RawRsvSimpleToken())
                 is_start = False
             output[-1].text += char
             output[-1].pos_ids.append(pos_idx)
             output[-1].raw_text += raw_char
     return output
示例#3
0
    def _run_split_on_punc(self, text, never_split=None):
        """Splits punctuation on a piece of text."""
        if never_split is None:
            never_split = self.never_split
        if never_split is not None and text in never_split:
            return [text]
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        while i < len(chars):
            char = chars[i]
            if _is_punctuation(char):
                output.append([char])
                start_new_word = True
            else:
                if start_new_word:
                    output.append([])
                start_new_word = False
                output[-1].append(char)
            i += 1

        return ["".join(x) for x in output]