def test_is_punctuation(self): self.assertTrue(_is_punctuation(u"-")) self.assertTrue(_is_punctuation(u"$")) self.assertTrue(_is_punctuation(u"`")) self.assertTrue(_is_punctuation(u".")) self.assertFalse(_is_punctuation(u"A")) self.assertFalse(_is_punctuation(u" "))
def _run_split_on_punc(self, token: RawRsvSimpleToken, never_split=None) -> List[RawRsvSimpleToken]: output = [] is_start = True if token.text in never_split: return [token] for char, pos_idx, raw_char in token: if _is_punctuation(char): output.append(RawRsvSimpleToken(text=char, pos_ids=[pos_idx], raw_text=raw_char)) is_start = True else: if is_start: output.append(RawRsvSimpleToken()) is_start = False output[-1].text += char output[-1].pos_ids.append(pos_idx) output[-1].raw_text += raw_char return output
def _run_split_on_punc(self, text, never_split=None): """Splits punctuation on a piece of text.""" if never_split is None: never_split = self.never_split if never_split is not None and text in never_split: return [text] chars = list(text) i = 0 start_new_word = True output = [] while i < len(chars): char = chars[i] if _is_punctuation(char): output.append([char]) start_new_word = True else: if start_new_word: output.append([]) start_new_word = False output[-1].append(char) i += 1 return ["".join(x) for x in output]