def cut_sentences_by_rule(text: str, sentence_delimiters: str = "。!?;"): paragraph = 0 index = 0 buffer = [] results: List[SyntacticUnit] = [] delimiters = set(sentence_delimiters) for paragraph_text in text.split("\n"): for char in paragraph_text: buffer.append(char) if char in delimiters: results.append( SyntacticUnit( text="".join(buffer), token=len(results), index=index, paragraph=paragraph ) ) buffer = [] index += 1 if len(buffer) > 0: results.append( SyntacticUnit( text="".join(buffer), token=len(results), index=index, paragraph=paragraph ) ) buffer = [] elif index == 0: continue paragraph += 1 index = 0 return results
def merge_syntactic_units(original_units, filtered_units, tags=None): units = [] for i in range(len(original_units)): if filtered_units[i] == '': continue text = original_units[i] token = filtered_units[i] tag = tags[i][1] if tags else None sentence = SyntacticUnit(text, token, tag) sentence.index = i units.append(sentence) return units
def insert_unit(target_list: List[SyntacticUnit], raw: List[str], tokens: List[str], pidx: int, sidx: int) -> None: target_list.append( SyntacticUnit(text="".join(raw), token=" ".join(tokens), index=sidx, paragraph=pidx))