def fix_punct(conllu_string): doc = Document() doc.from_conllu_string(conllu_string) fixpunct_block = FixPunct() fixpunct_block.process_document(doc) output_string = doc.to_conllu_string() return output_string
def fix_punct(conllu_string): doc = Document() doc.from_conllu_string(conllu_string) fixpunct_block = FixPunct() fixpunct_block.process_document(doc) output_string = doc.to_conllu_string() output_string = re.sub(r'# sent_id = [0-9]+\n',r'',output_string) # remove udapi sent_id return output_string
def fix_punct(conllu_string): # Protect possessive apostrophe from being treated as punctuation conllu_string = re.sub(r"\t'\t([^\t\n]+\tPART\tPOS)", r'\t&udapi_apos;\t\1', conllu_string, flags=re.MULTILINE) # remove udapi sent_id doc = Document() doc.from_conllu_string(conllu_string) fixpunct_block = FixPunct() fixpunct_block.process_document(doc) output_string = doc.to_conllu_string() output_string = output_string.replace('&udapi_apos;',"'") output_string = re.sub(r'# sent_id = [0-9]+\n',r'',output_string) # remove udapi sent_id return output_string
def fix_punct(conllu_string): conllu_string = re.sub(r"\t'\t([^\t\n]+\tPART\tPOS)", r'\t&udapi_apos;\t\1', conllu_string, flags=re.MULTILINE) conllu_string = re.sub( r'\t"\t([^\t\n]+\t[^\t\n]+\t[^\t\n]+\t[^\t\n]+\t[^\t\n]+\t[^p])', r'\t&udapi_quot;\t\1', conllu_string, flags=re.MULTILINE) doc = UdapiDocument() doc.from_conllu_string(conllu_string) fixpunct_block = FixPunct() fixpunct_block.process_document(doc) output_string = doc.to_conllu_string() output_string = output_string.replace('&udapi_apos;', "'").replace('&udapi_quot;', '"') output_string = re.sub(r'# sent_id = [0-9]+\n', r'', output_string) # remove udapi sent_id return output_string