Пример #1
0
def fix_punct(conllu_string):
	doc = Document()
	doc.from_conllu_string(conllu_string)
	fixpunct_block = FixPunct()
	fixpunct_block.process_document(doc)
	output_string = doc.to_conllu_string()
	return output_string
Пример #2
0
def fix_punct(conllu_string):
	doc = Document()
	doc.from_conllu_string(conllu_string)
	fixpunct_block = FixPunct()
	fixpunct_block.process_document(doc)
	output_string = doc.to_conllu_string()
	output_string = re.sub(r'# sent_id = [0-9]+\n',r'',output_string)  # remove udapi sent_id
	return output_string
Пример #3
0
def fix_punct(conllu_string):
	# Protect possessive apostrophe from being treated as punctuation
	conllu_string = re.sub(r"\t'\t([^\t\n]+\tPART\tPOS)", r'\t&udapi_apos;\t\1', conllu_string, flags=re.MULTILINE)  # remove udapi sent_id
	doc = Document()
	doc.from_conllu_string(conllu_string)
	fixpunct_block = FixPunct()
	fixpunct_block.process_document(doc)
	output_string = doc.to_conllu_string()
	output_string = output_string.replace('&udapi_apos;',"'")
	output_string = re.sub(r'# sent_id = [0-9]+\n',r'',output_string)  # remove udapi sent_id
	return output_string
Пример #4
0
def fix_punct(conllu_string):
    conllu_string = re.sub(r"\t'\t([^\t\n]+\tPART\tPOS)",
                           r'\t&udapi_apos;\t\1',
                           conllu_string,
                           flags=re.MULTILINE)
    conllu_string = re.sub(
        r'\t"\t([^\t\n]+\t[^\t\n]+\t[^\t\n]+\t[^\t\n]+\t[^\t\n]+\t[^p])',
        r'\t&udapi_quot;\t\1',
        conllu_string,
        flags=re.MULTILINE)
    doc = UdapiDocument()
    doc.from_conllu_string(conllu_string)
    fixpunct_block = FixPunct()
    fixpunct_block.process_document(doc)
    output_string = doc.to_conllu_string()
    output_string = output_string.replace('&udapi_apos;',
                                          "'").replace('&udapi_quot;', '"')
    output_string = re.sub(r'# sent_id = [0-9]+\n', r'',
                           output_string)  # remove udapi sent_id
    return output_string