示例#1
0
def fix_punct(conllu_string):
	doc = Document()
	doc.from_conllu_string(conllu_string)
	fixpunct_block = FixPunct()
	fixpunct_block.process_document(doc)
	output_string = doc.to_conllu_string()
	return output_string
示例#2
0
def fix_punct(conllu_string):
	doc = Document()
	doc.from_conllu_string(conllu_string)
	fixpunct_block = FixPunct()
	fixpunct_block.process_document(doc)
	output_string = doc.to_conllu_string()
	output_string = re.sub(r'# sent_id = [0-9]+\n',r'',output_string)  # remove udapi sent_id
	return output_string
示例#3
0
def fix_punct(conllu_string):
	# Protect possessive apostrophe from being treated as punctuation
	conllu_string = re.sub(r"\t'\t([^\t\n]+\tPART\tPOS)", r'\t&udapi_apos;\t\1', conllu_string, flags=re.MULTILINE)  # remove udapi sent_id
	doc = Document()
	doc.from_conllu_string(conllu_string)
	fixpunct_block = FixPunct()
	fixpunct_block.process_document(doc)
	output_string = doc.to_conllu_string()
	output_string = output_string.replace('&udapi_apos;',"'")
	output_string = re.sub(r'# sent_id = [0-9]+\n',r'',output_string)  # remove udapi sent_id
	return output_string
示例#4
0
    def __init__(self, lang='unk', non_mwt_langs='ar en ja ko zh', **kwargs):
        """Create the Google2ud block instance.

        See ``Convert1to2`` for all the args.
        """
        super().__init__(**kwargs)
        self.lang = lang

        self._addmwt_block = None
        if lang == 'de':
            self._addmwt_block = de_AddMwt()
        elif lang == 'es':
            self._addmwt_block = es_AddMwt()
        elif lang == 'fr':
            self._addmwt_block = fr_AddMwt()
        elif lang == 'pt':
            self._addmwt_block = pt_AddMwt()

        self._fixrigheaded_block = None
        if lang in {'ar', 'de', 'en', 'fr', 'hi', 'ru', 'th', 'tr', 'zh'}:
            self._fixrigheaded_block = FixRightheaded()

        # Normalize the attachment of punctuation for all languages.
        self._fixpunct_block = FixPunct()

        self._fixchain_block = None
        if lang in {'pt', 'ru'}:
            self._fixchain_block = FixChain()

        # UD_English v2.0 still uses "do n't" with SpaceAfter=No,
        # instead of annotating it as a multiword token.
        # In several other languages it is also common
        # that syntactic words are not separated with a space without being an MWT.
        self._comply_block = ComplyWithText(prefer_mwt=bool(
            lang not in non_mwt_langs.split()))
示例#5
0
def fix_punct(conllu_string):
    conllu_string = re.sub(r"\t'\t([^\t\n]+\tPART\tPOS)",
                           r'\t&udapi_apos;\t\1',
                           conllu_string,
                           flags=re.MULTILINE)
    conllu_string = re.sub(
        r'\t"\t([^\t\n]+\t[^\t\n]+\t[^\t\n]+\t[^\t\n]+\t[^\t\n]+\t[^p])',
        r'\t&udapi_quot;\t\1',
        conllu_string,
        flags=re.MULTILINE)
    doc = UdapiDocument()
    doc.from_conllu_string(conllu_string)
    fixpunct_block = FixPunct()
    fixpunct_block.process_document(doc)
    output_string = doc.to_conllu_string()
    output_string = output_string.replace('&udapi_apos;',
                                          "'").replace('&udapi_quot;', '"')
    output_string = re.sub(r'# sent_id = [0-9]+\n', r'',
                           output_string)  # remove udapi sent_id
    return output_string