def fix_punct(conllu_string): doc = Document() doc.from_conllu_string(conllu_string) fixpunct_block = FixPunct() fixpunct_block.process_document(doc) output_string = doc.to_conllu_string() return output_string
def fix_punct(conllu_string): doc = Document() doc.from_conllu_string(conllu_string) fixpunct_block = FixPunct() fixpunct_block.process_document(doc) output_string = doc.to_conllu_string() output_string = re.sub(r'# sent_id = [0-9]+\n',r'',output_string) # remove udapi sent_id return output_string
def fix_punct(conllu_string): # Protect possessive apostrophe from being treated as punctuation conllu_string = re.sub(r"\t'\t([^\t\n]+\tPART\tPOS)", r'\t&udapi_apos;\t\1', conllu_string, flags=re.MULTILINE) # remove udapi sent_id doc = Document() doc.from_conllu_string(conllu_string) fixpunct_block = FixPunct() fixpunct_block.process_document(doc) output_string = doc.to_conllu_string() output_string = output_string.replace('&udapi_apos;',"'") output_string = re.sub(r'# sent_id = [0-9]+\n',r'',output_string) # remove udapi sent_id return output_string
def __init__(self, lang='unk', non_mwt_langs='ar en ja ko zh', **kwargs): """Create the Google2ud block instance. See ``Convert1to2`` for all the args. """ super().__init__(**kwargs) self.lang = lang self._addmwt_block = None if lang == 'de': self._addmwt_block = de_AddMwt() elif lang == 'es': self._addmwt_block = es_AddMwt() elif lang == 'fr': self._addmwt_block = fr_AddMwt() elif lang == 'pt': self._addmwt_block = pt_AddMwt() self._fixrigheaded_block = None if lang in {'ar', 'de', 'en', 'fr', 'hi', 'ru', 'th', 'tr', 'zh'}: self._fixrigheaded_block = FixRightheaded() # Normalize the attachment of punctuation for all languages. self._fixpunct_block = FixPunct() self._fixchain_block = None if lang in {'pt', 'ru'}: self._fixchain_block = FixChain() # UD_English v2.0 still uses "do n't" with SpaceAfter=No, # instead of annotating it as a multiword token. # In several other languages it is also common # that syntactic words are not separated with a space without being an MWT. self._comply_block = ComplyWithText(prefer_mwt=bool( lang not in non_mwt_langs.split()))
def fix_punct(conllu_string): conllu_string = re.sub(r"\t'\t([^\t\n]+\tPART\tPOS)", r'\t&udapi_apos;\t\1', conllu_string, flags=re.MULTILINE) conllu_string = re.sub( r'\t"\t([^\t\n]+\t[^\t\n]+\t[^\t\n]+\t[^\t\n]+\t[^\t\n]+\t[^p])', r'\t&udapi_quot;\t\1', conllu_string, flags=re.MULTILINE) doc = UdapiDocument() doc.from_conllu_string(conllu_string) fixpunct_block = FixPunct() fixpunct_block.process_document(doc) output_string = doc.to_conllu_string() output_string = output_string.replace('&udapi_apos;', "'").replace('&udapi_quot;', '"') output_string = re.sub(r'# sent_id = [0-9]+\n', r'', output_string) # remove udapi sent_id return output_string