Python FixPunct示例

编程语言: Python

命名空间/包名称: udapi.block.ud.fixpunct

类/类型: FixPunct

hotexamples.com的示例: 5

Python FixPunct - 已找到5个示例。这些是从开源项目中提取的最受好评的udapi.block.ud.fixpunct.FixPunct现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

FixPunct(5)

process_document(4)

示例#1

显示文件

文件： stanford2ud.py 项目： scarydemon2/gum

def fix_punct(conllu_string):
	doc = Document()
	doc.from_conllu_string(conllu_string)
	fixpunct_block = FixPunct()
	fixpunct_block.process_document(doc)
	output_string = doc.to_conllu_string()
	return output_string

示例#2

显示文件

def fix_punct(conllu_string):
	doc = Document()
	doc.from_conllu_string(conllu_string)
	fixpunct_block = FixPunct()
	fixpunct_block.process_document(doc)
	output_string = doc.to_conllu_string()
	output_string = re.sub(r'# sent_id = [0-9]+\n',r'',output_string)  # remove udapi sent_id
	return output_string

示例#3

显示文件

def fix_punct(conllu_string):
	# Protect possessive apostrophe from being treated as punctuation
	conllu_string = re.sub(r"\t'\t([^\t\n]+\tPART\tPOS)", r'\t&udapi_apos;\t\1', conllu_string, flags=re.MULTILINE)  # remove udapi sent_id
	doc = Document()
	doc.from_conllu_string(conllu_string)
	fixpunct_block = FixPunct()
	fixpunct_block.process_document(doc)
	output_string = doc.to_conllu_string()
	output_string = output_string.replace('&udapi_apos;',"'")
	output_string = re.sub(r'# sent_id = [0-9]+\n',r'',output_string)  # remove udapi sent_id
	return output_string

示例#4

显示文件

文件： google2ud.py 项目： dan-zeman/udapi-python

    def __init__(self, lang='unk', non_mwt_langs='ar en ja ko zh', **kwargs):
        """Create the Google2ud block instance.

        See ``Convert1to2`` for all the args.
        """
        super().__init__(**kwargs)
        self.lang = lang

        self._addmwt_block = None
        if lang == 'de':
            self._addmwt_block = de_AddMwt()
        elif lang == 'es':
            self._addmwt_block = es_AddMwt()
        elif lang == 'fr':
            self._addmwt_block = fr_AddMwt()
        elif lang == 'pt':
            self._addmwt_block = pt_AddMwt()

        self._fixrigheaded_block = None
        if lang in {'ar', 'de', 'en', 'fr', 'hi', 'ru', 'th', 'tr', 'zh'}:
            self._fixrigheaded_block = FixRightheaded()

        # Normalize the attachment of punctuation for all languages.
        self._fixpunct_block = FixPunct()

        self._fixchain_block = None
        if lang in {'pt', 'ru'}:
            self._fixchain_block = FixChain()

        # UD_English v2.0 still uses "do n't" with SpaceAfter=No,
        # instead of annotating it as a multiword token.
        # In several other languages it is also common
        # that syntactic words are not separated with a space without being an MWT.
        self._comply_block = ComplyWithText(prefer_mwt=bool(
            lang not in non_mwt_langs.split()))

示例#5

显示文件

文件： dep_parser.py 项目： gucorpling/amalgum

def fix_punct(conllu_string):
    conllu_string = re.sub(r"\t'\t([^\t\n]+\tPART\tPOS)",
                           r'\t&udapi_apos;\t\1',
                           conllu_string,
                           flags=re.MULTILINE)
    conllu_string = re.sub(
        r'\t"\t([^\t\n]+\t[^\t\n]+\t[^\t\n]+\t[^\t\n]+\t[^\t\n]+\t[^p])',
        r'\t&udapi_quot;\t\1',
        conllu_string,
        flags=re.MULTILINE)
    doc = UdapiDocument()
    doc.from_conllu_string(conllu_string)
    fixpunct_block = FixPunct()
    fixpunct_block.process_document(doc)
    output_string = doc.to_conllu_string()
    output_string = output_string.replace('&udapi_apos;',
                                          "'").replace('&udapi_quot;', '"')
    output_string = re.sub(r'# sent_id = [0-9]+\n', r'',
                           output_string)  # remove udapi sent_id
    return output_string