def format_lexc_segments(wordmap): wordmap['analysis'] = lexc_escape(wordmap['stub']) + '{STUB}' retvals = [] lex_stub = lexc_escape(wordmap['stub']) for new_para in wordmap['new_paras']: retvals += ["%s:%s\t%s\t;" %(wordmap['analysis'], lex_stub, new_para)] return "\n".join(retvals)
def format_continuation_lexc_omor(anals, surf, cont, format): omorstring = '' if 'DIGITS_' in cont and not ('BACK' in cont or 'FRONT' in cont): omorstring = lexc_escape(surf) if anals and anals != 'LEMMA-START': omorstring += ']' # Collapse DRV=NUT/TU and PCP=NUT to PCP=NUT with full inflection if anals == 'Dnut': anals = 'Vact|Cnut' elif anals == 'Dtu': anals = 'Vpss|Cnut' # Collapse DRV=VA/TAVA and PCP=VA to PCP=VA with full inflection elif anals == 'Dva': anals = 'Vact|Cva' elif anals == 'Dtava': anals = 'Vpss|Cva' # Collapse DRV=MA and PCP=AGENT to PCP=AGENT with full inflection elif anals == 'Dma': anals = 'Cma' # Collapse DRV=MATON and PCP=NEG to PCP=NEG with full inflection elif anals == 'Dmaton': anals = 'Cmaton' elif ('Cnut' in anals or 'Cva' in anals or 'Cma' in anals or 'Cmaton' in anals) and \ (anals.endswith('Npl') or anals.endswith('Nsg')): anals = anals + '|Xnom' tags = anals.split('|') for tag in tags: omorstring += format_tag_omor(tag, format) surf = lexc_escape(surf) return "%s:%s\t%s ;\n" %(omorstring, surf, cont)
def format_continuation_lexc_ftb3(anals, surf, cont): ftbstring = format_analysis_lexc_ftb3(anals) if 'COMPOUND' in cont: # XXX: there was += before ftbstring = surf.replace(morph_boundary, '').replace(deriv_boundary, '') elif 'NUM_' in cont and ('BACK' in cont or 'FRONT' in cont and not ('C**T' in cont or 'POSS' in cont)): ftbstring += surf.replace(morph_boundary, '').replace(deriv_boundary, '') elif 'DIGITS_' in cont and not ('BACK' in cont or 'FRONT' in cont): ftbstring = lexc_escape(surf) + ftbstring surf = lexc_escape(surf) return "%s:%s\t%s ;\n" %(ftbstring, surf, cont)
def format_tag_apertium(stuff): if len(stuff) == 0: return "" elif stuff in stuff2monodix: if stuff2monodix[stuff] in ['+', '-', '#', '0', '']: return stuff2monodix[stuff] elif stuff2monodix[stuff].startswith('+'): return (lexc_escape(stuff2monodix[stuff]) + '%>') else: return ('%<' + lexc_escape(stuff2monodix[stuff]) + '%>') else: fail_formatting_missing_for(stuff, "apertium") return ""
def format_lexc_ftb3(wordmap, format): ''' format string for canonical ftb3 format for morphological analysis ''' if wordmap['stub'] == ' ': # do not include normal white space for now return "" wordmap['stub'] = lexc_escape(wordmap['stub']) wordmap['analysis'] = "%s" %(lexc_escape(wordmap['bracketstub'].replace(word_boundary, '#') + '←<Del>')) if (wordmap['pos'] == 'ACRONYM' and (len(wordmap['stub']) == 1 and not wordmap['stub'].isalpha())) or wordmap['stub'] == '§§': wordmap['analysis'] += format_tag_ftb3('PUNCTUATION') elif wordmap['pos'] in ['NOUN', 'VERB', 'ADJECTIVE', 'PRONOUN', 'NUMERAL', 'ACRONYM', 'PUNCTUATION']: wordmap['analysis'] += format_tag_ftb3(wordmap['pos']) elif wordmap['pos'] == 'CONJUNCTIONVERB': if wordmap['lemma'] == 'eikä': wordmap['lemma'] = 'ei' wordmap['analysis'] = format_tag_ftb3('COORDINATING') + \ format_tag_ftb3('Nneg') else: wordmap['analysis'] = format_tag_ftb3('ADVERBIAL') + \ format_tag_ftb3('Nneg') elif wordmap['particle']: for pclass in wordmap['particle'].split('|'): wordmap['analysis'] += format_tag_ftb3(pclass) else: print("not in FTB3 known poses or particle!\n", wordmap) exit(1) if wordmap['subcat']: if 'PERSONAL' in wordmap['subcat']: wordmap['subcat'] = 'PERSONAL' for subcat in wordmap['subcat'].split('|'): wordmap['analysis'] += format_tag_ftb3(subcat) if wordmap['is_proper']: wordmap['analysis'] += format_tag_ftb3('PROPER') if wordmap['symbol']: for subcat in wordmap['symbol'].split('|'): wordmap['analysis'] += format_tag_ftb3(subcat) if wordmap['lemma'] == '–': wordmap['analysis'].replace('Dash', 'EnDash') if wordmap['lemma'] == '—': wordmap['analysis'].replace('Dash', 'EmDash') lex_stub = wordmap['stub'] retvals = [] for new_para in wordmap['new_paras']: retvals += ["%s:%s\t%s\t;" %(wordmap['analysis'], lex_stub, new_para)] if wordmap['lemma'] in ['-', '–', '—', '(']: retvals += ["%s%% %%>%%>%%>:%s\t%s\t;" %(wordmap['analysis'], lex_stub, new_para)] return "\n".join(retvals)
def format_lexc_apertium(wordmap): wordmap['analysis'] = lexc_escape(wordmap['lemma']) wordmap['analysis'] = wordmap['analysis'].replace(word_boundary, '+').replace(weak_boundary, '') if wordmap['is_suffix']: wordmap['analysis'] = "+" + wordmap['analysis'] elif wordmap['is_prefix']: wordmap['analysis'] += "+" if wordmap['pos'] == 'NOUN': if wordmap['is_proper']: wordmap['analysis'] += '%<np%>' for pc in wordmap['proper_noun_class'].split(','): wordmap['analysis'] += format_tag_apertium(pc) else: wordmap['analysis'] += '%<n%>' elif wordmap['pos'] == 'VERB': if wordmap['argument']: wordmap['analysis'] += format_tag_apertium(wordmap['argument'] + '_arg') else: wordmap['analysis'] += format_tag_apertium(wordmap['pos']) elif wordmap['pos'] == 'CONJUNCTIONVERB': if wordmap['lemma'] == 'eikä': wordmap['lemma'] = 'ei' wordmap['analysis'] = 'ja' + \ format_tag_apertium('COORDINATING') + \ '+ei' + \ format_tag_apertium('Nneg') else: wordmap['analysis'] = wordmap['lemma'][:-2] +\ format_tag_apertium('ADVERBIAL') + \ '+' + wordmap['lemma'][-2:] + \ format_tag_apertium('Nneg') elif wordmap['particle']: for pclass in wordmap['particle'].split('|'): wordmap['analysis'] += format_tag_apertium(pclass) else: wordmap['analysis'] += format_tag_apertium(wordmap['pos']) if wordmap['subcat']: for subcat in wordmap['subcat'].split('|'): wordmap['analysis'] += format_tag_apertium(subcat) if wordmap['symbol']: for subcat in wordmap['symbol'].split('|'): wordmap['analysis'] += format_tag_apertium(subcat) retvals = "" wordmap['stub'] = wordmap['stub'].replace(word_boundary, optional_hyphen) wordmap['stub'] = lexc_escape(wordmap['stub']) for new_para in wordmap['new_paras']: retvals += "%s:%s\t%s\t;\n" %(wordmap['analysis'], wordmap['stub'], new_para) return retvals
def format_continuation_lexc_google(anals, surf, cont): ftbstring = format_analysis_lexc_google(anals) if 'COMPOUND' in cont: ftbstring = surf.replace(morph_boundary, '').replace(deriv_boundary, '') if surf != '0': surf = lexc_escape(surf) return "%s:%s\t%s ;\n" %(ftbstring, surf, cont)
def format_lexc_google(wordmap): ''' format string for canonical google universal pos format for morphological analysis ''' if wordmap['stub'] == ' ': # do not include normal white space for now return "" wordmap['stub'] = lexc_escape(wordmap['stub']) wordmap['analysis'] = "%s" %(lexc_escape(wordmap['bracketstub'].replace(word_boundary, '#') + '←<Del>')) wordmap['analysis'] += format_tag_google(wordmap['pos']) if wordmap['particle']: for pclass in wordmap['particle'].split('|'): wordmap['analysis'] += format_tag_google(pclass) if wordmap['subcat']: for subcat in wordmap['subcat'].split('|'): wordmap['analysis'] += format_tag_google(subcat) if wordmap['is_proper']: wordmap['analysis'] += format_tag_google('PROPER') lex_stub = wordmap['stub'] retvals = [] for new_para in wordmap['new_paras']: retvals += ["%s:%s\t%s\t;" %(wordmap['analysis'], lex_stub, new_para)] return "\n".join(retvals)
def format_multichars_lexc_apertium(): multichars = "!! Apertium standard tags:\n" for mcs in apertium_multichars: if not '><' in mcs and not mcs in ['', '+', '-', '#', '0']: multichars += '%<' + lexc_escape(mcs) + "%>\n" return multichars
def format_continuation_lexc_apertium(anals, surf, cont): analstring = format_analysis_lexc_apertium(anals) if 'DIGITS_' in cont and not ('BACK' in cont or 'FRONT' in cont): analstring = lexc_escape(surf) + analstring surf = lexc_escape(surf) return "%s:%s\t%s ;\n" %(analstring, surf, cont)
def format_lexc_omor(wordmap, format): ''' format string for canonical omor format for morphological analysis ''' if wordmap['stub'] == ' ': # do not include normal white space for now return "" wordmap['stub'] = lexc_escape(wordmap['stub']) wordmap['analysis'] = "[WORD_ID=%s]" %(lexc_escape(wordmap['lemma'])) wordmap['particle'] = wordmap['particle'].replace('QUALIFIER', 'ADJECTIVE') if wordmap['pos'] != 'PARTICLE' or not wordmap['particle'].startswith('AD'): wordmap['analysis'] += format_tag_omor(wordmap['pos'], format) if wordmap['is_suffix']: wordmap['analysis'] += format_tag_omor('SUFFIX', format) if wordmap['is_prefix']: wordmap['analysis'] += format_tag_omor('PREFIX', format) if wordmap['pos'] == 'ADJECTIVE': wordmap['analysis'] += format_tag_omor('Cpos', format) if wordmap['particle']: for pclass in wordmap['particle'].split('|'): wordmap['analysis'] += format_tag_omor(pclass, format) if wordmap['symbol']: for subcat in wordmap['symbol'].split('|'): wordmap['analysis'] += format_tag_omor(subcat, format) if wordmap['subcat']: for subcat in wordmap['subcat'].split('|'): wordmap['analysis'] += format_tag_omor(subcat, format) if wordmap['is_proper']: if '+propers' in format and wordmap['proper_noun_class']: for prop in wordmap['proper_noun_class'].split(','): wordmap['analysis'] += format_tag_omor(prop, format) else: wordmap['analysis'] += format_tag_omor('PROPER', format) if '+semantics' in format and wordmap['sem']: for sem in wordmap['sem'].split(','): wordmap['analysis'] += format_tag_omor(sem, format) if wordmap['style']: wordmap['analysis'] += format_tag_omor(wordmap['style'], format) if '+ktnkav' in format and wordmap['pos'] != 'ACRONYM': tag = "[KTN=%s]" %(lexc_escape(wordmap['kotus_tn'])) if tag in ktnkav_multichars: wordmap['analysis'] += tag if wordmap['kotus_av']: wordmap['analysis'] += "[KAV=%(kotus_av)s]" %(wordmap) elif '+newparas' in format: for new_para in wordmap['new_paras']: wordmap['analysis'] += "[NEWPARA=%s]" %(new_para) # match WORD_ID= with epsilon, then stub and lemma might match lex_stub = '0' + wordmap['stub'] retvals = [] for new_para in wordmap['new_paras']: retvals += ["%s:%s\t%s\t;" %(wordmap['analysis'], lex_stub, new_para)] return "\n".join(retvals)
def format_continuation_lexc_segments(anals, surf, cont): surf = lexc_escape(surf) return "%s:%s\t%s ; \n" %(surf.replace(optional_hyphen, word_boundary), surf, cont)