class EnglishUralicNLPMorphologicalRealizer(LanguageSpecificMorphologicalRealizer): def __init__(self): super().__init__("fi") self.case_map: Dict[str, str] = {"genitive": "GEN"} def realize(self, slot: Slot) -> str: case: Optional[str] = slot.attributes.get("case") if case is None: return slot.value log.debug("Realizing {} to Finnish") case = self.case_map.get(case.lower(), case.upper()) log.debug("Normalized case {} to {}".format(slot.attributes.get("case"), case)) possible_analyses = uralicApi.analyze(slot.value, "eng") log.debug("Identified {} possible analyses".format(len(possible_analyses))) if len(possible_analyses) == 0: log.warning( "No valid morphological analysis for {}, unable to realize despite case attribute".format(slot.value) ) return slot.value analysis = possible_analyses[0][0] log.debug("Picked {} as the morphological analysis of {}".format(analysis, slot.value)) analysis = "{}+{}".format(analysis, case) log.debug("Modified analysis to {}".format(analysis)) modified_value = uralicApi.generate(analysis, "eng")[0][0] log.debug("Realized value is {}".format(modified_value)) return modified_value
async def generate(ctx, arg): response = [] lines = uralicApi.generate(arg, "fin") if len(lines) > 0: for line in lines: response.append(line[0]) response = ('\n'.join(response)) await ctx.send(response) else: await ctx.send('generation not possible')
def create_first_verse(verb_candidates, probability_distribution, lemmas, verse): ''' Picks a verb from the candidates and then creates and outputs the first verse of the stanza ''' draw = choice(verb_candidates, 1, p=probability_distribution) verb = uralicApi.generate(draw[0] + "+V+Act+Ind+Prs+Sg3", "fin")[0][0] verse = " ".join(verse) # print('verse: ' + verse) # verse += " " + verb verse = verse.replace( verse.split(" ")[0], verse.split(" ")[0] + " " + verb + " ") return fix_syllables(verse, False)
class FinnishUralicNLPMorphologicalRealizer( LanguageSpecificMorphologicalRealizer): def __init__(self): super().__init__("fi") self.case_map: Dict[str, str] = { "ssa": "Ine", "ssä": "Ine", "inessive": "Ine", "genitive": "Gen" } def realize(self, slot: Slot) -> str: case: Optional[str] = slot.attributes.get("case") if case is None: return slot.value log.debug("Realizing {} to Finnish") case = self.case_map.get(case.lower(), case.capitalize()) log.debug("Normalized case {} to {}".format( slot.attributes.get("case"), case)) possible_analyses = uralicApi.analyze(slot.value, "fin") log.debug("Identified {} possible analyses".format( len(possible_analyses))) if len(possible_analyses) == 0: log.warning( "No valid morphological analysis for {}, unable to realize despite case attribute" .format(slot.value)) return slot.value analysis = possible_analyses[0][0] log.debug("Picked {} as the morphological analysis of {}".format( analysis, slot.value)) # We only want to replace the last occurence of "Nom", as otherwise all parts of compound words, rather than # only the last, get transformed to genitive. This is simply wrong for, e.g. "tyvipari". Simply doing a global # replacement results in *"tyvenparin", rather than "tyviparin". Unfortunately, python lacks a replace() which # starts from the right, so we need to identify the correct instance of "Nom" with rfind() and then manually # fiddle with slices. gen_start_idx = analysis.rfind("Nom") analysis = analysis[:gen_start_idx] + "Gen" + analysis[ gen_start_idx + 4:] # 4 = 1 + len("Nom") log.debug("Modified analysis to {}".format(analysis)) modified_value = uralicApi.generate(analysis, "fin")[0][0] log.debug("Realized value is {}".format(modified_value)) return modified_value
def test_generate(self): result = uralicApi.generate("äkkipikainen+A+Sg+Gen", "fin", force_local=True) self.assertEqual(result[0][0], 'äkkipikaisen')
def dictionary_entry(grouped_relation): ''' :param grouped_relation: Lexeme ID with all Relation objects linked to it :return: A latex string to represent the relation as an entry in the dictionary ''' lexeme_from_id, relations = grouped_relation relations = list(relations) lexeme_from = relations[0].lexeme_from dictionary_entry_text = [] entry_content = ( lexeme_from.lexeme, lexeme_from.pos, lexeme_from.specification, ) entry_content = tuple([tex_escape(c) for c in entry_content]) dictionary_entry_text.append("\entry{%s}{%s}{%s}" % entry_content) inflection_table = { 'V': [ 'V+Ind+Prs+ConNeg', 'V+Ind+Prs+Sg3', 'V+Ind+Prt+Sg1', 'V+Ind+Prt+Sg3' ], 'N': ['N+Sg+Loc', 'N+Sg+Ill', 'N+Pl+Gen'], 'A': ['A+Attr'], 'Prop': [ 'N+Prop+Sg+Loc', 'N+Prop+Sem/Mal+Sg+Loc', 'N+Prop+Sem/Fem+Sg+Loc', 'N+Prop+Sem/Plc+Sg+Loc' ] } contelex_inflexType_cases = { 'V': { r'(YD,1)': ['V+Ind+Prs+Sg1', 'V+Ind+Prs+Sg3'], r'(AD,1)': ['V+Ind+Prs+Sg1', 'V+Ind+Prt+Sg1'], r'(ED,1)': ['V+Ind+Prs+Sg1', 'V+Ind+Prt+Sg1', 'V+Ind+Prt+Sg3'], r'(,[2-4])': ['V+Ind+Prs+ConNeg'] }, 'N': { r'(Q[^,\n]*)(,1)': ['N+Sg+Loc', 'N+Sg+Ill', 'N+Pl+Gen'], r'(_[^Q,\n]*)(,1)': ['N+Sg+Loc', 'N+Sg+Ill'], r'(,3|[^D],2|ID,2)': ['N+Sg+Gen', 'N+Sg+IllN+Sg+Gen', 'N+Sg+Ill'], r'(,4|[YAE]D,2)': ['N+Sg+Loc', 'N+Sg+Ill'], } } translation_lemma_map = { 'V': '+Inf', 'N': '+N+Sg+Nom', 'A': '+Sg+Nom', 'Adv': '' } relations = list( sorted(relations, key=lambda r: ( r.relationmetadata_set.all().count() != 0, r.lexeme_to.lexeme_lang, ))) for r in relations: translation = r.lexeme_to translation_text = translation.lexeme pos = '' if translation.pos == lexeme_from.pos else translation.pos if translation.pos in translation_lemma_map: result = uralicApi.generate( translation.lexeme + '+' + 'Hom{}+'.format(translation.homoId) if translation.homoId > 0 else '' + translation.pos + translation_lemma_map[translation.pos], translation.language, dictionary_forms=True) if result: translation_text = result[0][0] # LaTeX escape the content inflections = [] MP_forms = translation.miniparadigm_set.all() existing_MP_forms = defaultdict(list) for form in MP_forms: existing_MP_forms[form.msd].append(form.wordform) if translation.id not in nti_ids: # ignore certain translations # custom transducer generated_MP_forms = defaultdict(list) if synthetiser: try: queries, _ = _inflector.__generator_queries__( translation.lexeme, translation.pos) for i in range(len(queries)): MP_form = '+'.join(queries[i].split('+')[1:]) try: generated_MP_forms[MP_form].append( synthetiser.lookup( queries[i])[0][0].split("@")[0]) except: raise except: # POS is empty or no queries pass else: # default (uralicNLP) generated_MP_forms = _inflector.generate_uralicNLP( translation.language, translation.lexeme, translation.pos, dictionary_forms=True) if translation.pos in inflection_table: inflection_forms = inflection_table[ translation.pos] # default inflections # specific inflections based on contlex if translation.contlex and translation.pos in contelex_inflexType_cases: for re_pattern, _inflections in contelex_inflexType_cases[ translation.pos].items(): if re.search( re_pattern, "{},{}".format(translation.contlex, translation.inflexType_str())): inflection_forms = _inflections break for inflection_form in inflection_forms: generated_form = None if inflection_form in existing_MP_forms: generated_form = existing_MP_forms[inflection_form] elif inflection_form in generated_MP_forms: generated_form = generated_MP_forms[inflection_form] if generated_form: if inflection_form == 'A+Attr': generated_form = [ "#{}".format(gf) for gf in generated_form ] elif inflection_form == 'V+Ind+Prs+ConNeg': generated_form[0] = "ij {}".format( generated_form[0]) inflections.extend(generated_form) if not inflections and translation.pos == 'N' and re.match( r'[A-Z](.+)', translation.lexeme): for inflection_form in inflection_table['Prop']: generated_results = uralicApi.generate( "{}+{}".format(translation.lexeme, inflection_form), translation.language) generated_form = [ gr[0].split('@')[0] for gr in generated_results ] if generated_form: inflections.extend(generated_form) break source_specification = r.relationmetadata_set.values_list('text', flat=True) \ .filter(type=SPECIFICATION, language=lexeme_from.language) \ .order_by('text').all() target_specification = r.relationmetadata_set.values_list('text', flat=True) \ .filter(type=SPECIFICATION, language=translation.language) \ .order_by('text').all() source_example = r.relationexample_set.values_list('text', flat=True) \ .filter(language=lexeme_from.language).order_by('text').all() target_example = r.relationexample_set.values_list('text', flat=True) \ .filter(language=translation.language).order_by('text').all() content = (translation_text, translation.specification, pos, ", ".join(inflections), ", ".join(source_specification), ", ".join(target_specification), ", ".join(source_example), ", ".join(target_example), "") content = tuple([tex_escape(c) for c in content]) dictionary_entry_text.append( "\\translation{%s}{%s}{%s}{%s}{%s}{%s}{%s}{%s}{%s}" % content) return "\n".join(dictionary_entry_text)
def create_verb_probabilities(usr_input): ''' Uses the first input noun to find a verbs that are semantically similar. Outputs verb candidates and their probability distribution. ''' lemmas = tokenize_and_lemmatize(usr_input) input_posses = get_pos_template(lemmas) # print("Input POSes: " + input_posses + "\n") # If both input words are noun. Other alternatives are not implemented. if input_posses == 'NN': lemma_dict = {'subject': lemmas[0], 'object': lemmas[1]} verse = [] # Loop through both lemmas and inflect them depending on their syntactic role for lemma in lemmas: # print_some_input_info(lemma) # FOR DEBUGGING for analysis in uralicApi.analyze(lemma, "fin"): ms_desc = analysis[0].lstrip(analysis[0].split('+')[0]) # print("Analysis of the lemma: " + lemma + ms_desc + "\n") # FOR DEBUGGING if ms_desc[1] == 'N': if lemma == lemma_dict['subject']: generated = uralicApi.generate(lemma + "+N+Sg+Nom", "fin") if lemma == lemma_dict['object']: generated = uralicApi.generate(lemma + "+N+Sg+Gen", "fin") if len(generated) > 0: verse.append(generated[0][0]) else: print("Try with other words.") # If the lemma is subject, choose a verb using its word relations. There's probably a better alternative for this. if lemma == lemma_dict['subject']: word = semfi.get_word(lemma, "N", "fin") while True: try: relations = semfi.get_by_relation(word, "dobj", "fin", sort=True) break except Exception as e: print( "At least one of the input words was not recognized, try with other words.\n\n" + e) exit() verbs_and_probs = [] for relation in relations: try: if relation['word2']['pos'] == 'V': inflected_form = uralicApi.generate( relation['word2']['word'] + "+V+Act+Ind+Prs+Sg3", "fin")[0][0] first_syllable = finmeter.hyphenate( inflected_form).split("-")[0] if count_syllables( inflected_form ) == 2 and not finmeter.is_short_syllable( first_syllable): verbs_and_probs.append( (relation['word2']['word'], relation['word2']['frequency'])) except: pass # Sort the verb by frequency (descending order) and get rid of the top 5% frequent and the half that is least frequent verbs_and_probs = sorted( verbs_and_probs, key=lambda x: x[-1], reverse=True)[round(( (len(verbs_and_probs) / 100) * 5)):round(((len(verbs_and_probs) / 100) * 50))] if len(verbs_and_probs) == 0: print("Try with other words.") exit() else: # Normalize the probabilities and choose the verb randomly verb_candidates, probability_distribution = map( list, zip(*verbs_and_probs)) probability_distribution = np.array( np.array(probability_distribution) / sum(probability_distribution)) return verb_candidates, probability_distribution, lemmas, lemma_dict, verse
else: pos = "A" if "CASE" not in args: args["CASE"] = "NOM" else: args["CASE"] = args["CASE"].upper() if "DEGREE" in args: degree = "+" + args["DEGREE"] possessive = "" if "POSS" in args: possessive = "+" + args["POSS"] #omorfi_query = "[WORD_ID="+word+"][POS="+pos+"][NUM="+args["NUM"]+"][CASE="+args["CASE"]+"]" omorfi_query = word + "+" + pos + degree + "+" + args["NUM"].title( ) + "+" + args["CASE"].title() + possessive + c**t word_form = _filter_generated(uralicApi.generate(omorfi_query, "fin"), word) if len(word_form) == 0: #Generation failed! if pos == "N": return inflect(beginning + "|" + word, "N+Prop", args) else: return beginning + backup_inflect(word, pos, args) else: return beginning + word_form[0][0] def _filter_generated(res, lemma): if len(res) < 2: return res for r in res: