def populate_wordform_definitions(self, wf, senses): should_do_translation = self.translate_wordforms if should_do_translation: has_analysis_and_paradigm = ( (wf.analysis and wf.paradigm) if not settings.MORPHODICT_ENABLE_FST_LEMMA_SUPPORT else (wf.fst_lemma and wf.paradigm)) if not has_analysis_and_paradigm: should_do_translation = False definitions_and_sources = self.create_definitions(wf, senses) if not should_do_translation: return lemma_text = (wf.text if not settings.MORPHODICT_ENABLE_FST_LEMMA_SUPPORT else wf.fst_lemma) for ( prefix_tags, suffix_tags, ) in self.paradigm_manager.all_analysis_template_tags(wf.paradigm): analysis = RichAnalysis((prefix_tags, lemma_text, suffix_tags)) for generated in strict_generator().lookup(analysis.smushed()): # Skip re-instantiating lemma if analysis == wf.analysis: continue inflected_wordform = Wordform( # For now, leaving paradigm and linguist_info empty; # code can get that info from the lemma instead. text=generated, raw_analysis=analysis.tuple, lemma=wf, is_lemma=False, ) for d, sources in definitions_and_sources: translation = translate_single_definition( inflected_wordform, d.text, self.translation_stats) if translation is None: continue is_inflected_wordform_unsaved = inflected_wordform.id is None if is_inflected_wordform_unsaved: self.wordform_buffer.add(inflected_wordform) self._add_definition( inflected_wordform, translation, ("🤖" + source for source in sources), auto_translation_source=d, )
def test_fst_generation(): wordforms = set(strict_generator().lookup("PV/ta+PV/pe+kîwêmakan+V+II+Ind+4Sg")) assert "ta-pê-kîwêmakaniyiw" in wordforms
def fetch_results(search_run: core.SearchRun): fetch_results_from_target_language_keywords(search_run) fetch_results_from_source_language_keywords(search_run) # Use the spelling relaxation to try to decipher the query # e.g., "atchakosuk" becomes "acâhkos+N+A+Pl" -- # thus, we can match "acâhkos" in the dictionary! fst_analyses = set(rich_analyze_relaxed(search_run.internal_query)) print([a.tuple for a in fst_analyses]) db_matches = list( Wordform.objects.filter( raw_analysis__in=[a.tuple for a in fst_analyses])) for wf in db_matches: search_run.add_result( Result( wf, source_language_match=wf.text, query_wordform_edit_distance=get_modified_distance( wf.text, search_run.internal_query), )) # An exact match here means we’re done with this analysis. fst_analyses.discard(wf.analysis) # fst_analyses has now been thinned by calls to `fst_analyses.remove()` # above; remaining items are analyses which are not in the database, # although their lemmas should be. for analysis in fst_analyses: # When the user query is outside of paradigm tables # e.g. mad preverb and reduplication: ê-mâh-misi-nâh-nôcihikocik # e.g. Initial change: nêpât: {'IC+nipâw+V+AI+Cnj+3Sg'} normatized_form_for_analysis = strict_generator().lookup( analysis.smushed()) if len(normatized_form_for_analysis) == 0: logger.error( "Cannot generate normative form for analysis: %s (query: %s)", analysis, search_run.internal_query, ) continue # If there are multiple forms for this analysis, use the one that is # closest to what the user typed. normatized_user_query = min( normatized_form_for_analysis, key=lambda f: get_modified_distance(f, search_run.internal_query), ) possible_lemma_wordforms = best_lemma_matches( analysis, Wordform.objects.filter(text=analysis.lemma, is_lemma=True)) for lemma_wordform in possible_lemma_wordforms: synthetic_wordform = Wordform( text=normatized_user_query, raw_analysis=analysis.tuple, lemma=lemma_wordform, ) search_run.add_result( Result( synthetic_wordform, analyzable_inflection_match=True, query_wordform_edit_distance=get_modified_distance( search_run.internal_query, normatized_user_query, ), ))
def test_generate_non_word(): assert [] == list(strict_generator().lookup("pîpîpôpô+Ipc"))
def test_generate(analysis, wordform): """ Simple test of generating wordforms. """ assert wordform in list(strict_generator().lookup(analysis))