Пример #1
0
def test_logs_error_on_analyzable_result_without_generated_string(caplog):
    """
    Ensures searching does not crash when given an analyzable result with no normative
    form. An error should be logged instead.

    It used to raise: ValueError: min() arg is an empty sequence

    See: https://github.com/UAlbertaALTLab/morphodict/issues/693
    """
    with caplog.at_level(logging.ERROR):
        search(query="bod").presentation_results()

    errors = [log for log in caplog.records if log.levelname == "ERROR"]
    assert len(errors) >= 1
    assert any("bod" in log.message for log in errors)
Пример #2
0
def test_search_for_stored_non_lemma():
    """
    A "stored non-lemma" is a wordform in the database that is NOT a lemma.
    """
    # "S/he would tell us stories."
    lemma_str = "âcimêw"
    query = "ê-kî-âcimikoyâhk"
    search_results = search(query=query).presentation_results()

    assert len(search_results) >= 1

    exact_matches = [
        result for result in search_results if result.wordform.text == query
    ]
    assert len(exact_matches) >= 1

    # Let's look at that search result in more detail
    result = exact_matches[0]
    assert not result.is_lemma
    assert result.lemma_wordform.text == lemma_str
    # todo: tags are not implemented
    # assert not result.preverbs
    # assert not result.reduplication_tags
    # assert not result.initial_change_tags
    assert len(result.lemma_wordform.definitions.all()) >= 1
    assert all(
        len(dfn.source_ids) >= 1 for dfn in result.lemma_wordform.definitions.all()
    )
Пример #3
0
def test_lemma_and_syncretic_form_ranking(lemma):
    """
    Tests that the lemma is always shown first, even when a search yields
    one or more forms that are syncretic with the lemma; That is, ensure THIS
    doesn't happen:

        sheep [Plural]
        form of sheep [Singular]

        (no definition found for sheep [Plural])

        sheep [Singular]
        1. a fluffy mammal that appears in dreams

    Note: this test is likely to be **FLAKY** if the implementation is buggy
    and uses a **non-stable** sort or comparison.
    """

    results = search(query=lemma).presentation_results()
    assert len(results) >= 2
    search_results = [res for res in results if res.lemma_wordform.text == lemma]
    assert len(search_results) >= 2
    assert any(res.is_lemma for res in search_results)
    first_result = search_results[0]
    assert first_result.is_lemma, f"unexpected first result: {first_result}"
Пример #4
0
    def handle(self, *args, **options) -> None:
        samples = load_sample_definition(options["csv_file"])
        if options["shuffle"]:
            random.shuffle(samples)
        if options["max"] is not None:
            samples = samples[: options["max"]]

        with output_file(options["output_file"]) as out:
            # Only display progress bar if output is redirected
            if not out.isatty():
                samples = tqdm(samples)

            for entry in samples:
                query = entry["Query"]

                results = search(
                    query=f"verbose:1 {options['prefix_queries_with']} {query}"
                ).sorted_results()
                prefetch_related_objects(
                    [r.wordform for r in results], "definitions__citations"
                )
                for i, r in enumerate(results):
                    ret = r.features()
                    ret["query"] = query
                    ret["wordform_text"] = r.wordform.text
                    ret["lemma_wordform_text"] = r.wordform.lemma.text
                    ret["definitions"] = [
                        [d.text, ", ".join(c.abbrv for c in d.citations.all())]
                        for d in r.wordform.definitions.all()
                        if d.auto_translation_source_id is None
                    ]
                    ret["webapp_sort_rank"] = i + 1
                    print(json.dumps(ret, ensure_ascii=False), file=out)
Пример #5
0
def test_search_for_pronoun() -> None:
    """
    Search for a common pronoun "ôma". Make sure "oma" returns at least one
    result that says "ôma"
    """

    search_results = search(query="oma").presentation_results()
    assert any(r.wordform.text == "ôma" for r in search_results)
Пример #6
0
def test_search_text_with_ambiguous_word_classes():
    """
    Results of all word classes should be searched when the query is ambiguous
    """
    # pipon can be viewed as a Verb as well as a Noun
    results = search(query="pipon").presentation_results()
    assert {
        r.lemma_wordform.paradigm for r in results if r.wordform.text == "pipon"
    } == {"NI", "VII"}
Пример #7
0
def test_search_for_english() -> None:
    """
    Search for a word that is definitely in English.
    """

    # This should match "âcimowin" and related words:
    search_results = search(query="story").presentation_results()

    assert any(r.wordform.text == "âcimowin" for r in search_results)
Пример #8
0
def test_search_serialization_json_parsable(query):
    """
    Test serialize produces json compatible results
    """
    results = search(query=query).serialized_presentation_results()
    try:
        json.dumps(results)
    except Exception as e:
        pytest.fail("SearchResult.serialized method failed to be json compatible")
        raise
Пример #9
0
def test_search_words_with_preverbs():
    """
    preverbs should be extracted and present in SearchResult instances
    """
    results = search(query="nitawi-nipâw").presentation_results()
    assert len(results) == 1
    search_result = results.pop()

    assert len(search_result.preverbs) == 1
    assert search_result.preverbs[0]["text"] == "nitawi-"
    assert search_result.lexical_info[0]["type"] == "Preverb"
Пример #10
0
def test_search_words_with_reduplication():
    """
    reduplication should be extracted and present in SearchResult instances
    """
    results = search(query="nanipâw").presentation_results()
    assert len(results) == 1
    search_result = results.pop()

    assert len(search_result.lexical_info) == 1
    assert search_result.lexical_info[0]["entry"]["text"] == "na-"
    assert search_result.lexical_info[0]["type"] == "Reduplication"
Пример #11
0
def test_search_results_order(query: str, top_result: str, later_result: str):
    """
    Ensure that some search results appear before others.
    """
    results = search(query=query).presentation_results()

    top_result_pos = position_in_results(top_result, results)
    later_result_pos = position_in_results(later_result, results)
    assert (
        top_result_pos < later_result_pos
    ), f"{top_result} did not come before {later_result}"
Пример #12
0
def test_search_words_with_inital_change():
    """
    reduplication should be extracted and present in SearchResult instances
    """
    results = search(query="nêpat").presentation_results()
    assert len(results) == 1
    search_result = results.pop()

    assert len(search_result.lexical_info) == 1
    assert search_result.lexical_info[0]["entry"]["text"] == " "
    assert search_result.lexical_info[0]["type"] == "Initial Change"
Пример #13
0
def test_search_space_characters_in_matched_term(term):
    """
    The search should find results with spaces in them.
    See: https://github.com/UAlbertaALTLab/morphodict/issues/147
    """

    # Ensure the word is in the database to begin with...
    word = Wordform.objects.get(text=term)
    assert word is not None

    # Now try searching for it:
    cree_results = search(query=term).presentation_results()
    assert len(cree_results) > 0
Пример #14
0
def test_cvd_exclusive_only_uses_cvd_for_ranking(db):
    search_run = search(query="dance cvd:2")
    results = search_run.sorted_results()
    assert len(results) > 2

    def is_sorted_by_cvd(results: list[Result]):
        for r1, r2 in zip(results, results[1:]):
            if (r1.cosine_vector_distance is not None
                    and r2.cosine_vector_distance is not None
                    and r1.cosine_vector_distance > r2.cosine_vector_distance):
                raise Exception(
                    f"Item {r1} comes first but has a bigger cvd than {r2}")
        return True

    assert is_sorted_by_cvd(results)
Пример #15
0
def test_compare_simple_vs_affix_search() -> None:
    """
    Searches can include affixes or not; by default, they do.

    The only difference is that there should be more things returned via affix search.
    """

    # The prefix should be a complete wordform, as well as a valid prefix of the lemma
    # AA: Is this true? As one can search with an incomplete wordform and get results, e.g. 'wâpamê'

    prefix = "wâpam"
    lemma = "wâpamêw"
    assert lemma.startswith(prefix)

    simple_results = search(query=prefix, include_affixes=False).presentation_results()
    general_results = search(query=prefix).presentation_results()

    assert len(simple_results) <= len(general_results)

    assert results_contains_wordform(prefix, simple_results)
    assert not results_contains_wordform(lemma, simple_results)

    assert results_contains_wordform(prefix, general_results)
    assert results_contains_wordform(lemma, general_results)
Пример #16
0
def test_query_exact_wordform_in_database(lemma: Wordform):
    """
    Sanity check: querying a lemma by its EXACT text returns that lemma.
    """

    query = lemma.text
    results = search(query=query).presentation_results()

    exact_match = False
    matched_lemma_count = 0
    for r in results:
        if r.wordform.id == lemma.id:
            exact_match = True
        matched_lemma_count += 1

    assert matched_lemma_count >= 1, f"Could not find {query!r} in the database"
    assert exact_match, f"No exact matches for {query!r} in {results}"
Пример #17
0
def test_when_linguistic_breakdown_absent():
    # pê- is a preverb
    # it's not analyzable by the normative fst and should not have a linguistic breakdown
    # however, in the latest version of the descriptive FST pê- can get analyzed as a fragment, receiving the +Err/Frag tag.
    # nevertheless, currently we ignore the +Err/Frag cases.

    query = "pe-"
    search_results = search(query=query).presentation_results()

    # when introducing cosine vector distance, `pe` is in the news vectors, so
    # we now get additional results for this search.
    assert len(search_results) >= 1

    result = search_results[0]
    assert result.wordform.text == "pê-"
    assert result.wordform.analysis is None
    assert result.friendly_linguistic_breakdown_head == []
    assert (
        result.serialize()["lemma_wordform"]["inflectional_category_plain_english"]
        == "like: pê-"
    )
Пример #18
0
def test_search_for_exact_lemma(lemma: Wordform):
    """
    Check that we get a search result that matches the exact query.
    """

    assert lemma.is_lemma
    _, fst_lemma, _ = lemma.analysis
    assert all(c == c.lower() for c in fst_lemma)
    assume(lemma.text == fst_lemma)

    query = lemma.text
    search_results = search(query=query).presentation_results()

    exact_matches = {
        result
        for result in search_results
        if result.is_lemma and result.lemma_wordform == lemma
    }
    assert len(exact_matches) == 1

    # Let's look at that search result in more detail
    exact_match = exact_matches.pop()
    assert exact_match.source_language_match == lemma.text
    assert not exact_match.preverbs
Пример #19
0
def test_avoids_cvd_search_if_query_looks_like_cree(query: str) -> None:
    """
    Some searches should not even **TOUCH** CVD, yielding zero results.
    """
    assert search(query=query).verbose_messages[0].startswith("Skipping CVD")
Пример #20
0
def test_lemma_ranking_most_frequent_word():
    # the English sleep should many Cree words. But nipâw should show first because
    # it undoubtedly has the highest frequency
    results = search(query="sees").presentation_results()
    assert results[0].wordform.text == "wâpahtam"
Пример #21
0
def test_extra_pieces_searchable(db, search_term):
    search_results = search(query=search_term).presentation_results()
    assert any(r.wordform.text == "kwâskwêpayihôs" for r in search_results)