예제 #1
0
def test_import_niska(shared_datadir):

    # https://github.com/UAlbertaALTLab/cree-intelligent-dictionary/issues/213
    # searching with niskak didn't yield an inflection of "niska" as niska wasn't disambiguated and was marked as_is
    migrate_and_import(shared_datadir / "crkeng-niska")

    assert not Wordform.objects.filter(text="niska", is_lemma=True).get().as_is
예제 #2
0
def test_import_pipon_of_different_word_classes(shared_datadir):

    # The Cree word pipon has two entries in the test xml, one's word class is VII and the other's is NI
    migrate_and_import(shared_datadir /
                       "crkeng-pipon-of-different-word-classes")

    # https://github.com/UAlbertaALTLab/cree-intelligent-dictionary/issues/190
    # Issue description: search results for some inflected form of word pipon is not showing up
    # Cause: pipon lemmas wrongly marked as "as-is" in the database when the xml actually provided enough resolution
    # on the word classes (VII and NI)

    # todo: let `migrate_and_import` report success/ambiguity/no-analysis count so that further tests to the importer
    #   can be easier constructed. e.g. in this case we'll only need to assert `success == 2`

    assert (Wordform.objects.filter(text="pipon", is_lemma=True,
                                    as_is=False).count() == 2)

    # https://github.com/UAlbertaALTLab/cree-intelligent-dictionary/issues/412
    # Issue description: The verb entry and the noun entry have the same 3 definitions.
    #   while in the source, the noun has 2 definitions:
    #       It is winter.
    #       it is winter; is one year
    #   The verb has 1 definition:
    #       year, winter
    # They are wrongly merged.

    assert (Wordform.objects.filter(
        text="pipon", is_lemma=True,
        pos="N").get().definitions.all().count() == 1)
    assert (Wordform.objects.filter(
        text="pipon", is_lemma=True,
        pos="V").get().definitions.all().count() == 2)
예제 #3
0
def test_import_lemma_with_multiple_spellings(shared_datadir):
    migrate_and_import(shared_datadir / "crkeng-small-lemma-with-multiple-spelling")

    pisin_lemma = Wordform.objects.filter(text="pisin", is_lemma=True).get()

    assert Wordform.objects.filter(
        text="pisiniw", is_lemma=False, lemma__id=pisin_lemma.id
    ).exists()
예제 #4
0
def test_import_nice_xml(shared_datadir):
    migrate_and_import(shared_datadir / "crkeng-small-nice-0")

    expanded = expand_inflections(
        ["yôwamêw+V+TA+Ind+3Sg+4Sg/PlO"], multi_processing=1, verbose=False
    )
    for analysis_and_inflections in expanded.values():
        for analysis, inflections in analysis_and_inflections:
            for inflection in inflections:
                assert len(Wordform.objects.filter(text=inflection)) >= 1
예제 #5
0
def test_import_xml_crkeng_small_duplicate_l_pos_ic(shared_datadir):
    # This test shows the behavior of the importer when entries with duplicate (l, pos, ic) in the xml file exists
    # It's a rare case

    # two Wordform objects will be created, each with the pooled translations from the two entries
    migrate_and_import(shared_datadir / "crkeng-small-duplicate-l-pos-ic")

    lemmas = Wordform.objects.filter(text="asawâpiwin", is_lemma=True)
    assert lemmas.count() == 2
    assert {len(o.definitions.all()) for o in lemmas} == {4}
예제 #6
0
def test_import_calgary(shared_datadir):
    """
    See: https://github.com/UAlbertaALTLab/cree-intelligent-dictionary/issues/353
    """

    # https://github.com/UAlbertaALTLab/cree-intelligent-dictionary/issues/213
    # searching with maskwak didn't yield an inflection of "maskwa" as "maskwa" wasn't disambiguated and was marked as_is
    migrate_and_import(shared_datadir / "crkeng-calgary")

    results = EnglishKeyword.objects.filter(text__startswith="calgar")
    assert len(results) >= 1
    assert {"otôskwanihk"} == {r.lemma.text for r in results}
예제 #7
0
def test_import_xml_common_analysis_definition_merge(shared_datadir):
    """
    test purpose: sometimes two entries in the xml produce the same analysis. Their definition shouldn't be merged
    """

    migrate_and_import(shared_datadir / "crkeng-small-common-analysis-different-ic")
    assert Wordform.objects.get(text="pisin").definitions.count() == 1

    # Note: this test no longer works because pisin and pisiniw no longer produces the same analysis

    # We need to find two entries in the xml that produces the same analysis. See
    # DatabaseManager_tests/data/crkeng-small-common-analysis-different-ic/crkeng.xml

    assert Wordform.objects.get(text="pisiniw").definitions.count() == 2
예제 #8
0
def test_import_xml_common_analysis_definition_merge(shared_datadir):
    migrate_and_import(shared_datadir /
                       "crkeng-common-analysis-definition-merge")

    query_set = Wordform.objects.filter(text="nipa")

    kill_him_inflections = []
    for inflection in query_set:
        for definition in inflection.definitions.all():
            if "Kill" in definition.text:
                kill_him_inflections.append(inflection)

    assert len(kill_him_inflections) == 1
    kill_him_inflection = kill_him_inflections[0]
    assert kill_him_inflection.pos == "V"
예제 #9
0
def test_import_xml_crkeng_small_common_xml_l_different_ic(shared_datadir):
    # This test shows the behavior of the importer when entries with the same l but different ic in the xml file exists
    # These entries will be identified as belonging to different lemmas

    migrate_and_import(shared_datadir / "crkeng-small-common-xml-l-different-ic")
    assert len(Wordform.objects.filter(text="pisiw", is_lemma=True)) == 2
예제 #10
0
def test_import_xml_fst_no_analysis(shared_datadir):
    migrate_and_import(shared_datadir / "crkeng-small-fst-can-not-analyze")
    assert len(Wordform.objects.all()) == 1
    assert Wordform.objects.get(text="miwapisin").as_is is True
    assert Wordform.objects.get(text="miwapisin").is_lemma is True