예제 #1
0
def test_superfluous_columns2(single_import_parameters, caplog):
    dataset, target, excel, concept_name = single_import_parameters
    concepts = {"Concept": "c_id_1"}
    sheet = MockSingleExcelSheet([
        [
            "Form",
            "Segments",
            "English",
            "procedural_comment",
            "Comment",
            "Source",
            "phonetic",
            "phonemic",
            "variants",
            "orthographic",
            "superfluous",
        ],
        ["test form", "t e s t", "Concept"],
    ])
    # AssertionError on concept column not in excel header
    with caplog.at_level(logging.INFO):
        read_single_excel_sheet(
            dataset=dataset,
            sheet=sheet,
            entries_to_concepts=concepts,
            concept_column="English",
            ignore_superfluous=True,
        )
    assert re.search(
        r"Excel sheet MockSingleExcelSheet contained unexpected columns {'superfluous'}. These columns will be ignored",
        caplog.text,
    )
예제 #2
0
def test_source_context(minimal_parser_with_dialect):
    """Check how the ‘context’ of a source is parsed

    The ‘context’ of a source, ie. its page number etc., should be added to the
    source column in square brackets after the source ID. It should be stripped
    of leading and trailing whitespace.

    """
    lexicon_wb = MockSingleExcelSheet(
        [
            ["", "L1"],
            ["C1", "<L1C1>{1:p. 34 }"],
        ]
    )
    minimal_parser_with_dialect.parse_cells(lexicon_wb)

    forms = list(minimal_parser_with_dialect.db.retrieve("FormTable"))
    assert len(forms) == 1
    assert forms[0] == {
        "Language_ID": "l1",
        "Value": "<L1C1>{1:p. 34 }",
        "Form": "L1C1",
        "Source": {"l1_s1[p. 34]"},
        "ID": "l1_c1",
        "Parameter_ID": ["c1"],
    }
예제 #3
0
def test_import_error_missing_parameter_column(single_import_parameters):
    dataset, target, excel, concept_name = single_import_parameters
    c_c_id = dataset["ParameterTable", "id"].name
    c_c_name = dataset["ParameterTable", "name"].name
    concepts = {c[c_c_name]: c[c_c_id] for c in dataset["ParameterTable"]}
    sheet = MockSingleExcelSheet([
        [
            "variants",
            "Form",
            "phonemic",
            "orthographic",
            "Segments",
            "procedural_comment",
            "Comment",
            "Source",
            "phonetic",
        ],
        [],
    ])
    with pytest.raises(
            AssertionError,
            match=f"Could not find concept column {concept_name} in .*"):
        read_single_excel_sheet(
            dataset=dataset,
            sheet=sheet,
            entries_to_concepts=concepts,
            concept_column=concept_name,
        )
예제 #4
0
def test_form_association_id_after_normalization(minimal_parser_with_dialect):
    f1 = "\xf1"  # Composed form of ñ
    f2 = "n\u0303"  # Decomposed form of ñ
    assert unicodedata.normalize("NFC", f1) == unicodedata.normalize("NFC", f2)
    lexicon_wb = MockSingleExcelSheet(
        [
            ["", "L1", "L2"],
            ["C1", f"<{f1}>{{1}}", "<L2C1>{1}"],
            ["C2", f"<{f2}>{{1}}", "<L2C2>{1}"],
        ]
    )
    minimal_parser_with_dialect.parse_cells(lexicon_wb)

    complete_forms = minimal_parser_with_dialect.db.retrieve("FormTable")
    forms = [(f["Language_ID"], f["Form"]) for f in complete_forms]

    assert (
        forms.count(("l1", "n\u0303")) + forms.count(("l1", "\xf1")) == 1
    ), """Only one variant, either the composed or the decomposed version, should
          persist. (It should be the NFC one, but that is not a
          guarantee of the code, just an implementation detail.)"""

    assert ["c1", "c2"] in [
        f["Parameter_ID"] for f in complete_forms
    ], "Accordingly, there should be one form both C1 and C2 are linked to."
예제 #5
0
def test_superfluous_columns1(single_import_parameters):
    dataset, target, excel, concept_name = single_import_parameters
    c_c_id = dataset["ParameterTable", "id"].name
    c_c_name = dataset["ParameterTable", "name"].name
    concepts = {c[c_c_name]: c[c_c_id] for c in dataset["ParameterTable"]}
    sheet = MockSingleExcelSheet([
        [
            "variants",
            "Form",
            "Segments",
            "procedural_comment",
            "Comment",
            "Source",
            "phonetic",
            "phonemic",
            "orthographic",
            "superfluous",
        ],
        [],
    ])
    with pytest.raises(
            ValueError,
            match=
            ".* Excel sheet MockSingleExcelSheet contained unexpected columns {'superfluous'}.*"
            ".* use --ignore-superfluous-excel-columns .*",
    ):
        read_single_excel_sheet(
            dataset=dataset,
            sheet=sheet,
            entries_to_concepts=concepts,
            concept_column="English",
        )
예제 #6
0
def test_missing_concept(single_import_parameters, caplog):
    dataset, target, excel, concept_name = single_import_parameters
    c_c_id = dataset["ParameterTable", "id"].name
    c_c_name = dataset["ParameterTable", "name"].name
    concepts = {c[c_c_name]: c[c_c_id] for c in dataset["ParameterTable"]}
    sheet = MockSingleExcelSheet([
        [
            "variants",
            "Form",
            "Segments",
            "procedural_comment",
            "Comment",
            "Source",
            "phonetic",
            "phonemic",
            "orthographic",
            "superfluous",
        ],
        [],
    ])
    concept_column_name = "Concept_Column_Name"
    # AssertionError on concept column not in excel header
    with pytest.raises(AssertionError, match=f".*{concept_column_name}.*"):
        read_single_excel_sheet(
            dataset=dataset,
            sheet=sheet,
            entries_to_concepts=concepts,
            concept_column=concept_column_name,
            ignore_superfluous=True,
        )
예제 #7
0
def test_missing_columns2(single_import_parameters):
    dataset, target, excel, concept_name = single_import_parameters
    c_c_id = dataset["ParameterTable", "id"].name
    c_c_name = dataset["ParameterTable", "name"].name
    concepts = {c[c_c_name]: c[c_c_id] for c in dataset["ParameterTable"]}
    sheet = MockSingleExcelSheet([
        [
            "variants",
            "Form",
            "Segments",
            "procedural_comment",
            "Comment",
            "Source",
            "phonetic",
            "phonemic",
            "undescribed",
            "superfluous2",
        ],
        [],
    ])
    with pytest.raises(
            ValueError,
            match=".*sheet MockSingleExcelSheet.*unexpected col.*") as ex_info:
        read_single_excel_sheet(
            dataset=dataset,
            sheet=sheet,
            entries_to_concepts=concepts,
            concept_column="English",
            ignore_missing=True,
        )
    assert "undescribed" in ex_info.value.args[0]
    assert "superfluous2" in ex_info.value.args[0]
    assert "--ignore-superfluous-excel-columns" in ex_info.value.args[0]
예제 #8
0
def test_missing_columns1(single_import_parameters):
    dataset, target, excel, concept_name = single_import_parameters
    c_c_id = dataset["ParameterTable", "id"].name
    c_c_name = dataset["ParameterTable", "name"].name
    concepts = {c[c_c_name]: c[c_c_id] for c in dataset["ParameterTable"]}
    sheet = MockSingleExcelSheet([
        [
            "variants",
            "Form",
            "Segments",
            "procedural_comment",
            "Comment",
            "Source",
            "phonetic",
            "phonemic",
        ],
        [],
    ])
    with pytest.raises(
            ValueError,
            match=
            ".*sheet MockSingleExcelSheet.*missing col.*{[^a-z]*orthographic[^a-z]*}.*--ignore-missing-excel-columns",
    ):
        read_single_excel_sheet(
            dataset=dataset,
            sheet=sheet,
            entries_to_concepts=concepts,
            concept_column=concept_name,
        )
예제 #9
0
def test_import_report_skipped(single_import_parameters):
    dataset, target, excel, concept_name = single_import_parameters
    c_c_id = dataset["ParameterTable", "id"].name
    c_c_name = dataset["ParameterTable", "name"].name
    concepts = {c[c_c_name]: c[c_c_id] for c in dataset["ParameterTable"]}
    mocksheet = MockSingleExcelSheet([
        [
            "English",
            "Form",
            "phonemic",
            "orthographic",
            "Segments",
            "procedural_comment",
            "Comment",
            "Source",
            "phonetic",
            "variants",
        ],
        [
            "FAKE",
            "form",
            "phonemic",
            "orthographic",
            "f o r m",
            "-",
            "None",
            "source[10]",
            "phonetic",
            "",
        ],
    ])
    mocksheet.title = "new_language"
    assert read_single_excel_sheet(
        dataset=dataset,
        sheet=mocksheet,
        entries_to_concepts=concepts,
        concept_column=concept_name,
    ) == {
        "new_language":
        ImportLanguageReport(
            is_new_language=True,
            new=0,
            existing=0,
            skipped=1,
            concepts=0,
        )
    }
예제 #10
0
def test_new_concept_association(single_import_parameters, caplog):
    dataset, target, excel, concept_name = single_import_parameters
    c_c_id = dataset["ParameterTable", "id"].name
    c_c_name = dataset["ParameterTable", "name"].name
    concepts = {c[c_c_name]: c[c_c_id] for c in dataset["ParameterTable"]}
    mocksheet = MockSingleExcelSheet([
        [
            "English",
            "Form",
            "phonemic",
            "orthographic",
            "Segments",
            "procedural_comment",
            "Comment",
            "Source",
            "phonetic",
            "variants",
        ],
        [
            "two",  # existing concept, but new association
            "e.ta.'kɾã",
            "",
            "",
            "",
            "",
            "",
            "",
            "",
            "",
        ],
    ])
    mocksheet.title = "ache"
    with caplog.at_level(logging.INFO):
        read_single_excel_sheet(
            dataset=dataset,
            sheet=mocksheet,
            entries_to_concepts=concepts,
            concept_column=concept_name,
        )
    # Test new concept association
    assert re.search(
        r"two.*added to.*ache_one",
        caplog.text,
    )
예제 #11
0
def test_all_ipa_symbols(minimal_parser_with_dialect, bipa):
    lexicon_wb = MockSingleExcelSheet(
        [["", "L1"]] + [[s, f"<{s:}>{{bipa}}"] for s in bipa.sounds.keys()]
    )
    minimal_parser_with_dialect.parse_cells(lexicon_wb)

    complete_forms = minimal_parser_with_dialect.db.retrieve("FormTable")
    forms = {f["Form"] for f in complete_forms}

    assert set(unicodedata.normalize("NFC", f) for f in bipa.sounds.keys()) == set(
        unicodedata.normalize("NFC", f) for f in forms
    ), "Some IPA symbols got lost under import"
예제 #12
0
def test_concept_not_found(single_import_parameters, caplog):
    dataset, target, excel, concept_name = single_import_parameters
    c_c_id = dataset["ParameterTable", "id"].name
    c_c_name = dataset["ParameterTable", "name"].name
    concepts = {c[c_c_name]: c[c_c_id] for c in dataset["ParameterTable"]}
    mocksheet = MockSingleExcelSheet([
        [
            "English",
            "Form",
            "phonemic",
            "orthographic",
            "Segments",
            "procedural_comment",
            "Comment",
            "Source",
            "phonetic",
            "variants",
        ],
        [
            "FAKE",
            "form",
            "phonemic",
            "orthographic",
            "f o r m",
            "-",
            "None",
            "source[10]",
            "phonetic",
            "",
        ],
    ])
    mocksheet.title = "new_language"
    read_single_excel_sheet(
        dataset=dataset,
        sheet=mocksheet,
        entries_to_concepts=concepts,
        concept_column=concept_name,
    )
    assert re.search(r"Concept FAKE was not found", caplog.text)
예제 #13
0
def test_form_association(minimal_parser_with_dialect):
    lexicon_wb = MockSingleExcelSheet(
        [
            ["", "L1", "L2"],
            ["C1", "<L1C1>{1}", "<L2C1>{1}"],
            ["C2", "<L1C2>{1}", "<L2C2>{1}"],
        ]
    )
    minimal_parser_with_dialect.parse_cells(lexicon_wb)

    assert list(minimal_parser_with_dialect.db.retrieve("FormTable")) == [
        {
            "Language_ID": "l1",
            "Value": "<L1C1>{1}",
            "Form": "L1C1",
            "Source": {"l1_s1"},
            "ID": "l1_c1",
            "Parameter_ID": ["c1"],
        },
        {
            "Language_ID": "l2",
            "Value": "<L2C1>{1}",
            "Form": "L2C1",
            "Source": {"l2_s1"},
            "ID": "l2_c1",
            "Parameter_ID": ["c1"],
        },
        {
            "Language_ID": "l1",
            "Value": "<L1C2>{1}",
            "Form": "L1C2",
            "Source": {"l1_s1"},
            "ID": "l1_c2",
            "Parameter_ID": ["c2"],
        },
        {
            "Language_ID": "l2",
            "Value": "<L2C2>{1}",
            "Form": "L2C2",
            "Source": {"l2_s1"},
            "ID": "l2_c2",
            "Parameter_ID": ["c2"],
        },
    ]
예제 #14
0
def test_concept_separator(single_import_parameters, caplog):
    dataset, target, excel, concept_name = single_import_parameters
    c_f_concept = dataset["FormTable", "parameterReference"].name
    match_form = [c_f_concept]
    concepts = dict()
    sheet = MockSingleExcelSheet([
        [
            "English",
            "Form",
            "Segments",
            "procedural_comment",
            "Comment",
            "Source",
            "phonetic",
            "phonemic",
            "orthographic",
            "variants",
        ],
        [
            "three",
            "form",
            "f o r m",
            "auto-generated",
            "",
            "source[10]",
            "phonetic",
            "phonemic",
            "orthographic",
            "",
        ],
    ])
    # ValueError on missing column
    with caplog.at_level(logging.INFO):
        read_single_excel_sheet(
            dataset=dataset,
            sheet=sheet,
            entries_to_concepts=concepts,
            match_form=match_form,
            concept_column="English",
        )
    assert re.search(r"[mM]atch.*concept.*lexedata\.report\.list_homophones",
                     caplog.text)
예제 #15
0
def test_excel_messy_row(caplog):
    # Build a dataset with forms F1, F2, F3 in languages L1, L2 and
    # CognateTable columns ID and Status
    dataset = util.fs.new_wordlist(
        FormTable=[
            {
                "ID": "F1",
                "Language_ID": "L1",
                "Form": "f1",
                "Parameter_ID": "C"
            },
            {
                "ID": "F2",
                "Language_ID": "L2",
                "Form": "f1",
                "Parameter_ID": "C"
            },
            {
                "ID": "F3",
                "Language_ID": "L1",
                "Form": "f1",
                "Parameter_ID": "C"
            },
        ],
        LanguageTable=[{
            "ID": "L1",
            "Name": "L1"
        }, {
            "ID": "L2",
            "Name": "L2"
        }],
        ParameterTable=[{
            "ID": "C"
        }],
        CognateTable=[],
        CognatesetTable=[],
    )
    # TODO: Ensure FormTable does not need a value
    dataset.add_columns("FormTable", "value")
    dataset["FormTable", "value"].required = False
    dataset.add_columns("CognatesetTable", "Status")
    dataset.add_columns("CognatesetTable", "comment")

    # Construct a sheet with a messy cognateset header
    messy_sheet = MockSingleExcelSheet([
        [
            "CogSet",
            "Status",
            "L1",
            "L2",
        ],
        [
            "S1",
            "valid",
            "F1",
            "F2",
        ],
        [
            "",
            "invalid",
            "F3",
        ],
    ])
    for cell in [(2, 3), (3, 3), (2, 4)]:
        messy_sheet.cell(*cell).hyperlink = "/{:}".format(
            messy_sheet.cell(*cell).value)

    # Cognate-import this dataset
    with caplog.at_level(logging.INFO):
        import_cognates_from_excel(
            messy_sheet,
            dataset,
        )

    # Check that cognateset S1 contains form F3
    assert ("F3", "S1") in [(j["Form_ID"], j["Cognateset_ID"])
                            for j in dataset["CognateTable"]]

    # Check for warning in caplog
    assert re.search("[Rr]ow 3 .* no cognate ?set .*'Status': 'invalid'",
                     caplog.text)
예제 #16
0
def test_add_concept_to_existing_form(single_import_parameters):
    dataset, target, excel, concept_name = single_import_parameters
    c_c_id = dataset["ParameterTable", "id"].name
    c_c_name = dataset["ParameterTable", "name"].name
    concepts = {c[c_c_name]: c[c_c_id] for c in dataset["ParameterTable"]}
    mocksheet = MockSingleExcelSheet([
        [
            "English",
            "Form",
            "phonemic",
            "orthographic",
            "Segments",
            "procedural_comment",
            "Comment",
            "Source",
            "phonetic",
            "variants",
        ],
        [
            "one",
            "form",
            "phonemic",
            "orthographic",
            "f o r m",
            "-",
            "None",
            "source[10]",
            "phonetic",
            "",
        ],
    ])
    mocksheet.title = "new_language"
    # Import this single form in a new language
    read_single_excel_sheet(
        dataset=dataset,
        sheet=mocksheet,
        entries_to_concepts=concepts,
        concept_column=concept_name,
    )
    mocksheet = MockSingleExcelSheet([
        [
            "English",
            "Form",
            "phonemic",
            "orthographic",
            "Segments",
            "procedural_comment",
            "Comment",
            "Source",
            "phonetic",
            "variants",
        ],
        [
            "two",
            "form",
            "phonemic",
            "orthographic",
            "f o r m",
            "-",
            "None",
            "source[10]",
            "phonetic",
            "",
        ],
    ])
    mocksheet.title = "new_language"
    # Import it again, now both form and language should be existing, but the form has a new concept
    assert read_single_excel_sheet(
        dataset=dataset,
        sheet=mocksheet,
        entries_to_concepts=concepts,
        concept_column=concept_name,
    ) == {
        "new_language":
        ImportLanguageReport(
            # TODO: Actually, this isn't a new language. The difference between
            # adding forms for a language that is not in the LanguageTable yet,
            # but already has forms in the FormTable, and adding something
            # completely new, is washed out by read_single_language. The
            # interpretation of “Does this language still need to be added to
            # the LanguageTable?” for is_new_language is consistent.
            is_new_language=True,
            new=0,
            existing=0,
            skipped=0,
            concepts=1,
        )
    }
예제 #17
0
def test_no_concept_separator(single_import_parameters, caplog):
    dataset, target, excel, concept_name = single_import_parameters
    dataset["FormTable", "parameterReference"].separator = None
    dataset.write_metadata()

    c_c_id = dataset["ParameterTable", "id"].name
    c_c_name = dataset["ParameterTable", "name"].name
    concepts = {c[c_c_name]: c[c_c_id] for c in dataset["ParameterTable"]}
    sheet = MockSingleExcelSheet([
        [
            "English",
            "Form",
            "phonemic",
            "orthographic",
            "Segments",
            "procedural_comment",
            "Comment",
            "Source",
            "phonetic",
            "variants",
        ],
        [
            "one",
            "form",
            "phonemic",
            "orthographic",
            "f o r m",
            "-",
            "None",
            "source[10]",
            "phonetic",
            "",
        ],
    ])
    sheet.title = "new_language"

    # Import this single form in a new language
    assert read_single_excel_sheet(
        dataset=dataset,
        sheet=sheet,
        entries_to_concepts=concepts,
        concept_column=concept_name,
    ) == {
        "new_language":
        ImportLanguageReport(is_new_language=True,
                             new=1,
                             existing=0,
                             skipped=0,
                             concepts=0)
    }

    # Import it again, with a new concept
    sheet = MockSingleExcelSheet([
        [
            "English",
            "Form",
            "phonemic",
            "orthographic",
            "Segments",
            "procedural_comment",
            "Comment",
            "Source",
            "phonetic",
            "variants",
        ],
        [
            "three",
            "form",
            "phonemic",
            "orthographic",
            "f o r m",
            "-",
            "None",
            "source[10]",
            "phonetic",
            "",
        ],
    ])
    sheet.title = "new_language"

    # Test new concept was added as new form
    assert read_single_excel_sheet(
        dataset=dataset,
        sheet=sheet,
        entries_to_concepts=concepts,
        concept_column=concept_name,
    ) == {
        "new_language":
        ImportLanguageReport(is_new_language=True,
                             new=1,
                             existing=0,
                             skipped=0,
                             concepts=0)
    }
    # Test messages mention the solutions
    print(caplog.text)
    assert re.search(
        r"not.* polysemous forms.*separator.*FormTable.*parameterReference.*json.*lexedata\.report\.list_homophones",
        caplog.text,
    )
예제 #18
0
def test_normalize_header():
    sheet = MockSingleExcelSheet([["Language ID", "Gloss (eng)"]])
    for row in sheet.iter_rows():
        assert normalize_header(row) == ["Language_ID", "Gloss_eng"]