def test_superfluous_columns2(single_import_parameters, caplog): dataset, target, excel, concept_name = single_import_parameters concepts = {"Concept": "c_id_1"} sheet = MockSingleExcelSheet([ [ "Form", "Segments", "English", "procedural_comment", "Comment", "Source", "phonetic", "phonemic", "variants", "orthographic", "superfluous", ], ["test form", "t e s t", "Concept"], ]) # AssertionError on concept column not in excel header with caplog.at_level(logging.INFO): read_single_excel_sheet( dataset=dataset, sheet=sheet, entries_to_concepts=concepts, concept_column="English", ignore_superfluous=True, ) assert re.search( r"Excel sheet MockSingleExcelSheet contained unexpected columns {'superfluous'}. These columns will be ignored", caplog.text, )
def test_source_context(minimal_parser_with_dialect): """Check how the ‘context’ of a source is parsed The ‘context’ of a source, ie. its page number etc., should be added to the source column in square brackets after the source ID. It should be stripped of leading and trailing whitespace. """ lexicon_wb = MockSingleExcelSheet( [ ["", "L1"], ["C1", "<L1C1>{1:p. 34 }"], ] ) minimal_parser_with_dialect.parse_cells(lexicon_wb) forms = list(minimal_parser_with_dialect.db.retrieve("FormTable")) assert len(forms) == 1 assert forms[0] == { "Language_ID": "l1", "Value": "<L1C1>{1:p. 34 }", "Form": "L1C1", "Source": {"l1_s1[p. 34]"}, "ID": "l1_c1", "Parameter_ID": ["c1"], }
def test_import_error_missing_parameter_column(single_import_parameters): dataset, target, excel, concept_name = single_import_parameters c_c_id = dataset["ParameterTable", "id"].name c_c_name = dataset["ParameterTable", "name"].name concepts = {c[c_c_name]: c[c_c_id] for c in dataset["ParameterTable"]} sheet = MockSingleExcelSheet([ [ "variants", "Form", "phonemic", "orthographic", "Segments", "procedural_comment", "Comment", "Source", "phonetic", ], [], ]) with pytest.raises( AssertionError, match=f"Could not find concept column {concept_name} in .*"): read_single_excel_sheet( dataset=dataset, sheet=sheet, entries_to_concepts=concepts, concept_column=concept_name, )
def test_form_association_id_after_normalization(minimal_parser_with_dialect): f1 = "\xf1" # Composed form of ñ f2 = "n\u0303" # Decomposed form of ñ assert unicodedata.normalize("NFC", f1) == unicodedata.normalize("NFC", f2) lexicon_wb = MockSingleExcelSheet( [ ["", "L1", "L2"], ["C1", f"<{f1}>{{1}}", "<L2C1>{1}"], ["C2", f"<{f2}>{{1}}", "<L2C2>{1}"], ] ) minimal_parser_with_dialect.parse_cells(lexicon_wb) complete_forms = minimal_parser_with_dialect.db.retrieve("FormTable") forms = [(f["Language_ID"], f["Form"]) for f in complete_forms] assert ( forms.count(("l1", "n\u0303")) + forms.count(("l1", "\xf1")) == 1 ), """Only one variant, either the composed or the decomposed version, should persist. (It should be the NFC one, but that is not a guarantee of the code, just an implementation detail.)""" assert ["c1", "c2"] in [ f["Parameter_ID"] for f in complete_forms ], "Accordingly, there should be one form both C1 and C2 are linked to."
def test_superfluous_columns1(single_import_parameters): dataset, target, excel, concept_name = single_import_parameters c_c_id = dataset["ParameterTable", "id"].name c_c_name = dataset["ParameterTable", "name"].name concepts = {c[c_c_name]: c[c_c_id] for c in dataset["ParameterTable"]} sheet = MockSingleExcelSheet([ [ "variants", "Form", "Segments", "procedural_comment", "Comment", "Source", "phonetic", "phonemic", "orthographic", "superfluous", ], [], ]) with pytest.raises( ValueError, match= ".* Excel sheet MockSingleExcelSheet contained unexpected columns {'superfluous'}.*" ".* use --ignore-superfluous-excel-columns .*", ): read_single_excel_sheet( dataset=dataset, sheet=sheet, entries_to_concepts=concepts, concept_column="English", )
def test_missing_concept(single_import_parameters, caplog): dataset, target, excel, concept_name = single_import_parameters c_c_id = dataset["ParameterTable", "id"].name c_c_name = dataset["ParameterTable", "name"].name concepts = {c[c_c_name]: c[c_c_id] for c in dataset["ParameterTable"]} sheet = MockSingleExcelSheet([ [ "variants", "Form", "Segments", "procedural_comment", "Comment", "Source", "phonetic", "phonemic", "orthographic", "superfluous", ], [], ]) concept_column_name = "Concept_Column_Name" # AssertionError on concept column not in excel header with pytest.raises(AssertionError, match=f".*{concept_column_name}.*"): read_single_excel_sheet( dataset=dataset, sheet=sheet, entries_to_concepts=concepts, concept_column=concept_column_name, ignore_superfluous=True, )
def test_missing_columns2(single_import_parameters): dataset, target, excel, concept_name = single_import_parameters c_c_id = dataset["ParameterTable", "id"].name c_c_name = dataset["ParameterTable", "name"].name concepts = {c[c_c_name]: c[c_c_id] for c in dataset["ParameterTable"]} sheet = MockSingleExcelSheet([ [ "variants", "Form", "Segments", "procedural_comment", "Comment", "Source", "phonetic", "phonemic", "undescribed", "superfluous2", ], [], ]) with pytest.raises( ValueError, match=".*sheet MockSingleExcelSheet.*unexpected col.*") as ex_info: read_single_excel_sheet( dataset=dataset, sheet=sheet, entries_to_concepts=concepts, concept_column="English", ignore_missing=True, ) assert "undescribed" in ex_info.value.args[0] assert "superfluous2" in ex_info.value.args[0] assert "--ignore-superfluous-excel-columns" in ex_info.value.args[0]
def test_missing_columns1(single_import_parameters): dataset, target, excel, concept_name = single_import_parameters c_c_id = dataset["ParameterTable", "id"].name c_c_name = dataset["ParameterTable", "name"].name concepts = {c[c_c_name]: c[c_c_id] for c in dataset["ParameterTable"]} sheet = MockSingleExcelSheet([ [ "variants", "Form", "Segments", "procedural_comment", "Comment", "Source", "phonetic", "phonemic", ], [], ]) with pytest.raises( ValueError, match= ".*sheet MockSingleExcelSheet.*missing col.*{[^a-z]*orthographic[^a-z]*}.*--ignore-missing-excel-columns", ): read_single_excel_sheet( dataset=dataset, sheet=sheet, entries_to_concepts=concepts, concept_column=concept_name, )
def test_import_report_skipped(single_import_parameters): dataset, target, excel, concept_name = single_import_parameters c_c_id = dataset["ParameterTable", "id"].name c_c_name = dataset["ParameterTable", "name"].name concepts = {c[c_c_name]: c[c_c_id] for c in dataset["ParameterTable"]} mocksheet = MockSingleExcelSheet([ [ "English", "Form", "phonemic", "orthographic", "Segments", "procedural_comment", "Comment", "Source", "phonetic", "variants", ], [ "FAKE", "form", "phonemic", "orthographic", "f o r m", "-", "None", "source[10]", "phonetic", "", ], ]) mocksheet.title = "new_language" assert read_single_excel_sheet( dataset=dataset, sheet=mocksheet, entries_to_concepts=concepts, concept_column=concept_name, ) == { "new_language": ImportLanguageReport( is_new_language=True, new=0, existing=0, skipped=1, concepts=0, ) }
def test_new_concept_association(single_import_parameters, caplog): dataset, target, excel, concept_name = single_import_parameters c_c_id = dataset["ParameterTable", "id"].name c_c_name = dataset["ParameterTable", "name"].name concepts = {c[c_c_name]: c[c_c_id] for c in dataset["ParameterTable"]} mocksheet = MockSingleExcelSheet([ [ "English", "Form", "phonemic", "orthographic", "Segments", "procedural_comment", "Comment", "Source", "phonetic", "variants", ], [ "two", # existing concept, but new association "e.ta.'kɾã", "", "", "", "", "", "", "", "", ], ]) mocksheet.title = "ache" with caplog.at_level(logging.INFO): read_single_excel_sheet( dataset=dataset, sheet=mocksheet, entries_to_concepts=concepts, concept_column=concept_name, ) # Test new concept association assert re.search( r"two.*added to.*ache_one", caplog.text, )
def test_all_ipa_symbols(minimal_parser_with_dialect, bipa): lexicon_wb = MockSingleExcelSheet( [["", "L1"]] + [[s, f"<{s:}>{{bipa}}"] for s in bipa.sounds.keys()] ) minimal_parser_with_dialect.parse_cells(lexicon_wb) complete_forms = minimal_parser_with_dialect.db.retrieve("FormTable") forms = {f["Form"] for f in complete_forms} assert set(unicodedata.normalize("NFC", f) for f in bipa.sounds.keys()) == set( unicodedata.normalize("NFC", f) for f in forms ), "Some IPA symbols got lost under import"
def test_concept_not_found(single_import_parameters, caplog): dataset, target, excel, concept_name = single_import_parameters c_c_id = dataset["ParameterTable", "id"].name c_c_name = dataset["ParameterTable", "name"].name concepts = {c[c_c_name]: c[c_c_id] for c in dataset["ParameterTable"]} mocksheet = MockSingleExcelSheet([ [ "English", "Form", "phonemic", "orthographic", "Segments", "procedural_comment", "Comment", "Source", "phonetic", "variants", ], [ "FAKE", "form", "phonemic", "orthographic", "f o r m", "-", "None", "source[10]", "phonetic", "", ], ]) mocksheet.title = "new_language" read_single_excel_sheet( dataset=dataset, sheet=mocksheet, entries_to_concepts=concepts, concept_column=concept_name, ) assert re.search(r"Concept FAKE was not found", caplog.text)
def test_form_association(minimal_parser_with_dialect): lexicon_wb = MockSingleExcelSheet( [ ["", "L1", "L2"], ["C1", "<L1C1>{1}", "<L2C1>{1}"], ["C2", "<L1C2>{1}", "<L2C2>{1}"], ] ) minimal_parser_with_dialect.parse_cells(lexicon_wb) assert list(minimal_parser_with_dialect.db.retrieve("FormTable")) == [ { "Language_ID": "l1", "Value": "<L1C1>{1}", "Form": "L1C1", "Source": {"l1_s1"}, "ID": "l1_c1", "Parameter_ID": ["c1"], }, { "Language_ID": "l2", "Value": "<L2C1>{1}", "Form": "L2C1", "Source": {"l2_s1"}, "ID": "l2_c1", "Parameter_ID": ["c1"], }, { "Language_ID": "l1", "Value": "<L1C2>{1}", "Form": "L1C2", "Source": {"l1_s1"}, "ID": "l1_c2", "Parameter_ID": ["c2"], }, { "Language_ID": "l2", "Value": "<L2C2>{1}", "Form": "L2C2", "Source": {"l2_s1"}, "ID": "l2_c2", "Parameter_ID": ["c2"], }, ]
def test_concept_separator(single_import_parameters, caplog): dataset, target, excel, concept_name = single_import_parameters c_f_concept = dataset["FormTable", "parameterReference"].name match_form = [c_f_concept] concepts = dict() sheet = MockSingleExcelSheet([ [ "English", "Form", "Segments", "procedural_comment", "Comment", "Source", "phonetic", "phonemic", "orthographic", "variants", ], [ "three", "form", "f o r m", "auto-generated", "", "source[10]", "phonetic", "phonemic", "orthographic", "", ], ]) # ValueError on missing column with caplog.at_level(logging.INFO): read_single_excel_sheet( dataset=dataset, sheet=sheet, entries_to_concepts=concepts, match_form=match_form, concept_column="English", ) assert re.search(r"[mM]atch.*concept.*lexedata\.report\.list_homophones", caplog.text)
def test_excel_messy_row(caplog): # Build a dataset with forms F1, F2, F3 in languages L1, L2 and # CognateTable columns ID and Status dataset = util.fs.new_wordlist( FormTable=[ { "ID": "F1", "Language_ID": "L1", "Form": "f1", "Parameter_ID": "C" }, { "ID": "F2", "Language_ID": "L2", "Form": "f1", "Parameter_ID": "C" }, { "ID": "F3", "Language_ID": "L1", "Form": "f1", "Parameter_ID": "C" }, ], LanguageTable=[{ "ID": "L1", "Name": "L1" }, { "ID": "L2", "Name": "L2" }], ParameterTable=[{ "ID": "C" }], CognateTable=[], CognatesetTable=[], ) # TODO: Ensure FormTable does not need a value dataset.add_columns("FormTable", "value") dataset["FormTable", "value"].required = False dataset.add_columns("CognatesetTable", "Status") dataset.add_columns("CognatesetTable", "comment") # Construct a sheet with a messy cognateset header messy_sheet = MockSingleExcelSheet([ [ "CogSet", "Status", "L1", "L2", ], [ "S1", "valid", "F1", "F2", ], [ "", "invalid", "F3", ], ]) for cell in [(2, 3), (3, 3), (2, 4)]: messy_sheet.cell(*cell).hyperlink = "/{:}".format( messy_sheet.cell(*cell).value) # Cognate-import this dataset with caplog.at_level(logging.INFO): import_cognates_from_excel( messy_sheet, dataset, ) # Check that cognateset S1 contains form F3 assert ("F3", "S1") in [(j["Form_ID"], j["Cognateset_ID"]) for j in dataset["CognateTable"]] # Check for warning in caplog assert re.search("[Rr]ow 3 .* no cognate ?set .*'Status': 'invalid'", caplog.text)
def test_add_concept_to_existing_form(single_import_parameters): dataset, target, excel, concept_name = single_import_parameters c_c_id = dataset["ParameterTable", "id"].name c_c_name = dataset["ParameterTable", "name"].name concepts = {c[c_c_name]: c[c_c_id] for c in dataset["ParameterTable"]} mocksheet = MockSingleExcelSheet([ [ "English", "Form", "phonemic", "orthographic", "Segments", "procedural_comment", "Comment", "Source", "phonetic", "variants", ], [ "one", "form", "phonemic", "orthographic", "f o r m", "-", "None", "source[10]", "phonetic", "", ], ]) mocksheet.title = "new_language" # Import this single form in a new language read_single_excel_sheet( dataset=dataset, sheet=mocksheet, entries_to_concepts=concepts, concept_column=concept_name, ) mocksheet = MockSingleExcelSheet([ [ "English", "Form", "phonemic", "orthographic", "Segments", "procedural_comment", "Comment", "Source", "phonetic", "variants", ], [ "two", "form", "phonemic", "orthographic", "f o r m", "-", "None", "source[10]", "phonetic", "", ], ]) mocksheet.title = "new_language" # Import it again, now both form and language should be existing, but the form has a new concept assert read_single_excel_sheet( dataset=dataset, sheet=mocksheet, entries_to_concepts=concepts, concept_column=concept_name, ) == { "new_language": ImportLanguageReport( # TODO: Actually, this isn't a new language. The difference between # adding forms for a language that is not in the LanguageTable yet, # but already has forms in the FormTable, and adding something # completely new, is washed out by read_single_language. The # interpretation of “Does this language still need to be added to # the LanguageTable?” for is_new_language is consistent. is_new_language=True, new=0, existing=0, skipped=0, concepts=1, ) }
def test_no_concept_separator(single_import_parameters, caplog): dataset, target, excel, concept_name = single_import_parameters dataset["FormTable", "parameterReference"].separator = None dataset.write_metadata() c_c_id = dataset["ParameterTable", "id"].name c_c_name = dataset["ParameterTable", "name"].name concepts = {c[c_c_name]: c[c_c_id] for c in dataset["ParameterTable"]} sheet = MockSingleExcelSheet([ [ "English", "Form", "phonemic", "orthographic", "Segments", "procedural_comment", "Comment", "Source", "phonetic", "variants", ], [ "one", "form", "phonemic", "orthographic", "f o r m", "-", "None", "source[10]", "phonetic", "", ], ]) sheet.title = "new_language" # Import this single form in a new language assert read_single_excel_sheet( dataset=dataset, sheet=sheet, entries_to_concepts=concepts, concept_column=concept_name, ) == { "new_language": ImportLanguageReport(is_new_language=True, new=1, existing=0, skipped=0, concepts=0) } # Import it again, with a new concept sheet = MockSingleExcelSheet([ [ "English", "Form", "phonemic", "orthographic", "Segments", "procedural_comment", "Comment", "Source", "phonetic", "variants", ], [ "three", "form", "phonemic", "orthographic", "f o r m", "-", "None", "source[10]", "phonetic", "", ], ]) sheet.title = "new_language" # Test new concept was added as new form assert read_single_excel_sheet( dataset=dataset, sheet=sheet, entries_to_concepts=concepts, concept_column=concept_name, ) == { "new_language": ImportLanguageReport(is_new_language=True, new=1, existing=0, skipped=0, concepts=0) } # Test messages mention the solutions print(caplog.text) assert re.search( r"not.* polysemous forms.*separator.*FormTable.*parameterReference.*json.*lexedata\.report\.list_homophones", caplog.text, )
def test_normalize_header(): sheet = MockSingleExcelSheet([["Language ID", "Gloss (eng)"]]) for row in sheet.iter_rows(): assert normalize_header(row) == ["Language_ID", "Gloss_eng"]