def check_na_form_has_no_alternative( dataset: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], logger: cli.logging.Logger = cli.logger, ): valid = True c_f_id = dataset["FormTable", "id"].name c_f_form = dataset["FormTable", "form"].name c_f_concept = dataset["FormTable", "parameterReference"].name c_f_language = dataset["FormTable", "languageReference"].name forms_by_concepts: t.Dict[types.Parameter_ID, t.Set[types.Form_ID]] = t.DefaultDict(set) for f in dataset["FormTable"]: for c in util.ensure_list(f[c_f_concept]): forms_by_concepts[c].add(f[c_f_id]) forms_to_languages = t.DefaultDict(set) for f in dataset["FormTable"]: forms_to_languages[f[c_f_language]].add(f[c_f_id]) na_forms = [f for f in dataset["FormTable"] if f[c_f_form] == "-"] for form in na_forms: for c in util.ensure_list(form[c_f_concept]): if forms_by_concepts[c].intersection( forms_to_languages[form[c_f_language]]) != {form[c_f_id]}: log_or_raise( message= f"Non empty forms exist for the NA form {form[c_f_id]} with identical parameter and language reference", log=logger, ) valid = False return valid
def test_toexcel_filtered(cldf_wordlist, working_and_nonworking_bibfile, caplog): dataset, url = working_and_nonworking_bibfile(cldf_wordlist) writer = MatrixExcelWriter( dataset=dataset, database_url=str(url), ) E = MatrixExcelWriter(dataset, database_url="https://example.org/lexicon/{:}") forms = util.cache_table(dataset) languages = sorted(util.cache_table(dataset, "LanguageTable").values(), key=lambda x: x["name"]) judgements = [{ "formReference": f["id"], "cognatesetReference": parameter } for f in forms.values() for parameter in util.ensure_list(f["parameterReference"])] parameters = [ c for n, c in util.cache_table(dataset, "ParameterTable").items() if n == "Woman" ] with caplog.at_level(logging.WARNING): E.create_excel(rows=parameters, judgements=judgements, forms=forms, languages=languages) assert len(list(writer.ws.iter_rows())) in {0, 2}
def test_cell_comments_export(): dataset, _ = copy_to_temp( Path(__file__).parent / "data/cldf/minimal/cldf-metadata.json") _, out_filename = tempfile.mkstemp(".xlsx", "cognates") E = MatrixExcelWriter(dataset, database_url="https://example.org/lexicon/{:}") forms = util.cache_table(dataset) languages = sorted(util.cache_table(dataset, "LanguageTable").values(), key=lambda x: x["name"]) judgements = [{ "formReference": f["id"], "cognatesetReference": parameter } for f in forms.values() for parameter in util.ensure_list(f["parameterReference"])] parameters = util.cache_table(dataset, "ParameterTable").values() E.create_excel(rows=parameters, judgements=judgements, forms=forms, languages=languages) for col in E.ws.iter_cols(): pass assert ( col[-1].comment and col[-1].comment.content ), "Last row of last column should contain a form, with a comment attached to it." assert (col[-1].comment.content == "A Comment!" ), "Comment should match the comment from the form table"
def test_toexcel_runs(cldf_wordlist, working_and_nonworking_bibfile): dataset, filename = working_and_nonworking_bibfile(cldf_wordlist) E = MatrixExcelWriter( dataset=dataset, database_url=str(filename), ) forms = util.cache_table(dataset) languages = sorted(util.cache_table(dataset, "LanguageTable").values(), key=lambda x: x["name"]) judgements = [{ "formReference": f["id"], "cognatesetReference": parameter } for f in forms.values() for parameter in util.ensure_list(f["parameterReference"])] parameters = util.cache_table(dataset, "ParameterTable").values() E.create_excel(rows=parameters, judgements=judgements, forms=forms, languages=languages) _, out_filename = tempfile.mkstemp(".xlsx", "cognates") E.wb.save(filename=out_filename)
def apply_heuristics( dataset: types.Wordlist, heuristic: t.Optional[AbsenceHeuristic] = None, primary_concepts: t.Union[ types.WorldSet[types.Parameter_ID], t.AbstractSet[types.Parameter_ID]] = types.WorldSet(), logger: cli.logging.Logger = cli.logger, ) -> t.Mapping[types.Cognateset_ID, t.Set[types.Parameter_ID]]: """Compute the relevant concepts for cognatesets, depending on the heuristic. These concepts will be considered when deciding whether a root is deemed absent in a language. For the CentralConcept heuristic, the relevant concepts are the central concept of a cognateset, as given by the #parameterReference column of the CognatesetTable. A central concept not included in the primary_concepts is ignored with a warning. >>> ds = util.fs.new_wordlist() >>> cst = ds.add_component("CognatesetTable") >>> ds["CognatesetTable"].tableSchema.columns.append( ... pycldf.dataset.Column( ... name="Central_Concept", ... propertyUrl="http://cldf.clld.org/v1.0/terms.rdf#parameterReference")) >>> ds.auto_constraints(cst) >>> ds.write(CognatesetTable=[ ... {"ID": "cognateset1", "Central_Concept": "concept1"} ... ]) >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.CENTRALCONCEPT) == {'cognateset1': {'concept1'}} True This extends to the case where a cognateset may have more than one central concept. >>> ds = util.fs.new_wordlist() >>> cst = ds.add_component("CognatesetTable") >>> ds["CognatesetTable"].tableSchema.columns.append( ... pycldf.dataset.Column( ... name="Central_Concepts", ... propertyUrl="http://cldf.clld.org/v1.0/terms.rdf#parameterReference", ... separator=",")) >>> ds.auto_constraints(cst) >>> ds.write(CognatesetTable=[ ... {"ID": "cognateset1", "Central_Concepts": ["concept1", "concept2"]} ... ]) >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.CENTRALCONCEPT) == { ... 'cognateset1': {'concept1', 'concept2'}} True For the HalfPrimaryConcepts heurisitc, the relevant concepts are all primary concepts connected to a cognateset. >>> ds = util.fs.new_wordlist( ... FormTable=[ ... {"ID": "f1", "Parameter_ID": "c1", "Language_ID": "l1", "Form": "x"}, ... {"ID": "f2", "Parameter_ID": "c2", "Language_ID": "l1", "Form": "x"}], ... CognateTable=[ ... {"ID": "1", "Form_ID": "f1", "Cognateset_ID": "s1"}, ... {"ID": "2", "Form_ID": "f2", "Cognateset_ID": "s1"}]) >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.HALFPRIMARYCONCEPTS) == { ... 's1': {'c1', 'c2'}} True NOTE: This function cannot guarantee that every concept has at least one relevant concept, there may be cognatesets without! A cognateset with 0 relevant concepts will always be included, because 0 is at least half of 0. """ heuristic = (heuristic if heuristic is not None else (AbsenceHeuristic.CENTRALCONCEPT if ("CognatesetTable", "parameterReference") in dataset else AbsenceHeuristic.HALFPRIMARYCONCEPTS)) relevant_concepts: t.MutableMapping[ types.Cognateset_ID, t.Set[types.Parameter_ID]] = t.DefaultDict(set) if heuristic is AbsenceHeuristic.HALFPRIMARYCONCEPTS: c_f = dataset["CognateTable", "formReference"].name c_s = dataset["CognateTable", "cognatesetReference"].name concepts = util.cache_table( dataset, "FormTable", {"concepts": dataset["FormTable", "parameterReference"].name}, ) for j in dataset["CognateTable"]: form = concepts[j[c_f]] for concept in util.ensure_list(form["concepts"]): relevant_concepts[j[c_s]].add(concept) elif heuristic is AbsenceHeuristic.CENTRALCONCEPT: c_cognateset_concept = dataset["CognatesetTable", "parameterReference"].name c_id = dataset["CognatesetTable", "id"].name for c in dataset["CognatesetTable"]: for concept in util.ensure_list(c[c_cognateset_concept]): if concept not in primary_concepts: logger.warning( f"The central concept {concept} of cognateset {c[c_id]} was not part of your list of primary concepts to be included in the coding, so the cognateset will be ignored." ) else: relevant_concepts[c[c_id]].add(concept) else: raise TypeError( f"Value of heuristic, {heuristic}, did not correspond to a known AbsenceHeuristic." ) return relevant_concepts
def coverage_report( dataset: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], min_percentage: float = 0.0, with_concept: t.Iterable[types.Parameter_ID] = set(), missing: Missing = Missing.KNOWN, only_coded: bool = True, ) -> t.List[t.List[str]]: coded: t.Container[types.Form_ID] if only_coded: try: c_j_form = dataset["CognateTable", "formReference"].name except KeyError: cli.Exit.NO_COGNATETABLE( message= "You requested that I only count cognate coded forms, but you have no CognateTable containing judgements." ) coded = {judgement[c_j_form] for judgement in dataset["CognateTable"]} else: coded = types.WorldSet() languages: t.Dict[types.Language_ID, str] = {} try: c_l_id = dataset["LanguageTable", "id"].name c_l_name = dataset["LanguageTable", "name"].name for language in dataset["LanguageTable"]: languages[language[c_l_id]] = language[c_l_name] except KeyError: pass concepts: t.DefaultDict[types.Language_ID, t.Counter[types.Parameter_ID]] = t.DefaultDict( t.Counter) c_f_id = dataset["FormTable", "id"].name c_concept = dataset["FormTable", "parameterReference"].name c_language = dataset["FormTable", "languageReference"].name c_form = dataset["FormTable", "form"].name for form in dataset["FormTable"]: languages.setdefault(form[c_language], form[c_language]) if form[c_f_id] not in coded: continue if missing == Missing.IGNORE and (not form[c_form] or form[c_form] == "-"): continue if missing == Missing.KNOWN and not form[c_form]: continue c: types.Parameter_ID for c in util.ensure_list(form[c_concept]): concepts[form[c_language]][c] += 1 # load primary concepts and number of concepts primary_concepts: t.Container[types.Parameter_ID] try: c_c_id = dataset["ParameterTable", "id"].name primary_concepts = [ c[c_c_id] for c in dataset["ParameterTable"] if c["Primary"] ] total_number_concepts = len(primary_concepts) except KeyError: cli.logger.warning( "ParameterTable doesn't contain a column 'Primary'. Primary concepts couldn't be loaded. " "Loading all concepts.") primary_concepts = types.WorldSet() try: total_number_concepts = len(list(dataset["ParameterTable"])) except KeyError: total_number_concepts = len( set.union(*(set(cs) for cs in concepts.values()))) data_languages = [] for language, name in languages.items(): conceptlist = concepts[language] try: synonyms = sum(conceptlist.values()) / len(conceptlist) except ZeroDivisionError: synonyms = float("nan") # percentage of all concepts covered by this language conceptlist_percentage = len(conceptlist) / total_number_concepts if conceptlist_percentage * 100 < min_percentage: continue if not all(c in conceptlist for c in with_concept): continue # count primary concepts primary_count = 0 for c in conceptlist: if c in primary_concepts: primary_count += 1 # if args.languages_only: # print(language) data_languages.append([ language, name, primary_count, conceptlist_percentage, synonyms, ]) return data_languages
def forms_to_tsv( dataset: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], languages: t.Iterable[str], concepts: t.Set[str], cognatesets: t.Iterable[str], logger: cli.logging.Logger = cli.logger, ): try: dataset["FormTable", "segments"].name except KeyError: cli.Exit.NO_SEGMENTS( """Edictor export requires your dataset to have segments in the FormTable. Run `lexedata.edit.add_segments` to automatically add segments based on your forms.""" ) delimiters = { util.cldf_property(c.propertyUrl) or c.name: c.separator for c in dataset["FormTable"].tableSchema.columns if c.separator } # prepare the header for the tsv output # the first column must be named ID and contain 1-based integer IDs # set header for tsv tsv_header = list(dataset["FormTable"].tableSchema.columndict.keys()) tsv_header.insert(0, "LINGPY_ID") tsv_header.append("cognatesetReference") if "alignment" not in tsv_header: tsv_header.append("alignment") if "parameterReference" in delimiters: tsv_header.append("_parameterReference") # select forms and cognates given restriction of languages and concepts, cognatesets respectively forms = {} for f, form in util.cache_table(dataset).items(): if form["form"] is None or form["form"] == "-": continue if form["languageReference"] in languages and concepts.intersection( ensure_list(form["parameterReference"])): # Normalize the form: # 1. No list-valued entries for c, d in delimiters.items(): if c == "segments": continue if c == "parameterReference": form["_parameterReference"] = d.join( str(e) for e in form[c]) form["parameterReference"] = form["parameterReference"][0] continue form[c] = d.join(str(e) for e in form[c]) if not form.get("segments"): logger.warning( "No segments found for form %s. You can generate segments using `lexedata.edit.add_segments`.", form["id"], ) # 2. No tabs, newlines in entries for c, v in form.items(): if type(v) == str: if "\\!t" in form[c] or "\\!n" in form[c]: logger.warning( "Your data contains the special characters '\\!t' or '\\!n', which I will introduce for escaping tabs and newlines for edictor. These characters will not survive the back-import." ) form[c] = form[c].replace("\t", "\\!t").replace("\n", "\\!n") forms[f] = form cognateset_cache: t.Mapping[t.Optional[str], int] if "CognatesetTable" in dataset: id = dataset["CognatesetTable", "id"].name cognateset_cache = { cognateset[id]: c for c, cognateset in enumerate(dataset["CognatesetTable"], 1) if cognateset[id] in cognatesets } else: if cognatesets is None: cognateset_cache = t.DefaultDict(itertools.count().__next__) else: cognateset_cache = {c: i for i, c in enumerate(cognatesets, 1)} # Warn about unexpected non-concatenative ‘morphemes’ lexedata.report.nonconcatenative_morphemes.segment_to_cognateset( dataset, cognatesets, logger) judgements_about_form: t.Mapping[types.Form_ID, t.Tuple[t.List[str], t.List[int]]] = { id: ([f"({s})" for s in form["segments"]], []) for id, form in forms.items() } # Compose all judgements, last-one-rules mode. for j in util.cache_table(dataset, "CognateTable").values(): if j["formReference"] in forms and cognateset_cache.get( j["cognatesetReference"]): if j.get("alignment"): j["alignment"] = [s or "" for s in j["alignment"]] else: j["alignment"] = forms[j["formReference"]]["segments"] try: segments_judged = list( parse_segment_slices(segment_slices=j["segmentSlice"], enforce_ordered=False)) except TypeError: logger.warning( "In judgement %s: No segment slice given. Assuming whole form.", j["id"], ) segments_judged = list( range(len(forms[j["formReference"]]["segments"]))) except KeyError: segments_judged = list( range(len(forms[j["formReference"]]["segments"]))) except ValueError: logger.warning( "In judgement %s: Index error due to bad segment slice %s. Skipped.", j["id"], ",".join(j["segmentSlice"]), ) continue global_alignment, cogsets = judgements_about_form[ j["formReference"]] segment_start, segment_end = min( segments_judged), max(segments_judged) + 1 try: glue_in_alignment( global_alignment, cogsets, j["alignment"], j["cognatesetReference"], slice(segment_start, segment_end), ) except IndexError: logger.warning( "In judgement %s: Index error due to bad segment slice %s. Skipped.", j["id"], ",".join(j["segmentSlice"]), ) continue return forms, judgements_about_form, cognateset_cache
def create_singletons( dataset: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], status: t.Optional[str] = None, by_segment: bool = False, logger: cli.logging.Logger = cli.logger, ) -> t.Tuple[t.Sequence[types.CogSet], t.Sequence[types.Judgement]]: """Create singleton cognate judgements for forms that don't have cognate judgements. Depending on by_segment, singletons are created for every range of segments that is not in any cognate set yet (True) or just for every form where no segment is in any cognate sets (False). """ forms = util.cache_table(dataset) c_j_id = dataset["CognateTable", "id"].name c_j_cogset = dataset["CognateTable", "cognatesetReference"].name c_j_form = dataset["CognateTable", "formReference"].name try: c_j_segmentslice = dataset["CognateTable", "segmentSlice"].name except KeyError: c_j_segmentslice = None try: c_j_alignment = dataset["CognateTable", "alignment"].name except KeyError: c_j_alignment = None if not dataset.get(("CognatesetTable", "Status_Column")): logger.warning( "No Status_Column in CognatesetTable. I will proceed without. Run `lexedata.edit.add_status_column`` in default mode or with table-names CognatesetTable to add a Status_Column." ) try: c_s_id = dataset["CognatesetTable", "id"].name all_cognatesets = {s[c_s_id]: s for s in dataset["CognatesetTable"]} except KeyError: c_s_id = "id" c_s_name = "name" all_cognatesets = { id: types.Judgement({ "id": id, "name": id }) for id in {j[c_j_cogset] for j in dataset["CognateTable"]} } try: c_s_name = dataset["CognatesetTable", "name"].name except KeyError: c_s_name = c_s_id all_judgements = list(dataset["CognateTable"]) if by_segment: judgements = segment_to_cognateset(dataset, types.WorldSet(), logger) forms_and_segments = uncoded_segments(judgements, logger) else: forms_and_segments = uncoded_forms( forms.values(), {j[c_j_form] for j in all_judgements}) for form, slice in forms_and_segments: i = 1 singleton_id = f"X_{form}_{i:d}" while singleton_id in all_cognatesets: i += 1 singleton_id = f"X_{form}_{i:d}" all_cognatesets[singleton_id] = types.CogSet({}) properties = { c_s_name: util.ensure_list(forms[form]["parameterReference"])[0], c_s_id: singleton_id, "Status_Column": status, } try: for column in dataset["CognatesetTable"].tableSchema.columns: all_cognatesets[singleton_id][column.name] = properties.get( column.name) except KeyError: pass judgement = types.Judgement({}) properties = { c_j_id: singleton_id, c_j_cogset: singleton_id, c_j_form: form, c_j_segmentslice: indices_to_segment_slice(slice), c_j_alignment: [forms[form]["segments"][i] for i in slice], "Status_Column": status, } for column in dataset["CognateTable"].tableSchema.columns: judgement[column.name] = properties.get(column.name) all_judgements.append(judgement) return all_cognatesets.values(), all_judgements
type=str, default="https://example.org/lexicon/{:}", help= "A template string for URLs pointing to individual forms. For example, to" " point to lexibank, you would use https://lexibank.clld.org/values/{:}." " (default: https://example.org/lexicon/{:})", ) args = parser.parse_args() logger = cli.setup_logging(args) dataset = (pycldf.Wordlist.from_metadata(args.metadata), ) E = MatrixExcelWriter( dataset, database_url=args.url_template, logger=logger, ) forms = util.cache_table(dataset) languages = sorted(util.cache_table(dataset, "LanguageTable").values(), key=lambda x: x["name"]) judgements = [{ "formReference": f["id"], "cognatesetReference": parameter } for f in forms.values() for parameter in util.ensure_list(f["parameterReference"])] parameters = util.cache_table(dataset, "ParameterTable").values() E.create_excel(rows=parameters, judgements=judgements, forms=forms, languages=languages) E.wb.save(filename=args.excel, )