def __init__( self, option_strings, dest, nargs="+", default=types.WorldSet(), help=None, autohelp=True, metavar=None, **kwargs, ): if nargs != "+": if ( len(option_strings) == 1 and nargs == "*" and not option_strings[0].startswith("-") ): # Mandatory argument, can be not given as default. pass else: raise ValueError( "Optional ListOrFromFile makes sense only with variable argument count ('+')" ) if metavar is None: metavar = option_strings[0].upper() if option_strings[0].endswith("s"): metavar = metavar[:-1] if option_strings[0].startswith("--"): metavar = metavar[2:] if autohelp: help = ( (help or "") + f" Instead of a list of individual {metavar}s on the command line, this argument accepts also the path to a single {metavar}S.CSV file (with header row), containing the relevant IDs in the first column." ) if type(default) == types.WorldSet: help += f" (default: All {metavar.lower()}s in the dataset)" help = help.strip() super().__init__( option_strings, dest, nargs=nargs, default=default, help=help, metavar=metavar, **kwargs, )
def apply_heuristics( dataset: types.Wordlist, heuristic: t.Optional[AbsenceHeuristic] = None, primary_concepts: t.Union[ types.WorldSet[types.Parameter_ID], t.AbstractSet[types.Parameter_ID]] = types.WorldSet(), logger: cli.logging.Logger = cli.logger, ) -> t.Mapping[types.Cognateset_ID, t.Set[types.Parameter_ID]]: """Compute the relevant concepts for cognatesets, depending on the heuristic. These concepts will be considered when deciding whether a root is deemed absent in a language. For the CentralConcept heuristic, the relevant concepts are the central concept of a cognateset, as given by the #parameterReference column of the CognatesetTable. A central concept not included in the primary_concepts is ignored with a warning. >>> ds = util.fs.new_wordlist() >>> cst = ds.add_component("CognatesetTable") >>> ds["CognatesetTable"].tableSchema.columns.append( ... pycldf.dataset.Column( ... name="Central_Concept", ... propertyUrl="http://cldf.clld.org/v1.0/terms.rdf#parameterReference")) >>> ds.auto_constraints(cst) >>> ds.write(CognatesetTable=[ ... {"ID": "cognateset1", "Central_Concept": "concept1"} ... ]) >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.CENTRALCONCEPT) == {'cognateset1': {'concept1'}} True This extends to the case where a cognateset may have more than one central concept. >>> ds = util.fs.new_wordlist() >>> cst = ds.add_component("CognatesetTable") >>> ds["CognatesetTable"].tableSchema.columns.append( ... pycldf.dataset.Column( ... name="Central_Concepts", ... propertyUrl="http://cldf.clld.org/v1.0/terms.rdf#parameterReference", ... separator=",")) >>> ds.auto_constraints(cst) >>> ds.write(CognatesetTable=[ ... {"ID": "cognateset1", "Central_Concepts": ["concept1", "concept2"]} ... ]) >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.CENTRALCONCEPT) == { ... 'cognateset1': {'concept1', 'concept2'}} True For the HalfPrimaryConcepts heurisitc, the relevant concepts are all primary concepts connected to a cognateset. >>> ds = util.fs.new_wordlist( ... FormTable=[ ... {"ID": "f1", "Parameter_ID": "c1", "Language_ID": "l1", "Form": "x"}, ... {"ID": "f2", "Parameter_ID": "c2", "Language_ID": "l1", "Form": "x"}], ... CognateTable=[ ... {"ID": "1", "Form_ID": "f1", "Cognateset_ID": "s1"}, ... {"ID": "2", "Form_ID": "f2", "Cognateset_ID": "s1"}]) >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.HALFPRIMARYCONCEPTS) == { ... 's1': {'c1', 'c2'}} True NOTE: This function cannot guarantee that every concept has at least one relevant concept, there may be cognatesets without! A cognateset with 0 relevant concepts will always be included, because 0 is at least half of 0. """ heuristic = (heuristic if heuristic is not None else (AbsenceHeuristic.CENTRALCONCEPT if ("CognatesetTable", "parameterReference") in dataset else AbsenceHeuristic.HALFPRIMARYCONCEPTS)) relevant_concepts: t.MutableMapping[ types.Cognateset_ID, t.Set[types.Parameter_ID]] = t.DefaultDict(set) if heuristic is AbsenceHeuristic.HALFPRIMARYCONCEPTS: c_f = dataset["CognateTable", "formReference"].name c_s = dataset["CognateTable", "cognatesetReference"].name concepts = util.cache_table( dataset, "FormTable", {"concepts": dataset["FormTable", "parameterReference"].name}, ) for j in dataset["CognateTable"]: form = concepts[j[c_f]] for concept in util.ensure_list(form["concepts"]): relevant_concepts[j[c_s]].add(concept) elif heuristic is AbsenceHeuristic.CENTRALCONCEPT: c_cognateset_concept = dataset["CognatesetTable", "parameterReference"].name c_id = dataset["CognatesetTable", "id"].name for c in dataset["CognatesetTable"]: for concept in util.ensure_list(c[c_cognateset_concept]): if concept not in primary_concepts: logger.warning( f"The central concept {concept} of cognateset {c[c_id]} was not part of your list of primary concepts to be included in the coding, so the cognateset will be ignored." ) else: relevant_concepts[c[c_id]].add(concept) else: raise TypeError( f"Value of heuristic, {heuristic}, did not correspond to a known AbsenceHeuristic." ) return relevant_concepts
def root_meaning_code( dataset: t.Mapping[types.Language_ID, t.Mapping[types.Parameter_ID, t.Set[types.Cognateset_ID]]], core_concepts: t.Set[types.Parameter_ID] = types.WorldSet(), ascertainment: t.Sequence[Literal["0", "1", "?"]] = ["0"], ) -> t.Tuple[t.Mapping[types.Language_ID, t.List[Literal["0", "1", "?"]]], t.Mapping[types.Parameter_ID, t.Mapping[types.Cognateset_ID, int]], ]: """Create a root-meaning coding from cognate codes in a dataset Take the cognate code information from a wordlist, i.e. a mapping of the form {Language ID: {Concept ID: {Cognateset ID}}}, and generate a binary alignment from it that lists for every meaning which roots are used to represent that meaning in each language. Return the aligment, and the list of slices belonging to each meaning. The default ascertainment is the a single absence ('0'): The configuration where a form is absent from all languages is never observed, but always possible, so we add this entry for the purposes of ascertainment correction. Examples ======== >>> alignment, concepts = root_meaning_code({"Language": {"Meaning": {"Cognateset 1"}}}) >>> alignment {'Language': ['0', '1']} >>> alignment, concepts = root_meaning_code( ... {"l1": {"m1": {"c1"}}, ... "l2": {"m1": {"c2"}, "m2": {"c1", "c3"}}}) >>> sorted(concepts) ['m1', 'm2'] >>> sorted(concepts["m1"]) ['c1', 'c2'] >>> {language: sequence[concepts["m1"]["c1"]] for language, sequence in alignment.items()} {'l1': '1', 'l2': '0'} >>> {language: sequence[concepts["m2"]["c3"]] for language, sequence in alignment.items()} {'l1': '?', 'l2': '1'} >>> list(zip(*sorted(zip(*alignment.values())))) [('0', '0', '1', '?', '?'), ('0', '1', '0', '1', '1')] """ roots: t.Dict[types.Parameter_ID, t.Set[types.Cognateset_ID]] = {} for language, lexicon in dataset.items(): for concept, cognatesets in lexicon.items(): if core_concepts is None or concept in core_concepts: roots.setdefault(concept, set()).update(cognatesets) blocks = {} sorted_roots: t.Dict[types.Parameter_ID, t.List[types.Cognateset_ID]] = {} c = len(ascertainment) for concept in sorted(roots): possible_roots = sorted(roots[concept]) sorted_roots[concept] = possible_roots blocks[concept] = {root: r for r, root in enumerate(possible_roots, c)} c += len(possible_roots) alignment: t.Dict[types.Language_ID, t.List[Literal["0", "1", "?"]]] = {} for language, lexicon in dataset.items(): alignment[language] = list(ascertainment) for concept, possible_roots in sorted_roots.items(): entries = lexicon.get(concept) if entries is None: alignment[language].extend(["?" for _ in possible_roots]) else: concept_sequence: t.List[Literal["0", "1", "?"]] = [ "1" if k in entries else "0" for k in possible_roots ] alignment[language].extend(concept_sequence) return alignment, blocks
def coverage_report( dataset: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], min_percentage: float = 0.0, with_concept: t.Iterable[types.Parameter_ID] = set(), missing: Missing = Missing.KNOWN, only_coded: bool = True, ) -> t.List[t.List[str]]: coded: t.Container[types.Form_ID] if only_coded: try: c_j_form = dataset["CognateTable", "formReference"].name except KeyError: cli.Exit.NO_COGNATETABLE( message= "You requested that I only count cognate coded forms, but you have no CognateTable containing judgements." ) coded = {judgement[c_j_form] for judgement in dataset["CognateTable"]} else: coded = types.WorldSet() languages: t.Dict[types.Language_ID, str] = {} try: c_l_id = dataset["LanguageTable", "id"].name c_l_name = dataset["LanguageTable", "name"].name for language in dataset["LanguageTable"]: languages[language[c_l_id]] = language[c_l_name] except KeyError: pass concepts: t.DefaultDict[types.Language_ID, t.Counter[types.Parameter_ID]] = t.DefaultDict( t.Counter) c_f_id = dataset["FormTable", "id"].name c_concept = dataset["FormTable", "parameterReference"].name c_language = dataset["FormTable", "languageReference"].name c_form = dataset["FormTable", "form"].name for form in dataset["FormTable"]: languages.setdefault(form[c_language], form[c_language]) if form[c_f_id] not in coded: continue if missing == Missing.IGNORE and (not form[c_form] or form[c_form] == "-"): continue if missing == Missing.KNOWN and not form[c_form]: continue c: types.Parameter_ID for c in util.ensure_list(form[c_concept]): concepts[form[c_language]][c] += 1 # load primary concepts and number of concepts primary_concepts: t.Container[types.Parameter_ID] try: c_c_id = dataset["ParameterTable", "id"].name primary_concepts = [ c[c_c_id] for c in dataset["ParameterTable"] if c["Primary"] ] total_number_concepts = len(primary_concepts) except KeyError: cli.logger.warning( "ParameterTable doesn't contain a column 'Primary'. Primary concepts couldn't be loaded. " "Loading all concepts.") primary_concepts = types.WorldSet() try: total_number_concepts = len(list(dataset["ParameterTable"])) except KeyError: total_number_concepts = len( set.union(*(set(cs) for cs in concepts.values()))) data_languages = [] for language, name in languages.items(): conceptlist = concepts[language] try: synonyms = sum(conceptlist.values()) / len(conceptlist) except ZeroDivisionError: synonyms = float("nan") # percentage of all concepts covered by this language conceptlist_percentage = len(conceptlist) / total_number_concepts if conceptlist_percentage * 100 < min_percentage: continue if not all(c in conceptlist for c in with_concept): continue # count primary concepts primary_count = 0 for c in conceptlist: if c in primary_concepts: primary_count += 1 # if args.languages_only: # print(language) data_languages.append([ language, name, primary_count, conceptlist_percentage, synonyms, ]) return data_languages
def test_segment_to_cognateset_no_slices(caplog): ds = new_wordlist( FormTable=[ { "ID": "f1", "Parameter_ID": "c1", "Language_ID": "l1", "Form": "f", "Segments": ["f"], }, { "ID": "f2", "Parameter_ID": "c1", "Language_ID": "l1", "Form": "f", "Segments": ["f"], }, { "ID": "f3", "Parameter_ID": "c1", "Language_ID": "l1", "Form": "f", "Segments": ["f", "i"], }, { "ID": "f4", "Parameter_ID": "c1", "Language_ID": "l1", "Form": "f", "Segments": ["f", "i"], }, ], CognateTable=[], ) ds.remove_columns("CognateTable", "Segment_Slice", "Alignment") ds.write(CognateTable=[ { "ID": "j1", "Form_ID": "f1", "Cognateset_ID": "s1" }, { "ID": "j2", "Form_ID": "f3", "Cognateset_ID": "s1" }, { "ID": "j3", "Form_ID": "f4", "Cognateset_ID": "s1", }, { "ID": "j4", "Form_ID": "f4", "Cognateset_ID": "s2" }, ], ) with caplog.at_level(logging.WARNING): segments = segment_to_cognateset(ds, types.WorldSet()) assert segments == { "f1": [{"s1"}], "f2": [set()], "f3": [{"s1"}, {"s1"}], "f4": [{"s1", "s2"}, {"s1", "s2"}], }
def test_segment_to_cognateset(caplog): ds = new_wordlist( FormTable=[ { "ID": "f1", "Parameter_ID": "c1", "Language_ID": "l1", "Form": "f", "Segments": ["f"], }, { "ID": "f2", "Parameter_ID": "c1", "Language_ID": "l1", "Form": "f", "Segments": ["f"], }, { "ID": "f3", "Parameter_ID": "c1", "Language_ID": "l1", "Form": "f", "Segments": ["f", "i"], }, { "ID": "f4", "Parameter_ID": "c1", "Language_ID": "l1", "Form": "f", "Segments": ["t", "e", "s", "t"], }, ], CognateTable=[ { "ID": "j1", "Form_ID": "f1", "Cognateset_ID": "s1", "Segment_Slice": "1" }, { "ID": "j2", "Form_ID": "f3", "Cognateset_ID": "s1", "Segment_Slice": "2" }, { "ID": "j3", "Form_ID": "f4", "Cognateset_ID": "s1", "Segment_Slice": ["2:3"], }, { "ID": "j4", "Form_ID": "f4", "Cognateset_ID": "s2", "Segment_Slice": "2" }, ], ) with caplog.at_level(logging.WARNING): segments = segment_to_cognateset(ds, types.WorldSet()) assert segments == { "f1": [{"s1"}], "f2": [set()], "f3": [set(), {"s1"}], "f4": [set(), {"s1", "s2"}, {"s1"}, set()], }
def create_singletons( dataset: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], status: t.Optional[str] = None, by_segment: bool = False, logger: cli.logging.Logger = cli.logger, ) -> t.Tuple[t.Sequence[types.CogSet], t.Sequence[types.Judgement]]: """Create singleton cognate judgements for forms that don't have cognate judgements. Depending on by_segment, singletons are created for every range of segments that is not in any cognate set yet (True) or just for every form where no segment is in any cognate sets (False). """ forms = util.cache_table(dataset) c_j_id = dataset["CognateTable", "id"].name c_j_cogset = dataset["CognateTable", "cognatesetReference"].name c_j_form = dataset["CognateTable", "formReference"].name try: c_j_segmentslice = dataset["CognateTable", "segmentSlice"].name except KeyError: c_j_segmentslice = None try: c_j_alignment = dataset["CognateTable", "alignment"].name except KeyError: c_j_alignment = None if not dataset.get(("CognatesetTable", "Status_Column")): logger.warning( "No Status_Column in CognatesetTable. I will proceed without. Run `lexedata.edit.add_status_column`` in default mode or with table-names CognatesetTable to add a Status_Column." ) try: c_s_id = dataset["CognatesetTable", "id"].name all_cognatesets = {s[c_s_id]: s for s in dataset["CognatesetTable"]} except KeyError: c_s_id = "id" c_s_name = "name" all_cognatesets = { id: types.Judgement({ "id": id, "name": id }) for id in {j[c_j_cogset] for j in dataset["CognateTable"]} } try: c_s_name = dataset["CognatesetTable", "name"].name except KeyError: c_s_name = c_s_id all_judgements = list(dataset["CognateTable"]) if by_segment: judgements = segment_to_cognateset(dataset, types.WorldSet(), logger) forms_and_segments = uncoded_segments(judgements, logger) else: forms_and_segments = uncoded_forms( forms.values(), {j[c_j_form] for j in all_judgements}) for form, slice in forms_and_segments: i = 1 singleton_id = f"X_{form}_{i:d}" while singleton_id in all_cognatesets: i += 1 singleton_id = f"X_{form}_{i:d}" all_cognatesets[singleton_id] = types.CogSet({}) properties = { c_s_name: util.ensure_list(forms[form]["parameterReference"])[0], c_s_id: singleton_id, "Status_Column": status, } try: for column in dataset["CognatesetTable"].tableSchema.columns: all_cognatesets[singleton_id][column.name] = properties.get( column.name) except KeyError: pass judgement = types.Judgement({}) properties = { c_j_id: singleton_id, c_j_cogset: singleton_id, c_j_form: form, c_j_segmentslice: indices_to_segment_slice(slice), c_j_alignment: [forms[form]["segments"][i] for i in slice], "Status_Column": status, } for column in dataset["CognateTable"].tableSchema.columns: judgement[column.name] = properties.get(column.name) all_judgements.append(judgement) return all_cognatesets.values(), all_judgements
def segment_to_cognateset( dataset: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], cognatesets: t.Container[types.Cognateset_ID], logger: cli.logging.Logger = cli.logger, ) -> t.Mapping[types.Form_ID, t.List[t.Set[types.Cognateset_ID]]]: # required fields c_cognate_cognateset = dataset.column_names.cognates.cognatesetReference c_cognate_id = dataset.column_names.cognates.id c_cognate_form = dataset.column_names.cognates.formReference c_cognate_slice = dataset.column_names.cognates.segmentSlice forms = util.cache_table(dataset) cognateset_cache: t.Container[types.Cognateset_ID] if "CognatesetTable" in dataset: c_s_id = dataset["CognatesetTable", "id"].name cognateset_cache = { cognateset[c_s_id] for cognateset in dataset["CognatesetTable"] if cognatesets is None or cognateset["ID"] in cognatesets } else: if cognatesets is None: cognateset_cache = types.WorldSet() else: cognateset_cache = cognatesets which_segment_belongs_to_which_cognateset: t.Mapping[ types.Form_ID, t.List[t.Set[types.Cognateset_ID]]] = { f: [set() for _ in form["segments"]] for f, form in forms.items() if form["form"] and form["form"].strip() and form["form"].strip() != "-" } for j in dataset["CognateTable"]: if j[c_cognate_form] in forms and j[ c_cognate_cognateset] in cognateset_cache: form = forms[j[c_cognate_form]] if j[c_cognate_form] not in which_segment_belongs_to_which_cognateset: continue if j.get(c_cognate_slice): try: segments_judged = list( parse_segment_slices(j[c_cognate_slice])) except ValueError: logger.warning( f"In judgement {j[c_cognate_id]}, segment slice {','.join(j[c_cognate_slice])} has start after end." ) continue else: segments_judged = list(range(len(form["segments"]))) old_s = None for s in segments_judged: if old_s is not None and old_s + 1 != s: logger.warning( f"In judgement {j[c_cognate_id]}, segment {s+1} follows segment {old_s}, so the morpheme is non-contiguous" ) try: cognatesets = which_segment_belongs_to_which_cognateset[ j[c_cognate_form]][s] except IndexError: logger.warning( f"In judgement {j[c_cognate_id]}, segment slice {','.join(j[c_cognate_slice])} points outside valid range 1:{len(form['segments'])}." ) continue cognatesets.add(j[c_cognate_cognateset]) return which_segment_belongs_to_which_cognateset