def check_na_form_has_no_alternative( dataset: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], logger: cli.logging.Logger = cli.logger, ): valid = True c_f_id = dataset["FormTable", "id"].name c_f_form = dataset["FormTable", "form"].name c_f_concept = dataset["FormTable", "parameterReference"].name c_f_language = dataset["FormTable", "languageReference"].name forms_by_concepts: t.Dict[types.Parameter_ID, t.Set[types.Form_ID]] = t.DefaultDict(set) for f in dataset["FormTable"]: for c in util.ensure_list(f[c_f_concept]): forms_by_concepts[c].add(f[c_f_id]) forms_to_languages = t.DefaultDict(set) for f in dataset["FormTable"]: forms_to_languages[f[c_f_language]].add(f[c_f_id]) na_forms = [f for f in dataset["FormTable"] if f[c_f_form] == "-"] for form in na_forms: for c in util.ensure_list(form[c_f_concept]): if forms_by_concepts[c].intersection( forms_to_languages[form[c_f_language]]) != {form[c_f_id]}: log_or_raise( message= f"Non empty forms exist for the NA form {form[c_f_id]} with identical parameter and language reference", log=logger, ) valid = False return valid
def multistate_code( dataset: t.Mapping[types.Language_ID, t.Mapping[types.Parameter_ID, t.Set[types.Cognateset_ID]]], ) -> t.Tuple[t.Mapping[types.Language_ID, t.Sequence[t.Set[int]]], t.Sequence[int]]: """Create a multistate root-meaning coding from cognate codes in a dataset Take the cognate code information from a wordlist, i.e. a mapping of the form {Language ID: {Concept ID: {Cognateset ID}}}, and generate a multistate alignment from it that lists for every meaning which roots are used to represent that meaning in each language. Also return the number of roots for each concept. Examples ======== >>> alignment, lengths = multistate_code({"Language": {"Meaning": {"Cognateset 1"}}}) >>> alignment =={'Language': [{0}]} True >>> lengths == [1] True >>> alignment, statecounts = multistate_code( ... {"l1": {"m1": {"c1"}}, ... "l2": {"m1": {"c2"}, "m2": {"c1", "c3"}}}) >>> alignment["l1"][1] set() >>> alignment["l2"][1] == {0, 1} True >>> statecounts [2, 2] """ roots: t.Dict[types.Parameter_ID, t.Set[types.Cognateset_ID]] = t.DefaultDict(set) for language, lexicon in dataset.items(): for concept, cognatesets in lexicon.items(): roots[concept].update(cognatesets) sorted_roots: t.Mapping[ types.Parameter_ID, t.Sequence[types.Cognateset_ID]] = { concept: sorted(cognatesets) for concept, cognatesets in sorted(roots.items()) } states: t.List[int] = [len(roots) for _, roots in sorted_roots.items()] alignment: t.MutableMapping[types.Language_ID, t.List[t.Set[int]]] = t.DefaultDict(list) for language, lexicon in dataset.items(): for concept, possible_roots in sorted_roots.items(): entries = lexicon.get(concept) alignment[language].append(set()) if entries: for entry in entries: state = possible_roots.index(entry) alignment[language][-1].add(state) return alignment, states
def list_homophones(dataset: pycldf.Dataset, out: io.TextIOBase, logger: cli.logging.Logger = cli.logger) -> None: clics = load_clics() # warn if clics cannot be loaded if not clics: logger.warning( "Clics could not be loaded. Using an empty graph instead") clics = nx.Graph() c_id = dataset["ParameterTable", "id"].name try: c_concepticon = dataset["ParameterTable", "concepticonReference"].name except KeyError: cli.Exit.INVALID_DATASET( "This script requires a column concepticonReference in ParamterTable. " "Please run add_concepticon.py") concepticon = {} for concept in dataset["ParameterTable"]: concepticon[concept[c_id]] = concept[c_concepticon] f_id = dataset["FormTable", "id"].name f_lang = dataset["FormTable", "languageReference"].name f_concept = dataset["FormTable", "parameterReference"].name f_form = dataset["FormTable", "form"].name homophones: t.DefaultDict[str, t.DefaultDict[str, t.Set[t.Tuple[ str, str]]]] = t.DefaultDict(lambda: t.DefaultDict(set)) for form in dataset["FormTable"]: if form[f_form] == "-" or form[f_form] is None: continue if isinstance(form[f_concept], list): homophones[form[f_lang]][form[f_form]].add( tuple(form[f_concept]) + (form[f_id], )) else: homophones[form[f_lang]][form[f_form]].add( (form[f_concept], form[f_id])) for lang, forms in homophones.items(): for form, meanings in forms.items(): if len(meanings) == 1: continue clics_nodes = {concepticon.get(concept) for concept, _ in meanings} if None in clics_nodes: x = " (but at least one concept not found):" else: x = ":" clics_nodes -= {None} if len(clics_nodes) <= 1: x = "Unknown" + x elif nx.is_connected(clics.subgraph(clics_nodes)): x = "Connected" + x else: x = "Unconnected" + x line = f"{lang}, '{form}': {x}\n" for ele in sorted(meanings): line += f"\t {ele[-1]} ({', '.join(ele[0:-1])})\n" out.write(line)
def collect_forms_by_row( self, judgements: t.Iterable[types.Judgement], rows: t.Iterable[types.Row_ID], ) -> t.Mapping[types.Cognateset_ID, t.Mapping[ types.Form_ID, t.Sequence[types.Judgement]]]: "Collect forms by row object (ie. concept or cognate set)" all_forms: t.MutableMapping[types.Cognateset_ID, t.Mapping[ types.Form_ID, t.List[types.Judgement]]] = t.DefaultDict( lambda: t.DefaultDict(list)) for judgement in judgements: all_forms[judgement["cognatesetReference"]][ judgement["formReference"]].append(judgement) return all_forms
def read_structure_dataset( dataset: pycldf.StructureDataset, logger: cli.logging.Logger = cli.logger ) -> t.MutableMapping[types.Language_ID, t.MutableMapping[types.Parameter_ID, t.Set]]: col_map = dataset.column_names data: t.MutableMapping[types.Language_ID, t.MutableMapping[ types.Parameter_ID, t.Set]] = t.DefaultDict(lambda: t.DefaultDict(set)) code_column = col_map.values.codeReference or col_map.values.value for row in dataset["ValueTable"]: lang_id = row[col_map.values.languageReference] feature_id = row[col_map.values.parameterReference] if row[code_column]: data[lang_id][feature_id].add(row[code_column]) return data
def test_no_defaultdict_instantiation(self): with self.assertRaises(TypeError): typing.DefaultDict() with self.assertRaises(TypeError): typing.DefaultDict[KT, VT]() with self.assertRaises(TypeError): typing.DefaultDict[str, int]()
def list_homophones(dataset: pycldf.Dataset) -> None: clics = load_clics() # warn if clics cannot be loaded if not clics: logger.warning( "Clics could not be loaded. Using an empty graph instead") clics = nx.Graph() c_id = dataset["ParameterTable", "id"].name c_concepticon = dataset["ParameterTable", "concepticonReference"].name concepticon = {} for concept in dataset["ParameterTable"]: concepticon[concept[c_id]] = concept[c_concepticon] f_id = dataset["FormTable", "id"].name f_lang = dataset["FormTable", "languageReference"].name f_concept = dataset["FormTable", "parameterReference"].name f_form = dataset["FormTable", "form"].name homophones: t.DefaultDict[str, t.DefaultDict[str, t.Set[t.Tuple[ str, str]]]] = t.DefaultDict(lambda: t.DefaultDict(set)) for form in dataset["FormTable"]: homophones[form[f_lang]][form[f_form]].add( (form[f_concept], form[f_id])) for lang, forms in homophones.items(): for form, meanings in forms.items(): if len(meanings) == 1: continue clics_nodes = [ concepticon.get(concept) for concept, form_id in meanings ] if None in clics_nodes: clics_nodes = [c for c in clics_nodes if c] x = "(but at least one concept not found)" else: x = "" if len(clics_nodes) <= 1: print("Unknown:", lang, form, meanings) elif nx.is_connected(clics.subgraph(clics_nodes)): print("Connected:", x, lang, form, meanings) else: print("Unconnected:", x, lang, form, meanings)
def coverage_report_concepts(dataset: pycldf.Dataset, ): # TODO: This assumes the existence of a ParameterTable. The script should # still work if none exists. TODO: In addition, we decided to not formalize # primary concepts, so this should instead depend on a command line # argument, either supplementing or replacing --with-concepts. c_c_id = dataset["ParameterTable", "id"].name try: # Load primary concepts if possible. primary_concepts = [ c[c_c_id] for c in dataset["ParameterTable"] if c["Primary"] ] except KeyError: cli.logger.warning( "ParamterTable doesn't contain a column 'Primary'. Primary concepts couldn't be loaded. " "Loading all concepts.") primary_concepts = [c[c_c_id] for c in dataset["ParameterTable"]] # get the foreign keys pointing to the required tables foreign_key_parameter = "" for foreign_key in dataset["FormTable"].tableSchema.foreignKeys: if foreign_key.reference.resource == dataset["ParameterTable"].url: foreign_key_parameter = foreign_key.columnReference[0] foreign_key_language = "" for foreign_key in dataset["FormTable"].tableSchema.foreignKeys: if foreign_key.reference.resource == dataset["LanguageTable"].url: foreign_key_language = foreign_key.columnReference[0] multiple_concepts = bool(dataset["FormTable", "parameterReference"].separator) c_concept = foreign_key_parameter c_language = foreign_key_language # for each concept count the languages concepts_to_languages: t.DefaultDict[str, t.List[str]] = t.DefaultDict(list) for form in dataset["FormTable"]: if multiple_concepts: language = form[c_language] for concept in form[c_concept]: if (language not in concepts_to_languages[concept] and concept in primary_concepts): concepts_to_languages[concept].append(language) else: concept = form[c_concept] language = form[c_language] if (language not in concepts_to_languages[concept] and concept in primary_concepts): concepts_to_languages[concept].append(language) data_concepts = [] for k, v in concepts_to_languages.items(): data_concepts.append([k, len(set(v))]) return data_concepts
def check_no_separator_in_ids(dataset: pycldf.Dataset, logger: cli.logger = cli.logger) -> bool: valid = True # Check that reference columns that have a separator don't contain the separator inside a string value forbidden_separators: t.MutableMapping[str, t.MutableMapping[ str, t.MutableMapping[str, t.List[t.Tuple[str, str]]]]] = t.DefaultDict( lambda: t.DefaultDict(lambda: t.DefaultDict(list))) for table in dataset.tables: for foreign_key in table.tableSchema.foreignKeys: try: (referencing_column, ) = foreign_key.columnReference (referenced_column, ) = foreign_key.reference.columnReference except ValueError: # Multi-column foreign key. We *could* check that there's not a # reference column hidden in there, but we don't. continue if table.get_column(referencing_column).separator is None: continue forbidden_separators[ foreign_key.reference.resource.__str__()][referenced_column][ table.get_column(referencing_column).separator].append( (table.url.string, referencing_column)) for table, targets in forbidden_separators.items(): for r, row in enumerate(dataset[table], 1): for target_column, separators_forbidden_here in targets.items(): for separator, forbidden_by in separators_forbidden_here.items( ): if separator in row[target_column]: log_or_raise( f"In table {table}, row {r} column {target_column} contains {separator}, which is also the separator of {forbidden_by}.", log=logger, ) valid = False return valid
def hex_ecoregions( ecoregions: numpy.array, transform: rasterio.Affine) -> t.Dict[h3.H3Index, t.Counter[int]]: c: t.Dict[h3.H3Index, t.Counter[int]] = t.DefaultDict(t.Counter) for y, row in enumerate(ecoregions): (_, lat) = transform * (0, y) area = numpy.cos(lat * numpy.pi / 180) * SQUARE_OF_15_ARCSEC for x, eco in enumerate(row): (lon, lat) = transform * (x, y) index: h3.H3Index = h3.geo_to_h3(lat, lon, RESOLUTION) c[index][int( eco )] += area # eco is a numpy type that sqlalchemy does not understand as int return c
def count_segments( dataset: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], languages: t.Container[types.Language_ID], ): c_f_language = dataset["FormTable", "languageReference"].name try: c_f_segments = dataset["FormTable", "segments"].name except KeyError: cli.Exit.NO_SEGMENTS( """Segment invertories report requires your dataset to have segments in the FormTable. Run `lexedata.edit.add_segments` to automatically add segments based on your forms.""" ) counter: t.MutableMapping[t.Counter[str]] = t.DefaultDict(t.Counter) for form in cli.tq( dataset["FormTable"], total=dataset["FormTable"].common_props.get("dc:extent"), task="Reading all forms", ): if form[c_f_language] in languages: counter[form[c_f_language]].update(form[c_f_segments]) return counter
def connected_concepts( dataset: pycldf.Wordlist, ) -> t.Mapping[CognatesetID, t.Counter[ConceptID]]: """For each cognate set it the dataset, check which concepts it is connected to. >>> """ concepts_by_form = load_concepts_by_form(dataset) cognatesets_to_concepts: t.DefaultDict[ CognatesetID, t.Sequence[ConceptID]] = t.DefaultDict(list) # Check whether cognate judgements live in the FormTable … c_cognateset = dataset.column_names.forms.cognatesetReference c_form = dataset.column_names.forms.id table = dataset["FormTable"] # … or in a separate CognateTable if c_cognateset is None: c_cognateset = dataset.column_names.cognates.cognatesetReference c_form = dataset.column_names.cognates.formReference table = dataset["CognateTable"] if c_cognateset is None: raise ValueError( f"Dataset {dataset:} had no cognatesetReference column in a CognateTable" " or a FormTable and is thus not compatible with this script.") for judgement in cli.tq( table, task="Link cognatesets to concepts", total=table.common_props.get("dc:extent"), ): cognatesets_to_concepts[judgement[c_cognateset]].extend( concepts_by_form[judgement[c_form]]) return { cogset: collections.Counter(concepts) for cogset, concepts in cognatesets_to_concepts.items() }
def edictor_to_cldf( dataset: types.Wordlist[ types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], new_cogsets: t.Mapping[ types.Cognateset_ID, t.List[t.Tuple[types.Form_ID, range, t.Sequence[str]]] ], affected_forms: t.Set[types.Form_ID], source: t.List[str] = [], ): ref_cogsets: t.MutableMapping[ types.Cognateset_ID, t.List[t.Tuple[types.Form_ID, range, t.Sequence[str]]] ] = t.DefaultDict(list) cognate: t.List[types.Judgement] = [] judgements_lookup: t.MutableMapping[ types.Form_ID, t.MutableMapping[types.Cognateset_ID, types.Judgement] ] = t.DefaultDict(dict) for j in util.cache_table(dataset, "CognateTable").values(): if j["formReference"] in affected_forms: ref_cogsets[j["cognatesetReference"]].append( (j["formReference"], j["segmentSlice"], j["alignment"]) ) judgements_lookup[j["formReference"]][j["cognatesetReference"]] = j else: cognate.append(j) matches = match_cognatesets(new_cogsets, ref_cogsets) for cognateset, judgements in new_cogsets.items(): cognateset = matches[cognateset] if cognateset is None: cognateset = "_".join(f for f, _, _ in judgements) for form, slice, alignment in judgements: was: types.Judgement = judgements_lookup.get(form, {}).get(cognateset) if was: was["segmentSlice"] = util.indices_to_segment_slice(slice) was["alignment"] = alignment cognate.append(was) continue judgements_lookup cognate.append( types.Judgement( { "id": f"{form}-{cognateset}", "formReference": form, "cognatesetReference": cognateset, "alignment": alignment, "segmentSlice": util.indices_to_segment_slice(slice), "source": source, # TODO: Any more parameters? Status update? } ) ) cognate.sort(key=lambda j: j["id"]) m = { util.cldf_property(c.propertyUrl) or c.name: c.name for c in dataset["CognateTable"].tableSchema.columns } dataset["CognateTable"].write( [{m[k]: v for k, v in j.items() if k in m} for j in cognate] )
def forms_to_tsv( dataset: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], languages: t.Iterable[str], concepts: t.Set[str], cognatesets: t.Iterable[str], logger: cli.logging.Logger = cli.logger, ): try: dataset["FormTable", "segments"].name except KeyError: cli.Exit.NO_SEGMENTS( """Edictor export requires your dataset to have segments in the FormTable. Run `lexedata.edit.add_segments` to automatically add segments based on your forms.""" ) delimiters = { util.cldf_property(c.propertyUrl) or c.name: c.separator for c in dataset["FormTable"].tableSchema.columns if c.separator } # prepare the header for the tsv output # the first column must be named ID and contain 1-based integer IDs # set header for tsv tsv_header = list(dataset["FormTable"].tableSchema.columndict.keys()) tsv_header.insert(0, "LINGPY_ID") tsv_header.append("cognatesetReference") if "alignment" not in tsv_header: tsv_header.append("alignment") if "parameterReference" in delimiters: tsv_header.append("_parameterReference") # select forms and cognates given restriction of languages and concepts, cognatesets respectively forms = {} for f, form in util.cache_table(dataset).items(): if form["form"] is None or form["form"] == "-": continue if form["languageReference"] in languages and concepts.intersection( ensure_list(form["parameterReference"])): # Normalize the form: # 1. No list-valued entries for c, d in delimiters.items(): if c == "segments": continue if c == "parameterReference": form["_parameterReference"] = d.join( str(e) for e in form[c]) form["parameterReference"] = form["parameterReference"][0] continue form[c] = d.join(str(e) for e in form[c]) if not form.get("segments"): logger.warning( "No segments found for form %s. You can generate segments using `lexedata.edit.add_segments`.", form["id"], ) # 2. No tabs, newlines in entries for c, v in form.items(): if type(v) == str: if "\\!t" in form[c] or "\\!n" in form[c]: logger.warning( "Your data contains the special characters '\\!t' or '\\!n', which I will introduce for escaping tabs and newlines for edictor. These characters will not survive the back-import." ) form[c] = form[c].replace("\t", "\\!t").replace("\n", "\\!n") forms[f] = form cognateset_cache: t.Mapping[t.Optional[str], int] if "CognatesetTable" in dataset: id = dataset["CognatesetTable", "id"].name cognateset_cache = { cognateset[id]: c for c, cognateset in enumerate(dataset["CognatesetTable"], 1) if cognateset[id] in cognatesets } else: if cognatesets is None: cognateset_cache = t.DefaultDict(itertools.count().__next__) else: cognateset_cache = {c: i for i, c in enumerate(cognatesets, 1)} # Warn about unexpected non-concatenative ‘morphemes’ lexedata.report.nonconcatenative_morphemes.segment_to_cognateset( dataset, cognatesets, logger) judgements_about_form: t.Mapping[types.Form_ID, t.Tuple[t.List[str], t.List[int]]] = { id: ([f"({s})" for s in form["segments"]], []) for id, form in forms.items() } # Compose all judgements, last-one-rules mode. for j in util.cache_table(dataset, "CognateTable").values(): if j["formReference"] in forms and cognateset_cache.get( j["cognatesetReference"]): if j.get("alignment"): j["alignment"] = [s or "" for s in j["alignment"]] else: j["alignment"] = forms[j["formReference"]]["segments"] try: segments_judged = list( parse_segment_slices(segment_slices=j["segmentSlice"], enforce_ordered=False)) except TypeError: logger.warning( "In judgement %s: No segment slice given. Assuming whole form.", j["id"], ) segments_judged = list( range(len(forms[j["formReference"]]["segments"]))) except KeyError: segments_judged = list( range(len(forms[j["formReference"]]["segments"]))) except ValueError: logger.warning( "In judgement %s: Index error due to bad segment slice %s. Skipped.", j["id"], ",".join(j["segmentSlice"]), ) continue global_alignment, cogsets = judgements_about_form[ j["formReference"]] segment_start, segment_end = min( segments_judged), max(segments_judged) + 1 try: glue_in_alignment( global_alignment, cogsets, j["alignment"], j["cognatesetReference"], slice(segment_start, segment_end), ) except IndexError: logger.warning( "In judgement %s: Index error due to bad segment slice %s. Skipped.", j["id"], ",".join(j["segmentSlice"]), ) continue return forms, judgements_about_form, cognateset_cache
args = parser.parse_args() if args.metadata_or_forms.name == "forms.csv": dataset = pycldf.Wordlist.from_data(args.metadata_or_forms) else: dataset = pycldf.Wordlist.from_metadata(args.metadata) languages = {} try: c_l_id = dataset["LanguageTable", "id"].name for language in dataset["LanguageTable"]: languages[language[c_l_id]] = language except KeyError: pass concepts: t.DefaultDict[str, t.Counter[str]] = t.DefaultDict(t.Counter) multiple_concepts = bool(dataset["FormTable", "parameterReference"].separator) c_concept = dataset["FormTable", "parameterReference"].name c_language = dataset["FormTable", "languageReference"].name c_form = dataset["FormTable", "form"].name for form in dataset["FormTable"]: languages.setdefault(form[c_language], {}) if form[c_form] == "?" and args.missing: continue if multiple_concepts: for c in form[c_concept]: concepts[form[c_language]][c] += 1 else: concepts[form[c_language]][form[c_concept]] += 1
dataset.column_names.forms.parameterReference) multi = bool(concepts.separator) concepts_by_form: t.Dict[t.Hashable, t.List[t.Optional[t.Hashable]]] = {} for form in dataset['FormTable']: if multi: concepts_by_form[form[dataset.column_names.forms.id]] = [ concept_to_concepticon.get(c) for c in form[concepts.name] ] else: concepts_by_form[form[dataset.column_names.forms.id]] = [ concept_to_concepticon.get(form[concepts.name]) ] concepts_by_cogset: t.DefaultDict[ t.Hashable, t.Counter[t.Optional[t.Hashable]]] = t.DefaultDict(t.Counter) for row in table: cognateset = row[c_cognateset] form = row[c_form] concepts_by_cogset[cognateset].update(concepts_by_form[form]) import networkx clics = networkx.parse_gml( (Path(__file__).parent / '../../../network-3-families.gml').open()) r = {} for cognateset, concepts in concepts_by_cogset.items(): centrality = networkx.algorithms.centrality.betweenness_centrality( clics.subgraph([c for c in concepts if c])) r[cognateset] = max(centrality, key=centrality.get) write_back = []
def check_cognate_table(dataset: pycldf.Wordlist, logger=cli.logger, strict_concatenative=False) -> bool: """Check that the CognateTable makes sense. The cognate table MUST have an indication of forms, in a #formReference column, and cognate sets, in a #cognatesetReference column. It SHOULD have segment slices (#segmentSlice) and alignments (#alignment). - The segment slice must be a valid (1-based, inclusive) slice into the segments of the form - The alignment must match the segment slice applied to the segments of the form - The length of the alignment must match the lengths of other alignments of that cognate set - NA forms (Including "" for “source reports form as unknown” must not be in cognatesets) If checking for strictly concatenative morphology, also check that the segment slice is a contiguous, non-overlapping section of the form. Having no cognates is a valid choice for a dataset, so this function returns True if no CognateTable was found. """ # First, load all forms that are referenced in the CognateTable try: cognatetable = dataset["CognateTable"] except KeyError: # Having no cognates is a valid choice for a dataset. return True try: c_form = dataset["CognateTable", "formReference"].name except KeyError: log_or_raise("CognateTable does not have a #formReference column.") # All further checks don't make sense, return early. return False try: c_cognateset = dataset["CognateTable", "cognatesetReference"].name except KeyError: log_or_raise( "CognateTable does not have a #cognatesetReference column.") # All further checks don't make sense, return early. return False # The CLDF specifications state that foreign key references take precedence # over the implicit semantics of a `#xxxReference` column pointing to an # `#id` column, so we need to find forms by the stated foreign key # relationship. for foreign_key in cognatetable.tableSchema.foreignKeys: if foreign_key.columnReference == [c_form]: referenced_table = str(foreign_key.reference.resource) # A multi-column column reference for a single-column foreign key # makes no sense, so use tuple unpacking to extract the only # element from that list. (referenced_column, ) = foreign_key.reference.columnReference if (not dataset[referenced_table].common_props["dc:conformsTo"] == "http://cldf.clld.org/v1.0/terms.rdf#FormTable"): log_or_raise( "CognateTable #formReference does not reference a FormTable.", ) break else: log_or_raise("CognateTable #formReference must be a foreign key.") # All further checks don't make sense, return early. return False try: c_sslice = dataset["CognateTable", "segmentSlice"].name except KeyError: logger.info("CognateTable does not have a #segmentSlice column.") c_sslice = None try: c_alignment = dataset["CognateTable", "alignment"].name except KeyError: logger.info("CognateTable does not have an #alignment column.") c_alignment = None if c_sslice is None and c_alignment is None: # No additional data concerning the associations between forms and # cognate sets. That's sad, but valid. # All further checks don't make sense, return early. return True try: c_f_form = dataset[referenced_table, "form"].name def form_given(row): return row[c_f_form] and row[c_f_form].strip() != "-" except KeyError: if dataset[referenced_table] == dataset["FormTable"]: log_or_raise("FormTable does not have a #form column.") def form_given(row): return True # Check whether each row is valid. all_judgements_okay = True forms = cache_table( dataset, columns={"segments": dataset[referenced_table, "segments"].name}, table=referenced_table, index_column=referenced_column, filter=form_given, ) missing_forms = cache_table( dataset, columns={}, table=referenced_table, index_column=referenced_column, filter=lambda row: not form_given(row), ) cognateset_alignment_lengths: t.DefaultDict[ t.Any, t.Set[int]] = t.DefaultDict(set) for f, j, judgement in dataset["CognateTable"].iterdicts( with_metadata=True): try: form_segments = forms[judgement[c_form]]["segments"] except KeyError: if judgement[c_form] in missing_forms: log_or_raise( "In {}, row {}: NA form {} was judged to be in cognate set." .format(f, j, judgement[c_form]), ) # The case of a missing foreign key in general is already handled # by the basic CLDF validator. continue if c_sslice is not None: if not judgement[c_sslice]: log_or_raise("In {}, row {}: Empty segment slice".format(f, j)) continue try: included_segments = list( parse_segment_slices(judgement[c_sslice])) if (max(included_segments) >= len(form_segments) or min(included_segments) < 0): log_or_raise( "In {}, row {}: Segment slice {} is invalid for segments {}" .format( f, j, judgement[c_sslice], form_segments, ), ) all_judgements_okay = False continue if strict_concatenative: s1 = included_segments[0] for s2 in included_segments[1:]: if s2 != s1 + 1: log_or_raise( "In {}, row {}: Segment slice {} has non-consecutive elements {}, {}" .format( f, j, judgement[c_sslice], s1, s2, )) s1 = s2 except ValueError: log_or_raise( "In {}, row {}: Segment slice {} is invalid".format( f, j, judgement[c_sslice], )) all_judgements_okay = False continue else: included_segments = list(range(len(form_segments))) if c_alignment: # Length of alignment should match length of every other alignment in this cognate set. lengths = cognateset_alignment_lengths[judgement[c_cognateset]] alignment_length = len(judgement[c_alignment]) if lengths and alignment_length not in lengths: log_or_raise( "In {}, row {}: Alignment has length {}, other alignments of cognateset {} have length(s) {}" .format(f, j, alignment_length, judgement[c_cognateset], lengths), ) all_judgements_okay = False elif not lengths: lengths.add(alignment_length) # Alignment when gaps are removed should match segments. TODO: # Should we permit other gap characters? Where do we know them # from? TODO: To be more robust when segments are separated into # morphemes, not individual segments, compare alignment and # segments space-separated. without_gaps = " ".join( [c or "" for c in judgement[c_alignment] if c != "-"]) actual_segments = " ".join(form_segments[i] for i in included_segments) if without_gaps.strip() != actual_segments.strip(): if unicodedata.normalize( "NFKC", without_gaps.strip()) == unicodedata.normalize( "NFKC", actual_segments.strip()): comment = " This is down to encoding differences: Their normalized unicode representations are the same. I suggest you run `lexedata.edit.normalize_unicode`." else: comment = "" log_or_raise( "In {}, row {}: Referenced segments in form resolve to {}, while alignment contains segments {}.{}" .format(f, j, actual_segments, without_gaps, comment), ) all_judgements_okay = False return all_judgements_okay
# TODO: Options given on the command line should have preference over defaults, # no matter whether they are given in terms of names ("Parameter_ID") or # property URLs ("parameterReference") default_mergers: t.Mapping[str, Merger] = t.DefaultDict( lambda: default, { "form": must_be_equal, "Form": must_be_equal, "languageReference": must_be_equal, "Language_ID": must_be_equal, "source": union, "Source": union, "parameterReference": union, "Parameter_ID": union, "variants": union, "comment": concatenate, "Comment": concatenate, "value": concatenate, "Value": concatenate, "status": constant_factory("MERGED: Review necessary"), "orthographic": transcription("<{}>"), "phonemic": transcription("/{}/"), "phonetic": transcription("[{}]"), "segments": must_be_equal, "Segments": must_be_equal, }, ) def merge_group( forms: t.Sequence[types.Form],
def read_wordlist( dataset: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], code_column: t.Optional[str], logger: cli.logging.Logger = cli.logger, ) -> t.MutableMapping[types.Language_ID, t.MutableMapping[types.Parameter_ID, t.Set]]: col_map = dataset.column_names if code_column: # Just in case that column was specified by property URL. We # definitely want the name. In any case, this will also throw a # helpful KeyError when the column does not exist. form_table_form = col_map.forms.form form_table_column = col_map.forms.id cognatesets = util.cache_table( dataset, columns={ "form": form_table_column, "transcription": form_table_form, "code": dataset["FormTable", code_column].name, }, filter=lambda row: bool(row[col_map.forms.form]), ) else: # We search for cognatesetReferences in the FormTable or a separate # CognateTable. # Try the FormTable first. code_column = col_map.forms.cognatesetReference if code_column: # This is not the CLDF way, warn the user. form_table_column = col_map.forms.id form_table_form = col_map.forms.form logger.warning( "Your dataset has a cognatesetReference in the FormTable. Consider running lexedata.edit.add_cognate_table to create an explicit cognate table." ) cognatesets = util.cache_table( dataset, columns={ "form": form_table_column, "transcription": form_table_form, "code": code_column, }, ) else: # There was no cognatesetReference in the form table. If we # find them in CognateTable (I mean, they should be there!), we # store them keyed with formReference. if (col_map.cognates and col_map.cognates.cognatesetReference and col_map.cognates.formReference): code_column = col_map.cognates.cognatesetReference form_reference = col_map.cognates.formReference (foreign_key, ) = [ key for key in dataset["CognateTable"].tableSchema.foreignKeys if key.columnReference == [form_reference] ] (form_table_column, ) = foreign_key.reference.columnReference cognatesets = util.cache_table( dataset, "CognateTable", { "form": form_reference, "code": code_column }, ) else: raise ValueError( "Dataset has no cognatesetReference column in its " "primary table or in a separate cognate table. " "Is this a metadata-free wordlist and you forgot to " "specify code_column explicitly?") # Cognate sets have been loaded. Consolidate. cognates_by_form: t.MutableMapping[ types.Form_ID, t.Set[types.Cognateset_ID]] = t.DefaultDict(set) for judgement in cognatesets.values(): cognates_by_form[judgement["form"]].add(judgement["code"]) parameter_column = col_map.forms.parameterReference # If one form can have multiple concepts, if dataset["FormTable", parameter_column].separator: def all_parameters(parameter): return list(parameter) else: def all_parameters(parameter): return [parameter] data: t.MutableMapping[types.Language_ID, t.MutableMapping[types.Parameter_ID, t.Set]] if "LanguageTable" in dataset: (langref_target, ) = [ key for key in dataset["FormTable"].tableSchema.foreignKeys if key.columnReference == [dataset["FormTable", "languageReference"].name] ] ref_col = langref_target.reference.columnReference[0] data = { lang[ref_col]: t.DefaultDict(set) for lang in dataset["LanguageTable"] } else: data = t.DefaultDict(lambda: t.DefaultDict(set)) for row in dataset["FormTable"].iterdicts(): if not row[col_map.forms.form]: # Transcription is empty, should not be a form. Skip, but maybe # warn if it was in a cognateset. if cognates_by_form[row[form_table_column]]: logger.warning( "Form %s was given as empty (i.e. the source noted that the form is unknown), but it was judged to be in cognateset %s. I will ignore that cognate judgement.", row[col_map.forms.id], cognates_by_form[row[form_table_column]], ) continue language = row[col_map.forms.languageReference] if row[col_map.forms.form] == "-": if cognates_by_form[row[form_table_column]]: logger.warning( "Form %s was given as '-' (i.e. “concept is not available in language %s”), but it was judged to be in cognateset %s. I will ignore that cognate judgement.", row[col_map.forms.id], language, cognates_by_form[row[form_table_column]], ) cognates_by_form[row[form_table_column]] = set() for parameter in all_parameters(row[parameter_column]): if data[language][parameter]: logger.warning( "Form %s claims concept %s is not available in language %s, but cognatesets %s are allocated to that concept in that language already.", row[col_map.forms.id], parameter, row[col_map.forms.languageReference], data[language][parameter], ) for parameter in all_parameters(row[parameter_column]): data[language][parameter] |= cognates_by_form[ row[form_table_column]] return data
def root_presence_code( dataset: t.Mapping[types.Language_ID, t.Mapping[types.Parameter_ID, t.Set[types.Cognateset_ID]]], relevant_concepts: t.Mapping[types.Cognateset_ID, t.Iterable[types.Parameter_ID]], ascertainment: t.Sequence[Literal["0", "1", "?"]] = ["0"], logger: cli.logging.Logger = cli.logger, ) -> t.Tuple[t.Mapping[types.Language_ID, t.List[Literal["0", "1", "?"]]], t.Mapping[types.Cognateset_ID, int], ]: """Create a root-presence/absence coding from cognate codes in a dataset Take the cognate code information from a wordlist, i.e. a mapping of the form {Language ID: {Concept ID: {Cognateset ID}}}, and generate a binary alignment from it that lists for every root whether it is present in that language or not. Return that, and the association between cognatesets and characters. >>> alignment, roots = root_presence_code( ... {"Language": {"Meaning": {"Cognateset 1"}}}, ... relevant_concepts={"Cognateset 1": ["Meaning"]}) >>> alignment {'Language': ['0', '1']} >>> roots {'Cognateset 1': 1} The first entry in each sequence is always '0': The configuration where a form is absent from all languages is never observed, but always possible, so we add this entry for the purposes of ascertainment correction. If a root is attested at all, in any concept, it is considered present. Because the word list is never a complete description of the language's lexicon, the function employs a heuristic to generate ‘absent’ states. If a root is unattested, and at least half of the relevant concepts associated with this root are attested, but each expressed by another root, the root is assumed to be absent in the target language. (If there is exactly one central concept, then that central concept being attested or unknown is a special case of this general rule.) Otherwise the presence/absence of the root is considered unknown. >>> alignment, roots = root_presence_code( ... {"l1": {"m1": {"c1"}}, ... "l2": {"m1": {"c2"}, "m2": {"c1", "c3"}}}, ... relevant_concepts={"c1": ["m1"], "c2": ["m1"], "c3": ["m2"]}) >>> sorted(roots) ['c1', 'c2', 'c3'] >>> sorted_roots = sorted(roots.items()) >>> {language: [sequence[k[1]] for k in sorted_roots] for language, sequence in alignment.items()} {'l1': ['1', '0', '?'], 'l2': ['1', '1', '1']} >>> list(zip(*sorted(zip(*alignment.values())))) [('0', '0', '1', '?'), ('0', '1', '1', '1')] """ all_roots: t.Set[types.Cognateset_ID] = set(relevant_concepts) language_roots: t.MutableMapping[ types.Language_ID, t.Set[types.Cognateset_ID]] = t.DefaultDict(set) for language, lexicon in dataset.items(): for concept, cognatesets in lexicon.items(): if not cognatesets: logger.warning( f"The root presence coder script got a language ({language}) with an improper lexicon: There is a form associated with Concept {concept}, but no cognate sets are associated with it." ) for cognateset in cognatesets: language_roots[language].add(cognateset) all_roots_sorted: t.Sequence[types.Cognateset_ID] = sorted(all_roots) alignment = {} roots = {} for language, lexicon in dataset.items(): alignment[language] = list(ascertainment) for root in all_roots_sorted: roots[root] = len(alignment[language]) if root in language_roots[language]: alignment[language].append("1") else: n_concepts = 0 n_filled_concepts = 0 for concept in relevant_concepts[root]: n_concepts += 1 if lexicon.get(concept): n_filled_concepts += 1 if 2 * n_filled_concepts >= n_concepts: alignment[language].append("0") else: alignment[language].append("?") return alignment, roots
def apply_heuristics( dataset: types.Wordlist, heuristic: t.Optional[AbsenceHeuristic] = None, primary_concepts: t.Union[ types.WorldSet[types.Parameter_ID], t.AbstractSet[types.Parameter_ID]] = types.WorldSet(), logger: cli.logging.Logger = cli.logger, ) -> t.Mapping[types.Cognateset_ID, t.Set[types.Parameter_ID]]: """Compute the relevant concepts for cognatesets, depending on the heuristic. These concepts will be considered when deciding whether a root is deemed absent in a language. For the CentralConcept heuristic, the relevant concepts are the central concept of a cognateset, as given by the #parameterReference column of the CognatesetTable. A central concept not included in the primary_concepts is ignored with a warning. >>> ds = util.fs.new_wordlist() >>> cst = ds.add_component("CognatesetTable") >>> ds["CognatesetTable"].tableSchema.columns.append( ... pycldf.dataset.Column( ... name="Central_Concept", ... propertyUrl="http://cldf.clld.org/v1.0/terms.rdf#parameterReference")) >>> ds.auto_constraints(cst) >>> ds.write(CognatesetTable=[ ... {"ID": "cognateset1", "Central_Concept": "concept1"} ... ]) >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.CENTRALCONCEPT) == {'cognateset1': {'concept1'}} True This extends to the case where a cognateset may have more than one central concept. >>> ds = util.fs.new_wordlist() >>> cst = ds.add_component("CognatesetTable") >>> ds["CognatesetTable"].tableSchema.columns.append( ... pycldf.dataset.Column( ... name="Central_Concepts", ... propertyUrl="http://cldf.clld.org/v1.0/terms.rdf#parameterReference", ... separator=",")) >>> ds.auto_constraints(cst) >>> ds.write(CognatesetTable=[ ... {"ID": "cognateset1", "Central_Concepts": ["concept1", "concept2"]} ... ]) >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.CENTRALCONCEPT) == { ... 'cognateset1': {'concept1', 'concept2'}} True For the HalfPrimaryConcepts heurisitc, the relevant concepts are all primary concepts connected to a cognateset. >>> ds = util.fs.new_wordlist( ... FormTable=[ ... {"ID": "f1", "Parameter_ID": "c1", "Language_ID": "l1", "Form": "x"}, ... {"ID": "f2", "Parameter_ID": "c2", "Language_ID": "l1", "Form": "x"}], ... CognateTable=[ ... {"ID": "1", "Form_ID": "f1", "Cognateset_ID": "s1"}, ... {"ID": "2", "Form_ID": "f2", "Cognateset_ID": "s1"}]) >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.HALFPRIMARYCONCEPTS) == { ... 's1': {'c1', 'c2'}} True NOTE: This function cannot guarantee that every concept has at least one relevant concept, there may be cognatesets without! A cognateset with 0 relevant concepts will always be included, because 0 is at least half of 0. """ heuristic = (heuristic if heuristic is not None else (AbsenceHeuristic.CENTRALCONCEPT if ("CognatesetTable", "parameterReference") in dataset else AbsenceHeuristic.HALFPRIMARYCONCEPTS)) relevant_concepts: t.MutableMapping[ types.Cognateset_ID, t.Set[types.Parameter_ID]] = t.DefaultDict(set) if heuristic is AbsenceHeuristic.HALFPRIMARYCONCEPTS: c_f = dataset["CognateTable", "formReference"].name c_s = dataset["CognateTable", "cognatesetReference"].name concepts = util.cache_table( dataset, "FormTable", {"concepts": dataset["FormTable", "parameterReference"].name}, ) for j in dataset["CognateTable"]: form = concepts[j[c_f]] for concept in util.ensure_list(form["concepts"]): relevant_concepts[j[c_s]].add(concept) elif heuristic is AbsenceHeuristic.CENTRALCONCEPT: c_cognateset_concept = dataset["CognatesetTable", "parameterReference"].name c_id = dataset["CognatesetTable", "id"].name for c in dataset["CognatesetTable"]: for concept in util.ensure_list(c[c_cognateset_concept]): if concept not in primary_concepts: logger.warning( f"The central concept {concept} of cognateset {c[c_id]} was not part of your list of primary concepts to be included in the coding, so the cognateset will be ignored." ) else: relevant_concepts[c[c_id]].add(concept) else: raise TypeError( f"Value of heuristic, {heuristic}, did not correspond to a known AbsenceHeuristic." ) return relevant_concepts
all_mergers, default, first, format_mergers, must_be_equal, parse_homophones_report, parse_merge_override, ) # TODO: Options given on the command line should have preference over defaults, # no matter whether they are given in terms of names ("Parameter_ID") or # property URLs ("parameterReference") default_mergers: t.Mapping[str, Merger] = t.DefaultDict( lambda: default, { "Name": first, "parameterReference": first, }, ) def merge_group( cogsets: t.Sequence[types.CogSet], target: types.CogSet, mergers: t.Mapping[str, Merger], dataset: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], logger: cli.logging.Logger = cli.logger, ) -> types.CogSet: """Merge one group of cognate sets
def coverage_report( dataset: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], min_percentage: float = 0.0, with_concept: t.Iterable[types.Parameter_ID] = set(), missing: Missing = Missing.KNOWN, only_coded: bool = True, ) -> t.List[t.List[str]]: coded: t.Container[types.Form_ID] if only_coded: try: c_j_form = dataset["CognateTable", "formReference"].name except KeyError: cli.Exit.NO_COGNATETABLE( message= "You requested that I only count cognate coded forms, but you have no CognateTable containing judgements." ) coded = {judgement[c_j_form] for judgement in dataset["CognateTable"]} else: coded = types.WorldSet() languages: t.Dict[types.Language_ID, str] = {} try: c_l_id = dataset["LanguageTable", "id"].name c_l_name = dataset["LanguageTable", "name"].name for language in dataset["LanguageTable"]: languages[language[c_l_id]] = language[c_l_name] except KeyError: pass concepts: t.DefaultDict[types.Language_ID, t.Counter[types.Parameter_ID]] = t.DefaultDict( t.Counter) c_f_id = dataset["FormTable", "id"].name c_concept = dataset["FormTable", "parameterReference"].name c_language = dataset["FormTable", "languageReference"].name c_form = dataset["FormTable", "form"].name for form in dataset["FormTable"]: languages.setdefault(form[c_language], form[c_language]) if form[c_f_id] not in coded: continue if missing == Missing.IGNORE and (not form[c_form] or form[c_form] == "-"): continue if missing == Missing.KNOWN and not form[c_form]: continue c: types.Parameter_ID for c in util.ensure_list(form[c_concept]): concepts[form[c_language]][c] += 1 # load primary concepts and number of concepts primary_concepts: t.Container[types.Parameter_ID] try: c_c_id = dataset["ParameterTable", "id"].name primary_concepts = [ c[c_c_id] for c in dataset["ParameterTable"] if c["Primary"] ] total_number_concepts = len(primary_concepts) except KeyError: cli.logger.warning( "ParameterTable doesn't contain a column 'Primary'. Primary concepts couldn't be loaded. " "Loading all concepts.") primary_concepts = types.WorldSet() try: total_number_concepts = len(list(dataset["ParameterTable"])) except KeyError: total_number_concepts = len( set.union(*(set(cs) for cs in concepts.values()))) data_languages = [] for language, name in languages.items(): conceptlist = concepts[language] try: synonyms = sum(conceptlist.values()) / len(conceptlist) except ZeroDivisionError: synonyms = float("nan") # percentage of all concepts covered by this language conceptlist_percentage = len(conceptlist) / total_number_concepts if conceptlist_percentage * 100 < min_percentage: continue if not all(c in conceptlist for c in with_concept): continue # count primary concepts primary_count = 0 for c in conceptlist: if c in primary_concepts: primary_count += 1 # if args.languages_only: # print(language) data_languages.append([ language, name, primary_count, conceptlist_percentage, synonyms, ]) return data_languages