def source_from_source_string( self, source_string: str, language_id: t.Optional[str], logger: cli.logging.Logger = cli.logger, ) -> str: """Parse a string referencing a language-specific source""" context: t.Optional[str] if ":" in source_string: source_part, context = source_string.split(":", maxsplit=1) if not context.endswith("}"): logger.warning( f"In source {source_string}: Closing bracket '}}' is missing, split into source and page/context may be wrong" ) source_string = source_part + "}" context = context[:-1].strip() context = context.replace(":", "").replace(",", "") else: context = None if source_string.startswith("{") and source_string.endswith("}"): source_string = source_string[1:-1] if language_id is None: source_id = string_to_id(source_string) else: source_id = string_to_id(f"{language_id:}_s{source_string:}") source_id = source_id.replace(":", "").replace(",", "") if context: return f"{source_id}[{context}]" else: return source_id
def rename( ds, old_values_to_new_values, logger: cli.logging.Logger, status_update: t.Optional[str], ): concepts = ds["ParameterTable"] for table in ds.tables: if table == concepts: continue _, component = table.common_props["dc:conformsTo"].split("#") try: c_concept = ds[component, "parameterReference"] columns = {c_concept.name} except KeyError: columns = set() for reference in table.tableSchema.foreignKeys: if reference.reference.resource.string == concepts.url.string: (column, ) = reference.columnReference columns.add(column) if columns: logger.info(f"Changing columns {columns:} in {component:}…") ds.write( **{ component: [ substitute_many( r, columns, old_values_to_new_values, status_update=status_update, ) for r in table ] })
def on_form_not_found( self, form: t.Dict[str, t.Any], cell_identifier: t.Optional[str] = None, language_id: t.Optional[str] = None, logger: cli.logging.Logger = cli.logger, ) -> bool: """Should I add a missing object? No, but inform the user. Send a warning (ObjectNotFoundWarning) reporting the missing object and cell. Returns ======= False: The object should not be added. """ rep = form.get("cldf_id", repr(form)) logger.warning( f"Unable to find form {rep} in cell {cell_identifier} in the dataset. " f"This cognate judgement was skipped. " f"Please make sure that the form is present in forms.csv or in the file " f"used for the Wordlist importation.") # Do a fuzzy search for row in self.db.find_db_candidates(form, self.check_for_match, edit_dist_threshold=4): logger.info(f"Did you mean {row} ?") return False
def header_from_cognate_excel( ws: openpyxl.worksheet.worksheet.Worksheet, dataset: pycldf.Dataset, logger: cli.logging.Logger = cli.logger, ): row_header = [] separators = [] for (header, ) in ws.iter_cols( min_row=1, max_row=1, max_col=len(dataset["CognatesetTable"].tableSchema.columns), ): column_name = header.value if column_name is None: column_name = dataset["CognatesetTable", "id"].name elif column_name == "CogSet": column_name = dataset["CognatesetTable", "id"].name try: column_name = dataset["CognatesetTable", column_name].name except KeyError: break row_header.append(column_name) separators.append(dataset["CognatesetTable", column_name].separator) if column_name == dataset["CognatesetTable", "comment"].name: logger.warning( "Your cognates table has a separate ‘{header.value}’ column for comments, but `lexedata.importer.cognates` expects to extract comments from the cell comments of the cognateset metadata columns, not from a separate column. Your ‘{header.value}’ column will be ignored." ) return row_header, separators
def list_homophones(dataset: pycldf.Dataset, out: io.TextIOBase, logger: cli.logging.Logger = cli.logger) -> None: clics = load_clics() # warn if clics cannot be loaded if not clics: logger.warning( "Clics could not be loaded. Using an empty graph instead") clics = nx.Graph() c_id = dataset["ParameterTable", "id"].name try: c_concepticon = dataset["ParameterTable", "concepticonReference"].name except KeyError: cli.Exit.INVALID_DATASET( "This script requires a column concepticonReference in ParamterTable. " "Please run add_concepticon.py") concepticon = {} for concept in dataset["ParameterTable"]: concepticon[concept[c_id]] = concept[c_concepticon] f_id = dataset["FormTable", "id"].name f_lang = dataset["FormTable", "languageReference"].name f_concept = dataset["FormTable", "parameterReference"].name f_form = dataset["FormTable", "form"].name homophones: t.DefaultDict[str, t.DefaultDict[str, t.Set[t.Tuple[ str, str]]]] = t.DefaultDict(lambda: t.DefaultDict(set)) for form in dataset["FormTable"]: if form[f_form] == "-" or form[f_form] is None: continue if isinstance(form[f_concept], list): homophones[form[f_lang]][form[f_form]].add( tuple(form[f_concept]) + (form[f_id], )) else: homophones[form[f_lang]][form[f_form]].add( (form[f_concept], form[f_id])) for lang, forms in homophones.items(): for form, meanings in forms.items(): if len(meanings) == 1: continue clics_nodes = {concepticon.get(concept) for concept, _ in meanings} if None in clics_nodes: x = " (but at least one concept not found):" else: x = ":" clics_nodes -= {None} if len(clics_nodes) <= 1: x = "Unknown" + x elif nx.is_connected(clics.subgraph(clics_nodes)): x = "Connected" + x else: x = "Unconnected" + x line = f"{lang}, '{form}': {x}\n" for ele in sorted(meanings): line += f"\t {ele[-1]} ({', '.join(ele[0:-1])})\n" out.write(line)
def merge_group( forms: t.Sequence[types.Form], target: types.Form, mergers: t.Mapping[str, Merger], dataset: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], logger: cli.logging.Logger = cli.logger, ) -> types.Form: """Merge one group of homophones >>> merge_group( ... [{"Parameter_ID": [1, 1]}, {"Parameter_ID": [2]}], ... {"Parameter_ID": [1, 1]}, {"Parameter_ID": union}, util.fs.new_wordlist()) {'Parameter_ID': [1, 2]} The target is assumed to be already included in the forms. >>> merge_group( ... [{"Parameter_ID": [1, 1]}, {"Parameter_ID": [2]}], ... {"Parameter_ID": [1, 1]}, {"Parameter_ID": concatenate}, util.fs.new_wordlist()) {'Parameter_ID': [1, 1, 2]} """ c_f_id = dataset["FormTable", "id"].name for column in target: if column == c_f_id: continue try: reference_name = (util.cldf_property( dataset["FormTable", column].propertyUrl) or column) merger = mergers.get(column, mergers.get(reference_name, must_be_equal)) try: merge_result = merger([form[column] for form in forms], target) except AssertionError: # We cannot deal with this block, but others may be fine. merger_name = merger.__name__ logger.error( f"Merging forms: {[f[c_f_id] for f in forms]} with target: {target[c_f_id]} on column: {column}\n" f"The merge function {merger_name} requires the input data to be equal. \n" f"Given input: {[form[column] for form in forms]}") raise Skip except TypeError: merger_name = merger.__name__ # Other groups will have the same issue. cli.Exit.INVALID_INPUT( f"Merging forms: {[f[c_f_id] for f in forms]} with target: {target[c_f_id]} \n" f"The merge function {merger_name} is not implemented for type {type(forms[0])}. \n" f"Given input: {[form[column] for form in forms]}") target[column] = merge_result except KeyError: cli.Exit.INVALID_COLUMN_NAME( f"Column {column} is not in FormTable.") return target
def add_central_concepts_to_cognateset_table( dataset: pycldf.Dataset, add_column: bool = True, overwrite_existing: bool = True, logger: cli.logging.Logger = cli.logger, status_update: t.Optional = None, ) -> pycldf.Dataset: # create mapping cognateset to central concept try: clics: t.Optional[networkx.Graph] = load_clics() except FileNotFoundError: logger.warning("Clics could not be loaded.") clics = None concepts_of_cognateset: t.Mapping[ CognatesetID, t.Counter[ConceptID]] = connected_concepts(dataset) central: t.MutableMapping[str, str] = {} if clics and dataset.column_names.parameters.concepticonReference: concept_to_concepticon = concepts_to_concepticon(dataset) for cognateset, concepts in concepts_of_cognateset.items(): central[cognateset] = central_concept(concepts, concept_to_concepticon, clics) else: logger.warning( f"Dataset {dataset:} had no concepticonReference in a ParamterTable." ) for cognateset, concepts in concepts_of_cognateset.items(): central[cognateset] = central_concept(concepts, {}, None) dataset = reshape_dataset(dataset, add_column=add_column) c_core_concept = dataset.column_names.cognatesets.parameterReference if c_core_concept is None: raise ValueError( f"Dataset {dataset:} had no parameterReference column in a CognatesetTable" " and is thus not compatible with this script.") # if status update given, add status column if status_update: add_status_column_to_table(dataset=dataset, table_name="CognatesetTable") # write cognatesets with central concepts write_back = [] for row in cli.tq( dataset["CognatesetTable"], task="Write cognatesets with central concepts to dataset", total=dataset["CognatesetTable"].common_props.get("dc:extent"), ): if not overwrite_existing and row[c_core_concept]: continue row[c_core_concept] = central.get( row[dataset.column_names.cognatesets.id]) row["Status_Column"] = status_update write_back.append(row) dataset.write(CognatesetTable=write_back) return dataset
def check_id_format(dataset: pycldf.Dataset, logger: cli.logging.Logger = cli.logger): correct = True for table in dataset.tables: # Every table SHOULD have an ID column try: id_column = dataset[table, "id"] except KeyError: log_or_raise("Table %s has no identifier column.", logger) correct = False continue # All IDs SHOULD be [a-zA-Z0-9_-]+ datatype = id_column.datatype if datatype.base == "string": if not datatype.format: correct = False log_or_raise( f"Table {table.url} has an unconstrained ID column {id_column.name}. Consider setting " f"its format to [a-zA-Z0-9_-]+ and/or running `lexedata.edit.simplify_ids`.", logger, ) else: if datatype.format not in { "[a-zA-Z0-9_\\-]+", "[a-zA-Z0-9_-]+", "[a-zA-Z0-9\\-_]+", "[a-z0-9_]+", }: log_or_raise( f"Table {table.url} has a string ID column {id_column.name} with format {datatype.format}. " f"I am too dumb to check whether that's a subset of [a-zA-Z0-9_-]+ (which is fine) " f"or not (in which case maybe change it).", logger, ) elif datatype.base == "integer": logger.info( "Table %s has integer ID column %s. This is okay, I hope I will not mess it up.", table.url, id_column.name, ) # IDs should be primary keys and primary keys IDs (not official part of the CLDF specs) if table.tableSchema.primaryKey != [id_column.name]: log_or_raise( f"Table {table.url} has ID column {id_column.name}, but primary key {table.tableSchema.primaryKey}", logger, ) correct = False return correct
def merge_group( cogsets: t.Sequence[types.CogSet], target: types.CogSet, mergers: t.Mapping[str, Merger], dataset: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], logger: cli.logging.Logger = cli.logger, ) -> types.CogSet: """Merge one group of cognate sets The target is assumed to be already included in the forms. """ c_s_id = dataset["CognatesetTable", "id"].name for column in target: if column == c_s_id: continue try: reference_name = (util.cldf_property( dataset["CognatesetTable", column].propertyUrl) or column) merger = mergers.get(column, mergers.get(reference_name, must_be_equal)) try: merge_result = merger([cogset[column] for cogset in cogsets], target) except AssertionError: merger_name = merger.__name__ # We cannot deal with this block, but others may be fine. logger.error( f"Merging cognate sets: {[f[c_s_id] for f in cogsets]} with target: {target[c_s_id]} on column: {column}\n" f"The merge function {merger_name} requires the input data to be equal. \n" f"Given input: {[cogset[column] for cogset in cogsets]}") raise Skip except NotImplementedError: merger_name = merger.__name__ # Other groups will have the same issue. cli.Exit.INVALID_INPUT( f"Merging forms: {[f[c_s_id] for f in cogsets]} with target: {target[c_s_id]} \n" f"The merge function {merger_name} is not implemented for type {type(cogsets[0])}. \n" f"Given input: {[cogset[column] for cogset in cogsets]}") target[column] = merge_result except KeyError: cli.Exit.INVALID_COLUMN_NAME( f"Column {column} is not in CognatesetTable.") return target
def clean_forms( table: t.Iterable[R], form_column_name="form", variants_column_name="variants", split_at=[",", ";"], split_at_and_keep=["~"], logger: cli.logging.Logger = cli.logger, ) -> t.Iterator[R]: """Split all forms that contain separators into form+variants. >>> for row in clean_forms([ ... {'F': 'a ~ æ', 'V': []}, ... {'F': 'bə-, be-', 'V': ['b-']}], ... "F", "V"): ... print(row) {'F': 'a', 'V': ['~æ']} {'F': 'bə-', 'V': ['b-', 'be-']} """ for r, row in enumerate(table): forms = [("", row[form_column_name])] for separator in split_at: forms = [("", form.strip()) for _, chunk in forms for form in chunk.split(separator)] for separator in split_at_and_keep: forms = [(first_separator if f == 0 else separator, form.strip()) for first_separator, chunk in forms for f, form in enumerate(chunk.split(separator))] if len(forms) > 1: logger.info( "Line %d: Split form '%s' into %d elements.", r, row[form_column_name], len(forms), ) if forms[0][0]: logger.warn( "First element was marked as variant using %s, ignoring the marker", forms[0][0], ) row[form_column_name] = forms[0][1] row[variants_column_name].extend( [f"{separator}{form}" for separator, form in forms[1:]]) yield row
def replace_column( dataset: pycldf.Dataset, original: str, replacement: str, column_replace: bool, smush: bool, status_update: t.Optional[str], logger: cli.logging.Logger = cli.logger, ) -> None: # add Status_column if not existing and status update given if status_update: add_status_column_to_table(dataset=dataset, table_name="ParameterTable") if column_replace: assert ( original == "id" or original == dataset["ParameterTable", "id"].name ), f"Replacing an entire column is only meaningful when you change the #id column ({dataset['ParameterTable', 'id'].name}) of the ConceptTable." c_id = dataset["ParameterTable", original].name c_new = dataset["ParameterTable", replacement].name mapping = { concept[c_id]: concept[c_new] for concept in dataset["ParameterTable"] } assert smush or len(mapping) == len( set(mapping.values()) ), "Would collapse some concepts that were distinct before! Add '--smush' if that is intended." # dataset["ParameterTable"].tableSchema.columns["c_id"] rename(dataset, mapping, logger, status_update=status_update) else: concepts = dataset["ParameterTable"] c_id = dataset["ParameterTable", "id"].name logger.info(f"Changing {c_id:} of ParameterTable…") dataset.write(ParameterTable=[ substitute_many(r, [c_id], {original: replacement}, status_update=None) for r in concepts ]) rename(dataset, {original: replacement}, logger, status_update=status_update)
def separate( self, values: str, context: str = "", logger: cli.logging.Logger = cli.logger, ) -> t.Iterable[str]: """Separate different form descriptions in one string. Separate forms separated by comma or semicolon, unless the comma or semicolon occurs within a set of matching component delimiters (eg. brackets) If the brackets don't match, the whole remainder string is passed on, so that the form parser can try to recover as much as possible or throw an exception. """ raw_split = re.split(self.separation_pattern, values) if len(raw_split) <= 1: for form in raw_split: yield form return while len(raw_split) > 1: if check_brackets(raw_split[0], self.bracket_pairs): form = raw_split.pop(0).strip() if form: yield form raw_split.pop(0) else: raw_split[:2] = ["".join(raw_split[:2])] if not check_brackets(raw_split[0], self.bracket_pairs): logger.warning( f"{context:}In values {values:}: " "Encountered mismatched closing delimiters. Please check that the " "separation of the cell into multiple entries, for different forms, was correct." ) form = raw_split.pop(0).strip() if form: yield form assert not raw_split
def import_cognates_from_excel( ws: openpyxl.worksheet.worksheet.Worksheet, dataset: pycldf.Dataset, extractor: re.Pattern = re.compile("/(?P<ID>[^/]*)/?$"), logger: cli.logging.Logger = cli.logger, ) -> None: logger.info("Loading sheet…") logger.info( f"Importing cognate sets from sheet {ws.title}, into {dataset.tablegroup._fname}…" ) row_header, _ = header_from_cognate_excel(ws, dataset, logger=logger) excel_parser_cognate = CognateEditParser( dataset, top=2, # When the dataset has cognateset comments, that column is not a header # column, so this value is one higher than the actual number of header # columns, so actually correct for the 1-based indices. When there is # no comment column, we need to compensate for the 1-based Excel # indices. cellparser=cell_parsers.CellParserHyperlink(dataset, extractor=extractor), row_header=row_header, check_for_language_match=[dataset["LanguageTable", "name"].name], check_for_match=[dataset["FormTable", "id"].name], check_for_row_match=[dataset["CognatesetTable", "id"].name], ) excel_parser_cognate.db.cache_dataset() excel_parser_cognate.db.drop_from_cache("CognatesetTable") excel_parser_cognate.db.drop_from_cache("CognateTable") logger.info("Parsing cognate Excel…") excel_parser_cognate.parse_cells(ws, status_update=None) excel_parser_cognate.db.write_dataset_from_cache( ["CognateTable", "CognatesetTable"])
def cache_dataset(self, logger: cli.logging.Logger = cli.logger): logger.info("Caching dataset into memory…") for table in self.dataset.tables: table_type = (table.common_props.get("dc:conformsTo", "").rsplit( "#", 1)[1] or table.url) (id, ) = table.tableSchema.primaryKey # Extent may be wrong, but it's usually at least roughly correct # and a better indication of the table size than none at all. try: self.cache[table_type] = { row[id]: row for row in cli.tq( table, task="Cache the dataset", total=table.common_props.get("dc:extent"), ) } except FileNotFoundError: self.cache[table_type] = {} for source in self.dataset.sources: self.source_ids.add(source.id)
def filter( table: t.Iterable[R], column: str, filter: re.Pattern, invert: bool = False, logger: cli.logging.Logger = cli.logger, ) -> t.Iterator[R]: """Return all rows matching a filter Match the filter regular expression and return all rows in the table where the filter matches the column. (Or all where it does not, if invert==True.) >>> list(filter([ ... {"C": "A"}, ... {"C": "An"}, ... {"C": "T"}, ... {"C": "E"}, ... ], "C", re.compile("A"), invert=True)) [{'C': 'T'}, {'C': 'E'}] """ n_row = 0 n_included = 0 for row in table: n_row += 1 # TODO: Treat list-valued columns better. string = str(row[column]) row_matches = bool(filter.search(string)) if row_matches ^ invert: n_included += 1 yield row logger.info( "Filtered %d rows down to %d (%1.0f%%)", n_row, n_included, n_included / n_row * 100, )
def parse( self, cell: op.cell.Cell, language_id: str, cell_identifier: str = "", logger: cli.logging.Logger = cli.logger, ) -> t.Iterable[Judgement]: try: url = cell.hyperlink.target text = clean_cell_value(cell) comment = get_cell_comment(cell) if "{" not in text: slice, alignment = alignment_from_braces("{" + text + "}") else: slice, alignment = alignment_from_braces(text) try: form_id = self.extractor.search(url)["ID"] except (TypeError, IndexError): logger.error( f"Could not extract group ID from URL {url} using regular expression {self.extractor.pattern}" ) cli.Exit.INVALID_ID() properties = { self.c["c_id"]: form_id, self.c.get("c_segments"): ["{:}:{:}".format(i, j) for i, j in slice], self.c.get("c_alignment"): alignment, self.c.get("c_comment"): comment, } properties.pop(None, None) yield Judgement(properties) except AttributeError: pass
def add_concepticon_definitions( dataset: pycldf.Dataset, column_name: str = "Concepticon_Definition", logger: cli.logging.Logger = cli.logger, ) -> None: concepticon_ids = dataset.column_names.parameters.concepticonReference if concepticon_ids is None: logger.error( "Your concepts table has no #concepticonReference column, so I cannot add any definitions from Concepticon to it. Try running lexedata.edit.add_concepticon to have me guess those references." ) return # Create a concepticon_definition column try: dataset["ParameterTable", column_name] logger.info("Overwriting existing {:} column in concepts table".format( column_name)) except KeyError: dataset.add_columns("ParameterTable", column_name) dataset.write_metadata() # Now if this throws an exception, it's an unexpected exception. # write concepticon definitions write_back = [] for row in cli.tq( dataset["ParameterTable"], task="Write concepts with concepticon definitions to dataset", ): try: row[column_name] = concepticon.api.conceptsets[ row[concepticon_ids]].definition except KeyError: pass write_back.append(row) dataset.write(ParameterTable=write_back)
def update_ids( ds: pycldf.Dataset, table: csvw.metadata.Table, mapping: t.Mapping[str, str], logger: cli.logging.Logger = cli.logger, ): """Update all IDs of the table in the database, also in foreign keys, according to mapping.""" c_id = table.get_column("http://cldf.clld.org/v1.0/terms.rdf#id") rows = [] for row in cli.tq( ds[table], task=f"Updating ids of {table.url.string}", total=ds[table].common_props.get("dc:extent"), ): row[c_id.name] = mapping.get(row[c_id.name], row[c_id.name]) rows.append(row) logger.info(f"Writing {table.url.string} back to file…") table.write(rows) c_id.datatype.format = ID_FORMAT.pattern foreign_keys_to_here = { other_table.url.string: { foreign_key.columnReference[ foreign_key.reference.columnReference.index(c_id.name)] for foreign_key in other_table.tableSchema.foreignKeys if foreign_key.reference.resource == table.url if c_id.name in foreign_key.reference.columnReference } for other_table in ds.tables } for other_table, columns in foreign_keys_to_here.items(): if not columns: continue logger.info( f"Applying changed foreign key to columns {columns:} in {other_table:}…" ) rows = [] for row in cli.tq( ds[other_table], total=ds[other_table].common_props.get("dc:extent"), task="Replacing changed IDs", ): for column in columns: # TODO: is this enough to handle columns with a separator? like parameterReference in forms table if isinstance(row[column], list): row[column] = [mapping.get(v, v) for v in row[column]] else: row[column] = mapping.get(row[column], row[column]) rows.append(row) logger.info(f"Writing {other_table} back to file…") ds[other_table].write(rows) for column in columns: ds[other_table, column].datatype = c_id.datatype
def add_metadata(fname: Path, logger: cli.logging.Logger = cli.logger): if fname.name != "forms.csv": cli.Exit.CLI_ARGUMENT_ERROR( "A metadata-free Wordlist must be in a file called 'forms.csv'.") default_wordlist = TableGroup.from_file( pycldf.util.pkg_path("modules", "Wordlist-metadata.json")) default_wordlist._fname = fname.with_name("Wordlist-metadata.json") ds = pycldf.Wordlist(default_wordlist) # `from_data` checks that the reqired columns of the FormTable are present, # but it does not consolidate the columns further. colnames = next(iterrows(fname)) understood_colnames = { c.name for c in ds[ds.primary_table].tableSchema.columns if c.name in colnames } more_columns = { c.propertyUrl.uri: c for c in ds[ds.primary_table].tableSchema.columns if c.name not in understood_colnames } logger.info( "CLDF freely understood the columns %s in your forms.csv.", sorted(understood_colnames), ) # Consider the columns that were not understood. columns_without_metadata = set(colnames) - understood_colnames for column_name in columns_without_metadata: column: Column # Maybe they are known CLDF properties? if column_name in pycldf.terms.TERMS: column = pycldf.TERMS[column_name].to_column() # Maybe they are CLDF default column names? elif column_name in DEFAULT_NAME_COLUMNS: column = DEFAULT_NAME_COLUMNS[column_name] # Maybe they are columns that Lexedata knows to handle? elif column_name in LEXEDATA_COLUMNS: column = LEXEDATA_COLUMNS[column_name] # Maybe they are columns inherited from LingPy? elif column_name.upper() in LINGPY_COLUMNS: column = LINGPY_COLUMNS[column_name.upper()] # Maybe they are some name we have seen before? elif column_name in OTHER_KNOWN_COLUMNS: column = OTHER_KNOWN_COLUMNS[column_name] else: # TODO: Maybe they look like they have a specific type? ... # Otherwise, they are probably just text to be kept. column = Column( datatype=Datatype(base="string"), default="", null=[""], name=column_name, ) column.name = column_name ds[ds.primary_table].tableSchema.columns.append(column) summary = column.propertyUrl or column.datatype logger.info(f"Column {column_name} seems to be a {summary} column.") if column.propertyUrl: to_be_replaced = more_columns.pop(column.propertyUrl.uri, None) if to_be_replaced is not None: ds[ds.primary_table].tableSchema.columns.remove(to_be_replaced) for column in more_columns.values(): logger.info( f"Also added column {column.name}, as expected for a FormTable.") ds[ds.primary_table].tableSchema.columns.sort( key=lambda k: colnames.index(k.name) if k.name in colnames else 1e10) # TODO: Once lexedata is properly published, we can give a better URL. ds.properties["dc:contributor"] = [ "https://github.com/Anaphory/lexedata/blob/master/src/lexedata/edit/add_metadata.py" ] return ds
def add_cognate_table( dataset: pycldf.Wordlist, split: bool = True, logger: cli.logging.Logger = cli.logger, ) -> None: if "CognateTable" in dataset: return dataset.add_component("CognateTable") # TODO: Check if that cognatesetReference is already a foreign key to # elsewhere (could be a CognatesetTable, could be whatever), because then # we need to transfer that knowledge. # Load anything that's useful for a cognate set table: Form IDs, segments, # segment slices, cognateset references, alignments columns = { "id": dataset["FormTable", "id"].name, "concept": dataset["FormTable", "parameterReference"].name, "form": dataset["FormTable", "form"].name, } for property in [ "segments", "segmentSlice", "cognatesetReference", "alignment" ]: try: columns[property] = dataset["FormTable", property].name except KeyError: pass cognate_judgements = [] forms = cache_table(dataset, columns=columns) forms_without_segments = 0 for f, form in cli.tq(forms.items(), task="Extracting cognate judgements from forms…"): if form.get("cognatesetReference"): if split: cogset = util.string_to_id("{:}-{:}".format( form["concept"], form["cognatesetReference"])) else: cogset = form["cognatesetReference"] judgement = { "ID": f, "Form_ID": f, "Cognateset_ID": cogset, } try: judgement["Segment_Slice"] = form["segmentSlice"] except KeyError: try: if not form["segments"]: raise ValueError("No segments") if ("+" in form["segments"] and dataset["FormTable", "cognatesetReference"].separator): logger.warning( "You seem to have morpheme annotations in your cognates. I will probably mess them up a bit, because I have not been taught properly how to deal with them. Sorry!" ) judgement["Segment_Slice"] = [ "1:{:d}".format(len(form["segments"])) ] except (KeyError, TypeError, ValueError): forms_without_segments += 1 if forms_without_segments >= 5: pass else: logger.warning( f"No segments found for form {f} ({form['form']})." ) # What does an alignment mean without segments or their slices? # Doesn't matter, if we were given one, we take it. judgement["Alignment"] = form.get("alignment") cognate_judgements.append(judgement) if forms_without_segments >= 5: logger.warning( "No segments found for %d forms. You can generate segments using `lexedata.edit.segment_using_clts`.", forms_without_segments, ) # Delete the cognateset column cols = dataset["FormTable"].tableSchema.columns remove = { dataset["FormTable", c].name for c in ["cognatesetReference", "segmentSlice", "alignment"] if ("FormTable", c) in dataset } def clean_form(form): for c in remove: form.pop(c, None) return form forms = [clean_form(form) for form in dataset["FormTable"]] for c in remove: ix = cols.index(dataset["FormTable", c]) del cols[ix] dataset.write(FormTable=forms) dataset.write(CognateTable=cognate_judgements)
def update_integer_ids( ds: pycldf.Dataset, table: csvw.metadata.Table, logger: cli.logging.Logger = cli.logger, ): """Update all IDs of the table in the database, also in foreign keys.""" c_id = table.get_column("http://cldf.clld.org/v1.0/terms.rdf#id") max_id = 0 no_integer_rows: t.Set[str] = set() # logger.info("Checking IDs that are already integers…") for row in cli.tq( ds[table], task="Checking IDs that are already integers…", total=ds[table].common_props.get("dc:extent"), ): try: max_id = max(int(row[c_id.name]), max_id) except ValueError: no_integer_rows.add(row[c_id.name]) logger.info("Adding integer IDs to other rows…") mapping: t.Dict[str, int] = dict() rows: t.List[t.Dict[str, t.Any]] = [] for row in cli.tq( ds[table], task="Updating integer ids", total=ds[table].common_props.get("dc:extent"), ): original = row[c_id.name] if row[c_id.name] in no_integer_rows: max_id += 1 row[c_id.name] = max_id else: row[c_id.name] = int(row[c_id.name]) mapping[original] = row[c_id.name] rows.append(row) logger.info(f"Writing {table.url.string} back to file…") table.write(rows) foreign_keys_to_here = { other_table.url.string: { foreign_key.columnReference[ foreign_key.reference.columnReference.index(c_id.name)] for foreign_key in other_table.tableSchema.foreignKeys if foreign_key.reference.resource == table.url if c_id.name in foreign_key.reference.columnReference } for other_table in ds.tables } for other_table, columns in foreign_keys_to_here.items(): if not columns: continue rows = [] for row in cli.tq( ds[other_table], task=f"Applying changed foreign key to {other_table}…", total=ds[other_table].common_props.get("dc:extent"), ): for column in columns: # TODO: is this enough to handle columns with a separator? like parameterReference in forms table if isinstance(row[column], list): row[column] = [mapping[v] for v in row[column]] else: row[column] = mapping[row[column]] rows.append(row) for column in columns: ds[other_table, column].datatype = c_id.datatype logger.info(f"Writing {other_table} back to file…") ds[other_table].write(rows)
def read_wordlist( dataset: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], code_column: t.Optional[str], logger: cli.logging.Logger = cli.logger, ) -> t.MutableMapping[types.Language_ID, t.MutableMapping[types.Parameter_ID, t.Set]]: col_map = dataset.column_names if code_column: # Just in case that column was specified by property URL. We # definitely want the name. In any case, this will also throw a # helpful KeyError when the column does not exist. form_table_form = col_map.forms.form form_table_column = col_map.forms.id cognatesets = util.cache_table( dataset, columns={ "form": form_table_column, "transcription": form_table_form, "code": dataset["FormTable", code_column].name, }, filter=lambda row: bool(row[col_map.forms.form]), ) else: # We search for cognatesetReferences in the FormTable or a separate # CognateTable. # Try the FormTable first. code_column = col_map.forms.cognatesetReference if code_column: # This is not the CLDF way, warn the user. form_table_column = col_map.forms.id form_table_form = col_map.forms.form logger.warning( "Your dataset has a cognatesetReference in the FormTable. Consider running lexedata.edit.add_cognate_table to create an explicit cognate table." ) cognatesets = util.cache_table( dataset, columns={ "form": form_table_column, "transcription": form_table_form, "code": code_column, }, ) else: # There was no cognatesetReference in the form table. If we # find them in CognateTable (I mean, they should be there!), we # store them keyed with formReference. if (col_map.cognates and col_map.cognates.cognatesetReference and col_map.cognates.formReference): code_column = col_map.cognates.cognatesetReference form_reference = col_map.cognates.formReference (foreign_key, ) = [ key for key in dataset["CognateTable"].tableSchema.foreignKeys if key.columnReference == [form_reference] ] (form_table_column, ) = foreign_key.reference.columnReference cognatesets = util.cache_table( dataset, "CognateTable", { "form": form_reference, "code": code_column }, ) else: raise ValueError( "Dataset has no cognatesetReference column in its " "primary table or in a separate cognate table. " "Is this a metadata-free wordlist and you forgot to " "specify code_column explicitly?") # Cognate sets have been loaded. Consolidate. cognates_by_form: t.MutableMapping[ types.Form_ID, t.Set[types.Cognateset_ID]] = t.DefaultDict(set) for judgement in cognatesets.values(): cognates_by_form[judgement["form"]].add(judgement["code"]) parameter_column = col_map.forms.parameterReference # If one form can have multiple concepts, if dataset["FormTable", parameter_column].separator: def all_parameters(parameter): return list(parameter) else: def all_parameters(parameter): return [parameter] data: t.MutableMapping[types.Language_ID, t.MutableMapping[types.Parameter_ID, t.Set]] if "LanguageTable" in dataset: (langref_target, ) = [ key for key in dataset["FormTable"].tableSchema.foreignKeys if key.columnReference == [dataset["FormTable", "languageReference"].name] ] ref_col = langref_target.reference.columnReference[0] data = { lang[ref_col]: t.DefaultDict(set) for lang in dataset["LanguageTable"] } else: data = t.DefaultDict(lambda: t.DefaultDict(set)) for row in dataset["FormTable"].iterdicts(): if not row[col_map.forms.form]: # Transcription is empty, should not be a form. Skip, but maybe # warn if it was in a cognateset. if cognates_by_form[row[form_table_column]]: logger.warning( "Form %s was given as empty (i.e. the source noted that the form is unknown), but it was judged to be in cognateset %s. I will ignore that cognate judgement.", row[col_map.forms.id], cognates_by_form[row[form_table_column]], ) continue language = row[col_map.forms.languageReference] if row[col_map.forms.form] == "-": if cognates_by_form[row[form_table_column]]: logger.warning( "Form %s was given as '-' (i.e. “concept is not available in language %s”), but it was judged to be in cognateset %s. I will ignore that cognate judgement.", row[col_map.forms.id], language, cognates_by_form[row[form_table_column]], ) cognates_by_form[row[form_table_column]] = set() for parameter in all_parameters(row[parameter_column]): if data[language][parameter]: logger.warning( "Form %s claims concept %s is not available in language %s, but cognatesets %s are allocated to that concept in that language already.", row[col_map.forms.id], parameter, row[col_map.forms.languageReference], data[language][parameter], ) for parameter in all_parameters(row[parameter_column]): data[language][parameter] |= cognates_by_form[ row[form_table_column]] return data
def root_presence_code( dataset: t.Mapping[types.Language_ID, t.Mapping[types.Parameter_ID, t.Set[types.Cognateset_ID]]], relevant_concepts: t.Mapping[types.Cognateset_ID, t.Iterable[types.Parameter_ID]], ascertainment: t.Sequence[Literal["0", "1", "?"]] = ["0"], logger: cli.logging.Logger = cli.logger, ) -> t.Tuple[t.Mapping[types.Language_ID, t.List[Literal["0", "1", "?"]]], t.Mapping[types.Cognateset_ID, int], ]: """Create a root-presence/absence coding from cognate codes in a dataset Take the cognate code information from a wordlist, i.e. a mapping of the form {Language ID: {Concept ID: {Cognateset ID}}}, and generate a binary alignment from it that lists for every root whether it is present in that language or not. Return that, and the association between cognatesets and characters. >>> alignment, roots = root_presence_code( ... {"Language": {"Meaning": {"Cognateset 1"}}}, ... relevant_concepts={"Cognateset 1": ["Meaning"]}) >>> alignment {'Language': ['0', '1']} >>> roots {'Cognateset 1': 1} The first entry in each sequence is always '0': The configuration where a form is absent from all languages is never observed, but always possible, so we add this entry for the purposes of ascertainment correction. If a root is attested at all, in any concept, it is considered present. Because the word list is never a complete description of the language's lexicon, the function employs a heuristic to generate ‘absent’ states. If a root is unattested, and at least half of the relevant concepts associated with this root are attested, but each expressed by another root, the root is assumed to be absent in the target language. (If there is exactly one central concept, then that central concept being attested or unknown is a special case of this general rule.) Otherwise the presence/absence of the root is considered unknown. >>> alignment, roots = root_presence_code( ... {"l1": {"m1": {"c1"}}, ... "l2": {"m1": {"c2"}, "m2": {"c1", "c3"}}}, ... relevant_concepts={"c1": ["m1"], "c2": ["m1"], "c3": ["m2"]}) >>> sorted(roots) ['c1', 'c2', 'c3'] >>> sorted_roots = sorted(roots.items()) >>> {language: [sequence[k[1]] for k in sorted_roots] for language, sequence in alignment.items()} {'l1': ['1', '0', '?'], 'l2': ['1', '1', '1']} >>> list(zip(*sorted(zip(*alignment.values())))) [('0', '0', '1', '?'), ('0', '1', '1', '1')] """ all_roots: t.Set[types.Cognateset_ID] = set(relevant_concepts) language_roots: t.MutableMapping[ types.Language_ID, t.Set[types.Cognateset_ID]] = t.DefaultDict(set) for language, lexicon in dataset.items(): for concept, cognatesets in lexicon.items(): if not cognatesets: logger.warning( f"The root presence coder script got a language ({language}) with an improper lexicon: There is a form associated with Concept {concept}, but no cognate sets are associated with it." ) for cognateset in cognatesets: language_roots[language].add(cognateset) all_roots_sorted: t.Sequence[types.Cognateset_ID] = sorted(all_roots) alignment = {} roots = {} for language, lexicon in dataset.items(): alignment[language] = list(ascertainment) for root in all_roots_sorted: roots[root] = len(alignment[language]) if root in language_roots[language]: alignment[language].append("1") else: n_concepts = 0 n_filled_concepts = 0 for concept in relevant_concepts[root]: n_concepts += 1 if lexicon.get(concept): n_filled_concepts += 1 if 2 * n_filled_concepts >= n_concepts: alignment[language].append("0") else: alignment[language].append("?") return alignment, roots
def apply_heuristics( dataset: types.Wordlist, heuristic: t.Optional[AbsenceHeuristic] = None, primary_concepts: t.Union[ types.WorldSet[types.Parameter_ID], t.AbstractSet[types.Parameter_ID]] = types.WorldSet(), logger: cli.logging.Logger = cli.logger, ) -> t.Mapping[types.Cognateset_ID, t.Set[types.Parameter_ID]]: """Compute the relevant concepts for cognatesets, depending on the heuristic. These concepts will be considered when deciding whether a root is deemed absent in a language. For the CentralConcept heuristic, the relevant concepts are the central concept of a cognateset, as given by the #parameterReference column of the CognatesetTable. A central concept not included in the primary_concepts is ignored with a warning. >>> ds = util.fs.new_wordlist() >>> cst = ds.add_component("CognatesetTable") >>> ds["CognatesetTable"].tableSchema.columns.append( ... pycldf.dataset.Column( ... name="Central_Concept", ... propertyUrl="http://cldf.clld.org/v1.0/terms.rdf#parameterReference")) >>> ds.auto_constraints(cst) >>> ds.write(CognatesetTable=[ ... {"ID": "cognateset1", "Central_Concept": "concept1"} ... ]) >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.CENTRALCONCEPT) == {'cognateset1': {'concept1'}} True This extends to the case where a cognateset may have more than one central concept. >>> ds = util.fs.new_wordlist() >>> cst = ds.add_component("CognatesetTable") >>> ds["CognatesetTable"].tableSchema.columns.append( ... pycldf.dataset.Column( ... name="Central_Concepts", ... propertyUrl="http://cldf.clld.org/v1.0/terms.rdf#parameterReference", ... separator=",")) >>> ds.auto_constraints(cst) >>> ds.write(CognatesetTable=[ ... {"ID": "cognateset1", "Central_Concepts": ["concept1", "concept2"]} ... ]) >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.CENTRALCONCEPT) == { ... 'cognateset1': {'concept1', 'concept2'}} True For the HalfPrimaryConcepts heurisitc, the relevant concepts are all primary concepts connected to a cognateset. >>> ds = util.fs.new_wordlist( ... FormTable=[ ... {"ID": "f1", "Parameter_ID": "c1", "Language_ID": "l1", "Form": "x"}, ... {"ID": "f2", "Parameter_ID": "c2", "Language_ID": "l1", "Form": "x"}], ... CognateTable=[ ... {"ID": "1", "Form_ID": "f1", "Cognateset_ID": "s1"}, ... {"ID": "2", "Form_ID": "f2", "Cognateset_ID": "s1"}]) >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.HALFPRIMARYCONCEPTS) == { ... 's1': {'c1', 'c2'}} True NOTE: This function cannot guarantee that every concept has at least one relevant concept, there may be cognatesets without! A cognateset with 0 relevant concepts will always be included, because 0 is at least half of 0. """ heuristic = (heuristic if heuristic is not None else (AbsenceHeuristic.CENTRALCONCEPT if ("CognatesetTable", "parameterReference") in dataset else AbsenceHeuristic.HALFPRIMARYCONCEPTS)) relevant_concepts: t.MutableMapping[ types.Cognateset_ID, t.Set[types.Parameter_ID]] = t.DefaultDict(set) if heuristic is AbsenceHeuristic.HALFPRIMARYCONCEPTS: c_f = dataset["CognateTable", "formReference"].name c_s = dataset["CognateTable", "cognatesetReference"].name concepts = util.cache_table( dataset, "FormTable", {"concepts": dataset["FormTable", "parameterReference"].name}, ) for j in dataset["CognateTable"]: form = concepts[j[c_f]] for concept in util.ensure_list(form["concepts"]): relevant_concepts[j[c_s]].add(concept) elif heuristic is AbsenceHeuristic.CENTRALCONCEPT: c_cognateset_concept = dataset["CognatesetTable", "parameterReference"].name c_id = dataset["CognatesetTable", "id"].name for c in dataset["CognatesetTable"]: for concept in util.ensure_list(c[c_cognateset_concept]): if concept not in primary_concepts: logger.warning( f"The central concept {concept} of cognateset {c[c_id]} was not part of your list of primary concepts to be included in the coding, so the cognateset will be ignored." ) else: relevant_concepts[c[c_id]].add(concept) else: raise TypeError( f"Value of heuristic, {heuristic}, did not correspond to a known AbsenceHeuristic." ) return relevant_concepts
def read_single_excel_sheet( dataset: pycldf.Dataset, sheet: openpyxl.worksheet.worksheet.Worksheet, logger: cli.logging.Logger = cli.logger, match_form: t.Optional[t.List[str]] = None, entries_to_concepts: t.Mapping[str, str] = KeyKeyDict(), concept_column: t.Optional[str] = None, ignore_missing: bool = False, ignore_superfluous: bool = False, status_update: t.Optional[str] = None, ) -> t.Mapping[str, ImportLanguageReport]: report: t.Dict[str, ImportLanguageReport] = defaultdict(ImportLanguageReport) concept_columns: t.Tuple[str, str] if concept_column is None: concept_columns = ( dataset["FormTable", "parameterReference"].name, dataset["FormTable", "parameterReference"].name, ) else: concept_columns = ( dataset["FormTable", "parameterReference"].name, concept_column, ) db = DB(dataset) db.cache_dataset() # required cldf fields of a form c_f_id = db.dataset["FormTable", "id"].name c_f_language = db.dataset["FormTable", "languageReference"].name c_f_form = db.dataset["FormTable", "form"].name c_f_value = db.dataset["FormTable", "value"].name c_f_concept = db.dataset["FormTable", "parameterReference"].name if not match_form: match_form = [c_f_form, c_f_language] if not db.dataset["FormTable", c_f_concept].separator: logger.warning( "Your metadata does not allow polysemous forms. According to your specifications, " "identical forms with different concepts will always be considered homophones, not a single " "polysemous form. To include polysemous forms, add a separator to your FormTable #parameterReference " "in the Metadata.json To find potential polysemies, run lexedata.report.list_homophones." ) match_form.append(c_f_concept) else: if c_f_concept in match_form: logger.info( "Matching by concept enabled: To find potential polysemies, run lexedata.report.list_homophones." ) sheet_header = get_headers_from_excel(sheet) form_header = list(db.dataset["FormTable"].tableSchema.columndict.keys()) # These columns don't need to be given, we can infer them from the sheet title and from the other data: implicit: t.Dict[Literal["languageReference", "id", "value"], str] = {} if c_f_language not in sheet_header: implicit["languageReference"] = c_f_language if c_f_id not in sheet_header: implicit["id"] = c_f_id if c_f_value not in sheet_header: implicit["value"] = c_f_value found_columns = set(sheet_header) - {concept_column} - set(implicit.values()) expected_columns = set(form_header) - {c_f_concept} - set(implicit.values()) if not found_columns >= expected_columns: if ignore_missing: logger.info( f"Your Excel sheet {sheet.title} is missing columns {expected_columns - found_columns}. " f"For the newly imported forms, these columns will be left empty in the dataset." ) else: raise ValueError( f"Your Excel sheet {sheet.title} is missing columns {expected_columns - found_columns}. " f"Clean up your data, or use --ignore-missing-excel-columns to import anyway and leave these " f"columns empty in the dataset for the newly imported forms." ) if not found_columns <= expected_columns: if ignore_superfluous: logger.info( f"Your Excel sheet {sheet.title} contained unexpected columns " f"{found_columns - expected_columns}. These columns will be ignored." ) else: raise ValueError( f"Your Excel sheet {sheet.title} contained unexpected columns " f"{found_columns - expected_columns}. Clean up your data, or use " f"--ignore-superfluous-excel-columns to import the data anyway and ignore these columns." ) # check if language exist c_l_name = db.dataset["LanguageTable", "name"].name c_l_id = db.dataset["LanguageTable", "id"].name language_name_to_language_id = { row[c_l_name]: row[c_l_id] for row in db.cache["LanguageTable"].values() } language_name = normalize_string(sheet.title) if language_name in language_name_to_language_id: language_id = language_name_to_language_id[language_name] report[language_id].is_new_language = False else: language_id = language_name report[language_id].is_new_language = True # read new data from sheet for form in cli.tq( import_data_from_sheet( sheet, sheet_header=sheet_header, implicit=implicit, language_id=language_id, concept_column=concept_columns, ), task=f"Parsing cells of sheet {sheet.title}", total=sheet.max_row, ): # if concept not in dataset, don't add form try: concept_entry = form[c_f_concept] entries_to_concepts[concept_entry] except KeyError: logger.warning( f"Concept {concept_entry} was not found. Please add it to the concepts.csv file manually. " f"The corresponding form was ignored and not added to the dataset." ) report[language_id].skipped += 1 continue # else, look for candidates, link to existing form or add new form for item, value in form.items(): try: sep = db.dataset["FormTable", item].separator except KeyError: continue if sep is None: continue form[item] = value.split(sep) form_candidates = db.find_db_candidates(form, match_form) if form_candidates: new_concept_added = False for form_id in form_candidates: logger.info(f"Form {form[c_f_value]} was already in dataset.") if db.dataset["FormTable", c_f_concept].separator: for new_concept in form[c_f_concept]: if ( new_concept not in db.cache["FormTable"][form_id][c_f_concept] ): db.cache["FormTable"][form_id][c_f_concept].append( new_concept ) logger.info( f"New form-concept association: Concept {form[c_f_concept]} was added to existing form " f"{form_id}. If this was not intended " f"(because it is a homophonous form, not a polysemy), " f"you need to manually remove that concept from the old form in forms.csv " f"and create a separate new form. If you want to treat identical forms " f"as homophones in general, add " f"--match-forms={' '.join(match_form)}, " f"{db.dataset['FormTable', 'parameterReference']} " f"when you run this script." ) new_concept_added = True break if new_concept_added: report[language_id].concepts += 1 else: report[language_id].existing += 1 else: # we land here after the break and keep adding existing forms to the dataset just with integer in id +1 form[c_f_language] = language_id if "id" in implicit: # TODO: check for type of form id column form_concept = form[c_f_concept] concept_reference = ( form_concept[0] if isinstance(form_concept, list) else form_concept ) form[c_f_id] = string_to_id(f"{form[c_f_language]}_{concept_reference}") db.make_id_unique(form) if status_update: form["Status_Column"] = status_update db.insert_into_db(form) report[language_id].new += 1 # write to cldf db.write_dataset_from_cache() return report
def add_single_languages( metadata: Path, sheets: t.Iterable[openpyxl.worksheet.worksheet.Worksheet], match_form: t.Optional[t.List[str]], concept_name: t.Optional[str], ignore_missing: bool, ignore_superfluous: bool, status_update: t.Optional[str], logger: cli.logging.Logger, ) -> t.Mapping[str, ImportLanguageReport]: if status_update == "None": status_update = None # initiate dataset from meta data or csv depending on command line arguments if metadata: if metadata.name == "forms.csv": dataset = pycldf.Dataset.from_data(metadata) else: dataset = pycldf.Dataset.from_metadata(metadata) concepts: t.Mapping[str, str] try: cid = dataset["ParameterTable", "id"].name if concept_name is None: concepts = {c[cid]: c[cid] for c in dataset["ParameterTable"]} concept_column = dataset["FormTable", "parameterReference"].name else: name = dataset["ParameterTable", "name"].name concepts = {c[name]: c[cid] for c in dataset["ParameterTable"]} concept_column = concept_name except (KeyError, FileNotFoundError) as err: if isinstance(err, KeyError): logger.warning( "Did not find a well-formed ParameterTable. Importing all forms independent of concept" ) elif isinstance(err, FileNotFoundError): logger.warning( f"Did not find {dataset['ParameterTable'].url.string}. " f"Importing all forms independent of concept" ) concepts = KeyKeyDict() if concept_name: concept_column = concept_name else: concept_column = dataset["FormTable", "parameterReference"].name # add Status_Column if not existing and status_update given if status_update: add_status_column_to_table(dataset=dataset, table_name="FormTable") report: t.Dict[str, ImportLanguageReport] = defaultdict(ImportLanguageReport) # import all selected sheets for sheet in sheets: for lang, subreport in read_single_excel_sheet( dataset=dataset, sheet=sheet, logger=logger, match_form=match_form, entries_to_concepts=concepts, concept_column=concept_column, ignore_missing=ignore_missing, ignore_superfluous=ignore_superfluous, status_update=status_update, ).items(): report[lang] += subreport return report
def forms_to_tsv( dataset: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], languages: t.Iterable[str], concepts: t.Set[str], cognatesets: t.Iterable[str], logger: cli.logging.Logger = cli.logger, ): try: dataset["FormTable", "segments"].name except KeyError: cli.Exit.NO_SEGMENTS( """Edictor export requires your dataset to have segments in the FormTable. Run `lexedata.edit.add_segments` to automatically add segments based on your forms.""" ) delimiters = { util.cldf_property(c.propertyUrl) or c.name: c.separator for c in dataset["FormTable"].tableSchema.columns if c.separator } # prepare the header for the tsv output # the first column must be named ID and contain 1-based integer IDs # set header for tsv tsv_header = list(dataset["FormTable"].tableSchema.columndict.keys()) tsv_header.insert(0, "LINGPY_ID") tsv_header.append("cognatesetReference") if "alignment" not in tsv_header: tsv_header.append("alignment") if "parameterReference" in delimiters: tsv_header.append("_parameterReference") # select forms and cognates given restriction of languages and concepts, cognatesets respectively forms = {} for f, form in util.cache_table(dataset).items(): if form["form"] is None or form["form"] == "-": continue if form["languageReference"] in languages and concepts.intersection( ensure_list(form["parameterReference"])): # Normalize the form: # 1. No list-valued entries for c, d in delimiters.items(): if c == "segments": continue if c == "parameterReference": form["_parameterReference"] = d.join( str(e) for e in form[c]) form["parameterReference"] = form["parameterReference"][0] continue form[c] = d.join(str(e) for e in form[c]) if not form.get("segments"): logger.warning( "No segments found for form %s. You can generate segments using `lexedata.edit.add_segments`.", form["id"], ) # 2. No tabs, newlines in entries for c, v in form.items(): if type(v) == str: if "\\!t" in form[c] or "\\!n" in form[c]: logger.warning( "Your data contains the special characters '\\!t' or '\\!n', which I will introduce for escaping tabs and newlines for edictor. These characters will not survive the back-import." ) form[c] = form[c].replace("\t", "\\!t").replace("\n", "\\!n") forms[f] = form cognateset_cache: t.Mapping[t.Optional[str], int] if "CognatesetTable" in dataset: id = dataset["CognatesetTable", "id"].name cognateset_cache = { cognateset[id]: c for c, cognateset in enumerate(dataset["CognatesetTable"], 1) if cognateset[id] in cognatesets } else: if cognatesets is None: cognateset_cache = t.DefaultDict(itertools.count().__next__) else: cognateset_cache = {c: i for i, c in enumerate(cognatesets, 1)} # Warn about unexpected non-concatenative ‘morphemes’ lexedata.report.nonconcatenative_morphemes.segment_to_cognateset( dataset, cognatesets, logger) judgements_about_form: t.Mapping[types.Form_ID, t.Tuple[t.List[str], t.List[int]]] = { id: ([f"({s})" for s in form["segments"]], []) for id, form in forms.items() } # Compose all judgements, last-one-rules mode. for j in util.cache_table(dataset, "CognateTable").values(): if j["formReference"] in forms and cognateset_cache.get( j["cognatesetReference"]): if j.get("alignment"): j["alignment"] = [s or "" for s in j["alignment"]] else: j["alignment"] = forms[j["formReference"]]["segments"] try: segments_judged = list( parse_segment_slices(segment_slices=j["segmentSlice"], enforce_ordered=False)) except TypeError: logger.warning( "In judgement %s: No segment slice given. Assuming whole form.", j["id"], ) segments_judged = list( range(len(forms[j["formReference"]]["segments"]))) except KeyError: segments_judged = list( range(len(forms[j["formReference"]]["segments"]))) except ValueError: logger.warning( "In judgement %s: Index error due to bad segment slice %s. Skipped.", j["id"], ",".join(j["segmentSlice"]), ) continue global_alignment, cogsets = judgements_about_form[ j["formReference"]] segment_start, segment_end = min( segments_judged), max(segments_judged) + 1 try: glue_in_alignment( global_alignment, cogsets, j["alignment"], j["cognatesetReference"], slice(segment_start, segment_end), ) except IndexError: logger.warning( "In judgement %s: Index error due to bad segment slice %s. Skipped.", j["id"], ",".join(j["segmentSlice"]), ) continue return forms, judgements_about_form, cognateset_cache
def __init__( self, dataset: pycldf.Dataset, element_semantics: t.Iterable[t.Tuple[str, str, str, bool]] = [ # ("[", "]", "phonetic", True), ("<", ">", "form", True), # ("/", "/", "phonemic", True), ("(", ")", "comment", False), ("{", "}", "source", False), ], separation_pattern: str = r"([;,])", variant_separator: t.Optional[t.List[str]] = ["~", "%"], add_default_source: t.Optional[str] = "{1}", logger: cli.logging.Logger = cli.logger, ): super().__init__(dataset) # Colums implied by element semantics self.bracket_pairs = { start: end for start, end, _, _ in element_semantics } self.element_semantics = { start: (term, transcription) for start, _, term, transcription in element_semantics } for start, end, term, transcription in element_semantics: # Ensure that all terms required by the element semantics are fields we can write to. self.cc(short=term, long=("FormTable", term), dataset=dataset) assert self.transcriptions, ( "Your metadata json file and your cell parser don’t match: Your cell parser " f"{self.__class__.__name__} expects to work with transcriptions " "(at least one of 'orthographic', 'phonemic', and 'phonetic') to derive a #form " "in #FormTable, but your metadata defines no such column.") # Colums necessary for word list self.cc(short="source", long=("FormTable", "source"), dataset=dataset) self.cc(short="comment", long=("FormTable", "comment"), dataset=dataset) try: self.comment_separator = dataset["FormTable", "comment"].separator or "\t" except KeyError: logger.info("No #comment column found.") self.comment_separator = "" try: # As long as there is no CLDF term #variants, this will either be # 'variants' or raise a KeyError. However, it is a transparent # re-use of an otherwise established idiom in this module, so we # use this minor overhead. self.c["variants"] = dataset["FormTable", "variants"].name except KeyError: logger.warning( "No 'variants' column found for FormTable in Wordlist-metadata.json. " "Form variants will be added to #comment.") # Other class attributes self.separation_pattern = separation_pattern self.variant_separator = variant_separator self.add_default_source = add_default_source
def merge_forms( data: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], mergers: t.Mapping[str, Merger], homophone_groups: t.MutableMapping[types.Form_ID, t.Sequence[types.Form_ID]], logger: cli.logging.Logger = cli.logger, ) -> t.Iterable[types.Form]: """Merge forms from a dataset. TODO: Construct an example that shows that the order given in `homophone_groups` is maintained. Side Effects ============ Changes homophone_groups: Groups that are skipped are removed """ merge_targets = { variant: target for target, variants in homophone_groups.items() for variant in variants } for target in homophone_groups: assert merge_targets[target] == target c_f_id = data["FormTable", "id"].name buffer: t.Dict[types.Form_ID, types.Form] = {} unknown = set() form: types.Form for form in cli.tq( data["FormTable"], task="Going through forms and merging", logger=logger, total=data["FormTable"].common_props.get("dc:extent"), ): id: types.Form_ID = form[c_f_id] buffer[id] = form if id in merge_targets: unknown.add(id) target_id = merge_targets[id] group = homophone_groups[target_id] if all(i in buffer for i in group): try: buffer[target_id] = merge_group( [buffer[i] for i in group], buffer[target_id].copy(), # type: ignore mergers, data, logger, ) for i in group: if i != target_id: del buffer[i] except Skip: logger.info( f"Merging form {id} with forms {[f[c_f_id] for f in group]} was skipped." ) del homophone_groups[id] pass for i in group: unknown.remove(i) for f in list(buffer): if f in unknown: break yield buffer.pop(f)
def parse_form( self, form_string: str, language_id: str, cell_identifier: str = "", logger: cli.logging.Logger = cli.logger, ) -> t.Optional[Form]: """Create a dictionary of columns from a form description. Extract each value (transcriptions, comments, sources etc.) from a string describing a single form. """ # not required fields c_comment = self.c.get("comment") c_variants = self.c.get("variants", c_comment) # if string is only whitespaces, there is no form. if not form_string.strip(): return None properties: t.Dict[str, t.Any] = { self.c["lang"]: language_id, self.c["value"]: form_string, } # Semantics: 'None' for no variant expected, any string for the # decorator that introduces variant forms. Currently we expect '~' and # '%', see below. expect_variant: t.Optional[str] = None # Iterate over the delimiter-separated elements of the form. for element in components_in_brackets(form_string, self.bracket_pairs): element = element.strip() if not element: continue # If the element has mismatched brackets (tends to happen only for # the last element, because a mismatched opening bracket means we # are still waiting for the closing one), warn. if not check_brackets(element, self.bracket_pairs): try: delimiter = self.bracket_pairs[element[0]] except KeyError: delimiter = element[0] raise ValueError( f"{cell_identifier}In form {form_string}: Element {element} had mismatching delimiters " f"{delimiter}. This could be a bigger problem in the cell, " f"so the form was not imported.") # Check what kind of element we have. for start, (term, transcription) in self.element_semantics.items(): field = self.c[term] if element.startswith(start): break else: # TODO: here an other if catchin '-' might be necessary # The only thing we expect outside delimiters is the variant # separators, '~' and '%'. if self.variant_separator and element in self.variant_separator: expect_variant = element else: logger.warning( f"{cell_identifier}In form {form_string}: Element {element} could not be parsed, ignored" ) continue # If we encounter a field for the first time, we add it to the # dictionary. If repeatedly, to the variants, with a decorator that # shows how expected the variant was. # This drops sources and comments in variants, if more than one source or comment is provided # clean this up in self.postprocess_form if field in properties: if (not expect_variant and field != c_comment and field != self.c["source"]): logger.warning( f"{cell_identifier}In form {form_string}: Element {element} was an unexpected variant for {field}" ) properties.setdefault( c_variants, []).append((expect_variant or "") + element) else: if expect_variant: logger.warning( f"{cell_identifier}In form {form_string}: Element {element} was supposed to be a variant, but there is no earlier {field}" ) properties[field] = element expect_variant = None self.postprocess_form(properties, language_id) return Form(properties)