def rename( ds, old_values_to_new_values, logger: cli.logging.Logger, status_update: t.Optional[str], ): concepts = ds["ParameterTable"] for table in ds.tables: if table == concepts: continue _, component = table.common_props["dc:conformsTo"].split("#") try: c_concept = ds[component, "parameterReference"] columns = {c_concept.name} except KeyError: columns = set() for reference in table.tableSchema.foreignKeys: if reference.reference.resource.string == concepts.url.string: (column, ) = reference.columnReference columns.add(column) if columns: logger.info(f"Changing columns {columns:} in {component:}…") ds.write( **{ component: [ substitute_many( r, columns, old_values_to_new_values, status_update=status_update, ) for r in table ] })
def on_form_not_found( self, form: t.Dict[str, t.Any], cell_identifier: t.Optional[str] = None, language_id: t.Optional[str] = None, logger: cli.logging.Logger = cli.logger, ) -> bool: """Should I add a missing object? No, but inform the user. Send a warning (ObjectNotFoundWarning) reporting the missing object and cell. Returns ======= False: The object should not be added. """ rep = form.get("cldf_id", repr(form)) logger.warning( f"Unable to find form {rep} in cell {cell_identifier} in the dataset. " f"This cognate judgement was skipped. " f"Please make sure that the form is present in forms.csv or in the file " f"used for the Wordlist importation.") # Do a fuzzy search for row in self.db.find_db_candidates(form, self.check_for_match, edit_dist_threshold=4): logger.info(f"Did you mean {row} ?") return False
def check_id_format(dataset: pycldf.Dataset, logger: cli.logging.Logger = cli.logger): correct = True for table in dataset.tables: # Every table SHOULD have an ID column try: id_column = dataset[table, "id"] except KeyError: log_or_raise("Table %s has no identifier column.", logger) correct = False continue # All IDs SHOULD be [a-zA-Z0-9_-]+ datatype = id_column.datatype if datatype.base == "string": if not datatype.format: correct = False log_or_raise( f"Table {table.url} has an unconstrained ID column {id_column.name}. Consider setting " f"its format to [a-zA-Z0-9_-]+ and/or running `lexedata.edit.simplify_ids`.", logger, ) else: if datatype.format not in { "[a-zA-Z0-9_\\-]+", "[a-zA-Z0-9_-]+", "[a-zA-Z0-9\\-_]+", "[a-z0-9_]+", }: log_or_raise( f"Table {table.url} has a string ID column {id_column.name} with format {datatype.format}. " f"I am too dumb to check whether that's a subset of [a-zA-Z0-9_-]+ (which is fine) " f"or not (in which case maybe change it).", logger, ) elif datatype.base == "integer": logger.info( "Table %s has integer ID column %s. This is okay, I hope I will not mess it up.", table.url, id_column.name, ) # IDs should be primary keys and primary keys IDs (not official part of the CLDF specs) if table.tableSchema.primaryKey != [id_column.name]: log_or_raise( f"Table {table.url} has ID column {id_column.name}, but primary key {table.tableSchema.primaryKey}", logger, ) correct = False return correct
def clean_forms( table: t.Iterable[R], form_column_name="form", variants_column_name="variants", split_at=[",", ";"], split_at_and_keep=["~"], logger: cli.logging.Logger = cli.logger, ) -> t.Iterator[R]: """Split all forms that contain separators into form+variants. >>> for row in clean_forms([ ... {'F': 'a ~ æ', 'V': []}, ... {'F': 'bə-, be-', 'V': ['b-']}], ... "F", "V"): ... print(row) {'F': 'a', 'V': ['~æ']} {'F': 'bə-', 'V': ['b-', 'be-']} """ for r, row in enumerate(table): forms = [("", row[form_column_name])] for separator in split_at: forms = [("", form.strip()) for _, chunk in forms for form in chunk.split(separator)] for separator in split_at_and_keep: forms = [(first_separator if f == 0 else separator, form.strip()) for first_separator, chunk in forms for f, form in enumerate(chunk.split(separator))] if len(forms) > 1: logger.info( "Line %d: Split form '%s' into %d elements.", r, row[form_column_name], len(forms), ) if forms[0][0]: logger.warn( "First element was marked as variant using %s, ignoring the marker", forms[0][0], ) row[form_column_name] = forms[0][1] row[variants_column_name].extend( [f"{separator}{form}" for separator, form in forms[1:]]) yield row
def replace_column( dataset: pycldf.Dataset, original: str, replacement: str, column_replace: bool, smush: bool, status_update: t.Optional[str], logger: cli.logging.Logger = cli.logger, ) -> None: # add Status_column if not existing and status update given if status_update: add_status_column_to_table(dataset=dataset, table_name="ParameterTable") if column_replace: assert ( original == "id" or original == dataset["ParameterTable", "id"].name ), f"Replacing an entire column is only meaningful when you change the #id column ({dataset['ParameterTable', 'id'].name}) of the ConceptTable." c_id = dataset["ParameterTable", original].name c_new = dataset["ParameterTable", replacement].name mapping = { concept[c_id]: concept[c_new] for concept in dataset["ParameterTable"] } assert smush or len(mapping) == len( set(mapping.values()) ), "Would collapse some concepts that were distinct before! Add '--smush' if that is intended." # dataset["ParameterTable"].tableSchema.columns["c_id"] rename(dataset, mapping, logger, status_update=status_update) else: concepts = dataset["ParameterTable"] c_id = dataset["ParameterTable", "id"].name logger.info(f"Changing {c_id:} of ParameterTable…") dataset.write(ParameterTable=[ substitute_many(r, [c_id], {original: replacement}, status_update=None) for r in concepts ]) rename(dataset, {original: replacement}, logger, status_update=status_update)
def import_cognates_from_excel( ws: openpyxl.worksheet.worksheet.Worksheet, dataset: pycldf.Dataset, extractor: re.Pattern = re.compile("/(?P<ID>[^/]*)/?$"), logger: cli.logging.Logger = cli.logger, ) -> None: logger.info("Loading sheet…") logger.info( f"Importing cognate sets from sheet {ws.title}, into {dataset.tablegroup._fname}…" ) row_header, _ = header_from_cognate_excel(ws, dataset, logger=logger) excel_parser_cognate = CognateEditParser( dataset, top=2, # When the dataset has cognateset comments, that column is not a header # column, so this value is one higher than the actual number of header # columns, so actually correct for the 1-based indices. When there is # no comment column, we need to compensate for the 1-based Excel # indices. cellparser=cell_parsers.CellParserHyperlink(dataset, extractor=extractor), row_header=row_header, check_for_language_match=[dataset["LanguageTable", "name"].name], check_for_match=[dataset["FormTable", "id"].name], check_for_row_match=[dataset["CognatesetTable", "id"].name], ) excel_parser_cognate.db.cache_dataset() excel_parser_cognate.db.drop_from_cache("CognatesetTable") excel_parser_cognate.db.drop_from_cache("CognateTable") logger.info("Parsing cognate Excel…") excel_parser_cognate.parse_cells(ws, status_update=None) excel_parser_cognate.db.write_dataset_from_cache( ["CognateTable", "CognatesetTable"])
def cache_dataset(self, logger: cli.logging.Logger = cli.logger): logger.info("Caching dataset into memory…") for table in self.dataset.tables: table_type = (table.common_props.get("dc:conformsTo", "").rsplit( "#", 1)[1] or table.url) (id, ) = table.tableSchema.primaryKey # Extent may be wrong, but it's usually at least roughly correct # and a better indication of the table size than none at all. try: self.cache[table_type] = { row[id]: row for row in cli.tq( table, task="Cache the dataset", total=table.common_props.get("dc:extent"), ) } except FileNotFoundError: self.cache[table_type] = {} for source in self.dataset.sources: self.source_ids.add(source.id)
def filter( table: t.Iterable[R], column: str, filter: re.Pattern, invert: bool = False, logger: cli.logging.Logger = cli.logger, ) -> t.Iterator[R]: """Return all rows matching a filter Match the filter regular expression and return all rows in the table where the filter matches the column. (Or all where it does not, if invert==True.) >>> list(filter([ ... {"C": "A"}, ... {"C": "An"}, ... {"C": "T"}, ... {"C": "E"}, ... ], "C", re.compile("A"), invert=True)) [{'C': 'T'}, {'C': 'E'}] """ n_row = 0 n_included = 0 for row in table: n_row += 1 # TODO: Treat list-valued columns better. string = str(row[column]) row_matches = bool(filter.search(string)) if row_matches ^ invert: n_included += 1 yield row logger.info( "Filtered %d rows down to %d (%1.0f%%)", n_row, n_included, n_included / n_row * 100, )
def add_concepticon_definitions( dataset: pycldf.Dataset, column_name: str = "Concepticon_Definition", logger: cli.logging.Logger = cli.logger, ) -> None: concepticon_ids = dataset.column_names.parameters.concepticonReference if concepticon_ids is None: logger.error( "Your concepts table has no #concepticonReference column, so I cannot add any definitions from Concepticon to it. Try running lexedata.edit.add_concepticon to have me guess those references." ) return # Create a concepticon_definition column try: dataset["ParameterTable", column_name] logger.info("Overwriting existing {:} column in concepts table".format( column_name)) except KeyError: dataset.add_columns("ParameterTable", column_name) dataset.write_metadata() # Now if this throws an exception, it's an unexpected exception. # write concepticon definitions write_back = [] for row in cli.tq( dataset["ParameterTable"], task="Write concepts with concepticon definitions to dataset", ): try: row[column_name] = concepticon.api.conceptsets[ row[concepticon_ids]].definition except KeyError: pass write_back.append(row) dataset.write(ParameterTable=write_back)
def update_ids( ds: pycldf.Dataset, table: csvw.metadata.Table, mapping: t.Mapping[str, str], logger: cli.logging.Logger = cli.logger, ): """Update all IDs of the table in the database, also in foreign keys, according to mapping.""" c_id = table.get_column("http://cldf.clld.org/v1.0/terms.rdf#id") rows = [] for row in cli.tq( ds[table], task=f"Updating ids of {table.url.string}", total=ds[table].common_props.get("dc:extent"), ): row[c_id.name] = mapping.get(row[c_id.name], row[c_id.name]) rows.append(row) logger.info(f"Writing {table.url.string} back to file…") table.write(rows) c_id.datatype.format = ID_FORMAT.pattern foreign_keys_to_here = { other_table.url.string: { foreign_key.columnReference[ foreign_key.reference.columnReference.index(c_id.name)] for foreign_key in other_table.tableSchema.foreignKeys if foreign_key.reference.resource == table.url if c_id.name in foreign_key.reference.columnReference } for other_table in ds.tables } for other_table, columns in foreign_keys_to_here.items(): if not columns: continue logger.info( f"Applying changed foreign key to columns {columns:} in {other_table:}…" ) rows = [] for row in cli.tq( ds[other_table], total=ds[other_table].common_props.get("dc:extent"), task="Replacing changed IDs", ): for column in columns: # TODO: is this enough to handle columns with a separator? like parameterReference in forms table if isinstance(row[column], list): row[column] = [mapping.get(v, v) for v in row[column]] else: row[column] = mapping.get(row[column], row[column]) rows.append(row) logger.info(f"Writing {other_table} back to file…") ds[other_table].write(rows) for column in columns: ds[other_table, column].datatype = c_id.datatype
def update_integer_ids( ds: pycldf.Dataset, table: csvw.metadata.Table, logger: cli.logging.Logger = cli.logger, ): """Update all IDs of the table in the database, also in foreign keys.""" c_id = table.get_column("http://cldf.clld.org/v1.0/terms.rdf#id") max_id = 0 no_integer_rows: t.Set[str] = set() # logger.info("Checking IDs that are already integers…") for row in cli.tq( ds[table], task="Checking IDs that are already integers…", total=ds[table].common_props.get("dc:extent"), ): try: max_id = max(int(row[c_id.name]), max_id) except ValueError: no_integer_rows.add(row[c_id.name]) logger.info("Adding integer IDs to other rows…") mapping: t.Dict[str, int] = dict() rows: t.List[t.Dict[str, t.Any]] = [] for row in cli.tq( ds[table], task="Updating integer ids", total=ds[table].common_props.get("dc:extent"), ): original = row[c_id.name] if row[c_id.name] in no_integer_rows: max_id += 1 row[c_id.name] = max_id else: row[c_id.name] = int(row[c_id.name]) mapping[original] = row[c_id.name] rows.append(row) logger.info(f"Writing {table.url.string} back to file…") table.write(rows) foreign_keys_to_here = { other_table.url.string: { foreign_key.columnReference[ foreign_key.reference.columnReference.index(c_id.name)] for foreign_key in other_table.tableSchema.foreignKeys if foreign_key.reference.resource == table.url if c_id.name in foreign_key.reference.columnReference } for other_table in ds.tables } for other_table, columns in foreign_keys_to_here.items(): if not columns: continue rows = [] for row in cli.tq( ds[other_table], task=f"Applying changed foreign key to {other_table}…", total=ds[other_table].common_props.get("dc:extent"), ): for column in columns: # TODO: is this enough to handle columns with a separator? like parameterReference in forms table if isinstance(row[column], list): row[column] = [mapping[v] for v in row[column]] else: row[column] = mapping[row[column]] rows.append(row) for column in columns: ds[other_table, column].datatype = c_id.datatype logger.info(f"Writing {other_table} back to file…") ds[other_table].write(rows)
def read_single_excel_sheet( dataset: pycldf.Dataset, sheet: openpyxl.worksheet.worksheet.Worksheet, logger: cli.logging.Logger = cli.logger, match_form: t.Optional[t.List[str]] = None, entries_to_concepts: t.Mapping[str, str] = KeyKeyDict(), concept_column: t.Optional[str] = None, ignore_missing: bool = False, ignore_superfluous: bool = False, status_update: t.Optional[str] = None, ) -> t.Mapping[str, ImportLanguageReport]: report: t.Dict[str, ImportLanguageReport] = defaultdict(ImportLanguageReport) concept_columns: t.Tuple[str, str] if concept_column is None: concept_columns = ( dataset["FormTable", "parameterReference"].name, dataset["FormTable", "parameterReference"].name, ) else: concept_columns = ( dataset["FormTable", "parameterReference"].name, concept_column, ) db = DB(dataset) db.cache_dataset() # required cldf fields of a form c_f_id = db.dataset["FormTable", "id"].name c_f_language = db.dataset["FormTable", "languageReference"].name c_f_form = db.dataset["FormTable", "form"].name c_f_value = db.dataset["FormTable", "value"].name c_f_concept = db.dataset["FormTable", "parameterReference"].name if not match_form: match_form = [c_f_form, c_f_language] if not db.dataset["FormTable", c_f_concept].separator: logger.warning( "Your metadata does not allow polysemous forms. According to your specifications, " "identical forms with different concepts will always be considered homophones, not a single " "polysemous form. To include polysemous forms, add a separator to your FormTable #parameterReference " "in the Metadata.json To find potential polysemies, run lexedata.report.list_homophones." ) match_form.append(c_f_concept) else: if c_f_concept in match_form: logger.info( "Matching by concept enabled: To find potential polysemies, run lexedata.report.list_homophones." ) sheet_header = get_headers_from_excel(sheet) form_header = list(db.dataset["FormTable"].tableSchema.columndict.keys()) # These columns don't need to be given, we can infer them from the sheet title and from the other data: implicit: t.Dict[Literal["languageReference", "id", "value"], str] = {} if c_f_language not in sheet_header: implicit["languageReference"] = c_f_language if c_f_id not in sheet_header: implicit["id"] = c_f_id if c_f_value not in sheet_header: implicit["value"] = c_f_value found_columns = set(sheet_header) - {concept_column} - set(implicit.values()) expected_columns = set(form_header) - {c_f_concept} - set(implicit.values()) if not found_columns >= expected_columns: if ignore_missing: logger.info( f"Your Excel sheet {sheet.title} is missing columns {expected_columns - found_columns}. " f"For the newly imported forms, these columns will be left empty in the dataset." ) else: raise ValueError( f"Your Excel sheet {sheet.title} is missing columns {expected_columns - found_columns}. " f"Clean up your data, or use --ignore-missing-excel-columns to import anyway and leave these " f"columns empty in the dataset for the newly imported forms." ) if not found_columns <= expected_columns: if ignore_superfluous: logger.info( f"Your Excel sheet {sheet.title} contained unexpected columns " f"{found_columns - expected_columns}. These columns will be ignored." ) else: raise ValueError( f"Your Excel sheet {sheet.title} contained unexpected columns " f"{found_columns - expected_columns}. Clean up your data, or use " f"--ignore-superfluous-excel-columns to import the data anyway and ignore these columns." ) # check if language exist c_l_name = db.dataset["LanguageTable", "name"].name c_l_id = db.dataset["LanguageTable", "id"].name language_name_to_language_id = { row[c_l_name]: row[c_l_id] for row in db.cache["LanguageTable"].values() } language_name = normalize_string(sheet.title) if language_name in language_name_to_language_id: language_id = language_name_to_language_id[language_name] report[language_id].is_new_language = False else: language_id = language_name report[language_id].is_new_language = True # read new data from sheet for form in cli.tq( import_data_from_sheet( sheet, sheet_header=sheet_header, implicit=implicit, language_id=language_id, concept_column=concept_columns, ), task=f"Parsing cells of sheet {sheet.title}", total=sheet.max_row, ): # if concept not in dataset, don't add form try: concept_entry = form[c_f_concept] entries_to_concepts[concept_entry] except KeyError: logger.warning( f"Concept {concept_entry} was not found. Please add it to the concepts.csv file manually. " f"The corresponding form was ignored and not added to the dataset." ) report[language_id].skipped += 1 continue # else, look for candidates, link to existing form or add new form for item, value in form.items(): try: sep = db.dataset["FormTable", item].separator except KeyError: continue if sep is None: continue form[item] = value.split(sep) form_candidates = db.find_db_candidates(form, match_form) if form_candidates: new_concept_added = False for form_id in form_candidates: logger.info(f"Form {form[c_f_value]} was already in dataset.") if db.dataset["FormTable", c_f_concept].separator: for new_concept in form[c_f_concept]: if ( new_concept not in db.cache["FormTable"][form_id][c_f_concept] ): db.cache["FormTable"][form_id][c_f_concept].append( new_concept ) logger.info( f"New form-concept association: Concept {form[c_f_concept]} was added to existing form " f"{form_id}. If this was not intended " f"(because it is a homophonous form, not a polysemy), " f"you need to manually remove that concept from the old form in forms.csv " f"and create a separate new form. If you want to treat identical forms " f"as homophones in general, add " f"--match-forms={' '.join(match_form)}, " f"{db.dataset['FormTable', 'parameterReference']} " f"when you run this script." ) new_concept_added = True break if new_concept_added: report[language_id].concepts += 1 else: report[language_id].existing += 1 else: # we land here after the break and keep adding existing forms to the dataset just with integer in id +1 form[c_f_language] = language_id if "id" in implicit: # TODO: check for type of form id column form_concept = form[c_f_concept] concept_reference = ( form_concept[0] if isinstance(form_concept, list) else form_concept ) form[c_f_id] = string_to_id(f"{form[c_f_language]}_{concept_reference}") db.make_id_unique(form) if status_update: form["Status_Column"] = status_update db.insert_into_db(form) report[language_id].new += 1 # write to cldf db.write_dataset_from_cache() return report
def add_metadata(fname: Path, logger: cli.logging.Logger = cli.logger): if fname.name != "forms.csv": cli.Exit.CLI_ARGUMENT_ERROR( "A metadata-free Wordlist must be in a file called 'forms.csv'.") default_wordlist = TableGroup.from_file( pycldf.util.pkg_path("modules", "Wordlist-metadata.json")) default_wordlist._fname = fname.with_name("Wordlist-metadata.json") ds = pycldf.Wordlist(default_wordlist) # `from_data` checks that the reqired columns of the FormTable are present, # but it does not consolidate the columns further. colnames = next(iterrows(fname)) understood_colnames = { c.name for c in ds[ds.primary_table].tableSchema.columns if c.name in colnames } more_columns = { c.propertyUrl.uri: c for c in ds[ds.primary_table].tableSchema.columns if c.name not in understood_colnames } logger.info( "CLDF freely understood the columns %s in your forms.csv.", sorted(understood_colnames), ) # Consider the columns that were not understood. columns_without_metadata = set(colnames) - understood_colnames for column_name in columns_without_metadata: column: Column # Maybe they are known CLDF properties? if column_name in pycldf.terms.TERMS: column = pycldf.TERMS[column_name].to_column() # Maybe they are CLDF default column names? elif column_name in DEFAULT_NAME_COLUMNS: column = DEFAULT_NAME_COLUMNS[column_name] # Maybe they are columns that Lexedata knows to handle? elif column_name in LEXEDATA_COLUMNS: column = LEXEDATA_COLUMNS[column_name] # Maybe they are columns inherited from LingPy? elif column_name.upper() in LINGPY_COLUMNS: column = LINGPY_COLUMNS[column_name.upper()] # Maybe they are some name we have seen before? elif column_name in OTHER_KNOWN_COLUMNS: column = OTHER_KNOWN_COLUMNS[column_name] else: # TODO: Maybe they look like they have a specific type? ... # Otherwise, they are probably just text to be kept. column = Column( datatype=Datatype(base="string"), default="", null=[""], name=column_name, ) column.name = column_name ds[ds.primary_table].tableSchema.columns.append(column) summary = column.propertyUrl or column.datatype logger.info(f"Column {column_name} seems to be a {summary} column.") if column.propertyUrl: to_be_replaced = more_columns.pop(column.propertyUrl.uri, None) if to_be_replaced is not None: ds[ds.primary_table].tableSchema.columns.remove(to_be_replaced) for column in more_columns.values(): logger.info( f"Also added column {column.name}, as expected for a FormTable.") ds[ds.primary_table].tableSchema.columns.sort( key=lambda k: colnames.index(k.name) if k.name in colnames else 1e10) # TODO: Once lexedata is properly published, we can give a better URL. ds.properties["dc:contributor"] = [ "https://github.com/Anaphory/lexedata/blob/master/src/lexedata/edit/add_metadata.py" ] return ds
def merge_forms( data: types.Wordlist[types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], mergers: t.Mapping[str, Merger], homophone_groups: t.MutableMapping[types.Form_ID, t.Sequence[types.Form_ID]], logger: cli.logging.Logger = cli.logger, ) -> t.Iterable[types.Form]: """Merge forms from a dataset. TODO: Construct an example that shows that the order given in `homophone_groups` is maintained. Side Effects ============ Changes homophone_groups: Groups that are skipped are removed """ merge_targets = { variant: target for target, variants in homophone_groups.items() for variant in variants } for target in homophone_groups: assert merge_targets[target] == target c_f_id = data["FormTable", "id"].name buffer: t.Dict[types.Form_ID, types.Form] = {} unknown = set() form: types.Form for form in cli.tq( data["FormTable"], task="Going through forms and merging", logger=logger, total=data["FormTable"].common_props.get("dc:extent"), ): id: types.Form_ID = form[c_f_id] buffer[id] = form if id in merge_targets: unknown.add(id) target_id = merge_targets[id] group = homophone_groups[target_id] if all(i in buffer for i in group): try: buffer[target_id] = merge_group( [buffer[i] for i in group], buffer[target_id].copy(), # type: ignore mergers, data, logger, ) for i in group: if i != target_id: del buffer[i] except Skip: logger.info( f"Merging form {id} with forms {[f[c_f_id] for f in group]} was skipped." ) del homophone_groups[id] pass for i in group: unknown.remove(i) for f in list(buffer): if f in unknown: break yield buffer.pop(f)
def __init__( self, dataset: pycldf.Dataset, element_semantics: t.Iterable[t.Tuple[str, str, str, bool]] = [ # ("[", "]", "phonetic", True), ("<", ">", "form", True), # ("/", "/", "phonemic", True), ("(", ")", "comment", False), ("{", "}", "source", False), ], separation_pattern: str = r"([;,])", variant_separator: t.Optional[t.List[str]] = ["~", "%"], add_default_source: t.Optional[str] = "{1}", logger: cli.logging.Logger = cli.logger, ): super().__init__(dataset) # Colums implied by element semantics self.bracket_pairs = { start: end for start, end, _, _ in element_semantics } self.element_semantics = { start: (term, transcription) for start, _, term, transcription in element_semantics } for start, end, term, transcription in element_semantics: # Ensure that all terms required by the element semantics are fields we can write to. self.cc(short=term, long=("FormTable", term), dataset=dataset) assert self.transcriptions, ( "Your metadata json file and your cell parser don’t match: Your cell parser " f"{self.__class__.__name__} expects to work with transcriptions " "(at least one of 'orthographic', 'phonemic', and 'phonetic') to derive a #form " "in #FormTable, but your metadata defines no such column.") # Colums necessary for word list self.cc(short="source", long=("FormTable", "source"), dataset=dataset) self.cc(short="comment", long=("FormTable", "comment"), dataset=dataset) try: self.comment_separator = dataset["FormTable", "comment"].separator or "\t" except KeyError: logger.info("No #comment column found.") self.comment_separator = "" try: # As long as there is no CLDF term #variants, this will either be # 'variants' or raise a KeyError. However, it is a transparent # re-use of an otherwise established idiom in this module, so we # use this minor overhead. self.c["variants"] = dataset["FormTable", "variants"].name except KeyError: logger.warning( "No 'variants' column found for FormTable in Wordlist-metadata.json. " "Form variants will be added to #comment.") # Other class attributes self.separation_pattern = separation_pattern self.variant_separator = variant_separator self.add_default_source = add_default_source
def load_forms_from_tsv( dataset: types.Wordlist[ types.Language_ID, types.Form_ID, types.Parameter_ID, types.Cognate_ID, types.Cognateset_ID, ], input_file: Path, logger: cli.logging.Logger = cli.logger, ) -> t.Mapping[int, t.Sequence[t.Tuple[types.Form_ID, range, t.Sequence[str]]]]: """ Side effects ============ This function overwrites dataset's FormTable """ input = csv.DictReader( input_file.open(encoding="utf-8"), delimiter="\t", ) # These days, all dicts are ordered by default. Still, better make this explicit. forms = util.cache_table(dataset) edictor_cognatesets: t.Dict[ int, t.List[t.Tuple[types.Form_ID, range, t.Sequence[str]]] ] = collections.defaultdict(list) form_table_upper = { (util.cldf_property(column.propertyUrl) or column.name).upper(): ( util.cldf_property(column.propertyUrl) or column.name ) for column in dataset["FormTable"].tableSchema.columns } form_table_upper.update( { "DOCULECT": "languageReference", "CONCEPT": "parameterReference", "IPA": "form", "COGID": "cognatesetReference", "ALIGNMENT": "alignment", "TOKENS": "segments", "CLDF_ID": "id", "ID": "", } ) if "_PARAMETERREFERENCE" in [f.upper() for f in input.fieldnames]: form_table_upper["_PARAMETERREFERENCE"] = "parameterReference" form_table_upper["CONCEPT"] = "" separators: t.MutableMapping[str, t.Optional[str]] = {} # TODO: What's the logic behind going backwards through this? We are not modifying fieldnames. for i in range(len(input.fieldnames)): if i == 0 and input.fieldnames[0] != "ID": raise ValueError( "When importing from Edictor, expected the first column to be named 'ID', but found %s", input.fieldnames["ID"], ) lingpy = input.fieldnames[i] try: input.fieldnames[i] = form_table_upper[lingpy.upper()] except KeyError: logger.warning( "Your edictor file contained a column %s, which I could not interpret.", lingpy, ) if input.fieldnames[i] == "cognatesetReference": separators[input.fieldnames[i]] = " " elif input.fieldnames[i] == "alignment": separators[input.fieldnames[i]] = " " try: separators[input.fieldnames[i]] = dataset[ "FormTable", input.fieldnames[i] ].separator except KeyError: pass logger.info( "The header of your edictor file will be interpreted as %s.", input.fieldnames ) affected_forms: t.Set[types.Form_ID] = set() for line in cli.tq( input, task="Importing form rows from edictor…", total=len(forms) ): # Column "" is the re-named Lingpy-ID column, so the first one. if not any(line.values()) or line[""].startswith("#"): # One of Edictor's comment rows, storing settings continue for (key, value) in line.items(): value = value.replace("\\!t", "\t").replace("\\!n", "\n") sep = separators[key] if sep is not None: if not value: line[key] = [] else: line[key] = value.split(sep) else: line[key] = value affected_forms.add(line["id"]) try: for segments, cognateset, alignment in extract_partial_judgements( line["segments"], line["cognatesetReference"], line["alignment"], logger, ): edictor_cognatesets[cognateset].append( (line["id"], segments, alignment) ) forms[line["id"]] = line except IndexError: logger.warning( f"In form with Lingpy-ID {line['']}: Cognateset judgements {line['cognatesetReference']} and alignment {line['alignment']} did not match. At least one morpheme skipped." ) edictor_cognatesets.pop(0, None) columns = { (util.cldf_property(column.propertyUrl) or column.name): column.name for column in dataset["FormTable"].tableSchema.columns } # Deliberately make use of the property of `write` to discard any entries # that don't correspond to existing columns. Otherwise, we'd still have to # get rid of the alignment, cognatesetReference and Lingpy-ID columns. dataset["FormTable"].write( ( { columns[property]: value for property, value in form.items() if columns.get(property) } for form in forms.values() ) ) return edictor_cognatesets, affected_forms