def import_data_from_sheet( sheet, sheet_header, language_id: str, implicit: t.Mapping[Literal["languageReference", "id", "value"], str] = {}, concept_column: t.Tuple[str, str] = ("Concept_ID", "Concept_ID"), ) -> t.Iterable[Form]: row_iter = sheet.iter_rows() # TODO?: compare header of this sheet to format of given dataset process # row. Maybe unnecessary. In any case, do not complain about the unused # variable. header = next(row_iter) # noqa: F841 assert ( concept_column[1] in sheet_header ), f"Could not find concept column {concept_column[1]} in your excel sheet {sheet.title}." for row in row_iter: data = Form({k: clean_cell_value(cell) for k, cell in zip(sheet_header, row)}) if "?" in data.values(): continue if "value" in implicit: data[implicit["value"]] = "\t".join(map(str, data.values())) concept_entry = data.pop(concept_column[1]) data[concept_column[0]] = concept_entry if "id" in implicit: data[implicit["id"]] = None if "languageReference" in implicit: data[implicit["languageReference"]] = language_id yield data
def language_from_column(self, column: t.List[openpyxl.cell.Cell]) -> Language: data = [clean_cell_value(cell) for cell in column[:self.top - 1]] # Do we need to know language comments? – comment = get_cell_comment(column[0]) return Language({ self.db.dataset["LanguageTable", "name"].name: data[0], })
def language_from_column(self, column: t.List[openpyxl.cell.Cell]) -> Language: data = [clean_cell_value(cell) for cell in column[:self.top - 1]] comment = get_cell_comment(column[0]) id = string_to_id(data[0]) return Language( # an id candidate must be provided, which is transformed into a unique id ID=id, Name=data[0], Comment=comment, )
def properties_from_row( self, row: t.List[openpyxl.cell.Cell]) -> t.Optional[CogSet]: # TODO: get_cell_comment with unicode normalization or not? -> yes, comments also c_id = self.db.dataset[self.row_type.__table__, "id"].name c_comment = self.db.dataset[self.row_type.__table__, "comment"].name c_name = self.db.dataset[self.row_type.__table__, "name"].name data = [clean_cell_value(cell) for cell in row[:self.left - 1]] properties = dict(zip(self.row_header, data)) # delete all possible None entries coming from row_header while None in properties.keys(): del properties[None] # fetch cell comment comment = get_cell_comment(row[0]) properties[c_comment] = comment # cldf_name serves as cldf_id candidate properties[c_id] = properties.get(c_id) or properties[c_name] # create new row object return self.row_type(properties)
def properties_from_row( self, row: t.List[openpyxl.cell.Cell]) -> t.Optional[RowObject]: self.row_prop_separators = [ self.db.dataset["CognatesetTable", k].separator for k in self.row_header ] data = [clean_cell_value(cell) for cell in row[:self.left - 1]] properties: t.Dict[str, t.Any] = { n: (v if sep is None else v.split(sep)) for n, sep, v in zip(self.row_header, self.row_prop_separators, data) if n if v } if not properties: return None try: c_s_name = self.db.dataset["CognatesetTable", "name"].name except KeyError: c_s_name = None if not properties.get(c_s_name) and not properties.get( self.db.dataset["CognatesetTable", "id"].name): # TODO: Get official logger, or turn this into an Error that can be caught elsewhere. cli.logger.warning( "Row %d had no cognateset name and no ID, but other metadata: %s. If there are any entries in this row, they have been grouped with the previous row.", row[0].row, properties, ) return None comments: t.List[str] = [] for cell in row[:self.left - 1]: c = get_cell_comment(cell) if c is not None: comments.append(c) comment = "\t".join(comments).strip() properties[self.db.dataset["CognatesetTable", "comment"].name] = comment return CogSet(properties)
def test_cell_value(): wb = op.Workbook() ws = wb.active ws["A2"] = 2 ws["A3"] = 3.14 ws["A4"] = "4" ws["B1"] = "über" ws["B2"] = unicodedata.normalize("NFD", "über") ws["B3"] = "Line\nover\nline" _, filename = tempfile.mkstemp(suffix=".xlsx") wb.save(filename) del wb, ws wb = op.load_workbook(filename) ws = wb.active assert clean_cell_value(ws["A1"]) == "" assert clean_cell_value(ws["A2"]) == 2 assert clean_cell_value(ws["A3"]) == 3.14 assert clean_cell_value(ws["A4"]) == "4" assert clean_cell_value(ws["B1"]) == unicodedata.normalize("NFC", "über") assert clean_cell_value(ws["B2"]) == unicodedata.normalize("NFC", "über") assert clean_cell_value(ws["B3"]) == "Line;\tover;\tline"
def import_interleaved( ws: openpyxl.worksheet.worksheet.Worksheet, logger: logging.Logger = cli.logger, ids: t.Optional[t.Set[types.Cognateset_ID]] = None, ) -> t.Iterable[ t.Tuple[ types.Form_ID, types.Language_ID, types.Parameter_ID, str, None, types.Cognateset_ID, ] ]: if ids is None: ids = set() comma_or_semicolon = re.compile("[,;]\\W*") concepts = [] for concept_metadata in ws.iter_cols(min_col=1, max_col=1, min_row=2): for entry, cogset in zip(concept_metadata[::2], concept_metadata[1::2]): try: concepts.append(clean_cell_value(entry)) except AttributeError: break for language in cli.tq( ws.iter_cols(min_col=2), task="Parsing cells", total=ws.max_column ): language_name = clean_cell_value(language[0]) for c, (entry, cogset) in enumerate(zip(language[1::2], language[2::2])): if not entry.value: if cogset.value: logger.warning( f"Cell {entry.coordinate} was empty, but cognatesets {cogset.value} were given in {cogset.coordinate}." ) continue bracket_level = 0 i = 0 f = clean_cell_value(entry) forms = [] try: len(f) except TypeError: cli.Exit.INVALID_INPUT( "I expected one or more forms (so, text) in cell {}, but found {}. Do you have more than one header row?".format( entry.coordinate, f ) ) while i < len(f): match = comma_or_semicolon.match(f[i:]) if f[i] == "(": bracket_level += 1 i += 1 continue elif f[i] == ")": bracket_level -= 1 i += 1 continue elif bracket_level: i += 1 continue elif match: forms.append(f[:i].strip()) i += match.span()[1] f = f[i:] i = 0 else: i += 1 forms.append(f.strip()) if isinstance(clean_cell_value(cogset), int): cogsets = [str(clean_cell_value(cogset))] else: cogset = clean_cell_value(cogset) cogsets = comma_or_semicolon.split(cogset.strip()) if len(cogsets) == 1 or len(cogsets) == len(forms): True else: logger.warning( "{:}: Forms ({:}) did not match cognates ({:})".format( entry.coordinate, ", ".join(forms), ", ".join(cogsets) ) ) for form, cogset in zip(forms, cogsets + [None]): if form == "?" or cogset == "?": continue base_id = util.string_to_id(f"{language_name}_{concepts[c]}") id = base_id synonym = 1 while id in ids: synonym += 1 id = f"{base_id}_s{synonym:d}" yield (id, language_name, concepts[c], form, None, cogset) ids.add(id)
def cells_are_empty(cells: t.Iterable[openpyxl.cell.Cell]) -> bool: return not any([clean_cell_value(cell) for cell in cells])