예제 #1
0
def import_data_from_sheet(
    sheet,
    sheet_header,
    language_id: str,
    implicit: t.Mapping[Literal["languageReference", "id", "value"], str] = {},
    concept_column: t.Tuple[str, str] = ("Concept_ID", "Concept_ID"),
) -> t.Iterable[Form]:
    row_iter = sheet.iter_rows()

    # TODO?: compare header of this sheet to format of given dataset process
    # row. Maybe unnecessary. In any case, do not complain about the unused
    # variable.
    header = next(row_iter)  # noqa: F841

    assert (
        concept_column[1] in sheet_header
    ), f"Could not find concept column {concept_column[1]} in your excel sheet {sheet.title}."

    for row in row_iter:
        data = Form({k: clean_cell_value(cell) for k, cell in zip(sheet_header, row)})
        if "?" in data.values():
            continue
        if "value" in implicit:
            data[implicit["value"]] = "\t".join(map(str, data.values()))
        concept_entry = data.pop(concept_column[1])
        data[concept_column[0]] = concept_entry
        if "id" in implicit:
            data[implicit["id"]] = None
        if "languageReference" in implicit:
            data[implicit["languageReference"]] = language_id
        yield data
예제 #2
0
 def language_from_column(self,
                          column: t.List[openpyxl.cell.Cell]) -> Language:
     data = [clean_cell_value(cell) for cell in column[:self.top - 1]]
     # Do we need to know language comments? – comment = get_cell_comment(column[0])
     return Language({
         self.db.dataset["LanguageTable", "name"].name: data[0],
     })
예제 #3
0
 def language_from_column(self,
                          column: t.List[openpyxl.cell.Cell]) -> Language:
     data = [clean_cell_value(cell) for cell in column[:self.top - 1]]
     comment = get_cell_comment(column[0])
     id = string_to_id(data[0])
     return Language(
         # an id candidate must be provided, which is transformed into a unique id
         ID=id,
         Name=data[0],
         Comment=comment,
     )
예제 #4
0
    def properties_from_row(
            self, row: t.List[openpyxl.cell.Cell]) -> t.Optional[CogSet]:
        # TODO: get_cell_comment with unicode normalization or not? -> yes, comments also
        c_id = self.db.dataset[self.row_type.__table__, "id"].name
        c_comment = self.db.dataset[self.row_type.__table__, "comment"].name
        c_name = self.db.dataset[self.row_type.__table__, "name"].name
        data = [clean_cell_value(cell) for cell in row[:self.left - 1]]
        properties = dict(zip(self.row_header, data))
        # delete all possible None entries coming from row_header
        while None in properties.keys():
            del properties[None]

        # fetch cell comment
        comment = get_cell_comment(row[0])
        properties[c_comment] = comment

        # cldf_name serves as cldf_id candidate
        properties[c_id] = properties.get(c_id) or properties[c_name]
        # create new row object
        return self.row_type(properties)
예제 #5
0
    def properties_from_row(
            self, row: t.List[openpyxl.cell.Cell]) -> t.Optional[RowObject]:
        self.row_prop_separators = [
            self.db.dataset["CognatesetTable", k].separator
            for k in self.row_header
        ]
        data = [clean_cell_value(cell) for cell in row[:self.left - 1]]
        properties: t.Dict[str, t.Any] = {
            n: (v if sep is None else v.split(sep))
            for n, sep, v in zip(self.row_header, self.row_prop_separators,
                                 data) if n if v
        }
        if not properties:
            return None

        try:
            c_s_name = self.db.dataset["CognatesetTable", "name"].name
        except KeyError:
            c_s_name = None

        if not properties.get(c_s_name) and not properties.get(
                self.db.dataset["CognatesetTable", "id"].name):
            # TODO: Get official logger, or turn this into an Error that can be caught elsewhere.
            cli.logger.warning(
                "Row %d had no cognateset name and no ID, but other metadata: %s. If there are any entries in this row, they have been grouped with the previous row.",
                row[0].row,
                properties,
            )
            return None

        comments: t.List[str] = []
        for cell in row[:self.left - 1]:
            c = get_cell_comment(cell)
            if c is not None:
                comments.append(c)
        comment = "\t".join(comments).strip()
        properties[self.db.dataset["CognatesetTable",
                                   "comment"].name] = comment

        return CogSet(properties)
예제 #6
0
def test_cell_value():
    wb = op.Workbook()
    ws = wb.active
    ws["A2"] = 2
    ws["A3"] = 3.14
    ws["A4"] = "4"
    ws["B1"] = "über"
    ws["B2"] = unicodedata.normalize("NFD", "über")
    ws["B3"] = "Line\nover\nline"
    _, filename = tempfile.mkstemp(suffix=".xlsx")
    wb.save(filename)
    del wb, ws

    wb = op.load_workbook(filename)
    ws = wb.active

    assert clean_cell_value(ws["A1"]) == ""
    assert clean_cell_value(ws["A2"]) == 2
    assert clean_cell_value(ws["A3"]) == 3.14
    assert clean_cell_value(ws["A4"]) == "4"
    assert clean_cell_value(ws["B1"]) == unicodedata.normalize("NFC", "über")
    assert clean_cell_value(ws["B2"]) == unicodedata.normalize("NFC", "über")
    assert clean_cell_value(ws["B3"]) == "Line;\tover;\tline"
예제 #7
0
def import_interleaved(
    ws: openpyxl.worksheet.worksheet.Worksheet,
    logger: logging.Logger = cli.logger,
    ids: t.Optional[t.Set[types.Cognateset_ID]] = None,
) -> t.Iterable[
    t.Tuple[
        types.Form_ID,
        types.Language_ID,
        types.Parameter_ID,
        str,
        None,
        types.Cognateset_ID,
    ]
]:
    if ids is None:
        ids = set()

    comma_or_semicolon = re.compile("[,;]\\W*")

    concepts = []
    for concept_metadata in ws.iter_cols(min_col=1, max_col=1, min_row=2):
        for entry, cogset in zip(concept_metadata[::2], concept_metadata[1::2]):
            try:
                concepts.append(clean_cell_value(entry))
            except AttributeError:
                break

    for language in cli.tq(
        ws.iter_cols(min_col=2), task="Parsing cells", total=ws.max_column
    ):
        language_name = clean_cell_value(language[0])
        for c, (entry, cogset) in enumerate(zip(language[1::2], language[2::2])):
            if not entry.value:
                if cogset.value:
                    logger.warning(
                        f"Cell {entry.coordinate} was empty, but cognatesets {cogset.value} were given in {cogset.coordinate}."
                    )
                continue
            bracket_level = 0
            i = 0
            f = clean_cell_value(entry)
            forms = []

            try:
                len(f)
            except TypeError:
                cli.Exit.INVALID_INPUT(
                    "I expected one or more forms (so, text) in cell {}, but found {}. Do you have more than one header row?".format(
                        entry.coordinate, f
                    )
                )

            while i < len(f):
                match = comma_or_semicolon.match(f[i:])
                if f[i] == "(":
                    bracket_level += 1
                    i += 1
                    continue
                elif f[i] == ")":
                    bracket_level -= 1
                    i += 1
                    continue
                elif bracket_level:
                    i += 1
                    continue
                elif match:
                    forms.append(f[:i].strip())
                    i += match.span()[1]
                    f = f[i:]
                    i = 0
                else:
                    i += 1

            forms.append(f.strip())

            if isinstance(clean_cell_value(cogset), int):
                cogsets = [str(clean_cell_value(cogset))]
            else:
                cogset = clean_cell_value(cogset)
                cogsets = comma_or_semicolon.split(cogset.strip())

            if len(cogsets) == 1 or len(cogsets) == len(forms):
                True
            else:
                logger.warning(
                    "{:}: Forms ({:}) did not match cognates ({:})".format(
                        entry.coordinate, ", ".join(forms), ", ".join(cogsets)
                    )
                )
            for form, cogset in zip(forms, cogsets + [None]):
                if form == "?" or cogset == "?":
                    continue
                base_id = util.string_to_id(f"{language_name}_{concepts[c]}")
                id = base_id
                synonym = 1
                while id in ids:
                    synonym += 1
                    id = f"{base_id}_s{synonym:d}"
                yield (id, language_name, concepts[c], form, None, cogset)
                ids.add(id)
예제 #8
0
def cells_are_empty(cells: t.Iterable[openpyxl.cell.Cell]) -> bool:
    return not any([clean_cell_value(cell) for cell in cells])