Пример #1
0
def test_singletons():
    dataset, _ = copy_to_temp_no_bib(
        Path(__file__).parent /
        "data/cldf/smallmawetiguarani/cldf-metadata.json")
    add_status_column_to_table(dataset=dataset, table_name="CognatesetTable")

    all_cogsets, judgements = create_singletons(dataset=dataset,
                                                status="automatic singleton")
    c_c_id = dataset["CognateTable", "id"].name
    c_cs_id = dataset["CognatesetTable", "id"].name
    cognates = [c for c in judgements if c[c_c_id].startswith("X")]
    cogsets = [c for c in all_cogsets if c[c_cs_id].startswith("X")]
    assert cognates == [
        {
            "ID":
            "X_old_paraguayan_guarani_two_1",
            "Form_ID":
            "old_paraguayan_guarani_two",
            "Comment":
            None,
            "Segment_Slice": ["1:5"],
            "Alignment": ["p", "a", "t", "h", "á"],
            "FIXME_IF_you_set_this_column_name_to_Value_it_messes_up_translations_due_to_conflict":
            "X_old_paraguayan_guarani_two_1",
        },
        {
            "ID":
            "X_paraguayan_guarani_five_1",
            "Form_ID":
            "paraguayan_guarani_five",
            "Comment":
            None,
            "Segment_Slice": ["1:2"],
            "Alignment": ["p", "o"],
            "FIXME_IF_you_set_this_column_name_to_Value_it_messes_up_translations_due_to_conflict":
            "X_paraguayan_guarani_five_1",
        },
    ]

    assert cogsets == [
        {
            "ID": "X_old_paraguayan_guarani_two_1",
            "Set": None,
            "Comment": None,
            "Name": "two",
            "Status_Column": "automatic singleton",
        },
        {
            "ID": "X_paraguayan_guarani_five_1",
            "Set": None,
            "Comment": None,
            "Name": "five",
            "Status_Column": "automatic singleton",
        },
    ]
Пример #2
0
def add_central_concepts_to_cognateset_table(
    dataset: pycldf.Dataset,
    add_column: bool = True,
    overwrite_existing: bool = True,
    logger: cli.logging.Logger = cli.logger,
    status_update: t.Optional = None,
) -> pycldf.Dataset:
    # create mapping cognateset to central concept
    try:
        clics: t.Optional[networkx.Graph] = load_clics()
    except FileNotFoundError:
        logger.warning("Clics could not be loaded.")
        clics = None
    concepts_of_cognateset: t.Mapping[
        CognatesetID, t.Counter[ConceptID]] = connected_concepts(dataset)
    central: t.MutableMapping[str, str] = {}
    if clics and dataset.column_names.parameters.concepticonReference:
        concept_to_concepticon = concepts_to_concepticon(dataset)
        for cognateset, concepts in concepts_of_cognateset.items():
            central[cognateset] = central_concept(concepts,
                                                  concept_to_concepticon,
                                                  clics)
    else:
        logger.warning(
            f"Dataset {dataset:} had no concepticonReference in a ParamterTable."
        )
        for cognateset, concepts in concepts_of_cognateset.items():
            central[cognateset] = central_concept(concepts, {}, None)
    dataset = reshape_dataset(dataset, add_column=add_column)
    c_core_concept = dataset.column_names.cognatesets.parameterReference
    if c_core_concept is None:
        raise ValueError(
            f"Dataset {dataset:} had no parameterReference column in a CognatesetTable"
            " and is thus not compatible with this script.")
    # if status update given, add status column
    if status_update:
        add_status_column_to_table(dataset=dataset,
                                   table_name="CognatesetTable")
    # write cognatesets with central concepts
    write_back = []
    for row in cli.tq(
            dataset["CognatesetTable"],
            task="Write cognatesets with central concepts to dataset",
            total=dataset["CognatesetTable"].common_props.get("dc:extent"),
    ):
        if not overwrite_existing and row[c_core_concept]:
            continue
        row[c_core_concept] = central.get(
            row[dataset.column_names.cognatesets.id])
        row["Status_Column"] = status_update
        write_back.append(row)
    dataset.write(CognatesetTable=write_back)
    return dataset
Пример #3
0
def replace_column(
    dataset: pycldf.Dataset,
    original: str,
    replacement: str,
    column_replace: bool,
    smush: bool,
    status_update: t.Optional[str],
    logger: cli.logging.Logger = cli.logger,
) -> None:
    # add Status_column if not existing and status update given
    if status_update:
        add_status_column_to_table(dataset=dataset,
                                   table_name="ParameterTable")

    if column_replace:
        assert (
            original == "id"
            or original == dataset["ParameterTable", "id"].name
        ), f"Replacing an entire column is only meaningful when you change the #id column ({dataset['ParameterTable', 'id'].name}) of the ConceptTable."

        c_id = dataset["ParameterTable", original].name
        c_new = dataset["ParameterTable", replacement].name
        mapping = {
            concept[c_id]: concept[c_new]
            for concept in dataset["ParameterTable"]
        }
        assert smush or len(mapping) == len(
            set(mapping.values())
        ), "Would collapse some concepts that were distinct before! Add '--smush' if that is intended."
        # dataset["ParameterTable"].tableSchema.columns["c_id"]
        rename(dataset, mapping, logger, status_update=status_update)
    else:
        concepts = dataset["ParameterTable"]

        c_id = dataset["ParameterTable", "id"].name

        logger.info(f"Changing {c_id:} of ParameterTable…")
        dataset.write(ParameterTable=[
            substitute_many(r, [c_id], {original: replacement},
                            status_update=None) for r in concepts
        ])
        rename(dataset, {original: replacement},
               logger,
               status_update=status_update)
Пример #4
0
def aligne_cognate_table(dataset: pycldf.Dataset,
                         status_update: t.Optional[str] = None):
    # add Status_Column if not existing – TODO: make configurable
    if status_update:
        add_status_column_to_table(dataset=dataset, table_name="CognateTable")

    forms = util.cache_table(dataset, "FormTable")

    c_id = dataset["CognateTable", "id"].name
    c_form_id = dataset["CognateTable", "formReference"].name
    c_cognateset_id = dataset["CognateTable", "cognatesetReference"].name
    c_slice = dataset["CognateTable", "segmentSlice"].name
    c_alignment = dataset["CognateTable", "alignment"].name

    cognatesets: t.Dict[str, t.List[t.Tuple[str, str, str, t.List[str]]]] = {}
    judgements: t.Dict[str, t.Dict[str, t.Any]] = {}
    for judgement in cli.tq(
            dataset["CognateTable"],
            task="Aligning the cognate segments",
            total=dataset["CognateTable"].common_props.get("dc:extent"),
    ):
        judgements[judgement[c_id]] = judgement
        form = forms[judgement[c_form_id]]
        morpheme = []
        if not judgement[c_slice]:
            morpheme = form["segments"]
        else:
            morpheme = [
                form["segments"][i]
                for i in util.parse_segment_slices(judgement[c_slice])
            ]
        cognatesets.setdefault(judgement[c_cognateset_id], []).append(
            ((form["languageReference"], morpheme), judgement[c_id]))

    for cognateset, morphemes in cognatesets.items():
        for alignment, id in align(morphemes):
            judgements[id][c_alignment] = alignment
            if status_update:
                judgements[id]["Status_Column"] = status_update
    dataset.write(CognateTable=judgements.values())
Пример #5
0
def create_concepticon_for_concepts(
    dataset: pycldf.Dataset,
    language: t.Sequence[t.Tuple[str, str]],
    concepticon_glosses: bool,
    concepticon_definition: bool,
    overwrite: bool,
    status_update: t.Optional[str],
):
    # add Status_Column if status update
    if status_update:
        add_status_column_to_table(dataset=dataset,
                                   table_name="ParameterTable")
    # add Concepticon_ID column to ParameterTable
    if dataset.column_names.parameters.concepticonReference is None:
        # Create a concepticonReference column
        dataset.add_columns("ParameterTable", "Concepticon_ID")
        c = dataset["ParameterTable"].tableSchema.columns[-1]
        c.valueUrl = "http://concepticon.clld.org/parameters/{Concepticon_ID}"
        c.propertyUrl = URITemplate(
            "http://cldf.clld.org/v1.0/terms.rdf#concepticonReference")
        dataset.write_metadata()
    if not language:
        language = [(dataset.column_names.parameters.id, "en")]

    gloss_languages: t.Dict[str, str] = dict(language)
    add_concepticon_references(
        dataset,
        gloss_languages=gloss_languages,
        status_update=status_update,
        overwrite=overwrite,
    )

    if concepticon_glosses:
        add_concepticon_names(dataset)
    if concepticon_definition:
        add_concepticon_definitions(dataset=dataset)
Пример #6
0
def add_single_languages(
    metadata: Path,
    sheets: t.Iterable[openpyxl.worksheet.worksheet.Worksheet],
    match_form: t.Optional[t.List[str]],
    concept_name: t.Optional[str],
    ignore_missing: bool,
    ignore_superfluous: bool,
    status_update: t.Optional[str],
    logger: cli.logging.Logger,
) -> t.Mapping[str, ImportLanguageReport]:
    if status_update == "None":
        status_update = None
    # initiate dataset from meta data or csv depending on command line arguments
    if metadata:
        if metadata.name == "forms.csv":
            dataset = pycldf.Dataset.from_data(metadata)
        else:
            dataset = pycldf.Dataset.from_metadata(metadata)

    concepts: t.Mapping[str, str]
    try:
        cid = dataset["ParameterTable", "id"].name
        if concept_name is None:
            concepts = {c[cid]: c[cid] for c in dataset["ParameterTable"]}
            concept_column = dataset["FormTable", "parameterReference"].name
        else:
            name = dataset["ParameterTable", "name"].name
            concepts = {c[name]: c[cid] for c in dataset["ParameterTable"]}
            concept_column = concept_name
    except (KeyError, FileNotFoundError) as err:
        if isinstance(err, KeyError):
            logger.warning(
                "Did not find a well-formed ParameterTable. Importing all forms independent of concept"
            )
        elif isinstance(err, FileNotFoundError):
            logger.warning(
                f"Did not find {dataset['ParameterTable'].url.string}. "
                f"Importing all forms independent of concept"
            )
        concepts = KeyKeyDict()
        if concept_name:
            concept_column = concept_name
        else:
            concept_column = dataset["FormTable", "parameterReference"].name
    # add Status_Column if not existing and status_update given
    if status_update:
        add_status_column_to_table(dataset=dataset, table_name="FormTable")
    report: t.Dict[str, ImportLanguageReport] = defaultdict(ImportLanguageReport)
    # import all selected sheets
    for sheet in sheets:
        for lang, subreport in read_single_excel_sheet(
            dataset=dataset,
            sheet=sheet,
            logger=logger,
            match_form=match_form,
            entries_to_concepts=concepts,
            concept_column=concept_column,
            ignore_missing=ignore_missing,
            ignore_superfluous=ignore_superfluous,
            status_update=status_update,
        ).items():
            report[lang] += subreport
    return report
Пример #7
0
def load_dataset(
    metadata: Path,
    lexicon: t.Optional[str],
    cognate_lexicon: t.Optional[str] = None,
    status_update: t.Optional[str] = None,
    logger: logging.Logger = cli.logger,
):
    # logging.basicConfig(filename="warnings.log")
    dataset = pycldf.Dataset.from_metadata(metadata)
    # load dialect from metadata
    try:
        dialect = argparse.Namespace(
            **dataset.tablegroup.common_props["special:fromexcel"])
    except KeyError:
        dialect = None

    if not lexicon and not cognate_lexicon:
        raise argparse.ArgumentError(
            None,
            "At least one of WORDLIST and COGNATESETS excel files must be specified",
        )
    if lexicon:
        # load dialect from metadata
        if dialect:
            try:
                EP = excel_parser_from_dialect(dataset, dialect, cognate=False)
            except (AttributeError, KeyError) as err:
                field = re.match(r".*?'(.+?)'.+?'(.+?)'$", str(err)).group(2)
                logger.warning(
                    f"User-defined format specification in the json-file was missing the key {field}, "
                    f"falling back to default parser")
                EP = ExcelParser
        else:
            logger.warning(
                "User-defined format specification in the json-file was missing, falling back to default parser"
            )
            EP = ExcelParser
            # The Intermediate Storage, in a in-memory DB (unless specified otherwise)
        # add Status_Column if not existing
        if status_update:
            add_status_column_to_table(dataset=dataset, table_name="FormTable")
        EP = EP(dataset, row_type=Concept)

        EP.db.empty_cache()

        lexicon_wb = openpyxl.load_workbook(lexicon).active
        EP.parse_cells(lexicon_wb, status_update=status_update)
        EP.db.write_dataset_from_cache()

    # load cognate dataset if provided by metadata
    if cognate_lexicon:
        if dialect:
            try:
                ECP = excel_parser_from_dialect(
                    dataset,
                    argparse.Namespace(**dialect.cognates),
                    cognate=True)
            except (AttributeError, KeyError) as err:
                field = re.match(r".*?'(.+?)'.+?'(.+?)'$", str(err)).group(2)
                logger.warning(
                    f"User-defined format specification in the json-file was missing the key {field}, "
                    f"falling back to default parser")
                ECP = ExcelCognateParser
        else:
            logger.warning(
                "User-defined format specification in the json-file was missing, falling back to default parser"
            )
            ECP = ExcelCognateParser
        # add Status_Column if not existing
        if status_update:
            add_status_column_to_table(dataset=dataset,
                                       table_name="CognateTable")
        ECP = ECP(dataset, row_type=CogSet)
        ECP.db.cache_dataset()
        for sheet in openpyxl.load_workbook(cognate_lexicon).worksheets:
            ECP.parse_cells(sheet, status_update=status_update)
        ECP.db.write_dataset_from_cache()