示例#1
0
def add_central_concepts_to_cognateset_table(
    dataset: pycldf.Dataset,
    add_column: bool = True,
    overwrite_existing: bool = True,
    logger: cli.logging.Logger = cli.logger,
    status_update: t.Optional = None,
) -> pycldf.Dataset:
    # create mapping cognateset to central concept
    try:
        clics: t.Optional[networkx.Graph] = load_clics()
    except FileNotFoundError:
        logger.warning("Clics could not be loaded.")
        clics = None
    concepts_of_cognateset: t.Mapping[
        CognatesetID, t.Counter[ConceptID]] = connected_concepts(dataset)
    central: t.MutableMapping[str, str] = {}
    if clics and dataset.column_names.parameters.concepticonReference:
        concept_to_concepticon = concepts_to_concepticon(dataset)
        for cognateset, concepts in concepts_of_cognateset.items():
            central[cognateset] = central_concept(concepts,
                                                  concept_to_concepticon,
                                                  clics)
    else:
        logger.warning(
            f"Dataset {dataset:} had no concepticonReference in a ParamterTable."
        )
        for cognateset, concepts in concepts_of_cognateset.items():
            central[cognateset] = central_concept(concepts, {}, None)
    dataset = reshape_dataset(dataset, add_column=add_column)
    c_core_concept = dataset.column_names.cognatesets.parameterReference
    if c_core_concept is None:
        raise ValueError(
            f"Dataset {dataset:} had no parameterReference column in a CognatesetTable"
            " and is thus not compatible with this script.")
    # if status update given, add status column
    if status_update:
        add_status_column_to_table(dataset=dataset,
                                   table_name="CognatesetTable")
    # write cognatesets with central concepts
    write_back = []
    for row in cli.tq(
            dataset["CognatesetTable"],
            task="Write cognatesets with central concepts to dataset",
            total=dataset["CognatesetTable"].common_props.get("dc:extent"),
    ):
        if not overwrite_existing and row[c_core_concept]:
            continue
        row[c_core_concept] = central.get(
            row[dataset.column_names.cognatesets.id])
        row["Status_Column"] = status_update
        write_back.append(row)
    dataset.write(CognatesetTable=write_back)
    return dataset
示例#2
0
def replace_column(
    dataset: pycldf.Dataset,
    original: str,
    replacement: str,
    column_replace: bool,
    smush: bool,
    status_update: t.Optional[str],
    logger: cli.logging.Logger = cli.logger,
) -> None:
    # add Status_column if not existing and status update given
    if status_update:
        add_status_column_to_table(dataset=dataset,
                                   table_name="ParameterTable")

    if column_replace:
        assert (
            original == "id"
            or original == dataset["ParameterTable", "id"].name
        ), f"Replacing an entire column is only meaningful when you change the #id column ({dataset['ParameterTable', 'id'].name}) of the ConceptTable."

        c_id = dataset["ParameterTable", original].name
        c_new = dataset["ParameterTable", replacement].name
        mapping = {
            concept[c_id]: concept[c_new]
            for concept in dataset["ParameterTable"]
        }
        assert smush or len(mapping) == len(
            set(mapping.values())
        ), "Would collapse some concepts that were distinct before! Add '--smush' if that is intended."
        # dataset["ParameterTable"].tableSchema.columns["c_id"]
        rename(dataset, mapping, logger, status_update=status_update)
    else:
        concepts = dataset["ParameterTable"]

        c_id = dataset["ParameterTable", "id"].name

        logger.info(f"Changing {c_id:} of ParameterTable…")
        dataset.write(ParameterTable=[
            substitute_many(r, [c_id], {original: replacement},
                            status_update=None) for r in concepts
        ])
        rename(dataset, {original: replacement},
               logger,
               status_update=status_update)
示例#3
0
def add_central_concepts_to_cognateset_table(
    dataset: pycldf.Dataset,
    add_column: bool = True,
    overwrite_existing: bool = True,
) -> pycldf.Dataset:
    # create mapping cognateset to central concept
    try:
        clics: t.Optional[networkx.Graph] = load_clics()
    except FileNotFoundError:
        clics = None
    concepts_of_cognateset: t.Mapping[
        CognatesetID, t.Counter[ConceptID]] = connected_concepts(dataset)
    central: t.MutableMapping[str, str] = {}
    if clics and dataset.column_names.parameters.concepticonReference:
        concept_to_concepticon = concepts_to_concepticon(dataset)
        for cognateset, concepts in concepts_of_cognateset.items():
            central[cognateset] = central_concept(concepts,
                                                  concept_to_concepticon,
                                                  clics)
    else:
        for cognateset, concepts in concepts_of_cognateset.items():
            central[cognateset] = central_concept(concepts, {}, None)
    dataset = reshape_dataset(dataset, add_column=add_column)
    c_core_concept = dataset.column_names.cognatesets.parameterReference
    if c_core_concept is None:
        raise ValueError(
            f"Dataset {dataset:} had no parameterReference column in a CognatesetTable"
            " and is thus not compatible with this script.")

    # write cognatesets with central concepts
    write_back = []
    for row in tqdm(
            dataset["CognatesetTable"],
            total=dataset["CognatesetTable"].common_props.get("dc:extent"),
    ):
        if not overwrite_existing and row[c_core_concept]:
            continue
        row[c_core_concept] = central.get(
            row[dataset.column_names.cognatesets.id])
        write_back.append(row)
    dataset.write(CognatesetTable=write_back)
    return dataset
示例#4
0
def aligne_cognate_table(dataset: pycldf.Dataset,
                         status_update: t.Optional[str] = None):
    # add Status_Column if not existing – TODO: make configurable
    if status_update:
        add_status_column_to_table(dataset=dataset, table_name="CognateTable")

    forms = util.cache_table(dataset, "FormTable")

    c_id = dataset["CognateTable", "id"].name
    c_form_id = dataset["CognateTable", "formReference"].name
    c_cognateset_id = dataset["CognateTable", "cognatesetReference"].name
    c_slice = dataset["CognateTable", "segmentSlice"].name
    c_alignment = dataset["CognateTable", "alignment"].name

    cognatesets: t.Dict[str, t.List[t.Tuple[str, str, str, t.List[str]]]] = {}
    judgements: t.Dict[str, t.Dict[str, t.Any]] = {}
    for judgement in cli.tq(
            dataset["CognateTable"],
            task="Aligning the cognate segments",
            total=dataset["CognateTable"].common_props.get("dc:extent"),
    ):
        judgements[judgement[c_id]] = judgement
        form = forms[judgement[c_form_id]]
        morpheme = []
        if not judgement[c_slice]:
            morpheme = form["segments"]
        else:
            morpheme = [
                form["segments"][i]
                for i in util.parse_segment_slices(judgement[c_slice])
            ]
        cognatesets.setdefault(judgement[c_cognateset_id], []).append(
            ((form["languageReference"], morpheme), judgement[c_id]))

    for cognateset, morphemes in cognatesets.items():
        for alignment, id in align(morphemes):
            judgements[id][c_alignment] = alignment
            if status_update:
                judgements[id]["Status_Column"] = status_update
    dataset.write(CognateTable=judgements.values())
示例#5
0
def add_segments_to_dataset(dataset: pycldf.Dataset, transcription: str,
                            overwrite_existing: bool):
    if dataset.column_names.forms.segments is None:
        # Create a Segments column in FormTable
        dataset.add_columns("FormTable", "Segments")
        c = dataset["FormTable"].tableSchema.columns[-1]
        c.separator = " "
        c.propertyUrl = URITemplate(
            "http://cldf.clld.org/v1.0/terms.rdf#segments")
        dataset.write_metadata()

    write_back = []
    c_f_segments = dataset["FormTable", "Segments"].name
    for row in dataset["FormTable"]:
        if row[c_f_segments] and not overwrite_existing:
            continue
        else:
            if row[transcription]:
                form = row[transcription].strip()
                row[dataset.column_names.forms.segments] = segment_form(form)
            write_back.append(row)
    dataset.write(FormTable=write_back)
示例#6
0
def add_concepticon_definitions(
    dataset: pycldf.Dataset,
    column_name: str = "Concepticon_Definition",
    logger: cli.logging.Logger = cli.logger,
) -> None:
    concepticon_ids = dataset.column_names.parameters.concepticonReference
    if concepticon_ids is None:
        logger.error(
            "Your concepts table has no #concepticonReference column, so I cannot add any definitions from Concepticon to it. Try running lexedata.edit.add_concepticon to have me guess those references."
        )
        return

    # Create a concepticon_definition column
    try:
        dataset["ParameterTable", column_name]
        logger.info("Overwriting existing {:} column in concepts table".format(
            column_name))
    except KeyError:
        dataset.add_columns("ParameterTable", column_name)
        dataset.write_metadata()
        # Now if this throws an exception, it's an unexpected exception.

    # write concepticon definitions
    write_back = []
    for row in cli.tq(
            dataset["ParameterTable"],
            task="Write concepts with concepticon definitions to dataset",
    ):
        try:
            row[column_name] = concepticon.api.conceptsets[
                row[concepticon_ids]].definition
        except KeyError:
            pass
        write_back.append(row)

    dataset.write(ParameterTable=write_back)