Exemplo n.º 1
0
def check_na_form_has_no_alternative(
    dataset: types.Wordlist[types.Language_ID, types.Form_ID,
                            types.Parameter_ID, types.Cognate_ID,
                            types.Cognateset_ID, ],
    logger: cli.logging.Logger = cli.logger,
):
    valid = True
    c_f_id = dataset["FormTable", "id"].name
    c_f_form = dataset["FormTable", "form"].name
    c_f_concept = dataset["FormTable", "parameterReference"].name
    c_f_language = dataset["FormTable", "languageReference"].name
    forms_by_concepts: t.Dict[types.Parameter_ID,
                              t.Set[types.Form_ID]] = t.DefaultDict(set)

    for f in dataset["FormTable"]:
        for c in util.ensure_list(f[c_f_concept]):
            forms_by_concepts[c].add(f[c_f_id])
    forms_to_languages = t.DefaultDict(set)
    for f in dataset["FormTable"]:
        forms_to_languages[f[c_f_language]].add(f[c_f_id])
    na_forms = [f for f in dataset["FormTable"] if f[c_f_form] == "-"]
    for form in na_forms:
        for c in util.ensure_list(form[c_f_concept]):
            if forms_by_concepts[c].intersection(
                    forms_to_languages[form[c_f_language]]) != {form[c_f_id]}:
                log_or_raise(
                    message=
                    f"Non empty forms exist for the NA form {form[c_f_id]} with identical parameter and language reference",
                    log=logger,
                )
                valid = False
    return valid
Exemplo n.º 2
0
def test_toexcel_filtered(cldf_wordlist, working_and_nonworking_bibfile,
                          caplog):
    dataset, url = working_and_nonworking_bibfile(cldf_wordlist)
    writer = MatrixExcelWriter(
        dataset=dataset,
        database_url=str(url),
    )
    E = MatrixExcelWriter(dataset,
                          database_url="https://example.org/lexicon/{:}")
    forms = util.cache_table(dataset)
    languages = sorted(util.cache_table(dataset, "LanguageTable").values(),
                       key=lambda x: x["name"])
    judgements = [{
        "formReference": f["id"],
        "cognatesetReference": parameter
    } for f in forms.values()
                  for parameter in util.ensure_list(f["parameterReference"])]
    parameters = [
        c for n, c in util.cache_table(dataset, "ParameterTable").items()
        if n == "Woman"
    ]
    with caplog.at_level(logging.WARNING):
        E.create_excel(rows=parameters,
                       judgements=judgements,
                       forms=forms,
                       languages=languages)
    assert len(list(writer.ws.iter_rows())) in {0, 2}
Exemplo n.º 3
0
def test_cell_comments_export():
    dataset, _ = copy_to_temp(
        Path(__file__).parent / "data/cldf/minimal/cldf-metadata.json")
    _, out_filename = tempfile.mkstemp(".xlsx", "cognates")

    E = MatrixExcelWriter(dataset,
                          database_url="https://example.org/lexicon/{:}")
    forms = util.cache_table(dataset)
    languages = sorted(util.cache_table(dataset, "LanguageTable").values(),
                       key=lambda x: x["name"])
    judgements = [{
        "formReference": f["id"],
        "cognatesetReference": parameter
    } for f in forms.values()
                  for parameter in util.ensure_list(f["parameterReference"])]
    parameters = util.cache_table(dataset, "ParameterTable").values()
    E.create_excel(rows=parameters,
                   judgements=judgements,
                   forms=forms,
                   languages=languages)

    for col in E.ws.iter_cols():
        pass
    assert (
        col[-1].comment and col[-1].comment.content
    ), "Last row of last column should contain a form, with a comment attached to it."
    assert (col[-1].comment.content == "A Comment!"
            ), "Comment should match the comment from the form table"
Exemplo n.º 4
0
def test_toexcel_runs(cldf_wordlist, working_and_nonworking_bibfile):
    dataset, filename = working_and_nonworking_bibfile(cldf_wordlist)
    E = MatrixExcelWriter(
        dataset=dataset,
        database_url=str(filename),
    )
    forms = util.cache_table(dataset)
    languages = sorted(util.cache_table(dataset, "LanguageTable").values(),
                       key=lambda x: x["name"])
    judgements = [{
        "formReference": f["id"],
        "cognatesetReference": parameter
    } for f in forms.values()
                  for parameter in util.ensure_list(f["parameterReference"])]
    parameters = util.cache_table(dataset, "ParameterTable").values()
    E.create_excel(rows=parameters,
                   judgements=judgements,
                   forms=forms,
                   languages=languages)
    _, out_filename = tempfile.mkstemp(".xlsx", "cognates")
    E.wb.save(filename=out_filename)
Exemplo n.º 5
0
def apply_heuristics(
    dataset: types.Wordlist,
    heuristic: t.Optional[AbsenceHeuristic] = None,
    primary_concepts: t.Union[
        types.WorldSet[types.Parameter_ID],
        t.AbstractSet[types.Parameter_ID]] = types.WorldSet(),
    logger: cli.logging.Logger = cli.logger,
) -> t.Mapping[types.Cognateset_ID, t.Set[types.Parameter_ID]]:
    """Compute the relevant concepts for cognatesets, depending on the heuristic.

    These concepts will be considered when deciding whether a root is deemed
    absent in a language.

    For the CentralConcept heuristic, the relevant concepts are the
    central concept of a cognateset, as given by the #parameterReference column
    of the CognatesetTable. A central concept not included in the
    primary_concepts is ignored with a warning.

    >>> ds = util.fs.new_wordlist()
    >>> cst = ds.add_component("CognatesetTable")
    >>> ds["CognatesetTable"].tableSchema.columns.append(
    ...     pycldf.dataset.Column(
    ...         name="Central_Concept",
    ...         propertyUrl="http://cldf.clld.org/v1.0/terms.rdf#parameterReference"))
    >>> ds.auto_constraints(cst)
    >>> ds.write(CognatesetTable=[
    ...     {"ID": "cognateset1", "Central_Concept": "concept1"}
    ... ])
    >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.CENTRALCONCEPT) == {'cognateset1': {'concept1'}}
    True

    This extends to the case where a cognateset may have more than one central concept.

    >>> ds = util.fs.new_wordlist()
    >>> cst = ds.add_component("CognatesetTable")
    >>> ds["CognatesetTable"].tableSchema.columns.append(
    ...     pycldf.dataset.Column(
    ...         name="Central_Concepts",
    ...         propertyUrl="http://cldf.clld.org/v1.0/terms.rdf#parameterReference",
    ...         separator=","))
    >>> ds.auto_constraints(cst)
    >>> ds.write(CognatesetTable=[
    ...     {"ID": "cognateset1", "Central_Concepts": ["concept1", "concept2"]}
    ... ])
    >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.CENTRALCONCEPT) == {
    ...     'cognateset1': {'concept1', 'concept2'}}
    True

    For the HalfPrimaryConcepts heurisitc, the relevant concepts are all
    primary concepts connected to a cognateset.

    >>> ds = util.fs.new_wordlist(
    ...     FormTable=[
    ...         {"ID": "f1", "Parameter_ID": "c1", "Language_ID": "l1", "Form": "x"},
    ...         {"ID": "f2", "Parameter_ID": "c2", "Language_ID": "l1", "Form": "x"}],
    ...     CognateTable=[
    ...         {"ID": "1", "Form_ID": "f1", "Cognateset_ID": "s1"},
    ...         {"ID": "2", "Form_ID": "f2", "Cognateset_ID": "s1"}])
    >>> apply_heuristics(ds, heuristic=AbsenceHeuristic.HALFPRIMARYCONCEPTS) == {
    ...     's1': {'c1', 'c2'}}
    True


    NOTE: This function cannot guarantee that every concept has at least one
    relevant concept, there may be cognatesets without! A cognateset with 0
    relevant concepts will always be included, because 0 is at least half of 0.

    """
    heuristic = (heuristic if heuristic is not None else
                 (AbsenceHeuristic.CENTRALCONCEPT if
                  ("CognatesetTable", "parameterReference") in dataset else
                  AbsenceHeuristic.HALFPRIMARYCONCEPTS))

    relevant_concepts: t.MutableMapping[
        types.Cognateset_ID, t.Set[types.Parameter_ID]] = t.DefaultDict(set)

    if heuristic is AbsenceHeuristic.HALFPRIMARYCONCEPTS:
        c_f = dataset["CognateTable", "formReference"].name
        c_s = dataset["CognateTable", "cognatesetReference"].name
        concepts = util.cache_table(
            dataset,
            "FormTable",
            {"concepts": dataset["FormTable", "parameterReference"].name},
        )
        for j in dataset["CognateTable"]:
            form = concepts[j[c_f]]
            for concept in util.ensure_list(form["concepts"]):
                relevant_concepts[j[c_s]].add(concept)

    elif heuristic is AbsenceHeuristic.CENTRALCONCEPT:
        c_cognateset_concept = dataset["CognatesetTable",
                                       "parameterReference"].name
        c_id = dataset["CognatesetTable", "id"].name
        for c in dataset["CognatesetTable"]:
            for concept in util.ensure_list(c[c_cognateset_concept]):
                if concept not in primary_concepts:
                    logger.warning(
                        f"The central concept {concept} of cognateset {c[c_id]} was not part of your list of primary concepts to be included in the coding, so the cognateset will be ignored."
                    )
                else:
                    relevant_concepts[c[c_id]].add(concept)

    else:
        raise TypeError(
            f"Value of heuristic, {heuristic}, did not correspond to a known AbsenceHeuristic."
        )

    return relevant_concepts
Exemplo n.º 6
0
def coverage_report(
    dataset: types.Wordlist[types.Language_ID, types.Form_ID,
                            types.Parameter_ID, types.Cognate_ID,
                            types.Cognateset_ID, ],
    min_percentage: float = 0.0,
    with_concept: t.Iterable[types.Parameter_ID] = set(),
    missing: Missing = Missing.KNOWN,
    only_coded: bool = True,
) -> t.List[t.List[str]]:
    coded: t.Container[types.Form_ID]
    if only_coded:
        try:
            c_j_form = dataset["CognateTable", "formReference"].name
        except KeyError:
            cli.Exit.NO_COGNATETABLE(
                message=
                "You requested that I only count cognate coded forms, but you have no CognateTable containing judgements."
            )
        coded = {judgement[c_j_form] for judgement in dataset["CognateTable"]}
    else:
        coded = types.WorldSet()

    languages: t.Dict[types.Language_ID, str] = {}
    try:
        c_l_id = dataset["LanguageTable", "id"].name
        c_l_name = dataset["LanguageTable", "name"].name
        for language in dataset["LanguageTable"]:
            languages[language[c_l_id]] = language[c_l_name]
    except KeyError:
        pass

    concepts: t.DefaultDict[types.Language_ID,
                            t.Counter[types.Parameter_ID]] = t.DefaultDict(
                                t.Counter)
    c_f_id = dataset["FormTable", "id"].name
    c_concept = dataset["FormTable", "parameterReference"].name
    c_language = dataset["FormTable", "languageReference"].name
    c_form = dataset["FormTable", "form"].name
    for form in dataset["FormTable"]:
        languages.setdefault(form[c_language], form[c_language])
        if form[c_f_id] not in coded:
            continue
        if missing == Missing.IGNORE and (not form[c_form]
                                          or form[c_form] == "-"):
            continue
        if missing == Missing.KNOWN and not form[c_form]:
            continue
        c: types.Parameter_ID
        for c in util.ensure_list(form[c_concept]):
            concepts[form[c_language]][c] += 1

    # load primary concepts and number of concepts
    primary_concepts: t.Container[types.Parameter_ID]
    try:
        c_c_id = dataset["ParameterTable", "id"].name
        primary_concepts = [
            c[c_c_id] for c in dataset["ParameterTable"] if c["Primary"]
        ]
        total_number_concepts = len(primary_concepts)
    except KeyError:
        cli.logger.warning(
            "ParameterTable doesn't contain a column 'Primary'. Primary concepts couldn't be loaded. "
            "Loading all concepts.")
        primary_concepts = types.WorldSet()

        try:
            total_number_concepts = len(list(dataset["ParameterTable"]))
        except KeyError:
            total_number_concepts = len(
                set.union(*(set(cs) for cs in concepts.values())))

    data_languages = []
    for language, name in languages.items():
        conceptlist = concepts[language]
        try:
            synonyms = sum(conceptlist.values()) / len(conceptlist)
        except ZeroDivisionError:
            synonyms = float("nan")

        # percentage of all concepts covered by this language
        conceptlist_percentage = len(conceptlist) / total_number_concepts
        if conceptlist_percentage * 100 < min_percentage:
            continue

        if not all(c in conceptlist for c in with_concept):
            continue

        # count primary concepts
        primary_count = 0
        for c in conceptlist:
            if c in primary_concepts:
                primary_count += 1
        # if args.languages_only:
        #     print(language)
        data_languages.append([
            language,
            name,
            primary_count,
            conceptlist_percentage,
            synonyms,
        ])
    return data_languages
Exemplo n.º 7
0
def forms_to_tsv(
    dataset: types.Wordlist[types.Language_ID, types.Form_ID,
                            types.Parameter_ID, types.Cognate_ID,
                            types.Cognateset_ID, ],
    languages: t.Iterable[str],
    concepts: t.Set[str],
    cognatesets: t.Iterable[str],
    logger: cli.logging.Logger = cli.logger,
):
    try:
        dataset["FormTable", "segments"].name
    except KeyError:
        cli.Exit.NO_SEGMENTS(
            """Edictor export requires your dataset to have segments in the FormTable.
        Run `lexedata.edit.add_segments` to automatically add segments based on your forms."""
        )

    delimiters = {
        util.cldf_property(c.propertyUrl) or c.name: c.separator
        for c in dataset["FormTable"].tableSchema.columns if c.separator
    }

    # prepare the header for the tsv output
    # the first column must be named ID and contain 1-based integer IDs
    # set header for tsv
    tsv_header = list(dataset["FormTable"].tableSchema.columndict.keys())

    tsv_header.insert(0, "LINGPY_ID")
    tsv_header.append("cognatesetReference")
    if "alignment" not in tsv_header:
        tsv_header.append("alignment")
    if "parameterReference" in delimiters:
        tsv_header.append("_parameterReference")

    # select forms and cognates given restriction of languages and concepts, cognatesets respectively
    forms = {}
    for f, form in util.cache_table(dataset).items():
        if form["form"] is None or form["form"] == "-":
            continue
        if form["languageReference"] in languages and concepts.intersection(
                ensure_list(form["parameterReference"])):
            # Normalize the form:
            # 1. No list-valued entries
            for c, d in delimiters.items():
                if c == "segments":
                    continue
                if c == "parameterReference":
                    form["_parameterReference"] = d.join(
                        str(e) for e in form[c])
                    form["parameterReference"] = form["parameterReference"][0]
                    continue

                form[c] = d.join(str(e) for e in form[c])

            if not form.get("segments"):
                logger.warning(
                    "No segments found for form %s. You can generate segments using `lexedata.edit.add_segments`.",
                    form["id"],
                )

            # 2. No tabs, newlines in entries
            for c, v in form.items():
                if type(v) == str:
                    if "\\!t" in form[c] or "\\!n" in form[c]:
                        logger.warning(
                            "Your data contains the special characters '\\!t' or '\\!n', which I will introduce for escaping tabs and newlines for edictor. These characters will not survive the back-import."
                        )
                    form[c] = form[c].replace("\t",
                                              "\\!t").replace("\n", "\\!n")

            forms[f] = form

    cognateset_cache: t.Mapping[t.Optional[str], int]
    if "CognatesetTable" in dataset:
        id = dataset["CognatesetTable", "id"].name
        cognateset_cache = {
            cognateset[id]: c
            for c, cognateset in enumerate(dataset["CognatesetTable"], 1)
            if cognateset[id] in cognatesets
        }
    else:
        if cognatesets is None:
            cognateset_cache = t.DefaultDict(itertools.count().__next__)
        else:
            cognateset_cache = {c: i for i, c in enumerate(cognatesets, 1)}

    # Warn about unexpected non-concatenative ‘morphemes’
    lexedata.report.nonconcatenative_morphemes.segment_to_cognateset(
        dataset, cognatesets, logger)

    judgements_about_form: t.Mapping[types.Form_ID,
                                     t.Tuple[t.List[str], t.List[int]]] = {
                                         id:
                                         ([f"({s})"
                                           for s in form["segments"]], [])
                                         for id, form in forms.items()
                                     }
    # Compose all judgements, last-one-rules mode.
    for j in util.cache_table(dataset, "CognateTable").values():
        if j["formReference"] in forms and cognateset_cache.get(
                j["cognatesetReference"]):
            if j.get("alignment"):
                j["alignment"] = [s or "" for s in j["alignment"]]
            else:
                j["alignment"] = forms[j["formReference"]]["segments"]

            try:
                segments_judged = list(
                    parse_segment_slices(segment_slices=j["segmentSlice"],
                                         enforce_ordered=False))
            except TypeError:
                logger.warning(
                    "In judgement %s: No segment slice given. Assuming whole form.",
                    j["id"],
                )
                segments_judged = list(
                    range(len(forms[j["formReference"]]["segments"])))
            except KeyError:
                segments_judged = list(
                    range(len(forms[j["formReference"]]["segments"])))
            except ValueError:
                logger.warning(
                    "In judgement %s: Index error due to bad segment slice %s. Skipped.",
                    j["id"],
                    ",".join(j["segmentSlice"]),
                )
                continue
            global_alignment, cogsets = judgements_about_form[
                j["formReference"]]
            segment_start, segment_end = min(
                segments_judged), max(segments_judged) + 1
            try:
                glue_in_alignment(
                    global_alignment,
                    cogsets,
                    j["alignment"],
                    j["cognatesetReference"],
                    slice(segment_start, segment_end),
                )
            except IndexError:
                logger.warning(
                    "In judgement %s: Index error due to bad segment slice %s. Skipped.",
                    j["id"],
                    ",".join(j["segmentSlice"]),
                )
                continue

    return forms, judgements_about_form, cognateset_cache
Exemplo n.º 8
0
def create_singletons(
    dataset: types.Wordlist[types.Language_ID, types.Form_ID,
                            types.Parameter_ID, types.Cognate_ID,
                            types.Cognateset_ID, ],
    status: t.Optional[str] = None,
    by_segment: bool = False,
    logger: cli.logging.Logger = cli.logger,
) -> t.Tuple[t.Sequence[types.CogSet], t.Sequence[types.Judgement]]:
    """Create singleton cognate judgements for forms that don't have cognate judgements.

    Depending on by_segment, singletons are created for every range of segments
    that is not in any cognate set yet (True) or just for every form where no
    segment is in any cognate sets (False).

    """
    forms = util.cache_table(dataset)
    c_j_id = dataset["CognateTable", "id"].name
    c_j_cogset = dataset["CognateTable", "cognatesetReference"].name
    c_j_form = dataset["CognateTable", "formReference"].name
    try:
        c_j_segmentslice = dataset["CognateTable", "segmentSlice"].name
    except KeyError:
        c_j_segmentslice = None
    try:
        c_j_alignment = dataset["CognateTable", "alignment"].name
    except KeyError:
        c_j_alignment = None

    if not dataset.get(("CognatesetTable", "Status_Column")):
        logger.warning(
            "No Status_Column in CognatesetTable. I will proceed without. Run `lexedata.edit.add_status_column`` in default mode or with table-names CognatesetTable to add a Status_Column."
        )

    try:
        c_s_id = dataset["CognatesetTable", "id"].name
        all_cognatesets = {s[c_s_id]: s for s in dataset["CognatesetTable"]}
    except KeyError:
        c_s_id = "id"
        c_s_name = "name"
        all_cognatesets = {
            id: types.Judgement({
                "id": id,
                "name": id
            })
            for id in {j[c_j_cogset]
                       for j in dataset["CognateTable"]}
        }
    try:
        c_s_name = dataset["CognatesetTable", "name"].name
    except KeyError:
        c_s_name = c_s_id

    all_judgements = list(dataset["CognateTable"])
    if by_segment:
        judgements = segment_to_cognateset(dataset, types.WorldSet(), logger)
        forms_and_segments = uncoded_segments(judgements, logger)
    else:
        forms_and_segments = uncoded_forms(
            forms.values(), {j[c_j_form]
                             for j in all_judgements})
    for form, slice in forms_and_segments:
        i = 1
        singleton_id = f"X_{form}_{i:d}"
        while singleton_id in all_cognatesets:
            i += 1
            singleton_id = f"X_{form}_{i:d}"
        all_cognatesets[singleton_id] = types.CogSet({})
        properties = {
            c_s_name: util.ensure_list(forms[form]["parameterReference"])[0],
            c_s_id: singleton_id,
            "Status_Column": status,
        }
        try:
            for column in dataset["CognatesetTable"].tableSchema.columns:
                all_cognatesets[singleton_id][column.name] = properties.get(
                    column.name)
        except KeyError:
            pass
        judgement = types.Judgement({})
        properties = {
            c_j_id: singleton_id,
            c_j_cogset: singleton_id,
            c_j_form: form,
            c_j_segmentslice: indices_to_segment_slice(slice),
            c_j_alignment: [forms[form]["segments"][i] for i in slice],
            "Status_Column": status,
        }
        for column in dataset["CognateTable"].tableSchema.columns:
            judgement[column.name] = properties.get(column.name)
        all_judgements.append(judgement)
    return all_cognatesets.values(), all_judgements
Exemplo n.º 9
0
        type=str,
        default="https://example.org/lexicon/{:}",
        help=
        "A template string for URLs pointing to individual forms. For example, to"
        " point to lexibank, you would use https://lexibank.clld.org/values/{:}."
        " (default: https://example.org/lexicon/{:})",
    )
    args = parser.parse_args()
    logger = cli.setup_logging(args)

    dataset = (pycldf.Wordlist.from_metadata(args.metadata), )
    E = MatrixExcelWriter(
        dataset,
        database_url=args.url_template,
        logger=logger,
    )
    forms = util.cache_table(dataset)
    languages = sorted(util.cache_table(dataset, "LanguageTable").values(),
                       key=lambda x: x["name"])
    judgements = [{
        "formReference": f["id"],
        "cognatesetReference": parameter
    } for f in forms.values()
                  for parameter in util.ensure_list(f["parameterReference"])]
    parameters = util.cache_table(dataset, "ParameterTable").values()
    E.create_excel(rows=parameters,
                   judgements=judgements,
                   forms=forms,
                   languages=languages)
    E.wb.save(filename=args.excel, )