示例#1
0
def iter_alignments(dataset, cognate_sets, column='Segments', method='library'):
    """
    Function computes automatic alignments and writes them to file.
    """
    if not isinstance(dataset, lingpy.basic.parser.QLCParser):
        wordlist = _cldf2wordlist(dataset)
        cognates = {r['Form_ID']: r for r in cognate_sets}
        wordlist.add_entries(
            'cogid',
            'lid',
            lambda x: cognates[x]['Cognateset_ID'] if x in cognates else 0)
        alm = lingpy.Alignments(
            wordlist,
            ref='cogid',
            row='parameter_id',
            col='language_id',
            transcription='form',
            segments=column.lower())
        alm.align(method=method)
        for k in alm:
            if alm[k, 'lid'] in cognates:
                cognate = cognates[alm[k, 'lid']]
                cognate['Alignment'] = alm[k, 'alignment']
                cognate['Alignment_Method'] = method
    else:
        alm = lingpy.Alignments(dataset, ref='cogid')
        alm.align(method=method)

        for cognate in cognate_sets:
            idx = cognate['ID'] or cognate['Form_ID']
            cognate['Alignment'] = alm[int(idx), 'alignment']
            cognate['Alignment_Method'] = 'SCA-' + method
示例#2
0
def test_partial_alignments_with_lexstat():
    lex = lp.LexStat(test_data('test-partial-alignments.tsv'),
                     segments='tokens')
    alms = lp.Alignments(test_data('test-partial-alignments.tsv'),
                         fuzzy=True,
                         ref='cogids',
                         sonar=True,
                         segments='tokens')
    alms.align(scorer=lex.bscorer)
    assert '-' in alms.msa['cogids'][12]['alignment'][-1]
示例#3
0
def iter_alignments(dataset,
                    cognate_sets,
                    column='Segments',
                    method='library',
                    prefix=''):
    """
    Function computes automatic alignments and writes them to file.
    """
    if not isinstance(dataset, lp.basic.parser.QLCParser):
        wordlist = _cldf2wordlist(dataset)
        cognates = {r[0]: r for r in cognate_sets}
        wordlist.add_entries('cogid', 'lid', lambda x: cognates[x][3]
                             if x in cognates else '')
        for i, k in enumerate(wordlist):
            if not wordlist[k, 'cogid']:
                wordlist[k][wordlist.header['cogid']] = 'empty-%s' % i
        alm = lp.Alignments(wordlist,
                            ref='cogid',
                            row='parameter_name',
                            col='language_name',
                            segments=column.lower())
        alm.align(method=method)
        for k in alm:
            if alm[k, 'lid'] in cognates:
                row = list(cognates[alm[k, 'lid']])
                row[7] = alm[k, 'alignment']
                row[8] = method
                yield row
    else:
        alm = lp.Alignments(dataset, ref='cogid')
        alm.align(method=method)
        for row in cognate_sets:
            try:
                idx = int(row[0].split('-')[1])
            except:
                print(row)
                raise
            row[7] = alm[idx, 'alignment']
            row[8] = 'SCA-' + method
            yield row
示例#4
0
def cldf(dataset, concepticon, **kw):
    gloss2con = {x['GLOSS']: x['CONCEPTICON_ID'] for x in dataset.concepts}
    lang2glot = {x['NAME']: x['GLOTTOCODE'] for x in dataset.languages}

    for dset, srckey in zip(DSETS, sources):
        wl = lp.Wordlist(dataset.raw.joinpath(dset).as_posix())
        if 'tokens' not in wl.header:
            wl.add_entries('tokens',
                           'ipa',
                           lp.ipa2tokens,
                           merge_vowels=False,
                           expand_nasals=True)
        src = getEvoBibAsSource(srckey)

        with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Language_iso',
                          'Parameter_ID', 'Parameter_name', 'Value', 'Source',
                          'Segments', 'Cognacy', 'Loan'),
                         dataset,
                         subset=dset.split('.')[0]) as ds:
            ds.sources.add(src)
            errors = []
            cognates = []
            for k in wl:
                concept = wl[k, 'concept']
                if '(V)' in concept:
                    concept = concept[:-4]
                concept = correct_concepts.get(concept, concept)
                if concept not in gloss2con:
                    errors += [concept]
                doculect = correct_languages.get(wl[k, 'doculect'],
                                                 wl[k, 'doculect'])
                loan = wl[k, 'cogid'] < 0
                cogid = abs(wl[k, 'cogid'])

                wid = '{0}-{1}'.format(dset.split('.')[0], k)
                ds.add_row([
                    wid, lang2glot[doculect], wl[k, 'doculect'], '',
                    gloss2con.get(wl[k, 'concept'],
                                  ''), wl[k, 'concept'], wl[k, 'ipa'], srckey,
                    ' '.join(wl[k, 'tokens'] or ['']), cogid, wl[k, 'loan']
                ])

                cognates.append([
                    wid, ds.name, wl[k, 'ipa'], cogid,
                    'borrowed' if loan else '', 'expert', srckey, '', '', ''
                ])

            dataset.cognates.extend(
                iter_alignments(lp.Alignments(wl), cognates, method='library'))
            for er in sorted(set(errors)):
                print(er, dset)
示例#5
0
def cldf(dataset, concepticon, **kw):
    concepts = {x['GLOSS']: x['CONCEPTICON_ID'] for x in dataset.concepts}
    D = {}  # dictionary to be passed to lingpy
    D[0] = [
        'doculect', 'glottolog', 'concept', 'concepticon', 'ipa', 'segments',
        'cogid', 'alignment'
    ]
    idx = 1
    for f in FILES:
        msa = lp.MSA(
            dataset.raw.joinpath('phonalign_{0}.msa'.format(f)).as_posix())
        concept = msa.seq_id[1:-1]  # strip quotation marks from concept
        cid = concepts.get(concept, '')
        for i, taxon in enumerate(msa.taxa):
            if taxon in languages:
                tid = languages[taxon]
                alignment = ' '.join(msa.alignment[i])
                tokens = ' '.join([x for x in msa.alignment[i] if x != '-'])
                ipa = tokens.replace(' ', '')
                cogid = '{0}-{1}'.format(concept, f)
                D[idx] = [
                    taxon, tid, concept, cid, ipa, tokens, cogid, alignment
                ]
                idx += 1

    with CldfDataset(
        ('ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID',
         'Parameter_name', 'Value', 'Segments', 'Cognacy', 'Source'),
            dataset) as ds:
        src = getEvoBibAsSource('Heggarty2007')
        ds.sources.add(src)
        src = getEvoBibAsSource('List2014e')
        ds.sources.add(src)

        alm = lp.Alignments(D)
        for k in alm:
            ds.add_row(
                    ['Heggarty2007-{0}'.format(k)] + [alm[k, x] or '' for x in ['glottolog', 'taxon',
                        'iso', 'concepticon', 'concept', 'ipa']] + \
                                [' '.join(alm[k, 'tokens']), alm[k, 'cogid'], 'Heggarty2007']
                                )
            dataset.cognates += [[
                'Heggarty2007-{0}'.format(k), ds.name, alm[k, 'ipa'],
                alm[k, 'cogid'], '', 'expert', 'Heggarty2007',
                alm[k, 'alignment'], 'expert', 'List2014e'
            ]]
        dataset.write_cognates()
示例#6
0
def cldf(dataset, concepticon, **kw):
    orig_ds = Dataset.from_name('baidial')
    orig_ds.commands.cldf(dataset, concepticon, **kw)

    for cldfds in dataset.iter_cldf_datasets():
        for attr in ['dc:isVersionOf', 'dc:provenance']:
            cldfds.table[attr] = dataset.md[attr]
        cldfds.write(outdir=dataset.cldf_dir)

        # assuming that we don't need anything, I only load the wordlist, align it
        # in lingpy, and create cognates and alignments, currently, there is no
        # real source, so I'll just make a fake source "List2016i", but the dataset
        # should be published with zenodo, ideally
        alm = lp.Alignments(
            dataset.raw.joinpath('BDS-cognates.tsv').as_posix())

        cognates = wordlist2cognates(alm, cldfds, 'List2016i')
        dataset.cognates.extend(iter_alignments(alm, cognates))
示例#7
0
def cldf(dataset, concepticon, **kw):
    wl = lp.Alignments(dataset.raw.joinpath('tukano.tsv').as_posix())
    src1 = getEvoBibAsSource('Chacon2014')
    src2 = getEvoBibAsSource('Chacon2015')
    gloss2conc = {r['GLOSS']: r['CONCEPTICON_ID'] for r in dataset.concepts}

    with CldfDataset((
            'ID',
            'Language_ID',
            'Language_name',
            'Language_iso',
            'Parameter_ID',
            'Parameter_name',
            'Value',
            'Source',
            'Segments',
            'Cognacy',
    ), dataset) as ds:

        ds.sources.add(src1)
        ds.sources.add(src2)
        for k in wl:
            lid = wl[k, 'language']
            cogid = wl[k, 'cogid']
            concept = wl[k, 'concept']
            segments = wl[k, 'tokens']
            value = wl[k, 'ipa']
            cogid = wl[k, 'cogid']
            alignment = wl[k, 'alignment']
            name, iso = abbr2lang[lid]
            concept = wl[k, 'concept']
            cid = gloss2conc.get(concept)
            ds.add_row(('Chacon2014-' + str(k),
                        dataset.glottocode_by_iso.get(iso, ''), name, iso, cid,
                        concept, value, 'Chacon2014', ' '.join(segments),
                        str(cogid)))

            cogid = '-'.join([slug(wl[k, 'concept']), '%s' % cogid])
            dataset.cognates.append([
                'Chacon2014-' + str(k), ds.name, wl[k, 'ipa'], cogid, '',
                'expert', 'Chacon2014', alignment, 'expert', 'Chacon2015'
            ])
示例#8
0
def cognate_code_to_file(
    metadata: Path,
    ratio: float,
    soundclass: str,
    cluster_method: str,
    threshold: float,
    initial_threshold: float,
    gop: float,
    mode: str,
    output_file: Path,
) -> None:
    dataset = pycldf.Wordlist.from_metadata(args.metadata)
    assert (
        dataset.column_names.forms.segments is not None
    ), "Dataset must have a CLDF #segments column."

    def filter(row: t.Dict[str, t.Any]) -> bool:
        row["tokens"] = [
            str(x)
            for x in clean_segments(row[dataset.column_names.forms.segments.lower()])
        ]
        row["tokens"] = ["+" if x == "_" else x for x in row["tokens"]]
        # TODO: Find the official LingPy way to consider word boundaries to
        # also be morpheme boundaries – just adding them in
        # `partial_cluster(sep=...+'_')` did not work, and why isn't it the
        # default anyway?
        row["doculect"] = row[dataset.column_names.forms.languageReference.lower()]
        row["concept"] = row[dataset.column_names.forms.parameterReference.lower()]
        return row["segments"] and row["concept"]

    lex = lingpy.compare.partial.Partial.from_cldf(
        metadata,
        filter=filter,
        columns=["doculect", "concept", "tokens"],
        model=lingpy.data.model.Model(soundclass),
        check=True,
    )

    if ratio != 1.5:
        if ratio == float("inf"):
            ratio_pair = (1, 0)
            ratio_str = "-inf"
        if ratio == int(ratio) >= 0:
            r = int(ratio)
            ratio_pair = (r, 1)
            ratio_str = "-{:d}".format(r)
        elif ratio > 0:
            ratio_pair = (ratio, 1)
            ratio_str = "-" + str(ratio)
        else:
            raise ValueError("LexStat ratio must be in [0, ∞]")
    else:
        ratio_pair = (3, 2)
        ratio_str = ""
    if initial_threshold != 0.7:
        ratio_str += "-t{:02d}".format(int(initial_threshold * 100))
    try:
        scorers_etc = lingpy.compare.lexstat.LexStat(
            filename="lexstats-{:}-{:s}{:s}.tsv".format(
                sha1(metadata), soundclass, ratio_str
            )
        )
        lex.scorer = scorers_etc.scorer
        lex.cscorer = scorers_etc.cscorer
        lex.bscorer = scorers_etc.bscorer
    except (OSError, ValueError):
        lex.get_scorer(runs=10000, ratio=ratio_pair, threshold=initial_threshold)
        lex.output(
            "tsv",
            filename="lexstats-{:}-{:s}{:s}".format(
                sha1(metadata), soundclass, ratio_str
            ),
            ignore=[],
        )
    # For some purposes it is useful to have monolithic cognate classes.
    lex.cluster(
        method="lexstat",
        threshold=args.threshold,
        ref="cogid",
        cluster_method=cluster_method,
        verbose=True,
        override=True,
        gop=args.gop,
        mode=mode,
    )
    # But actually, in most cases partial cognates are much more useful.
    lex.partial_cluster(
        method="lexstat",
        threshold=threshold,
        cluster_method=cluster_method,
        ref="partialcognateids",
        override=True,
        verbose=True,
        gop=gop,
        mode=mode,
    )
    lex.output("tsv", filename="auto-clusters")
    alm = lingpy.Alignments(lex, ref="partialcognateids", fuzzy=True)
    alm.align(method="progressive")
    alm.output("tsv", filename=output_file, ignore="all", prettify=False)

    try:
        dataset.add_component("CognateTable")
    except ValueError:
        ...
    try:
        dataset.add_component("CognatesetTable")
    except ValueError:
        ...

    read_back = csv.DictReader(open(output_file + ".tsv"), delimiter="\t")
    cognatesets = {}
    judgements = []
    i = 1
    for line in read_back:
        partial = line["PARTIALCOGNATEIDS"].split()
        alignment = line["ALIGNMENT"].split(" + ")
        slice_start = 0
        for cs, alm in zip(partial, alignment):
            cognatesets.setdefault(cs, {"ID": cs})
            length = len(alm.split())
            judgements.append(
                {
                    "ID": i,
                    "Form_ID": line["ID"],
                    "Cognateset_ID": cs,
                    "Segment_Slice": [
                        "{:d}:{:d}".format(slice_start, slice_start + length)
                    ],
                    "Alignment": alm.split(),
                    "Source": ["LexStat"],
                }
            )
            i += 1
            slice_start += length
    dataset.write(CognatesetTable=cognatesets.values())
    dataset.write(CognateTable=judgements)
示例#9
0
def cldf(dataset, concepticon, **kw):
    concepticon = {
        c.english: c.concepticon_id
        for c in dataset.conceptlist.concepts.values()
    }
    concepticon['you (sing.)'] = concepticon['you (sing.) (thou)']
    concepticon['you (pl.)'] = concepticon['you (pl.) (ye)']
    concepticon['to itch/itchy'] = concepticon['to itch/to be itchy']
    concepticon['medicine'] = concepticon['medicine/juice']
    concepticon['excrement/shit'] = concepticon['feces/excrement/shit']

    language_map = {
        'Tampuon': 'Tampuan',
        'Palaung-Namhsan-Taunggyi': 'Palaung-Namhsan',
        'Jru-Laven\u02d0': 'Jru-Laven',
        'Pnar-Jaintia': 'Pnar',
        'K-Surin': 'Khmer-Surin',
    }

    languages = {}
    words = []

    with UnicodeReader(dataset.raw.joinpath('ds.Sheet1.csv')) as reader:
        for i, row in enumerate(reader):
            if 3 <= i < 125:
                languages[row[1]] = row
            elif i > 334:
                words.append(row)

    lids = [int(float(r[0])) for r in languages.values()]
    assert min(lids) == 1 and max(lids) == 122

    glottolog = dataset.glottocode_by_iso
    glottolog.update(
        {l['NAME']: l['GLOTTOCODE'] or None
         for l in dataset.languages})

    sources = {}
    for src, langs in groupby(sorted(languages.values(), key=lambda r: r[6]),
                              lambda r: r[6]):
        langs = [l[1] for l in langs]
        src = Source('misc', '_'.join(map(slug, langs)), title=src)
        for lang in langs:
            sources[lang] = src
    sources['cognates'] = getEvoBibAsSource(SOURCE)

    unmapped = Unmapped()
    with CldfDataset((
            'ID',
            'Language_ID',
            'Language_name',
            'Language_iso',
            'Parameter_ID',
            'Parameter_name',
            'Value',
            'Segments',
            'Source',
            'Comment',
    ), dataset) as ds:
        ds.sources.add(*sources.values())
        D = {0: ['lid', 'doculect', 'concept', 'ipa', 'tokens', 'cog']}
        for i, row in enumerate(words):
            form = row[4]
            if not form or form in '*-':
                continue
            assert row[1] in concepticon
            lang = language_map.get(row[3], row[3].strip())
            assert lang in languages
            gc = glottolog.get(glottolog.get(languages[lang][7]), lang)
            if not gc:
                unmapped.languages.add(('', lang, languages[lang][7]))
            # get segments
            segments = clean_string(form)[0]
            # get cognate identifier
            cogid = row[5] if row[5].strip() and row[5].strip() != '*' else (
                'e%s' % i)
            cogid = row[1] + '-' + cogid
            lid = '{0}-{1}'.format(ds.name, i + 1)
            ds.add_row([
                lid,
                glottolog.get(lang, glottolog.get(languages[lang][7])), lang,
                languages[lang][7], concepticon[row[1]], row[1], form,
                segments, sources[lang].id, None
            ])
            D[i + 1] = [lid, lang, row[1], form, segments, cogid]
        wl = lp.Wordlist(D)
        wl.renumber('cog')
        alm = lp.Alignments(wl)
        dataset.cognates.extend(
            iter_alignments(alm, wordlist2cognates(wl, ds, SOURCE)))

    unmapped.pprint()
示例#10
0
            'tsv',
            filename='lexstats-{:}-{:s}{:s}'.format(
                sha1(args.wordlist),
                args.soundclass, ratio_str),
            ignore=[])
    # For some purposes it is useful to have monolithic cognate classes.
    lex.cluster(method='lexstat', threshold=args.threshold, ref='cogid',
                cluster_method=args.cluster_method, verbose=True, override=True,
                gop=args.gop, mode=args.mode)
    # But actually, in most cases partial cognates are much more useful.
    lex.partial_cluster(method='lexstat', threshold=args.threshold,
                        cluster_method=args.cluster_method, ref='partialcognateids',
                        override=True, verbose=True, gop=args.gop,
                        mode=args.mode)
    lex.output("tsv", filename="auto-clusters")
    alm = lingpy.Alignments(lex, ref="partialcognateids", fuzzy=True)
    alm.align(method='progressive')
    alm.output('tsv', filename=args.output, ignore='all', prettify=False)

    try:
        dataset.add_component("CognateTable")
    except ValueError:
        ...
    try:
        dataset.add_component("CognatesetTable")
    except ValueError:
        ...

    read_back = csv.DictReader(open(args.output + ".tsv"), delimiter="\t")
    cognatesets = {}
    judgements = []
示例#11
0
def cldf(dataset, concepticon, **kw):
    concepticon = {
        c.english: c.concepticon_id
        for c in dataset.conceptlist.concepts.values()
    }
    language_map = {
        l['NAME']: l['GLOTTOCODE'] or None
        for l in dataset.languages
    }

    header, rows = None, []
    with UnicodeReader(
            dataset.raw.joinpath(
                'Semitic.Wordlists.ActualWordlists.csv')) as reader:
        for i, row in enumerate(reader):
            row = [c.strip() for c in row]
            if i == 0:
                header = row
            if i > 0:
                rows.append(row)
    cheader, crows = None, []
    with UnicodeReader(
            dataset.raw.joinpath(
                'Semitic.Codings.Multistate.Sheet1.csv')) as reader:
        for i, row in enumerate(reader):
            row = [c.strip() for c in row]
            if i == 0:
                cheader = row
            if i > 0:
                crows.append(row)

    langs = header[1:]
    clean_langs = {
        """Gɛ'ɛz""": "Ge'ez",
        "Tigrɛ": "Tigre",
        'ʷalani': "Walani",
        "Ogadɛn Arabic": "Ogaden Arabic",
        "Mɛhri": "Mehri",
        "Gibbali": "Jibbali",
    }
    correct_concepts = {
        'Cold (air)': 'Cold (of air)',
    }
    src = getEvoBibAsSource('Kitchen2012')

    with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Parameter_ID',
                      'Parameter_name', 'Value', 'Segments'), dataset) as ds:
        D = {0: ['doculect', 'concept', 'ipa', 'tokens']}
        idx = 1
        ds.sources.add(src)
        for row in rows:
            concept = row[0]
            for i, col in enumerate(row[1:]):
                lang = langs[i]
                if col != '---':
                    cleaned_string = clean_string(col,
                                                  merge_vowels=False,
                                                  preparse=PREPARSE,
                                                  rules=CONVERSION,
                                                  semi_diacritics='')[0]
                    ds.add_row([
                        'Kitchen2012-' + str(idx), language_map[lang],
                        clean_langs.get(lang, lang), concepticon[concept],
                        concept, col, cleaned_string
                    ])
                    D[idx] = [
                        clean_langs.get(lang, lang), concept, col,
                        cleaned_string
                    ]
                    idx += 1

        wl = lp.Wordlist(D)
        id2cog = {}
        errors = []
        for row in crows:
            taxon = row[0]
            for i, (concept, cog) in enumerate(zip(cheader[1:], row[1:])):
                nconcept = rows[i][0]
                if cog != '-':
                    idxs = wl.get_dict(taxon=taxon)
                    if idxs.get(nconcept, ''):
                        id2cog[idxs[nconcept][0]] = concept + '-' + cog
                    else:
                        errors += [(concept, nconcept, taxon)]
        bad_cogs = 1
        cognates = []
        for k in wl:
            cognates = []
            if k in id2cog:
                cogid = id2cog[k]
            else:
                cogid = str(bad_cogs)
                bad_cogs += 1
                id2cog[k] = cogid

        wl.add_entries('cog', id2cog, lambda x: x)
        wl.renumber('cog')
        for k in wl:
            cognates += [[
                'Kitchen2012-' + str(k), ds.name, wl[k, 'ipa'],
                wl[k, 'concept'] + '-' + str(wl[k, 'cogid']), '', 'expert',
                'Kitchen2012', '', '', ''
            ]]

        dataset.cognates.extend(iter_alignments(lp.Alignments(wl), cognates))
示例#12
0
    def setUp(self):

        self.alm = lp.Alignments(os.path.join('data', 'kessler.qlc'),
                                 loans=False)
示例#13
0
def cldf(dataset, concepticon, **kw):
    """
    Implements the conversion of the raw data to CLDF dataset(s).

    :param dataset: provides access to the information in supplementary files as follows:\
     - the JSON object from `metadata.json` is available as `dataset.md`\
     - items from languages.csv are available as `dataset.languages`\
     - items from concepts.csv are available as `dataset.concepts`\
     - if a Concepticon conceptlist was specified in metadata.json, its ID is available\
       as `dataset.conceptlist`
    :param glottolog: a pyglottolog.api.Glottolog` instance.
    :param concepticon:  a pyconcepticon.api.Concepticon` instance.
    :param kw: All arguments passed on the command line.
    """

    wl = lp.Wordlist(dataset.raw.joinpath(DSET).as_posix())

    # get language identifiers
    lids, cids, coords = {}, {}, {}
    for row in dataset.languages:
        language = row['NAME']
        lids[language] = row['GLOTTOCODE']
    coords = dict([wl.coords[taxon] for taxon in lids])
    modify = {
        'thunder (verb)': 'thunder',
        'flash (verb)': 'lightning',
        'room': 'flat',
        'have diarrea': 'have diarrhoea',
        'watery': 'light'
    }
    for row in dataset.concepts:
        concept = modify[row['CONCEPT']] if row['CONCEPT'] in modify else \
                row['CONCEPT']
        cids[concept] = row['CONCEPT_SET']

    # language ids
    src = getEvoBibAsSource(SOURCE)
    src2 = getEvoBibAsSource('List2014b')

    # get partial identifiers
    partial_ids = defaultdict(list)
    partial_converter = {}
    idx = 1
    for k in wl:
        for char in wl[k, 'counterpart']:
            if char in partial_converter:
                pidx = partial_converter[char]
            else:
                pidx = idx
                partial_converter[char] = idx
                idx += 1
            partial_ids[k] += [pidx]

    # trace if proto-langugages was visited
    visited = []
    idx = max([k for k in wl]) + 1

    with CldfDataset(
        ('ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID',
         'Parameter_name', 'Parameter_Chinese_name', 'Value',
         'Value_Chinese_characters', 'Source', 'Segments', 'Cognacy', 'Rank',
         'Comment'), dataset) as ds:

        ds.sources.add(src)
        ds.sources.add(src2)

        D = {0: ['doculect', 'concept', 'ipa', 'tokens', 'cogid']}
        for k in wl:
            tokens = lp.ipa2tokens(wl[k, 'ipa'],
                                   merge_vowels=False,
                                   expand_nasals=True)
            # remove sandhi-annotation in tokens, as it is confusing clpa
            for i, t in enumerate(tokens):
                if '⁻' in t:
                    tokens[i] = t[:t.index('⁻')]
            ds.add_row([
                '{0}-{1}'.format(SOURCE, k),
                lids[wl[k, 'doculect']],
                wl[k, 'doculect'],
                '',
                cids[wl[k, 'concept']],
                wl[k, 'concept'],
                wl[k, 'mandarin'],
                wl[k, 'ipa'],
                wl[k, 'counterpart'],
                SOURCE,
                ' '.join(tokens),
                wl[k, 'cogid'],
                wl[k, 'order'],
                wl[k, 'note'] if wl[k, 'note'] != '-' else '',
            ])
            D[k] = [
                wl[k, 'doculect'], wl[k, 'concept'], wl[k, 'ipa'], tokens,
                wl[k, 'cogid']
            ]
            if wl[k, 'cogid'] not in visited:
                # we need to add new tones, otherwise it won't work, so we
                # split syllables first, then check if the syllable ends with
                # tone or not and add a '1' if this is not the case
                syllables = wl[k, 'mch'].split('.')
                for i, s in enumerate(syllables):
                    if s[-1] not in '²³':
                        if s[-1] not in 'ptk':
                            syllables[i] += '¹'
                        else:
                            syllables[i] += '⁴'
                tokens = lp.ipa2tokens(''.join(syllables))
                ds.add_row([
                    '{0}-{1}'.format(wl[k, 'concept'], idx), 'sini1245',
                    'Middle Chinese', '', cids[wl[k, 'concept']],
                    wl[k, 'concept'], '', wl[k, 'proto'], wl[k, 'counterpart'],
                    SOURCE, ' '.join(tokens), wl[k, 'cogid'], '', ''
                ])
                D[idx] = [
                    'Middle Chinese', wl[k, 'concept'], wl[k, 'mch'], tokens,
                    wl[k, 'cogid']
                ]
                idx += 1
                visited += [wl[k, 'cogid']]
        alms = lp.Alignments(D)
        cognates = [[
            '{0}-{1}'.format(SOURCE, k), ds.name, alms[k, 'ipa'],
            '-'.join([slug(alms[k, 'concept']),
                      str(alms[k, 'cogid'])]), '', 'expert', SOURCE, '', '', ''
        ] for k in alms]

        dataset.cognates.extend(
            iter_alignments(alms, cognates, method='library'))
示例#14
0
def cognate_code_to_file(
    metadata: Path,
    ratio: float,
    soundclass: str,
    cluster_method: str,
    threshold: float,
    initial_threshold: float,
    gop: float,
    mode: str,
    output_file: Path,
) -> None:
    dataset = pycldf.Wordlist.from_metadata(metadata)
    assert (dataset.column_names.forms.segments
            is not None), "Dataset must have a CLDF #segments column."

    lex = lingpy.compare.partial.Partial.from_cldf(
        metadata,
        filter=filter_function_factory(dataset),
        columns=["doculect", "concept", "tokens"],
        model=lingpy.data.model.Model(soundclass),
        check=True,
    )

    if ratio != 1.5:
        if ratio == float("inf"):
            ratio_pair = (1, 0)
            ratio_str = "-inf"
        if ratio == int(ratio) >= 0:
            r = int(ratio)
            ratio_pair = (r, 1)
            ratio_str = "-{:d}".format(r)
        elif ratio > 0:
            ratio_pair = (ratio, 1)
            ratio_str = "-" + str(ratio)
        else:
            raise ValueError("LexStat ratio must be in [0, ∞]")
    else:
        ratio_pair = (3, 2)
        ratio_str = ""
    if initial_threshold != 0.7:
        ratio_str += "-t{:02d}".format(int(initial_threshold * 100))
    try:
        scorers_etc = lingpy.compare.lexstat.LexStat(
            filename="lexstats-{:}-{:s}{:s}.tsv".format(
                sha1(metadata), soundclass, ratio_str))
        lex.scorer = scorers_etc.scorer
        lex.cscorer = scorers_etc.cscorer
        lex.bscorer = scorers_etc.bscorer
    except (OSError, ValueError):
        lex.get_scorer(runs=10000,
                       ratio=ratio_pair,
                       threshold=initial_threshold)
        lex.output(
            "tsv",
            filename="lexstats-{:}-{:s}{:s}".format(sha1(metadata), soundclass,
                                                    ratio_str),
            ignore=[],
        )
    # For some purposes it is useful to have monolithic cognate classes.
    lex.cluster(
        method="lexstat",
        threshold=threshold,
        ref="cogid",
        cluster_method=cluster_method,
        verbose=True,
        override=True,
        gop=gop,
        mode=mode,
    )
    # But actually, in most cases partial cognates are much more useful.
    lex.partial_cluster(
        method="lexstat",
        threshold=threshold,
        cluster_method=cluster_method,
        ref="partialcognateids",
        override=True,
        verbose=True,
        gop=gop,
        mode=mode,
    )
    lex.output("tsv", filename="auto-clusters")
    alm = lingpy.Alignments(lex, ref="partialcognateids", fuzzy=True)
    alm.align(method="progressive")
    alm.output("tsv", filename=str(output_file), ignore="all", prettify=False)

    try:
        dataset.add_component("CognateTable")
    except ValueError:
        ...
    try:
        dataset.add_component("CognatesetTable")
    except ValueError:
        ...

    read_back = csv.DictReader(open(str(output_file) + ".tsv",
                                    encoding="utf-8"),
                               delimiter="\t")
    cognatesets = {}
    judgements = []
    i = 1
    for line in read_back:
        partial = line["PARTIALCOGNATEIDS"].split()
        alignment = line["ALIGNMENT"].split(" + ")
        slice_start = 0
        for cs, alm in zip(partial, alignment):
            # TODO: @Gereon: is it alright to add the same content to Name and ID?
            cognatesets.setdefault(cs, {"ID": cs, "Name": cs})
            length = len(alm.split())
            judgements.append({
                "ID":
                i,
                "Form_ID":
                line["ID"],
                "Cognateset_ID":
                cs,
                "Segment_Slice":
                ["{:d}:{:d}".format(slice_start, slice_start + length)],
                "Alignment":
                alm.split(),
                "Source": ["LexStat"],
            })
            i += 1
            slice_start += length
    dataset.write(CognatesetTable=cognatesets.values())
    dataset.write(CognateTable=judgements)
    lex = lingpy.compare.partial.Partial(out,
                                         col="doculect",
                                         row="concept",
                                         segments="tokens",
                                         transcription="ipa")
    lex.get_scorer(runs=1000)
    lex.output('tsv', filename='lexstats', ignore=[])
    # For some purposes it is useful to have monolithic cognate classes.
    lex.cluster(method='lexstat',
                threshold=0.55,
                ref='cogid',
                cluster_method="infomap",
                verbose=True)
    # But actually, in most cases partial cognates are much more useful.
    lex.partial_cluster(method='lexstat',
                        threshold=0.55,
                        ref='partialids',
                        cluster_method="infomap",
                        verbose=True)
    lex.output("tsv", filename="auto-clusters")
    alm = lingpy.Alignments(lex,
                            row="concept",
                            col="doculect",
                            segments="tokens",
                            transcription="ipa",
                            ref="partialids",
                            fuzzy=True)
    alm.align(method='progressive')
    alm.output('tsv', filename='aligned', ignore='all', prettify=False)