Пример #1
0
def cldf(dataset, concepticon, **kw):
    for dset, srckey in zip(DSETS, SOURCES):
        wl = lp.Wordlist(dataset.raw.joinpath(dset).as_posix())
        src = getEvoBibAsSource(srckey)

        with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Language_iso',
                          'Parameter_ID', 'Parameter_name', 'Value', 'Source',
                          'Segments', 'CLPA', 'Cognacy', 'Partial_cognacy'),
                         dataset,
                         subset=dset.split('-')[0]) as ds:

            ds.sources.add(src)
            for k in wl:
                ds.add_row([
                    '{0}-{1}'.format(srckey,
                                     k), wl[k, 'glottolog'], wl[k, 'doculect'],
                    '', wl[k, 'concepticon_id'], wl[k, 'concept'],
                    wl[k, 'ipa'], srckey, ' '.join(wl[k, 'tokens']),
                    ' '.join(wl[k, 'clpa']), wl[k, 'cogid'],
                    ' '.join([str(x) for x in wl[k, 'partialids']])
                ])
            cognates = []
            for k in wl:
                concept = wl[k, 'concept']
                idf = '-'.join([slug(concept), '%s' % wl[k, 'cogid']])
                cognates += [[
                    '{0}-{1}'.format(srckey, k), ds.name, wl[k, 'ipa'], idf,
                    '', 'expert', srckey, '', '', ''
                ]]

            dataset.cognates.extend(
                iter_alignments(wl,
                                cognates,
                                method='progressive',
                                prefix=srckey + '-'))
Пример #2
0
def cldf(dataset, concepticon, **kw):
    concepts = {x['GLOSS']: x['CONCEPTICON_ID'] for x in dataset.concepts}
    D = {}  # dictionary to be passed to lingpy
    D[0] = [
        'doculect', 'glottolog', 'concept', 'concepticon', 'ipa', 'segments',
        'cogid', 'alignment'
    ]
    idx = 1
    for f in FILES:
        msa = lp.MSA(
            dataset.raw.joinpath('phonalign_{0}.msa'.format(f)).as_posix())
        concept = msa.seq_id[1:-1]  # strip quotation marks from concept
        cid = concepts.get(concept, '')
        for i, taxon in enumerate(msa.taxa):
            if taxon in languages:
                tid = languages[taxon]
                alignment = ' '.join(msa.alignment[i])
                tokens = ' '.join([x for x in msa.alignment[i] if x != '-'])
                ipa = tokens.replace(' ', '')
                cogid = '{0}-{1}'.format(concept, f)
                D[idx] = [
                    taxon, tid, concept, cid, ipa, tokens, cogid, alignment
                ]
                idx += 1

    with CldfDataset(
        ('ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID',
         'Parameter_name', 'Value', 'Segments', 'Cognacy', 'Source'),
            dataset) as ds:
        src = getEvoBibAsSource('Heggarty2007')
        ds.sources.add(src)
        src = getEvoBibAsSource('List2014e')
        ds.sources.add(src)

        alm = lp.Alignments(D)
        for k in alm:
            ds.add_row(
                    ['Heggarty2007-{0}'.format(k)] + [alm[k, x] or '' for x in ['glottolog', 'taxon',
                        'iso', 'concepticon', 'concept', 'ipa']] + \
                                [' '.join(alm[k, 'tokens']), alm[k, 'cogid'], 'Heggarty2007']
                                )
            dataset.cognates += [[
                'Heggarty2007-{0}'.format(k), ds.name, alm[k, 'ipa'],
                alm[k, 'cogid'], '', 'expert', 'Heggarty2007',
                alm[k, 'alignment'], 'expert', 'List2014e'
            ]]
        dataset.write_cognates()
Пример #3
0
def cldf(dataset, concepticon, **kw):
    wl = lp.Alignments(dataset.raw.joinpath('tukano.tsv').as_posix())
    src1 = getEvoBibAsSource('Chacon2014')
    src2 = getEvoBibAsSource('Chacon2015')
    gloss2conc = {r['GLOSS']: r['CONCEPTICON_ID'] for r in dataset.concepts}

    with CldfDataset((
            'ID',
            'Language_ID',
            'Language_name',
            'Language_iso',
            'Parameter_ID',
            'Parameter_name',
            'Value',
            'Source',
            'Segments',
            'Cognacy',
    ), dataset) as ds:

        ds.sources.add(src1)
        ds.sources.add(src2)
        for k in wl:
            lid = wl[k, 'language']
            cogid = wl[k, 'cogid']
            concept = wl[k, 'concept']
            segments = wl[k, 'tokens']
            value = wl[k, 'ipa']
            cogid = wl[k, 'cogid']
            alignment = wl[k, 'alignment']
            name, iso = abbr2lang[lid]
            concept = wl[k, 'concept']
            cid = gloss2conc.get(concept)
            ds.add_row(('Chacon2014-' + str(k),
                        dataset.glottocode_by_iso.get(iso, ''), name, iso, cid,
                        concept, value, 'Chacon2014', ' '.join(segments),
                        str(cogid)))

            cogid = '-'.join([slug(wl[k, 'concept']), '%s' % cogid])
            dataset.cognates.append([
                'Chacon2014-' + str(k), ds.name, wl[k, 'ipa'], cogid, '',
                'expert', 'Chacon2014', alignment, 'expert', 'Chacon2015'
            ])
Пример #4
0
def cldf(dataset, concepticon, **kw):
    concepticon = {
        c.english: c.concepticon_id
        for c in dataset.conceptlist.concepts.values()
    }
    language_map = {
        l['NAME']: l['GLOTTOCODE'] or None
        for l in dataset.languages
    }

    with UnicodeReader(
            dataset.raw.joinpath(FILENAME.replace('xls', 'Sheet1.csv'))) as r:
        rows = [row for row in r]

    concepts = [(i, rows[0][i].replace('_', ' ').strip())
                for i in range(1, len(rows[0]), 2)]
    assert all(concept in concepticon for _, concept in concepts)

    with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Parameter_ID',
                      'Parameter_name', 'Value', 'Segments', 'Source'),
                     dataset) as ds:
        ds.table.schema.columns['Value']['dc:format'] = 'IPA'
        ds.sources.add(getEvoBibAsSource(SOURCE))

        for row in rows[3:]:
            row = [col.strip() for col in row]
            if not row[0]:
                continue
            lname = row[0]
            for i, concept in concepts:
                for j, form in iterforms(row[i]):
                    if form != '?' and form.strip():
                        ds.add_row([
                            '%s-%s-%s' % (slug(lname), (i + 1) // 2, j),
                            language_map[lname],
                            lname.replace('_', ' '), concepticon[concept],
                            concept, form, ' '.join(clean_string(form)), SOURCE
                        ])
        # three methods: turchin, sca, lexstat, turchin is fast (needs not threshold)
        cognates = iter_cognates(ds,
                                 column='Segments',
                                 method='turchin',
                                 threshold=0.55)

        # two methods for alignments: progressive or library
        dataset.cognates.extend(
            iter_alignments(ds,
                            cognates,
                            column='Segments',
                            method='progressive'))

    dataset.write_cognates()
Пример #5
0
def cldf(dataset, concepticon, **kw):
    gloss2con = {x['GLOSS']: x['CONCEPTICON_ID'] for x in dataset.concepts}
    lang2glot = {x['NAME']: x['GLOTTOCODE'] for x in dataset.languages}

    for dset, srckey in zip(DSETS, sources):
        wl = lp.Wordlist(dataset.raw.joinpath(dset).as_posix())
        if 'tokens' not in wl.header:
            wl.add_entries('tokens',
                           'ipa',
                           lp.ipa2tokens,
                           merge_vowels=False,
                           expand_nasals=True)
        src = getEvoBibAsSource(srckey)

        with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Language_iso',
                          'Parameter_ID', 'Parameter_name', 'Value', 'Source',
                          'Segments', 'Cognacy', 'Loan'),
                         dataset,
                         subset=dset.split('.')[0]) as ds:
            ds.sources.add(src)
            errors = []
            cognates = []
            for k in wl:
                concept = wl[k, 'concept']
                if '(V)' in concept:
                    concept = concept[:-4]
                concept = correct_concepts.get(concept, concept)
                if concept not in gloss2con:
                    errors += [concept]
                doculect = correct_languages.get(wl[k, 'doculect'],
                                                 wl[k, 'doculect'])
                loan = wl[k, 'cogid'] < 0
                cogid = abs(wl[k, 'cogid'])

                wid = '{0}-{1}'.format(dset.split('.')[0], k)
                ds.add_row([
                    wid, lang2glot[doculect], wl[k, 'doculect'], '',
                    gloss2con.get(wl[k, 'concept'],
                                  ''), wl[k, 'concept'], wl[k, 'ipa'], srckey,
                    ' '.join(wl[k, 'tokens'] or ['']), cogid, wl[k, 'loan']
                ])

                cognates.append([
                    wid, ds.name, wl[k, 'ipa'], cogid,
                    'borrowed' if loan else '', 'expert', srckey, '', '', ''
                ])

            dataset.cognates.extend(
                iter_alignments(lp.Alignments(wl), cognates, method='library'))
            for er in sorted(set(errors)):
                print(er, dset)
Пример #6
0
def cldf(dataset, concepticon, **kw):
    with UnicodeReader(dataset.raw.joinpath('Wang2004.csv'), delimiter='\t') as reader:
        lines = list(reader)
    lmap = dict([(x['ABBREVIATION'], (x['GLOTTOCODE'], x['ISO'], x['NAME'])) for x in
        dataset.languages])
    cmap = {c.english: c.concepticon_id for c in dataset.conceptlist.concepts.values()}

    with CldfDataset((
        'ID',
        'Language_ID',
        'Language_name',
        'Language_iso',
        'Parameter_ID',
        'Parameter_name',
        'Value',
        'Source',
        'Cognacy',
        )
            , dataset) as ds:
        ds.sources.add(getEvoBibAsSource(SOURCE))
        idx = 1
        cogids = {0: 0}
        for i, line in enumerate(lines[1:]):
            concept = line[0]
            cid = cmap[concept]

            for t, cogs in zip(lines[0][1:], line[1:]):
                glottocode, iso, taxon = lmap[t]
                for cog in cogs.split('/'):
                    if cog in cogids:
                        cogid = cogids[cog]
                    else:
                        cogid = max(list(cogids.values()) or 0) + 1
                        cogids[cog] = cogid
                    ds.add_row((
                        idx, glottocode, taxon, iso, cid, concept, cog, SOURCE,
                        cogid))
                    dataset.cognates.append([
                        idx,
                        ds.name,
                        cog,
                        '-'.join([slug(concept), str(cogid)]),
                        '',
                        'expert',
                        SOURCE,
                        '', '', ''])
                    idx += 1
Пример #7
0
def cldf(dataset, concepticon, **kw):
    wl = lp.Wordlist(dataset.raw.joinpath(DSET).as_posix())
    gcode = {x['NAME']: x['GLOTTOCODE'] for x in dataset.languages}
    src = getEvoBibAsSource(SOURCE)

    with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Language_iso',
                      'Parameter_ID', 'Parameter_name',
                      'Parameter_Chinese_name', 'Value', 'Segments', 'Source'),
                     dataset) as ds:
        ds.sources.add(src)

        for k in wl:
            if wl[k, 'value'] not in '---' and wl[k, 'value'].strip():
                ds.add_row([
                    wl[k, 'lid'], gcode[wl[k, 'doculect']], wl[k, 'doculect'],
                    '', wl[k, 'concepticon_id'], wl[k, 'concept'],
                    wl[k, 'chinese'], wl[k, 'value'],
                    clean_string(wl[k, 'value'])[0], SOURCE
                ])
Пример #8
0
def cldf(dataset, concepticon, **kw):
    concepticon = {
        c.english: c.concepticon_id
        for c in dataset.conceptlist.concepts.values()
    }
    concepticon['you (sing.)'] = concepticon['you (sing.) (thou)']
    concepticon['you (pl.)'] = concepticon['you (pl.) (ye)']
    concepticon['to itch/itchy'] = concepticon['to itch/to be itchy']
    concepticon['medicine'] = concepticon['medicine/juice']
    concepticon['excrement/shit'] = concepticon['feces/excrement/shit']

    language_map = {
        'Tampuon': 'Tampuan',
        'Palaung-Namhsan-Taunggyi': 'Palaung-Namhsan',
        'Jru-Laven\u02d0': 'Jru-Laven',
        'Pnar-Jaintia': 'Pnar',
        'K-Surin': 'Khmer-Surin',
    }

    languages = {}
    words = []

    with UnicodeReader(dataset.raw.joinpath('ds.Sheet1.csv')) as reader:
        for i, row in enumerate(reader):
            if 3 <= i < 125:
                languages[row[1]] = row
            elif i > 334:
                words.append(row)

    lids = [int(float(r[0])) for r in languages.values()]
    assert min(lids) == 1 and max(lids) == 122

    glottolog = dataset.glottocode_by_iso
    glottolog.update(
        {l['NAME']: l['GLOTTOCODE'] or None
         for l in dataset.languages})

    sources = {}
    for src, langs in groupby(sorted(languages.values(), key=lambda r: r[6]),
                              lambda r: r[6]):
        langs = [l[1] for l in langs]
        src = Source('misc', '_'.join(map(slug, langs)), title=src)
        for lang in langs:
            sources[lang] = src
    sources['cognates'] = getEvoBibAsSource(SOURCE)

    unmapped = Unmapped()
    with CldfDataset((
            'ID',
            'Language_ID',
            'Language_name',
            'Language_iso',
            'Parameter_ID',
            'Parameter_name',
            'Value',
            'Segments',
            'Source',
            'Comment',
    ), dataset) as ds:
        ds.sources.add(*sources.values())
        D = {0: ['lid', 'doculect', 'concept', 'ipa', 'tokens', 'cog']}
        for i, row in enumerate(words):
            form = row[4]
            if not form or form in '*-':
                continue
            assert row[1] in concepticon
            lang = language_map.get(row[3], row[3].strip())
            assert lang in languages
            gc = glottolog.get(glottolog.get(languages[lang][7]), lang)
            if not gc:
                unmapped.languages.add(('', lang, languages[lang][7]))
            # get segments
            segments = clean_string(form)[0]
            # get cognate identifier
            cogid = row[5] if row[5].strip() and row[5].strip() != '*' else (
                'e%s' % i)
            cogid = row[1] + '-' + cogid
            lid = '{0}-{1}'.format(ds.name, i + 1)
            ds.add_row([
                lid,
                glottolog.get(lang, glottolog.get(languages[lang][7])), lang,
                languages[lang][7], concepticon[row[1]], row[1], form,
                segments, sources[lang].id, None
            ])
            D[i + 1] = [lid, lang, row[1], form, segments, cogid]
        wl = lp.Wordlist(D)
        wl.renumber('cog')
        alm = lp.Alignments(wl)
        dataset.cognates.extend(
            iter_alignments(alm, wordlist2cognates(wl, ds, SOURCE)))

    unmapped.pprint()
Пример #9
0
def cldf(dataset, concepticon, **kw):
    concepticon = {
        c.english: c.concepticon_id
        for c in dataset.conceptlist.concepts.values()
    }
    language_map = {
        l['NAME']: l['GLOTTOCODE'] or None
        for l in dataset.languages
    }

    header, rows = None, []
    with UnicodeReader(
            dataset.raw.joinpath(
                'Semitic.Wordlists.ActualWordlists.csv')) as reader:
        for i, row in enumerate(reader):
            row = [c.strip() for c in row]
            if i == 0:
                header = row
            if i > 0:
                rows.append(row)
    cheader, crows = None, []
    with UnicodeReader(
            dataset.raw.joinpath(
                'Semitic.Codings.Multistate.Sheet1.csv')) as reader:
        for i, row in enumerate(reader):
            row = [c.strip() for c in row]
            if i == 0:
                cheader = row
            if i > 0:
                crows.append(row)

    langs = header[1:]
    clean_langs = {
        """Gɛ'ɛz""": "Ge'ez",
        "Tigrɛ": "Tigre",
        'ʷalani': "Walani",
        "Ogadɛn Arabic": "Ogaden Arabic",
        "Mɛhri": "Mehri",
        "Gibbali": "Jibbali",
    }
    correct_concepts = {
        'Cold (air)': 'Cold (of air)',
    }
    src = getEvoBibAsSource('Kitchen2012')

    with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Parameter_ID',
                      'Parameter_name', 'Value', 'Segments'), dataset) as ds:
        D = {0: ['doculect', 'concept', 'ipa', 'tokens']}
        idx = 1
        ds.sources.add(src)
        for row in rows:
            concept = row[0]
            for i, col in enumerate(row[1:]):
                lang = langs[i]
                if col != '---':
                    cleaned_string = clean_string(col,
                                                  merge_vowels=False,
                                                  preparse=PREPARSE,
                                                  rules=CONVERSION,
                                                  semi_diacritics='')[0]
                    ds.add_row([
                        'Kitchen2012-' + str(idx), language_map[lang],
                        clean_langs.get(lang, lang), concepticon[concept],
                        concept, col, cleaned_string
                    ])
                    D[idx] = [
                        clean_langs.get(lang, lang), concept, col,
                        cleaned_string
                    ]
                    idx += 1

        wl = lp.Wordlist(D)
        id2cog = {}
        errors = []
        for row in crows:
            taxon = row[0]
            for i, (concept, cog) in enumerate(zip(cheader[1:], row[1:])):
                nconcept = rows[i][0]
                if cog != '-':
                    idxs = wl.get_dict(taxon=taxon)
                    if idxs.get(nconcept, ''):
                        id2cog[idxs[nconcept][0]] = concept + '-' + cog
                    else:
                        errors += [(concept, nconcept, taxon)]
        bad_cogs = 1
        cognates = []
        for k in wl:
            cognates = []
            if k in id2cog:
                cogid = id2cog[k]
            else:
                cogid = str(bad_cogs)
                bad_cogs += 1
                id2cog[k] = cogid

        wl.add_entries('cog', id2cog, lambda x: x)
        wl.renumber('cog')
        for k in wl:
            cognates += [[
                'Kitchen2012-' + str(k), ds.name, wl[k, 'ipa'],
                wl[k, 'concept'] + '-' + str(wl[k, 'cogid']), '', 'expert',
                'Kitchen2012', '', '', ''
            ]]

        dataset.cognates.extend(iter_alignments(lp.Alignments(wl), cognates))
Пример #10
0
def cldf(dataset, concepticon, **kw):
    concept_map = {
        c.english: c.concepticon_id
        for c in dataset.conceptlist.concepts.values()
    }
    glotto_map = {c['NAME']: c['GLOTTOCODE'] for c in dataset.languages}

    # retrieve coordinates
    coords = {}
    langs = []
    # language map, as the names are not identical
    language_map = {
        "Namhsan": "Nam Hsan",
        "Pangkham": "Pang Kham",
        "Xiang Zhai Tang  (Xiang Cai Tang)": "Xiang Zhai Tang"
    }
    with UnicodeReader(
            dataset.raw.joinpath('100item-phylo.Sheet2.csv')) as reader:
        for i, (num, lat, lon, village, country) in enumerate(reader):
            if i >= 1:
                coords[language_map.get(village, village)] = (lat, lon)
                langs.append(language_map.get(village, village))

    cognates = []
    idx = 1
    with UnicodeReader(dataset.raw.joinpath('100item-phylo.Sheet1.csv'),
            delimiter=',') as reader,\
            CldfDataset((
                'ID',
                'Language_ID',
                'Language_name',
                'Language_iso',
                'Parameter_ID',
                'Parameter_name',
                'Value',
                'Source',
                'Segments',
                'Cognacy'
                ), dataset) as ds:
        ds.sources.add(getEvoBibAsSource('Deepadung2015'))
        ds.metadata['coordinates'] = coords
        data = list(reader)
        header = data[2][2:]
        for i, row in enumerate(data[5:]):
            row = [c.strip() for c in row]
            concept = row[1]
            cid = concept_map[concept]
            for j in range(0, len(header), 2):
                lang = language_map.get(header[j], header[j])
                gcid = glotto_map[lang]
                cog = slug(concept) + '-' + row[2:][j + 1]
                certainty = 0
                if ' or ' in cog:
                    cog = cog.split(' ')[0]
                    certainty = 1
                word = CORRECT.get(row[2:][j], row[2:][j])
                if word.strip() and ''.join(set(word.strip())) != '-':
                    segments = lp.sequence.sound_classes.clean_string(
                        word,
                        splitters=',',
                        rules=CONVERSION,
                        preparse=PREPARSE,
                        semi_diacritics="")[0]
                    cogid = slug(concept) + '-' + cog
                    ds.add_row([
                        idx, gcid, lang, '', cid, concept, word, PROVIDER,
                        segments, cogid
                    ])
                    cognates.append([
                        idx, ds.name, word, cogid,
                        str(certainty), 'expert', PROVIDER, '', '', ''
                    ])
                    idx += 1
    dataset.cognates.extend(
        iter_alignments(
            ds,
            cognates,
            method='progressive',
        ))
Пример #11
0
def cldf(dataset, concepticon, **kw):
    wl = lp.Wordlist(dataset.raw.joinpath(DSET).as_posix())
    gcode = {x['NAME']: x['GLOTTOCODE'] for x in dataset.languages}
    ccode = {x.english: x.concepticon_id for x in
             dataset.conceptlist.concepts.values()}
    src = getEvoBibAsSource(SOURCE)
    src2 = getEvoBibAsSource('List2015d')

    with CldfDataset((
        'ID',
        'Language_ID',
        'Language_name',
        'Language_iso',
        'Parameter_ID',
        'Parameter_name',
        'Value',
        'Source',
        'Cognacy',
        )
            , dataset) as ds:
        
        ds.sources.add(src, src2)

        # store list of proto-form to cognate set
        p2c = {}

        for k in wl:
            ds.add_row([
                '{0}-{1}'.format(SOURCE, k),
                gcode[wl[k, 'doculect']],
                wl[k, 'doculect'],
                '',
                ccode[wl[k, 'concept']],
                wl[k, 'concept'],
                wl[k, 'ipa'],
                SOURCE,
                wl[k, 'COGID']
            ])
            dataset.cognates += [[
                '{0}-{1}'.format(SOURCE, k),
                ds.name,
                wl[k, 'ipa'],
                '-'.join([slug(wl[k, 'concept']), str(wl[k, 'cogid'])]),
                '', 
                'expert',
                SOURCE,
                '',
                '',
                ''
            ]]
            p2c[wl[k, 'proto']] = wl[k, 'cogid']
        idx = max([k for k in wl]) + 1
        for line in lp.csv2list(dataset.raw.joinpath('old_chinese.csv').as_posix()):
            for val in line[1].split(', '):
                ds.add_row((
                    '{0}-{1}'.format(SOURCE, idx),
                    'sini1245',
                    'Old Chinese',
                    '',
                    ccode[line[0]],
                    line[0],
                    val,
                    SOURCE,
                    p2c.get(val, val)
                ))
                dataset.cognates += [[
                    '{0}-{1}'.format(SOURCE, idx),
                    ds.name,
                    val,
                    '-'.join([slug(line[0]), text_type(p2c.get(val, val))]),
                    '',
                    'expert',
                    SOURCE,
                    '',
                    '',
                    '']]
                idx += 1
Пример #12
0
def cldf(dataset, concepticon, **kw):
    language_map = {
        l['NAME']: l['GLOTTOCODE'] or None
        for l in dataset.languages
    }
    concept_map = {
        c.english: c.concepticon_id
        for c in dataset.conceptlist.concepts.values()
    }
    concept_map[
        'year'] = '1226'  # dunno why this is missing, it's 200 words...
    wordlists = list(read_csv(dataset))
    cogsets = defaultdict(lambda: defaultdict(list))
    for wl in wordlists:
        for concept, (words, cogids) in wl.words.items():
            if len(cogids) == 1:
                cogsets[concept][cogids[0]].append(words[0])

    with CldfDataset((
            'ID',
            'Language_ID',
            'Language_name',
            'Parameter_ID',
            'Parameter_name',
            'Value',
            'Segments',
            'Source',
            'Comment',
    ), dataset) as ds:
        ds.sources.add(getEvoBibAsSource(SOURCE))
        cognates = []
        for wl in wordlists:
            #print(wl.language)
            for concept, (words, cogids) in wl.words.items():
                if len(cogids) > 1:
                    if len(words) < len(cogids):
                        if len(words) == 1:
                            if ':' in words[0]:
                                words = words[0].split(':')
                            if ',' in words[0]:
                                words = words[0].split(',')
                        assert len(words) >= len(cogids)
                    assert (wl.language, concept) in COGSET_MAP
                    if len(words) > len(cogids):
                        assert (wl.language, concept) in COGSET_MAP
                if (wl.language, concept) in COGSET_MAP:
                    word_to_cogid = COGSET_MAP[(wl.language, concept)]
                else:
                    word_to_cogid = dict(izip_longest(words, cogids))
                for i, word in enumerate(words):
                    if word.startswith('(') and word.endswith(')'):
                        word = word[1:-1].strip()
                    wid = '%s-%s-%s' % (slug(
                        wl.language), slug(concept), i + 1)
                    ds.add_row([
                        wid,
                        '',
                        wl.language,
                        concept_map.get(concept, ''),
                        concept,
                        word,
                        clean_string(word, splitters='?')[0],
                        SOURCE,
                        '',
                    ])
                    if word_to_cogid.get(word):
                        cognates.append([
                            wid,
                            ds.name,
                            word,
                            '%s-%s' % (slug(concept), word_to_cogid[word]),
                            False,
                            'expert',
                            SOURCE,
                            '',
                            '',
                            '',
                        ])
        dataset.cognates.extend(
            iter_alignments(ds, cognates, column='Segments'))
Пример #13
0
def cldf(dataset, concepticon, **kw):
    """
    Implements the conversion of the raw data to CLDF dataset(s).

    :param dataset: provides access to the information in supplementary files as follows:\
     - the JSON object from `metadata.json` is available as `dataset.md`\
     - items from languages.csv are available as `dataset.languages`\
     - items from concepts.csv are available as `dataset.concepts`\
     - if a Concepticon conceptlist was specified in metadata.json, its ID is available\
       as `dataset.conceptlist`
    :param glottolog: a pyglottolog.api.Glottolog` instance.
    :param concepticon:  a pyconcepticon.api.Concepticon` instance.
    :param kw: All arguments passed on the command line.
    """

    wl = lp.Wordlist(dataset.raw.joinpath(DSET).as_posix())

    # get language identifiers
    lids, cids, coords = {}, {}, {}
    for row in dataset.languages:
        language = row['NAME']
        lids[language] = row['GLOTTOCODE']
    coords = dict([wl.coords[taxon] for taxon in lids])
    modify = {
        'thunder (verb)': 'thunder',
        'flash (verb)': 'lightning',
        'room': 'flat',
        'have diarrea': 'have diarrhoea',
        'watery': 'light'
    }
    for row in dataset.concepts:
        concept = modify[row['CONCEPT']] if row['CONCEPT'] in modify else \
                row['CONCEPT']
        cids[concept] = row['CONCEPT_SET']

    # language ids
    src = getEvoBibAsSource(SOURCE)
    src2 = getEvoBibAsSource('List2014b')

    # get partial identifiers
    partial_ids = defaultdict(list)
    partial_converter = {}
    idx = 1
    for k in wl:
        for char in wl[k, 'counterpart']:
            if char in partial_converter:
                pidx = partial_converter[char]
            else:
                pidx = idx
                partial_converter[char] = idx
                idx += 1
            partial_ids[k] += [pidx]

    # trace if proto-langugages was visited
    visited = []
    idx = max([k for k in wl]) + 1

    with CldfDataset(
        ('ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID',
         'Parameter_name', 'Parameter_Chinese_name', 'Value',
         'Value_Chinese_characters', 'Source', 'Segments', 'Cognacy', 'Rank',
         'Comment'), dataset) as ds:

        ds.sources.add(src)
        ds.sources.add(src2)

        D = {0: ['doculect', 'concept', 'ipa', 'tokens', 'cogid']}
        for k in wl:
            tokens = lp.ipa2tokens(wl[k, 'ipa'],
                                   merge_vowels=False,
                                   expand_nasals=True)
            # remove sandhi-annotation in tokens, as it is confusing clpa
            for i, t in enumerate(tokens):
                if '⁻' in t:
                    tokens[i] = t[:t.index('⁻')]
            ds.add_row([
                '{0}-{1}'.format(SOURCE, k),
                lids[wl[k, 'doculect']],
                wl[k, 'doculect'],
                '',
                cids[wl[k, 'concept']],
                wl[k, 'concept'],
                wl[k, 'mandarin'],
                wl[k, 'ipa'],
                wl[k, 'counterpart'],
                SOURCE,
                ' '.join(tokens),
                wl[k, 'cogid'],
                wl[k, 'order'],
                wl[k, 'note'] if wl[k, 'note'] != '-' else '',
            ])
            D[k] = [
                wl[k, 'doculect'], wl[k, 'concept'], wl[k, 'ipa'], tokens,
                wl[k, 'cogid']
            ]
            if wl[k, 'cogid'] not in visited:
                # we need to add new tones, otherwise it won't work, so we
                # split syllables first, then check if the syllable ends with
                # tone or not and add a '1' if this is not the case
                syllables = wl[k, 'mch'].split('.')
                for i, s in enumerate(syllables):
                    if s[-1] not in '²³':
                        if s[-1] not in 'ptk':
                            syllables[i] += '¹'
                        else:
                            syllables[i] += '⁴'
                tokens = lp.ipa2tokens(''.join(syllables))
                ds.add_row([
                    '{0}-{1}'.format(wl[k, 'concept'], idx), 'sini1245',
                    'Middle Chinese', '', cids[wl[k, 'concept']],
                    wl[k, 'concept'], '', wl[k, 'proto'], wl[k, 'counterpart'],
                    SOURCE, ' '.join(tokens), wl[k, 'cogid'], '', ''
                ])
                D[idx] = [
                    'Middle Chinese', wl[k, 'concept'], wl[k, 'mch'], tokens,
                    wl[k, 'cogid']
                ]
                idx += 1
                visited += [wl[k, 'cogid']]
        alms = lp.Alignments(D)
        cognates = [[
            '{0}-{1}'.format(SOURCE, k), ds.name, alms[k, 'ipa'],
            '-'.join([slug(alms[k, 'concept']),
                      str(alms[k, 'cogid'])]), '', 'expert', SOURCE, '', '', ''
        ] for k in alms]

        dataset.cognates.extend(
            iter_alignments(alms, cognates, method='library'))