Пример #1
0
def cldf(dataset, concepticon, **kw):
    unmapped = set()
    for ods in clld.itercldf(dataset, __name__):
        lid = ods.name.split('-')[-1]
        fields = list(ods.fields) + [
            'Language_local_ID', 'Parameter_local_ID', 'Loan', 'Context'
        ]
        with CldfDataset(fields, dataset, subset=lid) as ds:
            ds.table.schema.columns['Loan'].datatype = 'boolean'
            ds.table.schema.columns['Parameter_local_ID'].valueUrl = \
                clld.url(__name__, path='/meaning/{Parameter_local_ID}')
            ds.table.schema.columns['Language_local_ID'].valueUrl = \
                clld.url(__name__, path='/language/{Language_local_ID}')
            ds.table.schema.columns['Word_ID'].valueUrl = \
                clld.url(__name__, path='/word/{Word_ID}')
            ds.metadata.update(
                {k: v
                 for k, v in ods.metadata.items() if k.startswith('dc:')})
            ds.sources.add(*ods.sources.items())
            for row in ods.rows:
                if row['Language_ID'] == 'None':
                    row['Language_ID'] = None
                    unmapped.add((row['Language_name'], lid))
                keys = list(row.keys())
                for i, (form, context) in enumerate(split(row['Value'])):
                    _row = row.to_list()
                    _row[keys.index('Value')] = form
                    _row[keys.index('ID')] = '%s-%s' % (row['ID'], i + 1)
                    # Note: We count words marked as "probably borrowed" as loans.
                    _row.extend([
                        lid, row['WOLD_Meaning_ID'],
                        float(row['Borrowed_score']) > 0.6, context
                    ])
                    ds.add_row(_row)
    assert not unmapped
Пример #2
0
def cldf(dataset, concepticon, **kw):
    for dset, srckey in zip(DSETS, SOURCES):
        wl = lp.Wordlist(dataset.raw.joinpath(dset).as_posix())
        src = getEvoBibAsSource(srckey)

        with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Language_iso',
                          'Parameter_ID', 'Parameter_name', 'Value', 'Source',
                          'Segments', 'CLPA', 'Cognacy', 'Partial_cognacy'),
                         dataset,
                         subset=dset.split('-')[0]) as ds:

            ds.sources.add(src)
            for k in wl:
                ds.add_row([
                    '{0}-{1}'.format(srckey,
                                     k), wl[k, 'glottolog'], wl[k, 'doculect'],
                    '', wl[k, 'concepticon_id'], wl[k, 'concept'],
                    wl[k, 'ipa'], srckey, ' '.join(wl[k, 'tokens']),
                    ' '.join(wl[k, 'clpa']), wl[k, 'cogid'],
                    ' '.join([str(x) for x in wl[k, 'partialids']])
                ])
            cognates = []
            for k in wl:
                concept = wl[k, 'concept']
                idf = '-'.join([slug(concept), '%s' % wl[k, 'cogid']])
                cognates += [[
                    '{0}-{1}'.format(srckey, k), ds.name, wl[k, 'ipa'], idf,
                    '', 'expert', srckey, '', '', ''
                ]]

            dataset.cognates.extend(
                iter_alignments(wl,
                                cognates,
                                method='progressive',
                                prefix=srckey + '-'))
Пример #3
0
def cldf(dataset, concepticon, **kw):
    concepticon = {
        c.english: c.concepticon_id
        for c in dataset.conceptlist.concepts.values()
    }
    language_map = {
        l['NAME']: l['GLOTTOCODE'] or None
        for l in dataset.languages
    }

    with UnicodeReader(
            dataset.raw.joinpath(FILENAME.replace('xls', 'Sheet1.csv'))) as r:
        rows = [row for row in r]

    concepts = [(i, rows[0][i].replace('_', ' ').strip())
                for i in range(1, len(rows[0]), 2)]
    assert all(concept in concepticon for _, concept in concepts)

    with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Parameter_ID',
                      'Parameter_name', 'Value', 'Segments', 'Source'),
                     dataset) as ds:
        ds.table.schema.columns['Value']['dc:format'] = 'IPA'
        ds.sources.add(getEvoBibAsSource(SOURCE))

        for row in rows[3:]:
            row = [col.strip() for col in row]
            if not row[0]:
                continue
            lname = row[0]
            for i, concept in concepts:
                for j, form in iterforms(row[i]):
                    if form != '?' and form.strip():
                        ds.add_row([
                            '%s-%s-%s' % (slug(lname), (i + 1) // 2, j),
                            language_map[lname],
                            lname.replace('_', ' '), concepticon[concept],
                            concept, form, ' '.join(clean_string(form)), SOURCE
                        ])
        # three methods: turchin, sca, lexstat, turchin is fast (needs not threshold)
        cognates = iter_cognates(ds,
                                 column='Segments',
                                 method='turchin',
                                 threshold=0.55)

        # two methods for alignments: progressive or library
        dataset.cognates.extend(
            iter_alignments(ds,
                            cognates,
                            column='Segments',
                            method='progressive'))

    dataset.write_cognates()
Пример #4
0
def cldf(dataset, concepticon, **kw):
    gloss2con = {x['GLOSS']: x['CONCEPTICON_ID'] for x in dataset.concepts}
    lang2glot = {x['NAME']: x['GLOTTOCODE'] for x in dataset.languages}

    for dset, srckey in zip(DSETS, sources):
        wl = lp.Wordlist(dataset.raw.joinpath(dset).as_posix())
        if 'tokens' not in wl.header:
            wl.add_entries('tokens',
                           'ipa',
                           lp.ipa2tokens,
                           merge_vowels=False,
                           expand_nasals=True)
        src = getEvoBibAsSource(srckey)

        with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Language_iso',
                          'Parameter_ID', 'Parameter_name', 'Value', 'Source',
                          'Segments', 'Cognacy', 'Loan'),
                         dataset,
                         subset=dset.split('.')[0]) as ds:
            ds.sources.add(src)
            errors = []
            cognates = []
            for k in wl:
                concept = wl[k, 'concept']
                if '(V)' in concept:
                    concept = concept[:-4]
                concept = correct_concepts.get(concept, concept)
                if concept not in gloss2con:
                    errors += [concept]
                doculect = correct_languages.get(wl[k, 'doculect'],
                                                 wl[k, 'doculect'])
                loan = wl[k, 'cogid'] < 0
                cogid = abs(wl[k, 'cogid'])

                wid = '{0}-{1}'.format(dset.split('.')[0], k)
                ds.add_row([
                    wid, lang2glot[doculect], wl[k, 'doculect'], '',
                    gloss2con.get(wl[k, 'concept'],
                                  ''), wl[k, 'concept'], wl[k, 'ipa'], srckey,
                    ' '.join(wl[k, 'tokens'] or ['']), cogid, wl[k, 'loan']
                ])

                cognates.append([
                    wid, ds.name, wl[k, 'ipa'], cogid,
                    'borrowed' if loan else '', 'expert', srckey, '', '', ''
                ])

            dataset.cognates.extend(
                iter_alignments(lp.Alignments(wl), cognates, method='library'))
            for er in sorted(set(errors)):
                print(er, dset)
Пример #5
0
def cldf(dataset, concepticon, **kw):
    gcode = {x['ID']: x['GLOTTOCODE'] for x in dataset.languages}
    ccode = {
        x.english: x.concepticon_id
        for x in dataset.conceptlist.concepts.values()
    }
    data = defaultdict(dict)
    for fname in dataset.raw.glob('*.csv'):
        read_csv(fname, data)

    cognatesets = []
    with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Parameter_ID',
                      'Parameter_name', 'Value', 'Segments'), dataset) as ds:
        for doculect, wl in data.items():
            for concept, (form, loan, cogset) in wl.items():
                wid = '%s-%s' % (slug(doculect), slug(concept))
                if concept in ccode:
                    csid = ccode[concept]
                elif concept.startswith('to ') and concept[3:] in ccode:
                    csid = ccode[concept[3:]]
                else:
                    csid = None

                ds.add_row([
                    wid,
                    gcode[doculect.split('-')[0]],
                    doculect,
                    csid,
                    concept,
                    form,
                    '',
                ])
                if cogset:
                    cognatesets.append([
                        wid,
                        ds.name,
                        form,
                        '%s-%s' % (slug(concept), cogset),
                        False,
                        'expert',
                        '',
                        '',
                        '',
                        '',
                    ])
        segmentize(ds, clean=lambda s: s.split(' ~ ')[0])
    dataset.cognates.extend(iter_alignments(ds, cognatesets,
                                            column='Segments'))
Пример #6
0
def cldf(dataset, concepticon, **kw):
    """
    Implements the conversion of the raw data to CLDF dataset(s).

    :param dataset: provides access to the information in supplementary files as follows:\
     - the JSON object from `metadata.json` is available as `dataset.md`\
     - items from languages.csv are available as `dataset.languages`\
     - items from concepts.csv are available as `dataset.concepts`\
     - if a Concepticon conceptlist was specified in metadata.json, its ID is available\
       as `dataset.conceptlist`
    :param glottolog: a pyglottolog.api.Glottolog` instance.
    :param concepticon:  a pyconcepticon.api.Concepticon` instance.
    :param kw: All arguments passed on the command line.
    """
    with CldfDataset(REQUIRED_FIELDS, dataset) as ds:
        pass
Пример #7
0
def cldf(dataset, concepticon, **kw):
    with UnicodeReader(dataset.raw.joinpath('Wang2004.csv'), delimiter='\t') as reader:
        lines = list(reader)
    lmap = dict([(x['ABBREVIATION'], (x['GLOTTOCODE'], x['ISO'], x['NAME'])) for x in
        dataset.languages])
    cmap = {c.english: c.concepticon_id for c in dataset.conceptlist.concepts.values()}

    with CldfDataset((
        'ID',
        'Language_ID',
        'Language_name',
        'Language_iso',
        'Parameter_ID',
        'Parameter_name',
        'Value',
        'Source',
        'Cognacy',
        )
            , dataset) as ds:
        ds.sources.add(getEvoBibAsSource(SOURCE))
        idx = 1
        cogids = {0: 0}
        for i, line in enumerate(lines[1:]):
            concept = line[0]
            cid = cmap[concept]

            for t, cogs in zip(lines[0][1:], line[1:]):
                glottocode, iso, taxon = lmap[t]
                for cog in cogs.split('/'):
                    if cog in cogids:
                        cogid = cogids[cog]
                    else:
                        cogid = max(list(cogids.values()) or 0) + 1
                        cogids[cog] = cogid
                    ds.add_row((
                        idx, glottocode, taxon, iso, cid, concept, cog, SOURCE,
                        cogid))
                    dataset.cognates.append([
                        idx,
                        ds.name,
                        cog,
                        '-'.join([slug(concept), str(cogid)]),
                        '',
                        'expert',
                        SOURCE,
                        '', '', ''])
                    idx += 1
Пример #8
0
def cldf(dataset, concepticon, **kw):
    concepts = {x['GLOSS']: x['CONCEPTICON_ID'] for x in dataset.concepts}
    D = {}  # dictionary to be passed to lingpy
    D[0] = [
        'doculect', 'glottolog', 'concept', 'concepticon', 'ipa', 'segments',
        'cogid', 'alignment'
    ]
    idx = 1
    for f in FILES:
        msa = lp.MSA(
            dataset.raw.joinpath('phonalign_{0}.msa'.format(f)).as_posix())
        concept = msa.seq_id[1:-1]  # strip quotation marks from concept
        cid = concepts.get(concept, '')
        for i, taxon in enumerate(msa.taxa):
            if taxon in languages:
                tid = languages[taxon]
                alignment = ' '.join(msa.alignment[i])
                tokens = ' '.join([x for x in msa.alignment[i] if x != '-'])
                ipa = tokens.replace(' ', '')
                cogid = '{0}-{1}'.format(concept, f)
                D[idx] = [
                    taxon, tid, concept, cid, ipa, tokens, cogid, alignment
                ]
                idx += 1

    with CldfDataset(
        ('ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID',
         'Parameter_name', 'Value', 'Segments', 'Cognacy', 'Source'),
            dataset) as ds:
        src = getEvoBibAsSource('Heggarty2007')
        ds.sources.add(src)
        src = getEvoBibAsSource('List2014e')
        ds.sources.add(src)

        alm = lp.Alignments(D)
        for k in alm:
            ds.add_row(
                    ['Heggarty2007-{0}'.format(k)] + [alm[k, x] or '' for x in ['glottolog', 'taxon',
                        'iso', 'concepticon', 'concept', 'ipa']] + \
                                [' '.join(alm[k, 'tokens']), alm[k, 'cogid'], 'Heggarty2007']
                                )
            dataset.cognates += [[
                'Heggarty2007-{0}'.format(k), ds.name, alm[k, 'ipa'],
                alm[k, 'cogid'], '', 'expert', 'Heggarty2007',
                alm[k, 'alignment'], 'expert', 'List2014e'
            ]]
        dataset.write_cognates()
Пример #9
0
def cldf(dataset, concepticon, **kw):
    wl = lp.Wordlist(dataset.raw.joinpath(DSET).as_posix())
    gcode = {x['NAME']: x['GLOTTOCODE'] for x in dataset.languages}
    src = getEvoBibAsSource(SOURCE)

    with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Language_iso',
                      'Parameter_ID', 'Parameter_name',
                      'Parameter_Chinese_name', 'Value', 'Segments', 'Source'),
                     dataset) as ds:
        ds.sources.add(src)

        for k in wl:
            if wl[k, 'value'] not in '---' and wl[k, 'value'].strip():
                ds.add_row([
                    wl[k, 'lid'], gcode[wl[k, 'doculect']], wl[k, 'doculect'],
                    '', wl[k, 'concepticon_id'], wl[k, 'concept'],
                    wl[k, 'chinese'], wl[k, 'value'],
                    clean_string(wl[k, 'value'])[0], SOURCE
                ])
Пример #10
0
def cldf(dataset, concepticon, **kw):
    wl = lp.Alignments(dataset.raw.joinpath('tukano.tsv').as_posix())
    src1 = getEvoBibAsSource('Chacon2014')
    src2 = getEvoBibAsSource('Chacon2015')
    gloss2conc = {r['GLOSS']: r['CONCEPTICON_ID'] for r in dataset.concepts}

    with CldfDataset((
            'ID',
            'Language_ID',
            'Language_name',
            'Language_iso',
            'Parameter_ID',
            'Parameter_name',
            'Value',
            'Source',
            'Segments',
            'Cognacy',
    ), dataset) as ds:

        ds.sources.add(src1)
        ds.sources.add(src2)
        for k in wl:
            lid = wl[k, 'language']
            cogid = wl[k, 'cogid']
            concept = wl[k, 'concept']
            segments = wl[k, 'tokens']
            value = wl[k, 'ipa']
            cogid = wl[k, 'cogid']
            alignment = wl[k, 'alignment']
            name, iso = abbr2lang[lid]
            concept = wl[k, 'concept']
            cid = gloss2conc.get(concept)
            ds.add_row(('Chacon2014-' + str(k),
                        dataset.glottocode_by_iso.get(iso, ''), name, iso, cid,
                        concept, value, 'Chacon2014', ' '.join(segments),
                        str(cogid)))

            cogid = '-'.join([slug(wl[k, 'concept']), '%s' % cogid])
            dataset.cognates.append([
                'Chacon2014-' + str(k), ds.name, wl[k, 'ipa'], cogid, '',
                'expert', 'Chacon2014', alignment, 'expert', 'Chacon2015'
            ])
Пример #11
0
def cldf(dataset, concepticon, **kw):
    unmapped = set()
    for ods in clld.itercldf(dataset, __name__):
        lid = ods.name.split('-')[-1]
        fields = list(ods.fields) + [
            'Language_local_ID', 'Parameter_local_ID', 'Value_in_source'
        ]
        with CldfDataset(fields, dataset, subset=lid) as ds:
            ds.table.schema.columns['Parameter_local_ID'].valueUrl = \
                clld.url(__name__, path='/parameters/{Parameter_local_ID}')
            ds.table.schema.columns['Language_local_ID'].valueUrl = \
                clld.url(__name__, path='/contributions/{Language_local_ID}')
            ds.metadata.update(
                {k: v
                 for k, v in ods.metadata.items() if k.startswith('dc:')})
            ds.sources.add(*ods.sources.items())
            for row in ods.rows:
                if row['Language_ID'] == 'None':
                    row['Language_ID'] = None
                    unmapped.add((row['Language_name'], lid))
                val, row['Value'] = row['Value'], clean_form(row['Value'])
                ds.add_row(row.to_list() +
                           [lid, '-'.join(row['ID'].split('-')[:2]), val])
Пример #12
0
def cldf(dataset, concepticon, **kw):
    data = get_all(dataset)
    gl_map = {k: v.id for k, v in dataset.glottolog_languoids.items()}
    gl_map.update(dataset.glottocode_by_iso)

    swadesh_concepts = {
        k: v
        for k, v in data['word'].items() if v['id'] in data['concept_ids']
    }

    def normalized_gloss(gloss):
        if gloss.startswith('to '):
            gloss = gloss[3:].strip()
        if '/' in gloss:
            gloss = gloss.split('/')[0].strip()
        if '(' in gloss:
            gloss = gloss.split('(')[0].strip()
        if gloss.endswith('?'):
            gloss = gloss[:-1]
        return gloss

    swadesh2concepticon = {
        'right (hand)': '2183',
        'we incl. (pronoun d:1p, incl)': '1131',
        'left (hand)': '2182',
        'right (correct, true)': '1725',
        'in, inside': '1460',
        'to lie down': '215',
    }
    for conceptlist in [
            'Swadesh-1960-200', 'Swadesh-1971-100', 'Swadesh-1955-100',
            'Swadesh-1950-215', 'Swadesh-1955-215'
    ]:
        for d in concepticon.conceptlists[conceptlist].concepts.values():
            swadesh2concepticon.setdefault(d.english, d.concepticon_id)

    concept_map = {}
    for concept in swadesh_concepts.values():
        gloss = normalized_gloss(concept['word'])
        if gloss in swadesh2concepticon:
            concept_map[concept['id']] = swadesh2concepticon[gloss]
        elif concept['word'] in swadesh2concepticon:
            concept_map[concept['id']] = swadesh2concepticon[concept['word']]
        else:
            raise ValueError(concept['word'])
    assert len(concept_map) == len(set(concept_map.values()))

    for c in dataset.concepts:
        if c['CONCEPTICON_ID']:
            concept_map[int(c['ID'])] = c['CONCEPTICON_ID'] or None

    uc = Counter()
    unmapped = Unmapped(lambda r: int(r[0]))
    for language_url, words in groupby(
            sorted(data['lexicon'].values(), key=lambda i: i['language']),
            lambda i: i['language']):
        contribution = data['language'][language_url]
        with CldfDataset((
                'ID',
                'Language_ID',
                'Language_iso',
                'Language_name',
                'Language_local_ID',
                'Parameter_ID',
                'Parameter_name',
                'Parameter_local_ID',
                'Value',
                'Source',
                'Cognate_Set',
                'Comment',
                'Loan',
        ),
                         dataset,
                         subset=contribution['id']) as ds:
            cname = contribution['language']
            if contribution['dialect']:
                cname += ' (%s Dialect)' % contribution['dialect']
            lid = gl_map.get(contribution['glottocode'])
            if not lid:
                lid = gl_map.get(contribution['isocode'])
                if not lid:
                    unmapped.languages.add(
                        (contribution['id'], cname, contribution['isocode']))
            if contribution['information']:
                ds.metadata['dc:description'] = contribution['information']

            ds.table.schema.aboutUrl = '%s.csv#{ID}' % ds.name
            ds.table.schema.columns['Loan'].datatype = 'boolean'
            ds.table.schema.columns['Parameter_local_ID'].valueUrl = \
                '%s/word/{Parameter_local_ID}' % BASE_URL
            ds.table.schema.columns['Language_local_ID'].valueUrl = \
                '%s/language/{Language_local_ID}' % BASE_URL

            for word in words:
                concept = data['word'][word['word']]
                if concept['id'] not in concept_map:
                    unmapped.concepts.add((concept['id'], concept['word']))
                    uc.update([concept['word']])
                src = data['source'].get(word['source'])
                if src:
                    ds.sources.add(
                        Source('misc',
                               src['slug'],
                               author=src['author'],
                               year=src['year'],
                               transnewguinea_id=BASE_URL + '/source/' +
                               src['slug'],
                               title=src['reference']))
                ds.add_row([
                    word['id'],
                    lid,
                    contribution['isocode'],
                    cname,
                    contribution['slug'],
                    concept_map.get(concept['id']),
                    concept['word'],
                    concept['slug'],
                    word['entry'],
                    src['slug'] if src else None,
                    None,
                    word['annotation'],
                    word['loan'],
                ])
    unmapped.pprint()
Пример #13
0
def cldf(dataset, concepticon, **kw):
    concept_map = {
        int(c['GLOSS']): c['CONCEPTICON_ID'] or None
        for c in dataset.concepts
    }

    gc_pattern = re.compile('[a-z0-9]{4}[1-9][0-9]{3}$')
    meta = {}
    for row in read_csv(dataset, 'META'):
        meta[(row[5], row[9])] = dict(
            zip(
                'NAME,COUNTRY,ISO,GLOTTO_NAME,GLOTTO_CODE,LG_LINK,AUDIO,SOURCE,NR_SETS,VARIANT'
                .lower().split(','), row))

    sources = {}
    sid = 0
    for spec in meta.values():
        if spec['source'] and spec['source'] not in sources:
            sid += 1
            sources[spec['source']] = Source('misc',
                                             's%s' % sid,
                                             title=spec['source'])

    unmapped = Unmapped()
    with CldfDataset((
            'ID',
            'Language_ID',
            'Language_name',
            'Parameter_ID',
            'Parameter_name',
            'Value',
            'Source',
            'Comment',
    ), dataset) as ds:
        for key, items in groupby(
                sorted(read_csv(dataset, 'NUMERAL'),
                       key=lambda r: (r[2], r[3], r[0])), lambda r:
            (r[2], r[3])):
            if key not in meta:
                continue
            if int(float(key[1])) > 1:
                continue
            md = meta[key]
            source, ref = sources.get(md['source']), None
            if source:
                ds.sources.add(source)
                ref = source.id
            if gc_pattern.match(md['glotto_code']):
                for concept, rows in groupby(items, lambda k: k[0]):
                    if not concept.endswith('.0'):
                        continue
                    iconcept = int(float(concept))
                    if iconcept not in concept_map:
                        unmapped.concepts.add((iconcept, iconcept))
                    for k, row in enumerate(rows):
                        ds.add_row([
                            '%s-%s-%s' % (lgid(row[2]), iconcept, k + 1),
                            md['glotto_code'],
                            md['name'],
                            concept_map.get(iconcept),
                            '%s' % iconcept,
                            row[1],
                            ref,
                            row[4] or None,
                        ])
    unmapped.pprint()
Пример #14
0
def cldf(dataset, concepticon, **kw):
    concepticon = {
        c.english: c.concepticon_id
        for c in dataset.conceptlist.concepts.values()
    }
    language_map = {
        l['NAME']: l['GLOTTOCODE'] or None
        for l in dataset.languages
    }

    header, rows = None, []
    with UnicodeReader(
            dataset.raw.joinpath(
                'Semitic.Wordlists.ActualWordlists.csv')) as reader:
        for i, row in enumerate(reader):
            row = [c.strip() for c in row]
            if i == 0:
                header = row
            if i > 0:
                rows.append(row)
    cheader, crows = None, []
    with UnicodeReader(
            dataset.raw.joinpath(
                'Semitic.Codings.Multistate.Sheet1.csv')) as reader:
        for i, row in enumerate(reader):
            row = [c.strip() for c in row]
            if i == 0:
                cheader = row
            if i > 0:
                crows.append(row)

    langs = header[1:]
    clean_langs = {
        """Gɛ'ɛz""": "Ge'ez",
        "Tigrɛ": "Tigre",
        'ʷalani': "Walani",
        "Ogadɛn Arabic": "Ogaden Arabic",
        "Mɛhri": "Mehri",
        "Gibbali": "Jibbali",
    }
    correct_concepts = {
        'Cold (air)': 'Cold (of air)',
    }
    src = getEvoBibAsSource('Kitchen2012')

    with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Parameter_ID',
                      'Parameter_name', 'Value', 'Segments'), dataset) as ds:
        D = {0: ['doculect', 'concept', 'ipa', 'tokens']}
        idx = 1
        ds.sources.add(src)
        for row in rows:
            concept = row[0]
            for i, col in enumerate(row[1:]):
                lang = langs[i]
                if col != '---':
                    cleaned_string = clean_string(col,
                                                  merge_vowels=False,
                                                  preparse=PREPARSE,
                                                  rules=CONVERSION,
                                                  semi_diacritics='')[0]
                    ds.add_row([
                        'Kitchen2012-' + str(idx), language_map[lang],
                        clean_langs.get(lang, lang), concepticon[concept],
                        concept, col, cleaned_string
                    ])
                    D[idx] = [
                        clean_langs.get(lang, lang), concept, col,
                        cleaned_string
                    ]
                    idx += 1

        wl = lp.Wordlist(D)
        id2cog = {}
        errors = []
        for row in crows:
            taxon = row[0]
            for i, (concept, cog) in enumerate(zip(cheader[1:], row[1:])):
                nconcept = rows[i][0]
                if cog != '-':
                    idxs = wl.get_dict(taxon=taxon)
                    if idxs.get(nconcept, ''):
                        id2cog[idxs[nconcept][0]] = concept + '-' + cog
                    else:
                        errors += [(concept, nconcept, taxon)]
        bad_cogs = 1
        cognates = []
        for k in wl:
            cognates = []
            if k in id2cog:
                cogid = id2cog[k]
            else:
                cogid = str(bad_cogs)
                bad_cogs += 1
                id2cog[k] = cogid

        wl.add_entries('cog', id2cog, lambda x: x)
        wl.renumber('cog')
        for k in wl:
            cognates += [[
                'Kitchen2012-' + str(k), ds.name, wl[k, 'ipa'],
                wl[k, 'concept'] + '-' + str(wl[k, 'cogid']), '', 'expert',
                'Kitchen2012', '', '', ''
            ]]

        dataset.cognates.extend(iter_alignments(lp.Alignments(wl), cognates))
Пример #15
0
def cldf(dataset, concepticon, **kw):
    concept_map = {
        c.english: c.concepticon_id
        for c in dataset.conceptlist.concepts.values()
    }
    glotto_map = {c['NAME']: c['GLOTTOCODE'] for c in dataset.languages}

    # retrieve coordinates
    coords = {}
    langs = []
    # language map, as the names are not identical
    language_map = {
        "Namhsan": "Nam Hsan",
        "Pangkham": "Pang Kham",
        "Xiang Zhai Tang  (Xiang Cai Tang)": "Xiang Zhai Tang"
    }
    with UnicodeReader(
            dataset.raw.joinpath('100item-phylo.Sheet2.csv')) as reader:
        for i, (num, lat, lon, village, country) in enumerate(reader):
            if i >= 1:
                coords[language_map.get(village, village)] = (lat, lon)
                langs.append(language_map.get(village, village))

    cognates = []
    idx = 1
    with UnicodeReader(dataset.raw.joinpath('100item-phylo.Sheet1.csv'),
            delimiter=',') as reader,\
            CldfDataset((
                'ID',
                'Language_ID',
                'Language_name',
                'Language_iso',
                'Parameter_ID',
                'Parameter_name',
                'Value',
                'Source',
                'Segments',
                'Cognacy'
                ), dataset) as ds:
        ds.sources.add(getEvoBibAsSource('Deepadung2015'))
        ds.metadata['coordinates'] = coords
        data = list(reader)
        header = data[2][2:]
        for i, row in enumerate(data[5:]):
            row = [c.strip() for c in row]
            concept = row[1]
            cid = concept_map[concept]
            for j in range(0, len(header), 2):
                lang = language_map.get(header[j], header[j])
                gcid = glotto_map[lang]
                cog = slug(concept) + '-' + row[2:][j + 1]
                certainty = 0
                if ' or ' in cog:
                    cog = cog.split(' ')[0]
                    certainty = 1
                word = CORRECT.get(row[2:][j], row[2:][j])
                if word.strip() and ''.join(set(word.strip())) != '-':
                    segments = lp.sequence.sound_classes.clean_string(
                        word,
                        splitters=',',
                        rules=CONVERSION,
                        preparse=PREPARSE,
                        semi_diacritics="")[0]
                    cogid = slug(concept) + '-' + cog
                    ds.add_row([
                        idx, gcid, lang, '', cid, concept, word, PROVIDER,
                        segments, cogid
                    ])
                    cognates.append([
                        idx, ds.name, word, cogid,
                        str(certainty), 'expert', PROVIDER, '', '', ''
                    ])
                    idx += 1
    dataset.cognates.extend(
        iter_alignments(
            ds,
            cognates,
            method='progressive',
        ))
Пример #16
0
def cldf(dataset, concepticon, **kw):
    concept_map = {
        re.sub('^(\*|\$)', '', c.english): c.concepticon_id
        for c in dataset.conceptlist.concepts.values()}
    for c in dataset.concepts:
        concept_map[(c['ID'], c['GLOSS'])] = c['CONCEPTICON_ID'] or None
    language_map = {l['ID']: l['GLOTTOCODE'] or None for l in dataset.languages}

    concepts = []
    languages = {}
    for path in dataset.raw.glob('languages-language-*.json'):
        data = jsonlib.load(path)
        data['glottocode'] = language_map[data['id']]
        languages[data['id']] = data

    for path in sorted(
            dataset.raw.glob('lexical-feature-*.json'),
            key=lambda p: int(p.stem.split('-')[-1])):
        data = jsonlib.load(path)
        data['concepticon'] = concept_map.get(data['concept'])
        if not data['concepticon']:
            data['concepticon'] = concept_map[(data['id'], data['concept'])]
        concepts.append(data)

    fields = defaultdict(lambda: Counter())
    sources = {}
    with CldfDataset((
            'ID',
            'Language_ID',
            'Language_iso',
            'Language_name',
            'Language_local_ID',
            'Parameter_ID',
            'Parameter_name',
            'Parameter_local_ID',
            'Semantic_field',
            'Value',
            'Context',
            'Loan',
            'Phonemic',
            'Source',
            'Creator',
            'Comment',
            ), 
            dataset) as ds:
        ds.table.schema.columns['Loan'].datatype = 'boolean'
        ds.table.schema.columns['Parameter_local_ID'].valueUrl = \
            'https://huntergatherer.la.utexas.edu/lexical/feature/{Parameter_local_ID}'
        ds.table.schema.columns['Language_local_ID'].valueUrl = \
            'https://huntergatherer.la.utexas.edu/languages/language/{Language_local_ID}'

        for param in concepts:
            for lid, items in groupby(
                    sorted(param['items'], key=lambda i: i['Language']),
                    lambda i: i['Language']):
                lid = lid.split('/')[-1]
                if lid in missing_languages:
                    continue
                lang = languages[lid]
                i = 0
                for item in items:
                    form = item['Orthographic Form'].strip()
                    refs = [ref for ref in itersources(item, lang, sources) if ref]
                    ds.sources.add(*[ref.source for ref in refs])
                    for k, v in item.items():
                        if v:
                            fields[k].update([v])
                    for fform, context in split(form):
                        i += 1
                        ds.add_row([
                            '%s-%s-%s' % (lid, param['id'], i),
                            lang['glottocode'],
                            lang['ISO 639-3'],
                            lang['name'],
                            lang['id'],
                            param['concepticon'],
                            param['concept'],
                            param['id'],
                            param['Semantic Field'],
                            fform,
                            context,
                            bool(item['Loan Source'] or item['Wanderwort Status']),
                            item['Phonemicized Form'] or None,
                            ';'.join('%s' % ref for ref in refs),
                            item.get('Created By'),
                            item.get('General Notes'),
                        ])
Пример #17
0
def cldf(dataset, concepticon, **kw):
    wl = lp.Wordlist(dataset.raw.joinpath(DSET).as_posix())
    gcode = {x['NAME']: x['GLOTTOCODE'] for x in dataset.languages}
    ccode = {x.english: x.concepticon_id for x in
             dataset.conceptlist.concepts.values()}
    src = getEvoBibAsSource(SOURCE)
    src2 = getEvoBibAsSource('List2015d')

    with CldfDataset((
        'ID',
        'Language_ID',
        'Language_name',
        'Language_iso',
        'Parameter_ID',
        'Parameter_name',
        'Value',
        'Source',
        'Cognacy',
        )
            , dataset) as ds:
        
        ds.sources.add(src, src2)

        # store list of proto-form to cognate set
        p2c = {}

        for k in wl:
            ds.add_row([
                '{0}-{1}'.format(SOURCE, k),
                gcode[wl[k, 'doculect']],
                wl[k, 'doculect'],
                '',
                ccode[wl[k, 'concept']],
                wl[k, 'concept'],
                wl[k, 'ipa'],
                SOURCE,
                wl[k, 'COGID']
            ])
            dataset.cognates += [[
                '{0}-{1}'.format(SOURCE, k),
                ds.name,
                wl[k, 'ipa'],
                '-'.join([slug(wl[k, 'concept']), str(wl[k, 'cogid'])]),
                '', 
                'expert',
                SOURCE,
                '',
                '',
                ''
            ]]
            p2c[wl[k, 'proto']] = wl[k, 'cogid']
        idx = max([k for k in wl]) + 1
        for line in lp.csv2list(dataset.raw.joinpath('old_chinese.csv').as_posix()):
            for val in line[1].split(', '):
                ds.add_row((
                    '{0}-{1}'.format(SOURCE, idx),
                    'sini1245',
                    'Old Chinese',
                    '',
                    ccode[line[0]],
                    line[0],
                    val,
                    SOURCE,
                    p2c.get(val, val)
                ))
                dataset.cognates += [[
                    '{0}-{1}'.format(SOURCE, idx),
                    ds.name,
                    val,
                    '-'.join([slug(line[0]), text_type(p2c.get(val, val))]),
                    '',
                    'expert',
                    SOURCE,
                    '',
                    '',
                    '']]
                idx += 1
Пример #18
0
def cldf(dataset, concepticon, **kw):
    concepticon = {
        x.english: x.concepticon_id for x in
        dataset.conceptlist.concepts.values()}
    lmap = {l['ID']: l['GLOTTOCODE'] or None for l in dataset.languages}
    lmap_name = {l['ID']: l['NAME'] or None for l in dataset.languages}

    cognate_sets = defaultdict(list)
    for (cid, c), w, missing in parse(dataset.raw.joinpath('galucio-tupi.txt'), lmap):
        assert c in concepticon
        if c in LANGUAGE_ID_FIXES:
            f, t = LANGUAGE_ID_FIXES[c]
            w = re.sub(f + '\s+', t + ' ', w, count=1)
            missing = re.sub(f + '\s+', t + ' ', missing, count=1)

        if missing:
            assert re.match(
                '((?P<lid>%s)\s*\?\s*)+$' % '|'.join(list(lmap.keys())), missing)
        missing = missing.replace('?', ' ').split()

        lids = set(missing[:])
        for m in re.finditer('(?P<lid>[A-Z][a-z])\s+', w):
            lids.add(m.group('lid'))
        # make sure all language IDs are valid
        assert not lids.difference(set(lmap.keys()))

        nlids = missing[:]
        for cs in iter_cogsets(w, lmap):
            cognate_sets[(cid, c)].append(cs)
            nlids.extend(list(cs.keys()))
        nlids = set(nlids)
        assert nlids == lids  # make sure we found all expected language IDs

    cognatesets = []
    with CldfDataset(
            ('ID',
             'Language_ID',
             'Language_name',
             'Language_local_ID',
             'Parameter_ID',
             'Parameter_name',
             'Parameter_local_ID',
             'Value',
             'Segments'),
            dataset) as ds:
        for (cid, concept), cogsets in cognate_sets.items():
            for j, cogset in enumerate(cogsets):
                for lid, words in sorted(cogset.items(), key=lambda k: k[0]):
                    for i, word in enumerate(words):
                        wid = '%s-%s-%s-%s' % (lid, cid, j + 1, i + 1)
                        ds.add_row([
                            wid,
                            lmap[lid],
                            lmap_name[lid],
                            lid,
                            concepticon[concept],
                            concept,
                            cid,
                            word,
                            '',
                        ])
                        cognatesets.append([
                            wid,
                            ds.name,
                            word,
                            '%s-%s' % (cid, j + 1),
                            False,
                            'expert',
                            '',
                            '',
                            '',
                            '',
                        ])
        segmentize(ds, clean=lambda s: s.split(' ~ ')[0])
    dataset.cognates.extend(iter_alignments(ds, cognatesets, column='Segments'))
Пример #19
0
    def to_cldf(self,
                concept_map,
                unmapped,
                citekey=None,
                source=None,
                concept_key=None):
        if concept_key is None:
            concept_key = lambda entry: entry.word_id

        if not self.language.glottocode:
            unmapped.languages.add(
                (self.language.id, self.language.name, self.language.iso))

        with CldfDataset((
                'ID',
                'Language_ID',
                'Language_iso',
                'Language_name',
                'Language_local_ID',
                'Parameter_ID',
                'Parameter_name',
                'Parameter_local_ID',
                'Value',
                'Value_in_source',
                'Segments',
                'Context',
                'Source',
                'Cognate_Set',
                'Comment',
                'Loan',
        ),
                         self.dataset,
                         subset=self.language.id) as ds:
            ds.metadata['dc:creator'] = self.language.author
            ds.metadata['dc:identifier'] = self.url('language.php?id=%s' %
                                                    self.language.id)
            if self.language.typedby:
                ds.metadata['dc:contributor'] = self.language.typedby
            if self.language.checkedby:
                ds.metadata['dc:contributor'] = self.language.checkedby
            if self.language.notes:
                ds.metadata['dc:description'] = self.language.notes

            ds.table.schema.aboutUrl = '%s.csv#{ID}' % ds.name
            ds.table.schema.columns['Loan'].datatype = 'boolean'
            ds.table.schema.columns['Parameter_local_ID'].valueUrl = \
                self.url('word.php?v=1{Parameter_local_ID}')
            ds.table.schema.columns['Language_local_ID'].valueUrl = \
                self.url('language.php?id={Language_local_ID}')

            ref = None
            if citekey and source:
                ref = citekey
                ds.sources.add(Source('misc', citekey, title=source))

            for entry in self.entries:
                if entry.name == '?':
                    continue
                if not (citekey and source):
                    src = entry.e.find('source')
                    if src and getattr(src, 'text'):
                        ref = slug(text_type(src.text))
                        ds.sources.add(Source('misc', ref, title=src.text))
                cid = concept_map.get(concept_key(entry))
                if not cid:
                    unmapped.concepts.add((entry.word_id, entry.word))
                for i, (form, context) in enumerate(util.split(entry.name)):
                    ds.add_row([
                        '{0}-{1}'.format(entry.id, i + 1),
                        self.language.glottocode,
                        self.language.iso,
                        self.language.name,
                        self.language.id,
                        cid,
                        entry.word,
                        entry.word_id,
                        util.clean_form(form),
                        form,
                        '',
                        context,
                        ref,
                        entry.cognacy,
                        entry.comment or '',
                        entry.loan == 'L',
                    ])
            segmentize(ds)
        return ds
Пример #20
0
def cldf(dataset, concepticon, **kw):
    concepticon = {
        c.english: c.concepticon_id
        for c in dataset.conceptlist.concepts.values()
    }
    concepticon['you (sing.)'] = concepticon['you (sing.) (thou)']
    concepticon['you (pl.)'] = concepticon['you (pl.) (ye)']
    concepticon['to itch/itchy'] = concepticon['to itch/to be itchy']
    concepticon['medicine'] = concepticon['medicine/juice']
    concepticon['excrement/shit'] = concepticon['feces/excrement/shit']

    language_map = {
        'Tampuon': 'Tampuan',
        'Palaung-Namhsan-Taunggyi': 'Palaung-Namhsan',
        'Jru-Laven\u02d0': 'Jru-Laven',
        'Pnar-Jaintia': 'Pnar',
        'K-Surin': 'Khmer-Surin',
    }

    languages = {}
    words = []

    with UnicodeReader(dataset.raw.joinpath('ds.Sheet1.csv')) as reader:
        for i, row in enumerate(reader):
            if 3 <= i < 125:
                languages[row[1]] = row
            elif i > 334:
                words.append(row)

    lids = [int(float(r[0])) for r in languages.values()]
    assert min(lids) == 1 and max(lids) == 122

    glottolog = dataset.glottocode_by_iso
    glottolog.update(
        {l['NAME']: l['GLOTTOCODE'] or None
         for l in dataset.languages})

    sources = {}
    for src, langs in groupby(sorted(languages.values(), key=lambda r: r[6]),
                              lambda r: r[6]):
        langs = [l[1] for l in langs]
        src = Source('misc', '_'.join(map(slug, langs)), title=src)
        for lang in langs:
            sources[lang] = src
    sources['cognates'] = getEvoBibAsSource(SOURCE)

    unmapped = Unmapped()
    with CldfDataset((
            'ID',
            'Language_ID',
            'Language_name',
            'Language_iso',
            'Parameter_ID',
            'Parameter_name',
            'Value',
            'Segments',
            'Source',
            'Comment',
    ), dataset) as ds:
        ds.sources.add(*sources.values())
        D = {0: ['lid', 'doculect', 'concept', 'ipa', 'tokens', 'cog']}
        for i, row in enumerate(words):
            form = row[4]
            if not form or form in '*-':
                continue
            assert row[1] in concepticon
            lang = language_map.get(row[3], row[3].strip())
            assert lang in languages
            gc = glottolog.get(glottolog.get(languages[lang][7]), lang)
            if not gc:
                unmapped.languages.add(('', lang, languages[lang][7]))
            # get segments
            segments = clean_string(form)[0]
            # get cognate identifier
            cogid = row[5] if row[5].strip() and row[5].strip() != '*' else (
                'e%s' % i)
            cogid = row[1] + '-' + cogid
            lid = '{0}-{1}'.format(ds.name, i + 1)
            ds.add_row([
                lid,
                glottolog.get(lang, glottolog.get(languages[lang][7])), lang,
                languages[lang][7], concepticon[row[1]], row[1], form,
                segments, sources[lang].id, None
            ])
            D[i + 1] = [lid, lang, row[1], form, segments, cogid]
        wl = lp.Wordlist(D)
        wl.renumber('cog')
        alm = lp.Alignments(wl)
        dataset.cognates.extend(
            iter_alignments(alm, wordlist2cognates(wl, ds, SOURCE)))

    unmapped.pprint()
Пример #21
0
def cldf(dataset, concepticon, **kw):
    """
    Implements the conversion of the raw data to CLDF dataset(s).

    :param dataset: provides access to the information in supplementary files as follows:\
     - the JSON object from `metadata.json` is available as `dataset.md`\
     - items from languages.csv are available as `dataset.languages`\
     - items from concepts.csv are available as `dataset.concepts`\
     - if a Concepticon conceptlist was specified in metadata.json, its ID is available\
       as `dataset.conceptlist`
    :param glottolog: a pyglottolog.api.Glottolog` instance.
    :param concepticon:  a pyconcepticon.api.Concepticon` instance.
    :param kw: All arguments passed on the command line.
    """

    wl = lp.Wordlist(dataset.raw.joinpath(DSET).as_posix())

    # get language identifiers
    lids, cids, coords = {}, {}, {}
    for row in dataset.languages:
        language = row['NAME']
        lids[language] = row['GLOTTOCODE']
    coords = dict([wl.coords[taxon] for taxon in lids])
    modify = {
        'thunder (verb)': 'thunder',
        'flash (verb)': 'lightning',
        'room': 'flat',
        'have diarrea': 'have diarrhoea',
        'watery': 'light'
    }
    for row in dataset.concepts:
        concept = modify[row['CONCEPT']] if row['CONCEPT'] in modify else \
                row['CONCEPT']
        cids[concept] = row['CONCEPT_SET']

    # language ids
    src = getEvoBibAsSource(SOURCE)
    src2 = getEvoBibAsSource('List2014b')

    # get partial identifiers
    partial_ids = defaultdict(list)
    partial_converter = {}
    idx = 1
    for k in wl:
        for char in wl[k, 'counterpart']:
            if char in partial_converter:
                pidx = partial_converter[char]
            else:
                pidx = idx
                partial_converter[char] = idx
                idx += 1
            partial_ids[k] += [pidx]

    # trace if proto-langugages was visited
    visited = []
    idx = max([k for k in wl]) + 1

    with CldfDataset(
        ('ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID',
         'Parameter_name', 'Parameter_Chinese_name', 'Value',
         'Value_Chinese_characters', 'Source', 'Segments', 'Cognacy', 'Rank',
         'Comment'), dataset) as ds:

        ds.sources.add(src)
        ds.sources.add(src2)

        D = {0: ['doculect', 'concept', 'ipa', 'tokens', 'cogid']}
        for k in wl:
            tokens = lp.ipa2tokens(wl[k, 'ipa'],
                                   merge_vowels=False,
                                   expand_nasals=True)
            # remove sandhi-annotation in tokens, as it is confusing clpa
            for i, t in enumerate(tokens):
                if '⁻' in t:
                    tokens[i] = t[:t.index('⁻')]
            ds.add_row([
                '{0}-{1}'.format(SOURCE, k),
                lids[wl[k, 'doculect']],
                wl[k, 'doculect'],
                '',
                cids[wl[k, 'concept']],
                wl[k, 'concept'],
                wl[k, 'mandarin'],
                wl[k, 'ipa'],
                wl[k, 'counterpart'],
                SOURCE,
                ' '.join(tokens),
                wl[k, 'cogid'],
                wl[k, 'order'],
                wl[k, 'note'] if wl[k, 'note'] != '-' else '',
            ])
            D[k] = [
                wl[k, 'doculect'], wl[k, 'concept'], wl[k, 'ipa'], tokens,
                wl[k, 'cogid']
            ]
            if wl[k, 'cogid'] not in visited:
                # we need to add new tones, otherwise it won't work, so we
                # split syllables first, then check if the syllable ends with
                # tone or not and add a '1' if this is not the case
                syllables = wl[k, 'mch'].split('.')
                for i, s in enumerate(syllables):
                    if s[-1] not in '²³':
                        if s[-1] not in 'ptk':
                            syllables[i] += '¹'
                        else:
                            syllables[i] += '⁴'
                tokens = lp.ipa2tokens(''.join(syllables))
                ds.add_row([
                    '{0}-{1}'.format(wl[k, 'concept'], idx), 'sini1245',
                    'Middle Chinese', '', cids[wl[k, 'concept']],
                    wl[k, 'concept'], '', wl[k, 'proto'], wl[k, 'counterpart'],
                    SOURCE, ' '.join(tokens), wl[k, 'cogid'], '', ''
                ])
                D[idx] = [
                    'Middle Chinese', wl[k, 'concept'], wl[k, 'mch'], tokens,
                    wl[k, 'cogid']
                ]
                idx += 1
                visited += [wl[k, 'cogid']]
        alms = lp.Alignments(D)
        cognates = [[
            '{0}-{1}'.format(SOURCE, k), ds.name, alms[k, 'ipa'],
            '-'.join([slug(alms[k, 'concept']),
                      str(alms[k, 'cogid'])]), '', 'expert', SOURCE, '', '', ''
        ] for k in alms]

        dataset.cognates.extend(
            iter_alignments(alms, cognates, method='library'))
Пример #22
0
def cldf(dataset, concepticon, **kw):
    language_map = {l['NAME']: l['GLOTTOCODE'] or None for l in dataset.languages}
    concept_map = {
        x.english: x.concepticon_id for x in dataset.conceptlist.concepts.values()}

    data = OrderedDict()

    # The english concept labels in the two excel sheets differ in one place:
    gloss_map = {'road/path': 'road'}

    header, rows = read_csv(dataset, 'Data')
    for row in rows:
        data[row[0]] = {
            'language': row[0],
            'source': row[-1],
            'items': OrderedDict(zip(header[1:-2], row[1:-2])),
        }

    ids = [slug(l['language']) for l in data.values()]
    assert len(set(ids)) == len(ids)

    header, rows = read_csv(dataset, 'Multistate')
    for row in rows:
        ldata = data[row[0]]
        for j, csid in enumerate(row[1:]):
            concept = header[j + 1]
            try:
                csid = '%s' % int(float(csid))
            except ValueError:
                assert csid == '?'
            ldata['items'][gloss_map.get(concept, concept)] = (
                ldata['items'][gloss_map.get(concept, concept)],
                csid)

    unmapped = Unmapped()
    sources = {}
    with CldfDataset((
                'ID',
                'Language_ID',
                'Language_name',
                'Parameter_ID',
                'Parameter_name',
                'Value',
                'Segments',
                'Source',
                'Cognacy',
            ), dataset) as ds:
        for lang in data.values():
            if not language_map[lang['language']]:
                unmapped.languages.add((lang['language'], lang['language'], ''))
            ref = ''
            if lang['source']:
                ref = get_ref(lang, sources)
                if ref:
                    ds.sources.add(ref.source)
                    ref = '%s' % ref

            for concept, item in lang['items'].items():
                if concept not in concept_map:
                    unmapped.concepts.add((slug(concept), concept))
                wid = '%s-%s' % (slug(lang['language']), slug(concept))
                
                if ds.add_row([
                    wid,
                    language_map[lang['language']],
                    lang['language'],
                    concept_map.get(concept),
                    concept,
                    item[0] if clean_string_with_validation(item[0]) else None,
                    clean_string_with_validation(item[0]),
                    ref,
                    item[1],
                ]) and item[1] != '?':
                        dataset.cognates.append([
                            wid,
                            ds.name,
                            item[0],
                            '%s-%s' % (slug(concept), item[1]),
                            False,
                            'expert',
                            '',
                            '',
                            '',
                            '',
                        ])
        dataset.write_cognates()
        unmapped.pprint()
Пример #23
0
def cldf(dataset, concepticon, **kw):
    language_map = {
        l['NAME']: l['GLOTTOCODE'] or None
        for l in dataset.languages
    }
    concept_map = {
        c.english: c.concepticon_id
        for c in dataset.conceptlist.concepts.values()
    }
    concept_map[
        'year'] = '1226'  # dunno why this is missing, it's 200 words...
    wordlists = list(read_csv(dataset))
    cogsets = defaultdict(lambda: defaultdict(list))
    for wl in wordlists:
        for concept, (words, cogids) in wl.words.items():
            if len(cogids) == 1:
                cogsets[concept][cogids[0]].append(words[0])

    with CldfDataset((
            'ID',
            'Language_ID',
            'Language_name',
            'Parameter_ID',
            'Parameter_name',
            'Value',
            'Segments',
            'Source',
            'Comment',
    ), dataset) as ds:
        ds.sources.add(getEvoBibAsSource(SOURCE))
        cognates = []
        for wl in wordlists:
            #print(wl.language)
            for concept, (words, cogids) in wl.words.items():
                if len(cogids) > 1:
                    if len(words) < len(cogids):
                        if len(words) == 1:
                            if ':' in words[0]:
                                words = words[0].split(':')
                            if ',' in words[0]:
                                words = words[0].split(',')
                        assert len(words) >= len(cogids)
                    assert (wl.language, concept) in COGSET_MAP
                    if len(words) > len(cogids):
                        assert (wl.language, concept) in COGSET_MAP
                if (wl.language, concept) in COGSET_MAP:
                    word_to_cogid = COGSET_MAP[(wl.language, concept)]
                else:
                    word_to_cogid = dict(izip_longest(words, cogids))
                for i, word in enumerate(words):
                    if word.startswith('(') and word.endswith(')'):
                        word = word[1:-1].strip()
                    wid = '%s-%s-%s' % (slug(
                        wl.language), slug(concept), i + 1)
                    ds.add_row([
                        wid,
                        '',
                        wl.language,
                        concept_map.get(concept, ''),
                        concept,
                        word,
                        clean_string(word, splitters='?')[0],
                        SOURCE,
                        '',
                    ])
                    if word_to_cogid.get(word):
                        cognates.append([
                            wid,
                            ds.name,
                            word,
                            '%s-%s' % (slug(concept), word_to_cogid[word]),
                            False,
                            'expert',
                            SOURCE,
                            '',
                            '',
                            '',
                        ])
        dataset.cognates.extend(
            iter_alignments(ds, cognates, column='Segments'))
Пример #24
0
def cldf(dataset, concepticon, **kw):
    language_map = {
        l['NAME']: l['GLOTTOCODE'] or None
        for l in dataset.languages
    }
    concept_map = {
        c.english: c.concepticon_id
        for c in dataset.conceptlist.concepts.values()
    }
    wordsh, words = read_csv(dataset, 'supplementary.Sheet1.csv', 0)
    cognatesh, cognates = read_csv(dataset, 'Japonic_recovered.Sheet1.csv', 1)

    def concepts(h, step):
        l = h[2:]
        return {i + 2: l[i] for i in range(0, len(l), step)}

    word_index_to_concept = concepts(wordsh, 1)

    assert all(c in concept_map for c in word_index_to_concept.values())
    assert len(words) == len(cognates)

    def sorted_(l):
        return sorted(l, key=lambda r: r[:2])

    cognatesets = []
    with CldfDataset((
            'ID',
            'Language_ID',
            'Language_name',
            'Parameter_ID',
            'Parameter_name',
            'Value',
            'Segments',
            'AltTranscription',
    ), dataset) as ds:
        for i, (word,
                cognate) in enumerate(zip(sorted_(words), sorted_(cognates))):
            if not word[1]:
                continue
            if word[1] == 'Nigata':
                word[1] = 'Niigata'
            assert word[:2] == cognate[:2]

            lname = word[1]
            lid = slug(lname)

            for index, concept in word_index_to_concept.items():
                if word[index] == '?':
                    continue
                wid = '%s-%s' % (lid, index - 1)
                cindex = (index - 1) * 2
                assert cognatesh[cindex] == concept
                ds.add_row([
                    wid,
                    language_map[lname],
                    lname,
                    concept_map[concept],
                    concept,
                    word[index],
                    '',
                    cognate[cindex],
                ])
                cs = cognate[cindex + 1]
                for css in cs.split('&'):
                    css = css.strip()
                    if css != '?':
                        css = int(float(css))
                        cognatesets.append([
                            wid,
                            ds.name,
                            word[index],
                            '%s-%s' % (index - 1, css),
                            False,
                            'expert',
                            '',
                            '',
                            '',
                            '',
                        ])
        segmentize(ds)
    dataset.cognates.extend(iter_alignments(ds, cognatesets,
                                            column='Segments'))