예제 #1
0
파일: initializedb.py 프로젝트: clld/lsi
def main(args):
    assert args.glottolog, 'The --glottolog option is required!'

    clts = CLTS(input('Path to cldf-clts/clts:') or '../../cldf-clts/clts')
    data = Data()
    ds = data.add(
        common.Dataset,
        lsi.__name__,
        id=lsi.__name__,
        name=
        'The Comparative Vocabularies of the "Linguistic Survey of India" Online',
        domain='lsi.clld.org',
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        },
    )

    for i, name in enumerate(
        ['Taraka Rama', 'Robert Forkel', 'Johann-Mattis List']):
        common.Editor(dataset=ds,
                      ord=i,
                      contributor=common.Contributor(id=slug(
                          HumanName(name).last),
                                                     name=name))

    contrib = data.add(
        common.Contribution,
        None,
        id='cldf',
        name=args.cldf.properties.get('dc:title'),
        description=args.cldf.properties.get('dc:bibliographicCitation'),
    )

    for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'glottocode',
                          'name', 'latitude', 'longitude'):
        data.add(
            models.Variety,
            lang['id'],
            id=lang['id'],
            name=lang['name'],
            latitude=lang['latitude'],
            longitude=lang['longitude'],
            glottocode=lang['glottocode'],
            order=int(lang['Order']),
            number=lang['NumberInSource'],
            family_in_source=lang['FamilyInSource'],
        )

    for rec in bibtex.Database.from_file(args.cldf.bibpath):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    refs = collections.defaultdict(list)

    for param in iteritems(args.cldf, 'ParameterTable', 'id',
                           'concepticonReference', 'name'):
        data.add(
            models.Concept,
            param['id'],
            id=param['id'],
            name='{} [{}]'.format(param['name'], param['id']),
            description=param['Concepticon_Gloss'],
            concepticon_id=param['concepticonReference'],
            pages=param['PageNumber'],
        )

    inventories = collections.defaultdict(set)
    for form in iteritems(args.cldf, 'FormTable', 'id', 'form',
                          'languageReference', 'parameterReference', 'source'):
        inventories[form['languageReference']] = inventories[
            form['languageReference']].union(form['Segments'])
        vsid = (form['languageReference'], form['parameterReference'])
        vs = data['ValueSet'].get(vsid)
        if not vs:
            vs = data.add(
                common.ValueSet,
                vsid,
                id='-'.join(vsid),
                language=data['Variety'][form['languageReference']],
                parameter=data['Concept'][form['parameterReference']],
                contribution=contrib,
            )
        for ref in form.get('source', []):
            sid, pages = Sources.parse(ref)
            refs[(vsid, sid)].append(pages)
        data.add(
            models.Form,
            form['id'],
            id=form['id'],
            name=form['form'],
            description=''.join(form['Segments']).replace('+', ' '),
            segments=' '.join(form['Segments']),
            valueset=vs,
        )
    for lid, inv in inventories.items():
        inv = [clts.bipa[c] for c in inv]
        data['Variety'][lid].update_jsondata(inventory=[(str(c), c.name)
                                                        for c in inv
                                                        if hasattr(c, 'name')])

    for (vsid, sid), pages in refs.items():
        DBSession.add(
            common.ValueSetReference(valueset=data['ValueSet'][vsid],
                                     source=data['Source'][sid],
                                     description='; '.join(nfilter(pages))))
    load_families(
        Data(),
        [(l.glottocode, l) for l in data['Variety'].values()],
        glottolog_repos=args.glottolog,
        isolates_icon='tcccccc',
        strict=False,
    )
예제 #2
0
def main(args):

    assert args.glottolog, 'The --glottolog option is required!'

    data = Data()
    ds = data.add(
        common.Dataset,
        jambu.__name__,
        id=jambu.__name__,
        name='Jambu',
        domain='jambu-clld.herokuapp.com',
        publisher_name="Georgetown University",
        publisher_place="Washington",
        publisher_url="http://gucl.georgetown.edu/",
        license="http://creativecommons.org/licenses/by/4.0/",
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        },
    )

    for i, name in enumerate(['Aryaman Arora']):
        common.Editor(dataset=ds,
                      ord=i,
                      contributor=common.Contributor(id=slug(
                          HumanName(name).last),
                                                     name=name))

    contrib = data.add(
        common.Contribution,
        None,
        id='cldf',
        name=args.cldf.properties.get('dc:title'),
        description=args.cldf.properties.get('dc:bibliographicCitation'),
    )

    print("Languages...")
    for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'name',
                          'glottocode', 'longitude', 'latitude', 'Clade'):
        data.add(
            models.Variety,
            lang['id'],
            id=lang['id'],
            name=lang['name'],
            latitude=lang['latitude'],
            longitude=lang['longitude'],
            glottocode=lang['glottocode'],
            family=lang['Clade'],
        )

    for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    refs = collections.defaultdict(list)

    print("Cognates...")
    for cognate in iteritems(args.cldf, 'CognateTable'):
        # print(cognate)
        data.add(models.Cognate_,
                 cognate['Cognateset_ID'],
                 name=cognate['Form'],
                 language=cognate['Language_ID'],
                 description=cognate['Description'])

    counts = collections.defaultdict(set)
    print("Forms...")
    for form in tqdm(
            iteritems(args.cldf, 'FormTable', 'id', 'form',
                      'languageReference', 'parameterReference', 'source')):
        counts[form['parameterReference']].add(form['languageReference'])

    print("Params...")
    for param in tqdm(
            iteritems(args.cldf, 'ParameterTable', 'ID', 'Name',
                      'Concepticon_ID', 'Description')):
        data.add(models.Concept,
                 param['ID'],
                 id=param['ID'],
                 name='{} [{}]'.format(param['Name'], param['ID']),
                 description=param['Description'],
                 count=len(counts[param['ID']]))

    print("Forms...")
    for form in tqdm(
            iteritems(args.cldf, 'FormTable', 'id', 'form',
                      'languageReference', 'parameterReference', 'source')):
        l = re.split(r";|\+", form['parameterReference'])
        for i, paramref in enumerate(l):
            if paramref == '?': continue
            vsid = (form['languageReference'], paramref)
            vs = data['ValueSet'].get(vsid)
            if not vs:
                vs = data.add(
                    common.ValueSet,
                    vsid,
                    id='-'.join(vsid),
                    language=data['Variety'][form['languageReference']],
                    parameter=data['Concept'][paramref],
                    contribution=contrib,
                )

            for ref in form.get('source', []):
                sid, pages = Sources.parse(ref)
                refs[(vsid, sid)].append(pages)

            data.add(
                models.Lexeme,
                form['id'] + '-' + str(i) if len(l) > 1 else form['id'],
                id=form['id'] + '-' + str(i) if len(l) > 1 else form['id'],
                name=form['form'],
                gloss=form['Gloss'],
                native=form['Native'],
                phonemic='/' + form['Phonemic'] +
                '/' if form['Phonemic'] else None,
                description=form['Description'],
                cognateset=form['Cognateset'],
                valueset=vs,
            )

    print("Refs...")
    for (vsid, sid), pages in tqdm(refs.items()):
        DBSession.add(
            common.ValueSetReference(valueset=data['ValueSet'][vsid],
                                     source=data['Source'][sid],
                                     description='; '.join(nfilter(pages))))
예제 #3
0
def main(args):  # pragma: no cover
    license = licenses.find(args.cldf.properties['dc:license'])
    assert license and license.id.startswith('CC-')
    clts = CLTS(
        input('Path to cldf-clts/clts:') or '../../cldf-clts/clts-data')
    data = Data()
    ds = data.add(
        common.Dataset,
        vanuatuvoices.__name__,
        id=vanuatuvoices.__name__,
        name='Vanuatu Voices',
        domain='vanuatuvoices.clld.org',
        contact="*****@*****.**",
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="https://www.eva.mpg.de",
        license=license.url,
        jsondata={
            'license_icon':
            '{}.png'.format('-'.join(
                [p.lower() for p in license.id.split('-')[:-1]])),
            'license_name':
            license.name
        },
    )

    form2audio = audioutil.form2audio(args.cldf, 'audio/mpeg')

    r = get_dataset('vanuatuvoices', ep='lexibank.dataset')
    authors, _ = r.get_creators_and_contributors()
    for ord, author in enumerate(authors):
        cid = slug(HumanName(author['name']).last)
        img = pathlib.Path(
            vanuatuvoices.__file__).parent / 'static' / '{}.jpg'.format(cid)
        c = data.add(
            common.Contributor,
            cid,
            id=cid,
            name=author['name'],
            description=author.get('description'),
            jsondata=dict(img=img.name if img.exists() else None),
        )
    data.add(
        common.Contributor,
        'forkel',
        id='forkel',
        name='Robert Forkel',
        description='Data curation and website implementation',
        jsondata=dict(img=None),
    )
    for ord, cid in enumerate(['walworth', 'forkel', 'gray']):
        DBSession.add(
            common.Editor(ord=ord,
                          dataset=ds,
                          contributor=data['Contributor'][cid]))

    contribs = collections.defaultdict(lambda: collections.defaultdict(list))
    for c in args.cldf.iter_rows('contributions.csv'):
        for role in ['phonetic_transcriptions', 'recording', 'sound_editing']:
            for name in c[role].split(' and '):
                if name:
                    cid = slug(HumanName(name).last)
                    contribs[c['Language_ID']][cid].append(role)

    for lang in args.cldf.iter_rows('LanguageTable', 'id', 'glottocode',
                                    'name', 'latitude', 'longitude'):
        contrib = data.add(
            common.Contribution,
            lang['id'],
            id=lang['id'],
            name='Wordlist for {}'.format(lang['name']),
        )
        if lang['id'] in contribs:
            for cid, roles in contribs[lang['id']].items():
                DBSession.add(
                    common.ContributionContributor(
                        contribution=contrib,
                        contributor=data['Contributor'][cid],
                        jsondata=dict(roles=roles),
                    ))
        data.add(
            models.Variety,
            lang['id'],
            id=lang['id'],
            name=lang['name'],
            latitude=lang['latitude'],
            longitude=lang['longitude'],
            glottocode=lang['glottocode'],
            description=lang['LongName'],
            contribution=contrib,
            island=lang['Island'],
        )

    for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    refs = collections.defaultdict(list)

    for param in args.cldf.iter_rows('ParameterTable', 'id',
                                     'concepticonReference', 'name'):
        data.add(
            models.Concept,
            param['id'],
            id=param['id'],
            name='{} [{}]'.format(param['name'], param['id'].split('_')[0]),
            description=param['Bislama_Gloss'],
            concepticon_id=param['concepticonReference'],
            concepticon_gloss=param['Concepticon_Gloss'],
        )
    inventories = collections.defaultdict(collections.Counter)
    for form in args.cldf.iter_rows('FormTable', 'id', 'form',
                                    'languageReference', 'parameterReference',
                                    'source'):
        inventories[form['languageReference']].update(form['Segments'])
        vsid = (form['languageReference'], form['parameterReference'])
        vs = data['ValueSet'].get(vsid)
        if not vs:
            vs = data.add(
                common.ValueSet,
                vsid,
                id='-'.join(vsid),
                language=data['Variety'][form['languageReference']],
                parameter=data['Concept'][form['parameterReference']],
                contribution=data['Contribution'][form['languageReference']],
            )
        for ref in form.get('source', []):
            sid, pages = Sources.parse(ref)
            refs[(vsid, sid)].append(pages)
        data.add(Counterpart,
                 form['id'],
                 id=form['id'],
                 name=form['form'],
                 valueset=vs,
                 audio=form2audio.get(form['id']))

    for (vsid, sid), pages in refs.items():
        DBSession.add(
            common.ValueSetReference(valueset=data['ValueSet'][vsid],
                                     source=data['Source'][sid],
                                     description='; '.join(nfilter(pages))))

    for lid, inv in inventories.items():
        inv = [clts.bipa[c] for c in inv]
        data['Variety'][lid].update_jsondata(
            inventory=[(str(c), c.name) for c in inv
                       if getattr(c, 'name', None)])
예제 #4
0
파일: initializedb.py 프로젝트: clld/tppsr
def main(args):  # pragma: no cover
    data = Data()
    clts = CLTS(input('Path to cldf-clts/clts:') or '../../cldf-clts/clts')
    ds = data.add(
        common.Dataset,
        tppsr.__name__,
        id=tppsr.__name__,
        name='Tableaux phonétiques des patois suisses romands Online',
        domain='tppsr.clld.org',
        contact="*****@*****.**",
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="https://www.eva.mpg.de",
        license="https://creativecommons.org/licenses/by/4.0/",
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'},
    )
    for i, name in enumerate(['Hans Geisler', 'Robert Forkel', 'Johann-Mattis List']):
        common.Editor(
            dataset=ds,
            ord=i,
            contributor=common.Contributor(id=slug(HumanName(name).last), name=name)
        )

    contrib = data.add(
        common.Contribution,
        None,
        id='cldf',
        name=args.cldf.properties.get('dc:title'),
        description=args.cldf.properties.get('dc:bibliographicCitation'),
    )

    for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'name', 'latitude', 'longitude'):
        data.add(
            models.Variety,
            lang['id'],
            id=lang['Number'],
            name=lang['name'],
            description=lang['FullName'],
            latitude=lang['latitude'],
            longitude=lang['longitude'],
            canton=lang['Canton'],
            group=lang['DialectGroup'],
            recorded=lang['DateOfRecording'],
            population=int(lang['Population']) if lang['Population'] else None,
            speaker_age=int(lang['SpeakerAge']) if lang['SpeakerAge'] else None,
            speaker_proficiency=lang['SpeakerProficiency'],
            speaker_language_use=lang['SpeakerLanguageUse'],
            speaker_gender=lang['SpeakerGender'],
            investigators=lang['Investigators'],
        )
    colors = qualitative_colors(len(set(l.canton for l in data['Variety'].values())), set='tol')
    for i, (_, langs) in enumerate(itertools.groupby(
        sorted(data['Variety'].values(), key=lambda l: l.canton),
        lambda l: l.canton,
    )):
        for lang in langs:
            lang.update_jsondata(color=colors[i])

    for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    refs = collections.defaultdict(list)
    for param in iteritems(args.cldf, 'ParameterTable', 'id', 'concepticonReference', 'name'):
        data.add(
            models.Concept,
            param['id'],
            id=param['Number'],
            number=int(param['Number']),
            name='{} [{}]'.format(param['name'], param['Number']),
            latin_gloss=param['Latin_Gloss'],
            french_gloss=param['French_Gloss'],
            concepticon_id=param['concepticonReference'],
            concepticon_gloss=param['Concepticon_Gloss'],
            concepticon_concept_id=param['id'].split('_')[0],
        )

    inventories = collections.defaultdict(set)
    scan_url_template = args.cldf['FormTable', 'Scan'].valueUrl
    for form in iteritems(args.cldf, 'FormTable', 'id', 'value', 'form', 'languageReference', 'parameterReference', 'source'):
        if not form['form']:
            continue
        inventories[form['languageReference']] = inventories[form['languageReference']].union(form['Segments'])
        vsid = (form['languageReference'], form['parameterReference'])
        vs = data['ValueSet'].get(vsid)
        if not vs:
            vs = data.add(
                common.ValueSet,
                vsid,
                id='-'.join(vsid),
                language=data['Variety'][form['languageReference']],
                parameter=data['Concept'][form['parameterReference']],
                contribution=contrib,
            )
        for ref in form.get('source', []):
            sid, pages = Sources.parse(ref)
            refs[(vsid, sid)].append(pages)
        f = data.add(
            models.Form,
            form['id'],  # Gauchat-1925-480-1_
            id=form['id'],
            name=form['form'].replace('+', ' '),
            description=form['value'],
            segments=' '.join(form['Segments']),
            valueset=vs,
            scan=scan_url_template.expand(**form),
            prosodic_structure=form['ProsodicStructure'],
        )

    for example in args.cldf['ExampleTable']:
        sentence = models.Phrase(
            id=example['ID'],
            language=data['Variety'][example['Language_ID']],
            name=example['Primary_Text'],
            description=example['Translated_Text'],
            original_script=example['Alt_Transcription'],
        )
        for cid in example['Concept_ID']:
            DBSession.add(models.ConceptSentence(concept=data['Concept'][cid], sentence=sentence))
        for fid in example['Form_ID']:
            DBSession.add(common.ValueSentence(value=data['Form'][fid], sentence=sentence))

    for lid, inv in inventories.items():
        inv = [clts.bipa[c] for c in inv]
        data['Variety'][lid].update_jsondata(
            inventory=[(str(c), c.name) for c in inv if hasattr(c, 'name')])

    for (vsid, sid), pages in refs.items():
        DBSession.add(common.ValueSetReference(
            valueset=data['ValueSet'][vsid],
            source=data['Source'][sid],
            description='; '.join(nfilter(pages))
        ))
예제 #5
0
def main(args):

    assert args.glottolog, 'The --glottolog option is required!'

    data = Data()
    data.add(
        common.Dataset,
        polyglottaafricana.__name__,
        id=polyglottaafricana.__name__,
        domain='',
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        },
    )

    contrib = data.add(
        common.Contribution,
        None,
        id='cldf',
        name=args.cldf.properties.get('dc:title'),
        description=args.cldf.properties.get('dc:bibliographicCitation'),
    )

    for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'glottocode',
                          'name', 'latitude', 'longitude'):
        data.add(
            models.Variety,
            lang['id'],
            id=lang['id'],
            name=lang['name'],
            latitude=lang['latitude'],
            longitude=lang['longitude'],
            glottocode=lang['glottocode'],
        )

    for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    refs = collections.defaultdict(list)

    for param in iteritems(args.cldf, 'ParameterTable', 'id',
                           'concepticonReference', 'name'):
        data.add(
            models.Concept,
            param['id'],
            id=param['id'],
            name='{} [{}]'.format(param['name'], param['id']),
        )
    for form in iteritems(args.cldf, 'FormTable', 'id', 'form',
                          'languageReference', 'parameterReference', 'source'):
        vsid = (form['languageReference'], form['parameterReference'])
        vs = data['ValueSet'].get(vsid)
        if not vs:
            vs = data.add(
                common.ValueSet,
                vsid,
                id='-'.join(vsid),
                language=data['Variety'][form['languageReference']],
                parameter=data['Concept'][form['parameterReference']],
                contribution=contrib,
            )
        for ref in form.get('source', []):
            sid, pages = Sources.parse(ref)
            refs[(vsid, sid)].append(pages)
        data.add(
            common.Value,
            form['id'],
            id=form['id'],
            name=form['form'],
            valueset=vs,
        )

    for (vsid, sid), pages in refs.items():
        DBSession.add(
            common.ValueSetReference(valueset=data['ValueSet'][vsid],
                                     source=data['Source'][sid],
                                     description='; '.join(nfilter(pages))))
    load_families(
        Data(),
        [(l.glottocode, l) for l in data['Variety'].values()],
        glottolog_repos=args.glottolog,
        isolates_icon='tcccccc',
        strict=False,
    )
예제 #6
0
def main(args):
    assert args.glottolog, 'The --glottolog option is required!'
    license = licenses.find(args.cldf.properties['dc:license'])
    assert license and license.id.startswith('CC-')

    data = Data()
    ds = data.add(
        common.Dataset,
        mixezoqueanvoices.__name__,
        id=mixezoqueanvoices.__name__,
        name="Mixe-Zoquean Voices",
        domain='mixezoqueanvoices.clld.org',
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        license=license.url,
        jsondata={
            'license_icon':
            '{}.png'.format('-'.join(
                [p.lower() for p in license.id.split('-')[:-1]])),
            'license_name':
            license.name
        },
    )

    contrib = data.add(
        common.Contribution,
        None,
        id='cldf',
        name=args.cldf.properties.get('dc:title'),
        description=args.cldf.properties.get('dc:bibliographicCitation'),
    )
    data.add(common.Contributor, 'kondic', id='kondic', name='Ana Kondic')
    data.add(common.Contributor, 'gray', id='gray', name='Russell Gray')
    DBSession.add(
        common.ContributionContributor(
            contribution=contrib,
            contributor=data['Contributor']['kondic'],
        ))
    for i, ed in enumerate(['kondic', 'gray']):
        data.add(common.Editor,
                 ed,
                 dataset=ds,
                 contributor=data['Contributor'][ed],
                 ord=i)

    ancestors = collections.defaultdict(list)
    gl = Glottolog(args.glottolog)
    lnames = {}
    for lang in args.cldf.iter_rows('LanguageTable', 'id', 'glottocode',
                                    'name', 'latitude', 'longitude'):
        lnames[lang['id']] = lang['name']
        glang = None
        if lang['glottocode']:
            glang = gl.languoid(lang['glottocode'])
            lineage = [i[0] for i in glang.lineage]
            if 'Mixe-Zoque' in lineage:
                ancestors[lang['id']].append('Protomixezoque')
            if 'Mixe' in lineage:
                ancestors[lang['id']].append('Protomixe')
            if 'Oaxaca Mixe' in lineage:
                ancestors[lang['id']].append('Protooaxacamixe')
        if not glang:
            assert lang['name'] == 'Nizaviguiti'
        data.add(
            models.Variety,
            lang['id'],
            id=lang['id'],
            name=lang['name'],
            latitude=lang['latitude'],
            longitude=lang['longitude'],
            glottocode=lang['glottocode'],
            description=lang['LongName'],
            subgroup=glang.lineage[1][0]
            if glang and len(glang.lineage) > 1 else None,
        )
    colors = dict(
        zip(
            set(l.subgroup for l in data['Variety'].values()),
            qualitative_colors(
                len(set(l.subgroup for l in data['Variety'].values())))))
    for l in data['Variety'].values():
        l.jsondata = dict(color=colors[l.subgroup].replace('#', ''))

    for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    refs = collections.defaultdict(list)

    # Store proto-forms for later lookup:
    proto_forms = collections.defaultdict(
        lambda: collections.defaultdict(list))
    for form in args.cldf.iter_rows('FormTable', 'id', 'form',
                                    'languageReference', 'parameterReference'):
        if form['languageReference'].startswith('Proto'):
            proto_forms[form['languageReference']][
                form['parameterReference']].append(form['form'])

    for param in args.cldf.iter_rows('ParameterTable', 'id',
                                     'concepticonReference', 'name'):
        proto = collections.OrderedDict()
        for lid, forms in proto_forms.items():
            f = forms.get(param['id'])
            if f:
                proto[lnames[lid]] = f
        data.add(
            models.Concept,
            param['id'],
            id=param['id'],
            name='{} [{}]'.format(param['name'], param['id'].split('_')[0]),
            concepticon_id=param['concepticonReference'],
            concepticon_gloss=param['Concepticon_Gloss'],
            description=param['Spanish_Gloss'],
            jsondata=dict(reconstructions=proto),
        )

    f2a = form2audio(args.cldf)
    for form in args.cldf.iter_rows('FormTable', 'id', 'form',
                                    'languageReference', 'parameterReference',
                                    'source'):
        assert not (form['form'] == '►' and not f2a.get(form['id']))
        vsid = (form['languageReference'], form['parameterReference'])
        vs = data['ValueSet'].get(vsid)
        if not vs:
            vs = data.add(
                common.ValueSet,
                vsid,
                id='-'.join(vsid),
                language=data['Variety'][form['languageReference']],
                parameter=data['Concept'][form['parameterReference']],
                contribution=contrib,
            )
        for ref in form.get('source', []):
            sid, pages = Sources.parse(ref)
            refs[(vsid, sid)].append(pages)
        proto = collections.OrderedDict()
        for lid in ancestors.get(form['languageReference'], []):
            f = proto_forms[lid].get(form['parameterReference'])
            if f:
                proto[lnames[lid]] = f
        data.add(
            Counterpart,
            form['id'],
            id=form['id'],
            name=form['form'],
            valueset=vs,
            audio=f2a.get(form['id']),
            jsondata=dict(reconstructions=proto),
        )

    for (vsid, sid), pages in refs.items():
        DBSession.add(
            common.ValueSetReference(valueset=data['ValueSet'][vsid],
                                     source=data['Source'][sid],
                                     description='; '.join(nfilter(pages))))
예제 #7
0
파일: initializedb.py 프로젝트: clld/ewave
def main(args):
    data = Data()
    doi = input('DOI of the released dataset: ')

    dataset = common.Dataset(
        id=ewave.__name__,
        name='eWAVE',
        description='The Electronic World Atlas of Varieties of English',
        domain='ewave-atlas.org',
        published=date.today(),
        license='http://creativecommons.org/licenses/by/3.0/',
        contact='*****@*****.**',
        jsondata={
            'doi': doi,
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 3.0 Unported License'})
    DBSession.add(dataset)

    ed_pattern = re.compile('ed(?P<ord>[0-9]+)$')
    for c in args.cldf['contributors.csv']:
        contrib = data.add(
            models.WaveContributor,
            c['ID'],
            id=c['ID'],
            name=c['Name'],
            email=c['Email'],
            url=c['URL'],
            address=c['Address'],
            sortkey=HumanName(c['Name']).last,
        )
        m = ed_pattern.match(c['ID'])
        if m:
            common.Editor(dataset=dataset, contributor=contrib, ord=int(m.group('ord')))

    for fc in args.cldf['featurecategories.csv']:
        data.add(
            models.FeatureCategory, fc['ID'],
            id=fc['ID'], name=fc['Name'], description=fc['Description'])

    for vt in args.cldf['varietytypes.csv']:
        data.add(
            models.VarietyType, vt['ID'],
            id=vt['ID'],
            name=vt['Name'],
            description=vt['Description'],
            jsondata=VARIETY_TYPE_ICONS[vt['ID']],
        )

    for vt in args.cldf['regions.csv']:
        data.add(models.Region, vt['ID'], id=vt['ID'], name=vt['Name'])

    for lang in args.cldf['LanguageTable']:
        l = data.add(
            models.Variety, lang['ID'],
            id=lang['ID'],
            name=lang['Name'],
            latitude=lang['Latitude'],
            longitude=lang['Longitude'],
            abbr=lang['abbr'],
            region=data['Region'][lang['Region_ID']],
            type=data['VarietyType'][lang['Type_ID']],
        )
        if lang['Glottocode']:
            add_language_codes(data, l, None, glottocode=lang['Glottocode'])
        c = data.add(
            models.WaveContribution, lang['ID'],
            id=str(lang['ID']),
            name=lang['Name'],
            description=lang['Description'],
            variety=l)
        for i, cid in enumerate(lang['Contributor_ID']):
            DBSession.add(common.ContributionContributor(
                contribution=c,
                contributor=data['WaveContributor'][cid],
                ord=i+1,
            ))

    for param in args.cldf['ParameterTable']:
        data.add(
            models.Feature, param['ID'],
            id=param['ID'],
            category=data['FeatureCategory'][param['Category_ID']],
            name=param['Name'],
            description=param['Description'],
            jsondata={'example_source': param['Example_Source']})


    for de in args.cldf['CodeTable']:
        data.add(
            common.DomainElement, de['ID'],
            id=de['ID'],
            parameter=data['Feature'][de['Parameter_ID']],
            name=de['Name'],
            description=de['Description'],
            jsondata={'color': CODE_COLORS[de['Name']]},
            number=de['Number'])

    for rec in bibtex.Database.from_file(args.cldf.bibpath):
        data.add(common.Source, slug(rec.id), _obj=bibtex2source(rec))

    for example in args.cldf['ExampleTable']:
        s = data.add(
            common.Sentence, example['ID'],
            id=example['ID'],
            name=example['Primary_Text'],
            gloss='\t'.join(example['Gloss']) if example['Gloss'] else None,
            comment=example['Comment'] or None,
            description=example['Translated_Text'] or None,
            language=data['Variety'][example['Language_ID']])

        for ref in example['Source']:
            sid, pages = Sources.parse(ref)
            DBSession.add(common.SentenceReference(
                sentence=s, source=data['Source'][sid], description=pages, key=sid))

    for value in args.cldf['ValueTable']:
        de = data['DomainElement'][value['Code_ID']]
        vs = data.add(
            common.ValueSet, value['ID'],
            id=value['ID'],
            contribution=data['WaveContribution'][value['Language_ID']],
            parameter=data['Feature'][value['Parameter_ID']],
            jsondata=de.jsondata,
            language=data['Variety'][value['Language_ID']])
        v = data.add(
            common.Value, value['ID'],
            id=value['ID'],
            domainelement=de,
            valueset=vs)

        for eid in value['Example_ID']:
            DBSession.add(common.ValueSentence(sentence=data['Sentence'][eid], value=v))
예제 #8
0
def main(args):
    license = licenses.find(args.cldf.properties['dc:license'])
    assert license and license.id.startswith('CC-')

    assert args.glottolog, 'The --glottolog option is required!'

    data = Data()
    ds = data.add(
        common.Dataset,
        papuanvoices.__name__,
        id=papuanvoices.__name__,
        domain='papuanvoices.clld.org',
        name="Papuan Voices",
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        license=license.url,
        jsondata={
            'license_icon':
            '{}.png'.format('-'.join(
                [p.lower() for p in license.id.split('-')[:-1]])),
            'license_name':
            license.name
        },
    )

    contrib = data.add(
        common.Contribution,
        None,
        id='cldf',
        name=args.cldf.properties.get('dc:title'),
        description=args.cldf.properties.get('dc:bibliographicCitation'),
    )

    data.add(common.Contributor, 'gray', id='gray', name='Russell Gray')
    for i, ed in enumerate(['gray']):
        data.add(common.Editor,
                 ed,
                 dataset=ds,
                 contributor=data['Contributor'][ed],
                 ord=i)

    for lang in args.cldf.iter_rows('LanguageTable', 'id', 'glottocode',
                                    'name', 'latitude', 'longitude'):
        data.add(
            models.Variety,
            lang['id'],
            id=lang['id'],
            name=lang['name'],
            description=lang['LongName'],
            latitude=lang['latitude'],
            longitude=lang['longitude'],
            glottocode=lang['glottocode'],
        )

    for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    refs = collections.defaultdict(list)

    for param in args.cldf.iter_rows('ParameterTable', 'id',
                                     'concepticonReference', 'name'):
        data.add(
            models.Concept,
            param['id'],
            id=param['id'],
            name='{} [{}]'.format(param['name'], param['id']),
            concepticon_id=param['concepticonReference'],
            concepticon_gloss=param['Concepticon_Gloss'],
        )
    f2a = form2audio(args.cldf)
    for form in args.cldf.iter_rows('FormTable', 'id', 'form',
                                    'languageReference', 'parameterReference',
                                    'source'):
        vsid = (form['languageReference'], form['parameterReference'])
        vs = data['ValueSet'].get(vsid)
        if not vs:
            vs = data.add(
                common.ValueSet,
                vsid,
                id='-'.join(vsid),
                language=data['Variety'][form['languageReference']],
                parameter=data['Concept'][form['parameterReference']],
                contribution=contrib,
            )
        for ref in form.get('source', []):
            sid, pages = Sources.parse(ref)
            refs[(vsid, sid)].append(pages)
        data.add(
            Counterpart,
            form['id'],
            id=form['id'],
            name=form['form'],
            valueset=vs,
            audio=f2a.get(form['id']),
        )

    for (vsid, sid), pages in refs.items():
        DBSession.add(
            common.ValueSetReference(valueset=data['ValueSet'][vsid],
                                     source=data['Source'][sid],
                                     description='; '.join(nfilter(pages))))
    load_families(
        Data(),
        [(l.glottocode, l) for l in data['Variety'].values()],
        glottolog_repos=args.glottolog,
        isolates_icon='tcccccc',
        strict=False,
    )
예제 #9
0
def main(args):  # pragma: no cover
    #
    # FIXME: more generic:
    # - run iter_datasets(args.cldf) -> assuming args.cldf is a directory! -> must go in clld!
    # - Store datasets in defaultdict(list) keyed with module
    #
    datasets = {}
    for ds in iter_datasets(args.cldf.directory):
        datasets[ds.module] = ds

    assert args.glottolog, 'The --glottolog option is required!'

    data = Data()
    thedataset = data.add(
        common.Dataset,
        hindukush.__name__,
        id=hindukush.__name__,
        name='Hindu Kush Areal Typology',
        domain='hindukush.clld.org',
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        },
    )
    for i, name in enumerate(
        ['Henrik Liljegren', 'Robert Forkel', 'Nina Knobloch', 'Noa Lange']):
        common.Editor(dataset=thedataset,
                      ord=i,
                      contributor=common.Contributor(id=slug(
                          HumanName(name).last),
                                                     name=name))

    for rec in bibtex.Database.from_file(pathlib.Path(__file__).parent /
                                         'HK_website.bib',
                                         lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    refs = collections.defaultdict(list)
    for module, ds in sorted(datasets.items(), key=lambda i: i[0]):
        for lang in ds.iter_rows('LanguageTable', 'id', 'glottocode', 'name',
                                 'latitude', 'longitude'):
            if lang['id'] not in data['Variety']:
                data.add(
                    models.Variety,
                    lang['id'],
                    id=lang['id'],
                    name=lang['name'],
                    latitude=lang['latitude'],
                    longitude=lang['longitude'],
                    glottocode=lang['glottocode'],
                    subgroup=lang['SubGroup'],
                    location=lang['Location'],
                    elicitation=lang['Elicitation'],
                    jsondata=dict(shape=subgroup_shapes.get(lang['SubGroup'])),
                )

        contrib = data.add(
            models.CLDFDataset,
            module,
            id=module,
            name='{} [{}]'.format(ds.properties.get('dc:title'), module),
            description=ds.properties.get('dc:bibliographicCitation'),
            module=module,
        )

        if module == 'Wordlist':
            for param in ds.iter_rows('ParameterTable', 'id',
                                      'concepticonReference', 'name'):
                data.add(
                    models.Param,
                    param['id'],
                    id=param['id'],
                    name='{} [{}]'.format(param['name'], param['id']),
                    sortkey=param['id']
                    if not param['id'].startswith('Numerals') else
                    'Numerals-{0:04d}'.format(int(param['id'].split('-')[1])),
                    concepticon_id=param['concepticonReference'],
                    contribution=contrib,
                    category=param['domain'] or 'ASJPlist',
                )

            audio = {
                r['ID']: r
                for r in ds.iter_rows('media.csv')
                if r['mimetype'] == 'audio/mpeg'
            }
            for form in ds.iter_rows('FormTable', 'id', 'form',
                                     'languageReference', 'parameterReference',
                                     'source'):
                vsid = (form['languageReference'], form['parameterReference'])
                vs = data['ValueSet'].get(vsid)
                if not vs:
                    vs = data.add(
                        common.ValueSet,
                        vsid,
                        id='-'.join(vsid),
                        language=data['Variety'][form['languageReference']],
                        parameter=data['Param'][form['parameterReference']],
                        contribution=contrib,
                    )
                for ref in form.get('source', []):
                    sid, pages = Sources.parse(ref)
                    refs[(vsid, sid)].append(pages)
                mp3 = next(
                    iter([
                        audio[aid] for aid in form['Audio_Files']
                        if aid in audio
                    ]), None)
                data.add(
                    common.Value,
                    form['id'],
                    id=form['id'],
                    name=form['form'],
                    valueset=vs,
                    jsondata=dict(audio=ds.get_row_url('media.csv', mp3
                                                       ) if mp3 else None),
                )
        elif module == 'StructureDataset':
            for param in ds.iter_rows('ParameterTable', 'id', 'name',
                                      'description'):
                data.add(
                    models.Param,
                    param['id'],
                    id=param['id'],
                    name=param['name'],
                    description=html(param['description'])
                    if param['description'] else None,
                    category=param['Category'],
                    contribution=contrib,
                )
            for code in ds.iter_rows('CodeTable', 'id', 'name', 'description',
                                     'parameterReference'):
                data.add(common.DomainElement,
                         code['id'],
                         id=code['id'],
                         name=code['name'],
                         description=code['description'],
                         parameter=data['Param'][code['parameterReference']],
                         jsondata={
                             'color': {
                                 'absent': 'ff0000',
                                 'present': '0000ff',
                                 'indeterminate': 'cccccc',
                             }.get(code['description'])
                         })
            #
            # FIXME: read CodeTable!
            #
            for form in ds.iter_rows('ValueTable', 'id', 'value',
                                     'languageReference', 'parameterReference',
                                     'codeReference', 'source'):
                vsid = (form['languageReference'], form['parameterReference'])
                vs = data['ValueSet'].get(vsid)
                if not vs:
                    vs = data.add(
                        common.ValueSet,
                        vsid,
                        id='-'.join(vsid),
                        language=data['Variety'][form['languageReference']],
                        parameter=data['Param'][form['parameterReference']],
                        contribution=contrib,
                    )
                for ref in form.get('source', []):
                    sid, pages = Sources.parse(ref)
                    refs[(vsid, sid)].append(pages)
                data.add(
                    common.Value,
                    form['id'],
                    id=form['id'],
                    name=form['value'],
                    valueset=vs,
                    domainelement=data['DomainElement'][form['codeReference']])

    for (vsid, sid), pages in refs.items():
        DBSession.add(
            common.ValueSetReference(valueset=data['ValueSet'][vsid],
                                     source=data['Source'][sid],
                                     description='; '.join(nfilter(pages))))
    load_families(
        Data(),
        [(l.glottocode, l) for l in data['Variety'].values()],
        glottolog_repos=args.glottolog,
        isolates_icon='tcccccc',
        strict=False,
    )