def main(args): assert args.glottolog, 'The --glottolog option is required!' clts = CLTS(input('Path to cldf-clts/clts:') or '../../cldf-clts/clts') data = Data() ds = data.add( common.Dataset, lsi.__name__, id=lsi.__name__, name= 'The Comparative Vocabularies of the "Linguistic Survey of India" Online', domain='lsi.clld.org', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }, ) for i, name in enumerate( ['Taraka Rama', 'Robert Forkel', 'Johann-Mattis List']): common.Editor(dataset=ds, ord=i, contributor=common.Contributor(id=slug( HumanName(name).last), name=name)) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], order=int(lang['Order']), number=lang['NumberInSource'], family_in_source=lang['FamilyInSource'], ) for rec in bibtex.Database.from_file(args.cldf.bibpath): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for param in iteritems(args.cldf, 'ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Concept, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id']), description=param['Concepticon_Gloss'], concepticon_id=param['concepticonReference'], pages=param['PageNumber'], ) inventories = collections.defaultdict(set) for form in iteritems(args.cldf, 'FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): inventories[form['languageReference']] = inventories[ form['languageReference']].union(form['Segments']) vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add( models.Form, form['id'], id=form['id'], name=form['form'], description=''.join(form['Segments']).replace('+', ' '), segments=' '.join(form['Segments']), valueset=vs, ) for lid, inv in inventories.items(): inv = [clts.bipa[c] for c in inv] data['Variety'][lid].update_jsondata(inventory=[(str(c), c.name) for c in inv if hasattr(c, 'name')]) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)))) load_families( Data(), [(l.glottocode, l) for l in data['Variety'].values()], glottolog_repos=args.glottolog, isolates_icon='tcccccc', strict=False, )
def main(args): assert args.glottolog, 'The --glottolog option is required!' data = Data() ds = data.add( common.Dataset, jambu.__name__, id=jambu.__name__, name='Jambu', domain='jambu-clld.herokuapp.com', publisher_name="Georgetown University", publisher_place="Washington", publisher_url="http://gucl.georgetown.edu/", license="http://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }, ) for i, name in enumerate(['Aryaman Arora']): common.Editor(dataset=ds, ord=i, contributor=common.Contributor(id=slug( HumanName(name).last), name=name)) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) print("Languages...") for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'name', 'glottocode', 'longitude', 'latitude', 'Clade'): data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], family=lang['Clade'], ) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) print("Cognates...") for cognate in iteritems(args.cldf, 'CognateTable'): # print(cognate) data.add(models.Cognate_, cognate['Cognateset_ID'], name=cognate['Form'], language=cognate['Language_ID'], description=cognate['Description']) counts = collections.defaultdict(set) print("Forms...") for form in tqdm( iteritems(args.cldf, 'FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source')): counts[form['parameterReference']].add(form['languageReference']) print("Params...") for param in tqdm( iteritems(args.cldf, 'ParameterTable', 'ID', 'Name', 'Concepticon_ID', 'Description')): data.add(models.Concept, param['ID'], id=param['ID'], name='{} [{}]'.format(param['Name'], param['ID']), description=param['Description'], count=len(counts[param['ID']])) print("Forms...") for form in tqdm( iteritems(args.cldf, 'FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source')): l = re.split(r";|\+", form['parameterReference']) for i, paramref in enumerate(l): if paramref == '?': continue vsid = (form['languageReference'], paramref) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][paramref], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add( models.Lexeme, form['id'] + '-' + str(i) if len(l) > 1 else form['id'], id=form['id'] + '-' + str(i) if len(l) > 1 else form['id'], name=form['form'], gloss=form['Gloss'], native=form['Native'], phonemic='/' + form['Phonemic'] + '/' if form['Phonemic'] else None, description=form['Description'], cognateset=form['Cognateset'], valueset=vs, ) print("Refs...") for (vsid, sid), pages in tqdm(refs.items()): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages))))
def main(args): # pragma: no cover license = licenses.find(args.cldf.properties['dc:license']) assert license and license.id.startswith('CC-') clts = CLTS( input('Path to cldf-clts/clts:') or '../../cldf-clts/clts-data') data = Data() ds = data.add( common.Dataset, vanuatuvoices.__name__, id=vanuatuvoices.__name__, name='Vanuatu Voices', domain='vanuatuvoices.clld.org', contact="*****@*****.**", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="https://www.eva.mpg.de", license=license.url, jsondata={ 'license_icon': '{}.png'.format('-'.join( [p.lower() for p in license.id.split('-')[:-1]])), 'license_name': license.name }, ) form2audio = audioutil.form2audio(args.cldf, 'audio/mpeg') r = get_dataset('vanuatuvoices', ep='lexibank.dataset') authors, _ = r.get_creators_and_contributors() for ord, author in enumerate(authors): cid = slug(HumanName(author['name']).last) img = pathlib.Path( vanuatuvoices.__file__).parent / 'static' / '{}.jpg'.format(cid) c = data.add( common.Contributor, cid, id=cid, name=author['name'], description=author.get('description'), jsondata=dict(img=img.name if img.exists() else None), ) data.add( common.Contributor, 'forkel', id='forkel', name='Robert Forkel', description='Data curation and website implementation', jsondata=dict(img=None), ) for ord, cid in enumerate(['walworth', 'forkel', 'gray']): DBSession.add( common.Editor(ord=ord, dataset=ds, contributor=data['Contributor'][cid])) contribs = collections.defaultdict(lambda: collections.defaultdict(list)) for c in args.cldf.iter_rows('contributions.csv'): for role in ['phonetic_transcriptions', 'recording', 'sound_editing']: for name in c[role].split(' and '): if name: cid = slug(HumanName(name).last) contribs[c['Language_ID']][cid].append(role) for lang in args.cldf.iter_rows('LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): contrib = data.add( common.Contribution, lang['id'], id=lang['id'], name='Wordlist for {}'.format(lang['name']), ) if lang['id'] in contribs: for cid, roles in contribs[lang['id']].items(): DBSession.add( common.ContributionContributor( contribution=contrib, contributor=data['Contributor'][cid], jsondata=dict(roles=roles), )) data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], description=lang['LongName'], contribution=contrib, island=lang['Island'], ) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for param in args.cldf.iter_rows('ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Concept, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id'].split('_')[0]), description=param['Bislama_Gloss'], concepticon_id=param['concepticonReference'], concepticon_gloss=param['Concepticon_Gloss'], ) inventories = collections.defaultdict(collections.Counter) for form in args.cldf.iter_rows('FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): inventories[form['languageReference']].update(form['Segments']) vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=data['Contribution'][form['languageReference']], ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add(Counterpart, form['id'], id=form['id'], name=form['form'], valueset=vs, audio=form2audio.get(form['id'])) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)))) for lid, inv in inventories.items(): inv = [clts.bipa[c] for c in inv] data['Variety'][lid].update_jsondata( inventory=[(str(c), c.name) for c in inv if getattr(c, 'name', None)])
def main(args): # pragma: no cover data = Data() clts = CLTS(input('Path to cldf-clts/clts:') or '../../cldf-clts/clts') ds = data.add( common.Dataset, tppsr.__name__, id=tppsr.__name__, name='Tableaux phonétiques des patois suisses romands Online', domain='tppsr.clld.org', contact="*****@*****.**", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="https://www.eva.mpg.de", license="https://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}, ) for i, name in enumerate(['Hans Geisler', 'Robert Forkel', 'Johann-Mattis List']): common.Editor( dataset=ds, ord=i, contributor=common.Contributor(id=slug(HumanName(name).last), name=name) ) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'name', 'latitude', 'longitude'): data.add( models.Variety, lang['id'], id=lang['Number'], name=lang['name'], description=lang['FullName'], latitude=lang['latitude'], longitude=lang['longitude'], canton=lang['Canton'], group=lang['DialectGroup'], recorded=lang['DateOfRecording'], population=int(lang['Population']) if lang['Population'] else None, speaker_age=int(lang['SpeakerAge']) if lang['SpeakerAge'] else None, speaker_proficiency=lang['SpeakerProficiency'], speaker_language_use=lang['SpeakerLanguageUse'], speaker_gender=lang['SpeakerGender'], investigators=lang['Investigators'], ) colors = qualitative_colors(len(set(l.canton for l in data['Variety'].values())), set='tol') for i, (_, langs) in enumerate(itertools.groupby( sorted(data['Variety'].values(), key=lambda l: l.canton), lambda l: l.canton, )): for lang in langs: lang.update_jsondata(color=colors[i]) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for param in iteritems(args.cldf, 'ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Concept, param['id'], id=param['Number'], number=int(param['Number']), name='{} [{}]'.format(param['name'], param['Number']), latin_gloss=param['Latin_Gloss'], french_gloss=param['French_Gloss'], concepticon_id=param['concepticonReference'], concepticon_gloss=param['Concepticon_Gloss'], concepticon_concept_id=param['id'].split('_')[0], ) inventories = collections.defaultdict(set) scan_url_template = args.cldf['FormTable', 'Scan'].valueUrl for form in iteritems(args.cldf, 'FormTable', 'id', 'value', 'form', 'languageReference', 'parameterReference', 'source'): if not form['form']: continue inventories[form['languageReference']] = inventories[form['languageReference']].union(form['Segments']) vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) f = data.add( models.Form, form['id'], # Gauchat-1925-480-1_ id=form['id'], name=form['form'].replace('+', ' '), description=form['value'], segments=' '.join(form['Segments']), valueset=vs, scan=scan_url_template.expand(**form), prosodic_structure=form['ProsodicStructure'], ) for example in args.cldf['ExampleTable']: sentence = models.Phrase( id=example['ID'], language=data['Variety'][example['Language_ID']], name=example['Primary_Text'], description=example['Translated_Text'], original_script=example['Alt_Transcription'], ) for cid in example['Concept_ID']: DBSession.add(models.ConceptSentence(concept=data['Concept'][cid], sentence=sentence)) for fid in example['Form_ID']: DBSession.add(common.ValueSentence(value=data['Form'][fid], sentence=sentence)) for lid, inv in inventories.items(): inv = [clts.bipa[c] for c in inv] data['Variety'][lid].update_jsondata( inventory=[(str(c), c.name) for c in inv if hasattr(c, 'name')]) for (vsid, sid), pages in refs.items(): DBSession.add(common.ValueSetReference( valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)) ))
def main(args): assert args.glottolog, 'The --glottolog option is required!' data = Data() data.add( common.Dataset, polyglottaafricana.__name__, id=polyglottaafricana.__name__, domain='', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }, ) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], ) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for param in iteritems(args.cldf, 'ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Concept, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id']), ) for form in iteritems(args.cldf, 'FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add( common.Value, form['id'], id=form['id'], name=form['form'], valueset=vs, ) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)))) load_families( Data(), [(l.glottocode, l) for l in data['Variety'].values()], glottolog_repos=args.glottolog, isolates_icon='tcccccc', strict=False, )
def main(args): assert args.glottolog, 'The --glottolog option is required!' license = licenses.find(args.cldf.properties['dc:license']) assert license and license.id.startswith('CC-') data = Data() ds = data.add( common.Dataset, mixezoqueanvoices.__name__, id=mixezoqueanvoices.__name__, name="Mixe-Zoquean Voices", domain='mixezoqueanvoices.clld.org', publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", license=license.url, jsondata={ 'license_icon': '{}.png'.format('-'.join( [p.lower() for p in license.id.split('-')[:-1]])), 'license_name': license.name }, ) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) data.add(common.Contributor, 'kondic', id='kondic', name='Ana Kondic') data.add(common.Contributor, 'gray', id='gray', name='Russell Gray') DBSession.add( common.ContributionContributor( contribution=contrib, contributor=data['Contributor']['kondic'], )) for i, ed in enumerate(['kondic', 'gray']): data.add(common.Editor, ed, dataset=ds, contributor=data['Contributor'][ed], ord=i) ancestors = collections.defaultdict(list) gl = Glottolog(args.glottolog) lnames = {} for lang in args.cldf.iter_rows('LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): lnames[lang['id']] = lang['name'] glang = None if lang['glottocode']: glang = gl.languoid(lang['glottocode']) lineage = [i[0] for i in glang.lineage] if 'Mixe-Zoque' in lineage: ancestors[lang['id']].append('Protomixezoque') if 'Mixe' in lineage: ancestors[lang['id']].append('Protomixe') if 'Oaxaca Mixe' in lineage: ancestors[lang['id']].append('Protooaxacamixe') if not glang: assert lang['name'] == 'Nizaviguiti' data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], description=lang['LongName'], subgroup=glang.lineage[1][0] if glang and len(glang.lineage) > 1 else None, ) colors = dict( zip( set(l.subgroup for l in data['Variety'].values()), qualitative_colors( len(set(l.subgroup for l in data['Variety'].values()))))) for l in data['Variety'].values(): l.jsondata = dict(color=colors[l.subgroup].replace('#', '')) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) # Store proto-forms for later lookup: proto_forms = collections.defaultdict( lambda: collections.defaultdict(list)) for form in args.cldf.iter_rows('FormTable', 'id', 'form', 'languageReference', 'parameterReference'): if form['languageReference'].startswith('Proto'): proto_forms[form['languageReference']][ form['parameterReference']].append(form['form']) for param in args.cldf.iter_rows('ParameterTable', 'id', 'concepticonReference', 'name'): proto = collections.OrderedDict() for lid, forms in proto_forms.items(): f = forms.get(param['id']) if f: proto[lnames[lid]] = f data.add( models.Concept, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id'].split('_')[0]), concepticon_id=param['concepticonReference'], concepticon_gloss=param['Concepticon_Gloss'], description=param['Spanish_Gloss'], jsondata=dict(reconstructions=proto), ) f2a = form2audio(args.cldf) for form in args.cldf.iter_rows('FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): assert not (form['form'] == '►' and not f2a.get(form['id'])) vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) proto = collections.OrderedDict() for lid in ancestors.get(form['languageReference'], []): f = proto_forms[lid].get(form['parameterReference']) if f: proto[lnames[lid]] = f data.add( Counterpart, form['id'], id=form['id'], name=form['form'], valueset=vs, audio=f2a.get(form['id']), jsondata=dict(reconstructions=proto), ) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages))))
def main(args): data = Data() doi = input('DOI of the released dataset: ') dataset = common.Dataset( id=ewave.__name__, name='eWAVE', description='The Electronic World Atlas of Varieties of English', domain='ewave-atlas.org', published=date.today(), license='http://creativecommons.org/licenses/by/3.0/', contact='*****@*****.**', jsondata={ 'doi': doi, 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 3.0 Unported License'}) DBSession.add(dataset) ed_pattern = re.compile('ed(?P<ord>[0-9]+)$') for c in args.cldf['contributors.csv']: contrib = data.add( models.WaveContributor, c['ID'], id=c['ID'], name=c['Name'], email=c['Email'], url=c['URL'], address=c['Address'], sortkey=HumanName(c['Name']).last, ) m = ed_pattern.match(c['ID']) if m: common.Editor(dataset=dataset, contributor=contrib, ord=int(m.group('ord'))) for fc in args.cldf['featurecategories.csv']: data.add( models.FeatureCategory, fc['ID'], id=fc['ID'], name=fc['Name'], description=fc['Description']) for vt in args.cldf['varietytypes.csv']: data.add( models.VarietyType, vt['ID'], id=vt['ID'], name=vt['Name'], description=vt['Description'], jsondata=VARIETY_TYPE_ICONS[vt['ID']], ) for vt in args.cldf['regions.csv']: data.add(models.Region, vt['ID'], id=vt['ID'], name=vt['Name']) for lang in args.cldf['LanguageTable']: l = data.add( models.Variety, lang['ID'], id=lang['ID'], name=lang['Name'], latitude=lang['Latitude'], longitude=lang['Longitude'], abbr=lang['abbr'], region=data['Region'][lang['Region_ID']], type=data['VarietyType'][lang['Type_ID']], ) if lang['Glottocode']: add_language_codes(data, l, None, glottocode=lang['Glottocode']) c = data.add( models.WaveContribution, lang['ID'], id=str(lang['ID']), name=lang['Name'], description=lang['Description'], variety=l) for i, cid in enumerate(lang['Contributor_ID']): DBSession.add(common.ContributionContributor( contribution=c, contributor=data['WaveContributor'][cid], ord=i+1, )) for param in args.cldf['ParameterTable']: data.add( models.Feature, param['ID'], id=param['ID'], category=data['FeatureCategory'][param['Category_ID']], name=param['Name'], description=param['Description'], jsondata={'example_source': param['Example_Source']}) for de in args.cldf['CodeTable']: data.add( common.DomainElement, de['ID'], id=de['ID'], parameter=data['Feature'][de['Parameter_ID']], name=de['Name'], description=de['Description'], jsondata={'color': CODE_COLORS[de['Name']]}, number=de['Number']) for rec in bibtex.Database.from_file(args.cldf.bibpath): data.add(common.Source, slug(rec.id), _obj=bibtex2source(rec)) for example in args.cldf['ExampleTable']: s = data.add( common.Sentence, example['ID'], id=example['ID'], name=example['Primary_Text'], gloss='\t'.join(example['Gloss']) if example['Gloss'] else None, comment=example['Comment'] or None, description=example['Translated_Text'] or None, language=data['Variety'][example['Language_ID']]) for ref in example['Source']: sid, pages = Sources.parse(ref) DBSession.add(common.SentenceReference( sentence=s, source=data['Source'][sid], description=pages, key=sid)) for value in args.cldf['ValueTable']: de = data['DomainElement'][value['Code_ID']] vs = data.add( common.ValueSet, value['ID'], id=value['ID'], contribution=data['WaveContribution'][value['Language_ID']], parameter=data['Feature'][value['Parameter_ID']], jsondata=de.jsondata, language=data['Variety'][value['Language_ID']]) v = data.add( common.Value, value['ID'], id=value['ID'], domainelement=de, valueset=vs) for eid in value['Example_ID']: DBSession.add(common.ValueSentence(sentence=data['Sentence'][eid], value=v))
def main(args): license = licenses.find(args.cldf.properties['dc:license']) assert license and license.id.startswith('CC-') assert args.glottolog, 'The --glottolog option is required!' data = Data() ds = data.add( common.Dataset, papuanvoices.__name__, id=papuanvoices.__name__, domain='papuanvoices.clld.org', name="Papuan Voices", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", license=license.url, jsondata={ 'license_icon': '{}.png'.format('-'.join( [p.lower() for p in license.id.split('-')[:-1]])), 'license_name': license.name }, ) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) data.add(common.Contributor, 'gray', id='gray', name='Russell Gray') for i, ed in enumerate(['gray']): data.add(common.Editor, ed, dataset=ds, contributor=data['Contributor'][ed], ord=i) for lang in args.cldf.iter_rows('LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], description=lang['LongName'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], ) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for param in args.cldf.iter_rows('ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Concept, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id']), concepticon_id=param['concepticonReference'], concepticon_gloss=param['Concepticon_Gloss'], ) f2a = form2audio(args.cldf) for form in args.cldf.iter_rows('FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add( Counterpart, form['id'], id=form['id'], name=form['form'], valueset=vs, audio=f2a.get(form['id']), ) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)))) load_families( Data(), [(l.glottocode, l) for l in data['Variety'].values()], glottolog_repos=args.glottolog, isolates_icon='tcccccc', strict=False, )
def main(args): # pragma: no cover # # FIXME: more generic: # - run iter_datasets(args.cldf) -> assuming args.cldf is a directory! -> must go in clld! # - Store datasets in defaultdict(list) keyed with module # datasets = {} for ds in iter_datasets(args.cldf.directory): datasets[ds.module] = ds assert args.glottolog, 'The --glottolog option is required!' data = Data() thedataset = data.add( common.Dataset, hindukush.__name__, id=hindukush.__name__, name='Hindu Kush Areal Typology', domain='hindukush.clld.org', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }, ) for i, name in enumerate( ['Henrik Liljegren', 'Robert Forkel', 'Nina Knobloch', 'Noa Lange']): common.Editor(dataset=thedataset, ord=i, contributor=common.Contributor(id=slug( HumanName(name).last), name=name)) for rec in bibtex.Database.from_file(pathlib.Path(__file__).parent / 'HK_website.bib', lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for module, ds in sorted(datasets.items(), key=lambda i: i[0]): for lang in ds.iter_rows('LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): if lang['id'] not in data['Variety']: data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], subgroup=lang['SubGroup'], location=lang['Location'], elicitation=lang['Elicitation'], jsondata=dict(shape=subgroup_shapes.get(lang['SubGroup'])), ) contrib = data.add( models.CLDFDataset, module, id=module, name='{} [{}]'.format(ds.properties.get('dc:title'), module), description=ds.properties.get('dc:bibliographicCitation'), module=module, ) if module == 'Wordlist': for param in ds.iter_rows('ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Param, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id']), sortkey=param['id'] if not param['id'].startswith('Numerals') else 'Numerals-{0:04d}'.format(int(param['id'].split('-')[1])), concepticon_id=param['concepticonReference'], contribution=contrib, category=param['domain'] or 'ASJPlist', ) audio = { r['ID']: r for r in ds.iter_rows('media.csv') if r['mimetype'] == 'audio/mpeg' } for form in ds.iter_rows('FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Param'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) mp3 = next( iter([ audio[aid] for aid in form['Audio_Files'] if aid in audio ]), None) data.add( common.Value, form['id'], id=form['id'], name=form['form'], valueset=vs, jsondata=dict(audio=ds.get_row_url('media.csv', mp3 ) if mp3 else None), ) elif module == 'StructureDataset': for param in ds.iter_rows('ParameterTable', 'id', 'name', 'description'): data.add( models.Param, param['id'], id=param['id'], name=param['name'], description=html(param['description']) if param['description'] else None, category=param['Category'], contribution=contrib, ) for code in ds.iter_rows('CodeTable', 'id', 'name', 'description', 'parameterReference'): data.add(common.DomainElement, code['id'], id=code['id'], name=code['name'], description=code['description'], parameter=data['Param'][code['parameterReference']], jsondata={ 'color': { 'absent': 'ff0000', 'present': '0000ff', 'indeterminate': 'cccccc', }.get(code['description']) }) # # FIXME: read CodeTable! # for form in ds.iter_rows('ValueTable', 'id', 'value', 'languageReference', 'parameterReference', 'codeReference', 'source'): vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Param'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add( common.Value, form['id'], id=form['id'], name=form['value'], valueset=vs, domainelement=data['DomainElement'][form['codeReference']]) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)))) load_families( Data(), [(l.glottocode, l) for l in data['Variety'].values()], glottolog_repos=args.glottolog, isolates_icon='tcccccc', strict=False, )