def main(args): assert args.glottolog, 'The --glottolog option is required!' data = Data() ds = data.add( common.Dataset, jambu.__name__, id=jambu.__name__, name='Jambu', domain='jambu-clld.herokuapp.com', publisher_name="Georgetown University", publisher_place="Washington", publisher_url="http://gucl.georgetown.edu/", license="http://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }, ) for i, name in enumerate(['Aryaman Arora']): common.Editor(dataset=ds, ord=i, contributor=common.Contributor(id=slug( HumanName(name).last), name=name)) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) print("Languages...") for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'name', 'glottocode', 'longitude', 'latitude', 'Clade'): data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], family=lang['Clade'], ) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) print("Cognates...") for cognate in iteritems(args.cldf, 'CognateTable'): # print(cognate) data.add(models.Cognate_, cognate['Cognateset_ID'], name=cognate['Form'], language=cognate['Language_ID'], description=cognate['Description']) counts = collections.defaultdict(set) print("Forms...") for form in tqdm( iteritems(args.cldf, 'FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source')): counts[form['parameterReference']].add(form['languageReference']) print("Params...") for param in tqdm( iteritems(args.cldf, 'ParameterTable', 'ID', 'Name', 'Concepticon_ID', 'Description')): data.add(models.Concept, param['ID'], id=param['ID'], name='{} [{}]'.format(param['Name'], param['ID']), description=param['Description'], count=len(counts[param['ID']])) print("Forms...") for form in tqdm( iteritems(args.cldf, 'FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source')): l = re.split(r";|\+", form['parameterReference']) for i, paramref in enumerate(l): if paramref == '?': continue vsid = (form['languageReference'], paramref) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][paramref], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add( models.Lexeme, form['id'] + '-' + str(i) if len(l) > 1 else form['id'], id=form['id'] + '-' + str(i) if len(l) > 1 else form['id'], name=form['form'], gloss=form['Gloss'], native=form['Native'], phonemic='/' + form['Phonemic'] + '/' if form['Phonemic'] else None, description=form['Description'], cognateset=form['Cognateset'], valueset=vs, ) print("Refs...") for (vsid, sid), pages in tqdm(refs.items()): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages))))
def main(args): # pragma: no cover data = Data() clts = CLTS(input('Path to cldf-clts/clts:') or '../../cldf-clts/clts') ds = data.add( common.Dataset, tppsr.__name__, id=tppsr.__name__, name='Tableaux phonétiques des patois suisses romands Online', domain='tppsr.clld.org', contact="*****@*****.**", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="https://www.eva.mpg.de", license="https://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}, ) for i, name in enumerate(['Hans Geisler', 'Robert Forkel', 'Johann-Mattis List']): common.Editor( dataset=ds, ord=i, contributor=common.Contributor(id=slug(HumanName(name).last), name=name) ) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'name', 'latitude', 'longitude'): data.add( models.Variety, lang['id'], id=lang['Number'], name=lang['name'], description=lang['FullName'], latitude=lang['latitude'], longitude=lang['longitude'], canton=lang['Canton'], group=lang['DialectGroup'], recorded=lang['DateOfRecording'], population=int(lang['Population']) if lang['Population'] else None, speaker_age=int(lang['SpeakerAge']) if lang['SpeakerAge'] else None, speaker_proficiency=lang['SpeakerProficiency'], speaker_language_use=lang['SpeakerLanguageUse'], speaker_gender=lang['SpeakerGender'], investigators=lang['Investigators'], ) colors = qualitative_colors(len(set(l.canton for l in data['Variety'].values())), set='tol') for i, (_, langs) in enumerate(itertools.groupby( sorted(data['Variety'].values(), key=lambda l: l.canton), lambda l: l.canton, )): for lang in langs: lang.update_jsondata(color=colors[i]) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for param in iteritems(args.cldf, 'ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Concept, param['id'], id=param['Number'], number=int(param['Number']), name='{} [{}]'.format(param['name'], param['Number']), latin_gloss=param['Latin_Gloss'], french_gloss=param['French_Gloss'], concepticon_id=param['concepticonReference'], concepticon_gloss=param['Concepticon_Gloss'], concepticon_concept_id=param['id'].split('_')[0], ) inventories = collections.defaultdict(set) scan_url_template = args.cldf['FormTable', 'Scan'].valueUrl for form in iteritems(args.cldf, 'FormTable', 'id', 'value', 'form', 'languageReference', 'parameterReference', 'source'): if not form['form']: continue inventories[form['languageReference']] = inventories[form['languageReference']].union(form['Segments']) vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) f = data.add( models.Form, form['id'], # Gauchat-1925-480-1_ id=form['id'], name=form['form'].replace('+', ' '), description=form['value'], segments=' '.join(form['Segments']), valueset=vs, scan=scan_url_template.expand(**form), prosodic_structure=form['ProsodicStructure'], ) for example in args.cldf['ExampleTable']: sentence = models.Phrase( id=example['ID'], language=data['Variety'][example['Language_ID']], name=example['Primary_Text'], description=example['Translated_Text'], original_script=example['Alt_Transcription'], ) for cid in example['Concept_ID']: DBSession.add(models.ConceptSentence(concept=data['Concept'][cid], sentence=sentence)) for fid in example['Form_ID']: DBSession.add(common.ValueSentence(value=data['Form'][fid], sentence=sentence)) for lid, inv in inventories.items(): inv = [clts.bipa[c] for c in inv] data['Variety'][lid].update_jsondata( inventory=[(str(c), c.name) for c in inv if hasattr(c, 'name')]) for (vsid, sid), pages in refs.items(): DBSession.add(common.ValueSetReference( valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)) ))
def main(args): assert args.glottolog, 'The --glottolog option is required!' clts = CLTS(input('Path to cldf-clts/clts:') or '../../cldf-clts/clts') data = Data() ds = data.add( common.Dataset, lsi.__name__, id=lsi.__name__, name= 'The Comparative Vocabularies of the "Linguistic Survey of India" Online', domain='lsi.clld.org', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }, ) for i, name in enumerate( ['Taraka Rama', 'Robert Forkel', 'Johann-Mattis List']): common.Editor(dataset=ds, ord=i, contributor=common.Contributor(id=slug( HumanName(name).last), name=name)) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], order=int(lang['Order']), number=lang['NumberInSource'], family_in_source=lang['FamilyInSource'], ) for rec in bibtex.Database.from_file(args.cldf.bibpath): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for param in iteritems(args.cldf, 'ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Concept, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id']), description=param['Concepticon_Gloss'], concepticon_id=param['concepticonReference'], pages=param['PageNumber'], ) inventories = collections.defaultdict(set) for form in iteritems(args.cldf, 'FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): inventories[form['languageReference']] = inventories[ form['languageReference']].union(form['Segments']) vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add( models.Form, form['id'], id=form['id'], name=form['form'], description=''.join(form['Segments']).replace('+', ' '), segments=' '.join(form['Segments']), valueset=vs, ) for lid, inv in inventories.items(): inv = [clts.bipa[c] for c in inv] data['Variety'][lid].update_jsondata(inventory=[(str(c), c.name) for c in inv if hasattr(c, 'name')]) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)))) load_families( Data(), [(l.glottocode, l) for l in data['Variety'].values()], glottolog_repos=args.glottolog, isolates_icon='tcccccc', strict=False, )
def main(args): assert args.glottolog, 'The --glottolog option is required!' data = Data() data.add( common.Dataset, polyglottaafricana.__name__, id=polyglottaafricana.__name__, domain='', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }, ) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], ) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for param in iteritems(args.cldf, 'ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Concept, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id']), ) for form in iteritems(args.cldf, 'FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add( common.Value, form['id'], id=form['id'], name=form['form'], valueset=vs, ) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)))) load_families( Data(), [(l.glottocode, l) for l in data['Variety'].values()], glottolog_repos=args.glottolog, isolates_icon='tcccccc', strict=False, )
def main(args): # pragma: no cover license = licenses.find(args.cldf.properties['dc:license']) assert license and license.id.startswith('CC-') clts = CLTS( input('Path to cldf-clts/clts:') or '../../cldf-clts/clts-data') data = Data() ds = data.add( common.Dataset, vanuatuvoices.__name__, id=vanuatuvoices.__name__, name='Vanuatu Voices', domain='vanuatuvoices.clld.org', contact="*****@*****.**", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="https://www.eva.mpg.de", license=license.url, jsondata={ 'license_icon': '{}.png'.format('-'.join( [p.lower() for p in license.id.split('-')[:-1]])), 'license_name': license.name }, ) form2audio = audioutil.form2audio(args.cldf, 'audio/mpeg') r = get_dataset('vanuatuvoices', ep='lexibank.dataset') authors, _ = r.get_creators_and_contributors() for ord, author in enumerate(authors): cid = slug(HumanName(author['name']).last) img = pathlib.Path( vanuatuvoices.__file__).parent / 'static' / '{}.jpg'.format(cid) c = data.add( common.Contributor, cid, id=cid, name=author['name'], description=author.get('description'), jsondata=dict(img=img.name if img.exists() else None), ) data.add( common.Contributor, 'forkel', id='forkel', name='Robert Forkel', description='Data curation and website implementation', jsondata=dict(img=None), ) for ord, cid in enumerate(['walworth', 'forkel', 'gray']): DBSession.add( common.Editor(ord=ord, dataset=ds, contributor=data['Contributor'][cid])) contribs = collections.defaultdict(lambda: collections.defaultdict(list)) for c in args.cldf.iter_rows('contributions.csv'): for role in ['phonetic_transcriptions', 'recording', 'sound_editing']: for name in c[role].split(' and '): if name: cid = slug(HumanName(name).last) contribs[c['Language_ID']][cid].append(role) for lang in args.cldf.iter_rows('LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): contrib = data.add( common.Contribution, lang['id'], id=lang['id'], name='Wordlist for {}'.format(lang['name']), ) if lang['id'] in contribs: for cid, roles in contribs[lang['id']].items(): DBSession.add( common.ContributionContributor( contribution=contrib, contributor=data['Contributor'][cid], jsondata=dict(roles=roles), )) data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], description=lang['LongName'], contribution=contrib, island=lang['Island'], ) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for param in args.cldf.iter_rows('ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Concept, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id'].split('_')[0]), description=param['Bislama_Gloss'], concepticon_id=param['concepticonReference'], concepticon_gloss=param['Concepticon_Gloss'], ) inventories = collections.defaultdict(collections.Counter) for form in args.cldf.iter_rows('FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): inventories[form['languageReference']].update(form['Segments']) vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=data['Contribution'][form['languageReference']], ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add(Counterpart, form['id'], id=form['id'], name=form['form'], valueset=vs, audio=form2audio.get(form['id'])) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)))) for lid, inv in inventories.items(): inv = [clts.bipa[c] for c in inv] data['Variety'][lid].update_jsondata( inventory=[(str(c), c.name) for c in inv if getattr(c, 'name', None)])
def cmd_makecldf(self, args): glottolog = Glottolog(args.glottolog.dir) clts = CLTS(Config.from_file().get_clone('clts')) bipa = clts.bipa clts_eurasian = clts.transcriptiondata_dict['eurasian'] args.writer.cldf.add_columns("ValueTable", { "name": "Marginal", "datatype": "boolean" }, { "name": "Value_in_Source", "datatype": "string" }) args.writer.cldf.add_columns('ParameterTable', { 'name': 'CLTS_BIPA', 'datatype': 'string' }, { 'name': 'CLTS_Name', 'datatype': 'string' }) args.writer.cldf.add_component("LanguageTable", "Family", "Glottolog_Name") # load language mapping and build inventory info languages = [] lang_map = {} all_glottolog = {lng.id: lng for lng in glottolog.languoids()} unknowns = defaultdict(list) for row in progressbar( self.etc_dir.read_csv("languages.csv", dicts=True)): lang_map[row["name"]] = slug(row["name"]) lang_dict = {"ID": slug(row["name"]), "Name": row["name"]} if row["glottocode"] in all_glottolog: lang = all_glottolog[row["glottocode"]] lang_dict.update({ "Family": lang.family if lang.lineage else None, "Glottocode": lang.id, "ISO639P3code": lang.iso_code, "Latitude": lang.latitude, "Longitude": lang.longitude, "Macroarea": lang.macroareas[0].name if lang.macroareas else None, "Glottolog_Name": lang.name, }) languages.append(lang_dict) # Read raw data with open(self.raw_dir.joinpath( 'phono_dbase.json').as_posix()) as handler: raw_data = json.load(handler) # Iterate over raw data values = [] parameters = [] inventories = [] counter = 1 segment_set = set() with open(self.raw_dir.joinpath('sources.txt').as_posix()) as f: sources = [source.strip() for source in f.readlines()][1:] sources_ = Sources.from_file(self.raw_dir / "sources.bib") args.writer.cldf.add_sources(*sources_) for idx, (language, langdata) in enumerate(raw_data.items()): cons = langdata["cons"] vows = langdata["vows"] tones = [tone for tone in langdata["tones"] if tone] source = sources[idx] # Prepare language key lang_key = language.split("#")[0].replace(",", "") # Add consonants and vowels to values, also collecting parameters for segment in cons + vows: marginal = bool(segment[0] == "(") # Obtain the corresponding BIPA grapheme, is possible normalized = normalize_grapheme(segment) par_id = compute_id(normalized) if normalized in clts_eurasian.grapheme_map: sound = bipa[clts_eurasian.grapheme_map[normalized]] else: sound = bipa['<NA>'] unknowns[normalized] += [(segment, lang_key)] if sound.type == 'unknownsound': bipa_grapheme = '' desc = '' else: bipa_grapheme = str(sound) desc = sound.name parameters.append((par_id, normalized, bipa_grapheme, desc)) values.append({ "ID": str(counter), "Language_ID": lang_map[lang_key], "Marginal": marginal, "Parameter_ID": par_id, "Value": normalized, "Value_in_Source": segment, "Source": [source], }) counter += 1 # Build segment data segments = [{ "ID": id, "Name": normalized, "BIPA": bipa_grapheme, "Description": desc } for id, normalized, bipa_grapheme, desc in set(parameters)] # Write data and validate args.writer.write( **{ "ValueTable": values, "LanguageTable": languages, "ParameterTable": segments, }) for g, rest in unknowns.items(): print('\t'.join([repr(g), str(len(rest)), g]))
def cmd_makecldf(self, args): args.writer.cldf.add_component('ParameterTable') args.writer.cldf.add_component( 'LanguageTable', 'Continent', 'Genus', 'WALSCode', # we add more language metadata ) args.writer.cldf.add_component('CodeTable') args.writer.objects['ParameterTable'] = [{ 'ID': 'sortalclassifier', 'Name': 'sortal classifier', 'Description': 'Does the language have sortal classifiers, regardless of optional of obligatory?' }, { 'ID': 'morphosyntacticplural', 'Name': 'morphosyntactic plural', 'Description': 'Does the language have morphosyntactic plural markers?' }] args.writer.objects['CodeTable'] = [ { 'ID': 'sortalclassifier-1', 'Parameter_ID': 'sortalclassifier', 'Name': 'yes' }, { 'ID': 'sortalclassifier-0', 'Parameter_ID': 'sortalclassifier', 'Name': 'no' }, { 'ID': 'morphosyntacticplural-1', 'Parameter_ID': 'morphosyntacticplural', 'Name': 'yes' }, { 'ID': 'morphosyntacticplural-0', 'Parameter_ID': 'morphosyntacticplural', 'Name': 'no' }, ] l2s = collections.defaultdict(list) sources = [] for src in sorted(Sources.from_file(self.raw_dir / 'sources.bib').items(), key=lambda i: i.id): if src.get('Wals_code'): for code in split_text(src['Wals_code'], ';', strip=True): l2s[code].append(src.id) sources += [src] args.writer.cldf.add_sources(*sources) for row in self.raw_dir.read_csv('GSSG_ListOfLanguages.csv', delimiter=';', dicts=True): lidx = slug(row['language_name'], lowercase=False) args.writer.objects['LanguageTable'].append({ 'ID': lidx, 'Name': row['language_name'], 'Latitude': row['latitude'], 'Longitude': row['longitude'], 'Glottocode': row['glottocode'], 'ISO639P3code': row['iso_code'], 'Continent': row['continent'], 'Genus': row['genus'], 'WALSCode': row['wals_code'] }) for param in ['sortal_classifier', 'morphosyntactic_plural']: pid = param.replace('_', '') args.writer.objects['ValueTable'].append({ "ID": '{}-{}'.format(lidx, pid), "Value": row[param], "Language_ID": lidx, "Parameter_ID": pid, "Code_ID": '{}-{}'.format(pid, '1' if row[param] == 'yes' else '0'), "Source": l2s.get(row['wals_code'], []) })
def main(args): assert args.glottolog, 'The --glottolog option is required!' license = licenses.find(args.cldf.properties['dc:license']) assert license and license.id.startswith('CC-') data = Data() ds = data.add( common.Dataset, mixezoqueanvoices.__name__, id=mixezoqueanvoices.__name__, name="Mixe-Zoquean Voices", domain='mixezoqueanvoices.clld.org', publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", license=license.url, jsondata={ 'license_icon': '{}.png'.format('-'.join( [p.lower() for p in license.id.split('-')[:-1]])), 'license_name': license.name }, ) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) data.add(common.Contributor, 'kondic', id='kondic', name='Ana Kondic') data.add(common.Contributor, 'gray', id='gray', name='Russell Gray') DBSession.add( common.ContributionContributor( contribution=contrib, contributor=data['Contributor']['kondic'], )) for i, ed in enumerate(['kondic', 'gray']): data.add(common.Editor, ed, dataset=ds, contributor=data['Contributor'][ed], ord=i) ancestors = collections.defaultdict(list) gl = Glottolog(args.glottolog) lnames = {} for lang in args.cldf.iter_rows('LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): lnames[lang['id']] = lang['name'] glang = None if lang['glottocode']: glang = gl.languoid(lang['glottocode']) lineage = [i[0] for i in glang.lineage] if 'Mixe-Zoque' in lineage: ancestors[lang['id']].append('Protomixezoque') if 'Mixe' in lineage: ancestors[lang['id']].append('Protomixe') if 'Oaxaca Mixe' in lineage: ancestors[lang['id']].append('Protooaxacamixe') if not glang: assert lang['name'] == 'Nizaviguiti' data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], description=lang['LongName'], subgroup=glang.lineage[1][0] if glang and len(glang.lineage) > 1 else None, ) colors = dict( zip( set(l.subgroup for l in data['Variety'].values()), qualitative_colors( len(set(l.subgroup for l in data['Variety'].values()))))) for l in data['Variety'].values(): l.jsondata = dict(color=colors[l.subgroup].replace('#', '')) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) # Store proto-forms for later lookup: proto_forms = collections.defaultdict( lambda: collections.defaultdict(list)) for form in args.cldf.iter_rows('FormTable', 'id', 'form', 'languageReference', 'parameterReference'): if form['languageReference'].startswith('Proto'): proto_forms[form['languageReference']][ form['parameterReference']].append(form['form']) for param in args.cldf.iter_rows('ParameterTable', 'id', 'concepticonReference', 'name'): proto = collections.OrderedDict() for lid, forms in proto_forms.items(): f = forms.get(param['id']) if f: proto[lnames[lid]] = f data.add( models.Concept, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id'].split('_')[0]), concepticon_id=param['concepticonReference'], concepticon_gloss=param['Concepticon_Gloss'], description=param['Spanish_Gloss'], jsondata=dict(reconstructions=proto), ) f2a = form2audio(args.cldf) for form in args.cldf.iter_rows('FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): assert not (form['form'] == '►' and not f2a.get(form['id'])) vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) proto = collections.OrderedDict() for lid in ancestors.get(form['languageReference'], []): f = proto_forms[lid].get(form['parameterReference']) if f: proto[lnames[lid]] = f data.add( Counterpart, form['id'], id=form['id'], name=form['form'], valueset=vs, audio=f2a.get(form['id']), jsondata=dict(reconstructions=proto), ) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages))))
def main(args): data = Data() doi = input('DOI of the released dataset: ') dataset = common.Dataset( id=ewave.__name__, name='eWAVE', description='The Electronic World Atlas of Varieties of English', domain='ewave-atlas.org', published=date.today(), license='http://creativecommons.org/licenses/by/3.0/', contact='*****@*****.**', jsondata={ 'doi': doi, 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 3.0 Unported License'}) DBSession.add(dataset) ed_pattern = re.compile('ed(?P<ord>[0-9]+)$') for c in args.cldf['contributors.csv']: contrib = data.add( models.WaveContributor, c['ID'], id=c['ID'], name=c['Name'], email=c['Email'], url=c['URL'], address=c['Address'], sortkey=HumanName(c['Name']).last, ) m = ed_pattern.match(c['ID']) if m: common.Editor(dataset=dataset, contributor=contrib, ord=int(m.group('ord'))) for fc in args.cldf['featurecategories.csv']: data.add( models.FeatureCategory, fc['ID'], id=fc['ID'], name=fc['Name'], description=fc['Description']) for vt in args.cldf['varietytypes.csv']: data.add( models.VarietyType, vt['ID'], id=vt['ID'], name=vt['Name'], description=vt['Description'], jsondata=VARIETY_TYPE_ICONS[vt['ID']], ) for vt in args.cldf['regions.csv']: data.add(models.Region, vt['ID'], id=vt['ID'], name=vt['Name']) for lang in args.cldf['LanguageTable']: l = data.add( models.Variety, lang['ID'], id=lang['ID'], name=lang['Name'], latitude=lang['Latitude'], longitude=lang['Longitude'], abbr=lang['abbr'], region=data['Region'][lang['Region_ID']], type=data['VarietyType'][lang['Type_ID']], ) if lang['Glottocode']: add_language_codes(data, l, None, glottocode=lang['Glottocode']) c = data.add( models.WaveContribution, lang['ID'], id=str(lang['ID']), name=lang['Name'], description=lang['Description'], variety=l) for i, cid in enumerate(lang['Contributor_ID']): DBSession.add(common.ContributionContributor( contribution=c, contributor=data['WaveContributor'][cid], ord=i+1, )) for param in args.cldf['ParameterTable']: data.add( models.Feature, param['ID'], id=param['ID'], category=data['FeatureCategory'][param['Category_ID']], name=param['Name'], description=param['Description'], jsondata={'example_source': param['Example_Source']}) for de in args.cldf['CodeTable']: data.add( common.DomainElement, de['ID'], id=de['ID'], parameter=data['Feature'][de['Parameter_ID']], name=de['Name'], description=de['Description'], jsondata={'color': CODE_COLORS[de['Name']]}, number=de['Number']) for rec in bibtex.Database.from_file(args.cldf.bibpath): data.add(common.Source, slug(rec.id), _obj=bibtex2source(rec)) for example in args.cldf['ExampleTable']: s = data.add( common.Sentence, example['ID'], id=example['ID'], name=example['Primary_Text'], gloss='\t'.join(example['Gloss']) if example['Gloss'] else None, comment=example['Comment'] or None, description=example['Translated_Text'] or None, language=data['Variety'][example['Language_ID']]) for ref in example['Source']: sid, pages = Sources.parse(ref) DBSession.add(common.SentenceReference( sentence=s, source=data['Source'][sid], description=pages, key=sid)) for value in args.cldf['ValueTable']: de = data['DomainElement'][value['Code_ID']] vs = data.add( common.ValueSet, value['ID'], id=value['ID'], contribution=data['WaveContribution'][value['Language_ID']], parameter=data['Feature'][value['Parameter_ID']], jsondata=de.jsondata, language=data['Variety'][value['Language_ID']]) v = data.add( common.Value, value['ID'], id=value['ID'], domainelement=de, valueset=vs) for eid in value['Example_ID']: DBSession.add(common.ValueSentence(sentence=data['Sentence'][eid], value=v))
def main(args): license = licenses.find(args.cldf.properties['dc:license']) assert license and license.id.startswith('CC-') assert args.glottolog, 'The --glottolog option is required!' data = Data() ds = data.add( common.Dataset, papuanvoices.__name__, id=papuanvoices.__name__, domain='papuanvoices.clld.org', name="Papuan Voices", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", license=license.url, jsondata={ 'license_icon': '{}.png'.format('-'.join( [p.lower() for p in license.id.split('-')[:-1]])), 'license_name': license.name }, ) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) data.add(common.Contributor, 'gray', id='gray', name='Russell Gray') for i, ed in enumerate(['gray']): data.add(common.Editor, ed, dataset=ds, contributor=data['Contributor'][ed], ord=i) for lang in args.cldf.iter_rows('LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], description=lang['LongName'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], ) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for param in args.cldf.iter_rows('ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Concept, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id']), concepticon_id=param['concepticonReference'], concepticon_gloss=param['Concepticon_Gloss'], ) f2a = form2audio(args.cldf) for form in args.cldf.iter_rows('FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add( Counterpart, form['id'], id=form['id'], name=form['form'], valueset=vs, audio=f2a.get(form['id']), ) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)))) load_families( Data(), [(l.glottocode, l) for l in data['Variety'].values()], glottolog_repos=args.glottolog, isolates_icon='tcccccc', strict=False, )
def get_data(cldf, args): relscount = 0 cldf.sources = Sources.from_file(args.repos.path('sources.bib')) categorical_variables = set() data = collections.defaultdict(list) dsids = [ds.id for ds in args.repos.datasets] for ds in args.repos.datasets: data['datasets.csv'].append({ 'ID': ds.id, 'Name': ds.name, 'Description': ds.description, 'Type': ds.type, 'Year': ds.year, 'Author': ds.author, 'Reference': ds.reference, 'URL': ds.url, }) for soc in ds.societies: data['LanguageTable'].append({ 'ID': soc.id, 'Dataset_ID': ds.id, 'Name': soc.pref_name_for_society, 'Glottocode': soc.glottocode, 'Latitude': soc.Lat, 'Longitude': soc.Long, 'Comment': soc.Comment, 'Glottocode_Comment': soc.glottocode_comment, 'xd_id': soc.xd_id, 'ORIG_name_and_ID_in_this_dataset': soc.ORIG_name_and_ID_in_this_dataset, 'alt_names_by_society': soc.alt_names_by_society, 'main_focal_year': soc.main_focal_year, 'HRAF_ID': soc.HRAF_name_ID.id if soc.HRAF_name_ID else None, 'HRAF_Name': soc.HRAF_name_ID.name if soc.HRAF_name_ID else None, 'HRAF_Link': soc.HRAF_link, 'origLat': soc.origLat, 'origLong': soc.origLong, }) for soc in ds.society_relations: for rel in soc.related: relscount += 1 data['society_relations.csv'].append({ 'ID': str(relscount), 'Society_ID': soc.id, 'Related_Society_ID': rel.id if rel.dataset in dsids else None, 'Related_Society_External_ID': rel.id if rel.dataset not in dsids else None, 'Related_Society_Name': rel.name, 'Related_Society_Dataset': rel.dataset, }) for param in ds.variables: data['ParameterTable'].append({ 'ID': param.id.replace('.', '_'), 'Dataset_ID': ds.id, 'Name': param.title, 'Description': param.definition, "Category": param.category, "Type": param.type, "Units": param.units, "Source": param.source, "Changes": param.changes, "Notes": param.notes, }) for code in param.codes: if code.code == 'NA': continue categorical_variables.add(code.var_id) data['CodeTable'].append({ 'ID': '{}-{}'.format(code.var_id, code.code).replace('.', '_'), 'Parameter_ID': code.var_id.replace('.', '_'), 'Name': code.name, 'Description': code.description, }) codes = set(c['ID'] for c in data['CodeTable']) for i, d in enumerate(ds.data, start=1): code_id = None \ if (d.var_id not in categorical_variables) or d.code == 'NA' \ else '{}-{}'.format(d.var_id, d.code).replace('.', '_') if code_id and (code_id not in codes) and args.fix_code_id: # This is a backwards compatibility fix. New releases should not have references # to undefined codes! code_id = None # pragma: no cover data['ValueTable'].append({ 'ID': '{}-{}'.format(ds.id, i), 'Language_ID': d.soc_id, 'Parameter_ID': d.var_id.replace('.', '_'), 'Dataset_ID': ds.id, 'Code_ID': code_id, 'Value': d.code, 'Comment': d.comment, 'Sub_Case': d.sub_case, 'Year': d.year, 'Source': [ref.format_cldf() for ref in d.references], 'Source_Coded_Data': d.source_coded_data, 'Admin_Comment': d.admin_comment, }) return data
def main(args): # pragma: no cover # # FIXME: more generic: # - run iter_datasets(args.cldf) -> assuming args.cldf is a directory! -> must go in clld! # - Store datasets in defaultdict(list) keyed with module # datasets = {} for ds in iter_datasets(args.cldf.directory): datasets[ds.module] = ds assert args.glottolog, 'The --glottolog option is required!' data = Data() thedataset = data.add( common.Dataset, hindukush.__name__, id=hindukush.__name__, name='Hindu Kush Areal Typology', domain='hindukush.clld.org', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }, ) for i, name in enumerate( ['Henrik Liljegren', 'Robert Forkel', 'Nina Knobloch', 'Noa Lange']): common.Editor(dataset=thedataset, ord=i, contributor=common.Contributor(id=slug( HumanName(name).last), name=name)) for rec in bibtex.Database.from_file(pathlib.Path(__file__).parent / 'HK_website.bib', lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for module, ds in sorted(datasets.items(), key=lambda i: i[0]): for lang in ds.iter_rows('LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): if lang['id'] not in data['Variety']: data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], subgroup=lang['SubGroup'], location=lang['Location'], elicitation=lang['Elicitation'], jsondata=dict(shape=subgroup_shapes.get(lang['SubGroup'])), ) contrib = data.add( models.CLDFDataset, module, id=module, name='{} [{}]'.format(ds.properties.get('dc:title'), module), description=ds.properties.get('dc:bibliographicCitation'), module=module, ) if module == 'Wordlist': for param in ds.iter_rows('ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Param, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id']), sortkey=param['id'] if not param['id'].startswith('Numerals') else 'Numerals-{0:04d}'.format(int(param['id'].split('-')[1])), concepticon_id=param['concepticonReference'], contribution=contrib, category=param['domain'] or 'ASJPlist', ) audio = { r['ID']: r for r in ds.iter_rows('media.csv') if r['mimetype'] == 'audio/mpeg' } for form in ds.iter_rows('FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Param'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) mp3 = next( iter([ audio[aid] for aid in form['Audio_Files'] if aid in audio ]), None) data.add( common.Value, form['id'], id=form['id'], name=form['form'], valueset=vs, jsondata=dict(audio=ds.get_row_url('media.csv', mp3 ) if mp3 else None), ) elif module == 'StructureDataset': for param in ds.iter_rows('ParameterTable', 'id', 'name', 'description'): data.add( models.Param, param['id'], id=param['id'], name=param['name'], description=html(param['description']) if param['description'] else None, category=param['Category'], contribution=contrib, ) for code in ds.iter_rows('CodeTable', 'id', 'name', 'description', 'parameterReference'): data.add(common.DomainElement, code['id'], id=code['id'], name=code['name'], description=code['description'], parameter=data['Param'][code['parameterReference']], jsondata={ 'color': { 'absent': 'ff0000', 'present': '0000ff', 'indeterminate': 'cccccc', }.get(code['description']) }) # # FIXME: read CodeTable! # for form in ds.iter_rows('ValueTable', 'id', 'value', 'languageReference', 'parameterReference', 'codeReference', 'source'): vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Param'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add( common.Value, form['id'], id=form['id'], name=form['value'], valueset=vs, domainelement=data['DomainElement'][form['codeReference']]) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)))) load_families( Data(), [(l.glottocode, l) for l in data['Variety'].values()], glottolog_repos=args.glottolog, isolates_icon='tcccccc', strict=False, )