def vs_copy_lang(session, timestamp, vs, lang): # pragma: no cover if isinstance(lang, basestring): lang = common.Language.get(lang, session=session) vs1 = get_vs(session, vs) pid, lid = vs1.id.split('-') id_ = '-'.join([pid, lang.id]) try: vs2 = get_vs(session, id_) vs2.updated = timestamp raise AssertionError except NoResultFound: vs2 = common.ValueSet(id=id_, description=vs1.description, language=lang, parameter=vs1.parameter, contribution=vs1.contribution, updated=timestamp, created=timestamp, source=vs1.source) session.add(vs2) # copy values and references: session.add( common.Value(id=vs2.id, valueset=vs2, domainelement=vs1.values[0].domainelement, created=timestamp, updated=timestamp)) for ref in vs1.references: session.add( common.ValueSetReference(valueset=vs2, source=ref.source, key=ref.key, description=ref.description))
def prime(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ # # Now that we loaded all languoids and refs, we can compute the MED values. # meds = defaultdict(list) for lpk, spk, sid, sname, med_type, year, pages in DBSession.execute("""\ select l.pk, r.pk, s.id, s.name, r.med_type, s.year_int, r.med_pages from languagesource as ls, language as l, source as s, ref as r where ls.active = TRUE and l.pk = ls.language_pk and s.pk = ls.source_pk and s.pk = r.pk order by l.id, r.med_index desc, r.med_pages, coalesce(s.year_int, 0), s.pk """): meds[lpk].append((spk, sid, sname, med_type, year, pages)) # The last one is the overall MED # Now weed out the "newer but worse" sources: for lpk, sources in {k: reversed(v) for k, v in meds.items()}.items(): relevant, lastyear = [], 10000 for spk, sid, sname, med_type, year, pages in sources: if year and year < lastyear: # If year is more recent, this is a "newer but worse" item relevant.append((spk, sid, sname, med_type, year, pages)) lastyear = year meds[lpk] = relevant med_param = common.Parameter.get('med') med_domain = {de.id: de for de in med_param.domain} contrib = common.Contribution.get('glottolog') for l in DBSession.query(common.Language).filter(common.Language.pk.in_(list(meds.keys()))): l.update_jsondata(meds=[ (sid, med_type, year, pages, sname) for spk, sid, sname, med_type, year, pages in meds[l.pk]]) if not meds[l.pk]: continue med = meds[l.pk][0] # Record the overall MED as value for the 'med' Parameter: vs = common.ValueSet( id=idjoin('med', l.id), contribution=contrib, parameter=med_param, language=l, ) DBSession.add(common.Value( id=idjoin('med', l.id), name=getattr(args.repos.med_types, med[3]).name, domainelement=med_domain[idjoin('med', med[3])], valueset=vs, )) DBSession.flush() DBSession.add(common.ValueSetReference(source_pk=med[0], valueset_pk=vs.pk)) recreate_treeclosure() macroareas = {r[0]: (r[1], r[2]) for r in DBSession.execute("""\ select de.pk, de.id, de.name from domainelement as de, parameter as p where de.parameter_pk = p.pk and p.id = 'macroarea' """)} for lid, lpk, cpk, ppk, mas in DBSession.execute("""\ select l.id, l.pk, vs.contribution_pk, vs.parameter_pk, array_agg(distinct v.domainelement_pk) from language as l, treeclosuretable as t, parameter as p, valueset as vs, value as v where l.pk = t.parent_pk and t.child_pk = vs.language_pk and vs.parameter_pk = p.pk and p.id = 'macroarea' and v.valueset_pk = vs.pk and l.pk not in ( select language_pk from valueset as _vs, parameter as _p where _vs.parameter_pk = _p.pk and _p.id = 'macroarea' ) group by l.id, l.pk, vs.contribution_pk, vs.parameter_pk"""): for i, mapk in enumerate(mas): if i == 0: vs = common.ValueSet( id=idjoin('macroarea', lid), language_pk=lpk, parameter_pk=ppk, contribution_pk=cpk) DBSession.add(common.Value( id=idjoin(macroareas[mapk][0], lid), name=macroareas[mapk][1], domainelement_pk=mapk, valueset=vs)) for vs in DBSession.query(common.ValueSet)\ .join(common.Language)\ .join(common.Parameter)\ .filter(common.Parameter.id == 'macroarea')\ .options(joinedload(common.ValueSet.values), joinedload(common.ValueSet.language)): vs.language.macroareas = ', '.join([macroareas[v.domainelement_pk][1] for v in vs.values]) for row in list(DBSession.execute( "select pk, pages, pages_int, startpage_int from source where pages_int < 0" )): raise ValueError(row) version = assert_release(args.repos.repos) with jsonlib.update(gc2version(args), indent=4) as legacy: for lang in DBSession.query(common.Language): if lang.id not in legacy: lang.update_jsondata(new=True) legacy[lang.id] = version valuesets = { r[0]: r[1] for r in DBSession.query(common.ValueSet.id, common.ValueSet.pk)} refs = { r[0]: r[1] for r in DBSession.query(models.Refprovider.id, models.Refprovider.ref_pk)} for vsid, vspk in valuesets.items(): if vsid.startswith('macroarea-'): DBSession.add(common.ValueSetReference( source_pk=refs[args.repos.macroareas.__defaults__['reference_id']], valueset_pk=vspk)) for vs in DBSession.query(common.ValueSet)\ .join(common.Parameter)\ .filter(common.Parameter.id == 'aes'): if vs.jsondata['reference_id']: DBSession.add(common.ValueSetReference( source_pk=refs[vs.jsondata['reference_id']], valueset_pk=vs.pk)) for lang in args.repos.languoids(): if lang.category == args.repos.language_types.bookkeeping.category: continue clf = lang.classification_comment if clf: for pid, attr_ in [('sc', 'sub'), ('fc', 'family')]: if getattr(clf, attr_ + 'refs'): if split_items(lang.cfg['classification'][attr_ + 'refs']) != \ split_items(lang.cfg['classification'].get(attr_)): vspk = valuesets['{0}-{1}'.format(pid, lang.id)] for ref in getattr(clf, attr_ + 'refs'): spk = refs.get(ref.key) if spk: DBSession.add( common.ValueSetReference(source_pk=spk, valueset_pk=vspk))
def main(args): # pragma: no cover data = Data() clts = CLTS(input('Path to cldf-clts/clts:') or '../../cldf-clts/clts') ds = data.add( common.Dataset, tppsr.__name__, id=tppsr.__name__, name='Tableaux phonétiques des patois suisses romands Online', domain='tppsr.clld.org', contact="*****@*****.**", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="https://www.eva.mpg.de", license="https://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}, ) for i, name in enumerate(['Hans Geisler', 'Robert Forkel', 'Johann-Mattis List']): common.Editor( dataset=ds, ord=i, contributor=common.Contributor(id=slug(HumanName(name).last), name=name) ) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'name', 'latitude', 'longitude'): data.add( models.Variety, lang['id'], id=lang['Number'], name=lang['name'], description=lang['FullName'], latitude=lang['latitude'], longitude=lang['longitude'], canton=lang['Canton'], group=lang['DialectGroup'], recorded=lang['DateOfRecording'], population=int(lang['Population']) if lang['Population'] else None, speaker_age=int(lang['SpeakerAge']) if lang['SpeakerAge'] else None, speaker_proficiency=lang['SpeakerProficiency'], speaker_language_use=lang['SpeakerLanguageUse'], speaker_gender=lang['SpeakerGender'], investigators=lang['Investigators'], ) colors = qualitative_colors(len(set(l.canton for l in data['Variety'].values())), set='tol') for i, (_, langs) in enumerate(itertools.groupby( sorted(data['Variety'].values(), key=lambda l: l.canton), lambda l: l.canton, )): for lang in langs: lang.update_jsondata(color=colors[i]) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for param in iteritems(args.cldf, 'ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Concept, param['id'], id=param['Number'], number=int(param['Number']), name='{} [{}]'.format(param['name'], param['Number']), latin_gloss=param['Latin_Gloss'], french_gloss=param['French_Gloss'], concepticon_id=param['concepticonReference'], concepticon_gloss=param['Concepticon_Gloss'], concepticon_concept_id=param['id'].split('_')[0], ) inventories = collections.defaultdict(set) scan_url_template = args.cldf['FormTable', 'Scan'].valueUrl for form in iteritems(args.cldf, 'FormTable', 'id', 'value', 'form', 'languageReference', 'parameterReference', 'source'): if not form['form']: continue inventories[form['languageReference']] = inventories[form['languageReference']].union(form['Segments']) vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) f = data.add( models.Form, form['id'], # Gauchat-1925-480-1_ id=form['id'], name=form['form'].replace('+', ' '), description=form['value'], segments=' '.join(form['Segments']), valueset=vs, scan=scan_url_template.expand(**form), prosodic_structure=form['ProsodicStructure'], ) for example in args.cldf['ExampleTable']: sentence = models.Phrase( id=example['ID'], language=data['Variety'][example['Language_ID']], name=example['Primary_Text'], description=example['Translated_Text'], original_script=example['Alt_Transcription'], ) for cid in example['Concept_ID']: DBSession.add(models.ConceptSentence(concept=data['Concept'][cid], sentence=sentence)) for fid in example['Form_ID']: DBSession.add(common.ValueSentence(value=data['Form'][fid], sentence=sentence)) for lid, inv in inventories.items(): inv = [clts.bipa[c] for c in inv] data['Variety'][lid].update_jsondata( inventory=[(str(c), c.name) for c in inv if hasattr(c, 'name')]) for (vsid, sid), pages in refs.items(): DBSession.add(common.ValueSetReference( valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)) ))
def main(args): data = Data() dataset = common.Dataset( id=cobl2.__name__, name="IE-CoR", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://www.eva.mpg.de", license="https://creativecommons.org/licenses/by/4.0/", domain='iecor.clld.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) editors = OrderedDict([('Heggarty', None), ('Anderson', None), ('Scarborough', None)]) for row in sorted(ds['authors.csv'], key=lambda x: [ x['Last_Name'].lower(), x['First_Name'].lower()]): if row['Last_Name'] in editors: editors[row['Last_Name']] = row['ID'] data.add( models.Author, row['ID'], id=row['ID'], name='{0} {1}'.format(row['First_Name'], row['Last_Name']), url=row['URL'], photo=data_uri(photos[row['Last_Name']], 'image/jpg') if row['Last_Name'] in photos else None) for i, cid in enumerate(editors.values()): common.Editor(dataset=dataset, contributor=data['Author'][cid], ord=i + 1) for src in ds.sources.items(): for invalid in ['isbn', 'part', 'institution']: if invalid in src: del src[invalid] data.add( common.Source, src.id, id=src.id, name=src.get('author', src.get('editor')), description=src.get('title', src.get('booktitle')), bibtex_type=getattr(EntryType, src.genre, EntryType.misc), **src) re_links = re.compile(r'\[(?P<label>[^\]]+?)\]\((?P<type>.+?)-(?P<id>\d+)\)') link_map = { 'cog': '/cognatesets/', 'lex': '/values/', 'src': '/sources/', } def parse_links(m): try: return '<a href="{}{}">{}</a>'.format( link_map[m.group('type')], m.group('id'), m.group('label')) except KeyError: print("parse_links: type error in '{}'".format(":".join(m.groups()))) return '[{}]({}-{})'.format(m.group('label'), m.group('type'), m.group('id')) for param in ds['ParameterTable']: data.add( models.Meaning, param['ID'], id=slug(param['Name']), name=param['Name'], description_md=param['Description_md'], concepticon_id=int(param['Concepticon_ID']) if param['Concepticon_ID'] != '0' else None, ) for row in ds['clades.csv']: data.add( models.Clade, row['ID'], id=row['ID'], level0_name=row['level0_name'], level1_name=row['level1_name'], level2_name=row['level2_name'], level3_name=row['level3_name'], clade_level0=row['clade_level0'], clade_level1=row['clade_level1'], clade_level2=row['clade_level2'], clade_level3=row['clade_level3'], clade_name=row['clade_name'], short_name=row['short_name'], color=row['color'], ) for row in ds['LanguageTable']: c = data.add( common.Contribution, row['ID'], id=row['ID'], name=row['Name'], ) for i, cid in enumerate(row['Author_ID']): DBSession.add(common.ContributionContributor( contribution=c, contributor=data['Author'][cid], ord=i + 1)) data.add( models.Variety, row['ID'], id=slug(row['Name']), name=row['Name'], latitude=float(row['Latitude']) if row['Latitude'] is not None else None, longitude=float(row['Longitude']) if row['Longitude'] is not None else None, contribution=c, color=rgb_as_hex(row['Color']), clade=', '.join(filter(None, row['Clade'])), clade_name=row['clade_name'], glottocode=row['Glottocode'], historical=row['historical'], distribution=row['distribution'], logNormalMean=row['logNormalMean'], logNormalOffset=row['logNormalOffset'], logNormalStDev=row['logNormalStDev'], normalMean=row['normalMean'], normalStDev=row['normalStDev'], ascii_name=row['ascii_name'], iso=row['ISO639P3code'], lang_description=row['Description'], variety=row['Variety'], loc_justification=row['loc_justification'] or None, sort_order=row['sort_order'] ) vsrs = set() for row in ds['FormTable']: vs = data['ValueSet'].get((row['Language_ID'], row['Parameter_ID'])) if not vs: vs = data.add( common.ValueSet, (row['Language_ID'], row['Parameter_ID']), id='{0}-{1}'.format(row['Language_ID'], row['Parameter_ID']), language=data['Variety'][row['Language_ID']], parameter=data['Meaning'][row['Parameter_ID']], contribution=data['Contribution'][row['Language_ID']], ) v = data.add( models.Lexeme, row['ID'], id=row['ID'], name=row['Form'], native_script=row['native_script'], phonetic=row['phon_form'], phonemic=row['Phonemic'], comment=re_links.sub(parse_links, row['Comment'] or ''), url=row['url'], gloss=row['Gloss'], valueset=vs ) for src in row['Source']: sid, pages = ds.sources.parse(src) key = (vs.id, sid, pages) if pages: pages = pages.replace('|', ';') if key not in vsrs: DBSession.add(common.ValueSetReference( valueset=vs, source=data['Source'][sid], description=pages)) vsrs.add(key) for row in ds['CognatesetTable']: cc = data.add( models.CognateClass, row['ID'], id=row['ID'], name=row['ID'], root_form=row['Root_Form_calc'] if row['Root_Form_calc'] is not None and len(row['Root_Form_calc']) else row['Root_Form'], root_form_calc=row['Root_Form_calc'] or None, root_gloss=row['Root_Gloss'] or None, root_language=row['Root_Language_calc'] if row['Root_Language_calc'] is not None and len(row['Root_Language_calc']) else row['Root_Language'], root_language_calc=row['Root_Language_calc'] or None, comment=re_links.sub(parse_links, row['Comment'] or ''), justification=re_links.sub(parse_links, row['Justification'] or ''), ideophonic=row['Ideophonic'] or None, parallel_derivation=row['parallelDerivation'] or None, revised_by=','.join(row['revised_by']) or None, superset_id=int(row['supersetid']) if row['supersetid'] else None, ) for src in row['Source']: sid, pages = ds.sources.parse(src) if pages: pages = pages.replace('|', ';') DBSession.add(clld_cognacy_plugin.models.CognatesetReference( cognateset=cc, source=data['Source'][sid], description=pages)) DBSession.flush() cc_id_pk_map = {str(ccid): cc.pk for ccid, cc in data['CognateClass'].items()} for row in ds['CognatesetTable']: if row['proposedAsCognateTo_pk']: DBSession.add(models.ProposedCognates( cc1_pk=data['CognateClass'][row['ID']].pk, cc2_pk=cc_id_pk_map[str(row['proposedAsCognateTo_pk'])], scale=row['proposedAsCognateToScale'] )) DBSession.flush() loans = {ln['Cognateset_ID']: ln for ln in ds['loans.csv']} for ccid, cc in data['CognateClass'].items(): if ccid in loans: le = loans[ccid] if le['SourceCognateset_ID']: cc.loan_source_pk = data['CognateClass'][le['SourceCognateset_ID']].pk else: cc.loan_source_pk = None cc.loan_notes = le['Comment'] cc.loan_source_languoid = le['Source_languoid'] cc.loan_source_form = le['Source_form'] cc.parallel_loan_event = le['Parallel_loan_event'] cc.is_loan = True for row in ds['CognateTable']: cc = data['CognateClass'][row['Cognateset_ID']] if cc.meaning_pk is None: cc.meaning_pk = data['Lexeme'][row['Form_ID']].valueset.parameter_pk else: assert data['Lexeme'][row['Form_ID']].valueset.parameter_pk == cc.meaning_pk data.add( clld_cognacy_plugin.models.Cognate, row['ID'], cognateset=data['CognateClass'][row['Cognateset_ID']], counterpart=data['Lexeme'][row['Form_ID']], doubt=row['Doubt'], ) l_by_gc = {} for s in DBSession.query(models.Variety): l_by_gc[s.glottocode] = s.pk tree = Phylogeny( id='1', name='Bouckaert et al.', description='', newick=Path.read_text(data_file_path / 'raw' / 'bouckaert_et_al2012' / 'newick.txt'), ) for k, taxon in enumerate(reader(data_file_path / 'raw' / 'bouckaert_et_al2012' / 'taxa.csv', namedtuples=True)): label = TreeLabel( id='{0}-{1}-{2}'.format(tree.id, slug(taxon.taxon), k + 1), name=taxon.taxon, phylogeny=tree, description=taxon.glottocode) if taxon.glottocode in l_by_gc: LanguageTreeLabel(language_pk=l_by_gc[taxon.glottocode], treelabel=label) DBSession.add(tree) l_by_ascii = {} for s in DBSession.query(models.Variety): l_by_ascii[s.ascii_name] = s.pk tree = Phylogeny( id='2', name='CoBL consensu', description='', newick=Path.read_text(data_file_path / 'raw' / 'ie122' / 'newick.txt'), ) for k, taxon in enumerate(reader(data_file_path / 'raw' / 'ie122' / 'taxa.csv', namedtuples=True)): label = TreeLabel( id='{0}-{1}-{2}'.format(tree.id, slug(taxon.taxon), k + 1), name=taxon.taxon, phylogeny=tree) if taxon.taxon in l_by_ascii: LanguageTreeLabel(language_pk=l_by_ascii[taxon.taxon], treelabel=label) DBSession.add(tree)
def main(args): data = Data() glottocodes, bibtex_keys = {}, defaultdict(set) for d in reader( args.data_file('repos', 'mappings', 'InventoryID-ISO-gcode-Bibkey-Source.tsv')): glottocodes[d['InventoryID']] = d['Glottocode'] bibtex_keys[d['InventoryID']].add(d['BibtexKey']) glottolog = Glottolog( Path(phoible.__file__).parent.parent.parent.parent.joinpath( 'glottolog3', 'glottolog')) languoids = {l.id: l for l in glottolog.languoids()} phonemes = sorted(list( reader(args.data_file('repos', 'data', 'phoible-by-phoneme.tsv'))), key=lambda r: (r['InventoryID'], r['GlyphID'])) inventories = defaultdict(set) for p in phonemes: if p['InventoryID'] in glottocodes: inventories[(languoids[glottocodes[p['InventoryID']]].name, p['SpecificDialect'], p['Source'].upper())].add( (p['InventoryID'], p['LanguageName'])) inventory_names = {} for (glname, dname, source), invids in inventories.items(): if len(invids) == 1: invid, lname = invids.pop() inventory_names[invid] = name_in_source(glname, dname) + ' [%s]' % source else: use_lname = len(set(r[1] for r in invids)) == len(invids) for i, (invid, lname) in enumerate(sorted(invids, key=lambda j: int(j[0]))): disambiguation = ' %s' % (i + 1, ) if use_lname: disambiguation = ' (%s)' % lname inventory_names[invid] = name_in_source( glname, dname) + '%s [%s]' % (disambiguation, source) for (invid, lname, dname, source), ps in groupby( phonemes, lambda p: (p['InventoryID'], p['LanguageName'], p[ 'SpecificDialect'], p['Source'])): if invid not in glottocodes: continue ps = list(ps) gc = glottocodes[invid] lang = data['Variety'].get(gc) if not lang: languoid = languoids[gc] lang = data.add( models.Variety, gc, id=gc, language_code=ps[0]['LanguageCode'], name=languoid.name, level=text_type(languoid.level.name), latitude=languoid.latitude, longitude=languoid.longitude, ) if lang.latitude is None and languoid.level == Level.dialect: ll = get_language(languoid) lang.latitude = ll.latitude lang.longitude = ll.longitude contrib = data.add( models.Inventory, invid, id=invid, #language=lang, source=source, #source_url=source_urls.get(row.InventoryID), #internetarchive_url=ia_urls.get(row.InventoryID), name=inventory_names[invid], description=name_in_source(lname, dname)) return # FIXME: read from mappings file! refs = defaultdict(list) for row in get_rows(args, 'BibtexKey'): if row[1] == 'NO SOURCE GIVEN': refs[row[0]] = [] else: refs[row[0]].append(row[1]) add_sources(args, data) dataset = data.add( common.Dataset, 'phoible', id='phoible', name='PHOIBLE Online', description='PHOIBLE Online', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://www.shh.mpg.de", domain='phoible.org', license='http://creativecommons.org/licenses/by-sa/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'http://i.creativecommons.org/l/by-sa/3.0/88x31.png', 'license_name': 'Creative Commons Attribution-ShareAlike 3.0 Unported License' }) for i, spec in enumerate([ ('moran', "Steven Moran"), ('mccloy', "Daniel McCloy"), ('wright', "Richard Wright"), ]): DBSession.add( common.Editor(dataset=dataset, ord=i + 1, contributor=common.Contributor(id=spec[0], name=spec[1]))) #squibs = defaultdict(list) #for row in get_rows(args, 'Squib'): # squibs[row[0]].append(row[1]) source_urls = dict(get_rows(args, 'URL')) ia_urls = dict(get_rows(args, 'InternetArchive')) # FIXME: group phoible-by-phoneme by LanguageCode, Source (make sure this is unique!) aggregated = list( reader(args.data_file('phoible-aggregated.tsv'), delimiter='\t', namedtuples=True)) inventory_names = {} for key, items in groupby(sorted(aggregated, key=lambda t: (t.LanguageCode, t.Source)), key=lambda t: (t.LanguageCode, t.Source)): items = list(items) lname = lnames.get(key[0]) if not lname: lname = items[0].LanguageName lnames[key[0]] = lname if len(items) == 1: inventory_names[items[0].InventoryID] = '%s (%s)' % (lname, key[1]) else: for i, item in enumerate(items): inventory_names[item.InventoryID] = '%s %s (%s)' % (lname, i + 1, key[1]) # pull in Glottolog families instead? or in addition? family_map = { ("Arawakan", "arwk"): "Arawakan", ("Trans-New Guinea", "trng"): "Trans-New Guinea", ("Moklen", "anes"): "Austronesian", ("Oko", "ncon"): "Niger-Congo", ("Muniche", "saso"): "Muniche", ("Tinigua", "saso"): "Tinigua", ("Vilela", "luvi"): "Vilela", ("Ofayé", "macg"): "Kamakanan", ("Purian", "macg"): "PurianPrint", ("Mixed language", "saml"): "Mixed language", ("Tupian", "tupi"): "Tupian", ("Yuwana", "saun"): "YuwanaPrint", } family_code_map = {k[1]: v for k, v in family_map.items()} for row in aggregated: lang = data['Variety'].get(row.LanguageCode) if not lang: if row.LanguageFamilyGenus == 'UNCLASSIFIED': genus = None else: genus_id = slug(strip_quotes(row.LanguageFamilyGenus)) genus = genera.get(genus_id) if not genus: genus = genera.get(row.LanguageCode) if not genus: #print(row.LanguageFamilyGenus, row.LanguageFamilyRoot) family = family_map.get( (row.LanguageFamilyGenus, row.LanguageFamilyRoot)) genus = genera[genus_id] = data.add( models.Genus, genus_id, id=genus_id, name=row.LanguageFamilyGenus, description=family or row.LanguageFamilyRoot, active=False, root=row.LanguageFamilyRoot) if not genus.root: genus.root = row.LanguageFamilyRoot if genus.description in family_code_map: genus.description = family_code_map[genus.description] if row.LanguageCode in geocoords: coords = geocoords[row.LanguageCode] elif row.Latitude != 'NULL' and row.Longitude != 'NULL': coords = (float(row.Latitude), float(row.Longitude)) lang = data.add(models.Variety, row.LanguageCode, id=row.LanguageCode, name=lnames[row.LanguageCode], genus=genus, country=strip_quotes(row.Country), area=strip_quotes(row.Area), latitude=coords[0], longitude=coords[1], jsondata=dict(inventory_id=row.InventoryID)) add_language_codes(data, lang, row.LanguageCode, glottocodes=glottocodes) contributor = data['Contributor'].get(row.Source) if not contributor: contributor = data.add(common.Contributor, row.Source, id=row.Source, name=SOURCES[row.Source][0], description=SOURCES[row.Source][2]) for ref in SOURCES[row.Source][1]: DBSession.add( models.ContributorReference(source=data['Source'][ref], contributor=contributor)) contrib = data.add(models.Inventory, row.InventoryID, id=row.InventoryID, language=lang, source=row.Source, source_url=source_urls.get(row.InventoryID), internetarchive_url=ia_urls.get(row.InventoryID), name=inventory_names[row.InventoryID], description=row.LanguageName) DBSession.add( common.ContributionContributor(contribution=contrib, contributor=contributor)) #for j, squib in enumerate(squibs.get(row.InventoryID, [])): # f = common.Contribution_files( # object=contrib, # id='squib-%s-%s.pdf' % (contrib.id, j + 1), # name='Phonological squib', # description=squib, # mime_type='application/pdf') # assert f # # f.create(files_dir, file(args.data_file('phonological_squibs', src)).read()) DBSession.flush() unknown_refs = {} for row in reader(args.data_file('phoible-phonemes.tsv'), namedtuples=True): inventory = data['Inventory'][row.InventoryID] segment = data['Segment'].get(row.Phoneme) if not segment: unicode_desc = [(c, unicodedata.name(c)) for c in row.Phoneme] description = ' - '.join([t[1] for t in unicode_desc]) segment = data.add( models.Segment, row.Phoneme, id=b16encode(md5(description).digest()), name=row.Phoneme, description=description, equivalence_class=''.join([ t[0] for t in unicode_desc if t[1].split()[0] not in ['COMBINING', 'MODIFIER'] ]), segment_class=row.Class, combined_class=row.CombinedClass) DBSession.flush() vs = common.ValueSet(id=row.PhonemeID, contribution=inventory, language=inventory.language, parameter=segment) for ref in refs.get(row.InventoryID, []): if ref not in data['Source']: if ref not in unknown_refs: print('-------', ref) unknown_refs[ref] = 1 continue DBSession.add( common.ValueSetReference(source=data['Source'][ref], valueset=vs)) DBSession.add( common.Value( id=row.PhonemeID, name='%s %s' % (row.Phoneme, data['Inventory'][row.InventoryID].name), valueset=vs)) DBSession.flush() for inventory_id in refs: for ref in refs[inventory_id]: if ref not in data['Source']: continue data.add(common.ContributionReference, '%s-%s' % (inventory_id, ref), source=data['Source'][ref], contribution=data['Inventory'][inventory_id]) for i, row in enumerate( reader(args.data_file('phoible-segments-features.tsv'))): if i == 0: features = list(map(feature_name, row)) continue if row[0] not in data['Segment']: # print('skipping feature vector:', row) continue for j, value in enumerate(row): if j and value != '0': DBSession.add( common.Parameter_data( key=features[j], value=value, ord=j, object_pk=data['Segment'][row[0]].pk)) # FIXME: add allophones! DBSession.flush()
def main(args): license = licenses.find(args.cldf.properties['dc:license']) assert license and license.id.startswith('CC-') assert args.glottolog, 'The --glottolog option is required!' data = Data() ds = data.add( common.Dataset, papuanvoices.__name__, id=papuanvoices.__name__, domain='papuanvoices.clld.org', name="Papuan Voices", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", license=license.url, jsondata={ 'license_icon': '{}.png'.format('-'.join( [p.lower() for p in license.id.split('-')[:-1]])), 'license_name': license.name }, ) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) data.add(common.Contributor, 'gray', id='gray', name='Russell Gray') for i, ed in enumerate(['gray']): data.add(common.Editor, ed, dataset=ds, contributor=data['Contributor'][ed], ord=i) for lang in args.cldf.iter_rows('LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], description=lang['LongName'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], ) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for param in args.cldf.iter_rows('ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Concept, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id']), concepticon_id=param['concepticonReference'], concepticon_gloss=param['Concepticon_Gloss'], ) f2a = form2audio(args.cldf) for form in args.cldf.iter_rows('FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add( Counterpart, form['id'], id=form['id'], name=form['form'], valueset=vs, audio=f2a.get(form['id']), ) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)))) load_families( Data(), [(l.glottocode, l) for l in data['Variety'].values()], glottolog_repos=args.glottolog, isolates_icon='tcccccc', strict=False, )
def setUp(self): TestWithDb.setUp(self) DBSession.add( common.Dataset(id='dataset', name='dataset', description='desc', domain='clld')) source = common.Source(id='source') contributors = { 'contributor': 'A Name', 'b': 'b Name', 'c': 'c Name', 'd': 'd Name' } for id_, name in contributors.items(): contributors[id_] = common.Contributor(id=id_, name=name) contribution = common.Contribution(id='contribution', name='Contribution') cr = common.ContributionReference(contribution=contribution, source=source) assert common.ContributionContributor( contribution=contribution, primary=True, contributor=contributors['contributor']) assert common.ContributionContributor(contribution=contribution, primary=False, contributor=contributors['b']) assert common.ContributionContributor(contribution=contribution, primary=True, contributor=contributors['c']) assert common.ContributionContributor(contribution=contribution, primary=False, contributor=contributors['d']) DBSession.add(contribution) language = common.Language(id='language', name='Language 1', latitude=10.5, longitude=0.3) language.sources.append(source) identifier = common.Identifier(type='iso639-3', id='iso') li = common.LanguageIdentifier(language=language, identifier=identifier) for i in range(2, 102): _l = common.Language(id='l%s' % i, name='Language %s' % i) _i = common.Identifier(type='iso639-3', id='%.3i' % i, name='%.3i' % i) _li = common.LanguageIdentifier(language=_l, identifier=_i) DBSession.add(_l) param = common.Parameter(id='parameter', name='Parameter') de = common.DomainElement(id='de', name='DomainElement', parameter=param) de2 = common.DomainElement(id='de2', name='DomainElement2', parameter=param) valueset = common.ValueSet(id='valueset', language=language, parameter=param, contribution=contribution) value = common.Value(id='value', domainelement=de, valueset=valueset, frequency=50, confidence='high') DBSession.add(value) paramnd = common.Parameter(id='no-domain', name='Parameter without domain') valueset = common.ValueSet(id='vs2', language=language, parameter=paramnd, contribution=contribution) vr = common.ValueSetReference(valueset=valueset, source=source) value = common.Value(id='v2', valueset=valueset, frequency=50, confidence='high') DBSession.add(value) unit = common.Unit(id='unit', name='Unit', language=language) up = common.UnitParameter(id='unitparameter', name='UnitParameter') DBSession.add(unit) DBSession.add( common.UnitValue(id='unitvalue', name='UnitValue', unit=unit, unitparameter=up)) up2 = common.UnitParameter(id='up2', name='UnitParameter with domain') de = common.UnitDomainElement(id='de', name='de', parameter=up2) DBSession.add( common.UnitValue(id='uv2', name='UnitValue2', unit=unit, unitparameter=up2, unitdomainelement=de)) DBSession.add(common.Source(id='s')) sentence = common.Sentence(id='sentence', name='sentence name', description='sentence description', analyzed='a\tmorpheme\tdoes\tdo', gloss='a\tmorpheme\t1SG\tdo.SG2', source='own', comment='comment', original_script='a morpheme', language=language) sr = common.SentenceReference(sentence=sentence, source=source) DBSession.add(common.Config(key='key', value='value')) DBSession.flush()
def main(args): assert args.glottolog, 'The --glottolog option is required!' clts = CLTS(input('Path to cldf-clts/clts:') or '../../cldf-clts/clts') data = Data() ds = data.add( common.Dataset, lsi.__name__, id=lsi.__name__, name= 'The Comparative Vocabularies of the "Linguistic Survey of India" Online', domain='lsi.clld.org', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }, ) for i, name in enumerate( ['Taraka Rama', 'Robert Forkel', 'Johann-Mattis List']): common.Editor(dataset=ds, ord=i, contributor=common.Contributor(id=slug( HumanName(name).last), name=name)) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], order=int(lang['Order']), number=lang['NumberInSource'], family_in_source=lang['FamilyInSource'], ) for rec in bibtex.Database.from_file(args.cldf.bibpath): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for param in iteritems(args.cldf, 'ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Concept, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id']), description=param['Concepticon_Gloss'], concepticon_id=param['concepticonReference'], pages=param['PageNumber'], ) inventories = collections.defaultdict(set) for form in iteritems(args.cldf, 'FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): inventories[form['languageReference']] = inventories[ form['languageReference']].union(form['Segments']) vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add( models.Form, form['id'], id=form['id'], name=form['form'], description=''.join(form['Segments']).replace('+', ' '), segments=' '.join(form['Segments']), valueset=vs, ) for lid, inv in inventories.items(): inv = [clts.bipa[c] for c in inv] data['Variety'][lid].update_jsondata(inventory=[(str(c), c.name) for c in inv if hasattr(c, 'name')]) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)))) load_families( Data(), [(l.glottocode, l) for l in data['Variety'].values()], glottolog_repos=args.glottolog, isolates_icon='tcccccc', strict=False, )
def main(args): assert args.glottolog, 'The --glottolog option is required!' data = Data() data.add( common.Dataset, polyglottaafricana.__name__, id=polyglottaafricana.__name__, domain='', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }, ) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], ) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for param in iteritems(args.cldf, 'ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Concept, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id']), ) for form in iteritems(args.cldf, 'FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add( common.Value, form['id'], id=form['id'], name=form['form'], valueset=vs, ) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)))) load_families( Data(), [(l.glottocode, l) for l in data['Variety'].values()], glottolog_repos=args.glottolog, isolates_icon='tcccccc', strict=False, )
def prime(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ recreate_treeclosure() for lpk, mas in DBSession.execute("""\ select l.pk, array_agg(distinct lma.macroarea_pk) from language as l, treeclosuretable as t, languoidmacroarea as lma, macroarea as ma where l.pk = t.parent_pk and t.child_pk = lma.languoid_pk and lma.macroarea_pk = ma.pk and l.pk not in (select languoid_pk from languoidmacroarea) group by l.pk"""): for mapk in mas: DBSession.add(models.Languoidmacroarea(languoid_pk=lpk, macroarea_pk=mapk)) for row in list(DBSession.execute( "select pk, pages, pages_int, startpage_int from source where pages_int < 0" )): pk, pages, number, start = row _start, _end, _number = compute_pages(pages) if _number > 0 and _number != number: DBSession.execute( "update source set pages_int = %s, startpage_int = %s where pk = %s" % (_number, _start, pk)) DBSession.execute( "update ref set endpage_int = %s where pk = %s" % (_end, pk)) with jsonlib.update(gc2version(args), indent=4) as legacy: for lang in DBSession.query(common.Language): if lang.id not in legacy: lang.update_jsondata(new=True) legacy[lang.id] = args.args[0] def items(s): if not s: return set() r = [] for ss in set(s.strip().split()): if '**:' in ss: ss = ss.split('**:')[0] + '**' if ss.endswith(','): ss = ss[:-1].strip() r.append(ss) return set(r) refs = { r[0]: r[1] for r in DBSession.query(models.Refprovider.id, models.Refprovider.ref_pk)} valuesets = { r[0]: r[1] for r in DBSession.query(common.ValueSet.id, common.ValueSet.pk)} for lang in args.repos.languoids(): if lang.category == models.BOOKKEEPING: continue clf = lang.classification_comment if clf: if clf.subrefs: if items(lang.cfg['classification']['subrefs']) != \ items(lang.cfg['classification'].get('sub')): vspk = valuesets['sc-{0}'.format(lang.id)] for ref in clf.subrefs: spk = refs.get(ref.key) DBSession.add( common.ValueSetReference(source_pk=spk, valueset_pk=vspk))
def populate_test_db(engine): set_alembic_version(engine, '58559d4eea0d') data = TestData() data.add_default(common.Dataset, domain='clld', jsondata={ 'license_icon': 'cc-by', 'license_url': 'http://example.org' }) data.add_default(common.Contributor, name='A Name', email='*****@*****.**') for id_, name in { 'b': 'b Name', 'c': 'c Name', 'd': 'd Name', }.items(): data.add(common.Contributor, id_, id=id_, name=name, url='http://example.org') DBSession.add( common.Editor(dataset=data[common.Dataset], contributor=data[common.Contributor])) data.add_default(common.Source) data.add(common.Source, 'replaced', id='replaced', active=False, jsondata={'__replacement_id__': 'source'}) data.add_default(common.Contribution) common.ContributionReference(contribution=data[common.Contribution], source=data[common.Source]) for primary, c in [(True, 'contributor'), (False, 'b'), (True, 'c'), (False, 'd')]: common.ContributionContributor(contribution=data[common.Contribution], primary=primary, contributor=data['Contributor'][c]) data.add_default(common.Language, latitude=10.5, longitude=0.3) data[common.Language].sources.append(data[common.Source]) for i, type_ in enumerate(common.IdentifierType): common.LanguageIdentifier( language=data[common.Language], identifier=common.Identifier( type=type_.value, id=type_.value + str(i), name='abc' if type_.name == 'iso' else 'glot1234')) common.LanguageIdentifier(language=data[common.Language], identifier=common.Identifier(type='name', id='name', name='a')) for i in range(2, 102): _l = common.Language(id='l%s' % i, name='Language %s' % i) _i = common.Identifier(type='iso639-3', id='%.3i' % i, name='abc') common.LanguageIdentifier(language=_l, identifier=_i) DBSession.add(_l) param = data.add_default(common.Parameter) de = common.DomainElement(id='de', name='DomainElement', parameter=param) de2 = common.DomainElement(id='de2', name='DomainElement2', parameter=param) valueset = data.add_default(common.ValueSet, language=data[common.Language], parameter=param, contribution=data[common.Contribution]) common.ValueSetReference(valueset=valueset, source=data[common.Source], description='10-20') data.add_default(common.Value, domainelement=de, valueset=valueset, frequency=50, confidence='high') data.add(common.Value, 'value2', id='value2', domainelement=de2, valueset=valueset, frequency=50, confidence='high') paramnd = data.add(common.Parameter, 'no-domain', id='no-domain', name='Parameter without domain') valueset = common.ValueSet(id='vs2', language=data[common.Language], parameter=paramnd, contribution=data[common.Contribution]) common.ValueSetReference(valueset=valueset, source=data[common.Source], description='10-20') common.Value(id='v2', valueset=valueset, frequency=50, confidence='high') unit = data.add_default(common.Unit, language=data[common.Language]) up = data.add_default(common.UnitParameter) common.UnitValue(id='unitvalue', name='UnitValue', unit=unit, unitparameter=up) up2 = common.UnitParameter(id='up2', name='UnitParameter with domain') de = common.UnitDomainElement(id='de', name='de', parameter=up2) DBSession.add( common.UnitValue(id='uv2', name='UnitValue2', unit=unit, unitparameter=up2, unitdomainelement=de)) DBSession.add(common.Source(id='s')) sentence = data.add_default(common.Sentence, description='sentence description', analyzed='a\tmorpheme\tdoes\tdo', gloss='a\tmorpheme\t1SG\tdo.SG2', source='own', comment='comment', original_script='a morpheme', language=data[common.Language], jsondata={'alt_translation': 'Spanish: ...'}) common.SentenceReference(sentence=sentence, source=data[common.Source]) DBSession.add(common.Config(key='key', value='value')) common.Config.add_replacement('replaced', 'language', model=common.Language) common.Config.add_replacement('gone', None, model=common.Language) DBSession.flush()
def main(args): for (org, repos), recs in itertools.groupby( sorted(oai.Records('tular'), key=lambda r: (r.repos.org, r.repos.repos, r.version), reverse=True), lambda r: (r.repos.org, r.repos.repos), ): if org == 'tupian-language-resources' and repos in DATASETS: DATASETS[repos] = next(recs) data = Data() dataset = data.add( common.Dataset, 'tular', id=tular.__name__, domain="tular.clld.org", name="TuLaR", description="Tupían Language Resources", publisher_name="Max-Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", license='https://creativecommons.org/licenses/by-sa/4.0/', contact="*****@*****.**", jsondata={ 'license_icon': 'cc-by-sa.png', 'license_name': 'Creative Commons Attribution-ShareAlike 4.0 International License' }, ) rd = pathlib.Path(tular.__file__).parent.parent.parent.resolve() root = input('Project dir [{}]: '.format(str(rd))) root = pathlib.Path(root) if root else rd clts = clts_from_input(rd / '..' / 'cldf-clts' / 'clts-data') for db, rec in DATASETS.items(): print(db, rec.doi, rec.tag) dbdir = root.joinpath(db) assert dbdir.exists() md = jsonlib.load(dbdir / 'metadata.json') name = md['title'] if md['description']: name += ': {}'.format(md['description']) contribution = data.add( Database, db, id=db, name=name, description=rec.citation if rec else None, doi=rec.doi if rec else None, ) header, contribs = next( iter_markdown_tables( dbdir.joinpath('CONTRIBUTORS.md').read_text(encoding='utf8'))) for i, contrib in enumerate(contribs): contrib = dict(zip(header, contrib)) cid = slug(HumanName(contrib['Name']).last) contributor = data['Contributor'].get(cid) if not contributor: contributor = data.add( common.Contributor, cid, id=cid, name=contrib['Name'], description=contrib.get('Affiliation'), ) DBSession.add( common.ContributionContributor( contribution=contribution, contributor=contributor, primary='author' in contrib['Role'].lower(), ord=i, )) for i, cid in enumerate( ['gerardi', 'reichert', 'aragon', 'list', 'forkel']): DBSession.add( common.Editor(contributor=data['Contributor'][cid], dataset=dataset, ord=i)) source_ids = list(add_sources(args.cldf.bibpath, DBSession)) sources = {s.id: s.pk for s in DBSession.query(common.Source)} subgroups = [] for row in args.cldf['LanguageTable']: if row['SubGroup'] not in subgroups: subgroups.append(row['SubGroup']) family = data['Family'].get(row['Family']) if (not family) and row['Family']: family = data.add(Family, row['Family'], id=slug(row['Family']), name=row['Family']) data.add( Doculect, row['ID'], id=row['ID'], name=row['Name'].replace('_', ' '), family=family, subfamily=row['SubGroup'], iso_code=row['ISO639P3code'], glotto_code=row['Glottocode'], longitude=row['Longitude'], latitude=row['Latitude'], jsondata=dict(icon=SUBGROUPS[row['SubGroup']]), ) tudet = Dataset.from_metadata(root / 'tudet' / 'cldf' / 'Generic-metadata.json') seen = set() for row in tudet['ExampleTable']: if row['ID'] in seen: print('skipping duplicate sentence ID {}'.format(row['ID'])) continue seen.add(row['ID']) DBSession.add( Example(id=row['ID'], name=row['Primary_Text'], description=row['Translated_Text'], language=data['Doculect'][row['Language_ID']], conllu=row['conllu'])) contrib = data['Database']['tuled'] for row in args.cldf['ParameterTable']: data.add( Concept, row['ID'], id=row['ID'].split('_')[0], name=row['Name'], portuguese=row['Portuguese_Gloss'], semantic_field=row['Semantic_Field'], concepticon_class=row['Concepticon_ID'], eol=row['EOL_ID'], ) for (lid, pid), rows in itertools.groupby( sorted(args.cldf.iter_rows('FormTable', 'languageReference', 'parameterReference'), key=lambda r: (r['Language_ID'], r['Parameter_ID'])), lambda r: (r['Language_ID'], r['Parameter_ID']), ): vsid = '{}-{}'.format(lid, pid) vs = data.add( common.ValueSet, vsid, id=vsid, language=data['Doculect'][lid], parameter=data['Concept'][pid], contribution=contrib, ) refs = set() for row in rows: data.add( Word, row['ID'], id=row['ID'], valueset=vs, name=row['Form'], tokens=' '.join(row['Segments']), simple_cognate=int(row['SimpleCognate']), notes=row['Comment'], morphemes=' '.join(row['Morphemes']), partial_cognate=' '.join([k for k in row['PartialCognates']]) if row['PartialCognates'] else None, ) refs = refs.union(row['Source']) for ref in refs: if ref in source_ids: DBSession.add( common.ValueSetReference(valueset=vs, source_pk=sources[slug( ref, lowercase=False)])) load_inventories(args.cldf, clts, data['Doculect']) for row in args.cldf['CognateTable']: cc = data['Cognateset'].get(row['Cognateset_ID']) if not cc: cc = data.add( Cognateset, row['Cognateset_ID'], id=row['Cognateset_ID'], name=row['Cognateset_ID'], contribution=contrib, ) data.add( Cognate, row['ID'], cognateset=cc, counterpart=data['Word'][row['Form_ID']], alignment=' '.join(row['Alignment'] or []), )
def setUp(self): TestWithDb.setUp(self) DBSession.add( common.Dataset(id='dataset', name='dataset', description='desc', domain='clld', jsondata={'license_icon': 'cc-by'})) DBSession.add( common.Source(id='replaced', active=False, jsondata={'__replacement_id__': 'source'})) source = common.Source(id='source') contributors = { 'contributor': 'A Name', 'b': 'b Name', 'c': 'c Name', 'd': 'd Name' } for id_, name in contributors.items(): contributors[id_] = common.Contributor(id=id_, name=name, url='http://example.org') contribution = common.Contribution(id='contribution', name='Contribution') common.ContributionReference(contribution=contribution, source=source) assert common.ContributionContributor( contribution=contribution, primary=True, contributor=contributors['contributor']) assert common.ContributionContributor(contribution=contribution, primary=False, contributor=contributors['b']) assert common.ContributionContributor(contribution=contribution, primary=True, contributor=contributors['c']) assert common.ContributionContributor(contribution=contribution, primary=False, contributor=contributors['d']) DBSession.add(contribution) language = common.Language(id='language', name='Language 1', latitude=10.5, longitude=0.3) language.sources.append(source) for i, type_ in enumerate(common.IdentifierType): id_ = common.Identifier(type=type_.value, id=type_.value + str(i), name='abc') common.LanguageIdentifier(language=language, identifier=id_) for i in range(2, 102): _l = common.Language(id='l%s' % i, name='Language %s' % i) _i = common.Identifier(type='iso639-3', id='%.3i' % i, name='%.3i' % i) common.LanguageIdentifier(language=_l, identifier=_i) DBSession.add(_l) param = common.Parameter(id='parameter', name='Parameter') de = common.DomainElement(id='de', name='DomainElement', parameter=param) de2 = common.DomainElement(id='de2', name='DomainElement2', parameter=param) valueset = common.ValueSet(id='valueset', language=language, parameter=param, contribution=contribution) value = common.Value(id='value', domainelement=de, valueset=valueset, frequency=50, confidence='high') DBSession.add(value) value2 = common.Value(id='value2', domainelement=de2, valueset=valueset, frequency=50, confidence='high') DBSession.add(value2) paramnd = common.Parameter(id='no-domain', name='Parameter without domain') valueset = common.ValueSet(id='vs2', language=language, parameter=paramnd, contribution=contribution) common.ValueSetReference(valueset=valueset, source=source) value = common.Value(id='v2', valueset=valueset, frequency=50, confidence='high') DBSession.add(value) unit = common.Unit(id='unit', name='Unit', language=language) up = common.UnitParameter(id='unitparameter', name='UnitParameter') DBSession.add(unit) DBSession.add( common.UnitValue(id='unitvalue', name='UnitValue', unit=unit, unitparameter=up)) up2 = common.UnitParameter(id='up2', name='UnitParameter with domain') de = common.UnitDomainElement(id='de', name='de', parameter=up2) DBSession.add( common.UnitValue(id='uv2', name='UnitValue2', unit=unit, unitparameter=up2, unitdomainelement=de)) DBSession.add(common.Source(id='s')) sentence = common.Sentence( id='sentence', name='sentence name', description='sentence description', analyzed='a\tmorpheme\tdoes\tdo', gloss='a\tmorpheme\t1SG\tdo.SG2', source='own', comment='comment', original_script='a morpheme', language=language, jsondata={'alt_translation': 'Spanish: ...'}) common.SentenceReference(sentence=sentence, source=source) DBSession.add(common.Config(key='key', value='value')) common.Config.add_replacement('replaced', 'language', model=common.Language) common.Config.add_replacement('gone', None, model=common.Language) DBSession.flush()
def main(args): assert args.glottolog, 'The --glottolog option is required!' license = licenses.find(args.cldf.properties['dc:license']) assert license and license.id.startswith('CC-') data = Data() ds = data.add( common.Dataset, mixezoqueanvoices.__name__, id=mixezoqueanvoices.__name__, name="Mixe-Zoquean Voices", domain='mixezoqueanvoices.clld.org', publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", license=license.url, jsondata={ 'license_icon': '{}.png'.format('-'.join( [p.lower() for p in license.id.split('-')[:-1]])), 'license_name': license.name }, ) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) data.add(common.Contributor, 'kondic', id='kondic', name='Ana Kondic') data.add(common.Contributor, 'gray', id='gray', name='Russell Gray') DBSession.add( common.ContributionContributor( contribution=contrib, contributor=data['Contributor']['kondic'], )) for i, ed in enumerate(['kondic', 'gray']): data.add(common.Editor, ed, dataset=ds, contributor=data['Contributor'][ed], ord=i) ancestors = collections.defaultdict(list) gl = Glottolog(args.glottolog) lnames = {} for lang in args.cldf.iter_rows('LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): lnames[lang['id']] = lang['name'] glang = None if lang['glottocode']: glang = gl.languoid(lang['glottocode']) lineage = [i[0] for i in glang.lineage] if 'Mixe-Zoque' in lineage: ancestors[lang['id']].append('Protomixezoque') if 'Mixe' in lineage: ancestors[lang['id']].append('Protomixe') if 'Oaxaca Mixe' in lineage: ancestors[lang['id']].append('Protooaxacamixe') if not glang: assert lang['name'] == 'Nizaviguiti' data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], description=lang['LongName'], subgroup=glang.lineage[1][0] if glang and len(glang.lineage) > 1 else None, ) colors = dict( zip( set(l.subgroup for l in data['Variety'].values()), qualitative_colors( len(set(l.subgroup for l in data['Variety'].values()))))) for l in data['Variety'].values(): l.jsondata = dict(color=colors[l.subgroup].replace('#', '')) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) # Store proto-forms for later lookup: proto_forms = collections.defaultdict( lambda: collections.defaultdict(list)) for form in args.cldf.iter_rows('FormTable', 'id', 'form', 'languageReference', 'parameterReference'): if form['languageReference'].startswith('Proto'): proto_forms[form['languageReference']][ form['parameterReference']].append(form['form']) for param in args.cldf.iter_rows('ParameterTable', 'id', 'concepticonReference', 'name'): proto = collections.OrderedDict() for lid, forms in proto_forms.items(): f = forms.get(param['id']) if f: proto[lnames[lid]] = f data.add( models.Concept, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id'].split('_')[0]), concepticon_id=param['concepticonReference'], concepticon_gloss=param['Concepticon_Gloss'], description=param['Spanish_Gloss'], jsondata=dict(reconstructions=proto), ) f2a = form2audio(args.cldf) for form in args.cldf.iter_rows('FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): assert not (form['form'] == '►' and not f2a.get(form['id'])) vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) proto = collections.OrderedDict() for lid in ancestors.get(form['languageReference'], []): f = proto_forms[lid].get(form['parameterReference']) if f: proto[lnames[lid]] = f data.add( Counterpart, form['id'], id=form['id'], name=form['form'], valueset=vs, audio=f2a.get(form['id']), jsondata=dict(reconstructions=proto), ) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages))))
def main(args): data = Data() editors = OrderedDict() editors['Susanne Maria Michaelis'] = None editors['Philippe Maurer'] = None editors['Martin Haspelmath'] = None editors['Magnus Huber'] = None for row in read(args, 'People'): name = row['First name'] + ' ' if row['First name'] else '' name += row['Last name'] kw = dict( name=name, id=slug('%(Last name)s%(First name)s' % row), url=row['Contact Website'].split()[0] if row['Contact Website'] else None, address=row['Comments on database'], ) contrib = data.add(common.Contributor, row['Author ID'], **kw) if kw['name'] in editors: editors[kw['name']] = contrib DBSession.flush() dataset = common.Dataset( id='apics', name='APiCS Online', description='Atlas of Pidgin and Creole Language Structures Online', domain='apics-online.info', published=date(2013, 11, 4), license='http://creativecommons.org/licenses/by/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 3.0 Unported License' }) DBSession.add(dataset) for i, editor in enumerate(editors.values()): common.Editor(dataset=dataset, contributor=editor, ord=i + 1) colors = dict( (row['ID'], row['RGB_code']) for row in read(args, 'Colours')) abbrs = {} for id_, name in LGR_ABBRS.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) abbrs[id_] = 1 for id_, name in { 'C**T': 'clitic', 'IMPF': 'imperfect', 'INTERM': 'intermediate', 'NCOMPL': 'noncompletive', 'NONFUT': 'nonfuture', 'NPROX': 'nonproximal', 'NSG': 'nonsingular', 'PP': 'past participle', 'PROP': 'proprietive', 'TMA': 'tense-mood-aspect', }.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) abbrs[id_] = 1 for row in reader(args.data_file('non-lgr-gloss-abbrs.csv'), delimiter=',', namedtuples=True): for match in GLOSS_ABBR_PATTERN.finditer(row.standard): if match.group('abbr') not in abbrs: abbrs[match.group('abbr')] = 1 DBSession.add( common.GlossAbbreviation(id=match.group('abbr'), name=row.meaning)) non_bibs = {} for row in read(args, 'References', 'Reference_ID'): if row['Reference_type'] == 'Non-bib': non_bibs[row['Reference_ID']] = row['Reference_name'] continue if isinstance(row['Year'], int): year_int = row['Year'] year = str(row['Year']) elif row['Year']: year_int = None for m in re.finditer('(?P<year>(1|2)[0-9]{3})', row['Year']): year_int = int(m.group('year')) break year = row['Year'] else: year, year_int = None, None title = row['Article_title'] or row['Book_title'] attrs = {} jsondata = {} for attr, field in { 'Additional_information': 'note', 'Article_title': 'title', 'Book_title': 'booktitle', 'City': 'address', 'Editors': 'editor', 'Full_reference': None, 'Issue': None, 'Journal': 'journal', 'Language_codes': None, 'LaTeX_cite_key': None, 'Pages': 'pages', 'Publisher': 'publisher', 'Reference_type': 'type', 'School': 'school', 'Series_title': 'series', 'URL': 'url', 'Volume': 'volume', }.items(): value = row.get(attr) if not isinstance(value, int): value = (value or '').strip() if attr == 'Issue' and value: try: value = str(int(value)) except ValueError: pass if value: if field: attrs[field] = value else: jsondata[attr] = value p = data.add(common.Source, row['Reference_ID'], id=str(row['Reference_ID']), name=row['Reference_name'], description=title, author=row['Authors'], year=year, year_int=year_int, bibtex_type=getattr(EntryType, row['BibTeX_type'] or 'misc'), jsondata=jsondata, **attrs) if p.bibtex_type.value == 'misc' and not p.description: p.description = p.note DBSession.flush() DBSession.flush() infobox = jsonload(args.data_file('infobox.json')) glottocodes = jsonload(args.data_file('glottocodes.json')) for row in read(args, 'Languages', 'Order_number'): lon, lat = [ float(c.strip()) for c in row['map_coordinates'].split(',') ] kw = dict( name=row['Language_name'], id=str(row['Order_number']), latitude=lat, longitude=lon, region=row['Category_region'], ) lect = data.add(models.Lect, row['Language_ID'], **kw) DBSession.flush() for i, item in enumerate(infobox[lect.id]): DBSession.add( common.Language_data(object_pk=lect.pk, ord=i, key=item[0], value=item[1])) if row["Languages_contribution_documentation::Lect_description_checked_status"] \ != "Checked": print 'unchecked! ---', row['Language_name'] desc = row.get( 'Languages_contribution_documentation::Lect description', '') markup_desc = normalize_markup(row[ 'Languages_contribution_documentation::z_calc_GetAsCSS_Lect_description'] ) c = data.add( models.ApicsContribution, row['Language_ID'], id=str(row['Order_number']), name=row['Language_name'], description=desc, markup_description=markup_desc, survey_reference=data['Source'][row['Survey_reference_ID']], language=lect) for ext, label, mtype in [ ('pdf', 'Glossed text', 'application/pdf'), ('mp3', 'Glossed text audio', 'audio/mpeg'), ]: fid = '%s-gt.%s' % (c.id, ext) if args.data_file('files', 'contribution', c.id, fid).exists(): common.Contribution_files(object=c, id=fid, name=label, mime_type=mtype) else: print label, 'missing for:', row['Language_name'] # # TODO: for michif, 75, add link http://www.youtube.com/watch?v=f0C4cODsSyE # iso = None if row['ISO_code'] and len(row['ISO_code']) == 3: iso = row['ISO_code'].lower() if 'iso:%s' % row['ISO_code'] not in data['Identifier']: data.add(common.Identifier, 'iso:%s' % row['ISO_code'], id=row['ISO_code'].lower(), name=row['ISO_code'].lower(), type=common.IdentifierType.iso.value) DBSession.add( common.LanguageIdentifier( language=data['Lect'][row['Language_ID']], identifier=data['Identifier']['iso:%s' % row['ISO_code']])) if lect.id in glottocodes: identifier = data.add(common.Identifier, 'gc:%s' % glottocodes[lect.id], id=glottocodes[lect.id], name=glottocodes[lect.id], type=common.IdentifierType.glottolog.value) DBSession.add( common.LanguageIdentifier( language=data['Lect'][row['Language_ID']], identifier=identifier)) if row['Language_name_ethnologue']: if row['Language_name_ethnologue'] not in data['Identifier']: data.add(common.Identifier, row['Language_name_ethnologue'], id=iso or 'ethnologue:%s' % row['Language_name_ethnologue'], name=row['Language_name_ethnologue'], type='ethnologue') DBSession.add( common.LanguageIdentifier( language=data['Lect'][row['Language_ID']], identifier=data['Identifier'][ row['Language_name_ethnologue']])) example_count = {} for row in read(args, 'Examples', 'Order_number'): assert row['Language_ID'] lang = data['Lect'][row['Language_ID']] id_ = '%(Language_ID)s-%(Example_number)s' % row atext, gloss = igt(row) example_count[row['Language_ID']] = max( [example_count.get(row['Language_ID'], 1), row['Example_number']]) p = add_sentence( args, data, id_, id='%s-%s' % (lang.id, row['Example_number']), name=row['Text'] or row['Analyzed_text'], description=row['Translation'], type=row['Type'].strip().lower() if row['Type'] else None, comment=row['Comments'], gloss=gloss, analyzed=atext, markup_text=normalize_markup(row['z_calc_Text_CSS']), markup_gloss=normalize_markup(row['z_calc_Gloss_CSS']), markup_comment=normalize_markup(row['z_calc_Comments_CSS']), markup_analyzed=normalize_markup(row['z_calc_Analyzed_text_CSS']), original_script=row['Original_script'], jsondata={ 'sort': row['Order_number'], 'alt_translation': (row['Translation_other'] or '').strip() or None }, language=lang) if row['Reference_ID']: if row['Reference_ID'] in data['Source']: source = data['Source'][row['Reference_ID']] DBSession.add( common.SentenceReference( sentence=p, source=source, key=source.id, description=row['Reference_pages'])) else: p.source = non_bibs[row['Reference_ID']] DBSession.flush() for row in read(args, 'Language_references'): if row['Reference_ID'] not in data['Source']: assert row['Reference_ID'] in non_bibs continue assert row['Language_ID'] in data['ApicsContribution'] source = data['Source'][row['Reference_ID']] DBSession.add( common.ContributionReference( contribution=data['ApicsContribution'][row['Language_ID']], source=source, description=row['Pages'], key=source.id)) # # global counter for features - across feature types # feature_count = 0 for row in read(args, 'Features', 'Feature_number'): id_ = str(row['Feature_number']) if int(id_) > feature_count: feature_count = int(id_) wals_id = None desc = row['Feature_annotation_publication'] if row['WALS_match'] == 'Total': if isinstance(row['WALS_No.'], int): wals_id = row['WALS_No.'] else: wals_id = int(row['WALS_No.'].split('.')[0].strip()) p = data.add(models.Feature, row['Feature_code'], name=row['Feature_name'], id=id_, description=desc, markup_description=normalize_markup( row['z_calc_Feature_annotation_publication_CSS']), feature_type='primary', multivalued=row['Value_relation_type'] != 'Single', area=row['Feature_area'], wals_id=wals_id) names = {} for i in range(1, 10): if not row['Value%s_publication' % i] \ or not row['Value%s_publication' % i].strip(): continue name = row['Value%s_publication' % i].strip() if name in names: name += ' (%s)' % i names[name] = 1 de = data.add( common.DomainElement, '%s-%s' % (row['Feature_code'], i), id='%s-%s' % (id_, i), name=name, parameter=p, abbr=row['Value%s_for_book_maps' % i] if p.id != '0' else name, number=int(row['Value%s_value_number_for_publication' % i]), jsondata={'color': colors[row['Value_%s_colour_ID' % i]]}, ) assert de if row['Authors_FeatureArticles']: authors, _ = row['Authors_FeatureArticles'].split('and the APiCS') authors = authors.strip() if authors.endswith(','): authors = authors[:-1].strip() for i, name in enumerate(authors.split(',')): assert name.strip() in editors p._authors.append( models.FeatureAuthor(ord=i + 1, contributor=editors[name.strip()])) DBSession.flush() primary_to_segment = {123: 63, 126: 35, 128: 45, 130: 41} segment_to_primary = dict( zip(primary_to_segment.values(), primary_to_segment.keys())) number_map = {} names = {} for row in read(args, 'Segment_features', 'Order_number'): symbol = row['Segment_symbol'] if row['Segment_name'] == 'voiceless dental/alveolar sibilant affricate': symbol = 't\u0361s' truth = lambda s: s and s.strip().lower() == 'yes' name = '%s - %s' % (symbol, row['Segment_name']) if name in names: number_map[row['Segment_feature_number']] = names[name] continue number_map[ row['Segment_feature_number']] = row['Segment_feature_number'] names[name] = row['Segment_feature_number'] feature_count += 1 if row['Segment_feature_number'] in segment_to_primary: primary_to_segment[segment_to_primary[row['Segment_feature_number']]]\ = str(feature_count) p = data.add(models.Feature, row['Segment_feature_number'], name=name, id=str(feature_count), feature_type='segment', area='Vowels' if truth(row['Vowel']) else ('Obstruent consonants' if truth(row['Obstruent']) else 'Sonorant consonants'), jsondata=dict( number=int(row['Segment_feature_number']), vowel=truth(row['Vowel']), consonant=truth(row['Consonant']), obstruent=truth(row['Obstruent']), core_list=truth(row['Core_list_segment']), symbol=symbol, )) for i, spec in SEGMENT_VALUES.items(): data.add(common.DomainElement, '%s-%s' % (row['Segment_feature_number'], spec[0]), id='%s-%s' % (p.id, i), name=spec[0], parameter=p, jsondata={'color': spec[1]}, number=i) print '--> remapped:', primary_to_segment DBSession.flush() for row in read(args, 'Sociolinguistic_features', 'Sociolinguistic_feature_number'): feature_count += 1 p = data.add(models.Feature, row['Sociolinguistic_feature_code'], name=row['Sociolinguistic_feature_name'], id='%s' % feature_count, description=row['Sociolinguistic_feature_annotation'], area='Sociolinguistic', feature_type='sociolinguistic') names = {} for i in range(1, 10): id_ = '%s-%s' % (row['Sociolinguistic_feature_code'], i) if row.get('Value%s' % i) and row['Value%s' % i].strip(): name = row['Value%s' % i].strip() if name in names: name += ' (%s)' % i names[name] = 1 else: continue kw = dict(id='%s-%s' % (p.id, i), name=name, parameter=p, number=i) data.add(common.DomainElement, id_, id='%s-%s' % (p.id, i), name=name, parameter=p, number=i, jsondata={ 'color': colors.get(row['Value%s_colour_ID' % i], colors.values()[i]) }) sd = {} for row in read(args, 'Segment_data'): if row['Segment_feature_number'] not in number_map: continue number = number_map[row['Segment_feature_number']] if not row['Presence_in_the_language']: continue lang = data['Lect'][row['Language_ID']] param = data['Feature'][number] id_ = '%s-%s' % (lang.id, param.id) if id_ in sd: assert row['c_Record_is_a_duplicate'] == 'Yes' continue sd[id_] = 1 valueset = data.add( common.ValueSet, id_, id=id_, parameter=param, language=lang, contribution=data['ApicsContribution'][row['Language_ID']], description=row['Comments'], markup_description=normalize_markup(row['z_calc_Comments_CSS']), ) v = data.add( common.Value, id_, id=id_, frequency=float(100), valueset=valueset, domainelement=data['DomainElement'][ '%s-%s' % (number, row['Presence_in_the_language'])], ) if row['Example_word'] and row['Example_word_gloss']: example_count[row['Language_ID']] += 1 p = add_sentence(args, data, '%s-p%s' % (lang.id, data['Feature'][number].id), id='%s-%s' % (lang.id, example_count[row['Language_ID']]), name=row['Example_word'], description=row['Example_word_gloss'], language=lang) DBSession.add(common.ValueSentence(value=v, sentence=p)) source = data['Source'].get(row['Refers_to_references_Reference_ID']) if source: DBSession.add( common.ValueSetReference(valueset=valueset, source=source, key=source.id)) elif row['Refers_to_references_Reference_ID'] in non_bibs: valueset.source = non_bibs[ row['Refers_to_references_Reference_ID']] lects = defaultdict(lambda: 1) lect_map = {} records = {} false_values = {} no_values = {} wals_value_number = {} for row in read(args, 'wals'): if row['z_calc_WALS_value_number']: wals_value_number[ row['Data_record_id']] = row['z_calc_WALS_value_number'] def prefix(attr, _prefix): if _prefix: return '%s_%s' % (_prefix, attr) return attr.capitalize() for _prefix, abbr in [('', ''), ('Sociolinguistic', 'sl')]: num_values = 10 for row in read(args, prefix('data', _prefix)): if not row[prefix('feature_code', _prefix)]: print('no associated feature for', prefix('data', _prefix), row[prefix('data_record_id', _prefix)]) continue lid = row['Language_ID'] lect_attr = row.get('Lect_attribute', 'my default lect').lower() if lect_attr != 'my default lect': if (row['Language_ID'], row['Lect_attribute']) in lect_map: lid = lect_map[(row['Language_ID'], row['Lect_attribute'])] else: lang = data['Lect'][row['Language_ID']] c = lects[row['Language_ID']] lid = '%s-%s' % (row['Language_ID'], c) kw = dict( name='%s (%s)' % (lang.name, row['Lect_attribute']), id='%s' % (1000 + 10 * int(lang.id) + c), latitude=lang.latitude, longitude=lang.longitude, description=row['Lect_attribute'], language=lang, ) data.add(models.Lect, lid, **kw) lects[row['Language_ID']] += 1 lect_map[(row['Language_ID'], row['Lect_attribute'])] = lid id_ = abbr + str(row[prefix('data_record_id', _prefix)]) assert id_ not in records records[id_] = 1 assert row[prefix('feature_code', _prefix)] in data['Feature'] language = data['Lect'][lid] parameter = data['Feature'][row[prefix('feature_code', _prefix)]] valueset = common.ValueSet( id='%s-%s' % (language.id, parameter.id), description=row['Comments_on_value_assignment'], markup_description=normalize_markup( row.get('z_calc_Comments_on_value_assignment_CSS')), ) values_found = {} for i in range(1, num_values): if not row['Value%s_true_false' % i]: continue if row['Value%s_true_false' % i].strip().lower() != 'true': assert row['Value%s_true_false' % i].strip().lower() == 'false' false_values[row[prefix('data_record_id', _prefix)]] = 1 continue iid = '%s-%s' % (row[prefix('feature_code', _prefix)], i) if iid not in data['DomainElement']: print(iid, row[prefix('data_record_id', _prefix)], '--> no domainelement!') continue values_found['%s-%s' % (id_, i)] = dict( id='%s-%s' % (valueset.id, i), domainelement=data['DomainElement']['%s-%s' % (row[prefix( 'feature_code', _prefix)], i)], confidence=row['Value%s_confidence' % i], frequency=float(row['c_V%s_frequency_normalised' % i]) if _prefix == '' else 100) if values_found: if row[prefix('data_record_id', _prefix)] in wals_value_number: valueset.jsondata = { 'wals_value_number': wals_value_number.pop(row[prefix( 'data_record_id', _prefix)]) } valueset.parameter = parameter valueset.language = language valueset.contribution = data['ApicsContribution'][ row['Language_ID']] valueset = data.add(common.ValueSet, id_, _obj=valueset) for i, item in enumerate(values_found.items()): if i > 0 and not parameter.multivalued: print 'multiple values for single-valued parameter: %s' % id_ break id_, kw = item kw['valueset'] = valueset value = data.add(common.Value, id_, **kw) # # store references to additional data for segments which should be reused # for corresponding primary features! # if int(parameter.id) in primary_to_segment: assert len(values_found) == 1 seg_id = '%s-%s' % (language.id, primary_to_segment[int( parameter.id)]) seg_valueset = data['ValueSet'][seg_id] seg_value = data['Value'][seg_id] if not valueset.description and seg_valueset.description: valueset.description = seg_valueset.description for s in seg_value.sentence_assocs: DBSession.add( common.ValueSentence(value=value, sentence=s.sentence)) for r in seg_valueset.references: DBSession.add( common.ValueSetReference(valueset=valueset, source=r.source, key=r.key)) if not valueset.source and seg_valueset.source: valueset.source = seg_valueset.source DBSession.flush() else: no_values[id_] = 1 DBSession.flush() for prefix, abbr, num_values in [ ('D', '', 10), ('Sociolinguistic_d', 'sl', 7), ]: for row in read(args, prefix + 'ata_references'): assert row['Reference_ID'] in data['Source'] \ or row['Reference_ID'] in non_bibs try: vs = data['ValueSet'][abbr + str(row[prefix + 'ata_record_id'])] if row['Reference_ID'] in data['Source']: source = data['Source'][row['Reference_ID']] DBSession.add( common.ValueSetReference( valueset=vs, source=source, key=source.id, description=row['Pages'], )) else: if vs.source: vs.source += '; ' + non_bibs[row['Reference_ID']] else: vs.source = non_bibs[row['Reference_ID']] except KeyError: continue DBSession.flush() missing = 0 for row in read(args, 'Value_examples'): try: DBSession.add( common.ValueSentence( value=data['Value']['%(Data_record_id)s-%(Value_number)s' % row], sentence=data['Sentence'][ '%(Language_ID)s-%(Example_number)s' % row], description=row['Notes'], )) except KeyError: missing += 1 print('%s Value_examples are missing data' % missing) print('%s data sets with false values' % len(false_values)) print('%s data sets without values' % len(no_values)) for k, v in wals_value_number.items(): print 'unclaimed wals value number:', k, v for i, row in enumerate(read(args, 'Contributors')): kw = dict(contribution=data['ApicsContribution'][row['Language ID']], contributor=data['Contributor'][row['Author ID']]) if row['Order_of_appearance']: kw['ord'] = int(float(row['Order_of_appearance'])) data.add(common.ContributionContributor, i, **kw) DBSession.flush()
def issue20(session, timestamp): # pragma: no cover # Datapoint http://wals.info/datapoint/121A/wals_code_bej should be changed to be # about Kemant (wals_code_kem). The same applies to the Rossini source for that # datapoint. (This is the only datapoint for this source.) vs_switch_lang(session, timestamp, '121A-bej', 'kem') # Eastern Ojibwa (wals_code_oji) should link to two ISO codes, ojg (as it is now) but also otw. update_iso(session, timestamp, 'oji', otw='Ottawa') # There should be two ISO codes for Campa (Axininca) (wals_code_cax): cni and cpc update_iso(session, timestamp, 'cax', cpc='Ajyíninka Apurucayali') # All of the datapoints for Fula (Nigerian) (wals_code_fni) based on Arnott (1970) # need to be moved to Fula (Cameroonian) (wals_code_fua). In some cases, this involves # merging these datapoints with existing datapoints for wals_code_fua. source = common.Source.get('Arnott-1970', session=session) for vsr in source.valuesetreferences: vs = vsr.valueset if vs.language.id == 'fni': vs_switch_lang(session, timestamp, vs, 'fua') # The one datapoint for Fulani (Gombe) fgo needs to be moved to Fula (Cameroonian) # (wals_code_fua), thus removing Fulani (Gombe) as a language. vs_switch_lang(session, timestamp, '27A-fgo', 'fua') # Tlapanec (wals_code_tlp) should link to ISO code tcf rather than tpx. update_iso(session, timestamp, 'tlp', 'tpx', tcf="Malinaltepec Me'phaa") # Kongo (wals_code_kon) should link to two ISO codes, kwy and kng. update_iso(session, timestamp, 'kon', kwy=None) # One of the sources for Vili (wals_code_vif), namely Carrie (1890) turns out not # to be a source for Vili but another source for Kongo (wals_code_kon). This means: # the page numbers given for Vili for 81A and 82A should be added to the corresponding # datapoints for Kongo # the value and source given for Vili for 91A should be transferred to Congo (which # currently does not have a value for that feature) # all the datapoints for Vili for which Carrie was the source should be removed # the values given for Vili for which Carrie was the source for the features # associated with chapters 112, 143, and 144 are NOT being transferred to Kongo # since they are inconsistent with the existing values for these features for Kongo source = common.Source.get('Carrie-1890', session=session) for vsr in source.valuesetreferences: vs = vsr.valueset if vs.language.id == 'vif': if vs.parameter.id in ['81A', '82A', '91A']: vs_switch_lang(session, timestamp, vs, 'kon') else: vs_delete(session, timestamp, vs) # One of the sources for Chichewa (wals_code_cic), namely Mateene 1980, turns out # to be a source for Nyanga (wals_code_nng). What this entail is # the values listed for Chichewa for features 81A, 82A, 83A, 86A, 87A, and 88A, # need to be added to Nyanga # Mateene 1980 should be added as a source for Nyanga # the references to Mateene as a source for datapoints for Chichewa need to be removed # there is one datapoint for Chichewa were Mateene is listed as the only source, # namely for 83A, but this is an error: the source for this datapoint should be # Price 1966: passim; Mchombo 2004: 19 (the sources listed for 81A) source = common.Source.get('Mateene-1980', session=session) for vsr in source.valuesetreferences: vs = vsr.valueset if vs.language.id == 'cic': if vs.parameter.id in ['81A', '82A', '83A', '86A', '87A', '88A']: vs_copy_lang(session, timestamp, vs, 'nng') else: vs_delete(session, timestamp, vs) session.delete(vsr) if vs.parameter.id == '83A': session.add( common.ValueSetReference( valueset=vs, source=common.Source.get('Price-1966', session=session), description='passim')) session.add( common.ValueSetReference(valueset=vs, source=common.Source.get( 'Mchombo-2004', session=session), description='19')) # [gby] should be removed as an ISO code for Gwari (wals_code_gwa); the only one should be [gbr] update_iso(session, timestamp, 'gwa', 'gby', gbr=None) # The ISO code for Grebo (wals_code_grb) should be corrected to [grj]. update_iso(session, timestamp, 'grb', 'gry', grj="Southern Grebo") # The only ISO code for Lega is [lea]; please remove the second one. update_iso(session, timestamp, 'leg', 'lgm') # The sources for Ngbaka (wals_code_ngb) are actually for two different, only # distantly related languages. GrandEury is the source for Ngbaka (Minagende), which # has the same ISO code [nga] and location we are currently using for Ngbaka, so we # should keep the WALS code for that Ngbaka (but should change the name to # Ngbaka (Minagende)). Thomas (1963) is a source for what will be a new WALS language, # Ngbaka (Ma’bo). Its ISO code is [nbm]. We could use the same code nbm as the WALS code. # It belongs to the Ubangi genus, as Ngbaka (Minagende) does in the current WALS # classification, but see below where Ngbaka (Minagende) is being moved out of # Ubangi into a new genus. I would use the Glottolog location for it, but I can’t find # that in the new Glottolog. It is also in the Democratic Republic of the Congo. # # This means that all the datapoints in the current WALS that use Thomas 1963 as a # source for Ngbaka need to be moved or copied to the new Ngbaka (Ma’bo). Those # datapoints in the current Ngbaka that only use Thomas as a source will need to be # removed (since that language is the new Ngbaka (Minagende)). Those datapoints that # use both sources in the current WALS will now become two datapoints, one for each # of these two languages. nbm = models.WalsLanguage(id='nbm', name="Ngbaka (Ma'bo)", ascii_name=slug("Ngbaka (Ma'bo)"), latitude=3.56, longitude=18.36, genus=models.Genus.get('ubangi', session=session)) nbm.countries.append(models.Country.get('CD', session=session)) session.add(nbm) update_iso(session, timestamp, nbm, nbm="Ngbaka Ma'bo") update_glottocode(session, timestamp, nbm, 'ngba1284') ngb = common.Language.get('ngb', session=session) ngb.name = 'Ngbaka (Minagende)' ngb.ascii_name = slug(ngb.name) for vs in ngb.valuesets: if 'Thomas-1963' in [ref.source.id for ref in vs.references]: if len(vs.references) > 1: vs_copy_lang(session, timestamp, vs, nbm) else: vs_switch_lang(session, timestamp, vs, nbm) # The ISO code for Sisaala (wals_code_ssa) needs to be changed from [ssl] to [sld]. update_iso(session, timestamp, 'ssa', 'ssl', sld='Sissala') # The ISO code for Makua (wals_code_mua) should be changed to [mgh] and [xsq]. update_iso(session, timestamp, 'mua', 'vmw', mgh='Makhuwa-Meetto', xsq='Makhuwa-Saka') # A change to the genealogical classification: Four languages need to be taken out # of the Ubangi genus and put into a new genus within Niger-Congo called # Gbaya-Manza-Ngbaka: (first below is WALS code, last is ISO code): # #gbb Gbeya Bossangoa gbp #gbk Gbaya Kara gya #mdo Mbodomo gmm #ngb Ngbaka nga # update_classification(session, timestamp, ['gbb', 'gbk', 'mdo', 'ngb'], 'gbayamanzangbaka', genus_name='Gbaya-Manza-Ngbaka', family_id='nigercongo')
def main(args): # determine if we run on a machine where other databases are available for lookup # locally: data = Data() genera = get_genera(data) if astroman else {} glottocodes, lnames, geocoords = {}, {}, {} if astroman: for k, v in glottocodes_by_isocode( 'postgresql://robert@/glottolog3', cols=['id', 'name', 'latitude', 'longitude']).items(): glottocodes[k] = v[0] lnames[k] = v[1] geocoords[k] = (v[2], v[3]) refs = defaultdict(list) for row in get_rows(args, 'BibtexKey'): if row[1] == 'NO SOURCE GIVEN': refs[row[0]] = [] else: refs[row[0]].append(row[1]) add_sources(args, data) dataset = data.add( common.Dataset, 'phoible', id='phoible', name='PHOIBLE Online', description='PHOIBLE Online', publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", domain='phoible.org', license='http://creativecommons.org/licenses/by-sa/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'http://i.creativecommons.org/l/by-sa/3.0/88x31.png', 'license_name': 'Creative Commons Attribution-ShareAlike 3.0 Unported License' }) for i, spec in enumerate([ ('moran', "Steven Moran"), ('mccloy', "Daniel McCloy"), ('wright', "Richard Wright"), ]): DBSession.add( common.Editor(dataset=dataset, ord=i + 1, contributor=common.Contributor(id=spec[0], name=spec[1]))) squibs = defaultdict(list) for row in get_rows(args, 'Squib'): squibs[row[0]].append(row[1]) source_urls = dict(get_rows(args, 'URL')) ia_urls = dict(get_rows(args, 'InternetArchive')) aggregated = list( reader(args.data_file('phoible-aggregated.tsv'), namedtuples=True)) inventory_names = {} for key, items in groupby(sorted(aggregated, key=lambda t: (t.LanguageCode, t.Source)), key=lambda t: (t.LanguageCode, t.Source)): items = list(items) lname = lnames.get(key[0]) if not lname: lname = items[0].LanguageName lnames[key[0]] = lname if len(items) == 1: inventory_names[items[0].InventoryID] = '%s (%s)' % (lname, key[1]) else: for i, item in enumerate(items): inventory_names[item.InventoryID] = '%s %s (%s)' % (lname, i + 1, key[1]) family_map = { ("Arawakan", "arwk"): "Arawakan", ("Trans-New Guinea", "trng"): "Trans-New Guinea", ("Moklen", "anes"): "Austronesian", ("Oko", "ncon"): "Niger-Congo", ("Muniche", "saso"): "Muniche", ("Tinigua", "saso"): "Tinigua", ("Vilela", "luvi"): "Vilela", ("Ofayé", "macg"): "Kamakanan", ("Purian", "macg"): "PurianPrint", ("Mixed language", "saml"): "Mixed language", ("Tupian", "tupi"): "Tupian", ("Yuwana", "saun"): "YuwanaPrint", } family_code_map = {k[1]: v for k, v in family_map.items()} for row in aggregated: lang = data['Variety'].get(row.LanguageCode) if not lang: if row.LanguageFamilyGenus == 'UNCLASSIFIED': genus = None else: genus_id = slug(strip_quotes(row.LanguageFamilyGenus)) genus = genera.get(genus_id) if not genus: genus = genera.get(row.LanguageCode) if not genus: #print(row.LanguageFamilyGenus, row.LanguageFamilyRoot) family = family_map.get( (row.LanguageFamilyGenus, row.LanguageFamilyRoot)) genus = genera[genus_id] = data.add( models.Genus, genus_id, id=genus_id, name=row.LanguageFamilyGenus, description=family or row.LanguageFamilyRoot, active=False, root=row.LanguageFamilyRoot) if not genus.root: genus.root = row.LanguageFamilyRoot if genus.description in family_code_map: genus.description = family_code_map[genus.description] if row.LanguageCode in geocoords: coords = geocoords[row.LanguageCode] elif row.Latitude != 'NULL' and row.Longitude != 'NULL': coords = (float(row.Latitude), float(row.Longitude)) lang = data.add(models.Variety, row.LanguageCode, id=row.LanguageCode, name=lnames[row.LanguageCode], genus=genus, country=strip_quotes(row.Country), area=strip_quotes(row.Area), latitude=coords[0], longitude=coords[1], jsondata=dict(inventory_id=row.InventoryID)) add_language_codes(data, lang, row.LanguageCode, glottocodes=glottocodes) contributor = data['Contributor'].get(row.Source) if not contributor: contributor = data.add(common.Contributor, row.Source, id=row.Source, name=SOURCES[row.Source][0], description=SOURCES[row.Source][2]) for ref in SOURCES[row.Source][1]: DBSession.add( models.ContributorReference(source=data['Source'][ref], contributor=contributor)) contrib = data.add(models.Inventory, row.InventoryID, id=row.InventoryID, language=lang, source=row.Source, source_url=source_urls.get(row.InventoryID), internetarchive_url=ia_urls.get(row.InventoryID), name=inventory_names[row.InventoryID], description=row.LanguageName) DBSession.add( common.ContributionContributor(contribution=contrib, contributor=contributor)) for j, squib in enumerate(squibs.get(row.InventoryID, [])): f = common.Contribution_files(object=contrib, id='squib-%s-%s.pdf' % (contrib.id, j + 1), name='Phonological squib', description=squib, mime_type='application/pdf') assert f # f.create(files_dir, file(args.data_file('phonological_squibs', src)).read()) DBSession.flush() unknown_refs = {} for row in reader(args.data_file('phoible-phonemes.tsv'), namedtuples=True): inventory = data['Inventory'][row.InventoryID] segment = data['Segment'].get(row.Phoneme) if not segment: unicode_desc = [(c, unicodedata.name(c)) for c in row.Phoneme] description = ' - '.join([t[1] for t in unicode_desc]) segment = data.add( models.Segment, row.Phoneme, id=b16encode(md5(description).digest()), name=row.Phoneme, description=description, equivalence_class=''.join([ t[0] for t in unicode_desc if t[1].split()[0] not in ['COMBINING', 'MODIFIER'] ]), segment_class=row.Class, combined_class=row.CombinedClass) DBSession.flush() vs = common.ValueSet(id=row.PhonemeID, contribution=inventory, language=inventory.language, parameter=segment) for ref in refs.get(row.InventoryID, []): if ref not in data['Source']: if ref not in unknown_refs: print('-------', ref) unknown_refs[ref] = 1 continue DBSession.add( common.ValueSetReference(source=data['Source'][ref], valueset=vs)) DBSession.add( common.Value( id=row.PhonemeID, name='%s %s' % (row.Phoneme, data['Inventory'][row.InventoryID].name), valueset=vs)) DBSession.flush() for inventory_id in refs: for ref in refs[inventory_id]: if ref not in data['Source']: continue data.add(common.ContributionReference, '%s-%s' % (inventory_id, ref), source=data['Source'][ref], contribution=data['Inventory'][inventory_id]) for i, row in enumerate( reader(args.data_file('phoible-segments-features.tsv'))): if i == 0: features = list(map(feature_name, row)) continue if row[0] not in data['Segment']: # print('skipping feature vector:', row) continue for j, value in enumerate(row): if j and value != '0': DBSession.add( common.Parameter_data( key=features[j], value=value, ord=j, object_pk=data['Segment'][row[0]].pk)) DBSession.flush()
def main(args): assert args.glottolog, 'The --glottolog option is required!' data = Data() ds = data.add( common.Dataset, jambu.__name__, id=jambu.__name__, name='Jambu', domain='jambu-clld.herokuapp.com', publisher_name="Georgetown University", publisher_place="Washington", publisher_url="http://gucl.georgetown.edu/", license="http://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }, ) for i, name in enumerate(['Aryaman Arora']): common.Editor(dataset=ds, ord=i, contributor=common.Contributor(id=slug( HumanName(name).last), name=name)) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) print("Languages...") for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'name', 'glottocode', 'longitude', 'latitude', 'Clade'): data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], family=lang['Clade'], ) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) print("Cognates...") for cognate in iteritems(args.cldf, 'CognateTable'): # print(cognate) data.add(models.Cognate_, cognate['Cognateset_ID'], name=cognate['Form'], language=cognate['Language_ID'], description=cognate['Description']) counts = collections.defaultdict(set) print("Forms...") for form in tqdm( iteritems(args.cldf, 'FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source')): counts[form['parameterReference']].add(form['languageReference']) print("Params...") for param in tqdm( iteritems(args.cldf, 'ParameterTable', 'ID', 'Name', 'Concepticon_ID', 'Description')): data.add(models.Concept, param['ID'], id=param['ID'], name='{} [{}]'.format(param['Name'], param['ID']), description=param['Description'], count=len(counts[param['ID']])) print("Forms...") for form in tqdm( iteritems(args.cldf, 'FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source')): l = re.split(r";|\+", form['parameterReference']) for i, paramref in enumerate(l): if paramref == '?': continue vsid = (form['languageReference'], paramref) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][paramref], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add( models.Lexeme, form['id'] + '-' + str(i) if len(l) > 1 else form['id'], id=form['id'] + '-' + str(i) if len(l) > 1 else form['id'], name=form['form'], gloss=form['Gloss'], native=form['Native'], phonemic='/' + form['Phonemic'] + '/' if form['Phonemic'] else None, description=form['Description'], cognateset=form['Cognateset'], valueset=vs, ) print("Refs...") for (vsid, sid), pages in tqdm(refs.items()): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages))))
def main(args): # pragma: no cover license = licenses.find(args.cldf.properties['dc:license']) assert license and license.id.startswith('CC-') clts = CLTS( input('Path to cldf-clts/clts:') or '../../cldf-clts/clts-data') data = Data() ds = data.add( common.Dataset, vanuatuvoices.__name__, id=vanuatuvoices.__name__, name='Vanuatu Voices', domain='vanuatuvoices.clld.org', contact="*****@*****.**", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="https://www.eva.mpg.de", license=license.url, jsondata={ 'license_icon': '{}.png'.format('-'.join( [p.lower() for p in license.id.split('-')[:-1]])), 'license_name': license.name }, ) form2audio = audioutil.form2audio(args.cldf, 'audio/mpeg') r = get_dataset('vanuatuvoices', ep='lexibank.dataset') authors, _ = r.get_creators_and_contributors() for ord, author in enumerate(authors): cid = slug(HumanName(author['name']).last) img = pathlib.Path( vanuatuvoices.__file__).parent / 'static' / '{}.jpg'.format(cid) c = data.add( common.Contributor, cid, id=cid, name=author['name'], description=author.get('description'), jsondata=dict(img=img.name if img.exists() else None), ) data.add( common.Contributor, 'forkel', id='forkel', name='Robert Forkel', description='Data curation and website implementation', jsondata=dict(img=None), ) for ord, cid in enumerate(['walworth', 'forkel', 'gray']): DBSession.add( common.Editor(ord=ord, dataset=ds, contributor=data['Contributor'][cid])) contribs = collections.defaultdict(lambda: collections.defaultdict(list)) for c in args.cldf.iter_rows('contributions.csv'): for role in ['phonetic_transcriptions', 'recording', 'sound_editing']: for name in c[role].split(' and '): if name: cid = slug(HumanName(name).last) contribs[c['Language_ID']][cid].append(role) for lang in args.cldf.iter_rows('LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): contrib = data.add( common.Contribution, lang['id'], id=lang['id'], name='Wordlist for {}'.format(lang['name']), ) if lang['id'] in contribs: for cid, roles in contribs[lang['id']].items(): DBSession.add( common.ContributionContributor( contribution=contrib, contributor=data['Contributor'][cid], jsondata=dict(roles=roles), )) data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], description=lang['LongName'], contribution=contrib, island=lang['Island'], ) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for param in args.cldf.iter_rows('ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Concept, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id'].split('_')[0]), description=param['Bislama_Gloss'], concepticon_id=param['concepticonReference'], concepticon_gloss=param['Concepticon_Gloss'], ) inventories = collections.defaultdict(collections.Counter) for form in args.cldf.iter_rows('FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): inventories[form['languageReference']].update(form['Segments']) vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=data['Contribution'][form['languageReference']], ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add(Counterpart, form['id'], id=form['id'], name=form['form'], valueset=vs, audio=form2audio.get(form['id'])) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)))) for lid, inv in inventories.items(): inv = [clts.bipa[c] for c in inv] data['Variety'][lid].update_jsondata( inventory=[(str(c), c.name) for c in inv if getattr(c, 'name', None)])
def main(args): # pragma: no cover ds = StructureDataset.from_metadata(DS) data = Data() for source in ds.sources: data.add(common.Source, source.id, _obj=bibtex2source(source)) ext = [ Record.from_string('@' + s, lowercase=True) for s in nfilter(BIB.split('@')) ] for rec in ext: if rec.id not in data['Source']: data.add(common.Source, rec.id, _obj=bibtex2source(rec)) for contrib in ds['contributors.csv']: o = data.add( common.Contributor, contrib['ID'], id=contrib['ID'].upper(), name=contrib['Name'], description=contrib['Description'], url=contrib['URL'], jsondata={ 'readme': contrib['Readme'], 'contents': contrib['Contents'] }, ) for src in contrib['Source']: DBSession.add( models.ContributorReference(source=data['Source'][src], contributor=o)) dataset = data.add( common.Dataset, 'phoible', id='phoible', name='PHOIBLE 2.0', description='PHOIBLE 2.0', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://www.shh.mpg.de", domain='phoible.org', license='https://creativecommons.org/licenses/by-sa/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'https://i.creativecommons.org/l/by-sa/3.0/88x31.png', 'license_name': 'Creative Commons Attribution-ShareAlike 3.0 Unported License' }) for i, (cid, name) in enumerate([ ('UZ', "Steven Moran"), ('mccloy', "Daniel McCloy"), ], start=1): contrib = data['Contributor'].get(cid) if not contrib: contrib = common.Contributor(id=cid, name=name) DBSession.add( common.Editor(dataset=dataset, ord=i, contributor=contrib)) glottolog = Glottolog( Path(phoible.__file__).parent.parent.parent.parent.joinpath( 'glottolog', 'glottolog')) for lang in ds['LanguageTable']: l = data.add( models.Variety, lang['ID'], id=lang['ID'], name=lang['Name'], ) load_families(data, [(l.id, l) for l in data['Variety'].values() if len(l.id) == 8], glottolog.repos) DBSession.flush() # assign color codes: families = defaultdict(list) for l in data['Variety'].values(): families[l.family_pk].append(l) colors = color.qualitative_colors(len(families)) for i, langs in enumerate(sorted(families.values(), key=lambda v: -len(v))): for l in langs: l.jsondata = {'color': colors[i]} for segment in ds['ParameterTable']: equivalence_class = ''.join([ t[0] for t in [(c, unicodedata.name(c)) for c in segment['Name']] if t[1].split()[0] not in ['COMBINING', 'MODIFIER'] ]), data.add(models.Segment, segment['ID'], id=segment['ID'], name=segment['Name'], description=segment['Description'], segment_class=segment['SegmentClass'], equivalence_class=equivalence_class) DBSession.flush() # Add redirects for old language pages! get relevant ISO codes and map to Glottocode! for model, repls in load( Path(phoible.__file__).parent.parent / 'replacements.json').items(): if model == 'Language': languoids = {l.id: l for l in glottolog.languoids()} iso_languoids = {l.iso: l for l in languoids.values() if l.iso} gl_in_phoible = set(data['Variety'].keys()) for oid, nid in repls.items(): gls = descendants_from_nodemap( iso_languoids.get(oid), languoids).intersection(gl_in_phoible) if gls: nid = gls.pop() if len(gls) > 1: print('+++', oid, gls) else: print('---', oid) common.Config.add_replacement(oid, nid, common.Language) elif model == 'Parameter': segments_in_phoible = set(data['Segment'].keys()) for oid, nid in repls.items(): id_ = nid if nid in segments_in_phoible else None common.Config.add_replacement(oid, id_, common.Parameter) for segment in ds['ParameterTable']: for i, (k, v) in enumerate(sorted(segment.items())): if k not in ['ID', 'Name', 'Description', 'SegmentClass']: DBSession.add( common.Parameter_data( key=feature_name(k), value=v, ord=i, object_pk=data['Segment'][segment['ID']].pk)) for inventory in ds['contributions.csv']: inv = data.add( models.Inventory, inventory['ID'], id=inventory['ID'], name='{0} ({1} {2})'.format( inventory['Name'], inventory['Contributor_ID'].upper(), inventory['ID'], ), source_url=inventory['URL'], count_tone=inventory['count_tones'], count_vowel=inventory['count_vowels'], count_consonant=inventory['count_consonants'], ) DBSession.add( common.ContributionContributor( contribution=inv, contributor=data['Contributor'][ inventory['Contributor_ID'].upper()])) for src in inventory['Source']: DBSession.add( common.ContributionReference(contribution=inv, source=data['Source'][src])) for phoneme in ds['ValueTable']: lang = data['Variety'][phoneme['Language_ID']] inv = data['Inventory'][phoneme['Contribution_ID']] if not inv.language: inv.language = lang vs = common.ValueSet( id=phoneme['ID'], contribution=inv, language=lang, parameter=data['Segment'][phoneme['Parameter_ID']]) for ref in phoneme['Source']: DBSession.add( common.ValueSetReference(source=data['Source'][ref], valueset=vs)) DBSession.add( models.Phoneme( id=phoneme['ID'], name='%s %s' % (phoneme['Value'], data['Inventory'][phoneme['Contribution_ID']].name), allophones=' '.join(phoneme['Allophones']), marginal=phoneme['Marginal'], valueset=vs)) return
def main(args): # pragma: no cover # # FIXME: more generic: # - run iter_datasets(args.cldf) -> assuming args.cldf is a directory! -> must go in clld! # - Store datasets in defaultdict(list) keyed with module # datasets = {} for ds in iter_datasets(args.cldf.directory): datasets[ds.module] = ds assert args.glottolog, 'The --glottolog option is required!' data = Data() thedataset = data.add( common.Dataset, hindukush.__name__, id=hindukush.__name__, name='Hindu Kush Areal Typology', domain='hindukush.clld.org', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }, ) for i, name in enumerate( ['Henrik Liljegren', 'Robert Forkel', 'Nina Knobloch', 'Noa Lange']): common.Editor(dataset=thedataset, ord=i, contributor=common.Contributor(id=slug( HumanName(name).last), name=name)) for rec in bibtex.Database.from_file(pathlib.Path(__file__).parent / 'HK_website.bib', lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for module, ds in sorted(datasets.items(), key=lambda i: i[0]): for lang in ds.iter_rows('LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): if lang['id'] not in data['Variety']: data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], subgroup=lang['SubGroup'], location=lang['Location'], elicitation=lang['Elicitation'], jsondata=dict(shape=subgroup_shapes.get(lang['SubGroup'])), ) contrib = data.add( models.CLDFDataset, module, id=module, name='{} [{}]'.format(ds.properties.get('dc:title'), module), description=ds.properties.get('dc:bibliographicCitation'), module=module, ) if module == 'Wordlist': for param in ds.iter_rows('ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Param, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id']), sortkey=param['id'] if not param['id'].startswith('Numerals') else 'Numerals-{0:04d}'.format(int(param['id'].split('-')[1])), concepticon_id=param['concepticonReference'], contribution=contrib, category=param['domain'] or 'ASJPlist', ) audio = { r['ID']: r for r in ds.iter_rows('media.csv') if r['mimetype'] == 'audio/mpeg' } for form in ds.iter_rows('FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Param'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) mp3 = next( iter([ audio[aid] for aid in form['Audio_Files'] if aid in audio ]), None) data.add( common.Value, form['id'], id=form['id'], name=form['form'], valueset=vs, jsondata=dict(audio=ds.get_row_url('media.csv', mp3 ) if mp3 else None), ) elif module == 'StructureDataset': for param in ds.iter_rows('ParameterTable', 'id', 'name', 'description'): data.add( models.Param, param['id'], id=param['id'], name=param['name'], description=html(param['description']) if param['description'] else None, category=param['Category'], contribution=contrib, ) for code in ds.iter_rows('CodeTable', 'id', 'name', 'description', 'parameterReference'): data.add(common.DomainElement, code['id'], id=code['id'], name=code['name'], description=code['description'], parameter=data['Param'][code['parameterReference']], jsondata={ 'color': { 'absent': 'ff0000', 'present': '0000ff', 'indeterminate': 'cccccc', }.get(code['description']) }) # # FIXME: read CodeTable! # for form in ds.iter_rows('ValueTable', 'id', 'value', 'languageReference', 'parameterReference', 'codeReference', 'source'): vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Param'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add( common.Value, form['id'], id=form['id'], name=form['value'], valueset=vs, domainelement=data['DomainElement'][form['codeReference']]) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)))) load_families( Data(), [(l.glottocode, l) for l in data['Variety'].values()], glottolog_repos=args.glottolog, isolates_icon='tcccccc', strict=False, )