def add_doculects(lang_dataset, session, sources={}): """ Creates and adds to the given SQLAlchemy session the Doculect instances harvested from the given LangDataset instance. Returns a dict of the added model instances with the respective ISO codes being the keys. The optional arg should contain common.Source instances with the keys being strings starting with the ISO code of the language that the source is for. Helper for the main function. """ d = {} for lang in lang_dataset.gen_langs(): if not (lang.name and lang.subfamily and lang.iso_code and lang.glotto_code and lang.longitude and lang.latitude and lang.id): print(f'SKIP: Missing data for {lang.name}.', file=sys.stderr) continue d[lang.name] = Doculect(id=lang.id, name=lang.name, subfamily=lang.subfamily, iso_code=lang.iso_code, glotto_code=lang.glotto_code, longitude=lang.longitude, latitude=lang.latitude) session.add(d[lang.name]) session.flush() for key, source in sources.items(): if key[:3] in d: session.add(common.LanguageSource( language_pk=d[key[:3]].pk, source_pk=source.pk)) session.flush() return d
def compute_language_sources(*references): """compute relations between languages and sources by going through the relevant models derived from the HasSource mixin. """ old_sl = {} for pair in DBSession.query(common.LanguageSource): old_sl[(pair.source_pk, pair.language_pk)] = True references = list(references) references.extend([(common.ValueSetReference, 'valueset'), (common.SentenceReference, 'sentence')]) sl = {} for model, attr in references: for ref in DBSession.query(model): sl[(ref.source_pk, getattr(ref, attr).language_pk)] = True for s, l in sl: if (s, l) not in old_sl: DBSession.add(common.LanguageSource(language_pk=l, source_pk=s))
def load_ref(data, entry, lgcodes, lgsources): kw = {'jsondata': {}, 'language_note': entry.fields.get('lgcode')} for col in common.Source.__table__.columns: if col.name in entry.fields: kw[col.name] = entry.fields.get(col.name) for col in models.Ref.__table__.columns: if col.name in entry.fields: kw[col.name] = entry.fields.get(col.name) for col in entry.fields: if col not in kw: kw['jsondata'][col] = entry.fields[col] try: btype = EntryType.from_string(entry.type.lower()) except ValueError: btype = EntryType.misc kw.update( publisher=entry.publisher_and_address[0], address=entry.publisher_and_address[1], year_int=entry.year_int, pages_int=entry.pages_int, med_index=-entry.weight[0], med_pages=entry.weight[1], med_type=entry.med_type.id, id=entry.fields['glottolog_ref_id'], fts=fts.tsvector('\n'.join(v for k, v in entry.fields.items() if k != 'abstract')), name='{} {}'.format(entry.fields.get('author', 'na'), entry.fields.get('year', 'nd')), description=entry.fields.get('title') or entry.fields.get('booktitle'), bibtex_type=btype) ref = models.Ref(**kw) DBSession.add(ref) DBSession.flush() reflangs, trigger = [], None no_ca = [{'degruyter'}, {'benjamins'}] provs = set() for key in entry.fields['srctrickle'].split(','): key = key.strip() if key: reflangs.extend(lgsources.get(key, [])) prov, key = key.split('#', 1) provs.add(prov) DBSession.add(models.Refprovider( provider_pk=data['Provider'][prov].pk, ref_pk=ref.pk, id='{0}:{1}'.format(prov, key))) if not reflangs: reflangs, trigger = entry.languoids(lgcodes) if trigger and ((provs in no_ca) or (reflangs)): # Discard computerized assigned languoids for bibs where this does not make sense, # or for bib entries that have been manually assigned in a Languoid's ini file. reflangs, trigger = [], None for lid in set(reflangs): DBSession.add( common.LanguageSource( language_pk=data['Languoid'][lid].pk, source_pk=ref.pk, active=not bool(trigger))) if trigger: ref.ca_language_trigger = trigger doctypes, trigger = entry.doctypes(data['Doctype']) if trigger is None or provs not in no_ca: for dt in set(doctypes): DBSession.add(models.Refdoctype(doctype_pk=dt.pk, ref_pk=ref.pk)) if trigger: ref.ca_doctype_trigger = trigger return ref
def prime_cache(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ ordered_clade_colors = {k: v for k, v in DBSession.query(models.Clade.clade_name, models.Clade.color) .filter(models.Clade.short_name != '') .order_by(models.Clade.clade_level0).all()} for _, cc in groupby( DBSession.query(models.CognateClass, models.Variety.clade_name) .join(clld_cognacy_plugin.models.Cognate, and_(models.CognateClass.pk == clld_cognacy_plugin.models.Cognate.cognateset_pk)) .join(models.Value, and_(clld_cognacy_plugin.models.Cognate.counterpart_pk == models.Value.pk)) .join(common.ValueSet, and_(models.Value.valueset_pk == common.ValueSet.pk)) .join(models.Variety, and_(common.ValueSet.language_pk == models.Variety.pk)) .distinct().order_by(models.CognateClass.pk), lambda c: c[0].pk): cc = sorted(list(cc)) cc[0][0].count_clades = len(cc) involved_clades = [c[1] for c in cc] r = [] for cl, col in ordered_clade_colors.items(): if cl in involved_clades: r.append(col) else: r.append('0') cc[0][0].involved_clade_colors = ' '.join(r) cc[0][0].clades = ', '.join(involved_clades) for c in DBSession.query(models.CognateClass, func.count(models.CognateClass.id)) \ .join(clld_cognacy_plugin.models.Cognate) \ .group_by(models.CognateClass.pk, models.Cognateset.pk, models.CognateClass.id): c[0].count_lexemes = c[1] for _, ccs in groupby( DBSession.query(models.CognateClass).order_by(models.CognateClass.meaning_pk), lambda c: c.meaning_pk ): ccs = list(ccs) colors = qualitative_colors(len(ccs)) for i, cc in enumerate(ccs): cc.color = colors[i] for meaning in DBSession.query(models.Meaning).options( joinedload(models.Meaning.cognateclasses), joinedload(common.Parameter.valuesets, common.ValueSet.language) ): meaning.count_cognateclasses = len(meaning.cognateclasses) meaning.count_languages = len([vs.language for vs in meaning.valuesets]) meaning.count_loan_cognateclasses = len([cc for cc in meaning.cognateclasses if cc.is_loan]) for meaning in DBSession.query( models.Meaning, func.count(common.Parameter.pk))\ .join(common.Parameter).join(common.ValueSet).join(common.Value).group_by( models.Meaning.pk, common.Parameter.pk): meaning[0].count_lexemes = meaning[1] for language in DBSession.query(common.Language).options( joinedload(common.Language.valuesets, common.ValueSet.references) ): language.count_meanings = len(language.valuesets) language.count_lexemes = len(DBSession.query(common.Value.id) .filter(common.ValueSet.language_pk == language.pk) .join(common.ValueSet).all()) spks = set() for vs in language.valuesets: for ref in vs.references: spks.add(ref.source_pk) for spk in spks: DBSession.add(common.LanguageSource(language_pk=language.pk, source_pk=spk))
def load_ref(data, entry, lgcodes, lgsources): kw = {'jsondata': {}, 'language_note': entry.fields.get('lgcode')} for col in common.Source.__table__.columns: if col.name in entry.fields: kw[col.name] = entry.fields.get(col.name) for col in models.Ref.__table__.columns: if col.name in entry.fields: kw[col.name] = entry.fields.get(col.name) for col in entry.fields: if col not in kw: kw['jsondata'][col] = entry.fields[col] try: btype = EntryType.from_string(entry.type.lower()) except ValueError: btype = EntryType.misc # try to extract numeric year, startpage, endpage, numberofpages, ... if kw.get('year'): # prefer years in brackets over the first 4-digit number. match = PREF_YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) else: match = YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) if kw.get('publisher'): p = kw.get('publisher') if ':' in p: address, publisher = [s.strip() for s in kw['publisher'].split(':', 1)] if 'address' not in kw or kw['address'] == address: kw['address'], kw['publisher'] = address, publisher if kw.get('numberofpages'): try: kw['pages_int'] = int(kw.get('numberofpages').strip()) except ValueError: pass if kw.get('pages'): start, end, number = compute_pages(kw['pages']) if start is not None: kw['startpage_int'] = start if end is not None: kw['endpage_int'] = end if number is not None and 'pages_int' not in kw: kw['pages_int'] = number kw.update( id=entry.fields['glottolog_ref_id'], fts=fts.tsvector( '\n'.join(v for k, v in entry.fields.items() if k != 'abstract')), name='%s %s' % ( entry.fields.get('author', 'na'), entry.fields.get('year', 'nd')), description=entry.fields.get('title') or entry.fields.get('booktitle'), bibtex_type=btype) ref = models.Ref(**kw) DBSession.add(ref) DBSession.flush() reflangs = [] no_ca = [{'degruyter'}, {'benjamins'}] provs = set() for key in entry.fields['srctrickle'].split(','): key = key.strip() if key: if key in lgsources: reflangs.extend(lgsources[key]) prov, key = key.split('#', 1) provs.add(prov) DBSession.add(models.Refprovider( provider_pk=data['Provider'][prov].pk, ref_pk=ref.pk, id='{0}:{1}'.format(prov, key))) langs, trigger = entry.languoids(lgcodes) if trigger and ((provs in no_ca) or (reflangs)): # Discard computerized assigned languoids for bibs where this does not make sense, # or for bib entries that have been manually assigned in a Languoid's ini file. langs, trigger = [], None for lid in set(reflangs + langs): DBSession.add( common.LanguageSource(language_pk=data['Languoid'][lid].pk, source_pk=ref.pk)) if trigger: ref.ca_language_trigger = trigger doctypes, trigger = entry.doctypes(data['Doctype']) if trigger is None or provs not in no_ca: for dt in set(doctypes): DBSession.add(models.Refdoctype(doctype_pk=dt.pk, ref_pk=ref.pk)) if trigger: ref.ca_doctype_trigger = trigger return ref
def main(args): meta = parse_meta(args) print(len(meta)) print(sum(len(m.sources) for m in meta.values())) sources = {} for m in meta.values(): for s in m.sources: sources[s] = None print(len(sources), 'distinct') for i, s in enumerate(sources): sources[s] = get_source(s, i + 1) glottocodes = glottocodes_by_isocode('postgresql://robert@/glottolog3') data = Data() wals = create_engine('postgresql://robert@/wals3') wals_families = {} for row in wals.execute('select name, id from family'): wals_families[row[0]] = row[1] wals_families[row[1]] = row[1] #for item in reader(args.data_file('WALSFamilyAbbreviations.tab'), namedtuples=True, encoding='latin1'): # name = item.FAMILY # if name not in wals_families: # name = slug(name) # if name not in wals_families: # print('missing wals family:', item.FAMILY) # name = None # if name: # wals_families[item.ABBREVIATION] = wals_families[name] wals_genera = { row[0]: row[0] for row in wals.execute('select id from genus') } with args.data_file('listss17.txt').open(encoding='latin1') as fp: wordlists = ['\n'.join(lines) for lines in parse(fp)] dataset = common.Dataset( id=asjp.__name__, name="The ASJP Database", contact="*****@*****.**", description="The Automated Similarity Judgment Program", domain='asjp.clld.org', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://www.shh.mpg.de", license='http://creativecommons.org/licenses/by/4.0/', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) DBSession.add(dataset) transcribers = get_transcriber_map(args) for i, spec in enumerate([ ('SW', "Søren Wichmann"), ('AM', "André Müller"), ('AKW', "Annkathrin Wett"), ('VV', "Viveka Velupillai"), ('JB', "Julia Bischoffberger"), ('CB', "Cecil H. Brown"), ('EH', "Eric W. Holman"), ('SS', "Sebastian Sauppe"), ('ZM', "Zarina Molochieva"), ('PB', "Pamela Brown"), ('HH', "Harald Hammarström"), ('OB', "Oleg Belyaev"), ('JML', "Johann-Mattis List"), ('DBA', "Dik Bakker"), ('DE', "Dmitry Egorov"), ('MU', "Matthias Urban"), ('RM', "Robert Mailhammer"), ('AC', "Agustina Carrizo"), ('MSD', "Matthew S. Dryer"), ('EK', "Evgenia Korovina"), ('DB', "David Beck"), ('HG', "Helen Geyer"), ('PE', "Patience Epps"), ('AG', "Anthony Grant"), ('PS', "Paul Sidwell"), # not in citation ('KTR', "K. Taraka Rama"), # not in citation ('PV', "Pilar Valenzuela"), ('MD', "Mark Donohue"), # not in citation ]): id_, name = spec if id_ in transcribers: assert name == transcribers.pop(id_) contributor = data.add(common.Contributor, id_, id=id_, name=name) if id_ in ['SW', 'CB', 'EH']: DBSession.add( common.Editor(dataset=dataset, ord=i + 1, contributor=contributor)) for id_, name in transcribers.items(): data.add(common.Contributor, id_, id=id_, name=name) for id_ in sorted(models.MEANINGS_ALL.keys()): data.add(models.Meaning, id_, id=str(id_), name=models.MEANINGS_ALL[id_], core=id_ in models.MEANINGS) for n, l in enumerate(wordlists): #if n > 100: # break lang = models.Doculect.from_txt(l) if lang.classification_wals: family, genus = lang.classification_wals.split('.') lang.wals_family = wals_families.get(family) lang.wals_genus = wals_genera.get(slug(genus)) lang.code_glottolog = glottocodes.get(lang.code_iso) add_codes(lang) data.add(models.Doculect, lang.id, _obj=lang) DBSession.flush() md = meta.pop(lang.id, None) assert md # associate transcribers and sources for i, transcriber in enumerate(md.transcribers): common.ContributionContributor( contribution=lang.wordlist, contributor=data['Contributor'][transcriber], ord=i + 1) for source in md.sources: DBSession.add( common.LanguageSource(language_pk=lang.pk, source_pk=sources[source].pk)) assert not list(meta.keys())
def _main(data, glottolog): languoids = list(glottolog.languoids()) lbyi = {l.iso: l for l in languoids if l.iso} dataset = common.Dataset( id='ldh', name='Language Description Heritage', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://www.shh.mpg.de", license="https://creativecommons.org/licenses/by/4.0/", domain='ldh.clld.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) DBSession.add(dataset) DBSession.add( common.Editor(dataset=dataset, contributor=common.Contributor(id='forkel', name='Robert Forkel'))) ls = set() for post in iter_posts(): if post.pure_item_id: item = pure.Item.from_json(post.pure_item_id) src = data['Description'].get(item.id) if not src: src = data.add( models.Description, item.id, id=item.id, description=item.title, name=item.name, bibtex_type=EntryType.get(item.bibtex_type), year=item.year, title=item.title, address=item.publisher.get('place') if item.publisher else None, publisher=item.publisher.get('publisher') if item.publisher else None, author=' and '.join(item.authors), editor=' and '.join(item.editors), pid=item.doi or item.pid, pid_type='doi' if item.doi else 'hdl', ) DBSession.flush() for file in item.files: if file.visibility == 'PUBLIC' \ and file.metadata["contentCategory"] == "any-fulltext"\ and file.storage == 'INTERNAL_MANAGED': assert file.mimeType == 'application/pdf' DBSession.add( common.Source_files( id=file.pid.replace('/', '__'), name=file.name, object_pk=src.pk, mime_type=file.mimeType, jsondata=dict(size=file.size, license=attr.asdict(file.license) if file.license else None), )) for iso in item.isocodes: if iso in lbyi: gl = lbyi[iso] l = data['LDHLanguage'].get(iso) if not l: l = data.add(models.LDHLanguage, iso, id=iso, name=gl.name) DBSession.flush() if (item.id, iso) not in ls: DBSession.add( common.LanguageSource(language_pk=l.pk, source_pk=src.pk)) ls.add((item.id, iso)) for item in zenodo.iter_items(): src = data.add( models.Description, item.id, id=item.id, description=item['metadata']['title'], name=item.name, bibtex_type=EntryType.get(item.bibtex_type), year=item.year, title=item['metadata']['title'], publisher='Zenodo', author=' and '.join(a['name'] for a in item['metadata']['creators']), pid=item['metadata']['doi'], pid_type='doi', ) DBSession.flush() for file in item['files']: license = licenses.find(item['metadata']['license']['id']) DBSession.add( common.Source_files( id=file['checksum'].replace('md5:', ''), name=file['key'], object_pk=src.pk, mime_type='application/' + file['type'], jsondata=dict( size=file['size'], url=file['links']['self'], license=attr.asdict(license) if license else None), )) for kw in item['metadata']['keywords']: if not kw.startswith('iso:'): continue iso = kw.replace('iso:', '') if iso in lbyi: gl = lbyi[iso] l = data['LDHLanguage'].get(iso) if not l: l = data.add(models.LDHLanguage, iso, id=iso, name=gl.name) DBSession.flush() if (item.id, iso) not in ls: DBSession.add( common.LanguageSource(language_pk=l.pk, source_pk=src.pk)) ls.add((item.id, iso)) load_families(data, data['LDHLanguage'].values(), glottolog_repos=glottolog.repos, isolates_icon='tcccccc')
def main(args): data = Data() icons = cycle(ORDERED_ICONS) dataset = common.Dataset( id=gelato.__name__, name="GeLaTo", description="Genes and Languages together", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="https://www.eva.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='gelato.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) for i, (id_, name) in enumerate([('barbierichiara', 'Chiara Barbieri'), ('blasidamian', 'Damián Blasi'), ('forkelrobert', 'Robert Forkel')]): ed = data.add(common.Contributor, id_, id=id_, name=name) common.Editor(dataset=dataset, contributor=ed, ord=i + 1) families = {} for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) for r in args.cldf.iter_rows('ContributionTable', 'id', 'name', 'description'): ds = data.add(models.Panel, r['id'], id=r['id'], name=r['name'], description=r['description']) for row in args.cldf.iter_rows('LanguageTable', 'id', 'name', 'contributionReference'): icon = families.get(row['LanguageFamily_Glottocode']) if not icon: families[row['LanguageFamily_Glottocode']] = icon = next(icons) lang = data['Languoid'].get(row['Glottocode']) if not lang: lang = data.add( models.Languoid, row['Glottocode'], id=row['Glottocode'], name=row['Language_Name'], family_id=row['LanguageFamily_Glottocode'], family_name=row['LanguageFamily'], jsondata=dict(icon=icon.name), ) s = data.add( models.Sample, row['id'], id=row['id'], name=row['Name'], panel=data['Panel'][row['contributionReference']], languoid=lang, latitude=row['Latitude'], longitude=row['Longitude'], samplesize=int(row['samplesize']), #source=row.get('dataSet.of.origin'), region=row['geographicRegion'], #location=row['location'], jsondata=dict(color=REGIONS[row['geographicRegion']]), ) DBSession.flush() for bibkey in row['Source']: DBSession.add( common.LanguageSource(language_pk=s.pk, source_pk=data['Source'][bibkey].pk)) types = {} for row in args.cldf.iter_rows('ParameterTable', 'id', 'name', 'description', 'contributionReference'): types[row['id']] = Datatype.fromvalue(row['datatype']) data.add(models.Measure, row['id'], id=row['id'], name=row['name'], description=row['description'], panel=data['Panel'][row['contributionReference']]) for row in args.cldf.iter_rows('ValueTable', 'id', 'parameterReference', 'languageReference'): v = types[row['parameterReference']].read(row['Value']) if isinstance(v, float): vs = data.add( common.ValueSet, row['id'], id=row['id'], language=data['Sample'][row['languageReference']], parameter=data['Measure'][row['parameterReference']], #contribution=ds, #jsondata=dict(color=REGIONS[sample.region]), ) data.add(models.Measurement, row['id'], id=row['id'], valueset=vs, name=row['Value'], value=v)