def main(args): # pragma: no cover global MAX_IDENTIFIER_PK with transaction.manager: MAX_IDENTIFIER_PK = DBSession.query( Identifier.pk).order_by(desc(Identifier.pk)).first()[0] gl_name = glottolog_name() gl_names = glottolog_names() languoids = {l.pk: l for l in DBSession.query(Languoid)} for attrs in jsonload(args.data_dir.joinpath('languoids', 'changes.json')): replacement = attrs.pop('replacement', None) hname = attrs.pop('hname', None) for name, enum in [('level', LanguoidLevel), ('status', LanguoidStatus)]: if name in attrs: attrs[name] = enum.from_string(attrs[name]) l = languoids.get(attrs['pk']) if l: for k, v in attrs.items(): setattr(l, k, v) # # We do not assign ISO codes for existing languages, because it could be # that the ISO code is now assigned to a family node, due to a change # request, e.g. see https://github.com/clld/glottolog-data/issues/40 # if len(l.hid or '') == 3 and not l.iso_code: args.log.warn('Language with hid %s but no iso code!' % l.hid) else: l = Languoid(**attrs) DBSession.add(l) languoids[l.pk] = l if len(attrs.get('hid', '')) == 3: create_identifier( None, l, name=attrs['hid'], type=IdentifierType.iso.value) create_identifier( gl_names.get(l.name), l, name=l.name, description=gl_name.description, type=gl_name.type) if hname: l.update_jsondata(hname=hname) if replacement: DBSession.add(Superseded( languoid_pk=l.pk, replacement_pk=replacement, relation='classification update')) DBSession.flush() recreate_treeclosure()
def main(args): # pragma: no cover recreate_treeclosure()
def main(args): # pragma: no cover global MAX_IDENTIFIER_PK with transaction.manager: MAX_IDENTIFIER_PK = DBSession.query( Identifier.pk).order_by(desc(Identifier.pk)).first()[0] gc_names = {i.name: i for i in DBSession.query(Identifier).filter( Identifier.type == 'name').filter(Identifier.description == 'Glottolog')} ma_map = get_map(Macroarea) languoids = dict((l.pk, l) for l in DBSession.query(Languoid)) with open(args.data_file(args.version, 'languoids.json')) as fp: for attrs in json.load(fp): ma = attrs.pop('macroarea', None) replacement = attrs.pop('replacement', None) hname = attrs.pop('hname', None) for name, enum in [('level', LanguoidLevel), ('status', LanguoidStatus)]: if name in attrs: attrs[name] = enum.from_string(attrs[name]) l = languoids.get(attrs['pk']) if l: for k, v in attrs.items(): if k == 'globalclassificationcomment': continue setattr(l, k, v) if len(l.hid or '') == 3: if not l.iso_code: create_identifier( None, l, name=l.hid, type=IdentifierType.iso.value) else: l = Languoid(**attrs) DBSession.add(l) languoids[l.pk] = l if len(attrs.get('hid', '')) == 3: create_identifier( None, l, name=attrs['hid'], type=IdentifierType.iso.value) if ma: l.macroareas.append(ma_map[ma]) create_identifier( gc_names.get(l.name), l, name=l.name, description='Glottolog', type='name') if hname: l.update_jsondata(hname=hname) if replacement: DBSession.add(Superseded( languoid_pk=l.pk, replacement_pk=replacement, relation='classification update')) DBSession.flush() recreate_treeclosure()
def prime(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ # # Now that we loaded all languoids and refs, we can compute the MED values. # meds = defaultdict(list) for lpk, spk, sid, sname, med_type, year, pages in DBSession.execute("""\ select l.pk, r.pk, s.id, s.name, r.med_type, s.year_int, r.med_pages from languagesource as ls, language as l, source as s, ref as r where ls.active = TRUE and l.pk = ls.language_pk and s.pk = ls.source_pk and s.pk = r.pk order by l.id, r.med_index desc, r.med_pages, coalesce(s.year_int, 0), s.pk """): meds[lpk].append((spk, sid, sname, med_type, year, pages)) # The last one is the overall MED # Now weed out the "newer but worse" sources: for lpk, sources in {k: reversed(v) for k, v in meds.items()}.items(): relevant, lastyear = [], 10000 for spk, sid, sname, med_type, year, pages in sources: if year and year < lastyear: # If year is more recent, this is a "newer but worse" item relevant.append((spk, sid, sname, med_type, year, pages)) lastyear = year meds[lpk] = relevant med_param = common.Parameter.get('med') med_domain = {de.id: de for de in med_param.domain} contrib = common.Contribution.get('glottolog') for l in DBSession.query(common.Language).filter(common.Language.pk.in_(list(meds.keys()))): l.update_jsondata(meds=[ (sid, med_type, year, pages, sname) for spk, sid, sname, med_type, year, pages in meds[l.pk]]) if not meds[l.pk]: continue med = meds[l.pk][0] # Record the overall MED as value for the 'med' Parameter: vs = common.ValueSet( id=idjoin('med', l.id), contribution=contrib, parameter=med_param, language=l, ) DBSession.add(common.Value( id=idjoin('med', l.id), name=getattr(args.repos.med_types, med[3]).name, domainelement=med_domain[idjoin('med', med[3])], valueset=vs, )) DBSession.flush() DBSession.add(common.ValueSetReference(source_pk=med[0], valueset_pk=vs.pk)) recreate_treeclosure() macroareas = {r[0]: (r[1], r[2]) for r in DBSession.execute("""\ select de.pk, de.id, de.name from domainelement as de, parameter as p where de.parameter_pk = p.pk and p.id = 'macroarea' """)} for lid, lpk, cpk, ppk, mas in DBSession.execute("""\ select l.id, l.pk, vs.contribution_pk, vs.parameter_pk, array_agg(distinct v.domainelement_pk) from language as l, treeclosuretable as t, parameter as p, valueset as vs, value as v where l.pk = t.parent_pk and t.child_pk = vs.language_pk and vs.parameter_pk = p.pk and p.id = 'macroarea' and v.valueset_pk = vs.pk and l.pk not in ( select language_pk from valueset as _vs, parameter as _p where _vs.parameter_pk = _p.pk and _p.id = 'macroarea' ) group by l.id, l.pk, vs.contribution_pk, vs.parameter_pk"""): for i, mapk in enumerate(mas): if i == 0: vs = common.ValueSet( id=idjoin('macroarea', lid), language_pk=lpk, parameter_pk=ppk, contribution_pk=cpk) DBSession.add(common.Value( id=idjoin(macroareas[mapk][0], lid), name=macroareas[mapk][1], domainelement_pk=mapk, valueset=vs)) for vs in DBSession.query(common.ValueSet)\ .join(common.Language)\ .join(common.Parameter)\ .filter(common.Parameter.id == 'macroarea')\ .options(joinedload(common.ValueSet.values), joinedload(common.ValueSet.language)): vs.language.macroareas = ', '.join([macroareas[v.domainelement_pk][1] for v in vs.values]) for row in list(DBSession.execute( "select pk, pages, pages_int, startpage_int from source where pages_int < 0" )): raise ValueError(row) version = assert_release(args.repos.repos) with jsonlib.update(gc2version(args), indent=4) as legacy: for lang in DBSession.query(common.Language): if lang.id not in legacy: lang.update_jsondata(new=True) legacy[lang.id] = version valuesets = { r[0]: r[1] for r in DBSession.query(common.ValueSet.id, common.ValueSet.pk)} refs = { r[0]: r[1] for r in DBSession.query(models.Refprovider.id, models.Refprovider.ref_pk)} for vsid, vspk in valuesets.items(): if vsid.startswith('macroarea-'): DBSession.add(common.ValueSetReference( source_pk=refs[args.repos.macroareas.__defaults__['reference_id']], valueset_pk=vspk)) for vs in DBSession.query(common.ValueSet)\ .join(common.Parameter)\ .filter(common.Parameter.id == 'aes'): if vs.jsondata['reference_id']: DBSession.add(common.ValueSetReference( source_pk=refs[vs.jsondata['reference_id']], valueset_pk=vs.pk)) for lang in args.repos.languoids(): if lang.category == args.repos.language_types.bookkeeping.category: continue clf = lang.classification_comment if clf: for pid, attr_ in [('sc', 'sub'), ('fc', 'family')]: if getattr(clf, attr_ + 'refs'): if split_items(lang.cfg['classification'][attr_ + 'refs']) != \ split_items(lang.cfg['classification'].get(attr_)): vspk = valuesets['{0}-{1}'.format(pid, lang.id)] for ref in getattr(clf, attr_ + 'refs'): spk = refs.get(ref.key) if spk: DBSession.add( common.ValueSetReference(source_pk=spk, valueset_pk=vspk))
def prime(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ recreate_treeclosure() for lpk, mas in DBSession.execute("""\ select l.pk, array_agg(distinct lma.macroarea_pk) from language as l, treeclosuretable as t, languoidmacroarea as lma, macroarea as ma where l.pk = t.parent_pk and t.child_pk = lma.languoid_pk and lma.macroarea_pk = ma.pk and l.pk not in (select languoid_pk from languoidmacroarea) group by l.pk"""): for mapk in mas: DBSession.add(models.Languoidmacroarea(languoid_pk=lpk, macroarea_pk=mapk)) for row in list(DBSession.execute( "select pk, pages, pages_int, startpage_int from source where pages_int < 0" )): pk, pages, number, start = row _start, _end, _number = compute_pages(pages) if _number > 0 and _number != number: DBSession.execute( "update source set pages_int = %s, startpage_int = %s where pk = %s" % (_number, _start, pk)) DBSession.execute( "update ref set endpage_int = %s where pk = %s" % (_end, pk)) with jsonlib.update(gc2version(args), indent=4) as legacy: for lang in DBSession.query(common.Language): if lang.id not in legacy: lang.update_jsondata(new=True) legacy[lang.id] = args.args[0] def items(s): if not s: return set() r = [] for ss in set(s.strip().split()): if '**:' in ss: ss = ss.split('**:')[0] + '**' if ss.endswith(','): ss = ss[:-1].strip() r.append(ss) return set(r) refs = { r[0]: r[1] for r in DBSession.query(models.Refprovider.id, models.Refprovider.ref_pk)} valuesets = { r[0]: r[1] for r in DBSession.query(common.ValueSet.id, common.ValueSet.pk)} for lang in args.repos.languoids(): if lang.category == models.BOOKKEEPING: continue clf = lang.classification_comment if clf: if clf.subrefs: if items(lang.cfg['classification']['subrefs']) != \ items(lang.cfg['classification'].get('sub')): vspk = valuesets['sc-{0}'.format(lang.id)] for ref in clf.subrefs: spk = refs.get(ref.key) DBSession.add( common.ValueSetReference(source_pk=spk, valueset_pk=vspk))
def prime(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ recreate_treeclosure() for lpk, mas in DBSession.execute("""\ select l.pk, array_agg(distinct lma.macroarea_pk) from language as l, treeclosuretable as t, languoidmacroarea as lma, macroarea as ma where l.pk = t.parent_pk and t.child_pk = lma.languoid_pk and lma.macroarea_pk = ma.pk and l.pk not in (select languoid_pk from languoidmacroarea) group by l.pk"""): for mapk in mas: DBSession.add(models.Languoidmacroarea(languoid_pk=lpk, macroarea_pk=mapk)) for row in list(DBSession.execute( "select pk, pages, pages_int, startpage_int from source where pages_int < 0" )): pk, pages, number, start = row _start, _end, _number = compute_pages(pages) if _number > 0 and _number != number: DBSession.execute( "update source set pages_int = %s, startpage_int = %s where pk = %s" % (_number, _start, pk)) DBSession.execute( "update ref set endpage_int = %s where pk = %s" % (_end, pk)) version = assert_release(args.repos.repos) with jsonlib.update(gc2version(args), indent=4) as legacy: for lang in DBSession.query(common.Language): if lang.id not in legacy: lang.update_jsondata(new=True) legacy[lang.id] = version def items(s): if not s: return set() r = [] for ss in set(s.strip().split()): if '**:' in ss: ss = ss.split('**:')[0] + '**' if ss.endswith(','): ss = ss[:-1].strip() r.append(ss) return set(r) refs = { r[0]: r[1] for r in DBSession.query(models.Refprovider.id, models.Refprovider.ref_pk)} valuesets = { r[0]: r[1] for r in DBSession.query(common.ValueSet.id, common.ValueSet.pk)} for lang in args.repos.languoids(): if lang.category == models.BOOKKEEPING: continue clf = lang.classification_comment if clf: if clf.subrefs: if items(lang.cfg['classification']['subrefs']) != \ items(lang.cfg['classification'].get('sub')): vspk = valuesets['sc-{0}'.format(lang.id)] for ref in clf.subrefs: spk = refs.get(ref.key) DBSession.add( common.ValueSetReference(source_pk=spk, valueset_pk=vspk)) if clf.familyrefs: if items(lang.cfg['classification']['familyrefs']) != \ items(lang.cfg['classification'].get('family')): vspk = valuesets['fc-{0}'.format(lang.id)] for ref in clf.familyrefs: spk = refs.get(ref.key) if spk: DBSession.add( common.ValueSetReference(source_pk=spk, valueset_pk=vspk))