def query(self, req): self._domainelements = DBSession.query(DomainElement).all() return DBSession.query(Language)\ .order_by(Language.id)\ .options( joinedload_all(Language.valuesets, ValueSet.values), joinedload_all(WalsLanguage.genus, Genus.family))
def getRefs(params): query = DBSession.query(Ref) filtered = False for param, value in params['biblio'].items(): if value: filtered = True query = query.filter(icontains(getattr(Ref, param), value)) if params.get('languoids'): filtered = True lids = DBSession.query(TreeClosureTable.child_pk)\ .filter(TreeClosureTable.parent_pk.in_([l.pk for l in params['languoids']]))\ .subquery() query = query.join(LanguageSource, LanguageSource.source_pk == Ref.pk)\ .filter(LanguageSource.language_pk.in_(lids)) if params.get('doctypes'): filtered = True query = query.join(Refdoctype)\ .filter(Refdoctype.doctype_pk.in_([l.pk for l in params['doctypes']])) if params.get('macroareas'): filtered = True query = query.join(Refmacroarea)\ .filter(Refmacroarea.macroarea_pk.in_([l.pk for l in params['macroareas']])) if not filtered: return [] return query.distinct()
def langdoccomplexquery(request): res = { 'dt': None, 'doctypes': DBSession.query(Doctype).order_by(Doctype.id), 'macroareas': DBSession.query(Macroarea).order_by(Macroarea.id), 'ms': {} } for name, cls, kw in [ ('languoids', LanguoidsMultiSelect, dict( url=request.route_url('glottolog.childnodes'))), ('macroareas', MultiSelect, dict(collection=res['macroareas'])), ('doctypes', MultiSelect, dict(collection=res['doctypes'])), ]: res['ms'][name] = cls(request, name, 'ms' + name, **kw) res['params'], reqparams = get_params(request.params, **res) res['refs'] = getRefs(res['params']) if res['refs']: res['dt'] = Refs(request, Source, cq=1, **reqparams) fmt = request.params.get('format') if fmt: db = bibtex.Database([ref.bibtex() for ref in res['refs']]) for name, adapter in request.registry.getAdapters([db], IRepresentation): if name == fmt: return adapter.render_to_response(db, request) return HTTPNotAcceptable() return res
def dataset_detail_html(context=None, request=None, **kw): res = dict((row[0], row[1]) for row in DBSession.execute("select source, count(pk) from inventory group by source")) res["inventory_count"] = DBSession.query(Inventory).count() res["segment_count"] = DBSession.query(Parameter).count() res["language_count"] = DBSession.query(Language).count() res["contributors"] = ( DBSession.query(Contributor) .order_by(Contributor.name) .options(joinedload(Contributor.contribution_assocs), joinedload(Contributor.references)) .all() ) res["sources"] = { k: Source.get(k) for k in [ "moisikesling2011", "ipa2005", "hayes2009", "moran2012a", "moranetal2012", "cysouwetal2012", "mccloyetal2013", ] } res["descriptions"] = {c.id: desc(request, c.description, res["sources"]) for c in res["contributors"]} return res
def parameter_detail_html(context=None, request=None, **kw): values = DBSession.query(common.Value.pk)\ .join(common.ValueSet).filter(common.ValueSet.parameter_pk == context.pk)\ .subquery() return { 'examples': DBSession.query(common.Sentence).join(common.ValueSentence) .filter(common.ValueSentence.value_pk.in_(values))}
def prime_cache(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ for vs in DBSession.query(common.ValueSet).options( joinedload(common.ValueSet.values)): d = [] for generic_term, words in groupby( sorted(vs.values, key=lambda v: v.description), key=lambda v: v.description ): if generic_term: generic_term += ': ' else: generic_term = '' d.append(generic_term + ', '.join(nfilter([w.name for w in words]))) vs.description = '; '.join(d) for model in [models.Country, models.Ecoregion]: for instance in DBSession.query(model).options( joinedload(getattr(model, 'taxa')) ): if not instance.taxa: instance.active = False
def glottologmeta(request): q = DBSession.query(Languoid)\ .filter(Language.active == true())\ .filter(Languoid.status.in_( (LanguoidStatus.established, LanguoidStatus.unattested))) qt = q.filter(Languoid.father_pk == null()) res = { 'last_update': DBSession.query(Language.updated) .order_by(Language.updated.desc()).first()[0], 'number_of_families': qt.filter(Languoid.level == LanguoidLevel.family).count(), 'number_of_isolates': qt.filter(Languoid.level == LanguoidLevel.language).count(), } ql = q.filter(Languoid.hid != null()) res['number_of_languages'] = {'all': ql.count()} res['special_families'] = OrderedDict() for name in SPECIAL_FAMILIES: l = qt.filter(Language.name == name).one() res['special_families'][name] = l res['number_of_languages'][name] = l.child_language_count res['number_of_languages']['l1'] = res['number_of_languages']['all'] \ - res['number_of_languages']['Pidgin']\ - res['number_of_languages']['Artificial Language']\ - res['number_of_languages']['Sign Language'] return res
def prime_cache(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ for concept in DBSession.query(models.ConceptSet): concept.representation = len(concept.valuesets) ul = [] for clist in DBSession.query(models.Conceptlist): clist.items = len(clist.valuesets) ul.append((clist.name, uniqueness(clist))) #for i, (n, u) in enumerate(sorted(ul, key=lambda t: t[1], reverse=True)): # if i > 10: # break # print n, u similarities = {} for cl1, cl2 in combinations(DBSession.query(models.Conceptlist), 2): s = similarity(cl1, cl2) similarities[(cl1.name, cl2.name)] = s for i, ((l1, l2), s) in enumerate(sorted(similarities.items(), key=lambda i: i[1], reverse=True)): if i < 20: print l1, l2, s if s == 0: pass
def macroareas(args, languages, stats): ma_map = get_map(Macroarea) # we store references to languages to make computation of cumulated macroareas for # families easier lang_map = {} for hid, info in get_lginfo(args, lambda x: x.macro_area): if hid not in languages: languages[hid] = Languoid.get(hid, key='hid', default=None) if not languages[hid]: continue lang_map[languages[hid].pk] = languages[hid] a, r = update_relationship(languages[hid].macroareas, [ma_map[info.macro_area]]) if a or r: stats.update(['macroarea']) for family in DBSession.query(Languoid)\ .filter(Languoid.level == LanguoidLevel.family)\ .filter(Language.active == true()): mas = [] for lang in DBSession.query(TreeClosureTable.child_pk)\ .filter(TreeClosureTable.parent_pk == family.pk): if lang[0] in lang_map: mas.extend(lang_map[lang[0]].macroareas) a, r = update_relationship(family.macroareas, mas) if a or r: stats.update(['macroarea']) args.log.info('macroareas done')
def dataset_detail_html(context=None, request=None, **kw): """ #unique language names: 6895 #Ethnologue families: 223 #Glottolog families: 381 #languages with unique ISO codes: 4424 match [a-z]{3}! asjp=# select count(*) from (select distinct name from identifier where type = 'iso639-3') as s; -[ RECORD 1 ] count | 4401 #words in the database (not counting synonyms): 238976 and counting synonyms: ... """ stats = { 'wordlists': DBSession.query(common.Language.pk).count(), 'ethnologue_families': DBSession.query(models.Doculect.ethnologue_family) .distinct().count(), 'glottolog_families': DBSession.query(models.Doculect.glottolog_family) .distinct().count(), 'iso_langs': DBSession.query(common.Identifier.name) .filter(common.Identifier.type == common.IdentifierType.iso.value).distinct() .count(), 'synsets': DBSession.execute( 'select count(*) from (select distinct valueset_pk from value) as s') .fetchone()[0], 'words': DBSession.query(common.Value.pk).count(), 'missing_iso': len(missing_iso()), } return {k: '{0:,}'.format(n) for k, n in stats.items()}
def match_obsolete_refs(args): with open(args.data_file(args.version, 'obsolete_refs.json')) as fp: refs = json.load(fp) matched = args.data_file(args.version, 'obsolete_refs_matched.json') if matched.exists(): with open(matched) as fp: matched = json.load(fp) else: matched = {} # # TODO: optionally re-evaluate known-unmatched refs! # count = 0 f, m = 0, 0 for id_ in refs: if id_ in matched: continue count += 1 if count > 1000: print '1000 obsolete refs processed!' break ref = Ref.get(id_) found = False if ref.description and len(ref.description) > 5: for match in DBSession.query(Ref)\ .filter(not_(Source.id.in_(refs)))\ .filter(Source.description.contains(ref.description))\ .filter(or_(Source.author == ref.author, Source.year == ref.year))\ .limit(10): print '++', ref.id, '->', match.id, '++', ref.author, '->', match.author, '++', ref.year, '->', match.year matched[ref.id] = match.id found = True break if not found and ref.name and len(ref.name) > 5: for match in DBSession.query(Ref)\ .filter(not_(Source.id.in_(refs)))\ .filter(Source.name == ref.name)\ .limit(10): try: if match.description and ref.description and slug(match.description) == slug(ref.description): print '++', ref.id, '->', match.id, '++', ref.description, '->', match.description matched[ref.id] = match.id found = True break except AssertionError: continue if not found: m += 1 print '--', ref.id, ref.name, ref.description matched[ref.id] = None else: f += 1 print f, 'found' print m, 'missed' with open(args.data_file(args.version, 'obsolete_refs_matched.json'), 'w') as fp: json.dump(matched, fp)
def extract_data(endangerment): # pragma: no cover status = {} lpks = DBSession.query(common.Language.pk) \ .filter(common.Language.active == True) \ .filter(common.Language.latitude != None) \ .filter(Languoid.level == LanguoidLevel.language) \ .order_by(common.Language.pk).all() print(len(lpks)) sql = """\ select ls.source_pk, count(ls.language_pk) from languagesource as ls, ref as r where ls.source_pk = r.pk and r.ca_doctype_trigger is null and r.ca_language_trigger is null group by source_pk """ lcounts = {r[0]: r[1] for r in DBSession.execute(sql)} # loop over active, established languages with geo-coords for i, lpk in enumerate(lpks): l = DBSession.query(common.Language).filter(common.Language.pk == lpk).one() # let's collect the relevant sources in a way that allows computation of med. # Note: we limit refs to the ones without computerized assignments. sources = list(DBSession.query(Ref).join(common.LanguageSource) \ .filter(common.LanguageSource.language_pk == lpk) \ .filter(Ref.ca_doctype_trigger == None) \ .filter(Ref.ca_language_trigger == None) \ .options(joinedload(Ref.doctypes))) sources = sorted([Source(s, lcounts.get(s.pk, 0)) for s in sources]) # keep the overall med # note: this source may not be included in the potential meds computed # below, # e.g. because it may not have a year. med = sources[0].__json__() if sources else None # now we have to compute meds respecting a cut-off year. # to do so, we collect eligible sources per year and then # take the med of this collection. potential_meds = [] # we only have to loop over publication years within all sources, because # only in these years something better might have come along. for year in set(s.year for s in sources if s.year): # let's see if something better was published! eligible = [s for s in sources if s.year and s.year <= year] if eligible: potential_meds.append(sorted(eligible)[0]) # we store the precomputed sources information as jsondata: status[l.id] = [ med, [s.__json__() for s in sorted(set(potential_meds), key=lambda s: -s.year)], endangerment.get(l.id, {}).get('source') ] if i and i % 1000 == 0: print(i) DBSession.close() return status
def quicksearch(request): message = None query = DBSession.query(Languoid) term = request.params['search'].strip() titlecase = term.istitle() term = term.lower() params = {'iso': '', 'country': '', 'name': '', 'namequerytype': 'part', 'multilingual': ''} if not term: query = None elif len(term) < 3: query = None message = ('Please enter at least four characters for a name search ' 'or three characters for an iso code') elif len(term) == 3 and not titlecase: query = query.filter(Languoid.identifiers.any( type=IdentifierType.iso.value, name=term)) kind = 'ISO 639-3' elif len(term) == 8 and GLOTTOCODE_PATTERN.match(term): query = query.filter(Languoid.id == term) kind = 'Glottocode' else: _query = query.filter(func.lower(Languoid.name) == term) if DBSession.query(_query.exists()).scalar(): query = _query else: query = query.filter(or_( func.lower(Languoid.name).contains(term), Languoid.identifiers.any(and_( Identifier.type == u'name', Identifier.description == Languoid.GLOTTOLOG_NAME, func.lower(Identifier.name).contains(term))))) kind = 'name part' params['name'] = term if query is None: languoids = [] else: languoids = query.order_by(Languoid.name)\ .options(joinedload(Languoid.family)).all() if not languoids: term_pre = HTML.kbd(term, style='white-space: pre') message = 'No matching languoids found for %s "' % kind + term_pre + '"' elif len(languoids) == 1: raise HTTPFound(request.resource_url(languoids[0])) map_, icon_map, family_map = get_selected_languages_map(request, languoids) layer = list(map_.get_layers())[0] if not layer.data['features']: map_ = None countries = json.dumps(['%s (%s)' % (c.name, c.id) for c in DBSession.query(Country).order_by(Country.description)]) return {'message': message, 'params': params, 'languoids': languoids, 'map': map_, 'countries': countries}
def main(args): # pragma: no cover global MAX_IDENTIFIER_PK with transaction.manager: MAX_IDENTIFIER_PK = DBSession.query( Identifier.pk).order_by(desc(Identifier.pk)).first()[0] gl_name = glottolog_name() gl_names = glottolog_names() languoids = {l.pk: l for l in DBSession.query(Languoid)} for attrs in jsonload(args.data_dir.joinpath('languoids', 'changes.json')): replacement = attrs.pop('replacement', None) hname = attrs.pop('hname', None) for name, enum in [('level', LanguoidLevel), ('status', LanguoidStatus)]: if name in attrs: attrs[name] = enum.from_string(attrs[name]) l = languoids.get(attrs['pk']) if l: for k, v in attrs.items(): setattr(l, k, v) # # We do not assign ISO codes for existing languages, because it could be # that the ISO code is now assigned to a family node, due to a change # request, e.g. see https://github.com/clld/glottolog-data/issues/40 # if len(l.hid or '') == 3 and not l.iso_code: args.log.warn('Language with hid %s but no iso code!' % l.hid) else: l = Languoid(**attrs) DBSession.add(l) languoids[l.pk] = l if len(attrs.get('hid', '')) == 3: create_identifier( None, l, name=attrs['hid'], type=IdentifierType.iso.value) create_identifier( gl_names.get(l.name), l, name=l.name, description=gl_name.description, type=gl_name.type) if hname: l.update_jsondata(hname=hname) if replacement: DBSession.add(Superseded( languoid_pk=l.pk, replacement_pk=replacement, relation='classification update')) DBSession.flush() recreate_treeclosure()
def prime_cache(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ for concept in DBSession.query(Concept): concept.representation = DBSession.query(common.ValueSet)\ .filter(common.ValueSet.parameter_pk == concept.pk)\ .count()
def language_detail_html(context=None, request=None, **kw): # makes sure all display elements have a value param_word = {p.id: '#' for p in DBSession.query(Parameter)} # override the param_word dict with values from the DB for word in DBSession.query(Value)\ .join(ValueSet)\ .filter(ValueSet.language_pk == context.pk)\ .options(joinedload_all(Value.valueset, ValueSet.parameter)): param_word[word.valueset.parameter.id] = word.name def thead(*cols): return HTML.thead( HTML.tr( HTML.th("", style="height:26px; font-weight:"), *[HTML.th(col) for col in cols], **dict(style="background: #F2F2F2") ) ) def td(p): return HTML.td(param_word.get(p, '') if p else '') def tr(name, *params): return HTML.tr( HTML.td( name, style="height:26px; font-weight: bold; background: #F2F2F2; padding: 5px"), *[td(p) for p in params]) def table(*cols, **kw): male_cols = kw.get('male', ['m' + col for col in cols]) female_cols = kw.get('female', ['f' + col for col in cols]) return HTML.table( thead(*cols), HTML.tbody(tr('male', *male_cols), tr('female', *female_cols))) # create a paradigm_tables dict for the HTML rendering paradigm_tables = { 'pronouns': HTML.table( thead("A", "S", "O", "P"), HTML.tbody( tr('1st (excl) Person Singular', '1sg_a', '1sg_s', '1sg_o', '1sg_p'), tr('1st (excl) Person Dual', '1du_a', '1du_s', '1du_o', '1du_p'), tr('1st (excl) Person Plural', '1pl_a', '1pl_s', '1pl_o', '1pl_p'), tr('1st (incl) Person Dual', '12du_a', '12du_s', '12du_o', '12du_p'), tr('1st (incl) Person Plural', '12pl_a', '12pl_s', '12pl_o', '12pl_p'), tr('2nd Person Singular', '2sg_a', '2sg_s', '2sg_o', '2sg_p'), tr('2nd Person Dual', '2du_a', '2du_s', '2du_o', '2du_p'), tr('2nd Person Plural', '2pl_a', '2pl_s', '2pl_o', '2pl_p'), tr('3rd Person Singular Gender 1', '3sg_gen1_a', '3sg_gen1_s', '3sg_gen1_o', '3sg_gen1_p'), tr('3rd Person Singular Gender 2', '3sg_gen2_a', '3sg_gen2_s', '3sg_gen2_o', '3sg_gen2_p'), tr('3rd Person Dual', '3du_gen1_a', '3du_gen1_s', '3du_gen1_o', '3du_gen1_p'), tr('3rd Person Plural', '3pl_gen1_a', '3pl_gen1_s', '3pl_gen1_o', '3pl_gen1_p'), ) ), } return paradigm_tables
def query(self, req): self._domainelements = DBSession.query(DomainElement).all() return DBSession.query(Language)\ .order_by(Language.id)\ .options( subqueryload_all('languageidentifier', 'identifier'), subqueryload_all('countries'), joinedload_all(Language.valuesets, ValueSet.values), joinedload_all(WalsLanguage.genus, Genus.family))
def prime_cache(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodiucally whenever data has been updated. """ for chapter in DBSession.query(models.Chapter).options(joinedload(models.Chapter.entries)): chapter.count_entries = len(chapter.entries) for entry in DBSession.query(models.Entry).options(joinedload(common.Parameter.valuesets)): entry.representation = len(entry.valuesets)
def languoids(request): if request.params.get('id'): if '-' not in request.params['id']: return HTTPNotFound() m, id_ = request.params['id'].split('-', 1) model = dict(w=Language, g=Genus, f=Family).get(m) if not model: return HTTPNotFound() obj = model.get(id_, default=None) if not obj: return HTTPNotFound() return HTTPFound(location=request.resource_url(obj)) max_results = 20 qs = request.params.get('q') if not qs: return [] query = DBSession.query(Language)\ .filter(icontains(Language.name, qs))\ .order_by(WalsLanguage.ascii_name).limit(max_results) res = [l for l in query] if len(res) < max_results: max_results = max_results - len(res) # fill up suggestions with matching alternative names: for l in DBSession.query(Language)\ .join(Language.languageidentifier, LanguageIdentifier.identifier)\ .filter(icontains(Identifier.name, qs))\ .order_by(WalsLanguage.ascii_name).limit(max_results): if l not in res: res.append(l) if len(res) < max_results: max_results = max_results - len(res) # fill up with matching genera: for l in DBSession.query(Genus)\ .filter(icontains(Genus.name, qs))\ .order_by(Genus.name).limit(max_results): res.append(l) if len(res) < max_results: max_results = max_results - len(res) # fill up with matching families: for l in DBSession.query(Family)\ .filter(icontains(Family.name, qs))\ .order_by(Family.name).limit(max_results): res.append(l) ms = LanguoidSelect(request, None, None, url='x') return dict(results=list(map(ms.format_result, res)), context={}, more=False)
def intro(req): lds = ldstatus() count, i = Counter(), 0 for i, (lid, spec) in enumerate(lds.items()): count.update([SIMPLIFIED_DOCTYPE_MAP[spec[0][1] if spec[0] else None].ord]) return { 'macroareas': DBSession.query(Macroarea).order_by(Macroarea.name), 'families': family_query().options(joinedload(Languoid.macroareas)), 'sdts': [(sdt, count[sdt.ord], i + 1) for sdt in SIMPLIFIED_DOCTYPES], 'doctypes': [(dt, SIMPLIFIED_DOCTYPE_MAP[dt.id].ord) for dt in DBSession.query(Doctype).order_by(Doctype.ord)], }
def main(args): source_pattern = re.compile("__Source_(?P<id>[0-9]+)__") ref_in_markup_pattern = re.compile("\*\*(?P<id>[0-9]+)\*\*") with transaction.manager: redirect_map = RedirectMap() for key, value in DBSession.query(Config.key, Config.value): m = source_pattern.match(key) if m and value != "__gone__": redirect_map[m.group("id")] = value for cfg in ( DBSession.query(Config) .filter(Config.key.startswith("__Source_")) .filter(Config.value.in_(list(redirect_map.keys()))) ): try: new = redirect_map.get_final(cfg.value) except ValueError: args.log.error("invalid redirect loop: %s" % (cfg.value,)) new = cfg.value if new != cfg.value: args.log.info("fixed redirect: %s %s" % (cfg.value, new)) cfg.value = new def repl(m): try: new = redirect_map.get_final(m.group("id")) except ValueError: new = m.group("id") return "**%s**" % new vs_rid = sa.select( [ ValueSet.pk, sa.func.unnest(sa.func.regexp_matches(ValueSet.description, "\*\*(\d+)\*\*", "g")).label("ref_id"), ] ).alias() for vs in ( DBSession.query(ValueSet) .filter( ValueSet.pk.in_( DBSession.query(vs_rid.c.pk).filter(~DBSession.query(Ref).filter_by(id=vs_rid.c.ref_id).exists()) ) ) .order_by(ValueSet.id) ): new = ref_in_markup_pattern.sub(repl, vs.description) if new != vs.description: args.log.info("fixed obsolete ref id in markup: %s %s" % (vs.description, new)) vs.description = new
def test_set_glottocode(db): c = Connection(DBSession) lpk = c.insert(common.Language, id='l', name='Language') c.set_glottocode('l', 'abcd1234') c.set_glottocode('l', 'abcd1234') l = DBSession.query(common.Language).get(lpk) assert l.glottocode == 'abcd1234' c.set_glottocode('l', 'dcba1234') DBSession.expire_all() l = DBSession.query(common.Language).get(lpk) assert l.glottocode == 'dcba1234' c.set_glottocode('l', 'abcd1234')
def __init__(self, fname): self.fname = fname self.authors = [c.id for c in DBSession.query(Contributor)] self.languages = {l.id: l.name for l in DBSession.query(Language)} self.id = self.get_id(fname) self.refs = {slug(s.name): s for s in DBSession.query(Source) if s.name} self.examples = defaultdict(list) for row in DBSession.query(Sentence): if row.description: self.examples[slug(row.description.split('OR:')[0])].append( (row.name, row.id)) for k in self.examples.keys(): self.examples[k] = {slug(k): v for k, v in self.examples[k]}
def dataset_detail_html(context=None, request=None, **kw): def vnum(*ids): return DBSession.query(models.Variety).join(models.VarietyType)\ .filter(models.VarietyType.id.in_(ids)).count() stats = { 'vl': vnum('L1t', 'L1c', 'L2'), 'vpc': vnum('P', 'Cr'), 'features': DBSession.query(models.Feature).count(), 'informants': DBSession.query(common.Contributor) .filter(common.Contributor.contribution_assocs.any()).count(), } return { 'stats': stats, 'citation': get_adapter(IRepresentation, context, request, ext='md.txt')}
def test_CustomModelMixin_polymorphic(self): from clld.tests.fixtures import CustomLanguage lang = Language(id='def', name='Name') clang = CustomLanguage(id='abc', name='Name', custom='c') DBSession.add_all([lang, clang]) DBSession.flush() DBSession.expunge_all() lang = DBSession.query(Language).filter_by(id='def').one() clang = DBSession.query(Language).filter_by(id='abc').one() self.assertEqual(lang.polymorphic_type, 'base') self.assertEqual(clang.polymorphic_type, 'custom') self.assertIs(type(lang), Language) self.assertIs(type(clang), CustomLanguage)
def main(args): # pragma: no cover global MAX_IDENTIFIER_PK stats = Counter() with transaction.manager: MAX_IDENTIFIER_PK = DBSession.query( Identifier.pk).order_by(desc(Identifier.pk)).first()[0] gl_names = glottolog_names() for l in DBSession.query(Languoid).options(joinedload_all( Language.languageidentifier, LanguageIdentifier.identifier )): stats.update(create_name(gl_names, l)) args.log.info('%s' % stats)
def test_set_glottocode(self): from clld.db.migration import Connection c = Connection(DBSession) lpk = c.insert(common.Language, id='l', name='Language') c.set_glottocode('l', 'abcd1234') c.set_glottocode('l', 'abcd1234') l = DBSession.query(common.Language).get(lpk) assert l.glottocode == 'abcd1234' c.set_glottocode('l', 'dcba1234') DBSession.expire_all() l = DBSession.query(common.Language).get(lpk) assert l.glottocode == 'dcba1234' c.set_glottocode('l', 'abcd1234')
def prime_cache(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ entries = {e.name.lower(): e for e in DBSession.query(models.Entry)} hit, miss = [], [] def lemma_repl(match): label = match.group('lemma').strip() if label.endswith(','): label = label[:-1].strip() lookup = re.sub('\s+', ' ', label.lower()) if lookup in entries: label = "**%s**" % entries[lookup].id hit.append(label) elif match.group('cf'): print (" '%s'" % label).encode('utf8') miss.append(label) label = "‘%s’" % label if match.group('cf'): label = 'Cf. %s' % label return label lemma_pattern = re.compile("(?P<cf>Cf\.\s*)?‘(?P<lemma>[^’]+)’", re.MULTILINE) def language_repl(m): return '**%s**' % m.group('id') language_pattern = re.compile('(?P<id>%s)' % '|'.join(k.upper() for k in LANGUAGES.keys())) for entry in entries.values(): if entry.description: #print ('\\lx %s' % entry.name).encode('utf8') entry.description = lemma_pattern.sub(lemma_repl, entry.description) entry.description = language_pattern.sub(language_repl, entry.description) print 'hits:', len(hit) print 'miss:', len(miss) def level(l): _level = 0 while l.parent: _level += 1 l = l.parent return _level for lang in DBSession.query(models.Languoid): lang.level = level(lang)
def get_values(self, p, language_url_pattern): q = DBSession.query(Value).join(Value.valueset)\ .filter(ValueSet.parameter_pk == p.pk)\ .options( joinedload(Value.valueset, ValueSet.language), joinedload(Value.valueset, ValueSet.contribution), joinedload(Value.domainelement), joinedload_all(Value.valueset, ValueSet.references, ValueSetReference.source) ).order_by(ValueSet.parameter_pk, ValueSet.language_pk, Value.pk) with UnicodeWriter() as writer: writer.writerow([ 'ID', 'Language_ID', 'Parameter_ID', 'Contribution_ID', 'Value', 'Source', 'Comment', ]) for v in page_query(q): writer.writerow([ v.id, language_url_pattern.format(v.valueset.language.id), p.id, v.valueset.contribution.id, v.domainelement.name if v.domainelement else v.name, ';'.join(self.format_sources(v)), getattr(v, 'comment', v.valueset.source) or '', ]) return writer.read()
def test_FamilyCol(self): from clld_glottologfamily_plugin.datatables import FamilyCol col = FamilyCol(MagicMock(), 'family', language_cls=LanguageWithFamily) q = DBSession.query(LanguageWithFamily).outerjoin(Family) assert q.filter(col.search('isolate')).all() assert q.filter(col.search('f')).order_by(col.order()).all()
def feature_iterator(self, ctx, req): return DBSession.query(Value).join(DomainElement)\ .filter(DomainElement.id == req.params.get('domainelement'))\ .options( joinedload(Value.valueset).joinedload(ValueSet.language), joinedload(Value.domainelement), )
def __init__(self, model, taxon_col, *args, **kw): self.taxon_col = taxon_col kw['choices'] = [(o.id, '%s %s' % (o.id, o.name)) for o in DBSession.query(model).filter( model.active == true()).order_by(model.id)] kw['model_col'] = getattr(Taxon, self.taxon_col) Col.__init__(self, *args, **kw)
def format_comment(req, comment): """ We collect source ids found in comment, retrieve the corresponding source objects from the database in a single query and then replace the ids with formatted source links. """ parts = [] sources = {} pos = 0 comment = comment.replace('~', ' ') for match in Reference.pattern.finditer(comment): preceding = comment[pos:match.start()] parts.append(preceding) parts.append(match.group('key')) sources[match.group('key')] = None if match.group('pages'): parts.append(': {0}'.format(match.group('pages'))) pos = match.end() parts.append(comment[pos:]) for rp in DBSession.query(Refprovider).filter( Refprovider.id.in_(sources.keys())): sources[rp.id] = rp.ref return ' '.join( link(req, sources[p]) if sources.get(p) else p for p in parts).replace(' : ', ': ')
def query(self, model): q = DBSession.query(model).order_by(model.pk) if model == Contribution: return q.options( joinedload(Contribution.contributor_assocs).joinedload( ContributionContributor.contributor)) if model == Sentence: return q.options(joinedload(Sentence.language)) if model == DomainElement: return q.order_by(None).order_by(model.parameter_pk, model.number, model.pk) if model == Value: return q.join(ValueSet)\ .order_by(None)\ .order_by( ValueSet.parameter_pk, ValueSet.language_pk, ValueSet.contribution_pk, Value.pk)\ .options( joinedload( Value.valueset ).joinedload( ValueSet.references ), joinedload(Value.domainelement)) return q
def __init__(self, dt, name, **kw): kw['choices'] = [ r[0] for r in DBSession.query(ImageData.value).filter( ImageData.key == 'permission').order_by( ImageData.value).distinct() ] Col.__init__(self, dt, name, **kw)
def language_query(req=None): query = DBSession.query(common.Language) \ .filter(common.Language.active == True) \ .filter(common.Language.latitude != None) \ .filter(Languoid.level == LanguoidLevel.language) \ .filter(Languoid.category.in_(CATEGORIES)) if req: macroarea = req.params.get('macroarea') if macroarea: query = query.filter(Languoid.macroareas.contains(macroarea)) families = [f for f in req.params.get('family', '').split(',') if f] if families: family = aliased(Languoid) query = query.join(family, Languoid.family_pk == family.pk)\ .filter(family.id.in_(families)) countries = [] for c in req.params.getall('country'): countries.extend(c.split()) if countries: query = query\ .join(common.ValueSet)\ .join(common.Parameter)\ .join(common.Value)\ .join(common.DomainElement)\ .filter(common.Parameter.id == 'country')\ .filter(common.DomainElement.name.in_(countries)) return query
def get(cls, id_, **kw): params = [] for pid in set(id_.split(cls.delimiter)): params.append( DBSession.query(Parameter).filter(Parameter.id == pid).options( joinedload_all(Parameter.domain)).one()) return cls(*params)
def base_query(self, query): query = DBSession.query(self.model)\ .join(ValueSet)\ .options(joinedload_all( Value.valueset, ValueSet.references, ValueSetReference.source )).distinct() if not self.parameter: query = query.join(ValueSet.parameter).filter(Parameter.id != '0') if self.ftype: query = query.filter(Feature.feature_type == self.ftype) if self.language: return query\ .options(joinedload(Value.domainelement))\ .filter(ValueSet.language_pk.in_( [l.pk for l in [self.language] + self.language.lects])) if self.parameter: query = query.join(ValueSet.contribution)\ .join(self.vs_lang, ValueSet.language_pk == self.vs_lang.pk)\ .join(self.vs_lect, ValueSet.language_pk == self.vs_lect.pk)\ .join(DomainElement)\ .options(joinedload(Value.domainelement)) return query.filter(ValueSet.parameter_pk == self.parameter.pk) return query
def sentence_index_html(context=None, request=None, **kw): if request.params.get('language'): lang = common.Language.get(request.params['language']) return dict(lang=lang, sentences=DBSession.query(common.Sentence).filter( common.Sentence.language_pk == lang.pk).all()) return dict(lang=None, sentences=[])
def get_layers(self): for panel in DBSession.query(Panel).options(joinedload(Panel.samples)): yield Layer( panel.id, panel.name, PanelSamples(None).render(panel, self.req, dump=False), )
def segments(language): """ :return: dict mapping segment numbers to tuples (label, symbol, parameter, exists) """ valuesets = { v.parameter.id: v for v in language.valuesets if v.parameter.feature_type == 'segment' and v.values } domainelements = { pid: v.values[0].domainelement for pid, v in valuesets.items() } return { sm.jsondata['number']: ('%s - %s' % (sm.name, domainelements[sm.id].name if sm.id in domainelements else 'Does not exist'), sm.jsondata['symbol'], SEGMENT_VALUES[domainelements[sm.id].number][2] if sm.id in domainelements else 'segment inexistent', sm, sm.id in domainelements and domainelements[sm.id].number != 4, valuesets.get(sm.id)) for sm in DBSession.query(Parameter).filter( Feature.feature_type == 'segment') }
def language_index_html(request=None, **kw): res = dict(countries=dumps([ '%s (%s)' % (c.name, c.id) for c in DBSession.query(Country).order_by(Country.description) ]), params={ 'name': '', 'iso': '', 'namequerytype': 'part', 'country': '' }, message=None) for param, default in res['params'].items(): res['params'][param] = request.params.get(param, default).strip() res['params']['multilingual'] = 'multilingual' in request.params if request.params.get('alnum'): l = Languoid.get(request.params.get('alnum'), default=None) if l: raise HTTPFound(location=request.resource_url(l)) res['message'] = 'No matching languoids found' languoids = list(getLanguoids(**res['params'])) if not languoids and \ (res['params']['name'] or res['params']['iso'] or res['params']['country']): res['message'] = 'No matching languoids found' map_ = LanguoidsMap(languoids, request) layer = list(map_.get_layers())[0] if not layer.data['features']: map_ = None res.update(map=map_, languoids=languoids) return res
def browser(req): ms = MultiSelect(req, 'families', 'msfamily', collection=family_query(req), selected=_get_families(req)) focus = req.params.get('focus', 'ed') if focus == 'sdt': colors, shapes = SIMPLIFIED_DOCTYPES, ENDANGERMENTS else: shapes, colors = SIMPLIFIED_DOCTYPES, ENDANGERMENTS icon_map = {} for shape in [o.shape for o in shapes]: for color in [o.color for o in colors] + ['ffffff']: spec = shape + color icon_map[spec] = req.static_url('clld:web/static/icons/%s.png' % spec) return { 'families': ms, 'macroareas': DBSession.query(Macroarea).all(), 'map': DescStatsMap(language_query(req), req, icon_map, focus), 'icon_map': icon_map, 'focus': focus, 'doctypes': SIMPLIFIED_DOCTYPES, 'endangerments': ENDANGERMENTS }
def geo(args): with_session(args) fname = args.pkg_dir.joinpath('static', 'download', 'languages-and-dialects-geo.csv') with transaction.manager, UnicodeWriter(fname) as writer: writer.writerow([ 'glottocode', 'name', 'isocodes', 'level', 'macroarea', 'latitude', 'longitude' ]) for l in DBSession.query(models.Languoid)\ .filter(or_( models.Languoid.level == models.LanguoidLevel.dialect, models.Languoid.level == models.LanguoidLevel.language))\ .options( joinedload(models.Languoid.macroareas), joinedload_all( common.Language.languageidentifier, common.LanguageIdentifier.identifier))\ .order_by(common.Language.name): writer.writerow([ l.id, l.name, ' '.join( i.name for i in l.get_identifier_objs(common.IdentifierType.iso)), l.level, l.macroareas[0].name if l.macroareas else '', l.latitude if l.latitude is not None else '', l.longitude if l.longitude is not None else '' ]) args.log.info('{0} written'.format(fname))
def test_CustomModelMixin_polymorphic(db, custom_language): lang = Language(id='def', name='Name') assert repr(lang).startswith("<Language ") assert is_base(Language) assert not is_base(custom_language) clang = custom_language(id='abc', name='Name', custom='c') DBSession.add_all([lang, clang]) DBSession.flush() DBSession.expunge_all() lang = DBSession.query(Language).filter_by(id='def').one() clang = DBSession.query(Language).filter_by(id='abc').one() assert lang.polymorphic_type == 'base' assert clang.polymorphic_type == 'custom' assert type(lang) is Language assert type(clang) is custom_language
def langdoccomplexquery(request): res = { 'dt': None, 'doctypes': DBSession.query(Doctype).order_by(Doctype.id), 'macroareas': get_parameter('macroarea').domain, 'ms': {} } for name, cls, kw in [ ('languoids', LanguoidsMultiSelect, dict(url=request.route_url('glottolog.childnodes'))), ('macroareas', MultiSelect, dict(collection=res['macroareas'])), ('doctypes', MultiSelect, dict(collection=res['doctypes'])), ]: res['ms'][name] = cls(request, name, 'ms' + name, **kw) res['params'], reqparams = get_params(request.params, **res) res['refs'] = getRefs(res['params']) if res['refs']: res['dt'] = Refs(request, Source, cq=1, **reqparams) fmt = request.params.get('format') if fmt: db = bibtex.Database([ref.bibtex() for ref in res['refs']]) for name, adapter in request.registry.getAdapters([db], IRepresentation): if name == fmt: return adapter.render_to_response(db, request) return HTTPNotAcceptable() return res
def test_crud(self): from clld.db.migration import Connection migration = Connection(DBSession) assert len(list(migration.select(common.Identifier))) == 0 pk = migration.insert(common.Identifier, id='iso-csw', name='csw', type=common.IdentifierType.iso.value) assert migration.pk(common.Identifier, 'iso-csw') == pk assert len(list(migration.select(common.Identifier))) == 1 identifier = DBSession.query(common.Identifier).get(pk) assert identifier.active assert identifier.version == 1 assert identifier.created assert identifier.updated migration.update(common.Identifier, [('name', 'cea')], pk=pk) DBSession.refresh(identifier) assert identifier.name == 'cea' migration.delete(common.Identifier, pk=pk) self.assertRaises(InvalidRequestError, DBSession.refresh, identifier)
def _query(req, rsc): """Ordered sqlalchemy query. We must make sure, each query is ordered, so that limit and offset does make sense. """ return DBSession.query(rsc.model.id, rsc.model.updated).order_by(rsc.model.pk)
def format_comment(req, comment): """ We collect source ids found in comment, retrieve the corresponding source objects from the database in a single query and then replace the ids with formatted source links. """ parts = [] sources = {} pos = 0 comment = comment.replace('~', ' ') for match in REF_PATTERN.finditer(comment): preceding = comment[pos:match.start()] parts.append(preceding) add_braces = \ (preceding.strip().split() or ['aaa'])[-1] not in ['in', 'of', 'per', 'by'] if add_braces: parts.append('(') parts.append(match.group('id')) sources[match.group('id')] = None if add_braces: parts.append(')') pos = match.end() parts.append(comment[pos:]) for source in DBSession.query(Source).filter(Source.id.in_(sources.keys())): sources[source.id] = source return HTML.p(*[link(req, sources[p]) if p in sources else p for p in parts] )
def contribution_detail_html(context=None, request=None, **kw): langs = ( DBSession.query(Language) .filter(Language.pk.in_(context.jsondata["language_pks"])) .options(joinedload(LexibankLanguage.family)) ) return {"map": SelectedLanguagesMap(context, request, list(langs))}
def test_crud(db): migration = Connection(DBSession) assert len(list(migration.select(common.Identifier))) == 0 pk = migration.insert(common.Identifier, id='iso-csw', name='csw', type=common.IdentifierType.iso.value) assert migration.pk(common.Identifier, 'iso-csw') == pk assert len(list(migration.select(common.Identifier))) == 1 identifier = DBSession.query(common.Identifier)\ .options(undefer('*')).get(pk) assert identifier.active assert identifier.version == 1 assert identifier.created assert identifier.updated migration.update(common.Identifier, [('name', 'cea')], pk=pk) DBSession.refresh(identifier) assert identifier.name == 'cea' migration.delete(common.Identifier, pk=pk) with pytest.raises(InvalidRequestError): DBSession.refresh(identifier)
def childnodes(request): if request.params.get('t') == 'select2': query = DBSession.query(Languoid.id, Languoid.name, Languoid.level)\ .filter(icontains(Languoid.name, request.params.get('q'))) total = query.count() ms = LanguoidsMultiSelect(request, None, None, url='x') return dict(results=[ms.format_result(l) for l in query.limit(100)], context={}, more=total > 500) query = DBSession.query( Languoid.pk, Languoid.id, Languoid.name, Languoid.level, func.count(TreeClosureTable.child_pk).label('children'))\ .filter(Language.pk == TreeClosureTable.parent_pk)\ .filter(Language.active == true()) if request.params.get('node'): query = query.filter(Languoid.father_pk == int(request.params['node'])) else: # narrow down selection of top-level nodes in the tree: query = query.filter(Languoid.father_pk == null()) if request.params.get('q'): query = query.filter( Language.name.contains(request.params.get('q'))) query = query.group_by(Languoid.pk, Languoid.id, Languoid.name, Languoid.level).order_by(Language.name) return [ { 'label': ('%s (%s)' % (l.name, l.children - 1)) if l.children > 1 else l.name, 'glottocode': l.id, 'lname': l.name, 'id': l.pk, 'level': l.level.value, #'children': l.children 'load_on_demand': l.children > 1 } for l in query ]
def prime_cache(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ q = DBSession.query(common.Parameter).join(common.ValueSet).distinct() n = q.count() m = DBSession.query(models.Inventory).count() for segment in q: # # TODO: this ratio (number of inventories a segment appears in by number of # distinct segment total) doesn't make much sense, does it? # segment.frequency = float(len(segment.valuesets)) / float(n) segment.in_inventories = len(segment.valuesets) segment.total_inventories = m for inventory in DBSession.query(models.Inventory).options( joinedload_all(common.Contribution.valuesets, common.ValueSet.parameter)): if '(UPSID)' not in inventory.name: inventory.count_tone = 0 for vs in inventory.valuesets: attr = 'count_' + vs.parameter.segment_class if hasattr(inventory, attr): val = getattr(inventory, attr) or 0 setattr(inventory, attr, val + 1) ficons = cycle(ORDERED_ICONS) gicons = cycle(ORDERED_ICONS) for root, genus in groupby( DBSession.query(models.Genus).order_by(models.Genus.description), lambda g: g.description): ficon = ficons.next().name for g in genus: g.ficon = ficon g.gicon = gicons.next().name for variety in DBSession.query(models.Variety).options( joinedload(models.Variety.inventories)): variety.count_inventories = len(variety.inventories) if 0: ia_func('update', args) gbs_func('update', args) print('added', add_wikipedia_urls(args), 'wikipedia urls')
def languages(request): if request.params.get('search'): return quicksearch(request) res = dict(countries=json.dumps([ '%s (%s)' % (c.name, c.id) for c in DBSession.query(Country).order_by(Country.description) ]), params={ 'name': '', 'iso': '', 'namequerytype': 'part', 'country': '' }, message=None) for param, default in res['params'].items(): res['params'][param] = request.params.get(param, default).strip() if res['params']['country']: country = res['params']['country'] try: alpha2 = country.split('(')[1].split(')')[0] \ if len(country) > 2 else country.upper() raise HTTPFound(location=request.route_url( 'languages_alt', ext='map.html', _query=dict(country=alpha2))) except IndexError: pass res['params']['multilingual'] = 'multilingual' in request.params if request.params.get('alnum'): l = Languoid.get(request.params.get('alnum'), default=None) if l: raise HTTPFound(location=request.resource_url(l)) res['message'] = 'No matching languoids found' if (res['params']['iso'] and len(res['params']['iso']) < 2) or ( res['params']['name'] and len(res['params']['name']) < 2 and res['params']['namequerytype'] == 'part'): res.update(message='Please enter at least two characters to search', map=None, languoids=[]) return res languoids = list(getLanguoids(**res['params'])) if not languoids and \ (res['params']['name'] or res['params']['iso'] or res['params']['country']): res['message'] = 'No matching languoids found' #if len(languoids) == 1: # raise HTTPFound(request.resource_url(languoids[0])) map_, icon_map, family_map = get_selected_languages_map(request, languoids) layer = list(map_.get_layers())[0] if not layer.data['features']: map_ = None res.update(map=map_, languoids=languoids) return res
def count(ppk): return DBSession.query(common.DomainElement.pk, func.count(common.Value.pk))\ .join(common.Value)\ .join(common.ValueSet)\ .join(common.Language)\ .filter(Languoid.category.in_(CATEGORIES))\ .filter(common.DomainElement.parameter_pk == ppk)\ .group_by(common.DomainElement.pk)
def get_x_data(model_name=None, context=None, request=None, **kw): m = getattr(amsd.models, model_name) x_m = getattr(amsd.models, 'x_%s' % (model_name)) q = [ n for n, in DBSession.query(m.name).join(x_m).filter( x_m.object_pk == context.pk).order_by(m.name) ] return ', '.join(q)
def feature_iterator(self, ctx, req): q = DBSession.query(ValueSet).join(Value).filter(ValueSet.parameter_pk == ctx.pk)\ .options(joinedload(ValueSet.values), joinedload(ValueSet.language)) de = req.params.get('domainelement') if de: return [vs for vs in ctx.valuesets if vs.values and vs.values[0].domainelement.id == de] return q
def dataset_detail_html(context=None, request=None, **kw): res = {}#dict( #(row[0], row[1]) for row in #DBSession.execute("select source, count(pk) from inventory group by source")) res['inventory_count'] = DBSession.query(Inventory).count() res['segment_count'] = DBSession.query(Parameter).count() res['language_count'] = DBSession.query(Language).count() res['contributors'] = DBSession.query(Contributor).order_by(Contributor.name).options( joinedload(Contributor.contribution_assocs), joinedload(Contributor.references)).all() res['sources'] = { k: Source.get(k) for k in ['MoisikEsling2011', 'IPA2005', 'Hayes2009', 'Moran2012a', 'Moran_etal2012', 'Cysouw_etal2012', 'mccloy_etal2013']} res['descriptions'] = {c.id: desc(request, c.description, res['sources']) for c in res['contributors']} return res
def resourcemap(req): res = {'properties': {'dataset': req.dataset.id}, 'resources': []} rsc = req.params.get('rsc') if rsc: res['properties']['uri_template'] = get_url_template(req, rsc, relative=False) if rsc == 'language': q = DBSession.query( common.Language.id, common.Language.name, common.Language.latitude, common.Language.longitude, common.Identifier.type, common.Identifier.name)\ .join(common.Language.languageidentifier)\ .join(common.LanguageIdentifier.identifier)\ .filter(common.Language.active == True)\ .filter(common.Identifier.type != 'name')\ .order_by(common.Language.id) for lang, codes in groupby(q, lambda r: (r[0], r[1], r[2], r[3])): res['resources'].append({ 'id': lang[0], 'name': lang[1], 'latitude': lang[2], 'longitude': lang[3], 'identifiers': [{ 'type': c.type, 'identifier': c.name.lower() if c.type.startswith('WALS') else c.name } for c in codes] }) return res if rsc == 'parameter': for id, name in DBSession.query( common.Parameter.id, common.Parameter.name, ).order_by(common.Parameter.pk): res['resources'].append({'id': id, 'name': name}) return res return HTTPNotFound()
def get_stats(cls): return { row[0]: row[1] for row in DBSession.query(Provider.pk, func.count(cls.ref_pk).label('c')). filter(Provider.pk == cls.provider_pk).group_by( Provider.pk).order_by(desc('c')).all() }