def canonicalize(cls): """Write out canonical_uids based on entity mappings.""" q = session.query(Entity) q = q.filter(Entity.project == project.name) q.update({Entity.canonical_uid: Entity.uid}, synchronize_session='fetch') q = session.query(Link) q = q.filter(Link.project == project.name) q.update({Link.source_canonical_uid: Link.source_uid}, synchronize_session='fetch') q.update({Link.target_canonical_uid: Link.target_uid}, synchronize_session='fetch') clusters = cls.generate_clusters() project.log.info("Canonicalize: %d clusters", len(clusters)) for uids in clusters: canonical_uid = max(uids) q = session.query(Entity) q = q.filter(Entity.project == project.name) q = q.filter(Entity.uid.in_(uids)) q.update({Entity.canonical_uid: canonical_uid}, synchronize_session='fetch') q = session.query(Link) q = q.filter(Link.project == project.name) q = q.filter(Link.source_uid.in_(uids)) q.update({Link.source_canonical_uid: canonical_uid}, synchronize_session='fetch') q = session.query(Link) q = q.filter(Link.project == project.name) q = q.filter(Link.target_uid.in_(uids)) q.update({Link.target_canonical_uid: canonical_uid}, synchronize_session='fetch')
def find_judgements(cls, judgement): """Find entity IDs linked by judgements of a particular type.""" q = session.query(cls.left_uid, cls.right_uid) q = q.filter(cls.project == project.name) q = q.filter(cls.judgement == judgement) for (uida, uidb) in q: yield cls.sort_uids(uida, uidb)
def cleanup(cls): """Delete all undecided mappings.""" q = session.query(cls) q = q.filter(cls.project == project.name) q = q.filter(cls.decided == False) # noqa q = q.filter(cls.generated == True) # noqa q.delete(synchronize_session='fetch')
def find_by_result(cls, query_uid=None, match_uid=None): q = session.query(cls) q = q.filter(cls.project == project.name) if query_uid is not None and match_uid is not None: q = q.filter(cls.query_uid == query_uid) q = q.filter(cls.match_uid == match_uid) return q
def get(cls, uida, uidb): """Load a mapping by it's end points.""" left_uid, right_uid = cls.sort_uids(uida, uidb) q = session.query(cls) q = q.filter(cls.project == project.name) q = q.filter(cls.left_uid == left_uid) q = q.filter(cls.right_uid == right_uid) return q.first()
def update(self, normalized, latitude, longitude): q = session.query(Address) q = q.filter(Address.project == project.name) q = q.filter(Address.slug == self.slug) q.update( { Address.normalized: normalized, Address.latitude: latitude, Address.longitude: longitude, }, synchronize_session='fetch')
def entities(): text_query = request.args.get('q', '').strip() offset = int(request.args.get('offset', '0')) limit = 50 sq = session.query(Mapping.left_uid) sq = sq q = session.query(Entity) q = q.filter(Entity.project == project.name) q = q.filter(Entity.active == True) # noqa if len(text_query): q = q.filter(Entity.data['name'].astext.ilike('%' + text_query + '%')) total = q.count() context = { 'total': total, 'has_prev': offset > 0, 'has_next': total >= (offset + limit), 'next': offset + limit, 'prev': max(0, offset - limit), 'text_query': text_query, } q = q.offset(offset).limit(limit) return render_template('entities.html', entities=q, **context)
def entity(uid): entity = Entity.get(uid) q = session.query(Mapping) q = q.filter(Mapping.project == project.name) q = q.filter( or_(Mapping.left_uid == entity.uid, Mapping.right_uid == entity.uid)) q = q.order_by(Mapping.score.desc()) decisions = Mapping.get_decisions() undecided = q.filter(Mapping.decided == False) # noqa decided = q.filter(Mapping.decided == True) # noqa sections = (('Undecided', undecided), ('Decided', decided)) return render_template('entity.html', entity=entity, sections=sections, decisions=decisions)
def review_entity_get(offset=None): """Jump to the next entity that needs disambiguation.""" qa = session.query(Mapping.left_uid.label('uid'), func.sum(Mapping.score).label('num')) qa = qa.filter(Mapping.project == project.name) qa = qa.filter(Mapping.decided == False) # noqa qa = qa.group_by(Mapping.left_uid) qb = session.query(Mapping.right_uid.label('uid'), func.sum(Mapping.score).label('num')) qb = qb.filter(Mapping.project == project.name) qb = qb.filter(Mapping.decided == False) # noqa qb = qa.group_by(Mapping.right_uid) sq = qa.union(qb).subquery() q = session.query(sq.c.uid, func.sum(sq.c.num)) q = q.join(Entity, Entity.uid == sq.c.uid) q = q.filter(Entity.active == True) # noqa q = q.group_by(sq.c.uid, Entity.tasked) q = q.order_by(Entity.tasked.desc()) q = q.order_by(func.sum(sq.c.num).desc()) q = q.order_by(func.random()) if q.count() == 0: return redirect(url_for('.entities')) q = q.limit(1) return redirect(url_for('.entity', uid=q.scalar()))
def iter_composite(cls, origins=[], tasked=None): sq = session.query(cls.canonical_uid.distinct()) sq = sq.filter(cls.project == project.name) sq = sq.filter(cls.active == True) # noqa if len(origins): sq = sq.filter(cls.origin.in_(origins)) if tasked is not None: sq = sq.filter(cls.tasked == tasked) q = session.query(cls) q = q.filter(cls.project == project.name) q = q.filter(cls.active == True) # noqa q = q.filter(cls.canonical_uid.in_(sq)) q = q.order_by(cls.canonical_uid.asc()) entities = [] canonical_uid = None for entity in q: if entity.canonical_uid != canonical_uid: if len(entities): yield CompositeEntity(entities) entities = [] entities.append(entity) canonical_uid = entity.canonical_uid if len(entities): yield CompositeEntity(entities)
def find_undecided(cls, limit=10, offset=0): """Return candidates for manual matching.""" decided = cls.get_decided() q = session.query(cls) q = q.filter(cls.project == project.name) q = q.filter(cls.decided == False) # noqa q = q.filter(cls.judgement == None) # noqa q = q.order_by(cls.score.desc()) q = q.offset(offset) mappings = [] for mapping in q.yield_per(limit): if (mapping.left_uid, mapping.right_uid) in decided or \ mapping.left is None or mapping.right is None: mapping.delete() continue mappings.append(mapping) if len(mappings) == limit: break session.commit() return mappings
def find_by_decision(cls, decided): """Find entity IDs linked by all, or only decided, judgments.""" q = session.query(cls) q = q.filter(cls.project == project.name) q = q.filter(cls.decided == decided) # noqa return q
def find_by_origins(cls, origins): q = session.query(cls) q = q.filter(cls.project == project.name) if len(origins): q = q.filter(cls.origin.in_(origins)) return q
def find(cls): q = session.query(cls) q = q.filter(cls.project == project.name) return q