def generate_scored_mappings(cls, origins=[], threshold=.5): """Do a cross-product comparison of entities and generate mappings.""" index = EntityIndex() index.build() q = Entity.find_by_origins(origins=[]) q = q.filter(Entity.active == True) # noqa entities = {e.uid: e for e in q.all()} decided = cls.get_decided() for entity in entities.values(): if len(origins) and entity.origin not in origins: continue skip = set() for pair in decided: if entity.uid in pair: skip.update(pair) for uid in index.search_similar(entity, skip=skip): match = entities.get(uid) score = entity.compare(match) if score <= threshold: continue project.log.info("Candidate [%.3f]: %s <-> %s", score, entity.name, match.name) cls.save(entity.uid, match.uid, judgement=None, score=score, generated=True) decided.add((entity.uid, match.uid)) session.commit()
def emit_entity(self, data): """Create or update an entity in the context of this emitter.""" entity = Entity.save(dict(data), self.origin, query_uid=self.query_uid, match_uid=self.match_uid) session.commit() return entity
def emit_judgement(self, uida, uidb, judgement, score=None, decided=False): """Change the record linkage status of two entities.""" mapping = Mapping.save(uida, uidb, judgement, decided=decided, score=score) session.commit() return mapping
def emit_document(self, entity_uid, url, title, publisher=None): """Create or update a document in the context of this emitter.""" doc = Document.save(entity_uid, url, title, self.origin, publisher=publisher) session.commit() return doc
def enrich_documents(origin, entity): for uid in entity.uids: Document.delete_by_entity(entity.uid) session.commit() if entity.schema not in [PERSON, COMPANY, ORGANIZATION, OTHER]: return total = 0 query = search_entity(entity) for url, title, publisher in search_documents(query): origin.emit_document(entity.uid, url, title, publisher=publisher) total += 1 origin.log.info('Query [%s]: %s -> %s', entity.name, query, total)
def enrich(origin, entity): gmaps = googlemaps.Client(key=API_KEY) for uid in entity.uids: q = Address.find_by_entity(uid) q = q.filter(Address.normalized == None) # noqa for address in q: origin.log.info("Geocoding [%s] %s", entity.name, address.clean) results = geocode(gmaps, address.clean) if not len(results): origin.log.info("No results: %s" % address.clean) for result in results: address.update(normalized=result['formatted_address'], latitude=result['geometry']['location']['lat'], longitude=result['geometry']['location']['lng']) break session.commit()
def find_undecided(cls, limit=10, offset=0): """Return candidates for manual matching.""" decided = cls.get_decided() q = session.query(cls) q = q.filter(cls.project == project.name) q = q.filter(cls.decided == False) # noqa q = q.filter(cls.judgement == None) # noqa q = q.order_by(cls.score.desc()) q = q.offset(offset) mappings = [] for mapping in q.yield_per(limit): if (mapping.left_uid, mapping.right_uid) in decided or \ mapping.left is None or mapping.right is None: mapping.delete() continue mappings.append(mapping) if len(mappings) == limit: break session.commit() return mappings
def emit_entity(self, data): # Enrichment results are first held as inactive and become active only # once the judgement between the query and result entities is confirmed entity = super(ResultEmitter, self).emit_entity(data) if (self.mapping is None) or \ (not self.mapping.decided) or \ (self.mapping.judgement is False): entity.active = False if self.mapping is None or self.mapping.judgement is None: entity.active = False if entity.uid == self.match_uid: # Generate a tentative mapping. query = Entity.get(self.query_uid) if query is not None: Mapping.save(self.match_uid, self.query_uid, None, score=query.compare(entity)) session.commit() return entity
def clear(self): Entity.delete_by_origin(self.origin, query_uid=self.query_uid, match_uid=self.match_uid) session.commit()
def emit_link(self, data): """Create or update a link in the context of this emitter.""" entity = Link.save(dict(data), self.origin) session.commit() return entity
def mappings_cleanup(): """Delete undecided generated mappings.""" Mapping.cleanup() session.commit()
def mappings_apply(): """Apply mapped canonical IDs to all entities.""" Mapping.canonicalize() session.commit()