def finalize(self): self.document.delete_references(origin=self.origin) for entity_id, weight in self.entities.items(): ref = Reference() ref.document_id = self.document.id ref.entity_id = entity_id ref.origin = self.origin ref.weight = weight db.session.add(ref) log.info('Aho Corasick extraced %s entities.', len(self.entities))
def analyze(self, document, meta): begin_time = time() self.cache.generate() entities = defaultdict(int) for text, rec in document.text_parts(): text = normalize_strong(text) if text is None or len(text) <= 2: continue for rex in self.cache.regexes: for match in rex.finditer(text): match = match.group(2) # match = match.group(1) for entity_id in self.cache.matches.get(match, []): entities[entity_id] += 1 Reference.delete_document(document.id, origin=self.origin) for entity_id, weight in entities.items(): ref = Reference() ref.document_id = document.id ref.entity_id = entity_id ref.origin = self.origin ref.weight = weight db.session.add(ref) self.save(document, meta) duration_time = int((time() - begin_time) * 1000) if len(entities): log.info("Regex tagged %r with %d entities (%sms)", document, len(entities), duration_time) else: log.info("Regex found no entities on %r (%sms)", document, duration_time)
def generate_entity_references(entity): if entity.state != Entity.STATE_ACTIVE: return # This is all a bit hacky: we're re-generating all the entity # references for the given entity by effectively re-implementing # the RegexEntityAnalyzer. The alternative was to conduct a # search for potential matching documents, re-analyze them and # re-index them. This proved to be too slow in reality. log.info("Updating document references: %r", entity) rex = '|'.join(entity.regex_terms) rex = re.compile('( |^)(%s)( |$)' % rex) documents = defaultdict(int) try: for document_id, text in scan_entity_mentions(entity): text = normalize_strong(text) if text is None or len(text) <= 2: continue for match in rex.finditer(text): documents[document_id] += 1 except Exception: log.exception('Failed to fully scan documents for entity refresh.') q = db.session.query(Reference) q = q.filter(Reference.entity_id == entity.id) q = q.filter(Reference.origin == 'regex') q.delete(synchronize_session='fetch') log.info("Re-matching %r gave %r documents.", entity, len(documents)) for document_id, weight in documents.items(): doc = Document.by_id(document_id) if doc is None: continue ref = Reference() ref.document_id = document_id ref.entity_id = entity.id ref.origin = 'regex' ref.weight = weight db.session.add(ref) db.session.commit() delete_entity_references(entity.id) update_entity_references(entity.id)
def generate_entities(document): entities = [] for entity_id, collection_id in Reference.index_references(document.id): entities.append({ 'id': entity_id, 'collection_id': collection_id }) return entities
def finalize(self): Reference.delete_document(self.document.id, origin=self.origin) for entity_id, weight in self.entities.items(): ref = Reference() ref.document_id = self.document.id ref.entity_id = entity_id ref.origin = self.origin ref.weight = weight db.session.add(ref) log.info('Regex extraced %s entities.', len(self.entities))
def analyze(self, document, meta): if not document.source.generate_entities: return begin_time = time() try: entities = self.extract_entities(document, meta) except Exception as ex: log.warning(ex) return Reference.delete_document(document.id, origin=self.origin) for name, weight, schema in entities: entity_id = self.load_entity(name, schema) if entity_id is None: continue ref = Reference() ref.document_id = document.id ref.entity_id = entity_id ref.weight = weight db.session.add(ref) self.save(document, meta) duration_time = int((time() - begin_time) * 1000) if len(entities): log.info("Polyglot tagged %r with %d entities (%sms)", document, len(entities), duration_time) else: log.info("Polyglot found no entities on %r (%sms)", document, duration_time)
def finalize(self): output = [] for entity_name, schemas in self.entities.items(): schema = max(set(schemas), key=schemas.count) output.append((entity_name, len(schemas), schema)) self.document.delete_references(origin=self.origin) for name, weight, schema in output: entity_id = self.load_entity(name, schema) if entity_id is None: continue ref = Reference() ref.document_id = self.document.id ref.entity_id = entity_id ref.origin = self.origin ref.weight = weight db.session.add(ref) log.info('Polyglot extraced %s entities.', len(output))
def generate_entity_references(entity): if entity.state != Entity.STATE_ACTIVE: return rex = '|'.join(entity.regex_terms) rex = re.compile('( |^)(%s)( |$)' % rex) documents = defaultdict(int) try: for document_id, text in scan_entity_mentions(entity): text = normalize_strong(text) if text is None or len(text) <= 2: continue for match in rex.finditer(text): documents[document_id] += 1 except Exception: log.exception('Failed to fully scan documents for entity refresh.') q = db.session.query(Reference) q = q.filter(Reference.entity_id == entity.id) q = q.filter(Reference.origin == 'regex') q.delete(synchronize_session='fetch') log.info("Re-matching %r gave %r documents.", entity, len(documents)) for document_id, weight in documents.items(): doc = Document.by_id(document_id) if doc is None: continue ref = Reference() ref.document_id = document_id ref.entity_id = entity.id ref.origin = 'regex' ref.weight = weight db.session.add(ref) db.session.commit() delete_entity_references(entity.id) q = db.session.query(func.distinct(Reference.document_id)) q = q.filter(Reference.entity_id == entity.id) for document_id, in q: index_document(document_id, index_records=False)
def generate_entities(document): colls = defaultdict(set) for entity_id, collection_id in Reference.index_references(document.id): colls[entity_id].add(collection_id) entities = [] for entity_id, collections in colls.items(): entities.append({ 'id': entity_id, 'collection_id': list(collections) }) return entities
def index_document(document): if document.status == Document.STATUS_PENDING: return log.info("Index document: %r", document) data = document.to_index_dict() data['entities'] = [] for entity_id, collection_id in Reference.index_references(document.id): data['entities'].append({ 'id': entity_id, 'collection_id': collection_id }) es.index(index=es_index, doc_type=TYPE_DOCUMENT, body=data, id=document.id)
def finalize(self): output = [] for entity_name, schemas in self.entities.items(): schema = max(set(schemas), key=schemas.count) output.append((entity_name, len(schemas), schema)) Reference.delete_document(self.document.id, origin=self.origin) for name, weight, schema in output: entity_id = self.load_entity(name, schema) if entity_id is None: continue ref = Reference() ref.document_id = self.document.id ref.entity_id = entity_id ref.origin = self.origin ref.weight = weight db.session.add(ref) log.info('Polyglot extraced %s entities.', len(output))
def save(self, document, meta, entities): if len(entities): log.info("Tagged %r with %d entities", document, len(entities)) Reference.delete_document(document.id) for entity_id, weight in entities.items(): ref = Reference() ref.document_id = document.id ref.entity_id = entity_id ref.weight = weight db.session.add(ref) super(EntityAnalyzer, self).save(document, meta)
def finalize(self): if self.disabled: return self.document.delete_references(origin=self.origin) for regno, name, full in self.entities: entity_id = self.load_entity(regno, name, full) ref = Reference() ref.document_id = self.document.id ref.entity_id = entity_id ref.origin = self.origin ref.weight = 1 db.session.add(ref) log.info('za_companies extraced %s entities.', len(self.entities))
def finalize(self): self.document.delete_references(origin=self.origin) for fk, schemas in self.entity_schemata.items(): schema = max(set(schemas), key=schemas.count) name = self.entity_names.get(fk) entity = self.load_entity(fk, name, schema) if entity.deleted_at is not None: continue ref = Reference() ref.document_id = self.document.id ref.entity_id = entity.id ref.origin = self.origin ref.weight = len(schemas) db.session.add(ref) log.info('Polyglot extracted %s entities.', len(self.entity_schemata))
def finalize(self): if self.disabled: return log.debug("%s deleting old refs for document %d", self, self.document.id) self.document.delete_references(origin=self.origin) for sa_id, name, full in self.entities: log.debug("%s Linking %s to document %d", self, sa_id, self.document.id) entity_id = self.load_entity(sa_id, name, full) ref = Reference() ref.document_id = self.document.id ref.entity_id = entity_id ref.origin = self.origin ref.weight = 1 db.session.add(ref) log.info('za_persons extraced %s entities.', len(self.entities))
def analyze(self, document, meta): entities = defaultdict(int) for text, rec in document.text_parts(): text = normalize_strong(text) if text is None or not len(text): continue for rex, matches in self.matchers: for match in rex.finditer(text): match = match.group(2) for entity_id in matches.get(match, []): entities[entity_id] += 1 if len(entities): log.info("Tagged %r with %d entities", document, len(entities)) Reference.delete_document(document.id) for entity_id, weight in entities.items(): ref = Reference() ref.document_id = document.id ref.entity_id = entity_id ref.weight = weight db.session.add(ref) self.save(document, meta)
def generate_entity_references(entity): # This is all a bit hacky: we're re-generating all the entity # references for the given entity by effectively re-implementing # the RegexEntityAnalyzer. The alternative was to conduct a # search for potential matching documents, re-analyze them and # re-index them. This proved to be too slow in reality. if entity.state != Entity.STATE_ACTIVE: entity.delete_references(origin='regex') return log.info("Updating document references: %r", entity) rex = '|'.join([t for t in entity.regex_terms]) rex = re.compile('(%s)' % rex) documents = defaultdict(int) try: for document_id, text in scan_entity_mentions(entity): text = match_form(text) if text is None or len(text) <= 2: continue for match in rex.finditer(text): documents[document_id] += 1 except Exception as ex: log.exception(ex) log.info("Re-matching %r gave %r documents.", entity, len(documents)) entity.delete_references(origin='regex') for document_id, weight in documents.items(): doc = Document.by_id(document_id) if doc is None: continue ref = Reference() ref.document_id = doc.id ref.entity_id = entity.id ref.origin = 'regex' ref.weight = weight db.session.add(ref) db.session.commit() delete_entity_references(entity.id) update_entity_references(entity)