def reindex_entity(entity, references=True): log.info('Index [%s]: %s', entity.id, entity.name) if entity.state != Entity.STATE_ACTIVE: index_delete(entity.id) if references: delete_entity_references(entity.id) else: index_entity(entity)
def generate_entity_references(entity): if entity.state != Entity.STATE_ACTIVE: return # This is all a bit hacky: we're re-generating all the entity # references for the given entity by effectively re-implementing # the RegexEntityAnalyzer. The alternative was to conduct a # search for potential matching documents, re-analyze them and # re-index them. This proved to be too slow in reality. log.info("Updating document references: %r", entity) rex = '|'.join(entity.regex_terms) rex = re.compile('( |^)(%s)( |$)' % rex) documents = defaultdict(int) try: for document_id, text in scan_entity_mentions(entity): text = normalize_strong(text) if text is None or len(text) <= 2: continue for match in rex.finditer(text): documents[document_id] += 1 except Exception: log.exception('Failed to fully scan documents for entity refresh.') q = db.session.query(Reference) q = q.filter(Reference.entity_id == entity.id) q = q.filter(Reference.origin == 'regex') q.delete(synchronize_session='fetch') log.info("Re-matching %r gave %r documents.", entity, len(documents)) for document_id, weight in documents.items(): doc = Document.by_id(document_id) if doc is None: continue ref = Reference() ref.document_id = document_id ref.entity_id = entity.id ref.origin = 'regex' ref.weight = weight db.session.add(ref) db.session.commit() delete_entity_references(entity.id) update_entity_references(entity.id)
def generate_entity_references(entity): if entity.state != Entity.STATE_ACTIVE: return rex = '|'.join(entity.regex_terms) rex = re.compile('( |^)(%s)( |$)' % rex) documents = defaultdict(int) try: for document_id, text in scan_entity_mentions(entity): text = normalize_strong(text) if text is None or len(text) <= 2: continue for match in rex.finditer(text): documents[document_id] += 1 except Exception: log.exception('Failed to fully scan documents for entity refresh.') q = db.session.query(Reference) q = q.filter(Reference.entity_id == entity.id) q = q.filter(Reference.origin == 'regex') q.delete(synchronize_session='fetch') log.info("Re-matching %r gave %r documents.", entity, len(documents)) for document_id, weight in documents.items(): doc = Document.by_id(document_id) if doc is None: continue ref = Reference() ref.document_id = document_id ref.entity_id = entity.id ref.origin = 'regex' ref.weight = weight db.session.add(ref) db.session.commit() delete_entity_references(entity.id) q = db.session.query(func.distinct(Reference.document_id)) q = q.filter(Reference.entity_id == entity.id) for document_id, in q: index_document(document_id, index_records=False)
def generate_entity_references(entity): # This is all a bit hacky: we're re-generating all the entity # references for the given entity by effectively re-implementing # the RegexEntityAnalyzer. The alternative was to conduct a # search for potential matching documents, re-analyze them and # re-index them. This proved to be too slow in reality. if entity.state != Entity.STATE_ACTIVE: entity.delete_references(origin='regex') return log.info("Updating document references: %r", entity) rex = '|'.join([t for t in entity.regex_terms]) rex = re.compile('(%s)' % rex) documents = defaultdict(int) try: for document_id, text in scan_entity_mentions(entity): text = match_form(text) if text is None or len(text) <= 2: continue for match in rex.finditer(text): documents[document_id] += 1 except Exception as ex: log.exception(ex) log.info("Re-matching %r gave %r documents.", entity, len(documents)) entity.delete_references(origin='regex') for document_id, weight in documents.items(): doc = Document.by_id(document_id) if doc is None: continue ref = Reference() ref.document_id = doc.id ref.entity_id = entity.id ref.origin = 'regex' ref.weight = weight db.session.add(ref) db.session.commit() delete_entity_references(entity.id) update_entity_references(entity)