示例#1
0
def reindex_entity(entity, references=True):
    log.info('Index [%s]: %s', entity.id, entity.name)
    if entity.state != Entity.STATE_ACTIVE:
        index_delete(entity.id)
        if references:
            delete_entity_references(entity.id)
    else:
        index_entity(entity)
示例#2
0
def reindex_entity(entity, references=True):
    log.info('Index [%s]: %s', entity.id, entity.name)
    if entity.state != Entity.STATE_ACTIVE:
        index_delete(entity.id)
        if references:
            delete_entity_references(entity.id)
    else:
        index_entity(entity)
示例#3
0
def generate_entity_references(entity):
    if entity.state != Entity.STATE_ACTIVE:
        return
    # This is all a bit hacky: we're re-generating all the entity
    # references for the given entity by effectively re-implementing
    # the RegexEntityAnalyzer. The alternative was to conduct a
    # search for potential matching documents, re-analyze them and
    # re-index them. This proved to be too slow in reality.

    log.info("Updating document references: %r", entity)
    rex = '|'.join(entity.regex_terms)
    rex = re.compile('( |^)(%s)( |$)' % rex)

    documents = defaultdict(int)
    try:
        for document_id, text in scan_entity_mentions(entity):
            text = normalize_strong(text)
            if text is None or len(text) <= 2:
                continue
            for match in rex.finditer(text):
                documents[document_id] += 1
    except Exception:
        log.exception('Failed to fully scan documents for entity refresh.')

    q = db.session.query(Reference)
    q = q.filter(Reference.entity_id == entity.id)
    q = q.filter(Reference.origin == 'regex')
    q.delete(synchronize_session='fetch')

    log.info("Re-matching %r gave %r documents.", entity,
             len(documents))

    for document_id, weight in documents.items():
        doc = Document.by_id(document_id)
        if doc is None:
            continue
        ref = Reference()
        ref.document_id = document_id
        ref.entity_id = entity.id
        ref.origin = 'regex'
        ref.weight = weight
        db.session.add(ref)

    db.session.commit()
    delete_entity_references(entity.id)
    update_entity_references(entity.id)
示例#4
0
def generate_entity_references(entity):
    if entity.state != Entity.STATE_ACTIVE:
        return
    # This is all a bit hacky: we're re-generating all the entity
    # references for the given entity by effectively re-implementing
    # the RegexEntityAnalyzer. The alternative was to conduct a
    # search for potential matching documents, re-analyze them and
    # re-index them. This proved to be too slow in reality.

    log.info("Updating document references: %r", entity)
    rex = '|'.join(entity.regex_terms)
    rex = re.compile('( |^)(%s)( |$)' % rex)

    documents = defaultdict(int)
    try:
        for document_id, text in scan_entity_mentions(entity):
            text = normalize_strong(text)
            if text is None or len(text) <= 2:
                continue
            for match in rex.finditer(text):
                documents[document_id] += 1
    except Exception:
        log.exception('Failed to fully scan documents for entity refresh.')

    q = db.session.query(Reference)
    q = q.filter(Reference.entity_id == entity.id)
    q = q.filter(Reference.origin == 'regex')
    q.delete(synchronize_session='fetch')

    log.info("Re-matching %r gave %r documents.", entity, len(documents))

    for document_id, weight in documents.items():
        doc = Document.by_id(document_id)
        if doc is None:
            continue
        ref = Reference()
        ref.document_id = document_id
        ref.entity_id = entity.id
        ref.origin = 'regex'
        ref.weight = weight
        db.session.add(ref)

    db.session.commit()
    delete_entity_references(entity.id)
    update_entity_references(entity.id)
示例#5
0
def generate_entity_references(entity):
    if entity.state != Entity.STATE_ACTIVE:
        return

    rex = '|'.join(entity.regex_terms)
    rex = re.compile('( |^)(%s)( |$)' % rex)

    documents = defaultdict(int)
    try:
        for document_id, text in scan_entity_mentions(entity):
            text = normalize_strong(text)
            if text is None or len(text) <= 2:
                continue
            for match in rex.finditer(text):
                documents[document_id] += 1
    except Exception:
        log.exception('Failed to fully scan documents for entity refresh.')

    q = db.session.query(Reference)
    q = q.filter(Reference.entity_id == entity.id)
    q = q.filter(Reference.origin == 'regex')
    q.delete(synchronize_session='fetch')

    log.info("Re-matching %r gave %r documents.", entity,
             len(documents))

    for document_id, weight in documents.items():
        doc = Document.by_id(document_id)
        if doc is None:
            continue
        ref = Reference()
        ref.document_id = document_id
        ref.entity_id = entity.id
        ref.origin = 'regex'
        ref.weight = weight
        db.session.add(ref)

    db.session.commit()
    delete_entity_references(entity.id)
    q = db.session.query(func.distinct(Reference.document_id))
    q = q.filter(Reference.entity_id == entity.id)
    for document_id, in q:
        index_document(document_id, index_records=False)
示例#6
0
文件: entities.py 项目: wcyn/aleph
def generate_entity_references(entity):
    # This is all a bit hacky: we're re-generating all the entity
    # references for the given entity by effectively re-implementing
    # the RegexEntityAnalyzer. The alternative was to conduct a
    # search for potential matching documents, re-analyze them and
    # re-index them. This proved to be too slow in reality.
    if entity.state != Entity.STATE_ACTIVE:
        entity.delete_references(origin='regex')
        return

    log.info("Updating document references: %r", entity)
    rex = '|'.join([t for t in entity.regex_terms])
    rex = re.compile('(%s)' % rex)

    documents = defaultdict(int)
    try:
        for document_id, text in scan_entity_mentions(entity):
            text = match_form(text)
            if text is None or len(text) <= 2:
                continue
            for match in rex.finditer(text):
                documents[document_id] += 1
    except Exception as ex:
        log.exception(ex)

    log.info("Re-matching %r gave %r documents.", entity, len(documents))

    entity.delete_references(origin='regex')
    for document_id, weight in documents.items():
        doc = Document.by_id(document_id)
        if doc is None:
            continue
        ref = Reference()
        ref.document_id = doc.id
        ref.entity_id = entity.id
        ref.origin = 'regex'
        ref.weight = weight
        db.session.add(ref)

    db.session.commit()
    delete_entity_references(entity.id)
    update_entity_references(entity)
示例#7
0
def generate_entity_references(entity):
    if entity.state != Entity.STATE_ACTIVE:
        return

    rex = '|'.join(entity.regex_terms)
    rex = re.compile('( |^)(%s)( |$)' % rex)

    documents = defaultdict(int)
    try:
        for document_id, text in scan_entity_mentions(entity):
            text = normalize_strong(text)
            if text is None or len(text) <= 2:
                continue
            for match in rex.finditer(text):
                documents[document_id] += 1
    except Exception:
        log.exception('Failed to fully scan documents for entity refresh.')

    q = db.session.query(Reference)
    q = q.filter(Reference.entity_id == entity.id)
    q = q.filter(Reference.origin == 'regex')
    q.delete(synchronize_session='fetch')

    log.info("Re-matching %r gave %r documents.", entity, len(documents))

    for document_id, weight in documents.items():
        doc = Document.by_id(document_id)
        if doc is None:
            continue
        ref = Reference()
        ref.document_id = document_id
        ref.entity_id = entity.id
        ref.origin = 'regex'
        ref.weight = weight
        db.session.add(ref)

    db.session.commit()
    delete_entity_references(entity.id)
    q = db.session.query(func.distinct(Reference.document_id))
    q = q.filter(Reference.entity_id == entity.id)
    for document_id, in q:
        index_document(document_id, index_records=False)