Пример #1
0
    def analyze(self, document, meta):
        begin_time = time()
        self.cache.generate()
        entities = defaultdict(int)
        for text, rec in document.text_parts():
            text = normalize_strong(text)
            if text is None or len(text) <= 2:
                continue
            for rex in self.cache.regexes:
                for match in rex.finditer(text):
                    match = match.group(2)
                    # match = match.group(1)
                    for entity_id in self.cache.matches.get(match, []):
                        entities[entity_id] += 1

        Reference.delete_document(document.id, origin=self.origin)
        for entity_id, weight in entities.items():
            ref = Reference()
            ref.document_id = document.id
            ref.entity_id = entity_id
            ref.origin = self.origin
            ref.weight = weight
            db.session.add(ref)
        self.save(document, meta)

        duration_time = int((time() - begin_time) * 1000)
        if len(entities):
            log.info("Regex tagged %r with %d entities (%sms)",
                     document, len(entities), duration_time)
        else:
            log.info("Regex found no entities on %r (%sms)",
                     document, duration_time)
Пример #2
0
    def _generate(self):
        latest = Entity.latest()
        if self.latest is not None and self.latest >= latest:
            return

        self.latest = latest
        self.matches = defaultdict(set)

        q = Entity.all()
        q = q.options(joinedload('other_names'))
        q = q.filter(Entity.state == Entity.STATE_ACTIVE)
        for entity in q:
            for term in entity.regex_terms:
                self.matches[normalize_strong(term)].add(entity.id)

        self.regexes = []
        terms = self.matches.keys()
        terms = [t for t in terms if len(t) > 2]
        for i in count(0):
            terms_slice = terms[i * BATCH_SIZE:(i + 1) * BATCH_SIZE]
            if not len(terms_slice):
                break
            body = '|'.join(terms_slice)
            rex = re.compile('( |^)(%s)( |$)' % body)
            # rex = re.compile('(%s)' % body)
            self.regexes.append(rex)

        log.info('Generating entity tagger: %r (%s terms)',
                 latest, len(terms))
Пример #3
0
    def analyze(self, document, meta):
        begin_time = time()
        self.cache.generate()
        entities = defaultdict(int)
        for text, rec in document.text_parts():
            text = normalize_strong(text)
            if text is None or len(text) <= 2:
                continue
            for rex in self.cache.regexes:
                for match in rex.finditer(text):
                    match = match.group(2)
                    # match = match.group(1)
                    for entity_id in self.cache.matches.get(match, []):
                        entities[entity_id] += 1

        Reference.delete_document(document.id, origin=self.origin)
        for entity_id, weight in entities.items():
            ref = Reference()
            ref.document_id = document.id
            ref.entity_id = entity_id
            ref.origin = self.origin
            ref.weight = weight
            db.session.add(ref)
        self.save(document, meta)

        duration_time = int((time() - begin_time) * 1000)
        if len(entities):
            log.info("Regex tagged %r with %d entities (%sms)", document,
                     len(entities), duration_time)
        else:
            log.info("Regex found no entities on %r (%sms)", document,
                     duration_time)
Пример #4
0
    def _generate(self):
        latest = Entity.latest()
        if self.latest is not None and self.latest >= latest:
            return

        self.latest = latest
        self.matches = defaultdict(set)

        q = Entity.all()
        q = q.options(joinedload('other_names'))
        q = q.filter(Entity.state == Entity.STATE_ACTIVE)
        for entity in q:
            for term in entity.regex_terms:
                self.matches[normalize_strong(term)].add(entity.id)

        self.regexes = []
        terms = self.matches.keys()
        terms = [t for t in terms if len(t) > 2]
        for i in count(0):
            terms_slice = terms[i * BATCH_SIZE:(i + 1) * BATCH_SIZE]
            if not len(terms_slice):
                break
            body = '|'.join(terms_slice)
            rex = re.compile('( |^)(%s)( |$)' % body)
            # rex = re.compile('(%s)' % body)
            self.regexes.append(rex)

        log.info('Generating entity tagger: %r (%s terms)', latest, len(terms))
Пример #5
0
 def on_text(self, text):
     text = normalize_strong(text)
     if text is None or len(text) <= 2:
         return
     for rex in self.cache.regexes:
         for match in rex.finditer(text):
             match = match.group(2)
             for entity_id in self.cache.matches.get(match, []):
                 self.entities[entity_id] += 1
Пример #6
0
 def on_text(self, text):
     text = normalize_strong(text)
     if text is None or len(text) <= 2:
         return
     for rex in self.cache.regexes:
         for match in rex.finditer(text):
             match = match.group(2)
             for entity_id in self.cache.matches.get(match, []):
                 self.entities[entity_id] += 1
Пример #7
0
 def on_text(self, text):
     if self.cache.automaton is None:
         return
     text = normalize_strong(text)
     if text is None or len(text) <= 2:
         return
     text = text.encode('utf-8')
     for match in self.cache.automaton.iter(text):
         for entity_id in match[1]:
             self.entities[entity_id] += 1
Пример #8
0
 def on_text(self, text):
     if self.cache.automaton is None:
         return
     text = normalize_strong(text)
     if text is None or len(text) <= 2:
         return
     text = ' %s ' % text.encode('utf-8')
     for match in self.cache.automaton.iter(text):
         for entity_id in match[1]:
             self.entities[entity_id] += 1
Пример #9
0
def generate_entity_references(entity):
    if entity.state != Entity.STATE_ACTIVE:
        return
    # This is all a bit hacky: we're re-generating all the entity
    # references for the given entity by effectively re-implementing
    # the RegexEntityAnalyzer. The alternative was to conduct a
    # search for potential matching documents, re-analyze them and
    # re-index them. This proved to be too slow in reality.

    log.info("Updating document references: %r", entity)
    rex = '|'.join(entity.regex_terms)
    rex = re.compile('( |^)(%s)( |$)' % rex)

    documents = defaultdict(int)
    try:
        for document_id, text in scan_entity_mentions(entity):
            text = normalize_strong(text)
            if text is None or len(text) <= 2:
                continue
            for match in rex.finditer(text):
                documents[document_id] += 1
    except Exception:
        log.exception('Failed to fully scan documents for entity refresh.')

    q = db.session.query(Reference)
    q = q.filter(Reference.entity_id == entity.id)
    q = q.filter(Reference.origin == 'regex')
    q.delete(synchronize_session='fetch')

    log.info("Re-matching %r gave %r documents.", entity,
             len(documents))

    for document_id, weight in documents.items():
        doc = Document.by_id(document_id)
        if doc is None:
            continue
        ref = Reference()
        ref.document_id = document_id
        ref.entity_id = entity.id
        ref.origin = 'regex'
        ref.weight = weight
        db.session.add(ref)

    db.session.commit()
    delete_entity_references(entity.id)
    update_entity_references(entity.id)
Пример #10
0
def generate_entity_references(entity):
    if entity.state != Entity.STATE_ACTIVE:
        return
    # This is all a bit hacky: we're re-generating all the entity
    # references for the given entity by effectively re-implementing
    # the RegexEntityAnalyzer. The alternative was to conduct a
    # search for potential matching documents, re-analyze them and
    # re-index them. This proved to be too slow in reality.

    log.info("Updating document references: %r", entity)
    rex = '|'.join(entity.regex_terms)
    rex = re.compile('( |^)(%s)( |$)' % rex)

    documents = defaultdict(int)
    try:
        for document_id, text in scan_entity_mentions(entity):
            text = normalize_strong(text)
            if text is None or len(text) <= 2:
                continue
            for match in rex.finditer(text):
                documents[document_id] += 1
    except Exception:
        log.exception('Failed to fully scan documents for entity refresh.')

    q = db.session.query(Reference)
    q = q.filter(Reference.entity_id == entity.id)
    q = q.filter(Reference.origin == 'regex')
    q.delete(synchronize_session='fetch')

    log.info("Re-matching %r gave %r documents.", entity, len(documents))

    for document_id, weight in documents.items():
        doc = Document.by_id(document_id)
        if doc is None:
            continue
        ref = Reference()
        ref.document_id = document_id
        ref.entity_id = entity.id
        ref.origin = 'regex'
        ref.weight = weight
        db.session.add(ref)

    db.session.commit()
    delete_entity_references(entity.id)
    update_entity_references(entity.id)
Пример #11
0
 def regex_terms(self):
     # This is to find the shortest possible regex for each entity.
     # If, for example, and entity matches both "Al Qaeda" and
     # "Al Qaeda in Iraq, Syria and the Levant", it is useless to
     # search for the latter.
     terms = set([normalize_strong(t) for t in self.terms])
     regex_terms = set()
     for term in terms:
         if term is None or len(term) < 4 or len(term) > 120:
             continue
         contained = False
         for other in terms:
             if other is None or other == term:
                 continue
             if other in term:
                 contained = True
         if not contained:
             regex_terms.add(term)
     return regex_terms
Пример #12
0
 def regex_terms(self):
     # This is to find the shortest possible regex for each entity.
     # If, for example, and entity matches both "Al Qaeda" and
     # "Al Qaeda in Iraq, Syria and the Levant", it is useless to
     # search for the latter.
     terms = [' %s ' % normalize_strong(t) for t in self.terms]
     regex_terms = set()
     for term in terms:
         if len(term) < 5:
             continue
         contained = False
         for other in terms:
             if other == term:
                 continue
             if other in term:
                 contained = True
         if not contained:
             regex_terms.add(term.strip())
     return regex_terms
Пример #13
0
def generate_entity_references(entity):
    if entity.state != Entity.STATE_ACTIVE:
        return

    rex = '|'.join(entity.regex_terms)
    rex = re.compile('( |^)(%s)( |$)' % rex)

    documents = defaultdict(int)
    try:
        for document_id, text in scan_entity_mentions(entity):
            text = normalize_strong(text)
            if text is None or len(text) <= 2:
                continue
            for match in rex.finditer(text):
                documents[document_id] += 1
    except Exception:
        log.exception('Failed to fully scan documents for entity refresh.')

    q = db.session.query(Reference)
    q = q.filter(Reference.entity_id == entity.id)
    q = q.filter(Reference.origin == 'regex')
    q.delete(synchronize_session='fetch')

    log.info("Re-matching %r gave %r documents.", entity,
             len(documents))

    for document_id, weight in documents.items():
        doc = Document.by_id(document_id)
        if doc is None:
            continue
        ref = Reference()
        ref.document_id = document_id
        ref.entity_id = entity.id
        ref.origin = 'regex'
        ref.weight = weight
        db.session.add(ref)

    db.session.commit()
    delete_entity_references(entity.id)
    q = db.session.query(func.distinct(Reference.document_id))
    q = q.filter(Reference.entity_id == entity.id)
    for document_id, in q:
        index_document(document_id, index_records=False)
Пример #14
0
def generate_entity_references(entity):
    if entity.state != Entity.STATE_ACTIVE:
        return

    rex = '|'.join(entity.regex_terms)
    rex = re.compile('( |^)(%s)( |$)' % rex)

    documents = defaultdict(int)
    try:
        for document_id, text in scan_entity_mentions(entity):
            text = normalize_strong(text)
            if text is None or len(text) <= 2:
                continue
            for match in rex.finditer(text):
                documents[document_id] += 1
    except Exception:
        log.exception('Failed to fully scan documents for entity refresh.')

    q = db.session.query(Reference)
    q = q.filter(Reference.entity_id == entity.id)
    q = q.filter(Reference.origin == 'regex')
    q.delete(synchronize_session='fetch')

    log.info("Re-matching %r gave %r documents.", entity, len(documents))

    for document_id, weight in documents.items():
        doc = Document.by_id(document_id)
        if doc is None:
            continue
        ref = Reference()
        ref.document_id = document_id
        ref.entity_id = entity.id
        ref.origin = 'regex'
        ref.weight = weight
        db.session.add(ref)

    db.session.commit()
    delete_entity_references(entity.id)
    q = db.session.query(func.distinct(Reference.document_id))
    q = q.filter(Reference.entity_id == entity.id)
    for document_id, in q:
        index_document(document_id, index_records=False)