Exemplo n.º 1
0
class AhoCorasickEntityAnalyzer(Analyzer):
    ORIGIN = 'regex'
    MIN_LENGTH = 100

    cache = AutomatonCache()

    def prepare(self):
        self.disabled = not get_config('REGEX_ENTITIES', True)
        if not self.disabled:
            self.cache.generate()
        self.collector = DocumentTagCollector(self.document, self.ORIGIN)

    def on_text(self, text):
        if self.cache.automaton.kind == EMPTY:
            return
        if text is None or len(text) <= self.MIN_LENGTH:
            return
        text = match_form(text)
        if text is None or len(text) <= self.MIN_LENGTH:
            return
        text = text.encode('utf-8')
        for match in self.cache.automaton.iter(text):
            for (text, type) in match[1]:
                self.collector.emit(text, type)

    def finalize(self):
        log.info('Aho Corasick extraced %s entities.', len(self.collector))
        self.collector.save()
Exemplo n.º 2
0
    def analyze(self, document):
        if not document.supports_nlp:
            return

        collector = DocumentTagCollector(document, self.ORIGIN)
        try:
            self.extract(collector, document)
        finally:
            collector.save()
Exemplo n.º 3
0
    def analyze(self, document):
        if document.schema in self.IGNORED:
            return

        collector = DocumentTagCollector(document, self.ORIGIN)
        try:
            self.extract(collector, document)
        finally:
            collector.save()
Exemplo n.º 4
0
    def update(self):
        """Apply the outcome of the result to the document."""
        if self.status == self.STATUS_SUCCESS:
            self.document.status = Document.STATUS_SUCCESS
            self.document.error_message = None
        else:
            self.document.status = Document.STATUS_FAIL
            self.document.type = Document.TYPE_OTHER
            self.document.error_message = self.error_message

        self.document.foreign_id = stringify(self.id)
        if self.checksum:
            self.document.content_hash = self.checksum

        self.document.uploader_id = self.role_id or self.document.uploader_id
        self.document.file_size = self.size
        self.document.file_name = self.file_name
        self.document.title = stringify(self.title)
        self.document.summary = stringify(self.summary)
        self.document.author = stringify(self.author)
        self.document.keywords = self.keywords
        self.document.mime_type = stringify(self.mime_type)
        self.document.encoding = self.encoding
        self.document.languages = self.languages
        self.document.headers = self.headers
        self.document.pdf_version = self.pdf_hash
        self.document.columns = self.columns.keys()

        collector = DocumentTagCollector(self.document, 'ingestors')
        for entity in self.entities:
            collector.emit(entity, DocumentTag.TYPE_PERSON)
        for email in self.emails:
            collector.emit(email, DocumentTag.TYPE_EMAIL)
        collector.save()
Exemplo n.º 5
0
 def extract(self, collector, document):
     DocumentTagCollector(document, 'polyglot').save()
     DocumentTagCollector(document, 'spacy').save()
     try:
         service = EntityExtractStub(self.channel)
         texts = self.text_iterator(document)
         entities = service.Extract(texts)
         for entity in entities.entities:
             type_ = self.TYPES.get(entity.type)
             if type_ is None:
                 continue
             collector.emit(entity.label, type_, weight=entity.weight)
         log.info('Extracted %s entities.', len(collector))
     except self.Error as e:
         log.warning("gRPC [%s]: %s", e.code(), e.details())
Exemplo n.º 6
0
 def extract(self, collector, document):
     DocumentTagCollector(document, 'polyglot').save()
     DocumentTagCollector(document, 'spacy').save()
     try:
         service = EntityExtractStub(self.channel)
         texts = self.text_iterator(document)
         entities = service.Extract(texts)
         for entity in entities.entities:
             type_ = self.TYPES.get(entity.type)
             if type_ is None:
                 continue
             collector.emit(entity.label, type_, weight=entity.weight)
         log.info('Extracted %s entities.', len(collector))
     except self.Error as exc:
         log.exception("gRPC Error: %s", self.SERVICE)
         self.reset_channel()
Exemplo n.º 7
0
Arquivo: regex.py Projeto: wcyn/aleph
class RegexAnalyzer(Analyzer):
    REGEX = None
    FLAG = None

    def prepare(self):
        # TODO: re-think this.
        self.disabled = self.document.type == self.document.TYPE_TABULAR
        self.collector = DocumentTagCollector(self.document, self.ORIGIN)
        self.regex = re.compile(self.REGEX, self.FLAG)

    def on_text(self, text):
        if not self.disabled:
            for mobj in self.regex.finditer(text):
                self.on_match(mobj)

    def finalize(self):
        self.collector.save()
Exemplo n.º 8
0
 def extract(self, collector, document):
     DocumentTagCollector(document, 'polyglot').save()
     DocumentTagCollector(document, 'spacy').save()
     try:
         service = EntityExtractStub(self.channel)
         texts = self.text_iterator(document)
         entities = service.Extract(texts)
         for entity in entities.entities:
             if entity.type == ExtractedEntity.COUNTRY:
                 document.add_country(entity.label)
             if entity.type == ExtractedEntity.LANGUAGE:
                 document.add_language(entity.label)
             type_ = self.TYPES.get(entity.type)
             # log.info('%s: %s', entity.label, type_)
             if type_ is not None:
                 collector.emit(entity.label, type_, weight=entity.weight)
         log.info('Extracted %s entities.', len(collector))
     except self.Error as e:
         log.warning("gRPC [%s]: %s", e.code(), e.details())
Exemplo n.º 9
0
    def update(self):
        """Apply the outcome of the result to the document."""
        doc = self.document
        if self.status == self.STATUS_SUCCESS:
            doc.status = Document.STATUS_SUCCESS
            doc.error_message = None
        else:
            doc.status = Document.STATUS_FAIL
            doc.error_message = stringify(self.error_message)

        schema = model['Document']
        for flag, name in self.SCHEMATA:
            if flag in self.flags:
                schema = model[name]

        doc.schema = schema.name
        doc.foreign_id = safe_string(self.id)
        doc.content_hash = self.checksum or doc.content_hash
        doc.pdf_version = self.pdf_checksum
        doc.title = self.title or doc.meta.get('title')
        doc.file_name = self.file_name or doc.meta.get('file_name')
        doc.file_size = self.size or doc.meta.get('file_size')
        doc.summary = self.summary or doc.meta.get('summary')
        doc.author = self.author or doc.meta.get('author')
        doc.generator = self.generator or doc.meta.get('generator')
        doc.mime_type = self.mime_type or doc.meta.get('mime_type')
        doc.encoding = self.encoding or doc.meta.get('encoding')
        doc.date = self.date or doc.meta.get('date')
        doc.authored_at = self.created_at or doc.meta.get('authored_at')
        doc.modified_at = self.modified_at or doc.meta.get('modified_at')
        doc.published_at = self.published_at or doc.meta.get('published_at')
        doc.message_id = self.message_id or doc.meta.get('message_id')
        doc.in_reply_to = ensure_list(self.in_reply_to)
        doc.columns = list(self.columns.keys())
        doc.body_raw = self.body_html
        doc.body_text = self.body_text
        doc.headers = self.headers

        for kw in self.keywords:
            doc.add_keyword(safe_string(kw))
        for lang in self.languages:
            doc.add_language(safe_string(lang))

        db.session.flush()

        collector = DocumentTagCollector(doc, 'ingestors')
        for entity in self.entities:
            collector.emit(entity, DocumentTag.TYPE_PERSON)
        for email in self.emails:
            collector.emit(email, DocumentTag.TYPE_EMAIL)
        collector.save()
Exemplo n.º 10
0
    def update(self):
        """Apply the outcome of the result to the document."""
        doc = self.document
        if self.status == self.STATUS_SUCCESS:
            doc.status = Document.STATUS_SUCCESS
            doc.error_message = None
        else:
            doc.status = Document.STATUS_FAIL
            doc.error_message = stringify(self.error_message)

        schema = model['Document']
        for flag, name in self.SCHEMATA:
            if flag in self.flags:
                schema = model[name]

        doc.schema = schema.name
        doc.foreign_id = stringify(self.id)
        doc.content_hash = self.checksum or doc.content_hash
        doc.uploader_id = self.role_id or doc.uploader_id
        doc.title = stringify(self.title) or doc.meta.get('title')
        doc.file_name = stringify(self.file_name) or doc.meta.get('file_name')
        doc.file_size = self.size or doc.meta.get('file_size')
        doc.title = stringify(self.title) or doc.meta.get('title')
        doc.summary = stringify(self.summary) or doc.meta.get('summary')
        doc.author = stringify(self.author) or doc.meta.get('author')
        doc.generator = stringify(self.generator) or doc.meta.get('generator')
        doc.mime_type = stringify(self.mime_type) or doc.meta.get('mime_type')
        doc.encoding = stringify(self.encoding) or doc.meta.get('encoding')

        doc.date = self.date or doc.meta.get('date')
        doc.authored_at = self.created_at or doc.meta.get('authored_at')
        doc.modified_at = self.modified_at or doc.meta.get('modified_at')
        doc.published_at = self.published_at or doc.meta.get('published_at')

        for kw in self.keywords:
            doc.add_keyword(kw)
        for lang in self.languages:
            doc.add_language(lang)

        doc.headers = self.headers or doc.meta.get('headers')
        doc.columns = self.columns.keys()

        if len(self.pages):
            doc.body_text = '\n\n'.join(self.pages)

        collector = DocumentTagCollector(doc, 'ingestors')
        for entity in self.entities:
            collector.emit(entity, DocumentTag.TYPE_PERSON)
        for email in self.emails:
            collector.emit(email, DocumentTag.TYPE_EMAIL)
        collector.save()
Exemplo n.º 11
0
    def analyze(self, document):
        if document.type in [document.TYPE_TABULAR, document.TYPE_OTHER]:
            return
        collector = DocumentTagCollector(document, self.ORIGIN)
        text = document.text
        if text is None or len(text) <= self.MIN_LENGTH:
            return
        try:
            hint_language_code = None
            if len(document.languages) == 1:
                hint_language_code = document.languages[0]
            text = Text(text, hint_language_code=hint_language_code)
            for entity in text.entities:
                if entity.tag == 'I-LOC' or len(entity) == 1:
                    continue

                label = ' '.join(entity)
                if len(label) < 4 or len(label) > 200:
                    continue
                collector.emit(label, self.TYPES[entity.tag])

        except ValueError as ve:
            log.info('NER value error: %r', ve)
        except Exception as ex:
            log.warning('NER failed: %r', ex)
        finally:
            log.info('Polyglot extracted %s entities.', len(collector))
            collector.save()
Exemplo n.º 12
0
    def analyze(self, document):
        if document.schema in self.IGNORED:
            return

        collector = DocumentTagCollector(document, self.ORIGIN)
        text = document.text
        if text is None or len(text) <= self.MIN_LENGTH:
            return
        try:
            hint_language_code = None
            if len(document.languages) == 1:
                hint_language_code = document.languages[0]
            text = Text(text, hint_language_code=hint_language_code)
            for entity in text.entities:
                if entity.tag == 'I-LOC':
                    continue

                label = ' '.join(entity)
                label = self.CLEAN.sub(' ', label)
                label = collapse_spaces(label)
                if ' ' not in label or len(label) < 4 or len(label) > 200:
                    continue
                # log.info("Entity [Doc %s]: %s [%s]",
                #          document.id, label, entity.tag)
                collector.emit(label, self.TYPES[entity.tag])

        except ValueError as ve:
            log.warning('NER value error: %r', ve)
        except Exception as ex:
            log.warning('NER failed: %r', ex)
        finally:
            collector.save()
            log.info('Polyglot extracted %s entities.', len(collector))
Exemplo n.º 13
0
 def analyze(self, document):
     collector = DocumentTagCollector(document, self.ORIGIN)
     for match in self.RE.finditer(document.text):
         text = self.extract_match(document, match)
         if text is not None:
             collector.emit(text, self.TYPE)
     collector.save()
Exemplo n.º 14
0
class PolyglotEntityAnalyzer(Analyzer):
    ORIGIN = 'polyglot'
    MIN_LENGTH = 100
    TYPES = {
        'I-PER': DocumentTag.TYPE_PERSON,
        'I-ORG': DocumentTag.TYPE_ORGANIZATION,
        'I-LOC': DocumentTag.TYPE_LOCATION
    }

    def prepare(self):
        self.disabled = self.document.type != self.document.TYPE_TEXT
        self.collector = DocumentTagCollector(self.document, self.ORIGIN)

    def on_text(self, text):
        if text is None or len(text) <= self.MIN_LENGTH:
            return
        try:
            hint_language_code = None
            if len(self.document.languages) == 1:
                hint_language_code = self.document.languages[0]
            text = Text(text, hint_language_code=hint_language_code)
            for entity in text.entities:
                if entity.tag == 'I-LOC' or len(entity) == 1:
                    continue

                label = ' '.join(entity)
                if len(label) < 4 or len(label) > 200:
                    continue
                self.collector.emit(label, self.TYPES.get(entity.tag))

        except ValueError as ve:
            log.info('NER value error: %r', ve)
        except Exception as ex:
            log.warning('NER failed: %r', ex)

    def finalize(self):
        log.info('Polyglot extracted %s entities.', len(self.collector))
        self.collector.save()
Exemplo n.º 15
0
    def analyze(self, document):
        text = match_form(document.text)
        if text is None or len(text) <= self.MIN_LENGTH:
            return

        collector = DocumentTagCollector(document, self.ORIGIN)
        self.cache.generate()
        if self.cache.automaton.kind != EMPTY:
            text = text.encode('utf-8')
            for match in self.cache.automaton.iter(text):
                for (text, tag) in match[1]:
                    collector.emit(text, tag)

        log.info('Aho Corasick extraced %s entities.', len(collector))
        collector.save()
Exemplo n.º 16
0
def extract_document_tags(document):
    if document.status != Document.STATUS_SUCCESS:
        return
    load_places()
    log.info("Tagging [%s]: %s", document.id, document.name)

    languages = list(document.languages)
    if not len(languages):
        languages = [settings.DEFAULT_LANGUAGE]

    aggregator = EntityAggregator()
    for text in document.texts:
        aggregator.extract(text, languages)

    DocumentTagCollector(document, 'polyglot').save()
    DocumentTagCollector(document, 'spacy').save()
    collector = DocumentTagCollector(document, 'ner')
    for (label, category, weight) in aggregator.entities:
        collector.emit(label, category, weight=weight)
    log.info("Extracted tags: %s", len(collector))
    collector.save()
    db.session.add(document)
    db.session.commit()
Exemplo n.º 17
0
    def analyze(self, document):
        collector = DocumentTagCollector(document, 'corasick')
        if self.automaton is None:
            return

        for text in document.texts:
            if len(text) <= self.MIN_LENGTH:
                continue
            text = self.match_form(text)
            if text is None:
                continue
            for match in self.automaton.iter(text):
                for (match_text, tag) in match[1]:
                    collector.emit(match_text, tag)

        if len(collector):
            log.info('Aho Corasick extraced %s entities.', len(collector))
        collector.save()
Exemplo n.º 18
0
    def analyze(self, document):
        if document.schema in self.IGNORED:
            return

        collector = DocumentTagCollector(document, self.ORIGIN)
        try:
            languages = set(document.languages)
            if len(self.languages):
                languages = languages.intersection(self.languages)
            if not len(languages):
                languages = [settings.DEFAULT_LANGUAGE]

            for text in document.texts:
                if len(text) <= self.MIN_LENGTH:
                    continue
                for label, tag in self.tag_text(text, languages):
                    # log.info("Entity [%s]: %s", document.id, label)
                    collector.emit(label, self.TYPES[tag])
        except ValueError as ve:
            log.warning('NER value error: %r', ve)

        collector.save()
        if len(collector):
            log.info('Polyglot extracted %s entities.', len(collector))
Exemplo n.º 19
0
 def prepare(self):
     self.disabled = not get_config('REGEX_ENTITIES', True)
     if not self.disabled:
         self.cache.generate()
     self.collector = DocumentTagCollector(self.document, self.ORIGIN)
Exemplo n.º 20
0
Arquivo: regex.py Projeto: wcyn/aleph
 def prepare(self):
     # TODO: re-think this.
     self.disabled = self.document.type == self.document.TYPE_TABULAR
     self.collector = DocumentTagCollector(self.document, self.ORIGIN)
     self.regex = re.compile(self.REGEX, self.FLAG)
Exemplo n.º 21
0
 def prepare(self):
     self.disabled = self.document.type != self.document.TYPE_TEXT
     self.collector = DocumentTagCollector(self.document, self.ORIGIN)