Пример #1
0
def analyze_document(document):
    """Run analyzers (such as NER) on a given document."""
    log.info("Analyze document: %r", document)
    start = timer()

    # initialise the analyzers
    analyzers = []
    for cls in get_analyzers():
        analyzer = cls(document)
        analyzer.prepare()
        analyzers.append(analyzer)

    # run the analyzers on each fragment of text in the given
    # document (row cells or text pages).
    for text in document.text_parts():
        for analyzer in analyzers:
            if not analyzer.disabled:
                analyzer.on_text(text)

    # collect outputs.
    for analyzer in analyzers:
        if not analyzer.disabled:
            analyzer.finalize()
    db.session.add(document)
    db.session.commit()

    end = timer()
    log.info("Completed analysis: %r (elapsed: %.2fms)", document, end - start)

    # next: update the search index.
    index_document(document)
    index_records(document)
Пример #2
0
def analyze_document(document_id):
    clear_session()
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Analyze document: %r", document)
    for cls in get_analyzers():
        cls().analyze(document, document.meta)
    index_document(document_id)
Пример #3
0
def analyze_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Analyze document: %r", document)
    try:
        for cls in get_analyzers():
            cls().analyze(document, document.meta)
    except Exception as ex:
        log.exception(ex)
    index_document(document_id)
Пример #4
0
def analyze_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Analyze document: %r", document)
    for cls in get_analyzers():
        try:
            cls().analyze(document, document.meta)
        except Exception as ex:
            log.exception(ex)
            process.exception(process.ANALYZE, component=cls.__name__,
                              document_id=document.id, meta=document.meta,
                              source_id=document.source_id, exception=ex)
    index_document(document_id)
Пример #5
0
def analyze_document(document):
    """Run analyzers (such as NER) on a given document."""
    log.info("Analyze document [%s]: %s",
             document.id, document.title)
    start = timer()

    for cls in get_analyzers():
        analyzer = cls()
        if not analyzer.disabled:
            analyzer.analyze(document)

    db.session.add(document)
    db.session.commit()
    end = timer()
    log.info("Completed analysis [%s]: %s (elapsed: %.2fs)",
             document.id, document.title, end - start)
Пример #6
0
def analyze_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Analyze document: %r", document)
    for cls in get_analyzers():
        try:
            cls().analyze(document, document.meta)
        except Exception as ex:
            log.exception(ex)
            process.exception(process.ANALYZE,
                              component=cls.__name__,
                              document_id=document.id,
                              meta=document.meta,
                              source_id=document.source_id,
                              exception=ex)
    index_document(document_id)
Пример #7
0
def analyze_document(document):
    log.info("Analyze document: %r", document)
    analyzers = []
    meta = document.meta
    for cls in get_analyzers():
        analyzer = cls(document, meta)
        analyzer.prepare()
        analyzers.append(analyzer)

    for text in document.text_parts():
        for analyzer in analyzers:
            analyzer.on_text(text)

    for analyzer in analyzers:
        analyzer.finalize()
    document.meta = meta
    db.session.add(document)
    db.session.commit()
    index_document(document)
Пример #8
0
def analyze_document(document):
    log.info("Analyze document: %r", document)
    analyzers = []
    meta = document.meta
    for cls in get_analyzers():
        analyzer = cls(document, meta)
        analyzer.prepare()
        analyzers.append(analyzer)

    for text in document.text_parts():
        for analyzer in analyzers:
            analyzer.on_text(text)

    for analyzer in analyzers:
        analyzer.finalize()
    document.meta = meta
    db.session.add(document)
    db.session.commit()
    index_document(document)
Пример #9
0
def analyze_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Analyze document: %r", document)
    analyzers = []
    meta = document.meta
    for cls in get_analyzers():
        try:
            analyzer = cls(document, meta)
            analyzer.prepare()
            analyzers.append(analyzer)
        except Exception as ex:
            log.exception(ex)

    if document.type == Document.TYPE_TEXT:
        for page in document.pages:
            for analyzer in analyzers:
                analyzer.on_page(page)
            for text in page.text_parts():
                for analyzer in analyzers:
                    analyzer.on_text(text)

    if document.type == Document.TYPE_TABULAR:
        for record in document.records:
            for analyzer in analyzers:
                analyzer.on_record(record)
            for text in record.text_parts():
                for analyzer in analyzers:
                    analyzer.on_text(text)

    for analyzer in analyzers:
        try:
            analyzer.finalize()
        except Exception as ex:
            log.exception(ex)
    document.meta = meta
    db.session.add(document)
    db.session.commit()
    index_document(document_id)
Пример #10
0
def analyze_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Analyze document: %r", document)
    analyzers = []
    meta = document.meta
    for cls in get_analyzers():
        try:
            analyzer = cls(document, meta)
            analyzer.prepare()
            analyzers.append(analyzer)
        except Exception as ex:
            log.exception(ex)

    if document.type == Document.TYPE_TEXT:
        for page in document.pages:
            for analyzer in analyzers:
                analyzer.on_page(page)
            for text in page.text_parts():
                for analyzer in analyzers:
                    analyzer.on_text(text)

    if document.type == Document.TYPE_TABULAR:
        for record in document.records:
            for analyzer in analyzers:
                analyzer.on_record(record)
            for text in record.text_parts():
                for analyzer in analyzers:
                    analyzer.on_text(text)

    for analyzer in analyzers:
        try:
            analyzer.finalize()
        except Exception as ex:
            log.exception(ex)
    document.meta = meta
    db.session.add(document)
    db.session.commit()
    index_document(document_id)