def analyze_document(document): """Run analyzers (such as NER) on a given document.""" log.info("Analyze document: %r", document) start = timer() # initialise the analyzers analyzers = [] for cls in get_analyzers(): analyzer = cls(document) analyzer.prepare() analyzers.append(analyzer) # run the analyzers on each fragment of text in the given # document (row cells or text pages). for text in document.text_parts(): for analyzer in analyzers: if not analyzer.disabled: analyzer.on_text(text) # collect outputs. for analyzer in analyzers: if not analyzer.disabled: analyzer.finalize() db.session.add(document) db.session.commit() end = timer() log.info("Completed analysis: %r (elapsed: %.2fms)", document, end - start) # next: update the search index. index_document(document) index_records(document)
def analyze_document(document_id): clear_session() document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return log.info("Analyze document: %r", document) for cls in get_analyzers(): cls().analyze(document, document.meta) index_document(document_id)
def analyze_document(document_id): document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return log.info("Analyze document: %r", document) try: for cls in get_analyzers(): cls().analyze(document, document.meta) except Exception as ex: log.exception(ex) index_document(document_id)
def analyze_document(document_id): document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return log.info("Analyze document: %r", document) for cls in get_analyzers(): try: cls().analyze(document, document.meta) except Exception as ex: log.exception(ex) process.exception(process.ANALYZE, component=cls.__name__, document_id=document.id, meta=document.meta, source_id=document.source_id, exception=ex) index_document(document_id)
def analyze_document(document): """Run analyzers (such as NER) on a given document.""" log.info("Analyze document [%s]: %s", document.id, document.title) start = timer() for cls in get_analyzers(): analyzer = cls() if not analyzer.disabled: analyzer.analyze(document) db.session.add(document) db.session.commit() end = timer() log.info("Completed analysis [%s]: %s (elapsed: %.2fs)", document.id, document.title, end - start)
def analyze_document(document): log.info("Analyze document: %r", document) analyzers = [] meta = document.meta for cls in get_analyzers(): analyzer = cls(document, meta) analyzer.prepare() analyzers.append(analyzer) for text in document.text_parts(): for analyzer in analyzers: analyzer.on_text(text) for analyzer in analyzers: analyzer.finalize() document.meta = meta db.session.add(document) db.session.commit() index_document(document)
def analyze_document(document_id): document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return log.info("Analyze document: %r", document) analyzers = [] meta = document.meta for cls in get_analyzers(): try: analyzer = cls(document, meta) analyzer.prepare() analyzers.append(analyzer) except Exception as ex: log.exception(ex) if document.type == Document.TYPE_TEXT: for page in document.pages: for analyzer in analyzers: analyzer.on_page(page) for text in page.text_parts(): for analyzer in analyzers: analyzer.on_text(text) if document.type == Document.TYPE_TABULAR: for record in document.records: for analyzer in analyzers: analyzer.on_record(record) for text in record.text_parts(): for analyzer in analyzers: analyzer.on_text(text) for analyzer in analyzers: try: analyzer.finalize() except Exception as ex: log.exception(ex) document.meta = meta db.session.add(document) db.session.commit() index_document(document_id)