def index_document(document_id): clear_session() document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return log.info("Index document: %r", document) data = document.to_dict() data['entities'] = generate_entities(document) data['title_latin'] = latinize_text(data.get('title')) data['summary_latin'] = latinize_text(data.get('summary')) es.index(index=es_index, doc_type=TYPE_DOCUMENT, body=data, id=document.id) clear_children(document) try: if document.type == Document.TYPE_TEXT: bulk(es, generate_pages(document), stats_only=True, chunk_size=2000, request_timeout=60.0) if document.type == Document.TYPE_TABULAR: bulk(es, generate_records(document), stats_only=True, chunk_size=2000, request_timeout=60.0) except Exception as ex: log.exception(ex)
def ingest_url(source_id, metadata, url): clear_session() meta = Metadata(data=metadata) try: with NamedTemporaryFile() as fh: log.info("Ingesting URL: %r", url) res = requests.get(url, stream=True) if res.status_code >= 400: log.error("Error ingesting %r: %r", url, res.status_code) for chunk in res.iter_content(chunk_size=1024): if chunk: fh.write(chunk) fh.flush() if not meta.has('source_url'): meta.source_url = res.url meta.headers = res.headers meta = get_archive().archive_file(fh.name, meta, move=True) except Exception as ex: log.exception(ex) process.exception(process.INGEST, component='ingest_url', source_id=source_id, meta=meta, exception=ex) return ingest.delay(source_id, meta.data)
def analyze_document(document_id): clear_session() document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return log.info("Analyze document: %r", document) for cls in get_analyzers(): cls().analyze(document, document.meta) index_document(document_id)
def ingest_url(source_id, metadata, url): clear_session() meta = Metadata(data=metadata) with NamedTemporaryFile() as fh: log.info("Ingesting URL: %r", url) res = requests.get(url, stream=True) if res.status_code >= 400: log.error("Error ingesting %r: %r", url, res.status_code) for chunk in res.iter_content(chunk_size=1024): if chunk: fh.write(chunk) fh.flush() if not meta.has("source_url"): meta.source_url = res.url meta.headers = res.headers meta = archive.archive_file(fh.name, meta, move=True) ingest.delay(source_id, meta.data)
def ingest_url(source_id, metadata, url): clear_session() meta = Metadata(data=metadata) try: with NamedTemporaryFile() as fh: log.info("Ingesting URL: %r", url) res = requests.get(url, stream=True) if res.status_code >= 400: log.error("Error ingesting %r: %r", url, res.status_code) for chunk in res.iter_content(chunk_size=1024): if chunk: fh.write(chunk) fh.flush() if not meta.has('source_url'): meta.source_url = res.url meta.headers = res.headers meta = get_archive().archive_file(fh.name, meta, move=True) except Exception as ex: log.exception(ex) process.exception(process.INGEST, component='ingest_url', source_id=source_id, meta=meta, exception=ex) return ingest.delay(source_id, meta.data)
def ingest(source_id, metadata): clear_session() meta = Metadata(data=metadata) Ingestor.dispatch(source_id, meta)