def init(): """Create or upgrade the search index and database.""" upgrade_db() init_search() upgrade_search() install_analyzers() get_archive().upgrade()
def dispatch(cls, source_id, meta): local_path = get_archive().load_file(meta) best_cls = cls.auction_file(meta, local_path) if best_cls is None: message = "No ingestor found: %r" % meta.file_name process.log(process.INGEST, component=cls.__name__, meta=meta, source_id=source_id, error_type='NoIngestorFound', error_message=message) return log.debug("Dispatching %r to %r", meta.file_name, best_cls.__name__) try: best_cls(source_id).ingest(meta, local_path) except Exception as ex: log.exception(ex) process.exception(process.INGEST, component=best_cls.__name__, exception=ex, meta=meta, source_id=source_id) finally: get_archive().cleanup_file(meta)
def dispatch(cls, source_id, meta): local_path = get_archive().load_file(meta) try: best_cls = cls.auction_file(meta, local_path) log.debug("Dispatching %r to %r", meta.file_name, best_cls) best_cls(source_id).ingest(meta, local_path) CrawlerState.store_ok(meta, source_id) db.session.commit() except Exception as exception: cls.handle_exception(meta, source_id, exception) finally: get_archive().cleanup_file(meta)
def dispatch(cls, collection_id, meta): local_path = get_archive().load_file(meta) try: best_cls = cls.auction_file(meta, local_path) log.debug("Dispatching %r to %r", meta.file_name, best_cls) best_cls(collection_id).ingest(meta, local_path) CrawlerState.store_ok(meta, collection_id) db.session.commit() except Exception as exc: cls.handle_exception(meta, collection_id, exc) finally: get_archive().cleanup_file(meta)
def file(document_id): document = get_document(document_id) enable_cache(server_side=True) url = get_archive().generate_url(document.meta) if url is not None: return redirect(url) local_path = get_archive().load_file(document.meta) fh = open(local_path, 'rb') return send_file(fh, as_attachment=True, attachment_filename=document.meta.file_name, mimetype=document.meta.mime_type)
def pdf(document_id): document = get_document(document_id) enable_cache(server_side=True) if document.type != Document.TYPE_TEXT: raise BadRequest("PDF is only available for text documents") pdf = document.meta.pdf url = get_archive().generate_url(pdf) if url is not None: return redirect(url) local_path = get_archive().load_file(pdf) fh = open(local_path, 'rb') return send_file(fh, mimetype=pdf.mime_type)
def ingest_url(self, collection_id, metadata, url): meta = Metadata.from_data(metadata) tmp_path = make_tempfile(meta.file_name, suffix=meta.extension) try: log.info("Ingesting URL: %s", url) res = requests.get(url, stream=True) if res.status_code == 404: log.info("HTTP not found: %s", url) return if res.status_code >= 399: countdown = 3600 ** self.request.retries self.retry(countdown=countdown) with open(tmp_path, 'w') as fh: for chunk in res.iter_content(chunk_size=1024): if chunk: fh.write(chunk) if not meta.has('source_url'): meta.source_url = res.url meta.headers = res.headers meta = get_archive().archive_file(tmp_path, meta, move=True) Ingestor.dispatch(collection_id, meta) except IOError as ioe: log.info("IO Failure: %r", ioe) countdown = 3600 ** self.request.retries self.retry(countdown=countdown) except Exception as ex: Ingestor.handle_exception(meta, collection_id, ex) finally: db.session.remove() remove_tempfile(tmp_path)
def ingest_url(source_id, metadata, url): clear_session() meta = Metadata(data=metadata) try: with NamedTemporaryFile() as fh: log.info("Ingesting URL: %r", url) res = requests.get(url, stream=True) if res.status_code >= 400: log.error("Error ingesting %r: %r", url, res.status_code) for chunk in res.iter_content(chunk_size=1024): if chunk: fh.write(chunk) fh.flush() if not meta.has('source_url'): meta.source_url = res.url meta.headers = res.headers meta = get_archive().archive_file(fh.name, meta, move=True) except Exception as ex: log.exception(ex) process.exception(process.INGEST, component='ingest_url', source_id=source_id, meta=meta, exception=ex) return ingest.delay(source_id, meta.data)
def pdf(document_id): document = get_document(document_id) enable_cache(server_side=True) log_event(request, document_id=document.id) if document.type != Document.TYPE_TEXT: raise BadRequest("PDF is only available for text documents") pdf = document.meta.pdf url = get_archive().generate_url(pdf) if url is not None: return redirect(url) try: local_path = get_archive().load_file(pdf) fh = open(local_path, 'rb') except Exception as ex: raise NotFound("Missing PDF file: %r" % ex) return send_file(fh, mimetype=pdf.mime_type)
def view(document_id): doc = get_document(document_id) enable_cache() data = doc.to_dict() data['data_url'] = get_archive().generate_url(doc.meta) if data['data_url'] is None: data['data_url'] = url_for('documents_api.file', document_id=document_id) if doc.meta.is_pdf: data['pdf_url'] = data['data_url'] else: data['pdf_url'] = get_archive().generate_url(doc.meta.pdf) if data['pdf_url'] is None: data['pdf_url'] = url_for('documents_api.pdf', document_id=document_id) data['source'] = doc.source return jsonify(data)
def ingest_file(source_id, meta, file_path, move=False): try: if not os.path.isfile(file_path): raise IngestorException("No such file: %r", file_path) if not meta.has('source_path'): meta.source_path = file_path meta = get_archive().archive_file(file_path, meta, move=move) ingest.delay(source_id, meta.data) except Exception as ex: Ingestor.handle_exception(meta, source_id, ex)
def view(document_id): doc = get_document(document_id) enable_cache() data = doc.to_dict() data['data_url'] = get_archive().generate_url(doc.meta) if data['data_url'] is None: data['data_url'] = url_for('documents_api.file', document_id=document_id) if doc.meta.is_pdf: data['pdf_url'] = data['data_url'] else: try: data['pdf_url'] = get_archive().generate_url(doc.meta.pdf) except Exception as ex: log.info('Could not generate PDF url: %r', ex) if data.get('pdf_url') is None: data['pdf_url'] = url_for('documents_api.pdf', document_id=document_id) data['source'] = doc.source return jsonify(data)
def view(document_id): doc = get_document(document_id) enable_cache() data = doc.to_dict() log_event(request, document_id=doc.id) data['data_url'] = get_archive().generate_url(doc.meta) if data['data_url'] is None: data['data_url'] = url_for('documents_api.file', document_id=document_id) if doc.meta.is_pdf: data['pdf_url'] = data['data_url'] else: try: data['pdf_url'] = get_archive().generate_url(doc.meta.pdf) except Exception as ex: log.info('Could not generate PDF url: %r', ex) if data.get('pdf_url') is None: data['pdf_url'] = url_for('documents_api.pdf', document_id=document_id) return jsonify(data)
def view(document_id): doc = get_document(document_id) enable_cache() data = doc.to_dict() data['data_url'] = get_archive().generate_url(doc.meta) if data['data_url'] is None: data['data_url'] = url_for('documents_api.file', document_id=document_id) if doc.meta.is_pdf: data['pdf_url'] = data['data_url'] else: try: data['pdf_url'] = get_archive().generate_url(doc.meta.pdf) except Exception as ex: log.info('Could not generate PDF url: %r', ex) if data.get('pdf_url') is None: data['pdf_url'] = url_for('documents_api.pdf', document_id=document_id) data['source'] = doc.source #data['metadata'] = {k:v for k,v in data['metadata'].items() if k in ALLOWED_METADATA} return jsonify(data)
def ingest_file(collection_id, meta, file_path, move=False): try: if not os.path.isfile(file_path): raise IngestorException("No such file: %r", file_path) if not meta.has('source_path'): meta.source_path = file_path meta = get_archive().archive_file(file_path, meta, move=move) ingest.delay(collection_id, meta.to_attr_dict()) except Exception as ex: Ingestor.handle_exception(meta, collection_id, ex) finally: db.session.remove()
def ingest_file(source_id, meta, file_name, move=False): try: if not os.path.isfile(file_name): raise ValueError("No such file: %r", file_name) if not meta.has('source_path'): meta.source_path = file_name meta = get_archive().archive_file(file_name, meta, move=move) except Exception as ex: log.exception(ex) process.exception(process.INGEST, component='ingest_url', source_id=source_id, meta=meta, exception=ex) return ingest.delay(source_id, meta.data)
def ingest_file(collection_id, meta, file_path, move=False, queue=WORKER_QUEUE, routing_key=WORKER_ROUTING_KEY): # the queue and routing key arguments are a workaround to # expedite user uploads over long-running batch imports. try: if not os.path.isfile(file_path): raise IngestorException("No such file: %r", file_path) if not meta.has('source_path'): meta.source_path = file_path meta = get_archive().archive_file(file_path, meta, move=move) ingest.apply_async([collection_id, meta.to_attr_dict()], queue=queue, routing_key=routing_key) except Exception as ex: Ingestor.handle_exception(meta, collection_id, ex) finally: db.session.remove()
def ingest_url(source_id, metadata, url): meta = Metadata(data=metadata) try: fh, tmp_path = mkstemp() os.close(fh) log.info("Ingesting URL: %r", url) res = requests.get(url, stream=True) if res.status_code >= 400: raise Exception("HTTP Error %r: %r" % (url, res.status_code)) with open(tmp_path, 'w') as fh: for chunk in res.iter_content(chunk_size=1024): if chunk: fh.write(chunk) if not meta.has('source_url'): meta.source_url = res.url meta.headers = res.headers meta = get_archive().archive_file(tmp_path, meta, move=True) Ingestor.dispatch(source_id, meta) except Exception as ex: Ingestor.handle_exception(meta, source_id, ex)
def ingest_url(collection_id, metadata, url): meta = Metadata.from_data(metadata) tmp_path = make_tempfile(meta.file_name, suffix=meta.extension) try: log.info("Ingesting URL: %r", url) res = requests.get(url, stream=True, timeout=120) if res.status_code >= 400: msg = "HTTP Error %r: %r" % (url, res.status_code) raise IngestorException(msg) with open(tmp_path, 'w') as fh: for chunk in res.iter_content(chunk_size=1024): if chunk: fh.write(chunk) if not meta.has('source_url'): meta.source_url = res.url meta.headers = res.headers meta = get_archive().archive_file(tmp_path, meta, move=True) Ingestor.dispatch(collection_id, meta) except Exception as ex: Ingestor.handle_exception(meta, collection_id, ex) finally: db.session.remove() remove_tempfile(tmp_path)
def store_pdf(self, meta, pdf_path, move=True): get_archive().archive_file(pdf_path, meta.pdf, move=move)
def store_pdf(self, meta, pdf_path): get_archive().archive_file(pdf_path, meta.pdf, move=False)
def upgrade(): """Create or upgrade the search index and database.""" upgrade_db() upgrade_search() upgrade_graph() get_archive().upgrade()