Пример #1
0
 def _document_to_pdf(self, file_path, entity):
     """Converts an office document to PDF."""
     file_name = entity_filename(entity)
     mime_type = entity.first('mimeType')
     log.info('Converting [%s] to PDF...', file_name)
     for attempt in count(1):
         try:
             with open(file_path, 'rb') as fh:
                 files = {'file': (file_name, fh, mime_type)}
                 res = requests.post(CONVERT_URL,
                                     params={'timeout': CONVERT_TIMEOUT},
                                     files=files,
                                     timeout=CONVERT_TIMEOUT + 10,
                                     stream=True)
             res.raise_for_status()
             out_path = self.make_work_file('out.pdf')
             with open(out_path, 'wb') as fh:
                 bytes_written = 0
                 for chunk in res.iter_content(chunk_size=None):
                     bytes_written += len(chunk)
                     fh.write(chunk)
                 if bytes_written > 50:
                     return out_path
             raise ProcessingException("Could not be converted to PDF.")
         except HTTPError as exc:
             if exc.response.status_code == 400:
                 raise ProcessingException(res.text)
             msg = "Converter not availble: %s (attempt: %s)"
             log.info(msg, exc, attempt)
             backoff(failures=math.sqrt(attempt))
         except RequestException as exc:
             msg = "Converter not availble: %s (attempt: %s)"
             log.error(msg, exc, attempt)
             backoff(failures=math.sqrt(attempt))
Пример #2
0
 def _document_to_pdf(self, file_path, result, work_path):
     """Converts an office document to PDF."""
     log.info('Converting [%s] to PDF...', result.file_name)
     out_path = os.path.basename(file_path)
     out_path = join_path(work_path, '%s.pdf' % out_path)
     file_name = result.file_name or 'data'
     mime_type = result.mime_type or DEFAULT
     attempt = 1
     for attempt in service_retries():
         fh = open(file_path, 'rb')
         try:
             files = {'file': (file_name, fh, mime_type)}
             res = requests.post(self.SERVICE_URL,
                                 files=files,
                                 timeout=(5, 305),
                                 stream=True)
             res.raise_for_status()
             with open(out_path, 'wb') as fh:
                 for chunk in res.iter_content(chunk_size=None):
                     fh.write(chunk)
             return out_path
         except RequestException as exc:
             if isinstance(exc, HTTPError):
                 if exc.response.status_code == 400:
                     raise ProcessingException(exc.response.text)
             log.error("Conversion failed: %s", exc)
             backoff(failures=attempt)
         finally:
             fh.close()
     raise ProcessingException("Document could not be converted to PDF.")
Пример #3
0
 def _document_to_pdf(self, file_path, entity):
     """Converts an office document to PDF."""
     if UNOSERVICE_URL is None:
         raise RuntimeError("No UNOSERVICE_URL for document conversion.")
     log.info('Converting [%s] to PDF...', entity.first('fileName'))
     file_name = entity.first('fileName') or 'data'
     mime_type = entity.first('mimeType') or DEFAULT
     attempt = 1
     for attempt in service_retries():
         fh = open(file_path, 'rb')
         try:
             files = {'file': (file_name, fh, mime_type)}
             res = requests.post(UNOSERVICE_URL,
                                 files=files,
                                 timeout=(5, 305),
                                 stream=True)
             if res.status_code > 399:
                 raise ProcessingException(res.text)
             out_path = self.make_work_file('out.pdf')
             with open(out_path, 'wb') as fh:
                 bytes_written = 0
                 for chunk in res.iter_content(chunk_size=None):
                     bytes_written += len(chunk)
                     fh.write(chunk)
                 if bytes_written > 50:
                     return out_path
         except RequestException as exc:
             log.error("Conversion failed: %s", exc)
             backoff(failures=attempt)
         finally:
             fh.close()
     raise ProcessingException("Document could not be converted to PDF.")
Пример #4
0
 def _delete_blob(self, blob):
     for attempt in service_retries():
         try:
             blob.delete()
             return
         except NotFound:
             return
         except FAILURES:
             log.exception("Delete error in GS")
             backoff(failures=attempt)
Пример #5
0
def wait_for_redis(conn):
    """Wait for redis to load its data into memory on initial system
    bootup."""
    for attempt in service_retries():
        try:
            conn.get('test_redis_ready')
            return conn
        except BusyLoadingError:
            log.info("Waiting for redis to load...")
            backoff(failures=attempt)
    raise RuntimeError("Redis is not ready.")
Пример #6
0
def index_safe(index, id, body, **kwargs):
    """Index a single document and retry until it has been stored."""
    for attempt in service_retries():
        try:
            es.index(index=index, id=id, body=body, **kwargs)
            body['id'] = str(id)
            body.pop('text', None)
            return body
        except TransportError as exc:
            log.warning("Index error [%s:%s]: %s", index, id, exc)
            backoff(failures=attempt)
Пример #7
0
Файл: util.py Проект: pudo/aleph
def index_safe(index, id, body, **kwargs):
    """Index a single document and retry until it has been stored."""
    for attempt in service_retries():
        try:
            es.index(index=index, id=id, body=body, **kwargs)
            body['id'] = str(id)
            body.pop('text', None)
            return body
        except TransportError as exc:
            log.warning("Index error [%s:%s]: %s", index, id, exc)
            backoff(failures=attempt)
Пример #8
0
def wait_for_redis(pool):
    """Wait for redis to load its data into memory on initial system
    bootup."""
    for attempt in service_retries():
        try:
            conn = Redis(connection_pool=pool, decode_responses=True)
            conn.ping()
            return
        except BusyLoadingError:
            log.info("Waiting for redis to load...")
            backoff(failures=attempt)
    raise RuntimeError("Redis is not ready.")
Пример #9
0
def index_safe(index, id, body, **kwargs):
    """Index a single document and retry until it has been stored."""
    for attempt in service_retries():
        try:
            es.index(index=index, id=id, body=body, **kwargs)
            body["id"] = str(id)
            body.pop("text", None)
            return body
        except TransportError as exc:
            if exc.status_code in ("400", "403"):
                raise
            log.warning("Index error [%s:%s]: %s", index, id, exc)
            backoff(failures=attempt)
Пример #10
0
 def handle_done(cls, queue):
     if not queue.is_done():
         return
     # HACK: randomly wait a little to avoid double-triggering the
     # index process.
     backoff()
     index = ServiceQueue(queue.conn,
                          ServiceQueue.OP_INDEX,
                          queue.dataset,
                          priority=queue.priority)
     if index.is_done():
         log.info("Ingest %r finished, queue index...", queue.dataset)
         index.queue_task({}, {})
     queue.remove()
Пример #11
0
def get_es():
    url = settings.ELASTICSEARCH_URL
    timeout = settings.ELASTICSEARCH_TIMEOUT
    for attempt in service_retries():
        try:
            if not hasattr(settings, "_es_instance"):
                es = Elasticsearch(url, timeout=timeout)
                es.info()
                settings._es_instance = es
            return settings._es_instance
        except TransportError as exc:
            log.warning("ElasticSearch error: %s", exc.error)
            backoff(failures=attempt)
    raise RuntimeError("Could not connect to ElasticSearch")
Пример #12
0
Файл: util.py Проект: pudo/aleph
def query_delete(index, query, sync=False, **kwargs):
    "Delete all documents matching the given query inside the index."
    for attempt in service_retries():
        try:
            es.delete_by_query(index=index,
                               body={'query': query},
                               conflicts='proceed',
                               wait_for_completion=sync,
                               refresh=refresh_sync(sync),
                               **kwargs)
            return
        except TransportError as exc:
            log.warning("Query delete failed: %s", exc)
            backoff(failures=attempt)
Пример #13
0
def query_delete(index, query, sync=False, **kwargs):
    "Delete all documents matching the given query inside the index."
    for attempt in service_retries():
        try:
            es.delete_by_query(index=index,
                               body={'query': query},
                               conflicts='proceed',
                               wait_for_completion=sync,
                               refresh=refresh_sync(sync),
                               **kwargs)
            return
        except TransportError as exc:
            log.warning("Query delete failed: %s", exc)
            backoff(failures=attempt)
Пример #14
0
    def load_file(self, content_hash, file_name=None, temp_path=None):
        """Retrieve a file from Google storage and put it onto the local file
        system for further processing."""
        for attempt in service_retries():
            try:
                blob = self._locate_contenthash(content_hash)
                if blob is not None:
                    path = self._local_path(content_hash, file_name, temp_path)
                    blob.download_to_filename(path)
                    return path
            except FAILURES:
                log.exception("Load error in GS")
                backoff(failures=attempt)

        # Returns None for "persistent error" as well as "file not found" :/
        log.debug("[%s] not found, or the backend is down.", content_hash)
Пример #15
0
def query_delete(index, query, sync=False, **kwargs):
    "Delete all documents matching the given query inside the index."
    for attempt in service_retries():
        try:
            es.delete_by_query(index=index,
                               body={'query': query},
                               conflicts='proceed',
                               wait_for_completion=sync,
                               refresh=refresh_sync(sync),
                               request_timeout=MAX_REQUEST_TIMEOUT,
                               timeout=MAX_TIMEOUT,
                               **kwargs)
            return
        except TransportError as exc:
            if int(exc.status_code) in (400, 403):
                raise
            log.warning("Query delete failed: %s", exc)
            backoff(failures=attempt)
Пример #16
0
    def _document_to_pdf(self, file_path, entity):
        """Converts an office document to PDF."""
        # Attempt to guess an appropriate time for processing
        # Guessed: 15s per MB of data, max.
        file_size = file_path.stat().st_size
        if file_size < 100:
            return ProcessingException("Document too small.")
        file_size = (file_size / 1024) / 1024  # megabyte
        timeout = int(min(600, max(20, file_size * 15)))

        file_name = entity_filename(entity)
        mime_type = entity.first('mimeType')
        log.info('Converting [%s] to PDF (%ds timeout)...',
                 file_name, timeout)
        failed = ProcessingException("Document could not be converted to PDF.")
        for attempt in service_retries():
            try:
                with open(file_path, 'rb') as fh:
                    files = {'file': (file_name, fh, mime_type)}
                    res = requests.post(CONVERT_URL,
                                        params={'timeout': timeout},
                                        files=files,
                                        timeout=timeout + 3,
                                        stream=True)
                res.raise_for_status()
                out_path = self.make_work_file('out.pdf')
                with open(out_path, 'wb') as fh:
                    bytes_written = 0
                    for chunk in res.iter_content(chunk_size=None):
                        bytes_written += len(chunk)
                        fh.write(chunk)
                    if bytes_written > 50:
                        return out_path
                raise failed
            except RequestException as exc:
                if isinstance(exc, HTTPError) and \
                        exc.response.status_code == 400:
                    raise ProcessingException(res.text)
                log.error("Conversion failed: %s", exc)
                backoff(failures=math.sqrt(attempt))
        raise failed
Пример #17
0
def get_es():
    url = settings.ELASTICSEARCH_URL
    timeout = settings.ELASTICSEARCH_TIMEOUT
    for attempt in service_retries():
        try:
            if not hasattr(settings, "_es_instance"):
                # When logging structured logs, use a custom transport to log
                # all es queries and their response time
                if sls.LOG_FORMAT == LOG_FORMAT_JSON:
                    es = Elasticsearch(url,
                                       transport_class=LoggingTransport,
                                       timeout=timeout)
                else:
                    es = Elasticsearch(url, timeout=timeout)
                es.info()
                settings._es_instance = es
            return settings._es_instance
        except TransportError as exc:
            log.warning("ElasticSearch error: %s", exc.error)
            backoff(failures=attempt)
    raise RuntimeError("Could not connect to ElasticSearch")
Пример #18
0
 def _document_to_pdf(self, file_path, entity):
     """Converts an office document to PDF."""
     file_name = entity_filename(entity)
     mime_type = entity.first("mimeType")
     for attempt in count(1):
         log.debug("Converting [%s] to PDF (attempt %d)...", entity,
                   attempt)
         try:
             with open(file_path, "rb") as fh:
                 files = {"file": (file_name, fh, mime_type)}
                 res = requests.post(
                     CONVERT_URL,
                     params={"timeout": CONVERT_TIMEOUT},
                     files=files,
                     timeout=CONVERT_TIMEOUT + 10,
                     stream=True,
                 )
             res.raise_for_status()
             out_path = self.make_work_file("out.pdf")
             with open(out_path, "wb") as fh:
                 bytes_written = 0
                 for chunk in res.iter_content(chunk_size=None):
                     bytes_written += len(chunk)
                     fh.write(chunk)
                 if bytes_written > 50:
                     return out_path
             raise ProcessingException("Could not be converted to PDF.")
         except HTTPError as exc:
             if exc.response.status_code in (400, 500):
                 # For error 500, this might also be a temporary error
                 # in the conversion service. But all attempts to divy
                 # these phenomena apart have failed so far.
                 raise ProcessingException(res.text)
             msg = "Converter not available: %s (attempt: %s)"
             log.info(msg, exc, attempt)
             backoff(failures=math.sqrt(attempt))
         except RequestException as exc:
             msg = "Converter not available: %s (attempt: %s)"
             log.error(msg, exc, attempt)
             backoff(failures=math.sqrt(attempt))
Пример #19
0
    def archive_file(self, file_path, content_hash=None, mime_type=None):
        """Store the file located at the given path on Google, based on a path
        made up from its SHA1 content hash."""
        file_path = ensure_path(file_path)
        if content_hash is None:
            content_hash = checksum(file_path)

        if content_hash is None:
            return

        file_path = ensure_posix_path(file_path)
        for attempt in service_retries():
            try:
                # blob = self._locate_contenthash(content_hash)
                # if blob is not None:
                #     return content_hash

                path = os.path.join(path_prefix(content_hash), "data")
                blob = Blob(path, self.bucket)
                blob.upload_from_filename(file_path, content_type=mime_type)
                return content_hash
            except FAILURES:
                log.exception("Store error in GS")
                backoff(failures=attempt)