예제 #1
0
파일: util.py 프로젝트: pudo/aleph
def index_safe(index, id, body, **kwargs):
    """Index a single document and retry until it has been stored."""
    for attempt in service_retries():
        try:
            es.index(index=index, id=id, body=body, **kwargs)
            body['id'] = str(id)
            body.pop('text', None)
            return body
        except TransportError as exc:
            log.warning("Index error [%s:%s]: %s", index, id, exc)
            backoff(failures=attempt)
예제 #2
0
파일: util.py 프로젝트: pudo/aleph
def query_delete(index, query, sync=False, **kwargs):
    "Delete all documents matching the given query inside the index."
    for attempt in service_retries():
        try:
            es.delete_by_query(index=index,
                               body={'query': query},
                               conflicts='proceed',
                               wait_for_completion=sync,
                               refresh=refresh_sync(sync),
                               **kwargs)
            return
        except TransportError as exc:
            log.warning("Query delete failed: %s", exc)
            backoff(failures=attempt)
예제 #3
0
def query_delete(index, query, sync=False, **kwargs):
    "Delete all documents matching the given query inside the index."
    for attempt in service_retries():
        try:
            es.delete_by_query(index=index,
                               body={'query': query},
                               conflicts='proceed',
                               timeout=TIMEOUT,
                               request_timeout=REQUEST_TIMEOUT,
                               wait_for_completion=sync,
                               refresh=refresh_sync(sync),
                               **kwargs)
            return
        except TransportError as exc:
            log.warning("Query delete failed: %s", exc)
            backoff(failures=attempt)
예제 #4
0
    def load_file(self, content_hash, file_name=None, temp_path=None):
        """Retrieve a file from Google storage and put it onto the local file
        system for further processing."""
        for attempt in service_retries():
            try:
                blob = self._locate_contenthash(content_hash)
                if blob is not None:
                    path = self._local_path(content_hash, file_name, temp_path)
                    blob.download_to_filename(path)
                    return path
            except FAILURES:
                log.exception("Load error in GS")
                backoff(failures=attempt)

        # Returns None for "persistent error" as well as "file not found" :/
        log.debug("[%s] not found, or the backend is down.", content_hash)
예제 #5
0
def query_delete(index, query, sync=False, **kwargs):
    "Delete all documents matching the given query inside the index."
    for attempt in service_retries():
        try:
            es.delete_by_query(index=index,
                               body={"query": query},
                               conflicts="proceed",
                               wait_for_completion=sync,
                               refresh=refresh_sync(sync),
                               request_timeout=MAX_REQUEST_TIMEOUT,
                               timeout=MAX_TIMEOUT,
                               **kwargs)
            return
        except TransportError as exc:
            if exc.status_code in ("400", "403"):
                raise
            log.warning("Query delete failed: %s", exc)
            backoff(failures=attempt)
예제 #6
0
    def _document_to_pdf(self, file_path, entity):
        """Converts an office document to PDF."""
        # Attempt to guess an appropriate time for processing
        # Guessed: 15s per MB of data, max.
        file_size = file_path.stat().st_size
        if file_size < 100:
            return ProcessingException("Document too small.")
        file_size = (file_size / 1024) / 1024  # megabyte
        timeout = int(min(600, max(20, file_size * 15)))

        file_name = entity_filename(entity)
        mime_type = entity.first('mimeType')
        log.info('Converting [%s] to PDF (%ds timeout)...',
                 file_name, timeout)
        failed = ProcessingException("Document could not be converted to PDF.")
        for attempt in service_retries():
            try:
                with open(file_path, 'rb') as fh:
                    files = {'file': (file_name, fh, mime_type)}
                    res = requests.post(CONVERT_URL,
                                        params={'timeout': timeout},
                                        files=files,
                                        timeout=timeout + 3,
                                        stream=True)
                res.raise_for_status()
                out_path = self.make_work_file('out.pdf')
                with open(out_path, 'wb') as fh:
                    bytes_written = 0
                    for chunk in res.iter_content(chunk_size=None):
                        bytes_written += len(chunk)
                        fh.write(chunk)
                    if bytes_written > 50:
                        return out_path
                raise failed
            except RequestException as exc:
                if isinstance(exc, HTTPError) and \
                        exc.response.status_code == 400:
                    raise ProcessingException(res.text)
                log.error("Conversion failed: %s", exc)
                backoff(failures=math.sqrt(attempt))
        raise failed
예제 #7
0
def get_es():
    url = settings.ELASTICSEARCH_URL
    timeout = settings.ELASTICSEARCH_TIMEOUT
    for attempt in service_retries():
        try:
            if not hasattr(settings, "_es_instance"):
                # When logging structured logs, use a custom transport to log
                # all es queries and their response time
                if sls.LOG_FORMAT == LOG_FORMAT_JSON:
                    es = Elasticsearch(url,
                                       transport_class=LoggingTransport,
                                       timeout=timeout)
                else:
                    es = Elasticsearch(url, timeout=timeout)
                es.info()
                settings._es_instance = es
            return settings._es_instance
        except TransportError as exc:
            log.warning("ElasticSearch error: %s", exc.error)
            backoff(failures=attempt)
    raise RuntimeError("Could not connect to ElasticSearch")
예제 #8
0
    def archive_file(self, file_path, content_hash=None, mime_type=None):
        """Store the file located at the given path on Google, based on a path
        made up from its SHA1 content hash."""
        file_path = ensure_path(file_path)
        if content_hash is None:
            content_hash = checksum(file_path)

        if content_hash is None:
            return

        file_path = ensure_posix_path(file_path)
        for attempt in service_retries():
            try:
                # blob = self._locate_contenthash(content_hash)
                # if blob is not None:
                #     return content_hash

                path = os.path.join(path_prefix(content_hash), "data")
                blob = Blob(path, self.bucket)
                blob.upload_from_filename(file_path, content_type=mime_type)
                return content_hash
            except FAILURES:
                log.exception("Store error in GS")
                backoff(failures=attempt)
예제 #9
0
 def test_basic(self):
     assert len(list(service_retries()))