def index_safe(index, id, body, **kwargs): """Index a single document and retry until it has been stored.""" for attempt in service_retries(): try: es.index(index=index, id=id, body=body, **kwargs) body['id'] = str(id) body.pop('text', None) return body except TransportError as exc: log.warning("Index error [%s:%s]: %s", index, id, exc) backoff(failures=attempt)
def query_delete(index, query, sync=False, **kwargs): "Delete all documents matching the given query inside the index." for attempt in service_retries(): try: es.delete_by_query(index=index, body={'query': query}, conflicts='proceed', wait_for_completion=sync, refresh=refresh_sync(sync), **kwargs) return except TransportError as exc: log.warning("Query delete failed: %s", exc) backoff(failures=attempt)
def query_delete(index, query, sync=False, **kwargs): "Delete all documents matching the given query inside the index." for attempt in service_retries(): try: es.delete_by_query(index=index, body={'query': query}, conflicts='proceed', timeout=TIMEOUT, request_timeout=REQUEST_TIMEOUT, wait_for_completion=sync, refresh=refresh_sync(sync), **kwargs) return except TransportError as exc: log.warning("Query delete failed: %s", exc) backoff(failures=attempt)
def load_file(self, content_hash, file_name=None, temp_path=None): """Retrieve a file from Google storage and put it onto the local file system for further processing.""" for attempt in service_retries(): try: blob = self._locate_contenthash(content_hash) if blob is not None: path = self._local_path(content_hash, file_name, temp_path) blob.download_to_filename(path) return path except FAILURES: log.exception("Load error in GS") backoff(failures=attempt) # Returns None for "persistent error" as well as "file not found" :/ log.debug("[%s] not found, or the backend is down.", content_hash)
def query_delete(index, query, sync=False, **kwargs): "Delete all documents matching the given query inside the index." for attempt in service_retries(): try: es.delete_by_query(index=index, body={"query": query}, conflicts="proceed", wait_for_completion=sync, refresh=refresh_sync(sync), request_timeout=MAX_REQUEST_TIMEOUT, timeout=MAX_TIMEOUT, **kwargs) return except TransportError as exc: if exc.status_code in ("400", "403"): raise log.warning("Query delete failed: %s", exc) backoff(failures=attempt)
def _document_to_pdf(self, file_path, entity): """Converts an office document to PDF.""" # Attempt to guess an appropriate time for processing # Guessed: 15s per MB of data, max. file_size = file_path.stat().st_size if file_size < 100: return ProcessingException("Document too small.") file_size = (file_size / 1024) / 1024 # megabyte timeout = int(min(600, max(20, file_size * 15))) file_name = entity_filename(entity) mime_type = entity.first('mimeType') log.info('Converting [%s] to PDF (%ds timeout)...', file_name, timeout) failed = ProcessingException("Document could not be converted to PDF.") for attempt in service_retries(): try: with open(file_path, 'rb') as fh: files = {'file': (file_name, fh, mime_type)} res = requests.post(CONVERT_URL, params={'timeout': timeout}, files=files, timeout=timeout + 3, stream=True) res.raise_for_status() out_path = self.make_work_file('out.pdf') with open(out_path, 'wb') as fh: bytes_written = 0 for chunk in res.iter_content(chunk_size=None): bytes_written += len(chunk) fh.write(chunk) if bytes_written > 50: return out_path raise failed except RequestException as exc: if isinstance(exc, HTTPError) and \ exc.response.status_code == 400: raise ProcessingException(res.text) log.error("Conversion failed: %s", exc) backoff(failures=math.sqrt(attempt)) raise failed
def get_es(): url = settings.ELASTICSEARCH_URL timeout = settings.ELASTICSEARCH_TIMEOUT for attempt in service_retries(): try: if not hasattr(settings, "_es_instance"): # When logging structured logs, use a custom transport to log # all es queries and their response time if sls.LOG_FORMAT == LOG_FORMAT_JSON: es = Elasticsearch(url, transport_class=LoggingTransport, timeout=timeout) else: es = Elasticsearch(url, timeout=timeout) es.info() settings._es_instance = es return settings._es_instance except TransportError as exc: log.warning("ElasticSearch error: %s", exc.error) backoff(failures=attempt) raise RuntimeError("Could not connect to ElasticSearch")
def archive_file(self, file_path, content_hash=None, mime_type=None): """Store the file located at the given path on Google, based on a path made up from its SHA1 content hash.""" file_path = ensure_path(file_path) if content_hash is None: content_hash = checksum(file_path) if content_hash is None: return file_path = ensure_posix_path(file_path) for attempt in service_retries(): try: # blob = self._locate_contenthash(content_hash) # if blob is not None: # return content_hash path = os.path.join(path_prefix(content_hash), "data") blob = Blob(path, self.bucket) blob.upload_from_filename(file_path, content_type=mime_type) return content_hash except FAILURES: log.exception("Store error in GS") backoff(failures=attempt)
def test_basic(self): assert len(list(service_retries()))