def _serialize(self, obj): pk = obj.get("id") collection_id = obj.pop("collection_id", None) obj["collection"] = self.resolve( Collection, collection_id, CollectionSerializer ) proxy = model.get_proxy(obj) properties = obj.get("properties", {}) for prop in proxy.iterprops(): if prop.type != registry.entity: continue values = ensure_list(properties.get(prop.name)) properties[prop.name] = [] for value in values: entity = self.resolve(Entity, value, EntitySerializer) properties[prop.name].append(entity or value) links = { "self": url_for("entities_api.view", entity_id=pk), "references": url_for("entities_api.references", entity_id=pk), "tags": url_for("entities_api.tags", entity_id=pk), "ui": entity_url(pk), } if proxy.schema.is_a(Document.SCHEMA): content_hash = first(properties.get("contentHash")) if content_hash: name = entity_filename(proxy) mime = first(properties.get("mimeType")) links["file"] = archive_url( content_hash, file_name=name, mime_type=mime, expire=request.authz.expire, ) pdf_hash = first(properties.get("pdfHash")) if pdf_hash: name = entity_filename(proxy, extension="pdf") links["pdf"] = archive_url( pdf_hash, file_name=name, mime_type=PDF, expire=request.authz.expire, ) csv_hash = first(properties.get("csvHash")) if csv_hash: name = entity_filename(proxy, extension="csv") links["csv"] = archive_url( csv_hash, file_name=name, mime_type=CSV, expire=request.authz.expire, ) obj["links"] = links obj["latinized"] = transliterate_values(proxy) obj["writeable"] = check_write_entity(obj, request.authz) obj["shallow"] = obj.get("shallow", True) return obj
def _serialize(self, obj): pk = obj.get('id') obj['id'] = str(pk) authz = request.authz collection_id = obj.pop('collection_id', None) obj['collection'] = self.resolve(Collection, collection_id, CollectionSerializer) proxy = model.get_proxy(obj) obj['schemata'] = proxy.schema.names properties = obj.get('properties', {}) for prop in proxy.iterprops(): if prop.type != registry.entity: continue values = ensure_list(properties.get(prop.name)) properties[prop.name] = [] for value in values: entity = self.resolve(Entity, value, EntitySerializer) properties[prop.name].append(entity) links = { 'self': url_for('entities_api.view', entity_id=pk), 'references': url_for('entities_api.references', entity_id=pk), 'tags': url_for('entities_api.tags', entity_id=pk), 'ui': entity_url(pk) } if proxy.schema.is_a(Document.SCHEMA): links['content'] = url_for('entities_api.content', entity_id=pk) content_hash = first(properties.get('contentHash')) if content_hash: name = entity_filename(proxy) mime_type = first(properties.get('mimeType')) links['file'] = archive_url(request.authz.id, content_hash, file_name=name, mime_type=mime_type) pdf_hash = first(properties.get('pdfHash')) if pdf_hash: name = entity_filename(proxy, extension='pdf') links['pdf'] = archive_url(request.authz.id, pdf_hash, file_name=name, mime_type=PDF) csv_hash = first(properties.get('csvHash')) if csv_hash: name = entity_filename(proxy, extension='csv') links['csv'] = archive_url(request.authz.id, csv_hash, file_name=name, mime_type=CSV) obj['links'] = links obj['writeable'] = authz.can(collection_id, authz.WRITE) obj.pop('_index', None) return self._clean_response(obj)
def _document_to_pdf(self, file_path, entity): """Converts an office document to PDF.""" file_name = entity_filename(entity) mime_type = entity.first('mimeType') log.info('Converting [%s] to PDF...', file_name) for attempt in count(1): try: with open(file_path, 'rb') as fh: files = {'file': (file_name, fh, mime_type)} res = requests.post(CONVERT_URL, params={'timeout': CONVERT_TIMEOUT}, files=files, timeout=CONVERT_TIMEOUT + 10, stream=True) res.raise_for_status() out_path = self.make_work_file('out.pdf') with open(out_path, 'wb') as fh: bytes_written = 0 for chunk in res.iter_content(chunk_size=None): bytes_written += len(chunk) fh.write(chunk) if bytes_written > 50: return out_path raise ProcessingException("Could not be converted to PDF.") except HTTPError as exc: if exc.response.status_code == 400: raise ProcessingException(res.text) msg = "Converter not availble: %s (attempt: %s)" log.info(msg, exc, attempt) backoff(failures=math.sqrt(attempt)) except RequestException as exc: msg = "Converter not availble: %s (attempt: %s)" log.error(msg, exc, attempt) backoff(failures=math.sqrt(attempt))
def test_entity_filename(self): proxy = model.get_proxy( { "id": "banana", "schema": "Document", } ) file_name = entity_filename(proxy) assert "banana" == file_name, file_name proxy = model.get_proxy( { "id": "banana", "schema": "Document", "properties": { "extension": [".doc"], }, } ) file_name = entity_filename(proxy) assert "banana.doc" == file_name, file_name proxy = model.get_proxy( { "id": "banana", "schema": "Document", "properties": { "mimeType": ["application/pdf"], }, } ) file_name = entity_filename(proxy) assert "banana.pdf" == file_name, file_name proxy = model.get_proxy( { "id": "banana", "schema": "Document", "properties": { "fileName": ["bla.doc"], }, } ) file_name = entity_filename(proxy) assert "bla.doc" == file_name, file_name file_name = entity_filename(proxy, extension="pdf") assert "bla.pdf" == file_name, file_name
def ingest_entity(self, entity): for content_hash in entity.get("contentHash", quiet=True): file_name = entity_filename(entity) file_path = self.load(content_hash, file_name=file_name) if file_path is None or not file_path.exists(): continue self.ingest(file_path, entity) return self.finalize(entity)
def write_document(export_dir, zf, collection, entity): content_hash = entity.first("contentHash", quiet=True) if content_hash is None: return file_name = entity_filename(entity) arcname = "{0}-{1}".format(entity.id, file_name) arcname = os.path.join(collection.get("label"), arcname) try: local_path = archive.load_file(content_hash, temp_path=export_dir) if local_path is not None and os.path.exists(local_path): zf.write(local_path, arcname=arcname) finally: archive.cleanup_file(content_hash, temp_path=export_dir)
def test_entity_filename(self): proxy = model.get_proxy({ 'id': 'banana', 'schema': 'Document', }) file_name = entity_filename(proxy) assert 'banana' == file_name, file_name proxy = model.get_proxy({ 'id': 'banana', 'schema': 'Document', 'properties': { 'extension': ['.doc'], } }) file_name = entity_filename(proxy) assert 'banana.doc' == file_name, file_name proxy = model.get_proxy({ 'id': 'banana', 'schema': 'Document', 'properties': { 'mimeType': ['application/pdf'], } }) file_name = entity_filename(proxy) assert 'banana.pdf' == file_name, file_name proxy = model.get_proxy({ 'id': 'banana', 'schema': 'Document', 'properties': { 'fileName': ['bla.doc'], } }) file_name = entity_filename(proxy) assert 'bla.doc' == file_name, file_name file_name = entity_filename(proxy, extension='pdf') assert 'bla.pdf' == file_name, file_name
def document_to_pdf(self, file_path, entity): key = self.cache_key('pdf', entity.first('contentHash')) pdf_hash = self.get_cache_value(key) if pdf_hash is not None: file_name = entity_filename(entity, extension='pdf') path = self.manager.load(pdf_hash, file_name=file_name) if path is not None: log.info("Using PDF cache: %s", file_name) entity.set('pdfHash', pdf_hash) return path pdf_file = self._document_to_pdf(file_path, entity) if pdf_file is not None: content_hash = self.manager.store(pdf_file) entity.set('pdfHash', content_hash) self.set_cache_value(key, content_hash) return pdf_file
def document_to_pdf(self, file_path, entity): key = self.cache_key("pdf", entity.first("contentHash")) pdf_hash = self.tags.get(key) if pdf_hash is not None: file_name = entity_filename(entity, extension="pdf") path = self.manager.load(pdf_hash, file_name=file_name) if path is not None: log.info("Using PDF cache: %s", file_name) entity.set("pdfHash", pdf_hash) return path pdf_file = self._document_to_pdf(file_path, entity) if pdf_file is not None: content_hash = self.manager.store(pdf_file) entity.set("pdfHash", content_hash) self.tags.set(key, content_hash) return pdf_file
def _document_to_pdf(self, file_path, entity): """Converts an office document to PDF.""" # Attempt to guess an appropriate time for processing # Guessed: 15s per MB of data, max. file_size = file_path.stat().st_size if file_size < 100: return ProcessingException("Document too small.") file_size = (file_size / 1024) / 1024 # megabyte timeout = int(min(600, max(20, file_size * 15))) file_name = entity_filename(entity) mime_type = entity.first('mimeType') log.info('Converting [%s] to PDF (%ds timeout)...', file_name, timeout) failed = ProcessingException("Document could not be converted to PDF.") for attempt in service_retries(): try: with open(file_path, 'rb') as fh: files = {'file': (file_name, fh, mime_type)} res = requests.post(CONVERT_URL, params={'timeout': timeout}, files=files, timeout=timeout + 3, stream=True) res.raise_for_status() out_path = self.make_work_file('out.pdf') with open(out_path, 'wb') as fh: bytes_written = 0 for chunk in res.iter_content(chunk_size=None): bytes_written += len(chunk) fh.write(chunk) if bytes_written > 50: return out_path raise failed except RequestException as exc: if isinstance(exc, HTTPError) and \ exc.response.status_code == 400: raise ProcessingException(res.text) log.error("Conversion failed: %s", exc) backoff(failures=math.sqrt(attempt)) raise failed
def _document_to_pdf(self, file_path, entity): """Converts an office document to PDF.""" file_name = entity_filename(entity) mime_type = entity.first("mimeType") for attempt in count(1): log.debug("Converting [%s] to PDF (attempt %d)...", entity, attempt) try: with open(file_path, "rb") as fh: files = {"file": (file_name, fh, mime_type)} res = requests.post( CONVERT_URL, params={"timeout": CONVERT_TIMEOUT}, files=files, timeout=CONVERT_TIMEOUT + 10, stream=True, ) res.raise_for_status() out_path = self.make_work_file("out.pdf") with open(out_path, "wb") as fh: bytes_written = 0 for chunk in res.iter_content(chunk_size=None): bytes_written += len(chunk) fh.write(chunk) if bytes_written > 50: return out_path raise ProcessingException("Could not be converted to PDF.") except HTTPError as exc: if exc.response.status_code in (400, 500): # For error 500, this might also be a temporary error # in the conversion service. But all attempts to divy # these phenomena apart have failed so far. raise ProcessingException(res.text) msg = "Converter not available: %s (attempt: %s)" log.info(msg, exc, attempt) backoff(failures=math.sqrt(attempt)) except RequestException as exc: msg = "Converter not available: %s (attempt: %s)" log.error(msg, exc, attempt) backoff(failures=math.sqrt(attempt))
def _serialize(self, obj): pk = obj.get("id") proxy = model.get_proxy(obj) properties = {} for prop, value in proxy.itervalues(): properties.setdefault(prop.name, []) if prop.type == registry.entity: entity = self.resolve(Entity, value, EntitySerializer) value = entity or value if value is not None: properties[prop.name].append(value) obj["properties"] = properties links = { "self": url_for("entities_api.view", entity_id=pk), "expand": url_for("entities_api.expand", entity_id=pk), "tags": url_for("entities_api.tags", entity_id=pk), "ui": entity_url(pk), } if proxy.schema.is_a(Document.SCHEMA): content_hash = proxy.first("contentHash", quiet=True) if content_hash: name = entity_filename(proxy) mime = proxy.first("mimeType", quiet=True) links["file"] = archive_url(content_hash, file_name=name, mime_type=mime) pdf_hash = proxy.first("pdfHash", quiet=True) if pdf_hash: name = entity_filename(proxy, extension="pdf") links["pdf"] = archive_url(pdf_hash, file_name=name, mime_type=PDF) csv_hash = proxy.first("csvHash", quiet=True) if csv_hash: name = entity_filename(proxy, extension="csv") links["csv"] = archive_url(csv_hash, file_name=name, mime_type=CSV) collection = obj.get("collection") or {} coll_id = obj.pop("collection_id", collection.get("id")) # This is a last resort catcher for entities nested in other # entities that get resolved without regard for authz. if not request.authz.can(coll_id, request.authz.READ): return None obj["collection"] = self.resolve(Collection, coll_id, CollectionSerializer) role_id = obj.pop("role_id", None) obj["role"] = self.resolve(Role, role_id, RoleSerializer) obj["links"] = links obj["latinized"] = transliterate_values(proxy) obj["writeable"] = check_write_entity(obj, request.authz) obj["shallow"] = obj.get("shallow", True) # Phasing out multi-values here (2021-01): obj["created_at"] = min(ensure_list(obj.get("created_at")), default=None) obj["updated_at"] = max(ensure_list(obj.get("updated_at")), default=None) return obj