def get_document_pdf(request, pk): """ Retrieve a document by primary key, as a PDF file. Note that this is not a JSON view. """ document = request.user.documents.get(pk = pk) assets = document.assets.filter( asset_class__name = models.AssetClass.DOCUMENT, mime_type__name = models.MimeType.PDF ) if len(assets) != 0: meta = operations.instantiate_asset(assets[0]) try: response = HttpResponse(open(meta['Local-Path'], 'rb')) response['Content-Disposition'] = \ 'attachment; filename=doc-%d.pdf' % document.pk response['Content-Type'] = meta['Content-Type'] response['Content-Length'] = os.stat(meta['Local-Path']).st_size response['ETag' ] = meta['ETag' ] response['Last-Modified'] = meta['Last-Modified'] finally: shutil.rmtree(os.path.dirname(meta['Local-Path'])) else: response = HttpResponse(content_type = 'application/pdf') response['Content-Disposition'] = \ 'attachment; filename=doc-%d.pdf' % document.pk pdf.render_document(document, response, request.user.username, str(document.pk)) return response
def handle_work_item(processor, item): """ Process a work item. The work item will be provided and its local temp directory will be cleaned up by the process driver framework. If this method does not raise an exception the work item will also be removed from the work queue. """ document = item['Asset-Instance'].related_document num_ocr_pages = document.pages.filter( assets__asset_class__name=models.AssetClass.PAGE_TEXT).count() if document.num_pages != num_ocr_pages: raise NotReadyException( "Postponing PDF generation, OCR not complete for pages") pdf_stream = StringIO( pdf.render_document(document, output_buffer=StringIO(), username=document.owner.username, title=document.title).getvalue()) # trial account handling -- send the PDF in attachment # and delete all associated assets if handle_trial_account(document, pdf_stream.getvalue()): return # classify the document based on the creation time of its PDF asset tag_document(document, datetime.timedelta(0, UPLOAD_AGGREGATE_TIME_TRESHOLD)) pdf_assets = document.assets.filter( asset_class__name=models.AssetClass.DOCUMENT, mime_type__name=models.MimeType.PDF) if len(pdf_assets) != 0: pdf_asset = pdf_assets[0] pdf_asset.producer = processor operations.upload_asset_stream(pdf_asset, pdf_stream) else: pdf_asset = operations.create_asset_from_stream( data_stream=pdf_stream, owner=item['Owner'], producer=processor, asset_class=models.AssetClass.DOCUMENT, related_document=document, file_name=document.title, parent=item['Asset-Instance'], child_number=1, mime_type=models.MimeType.PDF) return [pdf_asset]
def handle_work_item(processor, item): """ Process a work item. The work item will be provided and its local temp directory will be cleaned up by the process driver framework. If this method does not raise an exception the work item will also be removed from the work queue. """ document = item["Asset-Instance"].related_document num_ocr_pages = document.pages.filter(assets__asset_class__name=models.AssetClass.PAGE_TEXT).count() if document.num_pages != num_ocr_pages: raise NotReadyException("Postponing PDF generation, OCR not complete for pages") pdf_stream = StringIO( pdf.render_document( document, output_buffer=StringIO(), username=document.owner.username, title=document.title ).getvalue() ) # trial account handling -- send the PDF in attachment # and delete all associated assets if handle_trial_account(document, pdf_stream.getvalue()): return # classify the document based on the creation time of its PDF asset tag_document(document, datetime.timedelta(0, UPLOAD_AGGREGATE_TIME_TRESHOLD)) pdf_assets = document.assets.filter( asset_class__name=models.AssetClass.DOCUMENT, mime_type__name=models.MimeType.PDF ) if len(pdf_assets) != 0: pdf_asset = pdf_assets[0] pdf_asset.producer = processor operations.upload_asset_stream(pdf_asset, pdf_stream) else: pdf_asset = operations.create_asset_from_stream( data_stream=pdf_stream, owner=item["Owner"], producer=processor, asset_class=models.AssetClass.DOCUMENT, related_document=document, file_name=document.title, parent=item["Asset-Instance"], child_number=1, mime_type=models.MimeType.PDF, ) return [pdf_asset]