def test_preview(self): doc = Document.create_document( title="berlin.pdf", user=self.testcase_user, lang="ENG", file_name="berlin.pdf", size=1222, page_count=3 ) copy2doc_url( src_file_path=os.path.join( BASE_DIR, "data", "berlin.pdf" ), doc_url=doc.path.url() ) ret = self.client.post( reverse('core:preview', args=(doc.id, 1, 1)) ) self.assertEqual( ret.status_code, 200 ) page_path = PagePath( document_path=doc.path, page_num=1, step=Step(1), page_count=3 ) self.assertTrue( os.path.exists( default_storage.abspath(page_path.img_url()) ) )
def test_download_hocr_which_does_not_exists(self): """ HOCR might not be available. It is a normal case (page OCR task is still in the queue/progress). Missing HCOR file => HTTP 404 return code is expected. """ doc = Document.create_document( title="berlin.pdf", user=self.testcase_user, lang="ENG", file_name="berlin.pdf", size=1222, page_count=3 ) # Doc is available (for get_pagecount on server side). copy2doc_url( src_file_path=os.path.join( BASE_DIR, "data", "berlin.pdf" ), doc_url=doc.path.url() ) # But HOCR file is missing. ret = self.client.get( reverse('core:hocr', args=(doc.id, 1, 1)) ) self.assertEqual( ret.status_code, 404 )
def test_preview(self): doc = Document.create_document( title="andromeda.pdf", user=self.testcase_user, lang="ENG", file_name="andromeda.pdf", size=1222, page_count=3 ) copy2doc_url( src_file_path=os.path.join( BASE_DIR, "data", "andromeda.pdf" ), doc_url=doc.doc_ep.url() ) ret = self.client.post( reverse('core:preview', args=(doc.id, 1, 1)) ) self.assertEqual( ret.status_code, 200 ) page_url = PageEp( document_ep=doc.doc_ep, page_num=1, step=Step(1), page_count=3 ) self.assertTrue( os.path.exists(page_url.img_exists()) )
def test_download(self): doc = Document.create_document(title="andromeda.pdf", user=self.testcase_user, lang="ENG", file_name="andromeda.pdf", size=1222, page_count=3) copy2doc_url(src_file_path=os.path.join(BASE_DIR, "data", "andromeda.pdf"), doc_url=doc.doc_ep.url()) ret = self.client.post( reverse('core:document_download', args=(doc.id, ))) self.assertEqual(ret.status_code, 200)
def test_download_hocr(self): doc = Document.create_document( title="berlin.pdf", user=self.testcase_user, lang="ENG", file_name="berlin.pdf", size=1222, page_count=3 ) copy2doc_url( src_file_path=os.path.join( BASE_DIR, "data", "berlin.pdf" ), doc_url=default_storage.abspath(doc.path.url()) ) # build page url page_path = doc.page_paths[1] # just remember that at the end of test # copied file must be deteled. (1) copy2doc_url( src_file_path=os.path.join( BASE_DIR, "data", "page-1.hocr" ), doc_url=default_storage.abspath(page_path.hocr_url()) ) ret = self.client.get( reverse('core:hocr', args=(doc.id, 1, 1)) ) self.assertEqual( ret.status_code, 200 ) # Deleting file created at (1) os.remove( default_storage.abspath(page_path.hocr_url()) )
def import_file(filepath, username=None, file_title=None, inbox_title="Inbox", delete_after_import=False, start_ocr_async=True, upload=True): """ Gets as input a path to a file on a local file system and: 1. creates a document instance (if there is a available space). 2. Copies file to doc_instance.url() 3. (optionally) uploads the document to S3 storage. 4. (optionally) starts ocr_async task. Is used on customers instance by: * import_file command - to import files from SFTP directory * import_attachment command - to import attachments from mailbox """ logger.debug(f"Importing file {filepath}") if username is None: user = get_root_user() else: user = User.objects.get(username=username) if file_title is None: file_title = get_file_title(filepath) if not is_storage_left(filepath, user=user): logger.error(f"user.username reached his disk quota") return False lang = Document.get_default_language() # get_pagecount() might raise an exception in case # file is either wrong (not a PDF) or not yet # completed to upload try: page_count = get_pagecount(filepath) except Exception: # which means that document is not yet fully # uploaded by SFTP client. logger.error(f"File {filepath} not yet ready for importing.") return False inbox, _ = Folder.objects.get_or_create(title=inbox_title, parent=None, user=user) doc = Document.create_document(user=user, title=file_title, size=get_file_size(filepath), lang=lang, file_name=file_title, parent_id=inbox.id, page_count=page_count) logger.debug(f"Uploading file {filepath} to {doc.doc_ep.url()}") # Import file is executed as root (import-file.service) # (because import-file need to access/delete sftp files, folder # as of another system user) # Thus, after copying file into (newly created) folders, # it need to change permissions (of newly created files and folders) # to the app_user/app_group. copy2doc_url(src_file_path=filepath, doc_url=doc.doc_ep.url(), user=settings.APP_USER, group=settings.APP_GROUP) if upload and settings.S3: upload_document_to_s3(doc.doc_ep) if start_ocr_async and settings.OCR: Document.ocr_async(document=doc, page_count=page_count, lang=lang, s3_enabled=settings.S3) if delete_after_import: os.remove(filepath) return True
def post(self, request): files = request.FILES.getlist('file') if not files: logger.warning("POST request.FILES is empty. Forgot adding file?") if len(files) > 1: logger.warning("More then one files per ajax? how come?") return HttpResponse(json.dumps({}), content_type="application/json", status_code=400) f = files[0] logger.debug("upload for f=%s user=%s", f, request.user) if not is_storage_left(f.temporary_file_path()): logger.warning("Storage is full for user=%s.", request.user) msg = "Cannot upload file {}. Storage is full.".format(f.name) return HttpResponse(json.dumps({'error': msg}), status=400, content_type="application/json") user = request.user size = os.path.getsize(f.temporary_file_path()) parent_id = request.POST.get('parent', "-1") if parent_id and "-1" in parent_id: parent_id = None lang = request.POST.get('language') notes = request.POST.get('notes') page_count = get_pagecount(f.temporary_file_path()) logger.info("creating document {}".format(f.name)) doc = Document.create_document(user=user, title=f.name, size=size, lang=lang, file_name=f.name, parent_id=parent_id, notes=notes, page_count=page_count) logger.debug("uploading to {}".format(doc.doc_ep.url())) copy2doc_url(src_file_path=f.temporary_file_path(), doc_url=doc.doc_ep.url()) if settings.S3: upload_document_to_s3(doc.doc_ep) if settings.OCR: Document.ocr_async(document=doc, page_count=page_count, lang=lang) # upload only one file at time. # after each upload return a json object with # following fields: # # - title # - preview_url # - doc_id # - action_url -> needed for renaming/deleting selected item # # with that info a new thumbnail will be created. action_url = reverse('boss:core_basetreenode_change', args=(doc.id, )) preview_url = reverse('core:preview', args=(doc.id, 200, 1)) result = { 'title': doc.title, 'doc_id': doc.id, 'action_url': action_url, 'preview_url': preview_url } logger.info("and response is!") return HttpResponse(json.dumps(result), content_type="application/json")