def test_preview(self): doc = Document.create_document( title="andromeda.pdf", user=self.testcase_user, lang="ENG", file_name="andromeda.pdf", size=1222, page_count=3 ) copy2doc_url( src_file_path=os.path.join( BASE_DIR, "data", "andromeda.pdf" ), doc_url=doc.doc_ep.url() ) ret = self.client.post( reverse('core:preview', args=(doc.id, 1, 1)) ) self.assertEqual( ret.status_code, 200 ) page_url = PageEp( document_ep=doc.doc_ep, page_num=1, step=Step(1), page_count=3 ) self.assertTrue( os.path.exists(page_url.img_exists()) )
def preview(request, id, step=None, page="1"): try: doc = Document.objects.get(id=id) except Document.DoesNotExist: raise Http404("Document does not exists") if request.user.has_perm(Access.PERM_READ, doc): doc_ep = doc.doc_ep if not doc_ep.exists(): download(doc_ep) page_ep = doc.get_page_ep( page_num=page, step=Step(step), ) if not page_ep.img_exists(): extract_img(page_ep) try: with open(page_ep.img_url(), "rb") as f: return HttpResponse(f.read(), content_type="image/jpeg") except IOError: raise return redirect('core:index')
def test_hocr_exists(self): local_media = os.path.join(os.path.dirname(os.path.dirname(__file__)), "test", "media") remote_ep = Endpoint("s3:/test-papermerge/") local_ep = Endpoint(f"local:{local_media}") doc_ep = DocumentEp(remote_endpoint=remote_ep, local_endpoint=local_ep, user_id=1, document_id=3, file_name="x.pdf") page_ep1 = PageEp(document_ep=doc_ep, page_num=1, step=Step(1), page_count=3) self.assertTrue(page_ep1.hocr_exists()) page_ep2 = PageEp(document_ep=doc_ep, page_num=2, step=Step(1), page_count=3) self.assertFalse(page_ep2.hocr_exists())
def test_ppmroot(self): doc_ep = DocumentEp(remote_endpoint=self.remote_ep, local_endpoint=self.local_ep, user_id=1, document_id=3, file_name="x.pdf") page_url = PageEp(document_ep=doc_ep, page_num=1, step=Step(1), page_count=3) self.assertEqual(page_url.ppmroot, (f"/var/media/results/user_1/" f"document_3/pages/page_1/100/page"))
def test_txt_url(self): """ Without any arguments page_ep.url() returns page_ep.txt_url() """ doc_ep = DocumentEp(remote_endpoint=self.remote_ep, local_endpoint=self.local_ep, user_id=1, document_id=3, file_name="x.pdf") page_ep = PageEp(document_ep=doc_ep, page_num=1, step=Step(1), page_count=3) self.assertEqual(page_ep.url(), page_ep.txt_url())
def ocr_page_pdf(doc_ep, page_num, lang): page_count = get_pagecount(doc_ep.url()) logger.debug(f"page_count={page_count}") if page_num <= page_count: page_url = PageEp(document_ep=doc_ep, page_num=page_num, step=Step(1), page_count=page_count) extract_img(page_url) extract_txt(page_url, lang=lang) for step in Steps(): page_url.step = step extract_img(page_url) # tesseract unterhalt-1.jpg page-1 -l deu hocr if not step.is_thumbnail: extract_hocr(page_url, lang=lang) return page_url
def test_step(self): step = Step(1) self.assertFalse(step.is_thumbnail, f"{step} is is_thumbnail, but it should not be!")