Exemplo n.º 1
0
 def test_preview(self):
     doc = Document.create_document(
         title="andromeda.pdf",
         user=self.testcase_user,
         lang="ENG",
         file_name="andromeda.pdf",
         size=1222,
         page_count=3
     )
     copy2doc_url(
         src_file_path=os.path.join(
             BASE_DIR, "data", "andromeda.pdf"
         ),
         doc_url=doc.doc_ep.url()
     )
     ret = self.client.post(
         reverse('core:preview', args=(doc.id, 1, 1))
     )
     self.assertEqual(
         ret.status_code,
         200
     )
     page_url = PageEp(
         document_ep=doc.doc_ep,
         page_num=1,
         step=Step(1),
         page_count=3
     )
     self.assertTrue(
         os.path.exists(page_url.img_exists())
     )
Exemplo n.º 2
0
def preview(request, id, step=None, page="1"):

    try:
        doc = Document.objects.get(id=id)
    except Document.DoesNotExist:
        raise Http404("Document does not exists")

    if request.user.has_perm(Access.PERM_READ, doc):
        doc_ep = doc.doc_ep

        if not doc_ep.exists():
            download(doc_ep)

        page_ep = doc.get_page_ep(
            page_num=page,
            step=Step(step),
        )
        if not page_ep.img_exists():
            extract_img(page_ep)

        try:
            with open(page_ep.img_url(), "rb") as f:
                return HttpResponse(f.read(), content_type="image/jpeg")
        except IOError:
            raise

    return redirect('core:index')
Exemplo n.º 3
0
 def test_hocr_exists(self):
     local_media = os.path.join(os.path.dirname(os.path.dirname(__file__)),
                                "test", "media")
     remote_ep = Endpoint("s3:/test-papermerge/")
     local_ep = Endpoint(f"local:{local_media}")
     doc_ep = DocumentEp(remote_endpoint=remote_ep,
                         local_endpoint=local_ep,
                         user_id=1,
                         document_id=3,
                         file_name="x.pdf")
     page_ep1 = PageEp(document_ep=doc_ep,
                       page_num=1,
                       step=Step(1),
                       page_count=3)
     self.assertTrue(page_ep1.hocr_exists())
     page_ep2 = PageEp(document_ep=doc_ep,
                       page_num=2,
                       step=Step(1),
                       page_count=3)
     self.assertFalse(page_ep2.hocr_exists())
Exemplo n.º 4
0
 def test_ppmroot(self):
     doc_ep = DocumentEp(remote_endpoint=self.remote_ep,
                         local_endpoint=self.local_ep,
                         user_id=1,
                         document_id=3,
                         file_name="x.pdf")
     page_url = PageEp(document_ep=doc_ep,
                       page_num=1,
                       step=Step(1),
                       page_count=3)
     self.assertEqual(page_url.ppmroot,
                      (f"/var/media/results/user_1/"
                       f"document_3/pages/page_1/100/page"))
Exemplo n.º 5
0
 def test_txt_url(self):
     """
     Without any arguments
         page_ep.url() returns page_ep.txt_url()
     """
     doc_ep = DocumentEp(remote_endpoint=self.remote_ep,
                         local_endpoint=self.local_ep,
                         user_id=1,
                         document_id=3,
                         file_name="x.pdf")
     page_ep = PageEp(document_ep=doc_ep,
                      page_num=1,
                      step=Step(1),
                      page_count=3)
     self.assertEqual(page_ep.url(), page_ep.txt_url())
Exemplo n.º 6
0
def ocr_page_pdf(doc_ep, page_num, lang):
    page_count = get_pagecount(doc_ep.url())
    logger.debug(f"page_count={page_count}")
    if page_num <= page_count:
        page_url = PageEp(document_ep=doc_ep,
                          page_num=page_num,
                          step=Step(1),
                          page_count=page_count)
        extract_img(page_url)
        extract_txt(page_url, lang=lang)

        for step in Steps():
            page_url.step = step
            extract_img(page_url)
            # tesseract unterhalt-1.jpg page-1 -l deu hocr
            if not step.is_thumbnail:
                extract_hocr(page_url, lang=lang)

    return page_url
Exemplo n.º 7
0
 def test_step(self):
     step = Step(1)
     self.assertFalse(step.is_thumbnail,
                      f"{step} is is_thumbnail, but it should not be!")