def preview(request, id, step=None, page="1"): try: doc = Document.objects.get(id=id) except Document.DoesNotExist: raise Http404("Document does not exists") if request.user.has_perm(Access.PERM_READ, doc): doc_ep = doc.doc_ep if not doc_ep.exists(): download(doc_ep) page_ep = doc.get_page_ep( page_num=page, step=Step(step), ) if not page_ep.img_exists(): extract_img(page_ep) try: with open(page_ep.img_url(), "rb") as f: return HttpResponse(f.read(), content_type="image/jpeg") except IOError: raise return redirect('core:index')
def hocr(request, id, step=None, page="1"): logger.debug(f"hocr for doc_id={id}, step={step}, page={page}") try: doc = Document.objects.get(id=id) except Document.DoesNotExist: raise Http404("Document does not exists") doc_ep = doc.doc_ep if request.user.has_perm(Access.PERM_READ, doc): if not doc_ep.exists(): download(doc_ep) page_count = get_pagecount(doc_ep.url()) if page > page_count or page < 0: raise Http404("Page does not exists") page_ep = doc.page_eps[page] logger.debug(f"Extract words from {page_ep.hocr_url()}") if not page_ep.hocr_exists(): # check if HOCR data exists on S3 if settings.S3 and page_ep.hocr_exists(ep=Endpoint.S3): # ok, it should be able to download it. download_hocr(page_ep) else: # normal scenario, HOCR is not yet ready raise Http404("HOCR data not yet ready.") # At this point local HOCR data should be available. hocr = Hocr( hocr_file_path=page_ep.hocr_url() ) return HttpResponse( json.dumps({ 'hocr': hocr.good_json_words(), 'hocr_meta': hocr.get_meta() }), content_type="application/json", ) return HttpResponseForbidden()
def update_text_field(self): """Update text field from associated .txt file. Returns non-empty text string value if .txt file was found. If file was not found - will return an empty string. """ if not settings.OCR: return '' text = '' logger.debug(f"Checking {self.txt_url}") if not self.txt_exists: logger.debug( f"Missing page txt {self.txt_url}." ) # skip download to local media storage if S3 # is disabled. if not settings.S3: logger.info(f"S3 disabled") return '' if not storage.download(self.page_ep): logger.info( f"document_log " f" username={self.user.username}" f" doc_id={self.document.id}" f" page_num={self.number}" f" text_len={len(text.strip())}" ) return '' else: logger.debug(f"Page txt {self.txt_url} exists.") with open(self.txt_url) as file_handle: self.text = file_handle.read() self.save() logger.debug( f"text saved. len(page.text)=={len(self.text)}" ) text = self.text logger.info( f"document_log " f" username={self.user.username}" f" doc_id={self.document.id}" f" page_num={self.number}" f" text_len={len(self.text.strip())}" ) return text
def ocr_page(self, user_id, document_id, file_name, page_num, lang, s3_upload=True, s3_download=True, test_local_alternative=None): # A task being bound (bind=True) means the first argument # to the task will always be the # task instance (self). # https://celery.readthedocs.io/en/latest/userguide/tasks.html#bound-tasks logger.info(f"worker_log task_id={self.request.id}" f" user_id={user_id} doc_id={document_id}" f" page_num={page_num}") t1 = time.time() lang = lang.lower() doc_ep = DocumentEp( user_id=user_id, document_id=document_id, file_name=file_name, ) logger.debug(f"Received document_url={doc_ep.url(Endpoint.S3)}") if not doc_ep.exists(): logger.debug((f"doc_ep={doc_ep.url()} does not exists." f"Processing with download.")) download(doc_ep, s3_download=s3_download, test_local_alternative=test_local_alternative) else: logger.debug(f"Local copy {doc_ep.url()} exists.") mime_type = mime.Mime(doc_ep.url()) page_ep = None page_type = '' if mime_type.is_pdf(): tx1 = time.time() page_ep = ocr_page_pdf(doc_ep=doc_ep, page_num=page_num, lang=lang) page_type = 'pdf' tx2 = time.time() logger.info( f"worker_log task_id={self.request.id}" f" user_id={user_id}" f" doc_id={document_id}" f" page_num={page_num} page_type=pdf page_ocr_time={tx2-tx1:.2f}") else: logger.info(f"worker_log task_id={self.request.id}" f" user_id={user_id}" f" doc_id={document_id}" f" page_num={page_num} error=Unkown file type") return True if page_ep and s3_upload: upload_page(page_ep) logger.info( f"worker_log task_id={self.request.id}" f" user_id={user_id}" f" doc_id={document_id}" f" page_num={page_num} uploaded={page_ep.url(Endpoint.S3)}") t2 = time.time() logger.info(f"worker_log success task_id={self.request.id}" f" user_id={user_id} doc_id={document_id}" f" page_num={page_num} page_type={page_type}" f" total_exec_time={t2-t1:.2f}") return True