def page_eps(self): """ Enables document instance to get quickly page endpoints: page_ep = doc.page_eps[2] page_ep.url() # local url to second page of the doc. This is shortcut method when most used Step(1) is required. """ results = [None] # indexing starts from 1 # doc.page_count might be wrong because per # page logic was added just recently. So, let's use # this opportunity and correct it! page_count = get_pagecount(self.doc_ep.url()) if page_count != self.page_count: self.page_count = page_count self.save() for page_num in range(1, page_count + 1): ep = endpoint.PageEp(document_ep=self.doc_ep, page_num=page_num, step=step.Step(1), page_count=self.page_count) results.append(ep) return results
def delete_pages(doc_ep, page_numbers): ep_url = doc_ep.url() page_count = get_pagecount(ep_url) cat_ranges = cat_ranges_for_delete( page_count, page_numbers ) doc_ep.inc_version() cmd = [ "pdftk", ep_url, "cat" ] for page in cat_ranges: cmd.append( str(page) ) cmd.append("output") make_sure_path_exists(doc_ep.url()) cmd.append(doc_ep.url()) run(cmd) return doc_ep.version
def import_file(self, file_title=None, inbox_title="Inbox", delete_after_import=True, skip_ocr=False): """ Gets as input a path to a file on a local file system and: 1. creates a document instance 2. Copies file to doc_instance.url() 4. OCR the doc Used with ./manage.py local_importer ./manage.py imap_importer command """ logger.debug(f"Importing file {self.filepath}") if file_title is None: file_title = os.path.basename(self.filepath) try: page_count = get_pagecount(self.filepath) except Exception: logger.error(f"Error while getting page count of {self.filepath}.") return False inbox, _ = Folder.objects.get_or_create(title=inbox_title, parent=None, user=self.user) doc = Document.create_document(user=self.user, title=file_title, size=os.path.getsize(self.filepath), lang=self.user_ocr_language, file_name=file_title, parent_id=inbox.id, page_count=page_count) logger.debug(f"Uploading file {self.filepath} to {doc.path.url()}") default_storage.copy_doc( src=self.filepath, dst=doc.path.url(), ) if not skip_ocr: DocumentImporter.ocr_document( document=doc, page_count=page_count, lang=self.user_ocr_language, ) if delete_after_import: # Usually we want to delete files when importing # them from local directory # When importing from Email attachment - deleting # files does not apply os.remove(self.filepath) logger.debug("Import complete.") return doc
def recreate_pages(self): """ Recreate page models """ self.page_set.all().delete() self.page_count = get_pagecount(self.doc_ep.url()) self.save() self.create_pages()
def hocr(request, id, step=None, page="1"): logger.debug(f"hocr for doc_id={id}, step={step}, page={page}") try: doc = Document.objects.get(id=id) except Document.DoesNotExist: raise Http404("Document does not exists") doc_ep = doc.doc_ep if request.user.has_perm(Access.PERM_READ, doc): if not doc_ep.exists(): download(doc_ep) page_count = get_pagecount(doc_ep.url()) if page > page_count or page < 0: raise Http404("Page does not exists") page_ep = doc.page_eps[page] logger.debug(f"Extract words from {page_ep.hocr_url()}") if not page_ep.hocr_exists(): # check if HOCR data exists on S3 if settings.S3 and page_ep.hocr_exists(ep=Endpoint.S3): # ok, it should be able to download it. download_hocr(page_ep) else: # normal scenario, HOCR is not yet ready raise Http404("HOCR data not yet ready.") # At this point local HOCR data should be available. hocr = Hocr( hocr_file_path=page_ep.hocr_url() ) return HttpResponse( json.dumps({ 'hocr': hocr.good_json_words(), 'hocr_meta': hocr.get_meta() }), content_type="application/json", ) return HttpResponseForbidden()
def hocr(request, id, step=None, page="1"): logger.debug(f"hocr for doc_id={id}, step={step}, page={page}") try: doc = Document.objects.get(id=id) except Document.DoesNotExist: raise Http404("Document does not exists") doc_path = doc.path if request.user.has_perm(Access.PERM_READ, doc): # document absolute path doc_abs_path = default_storage.abspath(doc_path.url()) if not os.path.exists( doc_abs_path ): raise Http404("HOCR data not yet ready.") page_count = get_pagecount(doc_abs_path) if page > page_count or page < 0: raise Http404("Page does not exists") page_path = doc.page_paths[page] hocr_abs_path = default_storage.abspath(page_path.hocr_url()) logger.debug(f"Extract words from {hocr_abs_path}") if not os.path.exists(hocr_abs_path): raise Http404("HOCR data not yet ready.") # At this point local HOCR data should be available. hocr = Hocr( hocr_file_path=hocr_abs_path ) return HttpResponse( json.dumps({ 'hocr': hocr.good_json_words(), 'hocr_meta': hocr.get_meta() }), content_type="application/json", ) return HttpResponseForbidden()
def reorder_pages(doc_ep, new_order): """ new_order is a list of following format: [ {'page_num': 2, page_order: 1}, {'page_num': 1, page_order: 2}, {'page_num': 3, page_order: 3}, {'page_num': 4, page_order: 4}, ] Example above means that in current document of 4 pages, first page was swapped with second one. page_num = older page order page_order = current page order So in human language, each hash is read: <page_num> now should be <page_order> """ ep_url = doc_ep.url() page_count = get_pagecount(ep_url) cat_ranges = cat_ranges_for_reorder( page_count=page_count, new_order=new_order ) doc_ep.inc_version() cmd = [ "pdftk", ep_url, "cat" ] for page in cat_ranges: cmd.append( str(page) ) cmd.append("output") make_sure_path_exists(doc_ep.url()) cmd.append(doc_ep.url()) run(cmd) return doc_ep.version
def ocr_page_pdf(doc_ep, page_num, lang): page_count = get_pagecount(doc_ep.url()) logger.debug(f"page_count={page_count}") if page_num <= page_count: page_url = PageEp(document_ep=doc_ep, page_num=page_num, step=Step(1), page_count=page_count) extract_img(page_url) extract_txt(page_url, lang=lang) for step in Steps(): page_url.step = step extract_img(page_url) # tesseract unterhalt-1.jpg page-1 -l deu hocr if not step.is_thumbnail: extract_hocr(page_url, lang=lang) return page_url
def ocr_page_pdf(doc_path, page_num, lang): """ doc_path is an mglib.path.DocumentPath instance """ page_count = get_pagecount(default_storage.abspath(doc_path.url())) if page_num <= page_count: page_url = PagePath(document_path=doc_path, page_num=page_num, step=Step(1), page_count=page_count) extract_img(page_url, media_root=settings.MEDIA_ROOT) extract_txt(page_url, lang=lang, media_root=settings.MEDIA_ROOT) for step in Steps(): page_url.step = step extract_img(page_url, media_root=settings.MEDIA_ROOT) # tesseract unterhalt-1.jpg page-1 -l deu hocr if not step.is_thumbnail: extract_hocr(page_url, lang=lang, media_root=settings.MEDIA_ROOT) return page_url
def post(self, request): files = request.FILES.getlist('file') if not files: logger.warning("POST request.FILES is empty. Forgot adding file?") return HttpResponseBadRequest("Missing input file") if len(files) > 1: logger.warning("More then one files per ajax? how come?") return HttpResponse(json.dumps({}), content_type="application/json", status_code=400) f = files[0] logger.debug("upload for f=%s user=%s", f, request.user) user = request.user size = os.path.getsize(f.temporary_file_path()) parent_id = request.POST.get('parent', "-1") if parent_id and "-1" in parent_id: parent_id = None lang = request.POST.get('language') notes = request.POST.get('notes') page_count = get_pagecount(f.temporary_file_path()) logger.info("creating document {}".format(f.name)) doc = Document.create_document(user=user, title=f.name, size=size, lang=lang, file_name=f.name, parent_id=parent_id, notes=notes, page_count=page_count) logger.debug("uploading to {}".format(doc.path.url())) default_storage.copy_doc(src=f.temporary_file_path(), dst=doc.path.url()) for page_num in range(1, page_count + 1): ocr_page.apply_async( kwargs={ 'user_id': user.id, 'document_id': doc.id, 'file_name': f.name, 'page_num': page_num, 'lang': lang }) # upload only one file at time. # after each upload return a json object with # following fields: # # - title # - preview_url # - doc_id # - action_url -> needed for renaming/deleting selected item # # with that info a new thumbnail will be created. action_url = reverse('boss:core_basetreenode_change', args=(doc.id, )) preview_url = reverse('core:preview', args=(doc.id, 200, 1)) result = { 'title': doc.title, 'doc_id': doc.id, 'action_url': action_url, 'preview_url': preview_url } logger.info("and response is!") return HttpResponse(json.dumps(result), content_type="application/json")
def import_file(filepath, username=None, file_title=None, inbox_title="Inbox", delete_after_import=False, start_ocr_async=True, upload=True): """ Gets as input a path to a file on a local file system and: 1. creates a document instance (if there is a available space). 2. Copies file to doc_instance.url() 3. (optionally) uploads the document to S3 storage. 4. (optionally) starts ocr_async task. Is used on customers instance by: * import_file command - to import files from SFTP directory * import_attachment command - to import attachments from mailbox """ logger.debug(f"Importing file {filepath}") if username is None: user = get_root_user() else: user = User.objects.get(username=username) if file_title is None: file_title = get_file_title(filepath) if not is_storage_left(filepath, user=user): logger.error(f"user.username reached his disk quota") return False lang = Document.get_default_language() # get_pagecount() might raise an exception in case # file is either wrong (not a PDF) or not yet # completed to upload try: page_count = get_pagecount(filepath) except Exception: # which means that document is not yet fully # uploaded by SFTP client. logger.error(f"File {filepath} not yet ready for importing.") return False inbox, _ = Folder.objects.get_or_create(title=inbox_title, parent=None, user=user) doc = Document.create_document(user=user, title=file_title, size=get_file_size(filepath), lang=lang, file_name=file_title, parent_id=inbox.id, page_count=page_count) logger.debug(f"Uploading file {filepath} to {doc.doc_ep.url()}") # Import file is executed as root (import-file.service) # (because import-file need to access/delete sftp files, folder # as of another system user) # Thus, after copying file into (newly created) folders, # it need to change permissions (of newly created files and folders) # to the app_user/app_group. copy2doc_url(src_file_path=filepath, doc_url=doc.doc_ep.url(), user=settings.APP_USER, group=settings.APP_GROUP) if upload and settings.S3: upload_document_to_s3(doc.doc_ep) if start_ocr_async and settings.OCR: Document.ocr_async(document=doc, page_count=page_count, lang=lang, s3_enabled=settings.S3) if delete_after_import: os.remove(filepath) return True
def post(self, request): files = request.FILES.getlist('file') if not files: logger.warning("POST request.FILES is empty. Forgot adding file?") if len(files) > 1: logger.warning("More then one files per ajax? how come?") return HttpResponse(json.dumps({}), content_type="application/json", status_code=400) f = files[0] logger.debug("upload for f=%s user=%s", f, request.user) if not is_storage_left(f.temporary_file_path()): logger.warning("Storage is full for user=%s.", request.user) msg = "Cannot upload file {}. Storage is full.".format(f.name) return HttpResponse(json.dumps({'error': msg}), status=400, content_type="application/json") user = request.user size = os.path.getsize(f.temporary_file_path()) parent_id = request.POST.get('parent', "-1") if parent_id and "-1" in parent_id: parent_id = None lang = request.POST.get('language') notes = request.POST.get('notes') page_count = get_pagecount(f.temporary_file_path()) logger.info("creating document {}".format(f.name)) doc = Document.create_document(user=user, title=f.name, size=size, lang=lang, file_name=f.name, parent_id=parent_id, notes=notes, page_count=page_count) logger.debug("uploading to {}".format(doc.doc_ep.url())) copy2doc_url(src_file_path=f.temporary_file_path(), doc_url=doc.doc_ep.url()) if settings.S3: upload_document_to_s3(doc.doc_ep) if settings.OCR: Document.ocr_async(document=doc, page_count=page_count, lang=lang) # upload only one file at time. # after each upload return a json object with # following fields: # # - title # - preview_url # - doc_id # - action_url -> needed for renaming/deleting selected item # # with that info a new thumbnail will be created. action_url = reverse('boss:core_basetreenode_change', args=(doc.id, )) preview_url = reverse('core:preview', args=(doc.id, 200, 1)) result = { 'title': doc.title, 'doc_id': doc.id, 'action_url': action_url, 'preview_url': preview_url } logger.info("and response is!") return HttpResponse(json.dumps(result), content_type="application/json")
def paste_pages_into_existing_doc( dest_doc_ep, src_doc_ep_list, after_page_number=False, before_page_number=False ): page_count = get_pagecount(dest_doc_ep.url()) list1, list2 = split_ranges( total=page_count, after=after_page_number, before=before_page_number ) # notice missing A # Letter A is assignent to current folder and # pages from list1 and list2 letters = "BCDEFGHIJKLMNOPQRSTUVWXYZ" letters_2_doc_map = [] letters_pages = [] letters_pages_before = [] letters_pages_after = [] letters_2_doc_map.append( f"A={dest_doc_ep.url()}" ) for idx in range(0, len(src_doc_ep_list)): letter = letters[idx] doc_ep = src_doc_ep_list[idx]['doc_ep'] pages = src_doc_ep_list[idx]['page_nums'] letters_2_doc_map.append( f"{letter}={doc_ep.url()}" ) for p in pages: letters_pages.append( f"{letter}{p}" ) dest_doc_ep.inc_version() for p in list1: letters_pages_before.append( f"A{p}" ) for p in list2: letters_pages_after.append( f"A{p}" ) cmd = [ "pdftk", ] # add A=doc1_path, B=doc2_path cmd.extend(letters_2_doc_map) cmd.append("cat") # existing doc pages (may be empty) cmd.extend(letters_pages_before) # newly inserted pages cmd.extend(letters_pages) # existing doc pages (may be empty) cmd.extend(letters_pages_after) cmd.append("output") make_sure_path_exists(dest_doc_ep.url()) cmd.append(dest_doc_ep.url()) run(cmd) return dest_doc_ep.version
def restore_documents(restore_file: io.BytesIO, username, skip_ocr=False): restore_file.seek(0) user = User.objects.filter(username=username).first() with tarfile.open(fileobj=restore_file, mode="r") as restore_archive: backup_json = restore_archive.extractfile('backup.json') backup_info = json.load(backup_json) for restore_file in restore_archive.getnames(): if restore_file == "backup.json": continue for info in backup_info['documents']: document_info = info if info['path'] == restore_file: break splitted_path = PurePath(restore_file).parts parent = None # we first have to create a folder structure if len(splitted_path) > 1: for folder in splitted_path[:-1]: folder_object = Folder.objects.filter(title=folder).filter( parent=parent).first() if folder_object is None: new_folder = Folder.objects.create(title=folder, parent=parent, user=user) parent = new_folder else: parent = folder_object document_object = Document.objects.filter( title=splitted_path[-1]).filter(parent=parent).first() if document_object is not None: logger.error("Document %s already exists, skipping", restore_file) else: with NamedTemporaryFile("w+b") as temp_output: temp_output.write( restore_archive.extractfile(restore_file).read()) temp_output.seek(0) size = os.path.getsize(temp_output.name) page_count = get_pagecount(temp_output.name) if parent: parent_id = parent.id else: parent_id = None new_doc = Document.create_document( user=user, title=splitted_path[-1], size=size, lang=document_info['lang'], file_name=splitted_path[-1], parent_id=parent_id, notes="", page_count=page_count) default_storage.copy_doc(src=temp_output.name, dst=new_doc.path.url()) for page_num in range(1, page_count + 1): if not skip_ocr: ocr_page.apply_async( kwargs={ 'user_id': user.id, 'document_id': new_doc.id, 'file_name': splitted_path[-1], 'page_num': page_num, 'lang': document_info['lang'] })