def ocr_page_pdf(doc_path, page_num, lang): """ doc_path is an mglib.path.DocumentPath instance """ logger.debug("OCR PDF document") page_count = get_pagecount(default_storage.abspath(doc_path.url())) if page_num <= page_count: # first quickly generate preview images page_url = PagePath(document_path=doc_path, page_num=page_num, step=Step(1), page_count=page_count) for step in Steps(): page_url.step = step extract_img(page_url, media_root=settings.MEDIA_ROOT) if page_num <= page_count: page_url = PagePath(document_path=doc_path, page_num=page_num, step=Step(1), page_count=page_count) extract_txt(page_url, lang=lang, media_root=settings.MEDIA_ROOT) for step in Steps(): page_url.step = step if not step.is_thumbnail: extract_hocr(page_url, lang=lang, media_root=settings.MEDIA_ROOT) return page_url
def page_paths(self): """ Enables document instance to get quickly page paths: page_path = doc.page_path[2] page_path.url() # local url to second page of the doc. This is shortcut method when most used Step(1) is required. """ results = [None] # indexing starts from 1 # doc.page_count might be wrong because per # page logic was added just recently. So, let's use # this opportunity and correct it! page_count = get_pagecount(self.absfilepath) if page_count != self.page_count: self.page_count = page_count self.save() for page_num in range(1, page_count + 1): page_path = PagePath(document_path=self.path, page_num=page_num, step=step.Step(1), page_count=self.page_count) results.append(page_path) return results
def reorder_pages(src, dst, new_order): """ new_order is a list of following format: [ {'page_num': 2, page_order: 1}, {'page_num': 1, page_order: 2}, {'page_num': 3, page_order: 3}, {'page_num': 4, page_order: 4}, ] Example above means that in current document of 4 pages, first page was swapped with second one. page_num = older page order page_order = current page order So in human language, each hash is read: <page_num> now should be <page_order> """ page_count = get_pagecount(src) cat_ranges = cat_ranges_for_reorder(page_count=page_count, new_order=new_order) cmd = [settings.BINARY_PDFTK, src, "cat"] for page in cat_ranges: cmd.append(str(page)) cmd.append("output") cmd.append(dst) run(cmd)
def import_file(self, file_title=None, inbox_title="Inbox", delete_after_import=True, skip_ocr=False): """ Gets as input a path to a file on a local file system and: 1. creates a document instance 2. Copies file to doc_instance.url() 4. OCR the doc Used with ./manage.py local_importer ./manage.py imap_importer command """ logger.debug(f"Importing file {self.filepath}") if file_title is None: file_title = os.path.basename(self.filepath) try: page_count = get_pagecount(self.filepath) except Exception: logger.error(f"Error while getting page count of {self.filepath}.") return False inbox, _ = Folder.objects.get_or_create(title=inbox_title, parent=None, user=self.user) doc = Document.create_document(user=self.user, title=file_title, size=os.path.getsize(self.filepath), lang=self.user_ocr_language, file_name=file_title, parent_id=inbox.id, page_count=page_count) logger.debug(f"Uploading file {self.filepath} to {doc.path.url()}") default_storage.copy_doc( src=self.filepath, dst=doc.path.url(), ) if not skip_ocr: DocumentImporter.ocr_document( document=doc, page_count=page_count, lang=self.user_ocr_language, ) if delete_after_import: # Usually we want to delete files when importing # them from local directory # When importing from Email attachment - deleting # files does not apply os.remove(self.filepath) logger.debug("Import complete.") return doc
def ocr_page_pdf(doc_path, page_num, lang, **kwargs): """ doc_path is an mglib.path.DocumentPath instance On success returns ``mglib.path.PagePath`` instance. """ logger.debug("OCR PDF document") file_name = kwargs.pop('file_name', None) if not file_name: file_name = doc_path.file_name page_count = get_pagecount(default_storage.abspath(doc_path.url())) if page_num <= page_count: # first quickly generate preview images page_path = PagePath(document_path=doc_path, page_num=page_num, step=Step(1), page_count=page_count) for step in Steps(): page_path.step = step extract_img(page_path, media_root=settings.MEDIA_ROOT) notify_pre_page_ocr(page_path, page_num=page_num, lang=lang, file_name=doc_path.file_name, **kwargs) if page_num <= page_count: page_path = PagePath(document_path=doc_path, page_num=page_num, step=Step(1), page_count=page_count) extract_txt(page_path, lang=lang, media_root=settings.MEDIA_ROOT) notify_txt_ready(page_path, page_num=page_num, lang=lang, file_name=file_name, **kwargs) for step in Steps(): page_path.step = step if not step.is_thumbnail: extract_hocr(page_path, lang=lang, media_root=settings.MEDIA_ROOT) notify_hocr_ready( page_path, page_num=page_num, lang=lang, # step as integer number step=step.current, file_name=file_name, **kwargs) return page_path
def recreate_pages(self): """ Recreate page models """ self.pages.all().delete() self.page_count = get_pagecount( default_storage.abspath(self.path.url())) self.save() self.create_pages()
def delete_pages(src, dst, page_numbers): page_count = get_pagecount(src) cmd = [settings.BINARY_STAPLER, "del", src] for page in page_numbers: cmd.append(str(page)) cmd.append(dst) run(cmd)
def handle(self, *args, **options): file_path = options.get('file_path', False) if not os.path.exists(file_path): logger.debug(f"Path {file_path} does not exit. Quit.") return page_count = get_pagecount(file_path) logger.debug(f"Page count={page_count}")
def paste_pages_into_existing_doc(src, dst, data_list, after_page_number=False, before_page_number=False): page_count = get_pagecount(src) list1, list2 = split_ranges(total=page_count, after=after_page_number, before=before_page_number) # notice missing A # Letter A is assignent to current folder and # pages from list1 and list2 letters = "BCDEFGHIJKLMNOPQRSTUVWXYZ" letters_2_doc_map = [] letters_pages = [] letters_pages_before = [] letters_pages_after = [] letters_2_doc_map.append(f"A={src}") for idx in range(0, len(data_list)): letter = letters[idx] src = data_list[idx]['src'] pages = data_list[idx]['page_nums'] letters_2_doc_map.append(f"{letter}={src}") for p in pages: letters_pages.append(f"{letter}{p}") for p in list1: letters_pages_before.append(f"A{p}") for p in list2: letters_pages_after.append(f"A{p}") cmd = [ settings.BINARY_PDFTK, ] # add A=doc1_path, B=doc2_path cmd.extend(letters_2_doc_map) cmd.append("cat") # existing doc pages (may be empty) cmd.extend(letters_pages_before) # newly inserted pages cmd.extend(letters_pages) # existing doc pages (may be empty) cmd.extend(letters_pages_after) cmd.append("output") cmd.append(dst) run(cmd)
def delete_pages(src, dst, page_numbers): page_count = get_pagecount(src) cat_ranges = cat_ranges_for_delete(page_count, page_numbers) cmd = [settings.BINARY_PDFTK, src, "cat"] for page in cat_ranges: cmd.append(str(page)) cmd.append("output") cmd.append(dst) run(cmd)
def hocr(request, id, step=None, page="1"): logger.debug(f"hocr for doc_id={id}, step={step}, page={page}") try: doc = Document.objects.get(id=id) except Document.DoesNotExist: raise Http404("Document does not exists") doc_path = doc.path if request.user.has_perm(Access.PERM_READ, doc): # document absolute path doc_abs_path = default_storage.abspath(doc_path.url()) if not os.path.exists( doc_abs_path ): raise Http404("HOCR data not yet ready.") page_count = get_pagecount(doc_abs_path) if page > page_count or page < 0: raise Http404("Page does not exists") page_path = doc.page_paths[page] hocr_abs_path = default_storage.abspath(page_path.hocr_url()) logger.debug(f"Extract words from {hocr_abs_path}") if not os.path.exists(hocr_abs_path): raise Http404("HOCR data not yet ready.") # At this point local HOCR data should be available. hocr = Hocr( hocr_file_path=hocr_abs_path ) return HttpResponse( json.dumps({ 'hocr': hocr.good_json_words(), 'hocr_meta': hocr.get_meta() }), content_type="application/json", ) return HttpResponseForbidden()
def upload(request): """ To understand returned value, have a look at papermerge.core.views.decorators.json_reponse decorator """ files = request.FILES.getlist('file') if not files: logger.warning("POST request.FILES is empty. Forgot adding file?") return "Missing input file", 400 if len(files) > 1: msg = "More then one files per ajax? how come?" logger.warning(msg) return msg, 400 f = files[0] logger.debug("upload for f=%s user=%s", f, request.user) user = request.user size = os.path.getsize(f.temporary_file_path()) parent_id = request.POST.get('parent', "-1") if parent_id and "-1" in parent_id: parent_id = None lang = request.POST.get('language') notes = request.POST.get('notes') try: page_count = get_pagecount(f.temporary_file_path()) except exceptions.FileTypeNotSupported: status = 400 msg = _("File type not supported." " Only pdf, tiff, png, jpeg files are supported") return msg, status logger.debug("creating document {}".format(f.name)) doc = Document.create_document(user=user, title=f.name, size=size, lang=lang, file_name=f.name, parent_id=parent_id, notes=notes, page_count=page_count) logger.debug("uploading to {}".format(doc.path.url())) default_storage.copy_doc(src=f.temporary_file_path(), dst=doc.path.url()) for page_num in range(1, page_count + 1): ocr_page.apply_async( kwargs={ 'user_id': user.id, 'document_id': doc.id, 'file_name': f.name, 'page_num': page_num, 'lang': lang }) # upload only one file at time. # after each upload return a json object with # following fields: # # - title # - preview_url # - doc_id # - action_url -> needed for renaming/deleting selected item # # with that info a new thumbnail will be created. preview_url = reverse('core:preview', args=(doc.id, 200, 1)) result = { 'title': doc.title, 'doc_id': doc.id, 'action_url': "", 'preview_url': preview_url } return result
def test_basic_tiff(self): # in case input file has extention tiff extension # it will internally call get_tiff_pagecount method page_count = get_pagecount(get_filepath("text.tiff")) self.assertEqual(page_count, 2)
def restore_documents(restore_file: io.BytesIO, user: User, skip_ocr=False): restore_file.seek(0) with tarfile.open(fileobj=restore_file, mode="r") as restore_archive: backup_json = restore_archive.extractfile('backup.json') backup_info = json.load(backup_json) leading_user_in_path = False _user = user if not user: leading_user_in_path = True # user was not specified. It is assument that # backup.json contains a list of users. # Thus recreate users first. for backup_user in backup_info['users']: user = User.objects.create( username=backup_user['username'], email=backup_user['email'], is_active=backup_user['is_active'], is_superuser=backup_user['is_superuser']) # in case --include-user-password switch was used # update user (raw digest of) password field password = backup_user.get('password') if password: user.password = password user.save() for restore_file in restore_archive.getnames(): if restore_file == "backup.json": continue logger.debug(f"Restoring file {restore_file}...") splitted_path = PurePath(restore_file).parts base, ext = os.path.splitext( remove_backup_filename_id(splitted_path[-1])) # if there is leading username, remove it. if leading_user_in_path: username = splitted_path[0] _user = User.objects.get(username=username) splitted_path = splitted_path[1:] if backup_info.get('documents', False): backup_info_documents = backup_info['documents'] else: backup_info_documents = _get_json_user_documents_list( backup_info, _user) leading_user_in_path = True for info in backup_info_documents: document_info = info if info['path'] == restore_file: break parent = None # variables used only to shorten debug message _sp = splitted_path _rf = restore_file logger.debug( f"{_rf}: splitted_path={_sp} len(splitted_path)={len(_sp)}") # we first have to create a folder structure if len(splitted_path) > 1: for folder in splitted_path[:-1]: folder_object = Folder.objects.filter( title=folder, user=_user).filter(parent=parent).first() if folder_object is None: new_folder = Folder.objects.create(title=folder, parent=parent, user=_user) parent = new_folder else: parent = folder_object with NamedTemporaryFile("w+b", suffix=ext) as temp_output: logger.debug(f"Extracting {restore_file}...") ff = restore_archive.extractfile(restore_file) temp_output.write(ff.read()) temp_output.seek(0) size = os.path.getsize(temp_output.name) page_count = get_pagecount(temp_output.name) if parent: parent_id = parent.id else: parent_id = None new_doc = Document.objects.create_document( user=_user, title=document_info['title'], size=size, lang=document_info['lang'], file_name=remove_backup_filename_id(splitted_path[-1]), parent_id=parent_id, notes="", page_count=page_count, rebuild_tree=False # speeds up 100x ) tag_attributes = document_info.get('tags', []) for attrs in tag_attributes: attrs['user'] = _user tag, created = Tag.objects.get_or_create(**attrs) new_doc.tags.add(tag) default_storage.copy_doc(src=temp_output.name, dst=new_doc.path.url()) if not skip_ocr: for page_num in range(1, page_count + 1): ocr_page.apply_async( kwargs={ 'user_id': _user.id, 'document_id': new_doc.id, 'file_name': new_doc.file_name, 'page_num': page_num, 'lang': document_info['lang'] })
def post(self, request): files = request.FILES.getlist('file') if not files: logger.warning("POST request.FILES is empty. Forgot adding file?") return HttpResponseBadRequest("Missing input file") if len(files) > 1: logger.warning("More then one files per ajax? how come?") return HttpResponse(json.dumps({}), content_type="application/json", status_code=400) f = files[0] logger.debug("upload for f=%s user=%s", f, request.user) user = request.user size = os.path.getsize(f.temporary_file_path()) parent_id = request.POST.get('parent', "-1") if parent_id and "-1" in parent_id: parent_id = None lang = request.POST.get('language') notes = request.POST.get('notes') page_count = get_pagecount(f.temporary_file_path()) logger.info("creating document {}".format(f.name)) doc = Document.create_document(user=user, title=f.name, size=size, lang=lang, file_name=f.name, parent_id=parent_id, notes=notes, page_count=page_count) logger.debug("uploading to {}".format(doc.path.url())) default_storage.copy_doc(src=f.temporary_file_path(), dst=doc.path.url()) for page_num in range(1, page_count + 1): ocr_page.apply_async( kwargs={ 'user_id': user.id, 'document_id': doc.id, 'file_name': f.name, 'page_num': page_num, 'lang': lang }) # upload only one file at time. # after each upload return a json object with # following fields: # # - title # - preview_url # - doc_id # - action_url -> needed for renaming/deleting selected item # # with that info a new thumbnail will be created. # action_url = reverse( # 'boss:core_basetreenode_change', args=(doc.id,) # ) preview_url = reverse('core:preview', args=(doc.id, 200, 1)) result = { 'title': doc.title, 'doc_id': doc.id, 'action_url': "", 'preview_url': preview_url } logger.info("and response is!") return HttpResponse(json.dumps(result), content_type="application/json")
def restore_documents(restore_file: io.BytesIO, username, skip_ocr=False): restore_file.seek(0) user = User.objects.filter(username=username).first() with tarfile.open(fileobj=restore_file, mode="r") as restore_archive: backup_json = restore_archive.extractfile('backup.json') backup_info = json.load(backup_json) for restore_file in restore_archive.getnames(): if restore_file == "backup.json": continue for info in backup_info['documents']: document_info = info if info['path'] == restore_file: break splitted_path = PurePath(restore_file).parts parent = None # we first have to create a folder structure if len(splitted_path) > 1: for folder in splitted_path[:-1]: folder_object = Folder.objects.filter(title=folder).filter( parent=parent).first() if folder_object is None: new_folder = Folder.objects.create(title=folder, parent=parent, user=user) parent = new_folder else: parent = folder_object document_object = Document.objects.filter( title=splitted_path[-1]).filter(parent=parent).first() if document_object is not None: logger.error("Document %s already exists, skipping", restore_file) else: with NamedTemporaryFile("w+b") as temp_output: temp_output.write( restore_archive.extractfile(restore_file).read()) temp_output.seek(0) size = os.path.getsize(temp_output.name) page_count = get_pagecount(temp_output.name) if parent: parent_id = parent.id else: parent_id = None new_doc = Document.create_document( user=user, title=splitted_path[-1], size=size, lang=document_info['lang'], file_name=splitted_path[-1], parent_id=parent_id, notes="", page_count=page_count) default_storage.copy_doc(src=temp_output.name, dst=new_doc.path.url()) for page_num in range(1, page_count + 1): if not skip_ocr: ocr_page.apply_async( kwargs={ 'user_id': user.id, 'document_id': new_doc.id, 'file_name': splitted_path[-1], 'page_num': page_num, 'lang': document_info['lang'] })
def apply_automates(document_id, page_num): logger.debug("apply_automates: Begin.") try: document = Document.objects.get(id=document_id) except Document.DoesNotExist: logger.error(f"Provided document_id={document_id}, does not exists") return # use text files from the original version of the document doc_path = DocumentPath.copy_from( document.path, version=0 ) page_count = get_pagecount( default_storage.abspath(doc_path.url()) ) page_path = PagePath( document_path=doc_path, page_num=page_num, page_count=page_count, step=Step(), ) user = document.user text_path = default_storage.abspath(page_path.txt_url()) text = "" with open(text_path, "r") as f: text = f.read() automates = Automate.objects.filter(user=user) # are there automates for the user? if automates.count() == 0: logger.debug( f"No automates for user {user}. Quit." ) return # check all automates for given user (the owner of the document) matched = [] for automate in automates: if automate.is_a_match(text): logger.debug(f"Automate {automate} matched document={document}") plugin_klass = get_plugin_by_module_name( automate.plugin_name ) plugin = plugin_klass() if plugin_klass else None automate.apply( document=document, page_num=page_num, hocr=text, # Notice () - plugin passed is instance of the class plugin=plugin ) matched.append(automate) else: logger.debug( f"No match for automate={automate}" f" doc_id={document_id} page_num={page_num}" ) message = "" message = _( "%(count)s of %(total)s Automate(s) matched. ") % { 'count': len(matched), 'total': automates.count() } if len(matched) > 0: message += _("List of matched Automates: %(matched_automates)s") % { 'matched_automates': matched } automates_matching.send( sender="papermerge.core.automate", user_id=document.user.id, document_id=document_id, level=logging.INFO, message=message, page_num=page_num, text=text )
def page_count(self): return get_pagecount(self.path)
def test_basic_png(self): page_count = get_pagecount(get_filepath("berlin.png")) self.assertEqual(page_count, 1)
def restore_documents(restore_file: io.BytesIO, user: User, skip_ocr=False): restore_file.seek(0) with tarfile.open(fileobj=restore_file, mode="r") as restore_archive: backup_json = restore_archive.extractfile('backup.json') backup_info = json.load(backup_json) leading_user_in_path = False _user = user if not user: leading_user_in_path = True # user was not specified. It is assument that # backup.json contains a list of users. # Thus recreate users first. for backup_user in backup_info['users']: User.objects.create(username=backup_user['username'], email=backup_user['email'], is_active=backup_user['is_active'], is_superuser=backup_user['is_superuser']) for restore_file in restore_archive.getnames(): if restore_file == "backup.json": continue splitted_path = PurePath(restore_file).parts base, ext = os.path.splitext( remove_backup_filename_id(splitted_path[-1])) # if there is leading username, remove it. if leading_user_in_path: username = splitted_path[0] _user = User.objects.get(username=username) splitted_path = splitted_path[1:] if backup_info.get('documents', False): backup_info_documents = backup_info['documents'] else: backup_info_documents = _get_json_user_documents_list( backup_info, _user) leading_user_in_path = True for info in backup_info_documents: document_info = info if info['path'] == restore_file: break parent = None # we first have to create a folder structure if len(splitted_path) > 1: for folder in splitted_path[:-1]: folder_object = Folder.objects.filter( title=folder, user=_user).filter(parent=parent).first() if folder_object is None: new_folder = Folder.objects.create(title=folder, parent=parent, user=_user) parent = new_folder else: parent = folder_object with NamedTemporaryFile("w+b", suffix=ext) as temp_output: ff = restore_archive.extractfile(restore_file) temp_output.write(ff.read()) temp_output.seek(0) size = os.path.getsize(temp_output.name) page_count = get_pagecount(temp_output.name) if parent: parent_id = parent.id else: parent_id = None new_doc = Document.create_document( user=_user, title=document_info['title'], size=size, lang=document_info['lang'], file_name=remove_backup_filename_id(splitted_path[-1]), parent_id=parent_id, notes="", page_count=page_count, rebuild_tree=False # speeds up 100x ) default_storage.copy_doc(src=temp_output.name, dst=new_doc.path.url()) if not skip_ocr: for page_num in range(1, page_count + 1): ocr_page.apply_async( kwargs={ 'user_id': _user.id, 'document_id': new_doc.id, 'file_name': new_doc.file_name, 'page_num': page_num, 'lang': document_info['lang'] })