def import_file( self, file_title=None, inbox_title=Folder.INBOX_NAME, delete_after_import=True, skip_ocr=False, apply_async=False ): """ Gets as input a path to a file on a local file system and: 1. creates a document instance 2. Copies file to doc_instance.url() 4. OCR the doc Used with ./manage.py local_importer ./manage.py imap_importer command """ logger.debug(f"Importing file {self.filepath}.") if file_title is None: file_title = os.path.basename(self.filepath) try: page_count = get_pagecount(self.filepath) except Exception: logger.error(f"Error while getting page count of {self.filepath}.") return False inbox, _ = Folder.objects.get_or_create( title=inbox_title, parent=None, user=self.user ) doc = Document.create_document( user=self.user, title=file_title, size=os.path.getsize(self.filepath), lang=self.user_ocr_language, file_name=file_title, parent_id=inbox.id, page_count=page_count, rebuild_tree=False ) logger.debug( f"Uploading file {self.filepath} to {doc.path.url()}" ) default_storage.copy_doc( src=self.filepath, dst=doc.path.url(), ) if not skip_ocr: if apply_async: for page_num in range(1, page_count + 1): ocr_page_task.apply_async(kwargs={ 'user_id': self.user.id, 'document_id': doc.id, 'file_name': file_title, 'page_num': page_num, 'lang': self.user_ocr_language} ) else: DocumentImporter.ocr_document( document=doc, page_count=page_count, lang=self.user_ocr_language, ) if delete_after_import: # Usually we want to delete files when importing # them from local directory # When importing from Email attachment - deleting # files does not apply os.remove(self.filepath) logger.debug("Import complete.") return doc
def restore_documents(restore_file: io.BytesIO, user: User, skip_ocr=False): restore_file.seek(0) with tarfile.open(fileobj=restore_file, mode="r") as restore_archive: backup_json = restore_archive.extractfile('backup.json') backup_info = json.load(backup_json) leading_user_in_path = False _user = user if not user: leading_user_in_path = True # user was not specified. It is assument that # backup.json contains a list of users. # Thus recreate users first. for backup_user in backup_info['users']: user = User.objects.create( username=backup_user['username'], email=backup_user['email'], is_active=backup_user['is_active'], is_superuser=backup_user['is_superuser']) # in case --include-user-password switch was used # update user (raw digest of) password field password = backup_user.get('password') if password: user.password = password user.save() for restore_file in restore_archive.getnames(): if restore_file == "backup.json": continue logger.debug(f"Restoring file {restore_file}...") splitted_path = PurePath(restore_file).parts base, ext = os.path.splitext( remove_backup_filename_id(splitted_path[-1])) # if there is leading username, remove it. if leading_user_in_path: username = splitted_path[0] _user = User.objects.get(username=username) splitted_path = splitted_path[1:] if backup_info.get('documents', False): backup_info_documents = backup_info['documents'] else: backup_info_documents = _get_json_user_documents_list( backup_info, _user) leading_user_in_path = True for info in backup_info_documents: document_info = info if info['path'] == restore_file: break parent = None # variables used only to shorten debug message _sp = splitted_path _rf = restore_file logger.debug( f"{_rf}: splitted_path={_sp} len(splitted_path)={len(_sp)}") # we first have to create a folder structure if len(splitted_path) > 1: for folder in splitted_path[:-1]: folder_object = Folder.objects.filter( title=folder, user=_user).filter(parent=parent).first() if folder_object is None: new_folder = Folder.objects.create(title=folder, parent=parent, user=_user) parent = new_folder else: parent = folder_object with NamedTemporaryFile("w+b", suffix=ext) as temp_output: logger.debug(f"Extracting {restore_file}...") ff = restore_archive.extractfile(restore_file) temp_output.write(ff.read()) temp_output.seek(0) size = os.path.getsize(temp_output.name) page_count = get_pagecount(temp_output.name) if parent: parent_id = parent.id else: parent_id = None new_doc = Document.objects.create_document( user=_user, title=document_info['title'], size=size, lang=document_info['lang'], file_name=remove_backup_filename_id(splitted_path[-1]), parent_id=parent_id, notes="", page_count=page_count, rebuild_tree=False # speeds up 100x ) tag_attributes = document_info.get('tags', []) for attrs in tag_attributes: attrs['user'] = _user tag, created = Tag.objects.get_or_create(**attrs) new_doc.tags.add(tag) default_storage.copy_doc(src=temp_output.name, dst=new_doc.path.url()) if not skip_ocr: for page_num in range(1, page_count + 1): ocr_page.apply_async( kwargs={ 'user_id': _user.id, 'document_id': new_doc.id, 'file_name': new_doc.file_name, 'page_num': page_num, 'lang': document_info['lang'] })
def upload(request): """ To understand returned value, have a look at papermerge.core.views.decorators.json_reponse decorator """ files = request.FILES.getlist('file') if not files: logger.warning("POST request.FILES is empty. Forgot adding file?") return "Missing input file", 400 if len(files) > 1: msg = "More then one files per ajax? how come?" logger.warning(msg) return msg, 400 f = files[0] logger.debug("upload for f=%s user=%s", f, request.user) user = request.user size = os.path.getsize(f.temporary_file_path()) parent_id = request.POST.get('parent', "-1") if parent_id and "-1" in parent_id: parent_id = None lang = request.POST.get('language') notes = request.POST.get('notes') try: page_count = get_pagecount(f.temporary_file_path()) except exceptions.FileTypeNotSupported: status = 400 msg = _("File type not supported." " Only pdf, tiff, png, jpeg files are supported") return msg, status logger.debug("creating document {}".format(f.name)) doc = Document.create_document(user=user, title=f.name, size=size, lang=lang, file_name=f.name, parent_id=parent_id, notes=notes, page_count=page_count) logger.debug("uploading to {}".format(doc.path.url())) default_storage.copy_doc(src=f.temporary_file_path(), dst=doc.path.url()) for page_num in range(1, page_count + 1): ocr_page.apply_async( kwargs={ 'user_id': user.id, 'document_id': doc.id, 'file_name': f.name, 'page_num': page_num, 'lang': lang }) # upload only one file at time. # after each upload return a json object with # following fields: # # - title # - preview_url # - doc_id # - action_url -> needed for renaming/deleting selected item # # with that info a new thumbnail will be created. preview_url = reverse('core:preview', args=(doc.id, 200, 1)) result = { 'title': doc.title, 'doc_id': doc.id, 'action_url': "", 'preview_url': preview_url } return result
def post(self, request): files = request.FILES.getlist('file') if not files: logger.warning("POST request.FILES is empty. Forgot adding file?") return HttpResponseBadRequest("Missing input file") if len(files) > 1: logger.warning("More then one files per ajax? how come?") return HttpResponse(json.dumps({}), content_type="application/json", status_code=400) f = files[0] logger.debug("upload for f=%s user=%s", f, request.user) user = request.user size = os.path.getsize(f.temporary_file_path()) parent_id = request.POST.get('parent', "-1") if parent_id and "-1" in parent_id: parent_id = None lang = request.POST.get('language') notes = request.POST.get('notes') page_count = get_pagecount(f.temporary_file_path()) logger.info("creating document {}".format(f.name)) doc = Document.create_document(user=user, title=f.name, size=size, lang=lang, file_name=f.name, parent_id=parent_id, notes=notes, page_count=page_count) logger.debug("uploading to {}".format(doc.path.url())) default_storage.copy_doc(src=f.temporary_file_path(), dst=doc.path.url()) for page_num in range(1, page_count + 1): ocr_page.apply_async( kwargs={ 'user_id': user.id, 'document_id': doc.id, 'file_name': f.name, 'page_num': page_num, 'lang': lang }) # upload only one file at time. # after each upload return a json object with # following fields: # # - title # - preview_url # - doc_id # - action_url -> needed for renaming/deleting selected item # # with that info a new thumbnail will be created. action_url = reverse('boss:core_basetreenode_change', args=(doc.id, )) preview_url = reverse('core:preview', args=(doc.id, 200, 1)) result = { 'title': doc.title, 'doc_id': doc.id, 'action_url': action_url, 'preview_url': preview_url } logger.info("and response is!") return HttpResponse(json.dumps(result), content_type="application/json")
def restore_documents(restore_file: io.BytesIO, user: User, skip_ocr=False): restore_file.seek(0) with tarfile.open(fileobj=restore_file, mode="r") as restore_archive: backup_json = restore_archive.extractfile('backup.json') backup_info = json.load(backup_json) leading_user_in_path = False _user = user if not user: leading_user_in_path = True # user was not specified. It is assument that # backup.json contains a list of users. # Thus recreate users first. for backup_user in backup_info['users']: User.objects.create(username=backup_user['username'], email=backup_user['email'], is_active=backup_user['is_active'], is_superuser=backup_user['is_superuser']) for restore_file in restore_archive.getnames(): if restore_file == "backup.json": continue splitted_path = PurePath(restore_file).parts base, ext = os.path.splitext( remove_backup_filename_id(splitted_path[-1])) # if there is leading username, remove it. if leading_user_in_path: username = splitted_path[0] _user = User.objects.get(username=username) splitted_path = splitted_path[1:] if backup_info.get('documents', False): backup_info_documents = backup_info['documents'] else: backup_info_documents = _get_json_user_documents_list( backup_info, _user) leading_user_in_path = True for info in backup_info_documents: document_info = info if info['path'] == restore_file: break parent = None # we first have to create a folder structure if len(splitted_path) > 1: for folder in splitted_path[:-1]: folder_object = Folder.objects.filter( title=folder, user=_user).filter(parent=parent).first() if folder_object is None: new_folder = Folder.objects.create(title=folder, parent=parent, user=_user) parent = new_folder else: parent = folder_object with NamedTemporaryFile("w+b", suffix=ext) as temp_output: ff = restore_archive.extractfile(restore_file) temp_output.write(ff.read()) temp_output.seek(0) size = os.path.getsize(temp_output.name) page_count = get_pagecount(temp_output.name) if parent: parent_id = parent.id else: parent_id = None new_doc = Document.create_document( user=_user, title=document_info['title'], size=size, lang=document_info['lang'], file_name=remove_backup_filename_id(splitted_path[-1]), parent_id=parent_id, notes="", page_count=page_count, rebuild_tree=False # speeds up 100x ) default_storage.copy_doc(src=temp_output.name, dst=new_doc.path.url()) if not skip_ocr: for page_num in range(1, page_count + 1): ocr_page.apply_async( kwargs={ 'user_id': _user.id, 'document_id': new_doc.id, 'file_name': new_doc.file_name, 'page_num': page_num, 'lang': document_info['lang'] })
def apply(self, user=None, parent=None, lang=None, notes=None, name=None, skip_ocr=False, apply_async=False, create_document=True, **kwargs): """ Apply the pipeline. The document is created or modified here. This method is not supposed to throw errors. Arguments: - user (User, optional): document owner. - parent (Folder, optional): folder containing the document. - lang (str, optional): OCR language. - notes (str, optional): document notes. - name (str, optional): document name. - skip_ocr (bool, optional): whether to skip OCR processing. Defaults to False. - apply_async (bool, optional): whether to apply OCR asynchronously. Defaults to False. - create_document (bool, optional): whether to create or update a document. Defaults to True. Returns: Document: the created or updated document """ if parent is None: user, lang, inbox = self.get_user_properties(user) # in case of upload via WEB interface, documents # must land in root directory (as opposite to inbox) if self.processor != WEB: parent = inbox.id if name is None: name = basename(self.path) page_count = self.page_count() size = getsize(self.path) if create_document and self.doc is None: try: doc = Document.objects.create_document(user=user, title=name, size=size, lang=lang, file_name=name, parent_id=parent, page_count=page_count, notes=notes) self.doc = doc except ValidationError as error: logger.error(f"{self.processor} importer: validation failed") raise error elif self.doc is not None: doc = self.doc doc.version = doc.version + 1 doc.page_count = page_count doc.file_name = name doc.size = size doc.save() try: doc.recreate_pages() except ValueError: doc.create_pages() doc.full_clean() self.move_tempfile(doc) self.payload.close() if not skip_ocr: namespace = default_storage.upload(doc_path_url=doc.path().url()) if apply_async: for page_num in range(1, page_count + 1): ocr_page.apply_async( kwargs={ 'user_id': user.id, 'document_id': doc.id, 'file_name': name, 'page_num': page_num, 'lang': lang, 'namespace': namespace }) else: self.ocr_document( document=doc, page_count=page_count, lang=lang, ) logger.debug(f"{self.processor} importer: import complete.") return doc
def apply(self, user=None, parent=None, lang=None, notes=None, name=None, skip_ocr=False, apply_async=False, delete_after_import=False, create_document=True, *args, **kwargs): """ Is this function supposed to return something ? Please document. """ if not self.check_mimetype(): logger.debug(f"{self.processor} importer: invalid filetype") return None if self.processor != WEB: user, lang, inbox = self.get_user_properties(user) parent = inbox.id if name is None: name = basename(self.tempfile.name) page_count = self.page_count() size = getsize(self.temppath) if create_document: try: doc = Document.objects.create_document(user=user, title=name, size=size, lang=lang, file_name=name, parent_id=parent, page_count=page_count, notes=notes) self.doc = doc except ValidationError as e: logger.error("{} importer: validation failed".format( self.processor)) raise e elif self.doc is not None: doc = self.doc doc.version = doc.version + 1 doc.page_count = page_count doc.file_name = name doc.save() try: doc.recreate_pages() except ValueError: doc.create_pages() except Exception: logger.error( f"{self.processor} importer: could not create pages") self.move_tempfile(doc) self.tempfile.close() if not skip_ocr: if apply_async: for page_num in range(1, page_count + 1): ocr_page.apply_async( kwargs={ 'user_id': user.id, 'document_id': doc.id, 'file_name': name, 'page_num': page_num, 'lang': lang }) else: self.ocr_document( document=doc, page_count=page_count, lang=lang, ) if delete_after_import: os.remove(self.temppath) logger.debug("{} importer: import complete.".format(self.processor)) return {'doc': doc}
def restore_documents(restore_file: io.BytesIO, username, skip_ocr=False): restore_file.seek(0) user = User.objects.filter(username=username).first() with tarfile.open(fileobj=restore_file, mode="r") as restore_archive: backup_json = restore_archive.extractfile('backup.json') backup_info = json.load(backup_json) for restore_file in restore_archive.getnames(): if restore_file == "backup.json": continue for info in backup_info['documents']: document_info = info if info['path'] == restore_file: break splitted_path = PurePath(restore_file).parts parent = None # we first have to create a folder structure if len(splitted_path) > 1: for folder in splitted_path[:-1]: folder_object = Folder.objects.filter(title=folder).filter( parent=parent).first() if folder_object is None: new_folder = Folder.objects.create(title=folder, parent=parent, user=user) parent = new_folder else: parent = folder_object document_object = Document.objects.filter( title=splitted_path[-1]).filter(parent=parent).first() if document_object is not None: logger.error("Document %s already exists, skipping", restore_file) else: with NamedTemporaryFile("w+b") as temp_output: temp_output.write( restore_archive.extractfile(restore_file).read()) temp_output.seek(0) size = os.path.getsize(temp_output.name) page_count = get_pagecount(temp_output.name) if parent: parent_id = parent.id else: parent_id = None new_doc = Document.create_document( user=user, title=splitted_path[-1], size=size, lang=document_info['lang'], file_name=splitted_path[-1], parent_id=parent_id, notes="", page_count=page_count) default_storage.copy_doc(src=temp_output.name, dst=new_doc.path.url()) for page_num in range(1, page_count + 1): if not skip_ocr: ocr_page.apply_async( kwargs={ 'user_id': user.id, 'document_id': new_doc.id, 'file_name': splitted_path[-1], 'page_num': page_num, 'lang': document_info['lang'] })