Exemplo n.º 1
0
    def test_download_hocr_which_does_not_exists(self):
        """
        HOCR might not be available. It is a normal case
        (page OCR task is still in the queue/progress).

        Missing HCOR file => HTTP 404 return code is expected.
        """
        doc = Document.create_document(
            title="berlin.pdf",
            user=self.testcase_user,
            lang="ENG",
            file_name="berlin.pdf",
            size=1222,
            page_count=3
        )
        # Doc is available (for get_pagecount on server side).
        default_storage.copy_doc(
            src=os.path.join(
                BASE_DIR, "data", "berlin.pdf"
            ),
            dst=doc.path.url()
        )
        # But HOCR file is missing.
        ret = self.client.get(
            reverse('core:hocr', args=(doc.id, 1, 1))
        )
        self.assertEqual(
            ret.status_code,
            404
        )
Exemplo n.º 2
0
 def test_preview(self):
     doc = Document.create_document(
         title="berlin.pdf",
         user=self.testcase_user,
         lang="ENG",
         file_name="berlin.pdf",
         size=1222,
         page_count=3
     )
     default_storage.copy_doc(
         src=os.path.join(
             BASE_DIR, "data", "berlin.pdf"
         ),
         dst=doc.path.url(),
     )
     ret = self.client.post(
         reverse('core:preview', args=(doc.id, 1, 1))
     )
     self.assertEqual(
         ret.status_code,
         200
     )
     page_path = PagePath(
         document_path=doc.path,
         page_num=1,
         step=Step(1),
         page_count=3
     )
     self.assertTrue(
         os.path.exists(
             default_storage.abspath(page_path.img_url())
         )
     )
Exemplo n.º 3
0
    def import_file(self,
                    file_title=None,
                    inbox_title="Inbox",
                    delete_after_import=True,
                    skip_ocr=False):
        """
        Gets as input a path to a file on a local file system and:
            1. creates a document instance
            2. Copies file to doc_instance.url()
            4. OCR the doc

        Used with
            ./manage.py local_importer
            ./manage.py imap_importer
        command
        """
        logger.debug(f"Importing file {self.filepath}")

        if file_title is None:
            file_title = os.path.basename(self.filepath)

        try:
            page_count = get_pagecount(self.filepath)
        except Exception:
            logger.error(f"Error while getting page count of {self.filepath}.")
            return False

        inbox, _ = Folder.objects.get_or_create(title=inbox_title,
                                                parent=None,
                                                user=self.user)
        doc = Document.create_document(user=self.user,
                                       title=file_title,
                                       size=os.path.getsize(self.filepath),
                                       lang=self.user_ocr_language,
                                       file_name=file_title,
                                       parent_id=inbox.id,
                                       page_count=page_count)
        logger.debug(f"Uploading file {self.filepath} to {doc.path.url()}")
        default_storage.copy_doc(
            src=self.filepath,
            dst=doc.path.url(),
        )
        if not skip_ocr:
            DocumentImporter.ocr_document(
                document=doc,
                page_count=page_count,
                lang=self.user_ocr_language,
            )

        if delete_after_import:
            # Usually we want to delete files when importing
            # them from local directory
            # When importing from Email attachment - deleting
            # files does not apply
            os.remove(self.filepath)

        logger.debug("Import complete.")

        return doc
Exemplo n.º 4
0
 def test_download(self):
     doc = Document.objects.create_document(title="berlin.pdf",
                                            user=self.testcase_user,
                                            lang="ENG",
                                            file_name="berlin.pdf",
                                            size=1222,
                                            page_count=3)
     default_storage.copy_doc(src=os.path.join(BASE_DIR, "data",
                                               "berlin.pdf"),
                              dst=doc.path.url())
     ret = self.client.post(reverse('core:node_download', args=(doc.id, )))
     self.assertEqual(ret.status_code, 200)
Exemplo n.º 5
0
    def test_backup_single_document(self):
        document_path = os.path.join(
            BASE_DIR, "data", "berlin.pdf"
        )
        doc = Document.create_document(
            user=self.testcase_user,
            title='berlin.pdf',
            size=os.path.getsize(document_path),
            lang='deu',
            file_name='berlin.pdf',
            parent_id=None,
            page_count=3
        )

        default_storage.copy_doc(
            src=document_path,
            dst=doc.path.url(),
        )

        with io.BytesIO() as memoryfile:
            backup_documents(memoryfile, self.testcase_user)
            memoryfile.seek(0)

            self.assertTrue(
                _can_restore(memoryfile),
                'generated backup.tar is not valid'
            )
            memoryfile.seek(0)

            backup_file = tarfile.open(fileobj=memoryfile, mode='r')
            backup_json = backup_file.extractfile('backup.json')
            backup_info = json.loads(backup_json.read())

            self.assertIsNotNone(
                backup_info.get('documents'),
                'backup.json did not have a key "documents"'
            )
            self.assertIs(
                len(backup_info.get('documents')), 1,
                'backup.json key documents had more or less than one entry'
            )
            self.assertIs(
                len(backup_file.getnames()),
                2,
                'backup.tar had more or less than 2 entries'
            )
            self.assertTrue(
                'berlin.pdf' in backup_file.getnames(),
                'berlin.pdf was not in the backup.tar'
            )
Exemplo n.º 6
0
    def test_download_hocr(self):
        doc = Document.create_document(
            title="berlin.pdf",
            user=self.testcase_user,
            lang="ENG",
            file_name="berlin.pdf",
            size=1222,
            page_count=3
        )

        default_storage.copy_doc(
            src=os.path.join(
                BASE_DIR, "data", "berlin.pdf"
            ),
            dst=default_storage.abspath(doc.path.url())
        )
        # build page url
        page_path = doc.page_paths[1]

        # just remember that at the end of test
        # copied file must be deteled. (1)
        default_storage.copy_doc(
            src=os.path.join(
                BASE_DIR, "data", "page-1.hocr"
            ),
            dst=default_storage.abspath(page_path.hocr_url())
        )
        ret = self.client.get(
            reverse('core:hocr', args=(doc.id, 1, 1))
        )
        self.assertEqual(
            ret.status_code,
            200
        )
        # Deleting file created at (1)
        os.remove(
            default_storage.abspath(page_path.hocr_url())
        )
Exemplo n.º 7
0
def upload(request):
    """
    To understand returned value, have a look at
    papermerge.core.views.decorators.json_reponse decorator
    """
    files = request.FILES.getlist('file')
    if not files:
        logger.warning("POST request.FILES is empty. Forgot adding file?")
        return "Missing input file", 400

    if len(files) > 1:
        msg = "More then one files per ajax? how come?"
        logger.warning(msg)

        return msg, 400

    f = files[0]

    logger.debug("upload for f=%s user=%s", f, request.user)

    user = request.user
    size = os.path.getsize(f.temporary_file_path())
    parent_id = request.POST.get('parent', "-1")
    if parent_id and "-1" in parent_id:
        parent_id = None

    lang = request.POST.get('language')
    notes = request.POST.get('notes')
    try:
        page_count = get_pagecount(f.temporary_file_path())
    except exceptions.FileTypeNotSupported:
        status = 400
        msg = _("File type not supported."
                " Only pdf, tiff, png, jpeg files are supported")
        return msg, status

    logger.debug("creating document {}".format(f.name))

    doc = Document.create_document(user=user,
                                   title=f.name,
                                   size=size,
                                   lang=lang,
                                   file_name=f.name,
                                   parent_id=parent_id,
                                   notes=notes,
                                   page_count=page_count)
    logger.debug("uploading to {}".format(doc.path.url()))

    default_storage.copy_doc(src=f.temporary_file_path(), dst=doc.path.url())
    for page_num in range(1, page_count + 1):
        ocr_page.apply_async(
            kwargs={
                'user_id': user.id,
                'document_id': doc.id,
                'file_name': f.name,
                'page_num': page_num,
                'lang': lang
            })

    # upload only one file at time.
    # after each upload return a json object with
    # following fields:
    #
    # - title
    # - preview_url
    # - doc_id
    # - action_url  -> needed for renaming/deleting selected item
    #
    # with that info a new thumbnail will be created.
    preview_url = reverse('core:preview', args=(doc.id, 200, 1))

    result = {
        'title': doc.title,
        'doc_id': doc.id,
        'action_url': "",
        'preview_url': preview_url
    }

    return result
Exemplo n.º 8
0
    def test_user_download_document(self):
        """
        If user has read access to the document
        (even if he/she is not the owner of the document), then
        he/she must be able to download it.

        Scenario:
            admin user creates a document and assigns
            read only access for margaret
            (thus, root is the owner of the document).

        Expected:

            Margaret and root user must be able to download the document.
            Elizabet on the other hand - must not have access to the document
            (she was not assigned permissions for that)
        """
        document_path = os.path.join(
            BASE_DIR, "data", "berlin.pdf"
        )

        doc = Document.create_document(
            user=self.root_user,
            title='berlin.pdf',
            size=os.path.getsize(document_path),
            lang='deu',
            file_name='berlin.pdf',
            page_count=3
        )
        # copy document from its test/data place
        # to the media storage, as if document was uploaded.
        default_storage.copy_doc(
            src=document_path,
            dst=doc.path.url(),
        )
        create_access(
            node=doc,
            name=self.margaret_user.username,
            model_type=Access.MODEL_USER,
            access_type=Access.ALLOW,
            access_inherited=False,
            permissions={
                READ: True
            }  # allow read access to margaret
        )
        self.client.login(
            testcase_user=self.margaret_user
        )

        url = reverse(
            'core:document_download', args=(doc.id,)
        )

        ret = self.client.get(url)

        self.assertEqual(
            ret.status_code,
            200
        )

        # also, root/admin must be able to download it
        self.client.logout()
        self.client.login(
            testcase_user=self.root_user
        )

        ret = self.client.get(url)

        self.assertEqual(
            ret.status_code,
            200
        )

        self.client.logout()

        # for elizabet on the other hand, access is forbidden.
        self.client.login(testcase_user=self.elizabet_user)
        ret = self.client.get(url)

        self.assertEqual(
            ret.status_code,
            403
        )
Exemplo n.º 9
0
def restore_documents(restore_file: io.BytesIO, user: User, skip_ocr=False):

    restore_file.seek(0)

    with tarfile.open(fileobj=restore_file, mode="r") as restore_archive:
        backup_json = restore_archive.extractfile('backup.json')
        backup_info = json.load(backup_json)

        leading_user_in_path = False
        _user = user
        if not user:
            leading_user_in_path = True
            # user was not specified. It is assument that
            # backup.json contains a list of users.
            # Thus recreate users first.
            for backup_user in backup_info['users']:
                user = User.objects.create(
                    username=backup_user['username'],
                    email=backup_user['email'],
                    is_active=backup_user['is_active'],
                    is_superuser=backup_user['is_superuser'])
                # in case --include-user-password switch was used
                # update user (raw digest of) password field
                password = backup_user.get('password')
                if password:
                    user.password = password
                    user.save()

        for restore_file in restore_archive.getnames():

            if restore_file == "backup.json":
                continue

            logger.debug(f"Restoring file {restore_file}...")

            splitted_path = PurePath(restore_file).parts
            base, ext = os.path.splitext(
                remove_backup_filename_id(splitted_path[-1]))

            # if there is leading username, remove it.
            if leading_user_in_path:
                username = splitted_path[0]
                _user = User.objects.get(username=username)
                splitted_path = splitted_path[1:]

            if backup_info.get('documents', False):
                backup_info_documents = backup_info['documents']
            else:
                backup_info_documents = _get_json_user_documents_list(
                    backup_info, _user)
                leading_user_in_path = True

            for info in backup_info_documents:
                document_info = info
                if info['path'] == restore_file:
                    break

            parent = None

            # variables used only to shorten debug message
            _sp = splitted_path
            _rf = restore_file
            logger.debug(
                f"{_rf}: splitted_path={_sp} len(splitted_path)={len(_sp)}")
            # we first have to create a folder structure
            if len(splitted_path) > 1:
                for folder in splitted_path[:-1]:

                    folder_object = Folder.objects.filter(
                        title=folder,
                        user=_user).filter(parent=parent).first()

                    if folder_object is None:
                        new_folder = Folder.objects.create(title=folder,
                                                           parent=parent,
                                                           user=_user)
                        parent = new_folder
                    else:
                        parent = folder_object

            with NamedTemporaryFile("w+b", suffix=ext) as temp_output:
                logger.debug(f"Extracting {restore_file}...")

                ff = restore_archive.extractfile(restore_file)
                temp_output.write(ff.read())
                temp_output.seek(0)
                size = os.path.getsize(temp_output.name)

                page_count = get_pagecount(temp_output.name)

                if parent:
                    parent_id = parent.id
                else:
                    parent_id = None

                new_doc = Document.objects.create_document(
                    user=_user,
                    title=document_info['title'],
                    size=size,
                    lang=document_info['lang'],
                    file_name=remove_backup_filename_id(splitted_path[-1]),
                    parent_id=parent_id,
                    notes="",
                    page_count=page_count,
                    rebuild_tree=False  # speeds up 100x
                )

                tag_attributes = document_info.get('tags', [])

                for attrs in tag_attributes:
                    attrs['user'] = _user
                    tag, created = Tag.objects.get_or_create(**attrs)
                    new_doc.tags.add(tag)

                default_storage.copy_doc(src=temp_output.name,
                                         dst=new_doc.path.url())

            if not skip_ocr:
                for page_num in range(1, page_count + 1):
                    ocr_page.apply_async(
                        kwargs={
                            'user_id': _user.id,
                            'document_id': new_doc.id,
                            'file_name': new_doc.file_name,
                            'page_num': page_num,
                            'lang': document_info['lang']
                        })
Exemplo n.º 10
0
    def post(self, request):
        files = request.FILES.getlist('file')
        if not files:
            logger.warning("POST request.FILES is empty. Forgot adding file?")
            return HttpResponseBadRequest("Missing input file")

        if len(files) > 1:
            logger.warning("More then one files per ajax? how come?")
            return HttpResponse(json.dumps({}),
                                content_type="application/json",
                                status_code=400)

        f = files[0]

        logger.debug("upload for f=%s user=%s", f, request.user)

        user = request.user
        size = os.path.getsize(f.temporary_file_path())
        parent_id = request.POST.get('parent', "-1")
        if parent_id and "-1" in parent_id:
            parent_id = None

        lang = request.POST.get('language')
        notes = request.POST.get('notes')
        page_count = get_pagecount(f.temporary_file_path())
        logger.info("creating document {}".format(f.name))

        doc = Document.create_document(user=user,
                                       title=f.name,
                                       size=size,
                                       lang=lang,
                                       file_name=f.name,
                                       parent_id=parent_id,
                                       notes=notes,
                                       page_count=page_count)
        logger.debug("uploading to {}".format(doc.path.url()))

        default_storage.copy_doc(src=f.temporary_file_path(),
                                 dst=doc.path.url())

        for page_num in range(1, page_count + 1):
            ocr_page.apply_async(
                kwargs={
                    'user_id': user.id,
                    'document_id': doc.id,
                    'file_name': f.name,
                    'page_num': page_num,
                    'lang': lang
                })

        # upload only one file at time.
        # after each upload return a json object with
        # following fields:
        #
        # - title
        # - preview_url
        # - doc_id
        # - action_url  -> needed for renaming/deleting selected item
        #
        # with that info a new thumbnail will be created.

        action_url = reverse('boss:core_basetreenode_change', args=(doc.id, ))

        preview_url = reverse('core:preview', args=(doc.id, 200, 1))

        result = {
            'title': doc.title,
            'doc_id': doc.id,
            'action_url': action_url,
            'preview_url': preview_url
        }
        logger.info("and response is!")
        return HttpResponse(json.dumps(result),
                            content_type="application/json")
Exemplo n.º 11
0
    def test_documents_retains_per_page_metadata_after_page_delete(self):
        """
        DocM is a document with 3 pages. DocM has two metadata fields
        associated X and Y. Field has a value x=10 and y=20.

        Second page of the document DocM is deleted.
        Expected:
            document values of metadata fields X and Y should be preserverd:
            DocX.M is still 10 and DocM.Y is still 20.

        Important!

        In document browser and document viewer
        if user does not explicitely select a page, by default
        metadata associated with first page of respective document
        is returned.
        """
        document_path = os.path.join(BASE_DIR, "data", "berlin.pdf")
        docm = Document.objects.create_document(
            user=self.user,
            title='berlin.pdf',
            size=os.path.getsize(document_path),
            lang='deu',
            file_name='berlin.pdf',
            parent_id=None,
            page_count=3)

        default_storage.copy_doc(
            src=document_path,
            dst=docm.path.url(),
        )

        for number in range(1, 4):
            page = docm.pages.get(number=number)
            # filesystem absolute path /home/eugen/x/y/
            fs_abs_path = default_storage.abspath(page.path.url())
            # filesystem absolute dir
            fs_abs_dir = os.path.dirname(fs_abs_path)
            Path(fs_abs_dir).mkdir(parents=True, exist_ok=True)
            # create an empty file
            open(fs_abs_path, "w+")

        # indeed, docm has 3 pages
        self.assertEqual(docm.pages.count(), 3)
        docm.kv.update([{
            'key': 'X',
            'kv_type': TEXT,
        }, {
            'key': 'Y',
            'kv_type': TEXT,
        }])
        # In document browser and document viewer
        # if user does not explicitely select a document, by default
        # metadata associated with first page of respective document
        # is returned
        page = docm.pages.get(number=1)
        page.kv['X'] = 10
        page.kv['Y'] = 20

        page.refresh_from_db()

        self.assertEqual(page.kv['X'], '10')

        self.assertEqual(page.kv['Y'], '20')

        # Even if user deletes second page, all data (incl. metadata)
        # associated ramaining page (first and last)
        # MUST be preserved!
        docm.delete_pages([2])

        page = docm.pages.get(number=1)

        self.assertEqual(page.kv['X'], '10')
        self.assertEqual(page.kv['Y'], '20')
Exemplo n.º 12
0
def restore_documents(restore_file: io.BytesIO, user: User, skip_ocr=False):

    restore_file.seek(0)

    with tarfile.open(fileobj=restore_file, mode="r") as restore_archive:
        backup_json = restore_archive.extractfile('backup.json')
        backup_info = json.load(backup_json)

        leading_user_in_path = False
        _user = user
        if not user:
            leading_user_in_path = True
            # user was not specified. It is assument that
            # backup.json contains a list of users.
            # Thus recreate users first.
            for backup_user in backup_info['users']:
                User.objects.create(username=backup_user['username'],
                                    email=backup_user['email'],
                                    is_active=backup_user['is_active'],
                                    is_superuser=backup_user['is_superuser'])

        for restore_file in restore_archive.getnames():

            if restore_file == "backup.json":
                continue

            splitted_path = PurePath(restore_file).parts
            base, ext = os.path.splitext(
                remove_backup_filename_id(splitted_path[-1]))

            # if there is leading username, remove it.
            if leading_user_in_path:
                username = splitted_path[0]
                _user = User.objects.get(username=username)
                splitted_path = splitted_path[1:]

            if backup_info.get('documents', False):
                backup_info_documents = backup_info['documents']
            else:
                backup_info_documents = _get_json_user_documents_list(
                    backup_info, _user)
                leading_user_in_path = True

            for info in backup_info_documents:
                document_info = info
                if info['path'] == restore_file:
                    break

            parent = None
            # we first have to create a folder structure

            if len(splitted_path) > 1:
                for folder in splitted_path[:-1]:

                    folder_object = Folder.objects.filter(
                        title=folder,
                        user=_user).filter(parent=parent).first()

                    if folder_object is None:
                        new_folder = Folder.objects.create(title=folder,
                                                           parent=parent,
                                                           user=_user)
                        parent = new_folder
                    else:
                        parent = folder_object

                with NamedTemporaryFile("w+b", suffix=ext) as temp_output:
                    ff = restore_archive.extractfile(restore_file)
                    temp_output.write(ff.read())
                    temp_output.seek(0)
                    size = os.path.getsize(temp_output.name)

                    page_count = get_pagecount(temp_output.name)

                    if parent:
                        parent_id = parent.id
                    else:
                        parent_id = None

                    new_doc = Document.create_document(
                        user=_user,
                        title=document_info['title'],
                        size=size,
                        lang=document_info['lang'],
                        file_name=remove_backup_filename_id(splitted_path[-1]),
                        parent_id=parent_id,
                        notes="",
                        page_count=page_count,
                        rebuild_tree=False  # speeds up 100x
                    )

                    default_storage.copy_doc(src=temp_output.name,
                                             dst=new_doc.path.url())

                if not skip_ocr:
                    for page_num in range(1, page_count + 1):
                        ocr_page.apply_async(
                            kwargs={
                                'user_id': _user.id,
                                'document_id': new_doc.id,
                                'file_name': new_doc.file_name,
                                'page_num': page_num,
                                'lang': document_info['lang']
                            })
Exemplo n.º 13
0
 def move_tempfile(self, doc):
     default_storage.copy_doc(src=self.path, dst=doc.path().url())
     return None
    def test_backup_document_hierachy(self):
        folder_1 = Folder.objects.create(title='1',
                                         parent=None,
                                         user=self.testcase_user)
        folder_2 = Folder.objects.create(title='2',
                                         parent=folder_1,
                                         user=self.testcase_user)
        folder_3 = Folder.objects.create(title='3',
                                         parent=folder_1,
                                         user=self.testcase_user)
        Folder.objects.create(title='4', parent=None, user=self.testcase_user)

        document_path = os.path.join(BASE_DIR, "data", "berlin.pdf")

        doc_1 = Document.create_document(user=self.testcase_user,
                                         title='berlin.pdf',
                                         size=os.path.getsize(document_path),
                                         lang='deu',
                                         file_name='berlin.pdf',
                                         parent_id=folder_2.id,
                                         page_count=3)

        default_storage.copy_doc(
            src=document_path,
            dst=doc_1.path.url(),
        )

        doc_2 = Document.create_document(user=self.testcase_user,
                                         title='berlin.pdf',
                                         size=os.path.getsize(document_path),
                                         lang='deu',
                                         file_name='berlin.pdf',
                                         parent_id=folder_3.id,
                                         page_count=3)

        default_storage.copy_doc(
            src=document_path,
            dst=doc_2.path.url(),
        )

        with io.BytesIO() as memoryfile:
            backup_documents(memoryfile, self.testcase_user)
            memoryfile.seek(0)

            self.assertTrue(_can_restore(memoryfile),
                            'generated backup.tar is not valid')
            memoryfile.seek(0)

            backup_file = tarfile.open(fileobj=memoryfile, mode='r')
            backup_json = backup_file.extractfile('backup.json')
            backup_info = json.loads(backup_json.read())

            self.assertIsNotNone(backup_info.get('documents'),
                                 'backup.json did not have a key "documents"')
            self.assertIs(
                len(backup_info.get('documents')), 2,
                'backup.json key documents had more or less than two entry')
            self.assertIs(len(backup_file.getnames()), 3,
                          'backup.tar had more or less than 2 entries')
            self.assertTrue(
                f"1/2/berlin.pdf__{doc_1.id}" in backup_file.getnames(),
                'berlin.pdf was not in the backup.tar at folder 1/2/')
            self.assertTrue(
                f"1/3/berlin.pdf__{doc_2.id}" in backup_file.getnames(),
                'berlin.pdf was not in the backup.tar at folder 1/3/')
            self.assertFalse(
                '4' in backup_file.getnames(),
                'Folder 4 was in backup.tar but should have been ignored')
Exemplo n.º 15
0
    def test_basic_two_folders(self):
        """
        Creates following hierarchy:

            + Folder_1
            +   berlin_f_1.pdf
            + Folder_2
            +   berlin_f_2.pdf
            + berlin_root_1.pdf
            + berlin_root_2.pdf
        """
        f1 = Folder.objects.create(title='Folder_1',
                                   parent=None,
                                   user=self.testcase_user)
        f2 = Folder.objects.create(title='Folder_2',
                                   parent=None,
                                   user=self.testcase_user)

        document_path = os.path.join(BASE_DIR, "data", "berlin.pdf")

        doc_in_root_1 = Document.objects.create_document(
            user=self.testcase_user,
            title='berlin_root_1.pdf',
            size=os.path.getsize(document_path),
            lang='deu',
            file_name='berlin_root_1.pdf',
            page_count=3)
        default_storage.copy_doc(
            src=document_path,
            dst=doc_in_root_1.path.url(),
        )

        doc_in_root_2 = Document.objects.create_document(
            user=self.testcase_user,
            title='berlin_root_2.pdf',
            size=os.path.getsize(document_path),
            lang='deu',
            file_name='berlin_root_2.pdf',
            page_count=3)
        default_storage.copy_doc(
            src=document_path,
            dst=doc_in_root_2.path.url(),
        )

        doc_in_f_1 = Document.objects.create_document(
            user=self.testcase_user,
            title='berlin_f_1.pdf',
            size=os.path.getsize(document_path),
            lang='deu',
            parent_id=f1.id,
            file_name='berlin_f_1.pdf',
            page_count=3)
        default_storage.copy_doc(
            src=document_path,
            dst=doc_in_f_1.path.url(),
        )

        doc_in_f_2 = Document.objects.create_document(
            user=self.testcase_user,
            title='berlin_f_2.pdf',
            size=os.path.getsize(document_path),
            lang='deu',
            parent_id=f2.id,
            file_name='berlin_f_2.pdf',
            page_count=3)
        default_storage.copy_doc(
            src=document_path,
            dst=doc_in_f_2.path.url(),
        )
        """
        User selected two documents in the root dir berlin_root_1.pdf,
        and berlin_root_1.pdf plus Folder_1 and Folder_2.
        Selection is marked with square brackets [...]

            + [Folder_1]
            +   berlin_f_1.pdf
            + [Folder_2]
            +   berlin_f_2.pdf
            + [berlin_root_1.pdf]
            + [berlin_root_2.pdf]
        """
        selected_ids = [doc_in_root_1.id, doc_in_root_2.id, f1.id, f2.id]

        with io.BytesIO() as memoryfile:
            build_tar_archive(  # <-- THIS IS WHAT WE ARE TESTING
                fileobj=memoryfile,
                node_ids=selected_ids)
            memoryfile.seek(0)
            archive_file = tarfile.open(fileobj=memoryfile, mode='r')
            berlin_root_1_handle = archive_file.extractfile(
                'berlin_root_1.pdf')
            data = berlin_root_1_handle.read()
            self.assertTrue(len(data) > 0)

            berlin_f_1_handle = archive_file.extractfile(
                'Folder_1/berlin_f_1.pdf')
            data = berlin_f_1_handle.read()
            self.assertTrue(len(data) > 0)

            berlin_f_2_handle = archive_file.extractfile(
                'Folder_2/berlin_f_2.pdf')
            data = berlin_f_2_handle.read()
            self.assertTrue(len(data) > 0)

            with self.assertRaises(KeyError):
                # there is no file Accounting/Expenses/Paris.pdf
                # in archive, thus, KeyError exception is expected
                archive_file.extractfile('Accounting/Expenses/Paris.pdf')
Exemplo n.º 16
0
def restore_documents(restore_file: io.BytesIO, username, skip_ocr=False):

    restore_file.seek(0)
    user = User.objects.filter(username=username).first()

    with tarfile.open(fileobj=restore_file, mode="r") as restore_archive:

        backup_json = restore_archive.extractfile('backup.json')
        backup_info = json.load(backup_json)

        for restore_file in restore_archive.getnames():
            if restore_file == "backup.json":
                continue
            for info in backup_info['documents']:
                document_info = info
                if info['path'] == restore_file:
                    break

            splitted_path = PurePath(restore_file).parts
            parent = None
            # we first have to create a folder structure

            if len(splitted_path) > 1:
                for folder in splitted_path[:-1]:

                    folder_object = Folder.objects.filter(title=folder).filter(
                        parent=parent).first()

                    if folder_object is None:
                        new_folder = Folder.objects.create(title=folder,
                                                           parent=parent,
                                                           user=user)
                        parent = new_folder
                    else:
                        parent = folder_object

            document_object = Document.objects.filter(
                title=splitted_path[-1]).filter(parent=parent).first()

            if document_object is not None:
                logger.error("Document %s already exists, skipping",
                             restore_file)
            else:

                with NamedTemporaryFile("w+b") as temp_output:

                    temp_output.write(
                        restore_archive.extractfile(restore_file).read())
                    temp_output.seek(0)
                    size = os.path.getsize(temp_output.name)
                    page_count = get_pagecount(temp_output.name)
                    if parent:
                        parent_id = parent.id
                    else:
                        parent_id = None
                    new_doc = Document.create_document(
                        user=user,
                        title=splitted_path[-1],
                        size=size,
                        lang=document_info['lang'],
                        file_name=splitted_path[-1],
                        parent_id=parent_id,
                        notes="",
                        page_count=page_count)
                    default_storage.copy_doc(src=temp_output.name,
                                             dst=new_doc.path.url())

                for page_num in range(1, page_count + 1):
                    if not skip_ocr:
                        ocr_page.apply_async(
                            kwargs={
                                'user_id': user.id,
                                'document_id': new_doc.id,
                                'file_name': splitted_path[-1],
                                'page_num': page_num,
                                'lang': document_info['lang']
                            })