示例#1
0
def ocr_page_pdf(doc_path, page_num, lang):
    """
    doc_path is an mglib.path.DocumentPath instance
    """
    logger.debug("OCR PDF document")

    page_count = get_pagecount(default_storage.abspath(doc_path.url()))

    if page_num <= page_count:
        # first quickly generate preview images
        page_url = PagePath(document_path=doc_path,
                            page_num=page_num,
                            step=Step(1),
                            page_count=page_count)

        for step in Steps():
            page_url.step = step
            extract_img(page_url, media_root=settings.MEDIA_ROOT)

    if page_num <= page_count:
        page_url = PagePath(document_path=doc_path,
                            page_num=page_num,
                            step=Step(1),
                            page_count=page_count)
        extract_txt(page_url, lang=lang, media_root=settings.MEDIA_ROOT)

        for step in Steps():
            page_url.step = step
            if not step.is_thumbnail:
                extract_hocr(page_url,
                             lang=lang,
                             media_root=settings.MEDIA_ROOT)

    return page_url
示例#2
0
    def page_paths(self):
        """
        Enables document instance to get quickly page
        paths:

            page_path = doc.page_path[2]
            page_path.url() # local url to second page of the doc.

        This is shortcut method when most used Step(1) is required.
        """

        results = [None]  # indexing starts from 1

        # doc.page_count might be wrong because per
        # page logic was added just recently. So, let's use
        # this opportunity and correct it!
        page_count = get_pagecount(self.absfilepath)

        if page_count != self.page_count:
            self.page_count = page_count
            self.save()

        for page_num in range(1, page_count + 1):
            page_path = PagePath(document_path=self.path,
                                 page_num=page_num,
                                 step=step.Step(1),
                                 page_count=self.page_count)
            results.append(page_path)

        return results
示例#3
0
def reorder_pages(src, dst, new_order):
    """
    new_order is a list of following format:

        [
            {'page_num': 2, page_order: 1},
            {'page_num': 1, page_order: 2},
            {'page_num': 3, page_order: 3},
            {'page_num': 4, page_order: 4},
        ]
    Example above means that in current document of 4 pages,
    first page was swapped with second one.
    page_num    = older page order
    page_order  = current page order
    So in human language, each hash is read:
        <page_num> now should be <page_order>
    """
    page_count = get_pagecount(src)

    cat_ranges = cat_ranges_for_reorder(page_count=page_count,
                                        new_order=new_order)

    cmd = [settings.BINARY_PDFTK, src, "cat"]
    for page in cat_ranges:
        cmd.append(str(page))

    cmd.append("output")
    cmd.append(dst)
    run(cmd)
示例#4
0
    def import_file(self,
                    file_title=None,
                    inbox_title="Inbox",
                    delete_after_import=True,
                    skip_ocr=False):
        """
        Gets as input a path to a file on a local file system and:
            1. creates a document instance
            2. Copies file to doc_instance.url()
            4. OCR the doc

        Used with
            ./manage.py local_importer
            ./manage.py imap_importer
        command
        """
        logger.debug(f"Importing file {self.filepath}")

        if file_title is None:
            file_title = os.path.basename(self.filepath)

        try:
            page_count = get_pagecount(self.filepath)
        except Exception:
            logger.error(f"Error while getting page count of {self.filepath}.")
            return False

        inbox, _ = Folder.objects.get_or_create(title=inbox_title,
                                                parent=None,
                                                user=self.user)
        doc = Document.create_document(user=self.user,
                                       title=file_title,
                                       size=os.path.getsize(self.filepath),
                                       lang=self.user_ocr_language,
                                       file_name=file_title,
                                       parent_id=inbox.id,
                                       page_count=page_count)
        logger.debug(f"Uploading file {self.filepath} to {doc.path.url()}")
        default_storage.copy_doc(
            src=self.filepath,
            dst=doc.path.url(),
        )
        if not skip_ocr:
            DocumentImporter.ocr_document(
                document=doc,
                page_count=page_count,
                lang=self.user_ocr_language,
            )

        if delete_after_import:
            # Usually we want to delete files when importing
            # them from local directory
            # When importing from Email attachment - deleting
            # files does not apply
            os.remove(self.filepath)

        logger.debug("Import complete.")

        return doc
示例#5
0
def ocr_page_pdf(doc_path, page_num, lang, **kwargs):
    """
    doc_path is an mglib.path.DocumentPath instance

    On success returns ``mglib.path.PagePath`` instance.
    """
    logger.debug("OCR PDF document")

    file_name = kwargs.pop('file_name', None)

    if not file_name:
        file_name = doc_path.file_name

    page_count = get_pagecount(default_storage.abspath(doc_path.url()))

    if page_num <= page_count:
        # first quickly generate preview images
        page_path = PagePath(document_path=doc_path,
                             page_num=page_num,
                             step=Step(1),
                             page_count=page_count)
        for step in Steps():
            page_path.step = step
            extract_img(page_path, media_root=settings.MEDIA_ROOT)

    notify_pre_page_ocr(page_path,
                        page_num=page_num,
                        lang=lang,
                        file_name=doc_path.file_name,
                        **kwargs)

    if page_num <= page_count:
        page_path = PagePath(document_path=doc_path,
                             page_num=page_num,
                             step=Step(1),
                             page_count=page_count)
        extract_txt(page_path, lang=lang, media_root=settings.MEDIA_ROOT)
        notify_txt_ready(page_path,
                         page_num=page_num,
                         lang=lang,
                         file_name=file_name,
                         **kwargs)

        for step in Steps():
            page_path.step = step
            if not step.is_thumbnail:
                extract_hocr(page_path,
                             lang=lang,
                             media_root=settings.MEDIA_ROOT)
                notify_hocr_ready(
                    page_path,
                    page_num=page_num,
                    lang=lang,
                    # step as integer number
                    step=step.current,
                    file_name=file_name,
                    **kwargs)

    return page_path
示例#6
0
 def recreate_pages(self):
     """
     Recreate page models
     """
     self.pages.all().delete()
     self.page_count = get_pagecount(
         default_storage.abspath(self.path.url()))
     self.save()
     self.create_pages()
示例#7
0
def delete_pages(src, dst, page_numbers):
    page_count = get_pagecount(src)

    cmd = [settings.BINARY_STAPLER, "del", src]
    for page in page_numbers:
        cmd.append(str(page))

    cmd.append(dst)

    run(cmd)
示例#8
0
    def handle(self, *args, **options):
        file_path = options.get('file_path', False)

        if not os.path.exists(file_path):
            logger.debug(f"Path {file_path} does not exit. Quit.")
            return

        page_count = get_pagecount(file_path)

        logger.debug(f"Page count={page_count}")
示例#9
0
def paste_pages_into_existing_doc(src,
                                  dst,
                                  data_list,
                                  after_page_number=False,
                                  before_page_number=False):
    page_count = get_pagecount(src)
    list1, list2 = split_ranges(total=page_count,
                                after=after_page_number,
                                before=before_page_number)
    # notice missing A
    # Letter A is assignent to current folder and
    # pages from list1 and list2
    letters = "BCDEFGHIJKLMNOPQRSTUVWXYZ"
    letters_2_doc_map = []
    letters_pages = []
    letters_pages_before = []
    letters_pages_after = []

    letters_2_doc_map.append(f"A={src}")

    for idx in range(0, len(data_list)):
        letter = letters[idx]
        src = data_list[idx]['src']
        pages = data_list[idx]['page_nums']

        letters_2_doc_map.append(f"{letter}={src}")
        for p in pages:
            letters_pages.append(f"{letter}{p}")

    for p in list1:
        letters_pages_before.append(f"A{p}")

    for p in list2:
        letters_pages_after.append(f"A{p}")

    cmd = [
        settings.BINARY_PDFTK,
    ]
    # add A=doc1_path, B=doc2_path
    cmd.extend(letters_2_doc_map)

    cmd.append("cat")

    # existing doc pages (may be empty)
    cmd.extend(letters_pages_before)
    # newly inserted pages
    cmd.extend(letters_pages)
    # existing doc pages (may be empty)
    cmd.extend(letters_pages_after)

    cmd.append("output")

    cmd.append(dst)

    run(cmd)
示例#10
0
def delete_pages(src, dst, page_numbers):
    page_count = get_pagecount(src)

    cat_ranges = cat_ranges_for_delete(page_count, page_numbers)

    cmd = [settings.BINARY_PDFTK, src, "cat"]
    for page in cat_ranges:
        cmd.append(str(page))

    cmd.append("output")
    cmd.append(dst)

    run(cmd)
示例#11
0
def hocr(request, id, step=None, page="1"):

    logger.debug(f"hocr for doc_id={id}, step={step}, page={page}")

    try:
        doc = Document.objects.get(id=id)
    except Document.DoesNotExist:
        raise Http404("Document does not exists")

    doc_path = doc.path

    if request.user.has_perm(Access.PERM_READ, doc):
        # document absolute path
        doc_abs_path = default_storage.abspath(doc_path.url())
        if not os.path.exists(
            doc_abs_path
        ):
            raise Http404("HOCR data not yet ready.")

        page_count = get_pagecount(doc_abs_path)
        if page > page_count or page < 0:
            raise Http404("Page does not exists")

        page_path = doc.page_paths[page]
        hocr_abs_path = default_storage.abspath(page_path.hocr_url())

        logger.debug(f"Extract words from {hocr_abs_path}")

        if not os.path.exists(hocr_abs_path):
            raise Http404("HOCR data not yet ready.")

        # At this point local HOCR data should be available.
        hocr = Hocr(
            hocr_file_path=hocr_abs_path
        )

        return HttpResponse(
            json.dumps({
                'hocr': hocr.good_json_words(),
                'hocr_meta': hocr.get_meta()
            }),
            content_type="application/json",
        )

    return HttpResponseForbidden()
示例#12
0
def upload(request):
    """
    To understand returned value, have a look at
    papermerge.core.views.decorators.json_reponse decorator
    """
    files = request.FILES.getlist('file')
    if not files:
        logger.warning("POST request.FILES is empty. Forgot adding file?")
        return "Missing input file", 400

    if len(files) > 1:
        msg = "More then one files per ajax? how come?"
        logger.warning(msg)

        return msg, 400

    f = files[0]

    logger.debug("upload for f=%s user=%s", f, request.user)

    user = request.user
    size = os.path.getsize(f.temporary_file_path())
    parent_id = request.POST.get('parent', "-1")
    if parent_id and "-1" in parent_id:
        parent_id = None

    lang = request.POST.get('language')
    notes = request.POST.get('notes')
    try:
        page_count = get_pagecount(f.temporary_file_path())
    except exceptions.FileTypeNotSupported:
        status = 400
        msg = _("File type not supported."
                " Only pdf, tiff, png, jpeg files are supported")
        return msg, status

    logger.debug("creating document {}".format(f.name))

    doc = Document.create_document(user=user,
                                   title=f.name,
                                   size=size,
                                   lang=lang,
                                   file_name=f.name,
                                   parent_id=parent_id,
                                   notes=notes,
                                   page_count=page_count)
    logger.debug("uploading to {}".format(doc.path.url()))

    default_storage.copy_doc(src=f.temporary_file_path(), dst=doc.path.url())
    for page_num in range(1, page_count + 1):
        ocr_page.apply_async(
            kwargs={
                'user_id': user.id,
                'document_id': doc.id,
                'file_name': f.name,
                'page_num': page_num,
                'lang': lang
            })

    # upload only one file at time.
    # after each upload return a json object with
    # following fields:
    #
    # - title
    # - preview_url
    # - doc_id
    # - action_url  -> needed for renaming/deleting selected item
    #
    # with that info a new thumbnail will be created.
    preview_url = reverse('core:preview', args=(doc.id, 200, 1))

    result = {
        'title': doc.title,
        'doc_id': doc.id,
        'action_url': "",
        'preview_url': preview_url
    }

    return result
示例#13
0
    def test_basic_tiff(self):
        # in case input file has extention tiff extension
        # it will internally call get_tiff_pagecount method
        page_count = get_pagecount(get_filepath("text.tiff"))

        self.assertEqual(page_count, 2)
示例#14
0
def restore_documents(restore_file: io.BytesIO, user: User, skip_ocr=False):

    restore_file.seek(0)

    with tarfile.open(fileobj=restore_file, mode="r") as restore_archive:
        backup_json = restore_archive.extractfile('backup.json')
        backup_info = json.load(backup_json)

        leading_user_in_path = False
        _user = user
        if not user:
            leading_user_in_path = True
            # user was not specified. It is assument that
            # backup.json contains a list of users.
            # Thus recreate users first.
            for backup_user in backup_info['users']:
                user = User.objects.create(
                    username=backup_user['username'],
                    email=backup_user['email'],
                    is_active=backup_user['is_active'],
                    is_superuser=backup_user['is_superuser'])
                # in case --include-user-password switch was used
                # update user (raw digest of) password field
                password = backup_user.get('password')
                if password:
                    user.password = password
                    user.save()

        for restore_file in restore_archive.getnames():

            if restore_file == "backup.json":
                continue

            logger.debug(f"Restoring file {restore_file}...")

            splitted_path = PurePath(restore_file).parts
            base, ext = os.path.splitext(
                remove_backup_filename_id(splitted_path[-1]))

            # if there is leading username, remove it.
            if leading_user_in_path:
                username = splitted_path[0]
                _user = User.objects.get(username=username)
                splitted_path = splitted_path[1:]

            if backup_info.get('documents', False):
                backup_info_documents = backup_info['documents']
            else:
                backup_info_documents = _get_json_user_documents_list(
                    backup_info, _user)
                leading_user_in_path = True

            for info in backup_info_documents:
                document_info = info
                if info['path'] == restore_file:
                    break

            parent = None

            # variables used only to shorten debug message
            _sp = splitted_path
            _rf = restore_file
            logger.debug(
                f"{_rf}: splitted_path={_sp} len(splitted_path)={len(_sp)}")
            # we first have to create a folder structure
            if len(splitted_path) > 1:
                for folder in splitted_path[:-1]:

                    folder_object = Folder.objects.filter(
                        title=folder,
                        user=_user).filter(parent=parent).first()

                    if folder_object is None:
                        new_folder = Folder.objects.create(title=folder,
                                                           parent=parent,
                                                           user=_user)
                        parent = new_folder
                    else:
                        parent = folder_object

            with NamedTemporaryFile("w+b", suffix=ext) as temp_output:
                logger.debug(f"Extracting {restore_file}...")

                ff = restore_archive.extractfile(restore_file)
                temp_output.write(ff.read())
                temp_output.seek(0)
                size = os.path.getsize(temp_output.name)

                page_count = get_pagecount(temp_output.name)

                if parent:
                    parent_id = parent.id
                else:
                    parent_id = None

                new_doc = Document.objects.create_document(
                    user=_user,
                    title=document_info['title'],
                    size=size,
                    lang=document_info['lang'],
                    file_name=remove_backup_filename_id(splitted_path[-1]),
                    parent_id=parent_id,
                    notes="",
                    page_count=page_count,
                    rebuild_tree=False  # speeds up 100x
                )

                tag_attributes = document_info.get('tags', [])

                for attrs in tag_attributes:
                    attrs['user'] = _user
                    tag, created = Tag.objects.get_or_create(**attrs)
                    new_doc.tags.add(tag)

                default_storage.copy_doc(src=temp_output.name,
                                         dst=new_doc.path.url())

            if not skip_ocr:
                for page_num in range(1, page_count + 1):
                    ocr_page.apply_async(
                        kwargs={
                            'user_id': _user.id,
                            'document_id': new_doc.id,
                            'file_name': new_doc.file_name,
                            'page_num': page_num,
                            'lang': document_info['lang']
                        })
示例#15
0
    def post(self, request):
        files = request.FILES.getlist('file')
        if not files:
            logger.warning("POST request.FILES is empty. Forgot adding file?")
            return HttpResponseBadRequest("Missing input file")

        if len(files) > 1:
            logger.warning("More then one files per ajax? how come?")
            return HttpResponse(json.dumps({}),
                                content_type="application/json",
                                status_code=400)

        f = files[0]

        logger.debug("upload for f=%s user=%s", f, request.user)

        user = request.user
        size = os.path.getsize(f.temporary_file_path())
        parent_id = request.POST.get('parent', "-1")
        if parent_id and "-1" in parent_id:
            parent_id = None

        lang = request.POST.get('language')
        notes = request.POST.get('notes')
        page_count = get_pagecount(f.temporary_file_path())
        logger.info("creating document {}".format(f.name))

        doc = Document.create_document(user=user,
                                       title=f.name,
                                       size=size,
                                       lang=lang,
                                       file_name=f.name,
                                       parent_id=parent_id,
                                       notes=notes,
                                       page_count=page_count)
        logger.debug("uploading to {}".format(doc.path.url()))

        default_storage.copy_doc(src=f.temporary_file_path(),
                                 dst=doc.path.url())
        for page_num in range(1, page_count + 1):
            ocr_page.apply_async(
                kwargs={
                    'user_id': user.id,
                    'document_id': doc.id,
                    'file_name': f.name,
                    'page_num': page_num,
                    'lang': lang
                })

        # upload only one file at time.
        # after each upload return a json object with
        # following fields:
        #
        # - title
        # - preview_url
        # - doc_id
        # - action_url  -> needed for renaming/deleting selected item
        #
        # with that info a new thumbnail will be created.

        # action_url = reverse(
        #     'boss:core_basetreenode_change', args=(doc.id,)
        # )

        preview_url = reverse('core:preview', args=(doc.id, 200, 1))

        result = {
            'title': doc.title,
            'doc_id': doc.id,
            'action_url': "",
            'preview_url': preview_url
        }
        logger.info("and response is!")
        return HttpResponse(json.dumps(result),
                            content_type="application/json")
示例#16
0
def restore_documents(restore_file: io.BytesIO, username, skip_ocr=False):

    restore_file.seek(0)
    user = User.objects.filter(username=username).first()

    with tarfile.open(fileobj=restore_file, mode="r") as restore_archive:

        backup_json = restore_archive.extractfile('backup.json')
        backup_info = json.load(backup_json)

        for restore_file in restore_archive.getnames():
            if restore_file == "backup.json":
                continue
            for info in backup_info['documents']:
                document_info = info
                if info['path'] == restore_file:
                    break

            splitted_path = PurePath(restore_file).parts
            parent = None
            # we first have to create a folder structure

            if len(splitted_path) > 1:
                for folder in splitted_path[:-1]:

                    folder_object = Folder.objects.filter(title=folder).filter(
                        parent=parent).first()

                    if folder_object is None:
                        new_folder = Folder.objects.create(title=folder,
                                                           parent=parent,
                                                           user=user)
                        parent = new_folder
                    else:
                        parent = folder_object

            document_object = Document.objects.filter(
                title=splitted_path[-1]).filter(parent=parent).first()

            if document_object is not None:
                logger.error("Document %s already exists, skipping",
                             restore_file)
            else:

                with NamedTemporaryFile("w+b") as temp_output:

                    temp_output.write(
                        restore_archive.extractfile(restore_file).read())
                    temp_output.seek(0)
                    size = os.path.getsize(temp_output.name)
                    page_count = get_pagecount(temp_output.name)
                    if parent:
                        parent_id = parent.id
                    else:
                        parent_id = None
                    new_doc = Document.create_document(
                        user=user,
                        title=splitted_path[-1],
                        size=size,
                        lang=document_info['lang'],
                        file_name=splitted_path[-1],
                        parent_id=parent_id,
                        notes="",
                        page_count=page_count)
                    default_storage.copy_doc(src=temp_output.name,
                                             dst=new_doc.path.url())

                for page_num in range(1, page_count + 1):
                    if not skip_ocr:
                        ocr_page.apply_async(
                            kwargs={
                                'user_id': user.id,
                                'document_id': new_doc.id,
                                'file_name': splitted_path[-1],
                                'page_num': page_num,
                                'lang': document_info['lang']
                            })
示例#17
0
def apply_automates(document_id, page_num):

    logger.debug("apply_automates: Begin.")
    try:
        document = Document.objects.get(id=document_id)
    except Document.DoesNotExist:
        logger.error(f"Provided document_id={document_id}, does not exists")
        return

    # use text files from the original version of the document
    doc_path = DocumentPath.copy_from(
        document.path,
        version=0
    )
    page_count = get_pagecount(
        default_storage.abspath(doc_path.url())
    )
    page_path = PagePath(
        document_path=doc_path,
        page_num=page_num,
        page_count=page_count,
        step=Step(),
    )
    user = document.user

    text_path = default_storage.abspath(page_path.txt_url())
    text = ""
    with open(text_path, "r") as f:
        text = f.read()

    automates = Automate.objects.filter(user=user)
    # are there automates for the user?
    if automates.count() == 0:
        logger.debug(
            f"No automates for user {user}. Quit."
        )
        return

    # check all automates for given user (the owner of the document)
    matched = []
    for automate in automates:
        if automate.is_a_match(text):
            logger.debug(f"Automate {automate} matched document={document}")

            plugin_klass = get_plugin_by_module_name(
                automate.plugin_name
            )
            plugin = plugin_klass() if plugin_klass else None

            automate.apply(
                document=document,
                page_num=page_num,
                hocr=text,
                # Notice () - plugin passed is instance of the class
                plugin=plugin
            )
            matched.append(automate)
        else:
            logger.debug(
                f"No match for automate={automate}"
                f" doc_id={document_id} page_num={page_num}"
            )

    message = ""

    message = _(
        "%(count)s of %(total)s Automate(s) matched. ") % {
        'count': len(matched),
        'total': automates.count()
    }

    if len(matched) > 0:
        message += _("List of matched Automates: %(matched_automates)s") % {
            'matched_automates': matched
        }

    automates_matching.send(
        sender="papermerge.core.automate",
        user_id=document.user.id,
        document_id=document_id,
        level=logging.INFO,
        message=message,
        page_num=page_num,
        text=text
    )
示例#18
0
 def page_count(self):
     return get_pagecount(self.path)
示例#19
0
    def test_basic_png(self):
        page_count = get_pagecount(get_filepath("berlin.png"))

        self.assertEqual(page_count, 1)
def restore_documents(restore_file: io.BytesIO, user: User, skip_ocr=False):

    restore_file.seek(0)

    with tarfile.open(fileobj=restore_file, mode="r") as restore_archive:
        backup_json = restore_archive.extractfile('backup.json')
        backup_info = json.load(backup_json)

        leading_user_in_path = False
        _user = user
        if not user:
            leading_user_in_path = True
            # user was not specified. It is assument that
            # backup.json contains a list of users.
            # Thus recreate users first.
            for backup_user in backup_info['users']:
                User.objects.create(username=backup_user['username'],
                                    email=backup_user['email'],
                                    is_active=backup_user['is_active'],
                                    is_superuser=backup_user['is_superuser'])

        for restore_file in restore_archive.getnames():

            if restore_file == "backup.json":
                continue

            splitted_path = PurePath(restore_file).parts
            base, ext = os.path.splitext(
                remove_backup_filename_id(splitted_path[-1]))

            # if there is leading username, remove it.
            if leading_user_in_path:
                username = splitted_path[0]
                _user = User.objects.get(username=username)
                splitted_path = splitted_path[1:]

            if backup_info.get('documents', False):
                backup_info_documents = backup_info['documents']
            else:
                backup_info_documents = _get_json_user_documents_list(
                    backup_info, _user)
                leading_user_in_path = True

            for info in backup_info_documents:
                document_info = info
                if info['path'] == restore_file:
                    break

            parent = None
            # we first have to create a folder structure

            if len(splitted_path) > 1:
                for folder in splitted_path[:-1]:

                    folder_object = Folder.objects.filter(
                        title=folder,
                        user=_user).filter(parent=parent).first()

                    if folder_object is None:
                        new_folder = Folder.objects.create(title=folder,
                                                           parent=parent,
                                                           user=_user)
                        parent = new_folder
                    else:
                        parent = folder_object

                with NamedTemporaryFile("w+b", suffix=ext) as temp_output:
                    ff = restore_archive.extractfile(restore_file)
                    temp_output.write(ff.read())
                    temp_output.seek(0)
                    size = os.path.getsize(temp_output.name)

                    page_count = get_pagecount(temp_output.name)

                    if parent:
                        parent_id = parent.id
                    else:
                        parent_id = None

                    new_doc = Document.create_document(
                        user=_user,
                        title=document_info['title'],
                        size=size,
                        lang=document_info['lang'],
                        file_name=remove_backup_filename_id(splitted_path[-1]),
                        parent_id=parent_id,
                        notes="",
                        page_count=page_count,
                        rebuild_tree=False  # speeds up 100x
                    )

                    default_storage.copy_doc(src=temp_output.name,
                                             dst=new_doc.path.url())

                if not skip_ocr:
                    for page_num in range(1, page_count + 1):
                        ocr_page.apply_async(
                            kwargs={
                                'user_id': _user.id,
                                'document_id': new_doc.id,
                                'file_name': new_doc.file_name,
                                'page_num': page_num,
                                'lang': document_info['lang']
                            })