Exemplo n.º 1
0
    def test_document_url_with_another_version(self):

        doc_ep = DocumentPath(user_id=1, document_id=15, file_name="x.pdf")
        self.assertEqual(doc_ep.url(version=3),
                         "docs/user_1/document_15/v3/x.pdf")

        self.assertEqual(doc_ep.url(version=2),
                         "docs/user_1/document_15/v2/x.pdf")
Exemplo n.º 2
0
def ocr_page(
    user_id,
    document_id,
    file_name,
    page_num,
    lang,
):
    logger.debug(f" ocr_page user_id={user_id} doc_id={document_id}"
                 f" page_num={page_num}")
    t1 = time.time()
    lang = lang.lower()

    doc_path = DocumentPath(
        user_id=user_id,
        document_id=document_id,
        file_name=file_name,
    )

    mime_type = mime.Mime(default_storage.abspath(doc_path.url()))

    logger.debug(f"Mime Type = {mime_type}")

    page_type = ''
    if mime_type.is_pdf():
        ocr_page_pdf(doc_path=doc_path, page_num=page_num, lang=lang)
        page_type = 'pdf'
    elif mime_type.is_image():  # jpeg, jpeg or png
        ocr_page_image(doc_path=doc_path, page_num=page_num, lang=lang)
    elif mime_type.is_tiff():
        # new filename is a pdf file
        logger.debug("TIFF type detected")
        new_filename = convert_tiff2pdf(
            doc_url=default_storage.abspath(doc_path.url()))
        # now .pdf
        doc_path.file_name = new_filename
        # and continue as usual
        ocr_page_pdf(doc_path=doc_path, page_num=page_num, lang=lang)
    else:
        logger.error(f" user_id={user_id}"
                     f" doc_id={document_id}"
                     f" page_num={page_num} error=Unkown file type")
        return True

    t2 = time.time()
    logger.debug(f" user_id={user_id} doc_id={document_id}"
                 f" page_num={page_num} page_type={page_type}"
                 f" total_exec_time={t2-t1:.2f}")

    return True
Exemplo n.º 3
0
    def test_document_url_none_vs_0(self):
        doc_ep = DocumentPath(user_id=1, document_id=15, file_name="x.pdf")
        doc_ep.inc_version()  # current version = 1
        doc_ep.inc_version()  # current version = 2
        doc_ep.inc_version()  # current version = 3

        self.assertEqual(
            # with version == None, latest version of the document
            # will be returned, which is 3
            doc_ep.url(version=None),
            "docs/user_1/document_15/v3/x.pdf")

        self.assertEqual(
            # with version == 0, version 0 will be provided
            # i.e. version=0 returns original doc.
            doc_ep.url(version=0),
            "docs/user_1/document_15/x.pdf")
Exemplo n.º 4
0
    def test_inc_version(self):
        """
        Document endpoints are now versioned.
        Initial version is 0.
        When version is 0, the "old" endpoint path applies i.e.
        version is not included in the path.
        After document is modified (blank page deleted for example),
        its version is incremented. If document version is > 0, then
        version is included in the path.
        """
        doc_ep = DocumentPath(user_id=1, document_id=3, file_name="x.pdf")
        doc_ep.inc_version()

        self.assertEqual(doc_ep.url(), "docs/user_1/document_3/v1/x.pdf")

        doc_ep.inc_version()

        self.assertEqual(doc_ep.url(), "docs/user_1/document_3/v2/x.pdf")
Exemplo n.º 5
0
def ocr_page(
    user_id,
    document_id,
    file_name,
    page_num,
    lang,
):
    logger.debug(f" ocr_page user_id={user_id} doc_id={document_id}"
                 f" page_num={page_num}")
    t1 = time.time()
    lang = lang.lower()

    doc_path = DocumentPath(
        user_id=user_id,
        document_id=document_id,
        file_name=file_name,
    )

    mime_type = mime.Mime(default_storage.abspath(doc_path.url()))

    page_type = ''
    if mime_type.is_pdf():
        ocr_page_pdf(doc_path=doc_path, page_num=page_num, lang=lang)
        page_type = 'pdf'
    else:
        logger.error(f" user_id={user_id}"
                     f" doc_id={document_id}"
                     f" page_num={page_num} error=Unkown file type")
        return True

    t2 = time.time()
    logger.debug(f" user_id={user_id} doc_id={document_id}"
                 f" page_num={page_num} page_type={page_type}"
                 f" total_exec_time={t2-t1:.2f}")

    return True
Exemplo n.º 6
0
 def test_document_url(self):
     doc_ep = DocumentPath(user_id=1, document_id=3, file_name="x.pdf")
     self.assertEqual(doc_ep.url(), "docs/user_1/document_3/x.pdf")
Exemplo n.º 7
0
def ocr_page(
    user_id,
    document_id,
    file_name,
    page_num,
    lang,
    namespace=None,
):
    logger.debug(f" ocr_page user_id={user_id} doc_id={document_id}"
                 f" page_num={page_num}")
    t1 = time.time()
    lang = lang.lower()
    doc_path = DocumentPath(
        user_id=user_id,
        document_id=document_id,
        file_name=file_name,
    )

    if not default_storage.exists(doc_path.url()):
        # In case of distibuted deployment, document uploaded
        # by webapp is not directly available to the worker (which runs on
        # separate computer). Thus, if document is not locally available,
        # worker will download the document from whatever remote location.
        default_storage.download(doc_path_url=doc_path.url(),
                                 namespace=namespace)

    mime_type = mime.Mime(default_storage.abspath(doc_path.url()))
    logger.debug(f"Mime Type = {mime_type}")

    page_type = ''

    if mime_type.is_pdf():
        ocr_page_pdf(doc_path=doc_path,
                     page_num=page_num,
                     lang=lang,
                     user_id=user_id,
                     document_id=document_id,
                     namespace=namespace)
        page_type = 'pdf'
    elif mime_type.is_image():  # jpeg, jpeg or png
        ocr_page_image(doc_path=doc_path,
                       page_num=page_num,
                       lang=lang,
                       user_id=user_id,
                       document_id=document_id,
                       namespace=namespace)
    elif mime_type.is_tiff():
        # new filename is a pdf file
        logger.debug("TIFF type detected")
        new_filename = convert_tiff2pdf(
            doc_url=default_storage.abspath(doc_path.url()))
        # now .pdf
        orig_file_name = doc_path.file_name
        doc_path.file_name = new_filename
        # and continue as usual
        ocr_page_pdf(
            doc_path=doc_path,
            page_num=page_num,
            lang=lang,
            user_id=user_id,
            document_id=document_id,
            # Pass original file_name i.e. tiff file name as well.
            file_name=orig_file_name,
            namespace=namespace)
    else:
        logger.error(f" user_id={user_id}"
                     f" doc_id={document_id}"
                     f" page_num={page_num} error=Unkown file type")
        return True

    t2 = time.time()
    logger.debug(f" user_id={user_id} doc_id={document_id}"
                 f" page_num={page_num} page_type={page_type}"
                 f" total_exec_time={t2-t1:.2f}")

    return True