Exemplo n.º 1
0
    def test_get_pdf_info(self):
        """Testing get_pdf_info(pdf_content: bytes)"""
        tests_dir: Path = Path(os.path.dirname(os.path.dirname(__file__)))
        test_eng_pdf: Path = Path(os.path.join(tests_dir, 'test_eng.pdf'))
        test_eng_pdf_content: bytes = _u.read_binary_file(test_eng_pdf)
        test_eng_pdf_info: _u.PdfInfo = _u.get_pdf_info(test_eng_pdf_content)
        self.assertEqual(test_eng_pdf_info.author, '')
        self.assertEqual(test_eng_pdf_info.creation_date,
                         '2019-03-10 07:57:51+0000')
        self.assertEqual(test_eng_pdf_info.creator, '')
        self.assertEqual(test_eng_pdf_info.mod_date, '')
        self.assertEqual(test_eng_pdf_info.producer, 'Tesseract 4.0.0-beta.1')
        self.assertEqual(test_eng_pdf_info.title, '')
        self.assertEqual(test_eng_pdf_info.num_pages, 1)

        with patch('sys.stdout', new_callable=StringIO) as patched_stdout:
            not_pdf_info: _u.PdfInfo = _u.get_pdf_info(bytes())
            stdout_value: str = patched_stdout.getvalue()
            self.assertEqual(
                stdout_value,
                "PyPDF2.PdfFileReader exception: Cannot read an empty file\n")
            self.assertEqual(not_pdf_info.author, '')
            self.assertEqual(not_pdf_info.creation_date, '')
            self.assertEqual(not_pdf_info.creator, '')
            self.assertEqual(not_pdf_info.mod_date, '')
            self.assertEqual(not_pdf_info.producer, '')
            self.assertEqual(not_pdf_info.title, '')
            self.assertEqual(not_pdf_info.num_pages, 0)
Exemplo n.º 2
0
 def create_pdf(self, admin_obj=None, request=None):
     """
     This function creates self.pdf.file if it is possible 2019-03-13
     :admin_obj: An admin instance of the model
     :request: A request instance of the current http request
     :return: None
     """
     # checking that instance of OCRedFile is saved, raise DoesNotSaved exception otherwise
     self.is_saved()
     if self.can_create_pdf:
         content = self.file.file.read()
         self.file.file.seek(0)
         if 'image' in self.file_type:
             pdf_content = ocr_img2pdf(content)
             filename = set_pdffile_name(self, True)
             pdf = open(filename, 'wb')
             pdf.write(content)
             pdf.close()
             self.ocred_pdf.name = filename
             self.ocred_pdf_md5 = md5(pdf_content)
             OCRedFile.Counters.num_created_pdf += 1
             if admin_obj and request:
                 admin_obj.message_user(request, 'PDF created')
         elif 'pdf' in self.file_type:
             filename = set_pdffile_name(self, True)
             ocr_pdf(content, filename)
             self.ocred_pdf.name = filename
             self.ocred_pdf_md5 = md5(read_binary_file(filename))
             OCRedFile.Counters.num_created_pdf += 1
             if admin_obj and request:
                 admin_obj.message_user(request, 'PDF created')
         super(OCRedFile, self).save()
Exemplo n.º 3
0
 def test_pdf2text(self):
     """Testing pdf2text(pdf_content: bytes)"""
     tests_dir: Path = Path(os.path.dirname(os.path.dirname(__file__)))
     pdf_notext: Path = Path(os.path.join(tests_dir, 'test_eng_notext.pdf'))
     pdf_notext_content: bytes = _u.read_binary_file(pdf_notext)
     pdf_notext_decoded: str = _u.pdf2text(pdf_notext_content)
     self.assertEqual(pdf_notext_decoded, '')
     pdf_withtext: Path = Path(
         os.path.join(tests_dir, 'the_pdf_withtext.pdf'))
     pdf_withtext_content: bytes = _u.read_binary_file(pdf_withtext)
     pdf_withtext_decoded: str = _u.pdf2text(pdf_withtext_content)
     self.assertEqual(pdf_withtext_decoded, 'The test if pdf with text')
     not_pdf: Path = Path(os.path.join(tests_dir, 'test_eng.png'))
     not_pdf_content: bytes = _u.read_binary_file(not_pdf)
     with self.assertRaisesMessage(pdftotext.Error,
                                   f'poppler error creating document'):
         _: str = _u.pdf2text(not_pdf_content)
Exemplo n.º 4
0
 def test_ocr_img2str(self):
     """The testing ocr_img2str(stdin: bytes)"""
     tests_dir: Path = Path(os.path.dirname(os.path.dirname(__file__)))
     test_eng_png: Path = Path(os.path.join(tests_dir, 'test_eng.png'))
     test_eng_png_content: bytes = _u.read_binary_file(test_eng_png)
     test_eng_ocred_text: str = _u.ocr_img2str(test_eng_png_content)
     self.assertTrue(test_eng_ocred_text,
                     'A some english text to test Tesseract')
Exemplo n.º 5
0
 def test_ocr_img2pdf(self):
     """The testing ocr_img2pdf(stdin: bytes)"""
     tests_dir: Path = Path(os.path.dirname(os.path.dirname(__file__)))
     test_eng_png: Path = Path(os.path.join(tests_dir, 'test_eng.png'))
     test_eng_png_content: bytes = _u.read_binary_file(test_eng_png)
     test_eng_ocred_pdf: bytes = _u.ocr_img2pdf(test_eng_png_content)
     self.assertIsNotNone(test_eng_ocred_pdf)
     test_eng_ocred_pdf_text: str = _u.pdf2text(test_eng_ocred_pdf)
     self.assertEqual(test_eng_ocred_pdf_text,
                      'A some english text to test Tesseract')
Exemplo n.º 6
0
 def test_read_binary_file(self):
     """Testing read_binary_file(path: str)"""
     tests_dir: Path = Path(os.path.dirname(os.path.dirname(__file__)))
     empty_file: Path = Path(os.path.join(tests_dir, 'empty_file.txt'))
     empty_content: bytes = _u.read_binary_file(empty_file)
     self.assertEqual(empty_content, bytes())
     folder: Path = os.path.join(tests_dir, 'some_dir')
     with self.assertRaisesMessage(
             IsADirectoryError, f"[Errno 21] Is a directory: '{folder}'"):
         _: bytes = _u.read_binary_file(folder)
     not_empty_file: Path = Path(
         os.path.join(tests_dir, 'not_empty_file.txt'))
     not_empty_content: bytes = _u.read_binary_file(not_empty_file)
     self.assertEqual(type(not_empty_content), bytes)
     self.assertEqual(not_empty_content, 'content\n'.encode())
     no_file: Path = Path(os.path.join(tests_dir, 'no_file.txt'))
     with self.assertRaisesMessage(
             FileNotFoundError,
             f"[Errno 2] No such file or directory: '{no_file}'"):
         _: bytes = _u.read_binary_file(no_file)
Exemplo n.º 7
0
 def save(self,
          force_insert=False,
          force_update=False,
          using=None,
          update_fields=None):
     """
     This function save the instance of the model, or create it
     :param force_insert:
     :param force_update:
     :param using:
     :param update_fields:
     :return: None
     """
     if self.is_saved(raise_exception=False):
         return
     if not self.file_type:
         self.file_type = self.file.file.content_type
     OCRedFile.is_valid_file_type(file_type=self.file_type,
                                  raise_exception=True)
     # read content of the 'file' field
     content = self.file.file.read()
     # return the reading pointer of the 'file' file to start position
     self.file.file.seek(0)
     # calculate md5 of 'file' field if if does not exist
     if not self.md5:
         self.md5 = md5(content)
     OCRedFile.is_valid_ocr_md5(md5_value=self.md5, raise_exception=True)
     # extract of ocr a content of the 'file' field if 'text' does not exist
     if not self.text:
         print(f'OCRedFile->save start OCR {self.md5}')
         ocr_started_datetime = timezone.now()
         if 'image' in self.file_type:
             pdf_content = ocr_img2pdf(content)
             self.text = pdf2text(pdf_content)
             if len(self.text):
                 # create ocred_pdf only for an image that contains a text
                 self.ocred_pdf_md5 = md5(pdf_content)
                 if ocr_settings.OCR_STORE_PDF:
                     self.ocred_pdf.save(set_pdffile_name(self),
                                         BytesIO(pdf_content), False)
                 else:
                     self.ocred_pdf.name = set_pdffile_name(self)
             self.ocred = timezone.now()
         elif 'pdf' in self.file_type:
             pdf_info: PdfInfo = get_pdf_info(content)
             self.pdf_num_pages = pdf_info.num_pages
             self.pdf_author = pdf_info.author
             if pdf_info.creation_date:
                 self.pdf_creation_date = pdf_info.creation_date
             self.pdf_creator = pdf_info.creator
             if pdf_info.mod_date:
                 self.pdf_mod_date = pdf_info.mod_date
             self.pdf_producer = pdf_info.producer
             self.pdf_title = pdf_info.title
             pdf_text = pdf2text(content)
             # check that loaded PDF file contains text
             if pdf_need_ocr(pdf_text):
                 print(
                     f'OCRedFile PDF OCR processing via OCRmyPDF {self.md5}'
                 )
                 pdf_filename = set_pdffile_name(self)
                 self.text = ocr_pdf(content, pdf_filename)
                 self.ocred = timezone.now(
                 )  # save datetime when uploaded PDF was ocred
                 if len(self.text):
                     # create ocred_pdf only for a pdf file that contains images with text
                     self.ocred_pdf.name = pdf_filename
                     self.ocred_pdf_md5 = md5(
                         read_binary_file(pdf_filename))
                     if not ocr_settings.OCR_STORE_PDF:
                         if os.path.isfile(pdf_filename):
                             os.remove(pdf_filename)
                 else:
                     # remove the PDF file created by ocr_pdf(content, pdf_filename)
                     if os.path.isfile(pdf_filename):
                         os.remove(pdf_filename)
             else:
                 print(
                     f'OCRedFile->save use text from loaded pdf {self.md5}')
                 self.text = pdf_text
         ocr_finished_datetime = timezone.now()
         ocr_duration: timedelta = ocr_finished_datetime - ocr_started_datetime
         print(
             f"OCRedFile->save finished OCR '{ocr_duration.seconds}.{ocr_duration.microseconds}' ms {self.md5}"
         )
     if not ocr_settings.OCR_STORE_FILES:
         os.remove(self.file.path)
     # update counters
     OCRedFile.Counters.num_created_instances += 1
     # checking database connection
     if not connection.is_usable():
         try:
             connection.connect()
         except Exception as e:
             print(f"database reconnection exception {self.md5}")
     # parent method
     super(OCRedFile, self).save(force_insert=False,
                                 force_update=False,
                                 using=None,
                                 update_fields=None)