Пример #1
0
def decode_pdf(pdf_filepath, image_directory,image_format='PNG'):

    if Path(pdf_filepath).suffix != '.pdf':
        raise Exception('Must specify a \'.pdf\' file extension for input pdf_filepath.')
    image_filepaths = OCR_utils.pdf_pages_to_images(pdf_filepath, image_directory, image_format=image_format)

    output = PdfFileWriter()
    pdf_page_fps = []
    for image_fp in image_filepaths:
        pdf_page_fps.append(OCR_utils.image_to_pdf(image_fp))   # Create a pdf from the image
        file = PdfFileReader(open(pdf_page_fps[-1], "rb"))           # Open the image's pdf
        output.addPage(file.getPage(0))                         # Add the page to the new document

    new_filepath = str(Path(pdf_filepath).parent / (Path(pdf_filepath).stem + '_decoded' + Path(pdf_filepath).suffix))
    outputStream = open(new_filepath, "wb")
    output.write(outputStream)
    outputStream.close()

    # Delete the temporary image files created
    if image_filepaths:
        for fp in image_filepaths: os.remove(fp)

    return new_filepath
Пример #2
0
def remove_greyscale_watermark(PDF_file_path, to_black_upperbound, to_white_lowerbound,
                               compression_factor = 1,
                               replacement_watermark='',
                               replacement_watermark_font = 'Arial',
                               replacement_watermark_text_size=20,
                               replacement_watermark_colour=(50,50,50,255),
                               replacement_watermark_text_center = (200, 200),
                               replacement_watermark_rotation_angle=0,
                               output_file_path = '',
                               jpg_quality = 75):

    image_fps = OCR_utils.pdf_pages_to_images(PDF_file_path, str(Path(PDF_file_path).parent), 'BMP', compression_factor=compression_factor)
    mod_image_fps = []
    for image_fp in image_fps:
        im = Image.open(image_fp)
        pix, s = im.load(), im.size

        # Examine RGB of specified pixels
        # i_wm, j_wm = 1422, 3071
        # wm_grey = pix[i_wm - 1, j_wm - 1] #173
        # i_ol, j_ol = 1579, 2902
        # ol_grey = pix[i_ol - 1, j_ol - 1] #81

        # # Determine the most common RGBs
        # dict_of_colours = {}
        # for i in range(s[0]):
        #     for j in range(s[1]):
        #         col = pix[i, j]
        #         if col not in dict_of_colours.keys():
        #             dict_of_colours[col] = 1
        #         else:
        #             dict_of_colours[col] += 1
        # dict_of_colours = {k: v for k, v in sorted(dict_of_colours.items(), key=lambda item: item[1], reverse=True)}
        # len([tup for tup in dict_of_colours.keys() if tup[0]==tup[1] and tup[1]==tup[2]]) == len(dict_of_colours.keys()) # Check if all are greyscale

        for i in range(s[0]):
            for j in range(s[1]):
                col = pix[i, j]
                if col[0]>=to_white_lowerbound:
                    pix[i, j] = (255,255,255)
                elif col[0]<=to_black_upperbound:
                    pix[i, j] = (0, 0, 0)

        if replacement_watermark:
            fp, im=np.add_text_line_to_image(im, replacement_watermark, replacement_watermark_text_center,
                            text_size=replacement_watermark_text_size,
                            text_box_pixel_width = 0,
                            RGBA=replacement_watermark_colour,
                            text_background_RGBA = (0,0,0,0),
                            text_box_RGBA = (0,0,0,0),
                            rot_degrees=replacement_watermark_rotation_angle,
                            font_name = replacement_watermark_font,
                            show_result = False)


        im.save(image_fp[:-4]+'_mod.jpg', quality=jpg_quality)
        mod_image_fps.append(image_fp[:-4]+'_mod.jpg')

    OCR_utils.images_to_pdf(mod_image_fps, output_file_path=output_file_path)
    # Delete the temporary image files created
    if image_fps or mod_image_fps:
        for fp in image_fps+mod_image_fps: os.remove(fp)
Пример #3
0
    def search_in_pdfs(self, file_path):

        # Read the text with tika and parse into individual pages with BeautifulSoup
        # TODO: test with 1 page pdf
        pages = pdf_utils.tika_read()

        # If the file contains 4 or more consecutive symbols, it is probably encrypted
        file_probably_encrypted = False
        for page_text in pages:
            if page_text is None or pdf_utils.is_probably_encrypted(
                    page_text, n_consecutive_symbols=4):
                file_probably_encrypted = True
                break

        # 1. If probably encrypted and OCR allowed, analyse the pages with Tesseract OCR (Optical Character Recognition)
        if file_probably_encrypted and self.parameters['allow_OCR']:
            self.results['pdf_reading_steps'][file_path].append(
                'encrypted: used page OCR')
            page_image_filepaths = OCR_utils.pdf_pages_to_images(
                file_path, self.temp_directory, 'jpg'
            )  # Convert pdf pages to image files and save list of their filepaths
            for i, image_fp in enumerate(
                    page_image_filepaths
            ):  # Use OCR on each page to get a text string for each
                page_text = OCR_utils.image_to_text(image_fp, language='eng')
                for search_string in self.parameters['search_strings']:
                    line_numbers = nm.count_text_occurrences(
                        page_text,
                        search_string,
                        self.parameters['case_sensitive'],
                        self.parameters['whole_phrase_only'],
                        get_line_numbers=True)
                    if len(line_numbers) > 0:
                        if str(file_path) not in list(
                                self.results['containing_file_paths']
                            [search_string]['fancytext'].keys()):
                            self.results['containing_file_paths'][
                                search_string]['fancytext'][str(
                                    file_path)] = {}
                        self.results['containing_file_paths'][search_string][
                            'fancytext'][str(file_path)]['page ' +
                                                         str(i +
                                                             1)] = line_numbers
            # Delete the temporary image files created
            if page_image_filepaths:
                for fp in page_image_filepaths:
                    os.remove(fp)
        # 2. If probably encrypted but cannot use OCR, add to failed file paths store
        elif file_probably_encrypted and not self.parameters['allow_OCR']:
            self.results['failed_file_paths']['fancytext'][(
                str(file_path)
            )] = 'File appears to be encrypted and OCR has not been allowed/is not available.'
            self.results['pdf_reading_steps'][file_path].append(
                'encrypted: OCR not allowed')
        # 3. If probably not encrypted, analyse the tika text of each page
        else:
            self.results['pdf_reading_steps'][file_path].append(
                'unencrypted: analyse tika text')
            for i, page_text in enumerate(pages):
                for search_string in self.parameters['search_strings']:
                    line_numbers = nm.count_text_occurrences(
                        page_text,
                        search_string,
                        self.parameters['case_sensitive'],
                        self.parameters['whole_phrase_only'],
                        get_line_numbers=True)
                    if len(line_numbers) > 0:
                        if str(file_path) not in list(
                                self.results['containing_file_paths']
                            [search_string]['fancytext'].keys()):
                            self.results['containing_file_paths'][
                                search_string]['fancytext'][str(
                                    file_path)] = {}
                        self.results['containing_file_paths'][search_string][
                            'fancytext'][str(file_path)]['page ' +
                                                         str(i +
                                                             1)] = line_numbers

            # Check if the pdf has any images - if desired, these can be analysed separately with OCR (not needed in encrypted case)
            if self.parameters['search_in_doc_images'] and self.parameters[
                    'allow_OCR']:
                self.results['pdf_reading_steps'][file_path].append(
                    'unencrypted: search in images')
                n_images, saved_image_filepaths = pdf_utils.count_extract_pdf_images(
                    file_path, save_images=True)
                if n_images > 0:
                    for j, image_fp in enumerate(saved_image_filepaths):
                        image_text = OCR_utils.image_to_text(image_fp,
                                                             language='eng')
                        for search_string in self.parameters['search_strings']:
                            occurrences = nm.count_text_occurrences(
                                image_text, search_string,
                                self.parameters['case_sensitive'],
                                self.parameters['whole_phrase_only'])
                            if occurrences > 0:
                                page_number = Path(file_path).stem.split(
                                    '_page_')[-1]
                                if str(file_path) not in list(
                                        self.results['containing_file_paths']
                                    [search_string]['fancytext'].keys()):
                                    self.results['containing_file_paths'][
                                        search_string]['fancytext'][str(
                                            file_path)] = {}
                                self.results['containing_file_paths'][
                                    search_string]['fancytext'][str(
                                        file_path
                                    )]['image {} on page {}'.format(
                                        j + 1, page_number
                                    )] = '{} occurrences'.format(occurrences)