def decode_pdf(pdf_filepath, image_directory,image_format='PNG'): if Path(pdf_filepath).suffix != '.pdf': raise Exception('Must specify a \'.pdf\' file extension for input pdf_filepath.') image_filepaths = OCR_utils.pdf_pages_to_images(pdf_filepath, image_directory, image_format=image_format) output = PdfFileWriter() pdf_page_fps = [] for image_fp in image_filepaths: pdf_page_fps.append(OCR_utils.image_to_pdf(image_fp)) # Create a pdf from the image file = PdfFileReader(open(pdf_page_fps[-1], "rb")) # Open the image's pdf output.addPage(file.getPage(0)) # Add the page to the new document new_filepath = str(Path(pdf_filepath).parent / (Path(pdf_filepath).stem + '_decoded' + Path(pdf_filepath).suffix)) outputStream = open(new_filepath, "wb") output.write(outputStream) outputStream.close() # Delete the temporary image files created if image_filepaths: for fp in image_filepaths: os.remove(fp) return new_filepath
def remove_greyscale_watermark(PDF_file_path, to_black_upperbound, to_white_lowerbound, compression_factor = 1, replacement_watermark='', replacement_watermark_font = 'Arial', replacement_watermark_text_size=20, replacement_watermark_colour=(50,50,50,255), replacement_watermark_text_center = (200, 200), replacement_watermark_rotation_angle=0, output_file_path = '', jpg_quality = 75): image_fps = OCR_utils.pdf_pages_to_images(PDF_file_path, str(Path(PDF_file_path).parent), 'BMP', compression_factor=compression_factor) mod_image_fps = [] for image_fp in image_fps: im = Image.open(image_fp) pix, s = im.load(), im.size # Examine RGB of specified pixels # i_wm, j_wm = 1422, 3071 # wm_grey = pix[i_wm - 1, j_wm - 1] #173 # i_ol, j_ol = 1579, 2902 # ol_grey = pix[i_ol - 1, j_ol - 1] #81 # # Determine the most common RGBs # dict_of_colours = {} # for i in range(s[0]): # for j in range(s[1]): # col = pix[i, j] # if col not in dict_of_colours.keys(): # dict_of_colours[col] = 1 # else: # dict_of_colours[col] += 1 # dict_of_colours = {k: v for k, v in sorted(dict_of_colours.items(), key=lambda item: item[1], reverse=True)} # len([tup for tup in dict_of_colours.keys() if tup[0]==tup[1] and tup[1]==tup[2]]) == len(dict_of_colours.keys()) # Check if all are greyscale for i in range(s[0]): for j in range(s[1]): col = pix[i, j] if col[0]>=to_white_lowerbound: pix[i, j] = (255,255,255) elif col[0]<=to_black_upperbound: pix[i, j] = (0, 0, 0) if replacement_watermark: fp, im=np.add_text_line_to_image(im, replacement_watermark, replacement_watermark_text_center, text_size=replacement_watermark_text_size, text_box_pixel_width = 0, RGBA=replacement_watermark_colour, text_background_RGBA = (0,0,0,0), text_box_RGBA = (0,0,0,0), rot_degrees=replacement_watermark_rotation_angle, font_name = replacement_watermark_font, show_result = False) im.save(image_fp[:-4]+'_mod.jpg', quality=jpg_quality) mod_image_fps.append(image_fp[:-4]+'_mod.jpg') OCR_utils.images_to_pdf(mod_image_fps, output_file_path=output_file_path) # Delete the temporary image files created if image_fps or mod_image_fps: for fp in image_fps+mod_image_fps: os.remove(fp)
def search_in_pdfs(self, file_path): # Read the text with tika and parse into individual pages with BeautifulSoup # TODO: test with 1 page pdf pages = pdf_utils.tika_read() # If the file contains 4 or more consecutive symbols, it is probably encrypted file_probably_encrypted = False for page_text in pages: if page_text is None or pdf_utils.is_probably_encrypted( page_text, n_consecutive_symbols=4): file_probably_encrypted = True break # 1. If probably encrypted and OCR allowed, analyse the pages with Tesseract OCR (Optical Character Recognition) if file_probably_encrypted and self.parameters['allow_OCR']: self.results['pdf_reading_steps'][file_path].append( 'encrypted: used page OCR') page_image_filepaths = OCR_utils.pdf_pages_to_images( file_path, self.temp_directory, 'jpg' ) # Convert pdf pages to image files and save list of their filepaths for i, image_fp in enumerate( page_image_filepaths ): # Use OCR on each page to get a text string for each page_text = OCR_utils.image_to_text(image_fp, language='eng') for search_string in self.parameters['search_strings']: line_numbers = nm.count_text_occurrences( page_text, search_string, self.parameters['case_sensitive'], self.parameters['whole_phrase_only'], get_line_numbers=True) if len(line_numbers) > 0: if str(file_path) not in list( self.results['containing_file_paths'] [search_string]['fancytext'].keys()): self.results['containing_file_paths'][ search_string]['fancytext'][str( file_path)] = {} self.results['containing_file_paths'][search_string][ 'fancytext'][str(file_path)]['page ' + str(i + 1)] = line_numbers # Delete the temporary image files created if page_image_filepaths: for fp in page_image_filepaths: os.remove(fp) # 2. If probably encrypted but cannot use OCR, add to failed file paths store elif file_probably_encrypted and not self.parameters['allow_OCR']: self.results['failed_file_paths']['fancytext'][( str(file_path) )] = 'File appears to be encrypted and OCR has not been allowed/is not available.' self.results['pdf_reading_steps'][file_path].append( 'encrypted: OCR not allowed') # 3. If probably not encrypted, analyse the tika text of each page else: self.results['pdf_reading_steps'][file_path].append( 'unencrypted: analyse tika text') for i, page_text in enumerate(pages): for search_string in self.parameters['search_strings']: line_numbers = nm.count_text_occurrences( page_text, search_string, self.parameters['case_sensitive'], self.parameters['whole_phrase_only'], get_line_numbers=True) if len(line_numbers) > 0: if str(file_path) not in list( self.results['containing_file_paths'] [search_string]['fancytext'].keys()): self.results['containing_file_paths'][ search_string]['fancytext'][str( file_path)] = {} self.results['containing_file_paths'][search_string][ 'fancytext'][str(file_path)]['page ' + str(i + 1)] = line_numbers # Check if the pdf has any images - if desired, these can be analysed separately with OCR (not needed in encrypted case) if self.parameters['search_in_doc_images'] and self.parameters[ 'allow_OCR']: self.results['pdf_reading_steps'][file_path].append( 'unencrypted: search in images') n_images, saved_image_filepaths = pdf_utils.count_extract_pdf_images( file_path, save_images=True) if n_images > 0: for j, image_fp in enumerate(saved_image_filepaths): image_text = OCR_utils.image_to_text(image_fp, language='eng') for search_string in self.parameters['search_strings']: occurrences = nm.count_text_occurrences( image_text, search_string, self.parameters['case_sensitive'], self.parameters['whole_phrase_only']) if occurrences > 0: page_number = Path(file_path).stem.split( '_page_')[-1] if str(file_path) not in list( self.results['containing_file_paths'] [search_string]['fancytext'].keys()): self.results['containing_file_paths'][ search_string]['fancytext'][str( file_path)] = {} self.results['containing_file_paths'][ search_string]['fancytext'][str( file_path )]['image {} on page {}'.format( j + 1, page_number )] = '{} occurrences'.format(occurrences)