def search_in_presentations(self): ppt_instance, slide_counter = win32com.client.Dispatch( 'PowerPoint.Application'), 0 for index_file, file_path in enumerate( self.results['files_to_search_inside']['presentation']): print('Searching in presentation file {} of {}...'.format( index_file + 1, len(self.results['files_to_search_inside']['presentation']))) read_only, has_title, window = False, False, False prs = ppt_instance.Presentations.open(file_path, read_only, has_title, window) self.results['file_slide_sizes'][file_path] = ( prs.PageSetup.SlideWidth, prs.PageSetup.SlideHeight) for index_slide, Slide in enumerate(prs.Slides): for index_shape, Shape in enumerate(Slide.Shapes): slide_string = 'Slide ' + str(index_slide + 1) object_string = 'Object ' + str(index_shape + 1) if Shape.HasTextFrame: if Shape.TextFrame.HasText: paragraphs_specialchars_removed = [ p.Text for p in Shape.TextFrame.TextRange.Paragraphs() if (p.Text != '\r') ] for index_paragraph, Paragraph in enumerate( paragraphs_specialchars_removed): for search_string in self.parameters[ 'search_strings']: occurrences = nm.count_text_occurrences( Paragraph, search_string, self.parameters['case_sensitive'], self.parameters['whole_phrase_only']) if occurrences > 0: slide_counter += 1 if str(file_path) not in list( self.results[ 'containing_file_paths'] [search_string] ['presentation'].keys()): self.results[ 'containing_file_paths'][ search_string][ 'presentation'][str( file_path)] = {} paragraph_string = 'Paragraph ' + str( index_paragraph + 1) occurrences_string = str( occurrences ) + ' occurrence' if occurrences == 1 else str( occurrences) + ' occurrences' combined_string = object_string + ', ' + paragraph_string + ', ' + occurrences_string if slide_string in list( self.results[ 'containing_file_paths'] [search_string]['presentation'][ str(file_path)].keys()): self.results[ 'containing_file_paths'][ search_string][ 'presentation'][str( file_path )][slide_string].append( combined_string) else: self.results[ 'containing_file_paths'][ search_string][ 'presentation'][str( file_path )][slide_string] = [ combined_string ] if Shape.Type in [3, 21, 28, 11, 13] and self.parameters[ 'search_in_doc_images'] and self.parameters[ 'allow_OCR']: img_fp = str( self.temp_directory / '{}_{}_{}.jpg'.format( str(Path(file_path).stem).replace('.', ''), slide_string, object_string)) Shape.Export(img_fp, 3) try: image_text = OCR_utils.image_to_text( img_fp, language='eng') except: image_text = '' for search_string in self.parameters['search_strings']: occurrences = nm.count_text_occurrences( image_text, search_string, self.parameters['case_sensitive'], self.parameters['whole_phrase_only']) occurrences_string = str( occurrences ) + ' occurrence' if occurrences == 1 else str( occurrences) + ' occurrences' combined_string = object_string + ' (image), ' + occurrences_string if occurrences > 0: if str(file_path) not in list( self.results['containing_file_paths'] [search_string]['presentation'].keys()): self.results['containing_file_paths'][ search_string]['presentation'][str( file_path)] = {} if slide_string in list( self.results['containing_file_paths'] [search_string]['presentation'][str( file_path)].keys()): self.results['containing_file_paths'][ search_string]['presentation'][str( file_path)][slide_string].append( combined_string) else: self.results['containing_file_paths'][ search_string]['presentation'][str( file_path)][slide_string] = [ combined_string ] os.remove(img_fp)
def search_in_pdfs(self, file_path): # Read the text with tika and parse into individual pages with BeautifulSoup # TODO: test with 1 page pdf pages = pdf_utils.tika_read() # If the file contains 4 or more consecutive symbols, it is probably encrypted file_probably_encrypted = False for page_text in pages: if page_text is None or pdf_utils.is_probably_encrypted( page_text, n_consecutive_symbols=4): file_probably_encrypted = True break # 1. If probably encrypted and OCR allowed, analyse the pages with Tesseract OCR (Optical Character Recognition) if file_probably_encrypted and self.parameters['allow_OCR']: self.results['pdf_reading_steps'][file_path].append( 'encrypted: used page OCR') page_image_filepaths = OCR_utils.pdf_pages_to_images( file_path, self.temp_directory, 'jpg' ) # Convert pdf pages to image files and save list of their filepaths for i, image_fp in enumerate( page_image_filepaths ): # Use OCR on each page to get a text string for each page_text = OCR_utils.image_to_text(image_fp, language='eng') for search_string in self.parameters['search_strings']: line_numbers = nm.count_text_occurrences( page_text, search_string, self.parameters['case_sensitive'], self.parameters['whole_phrase_only'], get_line_numbers=True) if len(line_numbers) > 0: if str(file_path) not in list( self.results['containing_file_paths'] [search_string]['fancytext'].keys()): self.results['containing_file_paths'][ search_string]['fancytext'][str( file_path)] = {} self.results['containing_file_paths'][search_string][ 'fancytext'][str(file_path)]['page ' + str(i + 1)] = line_numbers # Delete the temporary image files created if page_image_filepaths: for fp in page_image_filepaths: os.remove(fp) # 2. If probably encrypted but cannot use OCR, add to failed file paths store elif file_probably_encrypted and not self.parameters['allow_OCR']: self.results['failed_file_paths']['fancytext'][( str(file_path) )] = 'File appears to be encrypted and OCR has not been allowed/is not available.' self.results['pdf_reading_steps'][file_path].append( 'encrypted: OCR not allowed') # 3. If probably not encrypted, analyse the tika text of each page else: self.results['pdf_reading_steps'][file_path].append( 'unencrypted: analyse tika text') for i, page_text in enumerate(pages): for search_string in self.parameters['search_strings']: line_numbers = nm.count_text_occurrences( page_text, search_string, self.parameters['case_sensitive'], self.parameters['whole_phrase_only'], get_line_numbers=True) if len(line_numbers) > 0: if str(file_path) not in list( self.results['containing_file_paths'] [search_string]['fancytext'].keys()): self.results['containing_file_paths'][ search_string]['fancytext'][str( file_path)] = {} self.results['containing_file_paths'][search_string][ 'fancytext'][str(file_path)]['page ' + str(i + 1)] = line_numbers # Check if the pdf has any images - if desired, these can be analysed separately with OCR (not needed in encrypted case) if self.parameters['search_in_doc_images'] and self.parameters[ 'allow_OCR']: self.results['pdf_reading_steps'][file_path].append( 'unencrypted: search in images') n_images, saved_image_filepaths = pdf_utils.count_extract_pdf_images( file_path, save_images=True) if n_images > 0: for j, image_fp in enumerate(saved_image_filepaths): image_text = OCR_utils.image_to_text(image_fp, language='eng') for search_string in self.parameters['search_strings']: occurrences = nm.count_text_occurrences( image_text, search_string, self.parameters['case_sensitive'], self.parameters['whole_phrase_only']) if occurrences > 0: page_number = Path(file_path).stem.split( '_page_')[-1] if str(file_path) not in list( self.results['containing_file_paths'] [search_string]['fancytext'].keys()): self.results['containing_file_paths'][ search_string]['fancytext'][str( file_path)] = {} self.results['containing_file_paths'][ search_string]['fancytext'][str( file_path )]['image {} on page {}'.format( j + 1, page_number )] = '{} occurrences'.format(occurrences)