Пример #1
0
    def search_in_presentations(self):

        ppt_instance, slide_counter = win32com.client.Dispatch(
            'PowerPoint.Application'), 0
        for index_file, file_path in enumerate(
                self.results['files_to_search_inside']['presentation']):
            print('Searching in presentation file {} of {}...'.format(
                index_file + 1,
                len(self.results['files_to_search_inside']['presentation'])))
            read_only, has_title, window = False, False, False
            prs = ppt_instance.Presentations.open(file_path, read_only,
                                                  has_title, window)
            self.results['file_slide_sizes'][file_path] = (
                prs.PageSetup.SlideWidth, prs.PageSetup.SlideHeight)

            for index_slide, Slide in enumerate(prs.Slides):
                for index_shape, Shape in enumerate(Slide.Shapes):
                    slide_string = 'Slide ' + str(index_slide + 1)
                    object_string = 'Object ' + str(index_shape + 1)
                    if Shape.HasTextFrame:
                        if Shape.TextFrame.HasText:
                            paragraphs_specialchars_removed = [
                                p.Text for p in
                                Shape.TextFrame.TextRange.Paragraphs()
                                if (p.Text != '\r')
                            ]
                            for index_paragraph, Paragraph in enumerate(
                                    paragraphs_specialchars_removed):
                                for search_string in self.parameters[
                                        'search_strings']:
                                    occurrences = nm.count_text_occurrences(
                                        Paragraph, search_string,
                                        self.parameters['case_sensitive'],
                                        self.parameters['whole_phrase_only'])
                                    if occurrences > 0:
                                        slide_counter += 1
                                        if str(file_path) not in list(
                                                self.results[
                                                    'containing_file_paths']
                                            [search_string]
                                            ['presentation'].keys()):
                                            self.results[
                                                'containing_file_paths'][
                                                    search_string][
                                                        'presentation'][str(
                                                            file_path)] = {}
                                        paragraph_string = 'Paragraph ' + str(
                                            index_paragraph + 1)
                                        occurrences_string = str(
                                            occurrences
                                        ) + ' occurrence' if occurrences == 1 else str(
                                            occurrences) + ' occurrences'
                                        combined_string = object_string + ', ' + paragraph_string + ', ' + occurrences_string
                                        if slide_string in list(
                                                self.results[
                                                    'containing_file_paths']
                                            [search_string]['presentation'][
                                                str(file_path)].keys()):
                                            self.results[
                                                'containing_file_paths'][
                                                    search_string][
                                                        'presentation'][str(
                                                            file_path
                                                        )][slide_string].append(
                                                            combined_string)
                                        else:
                                            self.results[
                                                'containing_file_paths'][
                                                    search_string][
                                                        'presentation'][str(
                                                            file_path
                                                        )][slide_string] = [
                                                            combined_string
                                                        ]
                    if Shape.Type in [3, 21, 28, 11, 13] and self.parameters[
                            'search_in_doc_images'] and self.parameters[
                                'allow_OCR']:
                        img_fp = str(
                            self.temp_directory / '{}_{}_{}.jpg'.format(
                                str(Path(file_path).stem).replace('.', ''),
                                slide_string, object_string))
                        Shape.Export(img_fp, 3)
                        try:
                            image_text = OCR_utils.image_to_text(
                                img_fp, language='eng')
                        except:
                            image_text = ''
                        for search_string in self.parameters['search_strings']:
                            occurrences = nm.count_text_occurrences(
                                image_text, search_string,
                                self.parameters['case_sensitive'],
                                self.parameters['whole_phrase_only'])
                            occurrences_string = str(
                                occurrences
                            ) + ' occurrence' if occurrences == 1 else str(
                                occurrences) + ' occurrences'
                            combined_string = object_string + ' (image), ' + occurrences_string
                            if occurrences > 0:
                                if str(file_path) not in list(
                                        self.results['containing_file_paths']
                                    [search_string]['presentation'].keys()):
                                    self.results['containing_file_paths'][
                                        search_string]['presentation'][str(
                                            file_path)] = {}
                                if slide_string in list(
                                        self.results['containing_file_paths']
                                    [search_string]['presentation'][str(
                                        file_path)].keys()):
                                    self.results['containing_file_paths'][
                                        search_string]['presentation'][str(
                                            file_path)][slide_string].append(
                                                combined_string)
                                else:
                                    self.results['containing_file_paths'][
                                        search_string]['presentation'][str(
                                            file_path)][slide_string] = [
                                                combined_string
                                            ]
                        os.remove(img_fp)
Пример #2
0
    def search_in_pdfs(self, file_path):

        # Read the text with tika and parse into individual pages with BeautifulSoup
        # TODO: test with 1 page pdf
        pages = pdf_utils.tika_read()

        # If the file contains 4 or more consecutive symbols, it is probably encrypted
        file_probably_encrypted = False
        for page_text in pages:
            if page_text is None or pdf_utils.is_probably_encrypted(
                    page_text, n_consecutive_symbols=4):
                file_probably_encrypted = True
                break

        # 1. If probably encrypted and OCR allowed, analyse the pages with Tesseract OCR (Optical Character Recognition)
        if file_probably_encrypted and self.parameters['allow_OCR']:
            self.results['pdf_reading_steps'][file_path].append(
                'encrypted: used page OCR')
            page_image_filepaths = OCR_utils.pdf_pages_to_images(
                file_path, self.temp_directory, 'jpg'
            )  # Convert pdf pages to image files and save list of their filepaths
            for i, image_fp in enumerate(
                    page_image_filepaths
            ):  # Use OCR on each page to get a text string for each
                page_text = OCR_utils.image_to_text(image_fp, language='eng')
                for search_string in self.parameters['search_strings']:
                    line_numbers = nm.count_text_occurrences(
                        page_text,
                        search_string,
                        self.parameters['case_sensitive'],
                        self.parameters['whole_phrase_only'],
                        get_line_numbers=True)
                    if len(line_numbers) > 0:
                        if str(file_path) not in list(
                                self.results['containing_file_paths']
                            [search_string]['fancytext'].keys()):
                            self.results['containing_file_paths'][
                                search_string]['fancytext'][str(
                                    file_path)] = {}
                        self.results['containing_file_paths'][search_string][
                            'fancytext'][str(file_path)]['page ' +
                                                         str(i +
                                                             1)] = line_numbers
            # Delete the temporary image files created
            if page_image_filepaths:
                for fp in page_image_filepaths:
                    os.remove(fp)
        # 2. If probably encrypted but cannot use OCR, add to failed file paths store
        elif file_probably_encrypted and not self.parameters['allow_OCR']:
            self.results['failed_file_paths']['fancytext'][(
                str(file_path)
            )] = 'File appears to be encrypted and OCR has not been allowed/is not available.'
            self.results['pdf_reading_steps'][file_path].append(
                'encrypted: OCR not allowed')
        # 3. If probably not encrypted, analyse the tika text of each page
        else:
            self.results['pdf_reading_steps'][file_path].append(
                'unencrypted: analyse tika text')
            for i, page_text in enumerate(pages):
                for search_string in self.parameters['search_strings']:
                    line_numbers = nm.count_text_occurrences(
                        page_text,
                        search_string,
                        self.parameters['case_sensitive'],
                        self.parameters['whole_phrase_only'],
                        get_line_numbers=True)
                    if len(line_numbers) > 0:
                        if str(file_path) not in list(
                                self.results['containing_file_paths']
                            [search_string]['fancytext'].keys()):
                            self.results['containing_file_paths'][
                                search_string]['fancytext'][str(
                                    file_path)] = {}
                        self.results['containing_file_paths'][search_string][
                            'fancytext'][str(file_path)]['page ' +
                                                         str(i +
                                                             1)] = line_numbers

            # Check if the pdf has any images - if desired, these can be analysed separately with OCR (not needed in encrypted case)
            if self.parameters['search_in_doc_images'] and self.parameters[
                    'allow_OCR']:
                self.results['pdf_reading_steps'][file_path].append(
                    'unencrypted: search in images')
                n_images, saved_image_filepaths = pdf_utils.count_extract_pdf_images(
                    file_path, save_images=True)
                if n_images > 0:
                    for j, image_fp in enumerate(saved_image_filepaths):
                        image_text = OCR_utils.image_to_text(image_fp,
                                                             language='eng')
                        for search_string in self.parameters['search_strings']:
                            occurrences = nm.count_text_occurrences(
                                image_text, search_string,
                                self.parameters['case_sensitive'],
                                self.parameters['whole_phrase_only'])
                            if occurrences > 0:
                                page_number = Path(file_path).stem.split(
                                    '_page_')[-1]
                                if str(file_path) not in list(
                                        self.results['containing_file_paths']
                                    [search_string]['fancytext'].keys()):
                                    self.results['containing_file_paths'][
                                        search_string]['fancytext'][str(
                                            file_path)] = {}
                                self.results['containing_file_paths'][
                                    search_string]['fancytext'][str(
                                        file_path
                                    )]['image {} on page {}'.format(
                                        j + 1, page_number
                                    )] = '{} occurrences'.format(occurrences)