def get_input_to_mapping(input_image_file, methods_to_account,
                         dict_preparation):
    inputs_to_mapping_semantic_segmentation = []
    inputs_to_mapping_OCR = []
    inputs_to_mapping_scene = []

    for automatic_method in methods_to_account:
        if automatic_method == "semantic_segmentation":
            # Get the outputs of the semantic segmentation.
            print("TODO: choose whether to get the probability")
            list_img = []
            list_img_name = []
            list_img_ratio = []
            list_img_shapes = []
            list_output = []
            img, img_name, img_ratio, img_shapes = eval_u.prepare_sample(
                Path(input_image_file), 'deeplab', True)
            # Get predictions
            model_deeplab = eval_u.load_model('deeplab')
            output_pred_deeplab = eval_u.get_predictions([img], 'deeplab',
                                                         model_deeplab)
            # Post process to polygon
            #output = dl_u.deeplab_pred_to_output(output_pred_deeplab[0][1], False, True, output_pred_deeplab[0][0])
            inputs = dl_u.deeplab_pred_to_output(output_pred_deeplab[0][1],
                                                 False, True,
                                                 output_pred_deeplab[0][0],
                                                 True, img_shapes)
            list_img.append(img)
            list_img_name.append(img_name)
            list_img_ratio.append(img_ratio)
            list_img_shapes.append(img_shapes)
            list_output.append(inputs)
            inputs_to_mapping_semantic_segmentation.append(inputs)

        elif automatic_method == "OCR":
            # GEt the outputs of the Optical Character Recognition.
            output_pred_OCR = eval_u.get_predictions([str(input_image_file)],
                                                     'OCR', True, False)
            # Process for misspellings
            inputs = OCR_u.accountForMisspellings(
                output_pred_OCR[0], dict_preparation["words_dict"],
                dict_preparation["ss"])
            inputs_to_mapping_OCR.append(inputs)

        elif automatic_method == "scene_recognition":
            output_pred = eval_u.get_predictions([str(input_image_file)],
                                                 'vgg_places365', True, False,
                                                 dict_preparation)  #["top_k"])
            inputs_to_mapping_scene.append(output_pred)

    return inputs_to_mapping_semantic_segmentation, inputs_to_mapping_OCR, inputs_to_mapping_scene
Пример #2
0
def decode_pdf(pdf_filepath, image_directory,image_format='PNG'):

    if Path(pdf_filepath).suffix != '.pdf':
        raise Exception('Must specify a \'.pdf\' file extension for input pdf_filepath.')
    image_filepaths = OCR_utils.pdf_pages_to_images(pdf_filepath, image_directory, image_format=image_format)

    output = PdfFileWriter()
    pdf_page_fps = []
    for image_fp in image_filepaths:
        pdf_page_fps.append(OCR_utils.image_to_pdf(image_fp))   # Create a pdf from the image
        file = PdfFileReader(open(pdf_page_fps[-1], "rb"))           # Open the image's pdf
        output.addPage(file.getPage(0))                         # Add the page to the new document

    new_filepath = str(Path(pdf_filepath).parent / (Path(pdf_filepath).stem + '_decoded' + Path(pdf_filepath).suffix))
    outputStream = open(new_filepath, "wb")
    output.write(outputStream)
    outputStream.close()

    # Delete the temporary image files created
    if image_filepaths:
        for fp in image_filepaths: os.remove(fp)

    return new_filepath
Пример #3
0
def remove_greyscale_watermark(PDF_file_path, to_black_upperbound, to_white_lowerbound,
                               compression_factor = 1,
                               replacement_watermark='',
                               replacement_watermark_font = 'Arial',
                               replacement_watermark_text_size=20,
                               replacement_watermark_colour=(50,50,50,255),
                               replacement_watermark_text_center = (200, 200),
                               replacement_watermark_rotation_angle=0,
                               output_file_path = '',
                               jpg_quality = 75):

    image_fps = OCR_utils.pdf_pages_to_images(PDF_file_path, str(Path(PDF_file_path).parent), 'BMP', compression_factor=compression_factor)
    mod_image_fps = []
    for image_fp in image_fps:
        im = Image.open(image_fp)
        pix, s = im.load(), im.size

        # Examine RGB of specified pixels
        # i_wm, j_wm = 1422, 3071
        # wm_grey = pix[i_wm - 1, j_wm - 1] #173
        # i_ol, j_ol = 1579, 2902
        # ol_grey = pix[i_ol - 1, j_ol - 1] #81

        # # Determine the most common RGBs
        # dict_of_colours = {}
        # for i in range(s[0]):
        #     for j in range(s[1]):
        #         col = pix[i, j]
        #         if col not in dict_of_colours.keys():
        #             dict_of_colours[col] = 1
        #         else:
        #             dict_of_colours[col] += 1
        # dict_of_colours = {k: v for k, v in sorted(dict_of_colours.items(), key=lambda item: item[1], reverse=True)}
        # len([tup for tup in dict_of_colours.keys() if tup[0]==tup[1] and tup[1]==tup[2]]) == len(dict_of_colours.keys()) # Check if all are greyscale

        for i in range(s[0]):
            for j in range(s[1]):
                col = pix[i, j]
                if col[0]>=to_white_lowerbound:
                    pix[i, j] = (255,255,255)
                elif col[0]<=to_black_upperbound:
                    pix[i, j] = (0, 0, 0)

        if replacement_watermark:
            fp, im=np.add_text_line_to_image(im, replacement_watermark, replacement_watermark_text_center,
                            text_size=replacement_watermark_text_size,
                            text_box_pixel_width = 0,
                            RGBA=replacement_watermark_colour,
                            text_background_RGBA = (0,0,0,0),
                            text_box_RGBA = (0,0,0,0),
                            rot_degrees=replacement_watermark_rotation_angle,
                            font_name = replacement_watermark_font,
                            show_result = False)


        im.save(image_fp[:-4]+'_mod.jpg', quality=jpg_quality)
        mod_image_fps.append(image_fp[:-4]+'_mod.jpg')

    OCR_utils.images_to_pdf(mod_image_fps, output_file_path=output_file_path)
    # Delete the temporary image files created
    if image_fps or mod_image_fps:
        for fp in image_fps+mod_image_fps: os.remove(fp)
def prepare_needed_elements(methods_to_account):
    if "OCR" in methods_to_account:
        ss, words_dict = OCR_u.prepareDictForMisspellings()
    return {"ss": ss, "words_dict": words_dict}
def ruleBasedMapping(type_semantic_seg_rule, type_OCR_rule, type_scene_rule,
                     list_semantic_segmentation, list_OCR, list_scene,
                     needed_elements):
    polys_to_obfuscate = []

    if len(list_semantic_segmentation) > 0:
        if type_semantic_seg_rule == "simple_list":
            print("Dealing with the polygons from semantic segmentation.")
            list_private_deeplab_labels = ["person, individual, someone, somebody, mortal, soul", \
                                       "car, auto, automobile, machine, motorcar", \
                       "bus, autobus, coach, charabanc, double-decker, jitney, motorbus, ", \
                       "motorcoach, omnibus, passenger vehicle", "truck, motortruck", "van",
                       "conveyer belt, conveyor belt, conveyer, conveyor, transporter",  "minibike, motorbike", \
                       "bicycle, bike, wheel, cycle", "poster, posting, placard, notice, bill, card", \
                       "signboard sign", "bulletin board, notice board", \
                      "screen door, screen",  "screen, silver screen, projection screen", \
                      "crt screen", "plate", "monitor, monitoring device", \
                       "bookcase", "blind, screen", "book", "computer, computing machine, computing device, data processor ", \
                        "electronic computer, information processing system", \
                        "television receiver, television, television set, tv, tv set, idiot ", \
                        "trade name, brand name, brand, marque", "flag"]
            for poly in list_semantic_segmentation:
                #print("TODO: add filter per confidence score")
                if poly[1] in list_private_deeplab_labels:
                    for poly_elem in poly[0]:
                        if poly_elem[
                                0].area > 4.0:  # Check that the size of the polygons is large enough to actually see anything on the images.
                            #print(poly[1])
                            polys_to_obfuscate.append(poly_elem[0])
                            #print("TODO: check a surface size to filter out polygons.")

    if len(list_OCR) > 0:
        if type_OCR_rule == "simple_rule":
            print("Dealing with the polygons from OCR.")

            for text_recognized in list_OCR:
                poly_text = text_recognized[0]
                #print(poly_text)
                possible_values = text_recognized[1]
                for potential_value in possible_values:
                    # Check whether the string is actually not just one letter or a space.
                    string_without_space = potential_value.translate(
                        {ord(c): None
                         for c in string.whitespace})
                    if (len(string_without_space) > 1):
                        #print(potential_value)

                        ### Obfuscate any number
                        # count number of digits in the string:
                        nb_digit = sum(
                            list(
                                map(lambda x: 1 if x.isdigit() else 0,
                                    set(potential_value))))
                        if nb_digit > 3:  # This is a parameter to tune. for now, 4 corresponds to a year, we will put 6 digits minimum because it corresponds to a birth date and phone numbers have even more numbers.
                            #print(potential_value)
                            polys_to_obfuscate.append(
                                Polygon([(poly_text[0], poly_text[1]),
                                         (poly_text[2], poly_text[1]),
                                         (poly_text[2], poly_text[3]),
                                         (poly_text[0], poly_text[3])]))
                            break

                        # Obfuscate any element recognized as a location or organization or person.
                        continuous_chunk_1 = OCR_u.NERWithOldStanford(
                            potential_value)
                        continuous_chunk_2 = OCR_u.NERNewVersion(
                            potential_value)
                        list_recognized_entities_1 = [
                            chunk[1] for chunk in continuous_chunk_1
                        ]
                        list_recognized_entities_2 = [
                            chunk[1] for chunk in continuous_chunk_2
                        ]
                        list_recognized_entities = list_recognized_entities_1 + list_recognized_entities_2
                        if ("LOCATION" in list_recognized_entities) or \
                        ("PERSON" in list_recognized_entities) or \
                        ("ORGANIZATION" in list_recognized_entities) or \
                        ("GPE" in list_recognized_entities) :
                            #print(potential_value, list_recognized_entities)
                            polys_to_obfuscate.append(
                                Polygon([(poly_text[0], poly_text[1]),
                                         (poly_text[2], poly_text[1]),
                                         (poly_text[2], poly_text[3]),
                                         (poly_text[0], poly_text[3])]))
                            break

                        # Obfuscate elements in a list of names or locations.
                        words = potential_value.split()
                        list_words = []
                        for value in words:
                            list_words += [
                                value,
                                value.upper(),
                                value.lower(),
                                value.title()
                            ]

                        for word in list_words:
                            # Get each word from the extracted strings and check for similarity
                            similar_words = difflib.get_close_matches(
                                word,
                                needed_elements["lines"],
                                n=3,
                                cutoff=0.9)
                            if len(similar_words) > 0:
                                #print(potential_value, similar_words)
                                polys_to_obfuscate.append(
                                    Polygon([(poly_text[0], poly_text[1]),
                                             (poly_text[2], poly_text[1]),
                                             (poly_text[2], poly_text[3]),
                                             (poly_text[0], poly_text[3])]))
                                break

                        if len(similar_words) > 0:
                            break

                        # Obfuscate elements that are next to "name" or "date". # Let's thnk about that later...

                        #print("TO IMPLEMENT")

        elif type_OCR_rule == "simplest_rule":
            print("Dealing with the polygons from OCR.")

            # Obfuscate all text that is diffeernt from ""
            for text_recognized in list_OCR:
                poly_text = text_recognized[0]
                possible_values = text_recognized[1]
                for potential_value in possible_values:
                    if potential_value.strip():
                        polys_to_obfuscate.append(
                            Polygon([(poly_text[0], poly_text[1]),
                                     (poly_text[2], poly_text[1]),
                                     (poly_text[2], poly_text[3]),
                                     (poly_text[0], poly_text[3])]))
                        break

    return polys_to_obfuscate
Пример #6
0
    def search_in_presentations(self):

        ppt_instance, slide_counter = win32com.client.Dispatch(
            'PowerPoint.Application'), 0
        for index_file, file_path in enumerate(
                self.results['files_to_search_inside']['presentation']):
            print('Searching in presentation file {} of {}...'.format(
                index_file + 1,
                len(self.results['files_to_search_inside']['presentation'])))
            read_only, has_title, window = False, False, False
            prs = ppt_instance.Presentations.open(file_path, read_only,
                                                  has_title, window)
            self.results['file_slide_sizes'][file_path] = (
                prs.PageSetup.SlideWidth, prs.PageSetup.SlideHeight)

            for index_slide, Slide in enumerate(prs.Slides):
                for index_shape, Shape in enumerate(Slide.Shapes):
                    slide_string = 'Slide ' + str(index_slide + 1)
                    object_string = 'Object ' + str(index_shape + 1)
                    if Shape.HasTextFrame:
                        if Shape.TextFrame.HasText:
                            paragraphs_specialchars_removed = [
                                p.Text for p in
                                Shape.TextFrame.TextRange.Paragraphs()
                                if (p.Text != '\r')
                            ]
                            for index_paragraph, Paragraph in enumerate(
                                    paragraphs_specialchars_removed):
                                for search_string in self.parameters[
                                        'search_strings']:
                                    occurrences = nm.count_text_occurrences(
                                        Paragraph, search_string,
                                        self.parameters['case_sensitive'],
                                        self.parameters['whole_phrase_only'])
                                    if occurrences > 0:
                                        slide_counter += 1
                                        if str(file_path) not in list(
                                                self.results[
                                                    'containing_file_paths']
                                            [search_string]
                                            ['presentation'].keys()):
                                            self.results[
                                                'containing_file_paths'][
                                                    search_string][
                                                        'presentation'][str(
                                                            file_path)] = {}
                                        paragraph_string = 'Paragraph ' + str(
                                            index_paragraph + 1)
                                        occurrences_string = str(
                                            occurrences
                                        ) + ' occurrence' if occurrences == 1 else str(
                                            occurrences) + ' occurrences'
                                        combined_string = object_string + ', ' + paragraph_string + ', ' + occurrences_string
                                        if slide_string in list(
                                                self.results[
                                                    'containing_file_paths']
                                            [search_string]['presentation'][
                                                str(file_path)].keys()):
                                            self.results[
                                                'containing_file_paths'][
                                                    search_string][
                                                        'presentation'][str(
                                                            file_path
                                                        )][slide_string].append(
                                                            combined_string)
                                        else:
                                            self.results[
                                                'containing_file_paths'][
                                                    search_string][
                                                        'presentation'][str(
                                                            file_path
                                                        )][slide_string] = [
                                                            combined_string
                                                        ]
                    if Shape.Type in [3, 21, 28, 11, 13] and self.parameters[
                            'search_in_doc_images'] and self.parameters[
                                'allow_OCR']:
                        img_fp = str(
                            self.temp_directory / '{}_{}_{}.jpg'.format(
                                str(Path(file_path).stem).replace('.', ''),
                                slide_string, object_string))
                        Shape.Export(img_fp, 3)
                        try:
                            image_text = OCR_utils.image_to_text(
                                img_fp, language='eng')
                        except:
                            image_text = ''
                        for search_string in self.parameters['search_strings']:
                            occurrences = nm.count_text_occurrences(
                                image_text, search_string,
                                self.parameters['case_sensitive'],
                                self.parameters['whole_phrase_only'])
                            occurrences_string = str(
                                occurrences
                            ) + ' occurrence' if occurrences == 1 else str(
                                occurrences) + ' occurrences'
                            combined_string = object_string + ' (image), ' + occurrences_string
                            if occurrences > 0:
                                if str(file_path) not in list(
                                        self.results['containing_file_paths']
                                    [search_string]['presentation'].keys()):
                                    self.results['containing_file_paths'][
                                        search_string]['presentation'][str(
                                            file_path)] = {}
                                if slide_string in list(
                                        self.results['containing_file_paths']
                                    [search_string]['presentation'][str(
                                        file_path)].keys()):
                                    self.results['containing_file_paths'][
                                        search_string]['presentation'][str(
                                            file_path)][slide_string].append(
                                                combined_string)
                                else:
                                    self.results['containing_file_paths'][
                                        search_string]['presentation'][str(
                                            file_path)][slide_string] = [
                                                combined_string
                                            ]
                        os.remove(img_fp)
Пример #7
0
    def search_in_pdfs(self, file_path):

        # Read the text with tika and parse into individual pages with BeautifulSoup
        # TODO: test with 1 page pdf
        pages = pdf_utils.tika_read()

        # If the file contains 4 or more consecutive symbols, it is probably encrypted
        file_probably_encrypted = False
        for page_text in pages:
            if page_text is None or pdf_utils.is_probably_encrypted(
                    page_text, n_consecutive_symbols=4):
                file_probably_encrypted = True
                break

        # 1. If probably encrypted and OCR allowed, analyse the pages with Tesseract OCR (Optical Character Recognition)
        if file_probably_encrypted and self.parameters['allow_OCR']:
            self.results['pdf_reading_steps'][file_path].append(
                'encrypted: used page OCR')
            page_image_filepaths = OCR_utils.pdf_pages_to_images(
                file_path, self.temp_directory, 'jpg'
            )  # Convert pdf pages to image files and save list of their filepaths
            for i, image_fp in enumerate(
                    page_image_filepaths
            ):  # Use OCR on each page to get a text string for each
                page_text = OCR_utils.image_to_text(image_fp, language='eng')
                for search_string in self.parameters['search_strings']:
                    line_numbers = nm.count_text_occurrences(
                        page_text,
                        search_string,
                        self.parameters['case_sensitive'],
                        self.parameters['whole_phrase_only'],
                        get_line_numbers=True)
                    if len(line_numbers) > 0:
                        if str(file_path) not in list(
                                self.results['containing_file_paths']
                            [search_string]['fancytext'].keys()):
                            self.results['containing_file_paths'][
                                search_string]['fancytext'][str(
                                    file_path)] = {}
                        self.results['containing_file_paths'][search_string][
                            'fancytext'][str(file_path)]['page ' +
                                                         str(i +
                                                             1)] = line_numbers
            # Delete the temporary image files created
            if page_image_filepaths:
                for fp in page_image_filepaths:
                    os.remove(fp)
        # 2. If probably encrypted but cannot use OCR, add to failed file paths store
        elif file_probably_encrypted and not self.parameters['allow_OCR']:
            self.results['failed_file_paths']['fancytext'][(
                str(file_path)
            )] = 'File appears to be encrypted and OCR has not been allowed/is not available.'
            self.results['pdf_reading_steps'][file_path].append(
                'encrypted: OCR not allowed')
        # 3. If probably not encrypted, analyse the tika text of each page
        else:
            self.results['pdf_reading_steps'][file_path].append(
                'unencrypted: analyse tika text')
            for i, page_text in enumerate(pages):
                for search_string in self.parameters['search_strings']:
                    line_numbers = nm.count_text_occurrences(
                        page_text,
                        search_string,
                        self.parameters['case_sensitive'],
                        self.parameters['whole_phrase_only'],
                        get_line_numbers=True)
                    if len(line_numbers) > 0:
                        if str(file_path) not in list(
                                self.results['containing_file_paths']
                            [search_string]['fancytext'].keys()):
                            self.results['containing_file_paths'][
                                search_string]['fancytext'][str(
                                    file_path)] = {}
                        self.results['containing_file_paths'][search_string][
                            'fancytext'][str(file_path)]['page ' +
                                                         str(i +
                                                             1)] = line_numbers

            # Check if the pdf has any images - if desired, these can be analysed separately with OCR (not needed in encrypted case)
            if self.parameters['search_in_doc_images'] and self.parameters[
                    'allow_OCR']:
                self.results['pdf_reading_steps'][file_path].append(
                    'unencrypted: search in images')
                n_images, saved_image_filepaths = pdf_utils.count_extract_pdf_images(
                    file_path, save_images=True)
                if n_images > 0:
                    for j, image_fp in enumerate(saved_image_filepaths):
                        image_text = OCR_utils.image_to_text(image_fp,
                                                             language='eng')
                        for search_string in self.parameters['search_strings']:
                            occurrences = nm.count_text_occurrences(
                                image_text, search_string,
                                self.parameters['case_sensitive'],
                                self.parameters['whole_phrase_only'])
                            if occurrences > 0:
                                page_number = Path(file_path).stem.split(
                                    '_page_')[-1]
                                if str(file_path) not in list(
                                        self.results['containing_file_paths']
                                    [search_string]['fancytext'].keys()):
                                    self.results['containing_file_paths'][
                                        search_string]['fancytext'][str(
                                            file_path)] = {}
                                self.results['containing_file_paths'][
                                    search_string]['fancytext'][str(
                                        file_path
                                    )]['image {} on page {}'.format(
                                        j + 1, page_number
                                    )] = '{} occurrences'.format(occurrences)
Пример #8
0
def postProcessOCROutputs(OCR_outputs, needed_elements):
    OCR_processed_output = []
    for text_recognized in OCR_outputs:
        poly_text = text_recognized[0]
        #print(poly_text)
        possible_values = list(set(text_recognized[1]))
        #print(possible_values)
        for potential_value in possible_values:
            # Check whether the string is actually not just one letter or a space.
            string_without_space = potential_value.translate(
                {ord(c): None
                 for c in string.whitespace})
            if (len(string_without_space) > 1):
                #print(potential_value)

                ### Obfuscate any number
                # count number of digits in the string:
                nb_digit = sum(
                    list(
                        map(lambda x: 1 if x.isdigit() else 0,
                            set(potential_value))))
                if nb_digit > 3:  # This is a parameter to tune. for now, 4 corresponds to a year, we will put 6 digits minimum because it corresponds to a birth date and phone numbers have even more numbers.
                    #print(potential_value)
                    OCR_processed_output.append((Polygon([
                        (poly_text[0], poly_text[1]),
                        (poly_text[2], poly_text[1]),
                        (poly_text[2], poly_text[3]),
                        (poly_text[0], poly_text[3])
                    ]), potential_value, "hasNumbers"))
                    break

                # Obfuscate any element recognized as a location or organization or person.
                continuous_chunk_1 = OCR_u.NERWithOldStanford(potential_value)
                continuous_chunk_2 = OCR_u.NERNewVersion(potential_value)
                list_recognized_entities_1 = [
                    chunk[1] for chunk in continuous_chunk_1
                ]
                list_recognized_entities_2 = [
                    chunk[1] for chunk in continuous_chunk_2
                ]
                list_recognized_entities = list_recognized_entities_1 + list_recognized_entities_2
                if ("LOCATION" in list_recognized_entities) or \
                ("PERSON" in list_recognized_entities) or \
                ("ORGANIZATION" in list_recognized_entities) or \
                ("GPE" in list_recognized_entities) :
                    #print(potential_value, list_recognized_entities)
                    if ("LOCATION" in list_recognized_entities):
                        OCR_processed_output.append((Polygon([
                            (poly_text[0], poly_text[1]),
                            (poly_text[2], poly_text[1]),
                            (poly_text[2], poly_text[3]),
                            (poly_text[0], poly_text[3])
                        ]), potential_value, "LOCATION"))
                        break
                    elif ("PERSON" in list_recognized_entities):
                        OCR_processed_output.append((Polygon([
                            (poly_text[0], poly_text[1]),
                            (poly_text[2], poly_text[1]),
                            (poly_text[2], poly_text[3]),
                            (poly_text[0], poly_text[3])
                        ]), potential_value, "PERSON"))
                        break
                    elif ("ORGANIZATION" in list_recognized_entities):
                        OCR_processed_output.append((Polygon([
                            (poly_text[0], poly_text[1]),
                            (poly_text[2], poly_text[1]),
                            (poly_text[2], poly_text[3]),
                            (poly_text[0], poly_text[3])
                        ]), potential_value, "ORGANIZATION"))
                        break
                    elif ("GPE" in list_recognized_entities):
                        OCR_processed_output.append((Polygon([
                            (poly_text[0], poly_text[1]),
                            (poly_text[2], poly_text[1]),
                            (poly_text[2], poly_text[3]),
                            (poly_text[0], poly_text[3])
                        ]), potential_value, "GPE"))
                        break

                # Obfuscate elements in a list of names or locations.
                words = potential_value.split()
                list_words = []
                for value in words:
                    list_words += [
                        value,
                        value.upper(),
                        value.lower(),
                        value.title()
                    ]

                for word in list_words:
                    similar_words = ""
                    if len(
                            word
                    ) > 3:  # This is a design choice to avoid small words...
                        # Get each word from the extracted strings and check for similarity
                        similar_words = difflib.get_close_matches(
                            word,
                            needed_elements["lines_names"],
                            n=3,
                            cutoff=0.9)
                        if len(similar_words) > 0:
                            #print(potential_value, similar_words)
                            OCR_processed_output.append((Polygon([
                                (poly_text[0], poly_text[1]),
                                (poly_text[2], poly_text[1]),
                                (poly_text[2], poly_text[3]),
                                (poly_text[0], poly_text[3])
                            ]), potential_value, "PERSON"))
                            break
                        similar_words = difflib.get_close_matches(
                            word,
                            needed_elements["lines_location"],
                            n=3,
                            cutoff=0.9)
                        if len(similar_words) > 0:
                            #print(potential_value, similar_words)
                            OCR_processed_output.append((Polygon([
                                (poly_text[0], poly_text[1]),
                                (poly_text[2], poly_text[1]),
                                (poly_text[2], poly_text[3]),
                                (poly_text[0], poly_text[3])
                            ]), potential_value, "LOCATION"))
                            break

                    if len(similar_words) > 0:
                        break
    return OCR_processed_output