示例#1
0
 def test_extract_dates_frm_pdf(self):
     PDF_Info = PDF_Information()
     PDF_Info.rawData = [[5, 1, 5, 4, 1, 7, 2518, 1528, 154, 42, 95, '95014'], [5, 1, 5, 4, 1, 8, 3653, 1515, 321, 42, 88, 'AE10071871'], [5, 1, 5, 5, 1, 1, 275, 1624, 132, 42, 94, 'Israel'], [5, 1, 5, 5, 1, 2, 2129, 1608, 157, 42, 95, '$'], [5, 1, 5, 5, 1, 3, 2309, 1607, 157, 43, 96, '1,000'], [5, 1, 5, 5, 1, 4, 3331, 1602, 208, 42, 95, 'Mar2020'], [5,
     1, 5, 5, 1, 5, 3562, 1598, 282, 53, 95, '$60,000.00'], [5, 1, 5, 5, 1, 6, 3867, 1601, 114, 43, 94, 'USD'], [5, 1, 5, 5, 2, 1, 2129, 1689, 172, 42, 95, 'Phone:'], [5, 1, 5, 5, 2, 2, 2325, 1689, 52, 41, 95, '+1'], [5, 1, 5, 5, 2, 3, 2406, 1688, 128, 54, 95, '(650)'], [5, 1, 5, 5, 2, 4, 2555, 1689, 219, 42, 95, '6186116'], [5, 1, 5, 5, 2, 5, 3726, 1683, 205, 42, 96, 'Version:'], [5, 1, 5, 5, 2, 6, 3958, 1683, 16, 41, 96, '1'], [5, 1, 5, 5, 3, 1, 2129, 1770, 102, 42, 93, 'Fax:']]
     p_distance = 50
     PDP = Prepare_Data_PDF()
     PDP.get_pdf_network(PDF_Info,0)
     PDP.get_pdf_lines(PDF_Info)
     self.x.extract_dates_frm_pdf(PDF_Info, self.get_exact_match,  p_distance)
     print(PDF_Info.PDF_Lines)
     print(PDF_Info.dates)
    def extract_information_frm_PDF(self,
                                    p_pdf_path,
                                    p_keyword_dict,
                                    p_exact_match_only,
                                    p_distance,
                                    p_currency_distance,
                                    p_scan_for_tbl=0):
        """
        extracts information from pdf with location in document. Extracts:
                => keywords
                => currency
                => integers
                => dates
        Args:
            p_pdf_path (string): path to pdf document.
            p_keyword_dict (dictionary): keyword dictionary: key = list name, value = list.
            p_exact_match_only (Boolean): 1 = returns only exact match, 0 = return partial match as well.
            p_distance (int): distance in pixel while looking for term in proximity
        Returns:       
        """
        # get pdf term network.
        PDF_Info = PDF_Information()
        self.get_pdf_term_network(PDF_Info, p_pdf_path)

        # get pdf lines
        self.get_pdf_lines(PDF_Info)
        # get keywords from term network
        PDF_Info.keywords = self.extract_keywords_frm_PDF(
            PDF_Info, p_keyword_dict, p_exact_match_only, p_distance)

        # search dates
        self.EDFP.extract_dates_frm_pdf(PDF_Info, self.get_exact_match,
                                        p_distance)

        #search currency
        self.ECFP.extract_currency_frm_pdf(PDF_Info, p_currency_distance)

        # search numbers
        self.extract_numbers_frm_PDF(PDF_Info)

        # extract table
        if p_scan_for_tbl:
            self.extract_information_frm_table(PDF_Info, p_keyword_dict)

        # remove all images created
        self.pdf_r.clear_image_files(PDF_Info.page_count)

        return PDF_Info
    def test_get_distance(self):
        PDF_Info = PDF_Information()
        l_alignment_list = [
            [[5, 1, 4, 1, 1, 1, 454, 973, 203, 53, 95, 'Issued', 0],
             [[[5, 1, 4, 1, 1, 4, 1134, 973, 286, 67, 95, 'February', 0],
               ('right', 477)],
              [[5, 1, 4, 1, 1, 5, 1442, 973, 94, 62, 95, '20', 0],
               ('right', 785)],
              [[5, 1, 4, 1, 1, 5, 1442, 973, 94, 62, 95, ',', 1],
               ('right', 785)],
              [[5, 1, 4, 1, 1, 6, 1564, 973, 157, 53, 95, '2020', 0],
               ('right', 907)]]],
            [[5, 1, 4, 1, 1, 2, 685, 986, 74, 40, 96, 'on', 0],
             [[[5, 1, 4, 1, 1, 4, 1134, 973, 286, 67, 95, 'February', 0],
               ('right', 375)],
              [[5, 1, 4, 1, 1, 5, 1442, 973, 94, 62, 95, '20', 0],
               ('right', 683)],
              [[5, 1, 4, 1, 1, 5, 1442, 973, 94, 62, 95, ',', 1],
               ('right', 683)],
              [[5, 1, 4, 1, 1, 6, 1564, 973, 157, 53, 95, '2020', 0],
               ('right', 805)]]]
        ]
        out_put = self.x.get_distance(l_alignment_list, PDF_Info.get_distance)
        self.assertEqual(out_put, {'min': 375, 'max': 907})

        # empty
        l_alignment_list = []
        out_put = self.x.get_distance(l_alignment_list, PDF_Info.get_distance)
        self.assertEqual(out_put, {'min': -1, 'max': -1})

        # two max at same distance
        l_alignment_list = [
            [[5, 1, 4, 1, 1, 1, 454, 973, 203, 53, 95, 'Issued', 0],
             [[[5, 1, 4, 1, 1, 4, 1134, 973, 286, 67, 95, 'February', 0],
               ('right', 477)],
              [[5, 1, 4, 1, 1, 5, 1442, 973, 94, 62, 95, '20', 0],
               ('right', 785)],
              [[5, 1, 4, 1, 1, 5, 1442, 973, 94, 62, 95, ',', 1],
               ('right', 785)],
              [[5, 1, 4, 1, 1, 6, 1564, 973, 157, 53, 95, '2020', 0],
               ('right', 907)]]],
            [[5, 1, 4, 1, 1, 2, 685, 986, 74, 40, 96, 'on', 0],
             [[[5, 1, 4, 1, 1, 4, 1134, 973, 286, 67, 95, 'February', 0],
               ('right', 375)],
              [[5, 1, 4, 1, 1, 5, 1442, 973, 94, 62, 95, '20', 0],
               ('right', 683)],
              [[5, 1, 4, 1, 1, 5, 1442, 973, 94, 62, 95, ',', 1],
               ('right', 907)],
              [[5, 1, 4, 1, 1, 6, 1564, 973, 157, 53, 95, '2020', 0],
               ('right', 805)]]]
        ]
        out_put = self.x.get_distance(l_alignment_list, PDF_Info.get_distance)
        self.assertEqual(out_put, {'min': 375, 'max': 907})
示例#4
0
 def test_search_dates_in_pdf(self):
     PDF_Info = PDF_Information()
     PDF_Info.PDF_Lines = [[['hey todat is 5/7/2020!']], [['hey todat is May 7th 2020']], [['hey todat is May 2020!']]]
     self.x.search_dates_in_pdf(PDF_Info)
     self.assertEqual(PDF_Info.dates, {'hey todat is 5/7/2020!': ['5/7/2020'], 'hey todat is May 7th 2020': ['May 7th 2020'], 'hey todat is May 2020!': ['May 2020']})
     # single entry
     PDF_Info.PDF_Lines = [[['hey todat is 5/7/2020!']]]
     self.x.search_dates_in_pdf(PDF_Info)
     self.assertEqual(PDF_Info.dates, {'hey todat is 5/7/2020!': ['5/7/2020']})
     # no date found
     PDF_Info.PDF_Lines = [[['hey todat is !']], [['hey todat is May ']], [['hey todat is !']]]
     self.x.search_dates_in_pdf(PDF_Info)
     self.assertEqual(PDF_Info.dates, {})
     # empty dates
     PDF_Info.PDF_Lines = []
     self.x.search_dates_in_pdf(PDF_Info)
     self.assertEqual(PDF_Info.dates, {})
示例#5
0
    def test_get_unique_dates(self):
        PDF_Info = PDF_Information()
        PDF_Info.dates = {'a mar 2020':['mar 2020'], 'a mar 2021':['mar 2021'], 'a mar 2020 hello':['mar 2020']}
        out_put = self.x.get_unique_dates(PDF_Info)
        self.assertEqual(out_put, {'mar 2020', 'mar 2021'})

        PDF_Info.dates = {'a mar 2020':['mar 2020'], 'a mar 2020 hello':['mar 2020']}
        out_put = self.x.get_unique_dates(PDF_Info)
        self.assertEqual(out_put, {'mar 2020'})
        # single entry
        PDF_Info.dates = {'a mar 2020':['mar 2020']}
        out_put = self.x.get_unique_dates(PDF_Info)
        self.assertEqual(out_put, {'mar 2020'})
        # empty dates
        PDF_Info.dates = {}
        out_put = self.x.get_unique_dates(PDF_Info)
        self.assertEqual(out_put, set({}))
    def test_is_same(self):
        PDF_Info = PDF_Information()

        term1 = [5, 1, 13, 1, 3, 8, 2864, 104712, 282, 53, 96, '$60,000.00']
        term2 = [5, 1, 13, 1, 3, 8, 2864, 104712, 282, 53, 96, '$60,000.00']
        out_put = self.x.is_same(term1, term2,
                                 PDF_Info.get_indexes_with_pixel_info)
        self.assertTrue(out_put)
        # same pixel different text
        term1 = [5, 1, 13, 1, 3, 8, 2864, 104712, 282, 53, 96, '$60,000.00']
        term2 = [5, 1, 13, 1, 3, 8, 2864, 104712, 282, 53, 96, '$.00']
        out_put = self.x.is_same(term1, term2,
                                 PDF_Info.get_indexes_with_pixel_info)
        self.assertTrue(out_put)
        # not equal
        term1 = [5, 1, 13, 1, 3, 8, 93, 104712, 282, 53, 96, '$60,000.00']
        term2 = [5, 1, 13, 1, 3, 8, 2864, 104712, 282, 53, 96, '$60,000.00']
        out_put = self.x.is_same(term1, term2,
                                 PDF_Info.get_indexes_with_pixel_info)
        self.assertFalse(out_put)
        # term1 blank
        term1 = []
        term2 = [5, 1, 13, 1, 3, 8, 2864, 104712, 282, 53, 96, '$60,000.00']
        out_put = self.x.is_same(term1, term2,
                                 PDF_Info.get_indexes_with_pixel_info)
        self.assertFalse(out_put)
        # term2 blank
        term1 = [5, 1, 13, 1, 3, 8, 93, 104712, 282, 53, 96, '$60,000.00']
        term2 = []
        out_put = self.x.is_same(term1, term2,
                                 PDF_Info.get_indexes_with_pixel_info)
        self.assertFalse(out_put)
        # both term blank
        term1 = []
        term2 = []
        out_put = self.x.is_same(term1, term2,
                                 PDF_Info.get_indexes_with_pixel_info)
        self.assertTrue(out_put)
 def setUp(self):
     self.PDF_Info = PDF_Information()
     self.PDF_Info = z_test_data.PDF_Info
     self.x = Network_Navigtor()
示例#8
0
    #     box.append(key)

    # print('-------------------boxes-------------------------')
    # print(box)
    # print(len(box))

    # cells = get_cells(box)

    # for cell in cells:
    #     print('----------------------------------cell----------------------')
    #     print(cell)
    #     print(out_put[cell])
    # print(len(cells))

    # ---------------------------------------------------pdf_info-------------------------------------------------------
    pdf_info = PDF_Information()
    pdf_info.cleanData = [
        [5, 1, 1, 1, 1, 1, 681, 2770, 33, 25, 96, 'c.'],
        [5, 1, 1, 1, 1, 2, 732, 2762, 106, 42, 95, 'Long'],
        [5, 1, 1, 1, 1, 3, 856, 2761, 183, 34, 95, 'Distance'],
        [5, 1, 1, 1, 1, 4, 1057, 2761, 146, 34, 95, 'Radius'],
        [5, 1, 2, 1, 1, 1, 3031, 2848, 145, 34, 95, 'Radius'],
        [5, 1, 2, 1, 1, 2, 3193, 2848, 117, 34, 96, 'Class'],
        [5, 1, 3, 1, 1, 1, 1441, 2975, 196, 34, 95, 'Business'],
        [5, 1, 3, 1, 1, 2, 1655, 2975, 78, 34, 96, 'Use'],
        [5, 1, 3, 1, 1, 3, 2837, 2978, 105, 42, 95, 'Long'],
        [5, 1, 3, 1, 1, 4, 2960, 2977, 184, 34, 96, 'Distance'],
        [5, 1, 3, 1, 1, 5, 3160, 2977, 117, 43, 96, '(Over'],
        [5, 1, 3, 1, 1, 6, 3290, 2977, 73, 34, 96, '200'],
        [5, 1, 3, 1, 1, 7, 3381, 2977, 123, 43, 94, 'Miles)'],
        [5, 1, 4, 1, 1, 1, 1528, 3053, 116, 34, 94, 'Class'],