def test_extract_dates_frm_pdf(self): PDF_Info = PDF_Information() PDF_Info.rawData = [[5, 1, 5, 4, 1, 7, 2518, 1528, 154, 42, 95, '95014'], [5, 1, 5, 4, 1, 8, 3653, 1515, 321, 42, 88, 'AE10071871'], [5, 1, 5, 5, 1, 1, 275, 1624, 132, 42, 94, 'Israel'], [5, 1, 5, 5, 1, 2, 2129, 1608, 157, 42, 95, '$'], [5, 1, 5, 5, 1, 3, 2309, 1607, 157, 43, 96, '1,000'], [5, 1, 5, 5, 1, 4, 3331, 1602, 208, 42, 95, 'Mar2020'], [5, 1, 5, 5, 1, 5, 3562, 1598, 282, 53, 95, '$60,000.00'], [5, 1, 5, 5, 1, 6, 3867, 1601, 114, 43, 94, 'USD'], [5, 1, 5, 5, 2, 1, 2129, 1689, 172, 42, 95, 'Phone:'], [5, 1, 5, 5, 2, 2, 2325, 1689, 52, 41, 95, '+1'], [5, 1, 5, 5, 2, 3, 2406, 1688, 128, 54, 95, '(650)'], [5, 1, 5, 5, 2, 4, 2555, 1689, 219, 42, 95, '6186116'], [5, 1, 5, 5, 2, 5, 3726, 1683, 205, 42, 96, 'Version:'], [5, 1, 5, 5, 2, 6, 3958, 1683, 16, 41, 96, '1'], [5, 1, 5, 5, 3, 1, 2129, 1770, 102, 42, 93, 'Fax:']] p_distance = 50 PDP = Prepare_Data_PDF() PDP.get_pdf_network(PDF_Info,0) PDP.get_pdf_lines(PDF_Info) self.x.extract_dates_frm_pdf(PDF_Info, self.get_exact_match, p_distance) print(PDF_Info.PDF_Lines) print(PDF_Info.dates)
def extract_information_frm_PDF(self, p_pdf_path, p_keyword_dict, p_exact_match_only, p_distance, p_currency_distance, p_scan_for_tbl=0): """ extracts information from pdf with location in document. Extracts: => keywords => currency => integers => dates Args: p_pdf_path (string): path to pdf document. p_keyword_dict (dictionary): keyword dictionary: key = list name, value = list. p_exact_match_only (Boolean): 1 = returns only exact match, 0 = return partial match as well. p_distance (int): distance in pixel while looking for term in proximity Returns: """ # get pdf term network. PDF_Info = PDF_Information() self.get_pdf_term_network(PDF_Info, p_pdf_path) # get pdf lines self.get_pdf_lines(PDF_Info) # get keywords from term network PDF_Info.keywords = self.extract_keywords_frm_PDF( PDF_Info, p_keyword_dict, p_exact_match_only, p_distance) # search dates self.EDFP.extract_dates_frm_pdf(PDF_Info, self.get_exact_match, p_distance) #search currency self.ECFP.extract_currency_frm_pdf(PDF_Info, p_currency_distance) # search numbers self.extract_numbers_frm_PDF(PDF_Info) # extract table if p_scan_for_tbl: self.extract_information_frm_table(PDF_Info, p_keyword_dict) # remove all images created self.pdf_r.clear_image_files(PDF_Info.page_count) return PDF_Info
def test_get_distance(self): PDF_Info = PDF_Information() l_alignment_list = [ [[5, 1, 4, 1, 1, 1, 454, 973, 203, 53, 95, 'Issued', 0], [[[5, 1, 4, 1, 1, 4, 1134, 973, 286, 67, 95, 'February', 0], ('right', 477)], [[5, 1, 4, 1, 1, 5, 1442, 973, 94, 62, 95, '20', 0], ('right', 785)], [[5, 1, 4, 1, 1, 5, 1442, 973, 94, 62, 95, ',', 1], ('right', 785)], [[5, 1, 4, 1, 1, 6, 1564, 973, 157, 53, 95, '2020', 0], ('right', 907)]]], [[5, 1, 4, 1, 1, 2, 685, 986, 74, 40, 96, 'on', 0], [[[5, 1, 4, 1, 1, 4, 1134, 973, 286, 67, 95, 'February', 0], ('right', 375)], [[5, 1, 4, 1, 1, 5, 1442, 973, 94, 62, 95, '20', 0], ('right', 683)], [[5, 1, 4, 1, 1, 5, 1442, 973, 94, 62, 95, ',', 1], ('right', 683)], [[5, 1, 4, 1, 1, 6, 1564, 973, 157, 53, 95, '2020', 0], ('right', 805)]]] ] out_put = self.x.get_distance(l_alignment_list, PDF_Info.get_distance) self.assertEqual(out_put, {'min': 375, 'max': 907}) # empty l_alignment_list = [] out_put = self.x.get_distance(l_alignment_list, PDF_Info.get_distance) self.assertEqual(out_put, {'min': -1, 'max': -1}) # two max at same distance l_alignment_list = [ [[5, 1, 4, 1, 1, 1, 454, 973, 203, 53, 95, 'Issued', 0], [[[5, 1, 4, 1, 1, 4, 1134, 973, 286, 67, 95, 'February', 0], ('right', 477)], [[5, 1, 4, 1, 1, 5, 1442, 973, 94, 62, 95, '20', 0], ('right', 785)], [[5, 1, 4, 1, 1, 5, 1442, 973, 94, 62, 95, ',', 1], ('right', 785)], [[5, 1, 4, 1, 1, 6, 1564, 973, 157, 53, 95, '2020', 0], ('right', 907)]]], [[5, 1, 4, 1, 1, 2, 685, 986, 74, 40, 96, 'on', 0], [[[5, 1, 4, 1, 1, 4, 1134, 973, 286, 67, 95, 'February', 0], ('right', 375)], [[5, 1, 4, 1, 1, 5, 1442, 973, 94, 62, 95, '20', 0], ('right', 683)], [[5, 1, 4, 1, 1, 5, 1442, 973, 94, 62, 95, ',', 1], ('right', 907)], [[5, 1, 4, 1, 1, 6, 1564, 973, 157, 53, 95, '2020', 0], ('right', 805)]]] ] out_put = self.x.get_distance(l_alignment_list, PDF_Info.get_distance) self.assertEqual(out_put, {'min': 375, 'max': 907})
def test_search_dates_in_pdf(self): PDF_Info = PDF_Information() PDF_Info.PDF_Lines = [[['hey todat is 5/7/2020!']], [['hey todat is May 7th 2020']], [['hey todat is May 2020!']]] self.x.search_dates_in_pdf(PDF_Info) self.assertEqual(PDF_Info.dates, {'hey todat is 5/7/2020!': ['5/7/2020'], 'hey todat is May 7th 2020': ['May 7th 2020'], 'hey todat is May 2020!': ['May 2020']}) # single entry PDF_Info.PDF_Lines = [[['hey todat is 5/7/2020!']]] self.x.search_dates_in_pdf(PDF_Info) self.assertEqual(PDF_Info.dates, {'hey todat is 5/7/2020!': ['5/7/2020']}) # no date found PDF_Info.PDF_Lines = [[['hey todat is !']], [['hey todat is May ']], [['hey todat is !']]] self.x.search_dates_in_pdf(PDF_Info) self.assertEqual(PDF_Info.dates, {}) # empty dates PDF_Info.PDF_Lines = [] self.x.search_dates_in_pdf(PDF_Info) self.assertEqual(PDF_Info.dates, {})
def test_get_unique_dates(self): PDF_Info = PDF_Information() PDF_Info.dates = {'a mar 2020':['mar 2020'], 'a mar 2021':['mar 2021'], 'a mar 2020 hello':['mar 2020']} out_put = self.x.get_unique_dates(PDF_Info) self.assertEqual(out_put, {'mar 2020', 'mar 2021'}) PDF_Info.dates = {'a mar 2020':['mar 2020'], 'a mar 2020 hello':['mar 2020']} out_put = self.x.get_unique_dates(PDF_Info) self.assertEqual(out_put, {'mar 2020'}) # single entry PDF_Info.dates = {'a mar 2020':['mar 2020']} out_put = self.x.get_unique_dates(PDF_Info) self.assertEqual(out_put, {'mar 2020'}) # empty dates PDF_Info.dates = {} out_put = self.x.get_unique_dates(PDF_Info) self.assertEqual(out_put, set({}))
def test_is_same(self): PDF_Info = PDF_Information() term1 = [5, 1, 13, 1, 3, 8, 2864, 104712, 282, 53, 96, '$60,000.00'] term2 = [5, 1, 13, 1, 3, 8, 2864, 104712, 282, 53, 96, '$60,000.00'] out_put = self.x.is_same(term1, term2, PDF_Info.get_indexes_with_pixel_info) self.assertTrue(out_put) # same pixel different text term1 = [5, 1, 13, 1, 3, 8, 2864, 104712, 282, 53, 96, '$60,000.00'] term2 = [5, 1, 13, 1, 3, 8, 2864, 104712, 282, 53, 96, '$.00'] out_put = self.x.is_same(term1, term2, PDF_Info.get_indexes_with_pixel_info) self.assertTrue(out_put) # not equal term1 = [5, 1, 13, 1, 3, 8, 93, 104712, 282, 53, 96, '$60,000.00'] term2 = [5, 1, 13, 1, 3, 8, 2864, 104712, 282, 53, 96, '$60,000.00'] out_put = self.x.is_same(term1, term2, PDF_Info.get_indexes_with_pixel_info) self.assertFalse(out_put) # term1 blank term1 = [] term2 = [5, 1, 13, 1, 3, 8, 2864, 104712, 282, 53, 96, '$60,000.00'] out_put = self.x.is_same(term1, term2, PDF_Info.get_indexes_with_pixel_info) self.assertFalse(out_put) # term2 blank term1 = [5, 1, 13, 1, 3, 8, 93, 104712, 282, 53, 96, '$60,000.00'] term2 = [] out_put = self.x.is_same(term1, term2, PDF_Info.get_indexes_with_pixel_info) self.assertFalse(out_put) # both term blank term1 = [] term2 = [] out_put = self.x.is_same(term1, term2, PDF_Info.get_indexes_with_pixel_info) self.assertTrue(out_put)
def setUp(self): self.PDF_Info = PDF_Information() self.PDF_Info = z_test_data.PDF_Info self.x = Network_Navigtor()
# box.append(key) # print('-------------------boxes-------------------------') # print(box) # print(len(box)) # cells = get_cells(box) # for cell in cells: # print('----------------------------------cell----------------------') # print(cell) # print(out_put[cell]) # print(len(cells)) # ---------------------------------------------------pdf_info------------------------------------------------------- pdf_info = PDF_Information() pdf_info.cleanData = [ [5, 1, 1, 1, 1, 1, 681, 2770, 33, 25, 96, 'c.'], [5, 1, 1, 1, 1, 2, 732, 2762, 106, 42, 95, 'Long'], [5, 1, 1, 1, 1, 3, 856, 2761, 183, 34, 95, 'Distance'], [5, 1, 1, 1, 1, 4, 1057, 2761, 146, 34, 95, 'Radius'], [5, 1, 2, 1, 1, 1, 3031, 2848, 145, 34, 95, 'Radius'], [5, 1, 2, 1, 1, 2, 3193, 2848, 117, 34, 96, 'Class'], [5, 1, 3, 1, 1, 1, 1441, 2975, 196, 34, 95, 'Business'], [5, 1, 3, 1, 1, 2, 1655, 2975, 78, 34, 96, 'Use'], [5, 1, 3, 1, 1, 3, 2837, 2978, 105, 42, 95, 'Long'], [5, 1, 3, 1, 1, 4, 2960, 2977, 184, 34, 96, 'Distance'], [5, 1, 3, 1, 1, 5, 3160, 2977, 117, 43, 96, '(Over'], [5, 1, 3, 1, 1, 6, 3290, 2977, 73, 34, 96, '200'], [5, 1, 3, 1, 1, 7, 3381, 2977, 123, 43, 94, 'Miles)'], [5, 1, 4, 1, 1, 1, 1528, 3053, 116, 34, 94, 'Class'],