def convert_to_text(fname): pages = None if not pages: pagenums = set() else: pagenums = set(pages) output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = open(fname, 'rb') for page in PDFPage.get_pages(infile, pagenums): interpreter.process_page(page) infile.close() converter.close() text = output.getvalue() output.close text_list = text.split('\n') txt = text_list[:3] text = ' '.join(text_list[3:]) print("###################") print(txt) ## spliting word from string word_list = text.split(' ') string_input = "" flag = 0 for word in word_list: # print("*********") # print(word) if (word.lower() == 'tran'): break else: if (word.lower() == 'customer' or word.lower() == 'scheme' or word.lower() == 'currency' or word.lower() == 'for'): word = '\n' + word elif (word.lower() == 'statement'): word = '\n' + word flag = 1 elif (word.lower() == 'account' and flag == 1): word = '\n' + word string_input += word + " " print("::::::::::::::::::::::") # print(string_input) file_name = fname.split('/')[-1] file_name = file_name.split('.')[0] # print(file_name) # write Content to .txt text_file = open( "/home/dell/Documents/Aha-Loans/Production/PdfAnalyser/output/txt/output_" + file_name + ".txt", "w") text = re.sub("\s\s+", " ", text) text_file.write("%s" % text) text_file.close() file_name_main = "output_" + file_name + ".csv" csv_file = open( "/home/dell/Documents/Aha-Loans/Production/PdfAnalyser/output/csv/" + file_name_main, "w") text = re.sub("\s\s+", " ", string_input) csv_file.write("%s" % string_input) csv_file.close() length_lines = len(string_input.split('\n')) # print("-----------",length_lines) convert_to_table(fname, string_input, txt)
def extract_text_from_pdf(): file = os.listdir('D:\\Data\\send\\') files_name = '' for item in file: if re.search(str(), item): files_name = 'D:\\Data\\send\\' + item break resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter(resource_manager, fake_file_handle) page_interpreter = PDFPageInterpreter(resource_manager, converter) count = 0 with open(files_name, 'rb') as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): page_interpreter.process_page(page) count += 1 if count == 3: break text = fake_file_handle.getvalue() converter.close() fake_file_handle.close() #def patterns1(): p1 = re.compile(r'([а-я])([А-Я])') p2 = re.compile(r'([А-Я]{7})([а-я])') p3 = re.compile(r'([A-я])([0-9])') p4 = re.compile(r'([0-9])([A-я])') p5 = re.compile(r'(\w)(№)') p6 = re.compile(r'([.!?)])([A-я])') p7 = re.compile(r'(\d{2}.\d{2}.\d{4})([A-я])') p8 = re.compile(r'([A-я])(\d{2}.\d{2}.\d{4})') p9 = re.compile(r'([,])([A-я])') p10 = re.compile(r'([А-я]")([A-я])') #def no_CaMeLs(): text2 = re.sub(p1, r"\1&\2", text) text3 = re.sub(p2, r"\1 \2", text2) text4 = re.sub(p3, r"\1&\2", text3) text5 = re.sub(p4, r"\1&\2", text4) text6 = re.sub(p5, r"\1 \2", text5) text7 = re.sub(p6, r"\1&\2", text6) text8 = re.sub(p7, r"\1 \2", text7) text9 = re.sub(p8, r"\1 \2", text8) text10 = re.sub(p9, r"\1 \2", text9) text11 = re.sub(p10, r'\1&\2', text10) #def patterns2(): p_vipiska_date = re.compile( r'(.+)(ИСКА из Единого государственного реестра юридических лиц&)(\d{2}.\d{2}.\d{4})(.+)' ) p_number_date = re.compile(r'(.+)(№.+)(&дата формирования выписки)(.+)') p_full_title = re.compile( r'(.+)(Настоящая выписка содержит сведения о юридическом лице&)([А-я\s"]+)(.+)' ) p_inn = re.compile( r'(.+)(Сведения об учете в налоговом органе&\d\d&ИНН&)([0-9]+)(\d\d&КПП&)(.+)' ) p_ogrn = re.compile( r'(.+)(полное наименование юридического лица&ОГРН&)([0-9]+)(.+)') p_name = re.compile(r'(.+)(Сокращенное наименование&)([А-я\s"]+)(.+)') p_date_egrul = re.compile( r'(.+)(3&ГРН и дата внесения в ЕГРЮЛ записи, содержащей указанные сведения&)([0-9\.]+)(\d{2}.\d{2}.\d{4})(&Адрес )(.+)' ) p_index = re.compile( r'(.+)(4&Почтовый индекс&)([0-9]{6})(5&Субъект Российской Федерации&)(.+)' ) p_subject_RF = re.compile( r'(.+)(5&Субъект Российской Федерации&)([А-я\s]+)(&6&Улица )(.+)') p_street = re.compile( r'(.+)(проспект, переулок и т\.&д\..&)(.+)(&7&Дом )(.+)') p_house = re.compile(r'(.+)(&ДОМ )([0-9]+)(&Корпус )(.+)') p_corpus = re.compile(r'(.+)(&СТРОЕНИЕ )([0-9]+)(&Офис )(.+)') p_flat = re.compile(r'(.+)(&)(.+)(10&ГРН и дата внесения в ЕГРЮЛ)(.+)') p_date_egrul2 = re.compile( r'(.+)(10&ГРН и дата внесения в ЕГРЮЛ записи, содержащей указанные сведения&)([0-9\.]+)(\d{2}.\d{2}.\d{4})(&Сведения о регистрации&11&)(.+)' ) p_registration = re.compile( r'(.+)(11&Способ образования&)([А-я\s]+)(&12&ОГРН)(.+)') p_date_registration = re.compile( r'(.+)(13&Дата регистрации&)(\d{2}.\d{2}.\d{4})(14&ГРН и дата внесения в ЕГРЮЛ записи, содержащей указанные сведения&)(.+)' ) p_date_egrul3 = re.compile( r'(.+)(14&ГРН и дата внесения в ЕГРЮЛ записи, содержащей указанные сведения&)([0-9\.]+)(\d{2}.\d{2}.\d{4})(&Сведения о регистрирующем органе по месту нахождения юридического лица&15)(.+)' ) p_kpp = re.compile(r'(.+)(&КПП&)([0-9]+)(&Дата постановки на учет&)(.+)') p_date_inn = re.compile( r'(.+)(&Дата постановки на учет&)(\d{2}.\d{2}.\d{4})(21&Наименование налогового органа&)(.+)' ) p_tax_office = re.compile( r'(.+)(&Наименование налогового органа&)(.+)(22&ГРН и дата внесения в ЕГРЮЛ записи)(.+)' ) p_capital = re.compile(r'(.+)(31&Вид&)(.+)(&32&Размер )(.+)') p_capital2 = re.compile( r'(.+)(&32&Размер .в рублях.)(.+)(33&ГРН и дата внесения в ЕГРЮЛ)(.+)') p_surname = re.compile(r'(.+)(35&Фамилия&)(.+)(&36&Имя&)(.+)') p_name_gener = re.compile(r'(.+)(&36&Имя&)(.+)(&37&Отчество&)(.+)') p_patronymic = re.compile(r'(.+)(&37&Отчество&)(.+)(&38&ИНН&)(.+)') p_inn_gener = re.compile( r'(.+)(&38&ИНН&)(.+)(39&ГРН и дата внесения в ЕГРЮЛ записи,)(.+)') p_position = re.compile( r'(.+)(40&Должность&)(.+)(&41&ГРН и дата внесения)(.+)') p_founder = re.compile( r'(.+)(43&Полное наименование&)(.+)(&44&ГРН и дата внесения)(.+)') p_fou_country = re.compile( r'(.+)(45&Страна происхождения&)(.+)(&46&Дата регистрации&)(.+)') p_fou_address = re.compile( r'(.+)(&49&Адрес .место нахождения. в странепроисхождения&)(.+)(&50&ГРН и дата внесения )(.+)' ) p_fou_capital = re.compile( r'(.+)(51&Номинальная стоимость доли .в рублях.)(.+)(52&Размер доли)(.+)' ) p_percent = re.compile( r'(.+)(52&Размер доли .в процентах.)(.+)(53&ГРН и дата внесения в ЕГРЮЛ записи)(.+)' ) p_activity = re.compile( r'(.+)(&54&Код и наименование вида деятельности&)(.+)(&55&ГРН и дата внесения в ЕГРЮЛ записи)(.+)' ) #def stuff_docx(): vipiska_date = re.sub(p_vipiska_date, r'\3', text11) vipiska_number = re.sub(p_number_date, r'\2', text11) full_title = re.sub(p_full_title, r'\3', text11) inn = re.sub(p_inn, r'\3', text11) ogrn = re.sub(p_ogrn, r'\3', text11) name = re.sub(p_name, r'\3', text11) date_egrul = re.sub(p_date_egrul, r'\3 \4', text11) index = re.sub(p_index, r'\3', text11) subject_RF = re.sub(p_subject_RF, r'\3', text11) street = re.sub(p_street, r'\3', text11) house = re.sub(p_house, r'\3', text11) corpus = re.sub(p_corpus, r'\3', text11) flat = re.sub(p_flat, r'\3', text11) date_egrul2 = re.sub(p_date_egrul2, r'\3 \4', text11) registration = re.sub(p_registration, r'\3', text11) date_registration = re.sub(p_date_registration, r'\3', text11) date_egrul3 = re.sub(p_date_egrul3, r'\3 \4', text11) kpp = re.sub(p_kpp, r'\3', text11) date_inn = re.sub(p_date_inn, r'\3', text11) tax_office = re.sub(p_tax_office, r'\3', text11) capital = re.sub(p_capital, r'\3', text11) capital2 = re.sub(p_capital2, r'\3', text11) surname = re.sub(p_surname, r'\3', text11) name_gener = re.sub(p_name_gener, r'\3', text11) patronymic = re.sub(p_patronymic, r'\3', text11) inn_gener = re.sub(p_inn_gener, r'\3', text11) position = re.sub(p_position, r'\3', text11) founder = re.sub(p_founder, r'\3', text11) fou_country = re.sub(p_fou_country, r'\3', text11) fou_address = re.sub(p_fou_address, r'\3', text11) fou_capital = re.sub(p_fou_capital, r'\3', text11) percent = re.sub(p_percent, r'\3', text11) activity = re.sub(p_activity, r'\3', text11) print(text11) #def make_docx(): doc = DocxTemplate("D:\\Data\\document\\выписка.docx") context = { 'framing_date': vipiska_date, 'vipiska_number': vipiska_number, 'full_title': full_title, 'inn': inn, 'ogrn': ogrn, 'name': name, 'date_egrul': date_egrul, 'index': index, 'subject_RF': subject_RF, 'street': street, 'house': house, 'corpus': corpus, 'flat': flat, 'date_egrul2': date_egrul2, 'registration': registration, 'date_registration': date_registration, 'date_egrul3': date_egrul3, 'kpp': kpp, 'date_inn': date_inn, 'tax_office': tax_office, 'capital': capital, 'capital2': capital2, 'surname': surname, 'name_gener': name_gener, 'patronymic': patronymic, 'inn_gener': inn_gener, 'position': position, 'founder': founder, 'fou_country': fou_country, 'fou_address': fou_address, 'fou_capital': fou_capital, 'percent': percent, 'activity': activity } doc.render(context) doc.save('D:\\Data\\document\\выписка_' + str(inn) + '.docx')
def pdf2txt(argv): import getopt (opts, args) = getopt.getopt(argv[0:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None outdir = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() outfp.close() return
def generate_pdf_pages(fp, maxpages=0, logger=None): from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfparser import PDFParser from pdfminer.pdfdevice import PDFDevice from pdfminer.cmapdb import CMapDB from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfpage import PDFPage from pdfminer.converter import TextConverter from pdfminer.layout import LAParams import re password = '' pagenos = set() imagewriter = None rotation = 0 caching = True laparams = LAParams() #debug = 0 #PDFDocument.debug = debug #PDFParser.debug = debug #CMapDB.debug = debug #PDFResourceManager.debug = debug #PDFPageInterpreter.debug = debug #PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) pages = [] for i, page in enumerate( PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True), 1): # page.rotate = (page.rotate + rotation) % 360 outfp = StringIO() outfp.write('{} ===========================\n'.format(i)) device = TextConverter(rsrcmgr, outfp, codec='utf-8', laparams=laparams, imagewriter=imagewriter) interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.process_page(page) if logger: logger.info("Processing page: {}".format(i)) device.close() r = outfp.getvalue() outfp.close() pages.append(re.sub(r'[ ]+', ' ', r)) # Get rid of all of those damn spaces. fp.close() return pages
def processPdf(pdf_filenames,d,i): i=len(d)+1 print("in process pdf "+str(d)+" value i "+str(i)) for file in pdf_filenames: path=file rsrcmgr = PDFResourceManager() retstr = io.StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() matches = list(datefinder.find_dates(text)) pattern = re.findall(r'([£$€$₹])[\s]?(\d+(?:\.\d{2})?)', text) print(matches) print(pattern) if 'Ola' in text: d[i].append('taxi') d[i].append('OOLA') d[i].append(str(matches[3].date())) if '₹' in pattern[0][0]: d[i].append('INR') d[i].append(pattern[0][1]) elif '$' in pattern[0][0]: d[i].append('USD') d[i].append(pattern[0][1]) elif '€' in pattern[0][0]: d[i].append('USD') d[i].append(pattern[0][1]) elif 'Uber' in text: d[i].append('taxi') d[i].append('UBER') d[i].append(str(matches[3].date())) if '₹' in pattern[0][0]: d[i].append('INR') d[i].append(pattern[0][1]) elif '$' in pattern[0][0]: d[i].append('USD') d[i].append(pattern[0][1]) elif '€' in pattern[0][0]: d[i].append('USD') d[i].append(pattern[0][1]) elif 'Invoice' in text: d[i].append('flight') d[i].append('international') d[i].append(str(matches[0].date())) if '₹' in pattern[0][0]: d[i].append('INR') d[i].append(pattern[0][1]) elif '$' in pattern[0][0]: d[i].append('USD') d[i].append(pattern[0][1]) elif '€' in pattern[0][0]: d[i].append('USD') d[i].append(pattern[0][1]) else: d[i].append('flight') d[i].append('domestic') d[i].append(str(matches[3].date())) d[i].append('INR') words = text.split("\n") if 'Total Fare' in words: val = words.index('Total Fare') + 1 d[i].append(words[val]) d[i].append(file) i += 1 print("after all "+str(d)) return d
def convert(self): self.ui.progressBar.setValue(0) if not self.pages: pagenums = set() else: pagenums = set(self.pages) output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = file(self.fname, 'rb') for page in PDFPage.get_pages(infile, pagenums): interpreter.process_page(page) infile.close() converter.close() data = output.getvalue() output.close #print(data) style1 = xlwt.easyxf('font: bold off; align: wrap off, vert centre, horiz center; borders: top thin, bottom thin, left thin, right thin;') style1.num_format_str = 'DD-MM-YY' style2 = xlwt.easyxf('font: bold off; align: wrap off, vert centre, horiz center; borders: top thin, bottom thin, left thin, right thin;') style2.num_format_str = 'HH:MM' style4 = xlwt.easyxf('font: bold off; align: wrap off, vert centre, horiz center; borders: top thin, bottom thin, left thin, right thin;') style4.num_format_str = 'HH:MM:SS' style0 = xlwt.easyxf('font: bold off; align: wrap off, vert centre, horiz left; borders: top thin, bottom thin, left thin, right thin;') style3 = xlwt.easyxf('font: bold on; align: wrap off, vert centre, horiz center; borders: top double, bottom double, left double, right double;') style5 = xlwt.easyxf('font: bold on; align: wrap off, vert centre, horiz center; borders: top double, bottom double, left double, right double;') style5.num_format_str = '[h]:mm:ss;@' wb = xlwt.Workbook() ws = wb.add_sheet('A Test Sheet') ws.write(3,0,'Day',style3) ws.write(3,1,'Date',style3) ws.write(3,2,'Time',style3) ws.write(3,3,'E/Stn',style3) ws.write(3,4,'Service Name',style3) ws.write(3,5,'Destination',style3) ws.write(3,6,'Code',style3) ws.write(3,7,'Band',style3) ws.write(3,8,'Peak/Off-Peak',style3) ws.write(3,9,'Amount',style3) ws.write(3,10,'Unit',style3) ws.write(3,11,'Cost',style3) ws.write(3,12,'Tarif',style3) ws.write(3,13,'Cost_2',style3) i = 4 self.ui.progressBar.setValue(10) #regDate = '\d\d-\d\d-\d\d' regDate = str(self.ui.lineEdit_2.text()) matchesDate = re.findall(regDate, data) #print(matchesDate) #print(len(matchesDate)) for Date in matchesDate: #outfp.write(Date+'\n') ws.write(i,1,Date,style1) i +=1 i = 4 #regDay = '\n(Tue|Wed|Fri|Thu|Mon|Sat|Sun)' regDay = str(self.ui.lineEdit_3.text()) matchesDay = re.findall(regDay, data) for Day in matchesDay: #outfp.write(Day+'\n') ws.write(i,0,Day,style0) i +=1 i = 4 k = 0 self.ui.progressBar.setValue(20) #regTime = '[^\n:]\d\d:\d\d' regTime = str(self.ui.lineEdit_4.text()) matchesTime = re.findall(regTime, data) #print(matchesTime) #print(len(matchesTime)) while k < len(matchesDate): #outfp.write(Time+'\n') ws.write(i,2,matchesTime[k],style2) k+=1 i+=1 i = 4 k = 0 #regService = 'Voice' regService = str(self.ui.lineEdit_5.text()) matchesService = re.findall(regService, data) #print(matchesService) #print(len(matchesService)) while k < len(matchesDate): #outfp.write(Day+'\n') ws.write(i,4,matchesService[k],style0) i+=1 k+=1 i = 4 k = 0 self.ui.progressBar.setValue(30) trueDest = [] truedest2 = [] regDest = str(self.ui.lineEdit_6.text()) #regDest = '\d{7,}' matchesDest = re.findall(regDest, data) ws.write(0,0,'Mobile No',style3) ws.write(0,1,matchesDest[0],style3) #print(matchesDest) #print(len(matchesDest)) for Dest in matchesDest: if Dest != matchesDest[0]: #outfp.write(Day+'\n') trueDest.append(Dest) while k < len(matchesDate): ws.write(i,5,trueDest[k],style0) truedest2.append(trueDest[k][0:4]) i+=1 k+=1 #print(len(truedest2)) #print(truedest2) i = 4 k = 0 self.ui.progressBar.setValue(40) regCode = str(self.ui.lineEdit_7.text()) #regCode = r'\bL\b' matchesCode = re.findall(regCode, data) #print(matchesCode) #print(len(matchesCode)) while k < len(matchesDate): #outfp.write(Day+'\n') ws.write(i,6,matchesCode[k],style0) i+=1 k+=1 i = 4 k = 0 self.ui.progressBar.setValue(50) regBand = str(self.ui.lineEdit_8.text()) #regBand = r'\b(1|2|3)\b\n' matchesBand = re.findall(regBand, data) #print(matchesBand) #print(len(matchesBand)) while k < len(matchesDate): #outfp.write(Day+'\n') ws.write(i,7,matchesBand[k],style0) i+=1 k+=1 i = 4 k = 0 self.ui.progressBar.setValue(60) regPeak = str(self.ui.lineEdit_9.text()) #regPeak = r'\bO-P\b' matchesPeak = re.findall(regPeak, data) #print(matchesPeak) #print(len(matchesPeak)) while k < len(matchesDate): #outfp.write(Day+'\n') ws.write(i,8,matchesPeak[k],style0) i+=1 k+=1 i = 4 k = 0 self.ui.progressBar.setValue(70) regAmount = str(self.ui.lineEdit_10.text()) #regAmount = r'\b00:\d\d:\d\d\b' matchesAmount = re.findall(regAmount, data) #print(matchesAmount) #print(len(matchesAmount)) while k < len(matchesDate): #outfp.write(Day+'\n') hms = matchesAmount[k].split(':') ws.write(i,9,datetime.time(int(hms[0]),int(hms[1]),int(hms[2])),style4) i+=1 k+=1 i = 4 regUnit = str(self.ui.lineEdit_11.text()) #regUnit = r'\bH:M:S\b' matchesUnit = re.findall(regUnit, data) #print(matchesUnit) #print(len(matchesUnit)) for Unit in matchesUnit: #outfp.write(Day+'\n') ws.write(i,10,Unit,style0) i+=1 i = 4 k = 0 self.ui.progressBar.setValue(80) regCost = str(self.ui.lineEdit_12.text()) #regCost = r'\b\d+,\d+\b' matchesCost = re.findall(regCost, data) #print(matchesCost) #print(len(matchesCost)) while k < len(matchesDate): #outfp.write(Day+'\n') ws.write(i,11,float(matchesCost[k].replace(',','.')),style0) i+=1 k+=1 i = 4 k = 0 self.ui.progressBar.setValue(90) tarif = {} try: tariff = file('tarif.txt', 'r') tarif = pickle.load(tariff) tariff.close() #print(tarif) except: pass while k < len(matchesDate): if tarif.has_key(truedest2[k]): #outfp.write(Day+'\n') ws.write(i,12,tarif[truedest2[k]],style0) else: tarifinput = QInputDialog.getText(self,'Tarif','Please enter tarif for '+truedest2[k]+':') tarif[truedest2[k]] = float(tarifinput[0]) ws.write(i,12,tarif[truedest2[k]],style0) ws.write(i,13,xlwt.Formula('(HOUR(J'+str(i+1)+')*60+MINUTE(J'+str(i+1)+')+SECOND(J'+str(i+1)+')/60)*M'+str(i+1)+''),style0) i+=1 k+=1 tariff = file('tarif.txt', 'w') pickle.dump(tarif, tariff) tariff.close() ws.write(len(matchesDate)+6,0,'Total:',style3) ws.write(len(matchesDate)+6,9,xlwt.Formula('SUM(J5:J'+str(len(matchesDate)+4)+')'),style5) ws.write(len(matchesDate)+6,11,xlwt.Formula('SUM(L5:L'+str(len(matchesDate)+4)+')'),style3) ws.write(len(matchesDate)+6,13,xlwt.Formula('SUM(N5:N'+str(len(matchesDate)+4)+')'),style3) #outfp.close wb.save(self.fname+'.xls') self.ui.progressBar.setValue(100) if self.ui.checkBox.isChecked(): os.startfile(str(self.fname)+'.xls',) return data
def run(filepath): import getopt # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = 'pdfparser/minute_store/minutes.xml' outtype = 'xml' imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if outfile: outfp = open(outfile, 'wb') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) fp = open(filepath, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() print("Converted PDF to XML") return
def get_text_from_pdf(pdfname, limit=1000): if (pdfname == ''): return '' else: # 処理するPDFファイルを開く/開けなければ try: fp = open(pdfname, 'rb') except: return '' # PDFからテキストの抽出 rsrcmgr = PDFResourceManager() out_fp = StringIO() la_params = LAParams() la_params.detect_vertical = True device = TextConverter(rsrcmgr, out_fp, codec='utf-8', laparams=la_params) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos=None, maxpages=0, password=None, caching=True, check_extractable=True): interpreter.process_page(page) text = out_fp.getvalue() fp.close() device.close() out_fp.close() # 改行で分割する #lines = text.splitlines() lines = [] lines.append(text) outputs = [] output = "" # 除去するutf8文字 replace_strs = [b'\x00'] is_blank_line = False # 分割した行でループ for line in lines: # byte文字列に変換 line_utf8 = line.encode('utf-8') # 余分な文字を除去する for replace_str in replace_strs: line_utf8 = line_utf8.replace(replace_str, b'') # strに戻す line = line_utf8.decode() # 連続する空白を一つにする line = re.sub("[ ]+", " ", line) # 前後の空白を除く line = line.strip() #print("aft:[" + line + "]") # 空行は無視 if len(line) == 0: is_blank_line = True continue # 数字だけの行は無視 if is_float(line): continue # 1単語しかなく、末尾がピリオドで終わらないものは無視 if line.split(" ").count == 1 and not line.endswith("."): continue # 文章の切れ目の場合 if is_blank_line or output.endswith("."): # 文字数がlimitを超えていたらここで一旦区切る if (len(output) > limit): outputs.append(output) output = "" else: output += "\r\n" #前の行からの続きの場合 elif not is_blank_line and output.endswith("-"): output = output[:-1] #それ以外の場合は、単語の切れ目として半角空白を入れる else: output += " " #print("[" + str(line) + "]") output += str(line) is_blank_line = False outputs.append(output) outputs.append('\n') return outputs
# setting absolute paths prjPath = r'C:\Users\Natarajan\Desktop\PDFParser' def convert(case, fname, pages=None): if not pages: pagenums = set() else: pagenums = set(pages) manager = PDFResourceManager() codec = 'utf-8' caching = True if case == 'text': output = io.StringIO() converter = TextConverter(manager, output, codec=codec, laparams=LAParams()) if case == 'HTML': output = io.BytesIO() converter = HTMLConverter(manager, output, codec=codec, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = open(fname, 'rb') for page in PDFPage.get_pages(infile, pagenums, caching=caching, check_extractable=True): interpreter.process_page(page) convertedPDF = output.getvalue() infile.close() converter.close() output.close()
#pip install pdf miner from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfpage import PDFPage from pdfminer.converter import TextConverter, HTMLConverter, XMLConverter from pdfminer.layout import LAParams import io pdf_path = 'C:\\Users\somepath\filename.pdf' #path to your pdf file pdf = open(pdf_path, 'rb') mem = io.StringIO() lp = LAParams() rm = PDFResourceManager() cnv = TextConverter(rm, mem, laparams=lp) ip = PDFPageInterpreter(rm, cnv) for i in PDFPage.get_pages(pdf): ip.process_page(i) text = mem.getvalue() file = open("F:\\AIB\\covertedtext.txt", 'wb') #path to your destination file file.write(text.encode('utf-8')) print("DONE")
def extract_text_from_pdf(pdf_files): # resource_manager = PDFResourceManager() # fake_file_handle = io.StringIO() # converter = TextConverter(resource_manager, fake_file_handle) # page_interpreter = PDFPageInterpreter(resource_manager, converter) for PDF_file in pdf_files: resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter(resource_manager, fake_file_handle) page_interpreter = PDFPageInterpreter(resource_manager, converter) with open(PDF_file, 'rb') as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): page_interpreter.process_page(page) text = fake_file_handle.getvalue() # close open handles # file = open("text1.txt") output_file = PDF_file[:-4] output_file_name = os.path.join(output_file + ".txt") f = open(output_file_name, "a") for line in text.split(". "): if ("GOODWILL AND OTHER INTANGIBLE ASSETS" in line.upper()): # print(line) f.write(line) f.write("\n") if ("GOODWILL AND OTHER" in line.upper()): # print(line) f.write(line) f.write("\n") if ("ACQUISITION" in line.upper()): # print(line) f.write(line) f.write("\n") if ("BUSINESS COMBINATION" in line.upper()): # print(line) f.write(line) f.write("\n") if ("DIVESTITURE" in line.upper()): # print(line) f.write(line) f.write("\n") if ("GOODWILL AND OTHER" in line.upper()): # print(line) f.write(line) f.write("\n") if ("ACQUISITIONS" in line.upper()): # print(line) f.write(line) f.write("\n") f.close() # if text: # return text converter.close() fake_file_handle.close() text = ""
def get_report_startpage(pdf): """获取财务报表在文件内的起始页 Arguments: pdf {[str]} -- [pdf文件路径] Returns: start_page[int] -- [业务报表的起始页] """ getp= pdfplumber.open(pdf) total=len(getp.pages) #用于判断当前页是否在前10页 count=0 #存储报表的起始页 start_page=1 #是否是年度报告之类的文件标志 flag=False #创建一个pdf资源管理对象,存储共享资源 rsrcmgr = PDFResourceManager() laparams = LAParams() codec = 'utf-8' outfp = StringIO() #创建device对象 device = TextConverter(rsrcmgr=rsrcmgr, outfp=outfp, codec=codec, laparams=laparams) if total>30: print('总页数',total) with open(pdf, 'rb') as fp: #处理页面内容 interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() #遍历pdf中的每一页 for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): count+=1 teststr='' interpreter.process_page(page) teststr=outfp.getvalue() #第一页有无年/季度报告文字,若没有,则无需查找起始页 rs=re.search('(年\s*|季\s*)度?\s*报\s*告?',teststr) #print(teststr) if rs!=None and count==1: #第一张找到年报相关文字,在下一页查找目录二字 flag=True continue elif rs==None and count==1: #第一页未找到年/季报相关文字,查找第二页 #有的报告第一张具有印章,导致提取文字不全 print('第1页未检测到年/季报等文字,检测第二页') continue elif rs!=None and count==2: #第二页找到了年报相关文字,在第三页查找目录 flag=True continue elif rs==None and count==2: #如果第1页和第二页还是没找到年/季报字眼,则认为不是年/季度报文件 if flag==False: device.close() outfp.close() print('当前文件的财务报表起始页为',start_page) return start_page #如果第一页或第二页出现年报或季度报告字眼,则在前10页查找目录页 if flag==True: #1 对前10页进行处理 if count<11: #查找目录页 if re.search('目\s*录',teststr,flags=0): #查看含有目录两字的当前页中是否具有财务报表相关的目录名 #reg_stmt = re.compile(r'财务报告\D{10,}(\d{1,3})') ret = re.search('财务报告\s*(.)*\d', teststr) if ret!=None: ret=ret.group() #去除空格 tstr=[y.strip() for y in re.split(r'[…¨ .]',ret) if len(y)!=0] #第一个值未目录名,第二个值为页码 start_page=int(tstr[1]) device.close() outfp.close() print('当前文件的财务报表起始页为',start_page) return start_page else: #含有目录两字的当前页未找到财务报表相关文字,对下一页处理 count+=1 continue else: #当前页未找到目录文字,继续判断下一页 print('第',count,'页未找到目录二字,查找下一页') continue else: print('10页内未找到目录二字') #10页内未找到目录页,则退出循环 break else: #不超过30页不处理 print('当前文件的财务报表起始页为',start_page) return start_page device.close() outfp.close() print('当前文件的财务报表起始页为',start_page) return start_page
from pdfminer.pdfparser import PDFParser from pdfminer.pdfpage import PDFPage from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from subprocess import call from pdfminer.layout import LAParams import os url = 'http://www.ird.gov.hk/chi/pdf/c_s88list.pdf' opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] pdfdata = opener.open(url).read() file = open('document.pdf', 'wb') file.write(pdfdata) file.close() call('qpdf --password= --decrypt {0}/document.pdf {0}/decrypted.pdf'.format( os.getcwd()).split()) outfp = open('modifiedla.txt', 'w') parser = PDFParser(open('decrypted.pdf', 'rb')) document = PDFDocument(parser) rsrcmgr = PDFResourceManager() laparams = LAParams(char_margin=10) device = TextConverter(rsrcmgr, outfp, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) outfp.close()
def arc(): destino = str(formato.get()) if destino == "Arquivo do Word": destino = "docx" if destino == "Arquivo do Power-Point": destino = "ppt" if destino == "Arquivo do Excel": destino = "xlsx" if destino == "Arquivo de Texto": destino = "txt" import win32com.client as win32 from os import path in_file = path.abspath(diretorio) out_file = path.abspath(filename) if destino == "docx": if file_extension in ArqDOCX or file_extension.lower( ) == ".pdf" or file_extension.lower() == ".txt": word = win32.DispatchEx("Word.Application") word.Visible = 0 word.DisplayAlerts = 0 doc = word.Documents.Open(in_file) doc.SaveAs(out_file, FileFormat=16) doc.Close() word.Quit() elif destino.lower() == "pdf": if file_extension.lower() in ArqPPT: word = win32.DispatchEx("PowerPoint.Application") word.Visible = 0 word.DisplayAlerts = 0 doc = word.Presentations.Open(in_file) doc.SaveAs(out_file, FileFormat=32) doc.Close() word.Quit() elif file_extension.lower() in ArqXLSX: word = win32.DispatchEx("Excel.Application") word.Visible = 0 word.DisplayAlerts = 0 doc = word.Workbooks.Open(in_file) doc.ExportAsFixedFormat(0, out_file) doc.Close() word.Quit() elif file_extension.lower() in ArqDOCX or file_extension.lower( ) == ".txt": word = win32com.client.Dispatch('Word.Application') word.Visible = 0 word.DisplayAlerts = 0 doc = word.Documents.Open(in_file) doc.SaveAs(in_file, FileFormat=17) doc.Close() word.Quit() elif destino.lower() == "xlsx": if file_extension.lower() == ".pdf": import pdftables_api c = pdftables_api.Client('to7jluln0hvr') c.xlsx(diretorio, filename + '.xlsx') elif file_extension.lower() == ".txt" or file_extension.lower( ) in ArqDOCX: import pandas as pd df = pd.read_csv(diretorio, header=None, delim_whitespace=True) df.to_excel(filename + '.xlsx', index=False, header=None) elif destino.lower() == "txt": if file_extension in ArqDOCX: import docx2txt text = docx2txt.process(diretorio) with open(filename + ".txt", "w") as file: print(text, file=file) elif file_extension.lower() == ".pdf": from io import StringIO from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage output_string = StringIO() with open(diretorio, 'rb') as in_file: parser = PDFParser(in_file) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, output_string, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) with open(filename + ".txt", "w") as final: final.write(output_string.getvalue()) elif file_extension.lower() in ArqXLSX: import pandas as pd read_file = pd.read_excel(diretorio, header=None) read_file.to_csv(filename + ".txt", index=None, header=True) messagebox.showinfo( "Formato convertido", "Formato de ficheiro convertido com sucesso.\n\n" + file_extension[1:].upper() + " para " + destino.upper() + "\n\nSalvo em: " + out_file + "." + destino) root.destroy()
def _get_pdf_content(url, page_nums=[0]): """ (str) --> list Downloads the .pdf resume and parses it. """ resume = urllib.URLopener() # Just in case, try opening the .pdf with # the url found try: if type(url) != str and type(url) != unicode: raise TypeError except TypeError: raise TypeError(bcolors.FAIL + "Provided URL is bad type. URL must be a string" + bcolors.ENDC) try: # If we are redirected, follow it r = requests.get(url) # Call the download file, "resume.pdf" resume.retrieve(r.url, "resume.pdf") content = "" # Use the PDFMiner package to grab the # text from the .pdf file rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # Open the downloaded file here, # ---> 'rb' means hard read, regardless of the unicode pdf = file("resume.pdf", 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 1 caching = True pagenos = set() for page in PDFPage.get_pages(pdf, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) # Store the text as a string here text = retstr.getvalue() # Lower the text, for easier parse text = text.lower() # Use find_skills function, to parse the # text as it would with the HTML lstSkills = _find_skills(text) os.remove("resume.pdf") # Return the string of Skills return lstSkills except: # If we run into an error, continue on, and # move onto the next try statement pass # We found that some people don't like to # add "www" with their http://, which we # then attempt to fix here try: # Add www. to the string here url = url[:7] + "www." + url[7:] #######################Try request # Do the same as the above try statement resume.retrieve(url, "resume.pdf") content = "" rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) pdf = file("resume.pdf", 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 1 caching = True pagenos = set() for page in PDFPage.get_pages(pdf, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() text = text.lower() lstSkills = _find_skills(text) return lstSkills except: pass # Finally try a non-redirected route, and # parse the .pdf as we did in the first # try statement try: url = url[:7] + url[10:] resume.retrieve(url, "resume.pdf") content = "" rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) pdf = file("resume.pdf", 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 1 caching = True pagenos = set() for page in PDFPage.get_pages(pdf, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() text = text.lower() lstSkills = _find_skills(text) return lstSkills except: # Return an empty array, indicating that # we were unable to retrieve any # information from your resume print bcolors.FAIL + "Unable to parse pdf with given URL" + bcolors.ENDC return []
def create_candidates(path, sel_id, min_req, desire_req, remote_ind, scorer, model_type): """ We start by getting text from resumes uploaded to AWS S3 bucket rosev0 In case of pdf and docx we were able to process bytes making it easier to handle and make an inference. On the other hand, we have a processed doc files with antiword, sometimes the doc wiull not be processed in case of having weird format AWS S3 bucket: rosev0 mail_user / replace @ with _ chilean date and selection name model_type = 3c or whole """ #connecting to AWS S3 s3 = boto3.resource("s3", region_name='us-east-2', aws_access_key_id=os.environ.get('AWS_KEY'), aws_secret_access_key=os.environ.get('AWS_SECRET')) # loading models work_nlp = spacy.load('selection/models/work') ed_nlp = spacy.load('selection/models/education') per_nlp = spacy.load('selection/models/personal') whole_nlp = spacy.load('selection/models/whole') rose_bucket = s3.Bucket(r'rosev0') low_ind = 0 high_ind = 0 medium_ind = 0 candidates = [] for resume in rose_bucket.objects.filter(Prefix=path): key = resume.key body = resume.get()['Body'].read() buffer = io.BytesIO() buffer.write(body) ext = re.search('\.[a-z]+$', key) #print(key) ###body comes in binary stream, we have to decode it if ext == None: continue elif ext.group() == '.docx': document = Document(buffer) text = "\n".join([paragraph.text for paragraph in document.paragraphs]) #print(string) elif ext.group() == '.pdf': rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(buffer, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) #fp.close() #device.close() #retstr.close() text = retstr.getvalue() #print(string) elif ext.group() == '.doc': #LINUX version handles damaged files and text in docs split = str(key).split('/') #replace special characters, linux problem reading path filename = str(split[-1]).replace('$','_').replace('#','_') pathdoc = 'selection/tmp/' + filename #print('trying download in ' + pathdoc) rose_bucket.download_file(key, pathdoc) #doc_text = os.system('antiword "' + pathdoc + '"') try: output = subprocess.check_output('antiword "' + pathdoc + '"', shell=True) text = output.decode('utf-8') except: continue if text != None: d = {} results = [] # first we create list for work classes, we need to procces them, get them together comp_work = [] desig_work = [] years_work = [] other_work = [] desig_ind = [] years_ind = [] idioms = [] skills = [] comb = [] # then we do the same with college attributes grad_ed = [] colleges = [] degrees = [] certif = [] # then personal names = [] locations = [] mails = [] phones = [] if model_type == '3c': doc_work = work_nlp(text) doc_ed = ed_nlp(text) doc_per = per_nlp(text) #create array with entity text from algorithm inference for ent in doc_work.ents: for value in [ent.text]: if ent.label_ == 'companies worked at': comp_work.append(value) elif ent.label_ == 'designation': desig_work.append(value.lower()) desig_ind.append(ent.start_char) elif ent.label_ == 'years of experience': years_work.append(value) years_ind.append(ent.start_char) elif ent.label_ == 'idioms': idioms.append(value.lower()) elif ent.label_ == 'skills': skills.append(value.lower()) else: other_work.append([ent.label_, value]) results.append([ent.label_, value, text.index(value)]) for ent in doc_ed.ents: for value in [ent.text]: if ent.label_ == 'graduation year': grad_ed.append(value) elif ent.label_ == 'college': colleges.append(value.lower()) elif ent.label_ == 'degree': degrees.append(value.lower()) elif ent.label_ == 'certifications': certif.append(value.lower()) results.append([ent.label_, value, text.index(value)]) for ent in doc_per.ents: for value in [ent.text]: if ent.label_ == 'name': names.append(value) elif ent.label_ == 'location': locations.append(value) elif ent.label_ == 'mail': mails.append(value) elif ent.label_ == 'phone': phones.append(value) results.append([ent.label_, value, text.index(value)]) elif model_type == 'whole': doc = whole_nlp(text) for ent in doc.ents: for value in [ent.text]: if ent.label_ == 'companies worked at': comp_work.append(value) elif ent.label_ == 'designation': desig_work.append(value.lower()) desig_ind.append(ent.start_char) elif ent.label_ == 'years of experience': years_work.append(value) years_ind.append(ent.start_char) elif ent.label_ == 'idioms': idioms.append(value.lower()) elif ent.label_ == 'skills': skills.append(value.lower()) elif ent.label_ == 'location': locations.append(value) elif ent.label_ == 'mail': mails.append(value) elif ent.label_ == 'phone': phones.append(value) elif ent.label_ == 'name': names.append(value) elif ent.label_ == 'graduation year': grad_ed.append(value) elif ent.label_ == 'college': colleges.append(value.lower()) elif ent.label_ == 'degree': degrees.append(value.lower()) elif ent.label_ == 'certifications': certif.append(value.lower()) else: other_work.append([ent.label_, value]) results.append([ent.label_, value, text.index(value)]) if scorer == 'scorer': cand_json, high, low, medium = score_candidate(comp_work, desig_work, years_work, other_work, desig_ind, remote_ind, years_ind, idioms, skills, comb, grad_ed, colleges, degrees, certif, names, locations, mails, phones, min_req, desire_req, sel_id) match = re.findall('(\[])', str(cand_json)) if len(match) >= 5 and cand_json['info']['rank'] == 0: pass else: candidates.append(cand_json) if high: high_ind = high_ind + 1 if low: low_ind = low_ind + 1 if medium: medium_ind = medium_ind + 1 return candidates, high_ind, low_ind, medium_ind
def extract_text_from_pdf(pdf_path): ''' Helper function to extract the plain text from .pdf files :param pdf_path: path to PDF file to be extracted (remote or local) :return: iterator of string of extracted text ''' # https://www.blog.pythonlibrary.org/2018/05/03/exporting-data-from-pdfs-with-python/ if not isinstance(pdf_path, io.BytesIO): # extract text from local pdf file with open(pdf_path, 'rb') as fh: try: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter(resource_manager, fake_file_handle, codec='utf-8', laparams=LAParams()) page_interpreter = \ PDFPageInterpreter(resource_manager, converter) page_interpreter.process_page(page) text = fake_file_handle.getvalue() yield text # close open handles converter.close() fake_file_handle.close() except PDFSyntaxError: return else: # extract text from remote pdf file try: for page in PDFPage.get_pages(pdf_path, caching=True, check_extractable=True): resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter(resource_manager, fake_file_handle, codec='utf-8', laparams=LAParams()) page_interpreter = PDFPageInterpreter(resource_manager, converter) page_interpreter.process_page(page) text = fake_file_handle.getvalue() yield text # close open handles converter.close() fake_file_handle.close() except PDFSyntaxError: return
def synthesize_pdf( pdf_file, json_file, dst_dir, max_fonts, max_pages, num_outputs_per_document, synthesizer_class, ): ground_truth = json.loads(json_file.read_text()) pdf_io = BytesIO(pdf_file.read_bytes()) output_string = StringIO() rsrcmgr = PDFResourceManager(caching=True) device = TextConverter(rsrcmgr, output_string, codec='utf-8', laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter_fonts = {} def _out_path(_i, suffix): return dst_dir / f'{json_file.stem}-{_i}{suffix}' k_to_process = [] for i in range(num_outputs_per_document): if not (_out_path(i, '.pdf').exists() and _out_path(i, '.json').exists()): k_to_process.append(i) if not k_to_process: raise AlreadyProcessed(f'Already processed {pdf_file} {json_file}') with pikepdf.Pdf.open(pdf_file) as pdf: if max_pages and len(pdf.pages) > max_pages: raise TooManyPagesException( f'Too many pages {len(pdf.pages)} > {max_pages} in PDF, skipping!' ) for page_number, (page, miner) in enumerate( zip(pdf.pages, PDFPage.get_pages(pdf_io))): interpreter.process_page(miner) interpreter_fonts.update(interpreter.fontmap) if max_fonts and len(interpreter_fonts) > max_fonts: raise TooManyFontsException( f'Too many fonts {len(interpreter_fonts)} > {max_fonts} in PDF, skipping!' ) if not re.sub(f'[{re.escape(string.whitespace)}]', '', output_string.getvalue()): raise NoTextException('PDF does not have any text! Skipping') font_map = { f'/{k}': Font(f'/{k}', v) for k, v in interpreter_fonts.items() } synthesizer = synthesizer_class(ground_truth, font_map) with pikepdf.Pdf.open(pdf_file) as pdf: new_contents = collections.defaultdict(list) new_ground_truths = {} for i in k_to_process: for page_number, page in enumerate(pdf.pages): new_content_stream = parse_text(page, font_map, synthesizer) new_contents[i].append( pdf.make_stream( pikepdf.unparse_content_stream(new_content_stream))) new_ground_truths[i] = synthesizer.create_new_ground_truth() synthesizer.reset() for i in k_to_process: for page_number, page in enumerate(pdf.pages): page.Contents = new_contents[i][page_number] pdf.save(_out_path(i, '.pdf')) _out_path(i, '.json').write_text( json.dumps(new_ground_truths[i], indent=2))
def pdfToText(file, cid, catID, fileid, docType): #Create empty string for text to be extracted into keyWordObj = keywords.objects.exclude(categoryKeyWords=None) keyWords = [] for n in keyWordObj: keyWords.append(n.categoryKeyWords) extracted_text = '' #Sets the cursor back to 0 in f to be parsed and sets the documents and parser file.seek(0) parser = PDFParser(file) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') # doc.encode('utf-8') rsrcmgr = PDFResourceManager() #sets parameters for analysis laparams = LAParams() #Required to define separation of text within pdf laparams.char_margin = 1 laparams.word_margin = 1 #Device takes LAPrams and uses them to parse individual pdf objects device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) dataArray = [] #testing PRP # botSearchArray = [] if docType == 'Syllabus': for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: #if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): if isinstance(lt_obj, LTTextBoxHorizontal): extracted_text = lt_obj.get_text() filtText = restructString(extracted_text) added = False for n in keyWords: if (n.lower() in filtText.lower()) and ( added == False) and (len(n) > 2): dataArray.append(filtText) keyWords.remove(n) added = True elif (len(n) < 3) and (re.match( ('[a-zA-Z]*' + re.escape(n) + '[a-zA-Z]*'), filtText) != None): if (n in filtText) and (added == False): dataArray.append(filtText) keyWords.remove(n) added = True elif (n.lower() in filtText.lower()) and (added == True): keyWords.remove(n) if (len(dataArray) > 0) and (added == False): #checking for empty lines or lines with just a page number if (filtText != ' <br>') and (re.match( '[0-9]* <br>', filtText) == None) and (filtText != '<br>'): dataArray[-1] += ' ' + filtText elif len(dataArray) == 0: dataArray.append('Course Information: ' + filtText) for n in dataArray: restructForDB(n, cid, catID, fileid) elif docType == 'Lecture': for page in doc.get_pages(): dataArray.append('') interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: #if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): if isinstance(lt_obj, LTTextBoxHorizontal): extracted_text = lt_obj.get_text() filtText = restructString(extracted_text) #checking for empty lines or lines with just a page number if (filtText != ' <br>') and (re.match( '[0-9]* <br>', filtText) == None) and (filtText != '<br>'): dataArray[-1] += '' + filtText for n in dataArray: restructForDB(n, cid, catID, fileid) # botSearchArray.append(restructForDB(n)) elif docType == 'Assignment': #Sets the cursor back to 0 in f to be parsed and sets the documents and parser file.seek(0) parser = PDFParser(file) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() #sets parameters for analysis laparams = LAParams() retstr = StringIO() device = TextConverter(rsrcmgr, retstr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) #num = False data = "" head = "" for page in doc.get_pages(): read_position = retstr.tell() interpreter.process_page(page) retstr.seek(read_position, 0) page_text = retstr.read() page_text = re.sub('\\n', ' <br> ', page_text) page_text = re.sub('\\uf0b7|\\uf020|\\ufb01|\\uf8ff', '?', page_text) #print('CHECK: ' + page_text) for word in page_text.split(): skip = False if len(word) < 4: for ind, c in enumerate(word): if c.isdigit() and ind < len(word) - 1: if word[ind + 1] == '.': #print(data) tempStr = data.replace('<br>', '') while (tempStr[-1:] == ' '): tempStr = tempStr[:-1] if tempStr != "": data = head + ' ' + data parsedToDB(data, cid, catID, fileid) head = 'Question ' + word data = "" #num = True skip = True break if skip == False: if data == "": data = word else: data = data + ' ' + word if data != "": #print(data) data = head + ' ' + data parsedToDB(data, cid, catID, fileid) file.close()
def convert_pdf_to_txt(self, skiprows): try: self.url = str(self.objectj['object']['url']) self.list_columns = self.objectj['object']['columns'] self.name_doc = self.objectj['object']['document_name'] self.list_skiprows = self.objectj['object']['skiprows'] print(self.url + "|\n URL\n") print(str(self.list_columns) + "|\n COLUMNS\n") print(self.name_doc + "\n Document Name\n") print(str(self.list_skiprows) + " \n Skiprows\n") except Exception as exobjason: print("Json Object Error: " + str(exobjason)) time.sleep(5) quit() pdfname = r'.\_download_' + str(self.name_doc) + ".pdf" filename = wget.download(self.url, out=pdfname) iskiprows = int(skiprows) rsrcmgr = PDFResourceManager() retstr = io.StringIO() codec = 'utf-8-sig' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) path_pdf = open(filename, 'rb') pdfr = read_pdf( path_pdf, #guess=False, pages='all', pandas_options={ 'skiprows': iskiprows, 'header': None }, output_format="csv") #headers = ['NAME', 'ADDRESS', 'CITY', 'STATE', 'ZIP', 'PHONE'] #pdfr.columns = headers csvname = r'.\_generate_' + str(self.name_doc) + ".csv" pdfre = pdfr.to_csv(csvname) #pdfre = tabula.convert_into('.\ppp2.xlsx', "output.csv", output_format="csv") print("Head Csv File Generate\n" + str(pdfr.head())) print("Change Options in Json Object? ") print("-y Change options -enter pass ") optdocop = input(" Change Json Document Options? >:") if optdocop == "-y": url = input(" url >:") if url != "" and str(url).content('http://'): self.objectj['object']['url'] = str(url) print("Change Url" + str(self.objectj['object']['url'])) columns = input(" columns (separate fields with ,) >:") lcolumns = [] lcolumns = str(columns).split(",") if columns != "": self.objectj['object']['columns'] = lcolumns print("Change columns" + str(self.objectj['object']['columns'])) document_name = input(" document_name >:") if document_name != "": self.objectj['object']['document_name'] = str(document_name) print("Change document_name" + str(self.objectj['object']['document_name'])) skiprows = input(" skiprows (separate fields with ,) >:") lskiprows = [] lskiprows = str(skiprows).split(",") if skiprows != "": self.objectj['object']['skiprows'] = lskiprows print("Change skiprows" + str(self.objectj['object']['skiprows'])) doctype = input(" doctype >:") if doctype != "": self.objectj['object']['doctype'] = str(doctype) print("Change doctype" + str(self.objectj['object']['doctype'])) try: with open(self.doc, 'r') as filerin: self.objectj_copy = json.load(filerin) with open(self.doc, 'w') as filewin: json.dump(self.objectj, filewin, indent=5) except Exception as exchangejson: print("Error I-O Json file" + str(exchangejson)) time.sleep(4) try: with open(self.doc, 'r') as filerin1: self.objectj = json.load(filerin1) print("NEW OBJECT\nURL: " + str(self.objectj['object']['url'])) print("COLUMNS: " + str(self.objectj['object']['columns'])) print("DOCUMENT NAME: " + str(self.objectj['object']['document_name'])) print("SKIPROWS: " + str(self.objectj['object']['skiprows'])) print("DOCUMENT_TYPE: " + str(self.objectj['object']['doctype'])) self.__init__() except Exception as exchangejson: print("Error I-O Json file" + str(exchangejson))
# PyPDF2로 페이지 수 계산하기 filename = "north_korea_economic_growth.pdf" filepath = os.path.join(os.getcwd(), "data", filename) fp = open(filepath, 'rb') total_pages = PyPDF2.PdfFileReader(fp).numPages print(total_pages) # pdfminer로 페이지별 텍스트 가져오기 page_text = {} for page_no in range(total_pages): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = open(filepath, 'rb') password = None maxpages = 0 interpreter = PDFPageInterpreter(rsrcmgr, device) caching = True pagenos = [page_no] for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) page_text[page_no] = retstr.getvalue() fp.close() device.close()
def parsePDF(url): # Open the url provided as an argument to the function and read the content global data, data open = urllib2.urlopen(Request(url)).read() # Cast to StringIO object from StringIO import StringIO memory_file = StringIO(open) # Create a PDF parser object associated with the StringIO object parser = PDFParser(memory_file) # Create a PDF document object that stores the document structure document = PDFDocument(parser) # Define parameters to the PDF device objet rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() codec = 'utf-8' # Create a PDF device object device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # Create a PDF interpreter object interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document for page in PDFPage.create_pages(document): interpreter.process_page(page) data = retstr.getvalue() # Get values for Iowa B100 prices reg = '(?<=\n---\n\n)\d.\d{2}-\d.\d{2}' data_u = ('u', data) matches = re.findall(reg, data) # Our data are contained in matches[0] # Compute the average # Extract value from previous regex low = re.search('\d.\d{2}(?=-)', matches[0]) lpos = low.pos high = re.search('(?<=-)\d.\d{2}', matches[0]) hpos = high.pos # Cast string variables to float type low_val = float(low.group(0)) high_val = float(high.group(0)) # Calculate the average #import numpy #value = [high_val, low_val] #print value.mean ave = (high_val + low_val) / 2 # Search the date of the report reg = '\w{3},\s\w{3}\s\d{2},\s\d{4}' match = re.search(reg, data) # Result is contained in matches[0] dat = match.group(0) # Cast to date format #import datetime #form = datetime.datetime.strptime(match.group(0), '%a, %b %d, %Y') #print form # http://stackoverflow.com/questions/9752958/how-can-i-return-two-values-from-a-function-in-python return (dat, ave)
def readpdf(mypdf): rsrcmgr = PDFResourceManager() sio = StringIO() codec = 'utf-8' laparms = LAParams() device = TextConverter(rsrcmgr,sio,codec = codec, laparms)
from os import path from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfinterp import PDFResourceManager, process_pdf input_file_path = "../../../../etc" print("-" * 40) input_file = path.join(input_file_path, "SampleDoc.pdf") print("Input file name:", input_file) res_mgr = PDFResourceManager() raw = StringIO() laparams = LAParams() device = TextConverter(res_mgr, raw, laparams=laparams) pdf_file = open(input_file, "rb") process_pdf(res_mgr, device, pdf_file) device.close() pdf_file.close() content = raw.getvalue() raw.close() print("-" * 40) print(content) print("-" * 40) if __name__ == '__main__':
def main(argv): def usage(): print(( 'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] ' '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] ' '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() debug = False # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None outdir = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug = True elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) if debug: set_debug_logging() rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = io.open(outfile, 'wt', encoding=codec, errors='ignore') close_outfp = True else: outfp = sys.stdout close_outfp = False if outtype == 'text': device = TextConverter(rsrcmgr, outfp, laparams=laparams) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp) else: return usage() for fname in args: fp = io.open(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() if close_outfp: outfp.close()
def extract_text_from_pdf(pdf_path): resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter(resource_manager, fake_file_handle) page_interpreter = PDFPageInterpreter(resource_manager, converter) with open(pdf_path, 'rb') as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): page_interpreter.process_page(page) text = fake_file_handle.getvalue() converter.close() fake_file_handle.close() print(text) print('\n\n') if "Paramount Trading Corporation" in text: PO = re.search("PO(.*?)Invoice", text) PO = PO.group() PO = PO.replace("PO Ref : ", " ") PO = PO.replace(" Invoice", " ") print(PO) date = re.search("Date :.{8}", text) date = date.group() date = date.replace("Date :", " ") print(date) name = "Paramount Trading Corporation" print(name) add = re.search("Billing Address (.*?)Date", text) add = add.group().replace("Billing Address ", " ").replace("Date", " ") print(add) inv = re.search("Invoice No(.*?)%", text) inv = inv.group().replace("Invoice No:- ", " ").replace("%", " ") print(inv) #cpan = re.search("Customer PAN (.*?)Ship",text) #cpan = cpan.group().replace("Customer PAN No"," ").replace("Ship"," ") #print(cpan) #cgst = re.search("Customer GST (.*?)Customer", text) #cgst = cgst.group().replace("Customer GST No"," ").replace("Customer"," ") #print(cgst) gst = re.search("GST No : (.*?)PAN", text) gst = gst.group().replace("GST No : ", " ").replace("PAN", " ") gst = gst.replace("Paramount Trading Corporation ", " ") print(gst) pan = re.search("PAN No : (.*?)Declaration", text) pan = pan.group().replace("PAN No : ", " ").replace("Declaration", " ") print(pan) total = re.search("18%.{300}", text) total = total.group().split(".") total = total[1][2:] + "." + total[2][:2] print(total) tax = re.search("18%.{300}", text) tax = tax.group().split(".") tax[0] = tax[0].replace("18%", " ") tax = tax[0] + "." + tax[1][:2] print(tax) des = re.search("Paramount Trading Corporation(.*?)#8", text) des = des.group() des = des.replace("Description", " ") des = des.replace("#8", " ") des = des.replace("Commercial Invoice", " ") des = des.replace("Shipping Method", " ") des = des.replace("Mode of Payment", " ") des = des.replace("Shipment Date", " ") des = des.replace("Hero MotoCorp Ltd.C/o", " ") des = des.replace( "The Grand New Delhi, Nelson Mandel Road, Vasant Kunj, Phase IINew Delhi, India. Pin - 110070", " ") des = des.replace("Contact : Avinash +919557971063", " ") des = des.replace("Total", " ") des = des.replace("Paramount Trading Corporation", " ") des = des.replace("Road", " ") des = des.replace("11th June 2019", " ") des = des.replace("Hero MotoCorp Ltd.", " ") des = des.replace("Customer PO Ref : ", " ") des = des.replace(PO, " ") des = des.replace("Invoice No:- ", " ") des = des.replace("GST No : ", " ") des = des.replace(gst, " ") des = des.replace("PAN No : ", " ") des = des.replace("%", " ") des = des.replace( "Declaration:We declare that this invoice shows the actual price of the goodsdescribed and that all particulars are true and correct.", " ") des = des.replace("Authorised Signatory", " ") des = des.replace("advance balance 60 ", " ") des = des.replace("against delivery", " ") des = des.replace(inv, " ") des = des.replace(pan, " ") des = des.replace("(round off)", " ") print(des) elif "SONATA" in text: PO = re.search("Cust PO Ref & Date(.*?)/", text) PO = PO.group().replace("Cust PO Ref & Date: ", " ").replace("/", " ") print(PO) date = re.search("Invoice Date: (.*?)BILL", text) date = date.group().replace("Invoice Date: ", " ").replace("BILL", " ") print(date) name = "SONATA INFORMATION TECHNOLOGY LIMITED" print(name) add = re.search("INVOICESONATA INFORMATION TECHNOLOGY LIMITED(.*?)TEL", text) add = add.group().replace( "INVOICESONATA INFORMATION TECHNOLOGY LIMITED", " ").replace("TEL", " ") print(add) inv = re.search("Invoice No.:(.*?)Invoice", text) inv = inv.group().replace("Invoice No.:", " ").replace("Invoice", " ") print(inv) gst = re.search("GSTIN : (.*?)PAN", text) gst = gst.group().replace("GSTIN : ", " ").replace("PAN", " ") print(gst) pan = re.search("Our PAN is (.*?)and", text) pan = pan.group().replace("Our PAN is ", " ").replace("and", " ") print(pan) total = re.search("Total Invoice Value (.*?)of", text) total = total.group().split(".") total[0] = total[0].replace("Total Invoice Value ", " ") total = total[0] + "." + total[1][:2] print(total) tax = re.search("Total Tax Value(.*?)Total", text) tax = tax.group().replace("Total Tax Value", " ").replace("Total", " ") print(tax) des = re.search("Description of Goods/Services(.*?)Each", text) des = des.group() des = des.replace("Description of Goods/Services", " ") des = des.replace("Each", " ") des = des.replace("Qty", " ") des = des.replace("UOM", " ") des = des.replace("Rate", " ") des = des.replace("(INR)", " ") des = des.replace("Amount", " ") print(des) elif "Concoct Human Resources Practitioners India" in text: PO = re.search("eWay Bill No#.{300}", text) PO = PO.group().split(" ") PO = PO[13] print(PO) date = re.search("eWay Bill No#.{300}", text) date = date.group().split(" ") date = date[12] print(date) name = "Concoct Human Resources Practitioners India" print(name) add = re.search("#(.*?)Proforma", text) add = add.group().replace("Proforma", " ") print(add) inv = re.search("Invoice No: (.*?)PAN", text) inv = inv.group().replace("Invoice No: ", " ").replace("PAN", " ") print(inv) gst = re.search("IGST No#:(.*?)IEC", text) gst = gst.group().replace("IGST No#:", " ").replace("IEC", " ") print(gst) pan = re.search("PAN No: (.*?)GSTIN", text) pan = pan.group().replace("PAN No: ", " ").replace("GSTIN", " ") print(pan) total = re.search("Total Inc. of GST @ 18%(.*?)Amount", text) total = total.group().replace("Total Inc. of GST @ 18%", " ").replace("Amount", " ") print(total) tax = "Not given separately" print(tax) des = re.search("Particulars(.*?)Total", text) des = des.group() des = des.replace("Particulars", " ") des = des.replace("Product", " ") des = des.replace("S/N", " ") des = des.replace("No# of Units", " ") des = des.replace("Price Per Unit", " ") des = des.replace("GST @ 18%", " ") des = des.replace("Amount", " ") des = des.replace("(INR)", " ") des = des.split(".") #des = re.findall("[a-z]",des) l = len(des) for i in range(0, l - 1): if "Unit" in des[i]: desi = des[i].split("Unit") desi = desi[0] print(desi) elif "MicroGenesis CADSoft" in text: PO = "Not given" print(PO) date = re.search("Despatched throughDated(.*?)Mode", text) date = date.group().replace("Despatched throughDated", " ").replace("Mode", " ") print(date) name = "MicroGenesis CADSoft" print(name) add = re.search("MicroGenesis CADSoft(.*?)MSMED", text) add = add.group().replace("MSMED", " ").replace("MicroGenesis CADSoft Pvt Ltd", " ") print(add) inv = re.search("Invoice No.(.*?)Delivery", text) inv = inv.group().replace("Invoice No.", " ").replace("Delivery", " ") print(inv) gst = re.search("GSTIN/UIN:(.*?)State", text) gst = gst.group().replace("GSTIN/UIN:", " ").replace("State", " ") print(gst) pan = re.search("Company's PAN :(.*?)Dec", text) pan = pan.group().replace("Company's PAN :", " ").replace("Dec", " ") print(pan) total = re.search("Total₹(.*?)No", text) total = total.group().replace("Total", " ").replace("No", " ") print(total) tax = re.search("IGST @ 18%(.*?)%", text) tax = tax.group().replace("IGST @ 18%", " ").replace("%", " ") print(tax) des = re.search("SACNo.Services(.*?)No", text) des = des.group().replace("SACNo.Services", " ").replace("No", " ") print(des)
def main(argv): import getopt def usage(): print( 'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]' ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]' ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation] [-S]' ' [-t text|html|xml|tag] [-c codec] [-s scale]' ' file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:St:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-R': rotation = int(v) elif k == '-S': stripcontrol = True elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return
def split(path): resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter(resource_manager, fake_file_handle) page_interpreter = PDFPageInterpreter(resource_manager, converter) stu_numbers = [] stu_names = [] indices = [] dsn_tns = cx_Oracle.makedsn('connection', 'database ssid') db = cx_Oracle.connect(user='******', password='******', dsn=dsn_tns) cursor = db.cursor() # assign db operation to cursor variable sql = '''UPDATE u_studentsuserfields u set u.ausd_hashkey = :hashid where u.studentsdcid = (SELECT s.dcid from students s where s.dcid = u.studentsdcid and s.student_number = :stu_numid) ''' #begin the process extracting text data from reports anum = "Student ID:" # begin text search param bnum = "Parent" # end text search param aname = "Report for" bname = "School" count = 0 # PDF Text Extraction with open(path, 'rb') as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): indices.append(count) page_interpreter.process_page(page) count += 1 text = fake_file_handle.getvalue() # Number Processing stu_num = text.split(anum)[-1].split(bnum)[0] stu_num = str(stu_num) stu_num = stu_num.strip() stu_num = stu_num.replace(" ", "") stu_numbers.append(stu_num) # Name Processing stu_name = text.split(aname)[-1].split(bname)[0] stu_name = str(stu_name) stu_name = stu_name.strip() stu_name = stu_name.replace("/x00", "") stu_name = stu_name.replace("\x00", "") stu_name = stu_name.replace("Student", "") stu_names.append(stu_name) print(stu_num + " " + stu_name + "'s report has been parsed") converter.close() fake_file_handle.close() # PDF Write Out pdf_file_obj = open(path, 'rb') # open allows you to read the file pdf_reader = PyPDF2.PdfFileReader(pdf_file_obj) num_pages = pdf_reader.getNumPages() count2 = 0 for page in range(pdf_reader.getNumPages()): while count2 < num_pages: # The while loop will read each page pageobj = pdf_reader.getPage(count2) cur_stunum = stu_numbers[count2] pdf_writer = PyPDF2.PdfFileWriter() pdf_writer.addPage(pageobj) ausdhash = str(uuid.uuid4()) # params = {'hashid': ausdhash, 'stu_numid': cur_stunum} # cursor.execute(sql, params) output_filename = '{}.pdf'.format(ausdhash) outdir = os.path.join(reportdir, output_filename) count2 += 1 with open(outdir, "wb") as out: pdf_writer.write(out) print("generated " + cur_stunum + " " + output_filename)
def extractAccountingPolicySection(path): rsrcmgr = PDFResourceManager() retstr = io.StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() extracted = "" foundTitle = 0 ifInSection = 0 pageCount = 1 for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): if pageCount >= 40: interpreter.process_page(page) # Get all text in the current page data = retstr.getvalue() for line in data.splitlines(): # if in the section of accounting disclosure if ifInSection == 1: if line.strip(): # if reach section of taxation if "税项" in line: return extracted extracted += line else: if line.strip(): # if keywords "accounting policy" present if "会计政策" in line and "会计估计" in line: # if it is not in the target section, clear everything if foundTitle == 1 and ifInSection == 0: extracted = "" foundTitle = 0 foundTitle = 1 extracted += line # if found the subtitle of the target section elif "遵循企业会计" in line: ifInSection = 1 extracted += line data = '' retstr.truncate(0) retstr.seek(0) pageCount = pageCount + 1 fp.close() device.close() retstr.close() return extracted
def __init__(self, pdf_stream, password='', pagenos=[], maxpages=0): ReaderBackend.__init__(self) self.pdf_stream = pdf_stream # Extract Metadata parser = PDFParser(pdf_stream) doc = PDFDocument(parser, password=password, caching=True) if doc.info: for k in doc.info[0]: v = doc.info[0][k] # print(repr(v), type(v)) if isinstance(v, (bytes, str, unicode)): self.metadata[k] = make_compat_str(v) elif isinstance(v, (psparser.PSLiteral, psparser.PSKeyword)): self.metadata[k] = make_compat_str(v.name) # Secret Metadata if 'Metadata' in doc.catalog: metadata = resolve1(doc.catalog['Metadata']).get_data() # print(metadata) # The raw XMP metadata # print(xmp_to_dict(metadata)) self.metadata.update(xmp_to_dict(metadata)) # print("---") # Extract Content text_io = BytesIO() rsrcmgr = PDFResourceManager(caching=True) converter = TextConverter(rsrcmgr, text_io, codec="utf-8", laparams=LAParams(), imagewriter=None) interpreter = PDFPageInterpreter(rsrcmgr, converter) self.metadata["Pages"] = 0 self.curpage = 0 for page in PDFPage.get_pages(self.pdf_stream, pagenos=pagenos, maxpages=maxpages, password=password, caching=True, check_extractable=False): # Read page contents interpreter.process_page(page) self.metadata["Pages"] += 1 self.curpage += 1 # Collect URL annotations # try: if page.annots: refs = self.resolve_PDFObjRef(page.annots) if refs: if isinstance(refs, list): for ref in refs: if ref: self.references.add(ref) elif isinstance(refs, Reference): self.references.add(refs) # except Exception as e: # logger.warning(str(e)) # Remove empty metadata entries self.metadata_cleanup() # Get text from stream self.text = text_io.getvalue().decode("utf-8") text_io.close() converter.close() # print(self.text) # Extract URL references from text for url in extractor.extract_urls(self.text): self.references.add(Reference(url, self.curpage)) for ref in extractor.extract_arxiv(self.text): self.references.add(Reference(ref, self.curpage)) for ref in extractor.extract_doi(self.text): self.references.add(Reference(ref, self.curpage))