def __parse_word(self, pdf_fullpath): """ 单个解析PDF转成文字 不支持文字则弹窗提示 """ fp = open(pdf_fullpath, 'rb') # 以二进制读模式打开 # 用文件对象来创建一个pdf文档分析器 parser = PDFParser(fp) # 创建一个PDF文档, 提供初始化密码 doc = PDFDocument(parser) # 连接分析器 与文档对象 parser.set_document(doc) # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: return { 'errCode': 2001, 'result': { 'message': 'PDF文件不支持文字版', 'pdf_fullpath': pdf_fullpath } } else: # 创建PDF资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 用来计数页面,图片,曲线,figure,水平文本框等对象的数量 (num_page, num_image, num_curve, num_figure, num_TextBoxHorizontal) = (0, 0, 0, 0, 0) # 循环遍历列表,每次处理一个page的内容 for page in PDFPage.create_pages(doc): # 获取page列表 num_page += 1 # 页面增一 interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() for x in layout: if isinstance(x, LTImage): # 图片对象 num_image += 1 if isinstance(x, LTCurve): # 曲线对象 num_curve += 1 if isinstance(x, LTFigure): # figure对象 num_figure += 1 if isinstance(x, LTTextBoxHorizontal): # 获取文本内容 num_TextBoxHorizontal += 1 # 水平文本框对象增一 # 保存文本内容 file_without_suffix = os.path.basename( pdf_fullpath[:str(pdf_fullpath).rfind('.')]) with open(self.filedir + '/%s.txt' % file_without_suffix, 'a', encoding='utf-8') as f: # 生成txt文件的文件名及路径 results = x.get_text() f.write(results) f.write('\n') if num_TextBoxHorizontal <= 0: return { 'errCode': 2001, 'result': { 'message': 'PDF文件不支持文字版', 'pdf_fullpath': pdf_fullpath } } else: return { 'errCode': 0, 'result': { '对象数量': num_figure, '页面数': num_page, '图片数': num_image, '曲线数': num_curve, '水平文本框': num_TextBoxHorizontal } }
def parse(): # rb以二进制读模式打开本地pdf文件 fn = open('半监督模糊聚类及其应用_杨昔阳.pdf', 'rb') # 创建一个pdf文档分析器 parser = PDFParser(fn) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 parser.set_document(doc) doc.set_parser(parser) # 提供初始化密码doc.initialize("lianxipython") # 如果没有密码 就创建一个空的字符串 doc.initialize("") # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDf资源管理器 resource = PDFResourceManager() # 创建一个PDF参数分析器 laparams = LAParams() # 创建聚合器,用于读取文档的对象 device = PDFPageAggregator(resource, laparams=laparams) # 创建解释器,对文档编码,解释成Python能够识别的格式 interpreter = PDFPageInterpreter(resource, device) # 循环遍历列表,每次处理一页的内容 # doc.get_pages() 获取page列表 for page in doc.get_pages(): # 利用解释器的process_page()方法解析读取单独页数 interpreter.process_page(page) # 使用聚合器get_result()方法获取内容 layout = device.get_result() # 这里layout是一个LTPage对象,里面存放着这个page解析出的各种对象 for out in layout: # 判断是否含有get_text()方法,获取我们想要的文字 if hasattr(out, "get_text"): print(out.get_text()) with open('test.txt', 'a') as f: f.write(out.get_text() + '\n') if __name__ == '__main__': parse()
lines=[line.rstrip('\n') for line in map_file] #for each article for line in lines: # copy filename from pdf document filename=line.replace('.pdf','.xml') # open new xml file xfp = open('xml_data/' + filename, 'w') # open pdf file fp = open('fichedir/'+line, 'rb') # set pdfminer resources rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = PDFPage.get_pages(fp) # counter current_page = 1 xfp.write('<pages>') for page in pages: # output monitor if(current_page==1): print current_row, ' ', filename # markup page-level data xfp.write("<page>\n")
def main(argv): for arg in argv[1:]: fd = open(arg, 'rb') parser = PDFParser(fd) document = PDFDocument(parser) if not document.is_extractable: print("Document not extractable.") return 1 params = LAParams(char_margin=1) resMan = PDFResourceManager(caching=True) device = PDFPageAggregator(resMan, laparams=params) interpreter = PDFPageInterpreter(resMan, device) parser = x86ManParser("html", params) i = 1 for page in PDFPage.get_pages(fd, set(), caching=True, check_extractable=True): print("Processing page %i" % i) interpreter.process_page(page) page = device.get_result() parser.process_page(page) i += 1 parser.flush() fd.close() # RG: We get the parse in just one file: html/AAA.html # Looks like the original repo does not create all the separate pages for all the # instructions nor the index.html # Dus gewoon een grote parse in een file met als filename de eerste instructie. # En later heeft ie de losse pagina's daar dan weer uit gehaald door te zoeken op <h3> # en de closing tags toe te voegen. # En zo ook een index.html gemaakt en zelf een style.css erbij. # NOTE_: we are getting 3 sorts of Description: <p>, <table> and <svg> # Op zijn website is het alleen <p> dus hij heeft zeker nog nabewerkingen gedaan. # Ook toevoegen van <pre> en <h2> etc. # Dit is dus alleen een grove parse om de tekst en tabellen eruit te krijgen. # Gezien de issues (e.g. problemen met footnotes in MOV) is het waarschijnlijk beter om # te gaan werken met de html die pdf2txt maakt. Is wel niet zo clean maar geeft minder # problemen. # Kijkend naar de resultaten van alle pdf2html conversies ziet het er naar uit dat het toch # niet zo makkelijk programmatisch gaat. zneak/felix heeft zijn best gedaan en genereert cleane # html maar er zitten toch nog veel fouten in (zie issues). pdf2txt maakt een nette layout maar # in de tabellen gaat het vaak mis en moet je nog veel nabewerken. Het is te vergelijken met # pdf2music: soms lukt het maar meestal ziet het er niet goed uit en kun je beter alles handmatig # doen. Veel werk maar geeft het beste resultaat. # pdf2txt gebruikt trouwens spans voor tabellen. Lelijk. # DONE: checked out pdftohtml from Xpdf. This produces the best looking pages. But also no real # tables. It uses a png file as background for the tables and then lays everything out with # absolutely positioned divs. For exact positioning that seems the way to go. But also slight # mistakes in the table layout. Faster (C++, Qt) and better than pdfminer.six. # But no real tables is faking it... # NOTE_: at autoclose we are getting mismatch: th strong when parsing the full vol2a.pdf # Something goes wrong. # Figures are extracted as svg but often look warped (e.g. Figure 3-18 and 3-19 at HADDPS) # PDF parsing is like unscrambling scrambled eggs... # DONE: checked out pdf2htmlEX. It creates perfectly looking html 5 pages. It can be done! # It is fast and puts everything in one html page. # TODO_: check out https://github.com/fmalina/unilex-transcript which promisses to create # clean (semantic) html from pdf2htmlEX output. # NOTE_: Conversion result geeft altijd 0/0 omdat we niet in de code komen waar success en fail worden # geincrementeerd. Het zijn dus loze variabelen. print("Conversion result: %i/%i" % (parser.success, parser.success + parser.fail))
def arc(): destino = str(formato.get()) if destino == "Arquivo do Word": destino = "docx" if destino == "Arquivo do Power-Point": destino = "ppt" if destino == "Arquivo do Excel": destino = "xlsx" if destino == "Arquivo de Texto": destino = "txt" import win32com.client as win32 from os import path in_file = path.abspath(diretorio) out_file = path.abspath(filename) if destino == "docx": if file_extension in ArqDOCX or file_extension.lower( ) == ".pdf" or file_extension.lower() == ".txt": word = win32.DispatchEx("Word.Application") word.Visible = 0 word.DisplayAlerts = 0 doc = word.Documents.Open(in_file) doc.SaveAs(out_file, FileFormat=16) doc.Close() word.Quit() elif destino.lower() == "pdf": if file_extension.lower() in ArqPPT: word = win32.DispatchEx("PowerPoint.Application") word.Visible = 0 word.DisplayAlerts = 0 doc = word.Presentations.Open(in_file) doc.SaveAs(out_file, FileFormat=32) doc.Close() word.Quit() elif file_extension.lower() in ArqXLSX: word = win32.DispatchEx("Excel.Application") word.Visible = 0 word.DisplayAlerts = 0 doc = word.Workbooks.Open(in_file) doc.ExportAsFixedFormat(0, out_file) doc.Close() word.Quit() elif file_extension.lower() in ArqDOCX or file_extension.lower( ) == ".txt": word = win32com.client.Dispatch('Word.Application') word.Visible = 0 word.DisplayAlerts = 0 doc = word.Documents.Open(in_file) doc.SaveAs(in_file, FileFormat=17) doc.Close() word.Quit() elif destino.lower() == "xlsx": if file_extension.lower() == ".pdf": import pdftables_api c = pdftables_api.Client('to7jluln0hvr') c.xlsx(diretorio, filename + '.xlsx') elif file_extension.lower() == ".txt" or file_extension.lower( ) in ArqDOCX: import pandas as pd df = pd.read_csv(diretorio, header=None, delim_whitespace=True) df.to_excel(filename + '.xlsx', index=False, header=None) elif destino.lower() == "txt": if file_extension in ArqDOCX: import docx2txt text = docx2txt.process(diretorio) with open(filename + ".txt", "w") as file: print(text, file=file) elif file_extension.lower() == ".pdf": from io import StringIO from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage output_string = StringIO() with open(diretorio, 'rb') as in_file: parser = PDFParser(in_file) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, output_string, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) with open(filename + ".txt", "w") as final: final.write(output_string.getvalue()) elif file_extension.lower() in ArqXLSX: import pandas as pd read_file = pd.read_excel(diretorio, header=None) read_file.to_csv(filename + ".txt", index=None, header=True) messagebox.showinfo( "Formato convertido", "Formato de ficheiro convertido com sucesso.\n\n" + file_extension[1:].upper() + " para " + destino.upper() + "\n\nSalvo em: " + out_file + "." + destino) root.destroy()
def parse_webpages(webpages): for page in webpages: # obtain the robots.txt url r = Robots.robots_url(page) robots = Robots.fetch(r) if (robots.allowed(page, '*')): # sitemaps is a list of all the sitemaps for a website sitemaps = robots.sitemaps sitemaps_list = list(sitemaps) html = requests.get(page) # html of the webpage soup = bs4.BeautifulSoup(html.text, "html.parser") outlinks = soup.find_all("a") # all the outlinks links = [str(i.get('href')) for i in outlinks] outlinks = [str(i) for i in outlinks] docs = [] # the documents on the page for file in links: directory = page.rsplit("/", 1)[0] link = directory + '/' + file # can be expanded to other file types with a comma if file.endswith(('txt', 'md')): if file.startswith(('http://', 'www.')): text = bs4.BeautifulSoup( requests.get(file).text, "html.parser") ext = file.rsplit(".", 1)[-1] text = [file, ext, text] # text = {'link': link, 'ext': ext, 'text': text} docs.append(text) else: text = bs4.BeautifulSoup( requests.get(link).text, "html.parser") ext = link.rsplit(".", 1)[-1] text = [link, ext, text] # text = {'link': link, 'ext': ext, 'text': text} docs.append(text) elif file.endswith(('pdf')): # special case if PDF x = file try: if file.startswith(('http://', 'www.')): pdf = file.rsplit("/", 1)[-1] response = urlopen(file) else: pdf = file.rsplit("/", 1)[-1] # must first check if pdf is found response = urlopen(link) except urllib.error.HTTPError as e: # if 404 error, put 404 as text text = [link, "pdf", "404"] # text = {'link': link, 'ext': 'pdf', 'text': "404"} docs.append(text) else: # otherwise must save the pdf to run pypdf2 file = open(pdf, 'wb') file.write(response.read()) file.close() if x.startswith('http://'): link = x txt = "" file = open(pdf, 'rb') parser = PDFParser(file) document = PDFDocument(parser) rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for p in PDFPage.create_pages(document): # As the interpreter processes the page stored in PDFDocument object interpreter.process_page(p) # The device renders the layout from interpreter layout = device.get_result() # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance( lt_obj, LTTextLine): txt += lt_obj.get_text() # close the pdf file file.close() name = [link, "pdf", txt] # name = {'link': link, 'ext': 'pdf', 'text': txt} os.remove(pdf) # remove the saved file when done docs.append(name) docs = [[str(i) for i in lis] for lis in docs] timestamp = datetime.datetime.now().isoformat() output = { 'url': page, 'timestamp': timestamp, 'outlinks': outlinks, 'html': html.text, 'docs': docs, 'sitemaps': sitemaps_list } with Crawling_L_REST.app.app_context(): Crawling_L_REST.add_webpage(output) return output
def get_text_box(pdf_path): """ :return: trả về list các box theo từng page ở dạng như thế này với region là danh sách tọa độ củ các block text còn media box là tọa độ size của từng page điểm x0=0, y0=0 [ { #page 1 "region": [ { "cordinate": [x0, y0, x1, y1] "text": "day la text" }, { "cordinate": [x0, y0, x1, y1] "text": "day la text" }, { "cordinate": [x0, y0, x1, y1] "text": "day la text" } ], "media_box":[ x1, y1 ] }, { #page 2 "region": [ { "cordinate": [x0, y0, x1, y1] "text": "day la text" }, { "cordinate": [x0, y0, x1, y1] "text": "day la text" }, { "cordinate": [x0, y0, x1, y1] "text": "day la text" } ], "media_box":[ x1, y1 ] } ] """ fp = open(pdf_path, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() device = PDFDevice(rsrcmgr) laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) list_all_box = [] # loop over all pages in the document for page in PDFPage.create_pages(document): list_item = {} interpreter.process_page(page) layout = device.get_result() media_texbox = (int(page.mediabox[2]), int(page.mediabox[3])) MEDIA_Y1 = int(page.mediabox[3]) sub_box = parse_obj(layout._objs, MEDIA_Y1) list_item['region'] = sub_box list_item['media_box'] = media_texbox list_all_box.append(list_item) return list_all_box
def main(argv): # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = False laparams = LAParams() using_optparse = False parser = ArgumentParser(prog='pdf2txt.py', description='Convert pdf to txt', formatter_class=ArgumentDefaultsHelpFormatter) if using_optparse: DEBUG(3, 'using optparse') parser.add_argument = parser.add_option parser.parse_known_args = parser.parse_args parser.disable_interspersed_args() parser.add_argument('-d', dest='debuglevel', action='count', default=0, help='Debug (repeat for more verbose debugging)') parser.add_argument( '-p', '--pages', dest='pagenos', action='store', type=str, default='', help= 'Specifies the comma-separated list of the page numbers to be extracted. Page numbers start at one. By default, it extracts text from all the pages.' ) parser.add_argument('-c', '--codec', dest='codec', action='store', type=str, default='utf-8', help='Specifies the output codec.') parser.add_argument( '-t', '--type', dest='outtype', action='store', type=str, default='shape', choices=['text', 'html', 'xml', 'tag', 'shape'], help='Specifies the output format, one of: shape, text, html, xml, tag' ) parser.add_argument( '-m', dest='maxpages', action='store', type=int, default=0, help= 'Specifies the maximum number of pages to extract. By default (0), it extracts all the pages in a document.' ) parser.add_argument( '-P', '--password', dest='password', action='store', type=str, default='', help='Provides the user password to access PDF contents.') parser.add_argument( '-o', '--output', dest='outfile', action='store', type=str, default=None, help= 'Specifies the output file name. By default, it prints the extracted contents to stdout in text format.' ) parser.add_argument( '-C', '--no-caching', dest='caching', action='store_false', default=True, help= 'Suppress object caching. This will reduce the memory consumption but also slows down the process.' ) parser.add_argument('-n', '--no-layout', dest='layout', action='store_false', default=True, help='Suppress layout analysis.') parser.add_argument('--show-pageno', dest='show_pageno', action='store_true', default=False, help='Show page numbers.') parser.add_argument( '-A', '--analyze-all', dest='all_texts', action='store_true', default=False, help= 'Forces to perform layout analysis for all the text strings, including text contained in figures.' ) parser.add_argument('-V', '--detect-vertical', dest='detect_vertical', action='store_true', default=False, help='Allows vertical writing detection.') parser.add_argument( '-M', dest='char_margin', action='store', type=float, default=2.0, help= 'Two text chunks whose distance is closer than the char_margin (shown as M) is considered continuous and get grouped into one.' ) parser.add_argument( '-L', dest='line_margin', action='store', type=float, default=0.5, help= 'Two lines whose distance is closer than the line_margin (L) is grouped as a text box, which is a rectangular area that contains a "cluster" of text portions.' ) parser.add_argument( '-W', dest='word_margin', action='store', type=float, default=0.1, help= 'It may be required to insert blank characters (spaces) as necessary if the distance between two words is greater than the word_margin (W), as a blank between words might not be represented as a space, but indicated by the positioning of each word.' ) parser.add_argument( '-F', dest='boxes_flow', action='store', type=float, default=0.5, help= 'Specifies how much a horizontal and vertical position of a text matters when determining a text order. The value should be within the range of -1.0 (only horizontal position matters) to +1.0 (only vertical position matters).' ) parser.add_argument( '-Y', '--layout-mode', dest='layoutmode', action='store', type=str, default='normal', choices=['exact', 'normal', 'loose'], help= 'Specifies how the page layout should be preserved. (Currently only applies to HTML format.) One of: exact, normal, loose.' ) parser.add_argument('-O', '--image-writer', dest='imagewriter', action='store', type=str, default=None, help='imagewriter') parser.add_argument('-R', '--rotation', dest='rotation', action='store', type=int, default=0, help='rotation') parser.add_argument('-S', '--strip-control', dest='stripcontrol', action='store_true', default=False, help='stripcontrol') parser.add_argument( '-s', dest='scale', action='store', type=float, default=1, help='Specifies the output scale. Can be used in HTML format only.') parser.add_argument( '--draw-lines', dest='draw_lines', action='store_true', help= "Draw crude page representation, coloured TextLines (= short pieces of text). Valid only for the `shape' output." ) parser.add_argument( '--draw-boxes', dest='draw_boxes', action='store_true', help= "Draw crude page representation, coloured TextBoxes (= grouped text lines). Valid only for the `shape' output." ) parser.add_argument( '--draw-blocks', dest='draw_blocks', action='store_true', help= "Draw crude page representation, coloured TextBlocks (= grouped TextBoxes). Valid only for the `shape' output." ) parser.add_argument( '--shear-limit', dest='shear_limit', action='store', default=0.1, type=float, help= "If the text is sheared above this limit, reject it. Valid only for the `shape' output." ) parser.add_argument( '--rotation-limit', dest='rotation_limit', action='store', default=2, type=float, help= "If the text is rotated above this angle (in degrees), reject it. Valid only for the `shape' output." ) parser.add_argument( '--line-height-diff', dest='line_height_diff', action='store', type=float, default=0.1, help= 'Two lines whose vertical sizes differ more than this ratio are not to be considered of the same paragraph (but e.g. one of them is a heading).' ) parser.add_argument('--heading-before', dest='heading_before', action='store', type=str, default='', help='String to put before each heading, e.g. <h1>') parser.add_argument('--heading-after', dest='heading_after', action='store', type=str, default='', help='String to put after each heading, e.g. </h1>') parser.add_argument( '--box-separator', dest='box_separator', action='store', type=str, default=r'\n\n', help= r'Separate boxes with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.' ) parser.add_argument( '--block-separator', dest='block_separator', action='store', type=str, default=r'\n\n', help= r'Separate blocks with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.' ) parser.add_argument( '--indent-separator', dest='indent_separator', action='store', type=str, default=r'\n\n', help= r'Separate indented lines with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.' ) parser.add_argument( '--indent-string', dest='indent_string', action='store', type=str, default=r'\t', help= r'Put this string in front of indented lines. Use \n for new line, \t for TAB, other escape sequences are not recognized.' ) parser.add_argument( '--indent-limit', dest='indent_limit', action='store', type=float, default=3, help= 'If the line is indented more then this (approximately characters), it will separated by --indent-separator from the previous one.' ) parser.add_argument( '--page-separator', dest='page_separator', action='store', type=str, default=r'\n\n', help= r'Separate pages with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.' ) parser.add_argument( '--norm-whitespace', dest='norm_whitespace', action='store_true', default=False, help= 'Normalize whitespace (remove duplicate spaces, replace end of lines with spaces).' ) parser.add_argument( '--print-stats', dest='print_stats', action='store_true', default=False, help= 'Instead of the text, output some simple statistics about the file.') parser.add_argument( '--max-blocks', dest='max_blocks', action='store', default=0, type=int, help= 'If there is more than this blocks per page, do not return any text. Use to discriminate abnormal files (run --print-stats first to find out the number of boxes per "normal" file). 0 means no limit. 50 is maybe a good value.' ) parser.add_argument( '--max-textlines', dest='max_textlines', action='store', default=0, type=int, help= 'If there is more than this textlines per any block, do not return any text. Use to discriminate abnormal files (run --print-stats first to find out the number of boxes per "normal" page). 0 means no limit. 18 is maybe a good value.' ) parser.add_argument( '--line-height-method', dest='line_height_method', action='store', type=str, default='bbox', choices=['bbox', 'mean', 'median'], help= 'Method to calculate height of line (relevant if there are characters with uneven height). bbox takes the bounding box (rectangle encompassing the line), mean the arithmetic mean of the height of all the characters, median is the median of the height of all the characters. Use mean or median if there are outlier characters, e.g. one big character at the beginning of line.' ) parser.add_argument(dest='pdffile', help='List of PDF files to go through', default=None, nargs='+') args, rest = parser.parse_known_args() global debuglevel debuglevel = debug = args.debuglevel DEBUG(3, 'args:', str(args)) DEBUG(3, 'rest:', str(rest)) DEBUG(3, 'optparse:', using_optparse) if args.pagenos: pagenos.update(int(x) - 1 for x in args.pagenos.split(',')) maxpages = args.maxpages outfile = args.outfile password = args.password caching = args.caching showpageno = args.show_pageno if not args.layout: laparams = None if laparams and args.all_texts: laparams.all_texts = True if laparams and args.detect_vertical: laparams.detect_vertical = True if laparams: laparams.char_margin = args.char_margin laparams.line_margin = args.line_margin laparams.word_margin = args.word_margin laparams.boxes_flow = args.boxes_flow layoutmode = args.layoutmode if args.imagewriter: imagewriter = ImageWriter(args.imagewriter) rotation = args.rotation stripcontrol = args.stripcontrol outtype = args.outtype codec = args.codec scale = args.scale args.box_separator = unescape_string(args.box_separator) args.block_separator = unescape_string(args.block_separator) args.indent_separator = unescape_string(args.indent_separator) args.indent_string = unescape_string(args.indent_string) args.page_separator = unescape_string(args.page_separator) global options options = args PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') DEBUG(2, 'output goes to', outfile) else: outfp = sys.stdout DEBUG(2, 'output goes to stdout') if outtype == 'shape': device = ShapeTextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, showpageno=showpageno, imagewriter=imagewriter) elif outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in options.pdffile: DEBUG(2, 'processing', fname) fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() DEBUG(2, 'finished.') return
def get_text_from_pdf(path, page_nums=None): r = [] fp = open(path, 'rb') rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = PDFPage.get_pages(fp, pagenos=page_nums) def parse_obj(lt_objs): # https://stackoverflow.com/questions/31819862/python-pdf-mining-get-position-of-text-on-every-line # loop over the object list for obj in lt_objs: if isinstance(obj, LTTextLine): x1, y1, x2, y2 = obj.bbox assert x1 < x2 assert y1 < y2 y1 = 1400 - y1 y2 = 1400 - y2 y1, y2 = y2, y1 text = obj.get_text() width = obj.width height = obj.height text = text.replace('東久留米武蔵村山', '東久留米 武蔵村山') # HACK! for line_i, line in enumerate( text.split('\n')): # CHECK WHETHER THIS IS NEEDED! for word_j, word in enumerate(line.split()): each_height = height / text.count('\n') i_y1 = y1 + each_height * line_i i_y2 = y2 + each_height * (line_i + 1) each_width = width / len(line.split()) i_x1 = x1 + each_width * word_j i_x2 = x2 + each_width * (word_j + 1) r.append( TextItem(text=word, x1=i_x1, y1=i_y1, x2=i_x2, y2=i_y2, width=each_width, height=each_height)) # if it's a textbox, also recurse if isinstance(obj, LTTextBoxHorizontal): parse_obj(obj._objs) # if it's a container, recurse elif isinstance(obj, LTFigure): parse_obj(obj._objs) for page in pages: print('Processing next page...') interpreter.process_page(page) layout = device.get_result() for lobj in layout: if isinstance(lobj, LTTextBox): parse_obj(lobj) for xx in range(5): dists = [] for x in range(len(r)): for y in range(len(r)): text_item_1 = r[x] text_item_2 = r[y] dists.append((abs(text_item_1.y1 - text_item_2.y1), x, y)) merged = set() for dist, x, y in sorted(dists): text_item_1 = r[x] text_item_2 = r[y] text_1_num = all(i.isnumeric() or i in ',()' for i in text_item_1.text.strip()) text_2_num = all(i.isnumeric() or i in ',()' for i in text_item_2.text.strip()) if not dist: continue elif text_1_num != text_2_num: continue elif y in merged: continue merged.add(y) if dist <= 18: # NOTE ME: This threshold may need to be tuned!!! ===================================== r[y] = TextItem(text=text_item_2.text, x1=text_item_2.x1, y1=text_item_1.y1, x2=text_item_2.x2, y2=text_item_1.y1 + text_item_2.height, width=text_item_2.width, height=text_item_2.height) r.sort(key=lambda x: (x.y1, x.x1, x.x2, x.y2)) #for i in r: # print(i) return r
from urllib.request import urlopen # Open a PDF file. # fp = open('./sample/sampl1.pdf', 'rb') fp = urlopen("https://www.tencent.com/zh-cn/articles/8003261479985013.pdf") parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize("") reource = PDFResourceManager() laparam = LAParams() device = PDFPageAggregator(reource, laparams=laparam) interpreter = PDFPageInterpreter(reource, device) for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for out in layout: if hasattr(out, "get_text"): print(out.get_text())
def get_pdf_file_content(path_to_pdf): ''' path_to_pdf: is the parameter that will give access to the PDF File we want to extract the content. ''' ''' PDFResourceManager is used to store shared resources such as fonts or images that we might encounter in the files. ''' resource_manager = PDFResourceManager(caching=True) ''' create a string object that will contain the final text the representation of the pdf. ''' out_text = StringIO() ''' UTF-8 is one of the most commonly used encodings, and Python often defaults to using it. In our case, we are going to specify in order to avoid some encoding errors. ''' codec = 'utf-8' """ LAParams is the object containing the Layout parameters with a certain default value. """ laParams = LAParams() ''' Create a TextConverter Object, taking : - ressource_manager, - out_text - layout parameters. ''' text_converter = TextConverter(resource_manager, out_text, laparams=laParams) fp = open(path_to_pdf, 'rb') ''' Create a PDF interpreter object taking: - ressource_manager - text_converter ''' interpreter = PDFPageInterpreter(resource_manager, text_converter) ''' We are going to process the content of each page of the original PDF File ''' for page in PDFPage.get_pages(fp, pagenos=set(), maxpages=0, password="", caching=True, check_extractable=True): interpreter.process_page(page) ''' Retrieve the entire contents of the “file” at any time before the StringIO object’s close() method is called. ''' text = out_text.getvalue() ''' Closing all the ressources we previously opened ''' fp.close() text_converter.close() out_text.close() ''' Return the final variable containing all the text of the PDF ''' return text
parser = PDFParser(pd_file) # print(parser) # pdf文档对象 document = PDFDocument(parser) parser.set_document(document) document.set_parser(parser) # 初始化文档密码 document.initialize() if document.is_extractable: print(True) else: raise PDFTextExtractionNotAllowed # 存储文档资源 src = PDFResourceManager() # 设备对象 device = PDFPageAggregator(src, laparams=LAParams()) # 解释器对象 inter = PDFPageInterpreter(src, device) pages = document.get_pages() for page in pages: # print(page.contents) inter.process_page(page) layout = device.get_result() for x in layout:
def pdf_translate(pdf_path, fgmt, make_marian_conf=None, logger=None): page_split_tag = '\n\n<<PAGE_SPLIT_TAG>>\n\n' output_string = StringIO() with open(pdf_path, 'rb') as in_file: parser = PDFParser(in_file) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, output_string, laparams=LAParams(boxes_flow=0.3, line_margin=1.0)) interpreter = PDFPageInterpreter(rsrcmgr, device) for idx, page in enumerate(PDFPage.create_pages(doc)): interpreter.process_page(page) output_string.write(page_split_tag) pdf_text = output_string.getvalue() pdf_pages = pdf_text.split(page_split_tag) marian_processes = [] if make_marian_conf: marian_processes = make_marian_process( make_marian_conf["marian_command"], make_marian_conf["marian_args_pdf_translator"], make_marian_conf["pdf_ports"]) ret = [] for pdf_idx, pdf_page in enumerate(pdf_pages[:-1]): retry_max = 3 translated = None for i in range(retry_max): if logger: logger.info("translate page={}".format(pdf_idx)) to_translate = pre_proc_text(pdf_page) translated = fgmt.translate_text(to_translate) if not fgmt.detected_marian_err: ret.append(translated) break else: translated = None close_marian_process(marian_processes) marian_processes = make_marian_process( make_marian_conf["marian_command"], make_marian_conf["marian_args_pdf_translator"], make_marian_conf["pdf_ports"]) fgmt.detected_marian_err = False if logger: logger.info(fgmt.get_and_clear_logs()) logger.warning("recovery marian processes {}/{}".format( i, retry_max - 1)) if translated is None: ret.append(get_err_translated()) marian_processes = ckeck_restart_marian_process( marian_processes, make_marian_conf["max_marian_memory"], make_marian_conf["marian_command"], make_marian_conf["marian_args_pdf_translator"], make_marian_conf["pdf_ports"], logger=logger) if logger: logger.info(fgmt.get_and_clear_logs()) if make_marian_conf: close_marian_process(marian_processes) return ret
def get_context_data(self, **kwargs): context = {} document_obj = Document.objects.filter().order_by('-uploaded_at')[0] context['document_name'] = document_obj.name path = document_obj.document.path context['path'] = path doc_detail = DocumentDetail(document=document_obj) if path.endswith('.pdf'): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() # # with open("testtttttttttt.txt", "w") as text_file: # text_file.write(text) text = text.lower() name = re.findall(r"patient's name:|patient:(.*)", text.lower()) if not name: name = re.findall(r"re:(.*)", text.lower()) address = self.__remove_null_from_list( re.findall(r"address:(.*)", text.lower())) dob = self.__remove_null_from_list( re.findall(r"dob:(.*),", text.lower())) if not dob: dob = re.findall(r"date of birth:\s*(.*)", text.lower()) sex = re.findall(r"sex:(.*)", text.lower()) injury = re.findall(r"consultation:(.+?)\.", text, re.DOTALL) if not injury: injury = self.__remove_null_from_list( re.findall(r"injury:(.*)", text.lower())) date_of_surgery = re.findall(r"date of surgery:(.*)", text.lower()) claim_no = re.findall(r"(claim #|claim#:)(.*)", text.lower()) allergies = self.__remove_null_from_list( re.findall(r"allergies:(.*)", text.lower())) social_history = re.findall( r"(social history|family history|social history :)(.*?)(?:(?:\r*\n){2})", text.lower(), re.DOTALL) if 'no past medical history on file' in text.lower(): medical_history = ['no past medical history on file'] else: medical_history = re.findall(r"medical history:(.+?)\.", text.lower(), re.DOTALL) impression_list = re.findall(r"(impression.*?)(?:(?:\r*\n){2})", text.lower(), re.DOTALL) impression = [ i.replace('impression: ', '') for i in impression_list ] impression = list(filter(None, impression)) doctor = re.findall(r"(doctor.|physician:)(.*)", text.lower()) medicines = re.findall(r'mar action action date dose(.+?)iven', text.lower(), re.DOTALL) if medicines: medicine_list = list(filter(None, medicines[0].split('\n'))) medicines_list = ''.join(medicine_list[3:]) if not medicines: medicines = re.findall( r'current medications :(.+?)groves, steven j', text.lower(), re.DOTALL) medicines_list = list(filter(None, medicines[0].split('\n'))) del medicines_list[5:11] medicines_list = ''.join(medicines_list[1:]) general = re.findall(r'general:(.*)', text.lower())[1] vital_signs = re.findall( r'last filed vital signs(.+?)vital\s*s~ig', text, re.DOTALL) if not vital_signs: vital_dict = {} vital_sign_string = '' vital_signss = re.findall(r'vital signs:(.+?)bilateral', text.lower(), re.DOTALL) vital_signss = list(filter(None, vital_signss[0].split('\n'))) vital_dict['blood_pressure'] = vital_signss[-8].strip() vital_dict['pulse'] = vital_signss[-7].strip() vital_dict['temperature'] = vital_signss[-6].strip() vital_dict['respiratory_rate'] = vital_signss[-5].strip() vital_dict['Body Mass Index'] = vital_signss[-1].strip() vital_dict['Sp02'] = vital_signss[-2].strip() vital_dict['Weight'] = vital_signss[-3].strip() vital_dict['Height'] = vital_signss[-4].strip() for key, value in vital_dict.items(): vital_sign_string += key vital_sign_string += " : " + value + ', ' if vital_signs: vital_sign_string = '' vital_dict = {} vital_signs = vital_signs[0].split('\n') vital_dict['blood_pressure'] = vital_signs[4].strip().strip( '·').strip() vital_dict['pulse'] = vital_signs[5].strip() vital_dict['temperature'] = vital_signs[6].strip() vital_dict['respiratory_rate'] = vital_signs[7].strip() vital_dict['oxygen_sat'] = vital_signs[8].strip() for key, value in vital_dict.items(): vital_sign_string += key vital_sign_string += " : " + value + ', ' context['name'] = '' context['address'] = '' context['dob'] = '' context['sex'] = '' context['date_of_surgery'] = '' context['doctor'] = '' if name: context['name'] = name[-1].replace('(cid:9)', '').strip() doc_detail.patient_name = context['name'] if address: context['address'] = address[-1].replace('(cid:9)', '').strip() doc_detail.address = context['address'] if dob: context['dob'] = dob[-1].replace('(cid:9)', '').strip() doc_detail.dob = context['dob'] if sex: context['sex'] = sex[-1].replace('(cid:9)', '').strip() doc_detail.sex = context['sex'] if date_of_surgery: context['date_of_surgery'] = date_of_surgery[-1].replace( '(cid:9)', '').strip() doc_detail.date_of_surgery = context['date_of_surgery'] if injury: context['injury'] = injury[0] doc_detail.injury = context['injury'] if claim_no: context['claim_no'] = claim_no[0][1] doc_detail.claim_no = context['claim_no'] if allergies: context['allergies'] = allergies[0] doc_detail.allergies = context['allergies'] if social_history: context['social_history'] = social_history[0][1] doc_detail.social_history = context['social_history'] if medical_history: context['medical_history'] = medical_history[0] doc_detail.medical_history = context['medical_history'] if impression: context['impression'] = impression[0] doc_detail.impression = context['impression'] if doctor: context['doctor'] = doctor[0][1] doc_detail.doctor = context['doctor'] if vital_sign_string: context['vital_signs'] = vital_sign_string.strip().strip(',') doc_detail.vital_signs = context['vital_signs'] if medicines_list: context['medicines'] = medicines_list doc_detail.medicines = context['medicines'] if general: context['general'] = general doc_detail.general = context['general'] doc_detail.save() return context
from pdfminer.pdfpage import PDFPage from pdfminer.pdfpage import PDFTextExtractionNotAllowed from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.layout import LAParams from pdfminer.layout import LTTextBoxHorizontal import re # encoding=utf8 import sys fp = open(u'./jlpea-04-00214.pdf', 'rb') parser = PDFParser(fp) document = PDFDocument(parser) rsrcmgr = PDFResourceManager(caching=False) laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) replace = re.compile(r's+') for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() for x in layout: if (isinstance(x, LTTextBoxHorizontal)): text = re.sub(replace, '', x.get_text())
def parse_loske_pdf(pdf, is_ipp=True): stripcid_re = re.compile(u"\(cid:.*?\)", re.UNICODE) newline_heuristic_re = re.compile(u"Montag, den |Dienstag, den |Mittwoch" \ u", den |Donnerstag, den |Freitag, den ", re.IGNORECASE | re.UNICODE) bnw_endheuristic_re = re.compile(u"B\.n\.W\.=Beilage.*", re.UNICODE) dow_beginheuristic_re = re.compile(u".*?Montag, den ", re.IGNORECASE | re.UNICODE) meal_detect_re = re.compile(u"(\d\.)(.*?)(\d).(\d\d)", re.UNICODE) #meal_detect_re = re.compile(u"(\d\.)(\D)", re.UNICODE) date_re = re.compile(u"(\d{1,2})\.(\d{1,2})\.(\d{1,4})(.*)", re.UNICODE) meal_props = re.compile(ur'\b[VKRS](?:\+[VKRS])*\b\s*', re.UNICODE) meal_numbers = re.compile(ur'([^/]|^)\s*\b[1-6](?:,[1-6])*\b([^/]|$)', re.UNICODE) rsrcmgr = PDFResourceManager() outtxt = cStringIO.StringIO() device = TextConverter(rsrcmgr, outtxt) pdfp = PDFParser(cStringIO.StringIO(pdf)) doc = PDFDocument() pdfp.set_document(doc) doc.set_parser(pdfp) doc.initialize("") if not doc.is_extractable: print >> sys.stderr, u"PDF Document not extractable" sys.exit(1) interpreter = PDFPageInterpreter(rsrcmgr, device) for (pageno, page) in enumerate(doc.get_pages()): #print pageno interpreter.process_page(page) device.close() fulltext = outtxt.getvalue().decode('utf-8', 'replace') fulltext = stripcid_re.sub(u'', fulltext) fulltext = dow_beginheuristic_re.sub(u'', fulltext) fulltext = bnw_endheuristic_re.sub(u'', fulltext) fulltext = newline_heuristic_re.sub(u'\n', fulltext) lines = fulltext.split(u'\n') now = datetime.date(1, 1, 1) for line in lines: ret = date_re.search(line) if ret: day, month, year, meals = ret.groups() try: now = datetime.date(int(year), int(month), int(day)) except ValueError: # some weird date in pdf (like 29.02.2013), skipping these # entries is the easiest solution continue #meals = meal_detect_re.sub(ur'\n\2(\3.\4 €)', meals).strip() meals = meal_detect_re.finditer(meals) for meal_match in meals: m = meal_match.group(2) m = meal_props.sub(u'', m) m = meal_numbers.sub(lambda x: x.group(1) + x.group(2), m) m = m.replace(u'*', u'') m = m.split() m.append(u'({0}.{1} €)'.format(meal_match.group(3), meal_match.group(4))) m = u' '.join(m) meal_type = TYPE_IPP if is_ipp else TYPE_FMI try: tmp = config["meals"][now] config["meals"][now].append((meal_type, m)) except KeyError, e: config["meals"][now] = [(meal_type, m)]
def Converting_Function(Path_To_TXTs, new_file): """ :param Path_To_TXTs: path to PDFs or/and XML files :param new_file: the path to save the TXT format """ files_short = np.array([ f for f in os.listdir(Path_To_TXTs) if os.path.isfile(os.path.join(Path_To_TXTs, f)) ]) files = np.array([Path_To_TXTs + '/' + f for f in files_short]) for file in files: if file.endswith('.pdf'): Not_Good = False Prob = False try: fp = open(file, 'rb') parser_pdf = PDFParser(fp) doc = PDFDocument(parser_pdf) rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageDetailedAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) device.get_result() rows = device.rows lines = [item[5] for item in rows] if average_len(lines) >= 20: try: text_all = convert_pdf_to_txt(file, pages=[0]) rows_pages = [item for item in rows if item[0] != 0] words = [item[1] for item in rows_pages] words_1 = [item for item in words if item <= 200] words_2 = [item for item in words if item > 200] first = most_common(words_1) second = most_common(words_2) pages = [item[0] for item in rows_pages] pages = list(set(pages)) pages.sort() for page in pages: page_lines = [ line for line in rows_pages if line[0] == page ] text1 = '' text2 = '' text_middle = '' for item in page_lines: if item[1] <= (first + 20) and not ( item[5].isdigit() and not item[5].endswith('.')): text1 = text1 + '\n' + item[5] elif item[1] >= ( second - 20) and item[1] <= 500 and not ( item[5].isdigit() and not item[5].endswith('.')): text2 = text2 + '\n' + item[5] else: if not (item[5].isdigit() and not item[5].endswith('.')): text_middle = text_middle + '\n' + item[ 5] if len(text1 + text2) > len(text_middle): text_all = text_all + text1 + text_middle + text2 else: Not_Good = True if len(text_all) >= 1500 and Not_Good == False: text_all = text_all.replace(' ac.', '~').replace( ' a.c.', '~').replace(' a.c', '~') name = file.split('/')[-1][:-4] path = new_file + '/' + name + '.txt' with open(path, 'w', encoding='utf8') as f: f.write(text_all) f.close() #print('Article ', name, ' is successfully converted') elif len(text_all) >= 1500 and Not_Good == True: rawText = parser.from_file(file) text = rawText['content'] text = os.linesep.join( [s for s in text.splitlines() if s]) text_all = text.replace(' ac.', '~').replace( ' a.c.', '~').replace(' a.c', '~') text_all = " ".join(text_all.split()) name = file.split('/')[-1][:-4] path = new_file + '/' + name + '.txt' with open(path, 'w', encoding='utf8') as f: f.write(text_all) f.close() #print('Article ', name, ' is successfully converted') else: raw = parser.from_file(file) text_all = raw['content'] text_all = "\n".join([ ll.rstrip() for ll in text_all.splitlines() if ll.strip() ]) if len(text_all) >= 1500: text_all = text_all.replace( ' ac.', '~').replace(' a.c.', '~').replace(' a.c', '~') name = file.split('/')[-1][:-4] path = new_file + '/' + name + '.txt' with open(path, 'w', encoding='utf8') as f: f.write(text_all) f.close() #print('Article ', name, ' is successfully converted') else: pass #print('The PDF "' + file + '" contain less than 1500 characters !!!') except: Prob = True elif average_len(lines) < 20 or Prob == True: raw = parser.from_file(file) text_all = raw['content'] text_all = "\n".join([ ll.rstrip() for ll in text_all.splitlines() if ll.strip() ]) if len(text_all) >= 1500: text_all = text_all.replace(' ac.', '~').replace( ' a.c.', '~').replace(' a.c', '~') name = file.split('/')[-1][:-4] path = new_file + '/' + name + '.txt' with open(path, 'w', encoding='utf8') as f: f.write(text_all) f.close() #print('Article ', name, ' is successfully converted') else: pass #print('The PDF "' + file + '" contain less than 1500 characters !!!') except: Prob = True if Prob == True: raw = parser.from_file(file) text_all = raw['content'] text_all = "\n".join([ ll.rstrip() for ll in text_all.splitlines() if ll.strip() ]) if len(text_all) >= 1500: text_all = text_all.replace(' ac.', '~').replace( ' a.c.', '~').replace(' a.c', '~') name = file.split('/')[-1][:-4] path = new_file + '/' + name + '.txt' with open(path, 'w', encoding='utf8') as f: f.write(text_all) f.close() #print('Article ', name, ' is successfully converted') else: pass #print('The PDF "' + file + '" contain less than 1500 characters !!!') elif file.endswith('.xml'): text_all = get_text_from_XML_without_saving(file) text_all = text_all.split('competing financial interest')[0] text_all = text_all.replace(' ac.', '~').replace(' a.c.', '~').replace(' a.c', '~') name = file.split('/')[-1][:-4] path = new_file + '/' + name + '.txt' with open(path, 'w', encoding='utf8') as f: f.write(text_all) f.close()
def get_text(url, parse=True, laparams=laparams): url += 'v1.full.pdf' max_attempts = 4 attempts = 0 print(url) while attempts < max_attempts: r = requests.get(url) if r.status_code != 429: break # If rate limited, wait and try again (in seconds) time.sleep((2**attempts) + random.random()) attempts = attempts + 1 data = r.content try: f = io.BytesIO(data) rsrcmgr = PDFResourceManager() retstr = BytesIO() codec = 'utf-8' device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # , rect_colors=rect_colors) interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 # is for all caching = True pagenos = set() for page in PDFPage.get_pages(f, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) device.close() pdf_data = retstr.getvalue() retstr.close() except: return ('.raw.txt', data) try: if parse == False: return ('.xml', pdf_data) else: # xmltest = convert_pdf_to_xml(pdf_data) root = ET.fromstring(pdf_data) temp = root.find('.//text') curr_font = temp.get('font') curr_size = float(temp.get('size')) text = '' rmargin = 70 i = 0 newline_pos = [] for l in root.iterfind('.//textline'): for t in l.findall('./text'): if (t.get('font') or t.get('size')) is None: if t.text[0] == ' ': text += ' ' else: text += '<<NEWLINE>>' newline_pos.append([]) else: x0, y0, x1, y1 = [ float(z) for z in t.get('bbox').split(',') ] char_size = float(t.get('size', 0)) char_font = t.get('font', '') if y0 > 750 or y0 < 75: continue if x0 < rmargin: if re.search('[A-Za-z]+', t.text) is not None: print('changing rmargin to ', str(x0 - 1)) rmargin = x0 - 1 text += t.text continue else: if (char_size != curr_size) or (char_font != curr_font): if (char_size) <= 8.: continue text += '<<NEWFONT>>' + t.text curr_font = t.get('font') curr_size = float(t.get('size')) else: text += t.text lines = text.split('<<NEWLINE>>') [print(l) for l in lines[:min(len(lines), 5)]] doc = lines[0] open_parens = False parens = [] if len(re.findall(r'\(', doc)) > len(re.findall(r'\)', doc)): parens.append(True) else: parens.append(False) for i, t in enumerate(lines): if (i == 0): if re.search(r'^\s*[a-z(]', lines[1]) is None: doc += '\n' continue if len(t) < 1: if open_parens == False: doc += '\n' else: continue continue else: o = len(re.findall(r'\(', t)) if open_parens == True: o += 1 c = len(re.findall(r'\)', t)) if o > c: open_parens = True else: open_parens = False if open_parens == False: if t.startswith(' '): t = re.sub(r'^ +', '<<PARAGRAPH>>', t) if t.lstrip(' ').startswith('<<NEWFONT>>') and lines[ i - 1].rstrip(' ').endswith('.'): t = re.sub(r'^<<NEWFONT>>', '<<PARAGRAPH>>', t.lstrip(' ')) if t.rstrip(' ').endswith('.'): t += '<<PARAGRAPH>>' if re.match(r'^\d{1,3}\.<<NEWFONT>>', t): t = '<<PARAGRAPH>>' + t doc += t parens.append(open_parens) doc = re.sub(r'(?<=[^.])\n+', '', doc) doc = re.sub(r' {3,}', '<<PARAGRAPH>>', doc) print(doc[:50]) parsed = [] for _text in doc.split(r'<<PARAGRAPH>>'): _text = re.sub('(<<NEWLINE>>)+', '\n', _text) _text = re.sub(r' ', r'\n', _text) _text = re.sub( r'<<NEWFONT>>(?P<url>http[a-zA-Z0-9./+?_=:-]+)( <<NEWFONT>>)?', r'\g<url>', _text) _text = re.sub(r'<<NEWFONT>> <<NEWFONT>>', r' ', _text) _text = re.sub(r'\(<<NEWFONT>>(.+)<<NEWFONT>>\)', r'(\g<1>)', _text, re.M) pattern = re.compile( r'<<NEWFONT>>(((\W|\d)+)|([A-Za-z_-]{1,2}\n?))<<NEWFONT>>') _text = pattern.sub(r'\g<1>', _text) pat2 = re.compile( r'<<NEWFONT>>([A-Za-z- :]+)<<NEWFONT>>([.:]?)') _text = pat2.sub(r'\g<1>\g<2>\n', _text) pat3 = re.compile( r'<<NEWFONT>>([A-Za-z_-]{1,3} *\n?)<<NEWFONT>>') _text = pat3.sub(r'\g<1>', _text) _text = re.sub(r'<<NEWFONT>>(.+)<<NEWFONT>>([a-z]+)', r'\g<1> \g<2>', _text) _text = re.sub(r'<<NEWFONT>>(.+)<<NEWFONT>>(\W*)\.?', r'\g<1> \g<2>', _text) _text = re.sub(r'-\n', r'-', _text) _text = re.sub(r'\((.+)(?:\n)(.+)\)', r'(\g<1>\g<2>))', _text) _text = re.sub(r'\((.+)<<NEWFONT>>(.+)\)', r'(\g<1>\g<2>)', _text, re.M) if len(_text.strip(' \n')) > 0: if len(re.findall(r'<<NEWFONT>>', _text)) == 1: _text = re.sub(r'<<NEWFONT>>', '\n', _text) parsed.append(_text) parsed2 = [ parsed[0], ] for i, p in enumerate(parsed): if i > 0: if re.search(r'^\s*\n*[a-z]', p) is not None: parsed2[i - 1] += p p = '' parsed2.append(p) parsed2 = '\n===================================\n'.join( [p for p in parsed2 if p != '']) print(parsed2[:50]) return ('.txt', parsed2) except: return ('.raw.xml', pdf_data)
def read_pdf(filename): results = [] fp = open(filename, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed print(u'定位并解析数据') content = "" try: outlines = document.get_outlines() for (level, title, dest, a, se) in outlines: print(level, title) except PDFNoOutlines: print(u'没有大纲') # 创建一个PDF资源管理器对象来存储共赏资源 rsrcmgr = PDFResourceManager() # 设定参数进行分析 laparams = LAParams() # 创建一个PDF设备对象 # device=PDFDevice(rsrcmgr) device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) current_section = "" # use for save tag last_section = "" # use for save title current_char_size = 0 current_fontname = "" debug_line = False debug_section = False debug_title = True # 处理每一页 for page in PDFPage.create_pages(document): interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() for listItem in layout: if (isinstance(listItem, LTTextBox)): # print("LTTextBox: %s" % listItem.get_text().encode('utf-8')+'\n') for textLine in listItem: if (isinstance(textLine, LTTextLine)): if debug_line: if (len(textLine.get_text().strip())): print("LTTextLine: %s" % textLine.get_text().encode('utf-8') + '\n') for char in textLine: if (isinstance(char, LTChar)): #if(len(char.get_text().strip()) > 0): # print(" LTChar: %s size:%s font: %s " % (char,char.size,char.fontname)) # new section ? if (abs(char.size - current_char_size) > 0.00001 and len(current_section.strip()) > 0 or current_fontname != char.fontname): #print(" size chang from : %s to: %s %s" % (char.size, current_char_size,char)) try: if debug_section: print(u"current_section : %s" % (current_section)) print(u"last_section : %s" % (last_section)) except UnicodeEncodeError: clr.print_red_text( "UnicodeEncodeError") # print("section: %s : %f - %f" % (current_section,current_char_size,char.size)) # print(" %s - %s " % (type(current_char_size),type(char.size))) current_section = current_section.strip() if current_section.startswith( "<KPOC-REQ" ) or current_section.startswith( "KPOC-REQ"): # this is req tag if current_section.startswith( "KPOC-REQ"): current_section = "<" + current_section if current_section.count( ".......") > 0: # index , new result result = { "Section": "", 'Title': None, 'ReqId': None, 'Content': "", } ReqId_end = current_section.find( '>') Title_end = current_section.find( '............') ReqId = current_section[ 0:ReqId_end + 1] Title = current_section[ ReqId_end + 1:Title_end] Section = last_section result["Title"] = Title.strip() result["ReqId"] = ReqId.strip() result["Section"] = Section.strip() results.append( result) # save the result if debug_title: print(u"Text: %s" % (current_section)) print(u"Section: %s" % (Section)) print(u"ReqId: %s" % (ReqId)) print(u"Title: %s" % (Title)) print(u"\n") else: # not index , old result ReqId_end = current_section.find( '>') ReqId = current_section[ 0:ReqId_end + 1] for result in results: if result["ReqId"] == ReqId: Content_Begin = current_section.find( '> ') + 2 result[ "Content"] = current_section[ Content_Begin:] #exit(0) result = None # clean result current_section = "" # remove the title from next content if debug_section: print( u"current_section clear for ReqId" ) else: # endif with current_section.startswith # after all , when a new section is found , we should clean this if (len(current_section.strip())): last_section = current_section # section number maybe here current_section = "" if debug_section: print( u"current_section clear for Not Found ReqId" ) #print(u"last_section : %s" % (last_section)) # same section current_section = current_section + char.get_text( ) current_char_size = char.size current_fontname = char.fontname #save the last para if len(results) > 0: # skip the first one content = content.strip() results[-1]["Content"] = content # save the last one content content = "" # empty the content fp.close() print("count %d" % len(results)) return results
def main(argv): import getopt def usage(): print( f'usage: {argv[0]} [-P password] [-o output] [-t text|html|xml|tag]' ' [-O output_dir] [-c encoding] [-s scale] [-R rotation]' ' [-Y normal|loose|exact] [-p pagenos] [-m maxpages]' ' [-S] [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin]' ' [-W word_margin] [-F boxes_flow] [-d] input.pdf ...') return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dP:o:t:O:c:s:R:Y:p:m:SCnAVM:W:L:F:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = b'' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' encoding = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-P': password = v.encode('ascii') elif k == '-o': outfile = v elif k == '-t': outtype = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-c': encoding = v elif k == '-s': scale = float(v) elif k == '-R': rotation = int(v) elif k == '-Y': layoutmode = v elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-S': stripcontrol = True elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = open(outfile, 'w', encoding=encoding) else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp) else: return usage() for fname in args: with open(fname, 'rb') as fp: interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) device.close() outfp.close() return
def convert_to_text(fname): pages = None if not pages: pagenums = set() else: pagenums = set(pages) output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = open(fname, 'rb') for page in PDFPage.get_pages(infile, pagenums): interpreter.process_page(page) infile.close() converter.close() text = output.getvalue() output.close text_list = text.split('\n') txt = text_list[:3] text = ' '.join(text_list[3:]) print("###################") print(txt) ## spliting word from string word_list = text.split(' ') string_input = "" flag = 0 for word in word_list: # print("*********") # print(word) if (word.lower() == 'tran'): break else: if (word.lower() == 'customer' or word.lower() == 'scheme' or word.lower() == 'currency' or word.lower() == 'for'): word = '\n' + word elif (word.lower() == 'statement'): word = '\n' + word flag = 1 elif (word.lower() == 'account' and flag == 1): word = '\n' + word string_input += word + " " print("::::::::::::::::::::::") # print(string_input) file_name = fname.split('/')[-1] file_name = file_name.split('.')[0] # print(file_name) # write Content to .txt text_file = open( "/home/dell/Documents/Aha-Loans/Production/PdfAnalyser/output/txt/output_" + file_name + ".txt", "w") text = re.sub("\s\s+", " ", text) text_file.write("%s" % text) text_file.close() file_name_main = "output_" + file_name + ".csv" csv_file = open( "/home/dell/Documents/Aha-Loans/Production/PdfAnalyser/output/csv/" + file_name_main, "w") text = re.sub("\s\s+", " ", string_input) csv_file.write("%s" % string_input) csv_file.close() length_lines = len(string_input.split('\n')) # print("-----------",length_lines) convert_to_table(fname, string_input, txt)
def parse(): # 以二进制读模式打开 fb = open(path, 'rb') # 创建一个pdf文档分析器 parser = PDFParser(fb) # 创建一个pdf文档对象 doc = PDFDocument() # 连接分析器与文档对象 parser.set_document(doc) doc.set_parser(parser) # 提供初始化密码 # 如果没有密码,就创建一个空字符串 doc.initialize() obj = {} amount = 0 # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # pdf 资源管理器,来管理共享资源 resource = PDFResourceManager() # 参数分析器 laparam = LAParams() # 聚合器 device = PDFPageAggregator(resource, laparams=laparam) # 创建PDF解释器 interpreter = PDFPageInterpreter(resource, device) # 循环遍历列表,每次处理一个page的内容 doc.get_pages() 获取page列表 for index, page in enumerate(doc.get_pages()): # if index < 3: # continue # if index == 4: # break # 使用页面解释器来读取 interpreter.process_page(page) # 使用聚合器获取内容 layout = device.get_result() for out in layout: if hasattr(out, "get_text"): # print(out.get_text()) # 去除无法识别的文字转化成的 (cid:12) 之类的代码 t = re.sub(r'\(cid:[\d]*\)', '', out.get_text()) # 去除特殊内容,如数字、's、'm、're、n't tx = re.sub(r'(\d+|\'s|\'m|\'re|n\'t)', '', t) # 去除标点符号,且将多个空格转化为一个空格 txt = re.sub( r'[\s+\?\.\!\/_,`:;\-$%^*\[\]\{\})(+\"\']+|[+——!,。?、‘’“”~@#¥%……&*():]+', ' ', tx) for word in txt.split(): # 跳过非英语单词 if not is_english(word): continue # 将单词转化为小写 w = word.lower() amount = amount + 1 if obj.__contains__(w): obj[w] = obj[w] + 1 else: obj[w] = 1 db = connect() # 获取会话指针 cursor = db.cursor() # 创建表 cursor.execute('CREATE TABLE IF NOT EXISTS ' + tablename + '(word varchar(255) NOT NULL, ' + 'count int NOT NULL, probability float NOT NULL, ' + 'PRIMARY KEY (word))') # 清空 words 表,避免受前一次计算结果影响 cursor.execute('truncate table ' + tablename) for key in obj: # 创建一条sql语句 sql = 'REPLACE INTO ' + tablename + ' (word, count, probability) VALUES(%s, %s, %s)' # 执行sql语句 cursor.execute(sql, (key, obj[key], round(obj[key] / amount * 10000, 2))) # 提交 db.commit() # 断开数据库连接 db.close() print("总词数: %s" % amount)
def cas_pdf_to_text(filename: Union[str, io.IOBase], password) -> PartialCASData: """ Parse CAS pdf and returns line data. :param filename: CAS pdf file (CAMS or Kfintech) :param password: CAS pdf password :return: array of lines from the CAS. """ file_type: Optional[FileType] = None if isinstance(filename, str): fp = open(filename, "rb") elif hasattr(filename, "read") and hasattr(filename, "close"): # file-like object fp = filename else: raise CASParseError( "Invalid input. filename should be a string or a file like object") with fp: pdf_parser = PDFParser(fp) try: document = PDFDocument(pdf_parser, password=password) except PDFPasswordIncorrect: raise IncorrectPasswordError("Incorrect PDF password!") except PDFSyntaxError: raise CASParseError("Unhandled error while opening file") line_margin = { FileType.KFINTECH: 0.1, FileType.CAMS: 0.2 }.get(detect_pdf_source(document), 0.2) rsrc_mgr = PDFResourceManager() laparams = LAParams(line_margin=line_margin, detect_vertical=True) device = PDFPageAggregator(rsrc_mgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrc_mgr, device) pages: List[Iterator[LTTextBoxHorizontal]] = [] investor_info = None for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() text_elements = filter( lambda x: isinstance(x, LTTextBoxHorizontal), layout) if file_type is None: for el in filter(lambda x: isinstance(x, LTTextBoxVertical), layout): if re.search("CAMSCASWS", el.get_text()): file_type = FileType.CAMS if re.search("KFINCASWS", el.get_text()): file_type = FileType.KFINTECH if investor_info is None: investor_info = parse_investor_info(layout, *page.mediabox[2:]) pages.append(text_elements) lines = group_similar_rows(pages) return PartialCASData(file_type=file_type, investor_info=investor_info, lines=lines)
def __init__( self, file, merge_tags=('LTChar', 'LTAnno'), round_floats=True, round_digits=3, input_text_formatter=None, normalize_spaces=True, resort=True, parse_tree_cacher=None, laparams={ 'all_texts': True, 'detect_vertical': True }, ): # store input self.merge_tags = merge_tags self.round_floats = round_floats self.round_digits = round_digits self.resort = resort # set up input text formatting function, if any if input_text_formatter: self.input_text_formatter = input_text_formatter elif normalize_spaces: r = re.compile(r'\s+') self.input_text_formatter = lambda s: re.sub(r, ' ', s) else: self.input_text_formatter = None # open doc if not hasattr(file, 'read'): try: file = open(file, 'rb') except TypeError: raise TypeError("File must be file object or filepath string.") parser = PDFParser(file) if hasattr(QPDFDocument, 'set_parser'): # pdfminer < 20131022 doc = QPDFDocument() parser.set_document(doc) doc.set_parser(parser) else: # pdfminer >= 20131022 doc = QPDFDocument(parser) parser.set_document(doc) if hasattr(doc, 'initialize'): # as of pdfminer==20140328, "PDFDocument.initialize() method is # removed and no longer needed." doc.initialize() self.doc = doc self.parser = parser self.tree = None self.pq = None self.file = file if parse_tree_cacher: self._parse_tree_cacher = parse_tree_cacher self._parse_tree_cacher.set_hash_key(self.file) else: self._parse_tree_cacher = DummyCache() # set up layout parsing rsrcmgr = PDFResourceManager() if type(laparams) == dict: laparams = LAParams(**laparams) self.device = PDFPageAggregator(rsrcmgr, laparams=laparams) self.interpreter = PDFPageInterpreter(rsrcmgr, self.device) # caches self._pages = [] self._pages_iter = None self._elements = []
# Open a PDF file. fp = open(f'{file_path}/{input_filename}', 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. resource_manager = PDFResourceManager() # Create a PDF device object. device = PDFDevice(resource_manager) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(resource_manager, device) # Process each page contained in the document. pdf_text = '' for page in PDFPage.get_pages(fp, set(), 0, document): # convert hex to data interpreter.process_page(page) page_data = page.contents[0].data if page_data:
def pdf_to_csv(filename): from cStringIO import StringIO from pdfminer.converter import LTChar, TextConverter from pdfminer.layout import LAParams from pdfminer.pdfparser import PDFParser from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage class CsvConverter(TextConverter): def __init__(self, *args, **kwargs): TextConverter.__init__(self, *args, **kwargs) def end_page(self, i): from collections import defaultdict lines = defaultdict(lambda: {}) for child in self.cur_item._objs: # <-- changed if isinstance(child, LTChar): (_, _, x, y) = child.bbox line = lines[int(-y)] line[x] = child._text.encode(self.codec) #<-- changed for y in sorted(lines.keys()): line = lines[y] self.outfp.write(";".join(line[x] for x in sorted(line.keys()))) self.outfp.write("\n") # ... the following part of the code is a remix of the # convert() function in the pdfminer/tools/pdf2text module rsrc = PDFResourceManager() outfp = StringIO() device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams()) # becuase my test documents are utf-8 (note: utf-8 is the default codec) fp = open(filename, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) parser.set_document(doc) # doc.set_parser(parser) # doc.initialize('') interpreter = PDFPageInterpreter(rsrc, device) pagenos = set() rotation = 0 i = 1 for page in PDFPage.get_pages(fp, pagenos): page.rotate = (page.rotate + rotation) % 360 outfp.write("START PAGE %d\n" % i) interpreter.process_page(page) outfp.write("END PAGE %d\n" % i) i += 1 # for i, page in enumerate(doc.get_pages()): # outfp.write("START PAGE %d\n" % i) # if page is not None: # interpreter.process_page(page) # outfp.write("END PAGE %d\n" % i) device.close() fp.close() return outfp.getvalue()
def get_pdf_rows(data, miner_layout=True): """ Takes PDF file content as string and yield table row data for each page. For each page in the PDF, the function yields a list of rows. Each row is a list of cells. Each cell is a list of strings present in the cell. Note that the rows may belong to different tables. There are no logic tables in PDF format, so this parses PDF drawing instructions and tries to find rectangles and arrange them in rows, then arrange text in the rectangles. External dependencies: PDFMiner (http://www.unixuser.org/~euske/python/pdfminer/index.html). """ try: from pdfminer.pdfparser import PDFParser, PDFSyntaxError except ImportError: raise ImportError('Please install python-pdfminer') try: from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage newapi = True except ImportError: from pdfminer.pdfparser import PDFDocument newapi = False from pdfminer.converter import PDFPageAggregator from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.layout import LAParams, LTRect, LTTextBox, LTTextLine, LTLine, LTChar, LTCurve parser = PDFParser(BytesIO(data)) try: if newapi: doc = PDFDocument(parser) else: doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) except PDFSyntaxError: return rsrcmgr = PDFResourceManager() if miner_layout: device = PDFPageAggregator(rsrcmgr, laparams=LAParams()) else: device = PDFPageAggregator(rsrcmgr) interpreter = PDFPageInterpreter(rsrcmgr, device) if newapi: pages = PDFPage.get_pages(BytesIO(data), check_extractable=True) else: doc.initialize() pages = doc.get_pages() if LOGGER.isEnabledFor(DEBUGFILES): import tempfile import PIL.Image as Image import PIL.ImageDraw as ImageDraw import random path = tempfile.mkdtemp(prefix='pdf') for npage, page in enumerate(pages): LOGGER.debug('processing page %s', npage) interpreter.process_page(page) page_layout = device.get_result() texts = sum([ list(lttext_to_multilines(obj, page_layout)) for obj in page_layout._objs if isinstance(obj, (LTTextBox, LTTextLine, LTChar)) ], []) LOGGER.debug('found %d text objects', len(texts)) if LOGGER.isEnabledFor(DEBUGFILES): img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255)) draw = ImageDraw.Draw(img) for t in texts: color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255)) draw.rectangle((t.x0, t.y0, t.x1, t.y1), outline=color) draw.text((t.x0, t.y0), t.text.encode('utf-8'), color) fpath = '%s/1text-%03d.png' % (path, npage) img.save(fpath) LOGGER.log(DEBUGFILES, 'saved %r', fpath) if not miner_layout: texts.sort(key=lambda t: (t.y0, t.x0)) # TODO filter ltcurves that are not lines? # TODO convert rects to 4 lines? lines = [ lt_to_coords(obj, page_layout) for obj in page_layout._objs if isinstance(obj, (LTRect, LTLine, LTCurve)) ] LOGGER.debug('found %d lines', len(lines)) if LOGGER.isEnabledFor(DEBUGFILES): img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255)) draw = ImageDraw.Draw(img) for l in lines: color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255)) draw.rectangle((l.x0, l.y0, l.x1, l.y1), outline=color) fpath = '%s/2lines-%03d.png' % (path, npage) img.save(fpath) LOGGER.log(DEBUGFILES, 'saved %r', fpath) lines = list(uniq_lines(lines)) LOGGER.debug('found %d unique lines', len(lines)) rows = build_rows(lines) LOGGER.debug('built %d rows (%d boxes)', len(rows), sum(len(row) for row in rows)) if LOGGER.isEnabledFor(DEBUGFILES): img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255)) draw = ImageDraw.Draw(img) for r in rows: for b in r: color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255)) draw.rectangle((b.x0 + 1, b.y0 + 1, b.x1 - 1, b.y1 - 1), outline=color) fpath = '%s/3rows-%03d.png' % (path, npage) img.save(fpath) LOGGER.log(DEBUGFILES, 'saved %r', fpath) textrows = arrange_texts_in_rows(rows, texts) LOGGER.debug('assigned %d strings', sum(sum(len(c) for c in r) for r in textrows)) if LOGGER.isEnabledFor(DEBUGFILES): img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255)) draw = ImageDraw.Draw(img) for row, trow in zip(rows, textrows): for b, tlines in zip(row, trow): color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255)) draw.rectangle((b.x0 + 1, b.y0 + 1, b.x1 - 1, b.y1 - 1), outline=color) draw.text((b.x0 + 1, b.y0 + 1), '\n'.join(tlines).encode('utf-8'), color) fpath = '%s/4cells-%03d.png' % (path, npage) img.save(fpath) LOGGER.log(DEBUGFILES, 'saved %r', fpath) yield textrows device.close()
def __init__(self, pdf_stream, password='', pagenos=[], maxpages=0): ReaderBackend.__init__(self) self.pdf_stream = pdf_stream # Extract Metadata parser = PDFParser(pdf_stream) doc = PDFDocument(parser, password=password, caching=True) if doc.info: for k in doc.info[0]: v = doc.info[0][k] # print(repr(v), type(v)) if isinstance(v, (bytes, str, unicode)): self.metadata[k] = make_compat_str(v) elif isinstance(v, (psparser.PSLiteral, psparser.PSKeyword)): self.metadata[k] = make_compat_str(v.name) # Secret Metadata if 'Metadata' in doc.catalog: metadata = resolve1(doc.catalog['Metadata']).get_data() # print(metadata) # The raw XMP metadata # print(xmp_to_dict(metadata)) self.metadata.update(xmp_to_dict(metadata)) # print("---") # Extract Content text_io = BytesIO() rsrcmgr = PDFResourceManager(caching=True) converter = TextConverter(rsrcmgr, text_io, codec="utf-8", laparams=LAParams(), imagewriter=None) interpreter = PDFPageInterpreter(rsrcmgr, converter) self.metadata["Pages"] = 0 self.curpage = 0 for page in PDFPage.get_pages(self.pdf_stream, pagenos=pagenos, maxpages=maxpages, password=password, caching=True, check_extractable=False): # Read page contents interpreter.process_page(page) self.metadata["Pages"] += 1 self.curpage += 1 # Collect URL annotations # try: if page.annots: refs = self.resolve_PDFObjRef(page.annots) if refs: if isinstance(refs, list): for ref in refs: if ref: self.references.add(ref) elif isinstance(refs, Reference): self.references.add(refs) # except Exception as e: # logger.warning(str(e)) # Remove empty metadata entries self.metadata_cleanup() # Get text from stream self.text = text_io.getvalue().decode("utf-8") text_io.close() converter.close() # print(self.text) # Extract URL references from text for url in extractor.extract_urls(self.text): self.references.add(Reference(url, self.curpage)) for ref in extractor.extract_arxiv(self.text): self.references.add(Reference(ref, self.curpage)) for ref in extractor.extract_doi(self.text): self.references.add(Reference(ref, self.curpage))
def main(argv): def usage(): print(( 'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] ' '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] ' '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() debug = False # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None outdir = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug = True elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) if debug: set_debug_logging() rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = io.open(outfile, 'wt', encoding=codec, errors='ignore') close_outfp = True else: outfp = sys.stdout close_outfp = False if outtype == 'text': device = TextConverter(rsrcmgr, outfp, laparams=laparams) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp) else: return usage() for fname in args: fp = io.open(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() if close_outfp: outfp.close()
def main(argv): import getopt def usage(): print 'Syntax:\npdf2htm.exe SourcePDF\n where the parameter is either a file name or\na wildcard spec like\n*.pdf\nEnclose it with quotes if it contains a space\n\nAdditional options are supported with named command line parameters as follows:' print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]' ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]' ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]' ' [-t text|html|xml|tag] [-c codec] [-s scale]' ' file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = 'tag' imagewriter = None rotation = 0 layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = False laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-R': rotation = int(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'tag' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout for fname in args: l = glob.glob(fname) count = len(l) print 'Converting ' + str(count) + ' from ' + fname + ' to ' + outtype + ' format' for pdf in l: # print pdf d = {'html' : 'htm', 'tag' : 'tag', 'text' : 'txt', 'xml' : 'xml'} ext = '.' + d[outtype] outfile = pdf[0:-4] + ext print outfile outfp = file(outfile, 'wb') if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) device.showpageno = False elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) device.showpageno = False elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) device.showpageno = False elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) device.showpageno = False else: return usage() fp = file(pdf, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() print 'Done' return