def pdf2txt(filePath, outPath): manager = PDFResourceManager() codec = 'utf-8' caching = True #创建一个pdf文档分析器,从文件中获取数据 parser = PDFParser(filePath) #创建一个PDF文档对象存储文档结构,保存获取的数据 document = PDFDocument(parser) # 检查文件是否允许文本提取 if not document.is_extractable: #print("sorry,failed") raise PDFTextExtractionNotAllowed else: # 创建一个PDF资源管理器对象来存储共享资源 rsrcmgr = PDFResourceManager() # 设定参数进行分析 laparams = LAParams() # 创建一个PDF设备对象 device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象,处理页面内容 interpreter = PDFPageInterpreter(rsrcmgr, device) # 处理文档中的每一页 for page in PDFPage.create_pages(document): interpreter.process_page(page) # 接受该页面的LTPage整个页面对象 layout = device.get_result() for x in layout: if (isinstance(x, LTTextBoxHorizontal)): with open('%s' % (outPath), 'a') as f: #“a”追加写,不会被覆盖;“w”重新写入,w有些文献会出错 #f.write(x.get_text()+ '\n') f.write(( x.get_text().encode("utf-8") + '\n'.encode("utf-8")).decode( "utf-8", "xmlcharrefreplace")) #decode("gbk","ignore"))
def parse(): fp = open(path, 'rb') #二进制读模式打开 #创建pdf文档分析器 praser = PDFParser(fp) #创建一个pdf文档 doc = PDFDocument() #连接分析器 与文档对象 praser.set_document(doc) doc.set_parser(praser) #提供初始化密码,没有密码则创建一个空字符串 doc.initialize() #检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: #创建PDF资源管理器 共享资源 rsrcmgr = PDFResourceManager() #创建PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) #创建PDF解释器 interpreter = PDFPageInterpreter(rsrcmgr, device) #循环遍历,每次处理一Page内容 for page in doc.get_pages(): #doc.get_pages()获取pag列表 interpreter.process_page(page) #接受页面的LTPage对象,这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性, layout = device.get_result() for x in layout: with open(r'./2.txt', 'a', encoding="UTF-8") as f: if (isinstance(x, LTTextBoxHorizontal)): results = x.get_text() print(results) f.write(results + '\n')
def test_pdf(self): # Test capture library API content = self.capture.pdf(url=server.base_url + self.url) self.check_pdf(content) # Test service: relative and absolute URLs for url in (server.base_url + self.url, '..' + self.url, self.url): result = self.fetch(self.src, params={'url': url}) self.check_filename(result, 'screenshot.pdf') self.check_pdf(result.content) # delay=. After 500ms, page changes text and color to blue # file=. Changes filename result = self.fetch(self.src, params={'url': self.url, 'delay': 600, 'file': 'delay'}) self.check_filename(result, 'delay.pdf') self.assertIn('Blueblock', normalize(get_text(result.content))) # --format and --orientation result = self.fetch(self.src, params={ 'url': self.url, 'format': 'A3', 'orientation': 'landscape'}) parser = PDFParser(io.BytesIO(result.content)) page = next(PDFPage.create_pages(PDFDocument(parser))) self.assertIn([round(x) for x in page.attrs['MediaBox']], ( [0, 0, 1188, 842], # noqa: Chrome uses 1188 x 842 for A3 [0, 0, 1191, 842], # noqa: PhantomJS uses 1191 x 842 for A3 )) # cookie=. The Cookie is printed on the screen via JS result = self.fetch(self.src, params={'url': self.url + '?show-cookie', 'cookie': 'a=x'}) self.assertIn('a=x', normalize(get_text(result.content))) # Cookie: header is the same as ?cookie=. # Old request cookies vanish. Only new ones remain result = self.fetch(self.src, params={'url': self.url + '?show-cookie'}, headers={'Cookie': 'b=z'}) result_text = normalize(get_text(result.content)) self.assertIn('js:cookie=b=z', result_text) self.assertIn('server:cookie=b=z', result_text)
def extract_pdf(path, languages=None): """ Extract content from a PDF file. This will attempt to use pdfminer to extract textual content from each page. If none is found, it'll send the images through OCR. """ with open(path, 'rb') as fh: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) parser = PDFParser(fh) doc = PDFDocument(parser, '') result = {'pages': []} if len(doc.info): for k, v in doc.info[-1].items(): k = k.lower().strip() if k != 'pages': result[k] = string_value(v) for i, page in enumerate(PDFPage.create_pages(doc)): text = None try: interpreter.process_page(page) layout = device.get_result() text = _convert_page(layout, path) except Exception as ex: log.warning("Failed to parse PDF page: %r", ex) if text is None or len(text) < 3: log.info("OCR: %r, pg. %s", path, i + 1) text = _extract_image_page(path, i + 1, languages) result['pages'].append(text) device.close() return result
def get_result_from_file(filename): from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage from pdfminer.pdfpage import PDFTextExtractionNotAllowed from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams result = {"filename": filename, "pages": []} fp = open(filename, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 2.0 laparams.detect_vertical = True laparams.line_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) page_index = 0 for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() bounding_box = get_bounding_box(layout) labels = get_text_labels(layout) result["pages"].append({ "index": page_index, "bounding_box": bounding_box, "labels": labels }) page_index += 1 fp.close() return result
def PDFreader(pdfPATH, TXTname=""): #获取文档对象 fp = open(pdfPATH, "rb") #创建一个一个与文档关联的解释器 parser = PDFParser(fp) #PDF文档的对象 doc = PDFDocument() #连接解释器和文档对象 parser.set_document(doc) doc.set_parser(parser) #初始化文档,当前文档没有密码,设为空字符串 doc.initialize("") #创建PDF资源管理器 resource = PDFResourceManager() #参数分析器 laparam = LAParams() #创建一个聚合器 device = PDFPageAggregator(resource, laparams=laparam) #创建PDF页面解释器 interpreter = PDFPageInterpreter(resource, device) #使用文档对象得到页面的集合 list = [] for page in doc.get_pages(): # 使用页面解释器读取 interpreter.process_page(page) # 使用聚合器来获得内容 layout = device.get_result() for out in layout: if hasattr(out, "get_text"): str = out.get_text() list.append(str) #print(str) TXTstr = "\n".join(list) if TXTname == "": TXTname = pdfPATH.replace(".pdf", ".txt") writeFile(TXTname, TXTstr)
def pdfparse(url, name): res = s.get(url, headers={"user-agent": generate_user_agent()}) path1 = os.getcwd() + "\\%s.pdf" % name.split(".")[0] # path2 = os.getcwd()+"\\%s.txt"%name.split(".")[0] with open(path1, 'wb') as f: f.write(res.content) f = open(path1, 'rb') praser = PDFParser(f) doc = PDFDocument() praser.set_document(doc) doc.set_parser(praser) f.close() doc.initialize() if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) text = '' # 循环遍历列表,每次处理一个page的内容 for page in doc.get_pages(): # doc.get_pages() 获取page列表 interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() #text = "".join(map(lambda x:x.get_text().strip(" ") if x.get_text() else "",layout)) #print(text) # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性, for x in layout: results = x.get_text() if results: text = text + results.strip('\n') return text
def changePdfToText(self, filePath): file = open(path, 'rb') # 以二进制读模式打开 #用文件对象来创建一个pdf文档分析器 praser = PDFParser(file) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 praser.set_document(doc) doc.set_parser(praser) # 提供初始化密码 # 如果没有密码 就创建一个空的字符串 doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) pdfStr = '' # 循环遍历列表,每次处理一个page的内容 for page in doc.get_pages(): # doc.get_pages() 获取page列表 interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() for x in layout: if hasattr(x, "get_text"): # print x.get_text() result.append(x.get_text()) fileNames = os.path.splitext(filePath) with open(fileNames[0] + '.txt','wb') as f: results = x.get_text() print(results) f.write(results + '\n')
def parse(): fp = open(path, 'rb') # 以二进制读模式打开 #用文件对象来创建一个pdf文档分析器 praser = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 praser.set_document(doc) doc.set_parser(praser) # 提供初始化密码 # 如果没有密码 就创建一个空的字符串 doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 循环遍历列表,每次处理一个page的内容 for page in doc.get_pages(): # doc.get_pages() 获取page列表 interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性, for x in layout: if (isinstance(x, LTTextBoxHorizontal)): with open(r'傲慢与偏见英文版.txt', 'a', encoding='utf-8') as f: results = x.get_text() f.write(results + '\n')
def character_extraction(self, address): # Create a file pointer fp = open(address, 'rb') try: # Create parser object to parse the pdf content parser = PDFParser(fp) # Store the parsed content in PDFDocument object document = PDFDocument(parser, '') # Create PDFResourceManager object that stores shared resources such as fonts or images rsrcmgr = PDFResourceManager() # set parameters for analysis laparams = LAParams() # Create a PDFDevice object which translates interpreted information into desired format # Device needs to be connected to resource manager to store shared resources # device = PDFDevice(rsrcmgr) # Extract the decive to page aggregator to get LT object elements device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create interpreter object to process page content from PDFDocument # Interpreter needs to be connected to resource manager for shared resources and device interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document): # As the interpreter processes the page stored in PDFDocument object interpreter.process_page(page) # The device renders the layout from interpreter layout = device.get_result() # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine for lt_obj in layout: if isinstance(lt_obj, (LTTextBox, LTTextLine)): self.fetch_chars(lt_obj) finally: fp.close()
def pdf2txt(self, path): print('解析pdf中...') with open(path, 'rb') as f: praser = PDFParser(f) doc = PDFDocument(praser) # if not doc.is_extractable: # raise PDFTextExtractionNotAllowed pdfrm = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(pdfrm, laparams=laparams) interpreter = PDFPageInterpreter(pdfrm, device) result = '' for page in PDFPage.create_pages(doc): interpreter.process_page(page) layout = device.get_result() for x in layout: try: if hasattr(x, "get_text"): content = x.get_text() with open( r'E:\pycharm_len\py_learn\learn\office\file\linux_pdf.txt', 'a') as f: try: result += content f.write(content) except Exception as err: print('error_write', err) except Exception as err: print('error', err) print('__________' * 10) print(result)
def process_attachment(name: str, data: bytes) -> str: result = "" if name.endswith(".txt"): try: result = data.decode("utf-8") except UnicodeDecodeError: print("unable to decode the given text by 'utf-8'") else: temp_file_path = "./data/temp" with open(temp_file_path, mode='wb') as temp: temp.write(data) if name.endswith(".docx"): result = docx2txt.process(temp_file_path) elif name.endswith(".pdf"): output_string = StringIO() with open(temp_file_path, mode='rb') as pdf: parser = PDFParser(pdf) doc = PDFDocument(parser) resource_manager = PDFResourceManager() device = TextConverter(resource_manager, output_string, laparams=LAParams()) interpreter = PDFPageInterpreter(resource_manager, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) result = output_string.getvalue() elif name.endswith(".pptx"): ppt = Presentation(temp_file_path) for slide in ppt.slides: for shape in slide.shapes: if hasattr(shape, "text"): result += shape.text elif name.endswith(".xlsx"): data = pd.ExcelFile(temp_file_path) for sheet in data.sheet_names: temp = data.parse(sheet) result += str(temp.columns) result += str(data.sheet_names) return result
def noimgpdf_change_word(self, _path): """ 没有图片的pdf文件转word :param _path: pdf文件路径 :return: """ try: if 'http://www' in _path: re = Request( url=_path, headers={'User-Agent': random.choice(self.user_agent)}) fp = urlopen(re) # 打开在线PDF文档 else: fp = open(_path, 'rb') # 打开本地pdf文档 praser_pdf = PDFParser(fp) doc = PDFDocument() praser_pdf.set_document(doc) doc.set_parser(praser_pdf) doc.initialize() if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) all_results = '' for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for out in layout: if isinstance(out, LTTextBoxHorizontal): results = out.get_text() all_results += results return all_results except: return None
def parse(path): fp = open(path, 'rb') # 以二进制读模式打开 #用文件对象来创建一个pdf文档分析器 praser = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 praser.set_document(doc) doc.set_parser(praser) doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 循环遍历列表,每次处理一个page的内容 for page in doc.get_pages(): # doc.get_pages() 获取page列表 interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性, for x in layout: if (isinstance(x, LTTextBoxHorizontal)): results = x.get_text() if results != None and len(results) > 6 and results.find( '表') > 0: results = results.replace('\n', '') return results return ""
def readPDF(self, path, callback=None, toPath=''): f = open(path, 'rb') # 以二进制可读形式打开pdf文件,'rb' parser = PDFParser(f) # 创建一个pdf文档分析器 pdfFile = PDFDocument() # 创建pdf文档 parser.set_document(pdfFile) # 链接文档对象与分析器 pdfFile.set_parser(parser) # 链接分析器与文档对象 pdfFile.initialize('') # 提供初始化密码 # 检测文档是否提供txt转换 if not pdfFile.is_extractable: # raise PDFTextExtractionNotAllowed else: # 解析数据 # #数据管理器 manager = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(manager, laparams=laparams) # 创建解释器对象 interpreter = PDFPageInterpreter(manager, device) # 开始循环处理,每次处理一页,只能把文本读出来,图片读不出 for page in pdfFile.get_pages(): interpreter.process_page(page) layout = device.get_result() for x in layout: # 循环处理图层 if isinstance(x, LTTextBoxHorizontal ): # 判断图层类型为LTTextBoxHorizontal才可以进行读取 if toPath == '': #处理每行数据 str = x.get_text() if callback != None: callback(str) else: print(str) else: #写文件 print('将PDF数据写入文件')
def parse_file(file: Path): with open(file, 'rb') as fp: parser = PDFParser(fp) doc = PDFDocument(parser) laparams = LAParams() text_boxes = [] # 清理后box列表 if not doc.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) last_out = None for i, page in enumerate(PDFPage.create_pages(doc)): orgi_boxes = [] # 原始box列表 interpreter.process_page(page) layout = device.get_result() for out in layout: if isinstance(out, LTTextBoxHorizontal) and ('猿题库' not in out.get_text()): orgi_boxes.append(out) else: pass # 去除页眉和页尾 cleaned_boxes = orgi_boxes[1:-1] for box in cleaned_boxes: pdf_box = PDFBox(box, i, last_out) text_boxes.append(pdf_box) last_out = pdf_box print('parse end') return text_boxes
def parse(pdf_path, toPath): fp = open(pdf_path, 'rb') # 以二进制读模式打开 # 用文件对象来创建一个pdf文档分析器 parser = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 parser.set_document(doc) doc.set_parser(parser) # 提供初始化密码 # 如果没有密码 就创建一个空的字符串 doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 循环遍历列表,每次处理一个page的内容 for page in doc.get_pages(): # doc.get_pages() 获取page列表 interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() for x in layout: if isinstance(x, LTTextBoxHorizontal): # 获取文本内容 # 保存文本内容 with open(toPath, 'a', encoding="utf-8") as f: results = x.get_text() f.write(results + '\n')
def process_pdf(title, path): """ @param title string Title to apply to the document. @param path string Path to the input PDF. @returns DrocerDocument """ output_document = DrocerDocument(title, path) with open(path, 'rb') as pdf_file: # setup pdf reader pdf_parser = PDFParser(pdf_file) pdf_password = '' pdf_document = PDFDocument(pdf_parser, pdf_password) pdf_rsrcmgr = PDFResourceManager() pdf_laparams = LAParams() pdf_device = PDFPageAggregator(pdf_rsrcmgr, laparams=pdf_laparams) pdf_interpreter = PDFPageInterpreter(pdf_rsrcmgr, pdf_device) # process document page_number = 0 for pdf_page in PDFPage.create_pages(pdf_document): page_number += 1 logger.info("processing %s page number %s" % (title, page_number)) output_page = DrocerPage(page_number) pdf_interpreter.process_page(pdf_page) pdf_layout = pdf_device.get_result() box_number = 0 for pdf_obj in pdf_layout: if isinstance(pdf_obj, LTTextBox): box_number += 1 output_box = DrocerBox(page_number, box_number, pdf_obj.x0, pdf_obj.y0, pdf_obj.x1, pdf_obj.y1, pdf_obj.get_text().encode('utf8')) output_page.boxes.append(output_box) else: #logger.debug("non-text object") pass output_document.pages.append(output_page) return output_document
def pdf_text(filename): fp = open(filename, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, '') parser.set_document(doc) rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) text = '' largest_text = {'contents': '', 'y0': 0, 'size': 0} for page in PDFPage.create_pages(doc): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: log('lt_obj: ' + str(lt_obj)) if isinstance(lt_obj, LTFigure): (largest_text, figure_text) = extract_figure_text(lt_obj, largest_text) text += figure_text elif isinstance(lt_obj, (LTTextBox, LTTextLine)): # Ignore body text blocks stripped_to_chars = re.sub(r'[ \t\n]', '', lt_obj.get_text().strip()) if (len(stripped_to_chars) > MAX_CHARS * 2): continue largest_text = extract_largest_text(lt_obj, largest_text) text += lt_obj.get_text() + '\n' # Remove unprocessed CID text largest_text['contents'] = re.sub(r'(\(cid:[0-9 \t-]*\))*', '', largest_text['contents']) # Only parse the first page return (largest_text, text)
def readPDF(path, toPath): #以二进制形式打开PDF文件 f = open(path, "rb") #创建一个PDF文件分析器 parser = PDFParser(f) #创建PDF文档 pdfFile = PDFDocument() #连接分析器与文档对象 parser.set_document(pdfFile) pdfFile.set_parser(parser) #提供初始化代码 pdfFile.initialize() #检测文档是否提供TXT转换 if not pdfFile.is_extractable: raise PDFTextExtractionNotAllowed else: #解析数据 #数据管理器 manager = PDFResourceManager() #创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(manager, laparams=laparams) #解释器对象 interpreter = PDFPageInterpreter(manager, device) #开始循环处理,每次处理一页 for page in pdfFile.get_pages(): interpreter.process_page(page) #图层 layout = device.get_result() for x in layout: if (isinstance(x, LTTextBoxHorizontal)): with open(toPath, "a") as f: str = x.get_text() print(str) f.write(str + "\n")
def parse_pdf(path): fp = open(path, 'rb') # 以二进制读模式打开 praser = PDFParser(fp) doc = PDFDocument() praser.set_document(doc) doc.set_parser(praser) doc.initialize() if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in doc.get_pages(): # doc.get_pages() 获取page列表 interpreter.process_page(page) layout = device.get_result() for x in layout: if (isinstance(x, LTTextBoxHorizontal)): results = x.get_text() if results[:5] == "准考证号:": return results[5:].replace("\n", "")
def get_page_num(fpath): """ Get the page number for the current pdf file https://stackoverflow.com/questions/45841012/how-can-i-get-the-total-count-of-total-pages-of-a-pdf-using-pdfminer-in-python """ tmp_path = get_tmp_path(fpath) cache_path = "{}.page_num.json".format(tmp_path) if os.path.isfile(cache_path): tmp_dict = load_general(cache_path) return tmp_dict['page_num'] # Open a PDF file. fp = open(fpath, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser) c = resolve1(document.catalog['Pages'])['Count'] tmp_dict = {'page_num': c} dump_general(tmp_dict, cache_path) return c
def process(path): aud = cur = dat = gen = genlong = geo = nam = 0 fp = open(path, 'rb') praser = PDFParser(fp) doc = PDFDocument() praser.set_document(doc) doc.set_parser(praser) doc.initialize() fp.close() if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for x in layout: if (isinstance(x, LTTextBoxHorizontal)): results = x.get_text().lower() list = results.split() for part in list: aud += count_word(part, auditor) cur += count_word(part, currency) dat += count_word(part, datesand) gen += count_word(part, generic) genlong += count_word(part, genericlong) geo += count_word(part, geographic) nam += count_word(part, names) return [aud, cur, dat, gen, genlong, geo, nam]
def get_pdf_metadata(self, pdf_file_stream): metadata = { 'author': 'UNKNOWN_AUTHOR', 'title': 'UNKNOWN_TITLE', 'year': 'UNKNOWN_YEAR' } pdf_parser = PDFParser(pdf_file_stream) pdf_doc = PDFDocument(pdf_parser) author = make_pdf_metadata_str(pdf_doc.info[0]['Author']) if author and author != '': metadata['author'] = author title = make_pdf_metadata_str(pdf_doc.info[0]['Title']) if title and title != '': metadata['title'] = title year = pdf_metadata_moddate_to_year( make_pdf_metadata_str(pdf_doc.info[0]['ModDate'])) if year and year != '': metadata['year'] = year return metadata
def parse_case(case_path): """Parse all the pdf files in the folder.""" try: result = {'id': case_path.split('/')[-2], 'docs': {}} for name in os.listdir(case_path): if name[0] == '.' or name[-4:] != '.pdf': continue doc_id = name.split('.')[0] result['docs'][doc_id] = {'pages': {}} doc_obj = result['docs'][doc_id] path = case_path + name fp = open(path, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() laparams = LAParams(detect_vertical=True, all_texts=True) device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) layout = device.get_result() doc_obj['pages'][layout.pageid] = { 'size': (layout.width, layout.height), 'text': parse_text(layout) } # print(layout.width, layout.height) output = open(case_path + 'parsed.json', 'w') json.dump(result, output, indent=None) except: print("Error " + case_path) return None
def fetch_raw_outline(self): with open(self.file_path, 'rb') as f: parser = PDFParser(f) doc = PDFDocument(parser) try: self.paper_info = doc.info[0] except Exception as e: self.paper_dict['HasInfo'] = False print('No paper-info. ERROR: {}'.format(e)) raw_outlines = list() try: raw_outlines = list(doc.get_outlines()) except Exception as e: self.paper_dict['HasOLF'] = False print('[WARN] The file does not contain outline-frame.'.format( e)) if raw_outlines: self.meta_helper(doc) else: self.add_manual_title() self.gen_outlines()
def with_pdf(pdf_doc, fn, pdf_pwd, *args): """Open the pdf document, and apply the function, returning the results""" result = None try: # open the pdf file fp = open(pdf_doc, 'rb') # create a parser object associated with the file object parser = PDFParser(fp) # create a PDFDocument object that stores the document structure doc = PDFDocument(parser, pdf_pwd) # connect the parser and document objects parser.set_document(doc) # supply the password for initialization if doc.is_extractable: # apply the function and return the result result = fn(doc, *args) # close the pdf file fp.close() except IOError: # the file doesn't exist or similar problem pass return result
def auto_rename(pdf_file, file, finalname_list): #Variable that date slices go into final_date = '' #pdfminer get creation date with open(file, 'rb') as file_date: parser = PDFParser(file_date) doc = PDFDocument(parser) file_date.close() #Slice the main date part, had to specifiy date because it was a byte datatype date = str(doc.info[0]['CreationDate'])[4:14] #date slicing and random.int to stop overwritting of files and make unique final_date += date[0:4] + '-' + date[4:6] + '-' + date[6:8] + '-' + date[ 8:10] + date[10:12] + '-' + str(random.randint(1, 99)) final_filename = final_date + ' - ' + ''.join(finalname_list[0]) if final_filename in os.listdir(): print(f'You Are Overwritting a File {final_filename}') print(f'this is the filename BEFORE {file}') os.rename(file, final_filename) print(f'this is the filename AFTER {final_filename}')
def parsePDF(pdfPath, pdfPwd='', imgFolderPath='/tmp', saveImgs=False): """Process each of the pages in this pdf file and return a list of strings representing the text found in each page""" if not os.path.exists(imgFolderPath): os.makedirs(imgFolderPath) try: #打开pdf文档 fp = open(pdfPath, 'rb') #创建pdf解析器 parser = PDFParser(fp) #创建pdf存储器 doc = PDFDocument() #关联pdf解析器和存储器 parser.set_document(doc) doc.set_parser(parser) #初始化 doc.initialize(pdfPwd) if doc.is_extractable: #处理pdf文档 text = parsePages(doc, imgFolderPath, saveImgs=saveImgs) with open('{0}/text.txt'.format(imgFolderPath), 'w', encoding='utf-8') as f: for line in text: f.write(line) f.close() # close the pdf file fp.close() except IOError: # the file doesn't exist or similar problem pass
def parse_pdf(f_name): fp = open(f_name, 'rb') # open the file in binary parser = PDFParser(fp) document = PDFDocument(parser, '') if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. analysis = [] for page in PDFPage.create_pages(document): interpreter.process_page(page) analysis.append(device.get_result()) return analysis