def convert_pdf_to_txt(path): fp = open(path, 'rb') txt = '' parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): txt += lt_obj.get_text() return (txt)
def parse(path): fp = open(path, 'rb') # 以二进制读模式打开 praser = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 praser.set_document(doc) doc.set_parser(praser) # 提供初始化密码 # 如果没有密码 就创建一个空的字符串 doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) fulltext = [] # 循环遍历列表,每次处理一个page的内容 for page in doc.get_pages(): # doc.get_pages() 获取page列表 interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() str_page = "" for x in layout: if isinstance(x, LTTextBoxHorizontal): results = x.get_text() str_page += results fulltext.append(str_page) return fulltext
def extract_pdf(self): assert self.extension in ['pdf'] self.content = self.file.read() parser = PDFParser(self.file) doc = PDFDocument(parser) available_fields = list(doc.info[0].keys()) self.properties['auteur'] = None self.properties['creation_date'] = None self.properties['modification_date'] = None self.properties['creator'] = None self.properties['producer'] = None if 'CreationDate' in available_fields: if isinstance(doc.info[0]["CreationDate"], PDFObjRef): doc.info[0]["CreationDate"] = resolve1( doc.info[0]["CreationDate"]) try: pdf_creation_date = str( self.convertPdfDatetime(doc.info[0]["CreationDate"])) self.properties['creation_date'] = str(pdf_creation_date) except: pass if 'ModDate' in available_fields: if isinstance(doc.info[0]["ModDate"], PDFObjRef): doc.info[0]["ModDate"] = resolve1(doc.info[0]["ModDate"]) try: pdf_modif_date = str( self.convertPdfDatetime(doc.info[0]["ModDate"])) self.properties['modification_date'] = str(pdf_modif_date) except: pass if 'Author' in available_fields: if isinstance(doc.info[0]["Author"], PDFObjRef): doc.info[0]["Author"] = resolve1(doc.info[0]["Author"]) try: pdf_auteur = doc.info[0]["Author"].decode("utf-8") self.properties['auteur'] = pdf_auteur except: pass if 'Creator' in available_fields: if isinstance(doc.info[0]["Creator"], PDFObjRef): doc.info[0]["Creator"] = resolve1(doc.info[0]["Creator"]) try: pdf_creator = doc.info[0]["Creator"].decode("utf-16") self.properties['creator'] = pdf_creator except: pass if 'Producer' in available_fields: if isinstance(doc.info[0]["Producer"], PDFObjRef): doc.info[0]["Producer"] = resolve1(doc.info[0]["Producer"]) try: pdf_producer = doc.info[0]["Producer"].decode("utf-16") self.properties['producer'] = pdf_producer except: pass parser.set_document(doc) pages = resolve1(doc.catalog['Pages']) pages_count = pages.get('Count', 0) #Only the first 300 characters for clarity self.content = self.convert_pdf_to_txt() self.properties['content'] = self.content[:300] + '(...)' self.properties['page_count'] = pages_count return self.properties