class PDFReader: def __init__(self, filename): self.filename = filename self.file = open(self.filename + ".pdf", 'rb') self.pdf = PdfFileReader(self.file) def printBookDetails(self): print("Details of the book") print("Number of pages:", self.pdf.getNumPages()) print("Title:", self.pdf.getDocumentInfo().title) print("Author:", self.pdf.getDocumentInfo().author) def printPage(self, pageNo): print("Reading Page ", pageNo) page = self.pdf.getPage(pageNo) print(page.extractText()) def printOutline(self): print("Book Outline") for heading in self.pdf.getOutlines(): if type(heading) is not list: print(dict(heading).get('/Title'))
def add_watermark_to_pdf(self, src, dst, water): file_src = open(water, 'rb') water_reader = PdfFileReader(file_src) water_page = water_reader.getPage(0) source_reader = PdfFileReader(open(src, 'rb')) dest_write = PdfFileWriter() for pageNum in range(0, source_reader.numPages): self.progress_changed.emit(pageNum + 1, source_reader.numPages) pdf_page = source_reader.getPage(pageNum) pdf_page.mergePage(water_page) dest_write.addPage(pdf_page) org_info = water_reader.getDocumentInfo() infos = {} for k in org_info: infos[k] = org_info[k] print(k, org_info[k]) infos['/Producer'] = 'LiuShengKun' infos['/Title'] = os.path.basename(src) dest_write.addMetadata(infos) outlines = source_reader.getOutlines() self._add_bookmark(dest_write, outlines) with open(dst, 'wb') as f: dest_write.write(f) file_src.close() self.progress_changed.emit(source_reader.numPages + 1, source_reader.numPages)
def page_extract(): PDF_IN = PdfFileReader(open(PDF_DIR, 'rb')) pg_id_num_map = page_id_to_num(PDF_IN) outlines = PDF_IN.getOutlines() bmrks = bookmarks(outlines, pg_id_num_map) png_list = [] for i in range(int(start) - 1, int(end)): output = PdfFileWriter() output.addPage(PDF_IN.getPage(i)) base, name_ext = os.path.split(PDF_DIR) name, ext = os.path.splitext(name_ext) PDF_OUT = '{}{}'.format(TMP_DIR, '{}-{}{}'.format(name, str(i).zfill(6), ext)) with open(PDF_OUT, 'wb') as outputStream: output.write(outputStream) png_list.append(gs_pdf_to_png(PDF_OUT)) png_list.append(closest(bmrks, i+1)) os.remove(PDF_OUT) png_list = group(png_list, 4) for tup in png_list: make_cards(tup[0], tup[2], tup[3]) print "Current Tag Processed: " + tup[3]
def test_get_destination_age_number(): src = os.path.join(RESOURCE_ROOT, "pdflatex-outline.pdf") reader = PdfFileReader(src) outlines = reader.getOutlines() for outline in outlines: if not isinstance(outline, list): reader.getDestinationPageNumber(outline)
def bookmarks(PDF): def page_id_to_num(pdf, pages=None, _result=None, _num_pages=None): if _result is None: _result = {} if pages is None: _num_pages = [] pages = pdf.trailer["/Root"].getObject()["/Pages"].getObject() t = pages["/Type"] if t == "/Pages": for page in pages["/Kids"]: _result[page.idnum] = len(_num_pages) page_id_to_num(pdf, page.getObject(), _result, _num_pages) elif t == "/Page": _num_pages.append(1) return _result def bookmarks(outlines, pg_id_num_map, result=None): if result is None: result = [] if type(outlines) == list: for outline in outlines: result = bookmarks(outline, pg_id_num_map, result) elif type(outlines) == PyPDF2.pdf.Destination: result.append((pg_id_num_map[outlines.page.idnum]+1, outlines['/Title'])) return result PDF_IN = PdfFileReader(open(PDF, 'rb')) pg_id_num_map = page_id_to_num(PDF_IN) outlines = PDF_IN.getOutlines() outlines = [item for item in outlines if not type(item) == list] outlines = [item for item in outlines if not item['/Title'] in exclude] bmrks = bookmarks(outlines, pg_id_num_map) it = iter(bmrks[1:]) TOC = [] for x in bmrks: try: TOC.append( (x[0], (next(it)[0] - 1), x[1]) ) except: pass return TOC
def extract_bookmarks(pdf_filename): file = open(pdf_filename, 'rb') pdf = PdfFileReader(file) map_ = _construct_page_id_to_page_number_map(pdf) outlines = pdf.getOutlines() list_ = [] _recursive_extract_bookmarks(outlines, map_, list_) file.close() return list_
def get_page_numbers(pdf_name): with open(pdf_name, "rb") as f: pdf = PdfFileReader(f) total_pages = pdf.numPages # map page ids to page numbers pg_id_num_map = _setup_page_id_to_num(pdf) outlines = pdf.getOutlines() bookmarks_info = outlines_pg_zoom_info(outlines, pg_id_num_map) # print(pg_id_num_map) # print(bookmarks_info) pages = {meta["title"]: meta["page"] for meta in bookmarks_info.values()} return pages, total_pages
def getPdfOutlines(pdfpath, listpath, isPage): with open(pdfpath, "rb") as file: doc = PdfFileReader(file) outlines = doc.getOutlines() # 获取大纲 global returnlist # 全局变量,保存大纲的列表 returnlist = [] # 创建一个空列表 mylist = getOutline(outlines, isPage) # 递归获取大纲 w = DispatchEx("Word.Application") # 创建Word文档应用程序对象 w.Visible = 1 w.DisplayAlerts = 0 doc1 = w.Documents.Add() # 添加一个Word文档对象 range1 = doc1.Range(0, 0) for item in mylist: # 通过循环将获取的目录列表插入到Word文档对象中 range1.InsertAfter(item) outpath = os.path.join(listpath, 'list.docx') # 连接Word文档路径 doc1.SaveAs(outpath) # 保存文件 doc1.Close() # 关闭Word文档对象 w.Quit() # 退出Word文档应用程序对象 return outpath
def getPdfOutlines(pdfpath,listpath,isList): '''获取pdf文档的大纲''' with open(pdfpath,'rb') as file: doc=PdfFileReader(file) outlines=doc.getOutlines() global returnlist returnList=[] mylist=getOutline(outlines,isList) w=DispatchEx('Word.Application') w.Visible=1 w.DisplayAlerts=0 doc1=w.Documents.Add() range1=doc1.Range(0,0) for item in mylist: range1.InsertAfter(item) outpath=os.path.join(listpath,'list.docx') doc1.SaveAs(outpath) doc1.close() w.Quit() return outpath
def merge_pdf_template(src_pdf_path, template_pdf_path, dst_pdf_path): try: template = PdfFileReader(template_pdf_path, strict=False) if template.getNumPages() < 2: print template_pdf_path, 'page num must >=2, page 0 for cover page 1 for watermark and header footer!!!!' return cover_page = template.getPage(0) watermark_page = template.getPage(1) pdf_reader = PdfFileReader(src_pdf_path, strict=False) #print pdf_reader.getDocumentInfo() #print pdf_reader.getNamedDestinations() pdf_outlines = pdf_reader.getOutlines() #analyze_outline(pdf_reader, pdf_outlines) pdf_writer = PdfFileWriter() pdf_writer.addPage(cover_page) if 1: ############################################################# pdf_writer.appendPagesFromReader(pdf_reader) #pdf_writer.cloneDocumentFromReader(pdf_reader) #pdf_writer.cloneReaderDocumentRoot(pdf_reader) for page_index in range(pdf_reader.getNumPages()): pdf_page = pdf_writer.getPage(page_index) pdf_page.mergePage(watermark_page) else: for page_index in range(pdf_reader.getNumPages()): pdf_page = pdf_reader.getPage(page_index) pdf_page.mergePage(watermark_page) pdf_writer.addPage(pdf_page) ############################################################# write_outline(pdf_reader, pdf_writer, pdf_outlines, None) pdfOutputFile = open(dst_pdf_path, 'wb') #pdf_writer.encrypt('qg2101')#设置pdf密码 pdf_writer.write(pdfOutputFile) pdfOutputFile.close() except Exception, err: #print err print str(err).decode("string_escape")
def test_investment_report_pdf(self): # TODO create example data for everything - after lunch market = Market.objects.create(name='test') sector = Sector.objects.create(name='test') sample_content = '# Lorem Ipsum\n\nHello' FrontPage.objects.create(sector=sector) SectorOverview.objects.create(sector=sector, content=sample_content) KillerFacts.objects.create(sector=sector, content=sample_content) MacroContextBetweenCountries.objects.create(market=market, content=sample_content) UKMarketOverview.objects.create() SectorInitiatives.objects.create(sector=sector, content=sample_content) LastPage.objects.create(content=sample_content) pdf_io = investment_report_pdf_generator(market, sector, 'Test', moderated=False) reader = PdfFileReader(pdf_io) # Nothing else one can really do other than visual # inspection of the PDF self.assertEquals(reader.getOutlines()[0]['/Title'], 'Contents')
def splitAccordingToBookmarks(PDF, basePDFName): with open(PDF, 'rb') as f: p = PdfFileReader(f) pg_id_num_map = _setup_page_id_to_num(p) o = p.getOutlines() # type(o[0]) splitPages = [pg_id_num_map[o[i].page.idnum] for i in range(len(o))] bookmarkSet = set(splitPages) output = PdfFileWriter() count = 0 for i in range(p.numPages): if (i in bookmarkSet): print('Found ' + str(i) + ' in s') with open(str(basePDFName) + str(count) + ".pdf", "wb") as f2: output.write(f2) count += 1 output = PdfFileWriter() output.addPage(p.getPage(i)) else: print('Just added ' + str(i) + ' to o/p') output.addPage(p.getPage(i)) with open(str(basePDFName) + str(count) + ".pdf", "wb") as f2: output.write(f2) return len(o)
srcfile = os.path.join(original_folder, filename) # Try file existe? try: filepath = os.stat(srcfile) except: upload=False errormsg= "file does not exist" error_log(filename,upload,errormsg) sys.exit() with open(srcfile, "rb") as f: pdf = PdfFileReader(f) #Try bookmarks without child try: bookmarks = pdf.getOutlines() except: upload=False errormsg= "this file contains bookmarks with child" error_log(filename,upload,errormsg) sys.exit() #Read Bookmarks if bookmarks: for b in bookmarks: invID = b['/Title'] if len(invID) < 22 and re.match('\w',invID): i = pdf.getDestinationPageNumber(b) #Search InvID in database #Connect to db db = client.iportalDevDB19 #Connect to collection
import PyPDF2 from PyPDF2 import PdfFileReader pdf = open ("example.pdf", 'rb') reader = PdfFileReader(pdf) print "PDF Reader object is:", reader print "Number of pages:", reader.getNumPages() print "Title: ", reader.getDocumentInfo().title print "Author: ", reader.getDocumentInfo().author print "Book Outline" for heading in reader.getOutlines(): if type(heading) is not list: print dict(heading).get('/Title')
def __call__(self, value, system): """ Implements a subclass of pyramid_oereb.core.renderer.extract.json_.Renderer to create a print result out of a json. The json extract is reformatted to fit the structure of mapfish print. Args: value (tuple): A tuple containing the generated extract record and the params dictionary. system (dict): The available system properties. Returns: buffer: The pdf content as received from configured mapfish print instance url. """ log.debug("Parameter webservice is {}".format(value[1])) if value[1].images: raise HTTPBadRequest('With image is not allowed in the print') self._request = self.get_request(system) # Create a lower case GET dict to be able to accept all cases of upper and lower case writing self._lowercase_GET_dict = dict( (k.lower(), v.lower()) for k, v in self._request.GET.items()) # If a language is specified in the request, use it. Otherwise, use the language from base class self._fallback_language = Config.get('default_language') if 'lang' in self._lowercase_GET_dict: self._language = self._lowercase_GET_dict.get('lang') self._static_error_message = Config.get('static_error_message').get(self._language) or \ Config.get('static_error_message').get(self._fallback_language) # Based on extract record and webservice parameter, render the extract data as JSON extract_record = value[0] extract_as_dict = self._render(extract_record, value[1]) feature_geometry = mapping(extract_record.real_estate.limit) if Config.get('print', {}).get('compute_toc_pages', False): extract_as_dict['nbTocPages'] = TocPages( extract_as_dict).getNbPages() else: extract_as_dict['nbTocPages'] = 1 self.convert_to_printable_extract(extract_as_dict, feature_geometry) print_config = Config.get('print', {}) extract_as_dict[ 'Display_RealEstate_SubunitOfLandRegister'] = print_config.get( 'display_real_estate_subunit_of_land_register', True) extract_as_dict['Display_Certification'] = print_config.get( 'display_certification', False) spec = { 'layout': Config.get('print', {})['template_name'], 'outputFormat': 'pdf', 'lang': self._language, 'attributes': extract_as_dict, } response = self.get_response(system) if self._request.GET.get('getspec', 'no') != 'no': response.headers[ 'Content-Type'] = 'application/json; charset=UTF-8' return json.dumps(spec, sort_keys=True, indent=4) pdf_url = urlparse.urljoin( Config.get('print', {})['base_url'] + '/', 'buildreport.pdf') pdf_headers = Config.get('print', {})['headers'] print_result = requests.post(pdf_url, headers=pdf_headers, data=json.dumps(spec)) try: if Config.get('print', {}).get('compute_toc_pages', False): with io.BytesIO() as pdf: pdf.write(print_result.content) pdf_reader = PdfFileReader(pdf) x = [] for i in range(len(pdf_reader.getOutlines())): x.append(pdf_reader.getOutlines()[i]['/Page'] ['/StructParents']) try: true_nb_of_toc = min(x) - 1 except ValueError: true_nb_of_toc = 1 if true_nb_of_toc != extract_as_dict['nbTocPages']: log.warning( 'nbTocPages in result pdf: {} are not equal to the one predicted : {}, request new pdf' .format(true_nb_of_toc, extract_as_dict['nbTocPages'])) # noqa extract_as_dict['nbTocPages'] = true_nb_of_toc print_result = requests.post(pdf_url, headers=pdf_headers, data=json.dumps(spec)) except PdfReadError as e: err_msg = 'a problem occurred while generating the pdf file' log.error(err_msg + ': ' + str(e)) raise HTTPInternalServerError(self._static_error_message) try: content = print_result.content except PdfReadError as e: err_msg = 'No contents from print result available!' log.error(err_msg + ': ' + str(e)) raise HTTPInternalServerError(self._static_error_message) # Save printed file to the specified path. pdf_archive_path = print_config.get('pdf_archive_path', None) if pdf_archive_path is not None: self.archive_pdf_file(pdf_archive_path, content, extract_as_dict) response.status_code = print_result.status_code response.headers = print_result.headers if 'Transfer-Encoding' in response.headers: del response.headers['Transfer-Encoding'] if 'Connection' in response.headers: del response.headers['Connection'] return content
def test_get_outlines(src, outline_elements): reader = PdfFileReader(src) outlines = reader.getOutlines() assert len(outlines) == outline_elements
class PdfBookmark(object): """ This class supports import/export PDF's bookmarks from/to a file. """ def __init__(self, pdfPathName): self.pdfFileName = pdfPathName self._pdfStream = open(self.pdfFileName, 'rb') self._pdfReader = PdfFileReader(self._pdfStream) self.pageLabels = self._getPageLabels() self.outlines = self._pdfReader.getOutlines() self._addPageRatio(self.outlines, self.pageLabels) def getBookmark(self): """ Retrieve this pdf's bookmark. """ return self.outlines def exportBookmark(self, bookmarkFile): """ Export bookmarks to a file. """ stream = codecs.open(bookmarkFile, 'w', encoding='utf8') _writeBookmarkToStream(self.outlines, stream, 0) print("Export %s's bookmarks to %s finished!" % (self.pdfFileName, bookmarkFile)) def importBookmark(self, bookmarkFile, saveAsPdfName=None): """ Import the contents from a bookmark file and add these bookmarks to the current pdf file or another pdf file. """ outlines = readBookmarkFromFile(bookmarkFile) output = PdfFileWriter() for i in range(0, self._pdfReader.getNumPages()): output.addPage(self._pdfReader.getPage(i)) _writeOutlinesToPdf(outlines, output, None) if saveAsPdfName == None: saveAsPdfName = self.pdfFileName[0:-4] + '_bookmark.pdf' stream = open(saveAsPdfName, 'wb') output.write(stream) print("Add bookmarks in %s to %s finished!" % (bookmarkFile, saveAsPdfName)) def _getPageLabels(self): """ Get the map from IndirectObject id to real page number. """ pageLabels = {} pages = list(self._pdfReader.pages) for i in range(0, len(pages)): page = pages[i] pageLabels[page.indirectRef.idnum] = i + 1 return pageLabels def _addPageRatio(self, outlines, pageLabels): """ Retrieves page ratio from Destination list. param outlines: Destination list param pageLabels: map from IndirectObject id to real page number """ for i in range(0, len(outlines)): outline = outlines[i] if type(outline) == list: self._addPageRatio(outlines[i], pageLabels) continue elif not outline.has_key('/Page'): print("Error: outline has no key '/Page'") sys.exit(-1) pageHeight = outline['/Page']['/MediaBox'][-1] idIndirect = outline.page.idnum if pageLabels.has_key(idIndirect): pageNum = pageLabels[idIndirect] else: print( 'Error: Page corresponds to IndirectObject %d not Found' % idIndirect) sys.exit(-1) if outline.has_key('/Top'): top = outline['/Top'] else: top = pageHeight if outline.has_key('/Zoom'): zoom = outline['/Zoom'] else: zoom = 1 outline = dict(outline) try: outline['/Ratio'] = pageNum + (1 - top / zoom / pageHeight) except: pass outlines[i] = outline
def __call__(self, value, system): """ Implements a subclass of pyramid_oereb.lib.renderer.extract.json_.Renderer to create a print result out of a json. The json extract is reformatted to fit the structure of mapfish print. Args: value (tuple): A tuple containing the generated extract record and the params dictionary. system (dict): The available system properties. Returns: buffer: The pdf content as received from configured mapfish print instance url. """ log.debug("Parameter webservice is {}".format(value[1])) if value[1].images: raise HTTPBadRequest('With image is not allowed in the print') self._request = self.get_request(system) # Create a lower case GET dict to be able to accept all cases of upper and lower case writing self._lowercase_GET_dict = dict( (k.lower(), v.lower()) for k, v in self._request.GET.iteritems()) # If a language is specified in the request, use it. Otherwise, use the language from base class self._fallback_language = Config.get('default_language') if 'lang' in self._lowercase_GET_dict: self._language = self._lowercase_GET_dict.get('lang') # Based on extract record and webservice parameter, render the extract data as JSON extract_record = value[0] extract_as_dict = self._render(extract_record, value[1]) feature_geometry = mapping(extract_record.real_estate.limit) pdf_to_join = set() if Config.get('print', {}).get('compute_toc_pages', False): extract_as_dict['nbTocPages'] = TocPages( extract_as_dict).getNbPages() else: extract_as_dict['nbTocPages'] = 1 self.convert_to_printable_extract(extract_as_dict, feature_geometry, pdf_to_join) print_config = Config.get('print', {}) extract_as_dict[ 'Display_RealEstate_SubunitOfLandRegister'] = print_config.get( 'display_real_estate_subunit_of_land_register', True) extract_as_dict['Display_Certification'] = print_config.get( 'display_certification', True) spec = { 'layout': Config.get('print', {})['template_name'], 'outputFormat': 'pdf', 'lang': self._language, 'attributes': extract_as_dict, } response = self.get_response(system) if self._request.GET.get('getspec', 'no') != 'no': response.headers[ 'Content-Type'] = 'application/json; charset=UTF-8' return json.dumps(spec, sort_keys=True, indent=4) pdf_url = urlparse.urljoin( Config.get('print', {})['base_url'] + '/', 'buildreport.pdf') pdf_headers = Config.get('print', {})['headers'] print_result = requests.post(pdf_url, headers=pdf_headers, data=json.dumps(spec)) if Config.get('print', {}).get('compute_toc_pages', False): with io.BytesIO() as pdf: pdf.write(print_result.content) pdf_reader = PdfFileReader(pdf) x = [] for i in range(len(pdf_reader.getOutlines())): x.append( pdf_reader.getOutlines()[i]['/Page']['/StructParents']) try: true_nb_of_toc = min(x) - 1 except ValueError: true_nb_of_toc = 1 if true_nb_of_toc != extract_as_dict['nbTocPages']: log.warning( 'nbTocPages in result pdf: {} are not equal to the one predicted : {}, request new pdf' .format(true_nb_of_toc, extract_as_dict['nbTocPages'])) # noqa extract_as_dict['nbTocPages'] = true_nb_of_toc print_result = requests.post(pdf_url, headers=pdf_headers, data=json.dumps(spec)) if not extract_as_dict['isReduced'] and print_result.status_code == 200: main = tempfile.NamedTemporaryFile(suffix='.pdf') main.write(print_result.content) main.flush() cmd = ['pdftk', main.name] temp_files = [main] for url in pdf_to_join: result = requests.get(url) content_type = result.headers.get('content-type') log.debug("document url: " + url + " => content_type: " + content_type) if content_type != 'application/pdf': msg = "Skipped document inclusion (url: '{}') because content_type: '{}'" log.warning(msg.format(url, content_type)) continue tmp_file = tempfile.NamedTemporaryFile(suffix='.pdf') tmp_file.write(result.content) tmp_file.flush() temp_files.append(tmp_file) cmd.append(tmp_file.name) out = tempfile.NamedTemporaryFile(suffix='.pdf') cmd += ['cat', 'output', out.name] sys.stdout.flush() time.sleep(0.1) subprocess.check_call(cmd) content = out.file.read() else: content = print_result.content # Save printed file to the specified path. pdf_archive_path = print_config.get('pdf_archive_path', None) if pdf_archive_path is not None: self.archive_pdf_file(pdf_archive_path, content, extract_as_dict) response.status_code = print_result.status_code response.headers = print_result.headers if 'Transfer-Encoding' in response.headers: del response.headers['Transfer-Encoding'] if 'Connection' in response.headers: del response.headers['Connection'] return content
result = [] if type(outlines) == list: for outline in outlines: result = bookmarks(outline, pg_id_num_map, result) elif type(outlines) == PyPDF2.pdf.Destination: result.append((pg_id_num_map[outlines.page.idnum]+1, outlines['/Title'])) return result PDF_IN = PdfFileReader(open(PDF, 'rb')) pg_id_num_map = page_id_to_num(PDF_IN) outlines = PDF_IN.getOutlines() outlines = [item for item in outlines if not type(item) == list] outlines = [item for item in outlines if not item['/Title'] in exclude] bmrks = bookmarks(outlines, pg_id_num_map) it = iter(bmrks[1:]) TOC = [] for x in bmrks: try: TOC.append( (x[0], (next(it)[0] - 1), x[1]) ) except: pass print TOC
from PyPDF2 import PdfFileReader, PdfFileWriter import translate readFile = 'pdf/wtf_trans.pdf' pdf = PdfFileReader(open(readFile, "rb")) print(pdf.getOutlines())
def merge_pdf(firstpdf,secondpdf,insertpage): # 创建一个用来合并文件的实例 pdf_merger = PdfFileMerger() pdf_merger.append(firstpdf) pdf_merger.merge(insertpage, secondpdf) # # 添加书签 # pdf_merger.addBookmark('这是一个书签', 1) pdf_merger.write('merge_pdf.pdf') # # def split_by_num(filename, nums, password=None): filename = r'F:\研一下\量化投资资料\量化教材\Hands-On_Machine_Learning_for_Algorithmic_Trading.pdf' pdf_reader = PdfFileReader(open(filename, mode='rb' )) pages = pdf_reader.getNumPages() outline = pdf_reader.getOutlines() outlinchapter = [] outlinepage = [i+18 for i in [8,33,65,88,119,147,175,224,260,284,312,351,389,418,441,458]] for o in outline: res = re.findall(r"'/Title': '(.*?)', '/Page': IndirectObject\((.*?), 0\)",str(o),re.S) if 'Chapter' in res[0][0]: outlinchapter.append(res[0][0]) #print(list(outlinedict[0].keys())[0],list(outlinedict[0].values())[0]) outlinedict =[{i[0]:i[1]} for i in zip(outlinchapter,outlinepage)] for i in range(len(outlinedict)+1): pdf_writer = PdfFileWriter() split_pdf_name = list(outlinedict[i].keys())[0].replace(':','') + '.pdf' start = list(outlinedict[i].values())[0] end = list(outlinedict[i+1].values())[0]
result.update(bookmark_dict(item)) else: result[reader.getDestinationPageNumber(item)] = item.title return result print("Enter path to File(Example:: C:/Bob/Documents/)\n make sure it ends with /:",end='') pa=input() print("Enter the PDF file name(Example:: Bob.pdf):",end='') th=input() path = pa+th writer = PdfFileWriter() reader = PdfFileReader(path) BookMarks = bookmark_dict(reader.getOutlines()) Total_Number_pages = reader.getNumPages() Bname = "" ###################### Cleaning Bookmarks for i in BookMarks.keys(): Bname = str(BookMarks[i]) Bname = Bname.replace("b'","") Bname = Bname.replace(r"\r'","") Bname = Bname.replace("&","AND") BookMarks[i] = Bname ###################### j = 0 ListOfList = [] Total_Number_Pages = reader.getNumPages()
chapters = [] def flatten(A): rt = [] for i in A: if isinstance(i, list): rt.extend(flatten(i)) else: rt.append(i) return rt file_stream = open(file_to_read, 'rb') pdf_content = PdfFileReader(file_stream) outlines = pdf_content.getOutlines() for i, item in enumerate(outlines): if type(item) is generic.Destination and type(outlines[i + 1]) is list: title = item.title title = '_'.join(title.strip().replace('/', '_').split(' ')) max_number_of_characters = 100 if len(title) > max_number_of_characters: title = title[:max_number_of_characters] outlines[i + 1].insert(0, item) content = outlines[i + 1] chapters.append((title, content)) for chapter in chapters: subchapters = flatten(chapter[1])
readFile) # 或者这个方式:pdfFileReader = PdfFileReader(open(readFile, 'rb')) # 获取 PDF 文件的文档信息 documentInfo = pdfFileReader.getDocumentInfo() print('documentInfo = %s' % documentInfo) # 获取页面布局 pageLayout = pdfFileReader.getPageLayout() print('pageLayout = %s ' % pageLayout) # 获取页模式 pageMode = pdfFileReader.getPageMode() print('pageMode = %s' % pageMode) xmpMetadata = pdfFileReader.getXmpMetadata() print('xmpMetadata = %s ' % xmpMetadata) # 获取页面大纲 outLines = pdfFileReader.getOutlines() print('outLine = %s' % outLines) # 获取 pdf 文件页数 pageCount = pdfFileReader.getNumPages() print('pageCount = %s' % pageCount) for index in range(0, pageCount): # 返回指定页编号的 pageObject pageObj = pdfFileReader.getPage(index) print('index = %d , pageObj = %s' % (index, type(pageObj))) # <class 'PyPDF2.pdf.PageObject'> # 获取 pageObject 在 PDF 文档中处于的页码 pageNumber = pdfFileReader.getPageNumber(pageObj) print('pageNumber = %s ' % pageNumber)