def test_get_destination_age_number(): src = os.path.join(RESOURCE_ROOT, "pdflatex-outline.pdf") reader = PdfFileReader(src) outlines = reader.getOutlines() for outline in outlines: if not isinstance(outline, list): reader.getDestinationPageNumber(outline)
def __choose(self, reader: PdfFileReader, outlines: list, idx: int): current_outline = outlines[idx] pages = self.__chapter_pages(reader, outlines, idx) current_page = reader.getDestinationPageNumber(current_outline) for page in range(current_page, current_page + pages): self._writer.addPage(reader.getPage(page)) self._written_pages += 1
def __chapter_pages(self, reader: PdfFileReader, outlines: list, idx: int) -> int: current_outline = outlines[idx] current_page = reader.getDestinationPageNumber(current_outline) for idx_ in range(idx + 1, len(outlines)): next_outline = outlines[idx_] if isinstance(next_outline, Destination): return reader.getDestinationPageNumber( next_outline) - current_page current_level, *_ = self.state_list[-1] if current_level != START_LEVEL: pages_to_upper_chapter = self.__pages_to_next_upper_chapter( reader, current_page, current_level) if pages_to_upper_chapter > 0: return pages_to_upper_chapter return reader.numPages - current_page
def split_by_sections(path): """按照pdf书签结构拆分pdf文件,目前只支持第一级目录拆分 Args: path (str): pdf文件路径 """ # 获取文件名(不包含路径和后缀),以便作为拆分后的文件名的基础 filename = os.path.splitext(os.path.basename(path))[0] pdf_reader = PdfFileReader(path) outlines = pdf_reader.outlines # 存储各个section信息,包括标题、起始页、结束页 sections = [] titles = [] start_pages = [] end_pages = [] for outline in outlines: titles.append(outline['/Title']) start_pages.append(pdf_reader.getDestinationPageNumber(outline) + 1) # 结束页码取下一个section的起始页码-1 # end_pages数组比其他数组多一个元素 # 虽然此处是本section的起始页码-1,实际上从数组取值是从第二个开始取值的 # 因此起始存储的是下一个section的起始页码-1 last_section_page = pdf_reader.getDestinationPageNumber(outline) end_pages.append(last_section_page) end_pages.append(pdf_reader.numPages) for i in range(len(outlines)): # 将section信息存入数组 section = [titles[i], start_pages[i], end_pages[i + 1]] sections.append(section) for idx, section in enumerate(sections): title = section[0] pdf_writer = PdfFileWriter() # 每个section分别存储到独立的pdf for i in range(section[2] - section[1] + 1): pdf_writer.addPage(pdf_reader.getPage(section[1] + i - 1)) output_filename = f'{filename}-{idx + 1}-{title}.pdf' with open(output_filename, 'wb') as out: pdf_writer.write(out)
def __pages_to_next_upper_chapter(self, reader: PdfFileReader, current_page: int, current_level: int) -> int: for state in reversed(self.state_list): previous_level, previous_outlines, previous_idx = state if previous_level < current_level: for outline in previous_outlines[(previous_idx + 1):]: if isinstance(outline, Destination): chapter_pages = reader.getDestinationPageNumber( outline) - current_page if chapter_pages > 0: return chapter_pages return 0
file_stream = open(file_to_read, 'rb') pdf_content = PdfFileReader(file_stream) outlines = pdf_content.getOutlines() for i, item in enumerate(outlines): if type(item) is generic.Destination and type(outlines[i + 1]) is list: title = item.title title = '_'.join(title.strip().replace('/', '_').split(' ')) max_number_of_characters = 100 if len(title) > max_number_of_characters: title = title[:max_number_of_characters] outlines[i + 1].insert(0, item) content = outlines[i + 1] chapters.append((title, content)) for chapter in chapters: subchapters = flatten(chapter[1]) file_to_write = dir_to_save_chapters / f'{chapter[0]}.pdf' pdf_writer = PdfFileWriter() start_page = pdf_content.getDestinationPageNumber(subchapters[0]) end_page = pdf_content.getDestinationPageNumber(subchapters[-1]) for i in range(start_page, end_page + 1): pdf_writer.addPage(pdf_content.getPage(i)) with open(file_to_write, 'wb') as f: pdf_writer.write(f) file_stream.close()
with open(srcfile, "rb") as f: pdf = PdfFileReader(f) #Try bookmarks without child try: bookmarks = pdf.getOutlines() except: upload=False errormsg= "this file contains bookmarks with child" error_log(filename,upload,errormsg) sys.exit() #Read Bookmarks if bookmarks: for b in bookmarks: invID = b['/Title'] if len(invID) < 22 and re.match('\w',invID): i = pdf.getDestinationPageNumber(b) #Search InvID in database #Connect to db db = client.iportalDevDB19 #Connect to collection collection = db.investors collection2 = db.fundinvestors rinvID = '' fundID = '' for y in collection2.find({ "invID": invID }): fundID= str(y['fundID']) print (fundID) if fundID: for x in collection.find({ "invID": invID }): rinvID= str(x['_id'])
def __is_the_end(self, reader: PdfFileReader, outlines: list, idx: int) -> bool: left_pages = reader.numPages - reader.getDestinationPageNumber( outlines[idx]) return self.__chapter_pages(reader, outlines, idx) == left_pages