def extract_block(self, block: dict, page: int) -> List[ToCEntry]: """Extract matching headings in a block. Argument block: a block dictionary { 'bbox': (float, float, float, float), 'lines': [dict], 'type': int } Returns a list of toc entries, concatenated from the result of lines """ if block.get('type') != 0: # not a text block return [] vpos = block.get('bbox', (0, 0))[1] try: frags = chain.from_iterable([ self._extract_line(ln) for ln in block.get('lines') ]) titles = concatFrag(frags) return [ ToCEntry(level, title, page, vpos) for level, title in titles.items() ] except FoundGreedy as e: # return the entire block as a single entry return [ToCEntry(e.level, blk_to_str(block), page, vpos)]
def build_final_toc(path_title_level_list, toc): idx1, idx2 = 0, 0 last_page_num = 1 final_toc = [] for idx1, (path, title, level) in enumerate(path_title_level_list): title_match = "".join(title.split()) == "".join( toc[idx2].title.split()) if path is None or (path is not None or title_match): final_toc.append(ToCEntry(level, title, toc[idx2].pagenum)) if path is not None: if title_match: idx2 += 1 else: print("missing chapter: ", title) final_toc.pop() # remove missing ones return final_toc
def parse_entry(entry: List) -> ToCEntry: """parse a row in csv to a toc entry""" # a somewhat weird hack, csv reader would read spaces as an empty '', so we # only need to count the number of '' before an entry to determined the # heading level indent = len(list(takewhile(lambda x: x == '', entry))) try: toc_entry = ToCEntry( int(indent / 4) + 1, # 4 spaces = 1 level entry[indent], # heading int(entry[indent + 1]), # pagenum *entry[indent + 2:] # vpos ) return toc_entry except IndexError as e: print(f"Unable to parse toc entry {entry};", f"Need at least {indent + 2} parts but only have {len(entry)}.", "Make sure the page number is present.", file=sys.stderr) raise e
def read_toc(doc: Document) -> List[ToCEntry]: """Read table of contents from a document""" return [ToCEntry(*entry) for entry in doc.get_toc()]
import toml from mamba import description, it, before from fitzutils import ToCEntry from pdftocgen.tocgen import gen_toc dirpath = os.path.dirname(os.path.abspath(__file__)) with description("gen_toc") as self: with before.all: self.level2 = fitz.open(os.path.join(dirpath, "files/level2.pdf")) self.level2_recipe = toml.load( open(os.path.join(dirpath, "files/level2_recipe.toml"))) self.level2_expect = [ ToCEntry(level=1, title='1 Section One', pagenum=1, vpos=237.6484375), ToCEntry(level=1, title='2 Section Two', pagenum=1, vpos=567.3842163085938), ToCEntry(level=2, title='2.1 Subsection Two.One', pagenum=2, vpos=452.56671142578125), ToCEntry( level=1, title='3 Section Three, with looong loooong looong ti- tle', pagenum=3, vpos=335.569580078125), ToCEntry(
from mamba import description, it, before from fitzutils import ToCEntry from pdftocio.tocio import read_toc, write_toc dirpath = os.path.dirname(os.path.abspath(__file__)) level2 = os.path.join(dirpath, "files/level2.pdf") hastoc = os.path.join(dirpath, "files/hastoc.pdf") with description("read_toc") as self: with before.all: self.doc = fitz.open(level2) self.reference = fitz.open(hastoc) self.expect = [ ToCEntry(level=1, title='Section One', pagenum=1), ToCEntry(level=1, title='Section Two', pagenum=1), ToCEntry(level=2, title='Subsection Two.One', pagenum=2), ToCEntry(level=1, title='Section Three, with looong loooong looong title', pagenum=3), ToCEntry( level=2, title='Subsection Three.One, ' 'with even loooooooooooonger title, and probably even more', pagenum=3), ToCEntry(level=2, title='Subsection Three.Two', pagenum=4), ToCEntry(level=2, title='Subsection Three.Three', pagenum=5), ToCEntry(level=1, title='The End', pagenum=5) ]
import os import io from mamba import description, it, before from fitzutils import (dump_toc, ToCEntry) from pdftocio.tocparser import parse_toc dirpath = os.path.dirname(os.path.abspath(__file__)) valid_file = os.path.join(dirpath, "files/level2.pdf") invalid_file = os.path.join(dirpath, "files/nothing.pdf") with description("parse_toc") as self: with before.all: self.toc = [ ToCEntry(level=1, title="title1", pagenum=1, vpos=100.0), ToCEntry(level=2, title="title2", pagenum=1, vpos=150.0), ToCEntry(level=3, title="title3", pagenum=2, vpos=90.0), ToCEntry(level=2, title="title4", pagenum=2, vpos=150.0), ToCEntry(level=2, title="title5", pagenum=3, vpos=0.0), ToCEntry(level=1, title="title6", pagenum=5, vpos=200.0) ] self.toc_novpos = [ ToCEntry(level=1, title="title1", pagenum=1), ToCEntry(level=2, title="title2", pagenum=1), ToCEntry(level=3, title="title3", pagenum=2), ToCEntry(level=2, title="title4", pagenum=2), ToCEntry(level=2, title="title5", pagenum=3), ToCEntry(level=1, title="title6", pagenum=5) ]
with it("exits if pdf file is invalid and exit_on_error is true"): try: with open_pdf(invalid_file, True) as doc: assert False, "should have exited" except AssertionError as err: raise err except: pass with description("ToCEntry") as self: with it("matches fitz's representation"): fitz_entry = [1, "title", 2] fitz_entry2 = [1, "title", 2, 100.0] toc_entry = ToCEntry(level=1, title="title", pagenum=2) toc_entry2 = ToCEntry(level=1, title="title", pagenum=2, vpos=100.0) assert toc_entry.to_fitz_entry() == fitz_entry assert toc_entry2.to_fitz_entry() == fitz_entry2 assert ToCEntry(*fitz_entry) == toc_entry assert ToCEntry(*fitz_entry2) == toc_entry2 with it("is sorted correctly"): entries = [ ToCEntry(level=1, title="title4", pagenum=2, vpos=150.0), ToCEntry(level=1, title="title3", pagenum=2, vpos=90.0), ToCEntry(level=1, title="title5", pagenum=3, vpos=0.0), ToCEntry(level=1, title="title2", pagenum=1, vpos=150.0), ToCEntry(level=1, title="title1", pagenum=1, vpos=100.0),