Пример #1
0
    def extract_block(self, block: dict, page: int) -> List[ToCEntry]:
        """Extract matching headings in a block.

        Argument
          block: a block dictionary
          {
            'bbox': (float, float, float, float),
            'lines': [dict],
            'type': int
          }
        Returns
          a list of toc entries, concatenated from the result of lines
        """
        if block.get('type') != 0:
            # not a text block
            return []

        vpos = block.get('bbox', (0, 0))[1]

        try:
            frags = chain.from_iterable([
                self._extract_line(ln) for ln in block.get('lines')
            ])
            titles = concatFrag(frags)

            return [
                ToCEntry(level, title, page, vpos)
                for level, title in titles.items()
            ]
        except FoundGreedy as e:
            # return the entire block as a single entry
            return [ToCEntry(e.level, blk_to_str(block), page, vpos)]
Пример #2
0
def build_final_toc(path_title_level_list, toc):
    idx1, idx2 = 0, 0
    last_page_num = 1
    final_toc = []
    for idx1, (path, title, level) in enumerate(path_title_level_list):

        title_match = "".join(title.split()) == "".join(
            toc[idx2].title.split())

        if path is None or (path is not None or title_match):
            final_toc.append(ToCEntry(level, title, toc[idx2].pagenum))

        if path is not None:
            if title_match:
                idx2 += 1
            else:
                print("missing chapter: ", title)
                final_toc.pop()  # remove missing ones

    return final_toc
Пример #3
0
def parse_entry(entry: List) -> ToCEntry:
    """parse a row in csv to a toc entry"""

    # a somewhat weird hack, csv reader would read spaces as an empty '', so we
    # only need to count the number of '' before an entry to determined the
    # heading level
    indent = len(list(takewhile(lambda x: x == '', entry)))
    try:
        toc_entry = ToCEntry(
            int(indent / 4) + 1,  # 4 spaces = 1 level
            entry[indent],  # heading
            int(entry[indent + 1]),  # pagenum
            *entry[indent + 2:]  # vpos
        )
        return toc_entry
    except IndexError as e:
        print(f"Unable to parse toc entry {entry};",
              f"Need at least {indent + 2} parts but only have {len(entry)}.",
              "Make sure the page number is present.",
              file=sys.stderr)
        raise e
Пример #4
0
def read_toc(doc: Document) -> List[ToCEntry]:
    """Read table of contents from a document"""
    return [ToCEntry(*entry) for entry in doc.get_toc()]
Пример #5
0
import toml

from mamba import description, it, before
from fitzutils import ToCEntry
from pdftocgen.tocgen import gen_toc

dirpath = os.path.dirname(os.path.abspath(__file__))

with description("gen_toc") as self:
    with before.all:
        self.level2 = fitz.open(os.path.join(dirpath, "files/level2.pdf"))
        self.level2_recipe = toml.load(
            open(os.path.join(dirpath, "files/level2_recipe.toml")))
        self.level2_expect = [
            ToCEntry(level=1,
                     title='1 Section One',
                     pagenum=1,
                     vpos=237.6484375),
            ToCEntry(level=1,
                     title='2 Section Two',
                     pagenum=1,
                     vpos=567.3842163085938),
            ToCEntry(level=2,
                     title='2.1 Subsection Two.One',
                     pagenum=2,
                     vpos=452.56671142578125),
            ToCEntry(
                level=1,
                title='3 Section Three, with looong loooong looong ti- tle',
                pagenum=3,
                vpos=335.569580078125),
            ToCEntry(
Пример #6
0
from mamba import description, it, before
from fitzutils import ToCEntry
from pdftocio.tocio import read_toc, write_toc

dirpath = os.path.dirname(os.path.abspath(__file__))

level2 = os.path.join(dirpath, "files/level2.pdf")
hastoc = os.path.join(dirpath, "files/hastoc.pdf")

with description("read_toc") as self:
    with before.all:
        self.doc = fitz.open(level2)
        self.reference = fitz.open(hastoc)
        self.expect = [
            ToCEntry(level=1, title='Section One', pagenum=1),
            ToCEntry(level=1, title='Section Two', pagenum=1),
            ToCEntry(level=2, title='Subsection Two.One', pagenum=2),
            ToCEntry(level=1,
                     title='Section Three, with looong loooong looong title',
                     pagenum=3),
            ToCEntry(
                level=2,
                title='Subsection Three.One, '
                'with even loooooooooooonger title, and probably even more',
                pagenum=3),
            ToCEntry(level=2, title='Subsection Three.Two', pagenum=4),
            ToCEntry(level=2, title='Subsection Three.Three', pagenum=5),
            ToCEntry(level=1, title='The End', pagenum=5)
        ]
Пример #7
0
import os
import io

from mamba import description, it, before
from fitzutils import (dump_toc, ToCEntry)
from pdftocio.tocparser import parse_toc

dirpath = os.path.dirname(os.path.abspath(__file__))

valid_file = os.path.join(dirpath, "files/level2.pdf")
invalid_file = os.path.join(dirpath, "files/nothing.pdf")

with description("parse_toc") as self:
    with before.all:
        self.toc = [
            ToCEntry(level=1, title="title1", pagenum=1, vpos=100.0),
            ToCEntry(level=2, title="title2", pagenum=1, vpos=150.0),
            ToCEntry(level=3, title="title3", pagenum=2, vpos=90.0),
            ToCEntry(level=2, title="title4", pagenum=2, vpos=150.0),
            ToCEntry(level=2, title="title5", pagenum=3, vpos=0.0),
            ToCEntry(level=1, title="title6", pagenum=5, vpos=200.0)
        ]

        self.toc_novpos = [
            ToCEntry(level=1, title="title1", pagenum=1),
            ToCEntry(level=2, title="title2", pagenum=1),
            ToCEntry(level=3, title="title3", pagenum=2),
            ToCEntry(level=2, title="title4", pagenum=2),
            ToCEntry(level=2, title="title5", pagenum=3),
            ToCEntry(level=1, title="title6", pagenum=5)
        ]
Пример #8
0
    with it("exits if pdf file is invalid and exit_on_error is true"):
        try:
            with open_pdf(invalid_file, True) as doc:
                assert False, "should have exited"
        except AssertionError as err:
            raise err
        except:
            pass

with description("ToCEntry") as self:
    with it("matches fitz's representation"):
        fitz_entry = [1, "title", 2]
        fitz_entry2 = [1, "title", 2, 100.0]

        toc_entry = ToCEntry(level=1, title="title", pagenum=2)
        toc_entry2 = ToCEntry(level=1, title="title", pagenum=2, vpos=100.0)

        assert toc_entry.to_fitz_entry() == fitz_entry
        assert toc_entry2.to_fitz_entry() == fitz_entry2

        assert ToCEntry(*fitz_entry) == toc_entry
        assert ToCEntry(*fitz_entry2) == toc_entry2

    with it("is sorted correctly"):
        entries = [
            ToCEntry(level=1, title="title4", pagenum=2, vpos=150.0),
            ToCEntry(level=1, title="title3", pagenum=2, vpos=90.0),
            ToCEntry(level=1, title="title5", pagenum=3, vpos=0.0),
            ToCEntry(level=1, title="title2", pagenum=1, vpos=150.0),
            ToCEntry(level=1, title="title1", pagenum=1, vpos=100.0),