Пример #1
0
def extract_meta(doc: Document,
                 pattern: str,
                 page: Optional[int] = None,
                 ign_case: bool = False) -> List[dict]:
    """Extract meta for a `pattern` on `page` in a pdf document

    Arguments
      doc: document from pymupdf
      pattern: a regular expression pattern
      page: page number (1-based index), if None is given, search for the
            entire document, but this is highly discouraged.
      ign_case: ignore case?
    """
    result = []

    if page is None:
        pages = doc.pages()
    elif 1 <= page <= doc.pageCount:
        pages = [doc[page - 1]]
    else:  # page out of range
        return result

    regex = re.compile(pattern,
                       re.IGNORECASE) if ign_case else re.compile(pattern)

    # we could parallelize this, but I don't see a reason
    # to *not* specify a page number
    for p in pages:
        result.extend(search_in_page(regex, p))

    return result
Пример #2
0
def extract_toc(doc: Document, recipe: Recipe) -> List[ToCEntry]:
    """Extract toc entries from a document

    Arguments
      doc: a pdf document
      recipe: recipe from user
    Returns
      a list of toc entries in the document
    """
    result = []

    for page in doc.pages():
        for blk in page.getTextPage().extractDICT().get('blocks', []):
            result.extend(
                recipe.extract_block(blk, page.number + 1)
            )

    return result