Exemplo n.º 1
0
def pdf_2_text(pdf_file_name: str,
               password='',
               page_numbers=None,
               maxpages=0,
               caching=True,
               laparams=None) -> str:
    """
    This is a re-write of the function pdfminer.high_level.extract_text
    https://github.com/pdfminer/pdfminer.six/blob/0b44f7771462363528c109f263276eb254c4fcd0/pdfminer/high_level.py#L90
    It produces result, which does not have this issue: https://github.com/pdfminer/pdfminer.six/issues/466

    : pdf_file_name - name of the input PDF file
    : password: For encrypted PDFs, the password to decrypt.
    : page_numbers: zero-indexed page numbers to operate on
    : maxpages: How many pages to stop parsing after
    :
    """
    result = ""
    with open_filename(pdf_file_name, "rb") as pdf_file_object:
        for page in PDFPage.get_pages(
                pdf_file_object,
                page_numbers,
                maxpages=maxpages,
                password=password,
                caching=caching,
        ):
            result = result + _PDFpage2txt(page, laparams)

    return result
Exemplo n.º 2
0
def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
                 caching=True, codec='utf-8', laparams=None):
    """Parse and return the text contained in a PDF file.

    :param pdf_file: Either a file path or a file-like object for the PDF file
        to be worked on.
    :param password: For encrypted PDFs, the password to decrypt.
    :param page_numbers: List of zero-indexed page numbers to extract.
    :param maxpages: The maximum number of pages to parse
    :param caching: If resources should be cached
    :param codec: Text decoding codec
    :param laparams: An LAParams object from pdfminer.layout. If None, uses
        some default settings that often work well.
    :return: a string containing all of the text extracted.
    """
    if laparams is None:
        laparams = None

    with open_filename(pdf_file, "rb") as fp, StringIO() as output_string:
        rsrcmgr = PDFResourceManager(caching=caching)
        device = TextConverter(rsrcmgr, output_string, codec=codec,
                               laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for page in PDFPage.get_pages(
                fp,
                page_numbers,
                maxpages=maxpages,
                password=password,
                caching=caching,
        ):
            interpreter.process_page(page)

        return output_string.getvalue()
def extract_text_by_page(pdf_file, password='', page_numbers=None, maxpages=0,
                         caching=True, codec='utf-8', laparams=None):
    """
    Parse and return the text contained in each page of a PDF file. Taken from
    https://github.com/pdfminer/pdfminer.six/blob/master/pdfminer/high_level.py#L90-L123
    and adapted to return the text of each page separately as a dictionary obj.
    :param pdf_file: Either a file path or a file-like object for the PDF file
        to be worked on.
    :param password: For encrypted PDFs, the password to decrypt.
    :param page_numbers: List of zero-indexed page numbers to extract.
    :param maxpages: The maximum number of pages to parse
    :param caching: If resources should be cached
    :param codec: Text decoding codec
    :param laparams: An LAParams object from pdfminer.layout. If None, uses
        some default settings that often work well.
    :return: a dict containing the text from each page (keys = page numbers)
    """
    if laparams is None:
        laparams = LAParams()

    text_by_page = {}

    with open_filename(pdf_file, "rb") as fp:
        rsrcmgr = PDFResourceManager()
        pages_iterable = PDFPage.get_pages(fp, page_numbers, maxpages=maxpages, password=password, caching=caching)
        if page_numbers is None:
            tuples_iterable = enumerate(pages_iterable)
        else:
            tuples_iterable = zip(page_numbers, pages_iterable)
        for page_num, page in tuples_iterable:
            # print('Processing page_num', page_num)
            with StringIO() as output_string:
                device = TextConverter(rsrcmgr, output_string, codec=codec, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                interpreter.process_page(page)
                text_by_page[page_num] = output_string.getvalue()
    return text_by_page
Exemplo n.º 4
0
 def test_string_input(self):
     filename = absolute_sample_path("simple1.pdf")
     opened = open_filename(filename)
     assert_equal(opened.closing, True)
Exemplo n.º 5
0
 def test_file_input(self):
     filename = absolute_sample_path("simple1.pdf")
     with open(filename, "rb") as in_file:
         opened = open_filename(in_file)
         assert_equal(opened.file_handler, in_file)
Exemplo n.º 6
0
 def test_pathlib_input(self):
     filename = pathlib.Path(absolute_sample_path("simple1.pdf"))
     opened = open_filename(filename)
     assert_equal(opened.closing, True)