示例#1
0
def getPdfLayout(filename):
    # ファイルを開く
    fp = open(filename, 'rb')
    # parser生成
    parser = PDFParser(fp)
    # ドキュメントオブジェクト生成
    document = PDFDocument()
    # parserにドキュメントを設定
    parser.set_document(document)

    document.set_parser(parser)

    # PDFリソース管理オブジェクト生成
    manager = PDFResourceManager()
    rettxt = StringIO()
    # 解析のパラメータオブジェくを生成
    laparams = LAParams()

    # アグリゲーターの生成
    device = TextConverter(manager, rettxt, code='utf-8', laparams=laparams)
    # インタプリタの生成
    interpreter = PDFPageInterpreter(manager, device)

    # PDFページを取得
    pages = list(document.get_pages())
    page1 = pages[0]

    interpreter.process_page(page1)

    layout = device.get_result()

    for l in layout:
        print(l.get_text())
示例#2
0
def convert_pdf_to_txt(path, pgno):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    fstr = ''
    
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    
    for pageNumber, page in enumerate(PDFPage.get_pages(fp)):
        if pageNumber == pgno:
            interpreter.process_page(page)

            # receive the LTPage object for the page.
            layout = device.get_result()
            for element in layout:
                if isinstance(element, LTTextBoxHorizontal):
                    return element.get_text()
            
            '''str = retstr.getvalue()
            fstr += str'''
    
    '''for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,    password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

        str = retstr.getvalue()
        fstr += str'''

    '''fp.close()
示例#3
0
def text_extraction_example():
    import io

    pdf_filepath = '/path/to/sample.pdf'

    fp = None
    try:
        # Open a PDF file.
        fp = open(pdf_filepath, 'rb')

        # Create resource manager.
        rsrcmgr = PDFResourceManager()
        # Set parameters for layout analysis.
        laparams = LAParams(
            line_overlap=
            0.5,  # If two characters have more overlap than this they are considered to be on the same line.
            char_margin=
            2.0,  # If two characters are closer together than this margin they are considered part of the same line.
            word_margin=
            0.1,  # If two characters on the same line are further apart than this margin then they are considered to be two separate words, and an intermediate space will be added for readability.
            line_margin=
            0.5,  # If two lines are close together they are considered to be part of the same paragraph.
            boxes_flow=
            0.5,  # Specifies how much a horizontal and vertical position of a text matters when determining the order of text boxes.
            detect_vertical=
            False,  # If vertical text should be considered during layout analysis.
            all_texts=
            False  # If layout analysis should be performed on text in figures.
        )

        if True:
            retstr = io.StringIO()
            device = TextConverter(rsrcmgr,
                                   retstr,
                                   pageno=1,
                                   laparams=laparams,
                                   showpageno=False,
                                   imagewriter=None)
            interpreter = PDFPageInterpreter(rsrcmgr, device)

            for page in PDFPage.get_pages(
                    fp, pagenos=None, maxpages=0, password=b''
            ):  # pagenos uses zero-based indices. pagenos is sorted inside the function.
                interpreter.process_page(page)

                texts = retstr.getvalue()  # All texts in a page.
                #texts = texts.splitlines()
                #texts = list(txt for txt in texts.splitlines() if txt)

                print('------------------------------')
                print(texts)
        else:
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)

            page_no = 1
            pages = PDFPage.get_pages(
                fp, pagenos=[page_no], maxpages=0, password=b''
            )  # pagenos uses zero-based indices. pagenos is sorted inside the function.
            page = next(pages)

            interpreter.process_page(page)

            layout = device.get_result()
            print('Page bounding box = {}.'.format(layout.bbox))

            bbox_text_pairs = extract_text_object(layout, pdf_filepath,
                                                  page_no)
            for idx, (bbox, txt) in enumerate(bbox_text_pairs):
                print(
                    '------------------------------ Block {} {} in page {} in {}.'
                    .format(idx, bbox, page_no, pdf_filepath))
                #print('------------------------------ Block {} {} in page {} in {}.'.format(idx, (bbox[0], layout.bbox[3] - bbox[3], bbox[2], layout.bbox[3] - bbox[1]), page_no, pdf_filepath))
                print(txt)
    except FileNotFoundError as ex:
        print('File not found, {}: {}.'.format(pdf_filepath, ex))
    except Exception as ex:
        print('Unknown exception raised in {}: {}.'.format(pdf_filepath, ex))
    finally:
        if fp: fp.close()
示例#4
0
def extract_text(
        files=[],
        outfile='-',
        _py2_no_more_posargs=None,  # Bloody Python2 needs a shim
        no_laparams=False,
        all_texts=None,
        detect_vertical=None,  # LAParams
        word_margin=None,
        char_margin=None,
        line_margin=None,
        boxes_flow=None,  # LAParams
        output_type='text',
        codec='utf-8',
        strip_control=False,
        maxpages=0,
        page_numbers=None,
        password="",
        scale=1.0,
        rotation=0,
        layoutmode='normal',
        output_dir=None,
        debug=False,
        disable_caching=False,
        **other):
    if _py2_no_more_posargs is not None:
        raise ValueError("Too many positional arguments passed.")
    if not files:
        raise ValueError("Must provide files to work upon!")

    # If any LAParams group arguments were passed, create an LAParams object and
    # populate with given args. Otherwise, set it to None.
    if not no_laparams:
        laparams = pdfminer.layout.LAParams()
        for param in ("all_texts", "detect_vertical", "word_margin",
                      "char_margin", "line_margin", "boxes_flow"):
            paramv = locals().get(param, None)
            if paramv is not None:
                setattr(laparams, param, paramv)
    else:
        laparams = None

    imagewriter = None
    if output_dir:
        imagewriter = ImageWriter(output_dir)

    if output_type == "text" and outfile != "-":
        for override, alttype in ((".htm", "html"), (".html", "html"),
                                  (".xml", "xml"), (".tag", "tag")):
            if outfile.endswith(override):
                output_type = alttype

    if outfile == "-":
        outfp = sys.stdout
        if outfp.encoding is not None:
            codec = 'utf-8'
    else:
        outfp = open(outfile, "wb")

    for fname in files:
        with open(fname, "rb") as infp:
            #           pdfminer.high_level.extract_text_to_fp(fp, **locals())

            rsrcmgr = PDFResourceManager_new(caching=not disable_caching)
            device = TextConverter(rsrcmgr,
                                   outfp,
                                   codec=codec,
                                   laparams=laparams,
                                   imagewriter=imagewriter)
            if outfp == sys.stdout:
                outfp = sys.stdout.buffer

            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.get_pages(infp,
                                          page_numbers,
                                          maxpages=maxpages,
                                          password=password,
                                          caching=not disable_caching,
                                          check_extractable=True):
                page.rotate = (page.rotate + rotation) % 360
                interpreter.process_page(page)

            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)

            for page in PDFPage.get_pages(infp):
                interpreter.process_page(page)
                # receive the LTPage object for the page.
                layout = device.get_result()
                for element in layout:
                    if isinstance(element, LTTextBoxHorizontal):
                        print(element.get_text())

    return outfp