Python Page示例

编程语言: Python

命名空间/包名称: pdfplumber.page

类/类型: Page

hotexamples.com的示例: 5

Python Page - 已找到5个示例。这些是从开源项目中提取的最受好评的pdfplumber.page.Page现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

Page(2)

crop(1)

extract_text(1)

extract_words(1)

find_tables(1)

示例#1

显示文件

def extract_page(page: Page):
    if len(page.curves) < 2:
        raise ValueError(
            "PDF page contains less than 2 curves, formatting may have changed"
        )

    # Find table bounding box
    min_x0 = min(c["x0"] for c in page.curves)
    max_x1 = max(c["x1"] for c in page.curves)
    min_top = min(c["top"] for c in page.curves)
    max_bottom = max(c["bottom"] for c in page.curves)
    table_bbox = (min_x0, min_top, max_x1, max_bottom)
    table_page = page.crop(table_bbox)

    # Get table lines
    vertical_lines_x = sorted(set(p[0]
                                  for p in table_page.curves[0]["points"]))
    vertical_lines_x = [table_page.curves[1]["x0"]
                        ] + vertical_lines_x + [table_page.curves[1]["x1"]]
    horizontal_lines_y = sorted(c["points"][0][1]
                                for c in table_page.curves[1:])
    horizontal_lines_y = [table_page.curves[0]["top"]] + horizontal_lines_y + [
        table_page.curves[-1]["bottom"]
    ]

    return table_page.extract_table({
        "horizontal_strategy":
        "explicit",
        "explicit_vertical_lines":
        vertical_lines_x,
        "vertical_strategy":
        "explicit",
        "explicit_horizontal_lines":
        horizontal_lines_y,
    })

示例#2

显示文件

文件： page_utils.py 项目： thaiduongx26/document_extraction

def get_text_area(page: Page):
    words = page.extract_words()
    x0 = min([word['x0'] for word in words])
    top = min([word['top'] for word in words])
    x1 = max([word['x1'] for word in words])
    bottom = max([word['bottom'] for word in words])

    return x0, top, x1, bottom

示例#3

显示文件

def extract_tables(page: Page):
    tables = []
    plumber_tables = page.find_tables()
    for table in plumber_tables:
        if len(table.rows) > 1 and len(table.cells) > 1.5 * len(table.rows):
            tables.append(Table(table))

    return tables

示例#4

显示文件

    def pages(self):
        if hasattr(self, "_pages"): return self._pages

        doctop = 0
        pp = self.pages_to_parse
        self._pages = []
        for i, page in enumerate(PDFPage.create_pages(self.doc)):
            if pp != None and i + 1 not in pp: continue
            p = Page(self, page, initial_doctop=doctop)
            self._pages.append(p)
            doctop += p.height
        return self._pages

示例#5

显示文件

文件： PdfParser.py 项目： CuteyBoy/EatAllSystem

 def start_parser(self):
     """
     该方法暴露开始解析目标页面的数据，解析后的数据进行存入数据库
     可以很方便的进行一边解析，一边扫描满足需要的股票
     """
     print("start parser")
     doc_top = 0
     # 将pdf加载到内存
     page_doc_list = enumerate(PDFPage.create_pages(self.doc))
     for index, page_doc in page_doc_list:
         # 生存页面对象，方便解析
         page = Page(self,
                     page_doc,
                     page_number=index + 1,
                     initial_doctop=doc_top)
         # 将当前页面进行转文本
         page_content = page.extract_text()
         self._find_debt_table(page_content)
         self._find_profit_table(page_content)
         self._find_flow_table(page_content)
         if self.flow_table_start_index >= 3:
             # 说明所有需要的表都找到了，直接退出
             break
         doc_top += page.height