示例#1
0
def extract_page(page: Page):
    if len(page.curves) < 2:
        raise ValueError(
            "PDF page contains less than 2 curves, formatting may have changed"
        )

    # Find table bounding box
    min_x0 = min(c["x0"] for c in page.curves)
    max_x1 = max(c["x1"] for c in page.curves)
    min_top = min(c["top"] for c in page.curves)
    max_bottom = max(c["bottom"] for c in page.curves)
    table_bbox = (min_x0, min_top, max_x1, max_bottom)
    table_page = page.crop(table_bbox)

    # Get table lines
    vertical_lines_x = sorted(set(p[0]
                                  for p in table_page.curves[0]["points"]))
    vertical_lines_x = [table_page.curves[1]["x0"]
                        ] + vertical_lines_x + [table_page.curves[1]["x1"]]
    horizontal_lines_y = sorted(c["points"][0][1]
                                for c in table_page.curves[1:])
    horizontal_lines_y = [table_page.curves[0]["top"]] + horizontal_lines_y + [
        table_page.curves[-1]["bottom"]
    ]

    return table_page.extract_table({
        "horizontal_strategy":
        "explicit",
        "explicit_vertical_lines":
        vertical_lines_x,
        "vertical_strategy":
        "explicit",
        "explicit_horizontal_lines":
        horizontal_lines_y,
    })
def get_text_area(page: Page):
    words = page.extract_words()
    x0 = min([word['x0'] for word in words])
    top = min([word['top'] for word in words])
    x1 = max([word['x1'] for word in words])
    bottom = max([word['bottom'] for word in words])

    return x0, top, x1, bottom
示例#3
0
def extract_tables(page: Page):
    tables = []
    plumber_tables = page.find_tables()
    for table in plumber_tables:
        if len(table.rows) > 1 and len(table.cells) > 1.5 * len(table.rows):
            tables.append(Table(table))

    return tables
示例#4
0
    def pages(self):
        if hasattr(self, "_pages"): return self._pages

        doctop = 0
        pp = self.pages_to_parse
        self._pages = []
        for i, page in enumerate(PDFPage.create_pages(self.doc)):
            if pp != None and i + 1 not in pp: continue
            p = Page(self, page, initial_doctop=doctop)
            self._pages.append(p)
            doctop += p.height
        return self._pages
示例#5
0
 def start_parser(self):
     """
     该方法暴露开始解析目标页面的数据,解析后的数据进行存入数据库
     可以很方便的进行一边解析,一边扫描满足需要的股票
     """
     print("start parser")
     doc_top = 0
     # 将pdf加载到内存
     page_doc_list = enumerate(PDFPage.create_pages(self.doc))
     for index, page_doc in page_doc_list:
         # 生存页面对象,方便解析
         page = Page(self,
                     page_doc,
                     page_number=index + 1,
                     initial_doctop=doc_top)
         # 将当前页面进行转文本
         page_content = page.extract_text()
         self._find_debt_table(page_content)
         self._find_profit_table(page_content)
         self._find_flow_table(page_content)
         if self.flow_table_start_index >= 3:
             # 说明所有需要的表都找到了,直接退出
             break
         doc_top += page.height