def extract_page(page: Page): if len(page.curves) < 2: raise ValueError( "PDF page contains less than 2 curves, formatting may have changed" ) # Find table bounding box min_x0 = min(c["x0"] for c in page.curves) max_x1 = max(c["x1"] for c in page.curves) min_top = min(c["top"] for c in page.curves) max_bottom = max(c["bottom"] for c in page.curves) table_bbox = (min_x0, min_top, max_x1, max_bottom) table_page = page.crop(table_bbox) # Get table lines vertical_lines_x = sorted(set(p[0] for p in table_page.curves[0]["points"])) vertical_lines_x = [table_page.curves[1]["x0"] ] + vertical_lines_x + [table_page.curves[1]["x1"]] horizontal_lines_y = sorted(c["points"][0][1] for c in table_page.curves[1:]) horizontal_lines_y = [table_page.curves[0]["top"]] + horizontal_lines_y + [ table_page.curves[-1]["bottom"] ] return table_page.extract_table({ "horizontal_strategy": "explicit", "explicit_vertical_lines": vertical_lines_x, "vertical_strategy": "explicit", "explicit_horizontal_lines": horizontal_lines_y, })
def get_text_area(page: Page): words = page.extract_words() x0 = min([word['x0'] for word in words]) top = min([word['top'] for word in words]) x1 = max([word['x1'] for word in words]) bottom = max([word['bottom'] for word in words]) return x0, top, x1, bottom
def extract_tables(page: Page): tables = [] plumber_tables = page.find_tables() for table in plumber_tables: if len(table.rows) > 1 and len(table.cells) > 1.5 * len(table.rows): tables.append(Table(table)) return tables
def pages(self): if hasattr(self, "_pages"): return self._pages doctop = 0 pp = self.pages_to_parse self._pages = [] for i, page in enumerate(PDFPage.create_pages(self.doc)): if pp != None and i + 1 not in pp: continue p = Page(self, page, initial_doctop=doctop) self._pages.append(p) doctop += p.height return self._pages
def start_parser(self): """ 该方法暴露开始解析目标页面的数据,解析后的数据进行存入数据库 可以很方便的进行一边解析,一边扫描满足需要的股票 """ print("start parser") doc_top = 0 # 将pdf加载到内存 page_doc_list = enumerate(PDFPage.create_pages(self.doc)) for index, page_doc in page_doc_list: # 生存页面对象,方便解析 page = Page(self, page_doc, page_number=index + 1, initial_doctop=doc_top) # 将当前页面进行转文本 page_content = page.extract_text() self._find_debt_table(page_content) self._find_profit_table(page_content) self._find_flow_table(page_content) if self.flow_table_start_index >= 3: # 说明所有需要的表都找到了,直接退出 break doc_top += page.height