def page_to_tables(page, extend_y=False, hints=[], atomise=False): """ Get a rectangular list of list of strings from one page of a document """ if not isinstance(page, LTPage): raise TypeError("Page must be LTPage, not {}".format(page.__class__)) table_array = [] # For LTTextLine horizontal column and row thresholds of 3 work ok columnThreshold = 5 # 3 works for smaller tables rowThreshold = 3 if atomise: flt = ['LTPage', 'LTTextLineHorizontal', 'LTChar'] else: flt = ['LTPage', 'LTTextLineHorizontal'] # flt = ['LTPage', 'LTTextLineHorizontal', 'LTFigure'] box_list = LeafList().populate(page, flt).purge_empty_text() (minx, maxx, miny, maxy) = find_table_bounding_box(box_list, hints=hints) """If miny and maxy are None then we found no tables and should exit""" if miny is None and maxy is None: print "found no tables" return table_array, TableDiagnosticData() if atomise: box_list = box_list.filterByType(['LTPage', 'LTChar']) filtered_box_list = filter_box_list_by_position( box_list, miny, maxy, Leaf._midline) filtered_box_list = filter_box_list_by_position( filtered_box_list, minx, maxx, Leaf._centreline) # Project boxes onto horizontal axis column_projection = project_boxes(filtered_box_list, "column") # Project boxes onto vertical axis # Erode row height by a fraction of the modal text box height erodelevel = int(math.floor(calculate_modal_height(filtered_box_list) / 4)) row_projection = project_boxes( filtered_box_list, "row", erosion=erodelevel) # y_comb = comb_from_projection(row_projection, rowThreshold, "row") y_comb.reverse() # columnThreshold = max(len(y_comb)*0.75,5) x_comb = comb_from_projection(column_projection, columnThreshold, "column") x_comb[0] = minx x_comb[-1] = maxx # Extend y_comb to page size if extend_y is true if extend_y: pageminy = min([box.bottom for box in box_list]) pagemaxy = max([box.top for box in box_list]) y_comb = comb_extend(y_comb, pageminy, pagemaxy) filtered_box_list = box_list # Applying the combs table_array = apply_combs(box_list, x_comb, y_comb) # Strip out leading and trailing spaces when atomise true if atomise: tmp_table = [] for row in table_array: stripped_row = map(unicode.strip,row) tmp_table.append(stripped_row) table_array = tmp_table diagnostic_data = TableDiagnosticData( filtered_box_list, column_projection, row_projection, x_comb, y_comb) return table_array, diagnostic_data
def page_to_tables(pdf_page, config=None): """ Get a rectangular list of list of strings from one page of a document """ if not isinstance(pdf_page, PDFPage): raise TypeError("Page must be PDFPage, not {}".format( pdf_page.__class__)) if not config: config = ConfigParameters() table_array = [] # For LTTextLine horizontal column and row thresholds of 3 work ok columnThreshold = 5 # 3 works for smaller tables rowThreshold = 3 if config.atomise: flt = ['LTPage', 'LTTextLineHorizontal', 'LTChar'] else: flt = ['LTPage', 'LTTextLineHorizontal'] # flt = ['LTPage', 'LTTextLineHorizontal', 'LTFigure'] box_list = LeafList().populate(pdf_page, flt).purge_empty_text() (minx, maxx, miny, maxy) = find_table_bounding_box(box_list, config.table_top_hint, config.table_bottom_hint) """If miny and maxy are None then we found no tables and should exit""" if miny is None and maxy is None: print "found no tables" return table_array, TableDiagnosticData() if config.atomise: box_list = box_list.filterByType(['LTPage', 'LTChar']) filtered_box_list = filter_box_list_by_position(box_list, miny, maxy, Leaf._midline) filtered_box_list = filter_box_list_by_position(filtered_box_list, minx, maxx, Leaf._centreline) # Project boxes onto horizontal axis column_projection = project_boxes(filtered_box_list, "column") # Project boxes onto vertical axis # Erode row height by a fraction of the modal text box height erodelevel = int(math.floor(calculate_modal_height(filtered_box_list) / 4)) row_projection = project_boxes(filtered_box_list, "row", erosion=erodelevel) # y_comb = comb_from_projection(row_projection, rowThreshold, "row") y_comb.reverse() # columnThreshold = max(len(y_comb)*0.75,5) x_comb = comb_from_projection(column_projection, columnThreshold, "column") x_comb[0] = minx x_comb[-1] = maxx # Extend y_comb to page size if extend_y is true if config.extend_y: pageminy = min([box.bottom for box in box_list]) pagemaxy = max([box.top for box in box_list]) y_comb = comb_extend(y_comb, pageminy, pagemaxy) filtered_box_list = box_list # Applying the combs table_array = apply_combs(box_list, x_comb, y_comb) # Strip out leading and trailing spaces when atomise true if config.atomise: tmp_table = [] for row in table_array: stripped_row = map(unicode.strip, row) tmp_table.append(stripped_row) table_array = tmp_table diagnostic_data = TableDiagnosticData(filtered_box_list, column_projection, row_projection, x_comb, y_comb) return table_array, diagnostic_data