def plotAllPages(fh): # tol = 5 # This is the tolerance for histogram rounding fig_list = [] ax1_list = [] pdf = PDFDocument(fh) print "Created by: %s" % pdf.get_creator() # print SelectedPDF # flt = 'LTTextLineHorizontal' # flt = ['LTPage','LTTextLineHorizontal'] flt = ["LTPage", "LTFigure", "LTLine", "LTRect", "LTImage", "LTTextLineHorizontal", "LTCurve"] # flt = ['LTPage','LTChar'] for i, page in enumerate(pdf.get_pages()): # page = next(doc.get_pages()) # layout = page.layout() # LTPage box_list = LeafList().populate(page, interested=flt) ModalHeight = pdftables.calculate_modal_height(box_list) diagnostic_data = pdftables.TableDiagnosticData(box_list, {}, {}, [], []) fig, ax1 = plotpage(diagnostic_data) fig_list.append(fig) ax1_list.append(ax1) title = "page %d" % (i + 1) fig.suptitle(title) # print "Page %d" % (i+1), ElementCount print box_list.count() print "Modal character height: %d" % ModalHeight return fig_list, ax1_list
def page_contains_tables(layout, device): # TODO: hide doc, interpreter, device inside a higher level Pdf class. It's # silly that we have to care about these (see function signature!!) box_list = LeafList().populate(layout) for item in box_list: assert isinstance(item, Leaf), "NOT LEAF" yhist = box_list.histogram(Leaf._top).rounder(1) test = [k for k, v in yhist.items() if v > IS_TABLE_COLUMN_COUNT_THRESHOLD] return len(test) > IS_TABLE_ROW_COUNT_THRESHOLD
def filter_box_list_by_position(box_list, minv, maxv, dir_fun): #TODO This should be in tree.py filtered_box_list = LeafList() # print minv, maxv, index for box in box_list: # box = boxstruct[0] if dir_fun(box) >= minv and dir_fun(box) <= maxv: # print box filtered_box_list.append(box) return filtered_box_list
def page_contains_tables(pdf_page): if not isinstance(pdf_page, PDFPage): raise TypeError("Page must be PDFPage, not {}".format( pdf_page.__class__)) box_list = LeafList().populate(pdf_page) for item in box_list: assert isinstance(item, Leaf), "NOT LEAF" yhist = box_list.histogram(Leaf._top).rounder(1) test = [k for k, v in yhist.items() if v > IS_TABLE_COLUMN_COUNT_THRESHOLD] return len(test) > IS_TABLE_ROW_COUNT_THRESHOLD
def page_contains_tables(pdf_page, interpreter, device): # TODO: hide doc, interpreter, device inside a higher level Pdf class. It's # silly that we have to care about these (see function signature!!) interpreter.process_page(pdf_page) # receive the LTPage object for the page. layout = device.get_result() box_list = LeafList().populate(layout) for item in box_list: assert isinstance(item, Leaf), "NOT LEAF" yhist = box_list.histogram(Leaf._top).rounder(1) test = [k for k, v in yhist.items() if v > IS_TABLE_COLUMN_COUNT_THRESHOLD] return len(test) > IS_TABLE_ROW_COUNT_THRESHOLD
def plotAllPages(fh): #tol = 5 # This is the tolerance for histogram rounding fig_list = [] ax1_list = [] doc, interpreter, device = pt.initialize_pdf_miner(fh) # print SelectedPDF Creator = doc.info[0]['Creator'] print("Created by: %s" % Creator) #flt = 'LTTextLineHorizontal' #flt = ['LTPage','LTTextLineHorizontal'] # flt = ['LTPage','LTFigure','LTLine','LTRect','LTImage','LTTextLineHorizontal','LTCurve'] flt = ['LTPage','LTChar'] for i,page in enumerate(doc.get_pages()): # page = next(doc.get_pages()) interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() box_list = LeafList().populate(layout, interested = flt) ModalHeight = pt.calculate_modal_height(box_list) diagnostic_data = pt.TableDiagnosticData( box_list, {}, {}, [], []) fig, ax1 = plotpage(diagnostic_data) fig_list.append(fig) ax1_list.append(ax1) title = "page %d" % (i+1) fig.suptitle(title) #print "Page %d" % (i+1), ElementCount print(box_list.count()) print("Modal character height: %d" % ModalHeight) return fig_list, ax1_list
def __init__(self, box_list=LeafList(), top_plot=dict(), left_plot=dict(), x_comb=[], y_comb=[]): self.box_list = box_list self.top_plot = top_plot self.left_plot = left_plot self.x_comb = x_comb self.y_comb = y_comb
def plotAllPages(fh): #tol = 5 # This is the tolerance for histogram rounding fig_list = [] ax1_list = [] pdf = PDFDocument(fh) print "Created by: %s" % pdf.get_creator() # print SelectedPDF #flt = 'LTTextLineHorizontal' #flt = ['LTPage','LTTextLineHorizontal'] flt = [ 'LTPage', 'LTFigure', 'LTLine', 'LTRect', 'LTImage', 'LTTextLineHorizontal', 'LTCurve' ] # flt = ['LTPage','LTChar'] for i, page in enumerate(pdf.get_pages()): # page = next(doc.get_pages()) #layout = page.layout() # LTPage box_list = LeafList().populate(page, interested=flt) ModalHeight = pdftables.calculate_modal_height(box_list) diagnostic_data = pdftables.TableDiagnosticData( box_list, {}, {}, [], []) fig, ax1 = plotpage(diagnostic_data) fig_list.append(fig) ax1_list.append(ax1) title = "page %d" % (i + 1) fig.suptitle(title) #print "Page %d" % (i+1), ElementCount print box_list.count() print "Modal character height: %d" % ModalHeight return fig_list, ax1_list
def page_to_tables(page, extend_y=False, hints=[], atomise=False): """ Get a rectangular list of list of strings from one page of a document """ if not isinstance(page, LTPage): raise TypeError("Page must be LTPage, not {}".format(page.__class__)) table_array = [] # For LTTextLine horizontal column and row thresholds of 3 work ok columnThreshold = 5 # 3 works for smaller tables rowThreshold = 3 if atomise: flt = ['LTPage', 'LTTextLineHorizontal', 'LTChar'] else: flt = ['LTPage', 'LTTextLineHorizontal'] # flt = ['LTPage', 'LTTextLineHorizontal', 'LTFigure'] box_list = LeafList().populate(page, flt).purge_empty_text() (minx, maxx, miny, maxy) = find_table_bounding_box(box_list, hints=hints) """If miny and maxy are None then we found no tables and should exit""" if miny is None and maxy is None: print "found no tables" return table_array, TableDiagnosticData() if atomise: box_list = box_list.filterByType(['LTPage', 'LTChar']) filtered_box_list = filter_box_list_by_position( box_list, miny, maxy, Leaf._midline) filtered_box_list = filter_box_list_by_position( filtered_box_list, minx, maxx, Leaf._centreline) # Project boxes onto horizontal axis column_projection = project_boxes(filtered_box_list, "column") # Project boxes onto vertical axis # Erode row height by a fraction of the modal text box height erodelevel = int(math.floor(calculate_modal_height(filtered_box_list) / 4)) row_projection = project_boxes( filtered_box_list, "row", erosion=erodelevel) # y_comb = comb_from_projection(row_projection, rowThreshold, "row") y_comb.reverse() # columnThreshold = max(len(y_comb)*0.75,5) x_comb = comb_from_projection(column_projection, columnThreshold, "column") x_comb[0] = minx x_comb[-1] = maxx # Extend y_comb to page size if extend_y is true if extend_y: pageminy = min([box.bottom for box in box_list]) pagemaxy = max([box.top for box in box_list]) y_comb = comb_extend(y_comb, pageminy, pagemaxy) filtered_box_list = box_list # Applying the combs table_array = apply_combs(box_list, x_comb, y_comb) # Strip out leading and trailing spaces when atomise true if atomise: tmp_table = [] for row in table_array: stripped_row = map(unicode.strip,row) tmp_table.append(stripped_row) table_array = tmp_table diagnostic_data = TableDiagnosticData( filtered_box_list, column_projection, row_projection, x_comb, y_comb) return table_array, diagnostic_data
def page_to_tables(pdf_page, config=None): """ Get a rectangular list of list of strings from one page of a document """ if not isinstance(pdf_page, PDFPage): raise TypeError("Page must be PDFPage, not {}".format( pdf_page.__class__)) if not config: config = ConfigParameters() table_array = [] # For LTTextLine horizontal column and row thresholds of 3 work ok columnThreshold = 5 # 3 works for smaller tables rowThreshold = 3 if config.atomise: flt = ['LTPage', 'LTTextLineHorizontal', 'LTChar'] else: flt = ['LTPage', 'LTTextLineHorizontal'] # flt = ['LTPage', 'LTTextLineHorizontal', 'LTFigure'] box_list = LeafList().populate(pdf_page, flt).purge_empty_text() (minx, maxx, miny, maxy) = find_table_bounding_box(box_list, config.table_top_hint, config.table_bottom_hint) """If miny and maxy are None then we found no tables and should exit""" if miny is None and maxy is None: print "found no tables" return table_array, TableDiagnosticData() if config.atomise: box_list = box_list.filterByType(['LTPage', 'LTChar']) filtered_box_list = filter_box_list_by_position(box_list, miny, maxy, Leaf._midline) filtered_box_list = filter_box_list_by_position(filtered_box_list, minx, maxx, Leaf._centreline) # Project boxes onto horizontal axis column_projection = project_boxes(filtered_box_list, "column") # Project boxes onto vertical axis # Erode row height by a fraction of the modal text box height erodelevel = int(math.floor(calculate_modal_height(filtered_box_list) / 4)) row_projection = project_boxes(filtered_box_list, "row", erosion=erodelevel) # y_comb = comb_from_projection(row_projection, rowThreshold, "row") y_comb.reverse() # columnThreshold = max(len(y_comb)*0.75,5) x_comb = comb_from_projection(column_projection, columnThreshold, "column") x_comb[0] = minx x_comb[-1] = maxx # Extend y_comb to page size if extend_y is true if config.extend_y: pageminy = min([box.bottom for box in box_list]) pagemaxy = max([box.top for box in box_list]) y_comb = comb_extend(y_comb, pageminy, pagemaxy) filtered_box_list = box_list # Applying the combs table_array = apply_combs(box_list, x_comb, y_comb) # Strip out leading and trailing spaces when atomise true if config.atomise: tmp_table = [] for row in table_array: stripped_row = map(unicode.strip, row) tmp_table.append(stripped_row) table_array = tmp_table diagnostic_data = TableDiagnosticData(filtered_box_list, column_projection, row_projection, x_comb, y_comb) return table_array, diagnostic_data
def multi_column_detect(page): #TODO This function is under construction """ Test for multiColumns from a box_list, returns an integer number of columns and a set of (left, right) pairs delineating any columns """ # Ways to identify multicolumns: # 1. High fill factor compared to tables # 2. Gullies at textwidth/2, (textwidth/3, 2*textwidth/3)... # 3. Histogram of boxwidths with peak at some fraction of page width # This is like project_boxes but we are projecting the length of the # textbox onto the axis box_list = LeafList().populate( page, ['LTPage', 'LTTextLineHorizontal']).purge_empty_text() # Should use the LTPage object to get page bounding box box_list = filter_box_list_by_type(box_list, 'LTTextLineHorizontal') pile = {} vstep = 5 # should be scaled by modal row height minv = rounder( min([box.bottom for box in box_list]), 5) # ensure some overlap maxv = rounder(max([box.top for box in box_list]), 5) minx = round(min([box.left for box in box_list])) # ensure some overlap maxx = round(max([box.right for box in box_list])) # Initialise projection structure # print minv, maxv coords = range(int(minv), int(maxv) + vstep, vstep) pile = collections.OrderedDict(zip(coords, [0] * len(coords))) # print projection for box in box_list: # print int(rounder(box.midline, 30)), box.width pile[int(rounder(box.midline, vstep))] += box.width for key, value in pile.items(): pile[key] = value / (maxx - minx) # Box width histogram bstep = 10 boxhist = {} boxwidthmin = rounder(min([box.width for box in box_list]), bstep) boxwidthmax = rounder(max([box.width for box in box_list]), bstep) coords = range(int(boxwidthmin), int(boxwidthmax) + bstep, bstep) boxhist = collections.OrderedDict(zip(coords, [0] * len(coords))) for box in box_list: # print int(rounder(box.midline, 30)), box.width boxhist[int(rounder(box.width, bstep))] += 1 nboxes = len(box_list) for key, value in boxhist.items(): boxhist[key] = float(value) / float(nboxes) # TODO: plt undefined fig = plt.figure() ax1 = fig.add_subplot(111) ax1.plot(map(float, boxhist.keys()), map(float, boxhist.values()), color='red') plt.show() # This is old fashion projection projection = project_boxes(box_list, 'column') # process key and value # print projection # projection = Counter(projection) # print projection return pile, projection