Пример #1
0
def plotAllPages(fh):
    # tol = 5 # This is the tolerance for histogram rounding

    fig_list = []
    ax1_list = []

    pdf = PDFDocument(fh)
    print "Created by: %s" % pdf.get_creator()
    # print SelectedPDF
    # flt = 'LTTextLineHorizontal'
    # flt = ['LTPage','LTTextLineHorizontal']
    flt = ["LTPage", "LTFigure", "LTLine", "LTRect", "LTImage", "LTTextLineHorizontal", "LTCurve"]
    # flt = ['LTPage','LTChar']
    for i, page in enumerate(pdf.get_pages()):
        # page = next(doc.get_pages())

        # layout = page.layout()  # LTPage
        box_list = LeafList().populate(page, interested=flt)

        ModalHeight = pdftables.calculate_modal_height(box_list)

        diagnostic_data = pdftables.TableDiagnosticData(box_list, {}, {}, [], [])

        fig, ax1 = plotpage(diagnostic_data)
        fig_list.append(fig)
        ax1_list.append(ax1)

        title = "page %d" % (i + 1)
        fig.suptitle(title)
        # print "Page %d" % (i+1), ElementCount
        print box_list.count()
        print "Modal character height: %d" % ModalHeight

    return fig_list, ax1_list
Пример #2
0
def page_contains_tables(layout, device):
    # TODO: hide doc, interpreter, device inside a higher level Pdf class. It's
    # silly that we have to care about these (see function signature!!)

    box_list = LeafList().populate(layout)
    for item in box_list:
        assert isinstance(item, Leaf), "NOT LEAF"
    yhist = box_list.histogram(Leaf._top).rounder(1)

    test = [k for k, v in yhist.items() if v > IS_TABLE_COLUMN_COUNT_THRESHOLD]
    return len(test) > IS_TABLE_ROW_COUNT_THRESHOLD
Пример #3
0
def filter_box_list_by_position(box_list, minv, maxv, dir_fun):
    #TODO This should be in tree.py
    filtered_box_list = LeafList()
    # print minv, maxv, index
    for box in box_list:
        # box = boxstruct[0]
        if dir_fun(box) >= minv and dir_fun(box) <= maxv:
            # print box
            filtered_box_list.append(box)

    return filtered_box_list
Пример #4
0
def filter_box_list_by_position(box_list, minv, maxv, dir_fun):
    #TODO This should be in tree.py
    filtered_box_list = LeafList()
    # print minv, maxv, index
    for box in box_list:
        # box = boxstruct[0]
        if dir_fun(box) >= minv and dir_fun(box) <= maxv:
            # print box
            filtered_box_list.append(box)

    return filtered_box_list
Пример #5
0
def page_contains_tables(pdf_page):
    if not isinstance(pdf_page, PDFPage):
        raise TypeError("Page must be PDFPage, not {}".format(
            pdf_page.__class__))

    box_list = LeafList().populate(pdf_page)
    for item in box_list:
        assert isinstance(item, Leaf), "NOT LEAF"
    yhist = box_list.histogram(Leaf._top).rounder(1)

    test = [k for k, v in yhist.items() if v > IS_TABLE_COLUMN_COUNT_THRESHOLD]
    return len(test) > IS_TABLE_ROW_COUNT_THRESHOLD
Пример #6
0
def page_contains_tables(pdf_page):
    if not isinstance(pdf_page, PDFPage):
        raise TypeError("Page must be PDFPage, not {}".format(
            pdf_page.__class__))

    box_list = LeafList().populate(pdf_page)
    for item in box_list:
        assert isinstance(item, Leaf), "NOT LEAF"
    yhist = box_list.histogram(Leaf._top).rounder(1)

    test = [k for k, v in yhist.items() if v > IS_TABLE_COLUMN_COUNT_THRESHOLD]
    return len(test) > IS_TABLE_ROW_COUNT_THRESHOLD
Пример #7
0
def page_contains_tables(pdf_page, interpreter, device):
    # TODO: hide doc, interpreter, device inside a higher level Pdf class. It's
    # silly that we have to care about these (see function signature!!)

    interpreter.process_page(pdf_page)
    # receive the LTPage object for the page.
    layout = device.get_result()
    box_list = LeafList().populate(layout)
    for item in box_list:
        assert isinstance(item, Leaf), "NOT LEAF"
    yhist = box_list.histogram(Leaf._top).rounder(1)

    test = [k for k, v in yhist.items() if v > IS_TABLE_COLUMN_COUNT_THRESHOLD]
    return len(test) > IS_TABLE_ROW_COUNT_THRESHOLD
Пример #8
0
def plotAllPages(fh):
    #tol = 5 # This is the tolerance for histogram rounding

    fig_list = []
    ax1_list = []

    doc, interpreter, device = pt.initialize_pdf_miner(fh)
    # print SelectedPDF
    Creator = doc.info[0]['Creator']
    print("Created by: %s" % Creator)
    #flt = 'LTTextLineHorizontal'
    #flt = ['LTPage','LTTextLineHorizontal']
    # flt = ['LTPage','LTFigure','LTLine','LTRect','LTImage','LTTextLineHorizontal','LTCurve']
    flt = ['LTPage','LTChar']
    for i,page in enumerate(doc.get_pages()):
        # page = next(doc.get_pages())

        interpreter.process_page(page)
    # receive the LTPage object for the page.
        layout = device.get_result()
        box_list = LeafList().populate(layout, interested = flt)

        ModalHeight = pt.calculate_modal_height(box_list)

        diagnostic_data = pt.TableDiagnosticData(
                box_list,
                {},
                {},
                [],
                [])

        fig, ax1 = plotpage(diagnostic_data)
        fig_list.append(fig)
        ax1_list.append(ax1)

        title = "page %d" % (i+1)
        fig.suptitle(title)
        #print "Page %d" % (i+1), ElementCount
        print(box_list.count())
        print("Modal character height: %d" % ModalHeight)

    return fig_list, ax1_list
Пример #9
0
 def __init__(self,
              box_list=LeafList(),
              top_plot=dict(),
              left_plot=dict(),
              x_comb=[],
              y_comb=[]):
     self.box_list = box_list
     self.top_plot = top_plot
     self.left_plot = left_plot
     self.x_comb = x_comb
     self.y_comb = y_comb
Пример #10
0
def plotAllPages(fh):
    #tol = 5 # This is the tolerance for histogram rounding

    fig_list = []
    ax1_list = []

    pdf = PDFDocument(fh)
    print "Created by: %s" % pdf.get_creator()
    # print SelectedPDF
    #flt = 'LTTextLineHorizontal'
    #flt = ['LTPage','LTTextLineHorizontal']
    flt = [
        'LTPage', 'LTFigure', 'LTLine', 'LTRect', 'LTImage',
        'LTTextLineHorizontal', 'LTCurve'
    ]
    # flt = ['LTPage','LTChar']
    for i, page in enumerate(pdf.get_pages()):
        # page = next(doc.get_pages())

        #layout = page.layout()  # LTPage
        box_list = LeafList().populate(page, interested=flt)

        ModalHeight = pdftables.calculate_modal_height(box_list)

        diagnostic_data = pdftables.TableDiagnosticData(
            box_list, {}, {}, [], [])

        fig, ax1 = plotpage(diagnostic_data)
        fig_list.append(fig)
        ax1_list.append(ax1)

        title = "page %d" % (i + 1)
        fig.suptitle(title)
        #print "Page %d" % (i+1), ElementCount
        print box_list.count()
        print "Modal character height: %d" % ModalHeight

    return fig_list, ax1_list
Пример #11
0
def page_to_tables(page, extend_y=False, hints=[], atomise=False):
    """
    Get a rectangular list of list of strings from one page of a document
    """
    if not isinstance(page, LTPage):
        raise TypeError("Page must be LTPage, not {}".format(page.__class__))

    table_array = []

    # For LTTextLine horizontal column and row thresholds of 3 work ok
    columnThreshold = 5  # 3 works for smaller tables
    rowThreshold = 3

    if atomise:
        flt = ['LTPage', 'LTTextLineHorizontal', 'LTChar']
    else:
        flt = ['LTPage', 'LTTextLineHorizontal']
    # flt = ['LTPage', 'LTTextLineHorizontal', 'LTFigure']
    box_list = LeafList().populate(page, flt).purge_empty_text()

    (minx, maxx, miny, maxy) = find_table_bounding_box(box_list, hints=hints)

    """If miny and maxy are None then we found no tables and should exit"""
    if miny is None and maxy is None:
       print "found no tables"
       return table_array, TableDiagnosticData()

    if atomise:
        box_list = box_list.filterByType(['LTPage', 'LTChar'])

    filtered_box_list = filter_box_list_by_position(
        box_list,
        miny,
        maxy,
        Leaf._midline)

    filtered_box_list = filter_box_list_by_position(
        filtered_box_list,
        minx,
        maxx,
        Leaf._centreline)

    # Project boxes onto horizontal axis
    column_projection = project_boxes(filtered_box_list, "column")

    # Project boxes onto vertical axis
    # Erode row height by a fraction of the modal text box height
    erodelevel = int(math.floor(calculate_modal_height(filtered_box_list) / 4))
    row_projection = project_boxes(
        filtered_box_list, "row",
        erosion=erodelevel)

    #
    y_comb = comb_from_projection(row_projection, rowThreshold, "row")
    y_comb.reverse()

    # columnThreshold = max(len(y_comb)*0.75,5)
    x_comb = comb_from_projection(column_projection, columnThreshold, "column")

    x_comb[0] = minx
    x_comb[-1] = maxx

    # Extend y_comb to page size if extend_y is true
    if extend_y:
        pageminy = min([box.bottom for box in box_list])
        pagemaxy = max([box.top for box in box_list])
        y_comb = comb_extend(y_comb, pageminy, pagemaxy)
        filtered_box_list = box_list

    # Applying the combs
    table_array = apply_combs(box_list, x_comb, y_comb)

    # Strip out leading and trailing spaces when atomise true
    if atomise:
        tmp_table = []
        for row in table_array:
            stripped_row = map(unicode.strip,row)
            tmp_table.append(stripped_row)
        table_array = tmp_table

    diagnostic_data = TableDiagnosticData(
        filtered_box_list,
        column_projection,
        row_projection,
        x_comb,
        y_comb)

    return table_array, diagnostic_data
Пример #12
0
def page_to_tables(pdf_page, config=None):
    """
    Get a rectangular list of list of strings from one page of a document
    """
    if not isinstance(pdf_page, PDFPage):
        raise TypeError("Page must be PDFPage, not {}".format(
            pdf_page.__class__))

    if not config:
        config = ConfigParameters()
    table_array = []

    # For LTTextLine horizontal column and row thresholds of 3 work ok
    columnThreshold = 5  # 3 works for smaller tables
    rowThreshold = 3

    if config.atomise:
        flt = ['LTPage', 'LTTextLineHorizontal', 'LTChar']
    else:
        flt = ['LTPage', 'LTTextLineHorizontal']
    # flt = ['LTPage', 'LTTextLineHorizontal', 'LTFigure']
    box_list = LeafList().populate(pdf_page, flt).purge_empty_text()

    (minx, maxx, miny,
     maxy) = find_table_bounding_box(box_list, config.table_top_hint,
                                     config.table_bottom_hint)
    """If miny and maxy are None then we found no tables and should exit"""
    if miny is None and maxy is None:
        print "found no tables"
        return table_array, TableDiagnosticData()

    if config.atomise:
        box_list = box_list.filterByType(['LTPage', 'LTChar'])

    filtered_box_list = filter_box_list_by_position(box_list, miny, maxy,
                                                    Leaf._midline)

    filtered_box_list = filter_box_list_by_position(filtered_box_list, minx,
                                                    maxx, Leaf._centreline)

    # Project boxes onto horizontal axis
    column_projection = project_boxes(filtered_box_list, "column")

    # Project boxes onto vertical axis
    # Erode row height by a fraction of the modal text box height
    erodelevel = int(math.floor(calculate_modal_height(filtered_box_list) / 4))
    row_projection = project_boxes(filtered_box_list,
                                   "row",
                                   erosion=erodelevel)

    #
    y_comb = comb_from_projection(row_projection, rowThreshold, "row")
    y_comb.reverse()

    # columnThreshold = max(len(y_comb)*0.75,5)
    x_comb = comb_from_projection(column_projection, columnThreshold, "column")

    x_comb[0] = minx
    x_comb[-1] = maxx

    # Extend y_comb to page size if extend_y is true
    if config.extend_y:
        pageminy = min([box.bottom for box in box_list])
        pagemaxy = max([box.top for box in box_list])
        y_comb = comb_extend(y_comb, pageminy, pagemaxy)
        filtered_box_list = box_list

    # Applying the combs
    table_array = apply_combs(box_list, x_comb, y_comb)

    # Strip out leading and trailing spaces when atomise true
    if config.atomise:
        tmp_table = []
        for row in table_array:
            stripped_row = map(unicode.strip, row)
            tmp_table.append(stripped_row)
        table_array = tmp_table

    diagnostic_data = TableDiagnosticData(filtered_box_list, column_projection,
                                          row_projection, x_comb, y_comb)

    return table_array, diagnostic_data
Пример #13
0
def multi_column_detect(page):
    #TODO This function is under construction
    """
    Test for multiColumns from a box_list, returns an integer number of columns
    and a set of (left, right) pairs delineating any columns
    """
    # Ways to identify multicolumns:
    # 1. High fill factor compared to tables
    # 2. Gullies at textwidth/2, (textwidth/3, 2*textwidth/3)...
    # 3. Histogram of boxwidths with peak at some fraction of page width
    # This is like project_boxes but we are projecting the length of the
    # textbox onto the axis
    box_list = LeafList().populate(
        page, ['LTPage', 'LTTextLineHorizontal']).purge_empty_text()

    # Should use the LTPage object to get page bounding box
    box_list = filter_box_list_by_type(box_list, 'LTTextLineHorizontal')
    pile = {}
    vstep = 5  # should be scaled by modal row height
    minv = rounder(
        min([box.bottom for box in box_list]),
        5)  # ensure some overlap
    maxv = rounder(max([box.top for box in box_list]), 5)

    minx = round(min([box.left for box in box_list]))  # ensure some overlap
    maxx = round(max([box.right for box in box_list]))

    # Initialise projection structure
    # print minv, maxv

    coords = range(int(minv), int(maxv) + vstep, vstep)

    pile = collections.OrderedDict(zip(coords, [0] * len(coords)))
    # print projection
    for box in box_list:
        # print int(rounder(box.midline, 30)), box.width
        pile[int(rounder(box.midline, vstep))] += box.width

    for key, value in pile.items():
        pile[key] = value / (maxx - minx)

    # Box width histogram
    bstep = 10
    boxhist = {}
    boxwidthmin = rounder(min([box.width for box in box_list]), bstep)
    boxwidthmax = rounder(max([box.width for box in box_list]), bstep)

    coords = range(int(boxwidthmin), int(boxwidthmax) + bstep, bstep)
    boxhist = collections.OrderedDict(zip(coords, [0] * len(coords)))
    for box in box_list:
        # print int(rounder(box.midline, 30)), box.width
        boxhist[int(rounder(box.width, bstep))] += 1

    nboxes = len(box_list)
    for key, value in boxhist.items():
        boxhist[key] = float(value) / float(nboxes)
    # TODO: plt undefined
    fig = plt.figure()
    ax1 = fig.add_subplot(111)
    ax1.plot(map(float, boxhist.keys()),
             map(float, boxhist.values()), color='red')
    plt.show()

    # This is old fashion projection
    projection = project_boxes(box_list, 'column')
    # process key and value
    # print projection
    # projection = Counter(projection)
    # print projection
    return pile, projection