示例#1
0
    def test_issue_13(self):
        """
        Test slightly simplified from gist here: https://github.com/jsvine/pdfplumber/issues/13
        """
        pdf = pdfplumber.from_path(
            os.path.join(HERE, "pdfs/issue-13-151201DSP-Fond-581-90D.pdf")
        )

        # Only find checkboxes this size
        RECT_WIDTH = 9.3
        RECT_HEIGHT = 9.3
        RECT_TOLERANCE = 2

        def filter_rects(rects):
            ## Just get the rects that are the right size to be checkboxes
            rects_found = []
            for rect in rects:
                if ( rect['height'] > ( RECT_HEIGHT - RECT_TOLERANCE )   
                    and ( rect['height'] < RECT_HEIGHT + RECT_TOLERANCE) 
                    and ( rect['width'] < RECT_WIDTH + RECT_TOLERANCE) 
                    and ( rect['width'] < RECT_WIDTH + RECT_TOLERANCE) ):
                    rects_found.append(rect)
            return rects_found

        def determine_if_checked(checkbox, curve_list):
            # This figures out if the bounding box of (either) line used to make
            # one half of the 'x' is the right size and overlaps with a rectangle.
            # This isn't foolproof, but works for this case. 
            # It's not totally clear (to me) how common this style of checkboxes
            # are used, and whether this is useful approach to them.
            # Also note there should be *two* matching LTCurves for each checkbox.
            # But here we only test there's at least one. 

            for curve in curve_list:

                if ( checkbox['height'] > ( RECT_HEIGHT - RECT_TOLERANCE )   
                    and ( checkbox['height'] < RECT_HEIGHT + RECT_TOLERANCE) 
                    and ( checkbox['width'] < RECT_WIDTH + RECT_TOLERANCE) 
                    and ( checkbox['width'] < RECT_WIDTH + RECT_TOLERANCE) ):

                    xmatch = False
                    ymatch = False

                    if ( max(checkbox['x0'], curve['x0']) <= min(checkbox['x1'], curve['x1']) ):
                        xmatch = True
                    if ( max(checkbox['y0'], curve['y0']) <= min(checkbox['y1'], curve['y1']) ):
                        ymatch = True
                    if xmatch and ymatch:
                        return True

            return False

        p0 = pdf.pages[0]
        curves = p0.objects["curve"]
        rects = filter_rects(p0.objects["rect"])

        n_checked = sum([ determine_if_checked(rect, curves)
            for rect in rects ])

        assert(n_checked == 5)
示例#2
0
    def test_rotation(self):
        rotated = pdfplumber.from_path(
            os.path.join(HERE, "pdfs/nics-background-checks-2015-11-rotated.pdf")
        )
        assert(self.pdf.pages[0].width == 1008)
        assert(self.pdf.pages[0].height == 612)

        assert(rotated.pages[0].width == 612)
        assert(rotated.pages[0].height == 1008)

        assert(rotated.pages[0].cropbox == self.pdf.pages[0].cropbox)
        assert(rotated.pages[0].bbox != self.pdf.pages[0].bbox)
示例#3
0
def extract(pdf_path: str, filter=None, flavor='lattice', lang: str = 'eng', **imgOcrSettings):
    '''
    : 抽取pdf中的表格数据
    '''
    pdf = pdfplumber.from_path(pdf_path)
    total_page = len(pdf.pages)

    tables: [PageTable] = []

    # 使用camelot抽取表格
    print('use camelot extract tables')
    camelot_tables = camelot.read_pdf(pdf_path, pages='all',
                                      flavor=flavor, suppress_stdout=False)

    for t in camelot_tables:
        text = pdf.pages[t.page - 1].extract_text()
        merge_table(tables, t.page, t.data, text)

    # 如果抽取完成则返回
    if len(tables) == total_page:
        return tables

    # 否则使用ocr抽取其他页面的表格
    extract_pages = [t.page for t in tables]

    total_page_set = set(range(1, total_page + 1))
    extracted_pages_set = set(extract_pages)

    other_pages = list(total_page_set.difference(extracted_pages_set))

    for page_number in other_pages:
        other_tables = extract_imgbase(
            pdf, page_number, flavor, lang, filter, **imgOcrSettings)

        if other_tables is not None:
            merge_tables(tables, page_number, other_tables)

    pdf.close()

    return tables
示例#4
0
 def test_issue_33(self):
     pdf = pdfplumber.from_path(
         os.path.join(HERE, "pdfs/issue-33-lorem-ipsum.pdf")
     )
     assert len(pdf.metadata.keys())
示例#5
0
 def test_issue_21(self):
     pdf = pdfplumber.from_path(
         os.path.join(HERE, "pdfs/150109DSP-Milw-505-90D.pdf")
     )
     assert len(pdf.objects)
示例#6
0
 def test_issue_14(self):
     pdf = pdfplumber.from_path(
         os.path.join(HERE, "pdfs/cupertino_usd_4-6-16.pdf")
     )
     assert len(pdf.objects)
示例#7
0
 def setUp(self):
     path = os.path.join(HERE, "pdfs/nics-background-checks-2015-11.pdf")
     self.pdf = pdfplumber.from_path(path)
 def setUp(self):
     print('i am running')
     path = os.path.join('/home/sxs/yuhsuan/datasets/personal/003.pdf')
     self.pdf = pdfplumber.from_path(path)
     self.PDF_WIDTH = self.pdf.pages[0].width
 def setUp(self):
     path = os.path.join(HERE, "pdfs/nics-background-checks-2015-11.pdf")
     self.pdf = pdfplumber.from_path(path)
     self.PDF_WIDTH = self.pdf.pages[0].width
 def setUp(self):
     path = os.path.join(HERE, "pdfs/nics-background-checks-2015-11.pdf")
     self.pdf = pdfplumber.from_path(path)
     self.PDF_WIDTH = self.pdf.pages[0].width
 def setUp(self):
     path = os.path.join(HERE,
                         "pdfs/WARN-Report-for-7-1-2015-to-03-25-2016.pdf")
     self.pdf = pdfplumber.from_path(path)
     self.PDF_WIDTH = self.pdf.pages[0].width
 def setUp(self):
     path = os.path.join(HERE, "pdfs/la-precinct-bulletin-2014-p1.pdf")
     self.pdf = pdfplumber.from_path(path)
     self.PDF_WIDTH = self.pdf.pages[0].width
 def test_load(self):
     path = os.path.join(HERE, "pdfs/cupertino_usd_4-6-16.pdf")
     pdf = pdfplumber.from_path(path)
示例#14
0
 def test_issue_53(self):
     pdf = pdfplumber.from_path(
         os.path.join(HERE, "pdfs/issue-53-example.pdf")
     )
     assert len(pdf.objects)
示例#15
0
 def setUp(self):
     path = os.path.join(HERE, "pdfs/WARN-Report-for-7-1-2015-to-03-25-2016.pdf")
     self.pdf = pdfplumber.from_path(path)
     self.PDF_WIDTH = self.pdf.pages[0].width
示例#16
0
 def test_issue_67(self):
     pdf = pdfplumber.from_path(
         os.path.join(HERE, "pdfs/issue-67-example.pdf")
     )
     assert len(pdf.metadata.keys())
 def setUp(self):
     path = os.path.join(HERE, "pdfs/la-precinct-bulletin-2014-p1.pdf")
     self.pdf = pdfplumber.from_path(path)
     self.PDF_WIDTH = self.pdf.pages[0].width