示例#1
0
    def test_plain(self):
        page = self.pdf.pages[0]
        cropped = page.crop((0, 80, self.PDF_WIDTH, 485))
        table = cropped.extract_table({
            "horizontal_strategy":
            "text",
            "explicit_vertical_lines":
            [min(map(itemgetter("x0"), cropped.chars))],
            "intersection_tolerance":
            5,
        })

        def parse_value(k, x):
            if k == 0:
                return x
            if x in (None, ""):
                return None
            return int(x.replace(",", ""))

        def parse_row(row):
            return dict(
                (COLUMNS[i], parse_value(i, v)) for i, v in enumerate(row))

        parsed_table = [parse_row(row) for row in table]

        # [1:] because first column is state name
        for c in COLUMNS[1:]:
            total = parsed_table[-1][c]
            colsum = sum(row[c] or 0 for row in parsed_table)
            assert colsum == (total * 2)

        month_chars = within_bbox(page.chars, (0, 35, self.PDF_WIDTH, 65))
        month_text = extract_text(month_chars)
        assert month_text == "November - 2015"
示例#2
0
    def extract_text(self,
        x_tolerance=utils.DEFAULT_X_TOLERANCE,
        y_tolerance=utils.DEFAULT_Y_TOLERANCE):

        return utils.extract_text(self.chars,
            x_tolerance=x_tolerance,
            y_tolerance=y_tolerance)
示例#3
0
 def test_extract_text_layout(self):
     target = open(
         os.path.join(HERE, "comparisons/scotus-transcript-p1.txt")).read()
     page = self.pdf_scotus.pages[0]
     text = page.extract_text(layout=True)
     utils_text = utils.extract_text(page.chars, layout=True)
     assert text == utils_text
     assert text == target
示例#4
0
    def extract(self,
                x_tolerance=utils.DEFAULT_X_TOLERANCE,
                y_tolerance=utils.DEFAULT_Y_TOLERANCE):

        chars = self.page.chars
        table_arr = []

        def char_in_bbox(char, bbox):
            v_mid = (char["top"] + char["bottom"]) / 2
            h_mid = (char["x0"] + char["x1"]) / 2
            x0, top, x1, bottom = bbox
            return ((h_mid >= x0) and (h_mid < x1) and (v_mid >= top)
                    and (v_mid < bottom))

        for row in self.rows:
            arr = []
            row_chars = [
                char for char in chars if char_in_bbox(char, row.bbox)
            ]

            for cell in row.cells:
                if cell == None:
                    cell_text = None
                else:
                    cell_chars = [
                        char for char in row_chars if char_in_bbox(char, cell)
                    ]

                    if len(cell_chars):
                        self.used_chars += cell_chars
                        cell_text = utils.extract_text(
                            cell_chars,
                            x_tolerance=x_tolerance,
                            y_tolerance=y_tolerance).strip()
                    else:
                        cell_text = ""
                arr.append(cell_text)
            table_arr.append(arr)

        return table_arr
def parse_page(page):
    month_chars = [ c for c in page.chars if c["non_stroking_color"] == (1, 0, 0) ]
    month_text = extract_text(month_chars, x_tolerance=2)
    month = parse_month(month_text)
    sys.stderr.write("\r" + month)

    table_crop = page.crop((
        0,
        [ w for w in page.extract_words() if w["text"] == "State" ][0]["bottom"],
        page.width,
        page.rects[-1]["bottom"],
    ))

    edge_xs = list(set(map(itemgetter("x0"), table_crop.edges)))
    leftmost_char = min(map(itemgetter("x0"), table_crop.chars)) 

    _table = table_crop.extract_table({
        "horizontal_strategy": "text",
        "vertical_strategy": "explicit",
        "explicit_vertical_lines": [ leftmost_char ] + edge_xs,
        "intersection_tolerance": 5,
        "text_y_tolerance": 0,
        "text_x_tolerance": 2,
    })

    table = pd.DataFrame([ [ month ] + row for row in _table ])

    table.columns = COLUMNS
    table[table.columns[2:]] = table[table.columns[2:]].applymap(parse_value)

    table.loc[(table["state"] == "llinois"), "state"] = "Illinois"
    table = table.loc[lambda df: df["state"].fillna("").str.strip() != ""]
    try: validate_data(table)
    except: raise Exception("Invalid data for " + month)

    return table
示例#6
0
 def extract_text(self, x_tolerance=0, y_tolerance=0):
     return utils.extract_text(self.chars,
         x_tolerance=x_tolerance,
         y_tolerance=y_tolerance)
示例#7
0
 def extract_text(self, x_tolerance=0, y_tolerance=0):
     return utils.extract_text(self.chars,
                               x_tolerance=x_tolerance,
                               y_tolerance=y_tolerance)