def test_pandas(self):
        page = self.pdf.pages[0]
        cropped = page.crop((0, 80, self.PDF_WIDTH, 485))
        table = cropped.extract_table({
            "horizontal_strategy":
            "text",
            "explicit_vertical_lines":
            [min(map(itemgetter("x0"), cropped.chars))],
            "intersection_tolerance":
            5
        })

        table = pd.DataFrame(table)

        def parse_value(x):
            if pd.isnull(x) or x == "": return None
            return int(x.replace(",", ""))

        table.columns = COLUMNS
        table[table.columns[1:]] = table[table.columns[1:]].applymap(
            parse_value)

        # [1:] because first column is state name
        for c in COLUMNS[1:]:
            total = table[c].iloc[-1]
            colsum = table[c].sum()
            assert (colsum == (total * 2))

        month_chars = within_bbox(page.chars, (0, 35, self.PDF_WIDTH, 65))
        month_text = collate_chars(month_chars)
        assert (month_text == "November - 2015")
    def test_pandas(self):
        page = self.pdf.pages[0]
        cropped = page.crop((0, 80, self.PDF_WIDTH, 485))

        _table = cropped.extract_table(h="gutters", x_tolerance=5, y_tolerance=5, gutter_min_height=5)

        table = pd.DataFrame(_table)

        def parse_value(x):
            if pd.isnull(x):
                return None
            return int(x.replace(",", ""))

        table.columns = COLUMNS
        table[table.columns[1:]] = table[table.columns[1:]].applymap(parse_value)

        # [1:] because first column is state name
        for c in COLUMNS[1:]:
            total = table[c].iloc[-1]
            colsum = table[c].sum()
            assert colsum == (total * 2)

        month_chars = within_bbox(page.chars, (0, 35, self.PDF_WIDTH, 65))
        month_text = collate_chars(month_chars, x_tolerance=2)
        assert month_text == "November - 2015"
    def test_plain(self):
        page = self.pdf.pages[0]
        cropped = page.crop((0, 80, self.PDF_WIDTH, 485))
        table = cropped.extract_table({
            "horizontal_strategy":
            "text",
            "explicit_vertical_lines":
            [min(map(itemgetter("x0"), cropped.chars))],
            "intersection_tolerance":
            5
        })
        print(table)

        def parse_value(k, x):
            if k == 0: return x
            if x in (None, ""): return None
            return int(x.replace(",", ""))

        def parse_row(row):
            return dict(
                (COLUMNS[i], parse_value(i, v)) for i, v in enumerate(row))

        parsed_table = [parse_row(row) for row in table]

        # [1:] because first column is state name
        for c in COLUMNS[1:]:
            total = parsed_table[-1][c]
            colsum = sum(row[c] or 0 for row in parsed_table)
            assert (colsum == (total * 2))

        month_chars = within_bbox(page.chars, (0, 35, self.PDF_WIDTH, 65))
        month_text = collate_chars(month_chars)
        assert (month_text == "November - 2015")
    def test_plain(self):
        page = self.pdf.pages[0]
        cropped = page.crop((0, 80, self.PDF_WIDTH, 485))
        table = cropped.extract_table(h="gutters",
                                      x_tolerance=5,
                                      y_tolerance=5,
                                      gutter_min_height=5)

        def parse_value(k, x):
            if k == 0: return x
            if x == None: return None
            return int(x.replace(",", ""))

        def parse_row(row):
            return dict(
                (COLUMNS[i], parse_value(i, v)) for i, v in enumerate(row))

        parsed_table = [parse_row(row) for row in table]

        # [1:] because first column is state name
        for c in COLUMNS[1:]:
            total = parsed_table[-1][c]
            colsum = sum(row[c] or 0 for row in parsed_table)
            assert (colsum == (total * 2))

        month_chars = within_bbox(page.chars, (0, 35, self.PDF_WIDTH, 65))
        month_text = collate_chars(month_chars, x_tolerance=2)
        assert (month_text == "November - 2015")
    def test_pandas(self):
        page = self.pdf.pages[0]
        cropped = page.crop((0, 80, self.PDF_WIDTH, 485))

        _table = cropped.extract_table(h="gutters",
                                       x_tolerance=5,
                                       y_tolerance=5,
                                       gutter_min_height=5)

        table = pd.DataFrame(_table)

        def parse_value(x):
            if pd.isnull(x): return None
            return int(x.replace(",", ""))

        table.columns = COLUMNS
        table[table.columns[1:]] = table[table.columns[1:]].applymap(
            parse_value)

        # [1:] because first column is state name
        for c in COLUMNS[1:]:
            total = table[c].iloc[-1]
            colsum = table[c].sum()
            assert (colsum == (total * 2))

        month_chars = within_bbox(page.chars, (0, 35, self.PDF_WIDTH, 65))
        month_text = collate_chars(month_chars, x_tolerance=2)
        assert (month_text == "November - 2015")
 def precinct(self):
     h1_left = list(self.bboxes["h1"])
     h1_left[-2] = float(h1_left[-2]) / 2
     h1_left_chars = within_bbox(self.chars, h1_left)
     txt = h1_left_chars.groupby("top").apply(_collate_chars).iloc[-1]
     p_id = "|".join(re.split(r"\s{2,}", txt)[1:3])
     return p_id
 def precinct(self):
     h1_left = list(self.bboxes["h1"])
     h1_left[-2] = float(h1_left[-2]) / 2
     h1_left_chars = within_bbox(self.chars, h1_left)
     txt = h1_left_chars.groupby("top").apply(_collate_chars).iloc[-1]
     p_id = "|".join(re.split(r"\s{2,}", txt)[1:3])
     return p_id
Пример #8
0
 def objects(self):
     if hasattr(self, "_objects"): return self._objects
     if self.strict:
         kwargs = {"strict": True}
     else:
         kwargs = {"crop": True}
     self._objects = utils.within_bbox(self.parent_page.objects, self.bbox,
                                       **kwargs)
     return self._objects
Пример #9
0
 def objects(self):
     if hasattr(self, "_objects"):
         return self._objects
     if self.strict:
         kwargs = {"strict": True}
     else:
         kwargs = {"crop": True}
     self._objects = utils.within_bbox(self.parent_page.objects, self.bbox, **kwargs)
     return self._objects
 def results(self):
     r = []
     for col in ["c1", "c2", "c3", "c4"]:
         b = within_bbox(self.chars, self.bboxes[col])
         r += self.parse_col(b)
     return r
 def registered_voters(self):
     h2_chars = within_bbox(self.chars, self.bboxes["h2"])
     txt = h2_chars.groupby("top").apply(collate_chars).iloc[1]
     return int(re.match(r"(\d+) REGISTERED VOTERS", txt).group(1))
 def ballots_cast(self):
     h2_chars = within_bbox(self.chars, self.bboxes["h2"])
     txt = h2_chars.groupby("top").apply(collate_chars).iloc[0]
     return int(re.match(r"(\d+) BALLOTS CAST", txt).group(1))
 def results(self):
     r = []
     for col in ["c1", "c2", "c3", "c4"]:
         b = within_bbox(self.chars, self.bboxes[col])
         r += self.parse_col(b)
     return r
 def registered_voters(self):
     h2_chars = within_bbox(self.chars, self.bboxes["h2"])
     txt = h2_chars.groupby("top").apply(_collate_chars).iloc[1]
     return int(re.match(r"(\d+) REGISTERED VOTERS", txt).group(1))
 def ballots_cast(self):
     h2_chars = within_bbox(self.chars, self.bboxes["h2"])
     txt = h2_chars.groupby("top").apply(_collate_chars).iloc[0]
     return int(re.match(r"(\d+) BALLOTS CAST", txt).group(1))
    def test_plain(self):
        page = self.pdf.pages[0]
        cropped = page.crop((0, 80, self.PDF_WIDTH, 485))
        table = cropped.extract_table(h="gutters", x_tolerance=5, y_tolerance=5, gutter_min_height=5)

        def parse_value(k, x):
            if k == 0:
                return x
            if x == None:
                return None
            return int(x.replace(",", ""))

        def parse_row(row):
            return dict((COLUMNS[i], parse_value(i, v)) for i, v in enumerate(row))

        parsed_table = [parse_row(row) for row in table]

        # [1:] because first column is state name
        for c in COLUMNS[1:]:
            total = parsed_table[-1][c]
            colsum = sum(row[c] or 0 for row in parsed_table)
            assert colsum == (total * 2)

        month_chars = within_bbox(page.chars, (0, 35, self.PDF_WIDTH, 65))
        month_text = collate_chars(month_chars, x_tolerance=2)
        assert month_text == "November - 2015"