def parse_results_line(chars): _left = chars[chars["x0rel"] < 125] left = collate_chars(_left) if len(_left) else None _right = chars[(chars["x0rel"] > 155)] right = int(collate_chars(_right)) if len(_right) else None _mid = chars[(chars["x0rel"] > 125) & (chars["x0rel"] < 155)] mid = collate_chars(_mid) if len(_mid) else None return {"text": left, "aff": mid, "votes": right}
def test_pandas(self): page = self.pdf.pages[0] cropped = page.crop((0, 80, self.PDF_WIDTH, 485)) table = cropped.extract_table({ "horizontal_strategy": "text", "explicit_vertical_lines": [min(map(itemgetter("x0"), cropped.chars))], "intersection_tolerance": 5 }) table = pd.DataFrame(table) def parse_value(x): if pd.isnull(x) or x == "": return None return int(x.replace(",", "")) table.columns = COLUMNS table[table.columns[1:]] = table[table.columns[1:]].applymap( parse_value) # [1:] because first column is state name for c in COLUMNS[1:]: total = table[c].iloc[-1] colsum = table[c].sum() assert (colsum == (total * 2)) month_chars = within_bbox(page.chars, (0, 35, self.PDF_WIDTH, 65)) month_text = collate_chars(month_chars) assert (month_text == "November - 2015")
def test_plain(self): page = self.pdf.pages[0] cropped = page.crop((0, 80, self.PDF_WIDTH, 485)) table = cropped.extract_table({ "horizontal_strategy": "text", "explicit_vertical_lines": [min(map(itemgetter("x0"), cropped.chars))], "intersection_tolerance": 5 }) print(table) def parse_value(k, x): if k == 0: return x if x in (None, ""): return None return int(x.replace(",", "")) def parse_row(row): return dict( (COLUMNS[i], parse_value(i, v)) for i, v in enumerate(row)) parsed_table = [parse_row(row) for row in table] # [1:] because first column is state name for c in COLUMNS[1:]: total = parsed_table[-1][c] colsum = sum(row[c] or 0 for row in parsed_table) assert (colsum == (total * 2)) month_chars = within_bbox(page.chars, (0, 35, self.PDF_WIDTH, 65)) month_text = collate_chars(month_chars) assert (month_text == "November - 2015")
def test_pandas(self): page = self.pdf.pages[0] cropped = page.crop((0, 80, self.PDF_WIDTH, 485)) _table = cropped.extract_table(h="gutters", x_tolerance=5, y_tolerance=5, gutter_min_height=5) table = pd.DataFrame(_table) def parse_value(x): if pd.isnull(x): return None return int(x.replace(",", "")) table.columns = COLUMNS table[table.columns[1:]] = table[table.columns[1:]].applymap(parse_value) # [1:] because first column is state name for c in COLUMNS[1:]: total = table[c].iloc[-1] colsum = table[c].sum() assert colsum == (total * 2) month_chars = within_bbox(page.chars, (0, 35, self.PDF_WIDTH, 65)) month_text = collate_chars(month_chars, x_tolerance=2) assert month_text == "November - 2015"
def test_pandas(self): page = self.pdf.pages[0] cropped = page.crop((0, 80, self.PDF_WIDTH, 485)) _table = cropped.extract_table(h="gutters", x_tolerance=5, y_tolerance=5, gutter_min_height=5) table = pd.DataFrame(_table) def parse_value(x): if pd.isnull(x): return None return int(x.replace(",", "")) table.columns = COLUMNS table[table.columns[1:]] = table[table.columns[1:]].applymap( parse_value) # [1:] because first column is state name for c in COLUMNS[1:]: total = table[c].iloc[-1] colsum = table[c].sum() assert (colsum == (total * 2)) month_chars = within_bbox(page.chars, (0, 35, self.PDF_WIDTH, 65)) month_text = collate_chars(month_chars, x_tolerance=2) assert (month_text == "November - 2015")
def test_plain(self): page = self.pdf.pages[0] cropped = page.crop((0, 80, self.PDF_WIDTH, 485)) table = cropped.extract_table(h="gutters", x_tolerance=5, y_tolerance=5, gutter_min_height=5) def parse_value(k, x): if k == 0: return x if x == None: return None return int(x.replace(",", "")) def parse_row(row): return dict( (COLUMNS[i], parse_value(i, v)) for i, v in enumerate(row)) parsed_table = [parse_row(row) for row in table] # [1:] because first column is state name for c in COLUMNS[1:]: total = parsed_table[-1][c] colsum = sum(row[c] or 0 for row in parsed_table) assert (colsum == (total * 2)) month_chars = within_bbox(page.chars, (0, 35, self.PDF_WIDTH, 65)) month_text = collate_chars(month_chars, x_tolerance=2) assert (month_text == "November - 2015")
def _collate_chars(x): return collate_chars(x, x_tolerance=1)
def test_plain(self): page = self.pdf.pages[0] cropped = page.crop((0, 80, self.PDF_WIDTH, 485)) table = cropped.extract_table(h="gutters", x_tolerance=5, y_tolerance=5, gutter_min_height=5) def parse_value(k, x): if k == 0: return x if x == None: return None return int(x.replace(",", "")) def parse_row(row): return dict((COLUMNS[i], parse_value(i, v)) for i, v in enumerate(row)) parsed_table = [parse_row(row) for row in table] # [1:] because first column is state name for c in COLUMNS[1:]: total = parsed_table[-1][c] colsum = sum(row[c] or 0 for row in parsed_table) assert colsum == (total * 2) month_chars = within_bbox(page.chars, (0, 35, self.PDF_WIDTH, 65)) month_text = collate_chars(month_chars, x_tolerance=2) assert month_text == "November - 2015"