def crop_and_extract( page: pdfplumber.pdf.Page, line: Dict, adjust=False, left_shift: int = 0, up_shift: int = 20, ) -> str: """Extract text content for pdf line if any Given a line of a pdf - extracted the text around it. If adjust is True, reduce the cropped area to within the first line (usually above) :param page: Page to crop :param line: Line to crop around :param adjust: Whether to check if another line is inside our crop :param left_shift: Leftward crop adjustment :param up_shift: Upward crop adjustment :return: Content of the section """ bbox = ( int(line["x0"]) - left_shift, int(line["top"]) - up_shift, line["x1"], line["top"], ) crop = page.crop(bbox) if adjust: tops = [row["top"] for row in crop.lines if row["top"] != line["top"]] if len(tops) > 0: crop = page.crop(bbox=(*bbox[:1], tops[-1], *bbox[2:])) return crop.filter(keys_and_input_text).extract_text()
def extract_other_creditors_ef(page: pdfplumber.pdf.Page, start: Dict, stop: Dict, creditors: List) -> Dict: """Process other creditors to be notified if any :param page:Page to crop :param start:Y coordinate of the top of the creditor section :param stop:Y coordinate of the bottom of the creditor section :return: The key, address and acct information """ key_bbox = (start["x0"], start["top"] - 20, start["x1"], start["top"]) addy_bbox = (0, start["top"] - 20, start["x0"] - 20, stop["top"]) acct_bbox = (start["x1"] + 150, start["top"] + 20, page.width, stop["top"]) key = page.crop(key_bbox).filter(just_text_filter).extract_text() address = page.crop(addy_bbox).filter(keys_and_input_text).extract_text() acct = page.crop(acct_bbox).filter(keys_and_input_text).extract_text() for creditor in creditors: if creditor["key"] == str(key): other_creditors = creditor["other_creditors"] other_creditors.append({ "address": address, "acct": acct, "key": key }) creditor["other_creditors"] = other_creditors return creditors
def parse_unsecured_creditors(page: pdfplumber.pdf.Page, top: int, bottom: int) -> Dict: """Extract the information on the unsecured creditor section :param page: PDF page :param top: Y coordinate of the top of section :param bottom: Y coordinate of the bottom of section :return: Organized creditor data """ data = [] crop_one = page.crop((0, max(100, top - 500), page.width, bottom)) crop = crop_one.crop((0, top, page.width, bottom)) key = crop.filter(key_filter).extract_text().replace("\n", "") boxes = get_checkboxes(crop) lines = crop.filter(remove_margin_lines).lines for line in sorted(lines, key=lambda x: x["top"]): if not data and line["width"] > 20: continue output = crop_and_extract(crop_one, line, adjust=True, up_shift=100) if data or (output is not None and key == output.replace("\n", "")): if len(data) == 10 and "2." in key or len( data) == 8 and "4." in key: continue data.append(output) if data: return make_creditor_dict(data, boxes, key) return {}
def extract_other_creditors_d(page: pdfplumber.pdf.Page, markers: List[Dict], creditors: List) -> None: """Crop and extract address, key and acct # from the PDf :param page: PDF page :param markers: The top and bottom markers :return: Address, key and account information """ adjust = 0 if len(markers) == 5 else 12 addy_bbox = ( 0, markers[0]["top"], int(markers[-1]["x1"]) * 0.35, markers[-1]["top"], ) key_bbox = ( markers[-3]["x0"], markers[0]["top"] - adjust, markers[-3]["x1"], markers[-3]["top"], ) acct_bbox = ( markers[1]["x0"], markers[1]["top"] - 12, markers[1]["x1"], markers[1]["top"], ) address = page.crop(addy_bbox).filter(keys_and_input_text).extract_text() key = page.crop(key_bbox).filter( keys_and_input_text).extract_text().strip() acct = page.crop(acct_bbox).filter(keys_and_input_text).extract_text() for creditor in creditors: if creditor["key"] == key: other_creditors = creditor["other_creditors"] other_creditors.append({ "key": key, "address": address, "acct": acct }) creditor["other_creditors"] = other_creditors return creditors
def get_1_to_2_from_a_b(only_page: pdfplumber.pdf.Page) -> List[Dict]: """Extract real estate, automobile, jet ski, boats etc, from A/B. :param only_page:The PDF page to extract from :return: Extracted content """ property_content = [] sections = find_property_sections(only_page) if not sections: return property_content for top, bottom, key in sections: bbox = (0, top, only_page.width, bottom) crop = only_page.crop(bbox) data = get_all_values_from_crop(crop.lines, only_page) if "1." in key: section = make_property_dict(key, data) checkboxes = get_checkboxes(crop) if not checkboxes: section["property_interest"] = "Checkbox unreadable" section["debtor"] = "Checkbox unreadable" else: section["property_interest"] = checkboxes["property"] section["debtor"] = checkboxes["debtor"] property_content.append(section) if "3." in key or "4." in key: if "3." in key: section = make_car_dict(key, data) else: section = make_other_dict(key, data) checkboxes = get_checkboxes(crop) if not checkboxes: section["debtor"] = "Checkbox unreadable" else: section["debtor"] = checkboxes["debtor"] property_content.append(section) return property_content
def parse_secured_creditors(only_page: pdfplumber.pdf.Page, top: int, bottom: int) -> Dict: """Find and extract content from secured creditor portion of 106D :param only_page:PDF page :param top: Y coordinate for top of section :param bottom: Y coordinate of bottom of section :return: Organized data of the section """ page = only_page.crop((0, max(100, top - 500), only_page.width, bottom)) section = page.crop((0, top, only_page.width, bottom)) key = section.filter(key_filter).extract_text() checkboxes = get_checkboxes(section) data = [] for line in sorted(section.filter(remove_margin_lines).lines, key=lambda x: x["top"]): top = int(line["top"]) if not data and line["width"] > 20: continue page_crop = page.crop((line["x0"], top - 200, line["x1"], top)) tops = [ row["top"] for row in page_crop.lines if int(row["top"]) != top ] if len(tops) > 0: if len(data) == 6: page_crop = page.crop( (line["x0"], tops[-1] - 20, line["x1"], top)) elif len(data) == 8: page_crop = page.crop((line["x0"], top - 50, line["x1"], top)) else: page_crop = page.crop( (line["x0"], tops[-1], line["x1"], line["top"])) output = page_crop.filter(keys_and_input_text).extract_text() if data or key == output: data.append(output) if data and len(data) > 10: return make_secured_creditor_dict(data, checkboxes) return {}