def find_property_sections( only_page: pdfplumber.pdf.Page, ) -> Optional[Iterator[Tuple[Union[int, Any], int, Union[int, Any]]]]: """Find property sections to iterate over Find Sections with white font and identifiers for each section :param only_page: PDF page to crop :return: None or Iterable sections as top bottom and id """ rows = only_page.filter(input_white_text_and_left_side).extract_words() rows = [{ "top": int(row["top"]), "text": row["text"] } for row in rows if len(row["text"]) > 2 and row["text"][0] in "P12345" and row["text"][1] == "."] if len(rows) == 0: return None bottoms = [ int(line["top"]) for line in only_page.lines if line["top"] > rows[0]["top"] and line["width"] > 530 ][:len(rows)] tops = [r["top"] for r in rows] keys = [r["text"] for r in rows] return zip(tops, bottoms, keys)
def get_3_to_8_form_a_b( page: pdfplumber.pdf.Page, ) -> Tuple[List[Union[Dict[str, Any], Dict[Optional[Any], str]]], list, Optional[dict]]: """Parse sections 3 to 8 of 106 A/B property form :param page:The pdf page to parse :return:Organized property data in the document. """ part = 0 totals, section, key = None, None, None results, data, part_eight = [], [], [] rows = page.filter(filter_106_ab_content).extract_text().splitlines() debtors = get_ab_debtors(rows) # Remove debtor rows from lines rows = [r for r in rows if r not in debtors] for debtor in debtors: rows = [r for r in rows if debtor not in r] for row in rows[1:]: match = re.match(r"Part \d:", row) if match: part += 1 continue # Extract parts 3 to 7 if part in [3, 4, 5, 6, 7]: match = re.match(r"^\d{1,2}\. ?|^5", row) if not match: data.append(row) continue if section == row: continue if "54. " in row: results.append({"54.": row.split(" ")[1]}) if key: data = [d for d in data if "[" not in d] if data: if key == "24." and data == ["2"]: data = [] continue results.append({key: clean_ab_data(data)}) data = [] section = row key = row if part == 8: # Part 8 is the section containing grand totals. part_eight.append(row) if "63. " in row: # this is the final row of Part 8 totals = make_ab_totals(part_eight) return results, debtors, totals
def get_checkboxes(crop: pdfplumber.pdf.Page) -> Dict: """Find and identify checked checkboxes Using multiple tolerances, find checkboxes and identify them by the content to the right of the checkbox. :param crop: Section of pdf to extract checkboxes from :return: Dictionary of selected checkboxes """ results = {} # Use multiple tolerances to line up checkboxes on weird PDFs for tolerance in [3, 4, 5]: filtered_data = crop.filter(filter_boxes).extract_text( y_tolerance=tolerance) filtered_data = filtered_data.replace( "Type of NONPRIORITY unsecured claim:", "") if "[]" not in filtered_data: # Checkboxes unreadable return {} filtered_lines = filtered_data.splitlines() checkboxes = [x.replace(" ", " ") for x in filtered_lines if "[" in x] query1 = ["debtor"] query2 = ["community", "see instructions", "claim relates"] query3 = ["No", "Yes"] query4 = ["contingent", "unliquidated", "disputed"] query5 = [ "domestic", "taxes", "death", "specify", "loans", "obligations", "pension", "including", "judgment", "statutory", "agreement", ] debtor = [ box.split(" ", 1)[1].strip() for box in checkboxes if "√" in box and any(s in box.lower() for s in query1) ] community = [ box.split(" ", 1)[1].strip() for box in checkboxes if "√" in box and any(s in box.lower() for s in query2) ] offset = [ box.split(" ", 1)[1].strip() for box in checkboxes if "√" in box and any(s in box for s in query3) ] offset = [ans for ans in offset if re.match(r"^(Yes|No)$", ans)] info = [ box.split(" ", 1)[1].strip() for box in checkboxes if "√" in box and any(s in box.lower() for s in query4) ] claim_type = [ box.split(" ", 1)[1].strip() for box in checkboxes if "√" in box and any(s in box.lower() for s in query5) ] property_values = [ box.split(" ", 1)[1].strip() for box in checkboxes if "√" in box and any(s in box for s in property_options) ] property_values = [ s for s in property_options if any(s in box for box in property_values) ] if claim_type: if "Specify" in claim_type[0]: claim_type = ["Other. Specify"] data = { "debtor": debtor, "community": community, "offset": offset, "info": info, "claim_type": claim_type, "property": property_values, } if not results: results = data else: datum = [{k: v} for k, v in data.items() if v != []] data = {} for item in datum: data = {**data, **item} results = {**results, **data} return results