示例#1
0
def find_property_sections(
    only_page: pdfplumber.pdf.Page,
) -> Optional[Iterator[Tuple[Union[int, Any], int, Union[int, Any]]]]:
    """Find property sections to iterate over

    Find Sections with white font and identifiers for each section

    :param only_page: PDF page to crop
    :return: None or Iterable sections as top bottom and id
    """
    rows = only_page.filter(input_white_text_and_left_side).extract_words()
    rows = [{
        "top": int(row["top"]),
        "text": row["text"]
    } for row in rows if len(row["text"]) > 2 and row["text"][0] in "P12345"
            and row["text"][1] == "."]
    if len(rows) == 0:
        return None
    bottoms = [
        int(line["top"]) for line in only_page.lines
        if line["top"] > rows[0]["top"] and line["width"] > 530
    ][:len(rows)]
    tops = [r["top"] for r in rows]
    keys = [r["text"] for r in rows]
    return zip(tops, bottoms, keys)
示例#2
0
def get_3_to_8_form_a_b(
    page: pdfplumber.pdf.Page,
) -> Tuple[List[Union[Dict[str, Any], Dict[Optional[Any], str]]], list,
           Optional[dict]]:
    """Parse sections 3 to 8 of 106 A/B property form

    :param page:The pdf page to parse
    :return:Organized property data in the document.
    """
    part = 0
    totals, section, key = None, None, None
    results, data, part_eight = [], [], []

    rows = page.filter(filter_106_ab_content).extract_text().splitlines()
    debtors = get_ab_debtors(rows)

    # Remove debtor rows from lines
    rows = [r for r in rows if r not in debtors]
    for debtor in debtors:
        rows = [r for r in rows if debtor not in r]

    for row in rows[1:]:
        match = re.match(r"Part \d:", row)
        if match:
            part += 1
            continue
        # Extract parts 3 to 7
        if part in [3, 4, 5, 6, 7]:
            match = re.match(r"^\d{1,2}\. ?|^5", row)
            if not match:
                data.append(row)
                continue
            if section == row:
                continue
            if "54. " in row:
                results.append({"54.": row.split(" ")[1]})
            if key:
                data = [d for d in data if "[" not in d]
                if data:
                    if key == "24." and data == ["2"]:
                        data = []
                        continue
                    results.append({key: clean_ab_data(data)})
                data = []
                section = row
            key = row

        if part == 8:
            # Part 8 is the section containing grand totals.
            part_eight.append(row)
            if "63. " in row:  # this is the final row of Part 8
                totals = make_ab_totals(part_eight)

    return results, debtors, totals
示例#3
0
def get_checkboxes(crop: pdfplumber.pdf.Page) -> Dict:
    """Find and identify checked checkboxes

    Using multiple tolerances, find checkboxes and identify them by the content
    to the right of the checkbox.

    :param crop: Section of pdf to extract checkboxes from
    :return: Dictionary of selected checkboxes
    """
    results = {}
    # Use multiple tolerances to line up checkboxes on weird PDFs
    for tolerance in [3, 4, 5]:
        filtered_data = crop.filter(filter_boxes).extract_text(
            y_tolerance=tolerance)
        filtered_data = filtered_data.replace(
            "Type of NONPRIORITY unsecured claim:", "")
        if "[]" not in filtered_data:
            # Checkboxes unreadable
            return {}
        filtered_lines = filtered_data.splitlines()
        checkboxes = [x.replace("  ", " ") for x in filtered_lines if "[" in x]
        query1 = ["debtor"]
        query2 = ["community", "see instructions", "claim relates"]
        query3 = ["No", "Yes"]
        query4 = ["contingent", "unliquidated", "disputed"]
        query5 = [
            "domestic",
            "taxes",
            "death",
            "specify",
            "loans",
            "obligations",
            "pension",
            "including",
            "judgment",
            "statutory",
            "agreement",
        ]

        debtor = [
            box.split(" ", 1)[1].strip() for box in checkboxes
            if "√" in box and any(s in box.lower() for s in query1)
        ]
        community = [
            box.split(" ", 1)[1].strip() for box in checkboxes
            if "√" in box and any(s in box.lower() for s in query2)
        ]
        offset = [
            box.split(" ", 1)[1].strip() for box in checkboxes
            if "√" in box and any(s in box for s in query3)
        ]
        offset = [ans for ans in offset if re.match(r"^(Yes|No)$", ans)]

        info = [
            box.split(" ", 1)[1].strip() for box in checkboxes
            if "√" in box and any(s in box.lower() for s in query4)
        ]
        claim_type = [
            box.split(" ", 1)[1].strip() for box in checkboxes
            if "√" in box and any(s in box.lower() for s in query5)
        ]

        property_values = [
            box.split(" ", 1)[1].strip() for box in checkboxes
            if "√" in box and any(s in box for s in property_options)
        ]
        property_values = [
            s for s in property_options
            if any(s in box for box in property_values)
        ]

        if claim_type:
            if "Specify" in claim_type[0]:
                claim_type = ["Other. Specify"]
        data = {
            "debtor": debtor,
            "community": community,
            "offset": offset,
            "info": info,
            "claim_type": claim_type,
            "property": property_values,
        }
        if not results:
            results = data
        else:
            datum = [{k: v} for k, v in data.items() if v != []]
            data = {}
            for item in datum:
                data = {**data, **item}

            results = {**results, **data}
    return results