Python pdf.Page.crop示例，pdfplumber.pdf.Page.crop Python示例

示例#1

0

显示文件

def crop_and_extract(
    page: pdfplumber.pdf.Page,
    line: Dict,
    adjust=False,
    left_shift: int = 0,
    up_shift: int = 20,
) -> str:
    """Extract text content for pdf line if any

    Given a line of a pdf - extracted the text around it.  If adjust
    is True, reduce the cropped area to within the first line (usually above)
    :param page: Page to crop
    :param line: Line to crop around
    :param adjust: Whether to check if another line is inside our crop
    :param left_shift: Leftward crop adjustment
    :param up_shift: Upward crop adjustment
    :return: Content of the section
    """
    bbox = (
        int(line["x0"]) - left_shift,
        int(line["top"]) - up_shift,
        line["x1"],
        line["top"],
    )
    crop = page.crop(bbox)
    if adjust:
        tops = [row["top"] for row in crop.lines if row["top"] != line["top"]]
        if len(tops) > 0:
            crop = page.crop(bbox=(*bbox[:1], tops[-1], *bbox[2:]))
    return crop.filter(keys_and_input_text).extract_text()

示例#2

0

显示文件

def extract_other_creditors_ef(page: pdfplumber.pdf.Page, start: Dict,
                               stop: Dict, creditors: List) -> Dict:
    """Process other creditors to be notified if any

    :param page:Page to crop
    :param start:Y coordinate of the top of the creditor section
    :param stop:Y coordinate of the bottom of the creditor section
    :return: The key, address and acct information
    """

    key_bbox = (start["x0"], start["top"] - 20, start["x1"], start["top"])
    addy_bbox = (0, start["top"] - 20, start["x0"] - 20, stop["top"])
    acct_bbox = (start["x1"] + 150, start["top"] + 20, page.width, stop["top"])

    key = page.crop(key_bbox).filter(just_text_filter).extract_text()
    address = page.crop(addy_bbox).filter(keys_and_input_text).extract_text()
    acct = page.crop(acct_bbox).filter(keys_and_input_text).extract_text()
    for creditor in creditors:
        if creditor["key"] == str(key):
            other_creditors = creditor["other_creditors"]
            other_creditors.append({
                "address": address,
                "acct": acct,
                "key": key
            })
            creditor["other_creditors"] = other_creditors

    return creditors

示例#3

0

显示文件

def parse_unsecured_creditors(page: pdfplumber.pdf.Page, top: int,
                              bottom: int) -> Dict:
    """Extract the information on the unsecured creditor section

    :param page: PDF page
    :param top: Y coordinate of the top of section
    :param bottom: Y coordinate of the bottom of section
    :return: Organized creditor data
    """
    data = []
    crop_one = page.crop((0, max(100, top - 500), page.width, bottom))
    crop = crop_one.crop((0, top, page.width, bottom))
    key = crop.filter(key_filter).extract_text().replace("\n", "")
    boxes = get_checkboxes(crop)
    lines = crop.filter(remove_margin_lines).lines
    for line in sorted(lines, key=lambda x: x["top"]):
        if not data and line["width"] > 20:
            continue
        output = crop_and_extract(crop_one, line, adjust=True, up_shift=100)
        if data or (output is not None and key == output.replace("\n", "")):
            if len(data) == 10 and "2." in key or len(
                    data) == 8 and "4." in key:
                continue
            data.append(output)
    if data:
        return make_creditor_dict(data, boxes, key)
    return {}

示例#4

0

显示文件

def extract_other_creditors_d(page: pdfplumber.pdf.Page, markers: List[Dict],
                              creditors: List) -> None:
    """Crop and extract address, key and acct # from the PDf

    :param page: PDF page
    :param markers: The top and bottom markers
    :return: Address, key and account information
    """
    adjust = 0 if len(markers) == 5 else 12

    addy_bbox = (
        0,
        markers[0]["top"],
        int(markers[-1]["x1"]) * 0.35,
        markers[-1]["top"],
    )
    key_bbox = (
        markers[-3]["x0"],
        markers[0]["top"] - adjust,
        markers[-3]["x1"],
        markers[-3]["top"],
    )
    acct_bbox = (
        markers[1]["x0"],
        markers[1]["top"] - 12,
        markers[1]["x1"],
        markers[1]["top"],
    )

    address = page.crop(addy_bbox).filter(keys_and_input_text).extract_text()
    key = page.crop(key_bbox).filter(
        keys_and_input_text).extract_text().strip()
    acct = page.crop(acct_bbox).filter(keys_and_input_text).extract_text()
    for creditor in creditors:
        if creditor["key"] == key:
            other_creditors = creditor["other_creditors"]
            other_creditors.append({
                "key": key,
                "address": address,
                "acct": acct
            })
            creditor["other_creditors"] = other_creditors
    return creditors

示例#5

0

显示文件

def get_1_to_2_from_a_b(only_page: pdfplumber.pdf.Page) -> List[Dict]:
    """Extract real estate, automobile, jet ski, boats etc, from A/B.

    :param only_page:The PDF page to extract from
    :return: Extracted content
    """
    property_content = []
    sections = find_property_sections(only_page)
    if not sections:
        return property_content

    for top, bottom, key in sections:
        bbox = (0, top, only_page.width, bottom)
        crop = only_page.crop(bbox)
        data = get_all_values_from_crop(crop.lines, only_page)

        if "1." in key:
            section = make_property_dict(key, data)
            checkboxes = get_checkboxes(crop)
            if not checkboxes:
                section["property_interest"] = "Checkbox unreadable"
                section["debtor"] = "Checkbox unreadable"
            else:
                section["property_interest"] = checkboxes["property"]
                section["debtor"] = checkboxes["debtor"]
            property_content.append(section)

        if "3." in key or "4." in key:
            if "3." in key:
                section = make_car_dict(key, data)
            else:
                section = make_other_dict(key, data)

            checkboxes = get_checkboxes(crop)
            if not checkboxes:
                section["debtor"] = "Checkbox unreadable"
            else:
                section["debtor"] = checkboxes["debtor"]
            property_content.append(section)

    return property_content

示例#6

0

显示文件

def parse_secured_creditors(only_page: pdfplumber.pdf.Page, top: int,
                            bottom: int) -> Dict:
    """Find and extract content from secured creditor portion of 106D

    :param only_page:PDF page
    :param top: Y coordinate for top of section
    :param bottom: Y coordinate of bottom of section
    :return: Organized data of the section
    """
    page = only_page.crop((0, max(100, top - 500), only_page.width, bottom))
    section = page.crop((0, top, only_page.width, bottom))
    key = section.filter(key_filter).extract_text()
    checkboxes = get_checkboxes(section)
    data = []

    for line in sorted(section.filter(remove_margin_lines).lines,
                       key=lambda x: x["top"]):
        top = int(line["top"])
        if not data and line["width"] > 20:
            continue
        page_crop = page.crop((line["x0"], top - 200, line["x1"], top))
        tops = [
            row["top"] for row in page_crop.lines if int(row["top"]) != top
        ]
        if len(tops) > 0:
            if len(data) == 6:
                page_crop = page.crop(
                    (line["x0"], tops[-1] - 20, line["x1"], top))
            elif len(data) == 8:
                page_crop = page.crop((line["x0"], top - 50, line["x1"], top))
            else:
                page_crop = page.crop(
                    (line["x0"], tops[-1], line["x1"], line["top"]))
        output = page_crop.filter(keys_and_input_text).extract_text()

        if data or key == output:
            data.append(output)

    if data and len(data) > 10:
        return make_secured_creditor_dict(data, checkboxes)
    return {}