예제 #1
0
def _find_statement_date(text_boxes: Sequence[str], logger) -> datetime.datetime:
    # The actual date of the statement would be in the third page, but since we only
    # converted the first page, we wing it a bit.
    period_box = find_box_starting_with(text_boxes, "Statement Period\n")
    if not period_box:
        # older statements
        period_box = find_box_starting_with(text_boxes, "Statement Period: ")

    logger.debug("found period specification: %r", period_box)
    assert period_box

    # This matches when the period is strictly within a month.
    period_match = re.search(
        r"Statement Period(?:\n|: )([A-Z][a-z]+ )[0-9]{1,2}-([0-9]{1,2}, [0-9]{4})\n",
        period_box,
    )
    if period_match:
        period_end_str = period_match.group(1) + period_match.group(2)
    else:
        period_match = re.search(
            r"Statement Period(?:\n|: )[A-Z][a-z]+ [0-9]{1,2}, [0-9]{4} to[\n ]([A-Z][a-z]+ [0-9]{1,2}, [0-9]{4})\n",
            period_box,
        )
        assert period_match
        period_end_str = period_match.group(1)

    return dateparser.parse(period_end_str, languages=["en"])
예제 #2
0
def try_americanexpress(text_boxes, parent_logger) -> Optional[NameComponents]:
    logger = parent_logger.getChild("americanexpress")

    if text_boxes[0] != "www.americanexpress.co.uk\n":
        return None

    document_type = text_boxes[4].strip()
    if document_type == "Statement of Account":
        document_type = "Statement"

    account_holder_box = find_box_starting_with(text_boxes, "Prepared for\n")
    assert account_holder_box
    account_holder_index = text_boxes.index(account_holder_box)
    account_holder_name = account_holder_box.split("\n")[1].strip().title()

    # The date is the box after the Membership Number. We can't look for the one starting
    # with "Date" because there's more than one.
    membership_box = find_box_starting_with(text_boxes, "Membership Number\n")
    assert membership_box
    membership_index = text_boxes.index(membership_box)

    date_box = text_boxes[membership_index + 1]
    date_fields = date_box.split("\n")
    assert date_fields[0] == "Date"

    statement_date = datetime.datetime.strptime(date_fields[1], "%d/%m/%y")

    return NameComponents(
        statement_date, "American Express", account_holder_name, "Statement",
    )
예제 #3
0
def try_aws(text_boxes: Sequence[str], parent_logger) -> Optional[NameComponents]:

    is_aws = find_box_starting_with(text_boxes, "Amazon Web Services, Inc. Invoice\n")
    if not is_aws:
        is_aws = find_box_starting_with(text_boxes, "Amazon Web Services Invoice\n")

    if not is_aws:
        return None

    fields_box = find_box_starting_with(text_boxes, "Invoice Number:\n")
    assert fields_box
    fields_index = text_boxes.index(fields_box)

    # There's at least two versions of this, where the fields are either right after, or
    # once more after that. Try both.
    values_box = text_boxes[fields_index + 1]
    if fields_box.count("\n") != values_box.count("\n"):
        values_box = text_boxes[fields_index + 2]

    invoice_info = build_dict_from_fake_table(fields_box, values_box)

    invoice_date = dateparser.parse(invoice_info["Invoice Date:"], languages=["en"])

    address_box = find_box_starting_with(text_boxes, "Bill to Address:\n")
    assert address_box

    account_holder = address_box.split("\n")[1]
    assert account_holder.startswith("ATTN: ")

    account_holder = account_holder[6:]  # Drop the ATTN

    return NameComponents(invoice_date, "AWS", account_holder, "Invoice")
예제 #4
0
def try_chase(text_boxes, parent_logger) -> Optional[NameComponents]:
    logger = parent_logger.getChild("chase")

    if not find_box_starting_with(text_boxes, "JPMorgan Chase Bank, N.A.\n"):
        return None

    # Period line changes from statement to statement, so try fuzzy-matching it instead.
    # Note that some more recent statements appear to have spacing issues, so we can't
    # match the space both sides.

    for box in text_boxes:
        period_match = re.search(
            r"^[A-Z][a-z]+ [0-9]{1,2}, [0-9]{4} ?through ([A-Z][a-z]+ [0-9]{1,2}, [0-9]{4})\n",
            box,
        )
        if period_match:
            break
    else:
        logger.debug("unable to find period line")
        return None

    assert period_match
    logger.debug("found period specification: %r", period_match.group(0))

    statement_date = dateparser.parse(period_match.group(1), languages=["en"])

    # We anchor the address on the contact numbers on the side, but that's not working for
    # older statements.
    deaf_contact_box = find_box_starting_with(text_boxes,
                                              "Deaf and Hard of Hearing: ")
    if deaf_contact_box:
        deaf_contact_index = text_boxes.index(deaf_contact_box)

        account_holder_box = text_boxes[deaf_contact_index + 1]
        account_holder_name = account_holder_box.strip().title()
    else:
        # If we couldn't find the account holder through the contact number, it probably is a newer version of the template.
        # We can find the address box based on the period line instead.
        period_box = find_box_starting_with(text_boxes, period_match.group(0))
        address_box_index = text_boxes.index(period_box) - 1
        address_box = text_boxes[address_box_index]
        if address_box.count("\n") < 2:
            logger.debug("unable to find the account holder name")
            return None

        # Here's another corner case: when the statement has communications attached in
        # the first page, the mail routing number is attached to the address. So instead,
        # we need to drop that ourselves.
        if re.search(r"^[0-9]+ [A-Z]+ ", address_box):
            address_box = address_box.split("\n", 1)[1]

        account_holder_name = extract_account_holder_from_address(address_box)

    return NameComponents(
        statement_date,
        "Chase",
        account_holder_name,
        "Statement",
    )
예제 #5
0
def try_ms_bank(text_boxes, parent_logger) -> Optional[NameComponents]:
    logger = parent_logger.getChild("ms_bank")

    if "M&S Bank" not in text_boxes[-1]:
        return None

    account_name_box = find_box_starting_with(text_boxes, "Account Name\n")
    assert account_name_box

    account_holder_name = account_name_box.split("\n")[1].strip()

    # The statement period is just before the account name box.
    period_box_index = text_boxes.index(account_name_box) - 1
    period_line = text_boxes[period_box_index]

    logger.debug("found period specification %r", period_line)

    period_match = re.search(
        r"^[0-9]{2} [A-Z][a-z]+(?: [0-9]{4})? to ([0-9]{2} [A-Z][a-z]+ [0-9]{4})\n$",
        period_line,
    )
    assert period_match

    statement_date = dateparser.parse(period_match.group(1), languages=["en"])

    return NameComponents(statement_date, "M&S Bank", account_holder_name, "Statement",)
예제 #6
0
def try_enel(text_boxes, parent_logger) -> Optional[NameComponents]:
    logger = parent_logger.getChild("enel")

    enel_address_box = find_box_starting_with(
        text_boxes, "Enel Energia - Mercato libero dell'energia\n"
    )
    if not enel_address_box:
        return None
    enel_address_index = text_boxes.index(enel_address_box)

    # Late 2019: the ENEL address is at the beginning, the address is two boxes before the
    # payment due date.
    due_date_box = find_box_starting_with(text_boxes, "Entro il ")
    assert due_date_box

    address_box_index = text_boxes.index(due_date_box) - 2
    address_box = text_boxes[address_box_index]

    # In 2020: the account holder address is _before_ the ENEL address. We can tell if we
    # got the wrong address box if it's too short in lines.
    if address_box.count("\n") < 2:
        address_box_index = enel_address_index - 1
        address_box = text_boxes[address_box_index]

    account_holder_name = extract_account_holder_from_address(address_box)

    # In 2018, the address was before the customer number instead, try again.
    if account_holder_name == "Periodo":
        customer_id_box = find_box_starting_with(text_boxes, "N° CLIENTE\n")
        assert customer_id_box
        customer_id_box_index = text_boxes.index(customer_id_box)

        address_box = text_boxes[customer_id_box_index - 1]
        account_holder_name = extract_account_holder_from_address(address_box)

    # The date follows the invoice number, look for the invoce number, then take the next.
    invoice_number_box = find_box_starting_with(text_boxes, "N. Fattura ")
    assert invoice_number_box

    date_box_index = text_boxes.index(invoice_number_box) + 1
    date_box = text_boxes[date_box_index]

    bill_date = datetime.datetime.strptime(date_box, "Del %d/%m/%Y\n")

    return NameComponents(bill_date, "ENEL Energia", account_holder_name, "Bolletta",)
예제 #7
0
def try_scaleway(text_boxes, parent_logger) -> Optional[NameComponents]:
    logger = parent_logger.getChild("scaleway")

    if not find_box_starting_with(text_boxes, "Online SAS,"):
        return None

    customer_box = find_box_starting_with(text_boxes, "Customer \n")
    if customer_box:
        # Latest template
        account_holder = customer_box.split("\n")[1].strip()
    else:
        # Previous templates split this into two separate boxes.
        customer_label_idx = text_boxes.index("Customer\n")
        customer_box = text_boxes[customer_label_idx + 1]
        account_holder = customer_box.strip()

    date_box = find_box_starting_with(text_boxes, "Issued: \n")
    if date_box:
        # Latest template
        date_str = date_box.split("\n")[1].strip()
    else:
        # We need to find teh Issued line that is mixed together with other items, so just
        # use regex to find it.
        for box in text_boxes:
            # We don't really use a strict regex here, but we do only extract the _date_
            # part rather than the time, which is also present but useless to the
            # renaming.
            date_match = re.search(
                r"Issued: ([A-Z][a-z]+ [0-9]{1,2}, [0-9]{4}) at [0-9]", box)
            if date_match:
                break
        else:
            logger.debug("Unable to find the invoice issue date.")

        assert date_match
        date_str = date_match.group(1)

    bill_date = dateparser.parse(date_str)

    return NameComponents(bill_date, "Scaleway", account_holder, "Invoice")
예제 #8
0
def try_hounslow(text_boxes: Sequence[str],
                 parent_logger) -> Optional[NameComponents]:
    logger = parent_logger.getChild("hounslow")

    if not find_box_starting_with(text_boxes, "London Borough of Hounslow\n"):
        return None

    subject = text_boxes[2]
    if not subject.startswith("Council Tax Bill "):
        logger.debug("Not a council tax bill, unknown format.")
        return None

    bill_date = dateparser.parse(text_boxes[0], languages=["en"])

    # In older bills, the subject box includes the address.
    if subject.count("\n") > 1:
        address_box = subject.split("\n", 1)[1]
    else:
        address_box = text_boxes[3]

    account_holder = extract_account_holder_from_address(address_box)
    # There can be more than one account holder, which makes things a bit more complicated.
    if "&" in account_holder:
        account_holders = [
            drop_honorific(holder.strip())
            for holder in account_holder.split("&")
        ]

        account_holder = ", ".join(account_holders)

    return NameComponents(
        bill_date,
        "LB Hounslow",
        account_holder,
        "Council Tax Bill",
    )
예제 #9
0
def try_schwab(text_boxes: Sequence[str], parent_logger) -> Optional[NameComponents]:
    logger = parent_logger.getChild("schwab")

    # Older brokerage accounts (2016)
    if text_boxes[0].startswith("Schwab One® International Account\n"):
        logger.debug("Schwab One brokerage account statement (2016).")
        address_index = text_boxes.index("Mail To\n") + 1
        address_box = text_boxes[address_index]

        account_holder = extract_account_holder_from_address(address_box)
        assert account_holder

        statement_date = _find_statement_date(text_boxes, logger)

        return NameComponents(
            statement_date, "Schwab", account_holder, "Brokerage Statement"
        )

    # Brokerage Accounts, Trade Confirmations and Year-End documents from 2017 onwards.
    if text_boxes[0].startswith("Schwab One® International Account"):

        account_holder = text_boxes[0].split("\n")[1].strip().title()
        assert account_holder

        if text_boxes[2] == "Trade Confirmation\n":
            logger.debug("Schwab One Trade Confirmation")
            logger.warning(
                "Cannot rename this document, as date is not present on the first page!"
            )
            return None

        # Look for different types of year end documents.
        year_end_gain_losses = [
            box for box in text_boxes if "Year-End Schwab Gain/Loss Report" in box
        ]
        year_end_summary = [box for box in text_boxes if "YEAR-END SUMMARY" in box]

        if year_end_gain_losses:
            logger.debug("Year End Gain/Loss Report")
            date_match = re.search(
                r"\nPrepared on ([A-Z][a-z]+ [0-9]{1,2}, [0-9]{4})\n",
                year_end_gain_losses[0],
            )
            assert date_match  # Else we don't have the right document.
            document_date = dateparser.parse(date_match.group(1), languages=["en"])
            document_type = "Year End Gain-Losses Report"
        elif year_end_summary:
            logger.debug("Year End Summary")
            date_box = find_box_starting_with(text_boxes, "Date Prepared: ")
            assert date_box
            date_match = re.search(
                r"^Date Prepared:  ([A-Z][a-z]+ [0-9]{1,2}, [0-9]{4})\n$", date_box
            )
            assert date_match

            document_date = dateparser.parse(date_match.group(1), languages=["en"])
            document_type = "Year End Summary"
        else:
            logger.debug("Schwab One brokerage account statement.")
            document_date = _find_statement_date(text_boxes, logger)
            documen_type = "Brokerage Statement"

        return NameComponents(document_date, "Schwab", account_holder, document_type)

    # Letters
    if any(
        "Charles Schwab & Co., Inc. All rights reserved." in box for box in text_boxes
    ):
        logger.debug("Letter, possibly.")

        # Newer (2018) letters.
        if "Dear Client,\n" in text_boxes:
            date_str = text_boxes[0].split("\n")[0]
            logger.debug("Found date: %r", date_str)

            letter_date = dateparser.parse(date_str, languages=["en"])

            # The address is two boxes before the "Dear Client,".
            address_index = text_boxes.index("Dear Client,\n") - 3

            account_holder = extract_account_holder_from_address(
                text_boxes[address_index]
            )
        else:
            account_holder = extract_account_holder_from_address(text_boxes[0])
            letter_date = dateparser.parse(text_boxes[1], languages=["en"])

        assert account_holder

        return NameComponents(letter_date, "Schwab", account_holder, "Letter")