Пример #1
0
def try_tesco_bank(text_boxes, parent_logger) -> Optional[NameComponents]:
    logger = parent_logger.getChild("tesco_bank")

    # Before checking for statements, check other communications.
    if text_boxes[0].startswith("Tesco Bank\n") and any(
        box.startswith("Annual Summary of Interest\n") for box in text_boxes
    ):
        assert "Minicom:" in text_boxes[2]

        account_holder_name = text_boxes[4].strip()
        tax_year_line = [box for box in text_boxes if box.startswith("Tax Year:")]
        assert len(tax_year_line) == 1

        tax_year_match = re.search(
            r"^Tax Year: [0-9]{1,2} [A-Z][a-z]+ [0-9]{4} to ([0-9]{1,2} [A-Z][a-z]+ [0-9]{4})\n$",
            tax_year_line[0],
        )
        assert tax_year_match

        document_date = dateparser.parse(tax_year_match.group(1))

        return NameComponents(
            document_date,
            "Tesco Bank",
            account_holder_name,
            "Annual Summary of Interest",
        )

    if not any("tescobank.com/mmc" in box for box in text_boxes):
        return None

    assert "Current Account\n" in text_boxes[0]

    if text_boxes[1] == "Monthly statement\n":
        document_type = "Statement"
    else:
        document_type = text_boxes[1].strip().title()

    account_holder_name = extract_account_holder_from_address(text_boxes[2])

    fields_box = text_boxes[3]
    values_box = text_boxes[4]

    statement_info = build_dict_from_fake_table(fields_box, values_box)

    statement_date = dateparser.parse(
        statement_info["Statement date:"], languages=["en"]
    )

    return NameComponents(
        statement_date, "Tesco Bank", account_holder_name, document_type,
    )
Пример #2
0
def try_americanexpress(text_boxes, parent_logger) -> Optional[NameComponents]:
    logger = parent_logger.getChild("americanexpress")

    if text_boxes[0] != "www.americanexpress.co.uk\n":
        return None

    document_type = text_boxes[4].strip()
    if document_type == "Statement of Account":
        document_type = "Statement"

    account_holder_box = find_box_starting_with(text_boxes, "Prepared for\n")
    assert account_holder_box
    account_holder_index = text_boxes.index(account_holder_box)
    account_holder_name = account_holder_box.split("\n")[1].strip().title()

    # The date is the box after the Membership Number. We can't look for the one starting
    # with "Date" because there's more than one.
    membership_box = find_box_starting_with(text_boxes, "Membership Number\n")
    assert membership_box
    membership_index = text_boxes.index(membership_box)

    date_box = text_boxes[membership_index + 1]
    date_fields = date_box.split("\n")
    assert date_fields[0] == "Date"

    statement_date = datetime.datetime.strptime(date_fields[1], "%d/%m/%y")

    return NameComponents(
        statement_date, "American Express", account_holder_name, "Statement",
    )
Пример #3
0
def try_soenergy(text_boxes, parent_logger) -> Optional[NameComponents]:
    logger = parent_logger.getChild("soenergy")

    is_soenergy = any(box == "www.so.energy\n" for box in text_boxes)
    if not is_soenergy:
        return None

    assert text_boxes[1] == "Hello, here is your statement.\n"

    # Find the account holder name at the start of the PDF.
    address_box = text_boxes[0]
    account_holder_name = extract_account_holder_from_address(address_box)

    period_line = text_boxes[2]
    logger.debug("found period specification: %r", period_line)
    period_match = re.match(
        r"^For the period of [0-9]{1,2} [A-Z][a-z]{2} [0-9]{4} - ([0-9]{1,2} [A-Z][a-z]{2} [0-9]{4})\n$",
        period_line,
    )
    assert period_match
    statement_date = dateparser.parse(period_match.group(1), languages=["en"])

    return NameComponents(
        statement_date, "So Energy", account_holder_name, "Statement",
    )
Пример #4
0
def try_thameswater(text_boxes, parent_logger) -> Optional[NameComponents]:
    logger = parent_logger.getChild("thameswater")

    # There are at least two different possible boxes as the bottom of page 1 since 2017,
    # but they all include a link to TW's website.
    if "thameswater.co.uk/" not in text_boxes[-1]:
        return None

    assert text_boxes[0].startswith("Page 1 of ")

    date_line = text_boxes[1]
    date_match = re.search("^Date\n([0-9]{1,2} [A-Z][a-z]+ [0-9]{4})\n", date_line)
    assert date_match

    document_date = dateparser.parse(date_match.group(1), languages=["en"])

    address_box = text_boxes[5]
    account_holder_name = extract_account_holder_from_address(address_box)

    document_subject = text_boxes[7]
    if (
        document_subject == "Your payment plan.\n"
        or document_subject == "Your new payment plan.\n"
    ):
        document_type = "Payment Plan"
    elif document_subject == "Your water and wastewater bill.\n":
        document_type = "Bill"
    else:
        document_type = "Other"

    return NameComponents(
        document_date, "Thames Water", account_holder_name, document_type,
    )
Пример #5
0
def try_ms_bank(text_boxes, parent_logger) -> Optional[NameComponents]:
    logger = parent_logger.getChild("ms_bank")

    if "M&S Bank" not in text_boxes[-1]:
        return None

    account_name_box = find_box_starting_with(text_boxes, "Account Name\n")
    assert account_name_box

    account_holder_name = account_name_box.split("\n")[1].strip()

    # The statement period is just before the account name box.
    period_box_index = text_boxes.index(account_name_box) - 1
    period_line = text_boxes[period_box_index]

    logger.debug("found period specification %r", period_line)

    period_match = re.search(
        r"^[0-9]{2} [A-Z][a-z]+(?: [0-9]{4})? to ([0-9]{2} [A-Z][a-z]+ [0-9]{4})\n$",
        period_line,
    )
    assert period_match

    statement_date = dateparser.parse(period_match.group(1), languages=["en"])

    return NameComponents(statement_date, "M&S Bank", account_holder_name, "Statement",)
Пример #6
0
def try_aws(text_boxes: Sequence[str], parent_logger) -> Optional[NameComponents]:

    is_aws = find_box_starting_with(text_boxes, "Amazon Web Services, Inc. Invoice\n")
    if not is_aws:
        is_aws = find_box_starting_with(text_boxes, "Amazon Web Services Invoice\n")

    if not is_aws:
        return None

    fields_box = find_box_starting_with(text_boxes, "Invoice Number:\n")
    assert fields_box
    fields_index = text_boxes.index(fields_box)

    # There's at least two versions of this, where the fields are either right after, or
    # once more after that. Try both.
    values_box = text_boxes[fields_index + 1]
    if fields_box.count("\n") != values_box.count("\n"):
        values_box = text_boxes[fields_index + 2]

    invoice_info = build_dict_from_fake_table(fields_box, values_box)

    invoice_date = dateparser.parse(invoice_info["Invoice Date:"], languages=["en"])

    address_box = find_box_starting_with(text_boxes, "Bill to Address:\n")
    assert address_box

    account_holder = address_box.split("\n")[1]
    assert account_holder.startswith("ATTN: ")

    account_holder = account_holder[6:]  # Drop the ATTN

    return NameComponents(invoice_date, "AWS", account_holder, "Invoice")
Пример #7
0
def try_chase(text_boxes, parent_logger) -> Optional[NameComponents]:
    logger = parent_logger.getChild("chase")

    if not find_box_starting_with(text_boxes, "JPMorgan Chase Bank, N.A.\n"):
        return None

    # Period line changes from statement to statement, so try fuzzy-matching it instead.
    # Note that some more recent statements appear to have spacing issues, so we can't
    # match the space both sides.

    for box in text_boxes:
        period_match = re.search(
            r"^[A-Z][a-z]+ [0-9]{1,2}, [0-9]{4} ?through ([A-Z][a-z]+ [0-9]{1,2}, [0-9]{4})\n",
            box,
        )
        if period_match:
            break
    else:
        logger.debug("unable to find period line")
        return None

    assert period_match
    logger.debug("found period specification: %r", period_match.group(0))

    statement_date = dateparser.parse(period_match.group(1), languages=["en"])

    # We anchor the address on the contact numbers on the side, but that's not working for
    # older statements.
    deaf_contact_box = find_box_starting_with(text_boxes,
                                              "Deaf and Hard of Hearing: ")
    if deaf_contact_box:
        deaf_contact_index = text_boxes.index(deaf_contact_box)

        account_holder_box = text_boxes[deaf_contact_index + 1]
        account_holder_name = account_holder_box.strip().title()
    else:
        # If we couldn't find the account holder through the contact number, it probably is a newer version of the template.
        # We can find the address box based on the period line instead.
        period_box = find_box_starting_with(text_boxes, period_match.group(0))
        address_box_index = text_boxes.index(period_box) - 1
        address_box = text_boxes[address_box_index]
        if address_box.count("\n") < 2:
            logger.debug("unable to find the account holder name")
            return None

        # Here's another corner case: when the statement has communications attached in
        # the first page, the mail routing number is attached to the address. So instead,
        # we need to drop that ourselves.
        if re.search(r"^[0-9]+ [A-Z]+ ", address_box):
            address_box = address_box.split("\n", 1)[1]

        account_holder_name = extract_account_holder_from_address(address_box)

    return NameComponents(
        statement_date,
        "Chase",
        account_holder_name,
        "Statement",
    )
Пример #8
0
def try_enel(text_boxes, parent_logger) -> Optional[NameComponents]:
    logger = parent_logger.getChild("enel")

    enel_address_box = find_box_starting_with(
        text_boxes, "Enel Energia - Mercato libero dell'energia\n"
    )
    if not enel_address_box:
        return None
    enel_address_index = text_boxes.index(enel_address_box)

    # Late 2019: the ENEL address is at the beginning, the address is two boxes before the
    # payment due date.
    due_date_box = find_box_starting_with(text_boxes, "Entro il ")
    assert due_date_box

    address_box_index = text_boxes.index(due_date_box) - 2
    address_box = text_boxes[address_box_index]

    # In 2020: the account holder address is _before_ the ENEL address. We can tell if we
    # got the wrong address box if it's too short in lines.
    if address_box.count("\n") < 2:
        address_box_index = enel_address_index - 1
        address_box = text_boxes[address_box_index]

    account_holder_name = extract_account_holder_from_address(address_box)

    # In 2018, the address was before the customer number instead, try again.
    if account_holder_name == "Periodo":
        customer_id_box = find_box_starting_with(text_boxes, "N° CLIENTE\n")
        assert customer_id_box
        customer_id_box_index = text_boxes.index(customer_id_box)

        address_box = text_boxes[customer_id_box_index - 1]
        account_holder_name = extract_account_holder_from_address(address_box)

    # The date follows the invoice number, look for the invoce number, then take the next.
    invoice_number_box = find_box_starting_with(text_boxes, "N. Fattura ")
    assert invoice_number_box

    date_box_index = text_boxes.index(invoice_number_box) + 1
    date_box = text_boxes[date_box_index]

    bill_date = datetime.datetime.strptime(date_box, "Del %d/%m/%Y\n")

    return NameComponents(bill_date, "ENEL Energia", account_holder_name, "Bolletta",)
Пример #9
0
def try_o2(text_boxes, parent_logger) -> Optional[NameComponents]:
    logger = parent_logger.getChild("o2")

    if "Telefónica UK Limited" not in text_boxes[-1]:
        return None

    assert text_boxes[0] == "Copy Bill\n"

    fields_box = text_boxes[1]
    values_box = text_boxes[2]

    bill_info = build_dict_from_fake_table(fields_box, values_box)
    bill_date = dateparser.parse(bill_info["Bill date"], languages=["en"])

    address_box = text_boxes[3]
    account_holder_name = extract_account_holder_from_address(address_box)

    return NameComponents(bill_date, "O2 UK", account_holder_name, "Bill",)
Пример #10
0
def try_scaleway(text_boxes, parent_logger) -> Optional[NameComponents]:
    logger = parent_logger.getChild("scaleway")

    if not find_box_starting_with(text_boxes, "Online SAS,"):
        return None

    customer_box = find_box_starting_with(text_boxes, "Customer \n")
    if customer_box:
        # Latest template
        account_holder = customer_box.split("\n")[1].strip()
    else:
        # Previous templates split this into two separate boxes.
        customer_label_idx = text_boxes.index("Customer\n")
        customer_box = text_boxes[customer_label_idx + 1]
        account_holder = customer_box.strip()

    date_box = find_box_starting_with(text_boxes, "Issued: \n")
    if date_box:
        # Latest template
        date_str = date_box.split("\n")[1].strip()
    else:
        # We need to find teh Issued line that is mixed together with other items, so just
        # use regex to find it.
        for box in text_boxes:
            # We don't really use a strict regex here, but we do only extract the _date_
            # part rather than the time, which is also present but useless to the
            # renaming.
            date_match = re.search(
                r"Issued: ([A-Z][a-z]+ [0-9]{1,2}, [0-9]{4}) at [0-9]", box)
            if date_match:
                break
        else:
            logger.debug("Unable to find the invoice issue date.")

        assert date_match
        date_str = date_match.group(1)

    bill_date = dateparser.parse(date_str)

    return NameComponents(bill_date, "Scaleway", account_holder, "Invoice")
Пример #11
0
def _try_old_hyperoptic(text_boxes, logger) -> Optional[NameComponents]:
    if (text_boxes[0] == "www.hyperoptic.com\n"
            or text_boxes[0] == "www.hyperoptic.com \n"):
        account_holder_box = text_boxes[1]
    elif len(text_boxes) > 8 and text_boxes[7] == "www.hyperoptic.com \n":
        account_holder_box = text_boxes[0]
    else:
        return None

    logger.debug("looking for customer name in %r", account_holder_box)
    account_holder_match = re.search(r"Customer Name: ([^\n]+)\n",
                                     account_holder_box)
    assert account_holder_match
    account_holder_name = account_holder_match.group(1)

    # Extract the bill date from a "fake table".
    #
    # Older (2017~2018) Hyperoptic bills have two multi-line text boxes, one including all
    # the labels, and the other including all of the values.
    #
    # They thankfully sit next to each other, so once one is found, it's possible to find
    # the invoice date with relative ease.
    titles_str = [box for box in text_boxes if box.startswith("DD Ref:\n")]
    assert len(titles_str) == 1
    titles_idx = text_boxes.index(titles_str[0])
    values_str = text_boxes[titles_idx + 1]

    document_info = build_dict_from_fake_table(titles_str[0], values_str)
    bill_date_str = document_info["Invoice date:"]
    bill_date = datetime.datetime.strptime(bill_date_str, "%d %b %Y")

    return NameComponents(
        bill_date,
        "Hyperoptic",
        account_holder_name,
        "Bill",
    )
Пример #12
0
def try_hounslow(text_boxes: Sequence[str],
                 parent_logger) -> Optional[NameComponents]:
    logger = parent_logger.getChild("hounslow")

    if not find_box_starting_with(text_boxes, "London Borough of Hounslow\n"):
        return None

    subject = text_boxes[2]
    if not subject.startswith("Council Tax Bill "):
        logger.debug("Not a council tax bill, unknown format.")
        return None

    bill_date = dateparser.parse(text_boxes[0], languages=["en"])

    # In older bills, the subject box includes the address.
    if subject.count("\n") > 1:
        address_box = subject.split("\n", 1)[1]
    else:
        address_box = text_boxes[3]

    account_holder = extract_account_holder_from_address(address_box)
    # There can be more than one account holder, which makes things a bit more complicated.
    if "&" in account_holder:
        account_holders = [
            drop_honorific(holder.strip())
            for holder in account_holder.split("&")
        ]

        account_holder = ", ".join(account_holders)

    return NameComponents(
        bill_date,
        "LB Hounslow",
        account_holder,
        "Council Tax Bill",
    )
Пример #13
0
def try_hyperoptic(text_boxes, parent_logger) -> Optional[NameComponents]:
    logger = parent_logger.getChild("hyperoptic")

    # Check for very old templates, used in 2017 to 2018.
    old_bill = _try_old_hyperoptic(text_boxes, logger)
    if old_bill:
        return old_bill

    # All Hyperoptic objects on the page are logos, not text. But Hypernews is fairly
    # specific, too.
    is_hyperoptic = "Hypernews\n" in text_boxes

    # Older templates of the bills don't have "Hypernews", so we need to guess. If there's
    # a "DD Ref" field, and the following includes HYP, it's probably Hyperoptic.
    if not is_hyperoptic and "DD Ref:\n" in text_boxes:
        dd_ref_idx = text_boxes.index("DD Ref:\n")
        dd_ref = text_boxes[dd_ref_idx + 1]
        is_hyperoptic = "HYP" in dd_ref

    if not is_hyperoptic:
        return None

    account_idx = text_boxes.index("Name:\n")
    account_holder_name = text_boxes[account_idx + 1].strip()

    date_idx = text_boxes.index("Bill date:\n")
    date_str = text_boxes[date_idx + 1]

    bill_date = datetime.datetime.strptime(date_str, "%d %b %Y\n")

    return NameComponents(
        bill_date,
        "Hyperoptic",
        account_holder_name,
        "Bill",
    )
Пример #14
0
def try_santander(text_boxes, parent_logger) -> Optional[NameComponents]:
    logger = parent_logger.getChild("santander")

    is_santander_credit_card = any(box == "Santander Credit Card \n"
                                   for box in text_boxes)

    if is_santander_credit_card:
        # Always include the account holder name, which is found in the second text box.
        account_holder_name = extract_account_holder_from_address(
            text_boxes[1])

        # Could be an annual statement, look for it.
        is_annual_statement = any(
            box.startswith("Annual Statement:") for box in text_boxes)

        if is_annual_statement:
            document_type = "Annual Statement"

            period_line = [
                box for box in text_boxes
                if box.startswith("Annual Statement:")
            ]
            assert len(period_line) == 1

            logger.debug("found period specification: %r", period_line[0])

            period_match = re.match(
                r"^Annual Statement: [0-9]{1,2}[a-z]{2} [A-Z][a-z]{2} [0-9]{4} to ([0-9]{1,2}[a-z]{2} [A-Z][a-z]{2} [0-9]{4})\n",
                period_line[0],
            )
            assert period_match
            statement_date = dateparser.parse(period_match.group(1),
                                              languages=["en"])
        else:
            document_type = "Statement"

            period_line = [
                box for box in text_boxes
                if box.startswith("Account summary as at:")
            ]
            assert len(period_line) == 1

            logger.debug("found period specification: %r", period_line[0])

            period_match = re.match(
                r"^Account summary as at: ([0-9]{1,2}[a-z]{2} [A-Z][a-z]+ [0-9]{4}) for card number ending [0-9]{4}\n$",
                period_line[0],
            )
            assert period_match
            statement_date = dateparser.parse(period_match.group(1),
                                              languages=["en"])

        return NameComponents(
            statement_date,
            "Santander",
            account_holder_name,
            "Credit Card",
            additional_components=(document_type, ),
        )

    is_santander_select = any(box == "Select Current Account\n"
                              for box in text_boxes)
    is_santander_123 = any(box == "1l2l3 Current Account earnings\n"
                           for box in text_boxes)

    if is_santander_select or is_santander_123:
        # Always include the account holder name, which is found in the third text box.
        account_holder_name = extract_account_holder_from_address(
            text_boxes[2])

        period_line = [
            box for box in text_boxes
            if box.startswith("Your account summary for  \n")
        ]
        assert len(period_line) == 1

        logger.debug("found period specification: %r", period_line[0])

        period_match = re.match(
            r"^Your account summary for  \n[0-9]{1,2}[a-z]{2} [A-Z][a-z]{2} [0-9]{4} to ([0-9]{1,2}[a-z]{2} [A-Z][a-z]{2} [0-9]{4})\n$",
            period_line[0],
        )
        assert period_match
        statement_date = dateparser.parse(period_match.group(1),
                                          languages=["en"])

        if is_santander_select:
            account_type = "Select Current Account"
        elif is_santander_123:
            account_type = "123 Current Account"

        return NameComponents(
            statement_date,
            "Santander",
            account_holder_name,
            account_type,
            additional_components=("Statement", ),
        )

    is_statement_of_fees = any(box == "Statement of Fees\n"
                               for box in text_boxes)

    if is_statement_of_fees:
        # Always include the account holder name, which is found in the fourth text box.
        account_holder_name = extract_account_holder_from_address(
            text_boxes[3])

        # Find the account this refers to. It's the text box after the title column.
        account_idx = text_boxes.index("Account\n")
        account_type = text_boxes[account_idx + 1].strip().title()

        # Find the date this statement was issued. It's the second text box after tht
        # title column (why?)
        date_idx = text_boxes.index("Date\n")
        date_str = text_boxes[date_idx + 2]

        # Unlike the other documents, this uses a normal date format.
        statement_date = datetime.datetime.strptime(date_str, "%d/%m/%Y\n")

        return NameComponents(
            statement_date,
            "Santander",
            account_holder_name,
            account_type,
            additional_components=("Statement of Fees", ),
        )
Пример #15
0
def try_schwab(text_boxes: Sequence[str], parent_logger) -> Optional[NameComponents]:
    logger = parent_logger.getChild("schwab")

    # Older brokerage accounts (2016)
    if text_boxes[0].startswith("Schwab One® International Account\n"):
        logger.debug("Schwab One brokerage account statement (2016).")
        address_index = text_boxes.index("Mail To\n") + 1
        address_box = text_boxes[address_index]

        account_holder = extract_account_holder_from_address(address_box)
        assert account_holder

        statement_date = _find_statement_date(text_boxes, logger)

        return NameComponents(
            statement_date, "Schwab", account_holder, "Brokerage Statement"
        )

    # Brokerage Accounts, Trade Confirmations and Year-End documents from 2017 onwards.
    if text_boxes[0].startswith("Schwab One® International Account"):

        account_holder = text_boxes[0].split("\n")[1].strip().title()
        assert account_holder

        if text_boxes[2] == "Trade Confirmation\n":
            logger.debug("Schwab One Trade Confirmation")
            logger.warning(
                "Cannot rename this document, as date is not present on the first page!"
            )
            return None

        # Look for different types of year end documents.
        year_end_gain_losses = [
            box for box in text_boxes if "Year-End Schwab Gain/Loss Report" in box
        ]
        year_end_summary = [box for box in text_boxes if "YEAR-END SUMMARY" in box]

        if year_end_gain_losses:
            logger.debug("Year End Gain/Loss Report")
            date_match = re.search(
                r"\nPrepared on ([A-Z][a-z]+ [0-9]{1,2}, [0-9]{4})\n",
                year_end_gain_losses[0],
            )
            assert date_match  # Else we don't have the right document.
            document_date = dateparser.parse(date_match.group(1), languages=["en"])
            document_type = "Year End Gain-Losses Report"
        elif year_end_summary:
            logger.debug("Year End Summary")
            date_box = find_box_starting_with(text_boxes, "Date Prepared: ")
            assert date_box
            date_match = re.search(
                r"^Date Prepared:  ([A-Z][a-z]+ [0-9]{1,2}, [0-9]{4})\n$", date_box
            )
            assert date_match

            document_date = dateparser.parse(date_match.group(1), languages=["en"])
            document_type = "Year End Summary"
        else:
            logger.debug("Schwab One brokerage account statement.")
            document_date = _find_statement_date(text_boxes, logger)
            documen_type = "Brokerage Statement"

        return NameComponents(document_date, "Schwab", account_holder, document_type)

    # Letters
    if any(
        "Charles Schwab & Co., Inc. All rights reserved." in box for box in text_boxes
    ):
        logger.debug("Letter, possibly.")

        # Newer (2018) letters.
        if "Dear Client,\n" in text_boxes:
            date_str = text_boxes[0].split("\n")[0]
            logger.debug("Found date: %r", date_str)

            letter_date = dateparser.parse(date_str, languages=["en"])

            # The address is two boxes before the "Dear Client,".
            address_index = text_boxes.index("Dear Client,\n") - 3

            account_holder = extract_account_holder_from_address(
                text_boxes[address_index]
            )
        else:
            account_holder = extract_account_holder_from_address(text_boxes[0])
            letter_date = dateparser.parse(text_boxes[1], languages=["en"])

        assert account_holder

        return NameComponents(letter_date, "Schwab", account_holder, "Letter")