def try_thameswater(text_boxes, parent_logger) -> Optional[NameComponents]: logger = parent_logger.getChild("thameswater") # There are at least two different possible boxes as the bottom of page 1 since 2017, # but they all include a link to TW's website. if "thameswater.co.uk/" not in text_boxes[-1]: return None assert text_boxes[0].startswith("Page 1 of ") date_line = text_boxes[1] date_match = re.search("^Date\n([0-9]{1,2} [A-Z][a-z]+ [0-9]{4})\n", date_line) assert date_match document_date = dateparser.parse(date_match.group(1), languages=["en"]) address_box = text_boxes[5] account_holder_name = extract_account_holder_from_address(address_box) document_subject = text_boxes[7] if ( document_subject == "Your payment plan.\n" or document_subject == "Your new payment plan.\n" ): document_type = "Payment Plan" elif document_subject == "Your water and wastewater bill.\n": document_type = "Bill" else: document_type = "Other" return NameComponents( document_date, "Thames Water", account_holder_name, document_type, )
def try_soenergy(text_boxes, parent_logger) -> Optional[NameComponents]: logger = parent_logger.getChild("soenergy") is_soenergy = any(box == "www.so.energy\n" for box in text_boxes) if not is_soenergy: return None assert text_boxes[1] == "Hello, here is your statement.\n" # Find the account holder name at the start of the PDF. address_box = text_boxes[0] account_holder_name = extract_account_holder_from_address(address_box) period_line = text_boxes[2] logger.debug("found period specification: %r", period_line) period_match = re.match( r"^For the period of [0-9]{1,2} [A-Z][a-z]{2} [0-9]{4} - ([0-9]{1,2} [A-Z][a-z]{2} [0-9]{4})\n$", period_line, ) assert period_match statement_date = dateparser.parse(period_match.group(1), languages=["en"]) return NameComponents( statement_date, "So Energy", account_holder_name, "Statement", )
def try_chase(text_boxes, parent_logger) -> Optional[NameComponents]: logger = parent_logger.getChild("chase") if not find_box_starting_with(text_boxes, "JPMorgan Chase Bank, N.A.\n"): return None # Period line changes from statement to statement, so try fuzzy-matching it instead. # Note that some more recent statements appear to have spacing issues, so we can't # match the space both sides. for box in text_boxes: period_match = re.search( r"^[A-Z][a-z]+ [0-9]{1,2}, [0-9]{4} ?through ([A-Z][a-z]+ [0-9]{1,2}, [0-9]{4})\n", box, ) if period_match: break else: logger.debug("unable to find period line") return None assert period_match logger.debug("found period specification: %r", period_match.group(0)) statement_date = dateparser.parse(period_match.group(1), languages=["en"]) # We anchor the address on the contact numbers on the side, but that's not working for # older statements. deaf_contact_box = find_box_starting_with(text_boxes, "Deaf and Hard of Hearing: ") if deaf_contact_box: deaf_contact_index = text_boxes.index(deaf_contact_box) account_holder_box = text_boxes[deaf_contact_index + 1] account_holder_name = account_holder_box.strip().title() else: # If we couldn't find the account holder through the contact number, it probably is a newer version of the template. # We can find the address box based on the period line instead. period_box = find_box_starting_with(text_boxes, period_match.group(0)) address_box_index = text_boxes.index(period_box) - 1 address_box = text_boxes[address_box_index] if address_box.count("\n") < 2: logger.debug("unable to find the account holder name") return None # Here's another corner case: when the statement has communications attached in # the first page, the mail routing number is attached to the address. So instead, # we need to drop that ourselves. if re.search(r"^[0-9]+ [A-Z]+ ", address_box): address_box = address_box.split("\n", 1)[1] account_holder_name = extract_account_holder_from_address(address_box) return NameComponents( statement_date, "Chase", account_holder_name, "Statement", )
def try_enel(text_boxes, parent_logger) -> Optional[NameComponents]: logger = parent_logger.getChild("enel") enel_address_box = find_box_starting_with( text_boxes, "Enel Energia - Mercato libero dell'energia\n" ) if not enel_address_box: return None enel_address_index = text_boxes.index(enel_address_box) # Late 2019: the ENEL address is at the beginning, the address is two boxes before the # payment due date. due_date_box = find_box_starting_with(text_boxes, "Entro il ") assert due_date_box address_box_index = text_boxes.index(due_date_box) - 2 address_box = text_boxes[address_box_index] # In 2020: the account holder address is _before_ the ENEL address. We can tell if we # got the wrong address box if it's too short in lines. if address_box.count("\n") < 2: address_box_index = enel_address_index - 1 address_box = text_boxes[address_box_index] account_holder_name = extract_account_holder_from_address(address_box) # In 2018, the address was before the customer number instead, try again. if account_holder_name == "Periodo": customer_id_box = find_box_starting_with(text_boxes, "N° CLIENTE\n") assert customer_id_box customer_id_box_index = text_boxes.index(customer_id_box) address_box = text_boxes[customer_id_box_index - 1] account_holder_name = extract_account_holder_from_address(address_box) # The date follows the invoice number, look for the invoce number, then take the next. invoice_number_box = find_box_starting_with(text_boxes, "N. Fattura ") assert invoice_number_box date_box_index = text_boxes.index(invoice_number_box) + 1 date_box = text_boxes[date_box_index] bill_date = datetime.datetime.strptime(date_box, "Del %d/%m/%Y\n") return NameComponents(bill_date, "ENEL Energia", account_holder_name, "Bolletta",)
def try_tesco_bank(text_boxes, parent_logger) -> Optional[NameComponents]: logger = parent_logger.getChild("tesco_bank") # Before checking for statements, check other communications. if text_boxes[0].startswith("Tesco Bank\n") and any( box.startswith("Annual Summary of Interest\n") for box in text_boxes ): assert "Minicom:" in text_boxes[2] account_holder_name = text_boxes[4].strip() tax_year_line = [box for box in text_boxes if box.startswith("Tax Year:")] assert len(tax_year_line) == 1 tax_year_match = re.search( r"^Tax Year: [0-9]{1,2} [A-Z][a-z]+ [0-9]{4} to ([0-9]{1,2} [A-Z][a-z]+ [0-9]{4})\n$", tax_year_line[0], ) assert tax_year_match document_date = dateparser.parse(tax_year_match.group(1)) return NameComponents( document_date, "Tesco Bank", account_holder_name, "Annual Summary of Interest", ) if not any("tescobank.com/mmc" in box for box in text_boxes): return None assert "Current Account\n" in text_boxes[0] if text_boxes[1] == "Monthly statement\n": document_type = "Statement" else: document_type = text_boxes[1].strip().title() account_holder_name = extract_account_holder_from_address(text_boxes[2]) fields_box = text_boxes[3] values_box = text_boxes[4] statement_info = build_dict_from_fake_table(fields_box, values_box) statement_date = dateparser.parse( statement_info["Statement date:"], languages=["en"] ) return NameComponents( statement_date, "Tesco Bank", account_holder_name, document_type, )
def try_o2(text_boxes, parent_logger) -> Optional[NameComponents]: logger = parent_logger.getChild("o2") if "Telefónica UK Limited" not in text_boxes[-1]: return None assert text_boxes[0] == "Copy Bill\n" fields_box = text_boxes[1] values_box = text_boxes[2] bill_info = build_dict_from_fake_table(fields_box, values_box) bill_date = dateparser.parse(bill_info["Bill date"], languages=["en"]) address_box = text_boxes[3] account_holder_name = extract_account_holder_from_address(address_box) return NameComponents(bill_date, "O2 UK", account_holder_name, "Bill",)
def try_hounslow(text_boxes: Sequence[str], parent_logger) -> Optional[NameComponents]: logger = parent_logger.getChild("hounslow") if not find_box_starting_with(text_boxes, "London Borough of Hounslow\n"): return None subject = text_boxes[2] if not subject.startswith("Council Tax Bill "): logger.debug("Not a council tax bill, unknown format.") return None bill_date = dateparser.parse(text_boxes[0], languages=["en"]) # In older bills, the subject box includes the address. if subject.count("\n") > 1: address_box = subject.split("\n", 1)[1] else: address_box = text_boxes[3] account_holder = extract_account_holder_from_address(address_box) # There can be more than one account holder, which makes things a bit more complicated. if "&" in account_holder: account_holders = [ drop_honorific(holder.strip()) for holder in account_holder.split("&") ] account_holder = ", ".join(account_holders) return NameComponents( bill_date, "LB Hounslow", account_holder, "Council Tax Bill", )
def try_santander(text_boxes, parent_logger) -> Optional[NameComponents]: logger = parent_logger.getChild("santander") is_santander_credit_card = any(box == "Santander Credit Card \n" for box in text_boxes) if is_santander_credit_card: # Always include the account holder name, which is found in the second text box. account_holder_name = extract_account_holder_from_address( text_boxes[1]) # Could be an annual statement, look for it. is_annual_statement = any( box.startswith("Annual Statement:") for box in text_boxes) if is_annual_statement: document_type = "Annual Statement" period_line = [ box for box in text_boxes if box.startswith("Annual Statement:") ] assert len(period_line) == 1 logger.debug("found period specification: %r", period_line[0]) period_match = re.match( r"^Annual Statement: [0-9]{1,2}[a-z]{2} [A-Z][a-z]{2} [0-9]{4} to ([0-9]{1,2}[a-z]{2} [A-Z][a-z]{2} [0-9]{4})\n", period_line[0], ) assert period_match statement_date = dateparser.parse(period_match.group(1), languages=["en"]) else: document_type = "Statement" period_line = [ box for box in text_boxes if box.startswith("Account summary as at:") ] assert len(period_line) == 1 logger.debug("found period specification: %r", period_line[0]) period_match = re.match( r"^Account summary as at: ([0-9]{1,2}[a-z]{2} [A-Z][a-z]+ [0-9]{4}) for card number ending [0-9]{4}\n$", period_line[0], ) assert period_match statement_date = dateparser.parse(period_match.group(1), languages=["en"]) return NameComponents( statement_date, "Santander", account_holder_name, "Credit Card", additional_components=(document_type, ), ) is_santander_select = any(box == "Select Current Account\n" for box in text_boxes) is_santander_123 = any(box == "1l2l3 Current Account earnings\n" for box in text_boxes) if is_santander_select or is_santander_123: # Always include the account holder name, which is found in the third text box. account_holder_name = extract_account_holder_from_address( text_boxes[2]) period_line = [ box for box in text_boxes if box.startswith("Your account summary for \n") ] assert len(period_line) == 1 logger.debug("found period specification: %r", period_line[0]) period_match = re.match( r"^Your account summary for \n[0-9]{1,2}[a-z]{2} [A-Z][a-z]{2} [0-9]{4} to ([0-9]{1,2}[a-z]{2} [A-Z][a-z]{2} [0-9]{4})\n$", period_line[0], ) assert period_match statement_date = dateparser.parse(period_match.group(1), languages=["en"]) if is_santander_select: account_type = "Select Current Account" elif is_santander_123: account_type = "123 Current Account" return NameComponents( statement_date, "Santander", account_holder_name, account_type, additional_components=("Statement", ), ) is_statement_of_fees = any(box == "Statement of Fees\n" for box in text_boxes) if is_statement_of_fees: # Always include the account holder name, which is found in the fourth text box. account_holder_name = extract_account_holder_from_address( text_boxes[3]) # Find the account this refers to. It's the text box after the title column. account_idx = text_boxes.index("Account\n") account_type = text_boxes[account_idx + 1].strip().title() # Find the date this statement was issued. It's the second text box after tht # title column (why?) date_idx = text_boxes.index("Date\n") date_str = text_boxes[date_idx + 2] # Unlike the other documents, this uses a normal date format. statement_date = datetime.datetime.strptime(date_str, "%d/%m/%Y\n") return NameComponents( statement_date, "Santander", account_holder_name, account_type, additional_components=("Statement of Fees", ), )
def try_schwab(text_boxes: Sequence[str], parent_logger) -> Optional[NameComponents]: logger = parent_logger.getChild("schwab") # Older brokerage accounts (2016) if text_boxes[0].startswith("Schwab One® International Account\n"): logger.debug("Schwab One brokerage account statement (2016).") address_index = text_boxes.index("Mail To\n") + 1 address_box = text_boxes[address_index] account_holder = extract_account_holder_from_address(address_box) assert account_holder statement_date = _find_statement_date(text_boxes, logger) return NameComponents( statement_date, "Schwab", account_holder, "Brokerage Statement" ) # Brokerage Accounts, Trade Confirmations and Year-End documents from 2017 onwards. if text_boxes[0].startswith("Schwab One® International Account"): account_holder = text_boxes[0].split("\n")[1].strip().title() assert account_holder if text_boxes[2] == "Trade Confirmation\n": logger.debug("Schwab One Trade Confirmation") logger.warning( "Cannot rename this document, as date is not present on the first page!" ) return None # Look for different types of year end documents. year_end_gain_losses = [ box for box in text_boxes if "Year-End Schwab Gain/Loss Report" in box ] year_end_summary = [box for box in text_boxes if "YEAR-END SUMMARY" in box] if year_end_gain_losses: logger.debug("Year End Gain/Loss Report") date_match = re.search( r"\nPrepared on ([A-Z][a-z]+ [0-9]{1,2}, [0-9]{4})\n", year_end_gain_losses[0], ) assert date_match # Else we don't have the right document. document_date = dateparser.parse(date_match.group(1), languages=["en"]) document_type = "Year End Gain-Losses Report" elif year_end_summary: logger.debug("Year End Summary") date_box = find_box_starting_with(text_boxes, "Date Prepared: ") assert date_box date_match = re.search( r"^Date Prepared: ([A-Z][a-z]+ [0-9]{1,2}, [0-9]{4})\n$", date_box ) assert date_match document_date = dateparser.parse(date_match.group(1), languages=["en"]) document_type = "Year End Summary" else: logger.debug("Schwab One brokerage account statement.") document_date = _find_statement_date(text_boxes, logger) documen_type = "Brokerage Statement" return NameComponents(document_date, "Schwab", account_holder, document_type) # Letters if any( "Charles Schwab & Co., Inc. All rights reserved." in box for box in text_boxes ): logger.debug("Letter, possibly.") # Newer (2018) letters. if "Dear Client,\n" in text_boxes: date_str = text_boxes[0].split("\n")[0] logger.debug("Found date: %r", date_str) letter_date = dateparser.parse(date_str, languages=["en"]) # The address is two boxes before the "Dear Client,". address_index = text_boxes.index("Dear Client,\n") - 3 account_holder = extract_account_holder_from_address( text_boxes[address_index] ) else: account_holder = extract_account_holder_from_address(text_boxes[0]) letter_date = dateparser.parse(text_boxes[1], languages=["en"]) assert account_holder return NameComponents(letter_date, "Schwab", account_holder, "Letter")