def _find_statement_date(text_boxes: Sequence[str], logger) -> datetime.datetime: # The actual date of the statement would be in the third page, but since we only # converted the first page, we wing it a bit. period_box = find_box_starting_with(text_boxes, "Statement Period\n") if not period_box: # older statements period_box = find_box_starting_with(text_boxes, "Statement Period: ") logger.debug("found period specification: %r", period_box) assert period_box # This matches when the period is strictly within a month. period_match = re.search( r"Statement Period(?:\n|: )([A-Z][a-z]+ )[0-9]{1,2}-([0-9]{1,2}, [0-9]{4})\n", period_box, ) if period_match: period_end_str = period_match.group(1) + period_match.group(2) else: period_match = re.search( r"Statement Period(?:\n|: )[A-Z][a-z]+ [0-9]{1,2}, [0-9]{4} to[\n ]([A-Z][a-z]+ [0-9]{1,2}, [0-9]{4})\n", period_box, ) assert period_match period_end_str = period_match.group(1) return dateparser.parse(period_end_str, languages=["en"])
def try_americanexpress(text_boxes, parent_logger) -> Optional[NameComponents]: logger = parent_logger.getChild("americanexpress") if text_boxes[0] != "www.americanexpress.co.uk\n": return None document_type = text_boxes[4].strip() if document_type == "Statement of Account": document_type = "Statement" account_holder_box = find_box_starting_with(text_boxes, "Prepared for\n") assert account_holder_box account_holder_index = text_boxes.index(account_holder_box) account_holder_name = account_holder_box.split("\n")[1].strip().title() # The date is the box after the Membership Number. We can't look for the one starting # with "Date" because there's more than one. membership_box = find_box_starting_with(text_boxes, "Membership Number\n") assert membership_box membership_index = text_boxes.index(membership_box) date_box = text_boxes[membership_index + 1] date_fields = date_box.split("\n") assert date_fields[0] == "Date" statement_date = datetime.datetime.strptime(date_fields[1], "%d/%m/%y") return NameComponents( statement_date, "American Express", account_holder_name, "Statement", )
def try_aws(text_boxes: Sequence[str], parent_logger) -> Optional[NameComponents]: is_aws = find_box_starting_with(text_boxes, "Amazon Web Services, Inc. Invoice\n") if not is_aws: is_aws = find_box_starting_with(text_boxes, "Amazon Web Services Invoice\n") if not is_aws: return None fields_box = find_box_starting_with(text_boxes, "Invoice Number:\n") assert fields_box fields_index = text_boxes.index(fields_box) # There's at least two versions of this, where the fields are either right after, or # once more after that. Try both. values_box = text_boxes[fields_index + 1] if fields_box.count("\n") != values_box.count("\n"): values_box = text_boxes[fields_index + 2] invoice_info = build_dict_from_fake_table(fields_box, values_box) invoice_date = dateparser.parse(invoice_info["Invoice Date:"], languages=["en"]) address_box = find_box_starting_with(text_boxes, "Bill to Address:\n") assert address_box account_holder = address_box.split("\n")[1] assert account_holder.startswith("ATTN: ") account_holder = account_holder[6:] # Drop the ATTN return NameComponents(invoice_date, "AWS", account_holder, "Invoice")
def try_chase(text_boxes, parent_logger) -> Optional[NameComponents]: logger = parent_logger.getChild("chase") if not find_box_starting_with(text_boxes, "JPMorgan Chase Bank, N.A.\n"): return None # Period line changes from statement to statement, so try fuzzy-matching it instead. # Note that some more recent statements appear to have spacing issues, so we can't # match the space both sides. for box in text_boxes: period_match = re.search( r"^[A-Z][a-z]+ [0-9]{1,2}, [0-9]{4} ?through ([A-Z][a-z]+ [0-9]{1,2}, [0-9]{4})\n", box, ) if period_match: break else: logger.debug("unable to find period line") return None assert period_match logger.debug("found period specification: %r", period_match.group(0)) statement_date = dateparser.parse(period_match.group(1), languages=["en"]) # We anchor the address on the contact numbers on the side, but that's not working for # older statements. deaf_contact_box = find_box_starting_with(text_boxes, "Deaf and Hard of Hearing: ") if deaf_contact_box: deaf_contact_index = text_boxes.index(deaf_contact_box) account_holder_box = text_boxes[deaf_contact_index + 1] account_holder_name = account_holder_box.strip().title() else: # If we couldn't find the account holder through the contact number, it probably is a newer version of the template. # We can find the address box based on the period line instead. period_box = find_box_starting_with(text_boxes, period_match.group(0)) address_box_index = text_boxes.index(period_box) - 1 address_box = text_boxes[address_box_index] if address_box.count("\n") < 2: logger.debug("unable to find the account holder name") return None # Here's another corner case: when the statement has communications attached in # the first page, the mail routing number is attached to the address. So instead, # we need to drop that ourselves. if re.search(r"^[0-9]+ [A-Z]+ ", address_box): address_box = address_box.split("\n", 1)[1] account_holder_name = extract_account_holder_from_address(address_box) return NameComponents( statement_date, "Chase", account_holder_name, "Statement", )
def try_ms_bank(text_boxes, parent_logger) -> Optional[NameComponents]: logger = parent_logger.getChild("ms_bank") if "M&S Bank" not in text_boxes[-1]: return None account_name_box = find_box_starting_with(text_boxes, "Account Name\n") assert account_name_box account_holder_name = account_name_box.split("\n")[1].strip() # The statement period is just before the account name box. period_box_index = text_boxes.index(account_name_box) - 1 period_line = text_boxes[period_box_index] logger.debug("found period specification %r", period_line) period_match = re.search( r"^[0-9]{2} [A-Z][a-z]+(?: [0-9]{4})? to ([0-9]{2} [A-Z][a-z]+ [0-9]{4})\n$", period_line, ) assert period_match statement_date = dateparser.parse(period_match.group(1), languages=["en"]) return NameComponents(statement_date, "M&S Bank", account_holder_name, "Statement",)
def try_enel(text_boxes, parent_logger) -> Optional[NameComponents]: logger = parent_logger.getChild("enel") enel_address_box = find_box_starting_with( text_boxes, "Enel Energia - Mercato libero dell'energia\n" ) if not enel_address_box: return None enel_address_index = text_boxes.index(enel_address_box) # Late 2019: the ENEL address is at the beginning, the address is two boxes before the # payment due date. due_date_box = find_box_starting_with(text_boxes, "Entro il ") assert due_date_box address_box_index = text_boxes.index(due_date_box) - 2 address_box = text_boxes[address_box_index] # In 2020: the account holder address is _before_ the ENEL address. We can tell if we # got the wrong address box if it's too short in lines. if address_box.count("\n") < 2: address_box_index = enel_address_index - 1 address_box = text_boxes[address_box_index] account_holder_name = extract_account_holder_from_address(address_box) # In 2018, the address was before the customer number instead, try again. if account_holder_name == "Periodo": customer_id_box = find_box_starting_with(text_boxes, "N° CLIENTE\n") assert customer_id_box customer_id_box_index = text_boxes.index(customer_id_box) address_box = text_boxes[customer_id_box_index - 1] account_holder_name = extract_account_holder_from_address(address_box) # The date follows the invoice number, look for the invoce number, then take the next. invoice_number_box = find_box_starting_with(text_boxes, "N. Fattura ") assert invoice_number_box date_box_index = text_boxes.index(invoice_number_box) + 1 date_box = text_boxes[date_box_index] bill_date = datetime.datetime.strptime(date_box, "Del %d/%m/%Y\n") return NameComponents(bill_date, "ENEL Energia", account_holder_name, "Bolletta",)
def try_scaleway(text_boxes, parent_logger) -> Optional[NameComponents]: logger = parent_logger.getChild("scaleway") if not find_box_starting_with(text_boxes, "Online SAS,"): return None customer_box = find_box_starting_with(text_boxes, "Customer \n") if customer_box: # Latest template account_holder = customer_box.split("\n")[1].strip() else: # Previous templates split this into two separate boxes. customer_label_idx = text_boxes.index("Customer\n") customer_box = text_boxes[customer_label_idx + 1] account_holder = customer_box.strip() date_box = find_box_starting_with(text_boxes, "Issued: \n") if date_box: # Latest template date_str = date_box.split("\n")[1].strip() else: # We need to find teh Issued line that is mixed together with other items, so just # use regex to find it. for box in text_boxes: # We don't really use a strict regex here, but we do only extract the _date_ # part rather than the time, which is also present but useless to the # renaming. date_match = re.search( r"Issued: ([A-Z][a-z]+ [0-9]{1,2}, [0-9]{4}) at [0-9]", box) if date_match: break else: logger.debug("Unable to find the invoice issue date.") assert date_match date_str = date_match.group(1) bill_date = dateparser.parse(date_str) return NameComponents(bill_date, "Scaleway", account_holder, "Invoice")
def try_hounslow(text_boxes: Sequence[str], parent_logger) -> Optional[NameComponents]: logger = parent_logger.getChild("hounslow") if not find_box_starting_with(text_boxes, "London Borough of Hounslow\n"): return None subject = text_boxes[2] if not subject.startswith("Council Tax Bill "): logger.debug("Not a council tax bill, unknown format.") return None bill_date = dateparser.parse(text_boxes[0], languages=["en"]) # In older bills, the subject box includes the address. if subject.count("\n") > 1: address_box = subject.split("\n", 1)[1] else: address_box = text_boxes[3] account_holder = extract_account_holder_from_address(address_box) # There can be more than one account holder, which makes things a bit more complicated. if "&" in account_holder: account_holders = [ drop_honorific(holder.strip()) for holder in account_holder.split("&") ] account_holder = ", ".join(account_holders) return NameComponents( bill_date, "LB Hounslow", account_holder, "Council Tax Bill", )
def try_schwab(text_boxes: Sequence[str], parent_logger) -> Optional[NameComponents]: logger = parent_logger.getChild("schwab") # Older brokerage accounts (2016) if text_boxes[0].startswith("Schwab One® International Account\n"): logger.debug("Schwab One brokerage account statement (2016).") address_index = text_boxes.index("Mail To\n") + 1 address_box = text_boxes[address_index] account_holder = extract_account_holder_from_address(address_box) assert account_holder statement_date = _find_statement_date(text_boxes, logger) return NameComponents( statement_date, "Schwab", account_holder, "Brokerage Statement" ) # Brokerage Accounts, Trade Confirmations and Year-End documents from 2017 onwards. if text_boxes[0].startswith("Schwab One® International Account"): account_holder = text_boxes[0].split("\n")[1].strip().title() assert account_holder if text_boxes[2] == "Trade Confirmation\n": logger.debug("Schwab One Trade Confirmation") logger.warning( "Cannot rename this document, as date is not present on the first page!" ) return None # Look for different types of year end documents. year_end_gain_losses = [ box for box in text_boxes if "Year-End Schwab Gain/Loss Report" in box ] year_end_summary = [box for box in text_boxes if "YEAR-END SUMMARY" in box] if year_end_gain_losses: logger.debug("Year End Gain/Loss Report") date_match = re.search( r"\nPrepared on ([A-Z][a-z]+ [0-9]{1,2}, [0-9]{4})\n", year_end_gain_losses[0], ) assert date_match # Else we don't have the right document. document_date = dateparser.parse(date_match.group(1), languages=["en"]) document_type = "Year End Gain-Losses Report" elif year_end_summary: logger.debug("Year End Summary") date_box = find_box_starting_with(text_boxes, "Date Prepared: ") assert date_box date_match = re.search( r"^Date Prepared: ([A-Z][a-z]+ [0-9]{1,2}, [0-9]{4})\n$", date_box ) assert date_match document_date = dateparser.parse(date_match.group(1), languages=["en"]) document_type = "Year End Summary" else: logger.debug("Schwab One brokerage account statement.") document_date = _find_statement_date(text_boxes, logger) documen_type = "Brokerage Statement" return NameComponents(document_date, "Schwab", account_holder, document_type) # Letters if any( "Charles Schwab & Co., Inc. All rights reserved." in box for box in text_boxes ): logger.debug("Letter, possibly.") # Newer (2018) letters. if "Dear Client,\n" in text_boxes: date_str = text_boxes[0].split("\n")[0] logger.debug("Found date: %r", date_str) letter_date = dateparser.parse(date_str, languages=["en"]) # The address is two boxes before the "Dear Client,". address_index = text_boxes.index("Dear Client,\n") - 3 account_holder = extract_account_holder_from_address( text_boxes[address_index] ) else: account_holder = extract_account_holder_from_address(text_boxes[0]) letter_date = dateparser.parse(text_boxes[1], languages=["en"]) assert account_holder return NameComponents(letter_date, "Schwab", account_holder, "Letter")