def download_and_attach_pdf( self, bill_data: BillingDatum, billing_row: sce_pages.BillingDataRow ) -> BillingDatum: self.clear_pdf_downloads() bill_path = self.download_pdf_for_billing_row(billing_row) if bill_path: with open(bill_path, "rb") as bill_file: key = bill_upload.hash_bill_datum(self.service_id, bill_data) + ".pdf" return bill_data._replace( attachments=[ bill_upload.upload_bill_to_s3( bill_file, key, statement=bill_data.statement, source="sce.com", utility=self.utility, utility_account_id=self.utility_account_id, ) ] ) else: log.info( "No pdf bill was available for this period: %s to %s", bill_data.start, bill_data.end, ) return bill_data
def _execute(self): login_page = LoginPage(self._driver) home_page = login_page.login(self.username, self.password) self.screenshot("home_page") bill_history_page = home_page.to_bill_history() bill_history_page.set_dates(self.start_date, self.end_date) self.screenshot("bill_history") history = bill_history_page.gather_data() pdf_bytes = sum(len(t[0]) for t in history if t[0]) xls_bytes = sum(len(t[1]) for t in history if t[1]) pdfs = sum(1 for t in history if t[0]) xls = sum(1 for t in history if t[1]) log.info( "Acquired %s pdfs (%s bytes) and %s excel files (%s bytes)." % (pdfs, pdf_bytes, xls, xls_bytes) ) bills = [] for pdf, xls in history: bill_data = [] if xls is not None: bill_data = bill_data_from_xls(xls, self.service_account) elif pdf is not None: bill_data = bill_data_from_pdf( pdf, self.service_account, self.meter_serial ) if pdf is not None and bill_data: bill_data_prime = [] for bill_datum in bill_data: key = bill_upload.hash_bill_datum(self.service_account, bill_datum) # statement date is not visible in the bill PDF text; use end date attachment_entry = bill_upload.upload_bill_to_s3( BytesIO(pdf), key, source="atmosenergy.com", statement=bill_datum.end, utility=self.utility, utility_account_id=self.utility_account_id, ) if attachment_entry: bill_data_prime.append( bill_datum._replace(attachments=[attachment_entry]) ) else: bill_data_prime.append(bill_datum) bill_data = bill_data_prime if bill_data: bills += bill_data final_bills = adjust_bill_dates(bills) return Results(bills=final_bills)
def _execute(self): login_page = LoginPage(self._driver) home_page = login_page.login(self.keller_id, self.password) self.screenshot("home_page") bill_history_page = home_page.to_bill_history() self.screenshot("bill_history_page") bills = bill_history_page.gather_data(self.keller_id, self.start_date, self.end_date) log.info("Acquired %d bills (%s bytes total)." % (len(bills), sum(len(b) for b in bills))) bill_data = [] for b in bills: bill_datum = parse_bill_pdf(BytesIO(b)) if bill_datum is None: continue key = bill_upload.hash_bill_datum(self.account_number, bill_datum) # bill doesn't have a statement date; use end date attachment_entry = bill_upload.upload_bill_to_s3( BytesIO(b), key, statement=bill_datum.end, source="cityofkeller.com", utility=self.utility, utility_account_id=self.account_number, ) if attachment_entry: bill_data.append( bill_datum._replace(attachments=[attachment_entry])) else: bill_data.append(bill_datum) # bill periods overlap; adjust start dates adjusted_bill_data = [] for bill in bill_data: adjusted_bill_data.append( BillingDatum( start=bill.start + timedelta(days=1), end=bill.end, statement=bill.statement, cost=bill.cost, used=bill.used, peak=bill.peak, items=bill.items, attachments=bill.attachments, utility_code=None, )) final_bills = adjust_bill_dates(adjusted_bill_data) show_bill_summary(final_bills, "Final Bill Summary") return Results(bills=final_bills)
def process_pdf( utility: str, utility_account_id: str, service_id: str, statement_dt: date, pdf_filename: str, ) -> BillingDatum: log.info("Parsing text from PDF %s", pdf_filename) text = pdfparser.pdf_to_str(pdf_filename) cost = extract_cost(text) used = extract_used(text) demand = extract_demand(text) start_date, end_date = extract_dates(text) # if the start date is in the wrong year, replace year (start_date = 12/1, statement_dt=12/15/2020) if start_date > statement_dt: start_date = start_date.replace(year=statement_dt.year) end_date = end_date.replace(year=statement_dt.year) # end_date must be after start date (end_date = 1/5, start_date = 12/1) if end_date < start_date: end_date = end_date.replace(year=end_date.year + 1) # adjust end date because SVP bills overlap on start/end dates end_date = end_date - timedelta(days=1) line_items: List[BillingDatumItemsEntry] = extract_line_items(text) key = hash_bill( service_id, start_date, end_date, cost, demand, used, ) with open(pdf_filename, "rb") as pdf_data: attachment_entry = upload_bill_to_s3( BytesIO(pdf_data.read()), key, source="mua.santaclaraca.gov", statement=end_date, utility=utility, utility_account_id=utility_account_id, ) return BillingDatum( start=start_date, end=end_date, statement=statement_dt, cost=cost, used=used, peak=demand, items=line_items, attachments=[attachment_entry], utility_code=None, )
def make_billing_datum(self, bill_detail: BillPeriodDetails) -> BillingDatum: """Convert a billing detail summary from the website to a Gridium BillingDatum object""" # get statement date from link: Date=yyyy-mm-dd date_re = re.compile(r"Date=(\d\d\d\d-\d\d-\d\d)") match = ( date_re.search(bill_detail.download_link) if bill_detail.download_link else None ) statement = None if match: try: statement = parse_date(match.group(1)).date() except Exception as exc: log.warning("error parsing date %s: %s", match.group(1), exc) if not statement: statement = bill_detail.end bill_datum = BillingDatum( start=bill_detail.start, end=bill_detail.end, statement=statement, cost=bill_detail.total_charges, used=bill_detail.total_kwh, peak=bill_detail.max_kw, items=None, attachments=None, utility_code=bill_detail.utility_code, ) pdf_bytes = self.download_pdf(bill_detail) if pdf_bytes: key = bill_upload.hash_bill_datum(self.account_id, bill_datum) attachment_entry = bill_upload.upload_bill_to_s3( BytesIO(pdf_bytes), key, source="smud.org", statement=statement, utility=self.utility, utility_account_id=self.account_id, ) if attachment_entry: bill_datum = bill_datum._replace(attachments=[attachment_entry]) return bill_datum
def _execute(self): if self.end_date - self.start_date < timedelta(days=90): self.start_date = self.end_date - timedelta(days=90) log.info( "Initial time window was too narrow for this utility. Expanding time window to: %s - %s" % (self.start_date, self.end_date)) login_page = LoginPage(self._driver) home_page = login_page.login(self.username, self.password) log.info("Login successful. Loading bill history.") self.screenshot("post_login") bill_history_page = home_page.select_account(self.account_number) log.info("Loaded bill history page.") self.screenshot("bill_history") results = bill_history_page.gather_data(self.start_date, self.end_date) log.info("Obtained %s bill records and %s PDFs." % (len(results), sum(1 for _, f in results if f is not None))) bills = [] for bd, pdf_bytes in results: if pdf_bytes is None: bills.append(bd) continue key = bill_upload.hash_bill_datum(self.account_number, bd) attachment_entry = bill_upload.upload_bill_to_s3( BytesIO(pdf_bytes), key, statement=bd.statement, source="hudsonenergy.net", utility=self.utility, utility_account_id=self.account_number, ) if attachment_entry: bills.append(bd._replace(attachments=[attachment_entry])) else: bills.append(bd) final_bills = adjust_bill_dates(bills) return Results(bills=final_bills)
def parse_pdf(pdf_filename: str, utility: str, utility_account_id: str) -> BillingDatum: text = pdfparser.pdf_to_str(pdf_filename) if "Your Energy Bill" in text: log.info("parsing new-style PDF %s", pdf_filename) data = parse_new_pdf(text) else: log.info("parsing old-style PDF %s", pdf_filename) data = parse_old_pdf(text) key = hash_bill(utility_account_id, data.start, data.end, data.cost, data.peak, data.used) with open(pdf_filename, "rb") as pdf_data: attachment_entry = upload_bill_to_s3( BytesIO(pdf_data.read()), key, source="www.duke-energy.com", statement=data.end, utility=utility, utility_account_id=utility_account_id, ) return data._replace(attachments=[attachment_entry])
def get_bills(self, utility: str, utility_account_id: str) -> List[BillingDatum]: billing_data = [] available_dates = self.driver.find_elements( By.CSS_SELECTOR, "table.table-alt a.bill-view-link") available_dates = [parse_date(i.text).date() for i in available_dates] log.info("available dates: %s", [dt.strftime("%Y-%m-%d") for dt in available_dates]) xpath_locators = { # Finds the last KWH reading under Total Usage column "cost": "//table[contains(., 'NEW CHARGES')]/tbody/tr/td[3]", "used": "(//table[contains(.,'USAGE')]//tr/td[contains(., 'KWH')])", "usage_kw": "//table[contains(.,'USAGE')]//tr/td[contains(.,'KW') and not(contains(.,'KWH'))]", } # loop through dates in table in ascending order for pdf_date in reversed(available_dates): # skip if the date isn't in the specified range if not (self.start_date <= pdf_date <= self.end_date): log.debug("skipping date outside range: %s", pdf_date) continue view_bill_link = self.driver.find_element_by_xpath( '//a[.="%s"]' % pdf_date.strftime("%m/%d/%Y")) scroll_to(self.driver, view_bill_link) self.driver.sleep(0.5) view_bill_link.click() self.driver.wait(30).until( EC.visibility_of_element_located( (By.CSS_SELECTOR, "div.billImage"))) start_date = None end_date = None cost = None used = None peak = None dates_line_text: str = self.driver.find_element_by_xpath( "//td[contains(., 'Service From:')]").text dates_match = re.search( r"Service From: (?P<from>\w+ \d\d) to (?P<to>\w+ \d\d) \(\d\d Days\)", dates_line_text, ) if dates_match: # if from month is December, use previous year year = (pdf_date.year - 1 if "dec" in dates_match.group("from").lower() else pdf_date.year) start_date = parse_date("%s %s" % (dates_match.group("from"), year)) end_date = parse_date( dates_match.group("to") + pdf_date.strftime(" %Y")) cost_match = self.driver.find(xpath_locators["cost"], xpath=True) if cost_match: cost = cost_match.text cost = float(cost.replace("$", "").replace(",", "")) kwh_usages = [] for match in self.driver.find_all(xpath_locators["used"], xpath=True): # include only if it has a reading values as siblings; exclude credit line items parent = match.find_element_by_xpath("..") # meter number, previous reading, current reading readings_text = "" for idx, child in enumerate( parent.find_elements_by_xpath(".//td")): log.debug("\t%s\t%s", idx, child.text.strip()) readings_text += child.text.strip() if idx == 2: break if not readings_text: log.info("skipping non-reading line item: %s", parent.text) continue kwh_value = float( match.text.replace("KWH", "").replace(",", "").strip()) kwh_usages.append(kwh_value) if kwh_usages: used = sum(kwh_usages) kw_usages = [] for usage_kw_match in self.driver.find_all( xpath_locators["usage_kw"], xpath=True): kw_usages.append( float( usage_kw_match.text.replace("KW", "").replace(",", "").strip())) if kw_usages: peak = max(kw_usages) data = BillingDatum( start=start_date, end=end_date - timedelta(days=1), statement=end_date - timedelta(days=1), cost=cost, peak=peak, used=used, items=None, attachments=None, utility_code=None, ) self.driver.find("a#billImageToPrint").click() self.driver.sleep(1) self.driver.switch_to.window(self.driver.window_handles[-1]) # the filename of the printed pdf is f"{current page title}.pdf" self.driver.execute_script("window.print();") try: file_exists_in_dir(directory=self.download_dir, pattern=r"^Bill View Bill Image.pdf$") except Exception: raise Exception("Unable to download file for %s" % pdf_date) curr_path = os.path.join(self.download_dir, "Bill View Bill Image.pdf") new_path = os.path.join( self.download_dir, f"bill_{pdf_date.strftime('%Y-%m-%d')}.pdf") os.rename(curr_path, new_path) log.info("parsed bill for %s - %s", data.start, data.end) self.driver.find("a#close").click() self.driver.sleep(1) self.driver.switch_to.window(self.driver.window_handles[-1]) self.driver.sleep(1) # upload PDF: key = hash_bill( utility_account_id, data.start, data.end, data.cost, data.peak, data.used, ) with open(new_path, "rb") as pdf_data: attachment_entry = upload_bill_to_s3( BytesIO(pdf_data.read()), key, source="www.duke-energy.com", statement=data.end, utility=utility, utility_account_id=utility_account_id, ) if attachment_entry: data = data._replace(attachments=[attachment_entry]) billing_data.append(data) # Click Bill Information in breadcrumbs to go back to bills list page self.driver.find("a#billInformation").click() return billing_data
def get_bills(self, account_id: str, start: date, end: date) -> List[BillingDatum]: """Get bills from the table. for each row: get end from Read date column (date) get start date from end date - (Days column (date) - 1) get statement date from Bill date column (date) if not start - end overlaps passed in start / end, continue get peak from On-peak Billed kW (float) get used from (Off-peak kWh + Shoulder kWh + On-peak kWh) (float) get cost from New charges (float) click eye icon to download PDF; wait for download to complete to self.driver.download_dir """ WebDriverWait(self.driver, 10).until( EC.presence_of_element_located(self.UsageTableBodyLocator)) usage_table_rows = self.driver.find_elements( *self.UsageTableRowsLocator) bill_data: List[BillingDatum] = [] self.driver.screenshot(BaseWebScraper.screenshot_path("bill table")) for row in usage_table_rows: cols = row.find_elements_by_tag_name("td") cols = [ c for c in cols if "display: none" not in c.get_attribute("style") ] col = lambda x: cols[x].text to_num = lambda x: "".join(d for d in col(x) if d.isdigit() or d == ".") to_float = lambda x: float(to_num(x)) if len(to_num(x)) > 0 else 0 log.debug(f"statement={col(1)} end={col(2)} days={col(7)}") # statement date statement_date = date_parser.parse(col(1)).date() # bill end period_year = statement_date.year if statement_date.month == 1 and col(2).startswith("12"): period_year = statement_date.year - 1 end_str = f"{col(2)}/{period_year}" bill_end = date_parser.parse(end_str).date() # bill start bill_start = bill_end - timedelta(days=int(to_float(7)) - 1) log.debug(f"start={bill_start} end={bill_end}") if not self._overlap(start, end, bill_start, bill_end): log.info( f"skipping bill {bill_start} - {bill_end}: does not overlap requested range {start} - {end}" ) continue # cost new_charges = to_float(8) # used used = to_float(4) + to_float(5) + to_float(6) # peak peak = to_float(3) bill_datum = BillingDatum( start=bill_start, end=bill_end, statement=statement_date, cost=new_charges, used=used, peak=peak, items=None, attachments=None, utility_code=None, ) try: bill_pdf_name = "SRPbill{}{}.pdf".format( statement_date.strftime("%B"), statement_date.year) pdf_download_link = cols[0].find_element_by_tag_name("a") scroll_to(self.driver, pdf_download_link) pdf_download_link.click() log.info("looking for %s in %s", bill_pdf_name, self.driver.download_dir) self.driver.wait(60).until( file_exists_in_dir(self.driver.download_dir, bill_pdf_name)) except Exception as e: raise Exception( f"Failed to download bill {bill_pdf_name} for statement date {statement_date}:\n {e}" ) log.info( f"Bill {bill_pdf_name} for statement date {statement_date} downloaded successfully" ) attachment_entry = None # open downloaded PDF and upload if config.enabled("S3_BILL_UPLOAD"): key = hash_bill_datum(account_id, bill_datum) with open(f"{self.driver.download_dir}/{bill_pdf_name}", "rb") as pdf_data: attachment_entry = upload_bill_to_s3( BytesIO(pdf_data.read()), key, source="myaccount.srpnet.com", statement=bill_datum.statement, utility="utility:salt-river-project", utility_account_id=account_id, ) if attachment_entry: bill_data.append( bill_datum._replace(attachments=[attachment_entry])) else: bill_data.append(bill_datum) return bill_data
def extract_bill_data(pdf_filename, service_id, utility, utility_account_id) -> Optional[BillingDatum]: # this function should upload the file to s3 to set attachments? try: text = pdf_to_str(pdf_filename) except PDFSyntaxError: log.exception("Downloaded bill file failed to parse as a PDF.") return None current_charges_pattern = "Current Charges(.*?)Cycle" for line in (re.search(current_charges_pattern, text, re.DOTALL).group(1).split("\n")): # get the last number if re.match(r"[\d,\.]", line.strip()): current_charges = line.strip().replace(",", "") period_start, period_end = extract_bill_period(pdf_filename) usage_pattern = r"Energy Charges \((\d*) kWh\)" usage = re.search(usage_pattern, text).groups()[0] on_peak_demand_pattern = r"On-Peak Demand \((\d+\.\d+)\ KW" on_peak_demand = re.search(on_peak_demand_pattern, text).groups()[0] offpeak_demand_pattern = r"Off-Peak Demand \((\d+\.\d+)\ KW" offpeak_demand = re.search(offpeak_demand_pattern, text).groups()[0] bill_attachment = [] if config.enabled("S3_BILL_UPLOAD"): log.info("S3_BILL_UPLOAD is enabled") with open(pdf_filename, "rb") as f: key = hash_bill( service_id, period_start, period_end, _format_number(current_charges), 0, _format_number(usage), ) # no statement date; use end date bill_attachment.append( upload_bill_to_s3( f, key, source="portlandgeneral.com", statement=period_end, utility=utility, utility_account_id=utility_account_id, )) log.info("Uploaded bill %s to s3", bill_attachment) bill = BillingDatum( start=period_start, end=period_end, statement=period_end, cost=_format_number(current_charges), used=_format_number(usage), peak=max( float(on_peak_demand), float(offpeak_demand), ), items=[], attachments=bill_attachment, utility_code=None, ) return bill
def parse_poway_pdf(pdf_filename: str, account_id: str) -> BillingDatum: text = pdfparser.pdf_to_str(pdf_filename) used_pattern = r"Consumption (?P<units_used>[\d\.,]+) @" cost_pattern = r"(?P<water_charges>[\d\.,]+)\s+WATERBasic Service @" # date format: m/d/yyyy date_pattern = r"\d{1,2}\/\d{1,2}\/\d{4}" dates_pattern = ( r"Total Current Charges.+?" fr"(?P<read_date_start>{date_pattern}) - (?P<read_date_end>{date_pattern})" fr"(?P<due_date>{date_pattern})" fr"(?P<statement_date>{date_pattern})") dates_match = re.search(dates_pattern, text) if not dates_match: raise InvalidMeterDataException( f"Couldn't parse dates from pdf: {text}") _dates = dates_match.group("read_date_start", "read_date_end", "statement_date") start_date, end_date, statement_date = [ parse_date(_date).date() for _date in _dates ] used_match = re.search(used_pattern, text) if not used_match: raise InvalidMeterDataException( "fCouldn't parse usage from pdf: {text}") used_text = used_match.group("units_used") used = float(used_text.replace(",", "").replace("$", "")) cost_match = re.search(cost_pattern, text) if not cost_match: raise InvalidMeterDataException( f"Couldn't parse cost from pdf: {text}") cost_text = cost_match.group("water_charges") cost = float(cost_text.replace(",", "").replace("$", "")) if config.enabled("S3_BILL_UPLOAD"): key = hash_bill(account_id, start_date, end_date, cost, 0, used) with open(pdf_filename, "rb") as pdf_data: attachments = [ upload_bill_to_s3( BytesIO(pdf_data.read()), key, source="customerconnect.poway.org", statement=statement_date, utility="utility:city-of-poway", utility_account_id=account_id, ) ] else: attachments = [] return BillingDatum( start=start_date, end=end_date - timedelta(days=1), statement=statement_date, cost=cost, peak=None, used=used, items=None, attachments=attachments, utility_code=None, )
def download_bills( self, latest: date, utility_account: str, utility: str, gen_utility: Optional[str] = None, gen_utility_account_id: Optional[str] = None, ) -> List[BillPdf]: """Download bill PDFs for the specified date range.""" pdfs: List[BillPdf] = [] log.info("Opening billing history") click(self._driver, css_selector="#arrowBillPaymentHistory") self.wait_until_ready(self.BillingHistoryTableSel) self._driver.screenshot( BaseWebScraper.screenshot_path("bill history arrow")) wait_for_block_overlay(self._driver) log.info("Clicking 'view up to..' link") click(self._driver, css_selector=self.ViewMoreHistorySel) self.wait_until_ready(self.BillingHistoryTableSel) self._driver.screenshot(BaseWebScraper.screenshot_path("panels")) panels_count = len( self._driver.find_elements_by_css_selector(self.PanelxSel)) log.info(f"found {panels_count} panels in billing widget") # Rather than get all matching elements and iterate through, use index # and manually get element each time to help avoid stale element errors for i in range(0, panels_count): panel = self._driver.find_elements_by_css_selector( self.PanelxSel)[i] # check if is a payment panel panel_header = panel.find_element_by_css_selector(".panel-title") header_text = panel_header.text if "Payment" in header_text: log.debug(f"Skipping panel {i} (payment)") # skip if is a payment panel continue log.debug(f"Processing panel {i} (bill): {header_text}") link_elem = panel.find_element_by_css_selector( "div.pge_coc-dashboard-viewPay_billed_history_panel_viewBill_para_block" " a.viewBill") # Get date from the "data-date" attribute on link to download bill... # data-date is in milliseconds timestamp = int(link_elem.get_attribute("data-date")) / 1000.0 # when bill was issued bill_date = datetime.fromtimestamp(timestamp).date() # bill issued about a week after end date; use this window to match dates approx_bill_end = bill_date - timedelta(days=7) approx_bill_start = approx_bill_end - timedelta(days=30) log.debug(f"bill date={bill_date}") # cost is in second column cost_text = panel.find_element_by_css_selector( "td.text-right").text log.debug(f"cost text={cost_text}") # cost with $ and commas: $1,234.56 or -$1,234.56 cost = float(cost_text.replace("$", "").replace(",", "")) log.info(f"Found bill issued {bill_date} with cost ${cost}") if approx_bill_end <= latest: log.info( f"ignoring bill, date: {approx_bill_end} already download") continue try: click(self._driver, elem=link_elem) except ElementNotInteractableException: log.info("Download link not visible; looking for other") link_elem = panel.find_element_by_css_selector( "div#billSummaryContainer a.viewBill") click(self._driver, elem=link_elem) except ElementClickInterceptedException as exc: log.info("download link failed: %s %s", exc, exc.msg) close_modal(self._driver) continue last4 = self.account_id.split("-")[0][6:10] filename = f"{last4}custbill{bill_date.strftime('%m%d%Y')}.pdf" download_dir = "%s/current" % config.WORKING_DIRECTORY try: self._driver.wait(60).until( file_exists_in_dir( # end pattern with $ to prevent matching filename.crdownload directory=download_dir, pattern=f"^{filename}$", )) except TimeoutException: log.error( f"ERROR waiting for file {filename} to download...skipping" ) # close the download failed modal if there is one close_modal(self._driver) continue with open("%s/%s" % (download_dir, filename), "rb") as f: key = hash_bill(self.account_id, approx_bill_start, approx_bill_end, cost, "", "") upload_bill_to_s3( file_handle=f, key=key, source="pge.com", statement=bill_date, utility=utility, utility_account_id=utility_account, gen_utility=gen_utility, gen_utility_account_id=gen_utility_account_id, ) log.info(f"Uploaded {filename} to {key}") pdfs.append( BillPdf( utility_account_id=utility_account, gen_utility_account_id=gen_utility, start=approx_bill_start, end=approx_bill_end, statement=bill_date, s3_key=key, )) return pdfs
def _execute(self): # Direct the driver to the login page self._driver.get(self.login_url) # Create page helpers login_page = LoginPage(self._driver) my_account_page = MyAccountPage(self._driver) bill_history_page = BillHistoryPage(self._driver) try: login_page.wait_until_ready() except Exception: self.screenshot("initial page load failed") # try one more time self._driver.get(self.login_url) login_page.wait_until_ready() login_page.login(self.username, self.password) self.screenshot("after login") my_account_page.wait_until_ready() my_account_page.navigate_to_bill_history() self.screenshot("bill history") if bill_history_page.too_many_sessions(): # waiting 5 minutes doesn't seem to help bill_history_page.logout() raise Exception("too many sessions") bill_history_page.wait_until_ready() self.screenshot("after captcha") if not bill_history_page.solve_captcha(): bill_history_page.logout() raise Exception("captcha failed") bill_history_page.wait_until_bills_ready() bill_history_page.select_account( self._configuration.utility_account_id, self._configuration.account_name) bill_history_page.wait_until_bills_ready() bill_history_page.download_bills(self.start_date, self.end_date) bill_history_page.logout() # get bills from download directory and parse bills: List[BillingDatum] = [] prefix = f"{config.WORKING_DIRECTORY}/current" log.info("Waiting for downloads to finish") while any(".pdf.crdownload" in f for f in os.listdir(prefix)): # Wait for downloads to finish time.sleep(1) continue start_dates: Set[date] = set() for filename in sorted(os.listdir(prefix)): if ".pdf" not in filename: continue log.info("parsing file %s" % filename) parsed_bills = parse_pdf(f"{prefix}/{filename}", self.meter_number, self.commodity) log.info(f"filename {filename} bills={parsed_bills}") if not parsed_bills: log.warning(f"no billing datum: filename={filename}") continue with open(prefix + "/" + filename, "rb") as pdf_data: bill = parsed_bills[0] key = hash_bill( self._configuration.utility_account_id, bill.start, bill.end, bill.cost, bill.peak, bill.used, ) attachment_entry = upload_bill_to_s3( BytesIO(pdf_data.read()), key, source="www.ladwp.com", statement=bill.end, utility="utility:ladwp", utility_account_id=self._configuration.utility_account_id, ) for bill in parsed_bills: attachments = [attachment_entry] if bill.start in start_dates: # if we already have a bill with this start date, replace it prev_bill = [b for b in bills if b.start == bill.start][0] log.info( "duplicate bill start: prev_bill = %s, bill = %s", prev_bill, bill, ) bills.remove(prev_bill) # copy the attachment attachments += prev_bill.attachments bills.append(bill._replace(attachments=attachments)) start_dates.add(bill.start) return Results(bills=bills)
def _execute(self): if self.end_date - self.start_date < timedelta(days=MINIMUM_BILL_DAYS): log.info( f"Expanding date range to a minimum of {MINIMUM_BILL_DAYS} days." ) self.start_date = self.end_date - timedelta(days=MINIMUM_BILL_DAYS) start_date = max(self.start_date, (datetime.now() - relativedelta(years=10)).date()) end_date = min(self.end_date, (datetime.now().date())) log.info("Final date range to search: %s - %s" % (start_date, end_date)) login_page = LoginPage(self._driver) home_page = login_page.login(self.username, self.password) self.screenshot("home_screen") log.info("Login successful.") bill_history_page = home_page.to_bill_history() self.screenshot("bill_history_page") log.info("Loaded bill history.") bill_history_page.select_account(self.account_number) self.screenshot("account_selected") log.info("Selected account.") bill_history_page.set_dates(start_date, end_date) self.screenshot("dates_selected") log.info("Selected dates.") raw_pdfs = bill_history_page.gather_data() log.info("PDF bills captured: %s" % len(raw_pdfs)) log.info("Net bill pdf bytes captured: %s" % (sum(len(x) for x in raw_pdfs))) ii = 0 bill_data = [] for b in raw_pdfs: ii += 1 bill_datum = parse_bill_pdf(BytesIO(b), self.meter_number) if bill_datum is None: log.info("There was a problem parsing a bill PDF #%d." % ii) continue attachment_entry = None if config.enabled("S3_BILL_UPLOAD"): key = bill_upload.hash_bill_datum(self.meter_number, bill_datum) attachment_entry = bill_upload.upload_bill_to_s3( BytesIO(b), key, source="pacificpower.net", statement=bill_datum.statement, utility=self.utility, utility_account_id=self.account_number, ) if attachment_entry: bill_data.append( bill_datum._replace(attachments=[attachment_entry])) else: bill_data.append(bill_datum) final_bills = adjust_bill_dates(bill_data) show_bill_summary(final_bills, "Final Bill Summary") return Results(bills=final_bills)