示例#1
0
def process_pdf(
    utility: str,
    utility_account_id: str,
    service_id: str,
    statement_dt: date,
    pdf_filename: str,
) -> BillingDatum:
    log.info("Parsing text from PDF %s", pdf_filename)
    text = pdfparser.pdf_to_str(pdf_filename)

    cost = extract_cost(text)
    used = extract_used(text)
    demand = extract_demand(text)
    start_date, end_date = extract_dates(text)

    # if the start date is in the wrong year, replace year (start_date = 12/1, statement_dt=12/15/2020)
    if start_date > statement_dt:
        start_date = start_date.replace(year=statement_dt.year)
        end_date = end_date.replace(year=statement_dt.year)
    # end_date must be after start date (end_date = 1/5, start_date = 12/1)
    if end_date < start_date:
        end_date = end_date.replace(year=end_date.year + 1)

    # adjust end date because SVP bills overlap on start/end dates
    end_date = end_date - timedelta(days=1)
    line_items: List[BillingDatumItemsEntry] = extract_line_items(text)
    key = hash_bill(
        service_id,
        start_date,
        end_date,
        cost,
        demand,
        used,
    )
    with open(pdf_filename, "rb") as pdf_data:
        attachment_entry = upload_bill_to_s3(
            BytesIO(pdf_data.read()),
            key,
            source="mua.santaclaraca.gov",
            statement=end_date,
            utility=utility,
            utility_account_id=utility_account_id,
        )

    return BillingDatum(
        start=start_date,
        end=end_date,
        statement=statement_dt,
        cost=cost,
        used=used,
        peak=demand,
        items=line_items,
        attachments=[attachment_entry],
        utility_code=None,
    )
示例#2
0
def parse_pdf(pdf_filename: str, utility: str,
              utility_account_id: str) -> BillingDatum:
    text = pdfparser.pdf_to_str(pdf_filename)
    if "Your Energy Bill" in text:
        log.info("parsing new-style PDF %s", pdf_filename)
        data = parse_new_pdf(text)
    else:
        log.info("parsing old-style PDF %s", pdf_filename)
        data = parse_old_pdf(text)
    key = hash_bill(utility_account_id, data.start, data.end, data.cost,
                    data.peak, data.used)
    with open(pdf_filename, "rb") as pdf_data:
        attachment_entry = upload_bill_to_s3(
            BytesIO(pdf_data.read()),
            key,
            source="www.duke-energy.com",
            statement=data.end,
            utility=utility,
            utility_account_id=utility_account_id,
        )
    return data._replace(attachments=[attachment_entry])
示例#3
0
    def get_bills(self, utility: str,
                  utility_account_id: str) -> List[BillingDatum]:
        billing_data = []

        available_dates = self.driver.find_elements(
            By.CSS_SELECTOR, "table.table-alt a.bill-view-link")
        available_dates = [parse_date(i.text).date() for i in available_dates]
        log.info("available dates: %s",
                 [dt.strftime("%Y-%m-%d") for dt in available_dates])

        xpath_locators = {
            # Finds the last KWH reading under Total Usage column
            "cost":
            "//table[contains(., 'NEW CHARGES')]/tbody/tr/td[3]",
            "used":
            "(//table[contains(.,'USAGE')]//tr/td[contains(., 'KWH')])",
            "usage_kw":
            "//table[contains(.,'USAGE')]//tr/td[contains(.,'KW') and not(contains(.,'KWH'))]",
        }

        # loop through dates in table in ascending order
        for pdf_date in reversed(available_dates):
            # skip if the date isn't in the specified range
            if not (self.start_date <= pdf_date <= self.end_date):
                log.debug("skipping date outside range: %s", pdf_date)
                continue

            view_bill_link = self.driver.find_element_by_xpath(
                '//a[.="%s"]' % pdf_date.strftime("%m/%d/%Y"))
            scroll_to(self.driver, view_bill_link)

            self.driver.sleep(0.5)
            view_bill_link.click()

            self.driver.wait(30).until(
                EC.visibility_of_element_located(
                    (By.CSS_SELECTOR, "div.billImage")))

            start_date = None
            end_date = None
            cost = None
            used = None
            peak = None

            dates_line_text: str = self.driver.find_element_by_xpath(
                "//td[contains(., 'Service From:')]").text
            dates_match = re.search(
                r"Service From: (?P<from>\w+ \d\d) to (?P<to>\w+ \d\d) \(\d\d Days\)",
                dates_line_text,
            )

            if dates_match:
                # if from month is December, use previous year
                year = (pdf_date.year -
                        1 if "dec" in dates_match.group("from").lower() else
                        pdf_date.year)
                start_date = parse_date("%s %s" %
                                        (dates_match.group("from"), year))
                end_date = parse_date(
                    dates_match.group("to") + pdf_date.strftime(" %Y"))

            cost_match = self.driver.find(xpath_locators["cost"], xpath=True)
            if cost_match:
                cost = cost_match.text
                cost = float(cost.replace("$", "").replace(",", ""))

            kwh_usages = []
            for match in self.driver.find_all(xpath_locators["used"],
                                              xpath=True):
                # include only if it has a reading values as siblings; exclude credit line items
                parent = match.find_element_by_xpath("..")
                # meter number, previous reading, current reading
                readings_text = ""
                for idx, child in enumerate(
                        parent.find_elements_by_xpath(".//td")):
                    log.debug("\t%s\t%s", idx, child.text.strip())
                    readings_text += child.text.strip()
                    if idx == 2:
                        break
                if not readings_text:
                    log.info("skipping non-reading line item: %s", parent.text)
                    continue
                kwh_value = float(
                    match.text.replace("KWH", "").replace(",", "").strip())
                kwh_usages.append(kwh_value)

            if kwh_usages:
                used = sum(kwh_usages)

            kw_usages = []
            for usage_kw_match in self.driver.find_all(
                    xpath_locators["usage_kw"], xpath=True):
                kw_usages.append(
                    float(
                        usage_kw_match.text.replace("KW",
                                                    "").replace(",",
                                                                "").strip()))

            if kw_usages:
                peak = max(kw_usages)

            data = BillingDatum(
                start=start_date,
                end=end_date - timedelta(days=1),
                statement=end_date - timedelta(days=1),
                cost=cost,
                peak=peak,
                used=used,
                items=None,
                attachments=None,
                utility_code=None,
            )

            self.driver.find("a#billImageToPrint").click()
            self.driver.sleep(1)
            self.driver.switch_to.window(self.driver.window_handles[-1])

            # the filename of the printed pdf is f"{current page title}.pdf"
            self.driver.execute_script("window.print();")

            try:
                file_exists_in_dir(directory=self.download_dir,
                                   pattern=r"^Bill View Bill Image.pdf$")
            except Exception:
                raise Exception("Unable to download file for %s" % pdf_date)

            curr_path = os.path.join(self.download_dir,
                                     "Bill View Bill Image.pdf")
            new_path = os.path.join(
                self.download_dir, f"bill_{pdf_date.strftime('%Y-%m-%d')}.pdf")
            os.rename(curr_path, new_path)

            log.info("parsed bill for %s - %s", data.start, data.end)

            self.driver.find("a#close").click()
            self.driver.sleep(1)
            self.driver.switch_to.window(self.driver.window_handles[-1])
            self.driver.sleep(1)

            # upload PDF:
            key = hash_bill(
                utility_account_id,
                data.start,
                data.end,
                data.cost,
                data.peak,
                data.used,
            )

            with open(new_path, "rb") as pdf_data:
                attachment_entry = upload_bill_to_s3(
                    BytesIO(pdf_data.read()),
                    key,
                    source="www.duke-energy.com",
                    statement=data.end,
                    utility=utility,
                    utility_account_id=utility_account_id,
                )

            if attachment_entry:
                data = data._replace(attachments=[attachment_entry])

            billing_data.append(data)

            # Click Bill Information in breadcrumbs to go back to bills list page
            self.driver.find("a#billInformation").click()

        return billing_data
def extract_bill_data(pdf_filename, service_id, utility,
                      utility_account_id) -> Optional[BillingDatum]:
    # this function should upload the file to s3 to set attachments?
    try:
        text = pdf_to_str(pdf_filename)
    except PDFSyntaxError:
        log.exception("Downloaded bill file failed to parse as a PDF.")
        return None

    current_charges_pattern = "Current Charges(.*?)Cycle"
    for line in (re.search(current_charges_pattern, text,
                           re.DOTALL).group(1).split("\n")):
        # get the last number
        if re.match(r"[\d,\.]", line.strip()):
            current_charges = line.strip().replace(",", "")

    period_start, period_end = extract_bill_period(pdf_filename)

    usage_pattern = r"Energy Charges \((\d*) kWh\)"
    usage = re.search(usage_pattern, text).groups()[0]

    on_peak_demand_pattern = r"On-Peak Demand \((\d+\.\d+)\ KW"
    on_peak_demand = re.search(on_peak_demand_pattern, text).groups()[0]

    offpeak_demand_pattern = r"Off-Peak Demand \((\d+\.\d+)\ KW"
    offpeak_demand = re.search(offpeak_demand_pattern, text).groups()[0]

    bill_attachment = []
    if config.enabled("S3_BILL_UPLOAD"):
        log.info("S3_BILL_UPLOAD is enabled")
        with open(pdf_filename, "rb") as f:
            key = hash_bill(
                service_id,
                period_start,
                period_end,
                _format_number(current_charges),
                0,
                _format_number(usage),
            )
            # no statement date; use end date
            bill_attachment.append(
                upload_bill_to_s3(
                    f,
                    key,
                    source="portlandgeneral.com",
                    statement=period_end,
                    utility=utility,
                    utility_account_id=utility_account_id,
                ))
            log.info("Uploaded bill %s to s3", bill_attachment)

    bill = BillingDatum(
        start=period_start,
        end=period_end,
        statement=period_end,
        cost=_format_number(current_charges),
        used=_format_number(usage),
        peak=max(
            float(on_peak_demand),
            float(offpeak_demand),
        ),
        items=[],
        attachments=bill_attachment,
        utility_code=None,
    )

    return bill
示例#5
0
def parse_poway_pdf(pdf_filename: str, account_id: str) -> BillingDatum:
    text = pdfparser.pdf_to_str(pdf_filename)

    used_pattern = r"Consumption (?P<units_used>[\d\.,]+) @"
    cost_pattern = r"(?P<water_charges>[\d\.,]+)\s+WATERBasic Service @"

    # date format: m/d/yyyy
    date_pattern = r"\d{1,2}\/\d{1,2}\/\d{4}"
    dates_pattern = (
        r"Total Current Charges.+?"
        fr"(?P<read_date_start>{date_pattern}) - (?P<read_date_end>{date_pattern})"
        fr"(?P<due_date>{date_pattern})"
        fr"(?P<statement_date>{date_pattern})")

    dates_match = re.search(dates_pattern, text)
    if not dates_match:
        raise InvalidMeterDataException(
            f"Couldn't parse dates from pdf: {text}")

    _dates = dates_match.group("read_date_start", "read_date_end",
                               "statement_date")
    start_date, end_date, statement_date = [
        parse_date(_date).date() for _date in _dates
    ]

    used_match = re.search(used_pattern, text)
    if not used_match:
        raise InvalidMeterDataException(
            "fCouldn't parse usage from pdf: {text}")

    used_text = used_match.group("units_used")
    used = float(used_text.replace(",", "").replace("$", ""))

    cost_match = re.search(cost_pattern, text)
    if not cost_match:
        raise InvalidMeterDataException(
            f"Couldn't parse cost from pdf: {text}")

    cost_text = cost_match.group("water_charges")
    cost = float(cost_text.replace(",", "").replace("$", ""))

    if config.enabled("S3_BILL_UPLOAD"):
        key = hash_bill(account_id, start_date, end_date, cost, 0, used)
        with open(pdf_filename, "rb") as pdf_data:
            attachments = [
                upload_bill_to_s3(
                    BytesIO(pdf_data.read()),
                    key,
                    source="customerconnect.poway.org",
                    statement=statement_date,
                    utility="utility:city-of-poway",
                    utility_account_id=account_id,
                )
            ]
    else:
        attachments = []
    return BillingDatum(
        start=start_date,
        end=end_date - timedelta(days=1),
        statement=statement_date,
        cost=cost,
        peak=None,
        used=used,
        items=None,
        attachments=attachments,
        utility_code=None,
    )
示例#6
0
    def download_bills(
        self,
        latest: date,
        utility_account: str,
        utility: str,
        gen_utility: Optional[str] = None,
        gen_utility_account_id: Optional[str] = None,
    ) -> List[BillPdf]:
        """Download bill PDFs for the specified date range."""
        pdfs: List[BillPdf] = []
        log.info("Opening billing history")

        click(self._driver, css_selector="#arrowBillPaymentHistory")

        self.wait_until_ready(self.BillingHistoryTableSel)
        self._driver.screenshot(
            BaseWebScraper.screenshot_path("bill history arrow"))
        wait_for_block_overlay(self._driver)

        log.info("Clicking 'view up to..' link")

        click(self._driver, css_selector=self.ViewMoreHistorySel)
        self.wait_until_ready(self.BillingHistoryTableSel)

        self._driver.screenshot(BaseWebScraper.screenshot_path("panels"))

        panels_count = len(
            self._driver.find_elements_by_css_selector(self.PanelxSel))
        log.info(f"found {panels_count} panels in billing widget")

        # Rather than get all matching elements and iterate through, use index
        # and manually get element each time to help avoid stale element errors
        for i in range(0, panels_count):
            panel = self._driver.find_elements_by_css_selector(
                self.PanelxSel)[i]

            # check if is a payment panel
            panel_header = panel.find_element_by_css_selector(".panel-title")
            header_text = panel_header.text
            if "Payment" in header_text:
                log.debug(f"Skipping panel {i} (payment)")
                # skip if is a payment panel
                continue

            log.debug(f"Processing panel {i} (bill): {header_text}")

            link_elem = panel.find_element_by_css_selector(
                "div.pge_coc-dashboard-viewPay_billed_history_panel_viewBill_para_block"
                " a.viewBill")
            # Get date from the "data-date" attribute on link to download bill...
            # data-date is in milliseconds
            timestamp = int(link_elem.get_attribute("data-date")) / 1000.0

            # when bill was issued
            bill_date = datetime.fromtimestamp(timestamp).date()
            # bill issued about a week after end date; use this window to match dates
            approx_bill_end = bill_date - timedelta(days=7)
            approx_bill_start = approx_bill_end - timedelta(days=30)
            log.debug(f"bill date={bill_date}")

            # cost is in second column
            cost_text = panel.find_element_by_css_selector(
                "td.text-right").text
            log.debug(f"cost text={cost_text}")
            # cost with $ and commas: $1,234.56 or -$1,234.56
            cost = float(cost_text.replace("$", "").replace(",", ""))

            log.info(f"Found bill issued {bill_date} with cost ${cost}")

            if approx_bill_end <= latest:
                log.info(
                    f"ignoring bill, date: {approx_bill_end} already download")
                continue

            try:
                click(self._driver, elem=link_elem)
            except ElementNotInteractableException:
                log.info("Download link not visible; looking for other")

                link_elem = panel.find_element_by_css_selector(
                    "div#billSummaryContainer a.viewBill")

                click(self._driver, elem=link_elem)
            except ElementClickInterceptedException as exc:
                log.info("download link failed: %s %s", exc, exc.msg)
                close_modal(self._driver)
                continue

            last4 = self.account_id.split("-")[0][6:10]
            filename = f"{last4}custbill{bill_date.strftime('%m%d%Y')}.pdf"
            download_dir = "%s/current" % config.WORKING_DIRECTORY

            try:
                self._driver.wait(60).until(
                    file_exists_in_dir(
                        # end pattern with $ to prevent matching filename.crdownload
                        directory=download_dir,
                        pattern=f"^{filename}$",
                    ))
            except TimeoutException:
                log.error(
                    f"ERROR waiting for file {filename} to download...skipping"
                )
                # close the download failed modal if there is one
                close_modal(self._driver)
                continue

            with open("%s/%s" % (download_dir, filename), "rb") as f:
                key = hash_bill(self.account_id, approx_bill_start,
                                approx_bill_end, cost, "", "")

                upload_bill_to_s3(
                    file_handle=f,
                    key=key,
                    source="pge.com",
                    statement=bill_date,
                    utility=utility,
                    utility_account_id=utility_account,
                    gen_utility=gen_utility,
                    gen_utility_account_id=gen_utility_account_id,
                )

            log.info(f"Uploaded {filename} to {key}")
            pdfs.append(
                BillPdf(
                    utility_account_id=utility_account,
                    gen_utility_account_id=gen_utility,
                    start=approx_bill_start,
                    end=approx_bill_end,
                    statement=bill_date,
                    s3_key=key,
                ))

        return pdfs
示例#7
0
    def _execute(self):
        # Direct the driver to the login page
        self._driver.get(self.login_url)
        # Create page helpers
        login_page = LoginPage(self._driver)
        my_account_page = MyAccountPage(self._driver)
        bill_history_page = BillHistoryPage(self._driver)

        try:
            login_page.wait_until_ready()
        except Exception:
            self.screenshot("initial page load failed")
            # try one more time
            self._driver.get(self.login_url)
            login_page.wait_until_ready()
        login_page.login(self.username, self.password)
        self.screenshot("after login")

        my_account_page.wait_until_ready()
        my_account_page.navigate_to_bill_history()
        self.screenshot("bill history")

        if bill_history_page.too_many_sessions():
            # waiting 5 minutes doesn't seem to help
            bill_history_page.logout()
            raise Exception("too many sessions")
        bill_history_page.wait_until_ready()
        self.screenshot("after captcha")
        if not bill_history_page.solve_captcha():
            bill_history_page.logout()
            raise Exception("captcha failed")

        bill_history_page.wait_until_bills_ready()
        bill_history_page.select_account(
            self._configuration.utility_account_id,
            self._configuration.account_name)
        bill_history_page.wait_until_bills_ready()
        bill_history_page.download_bills(self.start_date, self.end_date)
        bill_history_page.logout()
        # get bills from download directory and parse

        bills: List[BillingDatum] = []
        prefix = f"{config.WORKING_DIRECTORY}/current"

        log.info("Waiting for downloads to finish")
        while any(".pdf.crdownload" in f for f in os.listdir(prefix)):
            # Wait for downloads to finish
            time.sleep(1)
            continue

        start_dates: Set[date] = set()
        for filename in sorted(os.listdir(prefix)):
            if ".pdf" not in filename:
                continue

            log.info("parsing file %s" % filename)
            parsed_bills = parse_pdf(f"{prefix}/{filename}", self.meter_number,
                                     self.commodity)
            log.info(f"filename {filename} bills={parsed_bills}")
            if not parsed_bills:
                log.warning(f"no billing datum: filename={filename}")
                continue
            with open(prefix + "/" + filename, "rb") as pdf_data:
                bill = parsed_bills[0]
                key = hash_bill(
                    self._configuration.utility_account_id,
                    bill.start,
                    bill.end,
                    bill.cost,
                    bill.peak,
                    bill.used,
                )
                attachment_entry = upload_bill_to_s3(
                    BytesIO(pdf_data.read()),
                    key,
                    source="www.ladwp.com",
                    statement=bill.end,
                    utility="utility:ladwp",
                    utility_account_id=self._configuration.utility_account_id,
                )
            for bill in parsed_bills:
                attachments = [attachment_entry]
                if bill.start in start_dates:
                    # if we already have a bill with this start date, replace it
                    prev_bill = [b for b in bills if b.start == bill.start][0]
                    log.info(
                        "duplicate bill start: prev_bill = %s, bill = %s",
                        prev_bill,
                        bill,
                    )
                    bills.remove(prev_bill)
                    # copy the attachment
                    attachments += prev_bill.attachments
                bills.append(bill._replace(attachments=attachments))
                start_dates.add(bill.start)

        return Results(bills=bills)