def _execute(self): # Meterwatch immediately spawns a popup when loaded which is the actual # window we want. So we have to go and grab the main window handle and # THEN go looking for the popup window and switch to it. handles_before = self._driver.window_handles timeline = Timeline(self.start_date, self.end_date) main_window = _get_main_window(self._driver) login_window = None log.info(f"Navigating to {self.url}") self._driver.get(self.url) self.screenshot("initial_url") log.debug("Driver title: " + self._driver.title) assert "Seattle MeterWatch" in self._driver.title login_page = LoginPage(self._driver) meterdata_page = MeterDataPage(self._driver, self._configuration) login_page.login(self.username, self.password) self._driver.wait().until( lambda driver: len(handles_before) != len(driver.window_handles), "Issues loading login page.", ) for handle in self._driver.window_handles: if handle != main_window: login_window = handle # We have our popup, so lets do stuff with it. log.info("switching to new window") self._driver.switch_to.window(login_window) # resize: it opens as a tiny window self._driver.set_window_size(1200, 800) for meter_number in self._configuration.meter_numbers: meterdata_page.select_account(meter_number) self.start_date, self.end_date = meterdata_page.adjust_start_and_end_dates( self.start_date, self.end_date) # Widen timeline if necessary after dates may have been adjusted from original. timeline.extend_timeline(self.start_date, self.end_date) date_range = DateRange(self.start_date, self.end_date) interval_size = relativedelta(days=MAX_DOWNLOAD_DAYS) for sub_range in date_range.split_iter(delta=interval_size): meterdata_page.enter_dates(sub_range.start_date, sub_range.end_date) csv_file_path = meterdata_page.download_data(meter_number) log.info( f"parsing kWh usage from downloaded data for {meter_number}" ) self._process_csv(csv_file_path, timeline) return Results(readings=timeline.serialize(include_empty=False))
def process_partial_bills(self): """Primary method. Goes through billing_data and uploads new partial bills directly to the partial bills table. If a new partial bill differs from an existing partial bill, a new partial bill is created, rather than overwriting the old one. """ # Run initial validation of all the partial bills. Failures are caught # and the scraper run is marked as FAILED. try: PartialBillValidator(self.billing_data).run_prevalidation() except (OverlappedBillingDataDateRangeError, NoFutureBillsError): return Status.FAILED # Snap the start date of the first new bill, if applicable self.billing_data = snap_first_start(self.billing_data, self.haves) for pending_partial in self.billing_data: found = False for existing_partial in self.haves: existing_cycle = DateRange( existing_partial.initial, existing_partial.closing ) pending_cycle = DateRange(pending_partial.start, pending_partial.end) if existing_cycle == pending_cycle: # cycles match exactly if ( existing_partial.differs(pending_partial) and not self._bad_override_detected( existing_partial, pending_partial ) and not self._existing_is_manual( existing_partial, pending_partial ) ): # Mark the old partial bill as superseded # and add a new partial bill self._supersede(existing_partial, pending_partial) found = True break elif existing_cycle.intersects( pending_cycle ): # cycle does not match exactly, but intersects. if not self._existing_is_manual(existing_partial, pending_partial): # We create a new partial bill and supersede the old one self._supersede(existing_partial, pending_partial) found = True if not found: # Pending partial bill does not already exist, so we stage a new one pb = PartialBill.generate( self.meter.utility_service, self.bill_type, pending_partial ) self.staged_partial.append(pb) return Status.SUCCEEDED if self.staged_partial else Status.COMPLETED
def _execute(self): # Direct driver to site url - # Currently a public URL, no credentials needed. Will have to be # refactored in the future if we start scraping private sites. self._driver.get(self.site_url) # Create page helpers overview_page = OverviewPage(self._driver) site_analytics_page = SiteAnalyticsPage(self._driver) date_picker_component = DatePickerSection(self._driver) # Navigate to site analytics tab overview_page.wait_until_ready() self.screenshot("before clicking on site analytics tab") overview_page.navigate_to_site_analytics() # Select inverter from both dropdowns site_analytics_page.wait_until_ready() self.screenshot("before selecting inverters") site_analytics_page.select_inverter_from_both_dropdowns( self.inverter_id) # Click on AC Power button self.screenshot("before clicking on ac power button") site_analytics_page.click_ac_power_button() self.screenshot("after clicking on ac power button") self.install_date = self.string_to_date( site_analytics_page.get_install_date()) # Adjust start and end date, depending on inverter install date self.adjust_start_and_end_dates() date_range = DateRange(self.start_date, self.end_date) interval_size = relativedelta(days=MAX_INTERVAL_LENGTH) # Loop through desired interval in two day chunks to pull down # power generated for sub_range in date_range.split_iter(delta=interval_size): start = sub_range.start_date end = sub_range.end_date file_path = date_picker_component.complete_form_and_download( start, end) intermediate_readings = CSVParser(self.inverter_id, file_path).process_csv() self.readings.update(intermediate_readings) log.info("Cleaning up download.") clear_downloads(self._driver.download_dir) # Adding a large pause self._driver.sleep(5) return Results(readings=self.readings)
def datafeed( account: SnapmeterAccount, meter: Meter, datasource: SnapmeterMeterDataSource, params: dict, task_id: Optional[str] = None, ) -> Status: meta = datasource.meta or {} configuration = Configuration(mvweb_id=meta.get("mvWebId"), interval=meter.interval) # reduce load on MVWeb servers: skip if meter has data from within the last 3 days and there are no gaps max_reading = meter.readings_range.max_date or date.today() - timedelta( days=365) interval_age = (date.today() - max_reading).days date_range = DateRange( *iso_to_dates(params.get("data_start"), params.get("data_end"))) # freshest we can expect is 3 days old date_range = DateRange( date_range.start_date, min(date_range.end_date, date.today() - timedelta(days=3)), ) expected = (date_range.end_date - date_range.start_date).days + 1 days_with_data = (db.session.query(MeterReading).filter( MeterReading.meter == meter.oid, MeterReading.occurred >= date_range.start_date, MeterReading.occurred <= date_range.end_date, ).count()) log.info( "days with data from %s - %s = %s", date_range.start_date, date_range.end_date, days_with_data, ) if interval_age <= 3 and days_with_data == expected: log.info( "skipping MVWeb run: meter %s has recent interval data (%s) and no gaps", meter.oid, max_reading, ) return Status.SKIPPED return run_datafeed( LADWPMVWebScraper, account, meter, datasource, params, configuration=configuration, task_id=task_id, )
def test_scraper( utility_account: str, service_id: str, account_group: str, account_number: str, start_date: date, end_date: date, username: str, password: str, ): """Launch a Chrome browser to test the scraper.""" configuration = PortlandBizportalConfiguration( utility="utility:portland-ge", utility_account_id=utility_account, account_group=account_group, bizportal_account_number=account_number, service_id=service_id, ) credentials = Credentials(username, password) scraper = PortlandBizportalScraper( credentials, DateRange(start_date, end_date), configuration ) scraper.start() with mock.patch("datafeeds.scrapers.pge.bill_pdf.upload_bill_to_s3"): scraper.scrape( readings_handler=None, bills_handler=ft.partial( test_upload_bills, -1, service_id, None, "portland-bizportal" ), partial_bills_handler=None, pdfs_handler=None, ) scraper.stop()
def energy_manager_date_range(self, min_start_date): if self.start_date: start_date = date( year=self.start_date.year, month=self.start_date.month, day=1 ) else: start_date = min_start_date if self.end_date: end_date = date(year=self.end_date.year, month=self.end_date.month, day=1) else: today = date.today() end_date = date(year=today.year, month=today.month, day=1) if start_date > end_date: msg = "The scraper start date must be before the end date (start={}, end={})".format( start_date, end_date ) sce_errors.BillingDataDateRangeException(msg) if end_date < min_start_date: msg = "No billing data is available for the range {} to {}.".format( start_date, end_date ) raise sce_errors.BillingDataDateRangeException(msg) if start_date < min_start_date: log.info("Adjusting start date to minimum start date: %s", start_date) start_date = min_start_date date_range = DateRange(start_date, end_date) return date_range
def test_scraper( meter_number: str, start_date: date, end_date: date, username: str, password: str, ): configuration = LADWPBillPdfConfiguration( meter_number=meter_number, utility_account_id=meter_number, commodity="False", account_name=None, ) credentials = Credentials(username, password) scraper = LADWPBillPdfScraper( credentials, DateRange(start_date, end_date), configuration ) scraper.start() scraper.scrape( bills_handler=ft.partial( test_upload_bills, -1, meter_number, None, "ladwp-bill-pdf" ), partial_bills_handler=None, readings_handler=None, pdfs_handler=None, ) scraper.stop()
def test_scraper( service_id: str, gen_service_id: str, utility_account_id: str, start_date: date, end_date: date, username: str, password: str, ): is_partial = gen_service_id is not None configuration = SceReactBasicBillingConfiguration( service_id=service_id, gen_service_id=gen_service_id, utility_account_id=utility_account_id, scrape_partial_bills=is_partial, scrape_bills=not is_partial, ) credentials = Credentials(username, password) scraper = SceReactBasicBillingScraper(credentials, DateRange(start_date, end_date), configuration) fixture = setup_fixture().get(service_id) if fixture: scraper.utility_service = fixture["initial"] set_tariff_mock = MagicMock() set_tariff_mock.return_value = mock_set_tariff_from_utility_code scraper.utility_service.set_tariff_from_utility_code = set_tariff_mock scraper.start() scraper.scrape( bills_handler=ft.partial(test_upload_bills, -1, service_id, None, "sce-react-basic-billing"), partial_bills_handler=ft.partial(test_upload_partial_bills, None, None, None), readings_handler=None, pdfs_handler=None, ) scraper.stop() if fixture: print("field\tactual\texpected\tmatch?") fields = [ "service_id", "tariff", "utility_account_id", "gen_service_id", "gen_tariff", "gen_utility", "gen_utility_account_id", "provider_type", ] matches = [] for field in fields: actual = getattr(scraper.utility_service, field) expected = getattr(fixture["expected"], field) print(f"{field}\t{actual}\t{expected}\t{actual == expected}") if actual == expected: matches.append(field) if matches == fields: print("\nOK") else: print(f"\nFAILED: mismatches = {set(fields) - set(matches)}")
def test_scraper(self): """The Synchronizer can extract partial bill data from the SMD tables.""" self.add_customer_info("12345", "ABCDE") self.add_bill("ABCDE", datetime(2020, 1, 1), timedelta(days=30)) self.add_bill("ABCDE", datetime(2020, 2, 1), timedelta(days=28)) self.add_bill("ABCDE", datetime(2020, 3, 1), timedelta(days=30)) config = SmdPartialBillingScraperConfiguration(self.meter) scraper = SmdPartialBillingScraper( Credentials(None, None), DateRange(date(2019, 12, 1), date(2020, 5, 1)), configuration=config, ) results = scraper._execute() self.assertEqual(3, len(results.tnd_bills)) # Perform a quick sanity check that we found the right bill dates. # Conversion from an SMD bill to a billing datum is tested elsewhere in depth. # Note: Dates intentionally do not line up with SMD records; this ensures the dates # agree with PDF bill data. expected = [ (date(2020, 1, 2), date(2020, 1, 31)), (date(2020, 2, 2), date(2020, 2, 29)), (date(2020, 3, 2), date(2020, 3, 31)), ] actual = [(b.start, b.end) for b in results.tnd_bills] self.assertEqual(expected, actual)
def test_scraper( service_id: str, address: str, start_date: date, end_date: date, username: str, password: str, ): configuration = SceReactEnergyManagerGreenButtonConfiguration( service_id=service_id, meta={"serviceAccountAddress": address}, meter=MagicMock(), ) credentials = Credentials(username, password) scraper = SceReactEnergyManagerGreenButtonScraper( credentials, DateRange(start_date, end_date), configuration) scraper.start() with mock.patch("datafeeds.scrapers.pge.bill_pdf.upload_bill_to_s3"): scraper.scrape( readings_handler=ft.partial(test_upload_readings, None), bills_handler=None, pdfs_handler=None, partial_bills_handler=None, ) scraper.stop()
def setUp(self): super().setUp() config = Configuration() self.scraper = powertrack.PowerTrackScraper( credentials=None, date_range=DateRange(self.start_date.date(), self.end_date.date()), configuration=config, )
def setUp(self): super().setUp() config = Configuration() config.site_name = "Xilinx - San Jose" self.scraper = bloom_interval.BloomScraper( credentials=None, date_range=DateRange(self.start_date, self.end_date), configuration=config, )
def setUp(self): super().setUp() config = Configuration() config.account_id = "s12345" config.meter_id = "2a566973506457484a43554b772b71553d-1" self.scraper = nautilus.NautilusScraper( credentials=None, date_range=DateRange(self.start_date.date(), self.end_date.date()), configuration=config, )
def test_scraper(account_number: str, start_date: date, end_date: date, username: str, password: str): configuration = FPLMyAccountConfiguration(account_number=account_number) credentials = Credentials(username, password) scraper = FPLMyAccountScraper(credentials, DateRange(start_date, end_date), configuration) scraper.start() scraper.scrape( readings_handler=print, bills_handler=None, pdfs_handler=None, ) scraper.stop()
def test_scraper(username: str, password: str, account_id: str, start_date: date, end_date: date): configuration = PowayWaterConfiguration(account_id) credentials = Credentials(username, password) scraper = PowayWaterScraper(credentials, DateRange(start_date, end_date), configuration) scraper.start() scraper.scrape( bills_handler=ft.partial(test_upload_bills, -1, account_id, None), partial_bills_handler=None, readings_handler=None, pdfs_handler=None, ) scraper.stop()
def test_dst_data(self): date_range = DateRange(date(2020, 10, 31), date(2020, 11, 6)) timeline = Timeline(date_range.start_date, date_range.end_date, 15) scraper = heco.HECOScraper( Credentials(None, None), date_range, HECOGridConfiguration(meter_id=123, interval=15), ) scraper._process_csv("datafeeds/scrapers/tests/fixtures/mvweb_dst.csv", timeline) with open("datafeeds/scrapers/tests/fixtures/mvweb_dst_expected.json" ) as f: expected = json.loads(f.read()) self.assertEqual(expected, timeline.serialize())
def test_scraper( point_id: str, start_date: date, end_date: date, username: str, password: str ): configuration = SVPIntervalConfiguration(point_id=point_id) credentials = Credentials(username, password) scraper = SVPIntervalScraper( credentials, DateRange(start_date, end_date), configuration ) scraper.start() scraper.scrape( readings_handler=print, bills_handler=None, pdfs_handler=None, ) scraper.stop()
def test_scraper(username: str, password: str, account_id: str, start_date: date, end_date: date): configuration = DukeBillingConfiguration("utility:duke-carolinas-nc", account_id) credentials = Credentials(username, password) scraper = DukeBillingScraper(credentials, DateRange(start_date, end_date), configuration) scraper.start() scraper.scrape( bills_handler=ft.partial(test_upload_bills, -1, account_id, None, "duke-energy-billing"), partial_bills_handler=None, readings_handler=None, pdfs_handler=None, ) scraper.stop()
def test_scraper( service_id: str, start_date: date, end_date: date, username: str, password: str ): configuration = SVPBillingConfiguration( utility="utility:default", utility_account_id="12345", service_id=service_id ) credentials = Credentials(username, password) scraper = SVPBillingScraper( credentials, DateRange(start_date, end_date), configuration ) scraper.start() scraper.scrape( readings_handler=None, bills_handler=print, pdfs_handler=None, ) scraper.stop()
def test_scraper(utility_account: str, start_date: date, end_date: date, username: str, password: str): configuration = PgeBillPdfConfiguration( utility="pge", utility_account=utility_account, gen_utility=None, gen_utility_account_id=None, datasource=MagicMock(), ) credentials = Credentials(username, password) scraper = PgeBillPdfScraper(credentials, DateRange(start_date, end_date), configuration) scraper.start() with mock.patch("datafeeds.scrapers.pge.bill_pdf.upload_bill_to_s3"): scraper.scrape( readings_handler=None, bills_handler=None, pdfs_handler=ft.partial(test_pdf_upload, None), ) scraper.stop()
def test_fall_daylight_savings(self): """Test Fall DST values are not double counted""" date_range = DateRange(date(2020, 11, 1), date(2020, 11, 1)) timeline = Timeline(date_range.start_date, date_range.end_date, 15) scraper = SCLMeterWatchScraper( Credentials(None, None), date_range, SCLMeterWatchConfiguration(meter_numbers=["803441"], meter=self.meter), ) scraper._process_csv( "datafeeds/scrapers/tests/fixtures/scl_meterwatch_dst.csv", timeline) with open( "datafeeds/scrapers/tests/fixtures/scl_meterwatch_dst_expected.json" ) as f: expected = json.loads(f.read()) self.assertEqual(expected, timeline.serialize())
def test_urjanet_data_range_for_partial_scrapers(self): datasource = test_util.FixtureDataSource( os.path.join(DATA_DIR, "simple_fixture_input.json")) transformer = PacificGasElectricTransformer() config = BaseUrjanetConfiguration( datasource, transformer, "pge", False, partial_type=PartialBillProviderType.GENERATION_ONLY, ) date_range = DateRange(date(2020, 1, 1), date(2020, 6, 1)) scraper = BaseUrjanetScraper(None, date_range, config) result = scraper._execute() self.assertEqual( expected, result.generation_bills, "partial urjanet scrapers return whatever partial bills we find, regardless of scraped range.", )
def test_scraper( utility_account_id: str, start_date: date, end_date: date, username: str, password: str, ): configuration = SaltRiverBillingConfiguration( account_id=utility_account_id) credentials = Credentials(username, password) scraper = SaltRiverBillingScraper(credentials, DateRange(start_date, end_date), configuration) scraper.start() scraper.scrape( bills_handler=ft.partial(test_upload_bills, -1, utility_account_id, None, "saltriver-billing"), partial_bills_handler=None, readings_handler=None, pdfs_handler=None, ) scraper.stop()
def test_scraper( utility_account_id: str, service_id: str, start_date: date, end_date: date, username: str, password: str, ): configuration = PepcoIntervalConfiguration( utility_account_id=utility_account_id, service_id=service_id, interval=15 ) credentials = Credentials(username, password) scraper = PepcoIntervalScraper( credentials, DateRange(start_date, end_date), configuration ) scraper.start() scraper.scrape( readings_handler=print, bills_handler=None, pdfs_handler=None, partial_bills_handler=None, ) scraper.stop()
def _execute(self): self._driver.get(self.site_url) log.info(self._configuration.__dict__) log.info(self._configuration.meter_oid) interval = self._get_meter_interval() log.info("meter interval is %s", interval) login_page = LoginPage(self._driver) landing_page = LandingPage(self._driver) extract_page = DataExtractPage(self._driver) if interval == 1440: extract_page.IntervalRadio = 'label[for="timeInterval-daily"]' login_page.wait_until_ready(login_page.SigninButtonSelector) self.screenshot("before login") login_page.login(self.username, self.password) self.screenshot("after login") landing_page.go_to_data_extract() log.info("Filling out data extract form") self.screenshot("data extract page") # Verify dates and break into date ranges start_year = extract_page.get_earliest_year(extract_page) self.adjust_start_and_end_dates(start_year) date_range = DateRange(self.start_date, self.end_date) interval_size = relativedelta(days=MAX_DOWNLOAD_DAYS) readings = [] self._export_data(extract_page, date_range, interval_size, interval=interval) if self.timeline: readings = self.timeline.serialize() return Results(readings=readings)
def energy_manager_basic_usage_action( self, page: sce_pages.SceEnergyManagerBasicUsagePage): sce_pages.detect_and_close_survey(self._driver) rval = page.select_service_id(self.service_id) log.info("Result of select service id %s: %s", self.service_id, rval) self.screenshot("select_service_id") page.configure_report() date_range = DateRange(self.start_date, self.end_date) # the website seems to time out when trying to get more than this amount of data interval_size = relativedelta(days=7) timeline = Timeline(self.start_date, self.end_date) for idx, subrange in enumerate( date_range.split_iter(delta=interval_size)): log.info("Requesting interval data for dates: %s", subrange) start = subrange.start_date end = subrange.end_date page.set_time_range(start, end) self.screenshot("set_time_range") try: page.generate_report() time.sleep(5) WebDriverWait(self._driver, 180).until( EC.invisibility_of_element_located( sce_pages.GenericBusyIndicatorLocator)) self.screenshot(f"interval{idx}") except Exception as e: raise sce_errors.EnergyManagerReportException( "Failed to load data from Energy Manager") from e try: page.raise_on_report_error() except sce_errors.EnergyManagerDataNotFoundException: log.info("No data found for this time range, continuing...") # If a given date range has no interval data, just move on to the next one continue log.info("Downloading the interval data report.") self.clear_csv_downloads() try: page.download_report() except Exception as e: raise sce_errors.EnergyManagerReportException( "Failed to load data from Energy Manager") from e try: # Wait two minutes for the download to finish wait = WebDriverWait(self._driver, 120) csv_file_name = wait.until( file_exists_in_dir(self._driver.download_dir, r".*\.csv")) csv_file_path = os.path.join(self._driver.download_dir, csv_file_name) for reading in parse_sce_csv_file(csv_file_path, self.service_id): timeline.insert(reading.dt, reading.value) except TimeoutException: raise TimeoutException( "Downloading interval data from Energy Manager failed.") self.interval_data_timeline = timeline
def _execute(self): # Direct the driver to the login page self._driver.get(self.login_url) # Create page helpers login_page = LoginPage(self._driver) navigation = Navigation(self._driver) meter_selection_page = MeterSelectionPage(self._driver) export_csv_page = ExportCsvPage(self._driver) download_csv_page = DownloadCsvPage(self._driver) # Authenticate login_page.wait_until_ready() self.screenshot("before login") login_page.login(self.username, self.password) # Configure interval data generation, in two steps... meter_selection_page.wait_until_ready() self.screenshot("before meter selection") # 1) Specify we are entering a custom date range meter_selection_page.select_date_range_option() self.screenshot("date range option selected") # 2) Locate the meter of interest and select it matching_meter = None meter_query = self.ngrid_meter_id log.info("Looking for a meter with ID == {0}".format(meter_query)) for meter in meter_selection_page.iter_meters(): log.info("Found a meter: {0}".format(meter)) if meter.meter_id == meter_query: log.info("Found a matching meter.") matching_meter = meter break if matching_meter is None: log.info("No meter with ID {0} was found.".format(meter_query)) raise InvalidMeterException( "Meter {0} was not found".format(meter_query)) else: matching_meter.select() self.screenshot("meter selected") # Two notes on time... # 1) Each meter specifies the date range for which data is # available. If we don't respect this, the page will throw # errors. So, we restrict our start and end dates based on # this information. if self.start_date < matching_meter.min_date: log.info("Adjusting start date from {0} to {1}".format( self.start_date, matching_meter.min_date)) self.start_date = matching_meter.min_date if self.end_date > matching_meter.max_date: log.info("Adjusting end date from {0} to {1}".format( self.end_date, matching_meter.max_date)) self.end_date = matching_meter.max_date # 2) Only a limited amount of data can be extracted at a time. # The page enforces this by restricting the number of days # for which you can download data. Therefore, we pull down # data in 180-day chunks. The actual restriction is a little # hard to pin down, since it varies based on some nontransparent # factors. 180 though is a very conservative estimate. date_range = DateRange(self.start_date, self.end_date) interval_size = relativedelta(days=180) readings = {} # Maps dates to interval data, populated below for subrange in date_range.split_iter(delta=interval_size): log.info("Gathering meter data for: {0}".format(subrange)) # First, set the date range for the selected meter meter_selection_page.set_date_range(subrange) # Navigate to the "Export" page, and request a CSV report navigation.goto_export() export_csv_page.wait_until_ready() export_csv_page.generate_report() # Wait for the report to generate, then download it # and extract interval data from it download_csv_page.wait_until_ready() csv_rows_iter = download_csv_page.get_csv_rows() header = next(csv_rows_iter) log.info("CSV Header row: {0}".format(header)) for data_row in csv_rows_iter: result = NationalGridIntervalScraper.parse_csv_row(data_row) if result.units == UNITS_KWH: readings[self._iso_str(result.date)] = list( NationalGridIntervalScraper.kwh_to_kw( result.interval_data)) # Navigate back to the meter selection page in preparation # for the next iteration. Note that we do not reselect the # meter, since our initial selections are cached. navigation.goto_meter_selection() meter_selection_page.wait_until_ready() return Results(readings=readings)
def run_datafeed( scraper_class, account: SnapmeterAccount, meter: Meter, datasource: MeterDataSource, params: dict, configuration=None, task_id=None, transforms: Optional[List[Transforms]] = None, disable_login_on_error: Optional[bool] = False, notify_on_login_error: Optional[bool] = True, meter_only: Optional[bool] = False, ) -> Status: transforms = [] if transforms is None else transforms bill_handler = ft.partial( upload_bills, meter.oid, meter.utility_service.service_id, task_id, datasource.name, ) readings_handler = ft.partial(upload_readings, transforms, meter.oid, datasource.name, task_id) pdfs_handler = ft.partial(attach_bill_pdfs, meter.oid, task_id, meter_only) partial_bill_handler = ft.partial(upload_partial_bills, meter, task_id) date_range = DateRange( *iso_to_dates(params.get("data_start"), params.get("data_end"))) parent: AccountDataSource = None if datasource.account_data_source: parent = datasource.account_data_source credentials = Credentials(parent.username, parent.password) if not datasource.account_data_source.enabled: raise DataSourceConfigurationError( "%s scraper for %s is disabled" % (datasource.account_data_source.name, meter.oid)) else: credentials = Credentials(None, None) if task_id and config.enabled("ES_INDEX_JOBS"): log.info("Uploading task information to Elasticsearch.") doc = index.starter_doc(meter.oid, datasource) doc["start_date"] = date_range.start_date doc["end_date"] = date_range.end_date doc["meter_data_source"] = datasource.oid if configuration: doc.update({ "billScraper": configuration.scrape_bills or configuration.scrape_partial_bills, "intervalScraper": configuration.scrape_readings, }) index.index_etl_run(task_id, doc) index_doc: Dict[str, str] = {} # create a non-persisted copy utility_service = UtilityService.copy_from(meter.utility_service) try: with scraper_class(credentials, date_range, configuration) as scraper: scraper.utility_service = utility_service scraper_status = scraper.scrape( readings_handler=readings_handler, bills_handler=bill_handler, pdfs_handler=pdfs_handler, partial_bills_handler=partial_bill_handler, ) if scraper_status == Status.SUCCEEDED: # Avoid muddying Elasticsearch results index_doc = {"status": "SUCCESS"} else: index_doc = {"status": scraper_status.name} if scraper_status in [Status.SUCCEEDED, Status.COMPLETED]: retval = Status.SUCCEEDED else: retval = Status.FAILED # sce-metascraper needs to be able to get the completed status back if scraper.metascraper: retval = scraper_status except Exception as exc: log.exception("Scraper run failed.") retval = Status.FAILED index_doc = { "status": "FAILED", "error": repr(exc), "exception": type(exc).__name__, } # disable the login if scraping threw a LoginError, caller requested disabling on error, # and meter data source has a parent account data source if isinstance(exc, LoginError) and disable_login_on_error and parent: parent.enabled = False db.session.add(parent) log.warning("disabling %s login %s", parent.source_account_type, parent.oid) if notify_on_login_error: alert.disable_logins(parent) index_doc.update( update_utility_service(meter.utility_service, utility_service)) if task_id and config.enabled("ES_INDEX_JOBS"): log.info("Uploading final task status to Elasticsearch.") index.index_etl_run(task_id, index_doc) return retval
def _execute(self): # Direct the driver to the login page self._driver.get(self.login_url) # Create page helpers download_page = DownloadPage(self._driver) meter_page = MeterPage(self._driver) search_result = MeterSearchResult(self._driver) available_dates = AvailableDateComponent(self._driver) interval_form = IntervalForm(self._driver) self.login_to_mvweb() # Navigate to Download Page # Pause to let the IFrame to settle down time.sleep(5) download_page.wait_until_ready(selector=self.download_link_selector) self.screenshot("before clicking on download link") download_page.navigate_to_download_page(self.download_link_selector) time.sleep(10) # Enter MeterId in the search box meter_page.wait_until_ready(meter_page.MeterSearchInput) self.screenshot("before searching for meter") meter_page.search_by_meter_id(self.meter_id) time.sleep(10) # When search results have settled down, click on first meter result. # If meter isn't found, throw an error. search_result.wait_until_text_visible( search_result.SearchResult, self.meter_id, error_selector=search_result.NoResultsFound, alt_text="No matching records found", error_cls=MeterNotFoundException, error_msg="No matching records found for Meter ID {}".format(self.meter_id), ) self.screenshot("before clicking on meter result") search_result.click_on_meter_result() # Adjust start and end dates if supplied start and end are out of range adjusted_start, adjusted_end = available_dates.adjust_start_and_end_dates( self.start_date, self.end_date ) date_range = DateRange(adjusted_start, adjusted_end) interval_size = relativedelta(days=MAX_INTERVAL_LENGTH) timeline = Timeline(adjusted_start, adjusted_end, self._configuration.interval) # Breaks the date range into small, manageable chunks and downloads a csv # of demands for each one. for sub_range in date_range.split_iter(delta=interval_size): log.info("Getting interval data for date range: {}".format(sub_range)) start = sub_range.start_date end = sub_range.end_date # Fill out interval form and click save to download data interval_form.fill_out_interval_form_and_download(start, end) file_path = self.download_file("csv") # Extract intermediate info from csv self._process_csv(file_path, timeline) return Results(readings=timeline.serialize(include_empty=False))
def _execute_internal(self): # Direct the driver to the login page self._driver.get(self.login_url) # Create page helpers login_page = LoginPage(self._driver) home_page = HomePage(self._driver) usage_page = UsagePage(self._driver) # Authenticate log.info("Logging in.") login_page.wait_until_ready() self.screenshot("before login") # login seems to sometimes fail; try twice try: login_page.login(self.username, self.password, self) except LoginError: log.info("login failed; trying login a second time in 30s") self._driver.get(self.login_url) self._driver.sleep(30) self.screenshot("before second login") login_page.login(self.username, self.password, self) self.screenshot("after login") # On the homepage, fetch the visible account information. This info # tells us (among other things) which account id is associated with # which account name. log.info("Waiting for home page to be ready.") home_page.wait_until_ready() self.screenshot("home page loaded") # Go to the 'Usage' Page log.info("Navigating to 'Usage' page.") usage_page.navigate_to_usage_page() usage_page.wait_until_ready() self.screenshot("usage_page_initial") if usage_page.is_enterprise(): log.info("Enterprise: selecting account: {0}".format( self.account_id)) if not usage_page.select_account_from_portfolio(self.account_id): error_msg = f"Unable to find account with ID={self.account_id}" log.info(error_msg) raise InvalidAccountException(error_msg) self.screenshot("usage_account_selected") else: log.info("Selecting account: {0}".format(self.account_id)) if not usage_page.select_account(self.account_id): available_accounts = set(usage_page.get_available_accounts()) error_msg = "Unable to find account with ID={0}. Available accounts are: {1}".format( self.account_id, available_accounts) log.info(error_msg) raise InvalidAccountException(error_msg) self.screenshot("usage_account_selected") # Select the desired meter log.info("Selecting meter with id: {0}".format(self.service_id)) if not usage_page.select_meter(self.service_id): available_meters = set(usage_page.get_available_meters()) error_msg = ( "Unable to find meter with ID={0}. Available meters are: {1}". format(self.service_id, available_meters)) raise InvalidMeterException(error_msg) self.screenshot("selected meter") date_range = DateRange(self.start_date, self.end_date) timeline = Timeline(self.start_date, self.end_date, interval=self._configuration.interval) excel_download = True if usage_page.is_enterprise(): usage_page.enterprise_select_usage(self._configuration.interval) else: usage_page.select_usage(self._configuration.interval) excel_download = usage_page.excel_download_available() # use the same JavaScript download for both regular and enterprise if excel_download: if self._configuration.interval == 1440: for subrange in date_range.split_iter(delta=relativedelta( days=7)): usage_page.javascript_download( subrange.start_date, subrange.end_date, self._configuration.interval, ) else: dt = self.start_date while dt < self.end_date: usage_page.javascript_download( dt, dt, self._configuration.interval) dt += timedelta(days=1) for filename in glob(f"{self._driver.download_dir}/*.xlsx"): parse_xlsx(timeline, filename, self.adjustment_factor) else: log.info("starting GreenButton download") usage_page.open_green_button() self.screenshot("opened green button") """ This page only allows you to download a certain amount of billing data at a time. We will use a conservative chunk size of 90 days. """ interval_size = relativedelta(days=90) for subrange in date_range.split_iter(delta=interval_size): self.get_green_button(usage_page, timeline, subrange.start_date, subrange.end_date) return Results(readings=timeline.serialize())