def scrape(supplier): oldHTML = open("./data/" + supplier + ".html").read() #driver = webdriver.Chrome() driver = webdriver.Chrome( r'C:/Program Files/Chromedriver/chromedriver.exe', chrome_options=options) driver.get( "https://www.energizect.com/compare-energy-suppliers") # get the page if (supplier == "ui"): ui_button = driver.find_element_by_id("radioTwo") ui_button.click() # Click the button compare_now_button = driver.find_element_by_class_name( "supplier_form_submit") compare_now_button.click() #TEMPERORARY FOR THE POPUP ABOUT STANDARD PRICES try: WebDriverWait(driver, 20).until( EC.visibility_of_element_located( (By.CLASS_NAME, "ui-dialog-titlebar-close"))) close_button = driver.find_element_by_class_name( "ui-dialog-titlebar-close") close_button.click() except: print("no seasonal popup") # Wait *up to* 20 seconds for the popup to show up try: WebDriverWait(driver, 20).until( EC.visibility_of_element_located((By.CLASS_NAME, "close_anchor"))) except: email_error.send_email("no close anchor") #click the x for a disclaimer close_button = driver.find_element_by_class_name("clostPopup") close_button.click() # Get the html html = driver.page_source #writing to a file soup = bs.BeautifulSoup(html, 'html.parser') html = soup.prettify() with open("./data/" + supplier + ".html", "w") as out: for i in range(0, len(html)): try: out.write(html[i]) except Exception: 1 + 1 newHTML = open("./data/" + supplier + ".html").read() #calculates percentage difference between this and the last relevant HTML file matcher = SequenceMatcher(None, oldHTML, newHTML).quick_ratio() if matcher < 0.5: email_error.send_email("difference between HTML files is: ", matcher) print("percent match:", matcher)
def run(supplier): try: with open("./data/" + supplier + "_PVD.html") as html: soup = bs.BeautifulSoup(html, 'html.parser') suppliers = [] fill_suppliers(soup, suppliers) write_to_csv(supplier, suppliers) except Exception as e: error_traceback = traceback.extract_tb(e.__traceback__) print("error encountered: " + str(error_traceback)) email_error.send_email("general error: " + str(error_traceback))
def billingCycle(row): try: mobilerateDiv = row.findAll("div", class_="mobilerate")[1] except: email_error.send_email("There is no div with value mobilerate.") contractTerms = [] for elem in mobilerateDiv.find("div", class_="companyShortData").contents: if "Billing Cycle" in elem: contractTerms.append(''.join(x for x in elem if x.isdigit())) if len(contractTerms) > 2: email_error.send_email("more than two fixed tiers") return contractTerms
def varRate(row): supplyRates = row.findAll("b", class_="supply_rate") if supplyRates == []: email_error.send_email( "Empty array for supply_rate class (no variable rate corresponding to this)." ) rates = [] for rate in supplyRates: stripped = str(rate.contents[0]).replace("\n", "").strip() rates.append('{:,.4f}'.format( float(''.join(x for x in stripped if x.isdigit() or x == '.')) / 100)) return rates
def getNum(row, attribute, value): try: content = str(row.find(attrs={attribute: value}).contents) except: email_error.send_email( "No such attribute and value exist. Attribute: " + attribute + " Value: " + value) s = ''.join(x for x in content if x.isdigit()) #gets all numbers within the contents if s: return int(s) else: return 0
def diff_check(supplier): files = sorted([x for x in os.listdir("./data/") if x.endswith(".csv")], key=lambda x: os.path.getmtime("./data/" + x), reverse=True) if len(files) < 2: email_error.send_email("not enough files to compare") return for i in range(len(files)): if supplier in str(files[i]): now = files[i] for j in range(i + 1, len(files)): if supplier in str(files[j]): recent = files[j] break break diff = compare(load_csv(open("./data/" + now), key="plan_id"), load_csv(open("./data/" + recent), key="plan_id")) if diff['added'] == [] or diff['removed'] == []: os.remove("./data/" + now) print('deleted')
def fill_suppliers(soup, suppliers): table = soup.find_all( 'table', class_="nice_table responsive highlight_table display nowrap")[0] first = True planNum = 0 iterator = iter(table.find_all('tr')) year = datetime.date.today().year - 1 duplicate = [] next(iterator) #skip first entry, which is a header for row in iterator: counter = 0 info = {} rowString = str(row) info["date_downloaded"] = date.today() if row.attrs['style'] == "display: none;": continue service = getValue(rowString, "data-ratetitle") if "Eversource" in service: service = "Eversource" elif "UI" in service: service = "UI" info["TDU_service_territory"] = service if first: info["supplier_name"] = info["TDU_service_territory"] elif getValue(rowString, "data-friendly-name") in duplicate: # print(getValue(rowString, "data-friendly-name")) continue else: duplicate.append(getValue(rowString, "data-friendly-name")) info["supplier_name"] = getValue(rowString, "data-friendly-name") info["plan_id"] = getValue(rowString, "id=\"plan-", 0) curr_id = info["plan_id"] curr_low = soup.find(id="low_value_" + curr_id) if curr_low and curr_low['value'].find(str(year)) != -1: # print("1") indexes = find_all_indexes(curr_low['value'], str(year)) indexes_2 = find_all_indexes(curr_low['value'], str(year + 1)) low_list = [] for i in indexes: low_list.append('{0:g}'.format( float( re.findall('\d*\.?\d+', curr_low['value'][i + 19:i + 24])[0]) / 100)) for i in indexes_2: low_list.append('{0:g}'.format( float( re.findall('\d*\.?\d+', curr_low['value'][i + 19:i + 24])[0]) / 100)) if len(low_list) > 12: low_list = low_list[-12:] while len(low_list) < 12: low_list.append('N/A') curr_high = soup.find(id="high_value_" + curr_id)['value'] indexes = find_all_indexes(curr_high, str(year)) indexes_2 = find_all_indexes(curr_high, str(year + 1)) high_list = [] for i in indexes: try: high_list.append('{0:g}'.format( float( re.findall('\d*\.?\d+', curr_high[i + 19:i + 24])[0]) / 100)) except Exception as e: email_error.send_email( "Format of website changed, the value of high value is not numeric" ) for i in indexes_2: try: high_list.append('{0:g}'.format( float( re.findall('\d*\.?\d+', curr_high[i + 19:i + 24])[0]) / 100)) except Exception as e: email_error.send_email( "Format of website changed, the value of low value is not numeric" ) if len(high_list) > 12: high_list = high_list[-12:] while len(high_list) < 12: high_list.append('N/A') for i in range(12): info["Low_lag" + str(i + 1)] = low_list[11 - i] info["High_lag" + str(i + 1)] = high_list[11 - i] planNum += 1 first = False if 0 not in low_list: suppliers.append(Supplier(info))
def scrape(supplier): #driver = webdriver.Chrome() driver = webdriver.Chrome( r'C:/Program Files/Chromedriver/chromedriver.exe', chrome_options=options) driver.get( "https://www.energizect.com/compare-energy-suppliers") # get the page if (supplier == "UI"): ui_button = driver.find_element_by_id("radioTwo") ui_button.click() # Click the button compare_now_button = driver.find_element_by_class_name( "supplier_form_submit") compare_now_button.click() # Wait *up to* 10 seconds to make sure the page has finished loading (check that the button no longer exists) # WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.CLASS_NAME, "supplier_form_submit"))) #TEMPERORARY FOR THE POPUP ABOUT STANDARD PRICES try: WebDriverWait(driver, 20).until( EC.visibility_of_element_located( (By.CLASS_NAME, "ui-dialog-titlebar-close"))) close_button = driver.find_element_by_class_name( "ui-dialog-titlebar-close") close_button.click() except: print("no seasonal popup") # Wait *up to* 20 seconds for the popup to show up try: WebDriverWait(driver, 20).until( EC.visibility_of_element_located((By.CLASS_NAME, "close_anchor"))) except: email_error.send_email("no close anchor") #click the x for a disclaimer close_button = driver.find_element_by_class_name("clostPopup") close_button.click() action = ActionChains(driver) lists = driver.find_elements_by_class_name("compare_button1") count = 0 try: oldHTML = open("./data/" + supplier + "_PVD.html").read() except Exception: 1 + 1 print("didn't find old") for test_button1 in lists: try: action.move_to_element(test_button1).perform() test_button1.click() except Exception: time.sleep(5) print("slept") count += 1 # print(count) html = driver.page_source #writing to a file soup = bs.BeautifulSoup(html, 'html.parser') html = soup.prettify() print(html) with open("./data/" + supplier + "_PVD.html", "w") as out: for i in range(0, len(html)): #print(html[i]) try: out.write(html[i]) except Exception: 1 + 1 print("exception 1+1") newHTML = open("./data/" + supplier + "_PVD.html").read() matcher = SequenceMatcher(None, oldHTML, newHTML).quick_ratio() if matcher < 0.5: email_error.send_email("difference between HTML files is: ", matcher) print("percent match:", matcher)
# Check whether we have already scraped past variable rates today **note: shoudl add diff checker at some pt and will have to change this scraped = os.path.exists( "./data/"+ "PVD_ES_"+ str(Dt.date.today()) + ".csv") and os.path.exists( "./data/"+ "PVD_UI_"+ str(Dt.date.today()) + ".csv") # Scrape and parse past variable rates (if the hour is 5am or if the previous attempt failed) if (x.hour >= 5) and not(scraped): pvd_total() # Run all and send email with traceback if any unknown errors occur try: run_all() if not Path('run_history.txt').is_file(): with open('run_history.txt', 'w') as run_file: run_file.write(dt.today().strftime('%m/%d/%y %H:%M:%S')) else: with open('run_history.txt', 'a', newline='') as run_file: run_file.write("\n" + dt.today().strftime('%m/%d/%y %H:%M:%S')) except Exception as e: error_traceback = traceback.extract_tb(e.__traceback__) email_error.send_email(error=f"Traceback at {dt.today().strftime('%m/%d/%y %H:%M:%S')} from Scheduler: {error_traceback}") # OLD #from threading import Timer # Specify when to run the past variable rates scraper #y=x.replace(day=x.day+1, hour=6, minute=0, second=0, microsecond=0) #delta_t=y-x #secs=delta_t.seconds+1 #t = Timer(secs, pvd_total()) #t.start()