def the_work(creds_path, backup_folder, idph_csv_folder, geo_folder, chrome_options, chrome_path='chromedriver', **kwargs): # Dates the_date = datetime.now(timezone('US/Central')).strftime("%m-%d") the_date_year_yday = datetime.strftime(datetime.now(timezone('US/Central')) - timedelta(1), '%m-%d-%y') the_date_year = datetime.now(timezone('US/Central')).strftime("%m-%d-%y") the_date_YEAR = datetime.now(timezone('US/Central')).strftime("%m-%d-%Y") the_date_n_time = datetime.now(timezone('US/Central')).strftime("%m-%d-%H%M") the_time = datetime.now(timezone('US/Central')).strftime("%H:%M") print("Starting scraper...{} {}".format(the_date_year, the_time)) credentials = ServiceAccountCredentials.from_json_keyfile_name(creds_path) # If you copy this, make sure the file you are opening is accessible to your service account # Ie. Give Sharing/Edit access to ExProc ([email protected]) totals_spread, zip_spread, county_spread = None, None, None while not (zip_spread and county_spread and totals_spread): try: if not zip_spread: zip_spread = Spread(spread=gsheet_zip_link, creds=credentials) if not county_spread: county_spread = Spread(spread=gsheet_county_link, creds=credentials) if not totals_spread: totals_spread = Spread(spread=gsheet_totals_link, creds=credentials) except gspread.exceptions.APIError as e: print(e) print("gspread API Error occurred") time.sleep(5) continue except requests.exceptions.ReadTimeout: print("requests ReadTimeout occurred") time.sleep(5) continue driver = webdriver.Chrome(executable_path=chrome_path, options=chrome_options) # Chrome options are passed into function driver.implicitly_wait(300) # %% # Webdriver Work - First for Zips driver.get(idph_link) time.sleep(10) # Collecting totals daily_totals = {"tests_pos": int(driver.find_element_by_id("covid19positive").text.replace(",", "")), "deaths": int(driver.find_element_by_id("covid19deaths").text.replace(",", "")), "total_tests": int(driver.find_element_by_id("covid19totaltest").text.replace(",", ""))#, #"recovered": int(driver.find_element_by_id("covid19recovered").text.replace(",", "")) } by_county_button = driver.find_element_by_link_text("By County") by_zip_button = driver.find_element_by_link_text("By Zip") by_zip_button.click() time.sleep(2) all_zip_data_button = driver.find_element_by_link_text("All") all_zip_data_button.click() time.sleep(5) # Excessive, I know zip_soup = bs4.BeautifulSoup(driver.page_source, 'lxml') zip_table_soup = zip_soup.find("table", class_="padded-table", id="detailedData") by_county_button.click() time.sleep(2) all_county_data_button = driver.find_element_by_link_text("All") all_county_data_button.click() time.sleep(5) # Excessive, I know county_soup = bs4.BeautifulSoup(driver.page_source, 'lxml') county_table_soup = county_soup.find("table", class_="padded-table", id="detailedData") # Todo: Make sure to back this up as well with open(backup_folder / ("backup_soup_zip_%s.txt" % the_date_n_time), "w") as backup_zip_soup: backup_zip_soup.writelines(str(zip_soup.encode("utf-8"))) with open(backup_folder / ("backup_soup_county_%s.txt" % the_date_n_time), "w") as backup_county_soup: backup_county_soup.writelines(str(county_soup.encode("utf-8"))) # %% zip_table_headers = zip_table_soup.find_all("th") zip_table_headers_arr = [x.get_text() for x in zip_table_headers] assert zip_table_headers_arr == ['Zip', 'Tested', 'Positive Cases', 'Deaths'] county_table_headers = county_table_soup.find_all("th") county_table_headers_arr = [x.get_text() for x in county_table_headers] assert county_table_headers_arr == ['County', 'Tested', 'Positive Cases', 'Deaths'] #print("Zip first:") #print(zip_table_soup.find_all("tr")[1:]) #print() #print("County second:") #print(county_table_soup.find_all("tr")[1:]) df_zip_today = pd.DataFrame(columns=zip_table_headers_arr) for tr in zip_table_soup.find_all('tr')[1:]: tds = tr.find_all('td') df_zip_today = df_zip_today.append( {'Zip': tds[0].text, 'Tested': tds[1].text, 'Positive Cases': tds[2].text, 'Deaths': tds[3].text}, ignore_index=True) df_county_today = pd.DataFrame(columns=county_table_headers_arr) for tr in county_table_soup.find_all('tr')[1:]: tds = tr.find_all('td') df_county_today = df_county_today.append( {'County': tds[0].text, 'Tested': tds[1].text, 'Positive Cases': tds[2].text, 'Deaths': tds[3].text}, ignore_index=True) df_zip_today.set_index("Zip", inplace=True) df_county_today.set_index("County", inplace=True) # %% # Totals -- Assertions, Comparisons, and Long form production df_totals_oldlong = totals_spread.sheet_to_df(index=0) assert daily_totals['tests_pos'] >= int(df_totals_oldlong.iloc[0]['tests_pos']) # Was the number back on 4/18/2020 assert daily_totals['deaths'] >= int(df_totals_oldlong.iloc[0]['deaths']) assert daily_totals['total_tests'] >= int(df_totals_oldlong.iloc[0]['total_tests']) #assert daily_totals['recovered'] >= int(df_totals_oldlong.iloc[0]['recovered']) print("Daily Totals Assertions passed.") totals_changed = False if (daily_totals['tests_pos'] != int(df_totals_oldlong.iloc[0]['tests_pos'])) or \ (daily_totals['deaths'] != int(df_totals_oldlong.iloc[0]['deaths'])) or \ (daily_totals['total_tests'] != int(df_totals_oldlong.iloc[0]['total_tests'])): # or \ #(daily_totals['recovered'] != int(df_totals_oldlong.iloc[0]['recovered'])): totals_changed = True print("Totals were detected as changed.") if totals_changed: daily_totals['update_date'] = the_date_YEAR daily_totals['update_time'] = the_time daily_totals['tests_neg'] = daily_totals['total_tests'] - daily_totals['tests_pos'] daily_totals['new_tests'] = daily_totals['total_tests'] - int(df_totals_oldlong.iloc[0]['total_tests']) df_totals_today = pd.DataFrame(daily_totals, index=[0]) df_totals_newlong = df_totals_today.append(df_totals_oldlong)[ ["update_date", "tests_pos", "deaths", "total_tests", #"recovered", "tests_neg", "new_tests"]].reset_index( drop=True) df_totals_newlong.to_csv(idph_csv_folder / ("Long Totals %s.csv" % the_date_n_time), index=False) totals_spread.df_to_sheet(df_totals_newlong, index=False, sheet="IL_long", start="A1", replace=True) # %% Assertions Zip - Does the table look as expected? assert df_zip_today.index[0] <= "60002" # First zip code in Illinois assert int(df_zip_today.index[-1]) >= 62959 # This was the last zip code reported on 4/12/2020 assert df_zip_today.shape[0] >= 353 # This was the size of the table on 4/12/2020 assert df_zip_today.Deaths.str.contains("N/A").all() # The Death column should be "N/A" for all zip codes. print("Zip Assertions passed.") # %% Assertions County - Does the table look as expected? assert df_county_today.index[0] == "Illinois" # First "county" reported on their table assert df_county_today.index[-1] == "Woodford" # This was the last county code reported on their table on 4/13/2020 assert df_county_today.shape[0] >= 90 # This was the size of the table on 4/12/2020 print("County Assertions passed.") # %% Basic Assertions passed -- Drop and rename things # Zip table first df_zip_today.drop(columns="Deaths", inplace=True) df_zip_today.rename(columns={"Positive Cases": 'Positive_Cases'}, inplace=True) # County next df_county_today.rename(columns={"Positive Cases": "Positive_Cases"}, inplace=True) # %% # Open the most recent Zip file for comparison zip_changed = False # Assume the sheet has already been updated for the day, try grabbing and comparing with that first. zip_spread_yday = Spread(spread=gsheet_zip_link, creds=credentials) df_zip_yday = zip_spread_yday.sheet_to_df(index=0, sheet=zip_spread.find_sheet(the_date_year)) if "update_time" in df_zip_yday.columns: # Oh, so it didn't find it for today, let's grab the one from yesterday df_zip_yday = zip_spread_yday.sheet_to_df(index=0, sheet=zip_spread.find_sheet(the_date_year_yday)) # Now explicitly compare tables to see if they're different df_zip_yday.set_index("Zip", inplace=True) if df_zip_today.equals(df_zip_yday): print("Zip values have not yet changed from latest data.") else: print("Zip values have been detected as different.") zip_changed = True # %% # Open the most recent County file for comparison county_changed = False # Assume the sheet has already been updated for the day, try grabbing and comparing with that first. county_spread_yday = Spread(spread=gsheet_county_link, creds=credentials) df_county_yday = county_spread_yday.sheet_to_df(index=0, sheet=county_spread.find_sheet(the_date_year)) if "update_time" in df_county_yday.columns: # Oh, so it didn't find it for today, let's grab the one from yesterday df_county_yday = county_spread_yday.sheet_to_df(index=0, sheet=county_spread.find_sheet(the_date_year_yday)) # Now explicitly compare tables to see if they're different df_county_yday.set_index("County", inplace=True) if df_county_today.equals(df_county_yday): print("County values have not yet changed from latest data.") else: print("County values have been detected as different.") county_changed = True # %% If they were different, let's save them and upload them to sheets print("\nUploading new sheets if changed.") if zip_changed: df_zip_today.to_csv(idph_csv_folder / ("IDPH Stats Zip %s.csv" % the_date_n_time), index=True) zip_spread.df_to_sheet(df_zip_today, index=True, sheet=the_date_year, start='A1', replace=True) print("\tNew Zip was uploaded to Sheets.") else: print("\tZip version was not uploaded.") if county_changed: df_county_today.to_csv(idph_csv_folder / ("IDPH Stats County %s.csv" % the_date_n_time), index=True) county_spread.df_to_sheet(df_county_today, index=True, sheet=the_date_year, start='A1', replace=True) print("\tNew County was uploaded to Sheets.") else: print("\tCounty version was not uploaded.") # %% Now deal with production of Long version print("\nProducing Zip and County long versions.") # Bring in mapping tools census_df = pd.read_csv(geo_folder / "Illinois_Census_200414_1816.csv") nofo_map_dict = dict(zip(census_df.CountyName, census_df.NOFO_Region)) metro_map_dict = dict(zip(census_df.CountyName, census_df.Metro_area)) if zip_changed: df_zip_oldlong = zip_spread.sheet_to_df(index=0, sheet=zip_spread.find_sheet("long")) df_zip_today['update_date'] = the_date_YEAR df_zip_today['update_time'] = the_time df_zip_today.reset_index(inplace=True) df_zip_newlong = df_zip_today.append(df_zip_oldlong)[ ["update_date", "update_time", "Zip", "Positive_Cases", "Tested"]].reset_index(drop=True) df_zip_newlong.to_csv(idph_csv_folder / ("Long Zip %s.csv" % the_date_n_time), index=False) zip_spread.df_to_sheet(df_zip_newlong, index=False, sheet="long", start="A1", replace=True) print("\tZip Long version made and uploaded.") else: print("\tZip version was not uploaded.") if True: ## CHANGED THIS was county_changed df_county_oldlong = county_spread.sheet_to_df(index=0, sheet=county_spread.find_sheet("long")) df_county_today['update_date'] = the_date_YEAR df_county_today['update_time'] = the_time # Add NOFO Region df_county_today['NOFO_Region'] = df_county_today.index.map(nofo_map_dict) df_county_today.loc[df_county_today['NOFO_Region'].isna(), "NOFO_Region"] = df_county_today.index[ df_county_today['NOFO_Region'].isna()] # Add Metro Area df_county_today['Metro_Area'] = df_county_today.index.map(metro_map_dict) # Chicago and Illinois are not mapped on the Metro mapping file. df_county_today.loc[df_county_today.index == "Chicago", "Metro_Area"] = 'Chicago' df_county_today.loc[df_county_today.index == "Illinois", "Metro_Area"] = 'Illinois' # Leaving unlabeled metro areas as Nulls. df_county_today.reset_index(inplace=True) df_county_newlong = df_county_today.append(df_county_oldlong)[ ["update_date", "update_time", "County", "Positive_Cases", "Deaths", "Tested", "NOFO_Region", "Metro_Area"]].reset_index(drop=True) df_county_newlong.to_csv(idph_csv_folder / ("Long County %s.csv" % the_date_n_time), index=False) county_spread.df_to_sheet(df_county_newlong, index=False, sheet="long", start="A1", replace=True) print("\tLong editions made and uploaded.") else: print("\tCounty version was not uploaded.") # %% Now producing NOFO Region and Metro Area Rollups directly from County_long and Zip_long directly from gsheets if zip_changed or county_changed: # cln = county_long_now cln = county_spread.sheet_to_df(index=None, sheet=county_spread.find_sheet("long")) # I'm not converting the hourly one because as datetimes it includes the wrong day. (cln.update_date) = pd.to_datetime(cln.update_date) cln.Tested = cln.Tested.apply(lambda x: "0" if x == "" else x) cln.Positive_Cases, cln.Deaths, cln.Tested = cln.Positive_Cases.astype(int), cln.Deaths.astype( int), cln.Tested.astype(int) # County data is ready for roll-up and upload to NOFO_long cln_nofo = cln.groupby(by=["update_date", "update_time", "NOFO_Region"]).sum() cln_nofo = cln_nofo.sort_index(axis=0, level=[-1, -3], sort_remaining=False) totals_spread.df_to_sheet(cln_nofo, index=True, sheet="NOFO_long", start="A1", replace=True) # County data is ready for roll-up and upload to Metro_long cln_metro = cln.groupby(by=["update_date", "update_time", "Metro_Area"]).sum() cln_metro = cln_metro.sort_index(axis=0, level=[-1, -3], sort_remaining=False) cln_metro = cln_metro.drop(axis=0, labels="", level=-1) totals_spread.df_to_sheet(cln_metro, index=True, sheet="Metro_long", start="A1", replace=True) # %% driver.close() print("End of " + the_time + " run.") return zip_changed or county_changed
class GoogleSpread(object): def __init__(self, spread, sheet=0, creds=None, create_sheet=False, conf_file=None): if creds: self.creds = creds else: credentials = ParseConfiguration(conf_file).get_google_creds() self.creds = get_creds(config=credentials) self.spread = Spread(spread, sheet=sheet, creds=self.creds, create_sheet=create_sheet) @property def spread_url(self): return self.spread.url @property def worksheet_name(self): return self.spread.sheet @property def spread_name(self): return self.spread.spread def df_to_sheet(self, df, index=True, headers=True, start_cell=(1, 1), sheet=None, replace=False): self.spread.df_to_sheet(df, index=index, headers=headers, start=start_cell, sheet=sheet, replace=replace) def sheet_to_df(self, index=1, header_rows=1, start_row=1, sheet=None): return self.spread.sheet_to_df(index=index, header_rows=header_rows, start_row=start_row, sheet=sheet) def find_sheet(self, sheet): return self.spread.find_sheet(sheet) def open_sheet(self, sheet, create=False): self.spread.open_sheet(sheet, create=create) def update_cells(self, start, end, vals, sheet=None): self.spread.update_cells(start, end, vals, sheet=sheet) def get_sheet_dims(self, sheet=None): return self.spread.get_sheet_dims(sheet=sheet) def clear_sheet(self, rows=1, cols=1, sheet=None): self.spread.clear_sheet(rows=rows, cols=cols, sheet=sheet) def create_sheet(self, name, rows=1, cols=1): self.spread.create_sheet(name, rows=rows, cols=cols) def delete_sheet(self, sheet): self.spread.delete_sheet(sheet)