def get_demographic_dataframe(): """This function scrapes data from a public tableau dashboard. In order to scrape this data, several steps have to be done: 1. Issue a request to the `BASE_URL.` Selenium is used here because multiple back and forth requests are needed to generate a valid "session" 2. After a valid session is generated, we scrape the X-Session-Id from the response headers. Selenium-wire is used specifically used here because it allows for request/response inspection. 3. Once the X-Session-Id is scraped, that can be used to query another URL which contains the download link for Demographic data. A gotcha here is that the first request will fail, but subsequent requests will succeed. 4. Once the subsequent request succeeds, we can find the CSV download link and obtain the data. """ # 1/ Issue request to the BASE_URL BASE_URL = 'https://public.tableau.com/views/NCDHHS_COVID-19_DataDownload/Demographics' runner = WebdriverRunner() results = runner.run( WebdriverSteps().go_to_url(BASE_URL).wait_for_presence_of_elements([ (By.XPATH, "//span[contains(text(),'Race')]") ]).get_x_session_id()) # 2/ Get the Session-Id session_id = results.x_session_id assert session_id, 'No X-Session-Id found' # 3/ Make requests to DOWNLOAD URL DOWNLOAD_URL = ( 'https://public.tableau.com/vizql/w/NCDHHS_COVID-19_DataDownload/v/Demographics/viewData/' 'sessions/{}/views/5649504231100340473_15757585069639442359' '?maxrows=200&viz=%7B%22worksheet%22%3A%22TABLE_RACE%22%2C%22dashboard%22%3A%22Demographics%22%7D' ) results = runner.run(WebdriverSteps().go_to_url( DOWNLOAD_URL.format(session_id) ).wait_for_presence_of_elements([ (By.XPATH, "//div[@id='tabBootErrTitle' and contains(text(),'Unexpected Error')]" ) ]).go_to_url( DOWNLOAD_URL.format(session_id)).wait_for_presence_of_elements([ (By.CLASS_NAME, 'csvLink_summary') ]).get_page_source()) soup = results.page_source # 4/ scrape the download link link = soup.find('a', {'class': 'csvLink_summary'}) assert link, 'No CSV link found' csv_href = link.get('href') assert csv_href, 'No CSV link found' # Read CSV, set first row as header, and the first two columns (Race and Name) as indices. content = get_content_as_file(csv_href) df = pd.read_csv(content, header=0, index_col=[0, 1]) # Remaining column is the `value`; rename accordingly assert len(df.columns) == 1 return df.rename(columns={df.columns[0]: 'Value'})
def _scrape(self, **kwargs): runner = WebdriverRunner() # Get date cases_results = runner.run( WebdriverSteps() .go_to_url(self.SUMMARY_URL) .wait_for_presence_of_elements((By.XPATH, "//span[contains(text(),'Last updated')]")) .get_page_source()) date = self.get_date(cases_results.page_source) # Cases for Race cases_by_race_results = runner.run( WebdriverSteps() .go_to_url(self.RACE_CASES_URL) .find_request('race_cases', find_by=find_tableau_request)) assert cases_by_race_results.requests['race_cases'], 'No results for race_cases found' resp_body = cases_by_race_results.requests['race_cases'].response.body.decode('utf8') cases_for_race_json = TableauParser(resp_body).extract_data_from_key(key='Rates by Race for All Cases') cases_df = self.to_df(cases_for_race_json) cases = cases_df['Measure Values'].sum() known_race_cases = cases_df.drop('Not Reported/Missing')['Measure Values'].sum() aa_cases = cases_df.loc['Black or African American', 'Measure Values'].sum() # Deaths for Race deaths_by_race_results = runner.run( WebdriverSteps() .go_to_url(self.RACE_DEATHS_URL) .find_request('race_deaths', find_by=find_tableau_request)) assert deaths_by_race_results.requests['race_deaths'], 'No results for race_deaths found' resp_body = deaths_by_race_results.requests['race_deaths'].response.body.decode('utf8') deaths_for_race_json = TableauParser(resp_body).extract_data_from_key(key='Mortality by Race') deaths_df = self.to_df(deaths_for_race_json) deaths = deaths_df['Measure Values'].sum() known_race_deaths = deaths_df.drop('Not Reported/Missing')['Measure Values'].sum() aa_deaths = deaths_df.loc['Black or African American', 'Measure Values'].sum() pct_aa_cases = misc.to_percentage(aa_cases, known_race_cases) pct_aa_deaths = misc.to_percentage(aa_deaths, known_race_deaths) return [self._make_series( date=date, cases=cases, deaths=deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=pct_aa_cases, pct_aa_deaths=pct_aa_deaths, pct_includes_unknown_race=False, pct_includes_hispanic_black=True, known_race_cases=known_race_cases, known_race_deaths=known_race_deaths )]
def _scrape(self, **kwargs): runner = WebdriverRunner() results = runner.run(WebdriverSteps().go_to_url( self.CASES_URL).wait_for_number_of_elements( (By.XPATH, '//canvas'), 58).find_request( 'cases', find_by=tableau.find_tableau_request ).clear_request_history().go_to_url( self.DEATHS_URL).wait_for_number_of_elements( (By.XPATH, '//canvas'), 29).find_request('deaths', find_by=tableau.find_tableau_request)) parser = tableau.TableauParser(request=results.requests['cases']) raw_date_str = pydash.head( parser.extract_data_from_key('cases')['ATTR(Date Updated)']) date = datetime.strptime(raw_date_str, '%A, %B %d, %Y').date() confirmed_cases = pydash.head( parser.extract_data_from_key('cases') ['SUM(# Lab Confirmed Cases)']) probable_cases = pydash.head( parser.extract_data_from_key('probable cases')['SUM(# probable)']) cases = confirmed_cases + probable_cases cases_df = pd.DataFrame.from_dict( parser.extract_data_from_key('raceth')).set_index('sub-category') aa_cases = cases_df.loc['Black']['SUM(count)'] known_race_cases = cases - cases_df.loc['unknown']['SUM(count)'] parser = tableau.TableauParser(request=results.requests['deaths']) deaths = pydash.head( parser.extract_data_from_key('death (2)') ['SUM(# lab confirmed deaths)']) deaths_df = pd.DataFrame.from_dict( parser.extract_data_from_key('raceth (death)')).set_index( 'sub-category') deaths_df = deaths_df.assign(Count=[ round(v * deaths) for v in deaths_df['SUM(% of deaths)'].values ]) aa_deaths = deaths_df.loc['Black']['Count'] known_race_deaths = deaths - deaths_df.loc['unknown']['Count'] pct_aa_cases = misc.to_percentage(aa_cases, known_race_cases) pct_aa_deaths = misc.to_percentage(aa_deaths, known_race_deaths) return [ self._make_series( date=date, cases=cases, deaths=deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=pct_aa_cases, pct_aa_deaths=pct_aa_deaths, pct_includes_unknown_race=False, pct_includes_hispanic_black=False, known_race_cases=known_race_cases, known_race_deaths=known_race_deaths, ) ]
def _scrape(self, **kwargs): runner = WebdriverRunner() results = runner.run( WebdriverSteps().set_request_capture_scope([ '.*querydata?synchronous=true' ]).go_to_url(self.URL).find_request( 'summary', find_by=powerbi.filter_requests( entity='factCase Data', selects=['Case Data.Total Cases'])).find_request( 'deaths', find_by=powerbi.filter_requests( entity='factCase Data', selects=['Sum(Case Data.Death)'])).find_request( 'last_updated', find_by=powerbi.filter_requests( entity='Today Dates', selects=['Min(Today Dates.Today)'])). find_element_by_xpath( "//span[contains(text(), 'Cumulative Summary')]/parent::*"). clear_request_history().click_on_last_element_found().find_request( 'race_cases', find_by=powerbi.filter_requests( selects=['Case Data.Race Group']))) parser = powerbi.PowerBIParser( request=results.requests['last_updated']) date_df = parser.get_dataframe_by_key('Date') timestamp = date_df.loc[0][ 'Min(Today Dates.Today)'] / 1000 # convert to seconds date = datetime.fromtimestamp(timestamp).date() parser = powerbi.PowerBIParser(request=results.requests['summary']) cases_df = parser.get_dataframe_by_key('Total Cases') cases = cases_df.loc[0]['Case Data.Total Cases'] parser = powerbi.PowerBIParser(request=results.requests['deaths']) deaths_df = parser.get_dataframe_by_key('Death') deaths = deaths_df.loc[0]['Sum(Case Data.Death)'] parser = powerbi.PowerBIParser(request=results.requests['race_cases']) race_cases_df = parser.get_dataframe_by_key('Race').set_index( 'Case Data.Race Group') aa_cases = race_cases_df.loc['Black']['Case Data.Total Cases'] known_race_cases = race_cases_df['Case Data.Total Cases'].sum() pct_aa_cases = misc.to_percentage(aa_cases, known_race_cases) return [ self._make_series( date=date, cases=cases, deaths=deaths, aa_cases=aa_cases, pct_aa_cases=pct_aa_cases, pct_includes_unknown_race=True, pct_includes_hispanic_black=False, ) ]
def _scrape(self, **kwargs): runner = WebdriverRunner() results = runner.run(WebdriverSteps( ).go_to_url(self.ACCEPTABLE_USE_URL).find_element_by_xpath( "//input[@class='form-check-input']" ).click_on_last_element_found().find_element_by_xpath( '//form/button' ).click_on_last_element_found().wait_for_presence_of_elements( (By.XPATH, "//a[@data-chart-id='count-charts']") ).find_element_by_xpath( "//a[@data-chart-id='count-charts']" ).click_on_last_element_found( ).wait_for_presence_of_elements( (By.XPATH, "//*[contains(text(), 'Total Cases by Race/Ethnicity & County')]" )).get_page_source()) soup = results.page_source date = self.get_last_updated_date(soup) _logger.info(f'Processing data for {date}') cases = self.get_total_cases(soup) deaths = self.get_total_deaths(soup) cases_df = self.get_race_cases_df(soup) aa_cases = cases_df.loc['Non-Hispanic Black']['State of Delaware'] try: unknown_race_cases = cases_df.loc['Unknown']['State of Delaware'] except KeyError: unknown_race_cases = 0 known_race_cases = cases - unknown_race_cases deaths_df = self.get_race_deaths_df(soup) aa_deaths = deaths_df.loc['Non-Hispanic Black']['State of Delaware'] try: unknown_race_deaths = deaths_df.loc['Unknown']['State of Delaware'] except KeyError: unknown_race_deaths = 0 known_race_deaths = deaths - unknown_race_deaths pct_aa_cases = to_percentage(aa_cases, known_race_cases) pct_aa_deaths = to_percentage(aa_deaths, known_race_deaths) return [ self._make_series(date=self.get_last_updated_date(soup), cases=cases, deaths=deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=pct_aa_cases, pct_aa_deaths=pct_aa_deaths, pct_includes_unknown_race=False, pct_includes_hispanic_black=False, known_race_cases=known_race_cases, known_race_deaths=known_race_deaths) ]
def _scrape(self, **kwargs): runner = WebdriverRunner() results = runner.run(WebdriverSteps().go_to_url( self.CASES_URL).wait_for_number_of_elements( (By.XPATH, '//canvas'), 38).find_request( 'cases', find_by=tableau.find_tableau_request ).clear_request_history().go_to_url( self.DEATHS_URL).wait_for_number_of_elements( (By.XPATH, '//canvas'), 20).find_request('deaths', find_by=tableau.find_tableau_request)) parser = tableau.TableauParser(request=results.requests['cases']) date_str = pydash.head( parser.extract_data_from_key('Footer')['AGG(Today)']) date = datetime.strptime(date_str, '%m-%d-%y').date() cases = pydash.head( parser.extract_data_from_key('Total Cases')['AGG(Total Cases)']) deaths = pydash.head( parser.extract_data_from_key('Total Deaths') ['SUM(Count Of Deaths)']) cases_pct_df = pd.DataFrame.from_dict( parser.extract_data_from_key('Race Breakdown ')).set_index('Race') cases_df = cases_pct_df.assign(Count=[ round(v * cases) for v in cases_pct_df['CNTD(Caseid 1)'].values ]) aa_cases = cases_df.loc['Black']['Count'] known_race_cases = cases - cases_df.loc['Unknown']['Count'] parser = tableau.TableauParser(request=results.requests['deaths']) deaths_pct_df = pd.DataFrame.from_dict( parser.extract_data_from_key('Bar | Race')).set_index('Race') deaths_df = deaths_pct_df.assign(Count=[ round(v * deaths) for v in deaths_pct_df['SUM(Death Count)'].values ]) aa_deaths = deaths_df.loc['Black']['Count'] known_race_deaths = deaths - deaths_df.loc['Unknown']['Count'] pct_aa_cases = misc.to_percentage(aa_cases, known_race_cases) pct_aa_deaths = misc.to_percentage(aa_deaths, known_race_deaths) return [ self._make_series(date=date, cases=cases, deaths=deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=pct_aa_cases, pct_aa_deaths=pct_aa_deaths, pct_includes_unknown_race=False, pct_includes_hispanic_black=True, known_race_cases=known_race_cases, known_race_deaths=known_race_deaths) ]
def _scrape(self, **kwargs): runner = WebdriverRunner() results = runner.run(WebdriverSteps().go_to_url( self.HOME_PAGE_URL).get_page_source().go_to_url( self.DEMOGRAPHIC_CASES_URL).find_request( key='cases', find_by=tableau.find_tableau_request). clear_request_history().go_to_url( self.DEMOGRAPHIC_DEATHS_URL).find_request( 'deaths', find_by=tableau.find_tableau_request)) date_str_element = results.page_source.find( 'strong', string=re.compile('current')) assert date_str_element, 'No date element found' date_str = date_str_element.get_text() pattern = re.compile(r'(\d{1,2}\/\d{1,2}\/\d{4})') matches = pattern.search(date_str) assert matches, 'Date not found.' date = datetime.strptime(matches.group(), '%m/%d/%Y').date() parser = tableau.TableauParser(request=results.requests['cases']) cases_df = parser.get_dataframe_from_key('CaseRace').set_index( 'Measure Status') cases = cases_df.loc['Black or African American']['AGG(Calculation1)'] aa_cases = round( cases_df.loc['Black or African American']['SUM(Count)'] * cases) parser = tableau.TableauParser(request=results.requests['deaths']) deaths = parser.get_dataframe_from_key( 'Total Deaths (2)')['SUM(Deaths)'].sum() deaths_df = parser.get_dataframe_from_key('Race').set_index( 'Measure Status11') aa_deaths = round( deaths_df.loc['Black or African American']['SUM(Deaths)'] * deaths) pct_aa_cases = to_percentage(aa_cases, cases) pct_aa_deaths = to_percentage(aa_deaths, deaths) return [ self._make_series( date=date, cases=cases, deaths=deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=pct_aa_cases, pct_aa_deaths=pct_aa_deaths, pct_includes_unknown_race=False, pct_includes_hispanic_black=True, ) ]
def _scrape(self, **kwargs): runner = WebdriverRunner() results = runner.run(WebdriverSteps().go_to_url( self.CASES_URL).find_request('cases', find_by=tableau.find_tableau_request). clear_request_history().go_to_url( self.DEATHS_URL).find_request( 'deaths', find_by=tableau.find_tableau_request)) parser = tableau.TableauParser(request=results.requests['cases']) raw_date_str = pydash.head( parser.extract_data_from_key('cases')['ATTR(dateupdated)']) date = datetime.strptime(raw_date_str, '%m/%d/%Y').date() cases = pydash.head( parser.extract_data_from_key('cases') ['SUM(Laboratory Confirmed Cases)']) cases_df = pd.DataFrame.from_dict( parser.extract_data_from_key('raceth')).set_index('subcategory') aa_cases = cases_df.loc['Black']['SUM(count)'] known_race_cases = cases - cases_df.loc['Unknown']['SUM(count)'] parser = tableau.TableauParser(request=results.requests['deaths']) deaths = pydash.head( parser.extract_data_from_key('death (2)')['SUM(Deaths)']) deaths_df = pd.DataFrame.from_dict( parser.extract_data_from_key('raceth (death)')).set_index( 'subcategory') aa_deaths = deaths_df.loc['Black']['SUM(count)'] known_race_deaths = deaths - deaths_df.loc['Unknown']['SUM(count)'] pct_aa_cases = misc.to_percentage(aa_cases, known_race_cases) pct_aa_deaths = misc.to_percentage(aa_deaths, known_race_deaths) return [ self._make_series( date=date, cases=cases, deaths=deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=pct_aa_cases, pct_aa_deaths=pct_aa_deaths, pct_includes_unknown_race=False, pct_includes_hispanic_black=False, known_race_cases=known_race_cases, known_race_deaths=known_race_deaths, ) ]
def _scrape(self, **kwargs): runner = WebdriverRunner() results = runner.run(WebdriverSteps().go_to_url( self.SUMMARY_URL).find_request( 'summary', find_by=tableau.find_tableau_request ).clear_request_history().go_to_url(self.DEATHS_URL).find_request( 'deaths', find_by=tableau.find_tableau_request).clear_request_history(). go_to_url(self.NYS_RACE_DEATHS_URL).find_request( 'race_deaths', find_by=tableau.find_tableau_request)) parser = tableau.TableauParser(request=results.requests['summary']) date_info = parser.extract_data_from_key('DASHBOARD 1 DATE (2)') date_str = pydash.get(date_info, 'MAX(Last_Reported_Test_Formatted).0') date = datetime.strptime(date_str, '%m/%d/%Y').date() cases_df = self.get_cases_df( parser.extract_data_from_key('TABLE VIEW (LARGE)')) cases = cases_df.loc['%all%']['Measure Values'] parser = tableau.TableauParser(request=results.requests['deaths']) deaths_df = self.get_deaths_df( parser.extract_data_from_key('Fatalaties by County')) deaths = deaths_df.loc['%all%']['Measure Values'] parser = tableau.TableauParser(request=results.requests['race_deaths']) nys_race_deaths_df = self.get_nys_race_deaths_df( parser.extract_data_from_key('Race/Ethnicity Table (2)')) nyc_race_deaths_df = pd.read_csv( self.NYC_RACE_DEATHS_URL).set_index('RACE_GROUP') aa_deaths = ( nys_race_deaths_df.loc['Black']['Measure Values'] + nyc_race_deaths_df.loc['Black/African-American']['DEATH_COUNT']) known_race_deaths = nys_race_deaths_df.loc['Total'][ 'Measure Values'] + nyc_race_deaths_df['DEATH_COUNT'].sum() pct_aa_deaths = misc.to_percentage(aa_deaths, known_race_deaths) return [ self._make_series(date=date, cases=cases, deaths=deaths, aa_deaths=aa_deaths, pct_aa_deaths=pct_aa_deaths, pct_includes_unknown_race=False, pct_includes_hispanic_black=False, known_race_deaths=known_race_deaths) ]
def _df_from_url(self, url, index_col=None): runner = WebdriverRunner() results = runner.run(WebdriverSteps() .go_to_url(url) .wait_for_presence_of_elements([(By.XPATH, "//div[@id='tabBootErrTitle' and contains(text(),'Unexpected Error')]")]) .go_to_url(url) .wait_for_presence_of_elements([(By.CLASS_NAME, 'csvLink_summary')]) .get_page_source()) soup = results.page_source link = soup.find('a', {'class': 'csvLink_summary'}) assert link, 'No CSV link found' csv_href = link.get('href') assert csv_href, 'No CSV link found' content = get_content_as_file(csv_href) return pd.read_csv(content, index_col=index_col)
def setup_session(self): runner = WebdriverRunner() return runner.run( WebdriverSteps() .go_to_url(self.BASE_URL) .wait_for_presence_of_elements([(By.ID, 'dashboard-viewport')]) .get_x_session_id() .wait_for_presence_of_elements([ (By.CLASS_NAME, 'tabStoryPointContent'), (By.CLASS_NAME, 'tab-widget')]) .find_element_by_xpath(f"//*[contains(text(), '{self.STATEWIDE_DEMOGRAPHICS_TAB_TEXT}')]") .click_on_last_element_found() .wait_for_presence_of_elements([(By.XPATH, f"//*[contains(text(), '{self.STATEWIDE_DEMOGRAPHICS_DASHBOARD_TITLE}')]")]) .find_element_by_xpath(f"//*[contains(text(), '{self.DEATH_DEMOGRAPHICS_TAB_TEXT}')]") .click_on_last_element_found() .wait_for_presence_of_elements([(By.XPATH, f"//*[contains(text(), '{self.DEATH_DEMOGRAPHICS_DASHBOARD_TITLE}')]")]))
def _scrape(self, **kwargs): runner = WebdriverRunner() results = runner.run(WebdriverSteps().go_to_url( self.URL).wait_for_number_of_elements( (By.XPATH, "//*[name()='svg']"), 26).find_request( 'race_cases', find_by=powerbi.filter_requests(selects=[ 'dashboard.Race/Ethnicity', 'Sum(dashboard.confirmed or probable death)' ])).find_request('summary_cases', find_by=powerbi.filter_requests(selects=[ 'Custom Totals.Test Results', 'Sum(Custom Totals.# of Cases)' ])).get_page_source()) date = self.parse_date(results.page_source) parser = powerbi.PowerBIParser(results.requests['summary_cases']) df = parser.get_dataframe_by_key('Custom Totals').set_index('Label') cases = df.loc['Total Positive Cases*'][ 'Sum(Custom Totals.# of Cases)'] deaths = df.loc['Deaths***']['Sum(Custom Totals.# of Cases)'] parser = powerbi.PowerBIParser(results.requests['race_cases']) df = parser.get_dataframe_by_key('Race/Ethnicity').set_index( 'dashboard.Race/Ethnicity') # I believe the people who labeled the dashboards labeled them incorrectly # the results below actually reflect number of cases and not deaths. aa_cases = df.loc['Black, Non-Hispanic'][ 'Sum(dashboard.confirmed or probable death)1'] known_race_cases = df[ 'Sum(dashboard.confirmed or probable death)1'].sum() pct_aa_cases = misc.to_percentage(aa_cases, known_race_cases) return [ self._make_series( date=date, cases=cases, deaths=deaths, aa_cases=aa_cases, pct_aa_cases=pct_aa_cases, pct_includes_unknown_race=False, pct_includes_hispanic_black=False, known_race_cases=known_race_cases, ) ]
def _scrape(self, **kwargs): runner = WebdriverRunner() results = runner.run( WebdriverSteps() .go_to_url(self.URL) .wait_for_presence_of_elements((By.XPATH, "//span[contains(text(), 'Race/Ethnicity is known for ')]")) .find_request(key='results', find_by=lambda r: 'bootstrapSession' in r.path) .get_page_source()) date = self.get_date(results.page_source) resp_body = results.requests['results'].response.body.decode('utf8') tableau_parser = TableauParser(resp_body) gender_dict = tableau_parser.extract_data_from_key('Summary by Gender') df = pd.DataFrame.from_dict(gender_dict).set_index(['Gender', 'Measure Names']) cases = parse.raw_string_to_int(df.loc['%all%', 'Cumulative Infections ']['Measure Values']) deaths = parse.raw_string_to_int(df.loc['%all%', 'Deaths']['Measure Values']) race_dict = tableau_parser.extract_data_from_key('Summary by Race-Eth') df = pd.DataFrame.from_dict(race_dict).set_index(['Race/Ethnicity', 'Measure Names']) aa_cases = parse.raw_string_to_int( df.loc['Black or African American**', 'Cumulative Infections ']['Measure Values']) aa_deaths = parse.raw_string_to_int(df.loc['Black or African American**', 'Deaths']['Measure Values']) known_race_cases = parse.raw_string_to_int(df.loc['%all%', 'Cumulative Infections ']['Measure Values']) known_race_deaths = parse.raw_string_to_int(df.loc['%all%', 'Deaths']['Measure Values']) pct_aa_cases = misc.to_percentage(aa_cases, known_race_cases) pct_aa_deaths = misc.to_percentage(aa_deaths, known_race_deaths) return [self._make_series( date=date, cases=cases, deaths=deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=pct_aa_cases, pct_aa_deaths=pct_aa_deaths, pct_includes_unknown_race=False, pct_includes_hispanic_black=False, known_race_cases=known_race_cases, known_race_deaths=known_race_deaths )]
def _scrape(self, **kwargs): runner = WebdriverRunner() results = runner.run( WebdriverSteps() .go_to_url(self.URL) .wait_for_number_of_elements((By.XPATH, '//canvas'), 12) .find_request('summary', find_by=tableau.find_tableau_request)) parser = tableau.TableauParser(request=results.requests['summary']) date_str = parser.get_dataframe_from_key('Date Stamp').loc[0]['Date Stamp'] match = re.search(r'\d{1,2}\/\d{1,2}\/\d{4}', date_str) date = datetime.strptime(match.group(), '%m/%d/%Y').date() cases_df = parser.get_dataframe_from_key('Demographic Data - Hospitalizaton Status') cases_df = cases_df.set_index(['Status Value', 'Demographic', 'Categories']).sort_index().loc[('%all%', 'Race')] cases_df['SUM(Count)'] = cases_df['SUM(Count)'].apply(raw_string_to_int) cases = cases_df.loc['All', 'SUM(Count)'] aa_cases = cases_df.loc['Black', 'SUM(Count)'] known_race_cases = cases - cases_df.loc['Refused/Unknown', 'SUM(Count)'] deaths_df = parser.get_dataframe_from_key('Demographic Data - Survival Outcomes') deaths_df = deaths_df.set_index(['Status Value', 'Demographic', 'Categories']).sort_index().loc[('Died', 'Race')] deaths_df['SUM(Count)'] = deaths_df['SUM(Count)'].apply(raw_string_to_int) deaths = deaths_df.loc['All', 'SUM(Count)'] aa_deaths = deaths_df.loc['Black', 'SUM(Count)'] known_race_deaths = deaths - deaths_df.loc['Refused/Unknown', 'SUM(Count)'] pct_aa_cases = to_percentage(aa_cases, known_race_cases) pct_aa_deaths = to_percentage(aa_deaths, known_race_deaths) return [self._make_series( date=date, cases=cases, deaths=deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=pct_aa_cases, pct_aa_deaths=pct_aa_deaths, pct_includes_unknown_race=False, pct_includes_hispanic_black=True, known_race_cases=known_race_cases, known_race_deaths=known_race_deaths )]
def _scrape(self, **kwargs): runner = WebdriverRunner() results = runner.run(WebdriverSteps().go_to_url( self.DATA_URL).wait_for_presence_of_elements([ (By.CLASS_NAME, 'markdown-card'), (By.CLASS_NAME, 'ember-view') ]).get_page_source()) soup = results.page_source date = self.get_date() _logger.info(f'Processing data for {date}') cases = self.get_total_cases(soup) deaths = self.get_total_deaths(soup) race_df = self.get_race_and_ethnicity_table(soup) known_cases = cases - race_df.loc['Data not available', 'Cases'] known_deaths = deaths - race_df.loc['Data not available', 'Deaths'] aa_cases = race_df.loc['African-American (NH)', 'Cases'] aa_deaths = race_df.loc['African-American (NH)', 'Deaths'] pct_aa_cases = to_percentage(aa_cases, known_cases) pct_aa_deaths = to_percentage(aa_deaths, known_deaths) return [ self._make_series( date=date, cases=cases, deaths=deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=pct_aa_cases, pct_aa_deaths=pct_aa_deaths, pct_includes_unknown_race=False, pct_includes_hispanic_black=False, known_race_cases=known_cases, known_race_deaths=known_deaths, ) ]
def _scrape(self, **kwargs): _, summary_df = arcgis.query_geoservice(**self.SUMMARY_QUERY) cases = summary_df.loc[0, 'Cases'] deaths = summary_df.loc[0, 'Deaths'] runner = WebdriverRunner() results = runner.run(WebdriverSteps().go_to_url( self.RACE_URL).wait_for_number_of_elements( (By.XPATH, '//canvas'), 14).find_request( 'race_cases', find_by=tableau.find_tableau_request).go_to_url( self.MAIN_PAGE_URL).get_page_source()) soup = results.page_source date = self.get_date(soup) parser = tableau.TableauParser(request=results.requests['race_cases']) cases_df = parser.get_dataframe_from_key('Census') cases_df = cases_df[cases_df['Measure Names'] == 'Case %'].set_index( 'Race') aa_cases = cases_df.loc['Black', 'SUM(Case Count)'] known_race_cases = cases_df['SUM(Case Count)'].sum() pct_aa_cases = misc.to_percentage(aa_cases, known_race_cases) return [ self._make_series( date=date, cases=cases, deaths=deaths, aa_cases=aa_cases, pct_aa_cases=pct_aa_cases, pct_includes_unknown_race=False, pct_includes_hispanic_black=True, known_race_cases=known_race_cases, ) ]
def _scrape(self, **kwargs): # Pulling date, total cases, and deaths. runner = WebdriverRunner() cases_results_dashboard = runner.run(WebdriverSteps().go_to_url( self.OVERALL_DASHBOARD).wait_for_presence_of_elements([ (By.XPATH, "//a[@target='_self']") ]).get_page_source()) date = cases_results_dashboard.page_source.find( text='OK Summary').findNext('a').text total_cases = int( cases_results_dashboard.page_source.find( text='OK Cases').findNext('a').text.replace(',', '')) total_deaths = int( cases_results_dashboard.page_source.find( text='OK Deaths').findNext('a').text.replace(',', '')) # OK demographic breakdowns have to be obtained through WebdriverRunner # and by scraping the Looker dashboard runner = WebdriverRunner() cases_results = runner.run(WebdriverSteps().go_to_url( self.CASES_DASHBOARD_OK).wait_for_presence_of_elements([ (By.XPATH, "//a[@target='_self']") ]).get_page_source()) runner = WebdriverRunner() death_results = runner.run(WebdriverSteps().go_to_url( self.DEATH_DASHBOARD_OK).wait_for_presence_of_elements([ (By.XPATH, "//a[@target='_self']") ]).get_page_source()) # Once we have the page source for both dashboard, I'm extracting the "tspan" # tags which include the percentages for black lives def _get_demographic_data(page): by_race_title = page.find(text=re.compile(r'by race', re.I)) by_race_svg = by_race_title.find_next('svg') legend = by_race_svg.find('g', class_='highcharts-legend') pct_re = re.compile(r'([0-9.]+)%') for legend_item in legend.find_all( 'g', class_='highcharts-legend-item'): text = ' '.join((tspan.text.strip() for tspan in legend_item.find_all('tspan'))) if text.find('Unknown') >= 0: match = pct_re.search(text) assert match is not None, 'Unable to find value for label "Unknown"' unknown_pct = float(match.group(1)) elif text.find('African') >= 0: match = pct_re.search(text) assert match is not None, 'Unable to find value for label "African"' aa_pct = float(match.group(1)) return (unknown_pct, aa_pct) percentage_unkown_cases, aa_cases_pct = _get_demographic_data( cases_results.page_source) percentage_unkown_deaths, aa_deaths_pct = _get_demographic_data( death_results.page_source) aa_cases = (aa_cases_pct / 100) * total_cases aa_deaths = (aa_deaths_pct / 100) * total_deaths known_cases = total_cases * (1 - (percentage_unkown_cases / 100)) known_deaths = total_deaths * (1 - (percentage_unkown_deaths / 100)) return [ self._make_series( date=date, cases=total_cases, deaths=total_deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=aa_cases_pct, pct_aa_deaths=aa_deaths_pct, pct_includes_unknown_race=False, pct_includes_hispanic_black=False, known_race_cases=known_cases, known_race_deaths=known_deaths, ) ]
def _scrape(self, **kwargs): runner = WebdriverRunner() results = runner.run(WebdriverSteps( ).go_to_url(self.URL).wait_for_presence_of_elements(( By.XPATH, "//*[name()='svg']/*[name()='g']/*[name()='g']/*[name()='g']/*[name()='text']/*[name()='title' and contains(text(), 'Black')]" )).wait_for_visibility_of_elements( (By.XPATH, "//button[span[contains(text(), 'Deaths')]]") ).find_request( 'date', find_by=powerbi.filter_requests( entity='Date_Uploaded')).find_request( 'cases_by_race', find_by=powerbi.filter_requests(entity='Cases_Ethnicity') ).find_element_by_xpath( "//button[span[contains(text(), 'Deaths')]]" ).clear_request_history().click_on_last_element_found( ).wait_for_number_of_elements( (By.XPATH, "//div[contains(@aria-label, 'Deaths -')]"), 6).wait_for_number_of_elements( (By.XPATH, "//*[name()='svg']/*[name()='g']/*[name()='rect']"), 20).find_request('deaths_by_race', find_by=powerbi.filter_requests( entity='Deaths_Ethnicity'))) assert 'date' in results.requests, '`date` request missing' assert 'cases_by_race' in results.requests, '`cases_by_race` request missing' assert 'deaths_by_race' in results.requests, '`deaths_by_race` request missing' # Date parser = powerbi.PowerBIParser(results.requests['date']) df = parser.get_dataframe_by_key('Date_Uploaded') unparsed_date = df.loc[0]['Date_Uploaded.Data as of'] date = self.parse_date(unparsed_date) # Cases parser = powerbi.PowerBIParser(results.requests['cases_by_race']) cases_by_race_df = parser.get_dataframe_by_key( 'Cases_Ethnicity').set_index( 'CountNonNull(Cases_Ethnicity.Total Cases)') cases = cases_by_race_df['Cases_Ethnicity.raceethnicity'].sum() aa_cases = cases_by_race_df.loc['Black or African American'][ 'Cases_Ethnicity.raceethnicity'] known_cases = cases - cases_by_race_df.loc['Unknown'][ 'Cases_Ethnicity.raceethnicity'] pct_aa_cases = misc.to_percentage(aa_cases, known_cases) # Deaths parser = powerbi.PowerBIParser(results.requests['deaths_by_race']) deaths_by_race_df = (parser.get_dataframe_by_key( key='Deaths_Ethnicity').set_index( 'Sum(Deaths_Ethnicity.Total Cases)')) deaths = deaths_by_race_df['Deaths_Ethnicity.raceethnicity'].sum() aa_deaths = deaths_by_race_df.loc['Black or African American'][ 'Deaths_Ethnicity.raceethnicity'] # if there are no unknown deaths, it will not appear in the dataframe. known_deaths = deaths if 'Unknown' in deaths_by_race_df.index: known_deaths -= deaths_by_race_df.loc['Unknown'][ 'Deaths_Ethnicity.raceethnicity'] pct_aa_deaths = misc.to_percentage(aa_deaths, known_deaths) return [ self._make_series( date=date, cases=cases, deaths=deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=pct_aa_cases, pct_aa_deaths=pct_aa_deaths, pct_includes_unknown_race=False, pct_includes_hispanic_black=False, known_race_cases=known_cases, known_race_deaths=known_deaths, ) ]
def _scrape(self, **kwargs): runner = WebdriverRunner() results = runner.run(WebdriverSteps().go_to_url( self.HOME_URL).get_page_source()) date = self.parse_date(results.page_source) results = runner.run(WebdriverSteps().go_to_url( self.URL ).wait_for_number_of_elements((By.XPATH, '//canvas'), 32).find_request( 'cases', tableau.find_tableau_request ).clear_request_history().find_element_by_xpath( '//*[@id="tabZoneId4"]/div/div/div/span[2]/div/span/span/span[2]/div[2]/div' ).click_on_last_element_found().wait_for_number_of_elements( (By.XPATH, "//span[contains(text(), 'Deaths')]"), 6).find_request( 'deaths', find_by=lambda r: 'set-active-story-point' in r.path)) parser = tableau.TableauParser(request=results.requests['cases']) cases = pydash.head( parser.extract_data_from_key('Cases')['SUM(Number of Records)']) cases_pct_df = pd.DataFrame.from_dict( parser.extract_data_from_key('Race_Cases')).set_index( 'Assigned Race') cases_df = cases_pct_df.assign(Count=[ round(v * cases) for v in cases_pct_df['SUM(Number of Records)'].values ]) aa_cases = cases_df.loc['Black']['Count'] known_race_cases = cases - cases_df.loc['Unknown']['Count'] parser = tableau.TableauParser(request=results.requests['deaths']) deaths = pydash.head( parser.extract_data_from_key('NumberDeaths') ['SUM(Number of Records)']) deaths_pct_df = pd.DataFrame.from_dict( parser.extract_data_from_key('Race_Deaths')).set_index( 'Assigned Race') deaths_df = deaths_pct_df.assign(Count=[ round(v * deaths) for v in deaths_pct_df['SUM(Number of Records)'].values ]) aa_deaths = deaths_df.loc['Black']['Count'] known_race_deaths = deaths - deaths_df.loc['Unknown']['Count'] pct_aa_cases = misc.to_percentage(aa_cases, known_race_cases) pct_aa_deaths = misc.to_percentage(aa_deaths, known_race_deaths) return [ self._make_series(date=date, cases=cases, deaths=deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=pct_aa_cases, pct_aa_deaths=pct_aa_deaths, pct_includes_unknown_race=False, pct_includes_hispanic_black=True, known_race_cases=known_race_cases, known_race_deaths=known_race_deaths) ]
def _scrape(self, **kwargs): runner = WebdriverRunner() cases_results = runner.run(WebdriverSteps().go_to_url( self.CASES_DASHBOARD_URL).wait_for_number_of_elements( (By.XPATH, "//div[@class='badge-content-shield']"), 10).wait_for_presence_of_elements( (By.XPATH, '//summary-number')).find_request( key='cases', find_by=lambda r: self.CASES_CARD_PATH in r.path ).find_request(key='cases_by_race', find_by=lambda r: self.AA_CASES_CARD_PATH in r.path).get_page_source()) deaths_results = runner.run(WebdriverSteps().go_to_url( self.DEATHS_DASHBOARD_URL).wait_for_number_of_elements( (By.XPATH, "//div[@class='kpi_chart']"), 14).find_request( key='deaths', find_by=lambda r: self.DEATHS_CARD_PATH in r.path ).find_request( key='deaths_by_race', find_by=lambda r: self.AA_DEATHS_CARD_PATH in r.path)) date = self.get_date(cases_results.page_source) # total cases assert cases_results.requests['cases'] case_data = self.load_response_json(cases_results, 'cases') cases_rows = self.extract_rows(case_data) cases = sum(pydash.pluck(cases_rows, 1)) # aa cases assert cases_results.requests['cases_by_race'] cases_by_race_data = self.load_response_json(cases_results, 'cases_by_race') cases_by_race_rows = self.extract_rows(cases_by_race_data) aa_row = pydash.find( cases_by_race_rows, lambda r: r[0] == 'Black or African-American') or [] assert len(aa_row) == 2, 'Row is malformed' aa_cases = aa_row[1] # total deaths deaths_data = self.load_response_json(deaths_results, 'deaths') deaths_rows = self.extract_rows(deaths_data) deaths = sum(pydash.pluck(deaths_rows, 1)) # aa_deaths deaths_by_race_data = self.load_response_json(deaths_results, 'deaths_by_race') deaths_by_race_rows = self.extract_rows(deaths_by_race_data) aa_deaths_row = pydash.find( deaths_by_race_rows, lambda r: r[0] == 'Black or African-American') or [] assert len(aa_deaths_row) == 2, 'Row is malformed' aa_deaths = aa_deaths_row[1] pct_aa_deaths = to_percentage(aa_deaths, deaths) pct_aa_cases = to_percentage(aa_cases, cases) return [ self._make_series(date=date, cases=cases, deaths=deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=pct_aa_cases, pct_aa_deaths=pct_aa_deaths, pct_includes_unknown_race=True, pct_includes_hispanic_black=True) ]
def _scrape(self, **kwargs): runner = WebdriverRunner() cases_results = runner.run(WebdriverSteps().go_to_url( self.CASES_URL).find_request(key='cases', find_by=find_tableau_request)) deaths_results = runner.run(WebdriverSteps().go_to_url( self.DEATHS_URL).find_request(key='deaths', find_by=find_tableau_request)) assert cases_results.requests['cases'], 'No results found for `cases`' resp_body = cases_results.requests['cases'].response.body.decode( 'utf8') tableau_parser = TableauParser(resp_body) parsed_date = tableau_parser.extract_data_from_key('Date Updated') assert 'Date Updated' in parsed_date, 'Unable to parse date' assert len(parsed_date['Date Updated']) == 1, 'Unable to parse date' date_str = pydash.head(parsed_date['Date Updated']) date = datetime.strptime(date_str, '%m/%d/%Y').date() parsed_num_cases = tableau_parser.extract_data_from_key( 'Number of Cases') assert 'SUM(Number of Records)' in parsed_num_cases, 'Key not found, unable to parse number of records' assert len(parsed_num_cases['SUM(Number of Records)'] ) == 1, 'Parsing error might have occurred' cases = pydash.head(parsed_num_cases['SUM(Number of Records)']) parsed_race_eth = tableau_parser.extract_data_from_key( 'Race/Ethnicity Epi') parsed_race_eth_df = pd.DataFrame.from_dict(parsed_race_eth).set_index( 'Raceeth') aa_cases = parsed_race_eth_df.loc['Black, non-Hispanic'][ 'AGG(RecordCount)'] known_race_cases = cases - parsed_race_eth_df.loc['Unknown'][ 'AGG(RecordCount)'] assert deaths_results.requests[ 'deaths'], 'No results found for `deaths`' resp_body = deaths_results.requests['deaths'].response.body.decode( 'utf8') tableau_parser = TableauParser(resp_body) parsed_death_cases = tableau_parser.extract_data_from_key( 'Number of deaths') assert 'SUM(Death count)' in parsed_death_cases, 'Death count not found' assert len(parsed_death_cases['SUM(Death count)'] ) == 1, 'Parsing error might have occurred.' deaths = pydash.head(parsed_death_cases['SUM(Death count)']) parsed_deaths_by_race = tableau_parser.extract_data_from_key( 'Death Race/Ethnicity') parsed_deaths_by_race_df = pd.DataFrame.from_dict( parsed_deaths_by_race).set_index('Raceeth') aa_deaths = parsed_deaths_by_race_df.loc['Black, non-Hispanic'][ 'AGG(RecordCount)'] known_race_deaths = deaths - parsed_deaths_by_race_df.loc['Unknown'][ 'AGG(RecordCount)'] pct_aa_cases = misc.to_percentage(aa_cases, known_race_cases) pct_aa_deaths = misc.to_percentage(aa_deaths, known_race_deaths) return [ self._make_series(date=date, cases=cases, deaths=deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=pct_aa_cases, pct_aa_deaths=pct_aa_deaths, pct_includes_unknown_race=False, pct_includes_hispanic_black=False, known_race_cases=known_race_cases, known_race_deaths=known_race_deaths) ]
def _scrape(self, **kwargs): runner = WebdriverRunner() results = runner.run( WebdriverSteps().go_to_url(self.URL).wait_for_number_of_elements( (By.XPATH, '//visual-modern'), 34).wait_for_presence_of_elements( (By.XPATH, '//span[contains(text(), "Last updated")]')).find_request( 'summary', find_by=powerbi.filter_requests( entity='Trend Analysis', selects=[ 'Sum(County.Deaths)' ])).get_page_source().clear_request_history(). go_to_url(self.race_url).wait_for_presence_of_elements( (By.XPATH, "//*[name()='title' and contains(text(), 'Black')]" )).wait_for_presence_of_elements( (By.XPATH, "//*[name()='title' and contains(text(), 'Asian')]" )).find_request( 'race_cases', find_by=powerbi.filter_requests( entity='Demographics', selects=[ 'CountNonNull(Demographics.Black)' ])).find_element_by_xpath( "//div[@class='slicer-restatement']"). click_on_last_element_found().wait_for_visibility_of_elements( (By.XPATH, "//div[@class='row' and .//span/@title='Deaths']" )).find_element_by_xpath( "//div[@class='row' and .//span/@title='Deaths']"). clear_request_history().click_on_last_element_found( ).switch_to_iframe().wait_for_presence_of_elements(( By.XPATH, "//*[name()='svg']/*[name()='g']/*[name()='g']/*[name()='text']/*[name()='title' and contains(text(), 'deaths')]" )).find_request('race_deaths', find_by=powerbi.filter_requests( entity='Demographics', selects=['CountNonNull(Demographics.Black)']))) page_source = results.page_source date = self.get_date(page_source) parser = powerbi.PowerBIParser(results.requests['summary']) cases_df = parser.get_dataframe_by_key('Sum(County.Total Cases)') cases = cases_df.loc[0, 'Sum(County.Total Cases)'] deaths_df = parser.get_dataframe_by_key('Sum(County.Deaths)') deaths = deaths_df.loc[0, 'Sum(County.Deaths)'] parser = powerbi.PowerBIParser(results.requests['race_cases']) race_cases_df = parser.get_dataframe_by_key('Demographics') aa_cases = race_cases_df.loc[0, 'CountNonNull(Demographics.Black)'] known_race_cases = race_cases_df.loc[0].sum() parser = powerbi.PowerBIParser(results.requests['race_deaths']) race_deaths_df = parser.get_dataframe_by_key('Demographics') aa_deaths = race_deaths_df.loc[0, 'CountNonNull(Demographics.Black)'] known_race_deaths = race_deaths_df.loc[0].sum() pct_aa_cases = misc.to_percentage(aa_cases, known_race_cases) pct_aa_deaths = misc.to_percentage(aa_deaths, known_race_deaths) return [ self._make_series(date=date, cases=cases, deaths=deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=pct_aa_cases, pct_aa_deaths=pct_aa_deaths, pct_includes_unknown_race=False, pct_includes_hispanic_black=False, known_race_cases=known_race_cases, known_race_deaths=known_race_deaths) ]