def get_data(download=False): """Download data from web or from file Keyword Arguments: download {bool} -- get data from source (default: {False}) Returns: DataFrame -- florida covid19 cases and deaths data """ if download: # instantiate covid19 pdf scraper pdf = PdfScraper() pdf.get_pages() # covid19 cases cases = pdf.get_data(marker=r'Case[^s]') deaths = pdf.get_data(marker=r'Death[^s]') _db = DataBase() _db.add_table(FL_CASES_TABLE, cases, index=False) _db.add_table(FL_DEATHS_TABLE, deaths, index=False) _db.close() else: # read from database _db = DataBase() cases = _db.get_table(FL_CASES_TABLE, parse_dates=['date']) deaths = _db.get_table(FL_DEATHS_TABLE, parse_dates=['date']) _db.close() return cases, deaths
def add_metadata(): """Updates options, dates and levels database tables Updates: database table -- OPTIONS database table -- DATES database table -- LEVELS """ _db = DataBase() _db.update(DROP_US_MAP_PIVOT_VIEW) _db.update(US_MAP_PIVOT_VIEW) data = _db.get_table(NYTIMES_COUNTIES_TABLE, parse_dates=['dates']) _db.close() data['date'] = pd.to_datetime(data['date']) # last 15 days dates = [] latest_date = data['date'].max() for day in range(15): date = latest_date - pd.to_timedelta(day, 'days') dates.append(date) # meta data _db = DataBase() _db.add_table(LEVELS_TABLE, pd.DataFrame({'level': LEVELS}), index=False) _db.add_table(DATES_TABLE, pd.DataFrame({'date': dates}), index=False) _db.update(DROP_OPTIONS_TABLE) _db.update(CREATE_OPTIONS_TABLE) _db.update(INSERT_USA_OPTION) _db.close()
def clean_states_data(): """Clean US States data from NY Times Returns: DataFrame -- clean us states data Updates: database table -- NYTIMES_STATES_TABLEs database view -- STATES_VIEW """ # covid19 data and metadata _db = DataBase() data = _db.get_table(US_STATES_TABLE, parse_dates=['date']) states = _db.get_geotable(STATE_MAP_TABLE) _db.close() start = len(data) # add state ids states['name'] = states['name'].str.lower() lookup = states.set_index('name')['state_id'].to_dict() data['state_id'] = data['state'].str.lower().map(lookup) # get rid of data that is not in county meta data data = data[data['state_id'].isin(list( states['state_id']))].copy(deep=True) # days from lastest reported date delta_day = pd.to_timedelta(1, unit='days') data['day'] = (data['date'].max() - data['date']) / delta_day data['day'] = data['day'].astype('Int32') end = len(data) # ny times table data = data[['state_id', 'date', 'day', 'cases', 'deaths']].copy(deep=True) data.reset_index(drop=True, inplace=True) # ignored lines print( f'ignored lines: {start-end}/{start} = {(100*(start-end)/start):.01f}%' ) # table to database _db = DataBase() _db.add_table(NYTIMES_STATES_TABLE, data.set_index('state_id')) _db.update(DROP_STATES_VIEW) _db.update(STATES_VIEW) _db.close() return data
def download_fldem(): """Get, clean and store covid19 data from FL DEM """ get_data(True) _db = DataBase() # cases data = clean_data(FL_CASES_TABLE) _db.add_table(FLDEM_CASES_TABLE, data.set_index('case_id')) # deaths data = clean_data(FL_DEATHS_TABLE) _db.add_table(FLDEM_DEATHS_TABLE, data.set_index('case_id')) # view _db.update(DROP_FLDEM_VIEW) _db.update(FLDEM_VIEW) _db.close()
def download_nytimes(): """Read NY Times data from github """ # read covid19 county by county data from url data = pd.read_csv(URL_COUNTIES, dtype={'fips': 'str'}) _db = DataBase() _db.add_table(US_COUNTIES_TABLE, data, index=False) _db.close() # read covid19 state by state data from url data = pd.read_csv(URL_STATES, dtype={'fips': 'str'}) _db = DataBase() _db.add_table(US_STATES_TABLE, data, index=False) _db.close() clean_counties_data() clean_states_data() add_metadata()
def predict(): """main module function to predict covid19 cases and deaths Inputs from databae: US_STATES_TABLE {database table} -- nytimes covid19 data Outputs to database: ARIMA_CASES_TABLE {database table} -- cases[predict, upper, lower] ARIMA_DEATHS_TABLE {database table } -- deaths[predict, upper, lower] """ _db = DataBase() data = _db.get_table(STATES_VIEW_TABLE, parse_dates=['date']) _db.close() # predict cases result = run_arima(data, 'cases') # only data after 3/15/2020 slicer = result['date'] > pd.to_datetime('3/15/2020') result = result.loc[slicer, :].copy(deep=True) result.sort_values(['date', 'state'], inplace=True) result = result.round(0) _db = DataBase() _db.add_table(ARIMA_CASES_TABLE, data=result, index=False) _db.close() # predict deaths result = run_arima(data, 'deaths') # only data after 3/15/2020 slicer = result['date'] > pd.to_datetime('3/15/2020') result = result.loc[slicer, :].copy(deep=True) result.sort_values(['date', 'state'], inplace=True) result = result.round(0) _db = DataBase() _db.add_table(ARIMA_DEATHS_TABLE, data=result, index=False) _db.close()
def clean_counties_data(): """Clean US Counties data from NY Times Returns: DataFrame -- clean us counties data Updates: database table -- NYTIMES_COUNTIES_TABLEs database view -- COUNTIES_VIEW """ _db = DataBase() data = _db.get_table(US_COUNTIES_TABLE, parse_dates=['date']) counties = _db.get_geotable(US_MAP_TABLE) states = _db.get_geotable(STATE_MAP_TABLE) _db.close() start = len(data) # use new york county fips for new york city data.loc[data['county'] == 'New York City', 'fips'] = '36061' # add state ids lookup = states.set_index('name')['state_id'].to_dict() data['state_id'] = data['state'].map(lookup) # add county ids - first attempt data['id'] = data['county'].str.lower() + data['state_id'] counties['id'] = counties['name'].str.lower() + counties['state_id'] lookup = counties[['id', 'county_id']].set_index('id')['county_id'].to_dict() data['county_id'] = data['id'].map(lookup) # add county ids - last attempt condition = (~data['fips'].isna()) & (data['county_id'].isna()) data.loc[condition, 'county_id'] = data.loc[condition, 'fips'] # get rid of data that is not in county meta data data = data[data['county_id'].isin(list( counties['county_id']))].copy(deep=True) # state ids base on county_ids lookup = counties.set_index('county_id')['state_id'].to_dict() data['state_id'] = data['county_id'].map(lookup) # days from lastest day delta_day = pd.to_timedelta(1, unit='days') data['day'] = (data['date'].max() - data['date']) / delta_day data['day'] = data['day'].astype('Int32') end = len(data) # ny times counties table cols = ['county_id', 'state_id', 'date', 'day', 'cases', 'deaths'] data = data[cols].copy(deep=True) data.reset_index(drop=True, inplace=True) data['case_level'] = pd.cut(data['cases'], LEVELS, labels=range(1, len(LEVELS))) data['case_level'] = pd.to_numeric(data['case_level'], 'coerce').fillna(0) data['case_level'] = data['case_level'].astype('Int32') # ignored lines print( f'ignored lines: {start-end}/{start} = {(100*(start-end)/start):.01f}%' ) # tables to database _db = DataBase() _db.add_table(NYTIMES_COUNTIES_TABLE, data.set_index(['county_id', 'day'])) _db.update(DROP_COUNTIES_VIEW) _db.update(COUNTIES_VIEW) _db.close() return data