def data_read_and_prep(csv_path, epwk, yr, test_wks=4, wght=False, log_tr=False): # Read in the historical ILI data from startdate given by epwk and year from the csv_path and # create train and test set cdcdf = pd.read_csv(csv_path, header=1) df = cdcdf.drop(["REGION", "REGION TYPE", "AGE 0-4", "AGE 25-49", "AGE 25-64", "AGE 5-24", "AGE 50-64", "AGE 65", "NUM. OF PROVIDERS"], axis=1) df['DATE'] = pd.to_datetime(df.apply(lambda row : epi.Week(int(row["YEAR"]), int(row["WEEK"])).startdate() ,axis=1, result_type='reduce')) week=epi.Week(yr, epwk) df_train = df[(df['DATE']<=pd.to_datetime(week.startdate()))] df_test = df[(df['DATE']>pd.to_datetime(week.startdate()))&((df['DATE']<=pd.to_datetime(week.startdate())+timedelta(weeks=test_wks)))] if wght: train = df_train['% WEIGHTED ILI'] test = df_test['% WEIGHTED ILI'] else: train = df_train['%UNWEIGHTED ILI'] test = df_test['%UNWEIGHTED ILI'] if log_tr: train = np.log(train) test = np.log(test) train.index = df_train['DATE'] test.index = df_test['DATE'] return train, test, df, df_train, df_test
def test_year_weeks(year_cdc, year_iso): cdc_weeks = [] for w in range(1, 53): cdc_weeks.append(epiweeks.Week(2015, w)) assert list(year_cdc.iterweeks()) == cdc_weeks iso_weeks = [] for w in range(1, 54): iso_weeks.append(epiweeks.Week(2015, w, system="iso")) assert list(year_iso.iterweeks()) == iso_weeks
def national(): cdcdf = pd.read_csv('data/national/ILINet.csv', header=1) df = cdcdf.drop(["REGION", "REGION TYPE", "AGE 0-4", "AGE 25-49", "AGE 25-64", "AGE 5-24", "AGE 50-64", "AGE 65", "NUM. OF PROVIDERS"], axis=1) df['DATE'] = pd.to_datetime(df.apply(lambda row : epi.Week(int(row["YEAR"]), int(row["WEEK"])).startdate() ,axis=1, result_type='reduce')) return df
def test_week_ordering(week_cdc, week_iso): assert week_cdc > epiweeks.Week(2014, 53, system="cdc") assert week_cdc >= epiweeks.Week(2015, 1, system="cdc") assert week_cdc < epiweeks.Week(2015, 2, system="cdc") assert week_cdc <= epiweeks.Week(2015, 1, system="cdc") assert week_iso > epiweeks.Week(2014, 52, system="iso") assert week_iso >= epiweeks.Week(2015, 1, system="iso") assert week_iso < epiweeks.Week(2015, 2, system="iso") assert week_iso <= epiweeks.Week(2015, 1, system="iso")
def prepdata_retro(csv_path,epwk): nat_csv_file = csv_path + '/' +'national/'+'ILINet_National_' + str(epwk) + '.csv' df = pd.read_csv(nat_csv_file, na_values='X') df['REGION'] = df['REGION'].fillna('National') hhs_csv_file = csv_path +'/'+'hhs/'+'ILINet_HHS_' + str(epwk) + '.csv' df = df.append(pd.read_csv(hhs_csv_file,na_values='X')) df['REGION'] = df['REGION'].fillna('National') df['DATE'] = pd.to_datetime(df.apply(lambda row : epi.Week(int(row["YEAR"]), int(row["WEEK"])).startdate() ,axis=1, result_type='reduce')) return df
def prepdata_append(csv_path): national = pd.read_csv(csv_path+'national/ILINet.csv', na_values='X',header=1) national['REGION'] = national['REGION'].fillna('National') regional = national.append(pd.read_csv(csv_path+'regional/ILINet.csv',na_values='X', header=1)) df = regional.append(pd.read_csv(csv_path+'state/ILINet.csv', na_values='X', header=1)) df['DATE'] = pd.to_datetime(df.apply(lambda row : epi.Week(int(row["YEAR"]), int(row["WEEK"])).startdate() ,axis=1, result_type='reduce')) return df
def region(number): cdcdf = pd.read_csv('data/regional/ILINet.csv', header=1) cdcdf.drop(["REGION TYPE", "AGE 0-4", "AGE 25-49", "AGE 25-64", "AGE 5-24", "AGE 50-64", "AGE 65", "NUM. OF PROVIDERS"], axis=1, inplace = True) dfs = {} for region in cdcdf["REGION"].unique(): dfs[region] = pd.DataFrame(cdcdf.loc[cdcdf['REGION'] == region]) for df in dfs.values(): df.drop(["REGION"], axis=1, inplace=True) df['DATE'] = pd.to_datetime(df.apply(lambda row : epi.Week(int(row["YEAR"]), int(row["WEEK"])).startdate() ,axis=1, result_type='reduce')) #df.drop(["YEAR", "WEEK"], axis = 1, inplace = True) return dfs["Region " + number]
def prepdata_flux(csv_path,epwk): nat_csv_file = csv_path + '/'+'ILINet_national_' + str(epwk.year) +'EW'+ str(epwk.week) + '.csv' df = pd.read_csv(nat_csv_file, na_values='X') df['region'] = df['region'].fillna('National') hhs_csv_file = csv_path +'/'+ 'ILINet_hhs_' + str(epwk.year) +'EW'+ str(epwk.week) + '.csv' df = df.append(pd.read_csv(hhs_csv_file,na_values='X')) state_csv_file = csv_path +'/'+ 'ILINet_state_' + str(epwk.year) +'EW'+ str(epwk.week) + '.csv' df = df.append(pd.read_csv(state_csv_file,na_values='X')) df['DATE'] = pd.to_datetime(df.apply(lambda row : epi.Week(int(row["year"]), int(row["week"])).startdate() ,axis=1, result_type='reduce')) return df
def main(args): #parser = argparse.ArgumentParser(description='Script that runs an autoregressive forecasting model upon available CDC flu data.') #parser.add_argument('REGION', help='Region selector. Valid regions are "national", regions "1" - "10", or any state, e.g. "Alabama", "Michigan"') #parser.add_argument('TARGET', help='Target to forecast upon. Valid targets are Weighted ILI ("wili"), Unweighted ILI ("ili"), ILI Total ("ilitotal"), or Total Patients ("totalpatients")') #parser.add_argument('STARTDATE', help='Year in which the model will start training, formatted as "2018EW05".') #parser.add_argument('ENDDATE', help='Date at which the model will stop training, formatted as "2018EW05".') #args = parser.parse_args() args = vars(args) if args["REGION"] not in regions: raise TypeError("REGION is not valid") if args["REGION"] == "national": args["REGION"] = "US National" if args["TARGET"] not in targets: raise TypeError("TARGET is not valid") if re.fullmatch('\d{4}(EW)\d{2}', args["STARTDATE"]) is None: raise TypeError("STARTDATE is formatted incorrectly") if re.fullmatch('\d{4}(EW)\d{2}', args["ENDDATE"]) is None: raise TypeError("ENDDATE is formatted incorrectly") bin_ed = [ 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13.0, 100 ] startyear = args["STARTDATE"][:4] startweek = args["STARTDATE"][6:8] trainweek = startweek ww = epi.Week(int(startyear), int(startweek)) region = args["REGION"] target = targets[args["TARGET"]] df = prepdata() directory = 'output/' + str(ww.year) + '/' if not os.path.exists(directory): os.makedirs(directory) for i in range(0, 40): predictions, bn_mat = ARLR_module(df, region, target, ww + i) #pdb.set_trace() outputdistribution(predictions.reshape(4), bn_mat.reshape([131, 4]), bin_ed, region, target, directory, ww + i) pdb.set_trace()
def state(name): cdcdf = pd.read_csv('../data/state/ILINet.csv', header=1) cdcdf = cdcdf.drop([ "REGION TYPE", "AGE 0-4", "AGE 25-49", "AGE 25-64", "AGE 5-24", "AGE 50-64", "AGE 65", "NUM. OF PROVIDERS" ], axis=1) dfs = {} for state in cdcdf["REGION"].unique(): dfs[state] = pd.DataFrame(cdcdf.loc[cdcdf["REGION"] == state]) for df in dfs.values(): df.drop(["REGION"], axis=1, inplace=True) df['DATE'] = pd.to_datetime( df.apply(lambda row: epi.Week(int(row["YEAR"]), int(row["WEEK"])). startdate(), axis=1, result_type='reduce')) #df.drop(["YEAR", "WEEK"], axis = 1, inplace = True) return dfs[name]
def state_data(csv_path,epwk, mode): if mode == "test": state_csv_file = csv_path+'state/ILINet.csv' df = pd.read_csv(state_csv_file,na_values='X', header=1) elif mode == "flux": state_csv_file = csv_path +'/'+ 'ILINet_state_' + str(epwk.year) +'EW'+ str(epwk.week) + '.csv' df = pd.read_csv(state_csv_file,na_values='X') elif mode == "retro": state_csv_file = csv_path +'/'+'state/'+ 'ILINet_State_' + str(epwk.year) + str(epwk.week) + '.csv' df = pd.read_csv(state_csv_file,na_values='X') df['DATE'] = pd.to_datetime(df.apply(lambda row : epi.Week(int(row["YEAR"]), int(row["WEEK"])).startdate() ,axis=1, result_type='reduce')) df = df.rename(columns={'REGION TYPE': 'region_type', 'REGION': 'region', '% WEIGHTED ILI': 'weighted_ili', '%UNWEIGHTED ILI': 'unweighted_ili', 'DATE':'date'}) df = df.set_index('date') df_state = pd.DataFrame(columns=[],index=df.index.unique()) st_dict = {'state':[]} for st in df.region.unique(): df_state[st] = df[df.region==st]['unweighted_ili'] st_dict['state'].append(st) return df_state, st_dict
def prep_aw_data(st_id_path, **kwargs): '''Prepares weather and return the corresponding dataframe. kwargs is a dictionary woth key as "national", "HHS", and/or "States" and values are the paths. Prepare this dictionary before calling this functions.''' df_wtr = pd.DataFrame() for key,value in kwargs.items(): if key == "National": df_wtr_temp = pd.read_csv(value) df_wtr_temp['region'] = df_wtr_temp.apply(lambda x: "National", axis=1) df_wtr_temp['region_type'] = df_wtr_temp.apply(lambda x: "National", axis=1) elif key == "HHS": df_wtr_temp = pd.read_csv(value) df_wtr_temp['region'] = df_wtr_temp.apply(lambda x: "Region {}".format(x['area_id']),axis=1) df_wtr_temp['region_type'] = df_wtr_temp.apply(lambda x: "HHS Regions", axis=1) elif key == "States": df_wtr_temp = pd.read_csv(value) df_wtr_temp = df_wtr_temp[~df_wtr_temp.area_id.isin([72,78])] df_st_id = pd.read_csv(st_id_path) df_wtr_temp['region'] = df_wtr_temp.apply(lambda row: df_st_id[df_st_id['state']==row['area_id']]['state_name'].values[0], axis=1) df_wtr_temp['region_type'] = df_wtr_temp.apply(lambda x: "States", axis=1) df_wtr = df_wtr.append(df_wtr_temp) pp = pd.to_datetime([epi.Week(int(cdc_data.date2ew(d.date())[0]),int(cdc_data.date2ew(d.date())[1])).startdate() for d in pd.to_datetime(df_wtr.date)]) df_wtr.index = pp df_wtr.index = df_wtr.index.rename('DATE') return df_wtr
def test_week_equality(week_cdc, week_iso, week_barc): assert week_cdc == epiweeks.Week(2015, 1, system="cdc") assert week_cdc != epiweeks.Week(2014, 1, system="cdc") assert week_iso == epiweeks.Week(2015, 1, system="iso") assert week_iso != epiweeks.Week(2014, 1, system="iso") assert week_barc == epiweeks.Week(2019, 53, system="barc")
def week_barc(): return epiweeks.Week(2019, 53, system="barc")
def week_iso(): return epiweeks.Week(2015, 1, system="iso")
def test_iso_week_to_startdate(test_input, expected): year, week = test_input startdate = epiweeks.Week(year, week, "ISO").startdate() assert startdate.timetuple()[:3] == expected
def main(): config = configparser.ConfigParser() config_file = pkg_resources.resource_filename(__name__, 'config.ini') config.read(config_file) args = parse_args() level = logging.INFO if args.verbose: level = logging.DEBUG log.setLevel(level) if args.log is None: handler = logging.StreamHandler() else: handler = logging.FileHandler(args.log) log_formatter = logging.Formatter( '%(asctime)s:%(levelname)s:' '%(name)s.%(funcName)s:%(message)s', datefmt='%Y%m%d-%H%M%S') handler.setFormatter(log_formatter) log.addHandler(handler) log.info('{} v{}'.format(__processor__, __version__)) #if args.region not in regions: # raise TypeError("region is not valid") #if args.region_type == "national": # args.region_type = "US National" fct_weeks = args.weeks # csv_path = args.ground_truth st_id_path = args.st_fips epiyear = args.forecast_from startyear = epiyear[:4] #args.forecast_from[:4] startweek = epiyear[4:] #args.forecast_from[6:8] #trainweek = startweek ews = epi.Week(int(startyear), int(startweek)) targets = get_targets() header_region_type = targets[ 'flux_region_type'] #"REGION TYPE" for retro or old datasets header_region = targets['flux_region'] #"REGION" for retro or old datasets end_date = args.end_date if args.mode == "retro": fdf = prepdata_retro(csv_path, ews) if args.mode == "flux": fdf = prepdata_flux(csv_path, ews) if args.mode == "test": fdf = prepdata_append(csv_path) fdf = fdf.rename( columns={ 'REGION TYPE': 'region_type', 'REGION': 'region', '% WEIGHTED ILI': 'weighted_ili', '%UNWEIGHTED ILI': 'unweighted_ili', 'DATE': 'date' }) if end_date is None: end_date = fdf['date'].max().date() + timedelta(days=3) else: dt = datetime.strptime(end_date, '%Y%m%d').date() end_date = dt + timedelta(days=(3 - dt.isoweekday() % 7)) if args.end_date is not None: fdf = fdf[fdf['date'] <= pd.Timestamp(end_date)] fdf = fdf[~fdf.region. isin(['Puerto Rico', 'Virgin Islands', 'New York City'])] fdf.index = fdf['date'] fdf.index = fdf.index.rename('date') # DataFrame preparation part, integrating accuweather, ght time series with ILI kwargs_wtr = { "National": args.accu_data_nat, "HHS": args.accu_data_hhs, "States": args.accu_data_state } accu_data_fl = None for _, value in kwargs_wtr.items(): accu_data_fl = accu_data_fl or value kwargs_ght = { "National": args.ght_data_nat, "HHS": args.ght_data_hhs, "States": args.ght_data_state } ght_data_fl = None for _, value in kwargs_ght.items(): ght_data_fl = ght_data_fl or value if ght_data_fl is None and accu_data_fl is None: df_ght = pd.DataFrame() df_wtr = pd.DataFrame() targ_dict = { "target": [targets['flux_ili'], targets['flux_wili']], "ght_target": [], "aw_target": [] } elif ght_data_fl is None and accu_data_fl is not None: df_ght = pd.DataFrame() aw_target = [ 'temperature_max', 'temperature_min', 'temperature_mean', 'RH_max', 'RH_min', 'RH_mean', 'wind_speed_mean', 'cloud_cover_mean', 'water_total', 'pressure_max', 'pressure_min', 'pressure_mean', 'AH_max', 'AH_min', 'AH_mean', 'SH_max', 'SH_min', 'SH_mean' ] targ_dict = { "target": [targets['ili'], targets['wili']], "ght_target": [], "aw_target": [ 'temperature_max', 'temperature_min', 'temperature_mean', 'RH_max', 'RH_min', 'RH_mean', 'wind_speed_mean', 'cloud_cover_mean', 'water_total', 'pressure_max', 'pressure_min', 'pressure_mean', 'AH_max', 'AH_min', 'AH_mean', 'SH_max', 'SH_min', 'SH_mean' ] } #, 'wind_speed_mean']} #aw_csv_path = args.accu_data#'../data/data-aw-cumulative_20191018_1620-weekly-state.csv' df_wtr = prep_aw_data(st_id_path, **kwargs_wtr) #df_state = prepdata_state(csv_path, ews) #pdb.set_trace() elif accu_data_fl is None and ght_data_fl is not None: df_wtr = pd.DataFrame() targ_dict = { "target": [targets['ili'], targets['wili']], "ght_target": ['flu', 'cough', 'fever', 'influenza', 'cold'], "aw_target": [] } #ght_csv_path = args.ght_data df_ght = prep_ght_data(**kwargs_ght) #df_ght.index = df_ght.date #df_ght.index = df_ght.index.rename('DATE') #df_ght = df_ght.rename(columns={'state':'REGION'}) ght_target = ['flu', 'cough', 'fever', 'influenza', 'cold'] else: aw_target = [ 'temperature_max', 'temperature_min', 'temperature_mean', 'RH_max', 'RH_min', 'RH_mean', 'wind_speed_mean', 'cloud_cover_mean', 'water_total', 'pressure_max', 'pressure_min', 'pressure_mean', 'AH_max', 'AH_min', 'AH_mean', 'SH_max', 'SH_min', 'SH_mean' ] targ_dict = { "target": [targets['flux_ili'], targets['flux_wili']], "ght_target": ['flu', 'cough', 'fever', 'influenza', 'cold'], "aw_target": [ 'temperature_max', 'temperature_min', 'temperature_mean', 'RH_max', 'RH_min', 'RH_mean', 'wind_speed_mean', 'cloud_cover_mean', 'water_total', 'pressure_max', 'pressure_min', 'pressure_mean', 'AH_max', 'AH_min', 'AH_mean', 'SH_max', 'SH_min', 'SH_mean' ] } #, 'wind_speed_mean']} # weather data #aw_csv_path = args.accu_data df_wtr = prep_aw_data(st_id_path, **kwargs_wtr) # GHT data #ght_csv_path = args.ght_data df_ght = prep_ght_data(**kwargs_ght) #df_ght.index = df_ght.date #df_ght.index = df_ght.index.rename('DATE') #df_ght = df_ght.rename(columns={'state':'REGION'}) ght_target = ['flu', 'cough', 'fever', 'influenza', 'cold'] if args.state_exog is None: df_state = pd.DataFrame() else: df_state, state_dict = state_data(csv_path, ews, args.mode) targ_dict.update(state_dict) df_state, targ_dict = state_shifter(df_state, targ_dict, 0) directory_bst = args.out_folder + 'ARLR_bst/' # + str(args.forecast_from[:4]) directory_Gaussker = args.out_folder + 'ARLR_Gaussker/' # + str(args.forecast_from[:4]) if not os.path.exists(directory_bst): os.makedirs(directory_bst) if not os.path.exists(directory_Gaussker): os.makedirs(directory_Gaussker) bin_ed = get_bin() allw_lags_f = np.arange( 1, 55 ) # should have atleast "ms_fct" lags as we find "ms_fct" filters separately #targ_dict = {"target" : [targets['ili'], targets['wili']], "ght_target" : ['flu', 'cough', 'fever', 'influenza', 'cold'], "aw_target" : ['temperature_max', 'temperature_min','temperature_mean', 'RH_max', 'RH_min', 'RH_mean', 'wind_speed_mean','cloud_cover_mean', 'water_total', 'pressure_max', 'pressure_min','pressure_mean', 'AH_max', 'AH_min', 'AH_mean', 'SH_max', 'SH_min']}#, 'wind_speed_mean']} if args.sub_date is not None: sub_date = args.sub_date else: sub_date = ((ews + 1).enddate() + timedelta(days=2)).isoformat( ) #submission for epiweek N is (epiweek N+1).enddate() + timedelta(days=2) df_full_res = pd.DataFrame(columns=[ 'DATE', 'location', '1 week ahead', '2 week ahead', '3 week ahead', '4 week ahead', targ_dict['target'][0] ]) df_full_res = df_full_res.set_index('DATE') df_full_seas = pd.DataFrame(columns=['season', 'location']) idx_fct = [(ews + i).startdate() for i in range(1, fct_weeks + 1)] df_full_seas = pd.DataFrame(columns=['season', 'location']) df_full_seas['DATE'] = idx_fct for region in fdf[header_region].unique(): df_res = pd.DataFrame(columns=[ 'DATE', 'location', '1 week ahead', '2 week ahead', '3 week ahead', '4 week ahead', targ_dict['target'][0] ]) idx_fct = [(ews + i).startdate() for i in range(1, fct_weeks + 1)] df_res['DATE'] = idx_fct df_res = df_res.set_index('DATE') targ_dict['target'] = [targets['flux_ili'], targets['flux_wili']] #targ_dict['aw_target'] = aw_target if fdf[header_region_type][fdf[header_region] == region].unique() == 'States': print(region) for v in targ_dict.values(): if targets['flux_wili'] in v: v.remove(targets['flux_wili']) else: for v in targ_dict.values(): if targets['flux_ili'] in v: v.remove(targets['flux_ili']) win = int(config['Forecasting']['win']) # training window max_lag = np.max(allw_lags_f) # maximum lag considered in the model # Check if datastream has no missing information for all lagged regressors of length equal to training length window nan_chk_mask = (fdf[header_region] == region) & (fdf.index <= pd.to_datetime( ews.startdate())) & (fdf.index >= pd.to_datetime( (ews - int(win + max_lag)).startdate())) if fdf[nan_chk_mask][targ_dict['target']].isna().values.any(): print('Missing values in ILI data, cannot produce forecasts') continue diff_val = 'no_diff' df_m, df_ex = ARLR_regressor(fdf, df_wtr, df_ght, df_state, region, targ_dict, ews, diff_val) predictions, bn_mat_bst, bn_mat_Gaussker, seas, lags_app_f, coeffs_f = ARLR_exog_module( df_m, targ_dict, ews, fct_weeks, allw_lags_f) predictions = int_op(predictions, df_ex, targ_dict['target'], ews, diff_val) for i in range(1, len(predictions[0, :]) + 1): print('Week: {}, Fct: {}'.format(i, (predictions[0, i - 1]))) df_res.loc[(ews + i).startdate(), 'location'] = region df_res.loc[(ews + i).startdate(), '{} week ahead'.format(i)] = predictions[0, i - 1] if args.eval is not None: df_res.loc[(ews + i).startdate(), targ_dict['target']] = fdf[ (fdf.index == pd.to_datetime((ews + i).startdate())) & (fdf[header_region] == region)][ targ_dict['target']].values[0] idx = [(ews + i).startdate() for i in range(1, len(range((ews.week) - 40, 35)))] df_seas = pd.DataFrame(columns=['season', 'location']) df_seas['DATE'] = idx df_seas['location'] = df_seas.apply(lambda x: region, axis=1) df_seas.loc[:, 'season'] = seas df_seas = df_seas.set_index('DATE') #df_res = df_res.merge(df_seas, how='outer', left_index=True, right_index=True) df_full_res = df_full_res.append(df_res) df_full_seas = df_full_seas.append(df_seas) if int(args.CDC) and fdf[header_region_type][ fdf[header_region] == region].unique() != 'States': target = targets['flux_wili'] #outputdistribution_bst(predictions[0,0:4], bn_mat_bst[0,:,0:4], bin_ed, region, target, directory_bst, ews) #outputdistribution_Gaussker(predictions[0,0:4], bn_mat_Gaussker[:,0:4], bin_ed, region, target, directory_Gaussker, ews) outputdistribution_fromtemplate_for_FSN(predictions[0, 0:4], bn_mat_Gaussker[0, :, 0:4], bin_ed, region, target, directory_Gaussker, ews) #outputdistribution_fromtemplate_for_FluSight(predictions[0,0:4], bn_mat_Gaussker[0,:,0:4], bin_ed, region, target, directory_Gaussker, ews, sub_date) if fdf[header_region_type][fdf[header_region] == region].unique() == 'States': target = targets['flux_ili'] accu_output(predictions.reshape(fct_weeks), region, args.out_state, ews, args.st_fips) outputdistribution_state_fromtemplate(predictions[0, 0:4], bn_mat_Gaussker[0, :, 0:4], bin_ed, region, target, directory_Gaussker, ews, sub_date) df_full_res.to_csv('result_' + str(ews.year) + 'EW' + str(ews.week)) df_full_seas.to_csv('result_seas_' + str(ews.year) + 'EW' + str(ews.week))
def _week_to_date(self, row): return epiweeks.Week(row.iso_year, row.iso_week).startdate()
def week_to_date(year: int, week: int, output_fmt: str = DATE_FORMAT): week = epiweeks.Week(year, week) dt = week.enddate() return clean_date(dt, output_fmt=output_fmt)
def prepdata(csv_path): df = pd.read_csv(csv_path, na_values='X', header=1) df['REGION'] = df['REGION'].fillna('National') df['DATE'] = pd.to_datetime(df.apply(lambda row : epi.Week(int(row["YEAR"]), int(row["WEEK"])).startdate() ,axis=1, result_type='reduce')) return df
def test_week_subtracting(week_cdc, week_iso): assert (week_cdc - 1) == epiweeks.Week(2014, 53, system="cdc") assert (week_iso - 1) == epiweeks.Week(2014, 52, system="iso")
def test_week_addition(week_cdc, week_iso): assert (week_cdc + 1) == epiweeks.Week(2015, 2, system="cdc") assert (week_iso + 1) == epiweeks.Week(2015, 2, system="iso")
def ARLR_module(df, region, target, epi_week): config = configparser.ConfigParser() config_file = 'config.ini' config.read(config_file) ww_train = epi_week - 1 ww_test = epi_week cdcdf = df starttraining_date = pd.to_datetime(ww_train.startdate()) testing_date = pd.to_datetime(ww_test.startdate()) #endtraining = pd.to_datetime(enddate.startdate()) #startpredict = pd.to_datetime((enddate+1).startdate()) #endpredict = pd.to_datetime((enddate+4).startdate()) if region == 'US National': df = cdcdf[cdcdf['REGION TYPE'] == 'National'] df['DATE'] = pd.to_datetime( df.apply(lambda row: epi.Week(int(row["YEAR"]), int(row["WEEK"])). startdate(), axis=1, result_type='reduce')) #df.set_index(['DATE'], inplace=True) elif region.isdigit(): df = cdcdf[cdcdf['REGION'] == "Region " + str(region)] df['DATE'] = pd.to_datetime( df.apply(lambda row: epi.Week(int(row["YEAR"]), int(row["WEEK"])). startdate(), axis=1, result_type='reduce')) df.set_index(['DATE'], inplace=True) #When I set the date row as the index, I can no longer access it using df['DATE] else: df = cdcdf[cdcdf['REGION'] == region] df['DATE'] = pd.to_datetime( df.apply(lambda row: epi.Week(int(row["YEAR"]), int(row["WEEK"])). startdate(), axis=1, result_type='reduce')) df.set_index(['DATE'], inplace=True) df_train = df[(df['DATE'] < pd.to_datetime(ww_train.startdate()))] df_test = df[(df['DATE'] >= pd.to_datetime(ww_train.startdate()))] #targetdf = Series(df[target]) #target_series = targetdf[:starttraining_date] #df_train = target_series[:-1] #df_test = target_series[-1:] train = np.log(np.array(df_train[target], 'float').astype(float)) test = np.log(np.array(df_test[target], 'float').astype(float)) train = pd.Series(train) train.index = df_train['DATE'] test = pd.Series(test) test.index = df_test['DATE'] config = configparser.ConfigParser() config_file = 'config.ini' config.read(config_file) # Multi-step forecast win = int( config['Forecasting'] ['win']) # Length of the historial training data to be considered fut_wks = int( config['Forecasting'] ['fut_wks']) # Number of weeks ahead to forecast from training data ms_fct = int( config['Forecasting']['ms_fct'] ) # For every forecast week, give additional ms_fct weeks forecast test_win = fut_wks + ms_fct # Number of true value to be fetched (testing accuracy) exp_max_lags = int(config['Forecasting']['exp_max_lags'] ) # expected maximum lags to be considered in the model llr_tol = 1e-2 # log-likelihood tolerance # Uncertainty analysis uncer_anl = int(config['CDC']['uncer_anl']) Nb = int(config['CDC']['Nb']) # create bins n_bins = int(config['CDC']['n_bins']) #bin_ed = np.arange(0,n_bins,.1) #bin_ed = np.append(bin_ed,20) bin_ed = [ 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13.0, 100 ] # Read csv file and create train and test data # dates = pd.DatetimeIndex(df_train["DATE"]) #plt.figure(figsize=(12,7)) #plt.subplot(2,1,1);plt.plot(train.index,(train));plt.title('Full training data from specified epiweek {}, {}'.format(epwk,yr)) # plt.subplot(2,1,2);plt.plot((hist_win(train,win)).index,(hist_win(train,win)));plt.title('Training data: 4 year period') # Check data for stationarity in the training data with padding train_win = train[-1:( -win - exp_max_lags - 1):-1] # training samples in the window period + buffer result = adfuller(train_win) #print(result) #if result[1] < 0.05: # print('p-val of ADF test %e' %result[1]) # print('Stationary signal') # plt.plot(train_win) # Check seasonality season_ind = get_season(train_win, fft_len=1024, figs=False) # train the model max_lags = 55 coeffs = np.zeros([ms_fct, max_lags]) train_pred_err = np.zeros([ms_fct, win]) yp_train = np.zeros([ms_fct, win]) lags_app = np.zeros([ms_fct, max_lags]) # Train to obtain ARLR coeffs for all specified multi-step forecast: # Ex: For 1-step forecast, consider data from t-1 to t-p for training: ms_fct = 1 # for 4-step forecast, consider data for t-4 to t-p for training: ms_fct = 4 # similarly for 1 season, ms_fct = 52 for wks in range(1, ms_fct + 1): allw_lags = (np.arange(wks, max_lags)) coeffs_temp, yp_train[wks - 1, :], tr_tp1, llr1, train_pred_err[ wks - 1, :], lags_temp = ARLR_model(train, allw_lags, win, llr_tol) lags_app[wks - 1, lags_temp] = lags_temp coeffs[wks - 1, :] = coeffs_temp yp_fct = np.zeros([fut_wks, ms_fct]) yb_fct = np.zeros([fut_wks, ms_fct, Nb]) log_scr = np.zeros([fut_wks, ms_fct]) bn_mat = np.zeros([fut_wks, len(bin_ed) - 1, ms_fct]) # Once trained, use the coeffs to forecast multi-steps given data frame # For obtaining uncertainty in forecast estimates (using Boot strapping), choose uncer_anl = True, data_frame = train data_test = [] #test for new_wks in np.arange(0, fut_wks): data_frame = data_frame.append(test[new_wks:(new_wks + 1)]) data_test = data_test[1:] yp_fct[new_wks, :], yb_fct[new_wks, :, :], log_scr[new_wks, :], bn_mat[ new_wks, :, :], train_pred_err = multi_step_fct( data_frame, coeffs, lags_app, train_pred_err, ms_fct, win, Nb, bin_ed, uncer_anl) return np.exp(yp_fct), bn_mat
def week_cdc(): return epiweeks.Week(2015, 1, system="cdc")
if save: l_outs.append(st) l_outs = [] say('Beginning generation of power ratings from raw evaluation files.',l_outs) datestamp = datetime.datetime.today().strftime('%Y-%m-%d') outstring = 'Datestamp: '+datestamp say(outstring,l_outs) say('Following these policies:',l_outs) for k, v in d_policy.items(): outstring = ' '+str(k)+': '+str(v) say(outstring,l_outs) # Create some handy dicts to move between YYYY-MM-DD and epiweek d_epiweeks = {i: epiweeks.Week(2020,i) for i in range(first_week,last_week+1)} d_wk_to_enddate = {k: w.enddate().strftime('%Y-%m-%d') for k, w in d_epiweeks.items()} d_enddate_to_wk = {v: k for k, v in d_wk_to_enddate.items()} d_wk_to_startdate = {k: (w.startdate()+datetime.timedelta(days=1)).strftime('%Y-%m-%d') for k, w in d_epiweeks.items()} d_startdate_to_wk = {v: k for k, v in d_wk_to_startdate.items()} # For each starting week, create list of week pairs that are (starting_week,ending_week). Store as dict. d_week_pairs = {i: [(j,i) for j in range(first_week,last_week+1) if i >= j] for i in range(first_week,last_week+1)} # Also do this with the full string YYYY-MM-DD version d_week_pairs_str = {} for k, v in d_week_pairs.items(): d_week_pairs_str[d_wk_to_enddate[k]] = [d_wk_to_startdate[tup[0]]+'_'+d_wk_to_enddate[tup[1]] for tup in v] # Read in all the files in the evaluations directory # We could just glob but curating it slightly might make the manipulations easier
def main(): args = parse_args() level = logging.INFO if args.verbose: level = logging.DEBUG log.setLevel(level) if args.log is None: handler = logging.StreamHandler() else: handler = logging.FileHandler(args.log) log_formatter = logging.Formatter( '%(asctime)s:%(levelname)s:' '%(name)s.%(funcName)s:%(message)s', datefmt='%Y%m%d-%H%M%S') handler.setFormatter(log_formatter) log.addHandler(handler) log.info('{} v{}'.format(__processor__, __version__)) regions = get_regions() targets = get_targets() #if args.region not in regions: # raise TypeError("region is not valid") #if args.region_type == "national": # args.region_type = "US National" fct_weeks = args.weeks # csv_path = args.ground_truth if int(args.test): directory = 'dump/' if not os.path.exists(directory): os.makedirs(directory) directory_bst = args.out_folder + 'ARLR_bst/' + str(args.forecast_from) directory_Gaussker = args.out_folder + 'ARLR_Gaussker/' + str( args.forecast_from) if not os.path.exists(directory_bst): os.makedirs(directory_bst) if not os.path.exists(directory_Gaussker): os.makedirs(directory_Gaussker) bin_ed = get_bin() EWs = [] year_f = args.forecast_from year_t = str(int(year_f) + 1) EWs = [] for y in range(int(year_f), int(year_t) + 1): for week in epi.Year(y).iterweeks(): w = int(str(week)) if (w < int(year_f + '40')) | (w > int(year_t + '20')): continue EWs.append(str(w)) for wks in EWs: #epi.Year(int(args.forecast_from)).iterweeks(): startyear = wks[:4] #args.forecast_from[:4] startweek = wks[4:] #args.forecast_from[6:8] #trainweek = startweek fdf = prepdata_retro(csv_path, wks) fdf['REGION'] = fdf['REGION'].fillna('National') fdf.dropna(subset=['%UNWEIGHTED ILI'], inplace=True) fdf = fdf.drop(fdf[(fdf['REGION'] == 'Puerto Rico') | (fdf['REGION'] == 'Virgin Islands') | (fdf['REGION'] == 'New York City')].index) ews = epi.Week(int(startyear), int(startweek)) for region in fdf['REGION'].unique(): #for i in range(0, 1): #if region=='National' or 'HHS Regions': # target = targets["wili"] #else: # target = targets["ili"] target = targets['wili'] df = fdf[fdf['REGION'] == region] predictions, bn_mat_bst, bn_mat_Gaussker = ARLR_module( df, region, target, ews, fct_weeks) if int(args.CDC): outputdistribution_bst(predictions[0, 0:4], bn_mat_bst[0, :, 0:4], bin_ed, region, target, directory_bst, ews) #outputdistribution_Gaussker(predictions[0,0:4], bn_mat_Gaussker[:,0:4], bin_ed, region, target, directory_Gaussker, ews) outputdistribution_fromtemplate(predictions[0, 0:4], bn_mat_Gaussker[:, 0:4], bin_ed, region, target, directory_Gaussker, ews) if df['REGION TYPE'].unique() == 'States': print(region) accu_output(predictions.reshape(fct_weeks), region, args.out_state, ews, args.st_fips)
def main(): args = parse_args() level = logging.INFO if args.verbose: level = logging.DEBUG log.setLevel(level) if args.log is None: handler = logging.StreamHandler() else: handler = logging.FileHandler(args.log) log_formatter = logging.Formatter('%(asctime)s:%(levelname)s:' '%(name)s.%(funcName)s:%(message)s', datefmt='%Y%m%d-%H%M%S') handler.setFormatter(log_formatter) log.addHandler(handler) log.info('{} v{}'.format(__processor__,__version__)) regions = get_regions() targets = get_targets() #if args.region not in regions: # raise TypeError("region is not valid") #if args.region_type == "national": # args.region_type = "US National" fct_weeks = args.weeks # csv_path = args.ground_truth epiyear = args.forecast_from startyear = epiyear[:4] #args.forecast_from[:4] startweek = epiyear[4:] #args.forecast_from[6:8] #trainweek = startweek ews = epi.Week(int(startyear), int(startweek)) header_region_type = targets['flux_region_type'] #"REGION TYPE" for retro or old datasets header_region = targets['flux_region'] #"REGION" for retro or old datasets fdf = prepdata_flux(csv_path, ews) fdf[header_region] = fdf[header_region].fillna('National') fdf = fdf.drop(fdf[(fdf[header_region] == 'Puerto Rico')|(fdf[header_region] == 'Virgin Islands')|(fdf[header_region] == 'New York City')].index) if int(args.test): directory = 'dump/' if not os.path.exists(directory): os.makedirs(directory) directory_bst = args.out_folder + 'ARLR_bst/' + str(args.forecast_from) directory_Gaussker = args.out_folder + 'ARLR_Gaussker/' + str(args.forecast_from) if not os.path.exists(directory_bst): os.makedirs(directory_bst) if not os.path.exists(directory_Gaussker): os.makedirs(directory_Gaussker) bin_ed = get_bin() for region in fdf[header_region].unique(): #for i in range(0, 1): #if region=='National' or 'HHS Regions': # target = targets["wili"] #else: # target = targets["ili"] target = targets['flux_wili'] # "wili" for retro or old datasets df = fdf[fdf[header_region]==region] predictions, bn_mat_bst, bn_mat_Gaussker = ARLR_module(df, region, target, ews, fct_weeks) if int(args.CDC): #outputdistribution_bst(predictions[0,0:4], bn_mat_bst[0,:,0:4], bin_ed, region, target, directory_bst, ews) #outputdistribution_Gaussker(predictions[0,0:4], bn_mat_Gaussker[:,0:4], bin_ed, region, target, directory_Gaussker, ews) outputdistribution_fromtemplate(predictions[0,0:4], bn_mat_Gaussker[:,0:4], bin_ed, region, target, directory_Gaussker, ews) if df[header_region_type].unique() == 'States': print(region) accu_output(predictions.reshape(fct_weeks), region, args.out_state, ews, args.st_fips)