def fit_and_predict(df, outcome: str = 'deaths', method: str = 'exponential', mode: str = 'predict_future', target_day: np.ndarray = np.array([1]), output_key: str = None, demographic_vars=[], verbose: bool = False): """ Trains a method (method) to predict a current number of days ahead (target_day) Predicts the values of the number of deaths for the final day of test_df and writes to the column 'predicted_deaths_'+method+'_'+str(target_day[-1]) of the test_df Params ------ df a df with county level deaths and cases and demographic information outcome key for the outcome to predict (the values in this column should have a list for each row) method what method to use to do forecasting target_day np.array([1,2,..,n]) predicts these number of days ahead (can just be np.array([3])) for example if you just want 3 days ahead) output_key key to save the output as mode: either 'predict_future' or 'eval_mode' predict_future is predicting deaths on FUTURE days, so target_day=np.array([1])) means it predicts tomorrow's deaths eval_mode is for evaluating the performance of the classifier. target_day=np.array([k])) will predict the current days death count using information from k days ago. target_day= np.array([1,2,3,...,k]) will predict todays deaths, yesterdays deaths, deaths k-1 days ago using information from k days ago. Returns ------- test_df returns dataframe with added column """ assert mode == 'predict_future' or mode == 'eval_mode', 'unknown mode' if output_key is None: output_key = f'predicted_{outcome}_{method}_{target_day[-1]}' if len(demographic_vars) > 0: output_key += '_demographics' if method == 'AR': print('currently deprecated') raise NotImplementedError loss, model, best_window = naive_autoreg_baselines.train_and_evaluate_model( train_df, test_df) return naive_autoreg_baselines.make_predictions( test_df, model, best_window) elif method == 'exponential': preds = exponential_modeling.exponential_fit(df[outcome].values, mode=mode, target_day=target_day) df[output_key] = preds #del test_df['predicted_deaths_exponential'] return df elif method == 'linear': preds = exponential_modeling.linear_fit(df[outcome].values, mode=mode, target_day=target_day) df[output_key] = preds #del test_df['predicted_deaths_exponential'] return df elif method == 'shared_exponential': # Fit a poisson GLM with shared parameters across counties. Input to the poisson GLM is demographic_vars and log(previous_days_deaths+1) cur_day_predictions = exponential_modeling.fit_and_predict_shared_exponential( df, mode, outcome=outcome, demographic_vars=demographic_vars, target_day=target_day, verbose=verbose) #if len(demographic_vars) > 0: # output_key += '_demographics' # import IPython # IPython.embed() df[output_key] = cur_day_predictions return df elif method == 'ensemble': print('please use fit_and_predict_ensemble instead') elif method == 'advanced_shared_model': if 'neighbor_deaths' not in df.columns: neighboring_counties_df = pd.read_csv( oj( parentdir, 'data/county_level/raw/county_ids/county_adjacency2010.csv' )) neighboring_counties_df['fipscounty'] = neighboring_counties_df[ 'fipscounty'].astype(str).str.zfill(5) neighboring_counties_df['fipsneighbor'] = neighboring_counties_df[ 'fipsneighbor'].astype(str).str.zfill(5) df['countyFIPS'] = df['countyFIPS'].astype(str).str.zfill(5) county_neighbor_deaths = [] county_neighbor_cases = [] county_fips = list(df['countyFIPS']) for fips in county_fips: neighboring_counties = list(neighboring_counties_df.loc[ neighboring_counties_df['fipscounty'] == fips] ['fipsneighbor']) neighboring_county_deaths = list(df.loc[df['countyFIPS'].isin( neighboring_counties)]['deaths']) neighboring_county_cases = list(df.loc[df['countyFIPS'].isin( neighboring_counties)]['cases']) # if not in county adjacency file, assume neighboring deaths/counts to 0 if len(neighboring_county_deaths) == 0: n_deaths = len( df.loc[df['countyFIPS'] == fips]['deaths'].iloc[0]) n_cases = len( df.loc[df['countyFIPS'] == fips]['cases'].iloc[0]) sum_neighboring_county_deaths = np.zeros(n_deaths) sum_neighboring_county_cases = np.zeros(n_cases) else: sum_neighboring_county_deaths = np.zeros( len(neighboring_county_deaths[0])) for deaths in neighboring_county_deaths: sum_neighboring_county_deaths += deaths sum_neighboring_county_cases = np.zeros( len(neighboring_county_deaths[0])) for cases in neighboring_county_cases: sum_neighboring_county_cases += cases county_neighbor_deaths.append(sum_neighboring_county_deaths) county_neighbor_cases.append(sum_neighboring_county_cases) df['neighbor_deaths'] = county_neighbor_deaths df['neighbor_cases'] = county_neighbor_cases feat_transforms = defaultdict(lambda y: [lambda x: x]) feat_transforms['deaths'] = [lambda x: np.log(x + 1)] feat_transforms['cases'] = [lambda x: np.log(x + 1)] feat_transforms['neighbor_deaths'] = [lambda x: np.log(x + 1)] feat_transforms['neighbor_cases'] = [lambda x: np.log(x + 1)] default_values = defaultdict(lambda: 0) aux_feats = ['cases', 'neighbor_deaths', 'neighbor_cases'] shared_model = SharedModel(df=df, outcome=outcome, demographic_variables=[], mode=mode, target_days=target_day, feat_transforms=feat_transforms, auxiliary_time_features=aux_feats, time_series_default_values=default_values, scale=True) shared_model.create_dataset() shared_model.fit_model() shared_model.predict() df[output_key] = shared_model.predictions return df else: print('Unknown method') raise ValueError
def fit_and_predict(df, outcome: str = 'deaths', method: str = 'exponential', mode: str = 'predict_future', target_day: np.ndarray = np.array([1]), output_key: str = None, demographic_vars=[]): """ Trains a method (method) to predict a current number of days ahead (target_day) Predicts the values of the number of deaths for the final day of test_df and writes to the column 'predicted_deaths_'+method+'_'+str(target_day[-1]) of the test_df Params ------ df a df with county level deaths and cases and demographic information outcome key for the outcome to predict (the values in this column should have a list for each row) method what method to use to do forecasting target_day np.array([1,2,..,n]) predicts these number of days ahead (can just be np.array([3])) for example if you just want 3 days ahead) output_key key to save the output as mode: either 'predict_future' or 'eval_mode' predict_future is predicting deaths on FUTURE days, so target_day=np.array([1])) means it predicts tomorrow's deaths eval_mode is for evaluating the performance of the classifier. target_day=np.array([k])) will predict the current days death count using information from k days ago. target_day= np.array([1,2,3,...,k]) will predict todays deaths, yesterdays deaths, deaths k-1 days ago using information from k days ago. Returns ------- test_df returns dataframe with added column """ assert mode == 'predict_future' or mode == 'eval_mode', 'unknown mode' if output_key is None: output_key = f'predicted_{outcome}_{method}_{target_day[-1]}' if len(demographic_vars) > 0: output_key += '_demographics' if method == 'AR': print('currently deprecated') raise NotImplementedError loss, model, best_window = naive_autoreg_baselines.train_and_evaluate_model( train_df, test_df) return naive_autoreg_baselines.make_predictions( test_df, model, best_window) elif method == 'exponential': preds = exponential_modeling.exponential_fit(df[outcome].values, mode=mode, target_day=target_day) df[output_key] = preds #del test_df['predicted_deaths_exponential'] return df elif method == 'linear': preds = exponential_modeling.linear_fit(df[outcome].values, mode=mode, target_day=target_day) df[output_key] = preds #del test_df['predicted_deaths_exponential'] return df elif method == 'shared_exponential': # Fit a poisson GLM with shared parameters across counties. Input to the poisson GLM is demographic_vars and log(previous_days_deaths+1) cur_day_predictions = exponential_modeling.fit_and_predict_shared_exponential( df, mode, outcome=outcome, demographic_vars=demographic_vars, target_day=target_day) #if len(demographic_vars) > 0: # output_key += '_demographics' # import IPython # IPython.embed() df[output_key] = cur_day_predictions return df elif method == 'ensemble': print('please use fit_and_predict_ensemble instead') else: print('Unknown method') raise ValueError