def add_all_preds(df_county): """ add single predictor predictions for the past {ndays} days """ for method in methods: for t in tqdm(range(1, ndays + 1)): d = today - timedelta(t) if d < date(2020, 3, 16) and method in ['demographic']: continue use_df = exponential_modeling.leave_t_day_out(df_county, 0 + t) if method != 'ensemble' and method != 'demographic': use_df = fit_and_predict.fit_and_predict(use_df, target_day=np.arange(1, horizon + 1), outcome=outcome, method=method, mode='predict_future', output_key=f'predicted_{outcome}_{method}_{horizon}') elif method == 'demographic': use_df = fit_and_predict.fit_and_predict(use_df, target_day=np.arange(1, horizon + 1), outcome=outcome, method='shared_exponential', mode='predict_future', demographic_vars=very_important_vars, output_key=f'predicted_{outcome}_{method}_{horizon}') df_county[f'all_{outcome}_pred_{d.month}_{d.day}_{method}_{horizon}'] = use_df[f'predicted_{outcome}_{method}_{horizon}'] return df_county
def compute_pmdl_weight(df, methods, outcome): y = np.array([df[outcome].values[i][-7:] for i in range(len(df))]) weights = {} for method in methods: y_preds = np.zeros(y.shape) for t in range(1, 8): df2 = exponential_modeling.leave_t_day_out(df, t) df2 = fit_and_predict.get_forecasts(df2, outcome=outcome, method=method, output_key='y_preds', target_day=np.array([1])) y_preds[:,(7-t)] = np.array([df2['y_preds'].values[i][0] for i in range(len(df))]) weights[method] = pmdl_weight(np.log(y + 1), np.log(y_preds + 1)) return weights
def compute_pmdl_weight(df, methods, outcome, target_day): y = np.array([df[outcome].values[i][-7:] for i in range(len(df))]) weights = {} for (i, model) in enumerate(methods): if 'demographic_vars' in model: demographic_vars = model['demographic_vars'] else: demographic_vars = [] y_preds = np.zeros(y.shape) for t in range(1, 8): df2 = exponential_modeling.leave_t_day_out(df, t + 3 - 1) df2 = fit_and_predict.fit_and_predict( df2, outcome=outcome, method=model['model_type'], mode='predict_future', target_day=np.array([3]), output_key='y_preds', demographic_vars=demographic_vars) y_preds[:, (7 - t)] = np.array( [df2['y_preds'].values[i][-1] for i in range(len(df))]) weights[i] = pmdl_weight(np.log(y + 1), np.log(np.maximum(y_preds, 0) + 1)) return weights
def previous_prediction_errors(df, target_day: np.ndarray = np.array([1]), outcome: str = 'deaths', methods: list = [advanced_model, linear], look_back_day: int = 5, output_key: str = None): """ Calculating prediction errors of previous days Input: df: pd.DataFrame target_day: np.ndarray outcome: str methods: list look_back_day: int returns the prediction errors for the last {look_back_day} days Output: list of {len(df)} dictionaries, the keys of each dictionary are days in target_day, and the values are a list of (normalized) l1 error, of length {look_back_day} """ # find previous models to run previous_start_days = defaultdict(list) for day in target_day: for back_day in range(look_back_day): previous_start_days[day + back_day].append(day) #previous_model_predictions = {} previous_model_errors = [defaultdict(list) for i in range(len(df))] prediction_uncertainty = [defaultdict(list) for i in range(len(df))] for t in previous_start_days: previous_target_days = previous_start_days[t] df_old = exponential_modeling.leave_t_day_out(df, t) previous_model_predictions = fit_and_predict_ensemble( df_old, target_day=np.array(previous_target_days), outcome=outcome, methods=methods, mode='predict_future', output_key='old_predictions', )['old_predictions'].values # running old prediction models for i in range(len(df)): for (j, td) in enumerate(previous_target_days): pred = previous_model_predictions[i][j] actual_outcome = df[outcome].iloc[i][td - t - 1] error = actual_outcome / max(pred, 1) - 1 previous_model_errors[i][td].append(error) #for i in range(len(df)): # for td in target_day: # prediction_uncertainty[i][td] = max(previous_model_errors[i][td]) df[output_key] = previous_model_errors return df
def fit_and_predict_ensemble(df, target_day: np.ndarray = np.array([1]), outcome: str = 'deaths', methods: list = [shared_exponential, linear], mode: str = 'predict_future', output_key: str = None, verbose: bool = False): """ Function for ensemble prediction Input: df: pd.DataFrame target_day: array outcome: str method: list of dictionary each dictionary specify the type and parameters of the model mode: str output_key: str Output: df with ensemble prediction """ if output_key is None: output_key = f'predicted_{outcome}_ensemble_{target_day[-1]}' predictions = {} for (i, model) in enumerate(methods): if 'demographic_vars' in model: demographic_vars = model['demographic_vars'] else: demographic_vars = [] predictions[i] = fit_and_predict( df, outcome=outcome, method=model['model_type'], mode=mode, target_day=target_day, output_key=f'y_preds_{i}', demographic_vars=demographic_vars, verbose=verbose)[f'y_preds_{i}'].values if mode == 'predict_future': use_df = df else: use_df = exponential_modeling.leave_t_day_out(df, target_day[-1]) weights = pmdl_weight.compute_pmdl_weight(use_df, methods=methods, outcome=outcome, target_day=target_day) sum_weights = np.zeros(len(use_df)) for model_index in weights: sum_weights = sum_weights + np.array(weights[model_index]) #weighted_preds = np.zeros((len(use_df), len(target_day))) weighted_preds = [np.zeros(len(target_day)) for i in range(len(use_df))] for i in range(len(df)): for model_index in weights: weighted_preds[i] += np.array( predictions[model_index] [i]) * weights[model_index][i] / sum_weights[i] # print out the relative contribution of each model if verbose: print('--- Model Contributions ---') model_weight_counter = Counter() for model_index in weights: m_weights = 0 for i in range(len(use_df)): m_weights += weights[model_index][i] / sum_weights[i] m_weights = m_weights / len(use_df) model_weight_counter[model_index] = m_weights for model_index, weight in model_weight_counter.most_common(): print(str(methods[model_index]) + ': ' + str(weight)) df[output_key] = weighted_preds return df
def fit_and_predict_ensemble( df, target_day: np.ndarray = np.array([1]), outcome: str = 'deaths', methods: list = [exponential, shared_exponential, demographics], mode: str = 'predict_future', output_key: str = None): """ Function for ensemble prediction Input: df: pd.DataFrame target_day: array outcome: str method: list of dictionary each dictionary specify the type and parameters of the model mode: str output_key: str Output: df with ensemble prediction """ if output_key is None: output_key = f'predicted_{outcome}_ensemble_{target_day[-1]}' predictions = {} for (i, model) in enumerate(methods): if 'demographic_vars' in model: demographic_vars = model['demographic_vars'] else: demographic_vars = [] predictions[i] = fit_and_predict( df, outcome=outcome, method=model['model_type'], mode=mode, target_day=target_day, output_key=f'y_preds_{i}', demographic_vars=demographic_vars)[f'y_preds_{i}'].values if mode == 'predict_future': use_df = df else: use_df = exponential_modeling.leave_t_day_out(df, target_day[-1]) weights = pmdl_weight.compute_pmdl_weight(use_df, methods=methods, outcome=outcome) sum_weights = np.zeros(len(use_df)) for model_index in weights: sum_weights = sum_weights + np.array(weights[model_index]) #weighted_preds = np.zeros((len(use_df), len(target_day))) weighted_preds = [np.zeros(len(target_day)) for i in range(len(use_df))] for i in range(len(df)): for model_index in weights: weighted_preds[i] += np.array( predictions[model_index] [i]) * weights[model_index][i] / sum_weights[i] df[output_key] = weighted_preds return df
def fit_and_predict(train_df, test_df, outcome, method, mode, target_day=np.array([1]), demographic_vars=[]): """ Trains a method (method) to predict a current number of days ahead (target_day) Predicts the values of the number of deaths for the final day of test_df and writes to the column 'predicted_deaths_'+method+'_'+str(target_day[-1]) of the test_df Input: train_df, tests: dfs with county level deaths and cases method: string target_day = np.array([1,2,..,n]) predicts these number of days ahead (can just be np.array([3])) for example if you just want 3 days ahead) mode: either 'predict_future' or 'eval_mode' predict_future is predicting deaths on FUTURE days, so target_day=np.array([1])) means it predicts tomorrow's deaths eval_mode is for evaluating the performance of the classifier. target_day=np.array([k])) will predict the current days death count using information from k days ago. target_day= np.array([1,2,3,...,k]) will predict todays deaths, yesterdays deaths, deaths k-1 days ago using information from k days ago. Output: test_df """ assert mode == 'predict_future' or mode == 'eval_mode', 'unknown mode' if method == 'AR': print('currently deprecated') raise NotImplementedError loss, model, best_window = naive_autoreg_baselines.train_and_evaluate_model( train_df, test_df) return naive_autoreg_baselines.make_predictions( test_df, model, best_window) elif method == 'exponential': preds = exponential_modeling.exponential_fit(test_df[outcome].values, mode=mode, target_day=target_day) test_df[f'predicted_{outcome}_{method}_{target_day[-1]}'] = preds #del test_df['predicted_deaths_exponential'] return test_df elif method == 'shared_exponential': # Fit a poisson GLM with shared parameters across counties. Input to the poisson GLM is demographic_vars and log(previous_days_deaths+1) cur_day_predictions = exponential_modeling.fit_and_predict_shared_exponential( train_df, test_df, mode, outcome=outcome, demographic_vars=demographic_vars, target_day=target_day) save_name = f'predicted_{outcome}_{method}_{target_day[-1]}' if len(demographic_vars) > 0: save_name += '_demographics' # import IPython # IPython.embed() test_df[save_name] = cur_day_predictions return test_df elif method == 'ensemble': #if target_day != np.array([1]): # raise NotImplementedError shared_preds = exponential_modeling.fit_and_predict_shared_exponential( train_df, test_df, mode=mode, outcome=outcome, demographic_vars=demographic_vars, target_day=target_day) exp_preds = exponential_modeling.exponential_fit( test_df[outcome].values, mode=mode, target_day=target_day) if mode == 'predict_future': use_df = test_df else: use_df = exponential_modeling.leave_t_day_out( test_df, target_day[-1]) weights = pmdl_weight.compute_pmdl_weight( use_df, methods=['exponential', 'shared_exponential'], outcome=outcome) weights_sum = weights['exponential'] + weights['shared_exponential'] preds = [ exp_preds[i] * weights['exponential'][i] / weights_sum[i] + np.array(shared_preds[i]) * weights['shared_exponential'][i] / weights_sum[i] for i in range(len(test_df)) ] test_df[f'predicted_{outcome}_{method}_{target_day[-1]}'] = preds return test_df else: print('Unknown method') raise ValueError