def cluster_viz(geocode, clusters): data, group = get_cluster_data(geocode=geocode, clusters=clusters, data_types=DATA_TYPES, cols=['casos']) city_names = dict(get_city_names(group)) df_hm = data.reset_index().rename(columns={'index': 'week'}) df_hm = pd.melt(df_hm, id_vars=['week'], var_name='city', value_name='incidence') df_hm['city'] = [int(re.sub('casos_', '', i)) for i in df_hm.city] df_hm['city'] = [city_names[i] for i in df_hm.city] # return df_hm curve_opts = dict(line_width=10, line_alpha=0.4,tools=[]) overlay_opts = dict(width=900, height=200,tools=[]) hm_opts = dict(width=900, height=500, tools=[], logz=True, invert_yaxis=False, xrotation=90, labelled=[], toolbar=None, xaxis=None) heatmap = hv.HeatMap(df_hm) heatmap.toolbar_location = None graphs = [hv.Curve((data.index, data[i]), 'Time', 'Incidence') for i in data.columns] final = graphs[0] for i in graphs[1:]: final = final * i opts = {'HeatMap': {'plot': hm_opts}, 'Overlay': {'plot': overlay_opts}, 'Curve': {'plot': curve_opts, 'style': dict(color='blue', line_alpha=0.2)}} return (heatmap + final).opts(opts).cols(1)
def qf_prediction(city, state, horizon, lookback): with open('../analysis/clusters_{}.pkl'.format(state), 'rb') as fp: clusters = pickle.load(fp) data, group = get_cluster_data(city, clusters=clusters, data_types=DATA_TYPES, cols=PREDICTORS, doenca=DISEASE) target = 'casos_est_{}'.format(city) casos_est_columns = ['casos_est_{}'.format(i) for i in group] # casos_columns = ['casos_{}'.format(i) for i in group] # data = data_full.drop(casos_columns, axis=1) data_lag = build_lagged_features(data, lookback) data_lag.dropna() targets = {} for d in range(1, horizon + 1): if d == 1: targets[d] = data_lag[target].shift(-(d - 1)) else: targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)] X_data = data_lag.drop(casos_est_columns, axis=1) X_train, X_test, y_train, y_test = train_test_split(X_data, data_lag[target], train_size=0.7, test_size=0.3, shuffle=False) city_name = get_city_names([city, 0])[0][1] preds = np.empty((len(data_lag), horizon)) preds25 = np.empty((len(data_lag), horizon)) preds975 = np.empty((len(data_lag), horizon)) metrics = pd.DataFrame(index=('mean_absolute_error', 'explained_variance_score', 'mean_squared_error', 'mean_squared_log_error', 'median_absolute_error', 'r2_score')) for d in range(1, horizon + 1): tgt = targets[d][:len(X_train)] tgtt = targets[d][len(X_train):] model = rolling_forecasts(X_train, target=tgt, horizon=horizon) dump(model, 'saved_models/quantile_forest/{}/{}_city_model_{}W.joblib'.format(state, city, d)) pred25 = model.predict(X_data[:len(targets[d])], quantile=2.5) pred = model.predict(X_data[:len(targets[d])], quantile=50) pred975 = model.predict(X_data[:len(targets[d])], quantile=97.5) dif = len(data_lag) - len(pred) if dif > 0: pred = list(pred) + ([np.nan] * dif) pred25 = list(pred25) + ([np.nan] * dif) pred975 = list(pred975) + ([np.nan] * dif) preds[:, (d - 1)] = pred preds25[:, (d - 1)] = pred25 preds975[:, (d - 1)] = pred975 pred_m = model.predict(X_test[(d - 1):], quantile=50) metrics[d] = calculate_metrics(pred_m, tgtt) metrics.to_pickle('{}/{}/qf_metrics_{}.pkl'.format('saved_models/quantile_forest', state, city)) plot_prediction(preds, preds25, preds975, targets[1], city_name, len(X_train)) return model, preds, preds25, preds975, X_train, targets, data_lag, X_data.columns
def single_prediction(city, state, predictors, predict_n, look_back, hidden, epochs, predict=False): """ Fit an LSTM model to generate predictions for a city, Using its cluster as regressors. :param city: geocode of the target city :param state: State containing the city :param predict_n: How many weeks ahead to predict :param look_back: Look-back time window length used by the model :param hidden: Number of hidden layers in each LSTM unit :param epochs: Number of epochs of training :param random: If the model should be trained on a random selection of ten cities of the same state. :return: """ with open("../../analysis/clusters_{}.pkl".format(state), "rb") as fp: clusters = pickle.load(fp) data, group = get_cluster_data(geocode=city, clusters=clusters, data_types=DATA_TYPES, cols=predictors) indice = list(data.index) indice = [i.date() for i in indice] city_name = get_city_names([city, 0])[0][1] if predict: ratio = 1 else: ratio = 0.7 predicted, X_test, Y_test, Y_train, factor = train_evaluate_model( city, data, predict_n, look_back, hidden, epochs, ratio=ratio, load=False) plot_predicted_vs_data( predicted, np.concatenate((Y_train, Y_test), axis=0), indice[:], label="{}".format(city_name), pred_window=predict_n, factor=factor, split_point=len(Y_train), ) return predicted, indice, X_test, Y_test, Y_train, factor
def state_prediction(state, predictors, predict_n, look_back, hidden, epochs, predict=False): clusters = pd.read_pickle("../../analysis/clusters_{}.pkl".format(state)) for cluster in clusters: data, group = get_cluster_data( geocode=cluster[0], clusters=clusters, data_types=DATA_TYPES, cols=predictors, ) for city in cluster: if os.path.exists( "../saved_models/LSTM/{}/predicted_lstm_{}.pkl".format( state, city)): continue indice = list(data.index) indice = [i.date() for i in indice] city_name = get_city_names([city, 0])[0][1] if predict: ratio = 1 else: ratio = 0.7 predicted, X_test, Y_test, Y_train, factor = train_evaluate_model( city, data, predict_n, look_back, hidden, epochs, ratio=ratio) plot_predicted_vs_data( predicted, np.concatenate((Y_train, Y_test), axis=0), indice[:], label=city_name, pred_window=predict_n, factor=factor, split_point=len(Y_train), ) print("{} done".format(city)) return None
def qf_prediction(city, state, horizon, lookback, doenca='chik'): with open('../analysis/clusters_{}.pkl'.format(state), 'rb') as fp: clusters = pickle.load(fp) data, group = get_cluster_data(city, clusters=clusters, data_types=DATA_TYPES, cols=PREDICTORS, doenca=doenca) target = 'casos_est_{}'.format(city) casos_est_columns = ['casos_est_{}'.format(i) for i in group] # casos_columns = ['casos_{}'.format(i) for i in group] # data = data_full.drop(casos_columns, axis=1) data_lag = build_lagged_features(data, lookback) data_lag.dropna() data_lag = data_lag['2016-01-01':] targets = {} for d in range(1, horizon + 1): if d == 1: targets[d] = data_lag[target].shift(-(d - 1)) else: targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)] X_data = data_lag.drop(casos_est_columns, axis=1) city_name = get_city_names([city, 0])[0][1] # Load dengue model model = joblib.load( os.path.join([ RESULT_PATH, '{}/{}_city_model_{}W.joblib'.format(state, city, horizon) ])) pred25 = model.predict(X_data, quantile=2.5) pred = model.predict(X_data, quantile=50) pred975 = model.predict(X_data, quantile=97.5) # metrics.to_pickle('{}/{}/qf_metrics_{}.pkl'.format('saved_models/quantile_forest', state, city)) return model, pred, pred25, pred975, X_data, targets, data_lag
def ensemble_tpot(city, state, target, horizon, lookback): with open('../analysis/clusters_{}.pkl'.format(state), 'rb') as fp: clusters = pickle.load(fp) data, group = get_cluster_data(city, clusters=clusters, data_types=DATA_TYPES, cols=PREDICTORS) casos_est_columns = ['casos_est_{}'.format(i) for i in group] casos_columns = ['casos_{}'.format(i) for i in group] data = data.drop(casos_columns, axis=1) data_lag = build_lagged_features(data, lookback) data_lag.dropna() X_data = data_lag.drop(casos_est_columns, axis=1) X_train, X_test, y_train, y_test = train_test_split(X_data, data_lag[target], train_size=0.7, test_size=0.3, shuffle=False) tgt_full = data_lag[target].shift(-(horizon - 1))[:-(horizon - 1)] tgt = tgt_full[:len(X_train)] tgtt = tgt_full[len(X_train):] model = TPOTRegressor(generations=20, population_size=100, verbosity=2, n_jobs=32) model.fit(X_train, target=tgt) model.export('tpot_{}_pipeline.py'.format(city)) print(model.score(X_test[:len(tgtt)], tgtt)) pred = plot_prediction(X_data[:len(tgt_full)], tgt_full, model, 'Out_of_Sample_{}_{}'.format(horizon, city), horizon) plt.show() return pred
def qf_state_prediction(state, lookback, horizon, predictors): """ RQF prediction based on cluster of cities :param state: :param lookback: :param horizon: :param predictors: :return: """ clusters = pd.read_pickle('../analysis/clusters_{}.pkl'.format(state)) for cluster in clusters: data_full, group = get_cluster_data(geocode=cluster[0], clusters=clusters, data_types=DATA_TYPES, cols=predictors) for city in cluster: if os.path.isfile( './saved_models/{}/qf_metrics_{}.pkl'.format( state, city)): print('done') continue target = 'casos_est_{}'.format(city) casos_est_columns = ['casos_est_{}'.format(i) for i in group] # casos_columns = ['casos_{}'.format(i) for i in group] # data = data_full.drop(casos_columns, axis=1) data_lag = build_lagged_features(data_full, lookback) data_lag.dropna() targets = {} for d in range(1, horizon + 1): if d == 1: targets[d] = data_lag[target].shift(-(d - 1)) else: targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)] X_data = data_lag.drop(casos_est_columns, axis=1) X_train, X_test, y_train, y_test = train_test_split(X_data, data_lag[target], train_size=0.7, test_size=0.3, shuffle=False) city_name = get_city_names([city, 0])[0][1] preds = np.empty((len(data_lag), horizon)) preds25 = np.empty((len(data_lag), horizon)) preds975 = np.empty((len(data_lag), horizon)) metrics = pd.DataFrame(index=('mean_absolute_error', 'explained_variance_score', 'mean_squared_error', 'mean_squared_log_error', 'median_absolute_error', 'r2_score')) for d in range(1, horizon + 1): tgt = targets[d][:len(X_train)] tgtt = targets[d][len(X_train):] model = rolling_forecasts(X_train, target=tgt, horizon=horizon) pred = model.predict(X_data[:len(targets[d])], quantile=50) pred25 = model.predict(X_data[:len(targets[d])], quantile=2.5) pred975 = model.predict(X_data[:len(targets[d])], quantile=97.5) dif = len(data_lag) - len(pred) if dif > 0: pred = list(pred) + ([np.nan] * dif) pred25 = list(pred25) + ([np.nan] * dif) pred975 = list(pred975) + ([np.nan] * dif) preds[:, (d - 1)] = pred preds25[:, (d - 1)] = pred25 preds975[:, (d - 1)] = pred975 pred_m = model.predict(X_test[(d - 1):]) metrics[d] = calculate_metrics(pred_m, tgtt) metrics.to_pickle('{}/{}/qf_metrics_{}.pkl'.format('saved_models/quantile_forest', state, city)) dump(model, 'saved_models/quantile_forest/{}_{}_state_model.joblib'.format(state, city)) plot_prediction(preds, preds25, preds975, targets[1], city_name, len(X_train))
def build_model(data, lags): model = pf.VAR(data=data, lags=lags) return model if __name__ == "__main__": prediction_window = 5 # weeks city = 3304557 state = 'RJ' with open('clusters_{}.pkl'.format(state), 'rb') as fp: clusters = pickle.load(fp) # data = get_alerta_table(3304557) # Nova Iguaçu: 3303500 data = get_cluster_data(city, clusters) for col in list(filter(lambda x: 'casos' in x, data.columns)): data[col] = data[col].astype('float') print(data.values) # data = data[['casos', 'nivel']] # print(data.info()) # data.casos.plot(title="series") model = build_model(data+1, lags=12) fit = model.fit()#'BBVI',iterations=1000, optimizer='RMSProp') print(fit.summary()) model.plot_fit() plt.savefig('VAR_in_sample.png') model.plot_predict(h=5, past_values=104)
def cluster_prediction(geocode, state, predictors, predict_n, look_back, hidden, epochs): """ Fit an LSTM model to generate predictions for all cities from a cluster, Using its cluster as regressors. :param city: geocode of the target city :param state: State containing the city :param predict_n: How many weeks ahead to predict :param look_back: Look-back time window length used by the model :param hidden: Number of hidden layers in each LSTM unit :param epochs: Number of epochs of training :return: """ clusters = pd.read_pickle("../../analysis/clusters_{}.pkl".format(state)) if os.path.exists('{}_cluster.csv'.format(geocode)): data = pd.read_csv('{}_cluster.csv.gz') cluster = pickle.load('{}_cluster.pkl') else: data, cluster = get_cluster_data(geocode=geocode, clusters=clusters, data_types=DATA_TYPES, cols=predictors, save=True) indice = list(data.index) indice = [i.date() for i in indice] fig, axs = P.subplots(nrows=2, ncols=2, figsize=(50, 45)) targets = zip(cluster, axs.flatten()) for (city, ax) in targets: print(city) city_name = get_city_names([city, 0])[0][1] predicted, X_test, Y_test, Y_train, factor = train_evaluate_model( city, data, predict_n, look_back, hidden, epochs) ## plot Ydata = np.concatenate((Y_train, Y_test), axis=0) split_point = len(Y_train) df_predicted = pd.DataFrame(predicted).T ymax = max(predicted.max() * factor, Ydata.max() * factor) ax.vlines(indice[split_point], 0, ymax, "g", "dashdot", lw=2) ax.text(indice[split_point + 1], 0.6 * ymax, "Out of sample Predictions") for n in range(df_predicted.shape[1] - predict_n): ax.plot(indice[n:n + predict_n], pd.DataFrame(Ydata.T)[n] * factor, "k-") ax.plot(indice[n:n + predict_n], df_predicted[n] * factor, "r-") ax.vlines( indice[n:n + predict_n], np.zeros(predict_n), df_predicted[n] * factor, "b", alpha=0.2, ) ax.grid() ax.set_title("Predictions for {}".format(city_name), fontsize=13) ax.legend(["data", "predicted"]) P.tight_layout() P.savefig("{}/cluster_{}.pdf".format(FIG_PATH, geocode)) # , bbox_inches='tight') # P.show() return None
**kwargs) return model if __name__ == "__main__": prediction_window = 3 # weeks city = 3304557 state = 'RJ' # data = get_alerta_table(3304557) # Nova Iguaçu: 3303609 with open('clusters_{}.pkl'.format(state), 'rb') as fp: clusters = pickle.load(fp) data, cluster_n = get_cluster_data(city, clusters) label = 'casos_{}'.format(city) features = list(data.columns) features.remove(label) # data[label].plot() # Graph data autocorrelation fig, axes = plt.subplots(1, 2, figsize=(15, 4)) fig = sm.graphics.tsa.plot_acf(data.ix[1:, label], lags=52, ax=axes[0]) fig = sm.graphics.tsa.plot_pacf(data.ix[1:, label], lags=52, ax=axes[1]) # model = build_model(data, 'casos', ['p_rt1']) model = build_model(data, label, features)
def rf_state_prediction(state, lookback, horizon, predictors): """ make predictions for all cities of a state using the cluster series :param state: State Symbol :param lookback: number of steps of history to use as predictors :param horizon: number of steps to predict :param predictors: list of predictors to use :return: """ clusters = pd.read_pickle('../analysis/clusters_{}.pkl'.format(state)) for cluster in clusters: data_full, group = get_cluster_data(geocode=cluster[0], clusters=clusters, data_types=DATA_TYPES, cols=predictors) for city in cluster: if os.path.isfile('./saved_models/{}/rf_metrics_{}.pkl'.format( state, city)): print('done') continue target = 'casos_est_{}'.format(city) casos_est_columns = ['casos_est_{}'.format(i) for i in group] # casos_columns = ['casos_{}'.format(i) for i in group] # data = data_full.drop(casos_columns, axis=1) data_lag = build_lagged_features(data_full, lookback) data_lag.dropna() targets = {} for d in range(1, horizon + 1): if d == 1: targets[d] = data_lag[target].shift(-(d - 1)) else: targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)] X_data = data_lag.drop(casos_est_columns, axis=1) X_train, X_test, y_train, y_test = train_test_split( X_data, data_lag[target], train_size=0.7, test_size=0.3, shuffle=False) city_name = get_city_names([city, 0])[0][1] preds = np.empty((len(data_lag), horizon)) metrics = pd.DataFrame(index=('mean_absolute_error', 'explained_variance_score', 'mean_squared_error', 'mean_squared_log_error', 'median_absolute_error', 'r2_score')) for d in range(1, horizon + 1): tgt = targets[d][:len(X_train)] tgtt = targets[d][len(X_train):] model = rolling_forecasts(X_train, target=tgt, horizon=horizon) pred = model.predict(X_data[:len(targets[d])]) dif = len(data_lag) - len(pred) if dif > 0: pred = list(pred) + ([np.nan] * dif) preds[:, (d - 1)] = pred pred_m = model.predict(X_test[(d - 1):]) metrics[d] = calculate_metrics(pred_m, tgtt) metrics.to_pickle('{}/{}/rf_metrics_{}.pkl'.format( 'saved_models/random_forest', state, city)) plot_prediction(preds, targets[1], city_name, len(X_train)) # plt.show() return None
def rf_prediction(city, state, horizon, lookback): """ make predictions for a given city using the cluster series :param city: city name :param state: State symbol :param horizon: number of steps ahead to predict :param lookback: number steps of history to use as predictors :return: """ with open('../analysis/clusters_{}.pkl'.format(state), 'rb') as fp: clusters = pickle.load(fp) data, group = get_cluster_data(city, clusters=clusters, data_types=DATA_TYPES, cols=PREDICTORS) target = 'casos_est_{}'.format(city) casos_est_columns = ['casos_est_{}'.format(i) for i in group] # casos_columns = ['casos_{}'.format(i) for i in group] # data = data_full.drop(casos_columns, axis=1) data_lag = build_lagged_features(data, lookback) data_lag.dropna() targets = {} for d in range(1, horizon + 1): if d == 1: targets[d] = data_lag[target].shift(-(d - 1)) else: targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)] X_data = data_lag.drop(casos_est_columns, axis=1) X_train, X_test, y_train, y_test = train_test_split(X_data, data_lag[target], train_size=0.7, test_size=0.3, shuffle=False) city_name = get_city_names([city, 0])[0][1] preds = np.empty((len(data_lag), horizon)) metrics = pd.DataFrame(index=('mean_absolute_error', 'explained_variance_score', 'mean_squared_error', 'mean_squared_log_error', 'median_absolute_error', 'r2_score')) for d in range(1, horizon + 1): tgt = targets[d][:len(X_train)] tgtt = targets[d][len(X_train):] model = rolling_forecasts(X_train, target=tgt, horizon=horizon) pred = model.predict(X_data[:len(targets[d])]) dif = len(data_lag) - len(pred) if dif > 0: pred = list(pred) + ([np.nan] * dif) preds[:, (d - 1)] = pred pred_m = model.predict(X_test[(d - 1):]) metrics[d] = calculate_metrics(pred_m, tgtt) metrics.to_pickle('{}/{}/rf_metrics_{}.pkl'.format( 'saved_models/random_forest', state, city)) plot_prediction(preds, targets[1], city_name, len(X_train)) return preds, X_train, targets, data_lag
def lasso_single_prediction(city, state, lookback, horizon, predictors): clusters = pd.read_pickle('../analysis/clusters_{}.pkl'.format(state)) data, group = get_cluster_data(geocode=city, clusters=clusters, data_types=DATA_TYPES, cols=predictors) target = 'casos_est_{}'.format(city) casos_est_columns = ['casos_est_{}'.format(i) for i in group] # casos_columns = ['casos_{}'.format(i) for i in group] # data = data_full.drop(casos_columns, axis=1) data_lag = build_lagged_features(data, lookback) data_lag.dropna() targets = {} for d in range(1, horizon + 1): if d == 1: targets[d] = data_lag[target].shift(-(d - 1)) else: targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)] X_data = data_lag.drop(casos_est_columns, axis=1) X_train, X_test, y_train, y_test = train_test_split(X_data, data_lag[target], train_size=0.7, test_size=0.3, shuffle=False) if sum(y_train) == 0: print('aaaah', city) return None city_name = get_city_names([city, 0])[0][1] preds = np.empty((len(data_lag), horizon)) metrics = pd.DataFrame(index=('mean_absolute_error', 'explained_variance_score', 'mean_squared_error', 'mean_squared_log_error', 'median_absolute_error', 'r2_score')) for d in range(1, horizon + 1): model = LassoLarsCV(max_iter=5, n_jobs=-1, normalize=False) tgt = targets[d][:len(X_train)] tgtt = targets[d][len(X_train):] try: model.fit(X_train, tgt) print(city, 'done') except ValueError as err: print('-----------------------------------------------------') print(city, 'ERRO') print('-----------------------------------------------------') break pred = model.predict(X_data[:len(targets[d])]) dif = len(data_lag) - len(pred) if dif > 0: pred = list(pred) + ([np.nan] * dif) preds[:, (d - 1)] = pred pred_m = model.predict(X_test[:(len(tgtt))]) metrics[d] = calculate_metrics(pred_m, tgtt) metrics.to_pickle('{}/{}/lasso_metrics_{}.pkl'.format( 'saved_models/lasso', state, city)) plot_prediction(preds, targets[1], city_name, len(X_train)) return None
def calculate_mape(state, lookback, horizon): clusters = pd.read_pickle('../analysis/clusters_{}.pkl'.format(state)) for cluster in clusters: data_full, group = get_cluster_data(geocode=cluster[0], clusters=clusters, data_types=['alerta'], cols=['casos_est', 'casos']) for city in cluster: print(city) target = 'casos_est_{}'.format(city) casos_est_columns = ['casos_est_{}'.format(i) for i in group] casos_columns = ['casos_{}'.format(i) for i in group] data = data_full.drop(casos_columns, axis=1) data_lag = build_lagged_features(data, lookback) data_lag.dropna() targets = {} for d in range(1, horizon + 1): if d == 1: targets[d] = data_lag[target].shift(-(d - 1)) else: targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)] X_data = data_lag.drop(casos_est_columns, axis=1) X_train, X_test, y_train, y_test = train_test_split( X_data, data_lag[target], train_size=0.7, test_size=0.3, shuffle=False) try: metrics = pd.read_pickle( '~/Documentos/resultados_infodengue/lasso/{}/lasso_metrics_{}.pkl' .format(state, city)) except EOFError: print('---------------------------------') print('ERROR', 'eof', city) print('----------------------------------') if metrics.shape[1] != 4: print('---------------------------------') print('ERROR', 'shape', city) print('----------------------------------') continue values = [] for d in range(1, horizon + 1): mae = metrics[d]['mean_absolute_error'] tgtt = targets[d][len(X_train):] factor = (len(tgtt) / (len(tgtt) - 1)) * sum( [abs(i - (tgtt[pos])) for pos, i in enumerate(tgtt[1:])]) if factor == 0: values.append(np.nan) else: values.append(mae / factor) metrics.loc['mean_absolute_scaled_error'] = values metrics.to_pickle( '~/Documentos/resultados_infodengue/lasso/{}/lasso_metrics_{}.pkl' .format(state, city)) return None
def rgf_state_prediction(state, lookback, horizon, predictors): clusters = pd.read_pickle('../analysis/clusters_{}.pkl'.format(state)) for cluster in clusters: data_full, group = get_cluster_data(geocode=cluster[0], clusters=clusters, data_types=DATA_TYPES, cols=predictors) for city in cluster: if os.path.isfile('saved_models/rgf/{}/rgf_metrics_{}.pkl'.format( state, city)): print(city, 'done') continue target = 'casos_est_{}'.format(city) casos_est_columns = ['casos_est_{}'.format(i) for i in group] casos_columns = ['casos_{}'.format(i) for i in group] data = data_full.drop(casos_columns, axis=1) data_lag = build_lagged_features(data, lookback) data_lag.dropna() targets = {} for d in range(1, horizon + 1): if d == 1: targets[d] = data_lag[target].shift(-(d - 1)) else: targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)] X_data = data_lag.drop(casos_est_columns, axis=1) X_train, X_test, y_train, y_test = train_test_split( X_data, data_lag[target], train_size=0.7, test_size=0.3, shuffle=False) city_name = get_city_names([city, 0])[0][1] preds = np.empty((len(data_lag), horizon)) metrics = pd.DataFrame(index=('mean_absolute_error', 'explained_variance_score', 'mean_squared_error', 'mean_squared_log_error', 'median_absolute_error', 'r2_score')) for d in range(1, horizon + 1): model = RGFRegressor(max_leaf=300, algorithm="RGF_Sib", test_interval=100, loss="LS", verbose=False) tgt = targets[d][:len(X_train)] tgtt = targets[d][len(X_train):] try: model.fit(X_train, tgt) except ValueError as err: print( '-----------------------------------------------------' ) print(city, 'ERRO') print( '-----------------------------------------------------' ) break pred = model.predict(X_data[:len(targets[d])]) dif = len(data_lag) - len(pred) if dif > 0: pred = list(pred) + ([np.nan] * dif) preds[:, (d - 1)] = pred pred_m = model.predict(X_test[:(len(tgtt))]) metrics[d] = calculate_metrics(pred_m, tgtt) metrics.to_pickle('{}/{}/rgf_metrics_{}.pkl'.format( 'saved_models/rgf', state, city)) plot_prediction(preds, targets[1], city_name, len(X_train)) # plt.show() return None
def single_prediction(city, state, predictors, predict_n, look_back, hidden, epochs, predict=True, doenca='chick'): """ Fit an LSTM model to generate predictions for a city, Using its cluster as regressors. :param city: geocode of the target city :param state: State containing the city :param predict_n: How many weeks ahead to predict :param look_back: Look-back time window length used by the model :param hidden: Number of hidden layers in each LSTM unit :param epochs: Number of epochs of training :param predict: Only generate predictions :param random: If the model should be trained on a random selection of ten cities of the same state. :return: """ with open("../../analysis/clusters_{}.pkl".format(state), "rb") as fp: clusters = pickle.load(fp) data, group = get_cluster_data(geocode=city, clusters=clusters, data_types=DATA_TYPES, cols=predictors, doenca=doenca) data = data['2016-01-01':] x = data.index.shift(predict_n, freq='W') x = [i.date() for i in x] indice = list(data.index) indice = [i.date() for i in indice] city_name = get_city_names([city, 0])[0][1] if predict: ratio = 1 else: ratio = 0.7 if cluster: target_col = list(data.columns).index("casos_est_{}".format(city)) else: target_col = list(data.columns).index("casos_est") norm_data, max_features = normalize_data(data) factor = max_features[target_col] ## split test and train X_train, Y_train, X_test, Y_test = split_data( norm_data, look_back=look_back, ratio=ratio, predict_n=predict_n, Y_column=target_col, ) model = load_model("../saved_models/LSTM/{}/lstm_{}_epochs_{}.h5".format( state, city, epochs)) predicted = np.stack( [model.predict(X_train, batch_size=1, verbose=1) for i in range(100)], axis=2) df_predicted = pd.DataFrame(np.percentile(predicted, 50, axis=2)) df_predicted25 = pd.DataFrame(np.percentile(predicted, 2.5, axis=2)) df_predicted975 = pd.DataFrame(np.percentile(predicted, 97.5, axis=2)) plot_prediction(pred=df_predicted, pred25=df_predicted25, pred975=df_predicted975, x=x, ydata=Y_train, factor=factor, horizon=predict_n, title="{}".format(city_name), doenca=doenca) return predicted, indice, X_test, Y_test, Y_train, factor
def qf_prediction(city, state, horizon, lookback, doenca='chik'): with open('../analysis/clusters_{}.pkl'.format(state), 'rb') as fp: clusters = pickle.load(fp) data, group = get_cluster_data(city, clusters=clusters, data_types=DATA_TYPES, cols=PREDICTORS, doenca=doenca) target = 'casos_est_{}'.format(city) casos_est_columns = ['casos_est_{}'.format(i) for i in group] # casos_columns = ['casos_{}'.format(i) for i in group] # data = data_full.drop(casos_columns, axis=1) data_lag = build_lagged_features(data, lookback) data_lag.dropna() data_lag = data_lag['2016-01-01':] targets = {} for d in range(1, horizon + 1): if d == 1: targets[d] = data_lag[target].shift(-(d - 1)) else: targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)] X_data = data_lag.drop(casos_est_columns, axis=1) city_name = get_city_names([city, 0])[0][1] preds = np.empty((len(data_lag), horizon)) preds25 = np.empty((len(data_lag), horizon)) preds975 = np.empty((len(data_lag), horizon)) metrics = pd.DataFrame(index=('mean_absolute_error', 'explained_variance_score', 'mean_squared_error', 'mean_squared_log_error', 'median_absolute_error', 'r2_score')) # for d in range(1, horizon + 1): # tgtt = targets[d][len(X_data):] # # Load dengue model model = joblib.load( 'saved_models/quantile_forest/{}/{}_city_model_{}W.joblib'.format( state, city, horizon)) pred25 = model.predict(X_data, quantile=2.5) pred = model.predict(X_data, quantile=50) pred975 = model.predict(X_data, quantile=97.5) # dif = len(data_lag) - len(pred) # if dif > 0: # pred = list(pred) + ([np.nan] * dif) # pred25 = list(pred25) + ([np.nan] * dif) # pred975 = list(pred975) + ([np.nan] * dif) # preds[:, (d - 1)] = pred # preds25[:, (d - 1)] = pred25 # preds975[:, (d - 1)] = pred975 # metrics[d] = calculate_metrics(preds, tgtt) # print(metrics) # metrics.to_pickle('{}/{}/qf_metrics_{}.pkl'.format('saved_models/quantile_forest', state, city)) plot_prediction(pred, pred25, pred975, targets[1], horizon, city_name, save=True, doenca=doenca) return model, pred, pred25, pred975, X_data, targets, data_lag