def qf_single_state_prediction(state, lookback, horizon, predictors): """ RQF WITHOUT CLUSTER SERIES :param state: 2-letter code for state :param lookback: number of steps of history to use :param horizon: number of weeks ahead to predict :param predictors: predictor variables :return: """ if state == "CE": s = 'Ceará' else: s = state cities = list(get_cities_from_state(s)) for city in cities: if os.path.isfile('/saved_models/quantile_forest_no_cluster/{}/qf_metrics_{}.pkl'.format(state, city)): print(city, 'done') continue data = combined_data(city, DATA_TYPES) data = data[predictors] data.drop('casos', axis=1, inplace=True) target = 'casos_est' data_lag = build_lagged_features(data, lookback) data_lag.dropna() targets = {} for d in range(1, horizon + 1): if d == 1: targets[d] = data_lag[target].shift(-(d - 1)) else: targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)] X_data = data_lag.drop(target, axis=1) X_train, X_test, y_train, y_test = train_test_split(X_data, data_lag[target], train_size=0.7, test_size=0.3, shuffle=False) city_name = get_city_names([city, 0])[0][1] preds = np.empty((len(data_lag), horizon)) metrics = pd.DataFrame(index=('mean_absolute_error', 'explained_variance_score', 'mean_squared_error', 'mean_squared_log_error', 'median_absolute_error', 'r2_score')) for d in range(1, horizon + 1): tgt = targets[d][:len(X_train)] tgtt = targets[d][len(X_train):] model = rolling_forecasts(X_train, target=tgt, horizon=horizon) pred = model.predict(X_data[:len(targets[d])], quantile=50) dif = len(data_lag) - len(pred) if dif > 0: pred = list(pred) + ([np.nan] * dif) preds[:, (d - 1)] = pred pred_m = model.predict(X_test[(d - 1):]) metrics[d] = calculate_metrics(pred_m, tgtt) metrics.to_pickle('{}/{}/qf_metrics_{}.pkl'.format('saved_models/quantile_forest_no_cluster', state, city)) plot_prediction(preds, targets[1], city_name, len(X_train))
def alocate_data(state): cities_list = list(get_cities_from_state(state)) bad_cities = [] for city in cities_list: try: full_city = combined_data(city, data_types=DATA_TYPES) full_city.to_pickle('{}/city_{}.pkl'.format(TMP_PATH, city)) except TypeError as e: print("Skipping: ", city) bad_cities.append(city) continue for c in bad_cities: cities_list.remove(c) return cities_list
def lasso_single_state_prediction(state, lookback, horizon, predictors): ##LASSO WITHOUT CLUSTER SERIES cities = list(get_cities_from_state('Ceará')) for city in cities: if os.path.isfile( '/home/elisa/Documentos/InfoDenguePredict/infodenguepredict/models/saved_models/lasso_no_cluster/{}/lasso_metrics_{}.pkl' .format(state, city)): print(city, 'done') continue data = combined_data(city, DATA_TYPES) data = data[predictors] data.drop('casos', axis=1, inplace=True) target = 'casos_est' data_lag = build_lagged_features(data, lookback) data_lag.dropna() targets = {} for d in range(1, horizon + 1): if d == 1: targets[d] = data_lag[target].shift(-(d - 1)) else: targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)] X_data = data_lag.drop(target, axis=1) X_train, X_test, y_train, y_test = train_test_split(X_data, data_lag[target], train_size=0.7, test_size=0.3, shuffle=False) city_name = get_city_names([city, 0])[0][1] preds = np.empty((len(data_lag), horizon)) metrics = pd.DataFrame(index=('mean_absolute_error', 'explained_variance_score', 'mean_squared_error', 'mean_squared_log_error', 'median_absolute_error', 'r2_score')) for d in range(1, horizon + 1): model = LassoLarsCV(max_iter=15, n_jobs=-1, normalize=False) tgt = targets[d][:len(X_train)] tgtt = targets[d][len(X_train):] try: model.fit(X_train, tgt) except ValueError as err: print('-----------------------------------------------------') print(city, 'ERRO') print('-----------------------------------------------------') break pred = model.predict(X_data[:len(targets[d])]) dif = len(data_lag) - len(pred) if dif > 0: pred = list(pred) + ([np.nan] * dif) preds[:, (d - 1)] = pred pred_m = model.predict(X_test[:(len(tgtt))]) metrics[d] = calculate_metrics(pred_m, tgtt) metrics.to_pickle('{}/{}/lasso_metrics_{}.pkl'.format( 'saved_models/lasso_no_cluster', state, city)) plot_prediction(preds, targets[1], city_name, len(X_train), path='lasso_no_cluster') # plt.show() return None
if formula is None: formula = "casos~1" model = pf.GASX(data=data, ar=ar, sc=sc, family=family(), formula=formula) return model if __name__ == "__main__": city = 3304557 prediction_window = 5 # weeks # data = get_alerta_table(city) # Nova Iguaçu: 3303609 # Fetching exogenous vars # T = get_temperature_data(city) # (3303500) # T = T[~T.index.duplicated()] # Tw = get_tweet_data(city) # Tw = Tw[~Tw.index.duplicated()] Full = combined_data(city)#data.join(T.resample('W-SUN').mean()).join(Tw.resample('W-SUN').sum()).dropna() # print(data.info()) # data.casos.plot() # print(Full.info()) # print(Full.describe()) # Full.to_csv('data.csv.gz', compression='gzip') model = build_model(Full.dropna(), ar=4, sc=6, formula='casos~1+numero') fit = model.fit('PML')#'BBVI', iterations=1000, optimizer='RMSProp') print(fit.summary()) model.plot_fit() plt.savefig('GASX_in_sample.png') model.plot_parameters() model.plot_predict(h=5, past_values=12) plt.savefig('GASX_prediction.png')
from datetime import datetime import matplotlib.pyplot as plt from infodenguepredict.data.infodengue import get_alerta_table, combined_data def build_model(data): model = sm.tsa.regime_switching.markov_autoregression.\ MarkovAutoregression(endog=data.casos, k_regimes=2, exog=data[['p_rt1', 'p_inc100k', 'numero']], order=2) return model if __name__ == "__main__": prediction_window = 5 # weeks data = combined_data(3304557) # Nova Iguaçu: 3303609 data.casos.plot() # Graph data autocorrelation fig, axes = plt.subplots(1, 2, figsize=(15, 4)) fig = graphics.tsa.plot_acf(data.ix[1:, 'casos'], lags=52, ax=axes[0]) fig = graphics.tsa.plot_pacf(data.ix[1:, 'casos'], lags=52, ax=axes[1]) model = build_model(data) fit = model.fit() # 'BBVI',iterations=1000,optimizer='RMSProp') print(fit.summary()) #todo: fix model, not fitting plt.figure() predict = fit.predict(start='2017-01-01', end='2017-03-01') predict_ci = predict.conf_int() predictdy = fit.predict(start='2017-01-01', end='2017-02-26')