def test_eval_measures(): #mainly regression tests x = np.arange(20).reshape(4,5) y = np.ones((4,5)) assert_equal(iqr(x, y), 5*np.ones(5)) assert_equal(iqr(x, y, axis=1), 2*np.ones(4)) assert_equal(iqr(x, y, axis=None), 9) assert_equal(mse(x, y), np.array([ 73.5, 87.5, 103.5, 121.5, 141.5])) assert_equal(mse(x, y, axis=1), np.array([ 3., 38., 123., 258.])) assert_almost_equal(rmse(x, y), np.array([ 8.5732141 , 9.35414347, 10.17349497, 11.02270384, 11.89537725])) assert_almost_equal(rmse(x, y, axis=1), np.array([ 1.73205081, 6.164414, 11.09053651, 16.0623784 ])) assert_equal(maxabs(x, y), np.array([ 14., 15., 16., 17., 18.])) assert_equal(maxabs(x, y, axis=1), np.array([ 3., 8., 13., 18.])) assert_equal(meanabs(x, y), np.array([ 7. , 7.5, 8.5, 9.5, 10.5])) assert_equal(meanabs(x, y, axis=1), np.array([ 1.4, 6. , 11. , 16. ])) assert_equal(meanabs(x, y, axis=0), np.array([ 7. , 7.5, 8.5, 9.5, 10.5])) assert_equal(medianabs(x, y), np.array([ 6.5, 7.5, 8.5, 9.5, 10.5])) assert_equal(medianabs(x, y, axis=1), np.array([ 1., 6., 11., 16.])) assert_equal(bias(x, y), np.array([ 6.5, 7.5, 8.5, 9.5, 10.5])) assert_equal(bias(x, y, axis=1), np.array([ 1., 6., 11., 16.])) assert_equal(medianbias(x, y), np.array([ 6.5, 7.5, 8.5, 9.5, 10.5])) assert_equal(medianbias(x, y, axis=1), np.array([ 1., 6., 11., 16.])) assert_equal(vare(x, y), np.array([ 31.25, 31.25, 31.25, 31.25, 31.25])) assert_equal(vare(x, y, axis=1), np.array([ 2., 2., 2., 2.]))
def test_eval_measures(): #mainly regression tests x = np.arange(20).reshape(4,5) y = np.ones((4,5)) assert_equal(iqr(x, y), 5*np.ones(5)) assert_equal(iqr(x, y, axis=1), 2*np.ones(4)) assert_equal(iqr(x, y, axis=None), 9) assert_equal(mse(x, y), np.array([ 73.5, 87.5, 103.5, 121.5, 141.5])) assert_equal(mse(x, y, axis=1), np.array([ 3., 38., 123., 258.])) assert_almost_equal(rmse(x, y), np.array([ 8.5732141 , 9.35414347, 10.17349497, 11.02270384, 11.89537725])) assert_almost_equal(rmse(x, y, axis=1), np.array([ 1.73205081, 6.164414, 11.09053651, 16.0623784 ])) assert_equal(maxabs(x, y), np.array([ 14., 15., 16., 17., 18.])) assert_equal(maxabs(x, y, axis=1), np.array([ 3., 8., 13., 18.])) assert_equal(meanabs(x, y), np.array([ 7. , 7.5, 8.5, 9.5, 10.5])) assert_equal(meanabs(x, y, axis=1), np.array([ 1.4, 6. , 11. , 16. ])) assert_equal(meanabs(x, y, axis=0), np.array([ 7. , 7.5, 8.5, 9.5, 10.5])) assert_equal(medianabs(x, y), np.array([ 6.5, 7.5, 8.5, 9.5, 10.5])) assert_equal(medianabs(x, y, axis=1), np.array([ 1., 6., 11., 16.])) assert_equal(bias(x, y), np.array([ 6.5, 7.5, 8.5, 9.5, 10.5])) assert_equal(bias(x, y, axis=1), np.array([ 1., 6., 11., 16.])) assert_equal(medianbias(x, y), np.array([ 6.5, 7.5, 8.5, 9.5, 10.5])) assert_equal(medianbias(x, y, axis=1), np.array([ 1., 6., 11., 16.])) assert_equal(vare(x, y), np.array([ 31.25, 31.25, 31.25, 31.25, 31.25])) assert_equal(vare(x, y, axis=1), np.array([ 2., 2., 2., 2.]))
def test_eval_measures(): # mainly regression tests x = np.arange(20).reshape(4, 5) y = np.ones((4, 5)) assert_equal(iqr(x, y), 5 * np.ones(5)) assert_equal(iqr(x, y, axis=1), 2 * np.ones(4)) assert_equal(iqr(x, y, axis=None), 9) assert_equal(mse(x, y), np.array([73.5, 87.5, 103.5, 121.5, 141.5])) assert_equal(mse(x, y, axis=1), np.array([3.0, 38.0, 123.0, 258.0])) assert_almost_equal( rmse(x, y), np.array( [8.5732141, 9.35414347, 10.17349497, 11.02270384, 11.89537725] ), ) assert_almost_equal( rmse(x, y, axis=1), np.array([1.73205081, 6.164414, 11.09053651, 16.0623784]), ) err = x - y loc = np.where(x != 0) err[loc] /= x[loc] err[np.where(x == 0)] = np.nan expected = np.sqrt(np.nanmean(err ** 2, 0) * 100) assert_almost_equal(rmspe(x, y), expected) err[np.where(np.isnan(err))] = 0.0 expected = np.sqrt(np.nanmean(err ** 2, 0) * 100) assert_almost_equal(rmspe(x, y, zeros=0), expected) assert_equal(maxabs(x, y), np.array([14.0, 15.0, 16.0, 17.0, 18.0])) assert_equal(maxabs(x, y, axis=1), np.array([3.0, 8.0, 13.0, 18.0])) assert_equal(meanabs(x, y), np.array([7.0, 7.5, 8.5, 9.5, 10.5])) assert_equal(meanabs(x, y, axis=1), np.array([1.4, 6.0, 11.0, 16.0])) assert_equal(meanabs(x, y, axis=0), np.array([7.0, 7.5, 8.5, 9.5, 10.5])) assert_equal(medianabs(x, y), np.array([6.5, 7.5, 8.5, 9.5, 10.5])) assert_equal(medianabs(x, y, axis=1), np.array([1.0, 6.0, 11.0, 16.0])) assert_equal(bias(x, y), np.array([6.5, 7.5, 8.5, 9.5, 10.5])) assert_equal(bias(x, y, axis=1), np.array([1.0, 6.0, 11.0, 16.0])) assert_equal(medianbias(x, y), np.array([6.5, 7.5, 8.5, 9.5, 10.5])) assert_equal(medianbias(x, y, axis=1), np.array([1.0, 6.0, 11.0, 16.0])) assert_equal(vare(x, y), np.array([31.25, 31.25, 31.25, 31.25, 31.25])) assert_equal(vare(x, y, axis=1), np.array([2.0, 2.0, 2.0, 2.0]))
def get_best_model(train, test, model_formula): # Step 1: specify the form of the model grid = 10**np.arange(-8, -3, dtype=np.float64) best_alpha = [] best_score = 1000 # Step 2: Find the best hyper parameter, alpha for alpha in grid: model = smf.glm(formula=model_formula, data=train, family=sm.families.NegativeBinomial(alpha=alpha)) results = model.fit() predictions = results.predict(test).astype(int) score = eval_measures.meanabs(predictions, test.total_cases) if score < best_score: best_alpha = alpha best_score = score print('best alpha = ', best_alpha) print('best score = ', best_score) # Step 3: refit on entire dataset full_dataset = pd.concat([train, test]) model = smf.glm(formula=model_formula, data=full_dataset, family=sm.families.NegativeBinomial(alpha=best_alpha)) fitted_model = model.fit() return fitted_model
def negative_binomial_model_eval(self, x_train, x_test, y_train, y_test): train = x_train.copy() test = x_test.copy() train['total_cases'] = y_train test['total_cases'] = y_test # Step 1: specify the form of the model model_formula = train.columns[0] for i in range(1, len(train.columns)-1): model_formula = model_formula+" + "+train.columns[i] model_formula = train.columns[-1] + ' ~ ' + model_formula grid = 10 ** np.arange(-8, -3, dtype=np.float64) best_alpha = [] best_score = 1000 # Step 2: Find the best hyper parameter, alpha for alpha in grid: model = smf.glm(formula=model_formula, data=train, family=sm.families.NegativeBinomial(alpha=alpha)) results = model.fit() predictions = results.predict(test).astype(int) score = eval_measures.meanabs(predictions, test.total_cases) if score < best_score: best_alpha = alpha best_score = score # st.write('best alpha = ', best_alpha) # st.write('best score = ', best_score) return best_alpha
def get_best_model(train, test): # Step 1: specify the form of the model model_formula = "total_cases ~ 1 + " "reanalysis_specific_humidity_g_per_kg + " "reanalysis_dew_point_temp_k + " "reanalysis_min_air_temp_k + " "station_min_temp_c + " "station_max_temp_c + " "station_avg_temp_c + " "reanalysis_air_temp_k" grid = 10 ** np.arange(-8, -3, dtype=np.float64) best_alpha = [] best_score = 1000 # Step 2: Find the best hyper parameter, alpha for alpha in grid: model = smf.glm(formula=model_formula, data=train, family=sm.families.NegativeBinomial(alpha=alpha)) results = model.fit() predictions = results.predict(test).astype(int) score = eval_measures.meanabs(predictions, test.total_cases) if score < best_score: best_alpha = alpha best_score = score print('Alpha = ', best_alpha) print('Score = ', best_score) # Step 3: refit on entire dataset full_dataset = pd.concat([train, test]) model = smf.glm(formula=model_formula, data=full_dataset, family=sm.families.NegativeBinomial(alpha=best_alpha)) fitted_model = model.fit() return fitted_model
def bestrandfrorest(train, test): n_estimate = np.arange(2, 100, 2, dtype=np.float64) rand_state = np.arange(2, 100, 2, dtype=np.float64) best_estimate = [] best_rand_state = [] best_score = 1000 eee = 10 ** -6 train_X = train.copy() train_Y = train_X.total_cases train_X.drop('total_cases', axis=1, inplace=True) train_X.drop('city', axis=1, inplace=True) train_X.drop('week_start_date', axis=1, inplace=True) test_X = test.copy() test_Y = test_X.total_cases test_X.drop('total_cases', axis=1, inplace=True) test_X.drop('city', axis=1, inplace=True) test_X.drop('week_start_date', axis=1, inplace=True) for n in n_estimate: for r in rand_state: nest = int(n) rnd = int(r) randForestModel = RandomForestRegressor(n_estimators=nest, random_state=rnd) randForestModel.fit(train_X, train_Y) predictions = randForestModel.predict(test_X).astype(int) acc = eval_measures.meanabs(predictions, test_Y) if acc < best_score + eee: best_score = acc best_estimate = nest best_rand_state = rnd print(n) print(best_estimate) print(best_rand_state) randForestModel = RandomForestRegressor(n_estimators=best_estimate, random_state=best_rand_state) randForestModel.fit(train_X, train_Y) predictions = randForestModel.predict(test_X).astype(int) acc = eval_measures.meanabs(predictions, test_Y) return randForestModel, acc
def printErrors(test, pred, model): ''' Objective: to print errors of the models Inputs: test: test dataframe pred: predictions model: model that is used Outputs: Mean absolute error, mean squared error, root mean squared error ''' print('MAE of ' + model + ': {:.4}'.format(meanabs(test, pred, axis=0))) print('MSE of ' + model + ': {:.4}'.format(mse(test, pred, axis=0))) print('RMSE of ' + model + ': {:.4}'.format(rmse(test, pred, axis=0)))
def train_negative_binomial_model(self, x_train, y_train, test_size): '''generate and train the negative binomial model :param xtrain: matrix with the features, pandas or numpy :param ytrain: array with the targets, pandas or numpy :param test_size: from 0 to 1, percentual of train to use as test in the parameter evaluatio of the model :return: negative binomial model fitted ''' x_train_train, x_train_test, y_train_train, y_train_test = train_test_split( x_train, y_train, shuffle=False, test_size=test_size) train = x_train_train.copy() test = x_train_test.copy() train['target'] = y_train_train test['target'] = y_train_test # Step 1: specify the form of the model model_formula = train.columns[0] for i in range(1, len(train.columns) - 1): model_formula = model_formula + " + " + train.columns[i] model_formula = train.columns[-1] + ' ~ ' + model_formula grid = 10**np.arange(-8, -3, dtype=np.float64) best_alpha = [] best_score = 1000 # Step 2: Find the best hyper parameter, alpha for alpha in grid: model = smf.glm(formula=model_formula, data=train, family=sm.families.NegativeBinomial(alpha=alpha)) results = model.fit() predictions = results.predict(test).astype(int) score = eval_measures.meanabs(predictions, test.total_cases) if score < best_score: best_alpha = alpha best_score = score # fit the final model data = x_train.copy() data['target'] = y_train model_formula = data.columns[0] for i in range(1, len(data.columns) - 1): model_formula = model_formula + " + " + data.columns[i] model_formula = data.columns[-1] + ' ~ ' + model_formula # # Step 4: refit on entire dataset model = smf.glm(formula=model_formula, data=data, family=sm.families.NegativeBinomial(alpha=best_alpha)) fitted_model = model.fit() return fitted_model
def gradient_boosting(train_data, val_data): params = { 'n_estimators': 800, 'max_depth': 5, 'min_samples_split': 3, 'learning_rate': 0.01, 'loss': 'ls' } clf = ensemble.GradientBoostingRegressor(**params) train_label = train_data['total_cases'] train_feat = train_data.drop('total_cases', axis=1) clf.fit(train_feat, train_label) predictions = clf.predict(train_feat) mae = eval_measures.meanabs(predictions, train_label) #print("Training MAE: %.4f" % mae) val_label = val_data['total_cases'] val_feat = val_data.drop('total_cases', axis=1) val_predictions = clf.predict(val_feat) mae = eval_measures.meanabs(val_predictions, val_label) #print("Validation MAE: %.4f" % mae) return clf
def evaluate(self): """ Calculates the MAE between the predicted and test ratings. """ predicted = self.get_full_rating_matrix() real_ratings = [] predicted_ratings = [] for user, movie_ratings in self.test_set.iteritems(): for movie_id, rating in movie_ratings.iteritems(): predicted_user = predicted.get(user, None) if not predicted_user: continue predicted_rating = predicted_user.get(movie_id, None) if not predicted_rating: continue predicted_ratings.append(predicted_rating) real_ratings.append(rating) return meanabs(real_ratings, predicted_ratings)
def eval_metrics(forecast, observed): '''Return forecast evaluation metrics. Parameters ---------- forecast : pd.Series Forecasted values. observed : pd.Series Observed values. Return ------ mae : float Mean Absolute Error metric. rmserr : float Root Mean Squared Error metric. Named rmserr to avoid conflicting with statsmodels rmse function. ''' return meanabs(forecast, observed), rmse( forecast, observed), (((forecast - observed).abs() / observed).mean()) * 100
def forecast_arima(df: pd.DataFrame, cols: list, with_graph: bool = True): lag = 0 order = 1 moving_avg_model = 0 steps = 50 for col in cols: model = ARIMA(df[col].iloc[:-steps], order=(lag, order, moving_avg_model)) model_fit = model.fit() model_for = model_fit.get_forecast(steps=steps, alpha=0.05) print('\t==== Summary of forecast ARIMA(%d, %d, %d) ====\n' % (lag, order, moving_avg_model)) print(model_for.summary_frame(), model_for.conf_int(), sep='\n') print('RMSE: %f\nMAE: %f' % (rmse(df[col][-50:], model_for.predicted_mean), meanabs(df[col][-50:], model_for.predicted_mean))) print() if with_graph is True: plt.figure(figsize=(12, 5)) plt.xlabel(col) plt.title('Forecast for %s using ARIMA(%d, %d, %d)' % (col, lag, order, moving_avg_model)) ax1 = model_for.predicted_mean.plot(color='blue', grid=True, label='Actual') ax2 = df[col][-50:].plot(color='red', grid=True, secondary_y=True, label='Estimated') h1, l1 = ax1.get_legend_handles_labels() h2, l2 = ax2.get_legend_handles_labels() plt.legend(h1 + h2, l1 + l2, loc=2) plt.show()
def TrainModel(self, DATA, args={}): np.random.seed(1) self, options = UpdateOptions(self, args) self.Models = {} self.Accuracy = [] DATA_X = DATA.iloc[:, :-1] DATA_Y = DATA.iloc[:, -1] models = {} acc = [] for i in range(0, self.NumberOfModels): newData = np.random.randint(DATA.shape[0], size=DATA.shape[0]) newTest = np.delete(np.arange(0, DATA.shape[0]), pd.unique(newData)) data = DATA.iloc[newData, :].reset_index(drop=True) models[i] = RandomForest() models[i].TrainModel(data) tst_X = DATA_X.iloc[newTest, :].reset_index(drop=True) tst_Y = DATA_Y.iloc[newTest].reset_index(drop=True) predictions = models[i].Predict(tst_X).astype(int) acc.append(eval_measures.meanabs(predictions, tst_Y)) acc = np.asarray(acc) for i in range(0, self.NumberOfOutModels): index = acc.argmin() self.Accuracy.append(acc[index]) acc[index] = acc.max() self.Models[i] = models[index] self.Accuracy = np.asarray(self.Accuracy) # print(str(self.Accuracy.mean())) return self.Accuracy.mean()
def MultipleRandFrorest(idf, K=50, nest=100, rnd=20): df = idf.copy() train_Y = df.total_cases df.drop('total_cases', axis=1, inplace=True) models = {} acc = [] for i in range(0, K): newData = np.random.randint(df.shape[0], size=df.shape[0]) newTest = np.delete(np.arange(0, df.shape[0]), np.unique(newData)) data_X = df.iloc[newData, :].reset_index(drop=True) data_Y = train_Y.iloc[newData].reset_index(drop=True) models[i] = RandomForestRegressor(n_estimators=nest, random_state=rnd) models[i].fit(data_X, data_Y) tst_X = df.iloc[newTest, :].reset_index(drop=True) tst_Y = train_Y.iloc[newTest].reset_index(drop=True) predictions = models[i].predict(tst_X).astype(int) acc.append(eval_measures.meanabs(predictions, tst_Y)) acc = np.asarray(acc) out_models = {} out_ACC = [] for i in range(0, round(K)): indx = acc.argmin() out_ACC.append(acc[indx]) acc[indx] = 100 out_models[i] = models[indx] out_ACC = np.asarray(out_ACC) print(str(out_ACC.mean())) return out_models
def main(): ### parsing and Data pre-processing # load the provided data train_features_path = os.path.join(data_path, 'dengue_features_train.csv') train_labels_path = os.path.join(data_path, 'dengue_labels_train.csv') train_features = pd.read_csv(train_features_path, index_col=[0, 1, 2]) train_labels = pd.read_csv(train_labels_path, index_col=[0, 1, 2]) # Seperate data for San Juan sj_train_features = train_features.loc['sj'] sj_train_labels = train_labels.loc['sj'] # Separate data for Iquitos iq_train_features = train_features.loc['iq'] iq_train_labels = train_labels.loc['iq'] # Remove 'week_start_date' string. sj_train_features.drop('week_start_date', axis=1, inplace=True) iq_train_features.drop('week_start_date', axis=1, inplace=True) #find NaN in data be unsatisfying and eliminate those ddata sj_train_features.fillna(method='ffill', inplace=True) iq_train_features.fillna(method='ffill', inplace=True) ### pre-processing data sj_train, iq_train = preprocess_data(train_features_path, labels_path=train_labels_path) #print(sj_train.describe()) #print(iq_train.describe()) ''' sj_train_subtrain = sj_train.head(800) sj_train_subtest = sj_train.tail(sj_train.shape[0] - 800) iq_train_subtrain = iq_train.head(400) iq_train_subtest = iq_train.tail(iq_train.shape[0] - 400) ''' choose = rand.sample(range(0, sj_train.shape[0] - 1), 800) val = [i for i in range(sj_train.shape[0]) if i not in choose] sj_train_subtrain = sj_train.ix[choose] sj_train_subtest = sj_train.ix[val] choose = rand.sample(range(0, iq_train.shape[0] - 1), 400) val = [i for i in range(iq_train.shape[0]) if i not in choose] iq_train_subtrain = iq_train.ix[choose] iq_train_subtest = iq_train.ix[val] sj_best_model = get_best_model(sj_train_subtrain, sj_train_subtest, 'sj') iq_best_model = get_best_model(iq_train_subtrain, iq_train_subtest, 'iq') #Use K-fold to create cross validation data kf = KFold(n_splits=12) sj_score = [] for train_index, test_index in kf.split(sj_train): #print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = sj_train.ix[train_index], sj_train.ix[test_index] predictions = sj_best_model.predict(X_test).astype(int) for i in range(predictions.shape[0] - 1, 3, -1): predictions.ix[i] = predictions.ix[i - 4] sj_score.append(eval_measures.meanabs(predictions, X_test.total_cases)) print("Mean of {} cross validation of sj_score is {} (+/- {})".format( kf.get_n_splits(sj_train), np.mean(sj_score), np.std(sj_score))) print(sj_score) iq_score = [] for train_index, test_index in kf.split(iq_train): #print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = iq_train.ix[train_index], iq_train.ix[test_index] predictions = iq_best_model.predict(X_test).astype(int) #print(predictions) for i in range(predictions.shape[0] - 1, 0, -1): predictions.ix[i] = predictions.ix[i - 1] #print(predictions) iq_score.append(eval_measures.meanabs(predictions, X_test.total_cases)) print(iq_score) print("Mean of {} cross validation of iq_score is {} (+/- {})".format( kf.get_n_splits(iq_train), np.mean(iq_score), np.std(iq_score))) figs, axes = plt.subplots(nrows=2, ncols=1) # plot sj sj_train['fitted'] = sj_best_model.fittedvalues sj_train.fitted.plot(ax=axes[0], label="Predictions") sj_train.total_cases.plot(ax=axes[0], label="Actual") # plot iq iq_train['fitted'] = iq_best_model.fittedvalues iq_train.fitted.plot(ax=axes[1], label="Predictions") iq_train.total_cases.plot(ax=axes[1], label="Actual") plt.suptitle("Dengue Predicted Cases vs. Actual Cases") plt.legend() plt.show() test_features_path = os.path.join(data_path, 'dengue_features_test.csv') sj_test, iq_test = preprocess_data(test_features_path) sj_predictions = sj_best_model.predict(sj_test).astype(int) for i in range(sj_predictions.shape[0] - 1, 3, -1): sj_predictions.ix[i] = sj_predictions.ix[i - 4] iq_predictions = iq_best_model.predict(iq_test).astype(int) for i in range(iq_predictions.shape[0] - 1, 0, -1): iq_predictions.ix[i] = iq_predictions.ix[i - 1] sample_path = os.path.join(data_path, 'submission_format.csv') submission = pd.read_csv(sample_path, index_col=[0, 1, 2]) submission.total_cases = np.concatenate([sj_predictions, iq_predictions]) submission.to_csv("./data/benchmark_shift.csv")
def CalculatePerformance(self, result, target): if self.NegativeBinomial == {}: return None else: return eval_measures.meanabs(result, target)
# print('{} {} {}'.format( # valid_data[column].index.values[IDX], valid_data[column].iloc[IDX], forecast_data[column].iloc[IDX])) ax1.plot(valid_data[column].index, forecast_data[column], linewidth=0.5) # log the MSE and MAE logging.info( 'Mean square error for {} forecast for data order {} for column {} : {}' .format(forecast_type[forecast], order_type[order], column, mse(valid_data[column], forecast_data[column]))) logging.info( 'Absolute square error for {} forecast for data order {} for column {} : {}' .format(forecast_type[forecast], order_type[order], column, meanabs(valid_data[column], forecast_data[column]))) forecast_cnt = len(valid_data.index.values) correct_forecast_direction = 0 for data_idx in range(forecast_cnt): if (valid_data[column][data_idx] > 0) == (forecast_data[column][data_idx] > 0): correct_forecast_direction = correct_forecast_direction + 1 logging.info( 'Percent of valid data that is incerease from previous day {}'. format(valid_data[column].gt(0).sum() / forecast_cnt)) logging.info( 'Percent of forecast that matches valid direction {}'.format( correct_forecast_direction / forecast_cnt)) # write out the results to csv files for post processing output_file = open(
def evaluation(validation_data: pd.Series, forecast_ets: pd.Series, forecast_arima: pd.Series, forecast_xgboost: pd.Series, forecast_snaive: pd.Series, forecast_drift: pd.Series, forecast_average: pd.Series, verbose: bool): print("\tPrediction model\t\t|\tRMSE\t\t\t\t|\tMAE\t\t\t\t\t|\tSeasonal MASE\t\t|\tSMAPE") print("\t-------------------------------------------------------------------------------------------------") sn = [ rmse([float(x) for x in validation_data.values], forecast_snaive), meanabs([float(x) for x in validation_data.values], forecast_snaive), eval.mase(validation_data.values, forecast_snaive, 5), eval.smape(validation_data.values, forecast_snaive) ] if verbose: print("\tSeasonal Naive", end="\t\t\t|\t") print(sn[0], end="\t|\t") print(sn[1], end="\t|\t") print(sn[2], end="\t|\t") print(sn[3]) df = [ rmse([float(x) for x in validation_data.values], forecast_drift), meanabs([float(x) for x in validation_data.values], forecast_drift), eval.mase(validation_data.values, forecast_drift, 5), eval.smape(validation_data.values, forecast_drift) ] if verbose: print("\tDrift", end="\t\t\t\t\t|\t") print(df[0], end="\t|\t") print(df[1], end="\t|\t") print(df[2], end="\t|\t") print(df[3]) av = [ rmse([float(x) for x in validation_data.values], forecast_average), meanabs([float(x) for x in validation_data.values], forecast_average), eval.mase(validation_data.values, forecast_average, 5), eval.smape(validation_data.values, forecast_average) ] if verbose: print("\tAverage", end="\t\t\t\t\t|\t") print(av[0], end="\t|\t") print(av[1], end="\t|\t") print(av[2], end="\t|\t") print(av[3]) xgb = [ rmse([float(x) for x in validation_data.values], forecast_xgboost), meanabs([float(x) for x in validation_data.values], forecast_xgboost), eval.mase(validation_data.values, forecast_xgboost, 5), eval.smape(validation_data.values, forecast_xgboost) ] if verbose: print("\tXGBoost Regression", end="\t\t|\t") print(xgb[0], end="\t|\t") print(xgb[1], end="\t|\t") print(xgb[2], end="\t|\t") print(xgb[3]) ets = [ rmse([float(x) for x in validation_data.values], forecast_ets), meanabs([float(x) for x in validation_data.values], forecast_ets), eval.mase(validation_data.values, forecast_ets, 5), eval.smape(validation_data.values, forecast_ets) ] if verbose: print("\tExponentialSmoothing", end="\t|\t") print(ets[0], end="\t|\t") print(ets[1], end="\t|\t") print(ets[2], end="\t|\t") print(ets[3]) ar = [ rmse([float(x) for x in validation_data.values], forecast_arima), meanabs([float(x) for x in validation_data.values], forecast_arima), eval.mase(validation_data.values, forecast_arima, 5), eval.smape(validation_data.values, forecast_arima) ] if verbose: print("\tARIMA", end="\t\t\t\t\t|\t") print(ar[0], end="\t|\t") print(ar[1], end="\t|\t") print(ar[2], end="\t|\t") print(ar[3]) eval_results = DataFrame ([sn, df, av, xgb, ets, ar], index = ['seasonal_naive','drift','average','xgboost','ets','arima'],columns = ['RMSE','MAE','MASE', 'sMAPE']) return eval_results
def CalculatePerformance(self, result, target): if self.Models == {}: return None else: return eval_measures.meanabs(result, target)
from statsmodels.tsa.stattools import grangercausalitytests grangercausalitytests(df3[['a', 'd']], maxlag=5) grangercausalitytests(df3[['b', 'd']], maxlag=5) np.random.seed(42) df = pd.DataFrame(np.random.randint(20, 30, (50, 2)), columns=['test', 'predictions']) df.head() df.plot(figsize=(12, 8)) from statsmodels.tools.eval_measures import mse, rmse, meanabs mse(df['test'], df['predictions']) rmse(df['test'], df['predictions']) meanabs(df['test'], df['predictions']) df1.head() df1.index from statsmodels.graphics.tsaplots import month_plot, quarter_plot month_plot(df1['Pass_K']) df1q = df1['Pass_K'].resample(rule='Q').sum() quarter_plot(df1q)
def get_best_model_sj(train, test): # Step 1: specify the form of the model #CHANGE HERE ---- SJ FEATURES model_formula = "total_cases ~ 1 + " \ "reanalysis_specific_humidity_g_per_kg + " \ "reanalysis_dew_point_temp_k + " \ "station_avg_temp_c + " \ "station_max_temp_c + " \ "reanalysis_air_temp_k + " \ "reanalysis_relative_humidity_percent + " \ "reanalysis_relative_humidity_percent_2 + " \ "reanalysis_relative_humidity_percent_3 + " \ "reanalysis_precip_amt_kg_per_m2_2 + " \ "reanalysis_precip_amt_kg_per_m2_3 + " \ "reanalysis_specific_humidity_g_per_kg_2 + " \ "reanalysis_specific_humidity_g_per_kg_3 + " \ "reanalysis_dew_point_temp_k_2 + " \ "reanalysis_dew_point_temp_k_3 + " \ "reanalysis_dew_point_temp_k_4 + " \ "reanalysis_air_temp_k_2 + " \ "reanalysis_air_temp_k_4 + " \ "reanalysis_air_temp_k_5 + " \ "reanalysis_air_temp_k_6 + " \ "reanalysis_air_temp_k_7 + " \ "reanalysis_air_temp_k_8 + " \ "station_max_temp_c_3 + " \ "station_max_temp_c_4 + " \ "station_max_temp_c_5 + " \ "station_max_temp_c_6 + " \ "station_max_temp_c_2 + " \ "reanalysis_sat_precip_amt_mm_2 + " \ "precipitation_amt_mm_2 + " \ "precipitation_amt_mm_3" grid = 10**np.arange(-8, -3, dtype=np.float64) best_alpha = [] best_score = 1000 # Step 2: Find the best hyper parameter, alpha for alpha in grid: model = smf.glm(formula=model_formula, data=train, family=sm.families.NegativeBinomial(alpha=alpha)) results = model.fit() predictions = results.predict(test).astype(int) score = eval_measures.meanabs(predictions, test.total_cases) if score < best_score: best_alpha = alpha best_score = score print('best alpha = ', best_alpha) print('best score = ', best_score) # Step 3: refit on entire dataset full_dataset = pd.concat([train, test]) model = smf.glm(formula=model_formula, data=full_dataset, family=sm.families.NegativeBinomial(alpha=best_alpha)) fitted_model = model.fit() return fitted_model
'2': np.round(sdarbict1, 4) }, ] mstable = pd.DataFrame(msdata) print('') print('== ARIMA Model Selection ==') print('') print(mstable) print('') ########################################## # 3.8. ARIMA Models Forecasting Accuracy # 3.8.1. Multi-Steps Forecast rwdmae1 = fa.meanabs(rwdf1, spyf) rwdrmse1 = fa.rmse(rwdf1, spyf) darmae1 = fa.meanabs(darf1, spyf) darrmse1 = fa.rmse(darf1, spyf) srwdmae1 = fa.meanabs(srwdf1, spyf) srwdrmse1 = fa.rmse(srwdf1, spyf) sdarmae1 = fa.meanabs(sdarf1, spyf) sdarrmse1 = fa.rmse(sdarf1, spyf) fadata1 = [ { '0': '', '1': 'MAE', '2': 'RMSE' }, {
def main(): ### parsing and Data pre-processing # load the provided data train_features_path = os.path.join(data_path, 'dengue_features_train.csv') train_labels_path = os.path.join(data_path, 'dengue_labels_train.csv') ### pre-processing data sj_train, iq_train = preprocess_data(train_features_path, labels_path=train_labels_path) #print(sj_train.describe()) #print(iq_train.describe()) ###Define the xgb parameters xgb_params = { 'eta': 0.05, 'max_depth': 5, 'subsample': 0.7, 'colsample_bytree': 0.7, 'objective': 'reg:linear', 'eval_metric': 'rmse', 'silent': 1 } num_boost_rounds = 1000 ##Use K-fold to create cross validation data kf = KFold(n_splits=6) ##Do the stacking by adding 5 dataframes 'negbi', 'gb', 'xgb','adaboost','extratree' ,'bagging'which store the training prediction sj_train = sj_train.assign(negbi=0) sj_train = sj_train.assign(gb=0) sj_train = sj_train.assign(xgb=0) sj_train = sj_train.assign(abr=0) sj_train = sj_train.assign(etr=0) sj_train = sj_train.assign(br=0) loop = 1 for train_index, val_index in kf.split( sj_train ): #The index will be split into [train_index] and [val_index] X_train, X_val = sj_train.ix[train_index], sj_train.ix[val_index] ###(1)neg_binomial method sj_neg_model = get_best_model(X_train, X_val, 'sj') predictions_neg = sj_neg_model.predict(X_val).astype(int) #Shift the prediction manually for i in range(predictions_neg.shape[0] - 1, 3, -1): predictions_neg.ix[i] = predictions_neg.ix[i - 4] ###(2)gradient boosting method sj_gb_model = gradient_boosting( X_train.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1), X_val.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1)) predictions_gb = sj_gb_model.predict( X_val.drop( ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1)).astype(int) ###(3)xgboost method dtrain = xgb.DMatrix( X_train.drop( ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1), X_train['total_cases']) dval = xgb.DMatrix( X_val.drop( ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1)) sj_xgb_model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds) predictions_xgb = sj_xgb_model.predict(dval).astype(int) ###(4)Adaboost regressor method sj_abr_model = ABR(n_estimators=800, learning_rate=0.08, loss='linear', random_state=0) sj_abr_model.fit( X_train.drop( ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1), X_train['total_cases']) predictions_abr = sj_abr_model.predict( X_val.drop( ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1)) ###(5)Extra tree regressor method sj_etr_model = ETR(n_estimators=800, max_depth=4, random_state=0, verbose=1) sj_etr_model.fit( X_train.drop( ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1), X_train['total_cases']) predictions_etr = sj_etr_model.predict( X_val.drop( ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1)) ###(6) Bagging Regressor method sj_br_model = BR(n_estimators=800, oob_score=False, n_jobs=5, random_state=0, verbose=1) sj_br_model.fit( X_train.drop( ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1), X_train['total_cases']) predictions_br = sj_br_model.predict( X_val.drop( ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1)) ###Store the result in sj_train predictions_neg -> 'negbi', predictions_gb -> 'gb' print( "Adding the result of the predictions to sj training data({}/{})". format(loop, 6)) for idx, index in enumerate(val_index): sj_train['negbi'].ix[index] = predictions_neg.ix[idx] sj_train['gb'].ix[index] = predictions_gb[idx] sj_train['xgb'].ix[index] = predictions_xgb[idx] sj_train['abr'].ix[index] = predictions_abr[idx] sj_train['etr'].ix[index] = predictions_etr[idx] sj_train['br'].ix[index] = predictions_br[idx] loop += 1 iq_train = iq_train.assign(negbi=0) iq_train = iq_train.assign(gb=0) iq_train = iq_train.assign(xgb=0) iq_train = iq_train.assign(abr=0) iq_train = iq_train.assign(etr=0) iq_train = iq_train.assign(br=0) loop = 1 for train_index, val_index in kf.split(iq_train): X_train, X_val = iq_train.ix[train_index], iq_train.ix[val_index] ###(1)neg_binomial method iq_neg_model = get_best_model(X_train, X_val, 'iq') predictions_neg = iq_neg_model.predict(X_val).astype(int) #Shift the prediction manually for i in range(predictions_neg.shape[0] - 1, 0, -1): predictions_neg.ix[i] = predictions_neg.ix[i - 1] ###(2)gradient boosting method iq_gb_model = gradient_boosting( X_train.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1), X_val.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1)) predictions_gb = iq_gb_model.predict( X_val.drop( ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1)).astype(int) ###(3)xgb method dtrain = xgb.DMatrix( X_train.drop( ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1), X_train['total_cases']) dval = xgb.DMatrix( X_val.drop( ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1)) iq_xgb_model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds) predictions_xgb = iq_xgb_model.predict(dval).astype(int) ###(4)Adaboost regressor method iq_abr_model = ABR(n_estimators=800, learning_rate=0.08, loss='linear', random_state=0) iq_abr_model.fit( X_train.drop( ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1), X_train['total_cases']) predictions_abr = iq_abr_model.predict( X_val.drop( ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1)) ###(5)Extra tree regressor method iq_etr_model = ETR(n_estimators=800, max_depth=4, random_state=0, verbose=1) iq_etr_model.fit( X_train.drop( ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1), X_train['total_cases']) predictions_etr = iq_etr_model.predict( X_val.drop( ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1)) ###(6) Bagging Regressor method iq_br_model = BR(n_estimators=800, oob_score=False, n_jobs=5, random_state=0, verbose=1) iq_br_model.fit( X_train.drop( ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1), X_train['total_cases']) predictions_br = iq_br_model.predict( X_val.drop( ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1)) ###Store the result in iq_train predictions_neg -> 'negbi', predictions_gb -> 'gb' print( "Adding the result of the predictions to iq training data({}/{})". format(loop, 6)) for idx, index in enumerate(val_index): iq_train['negbi'].ix[index] = predictions_neg.ix[idx] iq_train['gb'].ix[index] = predictions_gb[idx] iq_train['xgb'].ix[index] = predictions_xgb[idx] iq_train['abr'].ix[index] = predictions_abr[idx] iq_train['etr'].ix[index] = predictions_etr[idx] iq_train['br'].ix[index] = predictions_br[idx] loop += 1 ###Now the training data looks like [feature, total_cases, negbi, gb, xgb] ##Accessing testing data test_features_path = os.path.join(data_path, 'dengue_features_test.csv') sj_test, iq_test = preprocess_data(test_features_path) ##Like training, add 'negbi' and 'gb' to the testing dataframe sj_test = sj_test.assign(negbi=0) sj_test = sj_test.assign(gb=0) sj_test = sj_test.assign(xgb=0) sj_test = sj_test.assign(abr=0) sj_test = sj_test.assign(etr=0) sj_test = sj_test.assign(br=0) ##(1)neg_binomial prediction sj_predictions_neg = sj_neg_model.predict(sj_test).astype(int) for i in range(sj_predictions_neg.shape[0] - 1, 3, -1): sj_predictions_neg.ix[i] = sj_predictions_neg.ix[i - 4] ##(2)gradient boosting prediction sj_predictions_gb = sj_gb_model.predict( sj_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1)).astype(int) ##(3)xgb prediction dtest = xgb.DMatrix( sj_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1)) sj_predictions_xgb = sj_xgb_model.predict(dtest).astype(int) ###(4)Adaboost regressor method sj_predictions_abr = sj_br_model.predict( sj_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1)).astype(int) ###(5)extra tree regressor method sj_predictions_etr = sj_etr_model.predict( sj_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1)).astype(int) ###(6)bagging regressor method sj_predictions_br = sj_br_model.predict( sj_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1)).astype(int) print("Adding predictions as features to sj testing data...") for i in range(len(sj_test['negbi']) ): #Add the prediction to the corresponding column sj_test['negbi'].ix[i] = sj_predictions_neg.ix[i] sj_test['gb'].ix[i] = sj_predictions_gb[i] sj_test['xgb'].ix[i] = sj_predictions_xgb[i] sj_test['abr'].ix[i] = sj_predictions_abr[i] sj_test['etr'].ix[i] = sj_predictions_etr[i] sj_test['br'].ix[i] = sj_predictions_br[i] ##Same process as city sj iq_test = iq_test.assign(negbi=0) iq_test = iq_test.assign(gb=0) iq_test = iq_test.assign(xgb=0) iq_test = iq_test.assign(abr=0) iq_test = iq_test.assign(etr=0) iq_test = iq_test.assign(br=0) ###(1)neg_binomial prediction iq_predictions_neg = iq_neg_model.predict(iq_test).astype(int) for i in range(iq_predictions_neg.shape[0] - 1, 0, -1): iq_predictions_neg.ix[i] = iq_predictions_neg.ix[i - 1] ##(2)gradient boosting prediction iq_predictions_gb = iq_gb_model.predict( iq_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1)).astype(int) ##(3)xgb prediction dtest = xgb.DMatrix( iq_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1)) iq_predictions_xgb = iq_xgb_model.predict(dtest).astype(int) ###(4)Adaboost regressor method iq_predictions_abr = iq_abr_model.predict( sj_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1)).astype(int) ###(5)extra tree regressor method iq_predictions_etr = iq_etr_model.predict( sj_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1)).astype(int) ###(6)bagging regressor method iq_predictions_br = iq_br_model.predict( sj_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1)).astype(int) print("Adding predictions as features to iq testing data...") for i in range(len(iq_test['negbi'])): iq_test['negbi'].ix[i] = iq_predictions_neg.ix[i] iq_test['gb'].ix[i] = iq_predictions_gb[i] iq_test['xgb'].ix[i] = iq_predictions_xgb[i] iq_test['abr'].ix[i] = iq_predictions_abr[i] iq_test['etr'].ix[i] = iq_predictions_etr[i] iq_test['br'].ix[i] = iq_predictions_br[i] ##use new information to run a linear regression print("Building linear regression model...") #Now the linear regression model uses (X = [features, negbi, gb, xgb], y = total_cases )to train(fit) sj_lr = LR() sj_lr.fit(sj_train.drop('total_cases', axis=1), sj_train['total_cases']) iq_lr = LR() iq_lr.fit(iq_train.drop('total_cases', axis=1), iq_train['total_cases']) #Calculate the k-fold validation error sj_score = [] for train_index, val_index in kf.split(sj_train): X_train, X_val = sj_train.ix[train_index], sj_train.ix[val_index] train_predict = np.array( sj_lr.predict(X_val.drop('total_cases', axis=1))).astype(int) sj_score.append(eval_measures.meanabs(train_predict, X_val.total_cases)) print("Mean of {} cross validation of sj_score is {} (+/- {})".format( kf.get_n_splits(sj_train), np.mean(sj_score), np.std(sj_score))) iq_score = [] for train_index, val_index in kf.split(iq_train): X_train, X_val = iq_train.ix[train_index], iq_train.ix[val_index] train_predict = np.array( iq_lr.predict(X_val.drop('total_cases', axis=1))).astype(int) iq_score.append(eval_measures.meanabs(train_predict, X_val.total_cases)) print("Mean of {} cross validation of iq_score is {} (+/- {})".format( kf.get_n_splits(iq_train), np.mean(iq_score), np.std(iq_score))) ##Use the model sj_lr and iq_lr trained before to predict the testing data print("Predicting testing data...") sj_predictions = sj_lr.predict(sj_test) iq_predictions = iq_lr.predict(iq_test) sj_predictions = np.array(sj_predictions).astype(int) iq_predictions = np.array(iq_predictions).astype(int) print("Creating submit file...") ##Use submission_format as template to write the answer sample_path = os.path.join(data_path, 'submission_format.csv') submission = pd.read_csv(sample_path, index_col=[0, 1, 2]) submission.total_cases = np.concatenate([sj_predictions, iq_predictions]) submission.to_csv("./data/stacking_6_less_feature.csv") '''
# Fix S% issue with "--" and multiply S% by 2 if d-man while i < regression_data['TOI/GP'].size: if regression_data['S%'][i] == "--": shootingp.append(0) else: shotp = float(regression_data['S%'][i]) shootingp.append(shotp) if regression_data['Pos'][i] == "D" and regression_data['S%'][i] != "--": shootingp.append(float(regression_data['S%'][i]) * 2) i = i + 1 shootingp_array = np.array(shootingp) regression_data['S%'] = pd.Series(shootingp_array) variables = regression_data[['G', 'A', 'TOI/GP', 'PPP']].values salary = regression_data['AAV'].values var_train, var_test, sal_train, sal_test = train_test_split(variables, salary, test_size=0.2, random_state=5) lin_model = sm.OLS(sal_train, var_train) result = lin_model.fit() sal_pred = result.predict(var_test) print(result.summary()) print("Mean Absolute Error: " + str(meanabs(sal_test, sal_pred, axis=0)))
def main(): ### parsing and Data pre-processing # load the provided data train_features_path = os.path.join(data_path, 'dengue_features_train.csv') train_labels_path = os.path.join(data_path, 'dengue_labels_train.csv') ### pre-processing data sj_train, iq_train = preprocess_data(train_features_path, labels_path=train_labels_path) #print(sj_train.describe()) #print(iq_train.describe()) choose = rand.sample(range(0, sj_train.shape[0] - 1), 800) val = [i for i in range(sj_train.shape[0]) if i not in choose] sj_train_subtrain = sj_train.ix[choose] sj_train_subtest = sj_train.ix[val] sj_etr = ETR(n_estimators=2000, max_depth=3, criterion='mae', verbose=1) sj_etr.fit(sj_train_subtrain.drop('total_cases', axis=1), sj_train_subtrain['total_cases']) ##The model generate by neg_binomial with best alpha on val_set chosen before kf = KFold(n_splits=12) sj_model_list = [] sj_err_list = [] loop = 1 for train_index, val_index in kf.split( sj_train ): #The index will be split into [train_index] and [val_index] X_train, X_val = sj_train.ix[train_index], sj_train.ix[val_index] #sj_etr = ETR(n_estimators = 2000, max_depth = 3,criterion = 'mae',verbose = 1) #sj_etr.fit(X_train.drop(['station_avg_temp_c','total_cases'],axis = 1),X_train['total_cases']) predictions = sj_etr.predict(X_val.drop('total_cases', axis=1)) sj_err_list.append( eval_measures.meanabs(predictions, X_val.total_cases)) #sj_model_list.append(sj_etr) loop += 1 print(sj_err_list) #argmax = sorted(range(len(sj_err_list)), key=lambda x: sj_err_list[x])[0] #print(argmax) #sj_best_model = sj_model_list[argmax] sj_best_model = sj_etr #print(sj_best_model.feature_importances_) choose = rand.sample(range(0, iq_train.shape[0] - 1), 400) val = [i for i in range(iq_train.shape[0]) if i not in choose] iq_train_subtrain = iq_train.ix[choose] iq_train_subtest = iq_train.ix[val] iq_etr = ETR(n_estimators=2000, max_depth=3, criterion='mae', verbose=1) iq_etr.fit(iq_train_subtrain.drop('total_cases', axis=1), iq_train_subtrain['total_cases']) iq_model_list = [] iq_err_list = [] loop = 1 for train_index, val_index in kf.split(iq_train): X_train, X_val = iq_train.ix[train_index], iq_train.ix[val_index] #iq_etr = ETR(n_estimators = 2000, max_depth = 3,criterion = 'mae',verbose = 1) #iq_etr.fit(X_train.drop(['station_min_temp_c','total_cases'],axis = 1),X_train['total_cases']) predictions = iq_etr.predict(X_val.drop('total_cases', axis=1)) iq_err_list.append( eval_measures.meanabs(predictions, X_val.total_cases)) #iq_model_list.append(iq_etr) loop += 1 print(iq_err_list) #argmax = sorted(range(len(iq_err_list)), key=lambda x: iq_err_list[x])[0] #print(argmax) #iq_best_model = iq_model_list[argmax] iq_best_model = iq_etr #print(iq_best_model.feature_importances_) ##Accessing testing data test_features_path = os.path.join(data_path, 'dengue_features_test.csv') sj_test, iq_test = preprocess_data(test_features_path) #Calculate the k-fold validation error sj_score = [] for train_index, val_index in kf.split(sj_train): X_train, X_val = sj_train.ix[train_index], sj_train.ix[val_index] train_predict = np.array( sj_best_model.predict(X_val.drop('total_cases', axis=1))).astype(int) sj_score.append(eval_measures.meanabs(train_predict, X_val.total_cases)) print("Mean of {} cross validation of sj_score is {} (+/- {})".format( kf.get_n_splits(sj_train), np.mean(sj_score), np.std(sj_score))) iq_score = [] for train_index, val_index in kf.split(iq_train): X_train, X_val = iq_train.ix[train_index], iq_train.ix[val_index] train_predict = np.array( iq_best_model.predict(X_val.drop('total_cases', axis=1))).astype(int) iq_score.append(eval_measures.meanabs(train_predict, X_val.total_cases)) print("Mean of {} cross validation of iq_score is {} (+/- {})".format( kf.get_n_splits(iq_train), np.mean(iq_score), np.std(iq_score))) ##Use the model sj_lr and iq_lr trained before to predict the testing data print("Predicting testing data...") sj_predictions = sj_best_model.predict(sj_test) iq_predictions = iq_best_model.predict(iq_test) sj_predictions = np.round(sj_predictions).astype(int) iq_predictions = np.round(iq_predictions).astype(int) print("Creating submit file...") ##Use submission_format as template to write the answer sample_path = os.path.join(data_path, 'submission_format.csv') submission = pd.read_csv(sample_path, index_col=[0, 1, 2]) submission.total_cases = np.concatenate([[28], [25], [34], sj_predictions, [8], [6], [10], iq_predictions]) submission.to_csv("./data/ext_final_new.csv") '''
def main(): ### parsing and Data pre-processing # load the provided data train_features_path = os.path.join(data_path, 'dengue_features_train.csv') train_labels_path = os.path.join(data_path, 'dengue_labels_train.csv') ### pre-processing data sj_train, iq_train = preprocess_data(train_features_path, labels_path=train_labels_path) #print(sj_train.describe()) #print(iq_train.describe()) kf = KFold(n_splits=6) sj_model_list = [] sj_err_list = [] loop = 1 for train_index, val_index in kf.split( sj_train ): #The index will be split into [train_index] and [val_index] X_train, X_val = sj_train.ix[train_index], sj_train.ix[val_index] sj_etr = ETR(n_estimators=800, max_depth=4, random_state=0, verbose=1) sj_etr.fit(X_train.drop('total_cases', axis=1), X_train['total_cases']) predictions = sj_etr.predict(X_val.drop('total_cases', axis=1)) sj_err_list.append( eval_measures.meanabs(predictions, X_val.total_cases)) sj_model_list.append(sj_etr) loop += 1 print(sj_err_list) argmax = sorted(range(len(sj_err_list)), key=lambda x: sj_err_list[x])[0] print(argmax) sj_best_model = sj_model_list[argmax] iq_model_list = [] iq_err_list = [] loop = 1 for train_index, val_index in kf.split(iq_train): X_train, X_val = iq_train.ix[train_index], iq_train.ix[val_index] iq_etr = ETR(n_estimators=400, max_depth=4, random_state=0) iq_etr.fit(X_train.drop('total_cases', axis=1), X_train['total_cases']) predictions = iq_etr.predict(X_val.drop('total_cases', axis=1)) iq_err_list.append( eval_measures.meanabs(predictions, X_val.total_cases)) iq_model_list.append(iq_etr) loop += 1 print(iq_err_list) argmax = sorted(range(len(iq_err_list)), key=lambda x: iq_err_list[x])[0] print(argmax) iq_best_model = iq_model_list[argmax] ##Accessing testing data test_features_path = os.path.join(data_path, 'dengue_features_test.csv') sj_test, iq_test = preprocess_data(test_features_path) #Calculate the k-fold validation error sj_score = [] for train_index, val_index in kf.split(sj_train): X_train, X_val = sj_train.ix[train_index], sj_train.ix[val_index] train_predict = np.array( sj_best_model.predict(X_val.drop('total_cases', axis=1))).astype(int) sj_score.append(eval_measures.meanabs(train_predict, X_val.total_cases)) print("Mean of {} cross validation of sj_score is {} (+/- {})".format( kf.get_n_splits(sj_train), np.mean(sj_score), np.std(sj_score))) iq_score = [] for train_index, val_index in kf.split(iq_train): X_train, X_val = iq_train.ix[train_index], iq_train.ix[val_index] train_predict = np.array( iq_best_model.predict(X_val.drop('total_cases', axis=1))).astype(int) iq_score.append(eval_measures.meanabs(train_predict, X_val.total_cases)) print("Mean of {} cross validation of iq_score is {} (+/- {})".format( kf.get_n_splits(iq_train), np.mean(iq_score), np.std(iq_score))) ##Use the model sj_lr and iq_lr trained before to predict the testing data print("Predicting testing data...") sj_predictions = sj_best_model.predict(sj_test) iq_predictions = iq_best_model.predict(iq_test) sj_predictions = np.array(sj_predictions).astype(int) iq_predictions = np.array(iq_predictions).astype(int) print("Creating submit file...") ##Use submission_format as template to write the answer sample_path = os.path.join(data_path, 'submission_format.csv') submission = pd.read_csv(sample_path, index_col=[0, 1, 2]) submission.total_cases = np.concatenate([sj_predictions, iq_predictions]) submission.to_csv("./data/ext_new.csv") '''