def model_cv_df(model_cv, X_train, X_test, y_train, y_test, cv_mapping, model_name):

    from sklearn.metrics import mean_absolute_error
    import numpy as np
    from statsmodels.tools.eval_measures import mse, rmse
    from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV

    # We are making predictions here
    y_preds_test = model_cv.predict(X_test)
    print("\t-----TRAIN SET-----")
    print("\tBest alpha: {}".format(model_cv.alpha_))
    print("\tR-squared: {}\n".format(model_cv.score(X_train, y_train)))
    print("\t-----TEST SET-----")
    print("\tR-squared: {}".format(model_cv.score(X_test, y_test)))
    print("\tMean absolute error: {}".format(mean_absolute_error(y_test, y_preds_test)))
    print("\tMean squared error: {}".format(mse(y_test, y_preds_test)))
    print("\tRoot mean squared error: {}".format(rmse(y_test, y_preds_test)))
    print(
        "\tMean absolute percentage error: {}".format(
            np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100
        )
    )

    model_vals = [
        model_cv.alpha_,
        model_cv.score(X_train, y_train),
        model_cv.score(X_test, y_test),
        mean_absolute_error(y_test, y_preds_test),
        mse(y_test, y_preds_test),
        rmse(y_test, y_preds_test),
        np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100,
    ]

    cv_mapping[model_name] = model_vals
예제 #2
0
def overall_single_regressions(x_train, x_test, y_train, y_test):
    """
	Run a regression over the whole timeline for each variable.
	x and y should be Pandas dataframes.
	"""
    for column in x_train.columns:
        x = x_train[column]
        plt.scatter(x, y_train)
        plt.ylabel("Days to first infection")
        plt.xlabel(column.replace("-", " ").title())
        plt.savefig(f"results/single-regressions/{column}-scatter.png")
        plt.clf()

        plt.hist(x_train[column])
        plt.ylabel(column.replace("-", " ").title())
        plt.savefig(f"results/single-regressions/{column}-histogram.png")
        plt.clf()

        # Use StatsModels to create the Linear Model and Output R-squared
        model = sm.OLS(y_train, x_train[column])
        results = model.fit()
        # print(f"{column} regression summary:")
        with open(f"results/single-regressions/{column}-result-summary.txt",
                  "w+") as rs:
            rs.write(results.summary().as_text())
            training_mse = eval_measures.mse(
                y_train, pd.DataFrame(results.predict(x_train[column])))
            testing_mse = eval_measures.mse(
                y_test, pd.DataFrame(results.predict(x_test[column])))
            rs.write("\n training MSE: " + str(training_mse[0]))
            rs.write("\n testing MSE: " + str(testing_mse[0]))
        # print(results.summary(), "\n\n")
    return
예제 #3
0
def MLR(train, test, train_label, test_label):
    train_ = sm.add_constant(train)
    res = sm.OLS(train_label, train_).fit()
    pred_train = res.predict(train_)
    test = sm.add_constant(test)
    pred_test = res.predict(test)
    mlr_test_mse = eval_measures.mse(test_label, pred_test)
    mlr_train_mse = eval_measures.mse(train_label, pred_train)
    return mlr_train_mse, mlr_test_mse
예제 #4
0
def test_eval_measures():
    #mainly regression tests

    x = np.arange(20).reshape(4,5)
    y = np.ones((4,5))
    assert_equal(iqr(x, y), 5*np.ones(5))
    assert_equal(iqr(x, y, axis=1), 2*np.ones(4))
    assert_equal(iqr(x, y, axis=None), 9)

    assert_equal(mse(x, y),
                 np.array([  73.5,   87.5,  103.5,  121.5,  141.5]))
    assert_equal(mse(x, y, axis=1),
                 np.array([   3.,   38.,  123.,  258.]))

    assert_almost_equal(rmse(x, y),
                        np.array([  8.5732141 ,   9.35414347,  10.17349497,
                                   11.02270384,  11.89537725]))
    assert_almost_equal(rmse(x, y, axis=1),
                        np.array([  1.73205081,   6.164414,
                                   11.09053651,  16.0623784 ]))

    assert_equal(maxabs(x, y),
                 np.array([ 14.,  15.,  16.,  17.,  18.]))
    assert_equal(maxabs(x, y, axis=1),
                 np.array([  3.,   8.,  13.,  18.]))

    assert_equal(meanabs(x, y),
                 np.array([  7. ,   7.5,   8.5,   9.5,  10.5]))
    assert_equal(meanabs(x, y, axis=1),
                 np.array([  1.4,   6. ,  11. ,  16. ]))
    assert_equal(meanabs(x, y, axis=0),
                 np.array([  7. ,   7.5,   8.5,   9.5,  10.5]))

    assert_equal(medianabs(x, y),
                 np.array([  6.5,   7.5,   8.5,   9.5,  10.5]))
    assert_equal(medianabs(x, y, axis=1),
                 np.array([  1.,   6.,  11.,  16.]))

    assert_equal(bias(x, y),
                 np.array([  6.5,   7.5,   8.5,   9.5,  10.5]))
    assert_equal(bias(x, y, axis=1),
                 np.array([  1.,   6.,  11.,  16.]))

    assert_equal(medianbias(x, y),
                 np.array([  6.5,   7.5,   8.5,   9.5,  10.5]))
    assert_equal(medianbias(x, y, axis=1),
                 np.array([  1.,   6.,  11.,  16.]))

    assert_equal(vare(x, y),
                 np.array([ 31.25,  31.25,  31.25,  31.25,  31.25]))
    assert_equal(vare(x, y, axis=1),
                 np.array([ 2.,  2.,  2.,  2.]))
예제 #5
0
def test_eval_measures():
    #mainly regression tests

    x = np.arange(20).reshape(4,5)
    y = np.ones((4,5))
    assert_equal(iqr(x, y), 5*np.ones(5))
    assert_equal(iqr(x, y, axis=1), 2*np.ones(4))
    assert_equal(iqr(x, y, axis=None), 9)

    assert_equal(mse(x, y),
                 np.array([  73.5,   87.5,  103.5,  121.5,  141.5]))
    assert_equal(mse(x, y, axis=1),
                 np.array([   3.,   38.,  123.,  258.]))

    assert_almost_equal(rmse(x, y),
                        np.array([  8.5732141 ,   9.35414347,  10.17349497,
                                   11.02270384,  11.89537725]))
    assert_almost_equal(rmse(x, y, axis=1),
                        np.array([  1.73205081,   6.164414,
                                   11.09053651,  16.0623784 ]))

    assert_equal(maxabs(x, y),
                 np.array([ 14.,  15.,  16.,  17.,  18.]))
    assert_equal(maxabs(x, y, axis=1),
                 np.array([  3.,   8.,  13.,  18.]))

    assert_equal(meanabs(x, y),
                 np.array([  7. ,   7.5,   8.5,   9.5,  10.5]))
    assert_equal(meanabs(x, y, axis=1),
                 np.array([  1.4,   6. ,  11. ,  16. ]))
    assert_equal(meanabs(x, y, axis=0),
                 np.array([  7. ,   7.5,   8.5,   9.5,  10.5]))

    assert_equal(medianabs(x, y),
                 np.array([  6.5,   7.5,   8.5,   9.5,  10.5]))
    assert_equal(medianabs(x, y, axis=1),
                 np.array([  1.,   6.,  11.,  16.]))

    assert_equal(bias(x, y),
                 np.array([  6.5,   7.5,   8.5,   9.5,  10.5]))
    assert_equal(bias(x, y, axis=1),
                 np.array([  1.,   6.,  11.,  16.]))

    assert_equal(medianbias(x, y),
                 np.array([  6.5,   7.5,   8.5,   9.5,  10.5]))
    assert_equal(medianbias(x, y, axis=1),
                 np.array([  1.,   6.,  11.,  16.]))

    assert_equal(vare(x, y),
                 np.array([ 31.25,  31.25,  31.25,  31.25,  31.25]))
    assert_equal(vare(x, y, axis=1),
                 np.array([ 2.,  2.,  2.,  2.]))
def test_eval_measures():
    # mainly regression tests
    x = np.arange(20).reshape(4, 5)
    y = np.ones((4, 5))

    assert_equal(iqr(x, y), 5 * np.ones(5))
    assert_equal(iqr(x, y, axis=1), 2 * np.ones(4))
    assert_equal(iqr(x, y, axis=None), 9)

    assert_equal(mse(x, y), np.array([73.5, 87.5, 103.5, 121.5, 141.5]))
    assert_equal(mse(x, y, axis=1), np.array([3.0, 38.0, 123.0, 258.0]))

    assert_almost_equal(
        rmse(x, y),
        np.array(
            [8.5732141, 9.35414347, 10.17349497, 11.02270384, 11.89537725]
        ),
    )
    assert_almost_equal(
        rmse(x, y, axis=1),
        np.array([1.73205081, 6.164414, 11.09053651, 16.0623784]),
    )

    err = x - y
    loc = np.where(x != 0)
    err[loc] /= x[loc]
    err[np.where(x == 0)] = np.nan
    expected = np.sqrt(np.nanmean(err ** 2, 0) * 100)
    assert_almost_equal(rmspe(x, y), expected)
    err[np.where(np.isnan(err))] = 0.0
    expected = np.sqrt(np.nanmean(err ** 2, 0) * 100)
    assert_almost_equal(rmspe(x, y, zeros=0), expected)

    assert_equal(maxabs(x, y), np.array([14.0, 15.0, 16.0, 17.0, 18.0]))
    assert_equal(maxabs(x, y, axis=1), np.array([3.0, 8.0, 13.0, 18.0]))

    assert_equal(meanabs(x, y), np.array([7.0, 7.5, 8.5, 9.5, 10.5]))
    assert_equal(meanabs(x, y, axis=1), np.array([1.4, 6.0, 11.0, 16.0]))
    assert_equal(meanabs(x, y, axis=0), np.array([7.0, 7.5, 8.5, 9.5, 10.5]))

    assert_equal(medianabs(x, y), np.array([6.5, 7.5, 8.5, 9.5, 10.5]))
    assert_equal(medianabs(x, y, axis=1), np.array([1.0, 6.0, 11.0, 16.0]))

    assert_equal(bias(x, y), np.array([6.5, 7.5, 8.5, 9.5, 10.5]))
    assert_equal(bias(x, y, axis=1), np.array([1.0, 6.0, 11.0, 16.0]))

    assert_equal(medianbias(x, y), np.array([6.5, 7.5, 8.5, 9.5, 10.5]))
    assert_equal(medianbias(x, y, axis=1), np.array([1.0, 6.0, 11.0, 16.0]))

    assert_equal(vare(x, y), np.array([31.25, 31.25, 31.25, 31.25, 31.25]))
    assert_equal(vare(x, y, axis=1), np.array([2.0, 2.0, 2.0, 2.0]))
예제 #7
0
def perfstats(col, Y):

    ##rsq,mae,mse,rmse,mape

    pf = pd.DataFrame(columns=[
        'model', 'rsq', 'rsq_adj', 'f_value', 'aic', 'bic', 'mae', 'mse',
        'rmse', 'mape'
    ])
    pd.options.display.float_format = '{:.3f}'.format
    for num, X in enumerate(col, 1):
        x_train, x_test, y_train, y_test = train_test_split(X,
                                                            Y,
                                                            test_size=0.2,
                                                            random_state=42)

        standardscaler = StandardScaler()
        x_train = standardscaler.fit_transform(x_train)
        x_test = standardscaler.transform(x_test)

        x_train = sm.add_constant(x_train)
        results = sm.OLS(y_train, x_train).fit()
        x_test = sm.add_constant(x_test)
        y_pred = results.predict(x_test)
        pf.loc[num] = ('model_' + str(num), results.rsquared,
                       results.rsquared_adj, results.fvalue, results.aic,
                       results.bic, mean_absolute_error(y_test, y_pred),
                       mse(y_test, y_pred), rmse(y_test, y_pred),
                       (np.mean(np.abs((y_test - y_pred) / y_test)) * 100))
    return pf
예제 #8
0
def regstats(col, Y):  #linear

    ##rsq,mae,mse,rmse,mape

    pf = pd.DataFrame(columns=[
        'model', 'rsq_train', 'rsq_test', 'subt_rsq', 'mae_test', 'mse_test',
        'rmse_test', 'mape_test'
    ])

    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.2,
                                                        random_state=42)

    standardscaler = StandardScaler()
    x_train = standardscaler.fit_transform(x_train)
    x_test = standardscaler.transform(x_test)

    results = LinearRegression().fit(x_train, y_train)
    y_pred = results.predict(x_test)

    pf.loc[num] = ('model_' + str(num), results.score(x_train, y_train),
                   results.score(x_test,
                                 y_test), results.score(x_train, y_train) -
                   results.score(x_test, y_test),
                   mean_absolute_error(y_test, y_pred), mse(y_test, y_pred),
                   rmse(y_test,
                        y_pred), (np.mean(np.abs(
                            (y_test - y_pred) / y_test)) * 100))
    return pf
예제 #9
0
    def fit(self, folds=3, thetas=(-2, -1, 0, 0.25, 0.5, 0.75, 1.25, 1.5, 1.75, 2)):
        """Function to theta models based on Kevin Sheppard's code. Selects the
        best theta for the series based on KFold cross-validation

        Parameters
        ----------
        @Parameters thetas  -   tuple of float theta values to evaluate

        Returns
        ----------
        None
        """

        # Initialise the KFold object
        kf = TimeSeriesSplit(n_splits=folds)

        for i, series in enumerate(self.data.columns):
            x = self.data.loc[:self.train_ix[series] - 1, series]

            mspes = {t: np.empty((folds, 1)) for t in thetas}
            p = pd.DataFrame(None, index=["a0", "b0"], dtype=np.double)
            params = {i: p for i in range(folds)}

            fold_ix = 0
            for tr_ix, te_ix in kf.split(x):
                # Set up data
                x_tr, x_te = x.iloc[tr_ix], x.iloc[te_ix]

                t = x_tr.shape[0]
                k = x_te.shape[0]

                for theta in thetas:
                    # Estimate the different theta models
                    params[fold_ix][theta] = self.estimate(x_tr, theta)
                    # Forecast for different theta models:
                    b0 = params[fold_ix][theta]["b0"]
                    # New RHS for forecasting
                    rhs_oos = np.ones((k, 2))
                    rhs_oos[:, 1] = np.arange(k) + t + 1
                    # Exp. Smoothing term
                    fit_args = {"disp": False, "iprint": -1, "low_memory": True}
                    ses = ExponentialSmoothing(x_tr).fit(**fit_args)
                    alpha = ses.params.smoothing_level
                    # Actual forecasting
                    ses_forecast = ses.forecast(k)
                    trend = (np.arange(k) + 1 / alpha - ((1 -alpha) ** t) / alpha)
                    trend *= 0.5 * b0
                    forecast = np.array(ses_forecast + trend)
                    mspes[theta][fold_ix] = mse(x_te, forecast)

                fold_ix += 1

            # Evaluate the KFold
            for k, v in mspes.items():
                mspes[k] = np.mean(v)

            self.best_theta[series] = min(mspes, key=mspes.get)
            self.fitted[series] = self.estimate(x, self.best_theta[series])
            self.fit_success = True
def print_evaluation_metrics(true, predicted):
    print("Mean absolute error of the prediction is: {}".format(
        mean_absolute_error(true, predicted)))
    print("Mean squared error of the prediction is: {}".format(
        mse(true, predicted)))
    print("Root mean squared error of the prediction is: {}".format(
        rmse(true, predicted)))
    print("Mean absolute percentage error of the prediction is: {}".format(
        np.mean(np.abs((true - predicted) / true)) * 100))
예제 #11
0
def overall_multiregression(x_train, x_test, y_train, y_test):
	"""
	Run a multigression over the whole timeline with all provided variables.
	x and y should be Pandas dataframes.
	"""
	# Use StatsModels to create the Linear Model and Output R-squared
	x_train = sm.add_constant(x_train)
	x_test = sm.add_constant(x_test)
	model = sm.OLS(y_train, x_train)
	results = model.fit()
	# print(f"Multiregression summary:")
	with open(f"results/multiregression-result-summary.txt", "w+") as rs:
		rs.write(results.summary().as_text())
		# TODO: Something is messed up here
		training_mse = eval_measures.mse(y_train, pd.DataFrame(results.predict(x_train)))
		testing_mse = eval_measures.mse(y_test, pd.DataFrame(results.predict(x_test)))
		rs.write("\n training MSE: " + str(training_mse[0]))
		rs.write("\n testing MSE: " + str(testing_mse[0]))
	# print(results.summary())
	return (results.rsquared, training_mse[0], testing_mse[0])
예제 #12
0
def printErrors(test, pred, model):
    '''
    Objective: to print errors of the models
    Inputs:
    test: test dataframe
    pred: predictions
    model: model that is used
    Outputs:
    Mean absolute error, mean squared error, root mean squared error
    '''
    print('MAE of ' + model + ': {:.4}'.format(meanabs(test, pred, axis=0)))
    print('MSE of ' + model + ': {:.4}'.format(mse(test, pred, axis=0)))
    print('RMSE of ' + model + ': {:.4}'.format(rmse(test, pred, axis=0)))
예제 #13
0
    def _evaluate_arima_model(X: Union[pd.Series, pd.DataFrame], arima_order: Tuple[int, int, int],
                              train_size: Union[float, int, None], freq: str) -> Tuple[float, dict]:
        train_size = int(len(X) * 0.75) if train_size is None else int(len(X) * train_size) \
            if isinstance(train_size, float) else train_size
        train, test = X[:train_size].astype(float), X[train_size:].astype(float)

        model = ARIMA(train, order=arima_order, freq=freq)
        model_fit = model.fit(disp=False, method='css', trend='nc')

        # calculate test error
        yhat = model_fit.forecast(len(test))[0]
        error = mse(test, yhat)

        return error, model_fit
예제 #14
0
def model_evaluation(y_test, X_test, fitted_model, training_time, start_time):
    """
    Evaluates model performance and returns a dict of evaluation metrics

    :param 1D array-like y_test: the target variable of the test set
    :param 2D array-like X_test: the predictor variables of the test set
    :param fitted_model: a trained model implementing a predict() method
    :param training_time: the duration of training
    :param start_time: the timestamp at which training started
    :return dict: a dict of evaluation metrics
    """
    return {
        'mse_test': mse(y_test.values, fitted_model.predict(X_test.values)),
        'training_time': training_time,
        'prediction_time': time.time() - start_time - training_time,
    }
def main():
	#Calculating via cross_val_score function
	kf = KFold(len(loansData['IR_TF']), 10)
	X1 = loansData[indVars]
	y1 = loansData['IR_TF']
	lr = LogisticRegression()
	msescores = cross_val_score(lr, X1, y1,scoring='mean_squared_error', cv=kf, n_jobs=1)
	r2score = cross_val_score(lr, X1, y1,scoring='r2', cv=kf, n_jobs=1)
	maescore = cross_val_score(lr, X1, y1,scoring='mean_absolute_error', cv=kf, n_jobs=1)
	
	
	#Below is an alternative means to calculating the cross validation stats. Seemed to give a more
	#intuitive answer, however not fully certain it is correct
	
	mselist = []
	maelist = []
	r2list = []
	r2listtrain = []
	r2listtest = []
	for train, test in kf:
 		X = loansData[indVars]
		y = loansData['IR_TF']
		#setting up train and test models
 		logit = sm.Logit(loansData['IR_TF'].ix[train], loansData[indVars].ix[train]).fit()
 		logittest = sm.Logit(loansData['IR_TF'].ix[test], loansData[indVars].ix[test]).fit()
 		testR = logittest.prsquared
 		trainR = logit.prsquared
 		#calculate MSE
 		mselist.append(ste.mse(logit.predict(X.ix[test]), y.ix[test], axis=0))
 		#calculate MAE
 		maelist.append(np.sum(logit.predict(X.ix[test])) - np.sum(y.ix[test]))
 		#calculate % difference in R2 of each train-test pair
 		r2list.append(abs((trainR - testR)/trainR))
 		r2listtrain.append(testR)
 		r2listtest.append(trainR)
 	
 	#printing results for both methods
 	print('First Method Using cross_val_score function:')
	print('MSE: '+str(np.mean(msescores))+' ,'+'RSquared: '+str(np.mean(r2score))+' ,'+
	'MAE: '+str(np.mean(maescore)))
 	print ('Second Method Using for loop (answers seemed more intuitive):')	
	print ('Mean of MSE is '+str(np.mean(mselist))	)	
	print ('MAE is '+str(np.mean(maelist))	)
	print ('The mean of R^2 percentage difference in each training and test samples is '+"{:.0%}".format(np.mean(r2list)))
	print ('The overall percentage difference of the total means of R^2 for training and test samples is '+"{:.0%}".format(
	abs((np.mean(r2listtrain) - np.mean(r2listtest))/np.mean(r2listtrain))))
def regression_stats(model):
    y_preds_test = model.predict(X_test)
    # create df for model results
    model_vals = [
        model.score(X_train, y_train),
        model.score(X_test, y_test),
        mean_absolute_error(y_test, y_preds_test),
        mse(y_test, y_preds_test),
        rmse(y_test, y_preds_test),
        np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100,
    ]
    mapping = {
        "stat": ["train R^2", "test R^2", "MAE", "MSE", "RMSE", "MAPE"],
        "model": model_vals,
    }
    stats_df = pd.DataFrame.from_dict(mapping)
    return stats_df
예제 #17
0
def daily_analysis(file_path):
    """Run a day-by-day analysis of the performance of the viral pressure metric"""
    X = get_viral_pressure_data(get_connectedness_data(file_path), file_path)
    y = process_y(get_case_data(file_path))

    for (country, date, val) in y:
        X[date][country] = (X[date][country], val)

    daily_results = {}
    for (date, data) in X.items():
        # _, data = daily_data
        [x_pressure, y_days] = zip(*data.values())
        model = sm.OLS(y_days, x_pressure)
        results = model.fit()
        training_mse = eval_measures.mse(y_days, results.predict(y_days))
        daily_results[date] = (training_mse, results.rsquared)

    [mse, rsquared] = zip(*daily_results.values())

    # dates = daily_results.keys()
    dates = [
        pd.to_datetime(d, format="%Y-%m-%d") for d in daily_results.keys()
    ]

    ax = plt.gca()
    ax.xaxis.set_major_formatter(mdates.DateFormatter("%m-%d"))
    # plt.scatter(dates, mse)
    plt.plot_date(dates, mse)
    plt.ylabel("MSE")
    plt.xlabel("Date")
    plt.savefig("results/MSE_daily_scatter.png")
    plt.clf()

    ax = plt.gca()
    ax.xaxis.set_major_formatter(mdates.DateFormatter("%m-%d"))
    # plt.scatter(dates, rsquared)
    plt.plot_date(dates, rsquared)
    plt.ylabel("R-Squared")
    plt.xlabel("Date")
    plt.savefig("results/R_Squared_daily_scatter.png")
    plt.clf()
예제 #18
0
    def __evaluate_arima_model(self, ts, order, seasonal_order, test_ratio):
        """
        Evaluates the ARIMA model with given order and seasonal order
        :param ts: time series
        :param order: (p,d,q)
        :param seasonal_order: (P,D,Q,m)
        :param test_ratio: test ratio to use
        :return: error, best_model, best_model_fit
        """
        # prepare training dataset
        train_size = int(len(ts) * (1 - test_ratio))
        train, test = ts[0:train_size], ts[train_size:]

        model, model_fit = self.__train_model(train, order, seasonal_order)
        if not self.__seasonal:
            yhat = model_fit.forecast(len(test))[0]
        else:
            yhat = model_fit.forecast(len(test))

        # calculate out of sample error
        error = mse(test, yhat)
        return error, model, model_fit
예제 #19
0
def model_evaluation(pred_data, test_date_index, test_airport_index):
    DelayRatio = pd.read_csv("DelayRatio.csv", index_col=0)

    mae_score = np.mean(
        mae(DelayRatio.fillna(0).iloc[test_date_index, test_airport_index],
            pred_data,
            axis=0))
    print('mae metric: ', mae_score)

    rmse_score = np.mean(
        mse(DelayRatio.fillna(0).iloc[test_date_index, test_airport_index],
            pred_data,
            axis=0))**0.5
    print('rmse metric: ', rmse_score)

    wae_score = wae_eval(pred_data, test_date_index, test_airport_index)
    print('wae metric: ', wae_score)

    rwse_score = rwse(pred_data, test_date_index, test_airport_index)
    print('rwse metric: ', rwse_score)

    DelayFlights = pd.read_csv("ArrDelayFlights.csv",
                               index_col=0) + pd.read_csv(
                                   "DepDelayFlights.csv", index_col=0)
    TotalFlights = pd.read_csv("ArrTotalFlights.csv",
                               index_col=0) + pd.read_csv(
                                   "DepTotalFlights.csv", index_col=0)

    w_pre_data = TotalFlights.iloc[test_date_index,
                                   test_airport_index] * pred_data
    #display(w_pre_data)
    #     w_mae_score = np.mean(mae(DelayFlights.iloc[test_date_index, test_airport_index],w_pre_data,axis=0))
    #     print ('w_mae metric: ',w_mae_score)

    #     w_rmse_score = np.mean(mse(DelayFlights.iloc[test_date_index, test_airport_index],w_pre_data,axis=0))**0.5
    #     print ('w_rmse metric: ',w_rmse_score)
    return
예제 #20
0
def regframe(X, Y, mod, idx):

    ##rsq,mae,mse,rmse,mape

    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.2,
                                                        shuffle=False)

    model = mod.fit(x_train, y_train)
    y_pred = model.predict(x_test)

    k_fold = KFold(n_splits=10, shuffle=False)

    df = pd.Series(
        {
            'rsq_train':
            model.score(x_train, y_train),
            'rsq_test':
            model.score(x_test, y_test),
            'subt_rsq':
            model.score(x_train, y_train) - model.score(x_test, y_test),
            'mae_test':
            mean_absolute_error(y_test, y_pred),
            'mse_test':
            mse(y_test, y_pred),
            'rmse_test':
            rmse(y_test, y_pred),
            'mape_test': (np.mean(np.abs((y_test - y_pred) / y_test)) * 100),
            'cross-score':
            cross_val_score(estimator=mod, X=X, y=Y, cv=k_fold).mean(),
            'cross-train':
            cross_val_score(estimator=mod, X=x_train, y=y_train,
                            cv=k_fold).mean()
        },
        name=idx)
    return df
예제 #21
0
        ],
                         axis=1)
        y = data_df['is_winner']
        return X, y

    X, y = load_file("tennis.csv")
    print(X)
    X = sm.add_constant(X)
    x_train, x_test, y_train, y_test = train_test_split(X.values, y, p)
    model = sm.OLS(y_train, x_train)
    results = model.fit()
    print(results.summary())

    train_y_cap = results.predict(x_train)
    y_cap = results.predict(x_test)
    training_MSE = eval_measures.mse(y_train, train_y_cap)
    testing_MSE = eval_measures.mse(y_test, y_cap)
    print('training r-squared: ' + str(results.rsquared))
    print('training MSE: ' + str(training_MSE))
    print('testing MSE: ' + str(testing_MSE))

    ##################################################################################
    # TODO: use train test split to split data into x_train, x_test, y_train, y_test #
    #################################################################################

    ##################################################################################
    # TODO: Use StatsModels to create the Linear Model and Output R-squared
    #################################################################################

    # Prints out the Report
    # TODO: print R-squared, test MSE & train MSE
예제 #22
0
파일: weather.py 프로젝트: zameeq/weather
ax.set_xlabel('YEAR')
ax.set_ylabel('DEC')
plt.show()

from sklearn import linear_model, feature_selection, preprocessing
from sklearn.model_selection import train_test_split
import statsmodels.formula.api as sm
from statsmodels.tools.eval_measures import mse
from statsmodels.tools import add_constant
from sklearn.metrics import mean_squared_error

X = df.values.copy()
X_train, X_valid, y_train, y_valid = train_test_split(X[:, :-1],
                                                      X[:, -1],
                                                      train_size=0.80)
result = sm.OLS(y_train, add_constant(X_train)).fit()
result.summary()
result = sm.OLS(y_train, add_constant(X_train)).fit()
result.summary()
ypred = result.predict(add_constant(X_valid))
print(mse(ypred, y_valid))
fig, ax = plt.subplots(1, 1)
ax.scatter(y_valid, ypred)
ax.set_xlabel('Actual')
ax.set_ylabel('Prediction')
plt.show()

# In[ ]:

# In[ ]:
예제 #23
0
X_train, X_test , y_train, y_test = train_test_split(X, y,
                                                      test_size=0.5,
                                                  random_state=1)
###############################
# run OLS
from statsmodels.tools.eval_measures import mse, rmse

X_train_const = sm.add_constant(X_train)
#X_train_const['carat_2'] =  np.pow(X_train_const['carat'],2)

res = sm.OLS(y_train, X_train_const ).fit()
print(res.summary())

#np.sqrt(mse(res.predict(X_test), y_test))

mse(res.predict(X_test), y_test)

plot_pred_vs_actual(res.predict(X_test), y_test)

res.params.plot(kind='bar')

pd.Series(lasso.coef_, index=X.columns).plot(kind='bar')

################################
# run LASSO
lassocv = LassoCV(alphas=None, cv=10, max_iter=100000, normalize=True)
lassocv.fit(X_train, y_train)
print("Alpha=", lassocv.alpha_)

lasso = Lasso()
lasso.set_params(alpha=lassocv.alpha_)
예제 #24
0
 def get_mse(self):
     return mse(self.y_test, self.y_predictor)
예제 #25
0
def cross_validation(all_variables, labels, ind_variables, title):
    
    features = all_variables[ind_variables].to_numpy()
    labels = labels.to_numpy()

    k_fold = KFold(n_splits=5, random_state=0, shuffle=True)
    nn_test_mse_sum = nn_train_mse_sum = mlr_test_mse_sum = mlr_train_mse_sum = dtr_train_mse_sum = dtr_test_mse_sum = base_test_mse_sum = base_train_mse_sum = 0
    res = []
    for train_indices, test_indices in k_fold.split(features):
        train = features[train_indices]
        train_label = labels[train_indices]
        test = features[test_indices]
        test_label = labels[test_indices]

        # neural network
        epoch = 200
        model = build_model(features.shape[1])
        history = model.fit(
            train, train_label,
            epochs=epoch, verbose=0)
        # plot_history(history)
        loss, mae, nn_test_mse = model.evaluate(test, test_label, verbose=2)
        nn_train_mse = history.history['mse'][epoch-1]
        print('nn test mse: ' + str(nn_test_mse))
        print('nn train mse: ' + str(nn_train_mse))
        nn_test_mse_sum += nn_test_mse
        nn_train_mse_sum += nn_train_mse

        # multiple linear regression
        mlr_train_mse, mlr_test_mse = MLR(train, test, train_label, test_label)
        print('mlr test MSE: ' + str(mlr_test_mse))
        print('mlr train mse: ' + str(mlr_train_mse))
        mlr_test_mse_sum += mlr_test_mse
        mlr_train_mse_sum += mlr_train_mse

        # decision tree regression
        regr = DecisionTreeRegressor(max_depth=5)
        regr.fit(train, train_label)
        target_train = regr.predict(train)
        target_test = regr.predict(test)
        regression_train_mse = eval_measures.mse(train_label, target_train)
        print("Decision Tree Regression train MSE: ", regression_train_mse)
        regression_test_mse = eval_measures.mse(test_label, target_test)
        print("Decision Tree Regression test MSE: ", regression_test_mse)
        dtr_train_mse_sum += regression_train_mse
        dtr_test_mse_sum += regression_test_mse

        # baseline
        mean = np.mean(list(train_label) + list(test_label))
        base_test_mse = sum([(label - mean)**2 for label in test_label])/len(test_label)
        print('baseline (mean) test mse: ' + str(base_test_mse))
        base_train_mse = sum([(label - mean)**2 for label in train_label])/len(train_label)
        print('baseline (mean) train mse: ' + str(base_train_mse))
        base_test_mse_sum += base_test_mse
        base_train_mse_sum += base_train_mse

        res.extend([mlr_train_mse, mlr_test_mse, regression_train_mse, regression_test_mse, nn_train_mse, nn_test_mse, base_train_mse, base_test_mse])

    df = pd.DataFrame([])
    df['Model'] = ['MLR', 'MLR', 'DT', 'DT', 'NN', 'NN', 'Baseline', 'Baseline']*5
    df['data'] = ['Train', 'Test', 'Train', 'Test', 'Train', 'Test', 'Train', 'Test']*5
    df['MSE'] = res #[mlr_train_mse_sum/5, mlr_test_mse_sum/5, dtr_train_mse_sum/5, dtr_test_mse_sum/5, nn_train_mse_sum/5, nn_test_mse_sum/5, base_train_mse_sum/5, base_test_mse_sum/5]
    sns.barplot(x="Model", y="MSE", hue="data", data=df, palette="Paired")
    plt.title(title)
    plt.legend(loc=[1, 1])
    plt.tight_layout()
    #plt.box(False)
    plt.show()
    plt.savefig(fname="D:\Brown_MS\spring2020\cs1951a\cs1951a_final_project\pictures\models\\"+title,format="svg")
    mlr, nn, dt, base = mlr_test_mse_sum/5, nn_test_mse_sum/5, dtr_test_mse_sum/5, base_test_mse_sum/5
    return mlr, nn, dt, base, res
 130,
 {'1%': -3.4816817173418295,
  '5%': -2.8840418343195267,
  '10%': -2.578770059171598},
 996.6929308390189)
'''

help(adfuller)



dftest = adfuller(df1['Thousands of Passengers'])

from statsmodels.tools.eval_measures import mse, rmse, meanabs

mse(df['test'],df['predictions']) #: 17.02

df4 = pd.read_csv('airline_passengers.csv', index_col = 'Month',parse_dates= True )

df4.index.freq = 'MS' 

from statsmodels.graphics.tsaplots import month_plot, quarter_plot

month_plot(df4['Thousands of Passengers']);


dfq = df4['Thousands of Passengers'].resample(rule = 'Q').mean()

quarter_plot(dfq)

예제 #27
0
파일: stats.py 프로젝트: ndtallant/tallant
def MSE(y_pred, y_true):
    return mse(y_pred, y_true)
예제 #28
0
        np.logical_and(y >= LOW_THRESHOLD, y <= HIGH_THRESHOLD)).tolist()
    data_groups["HIGH"] = np.argwhere(y > HIGH_THRESHOLD).tolist()
    data_groups["ALL"] = list(range(len(y)))

    # Loops through each testing group to perform the regression:
    for label, indices_list in data_groups.items():

        # Indices list is plain list of numbers (had to do this because np.argwhere returns list of lists)
        indices = indices_list
        if label != "ALL":
            indices = [item for sublist in indices_list for item in sublist]

        print('Current Testing Group: ' + label)
        print('Variables used: ' + ", ".join(variables))
        print('Number of countries: ' + str(len(indices)))

        cur_X = X[indices]
        cur_y = y[indices]

        cur_X = sm.add_constant(cur_X)

        model = sm.OLS(cur_y, cur_X)
        results = model.fit()

        mse = eval_measures.mse(cur_y, results.predict(cur_X))

        print(results.summary())
        print('R-squared = ' + str(results.rsquared))
        print('MSE = ' + str(mse))
        print("-----------------------------------------------")
            if 0 <= datetime.weekday() <= 4:
                one_hot_date[i][0] = 1

        X = np.append(X, one_hot_weather, axis=1)
        X = np.append(X, one_hot_borough, axis=1)
        X = np.append(X, one_hot_time, axis=1)
        X = np.append(X, one_hot_date, axis=1)
        return (X, y)

    X, y = load_file("fullDeliverable.db")
    x_train, x_test, y_train, y_test = train_test_split(X, y, p)
    X = sm.add_constant(x_train)
    model = sm.OLS(y_train, X)
    results = model.fit()
    print(results.summary())

    average = np.mean(y_train)
    train_predict = results.predict(X)
    train_MSE = eval_measures.mse(y_train, train_predict)
    regular_train_MSE = eval_measures.mse(y_train, average)
    regular_test_MSE = eval_measures.mse(y_test, average)
    X1 = sm.add_constant(x_test)
    test_predict = results.predict(X1)
    test_MSE = eval_measures.mse(y_test, test_predict)

    print("Baseline Train MSE: " + str(regular_train_MSE))
    print("Baseline Test MSE: " + str(regular_test_MSE))
    print("Regression Train MSE: " + str(train_MSE))
    print("Regression Test MSE: " + str(test_MSE))
    def ModelLinearRegression1(self):
        print '+++++++++++++++++++++++++ MULTI-DIM LINEAR REGRESSION 1 +++++++++++++++++++++++++'

        # Read csv file.. First get handle
        comLRHandle = ReadCSV.Read_CSV()
        data = comLRHandle.Read(self.path)

        # Lets print data and some maths
        print data
        print data.describe()

        # As we can see data is multi-Dimensional (more than 2 dim). We need to find something
        # which help us to use Linear Regression on this Multi-Dim data.
        # Lets print correlation Matrix to get some info..
        print '---------------------------- CORRELATION MATRIX ---------------------------- '
        print data.corr()
        print '-----------------------------------------------------------------------------'

        # Lets plot data to have visual info
        #fig, ax = plt.subplots(1,1)
        #ax.scatter(data.height, data.avg_points_scored)
        #ax.set_xlabel('height')
        #ax.set_ylabel('Average points scored per game')
        #plt.show()
        #fig, ax = plt.subplots(1,1)
        #ax.scatter(data.weight, data.avg_points_scored)
        #ax.set_xlabel('weight')
        #ax.set_ylabel('Average points scored per game')
        #plt.show()
        #fig, ax = plt.subplots(1,1)
        #ax.scatter(data.success_field_goals, data.avg_points_scored)
        #ax.set_xlabel('success_field_goals')
        #ax.set_ylabel('Average points scored per game')
        #plt.show()
        #fig, ax = plt.subplots(1,1)
        #ax.scatter(data.success_free_throws, data.avg_points_scored)
        #ax.set_xlabel('success_free_throws')
        #ax.set_ylabel('Average points scored per game')
        #plt.show()

        # This is highly deviated data against avg_points_scored. It will be difficult for the model
        # to analyze and predict. Let see what happens..

        # Lets break data in 2 pieces (training data (80%) and test data (20%))
        dataX = data.values.copy()
        TdataX, VdataX, TdataY, VdataY = cross_validation.train_test_split(dataX[:,:-1],
                                                                           dataX[:, -1],
                                                                           train_size= 0.80)
        #  height|weight|success_field_goals|success_free_throws|avg_points_scored
        # |<--------------------------------------------------->|<--------------->|
        # |<-----------------TdataX/VdataX--------------------->|<-TdataY/VdataY->|
        # |<------------Independent Variables ----------------->|<-Dep Variable-->|

        # Lets use Ordinary Least Square (OLS) regression model
        ols = sm.OLS(TdataY, add_constant(TdataX)).fit()
        print ols.summary()

        # Lets use Ordinary Least Square (OLS) regression model over success_field_goals
        olsSFG = sm.OLS(TdataY, add_constant(TdataX[:,2])).fit()
        print olsSFG.summary()

        # Lets see MSE of above two models
        TPredictedOLSY = ols.predict(add_constant(VdataX))
        print mse(TPredictedOLSY, VdataY)
        TPredictedOLSSFGY = olsSFG.predict(add_constant(VdataX[:,2]))
        print mse(TPredictedOLSSFGY, VdataY)

        # Lets plot them.. Get visuals
        #fig, ax = plt.subplots(1, 1)
        #ax.scatter(VdataY, TPredictedOLSY)
        #ax.set_xlabel('Actual')
        #ax.set_ylabel('Predicted')
        #plt.show()
        #fig, ax = plt.subplots(1, 1)
        #ax.scatter(VdataY, TPredictedOLSSFGY)
        #ax.set_xlabel('Actual')
        #ax.set_ylabel('Predicted')
        #plt.show()

        # LETS USE sklearn PACKAGE NOW..

        # create Linear Regression model handle
        lm = linear_model.LinearRegression()

        # Let train the model
        lm.fit(TdataX, TdataY)

        # Intercept and Weights
        print 'Intercept is %f' % lm.intercept_
        print pd.DataFrame(zip(data.columns,lm.coef_), columns = ['features','estimatedCoefficients'])

        # Cross validate
        CV = cross_validation.cross_val_score(lm, TdataX, TdataY, scoring='r2')
        print CV

        # Lets see how system predicts and what is MSE
        TPredictedLMY = lm.predict(VdataX)
        print mean_squared_error(TPredictedLMY, VdataY)
예제 #31
0
from statsmodels.tsa.stattools import grangercausalitytests

grangercausalitytests(df3[['a', 'd']], maxlag=5)
grangercausalitytests(df3[['b', 'd']], maxlag=5)

np.random.seed(42)

df = pd.DataFrame(np.random.randint(20, 30, (50, 2)),
                  columns=['test', 'predictions'])

df.head()

df.plot(figsize=(12, 8))

from statsmodels.tools.eval_measures import mse, rmse, meanabs

mse(df['test'], df['predictions'])
rmse(df['test'], df['predictions'])
meanabs(df['test'], df['predictions'])

df1.head()
df1.index

from statsmodels.graphics.tsaplots import month_plot, quarter_plot

month_plot(df1['Pass_K'])

df1q = df1['Pass_K'].resample(rule='Q').sum()
quarter_plot(df1q)
print "Printing OLS fit summary: "
print result.summary()
print "\n"

# use different combination of variables
print "\n"
print "Printing alternative OLS fit summary: "
result_alt = sm.OLS(y_train, add_constant(X_train[:, 2])).fit()
print result_alt.summary()
print "\n"

# apply model on test data set
ypred = result.predict(add_constant(X_valid))
print "Printing mean squared error: "
# mean-square error
print mse(ypred, y_valid)
print "\n"

# predict test set
ypred_alt = result_alt.predict(add_constant(X_valid[:, 2]))
print "Printing mean squared error of alternate model: "
print mse(ypred_alt, y_valid)
print "\n"

fig7, ax = plt.subplots(1, 1)
ax.scatter(y_valid, ypred)
ax.set_xlabel("Actual")
ax.set_ylabel("Predicted")

# alternate model
fig8, ax = plt.subplots(1, 1)
예제 #33
0
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.tools.eval_measures as ste

# Set seed for reproducible results
np.random.seed(414)

# Gen toy data
X = np.linspace(0, 15, 1000)
y = 3 * np.sin(X) + np.random.normal(1 + X, 0.2, 1000)

train_X, train_y = X[:700], y[:700]
test_X, test_y = X[300:], y[300:]

train_df = pd.DataFrame({"X": train_X, "y": train_y})
test_df = pd.DataFrame({"X": test_X, "y": test_y})

# Linear Fit
poly_1 = smf.ols(formula="y ~ 1 + X", data=train_df).fit()
print(ste.mse(poly_1.predict(test_df), test_y, axis=0))

# Quadratic Fit
poly_2 = smf.ols(formula="y ~ 1 + X + I(X**2)", data=train_df).fit()
print(ste.mse(poly_2.predict(test_df), test_y, axis=0))
예제 #34
0
# Since the 3rd variable is significant and the others aren't based on the pvalue. We'll recreate the model only using that variable.

# <codecell>

result_alternate = sm.OLS( y_train, add_constant(X_train[:,2]) ).fit()
result_alternate.summary()

# <markdowncell>

# Lets predict on the test data and see how much is the error 

# <codecell>

ypred = result.predict(add_constant(X_valid))
print mse(ypred,y_valid)

ypred_alternate = result_alternate.predict(add_constant(X_valid[:, 2]))
print mse(ypred_alternate,y_valid)

# <markdowncell>

# Lets see the actual vs predicted for the 1st model

# <codecell>

fig, ax = plt.subplots(1, 1)  
ax.scatter(y_valid, ypred)
ax.set_xlabel('Actual')
ax.set_ylabel('Predicted')
plt.show()
    ##################################################################################
    # TODO: Use StatsModels to create the Linear Model and Output R-squared
    #################################################################################
    x_train = sm.add_constant(x_train)
    model = sm.OLS(y_train, x_train)
    results = model.fit()

    # Prints out the Report
    # TODO: print R-squared, test MSE & train MSE
    print('r-squared')
    print(results.rsquared)

    x_test = sm.add_constant(x_test)
    test_pred = results.predict(x_test)
    print('MSE test')
    print(eval_measures.mse(y_test, test_pred))

    train_pred = results.predict(x_train)
    print('MSE train')
    print(eval_measures.mse(y_train, train_pred))

    print('Summary')
    print(results.summary())

#     r-squared
# 0.4610741855183632
# MSE test
# 0.12307815170253002
# MSE train
# 0.12598380510417417