def setUp(self): self.model1 = Model.Model("./data/simple", ".csv") self.files = self.model1.set_files_in_directory() self.model2 = Model.Model("./data/student", ".csv") self.model3 = Model.Model("./datfff", ".csv") self.model4 = Model.Model("./data/student", ".txt") self.model1out = self.model1.set_dataframes(self.files) self.model2out = self.model2.set_dataframes(self.files) self.regression1 = Regression.Regression(self.model1out) self.regression2 = Regression.Regression(self.model2)
def main(): new_model = Model.Model("./data/simple", ".csv") files = new_model.set_files_in_directory() dic = new_model.set_dataframes(files) new_reg = rg.Regression(dic) training_data = new_reg.split_data()[0] column_names = new_reg.get_columnNames(training_data) ind, dep = new_reg.get_data(columns_names=column_names, training_data=training_data) lr = new_reg.run(training_data) if lr.__class__.__name__ == "UnivariateLR": m, b = lr.run() print(m, b) y_hat = lr.predict(m, b) print(lr.evaluate_model(ind, y_hat)) m, b = lr.get_params_history() lr.plot_history_m(m) elif lr.__class__.__name__ == "MultivariateLR": B, cost_history = lr.run() y_hat = lr.predict(B) print(lr.evaluate_model(dep, y_hat)) lr.plot_cost(cost_history)
def analyze_regression(x1, x2, y, method='ols', n_folds=5, data_name='data'): max_degree = 20 n_lambdas = 9 lambdas = np.logspace(-3, 3, n_lambdas) error_scores = pd.DataFrame(columns=['degree', 'lambda', 'MSE_train', 'MSE_test', 'R2_train', 'R2_test', 'bias_train', 'bias_test', 'var_train', 'var_test']) if method=='ols': lambdas = [0] filename = 'error_scores_' + data_name + '_' + method if n_folds > 1: filename += '_cv' for lambda_ in lambdas: for deg in range(1, max_degree+1): X = create_design_matrix(x1, x2, deg=deg) if n_folds > 1: mse_train, mse_test, r2_train, r2_test, bias_train, bias_test, var_train, var_test = cross_validation(X, y, n_folds, method, lambda_) else: model = Regression(method, lambda_=lambda_) model.fit(X, y) model.predict(X) mse_train = mean_squared_error(model.y, model.y_pred) r2_train = r2_score(model.y, model.y_pred) bias_train = bias(model.y, model.y_pred) var_train = np.var(model.y_pred) mse_test = None r2_test = None bias_test = None var_test = None error_scores = error_scores.append({'degree': deg, 'lambda': lambda_, 'MSE_train': mse_train, 'MSE_test': mse_test, 'R2_train': r2_train, 'R2_test': r2_test, 'bias_train': bias_train, 'bias_test': bias_test, 'var_train': var_train, 'var_test': var_test}, ignore_index=True) print(error_scores) error_scores.to_csv(filename + '.csv')
def build(self): root = ScreenManager() root.transition = SwapTransition() root.add_widget(MainMenu()) root.add_widget( bm.BracketMethods(screenManager=root, name='bracket_methods')) root.add_widget(om.OpenMethods(screenManager=root, name='open_methods')) root.add_widget( soe.SystemOfEquations(screenManager=root, name='system_equations')) root.add_widget( ip.Interpolation(screenManager=root, name='interpolation')) root.add_widget(rg.Regression(screenManager=root, name='regression')) return root
def cross_validation(X, y, n_folds, method='ols', lambda_=0.01): if len(y.shape) > 1: y = np.ravel(y) kf = KFold(n_splits=n_folds, random_state=0, shuffle=True) mse = np.zeros((n_folds, 2)) r2 = np.zeros((n_folds, 2)) b = np.zeros((n_folds, 2)) var = np.zeros((n_folds, 2)) i = 0 for train_index, val_index in kf.split(X): model = Regression(method, lambda_) model.fit(X[train_index], y[train_index]) model.predict(X[train_index]) y_pred_train = model.y_pred model.predict(X[val_index]) y_pred_test = model.y_pred mse[i][0] = mean_squared_error(y[train_index], y_pred_train) mse[i][1] = mean_squared_error(y[val_index], y_pred_test) r2[i][0] = r2_score(y[train_index], y_pred_train) r2[i][1] = r2_score(y[val_index], y_pred_test) b[i][0] = bias(y[train_index], y_pred_train) b[i][1] = bias(y[val_index], y_pred_test) var[i][0] = np.var(y_pred_train) var[i][1] = np.var(y_pred_test) i += 1 mse_train = np.mean(mse[:,0]) mse_test = np.mean(mse[:,1]) r2_train = np.mean(r2[:,0]) r2_test = np.mean(r2[:,1]) b_train = np.mean(b[:,0]) b_test = np.mean(b[:,1]) var_train = np.mean(var[:,0]) var_test = np.mean(var[:,1]) return mse_train, mse_test, r2_train, r2_test, b_train, b_test, var_train, var_test
########DEFINE COLLECTION FIELDS########## print('DATA_COLLECTION_BEGIN') inputPeriods = [5, 10, 20, 50, 100, 200] pastReturnPeriods = [1, 2, 5, 10, 20, 50, 100] retPeriods = [1, 2, 3, 4, 5, 10, 20, 30, 40, 50] adjClose = ifld.AdjClose() longVolume = ifld.SMA(100, ifld.AdjVolume()) collectionFields = [] #import random #randomSymbols = random.sample(list(stockData.index.get_level_values('Symbol').unique()),2) #stockData = stockData[stockData.index.get_level_values('Symbol').isin(randomSymbols)] linRegressions = [] for period in inputPeriods: linearReg = reg.Regression(period, adjClose) linRegressions.append(linearReg) collectionFields.extend(linearReg.getRegFieldsList()) sdPeriod = ifld.SD(period, ifld.PcntChange(1, False, adjClose), 'SD_PCNT_' + str(period)) rollingMin = ifld.RollingMin(period, adjClose) rollingMax = ifld.RollingMax(period, adjClose) minDuration = ifld.ExtremeDuration(period, adjClose, False, 'Min_Duration_' + str(period)) maxDuration = ifld.ExtremeDuration(period, adjClose, True, 'Max_Duration_' + str(period)) minDurationLag = ifld.Lag(minDuration, 1, 'Min_Duration_' + str(period) + '_Lag') maxDurationLag = ifld.Lag(maxDuration, 1, 'Max_Duration_' + str(period) + '_Lag') retracedFromHigh = ifld.Divide(ifld.RetracementPcnt(period, True),
meanY = float(Y.mean().values) X = dataset.drop(columns=['ERP', 'PRP', 'vendor name', 'model name']) # # Separation between train dataset and test dataset with train_frac index_separation = int(data_lenght * train_frac) Xtrain = X.iloc[:index_separation] Ytrain = Y.iloc[:index_separation] Xtest = X.iloc[index_separation:] Ytest = Y.iloc[index_separation:] return Xtrain, Ytrain, Xtest, Ytest, meanY # Preparing the values and initiating the regression class # ---------------------------------------------------------- X, Y, Xtest, Ytest, meanY = prepareValues(verbose=True) Regression = rd.Regression(X, Y, verbose=True, unified=False) # # Training the model with X and Y sets # # ------------------------------------- print(tc.WARNING + "--> Training our regression model..." + tc.ENDC) Regression.train_model() print(tc.OKGREEN + " Training phase of the model finished!" + tc.ENDC) print(tc.OKGREEN + " Output model of the training (beta) :" + tc.ENDC) print(Regression.beta) # # Training the model with X and Y sets # # ------------------------------------- print(tc.WARNING + "--> Testing the model with the last 20% of the dataset!" + tc.ENDC) average_error = Regression.test_model(Xtest, Ytest) print(tc.OKGREEN + " Average error :" + tc.ENDC, average_error)
import Graph as g import File as f import Regression as r import numpy as np def func(l, g): return np.power(g, 0.5) * np.power(l, -0.5) file = f.File("data.csv") x, y, yerr = file.read_x_y_values_from_file() regression = r.Regression(x, y, yerr, func) regression.fit_data() graph = g.Graph(x, y, yerr, func, regression.popt[0]) graph.title = r'Calculating $g$ through measuring the period of a pendulum' graph.x_caption = r'Length of pendulum $l/\textrm{m}$' graph.y_caption = r'Angular frequency of pendulum $\omega /\textrm{s}^{-1}$' graph.text = r'$g = ' + str(graph.g) + r' \pm 0.06$' graph.text_x = 0.35 graph.text_y = 7.5 graph.show_graph()
from Regression import * from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt irisdata = datasets.load_iris() X_train, X_test, Target_train, Target_test = train_test_split(irisdata.data, irisdata.target, test_size=.4) Batch_size = 10 epoch_num = int(len(X_train) / Batch_size) MeanSquareError = np.zeros((3, epoch_num)) R2Score = np.zeros((3, epoch_num)) for epoch in range(epoch_num): X_batch = X_train[epoch:epoch + Batch_size, :] Y_batch = Target_train[epoch:epoch + Batch_size] Reg = Regression(X_batch, Y_batch) LinReg, _ = Reg.LinearRegression(X_batch) RigReg, _ = Reg.RidgeRegression(X_batch, alpha=0.1) LasReg, _ = Reg.LassoRegression(X_batch, alpha=0.1) LinReg_Eval = Evaluation(LinReg, Y_batch) RigReg_Eval = Evaluation(RigReg, Y_batch) LasReg_Eval = Evaluation(LasReg, Y_batch) MeanSquareError[0, epoch] = LinReg_Eval.MeanSquarErr() MeanSquareError[1, epoch] = RigReg_Eval.MeanSquarErr() MeanSquareError[2, epoch] = LasReg_Eval.MeanSquarErr() R2Score[0, epoch] = LinReg_Eval.R2Square() R2Score[1, epoch] = RigReg_Eval.R2Square() R2Score[2, epoch] = LasReg_Eval.R2Square() fig = plt.figure()
args = parser.parse_args() stock_code = args.code expect_tag = args.label method = args.method stockcodes_list = ['000001'] filenames_list = ["5min/000001.csv"] expect_day = '2018-01-18' his_num = 5 # print(stock_code) fast_data_searcher = FastResearchData(stock_code, stockcodes_list, filenames_list) stock_data = fast_data_searcher.run() # calculator = CalCorrMatrix() data_preparer = PreProcessor(stock_data, expect_day, expect_tag, his_num) valid_set, train_set, valid_tag, train_tag = data_preparer.run() regress = Regression(valid_set, train_set, valid_tag, train_tag, method) pred_result = regress.run() print(pred_result) evaluator = Evaluate(valid_set, valid_tag, pred_result, expect_tag, method) evaluator.run() drawer = PicDrawer(method, valid_tag, pred_result) drawer.picDrawer()
def test_Regression_fit(method='ols'): # Data generation N = 100 # data size p = 5 # polynomial degree np.random.seed(0) x = np.random.rand(N, 1) y = 5*x*x + 0.1*np.random.randn(N, 1) # Creating design matrix X X = np.ones((N, p + 1)) for i in range(1, p + 1): X[:,i] = x[:,0]**i # x1, x2 = generate_mesh(0, 1, 100) # y = franke_function(x1, x2, eps=0.00) # # X = create_design_matrix(x1, x2, deg=5) test_model = Regression(method=method, lambda_=0.01) # Manual test_model.fit(X, y) beta = test_model.beta test_model.predict(X) y_pred = test_model.y_pred r2 = r2_score(test_model.y, test_model.y_pred) mse = mean_squared_error(test_model.y, test_model.y_pred) # Scikit-learn # test_model.skl_fit(X, y) # beta_skl = test_model.beta # test_model.skl_predict(X) # y_pred_skl = test_model.y_pred # r2_skl = r2_score(test_model.y, test_model.y_pred) # mse_skl = mean_squared_error(test_model.y, test_model.y_pred) # test_model.skl_fit(X, y) # beta_skl = test_model.beta # test_model.skl_predict(X) # y_pred_skl = test_model.y_pred # r2_skl = r2_score(test_model.y, test_model.y_pred) # mse_skl = mean_squared_error(test_model.y, test_model.y_pred) # # print('Beta:') # print(beta) # print(beta_skl) # print('y:') # print(y_pred) # print(y_pred_skl) # print('mse:') # print(mse) # print(mse_skl) # # tol = 1e-15 # # # assert mean_squared_error(y_pred, y_pred_skl) < tol # assert mean_squared_error(beta, beta_skl) < tol # plot_regression(x, y, x, y_pred)
from pylab import * from numpy import * from Regression import * Reg = Regression() """Load in data and calculate the split ratio""" data = loadtxt('Q1.data') p = 13 """Shuffle Data""" data = data.reshape(-1, p + 1) order = range(shape(data)[0]) random.shuffle(order) data = data[order, :] split = int(len(data) * .66) covX = cov(transpose(data)) sdX = sqrt(diag(covX)) for i in range(p + 1): data[:, i] = data[:, i] / sdX[i] traindata = data[0:split, :] testdata = data[split:len(data), :] """Response splitting""" ytrain = traindata[:, p] ytrain = transpose(matrix(ytrain)) N = len(ytrain) ytest = testdata[:, p] ytest = transpose(matrix(ytest)) Ntest = len(ytest) """Add Squared Terms""" #X = concatenate((X,pow(data[:,0:p],2)), axis = 1)