def score_model(model, alpha=False): ''' This function fits a model using the training set, predicts using the test set, and then calculates and reports goodness of fit metrics and alpha if specified and available. All of the model parameters are also reported, which I find extremely useful. I wanted to include all of the available regression metrics to see how they compare and comove. I ran into an ValueError when trying to include MSLE (mean squared log error). Could be related to ln0 being undefined? ''' model.fit(Xtrain, ytrain) yhat = model.predict(Xtest) r2 = r2_score(ytest, yhat) me = mse(ytest, yhat) ae = mae(ytest, yhat) mede = medae(ytest, yhat) ev = evs(ytest, yhat) if alpha == True: print( "Results from {}: \nr2={:0.3f} \nMSE={:0.3f} \nMAE={:0.3f} \nMEDAE={:0.3f} \nEVS={:0.3f} \nalpha={:0.3f}" .format(model, r2, me, ae, mede, ev, model.alpha_)) else: print( "Results from {}: \nr2={:0.3f} \nMSE={:0.3f} \nMAE={:0.3f} \nMEDAE={:0.3f} \nEVS={:0.3f}" .format(model, r2, me, ae, mede, ev))
def eval_reg(y_test, predictions): ''' Function: Evaluates a regression model through its main metrics ''' print("### MEASURES OF REGRESSION MODEL ###") print("------------------------------------\n") print("R2 = {0:.4f}\n".format(r2_score(y_test, predictions))) # R2 print("RMSE = {0:.4f}\n".format(mse( y_test, predictions, squared=False))) # Root Mean Squared Error print("MSE = {0:.4f}\n".format(mse(y_test, predictions, squared=True))) # Mean Squared Error if len(predictions[predictions < 0]) > 0: print( "MSLE not possible to be applied. Predicitons contain negative values.\n" ) else: print("MSLE = {0:.4f}\n".format(msle( y_test, predictions))) # Mean Squared Log Error print("MAE = {0:.4f}\n".format(mae(y_test, predictions))) # Mean Absolute Error print("EVS = {0:.4%}\n".format(evs( y_test, predictions))) # Explained Variance Score
def val(mdl, crit, ldr): ''' This routine handles the validation loop Arguments: mdl : the model to be validated // nn.Module crit : the criterion (loss) function // nn loss function ldr : the dataloader for validation // dataloader Parameters: val_los : tracks the validation loss // float val_acc : tracks the validation accuracy using evs // float X : the feature set for the current datapoint // tensor y : the target for the current datapoint // tensor y_pred : the predictions returned by the model // tensor los : the calculated loss for the current datapoint // nn loss object Returns: val_los_avg : the averaged validation loss // float val_acc_avg : the averaged validation accuracy // float ''' mdl.eval() # set model to evaluation mode for validation val_los, val_acc = 0, 0 # initialise validation loss and accuracy metrics for X, y in ldr: with torch.no_grad(): # set torch to not calculate gradient for validation X, y = X.to(device), y.to(device) y_pred = mdl(X) # return model predictions los = crit(y_pred, y.unsqueeze(1)) # return loss from criterion function val_los += los*X.size(0) val_acc += evs(y.unsqueeze(1).cpu().numpy(), y_pred.cpu().numpy())*X.size(0) # use explained_variance_score for accuracy return val_los/len(ldr.dataset), val_acc/len(ldr.dataset) # take averages for the loss and accuracy
def modela_kvalitate(y_test, resultats): # Kvalitate virs 0.6 ir OK print( cl('Explained Variance Score (dispersija): {}'.format( evs(y_test, resultats)), attrs=['bold'])) print( cl('R-Squared (kvadratiska novirze): {}'.format(r2(y_test, resultats)), attrs=['bold']))
def score_model(model): """ Fits a model using the training set, predicts using the test set, and then calculates and reports goodness of fit metrics and alpha. """ model.fit(Xtrain, ytrain) yhat = model.predict(Xtest) r2 = r2_score(ytest, yhat) me = mse(ytest, yhat) ae = mae(ytest, yhat) mede = medae(ytest, yhat) ev = evs(ytest, yhat) print("Results from {}: \nr2={:0.3f} \nMSE={:0.3f} \nMAE={:0.3f} \nMEDAE={:0.3f} \nEVS={:0.3f} \nalpha={:0.3f}".format(model, r2, me, ae, mede, ev, model.alpha_))
def score_model(model, alpha=False): ''' This function fits a model using the training set, predicts using the test set, and then calculates and reports goodness of fit metrics and alpha if specified and available. ''' model.fit(Xtrain, ytrain) yhat = model.predict(Xtest) r2 = r2_score(ytest, yhat) me = mse(ytest, yhat) ae = mae(ytest, yhat) mede = medae(ytest, yhat) ev = evs(ytest, yhat) if alpha == True: print("Results from {}: \nr2={:0.3f} \nMSE={:0.3f} \nMAE={:0.3f} \nMEDAE={:0.3f} \nEVS={:0.3f} \nalpha={:0.3f}".format(model, r2, me, ae, mede, ev, model.alpha_)) else: print("Results from {}: \nr2={:0.3f} \nMSE={:0.3f} \nMAE={:0.3f} \nMEDAE={:0.3f} \nEVS={:0.3f}".format(model, r2, me, ae, mede, ev))
def trn(mdl, opti, crit, ldr): ''' This routine handles the training loop Arguments: mdl : the model to be trained // nn.Module opti : the optimiser object // optim crit : the criterion (loss) function // nn loss function ldr : the dataloader for training // dataloader Parameters: trn_los : tracks the training loss // float trn_acc : tracks the training accuracy using evs // float X : the feature set for the current datapoint // tensor y : the target for the current datapoint // tensor y_pred : the predictions returned by the model // tensor los : the calculated loss for the current datapoint // nn loss object Returns: trn_los_avg : the averaged training loss // float trn_acc_avg : the averaged training accuracy // float ''' mdl.train() # set model to training mode trn_los, trn_acc = 0, 0 # initialise training loss and accuracy metrics for X, y in ldr: X, y = X.to(device), y.to(device) opti.zero_grad() # set optimiser to zero grad for training y_pred = mdl(X) # return model predictions los = crit(y_pred, y.unsqueeze(1)) # return loss from criterion function los.backward() # implement backpropagation trn_los += los * X.size(0) trn_acc += evs(y.cpu().numpy(), y_pred.detach().cpu().numpy()) * X.size( 0) # use explained_variance_score for accuracy opti.step() # step the optimiser return trn_los / len(ldr.dataset), trn_acc / len( ldr.dataset) # take averages for the loss and accuracy
def model_significance(model, actual, predicted): ''' Takes in a model, the actual y values, and the predicted y values. Calculates the significance of the model by comparing it's p-value to an alpha of 0.05. Prints out whether or not the model is significant. Returns the Explained Variance of Squares. ''' p = model.f_pvalue alpha = 0.05 EVS = evs(actual, predicted) if p < 0.05: print( f'p: {p} is less than alpha: {alpha}, therefore our model is significant.' ) else: print( f'p: {p} is more than alpha: {alpha}, therefore our model is not significant.' ) print('\n') print(f'EVS: {EVS}') return evs
def accuracy(data1, data2): poi_rem = [5, 6, 7, 8, 11, 12, 13, 15, 17] poi_ori = [l for l in range(18)] poi_new = [l for l in poi_ori if l not in poi_rem] poi_list = ['Nose', 'Neck', 'Right Shoulder', 'Right Elbow', 'Right Wrist', 'Right Knee', 'Right Ankle', 'Right Eye', 'Right Ear'] col_x, col_y, col_c = [], [], [] for i in poi_new: col_x.append('x_'+str(i)) col_y.append('y_'+str(i)) col_c.append('c_'+str(i)) col = col_x + col_y + col_c d2 = {} for i in col: if len(data1) > len(data2): if (len(data1) - len(data2)) % 2 == 0: m = int((len(data1) - len(data2)) / 2) #number of rows to be added at the top of user's dataframe n = int((len(data1) - len(data2)) / 2) #number of rows to be added at the bottom of user's dataframe else: m = int(np.ceil((len(data1) - len(data2)) / 2)) #number of rows to be added at the top of user's dataframe n = int(np.floor((len(data1) - len(data2)) / 2)) #number of rows to be added at the bottom of user's dataframe l = list(data2[i]) l_n = [] for j in range(m): l_n.append(np.mean(l)) #adding mean values of the column at the top of list l_n = l_n + l #appending the user's list to the new list for j in range(n): l_n.append(np.mean(l)) #adding mean values of the column at the bottom of list d2[i] = l_n #adding list to the dictionary else: if (len(data2) - len(data1)) % 2 == 0: m = int((len(data2) - len(data1)) / 2) #number of rows to be removed from the top of user's dataframe n = int((len(data2) - len(data1)) / 2) #number of rows to be removed from the bottom of user's dataframe else: m = int(np.ceil((len(data2) - len(data1)) / 2)) #number of rows to be removed from the top of user's dataframe n = int(np.floor((len(data2) - len(data1)) / 2)) #number of rows to be removed from the bottom of user's dataframe l = list(data2[i]) l1 = l[m:] #removing first m elements from the user's list l_n = l1[:-n] #removing last n elements from the user's list d2[i] = l_n data2 = pd.DataFrame(d2, columns=col) #converting dictionary to dataframe d1 = minMaxNorm(data1) d2 = minMaxNorm(data2) evs_x = [] evs_y = [] for i, j in zip(col_x, col_y): evs_x.append(evs(d1[i], d2[i])) #computing accuracy for each joint in x dimension evs_y.append(evs(d1[j], d2[j])) #computing accuracy for each joint in y direction evs_t = [] for i, j in zip(evs_x, evs_y): evs_t.append(np.mean([i, j])) #computing mean accuracy across x and y dimension evs_f = [] for i in range(len(poi_new)): #categorizing the accuracy if evs_t[i] < 0: evs_f.append(poi_list[i]+' movement is entirely incorrect') elif evs_t[i] < 0.2 and evs_t[i] >= 0: evs_f.append(poi_list[i]+' movement is majorly incorrect') elif evs_t[i] < 0.4 and evs_t[i] >= 0.2: evs_f.append(poi_list[i]+' movement is minorly incorrect') elif evs_t[i] < 0.6 and evs_t[i] >= 0.4: evs_f.append(poi_list[i]+' movement is minorly correct') elif evs_t[i] < 0.8 and evs_t[i] >= 0.6: evs_f.append(poi_list[i]+' movement is majorly correct') else: evs_f.append(poi_list[i]+' Movement is entirely correct') return evs_t, evs_f
rmse_m.append(rmse(test_y, mlr_p)) rmse_m.append(rmse(test_y, svr_p)) rmse_m.append(rmse(test_y, dt_p)) rmse_m.append(rmse(test_y, pr_p)) #MAE for models mae_m.append(mae(test_y, mlr_p)) mae_m.append(mae(test_y, svr_p)) mae_m.append(mae(test_y, dt_p)) mae_m.append(mae(test_y, pr_p)) #MDAE for models mdae_m.append(mdae(test_y, mlr_p)) mdae_m.append(mdae(test_y, svr_p)) mdae_m.append(mdae(test_y, dt_p)) mdae_m.append(mdae(test_y, pr_p)) #EVS for models evs_m.append(evs(test_y, mlr_p)) evs_m.append(evs(test_y, svr_p)) evs_m.append(evs(test_y, dt_p)) evs_m.append(evs(test_y, pr_p)) #R2 for models r2_m.append(r2(test_y, mlr_p)) r2_m.append(r2(test_y, svr_p)) r2_m.append(r2(test_y, dt_p)) r2_m.append(r2(test_y, pr_p)) c += 1 #Converting the results to dict to dataframe d = {} d['Method'] = meth d['MSE'] = mse_m d['RMSE'] = rmse_m d['MAE'] = mae_m
dt_yhat = dt.predict(x_test) # Modelling: Linear Regression lr = LinearRegression() lr.fit(x_train, y_train) lr_yhat = lr.predict(x_test) # Modelling: Bayesian bayesian = BayesianRidge() bayesian.fit(x_train, y_train) bayesian_yhat = bayesian.predict(x_train) # Determining accuracy rf_accuracy = rf.score(x_test, y_test) rf_evs = evs(y_test, rf_yhat) print("Random Forest Training Accuracy:", rf.score(x_train, y_train)) print("Random Forest Testing Accuracy:", rf_accuracy) print("Random Forest Explained Variance Score:", rf_evs) dt_accuracy = dt.score(x_test, y_test) dt_evs = evs(y_test, dt_yhat) print("Decision Tree Training Accuracy:", dt.score(x_train, y_train)) print("Decision Tree Testing Accuracy:", dt_accuracy) print("Decision Tree Explained Variance Score:", dt_evs) lr_accuracy = lr.score(x_test, y_test) lr_evs = evs(y_test, lr_yhat)
test_x = test.drop(columns=['District', 'Index', 'Rainfall']) test_y = test['Rainfall'] for j in deg: print(i, c, j) poly = pf(degree=j) poly_tr = poly.fit_transform(train_x) poly_ts = poly.fit_transform(test_x) pr.fit(poly_tr, train_y) pr_p = pr.predict(poly_ts) #Error values d.append(j) mse_d.append(mse(test_y, pr_p)) rmse_d.append(rmse(test_y, pr_p)) mae_d.append(mae(test_y, pr_p)) mdae_d.append(mdae(test_y, pr_p)) evs_d.append(evs(test_y, pr_p)) r2_d.append(r2(test_y, pr_p)) c += 1 t = {} t['Degree'] = d t['MSE'] = mse_d t['RMSE'] = rmse_d t['MAE'] = mae_d t['MDAE'] = mdae_d t['EVS'] = evs_d t['R2'] = r2_d tf = pd.DataFrame( t, columns=['Degree', 'MSE', 'RMSE', 'MAE', 'MDAE', 'EVS', 'R2']) for j in deg: temp = tf[tf['Degree'] == j] dl.append(i)
def better_than_baseline(RMSE, RMSE_baseline) -> None: ''' Takes in the RMSE for a model as well as the RMSE for the baseline and compares the two. Prints out if the model is better or worse than the baseline. ''' if RMSE < RMSE_baseline: print('Model is better than baseline') else: print('Model sucks, should have done full stack dev instead.') def model_significance(model, actual, predicted): ''' Takes in a model, the actual y values, and the predicted y values. Calculates the significance of the model by comparing it's p value to an alpha of 0.05. Prints out whether or not the model is significant. Returns the Explained Variance of Squares. ''' p = model.f_pvalue alpha = 0.05 ß EVS = evs(actual, predicted) if p < 0.05: print(f'p: {p} is less than alpha: {alpha}, therefore our model is significant.') else: print(f'p: {p} is greather than alpha: {alpha}, therefore our model is not significant.') print('\n') print(f"EVS: {EVS}") return evs
def train(self): from sklearn.metrics import (mean_squared_log_error as msle, max_error as max, mean_absolute_error as mae, mean_squared_error as mse, explained_variance_score as evs, r2_score as r2, mean_tweedie_deviance as tweedie) for round in range(1): try: os.mkdir('%s/%d' % (self.path, round)) except FileExistsError: pass # get data split of one fold train_idx, test_idx, label_ind, unlab_ind = self.alibox.get_split( round) # get intermediate results saver for one fold experiment saver = self.alibox.get_stateio(round) # set initial performance point model = self.model net = NN.NeuralNetworkRegressor(model=model, batch_size=1, device_ids=[0], epochs=50) net.lr_fc = 0.01 net.initiate(self.dataset[label_ind.index], self.labels[label_ind.index]) net.predict(self.testset) pred = net.preds # evaluation all = len(label_ind) + len(unlab_ind) lab_init = len(label_ind) self.mse.append(mse(self.testlab, pred)) self.mae.append(mae(self.testlab, pred)) self.max.append(max(self.testlab, pred)) self.evs.append(evs(self.testlab, pred)) self.r2.append(r2(self.testlab, pred)) self.sample.append(len(label_ind.index)) saver.set_initial_point(mse(self.testlab, pred)) iteration = 0 while not self.stopping_criterion.is_stop(): # select subsets of Uind samples according to query strategy iteration += 1 lr_fc = net.lr_fc * (1 - len(label_ind.index) / (all * 1.001)) for p in net.optimizer.param_groups: p['lr'] = lr_fc print('learning rate is', net.optimizer.state_dict()['param_groups'][0]['lr']) if self.phase == 'active': if self.measure != 'residue': net.predict(self.dataset[unlab_ind.index]) else: net.predict(self.dataset[label_ind]) pred = net.preds if self.measure == 'distance': if iteration == 1: self._update_previous_prediction(pred) else: self._update_previous_prediction( pred, select_ind, unlab_ind_save) previous = self._get_previous_prediction() else: previous = None if len(label_ind) < all * 0.6: if iteration % 10: select_ind = self.query_strategy.select_by_prediction( unlabel_index=unlab_ind, predict=pred, labels=self.labels[label_ind.index], batch_size=int(lab_init * 1), X_lab=self.dataset[label_ind.index], X_unlab=self.dataset[unlab_ind.index], previous=previous) else: select_ind = self.random.select(label_ind, unlab_ind, batch_size=int( lab_init * 1)) else: select_ind = self.query_strategy.select_by_prediction( unlabel_index=unlab_ind, predict=pred, labels=self.labels[label_ind.index], batch_size=int(len(label_ind) * 0.3), X_lab=self.dataset[label_ind.index], X_unlab=self.dataset[unlab_ind.index], previous=previous) elif self.phase == 'passive': if len(label_ind) < all * 0.6: select_ind = self.random.select(label_ind, unlab_ind, batch_size=int( lab_init * 1)) # select_ind = self.random.select(label_ind, unlab_ind, batch_size=1) else: select_ind = self.random.select( label_ind, unlab_ind, batch_size=int(len(label_ind) * 0.3)) # select_ind = self.random.select(label_ind, unlab_ind, batch_size=1) # update the datasets and previous prediction unlab_ind_save = unlab_ind.index label_ind.update(select_ind) unlab_ind.difference_update(select_ind) # update model and calc performance accoding to the updated model loss = net.train(self.dataset[label_ind.index], self.labels[label_ind.index]) # if not iteration%2: net.predict(self.testset) pred = net.preds # evaluation self.mse.append(mse(self.testlab, pred)) self.mae.append(mae(self.testlab, pred)) self.max.append(max(self.testlab, pred)) self.evs.append(evs(self.testlab, pred)) self.r2.append(r2(self.testlab, pred)) self.sample.append(len(label_ind.index)) self.loss.append(loss) # save the results st = self.alibox.State(select_ind, mse(self.testlab, pred)) saver.add_state(st) saver.save() self.stopping_criterion.update_information(saver) torch.save(self.model, './%s/%d/model%d' % (self.path, round, iteration)) self.stopping_criterion.reset() self.unc_result.append(copy.deepcopy(saver)) joblib.dump(self.mse, './%s/%d/mse' % (self.path, round)) joblib.dump(self.mae, './%s/%d/mae' % (self.path, round)) joblib.dump(self.max, './%s/%d/max' % (self.path, round)) joblib.dump(self.evs, './%s/%d/evs' % (self.path, round)) joblib.dump(self.r2, './%s/%d/r2' % (self.path, round)) joblib.dump(self.sample, './%s/%d/sample' % (self.path, round)) joblib.dump(self.loss, './%s/%d/loss' % (self.path, round)) joblib.dump(self.testlab, './%s/%d/testlab' % (self.path, round)) joblib.dump(pred, './%s/%d/pred' % (self.path, round)) self.analyser = self.alibox.get_experiment_analyser( x_axis='num_of_queries') self.analyser.add_method( method_name='QueryInstanceDistribution-distance', method_results=self.unc_result) print(self.analyser)
mse_clus.append(mse(test_clus_y, clus_p)) mse_gen.append(mse(test_gen_y, gen_p)) #RMSE for models rmse_ds.append(rmse(test_ds_y, ds_p)) rmse_clus.append(rmse(test_clus_y, clus_p)) rmse_gen.append(rmse(test_gen_y, gen_p)) #MAE for models mae_ds.append(mae(test_ds_y, ds_p)) mae_clus.append(mae(test_clus_y, clus_p)) mae_gen.append(mae(test_gen_y, gen_p)) #MDAE for models mdae_ds.append(mdae(test_ds_y, ds_p)) mdae_clus.append(mdae(test_clus_y, clus_p)) mdae_gen.append(mdae(test_gen_y, gen_p)) #EVS for models evs_ds.append(evs(test_ds_y, ds_p)) evs_clus.append(evs(test_clus_y, clus_p)) evs_gen.append(evs(test_gen_y, gen_p)) #R2 for models r2_ds.append(r2(test_ds_y, ds_p)) r2_clus.append(r2(test_clus_y, clus_p)) r2_gen.append(r2(test_gen_y, gen_p)) #Mean of MSE for models mse_ts_ds.append(np.mean(mse_ds)) mse_ts_clus.append(np.mean(mse_clus)) mse_ts_gen.append(np.mean(mse_gen)) #Mean of RMSE for models rmse_ts_ds.append(np.mean(rmse_ds)) rmse_ts_clus.append(np.mean(rmse_clus)) rmse_ts_gen.append(np.mean(rmse_gen)) #Mean of MAE for models
#pricipal component analaysis '''from sklearn.decomposition import PCA pca=PCA(n_components=65) x_train=pca.fit_transform(x_train) variance=pca.explained_variance_ratio_''' #train_test split from sklearn.model_selection import train_test_split x_train1, x_test1, y_train1, y_test1 = train_test_split(x_train, y_train, test_size=0.2, random_state=0) #model creation from sklearn.linear_model import LinearRegression regressor = LinearRegression() regressor.fit(x_train1, y_train1) y_pred = regressor.predict(x_test1) from sklearn.metrics import mean_squared_error as mse from sklearn.metrics import explained_variance_score as evs from sklearn.metrics import r2_score mse = mse(y_test1, y_pred) evs = evs(y_test1, y_pred) r2 = r2_score(y_test1, y_pred) #more advance Evloution K flod cross validation #model for different dataset from sklearn.model_selection import cross_val_score as cvs accuracies = cvs(estimator=regressor, X=x_train1, y=y_train1, cv=10) accuracies.std() accuracies.mean()
mse_f = [] rmse_f = [] mae_f = [] mdae_f = [] evs_f = [] r2_f = [] y = data['Actual'] for i in comb_names: print(i) df = data[i] p = df.mean(axis=1) mse_f.append(mse(y, p)) rmse_f.append(rmse(y, p)) mae_f.append(mae(y, p)) mdae_f.append(mdae(y, p)) evs_f.append(evs(y, p)) r2_f.append(r2(y, p)) d = {} d['Combinations'] = comb_names d['MSE'] = mse_f d['RMSE'] = rmse_f d['MAE'] = mae_f d['MDAE'] = mdae_f d['EVS'] = evs_f d['R2'] = r2_f df = pd.DataFrame( d, columns=['Combinations', 'MSE', 'RMSE', 'MAE', 'MDAE', 'EVS', 'R2']) print(df) df.to_csv( 'C:\\Users\\Preetham G\\Documents\\Research Projects\\Ensemble Rainfall\\Results\\Main - SA Final.csv', index=False)
train_y = train['Rainfall'] test_x = test.drop( columns=['District', 'Index', 'Rainfall', 'Minimum Temperature']) test_y = test['Rainfall'] for j in ker: print(i, c, j) svr = SVR(kernel=j, C=1, epsilon=0.1) svr.fit(train_x, train_y) svr_p = svr.predict(test_x) #Error values k.append(j) mse_k.append(mse(test_y, svr_p)) rmse_k.append(rmse(test_y, svr_p)) mae_k.append(mae(test_y, svr_p)) mdae_k.append(mdae(test_y, svr_p)) evs_k.append(evs(test_y, svr_p)) r2_k.append(r2(test_y, svr_p)) c += 1 t = {} t['Kernel'] = k t['MSE'] = mse_k t['RMSE'] = rmse_k t['MAE'] = mae_k t['MDAE'] = mdae_k t['EVS'] = evs_k t['R2'] = r2_k tf = pd.DataFrame( t, columns=['Kernel', 'MSE', 'RMSE', 'MAE', 'MDAE', 'EVS', 'R2']) for j in ker: temp = tf[tf['Kernel'] == j] dl.append(i)
training_objects_dir = 'training_objects/' testing_file = 'small_subset_test.csv' model_file = 'small_subset_model.obj' vectorizer_file = 'small_subset_vectorizer.obj' print(f"Reading testing data from {input_files_dir + testing_file}...") data = pandas.read_csv(input_files_dir + testing_file, header=None, encoding='latin-1') print( f"Loading model from {training_objects_dir + model_file} and vectorizer from {training_objects_dir + vectorizer_file}..." ) with open(training_objects_dir + vectorizer_file, 'rb') as f: vectorizer = pickle.load(f) with open(training_objects_dir + model_file, 'rb') as f: clf = pickle.load(f) x_test = vectorizer.transform(data.iloc[:, 5].values) x = hstack([x_test]) # predict on test data print("Predicting test data...") results = clf.predict(x) # for tweet, prediction, correct in zip(data.iloc[:, 5].values, results, data.iloc[:, 0].values): # print(tweet, "prediction:", prediction, ", correct:", correct) print("Explained variance score (1.0 is best):", evs(data.iloc[:, 0].values, results))
mdae_t = [] evs_t = [] r2_t = [] for c in range(1, 101): print(i, c) data = pd.read_csv( 'C:\\Users\\Preetham G\\Documents\\Research Projects\\Ensemble Rainfall\\Results\\Adaboost\\Main Results\\R' + str(c) + '.csv') y = data['True'] data = data[i] p = data.mean(axis=1) mse_t.append(mse(y, p)) rmse_t.append(rmse(y, p)) mae_t.append(mae(y, p)) mdae_t.append(mdae(y, p)) evs_t.append(evs(y, p)) r2_t.append(r2(y, p)) mse_f.append(np.mean(mse_t)) rmse_f.append(np.mean(rmse_t)) mae_f.append(np.mean(mae_t)) mdae_f.append(np.mean(mdae_t)) evs_f.append(np.mean(evs_t)) r2_f.append(np.mean(r2_t)) d = {} d['Combinations'] = comb_names d['MSE'] = mse_f d['RMSE'] = rmse_f d['MAE'] = mae_f d['MDAE'] = mdae_f d['EVS'] = evs_f d['R2'] = r2_f
lasso.fit(x_train, y_train) lasso_result = lasso.predict(x_test) lasso_file = open("data/lasso.model", 'wb') pickle.dump(lasso, lasso_file) lasso_file.close() bayesian = BayesianRidge() bayesian.fit(x_train, y_train) bayesian_result = bayesian.predict(x_test) bayesian_file = open("data/bayesian.model", 'wb') pickle.dump(bayesian, bayesian_file) bayesian_file.close() elastic = ElasticNet(alpha=0.01) elastic.fit(x_train, y_train) elastic_result = elastic.predict(x_test) elastic_file = open("data/elastic.model", 'wb') pickle.dump(elastic, elastic_file) elastic_file.close() return y_test, [ model_1_result, ridge_result, elastic_result, lasso_result, bayesian_result ] y_test, results = model() for result in results: score = evs(result, y_test) print(score)
Y = X["imdb_score"] X.drop(["imdb_score"], axis=1,inplace=True) X.drop(["movie_title"], axis=1,inplace=True) #clean and scale X = X.fillna(0) X["gross"] = X["gross"]/10000 X["num_voted_users"] = X["num_voted_users"]/10000 X["cast_total_facebook_likes"] = X["cast_total_facebook_likes"]/1000 X["movie_facebook_likes"] = X["movie_facebook_likes"]/1000 X["budget"] = X["budget"]/10000 X["actor_1_facebook_likes"] = X["actor_1_facebook_likes"]/1000 X["actor_2_facebook_likes"] = X["actor_2_facebook_likes"]/1000 X["actor_3_facebook_likes"] = X["actor_3_facebook_likes"]/1000 #split into train and test sets X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8, test_size=0.2) #create regressor reg = Pipeline([('poly', PolynomialFeatures(degree=2)),('linear', linear_model.LinearRegression(fit_intercept=False))]) #train reg.fit(X_train, Y_train) #predict Y_pred = reg.predict(X_test) #metrics print('Variance:' + str(evs(Y_test, Y_pred))) print('Mean square error:' + str(mse(Y_test, Y_pred)))
r2_t = [] for tr_i, ts_i in rkf.split(data): print(i, c) train, test = data.iloc[tr_i], data.iloc[ts_i] train_x = train.drop(columns=['Rainfall']) train_y = train['Rainfall'] test_x = test.drop(columns=['Rainfall']) test_y = test['Rainfall'] model = DecisionTreeRegressor(criterion='mse', splitter='best', max_depth=i)#, min_samples_leaf=j, min_samples_split=k) model.fit(train_x, train_y) ts_p = model.predict(test_x) mse_t.append(mse(test_y, ts_p)) rmse_t.append(rmse(test_y, ts_p)) mae_t.append(mae(test_y, ts_p)) mdae_t.append(mdae(test_y, ts_p)) evs_t.append(evs(test_y, ts_p)) r2_t.append(r2(test_y, ts_p)) c += 1 dep_f.append(i) # saml_f.append(j) # sams_f.append(k) mse_f.append(np.mean(mse_t)) rmse_f.append(np.mean(rmse_t)) mae_f.append(np.mean(mae_t)) mdae_f.append(np.mean(mdae_t)) evs_f.append(np.mean(evs_t)) r2_f.append(np.mean(r2_t)) d = {} d['Max Depth'] = dep_f #d['Min Samples Leaf'] = saml_f #d['Min Samples Split'] = sams_f
ols = LinearRegression() ols.fit(X_train, y_train) ols_yhat = ols.predict(X_test) #print(X_test) #print(ols_yhat) df_ols = pd.DataFrame(data=ols_yhat) df_ols.columns = ['ols predicted'] print(df_ols) #print(cl(df_ols.dtypes, attrs = ['bold'])) print( cl('Explained Variance Score of OLS model is {}'.format( evs(y_test, ols_yhat)), attrs=['bold'])) print( cl('R-Squared of OLS model is {}'.format(r2(y_test, ols_yhat)), attrs=['bold'])) # 2. Ridge ridge = Ridge(alpha=0.5) ridge.fit(X_train, y_train) ridge_yhat = ridge.predict(X_test) #print(ridge_yhat) df_ridge = pd.DataFrame(data=ridge_yhat) df_ridge.columns = ['ridge predicted']
train_y = train['Rainfall'] test_x = test.drop(columns=['Rainfall']) test_y = test['Rainfall'] model = AdaBoostRegressor(n_estimators=i) model.fit(train_x, train_y) tr_p = model.predict(train_x) ts_p = model.predict(test_x) mse_tr_t.append(mse(train_y, tr_p)) mse_ts_t.append(mse(test_y, ts_p)) rmse_tr_t.append(rmse(train_y, tr_p)) rmse_ts_t.append(rmse(test_y, ts_p)) mae_tr_t.append(mae(train_y, tr_p)) mae_ts_t.append(mae(test_y, ts_p)) mdae_tr_t.append(mdae(train_y, tr_p)) mdae_ts_t.append(mdae(test_y, ts_p)) evs_tr_t.append(evs(train_y, tr_p)) evs_ts_t.append(evs(test_y, ts_p)) r2_tr_t.append(r2(train_y, tr_p)) r2_ts_t.append(r2(test_y, ts_p)) mse_tr_f.append(np.mean(mse_tr_t)) mse_ts_f.append(np.mean(mse_ts_t)) rmse_tr_f.append(np.mean(rmse_tr_t)) rmse_ts_f.append(np.mean(rmse_ts_t)) mae_tr_f.append(np.mean(mae_tr_t)) mae_ts_f.append(np.mean(mae_ts_t)) mdae_tr_f.append(np.mean(mdae_tr_t)) mdae_ts_f.append(np.mean(mdae_ts_t)) evs_tr_f.append(np.mean(evs_tr_t)) evs_ts_f.append(np.mean(evs_ts_t)) r2_tr_f.append(np.mean(r2_tr_t)) r2_ts_f.append(np.mean(r2_ts_t))
train_x = train.drop(columns=['District', 'Index', 'Rainfall']) train_y = train['Rainfall'] test_x = test.drop(columns=['District', 'Index', 'Rainfall']) test_y = test['Rainfall'] for j in dep: print(i, c, j) dt = dtr(max_depth=j) dt.fit(train_x, train_y) dt_p = dt.predict(test_x) #Error values d.append(j) mse_d.append(mse(test_y, dt_p)) rmse_d.append(rmse(test_y, dt_p)) mae_d.append(mae(test_y, dt_p)) mdae_d.append(mdae(test_y, dt_p)) evs_d.append(evs(test_y, dt_p)) r2_d.append(r2(test_y, dt_p)) c += 1 t = {} t['Depth'] = d t['MSE'] = mse_d t['RMSE'] = rmse_d t['MAE'] = mae_d t['MDAE'] = mdae_d t['EVS'] = evs_d t['R2'] = r2_d tf = pd.DataFrame( t, columns=['Depth', 'MSE', 'RMSE', 'MAE', 'MDAE', 'EVS', 'R2']) for j in dep: temp = tf[tf['Depth'] == j] dl.append(i)