def process_marg(npos, i, pos, ftarget, fm, fv, fi, fd, fref, fref_trunc, ftarget_trunc, lock): top20d = np.argsort(ftarget)[-20:] if "nat" in synth_nat: top20d_trunc = np.argsort(ftarget_trunc)[ -20:] # different target, so sort by that #top20s = np.argsort(fs)[-20:] with lock: print("npos {:3d}, set {:3d}, n_unique={:10d}".format( npos, i, ftarget.shape[0])) with open(output_dir + "/r20_{}".format(npos), "at") as f: if "nat" in synth_nat: # only nat uses target_trunc and ref_trunc for black line print( pr(ftarget[top20d], fm[top20d]), # target vs mi3 pr(ftarget[top20d], fv[top20d]), # targt vs vae pr(ftarget[top20d], fi[top20d]), # target vs indep pr(ftarget[top20d], fd[top20d]), # target vs deep # pr(ftarget[top20d], fp[top20d]), # target vs progen pr(ftarget_trunc[top20d_trunc], fref_trunc[top20d_trunc]), # target_trunc vs ref_trunc file=f) # both synthetics, 10K and 1M, use normal target and ref for black line else: print( pr(ftarget[top20d], fm[top20d]), # target vs mi3 pr(ftarget[top20d], fv[top20d]), # targt vs vae pr(ftarget[top20d], fi[top20d]), # target vs indep pr(ftarget[top20d], fd[top20d]), # target vs deep # pr(ftarget[top20d], fp[top20d]), # target vs progen pr(ftarget[top20d], fref[top20d]), # target vs ref file=f)
def fit(self, X, y, var_smoothing=1e-9): # add a smoothing parameter for calculus stability # the prior distribution self.priors = {} # paramters of the prior distribution self.params = {} # create a dictionoary for the mutual information self.mu_info = {} #get the set opf classes in the target variable y self.classes = list(sorted(set(y))) for c in self.classes: #apply the coditionning to label x = X[y==c] self.params[c] = { 'mean': x.mean(axis=0), 'variance': x.var(axis=0) + var_smoothing } self.mu_info[c] = {} self.corr = {} self.priors[c] = len(y[y==c])/len(y) for i in range(X.shape[1]): for j in range(i+1, X.shape[1]): self.mu_info[c][str(i)+','+str(j)] = -0.5*np.log(1-pr(x[:,i], x[:,j])[0]) self.corr[str(i)+','+str(j)] = pr(x[:,i], x[:,j])[0] # repalce nan values with 0 self.corr = {k: 0 if np.isnan(v) else v for k, v in self.corr.items()} self.mu_info = {k1: {k2: 0 if np.isnan(v2) else v2 for k2, v2 in v1.items()} \ for k1, v1 in self.mu_info.items()} # having mutual info, we can now construct the len(classes) trees with Kruskal algorithm self.Trees = [] for c in self.classes: # create as much nodes as much features in the training set g = Graph(X.shape[1]) #create the tree associated with label c for k, v_ in self.mu_info[c].items(): # to get the indexes of features the mutual info to hava a suitable # argument for the add_graph method u, v, w = int(k.split(",")[0]), int(k.split(",")[1]), v_ g.add_edge(u, v, w) # append the tree structure into Trees self.Trees.append(g.kruskal_algo()) self.fitted = True
def compute_pearson_and_spearman_r(A, B, n_pool, n_test): assert A.shape[0] == n_pool + n_test A_diag = np.diag(A)[:n_pool].tolist() B_diag = np.diag(B)[:n_pool].tolist() A_pool_test = A[:n_pool][:, n_pool:] B_pool_test = B[:n_pool][:, n_pool:] A_offdiag = np.reshape(A_pool_test, -1).tolist() B_offdiag = np.reshape(B_pool_test, -1).tolist() pr_diag, pr_diag_p = pr(A_diag, B_diag) pr_offdiag, pr_offdiag_p = pr(A_offdiag, B_offdiag) spr_diag, spr_diag_p = spr(A_diag, B_diag) spr_offdiag, spr_offdiag_p = spr(A_offdiag, B_offdiag) return pr_diag, pr_offdiag, spr_diag, spr_offdiag, pr_diag_p, pr_offdiag_p, spr_diag_p, spr_offdiag_p
def run_model(self): if self.grid_search: self.run_grid_search() self.parameters = self.best_params ''' Xtr, Xts, ytr, yts, CT_RT_tr, CT_RT_ts = train_test_split(self.X, self.y, self.CT_RT, test_size=self.test_size) ''' if self.cv == 'loo': cv = LeaveOneOut() else: cv = KFold(n_splits=self.cv, shuffle=True) self.rmse_cv_train = [] self.r2_cv_train = [] self.rmse_cv_test = [] self.r2_cv_test = [] self.pr_cv_train = [] self.pr_cv_test = [] if self.CT_RT is not None: self.rmse_CT_RT_cv_train = [] self.r2_CT_RT_cv_train = [] self.rmse_CT_RT_cv_test = [] self.r2_CT_RT_cv_test = [] self.pr_CT_RT_cv_train = [] self.pr_CT_RT_cv_test = [] est = {'lightgbm': lgb.LGBMRegressor, 'catboost': catboost.CatBoostRegressor, 'xgboost': xgboost.XGBRegressor} self.model = [] self.lin_model = LinearRegression().fit(self.X_lin, self.y) self.y_lin = self.lin_model.predict(self.X_lin) self.y_res = self.y - self.y_lin model = est[self.package](**self.parameters) for n, (tr_id, ts_id) in enumerate(cv.split(self.y)): print('Running Validation {} of {}'.format(n, self.cv)) if self.package == 'lightgbm': self.model.append(model.fit(self.X[tr_id], self.y_res[tr_id], eval_set=[(self.X[ts_id], self.y_res[ts_id])], eval_metric='rmse', early_stopping_rounds=20, feature_name=self.feature_names)) elif self.package == 'xgboost': self.model.append(model.fit(self.X[tr_id], self.y_res[tr_id], eval_set=[(self.X[ts_id], self.y_res[ts_id])], eval_metric='rmse', early_stopping_rounds=20)) else: self.model.append(model.fit(self.X[tr_id], self.y_res[tr_id], eval_set=[(self.X[ts_id], self.y_res[ts_id])], early_stopping_rounds=20)) if self.package == 'lightgbm': self.y_cv_tr_pred = self.y_lin[tr_id] + self.model[-1].predict( self.X[tr_id], num_iteration=self.model[-1].best_iteration_) self.y_cv_ts_pred = self.y_lin[ts_id] + self.model[-1].predict( self.X[ts_id], num_iteration=self.model[-1].best_iteration_) if self.model_scheme == 'LMP': self.CT_RT_cv_tr_pred = np.exp(( self.y_cv_tr_pred*1000/self.CT_Temp[tr_id]) - self.C[tr_id]) self.CT_RT_cv_ts_pred = np.exp(( self.y_cv_ts_pred*1000/self.CT_Temp[ts_id]) - self.C[ts_id]) else: self.y_cv_tr_pred = self.y_lin[tr_id] + self.model[-1].predict( self.X[tr_id]) self.y_cv_ts_pred = self.y_lin[ts_id] + self.model[-1].predict( self.X[ts_id]) if self.model_scheme == 'LMP': self.CT_RT_cv_tr_pred = np.exp(( self.y_cv_tr_pred*1000/self.CT_Temp[tr_id]) - self.C[tr_id]) self.CT_RT_cv_ts_pred = np.exp(( self.y_cv_ts_pred*1000/self.CT_Temp[ts_id]) - self.C[ts_id]) self.y_cv_tr = self.y[tr_id] self.y_cv_ts = self.y[ts_id] if self.CT_RT is not None: self.CT_RT_cv_tr = self.CT_RT[tr_id] self.CT_RT_cv_ts = self.CT_RT[ts_id] self.rmse_cv_train.append(np.sqrt(mean_squared_error( self.y_cv_tr_pred, self.y[tr_id]))) self.rmse_cv_test.append(np.sqrt(mean_squared_error( self.y_cv_ts_pred, self.y[ts_id]))) self.r2_cv_train.append(linregress(self.y_cv_tr_pred, self.y[tr_id])[2]**2) self.r2_cv_test.append(linregress(self.y_cv_ts_pred, self.y[ts_id])[2]**2) self.pr_cv_train.append(pr(self.y_cv_tr_pred, self.y[tr_id])) self.pr_cv_test.append(pr(self.y_cv_ts_pred, self.y[ts_id])) if self.CT_RT is not None: self.rmse_CT_RT_cv_train.append(np.sqrt(mean_squared_error( self.CT_RT_cv_tr_pred, self.CT_RT[tr_id]))) self.rmse_CT_RT_cv_test.append(np.sqrt(mean_squared_error( self.CT_RT_cv_ts_pred, self.CT_RT[ts_id]))) self.r2_CT_RT_cv_train.append(linregress(self.CT_RT_cv_tr_pred, self.CT_RT[tr_id])[2]**2) self.r2_CT_RT_cv_test.append(linregress(self.CT_RT_cv_ts_pred, self.CT_RT[ts_id])[2]**2) self.pr_CT_RT_cv_train.append(pr(self.CT_RT_cv_tr_pred, self.CT_RT[tr_id])) self.pr_CT_RT_cv_test.append(pr(self.CT_RT_cv_ts_pred, self.CT_RT[ts_id])) self.N_dp = len(self.y) self.rmse_mean_train = np.mean(self.rmse_cv_train) self.rmse_std_train = np.std(self.rmse_cv_train) self.rmse_mean_test = np.mean(self.rmse_cv_test) self.rmse_std_test = np.std(self.rmse_cv_test) self.r2_mean_train = np.mean(self.r2_cv_train) self.r2_std_train = np.std(self.r2_cv_train) self.r2_mean_test = np.mean(self.r2_cv_test) self.r2_std_test = np.std(self.r2_cv_test) self.pr_mean_train = np.mean([i[0] for i in self.pr_cv_train]) self.pr_std_train = np.std([i[0] for i in self.pr_cv_train]) self.pr_mean_test = np.mean([i[0] for i in self.pr_cv_test]) self.pr_std_test = np.std([i[0] for i in self.pr_cv_test]) if self.CT_RT is not None: self.rmse_CT_RT_mean_train = np.mean(self.rmse_CT_RT_cv_train) self.rmse_CT_RT_std_train = np.std(self.rmse_CT_RT_cv_train) self.rmse_CT_RT_mean_test = np.mean(self.rmse_CT_RT_cv_test) self.rmse_CT_RT_std_test = np.std(self.rmse_CT_RT_cv_test) self.r2_CT_RT_mean_train = np.mean(self.r2_CT_RT_cv_train) self.r2_CT_RT_std_train = np.std(self.r2_CT_RT_cv_train) self.r2_CT_RT_mean_test = np.mean(self.r2_CT_RT_cv_test) self.r2_CT_RT_std_test = np.std(self.r2_CT_RT_cv_test) self.pr_CT_RT_mean_train = np.mean([i[0] for i in self.pr_CT_RT_cv_train]) self.pr_CT_RT_std_train = np.std([i[0] for i in self.pr_CT_RT_cv_train]) self.pr_CT_RT_mean_test = np.mean([i[0] for i in self.pr_CT_RT_cv_test]) self.pr_CT_RT_std_test = np.std([i[0] for i in self.pr_CT_RT_cv_test]) '''
ax.set_xlabel('Time (s)'), ax.set_ylabel('Real part'), ax.set_zlabel( 'Imag part') ax.set_title('Complex sine wave in all its 3D glory') plt.show() # two vectors v1 = [1, 7, 5, 1, 4, 0, 6, 8, 1, 8] v2 = [10, 85, 35, 15, 55, 5, 72, 81, 13, 92] # compute the dot product dp = sum(np.multiply(v1, v2)) / 1000 print('The dot product is', dp) #pearson correlation corr, dp1 = pr(v2, v2) corr # dot products of sine waves # general simulation parameters srate = 500 # sampling rate in Hz time = np.arange(0., 2., 1. / srate) # time in seconds # sine wave parameters freq1 = 5 # frequency in Hz freq2 = 5 # frequency in Hz
def split_on_best_feature(self, dataX, dataY): # Resource: Primary Resource -> No. 2 # Main Goal: Choosing the best feature to split on <- that means choosing the feature which has the highest absoluate correlation with dataY. # Rules: # 1. Splitting value will be the mean of the splitiing feature values. # 1a. If all the features have same amount of values, then choose the feature which comes first. # 2. If the selected best feature can not split the data, then we will choose second best feature to split on. # 2a. If none of the feature can not split the data accordingly, then it that case we will return the leaf. # Params: # dataX : A numpy ndarray -> x values at each node # dataY : A numpy 1d array -> y values at each node # Returns: # Tree: A mumpy ndarray. # # <------ feature indices (int type; index for a leaf is -1), splitting values ------> # / ~ ~ ~ ~ # | ~ ~ ~ ~ # nodes | ~ ~ ~ ~ # | ~ ~ ~ ~ # \ ~ ~ ~ ~ if dataX.shape[0] <= self.leaf_size: return np.array([-1, cntr(dataY).most_common(1)[0][0], np.nan, np.nan]) # Now, lets look into the availble list of features. availble_features = range(dataX.shape[1]) # Equivalent to num_features availble_LIST_of_features = list(availble_features) # Tuples: (<features>, <their_correlation_with_dataY>) feature_correlations = [] feature_correlations = sorted(feature_correlations, key=lambda feature_correlations: feature_correlations[1]) # Sorting with correlations. # Referance for Sorting: https://docs.python.org/2.7/howto/sorting.html for ftr_itr in range(dataX.shape[1]): absolute_correlation_value = abs(pr(dataX[:, ftr_itr], dataY)[0]) # Dropping NAN values, and assigning their correlation to 0.0 <- float number. if np.isnan(absolute_correlation_value): absolute_correlation_value = 0.0 else: pass # Now,Appending all values to features_correlaticat coons. feature_correlations.append((ftr_itr, absolute_correlation_value)) # Choosing the best feature. # if lenth of availble total features are 0, # then return leaf. feature_Correlation_temp = 0 if len(availble_LIST_of_features) == 0: return np.array([-1, cntr(dataY).most_common(1)[0][0], np.nan, np.nan]) #else: # once again check if the features are 1 or more. # Choose the best feature, if any, by iterating over feats_corrs else: # Choose the best feature, if any, by iterating over feats_corrs while len(availble_LIST_of_features) -1 >= 0: best_feature_itr = feature_correlations[feature_Correlation_temp][0] y = best_feature_itr # Split the data according to the best feature, and considering the mean of the data. # Primary Resource No. 2 split_val = np.median(dataX[:, y]) # Arrays for indexing - Logically left_i = dataX[:, y] right_i = dataX[:, y] left_index = left_i <= split_val right_index = right_i > split_val # In any case if we can not split ANY feature in any two distinct parts, then all we do is -> return the leaf. if len(np.unique(left_index)) != 1: break # Once we use the feature, then we take it off from remaining best features to choose from. availble_LIST_of_features.remove(y) feature_Correlation_temp = feature_Correlation_temp + 1 # Once we run while loop and in any case if we run out of all features that we can split on, then in that case we just return leaf. if len(availble_LIST_of_features) == 0: return np.array([-1, cntr(dataY).most_common(1)[0][0], np.nan, np.nan]) # Building Following: # left branch # the root lefttree = (self.split_on_best_feature(dataX[left_index], dataY[left_index])) # Set the starting row for the right subtree of the current root if lefttree.ndim == 1: # https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.ndarray.ndim.html righttree_start = 1 righttree_start = righttree_start + 1 elif lefttree.ndim >= (1+1): righttree_start = (lefttree.shape[0] + 2)-1 root = np.array([best_feature_itr, split_val, 1, righttree_start]) return np.vstack((root, lefttree, self.split_on_best_feature(dataX[right_index], dataY[right_index])))
def calc_pearson(pred, true): try: r, p_value = pr(np.asarray(pred), np.asarray(true)) except ValueError: r = -1.0 return r
#!/usr/bin/env python # coding: utf-8 # In[173]: import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy.stats import pearsonr as pr data = pd.read_csv('../input/train.csv') pr(data.Fare, data.Pclass) plt.style.use('bmh') plt.xlabel('Age') plt.ylabel('Survived') plt.title('Age vs Survival') plt.hist(data.Age[(np.isnan(data.Age) == False)], bins=15, alpha=0.4, color='r', label='Before') plt.hist(data.Age[(np.isnan(data.Age) == False) & (data.Survived == 1)], bins=15, alpha=0.4, color='b', label='After') #plt.hist(data.Age[data.Age != np.NaN]) plt.legend(loc='upper right') plt.show() # In[181]:
def run_model(self): if self.cv == 'loo': cv = LeaveOneOut() else: cv = KFold(n_splits=self.cv, shuffle=True) self.rmse_cv_train = [] self.r2_cv_train = [] self.rmse_cv_test = [] self.r2_cv_test = [] self.pr_cv_train = [] self.pr_cv_test = [] est = {'lightgbm': lgb.LGBMRegressor, 'catboost': catboost.CatBoostRegressor, 'xgboost': xgboost.XGBRegressor} self.model = [] self.gen_sample = [] model = est[self.package](**self.parameters) for n, (tr_id, ts_id) in enumerate(cv.split(self.X)): print('Running Validation {} of {}'.format(n, self.cv)) ae = AutoEncoder(arch=self.vae_arch, X=self.X[tr_id], loss='xent', epochs=2000) ae.build_model() X_gen = None while X_gen is None: if self.gen_n is not None: X_gen = self.validate_xgen(generated_X= ae.get_random_alloy(n_samples=self.gen_n)) if self.gen_per_direction is not None: X_gen = self.validate_xgen(generated_X= ae.get_linspace_alloy(n_range=(-3, 3), n_sample_per_direction=self.gen_per_direction)) self.gen_sample.append(self.scale.inverse_transform(X_gen)) y_gen = self.scale.inverse_transform(X_gen)[:, -1] y_orig = self.scale.inverse_transform(self.X[tr_id])[:, -1] y_ts = self.scale.inverse_transform(self.X[ts_id])[:, -1] X_gen = X_gen[:, :-1] X_orig = self.X[tr_id][:, :-1] X_tr = np.vstack([X_orig, X_gen]) y_tr = np.concatenate([y_orig, y_gen]) if self.package == 'lightgbm': self.model.append(model.fit(X_tr, y_tr, eval_set=[(self.X[ts_id][:, :-1], y_ts)], eval_metric='rmse', early_stopping_rounds=20, feature_name=self.feature_names)) elif self.package == 'xgboost': self.model.append(model.fit(X_tr, y_tr, eval_set=[(self.X[ts_id][:, :-1], y_ts)], eval_metric='rmse', early_stopping_rounds=20)) else: self.model.append(model.fit(X_tr, y_tr, eval_set=[(self.X[ts_id][:, :-1], y_ts)], early_stopping_rounds=20)) if self.package == 'lightgbm': self.y_cv_tr_pred = self.model[-1].predict(X_tr, num_iteration=self.model[-1].best_iteration_) self.y_cv_ts_pred = self.model[-1].predict( self.X[ts_id][:, :-1], num_iteration= self.model[-1].best_iteration_) else: self.y_cv_tr_pred = self.model[-1].predict( self.X[tr_id][:, :-1]) self.y_cv_ts_pred = self.model[-1].predict( self.X[ts_id][:, :-1]) self.y_cv_tr = y_orig self.y_cv_ts = y_ts self.rmse_cv_train.append(np.sqrt(mean_squared_error( self.y_cv_tr_pred, self.y_cv_tr))) self.rmse_cv_test.append(np.sqrt(mean_squared_error( self.y_cv_ts_pred, self.y_cv_ts))) self.r2_cv_train.append(linregress(self.y_cv_tr_pred, self.y_cv_tr)[2]**2) self.r2_cv_test.append(linregress(self.y_cv_ts_pred, self.y_cv_ts)[2]**2) self.pr_cv_train.append(pr(self.y_cv_tr_pred, self.y_cv_tr)) self.pr_cv_test.append(pr(self.y_cv_ts_pred, self.y_cv_ts)) self.N_dp = len(self.X[:, -1]) self.rmse_mean_train = np.mean(self.rmse_cv_train) self.rmse_std_train = np.std(self.rmse_cv_train) self.rmse_mean_test = np.mean(self.rmse_cv_test) self.rmse_std_test = np.std(self.rmse_cv_test) self.r2_mean_train = np.mean(self.r2_cv_train) self.r2_std_train = np.std(self.r2_cv_train) self.r2_mean_test = np.mean(self.r2_cv_test) self.r2_std_test = np.std(self.r2_cv_test) self.pr_mean_train = np.mean([i[0] for i in self.pr_cv_train]) self.pr_std_train = np.std([i[0] for i in self.pr_cv_train]) self.pr_mean_test = np.mean([i[0] for i in self.pr_cv_test]) self.pr_std_test = np.std([i[0] for i in self.pr_cv_test])
def run_reg(self): if self.estimator == 'MLP': est = import_module('sklearn.neural_network') estimator = getattr(est, 'MLPRegressor') if self.estimator == 'LR': est = import_module('sklearn.linear_model') estimator = getattr(est, 'LinearRegression') if self.estimator == 'RF': est = import_module('sklearn.ensemble') estimator = getattr(est, 'RandomForestRegressor') if not self.estimator_param: estimator = estimator() else: estimator = estimator(**self.estimator_param) print('Fitting the master model. Hang tight!') self.model = estimator.fit(self.X, self.y) #Model Validation print('Initializing validation.') if self.validation == 'leave_one_out': val = getattr(import_module('sklearn.model_selection'), 'LeaveOneOut')() else: val = getattr(import_module('sklearn.model_selection'), 'KFold')(n_splits=int(self.validation.split('-')[0])) self.rmse_train = [] self.rmse_test = [] self.mae_train = [] self.mae_test = [] self.r2_train = [] self.r2_test = [] self.pr_train = [] self.pr_test = [] self.y_true_train = [] self.y_pred_train = [] self.y_true_test = [] self.y_pred_test = [] if self.CT_RT is not None: self.rmse_CT_RT_train = [] self.rmse_CT_RT_test = [] self.mae_CT_RT_train = [] self.mae_CT_RT_test = [] self.r2_CT_RT_train = [] self.r2_CT_RT_test = [] self.pr_CT_RT_train = [] self.pr_CT_RT_test = [] for n, (tr_id, ts_id) in enumerate(val.split(self.y)): print('Running validation model no. {}'.format(n + 1)) XTR, XTS, YTR = self.X[tr_id], self.X[ts_id], self.y[tr_id] temp_model = estimator.fit(XTR, YTR) y_true = self.y[ts_id] y_pred = temp_model.predict(XTS) y_pred_train = temp_model.predict(XTR) self.y_true_train.extend(YTR) self.y_pred_train.extend(y_pred_train) self.y_true_test.extend(y_true) self.y_pred_test.extend(y_pred) self.rmse_train.append( np.sqrt(mean_squared_error(y_pred_train, YTR))) self.rmse_test.append(np.sqrt(mean_squared_error(y_pred, y_true))) self.mae_train.append(mean_absolute_error(y_pred_train, YTR)) self.mae_test.append(mean_absolute_error(y_pred, y_true)) self.r2_train.append(linregress(y_pred_train, YTR)[2]**2) self.r2_test.append(linregress(y_pred, y_true)[2]**2) self.pr_train.append(pr(y_pred_train.reshape(-1), YTR.reshape(-1))) self.pr_test.append(pr(y_pred.reshape(-1), y_true.reshape(-1))) if self.CT_RT is not None: CT_RT_train_pred = np.exp((y_pred_train * 1000 / self.CT_Temp[tr_id]) - self.C[tr_id]) CT_RT_train_true = self.CT_RT[tr_id] CT_RT_test_pred = np.exp((y_pred * 1000 / self.CT_Temp[ts_id]) - self.C[ts_id]) CT_RT_test_true = self.CT_RT[ts_id] self.rmse_CT_RT_train.append( np.sqrt( mean_squared_error(CT_RT_train_pred, CT_RT_train_true))) self.rmse_CT_RT_test.append( np.sqrt( mean_squared_error(CT_RT_test_pred, CT_RT_test_true))) self.mae_CT_RT_train.append( mean_absolute_error(CT_RT_train_pred, CT_RT_train_true)) self.mae_CT_RT_test.append( mean_absolute_error(CT_RT_test_pred, CT_RT_test_true)) self.r2_CT_RT_train.append( linregress(CT_RT_train_pred, CT_RT_train_true)[2]**2) self.r2_CT_RT_test.append( linregress(CT_RT_test_pred, CT_RT_test_true)[2]**2) self.pr_CT_RT_train.append( pr(CT_RT_train_pred.reshape(-1), CT_RT_train_true.reshape(-1))) self.pr_CT_RT_test.append( pr(CT_RT_test_pred.reshape(-1), CT_RT_test_true.reshape(-1))) self.rmse_train_mean = np.mean(self.rmse_train) self.rmse_train_std = np.std(self.rmse_train) self.mae_train_mean = np.mean(self.mae_train) self.mae_train_std = np.std(self.mae_train) self.r2_train_mean = np.mean(self.r2_train) self.r2_train_std = np.std(self.r2_train) self.pr_train_mean = np.mean(self.pr_train) self.pr_train_std = np.std(self.pr_train) if self.CT_RT is not None: self.rmse_CT_RT_train_mean = np.mean(self.rmse_CT_RT_train) self.rmse_CT_RT_train_std = np.std(self.rmse_CT_RT_train) self.mae_CT_RT_train_mean = np.mean(self.mae_CT_RT_train) self.mae_CT_RT_train_std = np.std(self.mae_CT_RT_train) self.r2_CT_RT_train_mean = np.mean(self.r2_CT_RT_train) self.r2_CT_RT_train_std = np.std(self.r2_CT_RT_train) self.pr_CT_RT_train_mean = np.mean(self.pr_CT_RT_train) self.pr_CT_RT_train_std = np.std(self.pr_CT_RT_train) self.rmse_test_mean = np.mean(self.rmse_test) self.rmse_test_std = np.std(self.rmse_test) self.mae_test_mean = np.mean(self.mae_test) self.mae_test_std = np.std(self.mae_test) self.r2_test_mean = np.mean(self.r2_test) self.r2_test_std = np.std(self.r2_test) self.pr_test_mean = np.mean(self.pr_test) self.pr_test_std = np.std(self.pr_test) if self.CT_RT is not None: self.rmse_CT_RT_test_mean = np.mean(self.rmse_CT_RT_test) self.rmse_CT_RT_test_std = np.std(self.rmse_CT_RT_test) self.mae_CT_RT_test_mean = np.mean(self.mae_CT_RT_test) self.mae_CT_RT_test_std = np.std(self.mae_CT_RT_test) self.r2_CT_RT_test_mean = np.mean(self.r2_CT_RT_test) self.r2_CT_RT_test_std = np.std(self.r2_CT_RT_test) self.pr_CT_RT_test_mean = np.mean(self.pr_CT_RT_test) self.pr_CT_RT_test_std = np.std(self.pr_CT_RT_test)
import csv import math from scipy.stats import pearsonr as pr with open('prasanna to last cub alvina 5.csv') as csf: csv_reader = csv.reader(csf,delimiter=',') x = [] y = [] z = [] total = [] for row in csv_reader: x.append(float(row[0])) y.append(float(row[1])) z.append(float(row[2])) total.append(math.sqrt((float(row[0])**2)+(float(row[1])**2)+(float(row[2])**2))) with open('prasanna to last cub alvina 3.csv') as csf: csv_reader = csv.reader(csf,delimiter=',') x1 = [] y1 = [] z1 = [] total1 = [] for row in csv_reader: x1.append(float(row[0])) y1.append(float(row[1])) z1.append(float(row[2])) total1.append(math.sqrt((float(row[0])**2)+(float(row[1])**2)+(float(row[2])**2))) print(pr(x,x1)) print(pr(y,y1)) print(pr(z,z1)) print(pr(total,total1))
import statistics as stcs from scipy.stats import pearsonr as pr import pandas as pd stock = pd.read_csv("^GSPC.csv", index_col="Date") print(stock["Close"].mean()) print(stock["Close"].std()) print(stock["Close"].skew()) print(pr(stock["Close"], stock["Volume"]))
alldata['TM'].append(tmdic[key]) alldata['QA'].append(qadic[key]) df = pd.DataFrame(alldata) fig, axs = plt.subplots(3, 6, sharex=True, sharey=True) plt.xticks(np.arange(0, 1.2, 0.2)) plt.yticks(np.arange(0, 1.1, 0.1)) plt.xlim(0, 1) plt.ylim(0, 0.7) row = col = 0 for target in targets: tdf = df.loc[df['TARGET'] == target] m, b = np.polyfit(list(tdf['TM']), list(tdf['QA']), 1) pcc = pr(list(tdf['TM']), list(tdf['QA'])) x = np.arange(0, 1, 0.01) axs[row][col].plot(x, m * x + b) sb.scatterplot(x='TM', y='QA', data=tdf, s=5, ax=axs[row][col]) axs[row][col].set_ylabel('') axs[row][col].set_xlabel('') axs[row][col].set_title('{t} - PCC:{p}'.format(t=target, p=round(pcc[0], 3)), fontsize=8) if col < 5: col += 1 else: col = 0 row += 1 fig.text(0.5, 0.04, 'TM score', ha='center', fontsize=12)
def run_model(self): est = { 'lightgbm': lgb.LGBMRegressor, 'catboost': catboost.CatBoostRegressor, 'xgboost': xgboost.XGBRegressor } self.rmse_cv_train = [] self.r2_cv_train = [] self.rmse_cv_test = [] self.r2_cv_test = [] self.pr_cv_train = [] self.pr_cv_test = [] if self.CT_RT is not None: self.rmse_CT_RT_cv_train = [] self.r2_CT_RT_cv_train = [] self.rmse_CT_RT_cv_test = [] self.r2_CT_RT_cv_test = [] self.pr_CT_RT_cv_train = [] self.pr_CT_RT_cv_test = [] for i in np.arange(self.nrun): if self.CT_RT is not None: data = train_test_split(self.X, self.y, self.CT_RT, self.CT_Temp, self.C, test_size=self.test_size) Xtr, Xts, ytr, yts = data[0], data[1], data[2], data[3] CT_RTtr, CT_RTts, CT_Temptr = data[4], data[5], data[6] CT_Tempts, Ctr, Cts = data[7], data[8], data[9] del data else: Xtr, Xts, ytr, yts = train_test_split(self.X, self.y, test_size=self.test_size) model = est[self.package](**self.parameters) if self.package == 'lightgbm': model.fit(Xtr, ytr, eval_set=[(Xts, yts)], eval_metric='rmse', early_stopping_rounds=20, feature_name=self.feature_names) elif self.package == 'xgboost': model.fit(Xtr, ytr, eval_set=[(Xts, yts)], eval_metric='rmse', early_stopping_rounds=20) else: model.fit(Xtr, ytr, eval_set=[(Xts, yts)], early_stopping_rounds=20) if self.package == 'lightgbm': self.y_cv_tr_pred = model.predict( Xtr, num_iteration=model.best_iteration_) self.y_cv_ts_pred = model.predict( Xts, num_iteration=model.best_iteration_) if self.model_scheme == 'LMP': self.CT_RT_cv_tr_pred = np.exp((self.y_cv_tr_pred * 1000 / CT_Temptr) - Ctr) self.CT_RT_cv_ts_pred = np.exp((self.y_cv_ts_pred * 1000 / CT_Tempts) - Cts) else: self.y_cv_tr_pred = model.predict(Xtr) self.y_cv_ts_pred = model.predict(Xts) if self.model_scheme == 'LMP': self.CT_RT_cv_tr_pred = np.exp((self.y_cv_tr_pred * 1000 / CT_Temptr) - Ctr) self.CT_RT_cv_ts_pred = np.exp((self.y_cv_ts_pred * 1000 / CT_Tempts) - Cts) self.y_cv_tr = ytr self.y_cv_ts = yts if self.CT_RT is not None: self.CT_RT_cv_tr = CT_RTtr self.CT_RT_cv_ts = CT_RTts self.rmse_cv_train.append( np.sqrt(mean_squared_error(self.y_cv_tr_pred, ytr))) self.rmse_cv_test.append( np.sqrt(mean_squared_error(self.y_cv_ts_pred, yts))) self.r2_cv_train.append(linregress(self.y_cv_tr_pred, ytr)[2]**2) self.r2_cv_test.append(linregress(self.y_cv_ts_pred, yts)[2]**2) self.pr_cv_train.append(pr(self.y_cv_tr_pred, ytr)) self.pr_cv_test.append(pr(self.y_cv_ts_pred, yts)) if self.CT_RT is not None: self.rmse_CT_RT_cv_train.append( np.sqrt(mean_squared_error(self.CT_RT_cv_tr_pred, CT_RTtr))) self.rmse_CT_RT_cv_test.append( np.sqrt(mean_squared_error(self.CT_RT_cv_ts_pred, CT_RTts))) self.r2_CT_RT_cv_train.append( linregress(self.CT_RT_cv_tr_pred, CT_RTtr)[2]**2) self.r2_CT_RT_cv_test.append( linregress(self.CT_RT_cv_ts_pred, CT_RTts)[2]**2) self.pr_CT_RT_cv_train.append( pr(self.CT_RT_cv_tr_pred, CT_RTtr)) self.pr_CT_RT_cv_test.append(pr(self.CT_RT_cv_ts_pred, CT_RTts)) self.N_dp = len(self.y) self.N_dp_train = len(ytr) self.N_dp_test = len(yts) self.rmse_mean_train = np.mean(self.rmse_cv_train) self.rmse_std_train = np.std(self.rmse_cv_train) self.rmse_mean_test = np.mean(self.rmse_cv_test) self.rmse_std_test = np.std(self.rmse_cv_test) self.r2_mean_train = np.mean(self.r2_cv_train) self.r2_std_train = np.std(self.r2_cv_train) self.r2_mean_test = np.mean(self.r2_cv_test) self.r2_std_test = np.std(self.r2_cv_test) self.pr_mean_train = np.mean([i[0] for i in self.pr_cv_train]) self.pr_std_train = np.std([i[0] for i in self.pr_cv_train]) self.pr_mean_test = np.mean([i[0] for i in self.pr_cv_test]) self.pr_std_test = np.std([i[0] for i in self.pr_cv_test]) if self.CT_RT is not None: self.rmse_CT_RT_mean_train = np.mean(self.rmse_CT_RT_cv_train) self.rmse_CT_RT_std_train = np.std(self.rmse_CT_RT_cv_train) self.rmse_CT_RT_mean_test = np.mean(self.rmse_CT_RT_cv_test) self.rmse_CT_RT_std_test = np.std(self.rmse_CT_RT_cv_test) self.r2_CT_RT_mean_train = np.mean(self.r2_CT_RT_cv_train) self.r2_CT_RT_std_train = np.std(self.r2_CT_RT_cv_train) self.r2_CT_RT_mean_test = np.mean(self.r2_CT_RT_cv_test) self.r2_CT_RT_std_test = np.std(self.r2_CT_RT_cv_test) self.pr_CT_RT_mean_train = np.mean( [i[0] for i in self.pr_CT_RT_cv_train]) self.pr_CT_RT_std_train = np.std( [i[0] for i in self.pr_CT_RT_cv_train]) self.pr_CT_RT_mean_test = np.mean( [i[0] for i in self.pr_CT_RT_cv_test]) self.pr_CT_RT_std_test = np.std( [i[0] for i in self.pr_CT_RT_cv_test])
alldata['PQA'].append(pqadic[key]) alldata['GQA'].append(gqadic[key]) df = pd.DataFrame(alldata) fig, axs = plt.subplots(3, 6, sharex=True, sharey=True) plt.xticks(np.arange(0, 1.2, 0.2)) plt.yticks(np.arange(0, 1.1, 0.1)) plt.xlim(0, 1) plt.ylim(0, 0.7) row = col = 0 for target in targets: tdf = df.loc[df['TARGET'] == target] m, b = np.polyfit(list(tdf['TM']), list(tdf['PQA']), 1) pcc = pr(list(tdf['TM']), list(tdf['PQA'])) x = np.arange(0, 1, 0.01) axs[row][col].plot(x, m * x + b) sb.scatterplot(x='TM', y='PQA', data=tdf, s=5, ax=axs[row][col]) m, b = np.polyfit(list(tdf['TM']), list(tdf['GQA']), 1) pcc = pr(list(tdf['TM']), list(tdf['GQA'])) x = np.arange(0, 1, 0.01) axs[row][col].plot(x, m * x + b) sb.scatterplot(x='TM', y='GQA', data=tdf, s=5, ax=axs[row][col]) axs[row][col].set_ylabel('') axs[row][col].set_xlabel('') axs[row][col].set_title('{t} - PCC:{p}'.format(t=target, p=round(pcc[0], 3)), fontsize=8)