def PLSCrossValidation(n_components, trainSet, validationSet): pls = PLSRegression(n_components=n_components) pls.fit(trainSet[predictorList], trainSet['Apps']) predictPls = pls.predict(validationSet[predictorList]) different = predictPls.flat - validationSet['Apps'] error_rate = np.mean(different ** 2) return error_rate
def fit(self, predictors, predictands, locations, log=False, **kwargs): self.locations = locations self.models = [] self.n = predictors['n'] id = 0 for location in locations: X = extract_n_by_n(predictors, location, **kwargs) Y = predictands[:,id] if log: Y = np.log(Y) #pca = PCA(n_components='mle', whiten=True) model = PLSRegression(n_components=2) model = model.fit(X,Y) #components = pca.components_ #pca.components_ = components self.models.append(model) print "pls: ", location, model.score(X, Y), model.x_loadings_.shape, np.argmax(model.x_loadings_, axis=0) id += 1
def build_model(X, y): # gbr = GradientBoostingRegressor(learning_rate= 0.03, n_estimators=2000, max_depth=8, subsample=0.9) # rf = RandomForestRegressor(n_estimators=200) # lr = LinearRegression(fit_intercept=True) # knr = KNeighborsRegressor(n_neighbors=10, weights='uniform') # svr = SVR(C=5.0, kernel='linear') pls = PLSRegression(n_components=35) return pls.fit(X, y)
def Training(df,seed, yratio, xratio, index = 1): snp_matrix = np.array(df.values) xdim, ydim = snp_matrix.shape ydimlist = range(0,ydim) xdimlist = range(0,xdim) random.seed(seed) random.shuffle(ydimlist) # shuffle the individuals random.shuffle(xdimlist) # shuffle the SNPs accuracy = 0 snp_matrix_shuffle = np.copy(snp_matrix[:,ydimlist]) snp_matrix_shuffle = np.copy(snp_matrix[xdimlist,:]) snp_matrix_train = snp_matrix_shuffle[:,0:int(ydim*yratio)] snp_matrix_test = snp_matrix_shuffle[:,int(ydim*yratio):] snp_matrix_train_x = snp_matrix_train[0:int(xdim*xratio),:] snp_matrix_test_x = snp_matrix_test[0:int(xdim*xratio),:] for i in range(int(xdim*xratio), xdim): snp_matrix_train_y = snp_matrix_train[i,:] snp_matrix_test_y = snp_matrix_test[i,:] if index != 7: if index == 1: clf = AdaBoostClassifier(n_estimators= 100) elif index == 2: clf = RandomForestClassifier(n_estimators=100) elif index == 3: clf = linear_model.LogisticRegression(C=1e5) elif index == 4: clf = svm.SVC(kernel = 'rbf') elif index == 5: clf = svm.SVC(kernel = 'poly') else: clf = svm.SVC(kernel = 'linear') clf = clf.fit(snp_matrix_train_x.T, snp_matrix_train_y) Y_pred = clf.predict(snp_matrix_test_x.T) prediction = snp_matrix_test_y - Y_pred wrong = np.count_nonzero(prediction) tmp = 1 - (wrong + 0.0) / len(prediction) print tmp accuracy += tmp accuracy = accuracy / (xdim - int(xdim*xratio)) if index == 7: pls2 = PLSRegression(n_components = 50, scale=False, max_iter=1000) snp_matrix_train_y = snp_matrix_train[int(xdim*xratio):,:] pls2.fit(snp_matrix_train_x.T,snp_matrix_train_y.T) snp_matrix_test_x = snp_matrix_test[0:int(xdim*xratio),:] snp_matrix_test_y = snp_matrix_test[int(xdim*xratio):,:] Y_pred = transform(pls2.predict(snp_matrix_test_x.T)) prediction = snp_matrix_test_y - Y_pred.T xdim, ydim = prediction.shape wrong = np.count_nonzero(prediction) accuracy = 1 - wrong / (xdim * ydim + 0.0) return accuracy
def fit(predictors, predictands, log=False, **kwargs): model = PLSRegression(n_components=2) try: model.fit(predictors, predictands) except: return None return model
def get_correlations(param, spec, wave): '''Returns correlations between spec and params by wavelengths''' # using PLS pls = PLSRegression(10) pls.fit(spec, param) #get corretalions nparam = param.shape[1] cor = pls.coefs*np.asarray([pls.x_std_]*nparam).T cor /= np.tile(pls.y_std_, (cor.shape[0],1)) return cor
class PLSPredictor: def __init__(self): self.pls2 = PLSRegression(n_components=2, scale=True, max_iter=500, tol=1e-06, copy=True) def predict(self, values): self.pls2.predict(values) def train(self, measured_values, screen_points): self.pls2.fit(measured_values, screen_points)
def do_pls(X, Y): pls2 = PLSRegression(n_components=2) pls2.fit(X,Y) out = pls2.transform(X) print(out) print(out.shape) plt.title("PLS2") plt.xlabel("PL1") plt.ylabel("PL2") plt.grid(); plt.scatter(out[:, 0], out[:, 1], c=Y, cmap='viridis') plt.savefig('pls.png', dpi=125)
def pls_approach(): from sklearn.cross_decomposition import PLSRegression (X, Y), cities = pull_xy_data() pls = PLSRegression() pls.fit(X, Y) plsX, plsY = pls.transform(X, Y) plot(plsX, cities, ["Lat01", "Lat02", "Lat03"], ellipse_sigma=1) return "OK What Now?"
def __one_pls(self, cat): np.seterr(all='raise') lcat = np.zeros(self.train_set['labels'].size) lcat[self.train_set['labels'] != cat] = -1 lcat[self.train_set['labels'] == cat] = +1 pls = PLSRegression(n_components=2, scale=False) pls.fit(self.train_set['data'], lcat) return pls
def fit_base_model(classifiers, fully, dummyY, trainx, testx): """ Takes a list of classifiers and/or PLS regression and does dimension reduction by returning the predictions of the classifiers or first two scores of the PLS regression on bootstrapped subsamples of the data.""" trainProbs = [] testProbs = [] iterations = 0 for clf in classifiers: for i in range(clf[1]): iterations += 1 print(iterations) print(clf[0]) train_rows = np.random.choice(trainx.shape[0], round(trainx.shape[0] * base_prop), True) oob_rows = list(set(range(trainx.shape[0])) - set(train_rows)) print(len(train_rows)) print(len(oob_rows)) x = trainx[train_rows, :] if clf[0] == 'PLS': y = dummyY[train_rows, :] mod = PLSRegression().fit(x, y) trainscores = mod.transform(trainx) testscores = mod.transform(testx) trainProbs.append(trainscores[:, 0]) trainProbs.append(trainscores[:, 1]) testProbs.append(testscores[:, 0]) testProbs.append(testscores[:, 1]) else: y = fully[train_rows] print('\t Fitting model...') mod = clf[0].fit(x, y) print('\t Predicting training results...') tpreds = mod.predict_proba(trainx) trainProbs.append(list(tpreds[:, 1])) print('\t Predicting test results...') testProbs.append(list(mod.predict_proba(testx)[:, 1])) print('\t OOB score: ' + str(log_loss(fully[oob_rows], tpreds[oob_rows, :]))) return trainProbs, testProbs
def pls_regr(x, y): from sklearn.cross_decomposition import PLSRegression n = len(x[0]) if n < 2: raise TypeError score = -999999999999 pls = None ''' for i in range(3, n): pls2 = PLSRegression(n_components=i) pls2.fit(x,y) cscore = pls2.score(x, y) #print i, cscore if cscore > score: pls = pls2 score = cscore ''' pls = PLSRegression(n_components=5) pls.fit(x,y) return pls
def train_PLSR(x_filename, y_filename, model_filename, n): """ Train a PLSR model and save it to the model_filename. X and Y matrices are read from x_filename and y_filename. The no. of PLSR components is given by n. """ X = loadMatrix(x_filename)[0].todense() Y = loadMatrix(y_filename)[0].todense() if X.shape[0] != Y.shape[0]: sys.stderr.write("X and Y must have equal number of rows!\n") raise ValueError sys.stderr.write("Learning PLSR...") startTime = time.time() pls2 = PLSRegression(copy=True, max_iter=10000, n_components=n, scale=True, tol=1e-06) pls2.fit(X, Y) model = open(model_filename, 'w') pickle.dump(pls2, model, 1) model.close() endTime = time.time() sys.stderr.write(" took %ss\n" % str(round(endTime-startTime, 2))) pass
def lex_function_learning( class_name, hyper_vec ) : #pls2 = KernelRidge( kernel = "rbf", gamma= 100) #pls2 = KernelRidge( ) pls2 = PLSRegression(n_components=50, max_iter=5000) X = extract_postive_features ( train_dataset[class_name][0], train_dataset[class_name][1] ) Y = [] for hypo_vec in X : sub = hyper_vec-hypo_vec Y.append(sub) # Target = difference vector ( Hypernym_vector - Hyponym_vector ) #Y.append(hyper_vec) # Target = Hypernym vector pls2.fit( X, Y) train_acc = pls2.score(X, Y) print "class = ", class_name, "train len = ", len(X) return pls2, train_acc, len(X)
def reduce_PLS(dataframe): PLS_file="data/pls_structure.pickle" selectedcolumn=[x for x in dataframe.columns if x not in ["id","click","device_id","device_ip"]] X=np.array(dataframe[selectedcolumn]) y=np.array(dataframe["click"]) if os.path.exists(PLS_file): stand_PLS=pickle.load(open(PLS_file,'rb')) print "PLS structure is loaded." else: stand_PLS=PLSRegression(n_components=10,scale=True) stand_PLS.fit(X, y[:,np.newaxis]) stand_PLS.y_scores_=None stand_PLS.x_scores_=None pickle.dump(stand_PLS,open(PLS_file,"wb")) print "PLS transform structure is stored." T=stand_PLS.transform(X) print "PLS transformation is performed." return T
sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test) X_atual = sc_X.transform(X_atual) sc_y = StandardScaler() y = y.reshape(-1, 1) sc_y.fit(y) y_train = y_train.reshape(-1, 1) y_train = sc_y.transform(y_train) y_test = y_test.reshape(-1, 1) y_test = sc_y.transform(y_test) # Regressão linear from sklearn.cross_decomposition import PLSRegression pls2 = PLSRegression(n_components=3, scale=False) pls2.fit(X_train, y_train) # Predicting the Test set results y_pred = pls2.predict(X_test) y_pred_atual = pls2.predict(X_atual) # coeficientes mean_squared_error(y_test, y_pred) r2_score(y_test, y_pred) df_atual = df[df['atletas.rodada_id'] == CURRENT_ROUND] df_atual = df_atual[['atletas.apelido', 'atletas.clube.id.full.name', 'atletas.nome', 'atletas.posicao_id', \ 'atletas.rodada_id', 'atletas.preco_num','atletas.pontos_num_sum_last5', 'atletas.media_num' ]] df_atual['pred_score'] = y_pred_atual df_atual.to_csv('predictions/predict-PLS.csv', encoding='utf-8')
# plt.show() #Partial Least Squares Regression from sklearn.cross_decomposition import PLSRegression from sklearn.preprocessing import scale X_train_scaled = scale(X_train) X_test_scaled = scale(X_test) #Performing Cross_Validation for PLS mse = [] n= len(X_train_scaled) kf_10 = cross_validation.KFold(n,n_folds=10, shuffle=True, random_state=0) for i in np.arange(1,17): plsregr = PLSRegression(n_components=i, scale=False) plsregr.fit(X_train_scaled,y_train) score = -1*cross_validation.cross_val_score(plsregr, X_train_scaled, y_train, cv=kf_10, scoring='mean_squared_error').mean() mse.append(score) plt.plot(np.arange(1,17), np.array(mse), '-v') plt.title("PLS: MSE vs. Principal Components") plt.xlabel('Number of principal components in PLS regression') plt.ylabel('MSE') plt.xlim((-0.2, 17.2)) #Based off of the plot, 12 principal components minimized MSE plsregr_test = PLSRegression(n_components=12, scale=False) plsregr_test.fit(X_train_scaled, y_train) MSE_PLS = np.mean((plsregr_test.predict(X_test_scaled) - y_test) ** 2) # print "Mean Squared Error: ", MSE_PLS
def bestpls(vipMatrix, X, Y, V): ########################### #bestR2 = -10000 #lv_best = 1 #position = 1 ########################### bestR2 = vipMatrix[0][1] lv_best = vipMatrix[0][3] position = 0 ########################### #for i in range (len(vipMatrix)): # print vipMatrix[i] for entries in range (len(vipMatrix)): #print vipMatrix[entries][1], "=?=", bestR2 ############# if vipMatrix[entries][1] > bestR2: position = entries bestR2 = vipMatrix[entries][1] lv_best = vipMatrix[entries][3] #################################################################################################qq variables = [] for i in range (1, position): # not position + 1, as the vipMatrix[position] holds the next variable to be removed variables.append(vipMatrix[i][0]) #print "VAR TO BE REMOVED: ", variables V_new_Indices = [] for i in variables: # removed variable names in random order V_new_Indices.append(V.index(i)) #if V == sorted(V): # print "\nV ok!\n" # keep names == separate V_new = deepcopy(V) for i in variables: V_new.remove(i) X_new = [] for i in range (len(X)): X_new.append([]) variables_sent = [] #### for i in range (len(X)): for j in range (len(V)): if j not in V_new_Indices: #if V[j] not in variables_sent: #### # variables_sent.append(V[j])#### X_new[i].append(X[i][j]) # epic test if not V_new == sorted(V_new): return base64.b64encode("tobulo"), [], [], 0 #else: # print "v_new ok!" #validity tests #for i in range (len (variables_sent)): # if variables_sent[i] == V_new[i]: # print "ok", i #print "var: ", len(V), "selected: ", len(V_new), "data (var) init length: ", len(X[0]), "data (var) now length: ", len(X_new[0]) """ # PREVIOUS variables = [] for i in range (1, position): variables.append(vipMatrix[i][0]) V_new = deepcopy(V) for i in variables: V_new.remove(i) ################ remove by index??? CHECK!!!! X_new = [] for i in range (len(X)): X_new.append([]) for i in range (len(X)): for j in range (len(V_new)): ####### HERE ALSO X_new[i].append(X[i][j]) """ #################################################################################################qq #print V_new, "OOOO\n\n" #var names == cool #print "\n\nNumber of variables ", len(V_new), " and latent: ", lv_best #best_pls = PLSCanonical(n_components = lv_best) best_pls = PLSRegression(n_components = lv_best) best_pls.fit(X_new, Y) saveas = pickle.dumps(best_pls) encoded = base64.b64encode(saveas) return encoded, X_new, V_new, lv_best
General Linear Model -- Elastic Net ''' clf = linear_model.ElasticNet(alpha=0.2, l1_ratio=0.01) clf.fit(x_scaled, y_scaled) print(clf.coef_) yvalid_scaled = clf.predict(xvalid_scaled) err1= MAPE(y, scalery.inverse_transform(clf.predict(x_scaled)).reshape(-1,1)) err = MAPE(yvalid, scalery.inverse_transform(yvalid_scaled).reshape(-1,1)) ''' General Linear Model -- Elastic Net ''' from sklearn.cross_decomposition import PLSRegression pls = PLSRegression(n_components=20) pls.fit(x_scaled, y_scaled) print(pls.coef_) yvalid_scaled = pls.predict(xvalid_scaled) err1= MAPE(y, scalery.inverse_transform(pls.predict(x_scaled)).reshape(-1,1)) err = MAPE(yvalid, scalery.inverse_transform(yvalid_scaled).reshape(-1,1)) from sklearn.decomposition import PCA reduced_data = PCA(n_components=2).fit_transform(xtrain_minmax) pca = PCA(n_components=2) pca.fit(xtrain_minmax) print(pca.explained_variance_ratio_)
def partial_least_squares(X_train, y_train, X_pred, store_settings, mod_params=None, metric=None): lr = PLSRegression(n_components=1, max_iter=1000, tol=1e-04) return lr.fit(X_train, y_train).predict(X_pred) return X_pred
def plot_pcr_vs_pls(): rng = np.random.RandomState(0) n_samples = 500 cov = [[3, 3], [3, 4]] X = rng.multivariate_normal(mean=[0, 0], cov=cov, size=n_samples) pca = PCA(n_components=2).fit(X) plt.scatter(X[:, 0], X[:, 1], alpha=.3, label='samples') for i, (comp, var) in enumerate(zip(pca.components_, pca.explained_variance_)): comp = comp * var # scale component by its variance explanation power plt.plot([0, comp[0]], [0, comp[1]], label=f"Component {i}", linewidth=5, color=f"C{i + 2}") plt.gca().set(aspect='equal', title="2-dimensional dataset with principal components", xlabel='first feature', ylabel='second feature') plt.legend() plt.show() y = X.dot(pca.components_[1]) + rng.normal(size=n_samples) / 2 fig, axes = plt.subplots(1, 2, figsize=(10, 3)) axes[0].scatter(X.dot(pca.components_[0]), y, alpha=.3) axes[0].set(xlabel='Projected data onto first PCA component', ylabel='y') axes[1].scatter(X.dot(pca.components_[1]), y, alpha=.3) axes[1].set(xlabel='Projected data onto second PCA component', ylabel='y') plt.tight_layout() plt.show() X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) pcr = make_pipeline(StandardScaler(), PCA(n_components=1), LinearRegression()) pcr.fit(X_train, y_train) pca = pcr.named_steps['pca'] # retrieve the PCA step of the pipeline pls = PLSRegression(n_components=1) pls.fit(X_train, y_train) fig, axes = plt.subplots(1, 2, figsize=(10, 3)) axes[0].scatter(pca.transform(X_test), y_test, alpha=.3, label='ground truth') axes[0].scatter(pca.transform(X_test), pcr.predict(X_test), alpha=.3, label='predictions') axes[0].set(xlabel='Projected data onto first PCA component', ylabel='y', title='PCR / PCA') axes[0].legend() axes[1].scatter(pls.transform(X_test), y_test, alpha=.3, label='ground truth') axes[1].scatter(pls.transform(X_test), pls.predict(X_test), alpha=.3, label='predictions') axes[1].set(xlabel='Projected data onto first PLS component', ylabel='y', title='PLS') axes[1].legend() plt.tight_layout() plt.show() print(f"PCR r-squared {pcr.score(X_test, y_test):.3f}") print(f"PLS r-squared {pls.score(X_test, y_test):.3f}") pca_2 = make_pipeline(PCA(n_components=2), LinearRegression()) pca_2.fit(X_train, y_train) print(f"PCR r-squared with 2 components {pca_2.score(X_test, y_test):.3f}")
fold_number = min(fold_number, len(autoscaled_y_train)) autoscaled_y_train = pd.Series(autoscaled_y_train) y_test = pd.Series(y_test) autoscaled_x_train = pd.DataFrame(autoscaled_x_train) autoscaled_x_test = pd.DataFrame(autoscaled_x_test) plt.rcParams['font.size'] = 18 # 横軸や縦軸の名前の文字などのフォントのサイズ for method in regression_methods: print(method) if method == 'pls': # Partial Least Squares pls_components = np.arange(1, min(np.linalg.matrix_rank(autoscaled_x_train) + 1, max_pls_component_number + 1), 1) r2all = list() r2cvall = list() for pls_component in pls_components: pls_model_in_cv = PLSRegression(n_components=pls_component) pls_model_in_cv.fit(autoscaled_x_train, autoscaled_y_train) calculated_y_in_cv = np.ndarray.flatten(pls_model_in_cv.predict(autoscaled_x_train)) estimated_y_in_cv = np.ndarray.flatten( model_selection.cross_val_predict(pls_model_in_cv, autoscaled_x_train, autoscaled_y_train, cv=fold_number)) """ plt.figure(figsize=figure.figaspect(1)) plt.scatter( y, estimated_y_in_cv) plt.xlabel("Actual Y") plt.ylabel("Calculated Y") plt.show() """ r2all.append(float(1 - sum((autoscaled_y_train - calculated_y_in_cv) ** 2) / sum(autoscaled_y_train ** 2))) r2cvall.append(float(1 - sum((autoscaled_y_train - estimated_y_in_cv) ** 2) / sum(autoscaled_y_train ** 2))) plt.plot(pls_components, r2all, 'bo-')
for j in range(len(random_split)): test = random_split[j] training_list = random_split[0:j] + random_split[j + 1:len(random_split)] training = pd.concat(training_list) X_train = training.drop( training.columns[dsloader.RESPONSE_COLUMN_INDEX_NO_PCA], axis=1) Y_train = training.iloc[:, dsloader.RESPONSE_COLUMN_INDEX_NO_PCA].values X_test = test.drop(test.columns[dsloader.RESPONSE_COLUMN_INDEX_NO_PCA], axis=1) Y_test = label_encoder.fit_transform( test.iloc[:, dsloader.RESPONSE_COLUMN_INDEX_NO_PCA].apply( transform_numeric_to_y)) model = PLSRegression(n_components=f_num_candidate) model.fit(X_train, Y_train) predictions = model.predict(X_test) predictions = label_encoder.fit_transform( np.array([ transform_numeric_to_y(prediction) for prediction in predictions ])) current_auc += roc_auc_score(Y_test, predictions) current_auc /= len(random_split) print("step {}: {} - {}".format(i + 1, f_num_candidate, current_auc)) results_df.iloc[0, i] = f_num_candidate results_df.iloc[1, i] = current_auc i += 1
class VM_Process2_시뮬레이터: metric = 0 def __init__(self, A, d, C, F, p_lambda, p_VM, p_ACT, seed): self.pls = PLSRegression(n_components=6, scale=False, max_iter=50000, copy=True) np.random.seed(seed) self.A = A self.d = d self.C = C self.F = F self.p_lambda = p_lambda self.p_VM = p_VM self.p_ACT = p_ACT self.real_ACT = [] # Process-1을 반영하는 실제 Actual 값 def sampling_up(self): # u1 = np.random.normal(0.4, np.sqrt(0.2)) # u2 = np.random.normal(0.6, np.sqrt(0.2)) u1 = np.random.normal(0.2, np.sqrt(0.1)) u2 = np.random.normal(0.1, np.sqrt(0.05)) u = np.array([u1, u2]) return u def sampling_vp(self): v1 = np.random.normal(-0.4, np.sqrt(0.2)) v2 = 2 * v1 v3 = np.random.uniform(0.2, 0.6) v4 = 3 * v3 v5 = np.random.uniform(0, 0.4) v = np.array([v1, v2, v3, v4, v5]) return v def sampling_ep(self): e1 = np.random.normal(0, np.sqrt(0.05)) e2 = np.random.normal(0, np.sqrt(0.1)) e = np.array([e1, e2]) return e def sampling(self, k, uk=np.array([0, 0]), vp=np.array([0, 0, 0, 0, 0]), ep=np.array([0, 0]), p_VM=np.array([0, 0]), p_ACT=np.array([0, 0]), isInit=True): u1 = uk[0] u2 = uk[1] u = uk v1 = vp[0] v2 = vp[1] v3 = vp[2] v4 = vp[3] v5 = vp[4] v = vp e = ep k1 = k % 150 k2 = k eta_k = np.array([[k1], [k2]]) if isInit == True: e = np.array([0, 0]) #DoE는 Sampling Actual이기 때문에 e가 없다. fp = p_ACT # DoE에서는 Process-1 Act 값 사용 else: fp = p_VM # VM에서는 Process-1 VM 값 사용 psi = np.array([u1, u2, v1, v2, v3, v4, v5, k1, k2]) if fp is not None: # Process-1의 입력값이 있다면 # 이건 왜 해놓은지 모르겠다.. R2R에서 결과를 도출하기 위해서인지, y를 Paremeter로 학습목적인지.. 추후 검증필요 if k % 10 == 0: f = p_ACT else: f = p_VM if isInit == True: f = p_ACT psi = np.r_[psi, f] # VM이든 DoE든 계산한다. y = u.dot(self.A) + v.dot(self.C) + np.sum( eta_k * self.d, axis=0) + f.dot(self.F) + e if isInit == False: #VM이 아닌 실제 ACT값을 별도로 계산한다. #print('f.dot(self.F) : ', f.dot(self.F), 'p_ACT.dot(self.F) : ', p_ACT.dot(self.F)) temp = u.dot(self.A) + v.dot(self.C) + np.sum( eta_k * self.d, axis=0) + p_ACT.dot(self.F) + e self.real_ACT.append(np.array([temp[0], temp[1]])) else: # Process-1의 입력값이 없고, 향후 Process-2 VM만 하고 싶을 때 y = u.dot(self.A) + v.dot(self.C) + np.sum(eta_k * self.d, axis=0) + e rows = np.r_[psi, y] idx_end = len(rows) idx_start = idx_end - 2 return idx_start, idx_end, rows def pls_update(self, V, Y): self.pls.fit(V, Y) return self.pls def setDoE_Mean(self, DoE_Mean): self.DoE_Mean = DoE_Mean def getDoE_Mean(self): return self.DoE_Mean def setPlsWindow(self, PlsWindow): self.PlsWindow = PlsWindow def getPlsWindow(self): return self.PlsWindow def DoE_Run(self, lamda_PLS, Z, M, f): N = Z * M DoE_Queue = [] for k in range(1, N + 1): # range(101) = [1, 2, ..., 120]) if f is not None: fp = f[k - 1, 0:2] else: fp = None idx_start, idx_end, result = self.sampling(k, self.sampling_up(), self.sampling_vp(), self.sampling_ep(), None, fp, True) DoE_Queue.append(result) initplsWindow = DoE_Queue.copy() npPlsWindow = np.array(initplsWindow) plsWindow = [] # Process-1의 lamda_PLS는 이미 반영되어서 넘어오기 때문에, 중복되어 lamda_PLS를 반영할 필요가 없다. for z in np.arange(0, Z): if f is not None: npPlsWindow[z * M:(z + 1) * M - 1, 0:idx_start - 2] = lamda_PLS * npPlsWindow[z * M:(z + 1) * M - 1, 0:idx_start - 2] npPlsWindow[z * M:(z + 1) * M - 1, idx_start - 2:idx_start] = self.p_lambda * npPlsWindow[ z * M:(z + 1) * M - 1, idx_start - 2:idx_start] npPlsWindow[z * M:(z + 1) * M - 1, idx_start:idx_end] = lamda_PLS * (npPlsWindow[ z * M:(z + 1) * M - 1, idx_start:idx_end]) else: npPlsWindow[z * M:(z + 1) * M - 1, 0:idx_start] = lamda_PLS * npPlsWindow[ z * M:(z + 1) * M - 1, 0:idx_start] npPlsWindow[z * M:(z + 1) * M - 1, idx_start:idx_end] = lamda_PLS * (npPlsWindow[ z * M:(z + 1) * M - 1, idx_start:idx_end]) for i in range(len(npPlsWindow)): plsWindow.append(npPlsWindow[i]) npDoE_Queue = np.array(plsWindow) DoE_Mean = np.mean(npDoE_Queue, axis=0) plsModelData = npDoE_Queue - DoE_Mean V0 = plsModelData[:, 0:idx_start] Y0 = plsModelData[:, idx_start:idx_end] pls = self.pls_update(V0, Y0) y_prd = pls.predict(V0) + DoE_Mean[idx_start:idx_end] y_act = npDoE_Queue[:, idx_start:idx_end] #print("Init DoE VM Mean squared error: %.4f" % metrics.mean_squared_error(y_act[:,1:2], y_prd[:,1:2])) #print("Init DoE VM r2 score: %.4f" % metrics.r2_score(y_act[:,1:2], y_prd[:,1:2])) #print("pls : ", pls.coef_) self.setDoE_Mean(DoE_Mean) self.setPlsWindow(plsWindow) # self.plt_show1(N, y_act[:,0:1], y_prd[:,0:1]) def VM_Run(self, lamda_PLS, Z, M): N = Z * M ## V0, Y0 Mean Center DoE_Mean = self.getDoE_Mean() idx_end = len(DoE_Mean) idx_start = idx_end - 2 meanVz = DoE_Mean[0:idx_start] meanYz = DoE_Mean[idx_start:idx_end] M_Queue = [] ez_Queue = [] mape_Queue = [] ez_Queue.append([0, 0]) y_act = [] y_prd = [] VM_Output = [] ACT_Output = [] plsWindow = self.getPlsWindow() for z in np.arange(0, Z): for k in np.arange(z * M + 1, ((z + 1) * M) + 1): if self.p_VM[k - 1] is not None: idx_start, idx_end, result = self.sampling( k, self.sampling_up(), self.sampling_vp(), self.sampling_ep(), self.p_VM[k - 1], self.p_ACT[k - 1], False) else: idx_start, idx_end, result = self.sampling( k, self.sampling_up(), self.sampling_vp(), self.sampling_ep(), None, None, False) psiK = result[0:idx_start] psiKStar = psiK - meanVz y_predK = self.pls.predict(psiKStar.reshape( 1, idx_start)) + meanYz rows = np.r_[result, y_predK.reshape(2, )] M_Queue.append(rows) y_prd.append(rows[idx_end:idx_end + 2]) y_act.append(rows[idx_start:idx_end]) del plsWindow[0:M] ez = M_Queue[M - 1][idx_start:idx_end] - M_Queue[ M - 1][idx_end:idx_end + 2] ez_Queue.append(ez) if z == 0: ez = np.array([0, 0]) npVM_Queue = np.array(M_Queue) npACT_Queue = np.array(M_Queue) # for i in range(M): # VM_Output 구한다. lamda_pls 가중치를 반영하지 않는다. # if i == M - 1: # temp = npM_Queue[i:i + 1, idx_start:idx_end] # else: # temp = npM_Queue[i:i + 1, idx_end:idx_end + 2] # VM_Output.append(np.array([temp[0, 0], temp[0, 1]])) # Process-1의 lamda_PLS는 이미 반영되어서 넘어오기 때문에, 중복되어 lamda_PLS를 반영할 필요가 없다. if self.p_VM[z - 1] is not None: npVM_Queue[0:M - 1, 0:idx_start - 2] = lamda_PLS * npVM_Queue[0:M - 1, 0:idx_start - 2] npVM_Queue[0:M - 1, idx_start - 2:idx_start] = self.p_lambda * npVM_Queue[ 0:M - 1, idx_start - 2:idx_start] npVM_Queue[0:M - 1, idx_start:idx_end] = lamda_PLS * ( npVM_Queue[0:M - 1, idx_end:idx_end + 2] + 0.5 * ez ) # + 0.5 * ez npVM_Queue = npVM_Queue[:, 0:idx_end] npACT_Queue[0:M - 1, 0:idx_start - 2] = lamda_PLS * npACT_Queue[0:M - 1, 0:idx_start - 2] npACT_Queue[0:M - 1, idx_start - 2:idx_start] = lamda_PLS * npACT_Queue[0:M - 1, idx_start - 2:idx_start] npACT_Queue[0:M - 1, idx_start:idx_end] = lamda_PLS * npACT_Queue[ 0:M - 1, idx_start:idx_end] npACT_Queue = npACT_Queue[:, 0: idx_end] ##idx_start ~ end 까지 VM 값 정리 else: npVM_Queue[0:M - 1, 0:idx_start] = lamda_PLS * npVM_Queue[0:M - 1, 0:idx_start] npVM_Queue[0:M - 1, idx_start:idx_end] = lamda_PLS * ( npVM_Queue[0:M - 1, idx_end:idx_end + 2] + 0.5 * ez ) # + 0.5 * ez npVM_Queue = npVM_Queue[:, 0:idx_end] npACT_Queue[0:M - 1, 0:idx_start] = lamda_PLS * npACT_Queue[0:M - 1, 0:idx_start] npACT_Queue[0:M - 1, idx_start:idx_end] = lamda_PLS * npACT_Queue[ 0:M - 1, idx_start:idx_end] npACT_Queue = npACT_Queue[:, 0: idx_end] ##idx_start ~ end 까지 VM 값 정리 for i in range( M): #VM_Output 구한다. lamda_pls 가중치를 반영하여 다음 계산시 편리하게 한다. if i == M - 1: temp = npACT_Queue[i:i + 1, idx_start:idx_end] else: temp = npVM_Queue[i:i + 1, idx_start:idx_end] VM_Output.append(np.array([temp[0, 0], temp[0, 1]])) temp = npACT_Queue[i:i + 1, idx_start:idx_end] ACT_Output.append(np.array([temp[0, 0], temp[0, 1]])) for i in range(M): plsWindow.append(npVM_Queue[i]) M_Mean = np.mean(plsWindow, axis=0) meanVz = M_Mean[0:idx_start] meanYz = M_Mean[idx_start:idx_end] plsModelData = plsWindow - M_Mean V = plsModelData[:, 0:idx_start] Y = plsModelData[:, idx_start:idx_end] self.pls_update(V, Y) del M_Queue[0:M] #y_act = np.array(y_act) y_act = np.array(self.real_ACT) y_prd = np.array(y_prd) ez_all_run = y_act - y_prd self.metric = metrics.explained_variance_score(y_act[:, 1:2], y_prd[:, 1:2]) print("VM Mean squared error: %.4f" % metrics.mean_squared_error(y_act[:, 1:2], y_prd[:, 1:2])) print("explained_variance_score: %.4f" % self.metric) print("VM r2 score: %.4f" % metrics.r2_score(y_act[:, 1:2], y_prd[:, 1:2])) #print("pls : ", self.pls.coef_) ez_run = np.array(ez_Queue) VM_Output = np.array(VM_Output) ACT_Output = np.array(ACT_Output) return VM_Output, ACT_Output, ez_run, y_act, y_prd, ez_all_run
import time import pandas as pd import matplotlib.pyplot as plt import sys n = sys.argv[1] start = time.time() with open(f"{n}.pickle", "rb") as f: datas = pickle.load(f) models = {} models["LASSO"] = LassoCV(max_iter=10000, cv=5, n_jobs=-1) models["RIDGE"] = RidgeCV(cv=5) models["EN"] = ElasticNetCV(max_iter=10000, cv=5, n_jobs=-1) models["PLS20"] = PLSRegression(n_components=20, scale=False) results = {} for key in models.keys(): model = models[key] results[key] = {"metrics": {}, "data": {}} for i in range(len(datas)): data = datas[i] q2 = cross_val_score(model, data['train_X'], data['train_Y'], cv=5, scoring='r2').mean() model.fit(data["train_X"], data["train_Y"]) predict_Y = model.predict(data["test_X"])
def pls(self, x, y, param_info): pls = PLSRegression(n_components=param_info.pls_compnum) #pls = PLSRegression(n_components=param_info.pls_compnum, max_iter=1000000) #pls = PLSSVD(n_components=param_info.pls_compnum) self.learned_pls = pls.fit(x, y)
# scale all samples according to training set scaler = preprocessing.MinMaxScaler().fit(train_hydrogens) train_hydrogens_normalized = scaler.transform(train_hydrogens) test_hydrogens_normalized = scaler.transform(test_hydrogens) # one hot encode training labels for plsda train_labels_one_hot = [] for i in np.ravel(train_labels): if i == 0: train_labels_one_hot.append([1, 0]) else: train_labels_one_hot.append([0, 1]) train_labels_one_hot = np.array(train_labels_one_hot) plsda = PLSRegression(n_components=30, scale=False) plsda.fit(train_hydrogens_normalized, train_labels_one_hot) test_pred_ = plsda.predict(test_hydrogens_normalized) test_pred = np.array([np.argmax(x) for x in test_pred_]).reshape(-1, 1) cm = confusion_matrix(test_labels, test_pred) auroc = roc_auc_score(test_labels, test_pred_[:, 1]) auroc_folds.append(auroc) precision, recall, thresh = precision_recall_curve(test_labels, test_pred_[:, 1]) aupr = auc(recall, precision) aupr_folds.append(aupr)
def stacklearning(self): class sparseNorm(BaseEstimator, TransformerMixin): def __init__(self): pass def fit(self, X, y=None): return self def transform(self, X): from sklearn import preprocessing Y = preprocessing.normalize(sp.sparse.csc_matrix(X.values)) return Y fm = sgd.FMRegression( n_iter=4743, init_stdev=0.1, rank=100, l2_reg_w=0, l2_reg_V=0, step_size=0.1, ) fm = sgd.FMRegression( n_iter=9943, init_stdev=0.1, rank=219, l2_reg_w=0, l2_reg_V=0.06454, step_size=0.1, ) pipe = make_pipeline(sparseNorm(), fm) calcACC(pipe, X=X2) xgb = xgboost.XGBRegressor( n_estimators=100, max_depth=7, gamma=0, colsample_bytree=0.1 ) lgbm = LGBMRegressor( boosting_type='gbdt', num_leaves=367, learning_rate=0.06,feature_fraction=0.14, max_depth=28, min_data_in_leaf=8 ) rgf = RGFRegressor( max_leaf=1211, algorithm="RGF", test_interval=100, loss="LS", verbose=False, l2=0.93, min_samples_leaf=2 ) rf = RandomForestRegressor( max_depth=20, random_state=0, n_estimators=56,min_samples_split=2, max_features=0.21 ) rf = RandomForestRegressor() ext = ExtraTreesRegressor( n_estimators=384,max_features= 2228, min_samples_split= 0.01,max_depth= 856, min_samples_leaf= 1 ) svr = SVR( gamma=9.5367431640625e-07, epsilon=0.0009765625, C= 2048.0 ) #test combination desNew = make_pipeline(extdescriptorNew(),rf) morNew = make_pipeline(extMorganNew(),rf) kotNew = make_pipeline(extklekotaTothNew(),rf) macNew = make_pipeline(extMACCSNew(),rf) desMac = make_pipeline(extDescriptorMACCS(),rf) morMac = make_pipeline(extMorganMACCS(),rf) kotMac = make_pipeline(extKlekotaTothMACCS(),rf) morKotNew = make_pipeline(extMorganKlekotaTothNew(),rf) des = make_pipeline(extOnlyDescriptor(),rf) mor = make_pipeline(extOnlyMorgan(),rf) kot = make_pipeline(extOnlyklekotaToth(),rf) mac = make_pipeline(extOnlyMACCS(),rf) all = make_pipeline(extAll(),rf) allwithoutNew = make_pipeline(extAllwithoutNew(),rf) allwithoutMaccs = make_pipeline(extAllwithoutMaccs(),rf) allwithoutDes = make_pipeline(extAllwithoutDescriptor(),rf) testDic = {"Desc+New":desNew,"Mor+New":morNew,"kot+New":kotNew,"MACCS+New":macNew,"Des+MAC":desMac,"Morgan+Maccs":morMac,"Kot+MACCS":kotMac,"mor+kot+New":morKotNew, "descriptor":des,"morgan":mor,"kot":kot,"MACCS":mac,"All":all,"All without " "new":allwithoutNew, "All without MACCS":allwithoutMaccs,"All without Des":allwithoutDes} #10fold cv = KFold(n_splits=10, shuffle=True, random_state=0) #Fingerprinttest resultDic={} resultDic2={} for name,model in testDic.items(): #model = StackingRegressor(regressors=[name], meta_regressor=rf,verbose=1) #calcACC(model,X=X,y=y2,name=name) Scores = cross_validate(model, X2, y2, cv=cv,scoring=myScoreFunc) RMSETmp = Scores['test_RMSE'].mean() CORRTmP = Scores['test_Correlation coefficient'].mean() resultDic.update({name:[RMSETmp,CORRTmP]}) print(name,RMSETmp,CORRTmP) #stacking alldata = make_pipeline(extAll()) # random forest #1.1546 0.70905 stack = StackingRegressor(regressors=[alldata], meta_regressor=rf,verbose=1) # Light Gradient boosting # 1.160732 0.703776 testmodel = StackingRegressor(regressors=[alldata], meta_regressor=lgbm,verbose=1) # XGboost # 1.1839805 0.689571 testmodel = StackingRegressor(regressors=[alldata], meta_regressor=xgb,verbose=1) # Regularized greedily forest # 1.17050 0.6992 testmodel = StackingRegressor(regressors=[alldata], meta_regressor=rgf,verbose=1) #pls 22.808047774809697 0.6410026452910016 i=4 for i in np.arange(3,11,1): pls = PLSRegression(n_components=i) testmodel = StackingRegressor(regressors=[alldata], meta_regressor=pls,verbose=0) calcACC(testmodel) pls = PLSRegression(n_components=4) #SVR svr = SVR(gamma=9.5367431640625/10000000,C=1559.4918100725592, epsilon=0.0009765625,) svr = SVR(kernel='rbf',gamma=9.5367431640625e-07,epsilon=0.0009765625,C=2048.0) testmodel = StackingRegressor(regressors=[alldata], meta_regressor=svr, verbose=1) calcACC(svr) #Extratree 1.157420824123527 0.7061010221224269 testmodel = StackingRegressor(regressors=[alldata], meta_regressor=ext, verbose=1) calcACC(testmodel) #k-NN nbrs = KNeighborsRegressor(3) ##Linear regressions #Stochastic Gradient Descenta sgd = SGDRegressor(max_iter=1000) # Ridge for i in [1,10,100,1000]: ridge = Ridge(alpha=i) calcACC(ridge) ridge = Ridge(alpha=45.50940042350705) calcACC(ridge) # multiple linear lin = make_pipeline(forlinear(),LinearRegression(n_jobs=-1)) calcACC(lin) #stacking #0.69 testmodel = StackingRegressor(regressors=[alldata,nbrs,all], meta_regressor=rf,verbose=1) #1.1532 0.70926 testmodel = StackingRegressor(regressors=[alldata,nbrs,all,xgb,lgbm,rgf], meta_regressor=rf, verbose=1) #1.16420 0.7041 testmodel = StackingRegressor(regressors=[alldata,alldata,all], meta_regressor=rf,verbose=1) #1.16379 0.7044 stack1 = StackingRegressor(regressors=[alldata,nbrs,all,xgb,lgbm,rgf], meta_regressor=rf,verbose=1) testmodel = StackingRegressor(regressors=[alldata,stack1,stack1], meta_regressor=rf,verbose=1) #1.1535496740699531 0.7108839199109559 pcaFeature = make_pipeline(extPCA()) testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf] ,meta_regressor=rf,verbose=1) #1.181801005432221 0.6889745579620922 testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf] ,meta_regressor=lgbm,verbose=1) #0.70613 testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf,ext] ,meta_regressor=xgb,verbose=1) #0.71641717 testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf,ext] ,meta_regressor=rf,verbose=1) #0.7146922 testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,ridge,rf,xgb,lgbm,rgf,ext] ,meta_regressor=rf,verbose=1) #new features pcaFeature = make_pipeline(extPCA()) #old pipe1 = make_pipeline(extMACCS(), rf) pipe2 = make_pipeline(extMorgan(), rf) pipe3 = make_pipeline(extDescriptor(), rf) pipe4 = make_pipeline(extPCA(), rgf) pipe7 =make_pipeline(extDescriptor(), rgf) pipe8 =make_pipeline(extDescriptor(), rgf) xgb = xgboost.XGBRegressor() nbrs = KNeighborsRegressor(2) svr = SVR(gamma='auto',kernel='linear') pls = PLSRegression(n_components=4) extMACCSdata = make_pipeline(extMACCS()) nbrsPipe = make_pipeline(extMorgan(), nbrs) pipe6 = make_pipeline(extMACCS(), rgf) alldata = make_pipeline(extAll()) ave = extAverage() withoutdesc = make_pipeline(extMACCS()) meta = RandomForestRegressor(max_depth=20, random_state=0, n_estimators=400) #stack1 = StackingRegressor(regressors=[rgf, nbrs, alldata], meta_regressor=rgf, verbose=1) #0.70 stack = StackingRegressor(regressors=[pipe1,pipe2,pipe3,xgb,lgbm,rgf,rf], meta_regressor=ave, verbose=1) #stack2 = StackingRegressor(regressors=[stack1,nbrs, svr,pls,rgf], meta_regressor=lgbm, verbose=1) #0.69###################### stack1 = StackingRegressor(regressors=[pipe1,pipe2,pipe3], meta_regressor=rf, verbose=1) #0.70 stack2 = StackingRegressor(regressors=[stack1,alldata,rgf,lgbm,xgb], meta_regressor=rf,verbose=1) #0.71 stack3 = StackingRegressor(regressors=[stack2,pipe1], meta_regressor=ave, verbose=1) ########################### ########################### stack1 = StackingRegressor(regressors=[pipe1,pipe2,pipe3], meta_regressor=rf, verbose=1) stack2 = StackingRegressor(regressors=[stack1,withoutdesc,lgbm,rgf], meta_regressor=rf,verbose=1) stack3 = StackingRegressor(regressors=[stack2,pipe1,xgb], meta_regressor=ave, verbose=1) ########################### #stackingwithknn stack1 = StackingRegressor(regressors=[pipe1,pipe2,pipe3], meta_regressor=rf, verbose=1) stack2 = StackingRegressor(regressors=[stack1,nbrs,pipe1], meta_regressor=rf, verbose=1) #stack3 = StackingRegressor(regressors=[rgf, nbrs, alldata], meta_regressor=ave, verbose=1) cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0) cv = KFold(n_splits=10, shuffle=True, random_state=0) St1Scores = cross_validate(stack1,X,y,cv=cv) St1Scores['test_score'].mean()**(1/2) St2Scores = cross_validate(stack2,X,y,cv=cv) St2Scores['test_score'].mean()**(1/2) St3Scores = cross_validate(stack3,X,y,cv=cv) St3Scores['test_score'].mean()**(1/2) stackScore = cross_validate(stack, X, y, cv=cv) stackScore['test_score'].mean()**(1/2) lgbmScores =cross_validate(lgbm,X,y,cv=cv) lgbmScores['test_score'].mean()**(1/2) rgfScores = cross_validate(rgf,X,y,cv=cv) rgfScores['test_score'].mean()**(1/2) RFScores = cross_validate(rf,X,y,cv=cv) RFScores['test_score'].mean()**(1/2) scores = cross_validate(stack2,X,y,cv=cv) scores['test_score'].mean()**(1/2) print("R^2 Score: %0.2f (+/- %0.2f) [%s]" % (scores['test_score'].mean(), scores['test_score'].std(), 'stacking')) stack3.fit(X, y) y_pred = stack3.predict(X_train) y_val = stack3.predict(X_test) #stack3.score(X_train, y_train) exX = preprocess(extractDf, changeList) valy = (10 **(stack3.predict(exX))).tolist() print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train)) print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test)) print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train)) print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test)) stack1.fit(X, y) valy = (10 **(stack1.predict(exX))).tolist() sgd.fit(X,y) valy = (10 **(sgd.predict(exX))).tolist() rgfpipe = make_pipeline(extMACCS(), rf) rgf.fit(X,y) valy = (10 **(rgf.predict(exX))).tolist() nbrs.fit(X,y) valy = (10 **(nbrs.predict(exX))).tolist() pipe = make_pipeline(extMACCS(), rf) pipe.fit(X,y) valy = (10 **(pipe.predict(exX))).tolist() rf.fit(X, y) y_pred = rf.predict(X_train) y_val = rf.predict(X_test) exX = preprocess(extractDf, changeList) valy = (10 **(rf.predict(exX))).tolist() print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train)) print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test)) print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train)) print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test)) lgbm.fit(X, y) #y_pred = pipe1.predict(X_train) #y_val = pipe1.predict(X_test) exX = preprocess(extractDf, changeList) valy = (10 **(lgbm.predict(exX))).tolist() print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train)) print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test)) print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train)) print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))
def pls_thing(scenario_data, xcols, ycols, titlestr): #PLS Summary Stats pls = PLSRegression(n_components=3) pls.fit(scenario_data[xcols], scenario_data[ycols]) k = 0 transformed_x_full = pls.transform(scenario_data[xcols]) y = scenario_data[ycols] results = pd.DataFrame(columns=('Case Label', 'Explained Variance Ratio', 'RegressionCoefs', 'Regression R^2', 'SpearmanCorr', 'SpearmanPvalue', 'Loadings', 'X Weights', 'X Loadings', 'X Scores')) if type(titlestr) == type([]): titlestr = ' '.join(titlestr) #Linear fits for each individual component for c in range(np.shape(pls.x_weights_)[1]): x_transformed_1pc = transformed_x_full[:, k].reshape(-1, 1) lr = linear_model.LinearRegression(fit_intercept=True, normalize=True) lr.fit(x_transformed_1pc, y) print('Regression Coefs', lr.coef_) print('R^2', lr.score(x_transformed_1pc, y)) print('Spearman: ', scipy.stats.spearmanr(x_transformed_1pc, y)) print('Component: ', c) results.loc[len(results)] = np.nan results.loc[len(results) - 1, 'Case Label'] = titlestr + ' Component ' + str(k) # results.loc[len(results)-1,'Explained Variance Ratio'] = pls.explained_variance_ratio_[k] results.set_value(len(results) - 1, 'RegressionCoefs', lr.coef_) results.loc[len(results) - 1, 'Regression R^2'] = lr.score(x_transformed_1pc, y) results.loc[len(results) - 1, 'SpearmanCorr'] = scipy.stats.spearmanr( x_transformed_1pc, y)[0] results.loc[len(results) - 1, 'SpearmanPvalue'] = scipy.stats.spearmanr( x_transformed_1pc, y)[1] results.set_value(len(results) - 1, 'X Weights', pls.x_weights_[:, k]) results.set_value( len(results) - 1, 'X Loadings', pls.x_loadings_[:, k]) results.set_value(len(results) - 1, 'X Scores', pls.x_scores_[:, k]) plt.plot(x_transformed_1pc, y, '*') plt.xlabel('Component ' + str(k)) plt.ylabel('Performance') plt.title('PLS ' + titlestr) plt.show() k += 1 print(results) fig = plt.figure() ax = fig.add_subplot(111) ax.set_title("PLS PC0 vs PC1 vs Performance " + ' '.join(cs), fontsize=14) ax.set_xlabel("PC0", fontsize=12) ax.set_ylabel("PC1", fontsize=12) ax.scatter(transformed_x_full[:, 0], transformed_x_full[:, 1], s=100, c=y, marker='*', cmap=cm.bwr) plt.show() fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(transformed_x_full[:, 0], transformed_x_full[:, 1], transformed_x_full[:, 2], s=100, c=y, marker='*', cmap=cm.bwr) ax.set_title("PLS PC0 vs PC1 vs PC2 vs Performance " + ' '.join(cs), fontsize=14) ax.set_xlabel("PC0", fontsize=12) ax.set_ylabel("PC1", fontsize=12) ax.set_zlabel("PC2", fontsize=12) plt.show() print(results) return results
fig = plt.figure() ax = fig.add_subplot(111, projection='3d') for c, i, target_name in zip("rb", target_names, target_names): ax.scatter(X_r[y == i, 0], X_r[y == i, 1], X_r[y == i, 2], c=c) ax.set_xlabel('X Label') ax.set_ylabel('Y Label') ax.set_zlabel('Z Label') plt.axis('equal') ax.set_xlim([-1000,4000]) ax.set_ylim([-1000,4000]) ax.set_zlim([-1000,4000]) plt.show() # part b PLS1 = PLS(n_components=3) number_map = {"M": 0,"B": 1} numeric_y = np.array(map(lambda x : number_map[x],y)) result = PLS1.fit_transform(x,numeric_y) X_r = result[0] fig = plt.figure() ax = fig.add_subplot(111, projection='3d') for c, i, target_name in zip("rb", target_names, target_names): ax.scatter(X_r[y == i, 0], X_r[y == i, 1], X_r[y == i, 2], c=c) ax.set_xlabel('X Label') ax.set_ylabel('Y Label') ax.set_zlabel('Z Label') plt.axis('equal') plt.show()
def make_plots(m,data,colors,names,groundtruth=None,waves=None,sample_size=10,ux=0, remove_mean=False,log_x=False,ylim=(0.3,1),res_out='',title=None): inds_sup_train = np.random.choice(data['X'].shape[0],size=sample_size) inds_sup_valid = np.random.choice(data['X_valid'].shape[0],size=sample_size) inds_train_x = np.random.choice(data['X_'].shape[0],size=sample_size) inds_train_y = np.random.choice(data['_y'].shape[0],size=sample_size) y = np.hstack([data['y'],1-data['y'].sum(axis=1,keepdims=True)]) y_valid = np.hstack([data['y_valid'],1-data['y_valid'].sum(axis=1,keepdims=True)]) y_corners = np.vstack((np.eye(data['y'].shape[1]),np.zeros(data['y'].shape[1]))).astype('float32') simplex = [] for point in product(*([np.linspace(0,1,50)]*y.shape[1])): if np.sum(point) == 1: simplex += [point] simplex = np.asarray(simplex).astype('float32') simplex = simplex[:,:-1] if waves is None: waves = np.arange(data['X'].shape[1]) if remove_mean: _ux = ux else: _ux = 0 if log_x: f = lambda x: np.exp(x) else: f = lambda x: x if ylim is not None: force_ylim = True pls_XY = PLSRegression(n_components=8,scale=False) pls_XY.fit(data['X'],y) pred_train_pls = pls_XY.predict(data['X']) pred_train_pls = (pred_train_pls.T/np.sum(pred_train_pls,axis=1)).T pred_valid_pls = pls_XY.predict(data['X_valid']) pred_valid_pls = (pred_valid_pls.T/np.sum(pred_valid_pls,axis=1)).T score_pred_train_pls = KL(pred_train_pls,y) score_pred_valid_pls = KL(pred_valid_pls,y_valid) pls_YX = PLSRegression(n_components=min(8,y.shape[1]),scale=False) pls_YX.fit(y,data['X']) gen_train_pls = pls_YX.predict(y) gen_valid_pls = pls_YX.predict(y_valid) score_gen_train_pls = L2(gen_train_pls,data['X']) score_gen_valid_pls = L2(gen_valid_pls,data['X_valid']) pred_train = m.predict(x=data['X'],deterministic=True) pred_train = np.hstack([pred_train,1-pred_train.sum(axis=1,keepdims=True)]) score_pred_train = KL(pred_train,y) pred_valid = m.predict(x=data['X_valid'],deterministic=True) pred_valid = np.hstack([pred_valid,1-pred_valid.sum(axis=1,keepdims=True)]) score_pred_valid = KL(pred_valid,y_valid) if m.model_type in [1,2]: z2_train = m.getZ2(x=data['X'],y=data['y'],deterministic=True) z2_valid = m.getZ2(x=data['X_valid'],y=data['y_valid'],deterministic=True) z2_train_mean = z2_train.mean(axis=0) z2_valid_mean = z2_valid.mean(axis=0) z2_gen_train = z2_train_mean*np.ones_like(z2_train).astype('float32') z2_gen_valid = z2_valid_mean*np.ones_like(z2_valid).astype('float32') z2_gen_manifold = z2_valid_mean*np.ones((simplex.shape[0],z2_valid.shape[1])).astype('float32') z2_gen_endmembers = z2_train_mean*np.ones((y_corners.shape[0],z2_train.shape[1])).astype('float32') gen_train = f(_ux + m.generate(y=data['y'][inds_sup_train],z2=z2_gen_train[inds_sup_train],deterministic=True)) # true by default for non-variational, variational default is False gen_valid = f(_ux + m.generate(y=data['y_valid'][inds_sup_valid],z2=z2_gen_valid[inds_sup_valid],deterministic=True)) manifold = f(_ux + m.generate(y=simplex,z2=z2_gen_manifold,deterministic=True)) endmembers = f(_ux + m.generate(y=y_corners,z2=z2_gen_endmembers,deterministic=True)) if m.variational: endmembers_dists = [] for idx_c, c in enumerate(y_corners): endmembers_dist = [f(_ux + m.generate(y=np.atleast_2d(c),z2=z2_gen_endmembers[idx_c:idx_c+1],deterministic=False)).squeeze() for i in range(sample_size)] endmembers_dists += [np.asarray(endmembers_dist)] endmembers_dists = endmembers_dists else: gen_train = f(_ux + m.generate(y=data['y'][inds_sup_train],deterministic=True)) # true by default for non-variational, variational default is False gen_valid = f(_ux + m.generate(y=data['y_valid'][inds_sup_valid],deterministic=True)) manifold = f(_ux + m.generate(y=simplex,deterministic=True)) endmembers = f(_ux + m.generate(y=y_corners,deterministic=True)) if m.variational: endmembers_dists = [] for idx_c, c in enumerate(y_corners): endmembers_dist = [f(_ux + m.generate(y=np.atleast_2d(c),deterministic=False)).squeeze() for i in range(sample_size)] endmembers_dists += [np.asarray(endmembers_dist)] endmembers_dists = endmembers_dists recon_train = f(_ux + m.generate(x=data['X_'][inds_train_x],deterministic=True)) recon_sup_valid = f(_ux + m.generate(x=data['X_valid'][inds_sup_valid],deterministic=True)) fs = 24 fs_tick = 18 # change xticks to be names p = 100 plt.plot(p*y[inds_sup_train][0],'k',lw=2,label='Ground Truth') ssdgm_label = 'SSDGM ({:.3f})'.format(score_pred_train) plt.plot(p*pred_train[inds_sup_train][0],'r-.',lw=2,label=ssdgm_label) pls_label = 'PLS ({:.3f})'.format(score_pred_train_pls) plt.plot(p*pred_train_pls[inds_sup_train][0],'b-.',lw=2,label=pls_label) plt.plot(p*y[inds_sup_train].T,'k',lw=2) plt.plot(p*pred_train[inds_sup_train].T,'r-.',lw=2) plt.plot(p*pred_train_pls[inds_sup_train].T,'b-.',lw=2) plt.title('Predicting Composition - Training Error', fontsize=fs) plt.ylabel('Composition (%)', fontsize=fs) ax = plt.gca() ax.set_ylim((0,1*p)) ax.set_xticks(np.arange(y.shape[1])) ax.set_xticklabels(names, fontsize=fs) ax.tick_params(axis='x',direction='out',top='off',length=10,labelsize=fs_tick) lgd = plt.legend(loc='center left',bbox_to_anchor=(1, 0.5)) ax = plt.gca() plt.savefig(res_out+'/comp_train.png',additional_artists=[lgd],bbox_inches='tight') plt.close() plt.plot(p*y_valid[inds_sup_valid][0],'k',lw=2,label='Ground Truth') ssdgm_label = 'SSDGM ({:.3f})'.format(score_pred_valid) plt.plot(p*pred_valid[inds_sup_valid][0],'r-.',lw=2,label=ssdgm_label) pls_label = 'PLS ({:.3f})'.format(score_pred_valid_pls) plt.plot(p*pred_valid_pls[inds_sup_valid][0],'b-.',lw=2,label=pls_label) plt.plot(p*y_valid[inds_sup_valid].T,'k',lw=2) plt.plot(p*pred_valid[inds_sup_valid].T,'r-.',lw=2) plt.plot(p*pred_valid_pls[inds_sup_valid].T,'b-.',lw=2) plt.title('Predicting Composition - Validation Error', fontsize=fs) plt.ylabel('Composition (%)', fontsize=fs) ax = plt.gca() ax.set_ylim((0,1*p)) ax.set_xticks(np.arange(y.shape[1])) ax.set_xticklabels(names, fontsize=fs) ax.tick_params(axis='x',direction='out',top='off',length=10,labelsize=fs_tick) lgd = plt.legend(loc='center left',bbox_to_anchor=(1, 0.5)) ax = plt.gca() plt.savefig(res_out+'/comp_valid.png',additional_artists=[lgd],bbox_inches='tight') plt.close() plt.plot(waves,f(_ux+data['X'][inds_sup_train]).T,'k') plt.plot(waves,gen_train.T,'r-.') plt.title('Generating Spectra - Training Error', fontsize=fs) plt.xlabel('Channels', fontsize=fs) plt.ylabel('Intensities', fontsize=fs) plt.tick_params(axis='both', which='major', labelsize=fs_tick) if force_ylim: plt.gca().set_ylim(ylim) plt.savefig(res_out+'/genspectra_train.png') plt.close() plt.plot(waves,f(_ux+data['X_valid'][inds_sup_valid]).T,'k') plt.plot(waves,gen_valid.T,'r-.') plt.title('Generating Spectra - Validation Error', fontsize=fs) plt.xlabel('Channels', fontsize=fs) plt.ylabel('Intensities', fontsize=fs) plt.tick_params(axis='both', which='major', labelsize=fs_tick) if force_ylim: plt.gca().set_ylim(ylim) plt.savefig(res_out+'/genspectra_valid.png') plt.close() if m.variational: for endmember, color, name in zip(endmembers,colors,names): plt.plot(waves,endmember,color=color,lw=2,label=name) for endmember_dist, color in zip(endmembers_dists,colors): plt.plot(waves,endmember_dist.T,'-.',color=color,lw=1) plt.title('Generating Endmembers with Distributions', fontsize=fs) plt.xlabel('Channels', fontsize=fs) plt.ylabel('Intensities', fontsize=fs) plt.tick_params(axis='both', which='major', labelsize=fs_tick) lgd = plt.legend(loc='center left',bbox_to_anchor=(1, 0.5)) ax = plt.gca() if force_ylim: ax.set_ylim(ylim) plt.savefig(res_out+'/endmembers_dist.png',additional_artists=[lgd],bbox_inches='tight') plt.close() for endmember, color, name in zip(endmembers,colors,names): plt.plot(waves,endmember,color=color,lw=2,label=name) plt.title('Generating Endmembers', fontsize=fs) plt.xlabel('Channels', fontsize=fs) plt.ylabel('Intensities', fontsize=fs) plt.tick_params(axis='both', which='major', labelsize=fs_tick) lgd = plt.legend(loc='center left',bbox_to_anchor=(1, 0.5)) if m.variational: plt.gca().set_ylim(ax.get_ylim()) if force_ylim: plt.gca().set_ylim(ylim) plt.savefig(res_out+'/endmembers_means.png',additional_artists=[lgd],bbox_inches='tight') plt.close() for endmember, color, name in zip(endmembers,colors,names): plt.plot(waves,endmember,color=color,lw=2,label=name) for endmember, color, name in zip(groundtruth,colors,names): plt.plot(waves,endmember[:len(waves)],color=color,lw=6,alpha=0.4) score_gen_endmembers = L2(endmembers,groundtruth[:,:len(waves)]) if title is None: plt.title('Generating Endmembers with Ground Truth ({:.3f})'.format(score_gen_endmembers), fontsize=fs) else: plt.title(title+' ({:.3f})'.format(score_gen_endmembers), fontsize=fs) plt.xlabel('Channels', fontsize=fs) plt.ylabel('Intensities', fontsize=fs) plt.tick_params(axis='both', which='major', labelsize=fs_tick) lgd = plt.legend(loc='lower right', fontsize=fs) # lgd = plt.legend(loc='center left',bbox_to_anchor=(1, 0.5)) if m.variational: plt.gca().set_ylim(ax.get_ylim()) if force_ylim: plt.gca().set_ylim(ylim) plt.savefig(res_out+'/endmembers_means_with_groundtruth.png',additional_artists=[lgd],bbox_inches='tight') plt.close() plt.plot(waves,manifold.T,color='lightgray',lw=1,alpha=0.1) for endmember, color, name in zip(groundtruth,colors,names): plt.plot(waves,endmember[:len(waves)],color=color,lw=6,alpha=1.0) plt.title('Spectral Manifold', fontsize=fs) plt.xlabel('Channels', fontsize=fs) plt.ylabel('Intensities', fontsize=fs) plt.tick_params(axis='both', which='major', labelsize=fs_tick) lgd = plt.legend(loc='center left',bbox_to_anchor=(1, 0.5)) if m.variational: plt.gca().set_ylim(ax.get_ylim()) if force_ylim: plt.gca().set_ylim(ylim) plt.savefig(res_out+'/manifold.png',bbox_inches='tight') plt.close() plt.plot(waves,f(_ux+data['X_'][inds_train_x]).T,'k') plt.plot(waves,recon_train.T,'r-.') plt.title('Reconstructing Spectra - Training Error', fontsize=fs) plt.xlabel('Channels', fontsize=fs) plt.ylabel('Intensities', fontsize=fs) plt.tick_params(axis='both', which='major', labelsize=fs_tick) if force_ylim: plt.gca().set_ylim(ylim) plt.savefig(res_out+'/recon_train.png') plt.close() plt.plot(waves,f(_ux+data['X_valid'][inds_sup_valid]).T,'k') plt.plot(waves,recon_sup_valid.T,'r-.') plt.title('Reconstructing Spectra - Validation Error', fontsize=fs) plt.xlabel('Channels', fontsize=fs) plt.ylabel('Intensities', fontsize=fs) plt.tick_params(axis='both', which='major', labelsize=fs_tick) if force_ylim: plt.gca().set_ylim(ylim) plt.savefig(res_out+'/recon_valid.png') plt.close() if m.model_type in [1,2]: # need to use vertical lines to denote edges of datasets # write dataset i in middle of range on xlabel for i in range(z2_train.shape[1]): plt.plot(z2_train[:,i],'r-.') plt.title('Nuisance Variable '+str(i)+' - Training', fontsize=fs) plt.tick_params(axis='both', which='major', labelsize=fs_tick) plt.savefig(res_out+'/nuisance_train_'+str(i)+'.png') plt.close() plt.plot(z2_valid[:,i],'r-.') ax = plt.gca() ylim = ax.get_ylim() # should make this general if possible plt.plot([1866,1866],[-5,5],'k--') plt.plot([1866+1742,1866+1742],[-5,5],'k--') # plt.plot([1866+1742+1746,1866+1742+1746],[-5,5],'k--') ax.set_ylim(ylim) plt.title('Nuisance Variable '+str(i)+' - Validation', fontsize=fs) plt.tick_params(axis='both', which='major', labelsize=fs_tick) plt.savefig(res_out+'/nuisance_valid_'+str(i)+'.png') plt.close()
(Xtrain, ytrain) = loadData(xtrainpath, ytrainpath) (Xtest, ytest) = loadData(xtestpath, ytestpath) #trim off background and scale ytrain=ytrain[:,1:] #ytrain=scale(ytrain) Xtrain=standardize(Xtrain) #trim off background and scale ytest = ytest[:,1:] #ytest = scale(ytest) Xtest = standardize(Xtest) pls = PLSRegression(n_components=10) pls.fit(Xtrain, ytrain) y_pls = pls.predict(Xtest) print 1 + pls.score(Xtest, ytest) pls_rmse=[] pls_rmse.append(sqrt(mean_squared_error(ytest[:,0], y_pls[:,0]))) pls_rmse.append(sqrt(mean_squared_error(ytest[:,1], y_pls[:,1]))) pls_rmse.append(sqrt(mean_squared_error(ytest[:,2], y_pls[:,2]))) pls_rmse.append(sqrt(mean_squared_error(ytest[:,3], y_pls[:,3]))) fig = plt.figure(figsize=(20,10)) ax1 = fig.add_subplot(241) ax1.plot(y_pls[:,0], c='r', label='PLS Fit')
import os import numpy as np from matplotlib import pyplot as plt from sklearn.cross_decomposition import PLSRegression from sklearn import metrics os.chdir("D:/01. CLASS/Machine Learning/") pls = PLSRegression(n_components=6, scale=False, max_iter=50000, copy=True) init_lamda_PLS = 1 lamda_PLS = 1 Tgt = np.array([0, 50]) A_p1 = np.array([[0.5, -0.2], [0.25, 0.15]]) d_p1 = np.array([[0.1, 0], [0.05, 0]]) C_p1 = np.transpose( np.array([[0, 0.5, 0.05, 0, 0.15, 0], [0.085, 0, 0.025, 0.2, 0, 0]])) sample_init_EP = [] sample_vm_EP = [] sample_init_VP = [] sample_vm_VP = [] np.random.seed(1000000) # 4 I = np.identity(2) # L1_SC = 0.55 # L2_SC = 0.75 L1_SC = 0.45 L2_SC = 0.35
def __init__(self, dataset): pitchRoll = dataset[:, [2, 3]] motorPos = dataset[:, [0, 1]] self.polyModelLeft = PLSRegression(n_components=2) self.polyModelLeft.fit(pitchRoll, motorPos)
dms = pd.get_dummies(data[['League', 'Division', 'NewLeague']]) # 准备数据 y = data['Salary'] x_ = data.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64') x = pd.concat([x_, dms[['League_N', 'Division_W', 'NewLeague_N']]], axis=1) # 训练集、测试集划分 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42) # 回归模型、参数 pls_model_setup = PLSRegression(scale=True, max_iter=5000, n_components=5) param_grid = {'n_components': range(1, 20)} # GridSearchCV优化参数、训练模型 gsearch = GridSearchCV(pls_model_setup, param_grid) pls_model = gsearch.fit(x_train, y_train) # 打印 coef print('Partial Least Squares Regression coefficients:', pls_model.best_estimator_.coef_) # 对测试集做预测 pls_prediction = pls_model.predict(x_test) # 计算R2,均方差 pls_r2 = r2_score(y_test, pls_prediction)
def VIP(X, Y, H, NumDes): from sklearn.cross_decomposition import PLSRegression import numpy as np from sklearn.cross_validation import KFold import PCM_workflow as PW print('############## VIP is being processed ###############') M = list(X.keys()) H_VIP, X_VIP, Y_VIP, HArray = {}, {}, {}, {} NumDesVIP = np.zeros((13, 6), dtype=int) for kk in M: Xtrain, Ytrain = X[kk], Y kf = KFold(len(Ytrain), 10, indices=True, shuffle=True, random_state=1) HH = H[kk] nrow, ncol = np.shape(Xtrain) ArrayYpredCV, Q2, RMSE_CV, OptimalPC = PW.CV_Processing( Xtrain, Ytrain, kf) plsmodel = PLSRegression(n_components=OptimalPC) plsmodel.fit(Xtrain, Ytrain) x_scores = plsmodel.x_scores_ x_weighted = plsmodel.x_weights_ m, p = nrow, ncol m, h = np.shape(x_scores) p, h = np.shape(x_weighted) X_S, X_W = x_scores, x_weighted co = [] for i in range(h): corr = np.corrcoef(np.squeeze(Ytrain), X_S[:, i]) co.append(corr[0][1]**2) s = sum(co) vip = [] for j in range(p): d = [] for k in range(h): d.append(co[k] * X_W[j, k]**2) q = sum(d) vip.append(np.sqrt(p * q / s)) idx_keep = [idx for idx, val in enumerate(vip) if vip[idx] >= 1] idxDes = NumDes[int(kk[6:]) - 1, :] L, P, LxP, LxL, PxP = [], [], [], [], [] for idx in idx_keep: if idx >= 0 and idx < np.sum(idxDes[0:1]): L.append(idx) elif idx >= np.sum(idxDes[0:1]) and idx < np.sum(idxDes[0:2]): P.append(idx) elif idx >= np.sum(idxDes[0:2]) and idx < np.sum(idxDes[0:3]): LxP.append(idx) elif idx >= np.sum(idxDes[0:3]) and idx < np.sum(idxDes[0:4]): LxL.append(idx) elif idx >= np.sum(idxDes[0:4]) and idx < np.sum(idxDes): PxP.append(idx) NVIP = np.array( [len(L), len(P), len(LxP), len(LxL), len(PxP), len(idx_keep)]) NumDesVIP[int(kk[6:]) - 1, :] = NumDesVIP[int(kk[6:]) - 1, :] + NVIP hvip = np.array(HH)[idx_keep] vvip = np.array(vip)[idx_keep] H_VIP[kk] = hvip X_VIP[kk] = Xtrain[:, idx_keep] Y_VIP = Ytrain hvip = np.reshape(hvip, (len(hvip), 1)) vvip = np.reshape(vvip, (len(vvip), 1)) HArray[kk] = np.append(hvip, vvip, axis=1) return X_VIP, Y_VIP, H_VIP, HArray, NumDesVIP
#%% PCA, SVD, PLS from sklearn.cross_decomposition import PLSRegression from sklearn.decomposition import PCA, TruncatedSVD pca = PCA(n_components=8) pca_feats = [3, 5, 10, 14, 18, 19, 22, 23, 25, 26, 27] train_pca_df = pd.DataFrame([]) test_pca_df = pd.DataFrame([]) for feat in pca_feats: feat_label = "F" + str(feat) train_pca_df[feat_label] = train_features[feat_label] test_pca_df[feat_label] = test_features[feat_label] pls = PLSRegression(n_components=8) # This works good for the log reg model pls.fit(train_pca_df, train_y) train_feats_pls = pd.DataFrame(pls.transform(train_pca_df), index=train_features.index) test_feats_pls = pd.DataFrame(pls.transform(test_pca_df), index=test_features.index) #%% Replace pca feats with new feats for feat in pca_feats: feat_label = "F" + str(feat) train_features = train_features.drop([feat_label], axis=1) test_features = test_features.drop([feat_label], axis=1) train_features = pd.concat([train_features, train_feats_pls], axis=1) test_features = pd.concat([test_features, test_feats_pls], axis=1) #%% Logistic Regression on the initial features
#correct not accurate from sklearn.cross_validation import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn import metrics from sklearn.svm import SVC import numpy as np import pandas as pd from sklearn.cross_decomposition import PLSRegression from sklearn.cross_decomposition import PLSCanonical df=pd.read_csv('newdata.csv') x=df.drop(['tag'],axis=1) y=df.drop(['kx','ky','kz','wa','wb','wc','wd','we','wf'],axis=1) X_train , X_test , Y_train , Y_test = train_test_split(x,y , random_state=5) plsr=PLSRegression() plsr.fit(X_train,Y_train) plsc=PLSCanonical() plsc.fit(X_train,Y_train) print (plsr.score(X_test,Y_test)) print (plsc.score(X_test,Y_test))
def generate(self, input=None): dso = input _experiment_test = self.config.get('experiment_test') _experiment_control = self.config.get('experiment_control') data = dso.data plsr = PLSRegression(n_components=self.config.get('number_of_components'), scale=self.config.get('autoscale')) #, algorithm=self.config.get('algorithm')) Y = np.array([0 if c == _experiment_control else 1 for c in dso.classes[0] ]) plsr.fit(data, Y) # Transpose it, as vars need to along the top # Build scores into a dso no_of_samples x no_of_principal_components scored = DataSet(size=(len(plsr.x_scores_),len(plsr.x_scores_[0]))) scored.labels[0] = input.labels[0] scored.classes[0] = input.classes[0] for n,s in enumerate(plsr.x_scores_.T): scored.data[:,n] = s scored.labels[1][n] = 'Latent Variable %d' % (n+1) #, plsr.y_weights_[0][n]) # PLS-DA regions; mean +- 95% confidence in each axis for each cluster cw_x = defaultdict(list) cw_y = defaultdict(list) for c in list(cw_x.keys()): # Calculate mean point cx = np.mean( cw_x[c] ) cy = np.mean( cw_y[c] ) # Calculate 95% CI rx = np.std( cw_x[c] ) *2 # 2sd = 95% #1.95 * ( / srn) # 1.95 * SEM => 95% confidence ry = np.std( cw_y[c] ) *2 #1.95 * ( / srn) figure_regions.append( (c, cx, cy, rx, ry) ) # Label up the top 50 (the values are retained; just for clarity) wmx = np.amax( np.absolute( plsr.x_weights_), axis=1 ) dso_z = list(zip( dso.scales[1], dso.entities[1], dso.labels[1] )) dso_z = sorted( zip( dso_z, wmx ), key=lambda x: x[1])[-50:] # Top 50 dso_z = [x for x, wmx in dso_z ] weightsd = DataSet(size=plsr.x_weights_.T.shape) weightsd.data = plsr.x_weights_.T weightsd.scales[1] = input.scales[1] dso_lv = {} for n in range(0, plsr.x_weights_.shape[1] ): lvd = DataSet( size=(1, input.shape[1] ) ) lvd.entities[1] = input.entities[1] lvd.labels[1] = input.labels[1] lvd.scales[1] = input.scales[1] lvd.data = plsr.x_weights_[:,n:n+1].T dso_lv['lv%s' % (n+1)] = lvd weightsd.labels[0][n] = "Weights on LV %s" % (n+1) weightsd.classes[0][n] = "LV %s" % (n+1) return dict(list({ 'dso': dso, 'scores':scored, 'weights':weightsd, #'figure_data': figure_data, #'figure_regions': figure_regions, 'y_weights': plsr.y_weights_, 'x_weights': plsr.x_weights_, }.items()) + list(dso_lv.items()) )
'param': { 'n_estimators': range_t } }, 'SVR': { 'name': 'SVR', 'model': SVR(), 'param': { 'gamma': range_g, 'C': range_c, 'epsilon': range_e } }, 'PLS': { 'name': 'PLS', 'model': PLSRegression(), 'param': { 'n_components': range_p } }, 'GPR': { 'name': 'GPR', 'model': GaussianProcessRegressor(kernel=kernel), 'param': { 'n_restarts_optimizer': range_o } }, } key = 'RF' # 'RR' 'EN', 'LASSO', 'kNN', 'RF', 'GB', 'SVR', 'PLS', 'GPR' name = model_param[key]['name']
def run_full_caltarget_test(cal_spectra, cal_labels, cal_names, transformed_test, pls_comps=[10]): samples, comps = ct.load_data(norm=3, masked=True) org_samples = np.copy(samples) org_comps = np.copy(comps) elements = [ 'SiO2', 'TiO2', 'Al2O3', 'FeOT', 'MnO', 'MgO', 'CaO', 'Na2O', 'K2O' ] for n_comps in pls_comps: mars_preds = [] ct_preds = [] gt_preds = [] for e, elem in enumerate(elements): if verbose: print '-----------------------' print elem for transformer in transformed_test.keys(): print transformer targets = transformed_test[transformer] for t, target in enumerate(targets): # Remove caltargets if (comps['Name'] == cal_names[t]).any(): ind = np.argwhere(comps['Name'] == cal_names[t])[0, 0] comps = np.delete(org_comps, ind, 0) samples = np.delete(org_samples, ind, 0) model = PLSRegression(n_components=n_comps, scale=False) model.fit(samples, comps[elem]) lab_pred = model.predict(cal_spectra[0][t][None]) mars_pred = model.predict(cal_spectra[1][t][None]) trans_pred = model.predict(target) gt = cal_labels[t, e] score = (norm(mars_pred - gt, ord=1) - norm(trans_pred - gt, ord=1)) mars_preds.append(mars_pred[0][0]) ct_preds.append(trans_pred[0][0]) gt_preds.append(gt) if verbose: print cal_names[t] print 'Ground truth: %.4f' % gt print 'Lab target: %.4f' % lab_pred print 'Mars target: %.4f' % mars_pred print 'Transformed Mars: %.4f' % trans_pred print 'Score: %.4f' % score print pred_shape = (len(elements), len(targets)) ct_preds = np.array(ct_preds).reshape(pred_shape) gt_preds = np.array(gt_preds).reshape(pred_shape) mars_preds = np.array(mars_preds).reshape(pred_shape) print '-----------------------' print "Element\tMars\t\tCalTran\t\t%Gain/lost" for i, e in enumerate(elements): mars_rmsep = rmse(gt_preds[i, :], mars_preds[i, :]) ct_rmsep = rmse(gt_preds[i, :], ct_preds[i, :]) print e, print "\t%f" % round(mars_rmsep, 4), print "\t%f" % round(ct_rmsep, 4), print "\t%f" % round((mars_rmsep - ct_rmsep) * 100 / mars_rmsep, 4) print '-----------------------' print "Sample\tMars\t\tCalTran\t\t%Gain/lost" for i, n in enumerate(names): mars_rmsep = rmse(gt_preds[:, i], mars_preds[:, i]) ct_rmsep = rmse(gt_preds[:, i], ct_preds[:, i]) print n, print "\t%f" % round(mars_rmsep, 4), print "\t%f" % round(ct_rmsep, 4), print "\t%f" % round((mars_rmsep - ct_rmsep) * 100 / mars_rmsep, 4)
def plsvip (X, Y, V, lat_var): attributes = len(X[0]) if not lat_var: latent_variables = attributes else: latent_variables = lat_var num_instances = len(X) attributes_gone = [] min_att = -1 #start_time = time.time() #attr_time = time.time() #time_counter = 0 while attributes>0: #if (attributes +9) %10 ==0: # print "total time: ", time.time() - start_time # print "attr time: ", time.time() - attr_time # attr_time = time.time() if (latent_variables == 0) or (latent_variables > attributes): latent_variables = attributes lv_best = best_latent_variable(X, Y, latent_variables, num_instances) #print "current best lv: ", lv_best, "num. attr. ", attributes #### #fin_pls = PLSCanonical(n_components = lv_best) fin_pls = PLSRegression(n_components = lv_best) fin_pls.fit(X, Y) currentR2 = fin_pls.score(X, Y) #######################################w # alternative r2 """ meanY4r2 = numpy.mean(Y) predY = fin_pls.predict(X) RSS = 0 for i in range (len(Y)): RSS += numpy.power (Y[i] - predY[i], 2) TSS = 0 for i in range (len(Y)): TSS += numpy.power (Y[i] - meanY4r2, 2) alterR2 = 1 - (RSS/TSS) #print currentR2, "vs", alterR2 """ #######################################w min_vip = 1000 if min_att ==-1: attributes_gone.append(["None", currentR2, attributes, lv_best]) ##########################################r #threaded version """ myThreads = [] VIPcurrent = [] for i in range (0,attributes): myThreads.append(enthread( target = get_vip, args = (fin_pls, lv_best, i, attributes_gone, attributes )) ) for i in range (0,attributes): VIPcurrent.append(myThreads[i].get()) min_vip = min(VIPcurrent) min_att = VIPcurrent.index(min_vip) """ # Working version #""" for i in range (0,attributes): VIPcurrent = get_vip (fin_pls, lv_best, i, attributes_gone, attributes ) if VIPcurrent< min_vip: min_vip = VIPcurrent min_att = i #""" ##########################################r if min_att >-1: attributes_gone.append([V[min_att], currentR2, attributes, lv_best]) ####### CURRENT : to BE popped, NOT already popped V.pop(min_att) for i in range (num_instances): X[i].pop(min_att) attributes -= 1 #print attributes_gone #### #time_counter +=1 return attributes_gone
n_components = 75 #X_train[0].shape[1] for vid,Xt,yt in zip(subjId_val, X_val, y_val): levelOneTest = [] levelOneTrain = [] X_levelOne = [] y_levelOne = [] level0Classifier = [] for tid,Xp,yp in zip(subjId_train,X_train,y_train): print "Predicting subject ", vid, "from subject ", tid y0 = np.zeros(yp.shape) y1 = np.ones(Xt.shape[0]) X = np.vstack([Xp,Xt]) yd = np.concatenate([y0,y1]) pls = PLSRegression(n_components) Xp_t, Xp_v, yp_t, yp_v = tts(Xp.copy(),yp.copy(),train_size=0.9) yp_t = yp_t.astype(bool) yp_t_not = np.vstack((yp_t,~yp_t)).T #print "yp_t_not ", yp_t_not.shape pls.fit(Xp_t,yp_t_not.astype(int)) yp_new = pls.predict(Xp_t, copy=True) yp_pred = (yp_new[:,0] > yp_new[:,1]).astype(int) yp_t = yp_t.astype(int) #print y_new,y_pred, y_t error = ((yp_t - yp_pred) ** 2).sum() print "PLS Training error " , float(error)/yp_t.shape[0] yp_new = pls.predict(Xp_v, copy=True) yp_pred = (yp_new[:,0] > yp_new[:,1]).astype(int) #print y_new, y_pred, y_v #print ((y_v - y_pred) ** 2).sum(), y_v.shape[0]
plt.xticks(()) plt.yticks(()) plt.show() # ############################################################################# # PLS regression, with multivariate response, a.k.a. PLS2 n = 1000 q = 3 p = 10 X = np.random.normal(size=n * p).reshape((n, p)) B = np.array([[1, 2] + [0] * (p - 2)] * q).T # each Yj = 1*X1 + 2*X2 + noize Y = np.dot(X, B) + np.random.normal(size=n * q).reshape((n, q)) + 5 pls2 = PLSRegression(n_components=3) pls2.fit(X, Y) print("True B (such that: Y = XB + Err)") print(B) # compare pls2.coef_ with B print("Estimated B") print(np.round(pls2.coef_, 1)) pls2.predict(X) # PLS regression, with univariate response, a.k.a. PLS1 n = 1000 p = 10 X = np.random.normal(size=n * p).reshape((n, p)) y = X[:, 0] + 2 * X[:, 1] + np.random.normal(size=n * 1) + 5 pls1 = PLSRegression(n_components=3)
def main(): args = parser.parse_args() # 模型选择及输入参数 model_name = { 1: 'PLSR', 2: 'LS-SVR', 3: 'GPR', 4: 'FCN', 5: 'LSTM', 6: 'GCN', 7: 'MC-GCN', 8: 'GC-LSTM' } print(model_name) model_select = list(input('Select models:')) # 初始化结果 results = { 'adj': [], 'r2': [], 'rmse': [], 'loss_hist': [], 'prediction': [] } # os.mkdir('Results') f = open('Results/params.txt', 'w+') f.write('Parameters setting:\n{}\n\n'.format(args.__dict__)) # 导入数据 data = pd.read_excel('8号机磨煤机C_正常.xlsx', index_col=0, header=1, nrows=args.length + 5001) data = data.iloc[5001:, :] # 数据划分 predict_variable = [3, 12, 15, 20, 23] y = data.iloc[:, predict_variable] X = data.drop(columns=y.columns) X_train, y_train = X.iloc[:int(args.length * args.train_size )], y.iloc[:int(args.length * args.train_size)] X_test, y_test = X.iloc[int(args.length * args.train_size ):], y.iloc[int(args.length * args.train_size):] # 导出数据 # X_train.to_csv('Results/X_train.csv', header=False, index=False) # X_test.to_csv('Results/X_test.csv', header=False, index=False) # y_train.to_csv('Results/y_train.csv', header=False, index=False) # y_test.to_csv('Results/y_test.csv', header=False, index=False) # 设定种子 np.random.seed(args.seed) torch.manual_seed(args.seed) # 多次实验 for exp in range(args.n_exp): print('=====Experiment({}/{})====='.format(exp + 1, args.n_exp)) f.write('=====Experiment({}/{})=====\n'.format(exp + 1, args.n_exp)) results['adj'].append({}) results['r2'].append({}) results['rmse'].append({}) results['loss_hist'].append({}) results['prediction'].append({}) # PLSR if '1' in model_select: flag = 1 print('====={}====='.format(model_name[flag])) f.write('====={}=====\n'.format(model_name[flag])) # 训练&测试 t1 = time.time() reg = PLSRegression(args.n_components).fit(X_train, y_train) t2 = time.time() y_pred = reg.predict(X_test) t3 = time.time() y_fit = reg.predict(X_train) print(reg.get_params()) print('Time:\nFit: {:.3f}s Pred: {:.3f}s'.format(t2 - t1, t3 - t2)) print('R2:\nFit: {} Pred: {}'.format( r2_score(y_train, y_fit, multioutput='raw_values'), r2_score(y_test, y_pred, multioutput='raw_values'))) # 写入文件 f.write(str(reg.get_params()) + '\n') f.write('Time:\nFit: {:.3f}s Pred: {:.3f}s\n'.format( t2 - t1, t3 - t2)) f.write('R2:\nFit: {} Pred: {}\n'.format( r2_score(y_train, y_fit, multioutput='raw_values'), r2_score(y_test, y_pred, multioutput='raw_values'))) # 存储结果和模型 index = r2_rmse(y_test, y_pred, y.columns, f) results['r2'][-1].update({model_name[flag]: index[0]}) results['rmse'][-1].update({model_name[flag]: index[1]}) results['prediction'][-1].update({model_name[flag]: y_pred}) joblib.dump( reg, 'Results/{}-{}.model'.format(model_name[flag], exp + 1)) # LS-SVR if '2' in model_select: flag = 2 print('====={}====='.format(model_name[flag])) f.write('====={}=====\n'.format(model_name[flag])) # 训练&测试 t1 = time.time() reg = LssvrModel(args.c, args.sigma).fit(X_train, y_train) t2 = time.time() y_pred = reg.predict(X_test) t3 = time.time() y_fit = reg.predict(X_train) print(reg.get_params()) print('Time:\nFit: {:.3f}s Pred: {:.3f}s'.format(t2 - t1, t3 - t2)) print('R2:\nFit: {} Pred: {}'.format( r2_score(y_train, y_fit, multioutput='raw_values'), r2_score(y_test, y_pred, multioutput='raw_values'))) # 写入文件 f.write(str(reg.get_params()) + '\n') f.write('Time:\nFit: {:.3f}s Pred: {:.3f}s\n'.format( t2 - t1, t3 - t2)) f.write('R2:\nFit: {} Pred: {}\n'.format( r2_score(y_train, y_fit, multioutput='raw_values'), r2_score(y_test, y_pred, multioutput='raw_values'))) # 存储结果和模型 index = r2_rmse(y_test, y_pred, y.columns, f) results['r2'][-1].update({model_name[flag]: index[0]}) results['rmse'][-1].update({model_name[flag]: index[1]}) results['prediction'][-1].update({model_name[flag]: y_pred}) joblib.dump( reg, 'Results/{}-{}.model'.format(model_name[flag], exp + 1)) # GPR if '3' in model_select: flag = 3 print('====={}====='.format(model_name[flag])) f.write('====={}=====\n'.format(model_name[flag])) # 训练&测试 t1 = time.time() kernel = DotProduct() * RBF(args.length_scale, (args.length_scale, args.length_scale)) reg = GaussianProcessRegressor(kernel=kernel, alpha=args.alpha).fit( X_train, y_train) t2 = time.time() y_pred = reg.predict(X_test) t3 = time.time() y_fit = reg.predict(X_train) print(reg.get_params()) print('Time:\nFit: {:.3f}s Pred: {:.3f}s'.format(t2 - t1, t3 - t2)) print('R2:\nFit: {} Pred: {}'.format( r2_score(y_train, y_fit, multioutput='raw_values'), r2_score(y_test, y_pred, multioutput='raw_values'))) # 写入文件 f.write(str(reg.get_params()) + '\n') f.write('Time:\nFit: {:.3f}s Pred: {:.3f}s\n'.format( t2 - t1, t3 - t2)) f.write('R2:\nFit: {} Pred: {}\n'.format( r2_score(y_train, y_fit, multioutput='raw_values'), r2_score(y_test, y_pred, multioutput='raw_values'))) # 存储结果和模型 index = r2_rmse(y_test, y_pred, y.columns, f) results['r2'][-1].update({model_name[flag]: index[0]}) results['rmse'][-1].update({model_name[flag]: index[1]}) results['prediction'][-1].update({model_name[flag]: y_pred}) joblib.dump( reg, 'Results/{}-{}.model'.format(model_name[flag], exp + 1)) # FCN if '4' in model_select: flag = 4 print('====={}====='.format(model_name[flag])) f.write('====={}=====\n'.format(model_name[flag])) # 训练&测试 t1 = time.time() reg = FcnModel(X_train.shape[1], y_train.shape[1], (1024, 256, 256, 256), args.n_epoch, args.batch_size, args.lr, args.weight_decay, args.step_size, args.gamma).fit(X_train, y_train) t2 = time.time() y_pred = reg.predict(X_test) t3 = time.time() y_fit = reg.predict(X_train) print(reg.get_params()) print('Time:\nFit: {:.3f}s Pred: {:.3f}s'.format(t2 - t1, t3 - t2)) print('R2:\nFit: {} Pred: {}'.format( r2_score(y_train, y_fit, multioutput='raw_values'), r2_score(y_test, y_pred, multioutput='raw_values'))) # 写入文件 f.write(str(reg.get_params()) + '\n') f.write('Time:\nFit: {:.3f}s Pred: {:.3f}s\n'.format( t2 - t1, t3 - t2)) f.write('R2:\nFit: {} Pred: {}\n'.format( r2_score(y_train, y_fit, multioutput='raw_values'), r2_score(y_test, y_pred, multioutput='raw_values'))) # 存储结果和模型 index = r2_rmse(y_test, y_pred, y.columns, f) results['r2'][-1].update({model_name[flag]: index[0]}) results['rmse'][-1].update({model_name[flag]: index[1]}) results['prediction'][-1].update({model_name[flag]: y_pred}) results['loss_hist'][-1].update({model_name[flag]: reg.loss_hist}) joblib.dump( reg, 'Results/{}-{}.model'.format(model_name[flag], exp + 1)) # LSTM if '5' in model_select: flag = 5 print('====={}====='.format(model_name[flag])) f.write('====={}=====\n'.format(model_name[flag])) # 训练&测试 t1 = time.time() reg = LstmModel(X_train.shape[1], y_train.shape[1], (1024, ), (256, 256, 256), args.seq_len, args.n_epoch, args.batch_size, args.lr, args.weight_decay, args.step_size, args.gamma).fit(X_train, y_train) t2 = time.time() y_pred = reg.predict(X_test) t3 = time.time() y_fit = reg.predict(X_train) print(reg.get_params()) print('Time:\nFit: {:.3f}s Pred: {:.3f}s'.format(t2 - t1, t3 - t2)) print('R2:\nFit: {} Pred: {}'.format( r2_score(y_train, y_fit, multioutput='raw_values'), r2_score(y_test, y_pred, multioutput='raw_values'))) # 写入文件 f.write(str(reg.get_params()) + '\n') f.write('Time:\nFit: {:.3f}s Pred: {:.3f}s\n'.format( t2 - t1, t3 - t2)) f.write('R2:\nFit: {} Pred: {}\n'.format( r2_score(y_train, y_fit, multioutput='raw_values'), r2_score(y_test, y_pred, multioutput='raw_values'))) # 存储结果和模型 index = r2_rmse(y_test, y_pred, y.columns, f) results['r2'][-1].update({model_name[flag]: index[0]}) results['rmse'][-1].update({model_name[flag]: index[1]}) results['prediction'][-1].update({model_name[flag]: y_pred}) results['loss_hist'][-1].update({model_name[flag]: reg.loss_hist}) joblib.dump( reg, 'Results/{}-{}.model'.format(model_name[flag], exp + 1)) # GCN if '6' in model_select: flag = 6 print('====={}====='.format(model_name[flag])) f.write('====={}=====\n'.format(model_name[flag])) # 训练&测试 t1 = time.time() reg = GcnModel(X_train.shape[1], y_train.shape[1], (1024, ), (256, 256, 256), args.graph_reg, args.self_con, args.n_epoch, args.batch_size, args.lr, args.weight_decay, args.step_size, args.gamma).fit(X_train, y_train) t2 = time.time() y_pred = reg.predict(X_test) t3 = time.time() y_fit = reg.predict(X_train) print(reg.get_params()) print('Time:\nFit: {:.3f}s Pred: {:.3f}s'.format(t2 - t1, t3 - t2)) print('R2:\nFit: {} Pred: {}'.format( r2_score(y_train, y_fit, multioutput='raw_values'), r2_score(y_test, y_pred, multioutput='raw_values'))) # 写入文件 f.write(str(reg.get_params()) + '\n') f.write('Time:\nFit: {:.3f}s Pred: {:.3f}s\n'.format( t2 - t1, t3 - t2)) f.write('R2:\nFit: {} Pred: {}\n'.format( r2_score(y_train, y_fit, multioutput='raw_values'), r2_score(y_test, y_pred, multioutput='raw_values'))) # 存储结果和模型 index = r2_rmse(y_test, y_pred, y.columns, f) results['r2'][-1].update({model_name[flag]: index[0]}) results['rmse'][-1].update({model_name[flag]: index[1]}) results['prediction'][-1].update({model_name[flag]: y_pred}) results['loss_hist'][-1].update({model_name[flag]: reg.loss_hist}) joblib.dump( reg, 'Results/{}-{}.model'.format(model_name[flag], exp + 1)) # MC-GCN if '7' in model_select: flag = 7 print('====={}====='.format(model_name[flag])) f.write('====={}=====\n'.format(model_name[flag])) # 训练&测试 t1 = time.time() reg = McgcnModel(X_train.shape[1], (1024, ), (256, ), (256, 256), y_train.shape[1], args.graph_reg, args.self_con, args.n_epoch, args.batch_size, args.lr, args.weight_decay, args.step_size, args.gamma).fit(X_train, y_train) t2 = time.time() y_pred = reg.predict(X_test) t3 = time.time() y_fit = reg.predict(X_train) print(reg.get_params()) print('Time:\nFit: {:.3f}s Pred: {:.3f}s'.format(t2 - t1, t3 - t2)) print('R2:\nFit: {} Pred: {}'.format( r2_score(y_train, y_fit, multioutput='raw_values'), r2_score(y_test, y_pred, multioutput='raw_values'))) # 写入文件 f.write(str(reg.get_params()) + '\n') f.write('Time:\nFit: {:.3f}s Pred: {:.3f}s\n'.format( t2 - t1, t3 - t2)) f.write('R2:\nFit: {} Pred: {}\n'.format( r2_score(y_train, y_fit, multioutput='raw_values'), r2_score(y_test, y_pred, multioutput='raw_values'))) # 存储结果和模型 index = r2_rmse(y_test, y_pred, y.columns, f) results['adj'][-1].update({model_name[flag]: reg.adj}) results['r2'][-1].update({model_name[flag]: index[0]}) results['rmse'][-1].update({model_name[flag]: index[1]}) results['prediction'][-1].update({model_name[flag]: y_pred}) results['loss_hist'][-1].update({model_name[flag]: reg.loss_hist}) joblib.dump( reg, 'Results/{}-{}.model'.format(model_name[flag], exp + 1)) # GC-LSTM if '8' in model_select: flag = 8 print('====={}====='.format(model_name[flag])) f.write('====={}=====\n'.format(model_name[flag])) # 训练&测试 t1 = time.time() reg = GclstmModel(X_train.shape[1], (1024, ), (256, ), (256, 256), y_train.shape[1], args.seq_len, args.graph_reg, args.self_con, args.n_epoch, args.batch_size, args.lr, args.weight_decay, args.step_size, args.gamma).fit(X_train, y_train) t2 = time.time() y_pred = reg.predict(X_test) t3 = time.time() y_fit = reg.predict(X_train) print(reg.get_params()) print('Time:\nFit: {:.3f}s Pred: {:.3f}s'.format(t2 - t1, t3 - t2)) print('R2:\nFit: {} Pred: {}'.format( r2_score(y_train, y_fit, multioutput='raw_values'), r2_score(y_test, y_pred, multioutput='raw_values'))) # 写入文件 f.write(str(reg.get_params()) + '\n') f.write('Time:\nFit: {:.3f}s Pred: {:.3f}s\n'.format( t2 - t1, t3 - t2)) f.write('R2:\nFit: {} Pred: {}\n'.format( r2_score(y_train, y_fit, multioutput='raw_values'), r2_score(y_test, y_pred, multioutput='raw_values'))) # 存储结果和模型 index = r2_rmse(y_test, y_pred, y.columns, f) results['adj'][-1].update({model_name[flag]: reg.adj}) results['r2'][-1].update({model_name[flag]: index[0]}) results['rmse'][-1].update({model_name[flag]: index[1]}) results['prediction'][-1].update({model_name[flag]: y_pred}) results['loss_hist'][-1].update({model_name[flag]: reg.loss_hist}) joblib.dump( reg, 'Results/{}-{}.model'.format(model_name[flag], exp + 1)) # 存储结果 np.save('Results/results.npy', results) f.close()
if method_name[0:3] == 'jit': nn_model = NearestNeighbors(metric='euclidean') # サンプル選択用の k-NN モデルの宣言 # オートスケーリング autoscaled_x_train = (x_train - x_train.mean()) / x_train.std() autoscaled_y_train = (y_train - y_train.mean()) / y_train.std() autoscaled_x_test = (x_test - x_train.mean()) / x_train.std() # ハイパーパラメータの最適化やモデリング if method_name == 'pls' or method_name == 'mwpls': # CV による成分数の最適化 components = [] # 空の list の変数を作成して、成分数をこの変数に追加していきます同じく成分数をこの変数に追加 r2_in_cv_all = [] # 空の list の変数を作成して、成分数ごとのクロスバリデーション後の r2 をこの変数に追加 for component in range(1, min(np.linalg.matrix_rank(autoscaled_x_train), max_number_of_principal_components) + 1): # PLS model = PLSRegression(n_components=component) # PLS モデルの宣言 estimated_y_in_cv = pd.DataFrame(cross_val_predict(model, autoscaled_x_train, autoscaled_y_train, cv=fold_number)) # クロスバリデーション推定値の計算し、DataFrame型に変換 estimated_y_in_cv = estimated_y_in_cv * y_train.std() + y_train.mean() # スケールをもとに戻す r2_in_cv = metrics.r2_score(y_train, estimated_y_in_cv) # r2 を計算 print(component, r2_in_cv) # 成分数と r2 を表示 r2_in_cv_all.append(r2_in_cv) # r2 を追加 components.append(component) # 成分数を追加 optimal_component_number = components[r2_in_cv_all.index(max(r2_in_cv_all))] # 最適成分数 # PLS model = PLSRegression(n_components=optimal_component_number) # モデルの宣言 model.fit(autoscaled_x_train, autoscaled_y_train) # モデルの構築 elif method_name == 'svr' or method_name == 'mwsvr' or method_name == 'jitsvr': # グラム行列の分散を最大化することによる γ の最適化 variance_of_gram_matrix = list() for svr_gamma in svr_gammas:
mean_absolute_error(y_test, y_pred_test) ] output.append(temp) output = pd.DataFrame(output, columns=[ 'alpha', 'Train_R2', 'Train_MSE', 'Train_MAE', 'Test_R2', 'Test_MSE', 'Test_MAE' ]) output.to_csv('ElasticNet.csv', index=False) plots(y_test, y_pred_test) #model 2:PLS from sklearn.cross_decomposition import PLSRegression output = [] for i in range(1, 12, 2): pls = PLSRegression(n_components=i, max_iter=10000) pls.fit(X_train_std, y_train) y_pred_train = pls.predict(X_train_std) y_pred_test = pls.predict(X_test_std) temp = [ i, pls.score(X_train_std, y_train), mean_squared_error(y_train, y_pred_train), mean_absolute_error(y_train, y_pred_train), pls.score(X_test_std, y_test), mean_squared_error(y_test, y_pred_test), mean_absolute_error(y_test, y_pred_test) ] output.append(temp) output = pd.DataFrame(output, columns=[
def pls_ds(A, B, n_components=1): model = PLSRegression(n_components=n_components, scale=False).fit(B, A) return model.coefs, model.predict(B)
for i in range (5): plt.plot(nComponents,plsCanScores[i,:],lw=3) plt.xlim(1,np.amax(nComponents)) plt.title('PLS Cannonical accuracy') plt.xlabel('Number of components') plt.ylabel('accuracy') plt.legend (['LR','LDA','GNB','Linear SVM','rbf SVM'],loc='lower right') plt.grid(True) if (0): #%% PLS Regression nComponents = np.arange(1,nClasses+1) plsRegScores = np.zeros((5,np.alen(nComponents))) for i,n in enumerate(nComponents): plsReg = PLSRegression(n_components=n) plsReg.fit(Xtrain,Ytrain) XtrainT = plsReg.transform(Xtrain) XtestT = plsReg.transform(Xtest) plsRegScores[:,i] = util.classify(XtrainT,XtestT,labelsTrain,labelsTest) plsReg = PLSRegression(n_components=2) plsReg.fit(Xtrain,Ytrain) xt = plsReg.transform(Xtrain) fig = plt.figure() util.plotData(fig,xt,labelsTrain,classColors) plt.title('First 2 components of projected data') #%% Plot accuracies for PLSSVD
from sklearn.cross_decomposition import PLSRegression import pandas as pd import numpy as np _experiment_test = config['experiment_test'] _experiment_control = config['experiment_control'] plsr = PLSRegression(n_components=config['number_of_components'], scale=config['autoscale']) #, algorithm=self.config.get('algorithm')) # We need classes to do the classification; should check and raise an error class_idx = input_data.index.names.index('Class') classes = list( input_data.index.levels[ class_idx ] ) Y = input_data.index.labels[ class_idx ] plsr.fit(input_data.values, Y) # Build scores into a dso no_of_samples x no_of_principal_components scores = pd.DataFrame(plsr.x_scores_) scores.index = input_data.index scoresl =[] for n,s in enumerate(plsr.x_scores_.T): scoresl.append( 'Latent Variable %d' % (n+1) ) #, plsr.y_weights_[0][n]) scores.columns = scoresl weights = pd.DataFrame( plsr.x_weights_.T ) weights.columns = input_data.columns
pcr_opt.fit(college_train_x, college_train_y) reduced_college_test_x = pcr_opt.transform(college_test_x) reduced_college_train_x = pcr_opt.transform(college_train_x) lrm = LinearRegression() lrm.fit(reduced_college_train_x, college_train_y) print "\nPCR RMSE (M = " + str(opt_m) + ")" print rmse(lrm, reduced_college_test_x, college_test_y) #%% PLS from sklearn.cross_decomposition import PLSRegression pls_components = range(1, 18) cv_pls = np.array([]) for m in pls_components: pls = PLSRegression(n_components=m) foo = np.transpose(college_train_x.get_values()) transformed_college_train_x = pls.fit_transform(college_train_x, college_train_y)[0] lrm = LinearRegression() pls_this_rmse = rmse_cv(LinearRegression(), transformed_college_train_x, college_train_y).mean() cv_pls = np.append(cv_pls, pls_this_rmse) min_m = pls_components[np.argmin(cv_pls)] cv_pls = pd.Series(cv_pls, index=pls_components) cv_pls.plot(title="PLSRegression Cross Validation") plt.xlabel("Number of Components (M)") plt.ylabel("Root Mean Square Error") if show_plots_flag: plt.show()
train = pd.read_csv('train.csv', index_col='id') targets = pd.get_dummies(train.target) train.drop('target', axis=1, inplace=True) train = train.apply(np.log1p) test = pd.read_csv('test.csv', index_col='id') test = test.apply(np.log1p) Xt, Xv, yt, yv = train_test_split(train, targets, test_size=0.2, random_state=27) best = 10. for n in range(5,16): clf = PLSRegression(n_components=n) clf.fit(Xt,yt) y_pred = clf.predict(Xv) loss = multiclass_log_loss(np.argmax(y_pred,axis=1),y_pred) if loss < best: n_best = n best = loss postfix = '(*)' else: postfix = '' print ('comps: {:02d}\tLoss:{:5.4f} {}'.format(n,loss,postfix)) clf = PLSRegression(n_components=n_best) clf.fit(train,targets) y_pred = clf.predict(test)
class PLS_DA(object): ''' ''' def __init__(self, n_comps=3, yIsDummyMatrix=False, scaleData=False): ''' data contains n_samples, n_features Y - response ''' self.comps = n_comps self.yIsDummyMatrix = yIsDummyMatrix self.plsr = PLSRegression(n_components=n_comps, scale=scaleData) def fit(self, X, Y): if self.yIsDummyMatrix: self.Ym = Y else: self.Ym = self.create_dummy_y(Y) if self.evaluate_data(X, self.Ym): self.plsr.fit(X, self.Ym) def get_scores(self, block='x'): ''' ''' if block == 'x': return self.plsr.x_scores_ elif block == 'y': return self.plsr.y_scores_ def get_weights(self, block='x'): ''' ''' if block == 'x': return self.plsr.x_weights_ def get_loadings(self, block='x'): if block == 'x': return self.plsr.x_loadings_ def get_squared_r(self, X, Y): ''' ''' return self.plsr.score(X, Y) def get_classes(self): ''' ''' def get_dummy_Y(self): ''' ''' return self.Ym def evaluate_data(self, X, Y): ''' ''' if X.shape[0] != Y.shape[0]: print("Number of rows in X does not equal number of rows in Y") return False else: return True def create_dummy_y(self, Y): ''' ''' uniqueVals = np.unique(Y) nClasses = uniqueVals.size Ydummy = np.zeros((Y.shape[0], nClasses)) for n, target in enumerate(Y): col = np.where(uniqueVals == target) Ydummy[n, col] = 1 self.classOrder = uniqueVals.tolist() return Ydummy
# sfs_plot(sfs.get_metric_dict(), kind='std_dev') return sfs_fit def find_maxfit(data): max_score = 100 print(data["training scores"]) return max(data["test scores"]), max(data["training scores"]) if __name__ == "__main__": x, y = get_data.get_data("mango", "as7262", int_time=150, position=2, led_current="25 mA") pls = PLSRegression(n_components=6) y = y['Total Chlorophyll (µg/cm2)'] # x, y = get_data.get_data("mango", "as7262", int_time=150, # position=2, led="b'White'", # led_current="25 mA") # print(x.shape) # # pls_screen_as726x(x, y, n_comps=10) # print(type(x)) poly = PolynomialFeatures(degree=1) x_trans = poly.fit_transform(x) # pls.fit(x_trans, y) # y_predict = pls.predict(x_trans) # print(mean_absolute_error(y, y_predict)) # ham # n_comps = 6 # regr = PLSRegression(n_components=n_comps)
class Local_FWC_Simulator: def __init__(self, A, d, C, seed): self.pls = PLSRegression(n_components=6, scale=False, max_iter=50000, copy=True) np.random.seed(seed) self.A = A self.d = d self.C = C def sampling_up(self): u1 = np.random.normal(0.4, np.sqrt(0.2)) u2 = np.random.normal(0.6, np.sqrt(0.2)) u = np.array([u1, u2]) return u def sampling_vp(self): v1 = np.random.normal(1, np.sqrt(0.2)) v2 = 2 * v1 v3 = np.random.uniform(0.2, 1.2) v4 = 3 * v3 v5 = np.random.uniform(0, 0.4) v6 = np.random.normal(-0.6, np.sqrt(0.2)) v = np.array([v1, v2, v3, v4, v5, v6]) return v def sampling_ep(self): e1 = np.random.normal(0, np.sqrt(0.1)) e2 = np.random.normal(0, np.sqrt(0.2)) e = np.array([e1, e2]) return e def sampling(self, k, uk=np.array([0, 0]), vp=np.array([0, 0, 0, 0, 0, 0]), ep=np.array([0, 0]), isInit=True): u1 = uk[0] u2 = uk[1] u = uk v1 = vp[0] v2 = vp[1] v3 = vp[2] v4 = vp[3] v5 = vp[4] v6 = vp[5] v = vp e = ep if isInit == True: k1 = k % 100 k2 = k % 200 e = np.array([0, 0]) else: k1 = k % 100 # n = 100 일 때 #1 entity maintenance event k2 = k % 200 # n = 200 일 때 #1 entity maintenance event eta_k = np.array([[k1], [k2]]) psi = np.array([u1, u2, v1, v2, v3, v4, v5, v6, k1, k2]) y = u.dot(self.A) + v.dot(self.C) + np.sum(eta_k * self.d, axis=0) + e rows = np.r_[psi, y] idx_end = len(rows) idx_start = idx_end - 2 return idx_start, idx_end, rows def pls_update(self, V, Y): self.pls.fit(V, Y) return self.pls def setDoE_Mean(self, DoE_Mean): self.DoE_Mean = DoE_Mean def getDoE_Mean(self): return self.DoE_Mean def setPlsWindow(self, PlsWindow): self.PlsWindow = PlsWindow def getPlsWindow(self): return self.PlsWindow def plt_show1(self, n, y_act, y_prd): plt.plot(np.arange(n), y_act, 'rx--', y_prd, 'bx--', lw=2, ms=5, mew=2) plt.xticks(np.arange(0, n + 1, 50)) plt.xlabel('Run No.') plt.ylabel('Actual and Predicted Response (y1)') def plt_show2(self, n, y1, y2): plt.figure() plt.plot(np.arange(n), y1, 'bx-', y2, 'gx--', lw=2, ms=5, mew=2) plt.xticks(np.arange(0, n + 1, 5)) plt.yticks(np.arange(-1.2, 1.3, 0.2)) plt.xlabel('Metrology Run No.(z)') plt.ylabel('e(z)') def DoE_Run(self, Z, M): N = Z * M DoE_Queue = [] for k in range(1, N + 1): # range(101) = [0, 1, 2, ..., 100]) idx_start, idx_end, result = self.sampling(k, self.sampling_up(), self.sampling_vp(), self.sampling_ep(), True) DoE_Queue.append(result) initplsWindow = DoE_Queue.copy() npPlsWindow = np.array(initplsWindow) plsWindow = [] for i in range(len(npPlsWindow)): plsWindow.append(npPlsWindow[i]) npDoE_Queue = np.array(plsWindow) DoE_Mean = np.mean(npDoE_Queue, axis=0) plsModelData = npDoE_Queue - DoE_Mean V0 = plsModelData[:, 0:idx_start] Y0 = plsModelData[:, idx_start:idx_end] pls = self.pls_update(V0, Y0) print('Init VM Coefficients: \n', pls.coef_) y_prd = pls.predict(V0) + DoE_Mean[idx_start:idx_end] y_act = npDoE_Queue[:, idx_start:idx_end] print("Init DoE VM Mean squared error: %.3f" % metrics.mean_squared_error(y_act[:, 0:1], y_prd[:, 0:1])) print("Init DoE VM r2 score: %.3f" % metrics.r2_score(y_act[:, 0:1], y_prd[:, 0:1])) self.setDoE_Mean(DoE_Mean) self.setPlsWindow(plsWindow) # self.plt_show1(N, y_act[:,0:1], y_prd[:,0:1]) def VM_Run(self, lamda_PLS, Z, M): N = Z * M ## V0, Y0 Mean Center DoE_Mean = self.getDoE_Mean() idx_end = len(DoE_Mean) idx_start = idx_end - 2 meanVz = DoE_Mean[0:idx_start] meanYz = DoE_Mean[idx_start:idx_end] M_Queue = [] ez_Queue = [] ez_Queue.append([0, 0]) y_act = [] y_prd = [] plsWindow = self.getPlsWindow() for z in np.arange(0, Z): for k in np.arange(z * M + 1, ((z + 1) * M) + 1): idx_start, idx_end, result = self.sampling( k, self.sampling_up(), self.sampling_vp(), self.sampling_ep(), False) psiK = result[0:idx_start] psiKStar = psiK - meanVz y_predK = self.pls.predict(psiKStar.reshape( 1, idx_start)) + meanYz rows = np.r_[result, y_predK.reshape(2, )] M_Queue.append(rows) y_prd.append(rows[idx_end:idx_end + 2]) y_act.append(rows[idx_start:idx_end]) del plsWindow[0:M] ez = M_Queue[M - 1][idx_start:idx_end] - M_Queue[ M - 1][idx_end:idx_end + 2] print("ez : ", ez) ez_Queue.append(ez) if z == 0: ez = np.array([0, 0]) npM_Queue = np.array(M_Queue) npM_Queue[0:M - 1, 0:idx_start] = lamda_PLS * npM_Queue[0:M - 1, 0:idx_start] npM_Queue[0:M - 1, idx_start:idx_end] = lamda_PLS * ( npM_Queue[0:M - 1, idx_end:idx_end + 2] + 0.5 * ez) npM_Queue = npM_Queue[:, 0:idx_end] for i in range(M): plsWindow.append(npM_Queue[i]) M_Mean = np.mean(plsWindow, axis=0) meanVz = M_Mean[0:idx_start] meanYz = M_Mean[idx_start:idx_end] plsModelData = plsWindow - M_Mean V = plsModelData[:, 0:idx_start] Y = plsModelData[:, idx_start:idx_end] self.pls_update(V, Y) del M_Queue[0:M] y_act = np.array(y_act) y_prd = np.array(y_prd) self.plt_show1(N, y_act[:, 0:1], y_prd[:, 0:1]) print("VM Mean squared error: %.3f" % metrics.mean_squared_error(y_act[:, 0:1], y_prd[:, 0:1])) print("VM r2 score: %.3f" % metrics.r2_score(y_act[:, 0:1], y_prd[:, 0:1])) ez_run = np.array(ez_Queue) self.plt_show2(Z + 1, ez_run[:, 0:1], ez_run[:, 1:2])
plt.bar(np.arange(np.shape(X_train_prepro)[1]), pca_wild_b.components_[i]) if i == 0: plt.ylabel('1st component') elif i == 1: plt.ylabel('2nd component') else: plt.ylabel('3rd component') axis_c = plt.gca() axis_c.set_xticklabels(wild_boar_ddbb['header'][3:],fontsize = 7) axis_c.set_xticks(axis_c.get_xticks() + 0.5) print "dentro del bucleeeeeeeeeee" #Select the number of components using CV #%% ##PLSR pls_wild_b = PLSRegression(n_components = 3) pls_wild_b.fit(X_train_prepro,Y_train) X_train_pls_proj = pls_wild_b.transform(X_train_prepro) print("loadings") for i in range(pls_wild_b.n_components): plt.figure() plt.bar(np.arange(np.shape(X_train_prepro)[1]), pls_wild_b.x_loadings_[:,i]) if i == 0: plt.ylabel('PLS 1st component') elif i == 1: plt.ylabel('PLS2nd component') else: plt.ylabel('PLS 3rd component') axis_c = plt.gca() axis_c.set_xticklabels(wild_boar_ddbb['header'][3:],fontsize = 7)
class Global_FWC_P3_Simulator: def __init__(self, Tgt, A, d, C, F, seed): self.pls = PLSRegression(n_components=6, scale=False, max_iter=50000, copy=True) np.random.seed(seed) self.Tgt = Tgt self.A = A self.d = d self.C = C self.F = F def sampling_vp(self): v1 = np.random.normal(-0.4, np.sqrt(0.2)) v2 = 2 * v1 v3 = np.random.uniform(0.2, 0.6) v4 = 3 * v3 v5 = np.random.uniform(0, 0.4) v = np.array([v1, v2, v3, v4, v5]) return v def sampling_ep(self): e1 = np.random.normal(0, np.sqrt(0.05)) e2 = np.random.normal(0, np.sqrt(0.1)) e = np.array([e1, e2]) return e def sampling(self, k, uk=np.array([0, 0]), vp=np.array([0, 0, 0, 0, 0]), ep=np.array([0, 0]), fp=np.array([0, 0]), isInit=True): u1 = uk[0] u2 = uk[1] u = uk v1 = vp[0] v2 = vp[1] v3 = vp[2] v4 = vp[3] v5 = vp[4] v = vp e = ep k1 = k k2 = k eta_k = np.array([[k1], [k2]]) psi = np.array([u1, u2, v1, v2, v3, v4, v5, k1, k2]) if fp is not None: psi = np.r_[psi, fp] f = fp y = u.dot(self.A) + v.dot(self.C) + np.sum(eta_k * self.d, axis=0) + f.dot(self.F) + e else: y = u.dot(self.A) + v.dot(self.C) + np.sum(eta_k * self.d, axis=0) + e rows = np.r_[psi, y] idx_end = len(rows) idx_start = idx_end - 2 return idx_start, idx_end, rows #y값의 시작과 끝 정보, 전체 값 정보 def pls_update(self, V, Y): self.pls.fit(V, Y) return self.pls def setDoE_Mean(self, DoE_Mean): self.DoE_Mean = DoE_Mean def getDoE_Mean(self): return self.DoE_Mean def setPlsWindow(self, PlsWindow): self.PlsWindow = PlsWindow def getPlsWindow(self): return self.PlsWindow def plt_show1(self, n, y_act): plt.figure() plt.plot(np.arange(1, n + 1), y_act, 'bx--', lw=2, ms=10, mew=2) plt.xticks(np.arange(0, n + 1, 20)) plt.xlabel('Run No.') plt.ylabel('Actual Response (y2)') def plt_show2(self, n, y_act): plt.plot(np.arange(1, n + 1), y_act, 'ro-', lw=2, ms=5, mew=2) plt.xticks(np.arange(0, n + 1, 20)) plt.xlabel('Run No.') plt.ylabel('Actual Response (y2)') def DoE_Run(self, lamda_PLS, dEWMA_Wgt1, dEWMA_Wgt2, Z, M, f, isR2R): N = Z * M I = np.identity(2) dEWMA_Wgt1 = dEWMA_Wgt1 * I dEWMA_Wgt2 = dEWMA_Wgt2 * I DoE_Queue = [] sample_init_VP = [] sample_init_EP = [] for k in range(0, N + 1): sample_init_VP.append(self.sampling_vp()) sample_init_EP.append(self.sampling_ep()) vp_next = sample_init_VP[0] ep_next = sample_init_EP[0] ep_next = np.array([0, 0]) for k in range(1, N + 1): # range(101) = [0, 1, 2, ..., 100]) if f is not None: fp = f[k - 1,0:2] p1_lamda_PLS = f[k - 1, 2:3] fp = p1_lamda_PLS * fp if k == 1: uk_next = np.array([-51, -102]) # 계산 공식에 의해 Dk_prev = np.array([-0.24, 26.4]) Kd_prev = np.array([0.024, 0.07]) #Dk_prev = np.array([-0.2, 20]) #Kd_prev = np.array([-0.02, 1]) else: fp = None if k == 1: uk_next = np.array([0, 0]) # 계산 공식에 의해 Dk_prev = np.array([0, 0]) Kd_prev = np.array([0, 0]) idx_start, idx_end, result = self.sampling(k, uk_next, vp_next, ep_next, fp, True) npResult = np.array(result) #================================== initVM-R2R Control ===================================== uk = npResult[0:2] yk = npResult[idx_start:idx_end] Dk = (yk - uk.dot(self.A)).dot(dEWMA_Wgt1) + Dk_prev.dot(I - dEWMA_Wgt1) Kd = (yk - uk.dot(self.A) - Dk_prev).dot(dEWMA_Wgt2) + Kd_prev.dot(I - dEWMA_Wgt2) # Dk = (yk - uk.dot(self.A) - fp.dot(self.F)).dot(dEWMA_Wgt1) + Dk_prev.dot(I - dEWMA_Wgt1) # Kd = (yk - uk.dot(self.A) - fp.dot(self.F) - Dk_prev).dot(dEWMA_Wgt2) + Kd_prev.dot(I - dEWMA_Wgt2) Kd_prev = Kd Dk_prev = Dk if isR2R == True: uk_next = (self.Tgt - Dk - Kd).dot(np.linalg.inv(self.A)) vp_next = sample_init_VP[k] else: if k % M == 0: uk_next = (self.Tgt - Dk - Kd).dot(np.linalg.inv(self.A)) vp_next = sample_init_VP[k] ep_next = sample_init_EP[k] ep_next = np.array([0, 0]) DoE_Queue.append(result) initplsWindow = DoE_Queue.copy() npPlsWindow = np.array(initplsWindow) plsWindow = [] #np.savetxt("output/npPlsWindow1.csv", npPlsWindow, delimiter=",", fmt="%s") if f is not None: for k in range(0, N): # range(101) = [0, 1, 2, ..., 100]) p1_lamda_PLS = f[k, 2:3] if (k + 1) % M != 0: npPlsWindow[k, idx_start - 2:idx_start] = p1_lamda_PLS * npPlsWindow[k, idx_start - 2:idx_start] for z in np.arange(0, Z): npPlsWindow[z * M:(z + 1) * M - 1, 0:idx_start] = lamda_PLS * npPlsWindow[z * M:(z + 1) * M - 1, 0:idx_start] npPlsWindow[z * M:(z + 1) * M - 1, idx_start:idx_end] = lamda_PLS * (npPlsWindow[z * M:(z + 1) * M - 1, idx_start:idx_end]) for i in range(len(npPlsWindow)): plsWindow.append(npPlsWindow[i]) #np.savetxt("output/npPlsWindow2.csv", npPlsWindow, delimiter=",", fmt="%s") npDoE_Queue = np.array(plsWindow) DoE_Mean = np.mean(npDoE_Queue, axis=0) plsModelData = npDoE_Queue - DoE_Mean V0 = plsModelData[:, 0:idx_start] Y0 = plsModelData[:, idx_start:idx_end] pls = self.pls_update(V0, Y0) # print('Init VM Coefficients: \n', pls.coef_) y_pred = pls.predict(V0) + DoE_Mean[idx_start:idx_end] y_act = npDoE_Queue[:, idx_start:idx_end] self.setDoE_Mean(DoE_Mean) self.setPlsWindow(plsWindow) # self.plt_show2(N, y_act[:, 1:2]) # self.plt_show1(N, y_pred[:, 1:2]) def VM_Run(self, lamda_PLS, dEWMA_Wgt1, dEWMA_Wgt2, Z, M, f, isR2R): N = Z * M I = np.identity(2) dEWMA_Wgt1 = dEWMA_Wgt1 * I dEWMA_Wgt2 = dEWMA_Wgt2 * I ## V0, Y0 Mean Center DoE_Mean = self.getDoE_Mean() idx_end = len(DoE_Mean) idx_start = idx_end - 2 meanVz = DoE_Mean[0:idx_start] meanYz = DoE_Mean[idx_start:idx_end] yk = np.array([0, 0]) Dk_prev = np.array([-0.24, 26.4]) #10번째 run시 값 Kd_prev = np.array([0.024, 0.07]) #10번째 run시 값 # Dk = np.array([0, 0]) # Kd = np.array([0, 0]) uk_next = np.array([-51, -102]) #계산 공식에 의해 M_Queue = [] ez_Queue = [] ez_Queue.append([0, 0]) y_act = [] y_pred = [] VM_Output = [] plsWindow = self.getPlsWindow() sample_vm_VP = [] sample_vm_EP = [] for k in range(0, N + 1): sample_vm_VP.append(self.sampling_vp()) sample_vm_EP.append(self.sampling_ep()) vp_next = sample_vm_VP[0] ep_next = sample_vm_EP[0] for z in np.arange(0, Z): for k in np.arange(z * M + 1, ((z + 1) * M) + 1): if f is not None: fp = f[k - 1, 0:2] else: fp = None if k == 1: uk_next = np.array([0, 0]) # 계산 공식에 의해 Dk_prev = np.array([0, 0]) Kd_prev = np.array([0, 0]) # y값의 시작과 끝 정보, 전체 값 정보 idx_start, idx_end, result = self.sampling(k, uk_next, vp_next, ep_next, fp, False) psiK = result[0:idx_start] # 파라미터 값들 psiKStar = psiK - meanVz # 파라미터 값들 평균 마이너스 y_predK = self.pls.predict(psiKStar.reshape(1, idx_start)) + meanYz # 예측값 + 평균 rows = np.r_[result, y_predK.reshape(2, )] #실제값 + 2개 예측값을 rows로, run 10일때가 actual, vm 차이 비교 y_pred.append(rows[idx_end:idx_end + 2]) #예측 값 ==> 10개의 VM 값인데.. y_act.append(rows[idx_start:idx_end]) #실제 값 ==> 시뮬레이션의 실제 값 인데.. # ================================== VM + R2R Control ===================================== if k % M != 0: #예측 값 yk = rows[idx_end:idx_end + 2] else: yk = rows[idx_start:idx_end] #실제 값 e1 = np.absolute(rows[idx_start + 1:idx_end] - rows[idx_end + 1:idx_end + 2]) uk = psiK[0:2] Dk = (yk - uk.dot(self.A)).dot(dEWMA_Wgt1) + Dk_prev.dot(I - dEWMA_Wgt1) Kd = (yk - uk.dot(self.A) - Dk_prev).dot(dEWMA_Wgt2) + Kd_prev.dot(I - dEWMA_Wgt2) Kd_prev = Kd Dk_prev = Dk if isR2R == True: uk_next = (self.Tgt - Dk - Kd).dot(np.linalg.inv(self.A)) vp_next = sample_vm_VP[k] uk_next = uk_next.reshape(2, ) ep_next = sample_vm_EP[k] M_Queue.append(rows) # M_Queue에 rows의 정보 del plsWindow[0:M] #Queue의 가장 처음 Run 10이 없어진다. if isR2R == False: uk_next = (self.Tgt - Dk - Kd).dot(np.linalg.inv(self.A)) vp_next = sample_vm_VP[k] # 여기서 부터는 모델 업데이트를 위한 과정이다. 이미 VM은 rows 정보에 있지만, 가중치를 반영해 준다. if z == 0: ez = 0 npM_Queue = np.array(M_Queue) #parameter + 실제값 + 2개 예측값을 rows로, run 10일 때가 actual, vm 차이 비교 # M은 Run 주기이며, 10, M-1은 run = 10을 제외한 VM들이겠지 # idx_start 까지는 파라미터 값들로 lamda_PLS 0.1을 반영하겠다는 의미지.. for i in range(M): #VM_Output 구한다. lamda_pls 가중치를 반영하지 않는다. if i == M - 1: temp = npM_Queue[i:i + 1, idx_start:idx_end] else: temp = npM_Queue[i:i + 1, idx_end:idx_end + 2] VM_Output.append(np.array([temp[0, 0], temp[0, 1]])) # emax = 5 # lamda_PLS = 1 - e1/emax # if lamda_PLS <= 0: # lamda_PLS = 0.1 # # print("e1 : ", e1, "P2 lamda_PLS : ", lamda_PLS) if f is not None: p1_lamda_PLS = f[k - 1, 2:3] npM_Queue[0:M - 1, idx_start - 2:idx_start] = p1_lamda_PLS * npM_Queue[0:M - 1, idx_start - 2:idx_start] #np.savetxt("output/npM_Queue2.csv", npM_Queue, delimiter=",", fmt="%s") npM_Queue[0:M - 1, 0:idx_start] = lamda_PLS * npM_Queue[0:M - 1, 0:idx_start] # idx_start:idx_end는 실제 값에 VM 값들의 조정을 통해 모델을 위해 VM의 정보를 업데이트 한다. npM_Queue[0:M - 1, idx_start:idx_end] = lamda_PLS * (npM_Queue[0:M - 1, idx_end:idx_end + 2] + 0.5 * ez) #npM_Queue[0:M - 1, idx_start:idx_end] = lamda_PLS * (npM_Queue[0:M - 1, idx_end:idx_end + 2]) # 0.5 * ez 반영안할시 npM_Queue = npM_Queue[:, 0:idx_end] #여기에는 VM + Actual 실제값들이 저장되어 있다. # for i in range(M): #VM_Output 구한다. lamda_pls 가중치를 반영하지 않는다. # temp = npM_Queue[i:i + 1, idx_start:idx_end] # VM_Output.append(np.array([temp[0, 0], temp[0, 1]])) for i in range(M): plsWindow.append(npM_Queue[i]) #전체 Queue에 넣는다. M_Mean = np.mean(plsWindow, axis=0) #Queue의 평균을 구한다. meanVz = M_Mean[0:idx_start] #파라미터 평균 meanYz = M_Mean[idx_start:idx_end] #y값 run시마다 vm 9개(lamda_pla 0.1) 실제 1개(lamda_pls 1) 평균 plsModelData = plsWindow - M_Mean #Queue의 평균 제외 V = plsModelData[:, 0:idx_start] #모델을 위한 파라미터 Y = plsModelData[:, idx_start:idx_end] #모델을 위한 y값 self.pls_update(V, Y) ez = M_Queue[M - 1][idx_start:idx_end] - M_Queue[M - 1][idx_end:idx_end + 2] ez_Queue.append(ez) # print("ez : ", ez) del M_Queue[0:M] y_act = np.array(y_act) y_pred = np.array(y_pred) # print("VM Mean squared error: %.3f" % metrics.mean_squared_error(y_act[:,1:2], y_pred[:,1:2])) # print("VM r2 score: %.3f" % metrics.r2_score(y_act[:,1:2], y_pred[:,1:2])) return y_act[:, 1:2]
X = dataset["data"] y = dataset["target"] # Center each feature and scale the variance to be unitary X = preprocessing.scale(X) # Compute the variance for each column print(numpy.var(X, 0).sum()) # Now use PCA using 3 components pca = PCA(3) X2 = pca.fit_transform(X) print(numpy.var(X2, 0).sum()) pls = PLSRegression(3) pls.fit(X, y) X2 = pls.transform(X) print(numpy.var(X2, 0).sum()) # Make predictions using an SVM with PCA and PLS pca_error = 0 pls_error = 0 n_folds = 10 svc = LinearSVC() for train_inds, test_inds in KFold(X.shape[0], n_folds=n_folds): X_train, X_test = X[train_inds], X[test_inds] y_train, y_test = y[train_inds], y[test_inds]
def get_best_estimator(): """Hyperparameter optimization""" df = load_data() Y = df['Fitness'] X = df[['Variants']] features = FeatureUnion([ #('one_hot_encoder', OneHotEncoder()), #('one_hot_pair_encoder', OneHotPairEncoder()), #('pybiomed_encoder', PyBioMedEncoder()), ('aaindex_encoder', AAIndexEncoder()) ]) print('*' * 40) print('Extracting features...') print('*' * 40) start = timer() X = features.transform(X) end = timer() print('Finished in: {}'.format(end - start)) num_rows, num_cols = X.shape assert num_rows == len(df) print('Got {} features'.format(num_cols)) # TODO include this in pipeilne imp = SimpleImputer(missing_values=np.nan, strategy='mean') imp.fit(X) X = imp.transform(X) assert not pd.DataFrame(X).isna().any().any() X = FFTEncoder().fit_transform(X) import ipdb ipdb.set_trace() n_features_options = [int(num_cols * ratio) for ratio in N_FEATURES_RATIOS] print('n_features_options:', n_features_options) feature_reduction_grid = [ { 'reduce': [ #PCA(), NMF() ], 'reduce__n_components': n_features_options, }, #{ # 'reduce': [SelectKBest()], # 'reduce__score_func': [ # f_regression, # mutual_info_regression # ], # 'reduce__k': n_features_options, #}, ] # Random forest features # Number of trees n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)] # Number of features to consider at every split max_features = ['auto', 'sqrt'] # Maximum number of levels in tree max_depth = [int(x) for x in np.linspace(10, 110, num=11)] max_depth.append(None) # Minimum number of samples required to split a node min_samples_split = [2, 5, 10] # Minimum number of samples required at each leaf node min_samples_leaf = [1, 2, 4] # Method of selecting samples for training each tree bootstrap = [True, False] # TODO: search over more params regression_grid = [ #{ # 'regress': [ # #KNeighborsRegressor(), # #linear_model.ARDRegression(), # #linear_model.BayesianRidge(), # #linear_model.ElasticNet(), # #linear_model.LassoLars(), # #linear_model.LinearRegression(), # #linear_model.Ridge(), # #linear_model.SGDRegressor(), # #tree.DecisionTreeRegressor(), # #ensemble.AdaBoostRegressor(), # #ensemble.BaggingRegressor(), # #ensemble.GradientBoostingRegressor(), # ] #}, #{ # 'regress': [ensemble.RandomForestRegressor()], # #'regress__n_estimators': n_estimators, # #'regress__max_features': max_features, # #'regress__max_depth': max_depth, # #'regress__min_samples_split': min_samples_split, # #'regress__min_samples_leaf': min_samples_leaf, # #'regress__bootstrap': bootstrap #}, { 'regress': [PLSRegression()] } #{ # 'regress': [svm.NuSVR()], # 'regress__C': [1, 10, 100, 1000], # 'regress__kernel': ['rbf', 'linear', 'poly'], #}, #{ # 'regress': [svm.LinearSVR()], # 'regress__C': [1, 10, 100, 1000] #} #{ # 'regress': [neural_network.MLPRegressor()], # 'regress__hidden_layer_sizes': [(100,)] #}, ] pipeline = Pipeline( [ #('fft', FFTEncoder()), #('reduce', DummyEstimator()), ('regress', DummyEstimator()) ], #memory=memory ) grid_steps = [ #feature_reduction_grid, regression_grid ] combined_grids = get_combined_grids(grid_steps) print('combined_grids:') pprint(combined_grids) kfold = KFold(n_splits=NUM_FOLDS or num_rows, random_state=0) search = GridSearchCV(pipeline, combined_grids, error_score=np.nan, verbose=5, n_jobs=-1, cv=kfold) print('*' * 40) print('Searching') print('*' * 40) start = timer() search.fit(X, Y) end = timer() print('Finished in: {}'.format(end - start)) best_estimator = search.best_estimator_ best_params = search.best_params_ best_score = search.best_score_ best_index = search.best_index_ best_std = search.cv_results_['std_test_score'][best_index] print('best_estimator:', best_estimator) print('best_params:', best_params) print('best_score:', best_score) print('best_std:', best_std) return Pipeline([('features', features), ('estimator', best_estimator)])