def pls_train(self, X, Y, verbose=True): Xn = simple_normalize(X) pls = PLSRegression() if verbose: print 'fitting canonical pls...' pls.fit(Xn, Y) return pls
def plss(X, y, cv, n_components=1): """ """ pls = PLSRegression(n_components=n_components) sse = np.zeros(y.shape[1]) for train, test in cv: X_train, X_test = X[train], X[test] y_train, y_test = y[train], y[test] y0 = y_train.mean(0) X0 = X_train.mean(0) pls.fit(X_train - X0, y_train - y0) sse += np.sum((y_test - y0 - pls.predict(X_test - X0))**2, 0) return sse
def plss(X, y, cv, n_components=1): """ """ pls = PLSRegression(n_components=n_components) sse = np.zeros(y.shape[1]) for train, test in cv: X_train, X_test = X[train], X[test] y_train, y_test = y[train], y[test] y0 = y_train.mean(0) X0 = X_train.mean(0) pls.fit(X_train - X0, y_train - y0) sse += np.sum((y_test - y0 - pls.predict(X_test - X0)) ** 2, 0) return sse
def pls_kfold( sample_set, kfold_group_count, max_components, preprocess ): print "load..."; l = AttrDict(load('linre_big'+sample_set+'.npz')) disa = l.disa expa = l.expa Y = disa[:,None] X = l.flum.T X, Y, expa = shuffle(X, Y, expa, random_state=1) print "fix..."; X_err, X = find_peaks(X,l.exa) pls = PLSRegression( scale=False, algorithm='svd' ) pls.fit(X=X,Y=Y) PC = pls.transform(X.copy()) PC1 = PC[:,0] good = PC1 > -PC1.std()*2 X, Y, expa = X[good,:], Y[good,:], expa[good] if preprocess: X[X<0.5]=0.5 X = X**0.25 #save? print "cross-validation..."; group_count = kfold_group_count(len(disa)) Ypred4n_components = empty((len(Y),max_components)) for n_components in arange(max_components)+1: Ypred = empty_like(Y) loo = KFold( n=len(Y), k=group_count, indices=False ) for fit, test in loo: pls = PLSRegression( scale=False, algorithm='svd', n_components=n_components ) pls.fit( X=X[fit].copy(), Y=Y[fit].copy() ) Ypred[test] = pls.predict(X[test].copy()) Ypred4n_components[:,n_components-1] = Ypred[:,0] print "done for "+str(n_components)+" components" savez('out23/'+preprocess+'pred.npz', X=X, Y=Y, expa=expa, Ypred4n_components=Ypred4n_components )
exa = l['exa'] expa = l['expa'] Y = disa[:,None] X = flum.T X, Y, expa = shuffle(X, Y, expa, random_state=1) print "fix peaks..." X_err, X = find_peaks(X,exa) print "fix outliers..." pls = PLSRegression( scale=False, algorithm='svd' ) pls.fit(X=X,Y=Y) PC = pls.transform(X.copy()) PC1, PC2 = PC[:,0], PC[:,1] good = PC1 > -PC1.std()*2 plot_scores(fn='_bad_1', expa=expa, x=PC1,y=PC2, xl='T1',yl='T2', title=', bad') print expa[logical_not(good)] X, Y, expa = X[good,:], Y[good,:], expa[good] print "preprocess with power..." if preprocess: X[X<0.5]=0.5 X = X**0.25 print "fit..." a4fit = arange(len(X)) >= samples_in_testing_set
pl.yticks(()) pl.show() ############################################################################### # PLS regression, with multivariate response, a.k.a. PLS2 n = 1000 q = 3 p = 10 X = np.random.normal(size=n * p).reshape((n, p)) B = np.array([[1, 2] + [0] * (p - 2)] * q).T # each Yj = 1*X1 + 2*X2 + noize Y = np.dot(X, B) + np.random.normal(size=n * q).reshape((n, q)) + 5 pls2 = PLSRegression(n_components=3) pls2.fit(X, Y) print ("True B (such that: Y = XB + Err)") print (B) # compare pls2.coefs with B print ("Estimated B") print (np.round(pls2.coefs, 1)) pls2.predict(X) ############################################################################### # PLS regression, with univariate response, a.k.a. PLS1 n = 1000 p = 10 X = np.random.normal(size=n * p).reshape((n, p)) y = X[:, 0] + 2 * X[:, 1] + np.random.normal(size=n * 1) + 5 pls1 = PLSRegression(n_components=3)
pl.legend() pl.show() ############################################################################### # PLS regression, with multivariate response, a.k.a. PLS2 n = 1000 q = 3 p = 10 X = np.random.normal(size=n * p).reshape((n, p)) B = np.array([[1, 2] + [0] * (p - 2)] * q).T # each Yj = 1*X1 + 2*X2 + noize Y = np.dot(X, B) + np.random.normal(size=n * q).reshape((n, q)) + 5 pls2 = PLSRegression(n_components=3) pls2.fit(X, Y) print "True B (such that: Y = XB + Err)" print B # compare pls2.coefs with B print "Estimated B" print np.round(pls2.coefs, 1) pls2.predict(X) ############################################################################### # PLS regression, with univariate response, a.k.a. PLS1 n = 1000 p = 10 X = np.random.normal(size=n * p).reshape((n, p)) y = X[:, 0] + 2 * X[:, 1] + np.random.normal(size=n * 1) + 5 pls1 = PLSRegression(n_components=3)
pca = RandomizedPCA(n_components=ncomp, whiten=True) clf = LinearRegression().fit(pca.fit_transform(X_fmri_train), y_train) mse_fmri.append(mean_squared_error(clf.predict(pca.transform(X_fmri_test)), y_test)) clf = LinearRegression().fit(pca.fit_transform(X_meg_train), y_train) mse_meg.append(mean_squared_error(clf.predict(pca.transform(X_meg_test)), y_test)) both_train = np.hstack([X_meg_train, X_fmri_train]) both_test = np.hstack([X_meg_test, X_fmri_test]) clf = LinearRegression().fit(pca.fit_transform(both_train), y_train) mse_pca.append(mean_squared_error(clf.predict(pca.transform(both_test)), y_test)) plsca.fit(X_meg_train, X_fmri_train) X_mc_train, X_fc_train = plsca.transform(X_meg_train, X_fmri_train) X_mc_test, X_fc_test = plsca.transform(X_meg_test, X_fmri_test) clf = LinearRegression().fit(X_mc_train, y_train) mse_plsm.append(mean_squared_error(clf.predict(X_mc_test), y_test)) mse_plsf.append(mean_squared_error(clf.predict(X_fc_test), y_test)) # dumb.fit(X_fmri_train, X_meg_train) # dumb_pred = dumb.predict(X_fmri_test) # dumb_mae += mean_absolute_error(X_meg_test,dumb_pred) yf.append(np.sqrt(np.mean(mse_fmri))) ym.append(np.sqrt(np.mean(mse_meg))) ypca.append(np.sqrt(np.mean(mse_pca))) yplsm.append(np.sqrt(np.mean(mse_plsm))) yplsf.append(np.sqrt(np.mean(mse_plsf)))
params = {"LAMBDA": 0.4, "dimension": 4096} c = DatasetCreator(dtk_params=params, encoder_params=[4096, 3]) D = c.get_d() n = len(D[0]) print(D[1]) train_X, train_Y = D[0][:n / 2], D[1][:n / 2] test_X, test_Y = D[0][n / 2:], D[1][n / 2:] pls2 = PLSRegression() pls2.fit(train_X, train_Y) #print(pls2.coefs) pred = pls2.predict(test_X) mean_err = np.mean((pred - test_Y)**2) print(mean_err) mean_cos = 0 mean_cos_original = 0 for i, j in zip(pred, test_Y): mean_cos = mean_cos + np.dot(i, j) / np.sqrt(np.dot(i, i) * np.dot(j, j))
mae = 0 dumb_mae = 0 meg_mae, fmri_mae = 0, 0 for oidx, (train, test) in enumerate(cv): X_fmri_train = X_fmri[train] X_fmri_test = X_fmri[test] X_meg_train = X_meg[train] X_meg_test = X_meg[test] y_train = y[train] y_test = y[test] X_train = np.hstack([X_fmri_train,X_meg_train]) X_test = np.hstack([X_fmri_test,X_meg_test]) pls.fit(X_train, y_train) pred = pls.predict(X_test) mae += mean_absolute_error(y_test, pred) dumb.fit(X_train, y_train) dumb_pred = dumb.predict(X_test) dumb_mae += mean_absolute_error(y_test,dumb_pred) if within: pls.fit(X_fmri_train, y_train) pred = pls.predict(X_fmri_test) fmri_mae += mean_absolute_error(y_test, pred) pls.fit(X_meg_train, y_train) pred = pls.predict(X_meg_test)
X[X<0.5]=0.5 #0.7 X = X**0.25 group_count = 11 n_components_list = range(9,20) for n_components in n_components_list: Ypred = empty_like(Y) loo = KFold( n=len(Y), k=group_count, indices=False ) for fit, test in loo: pls = PLSRegression( scale=False, algorithm='svd', n_components=n_components ) pls.fit( X=X[fit].copy(), Y=Y[fit].copy() ) Ypred[test] = pls.predict(X[test].copy()) print n_components, RMSEP(Y[:,0],Ypred[:,0]) #n, bins, patches = plt.hist(X.flatten(),40,range=(0,2)) #plt.show() """ stuff: print [v for v in X.flatten() if v<-0.4] #only 3 numbers from X <-0.4 X = (1-X)**2/X*2 #Kubelka-Munk function """ """- 5 0.407984567727 6 0.354843551016 7 0.340217332243
def test_predictions(): d = load_linnerud() X = d.data Y = d.target tol = 5e-12 miter = 1000 num_comp = 2 Xorig = X.copy() Yorig = Y.copy() # SSY = np.sum(Yorig**2) # center = True scale = False pls1 = PLSRegression(n_components = num_comp, scale = scale, tol = tol, max_iter = miter, copy = True) pls1.fit(Xorig, Yorig) Yhat1 = pls1.predict(Xorig) SSYdiff1 = np.sum((Yorig-Yhat1)**2) # print "PLSRegression: R2Yhat = %.4f" % (1 - (SSYdiff1 / SSY)) # Compare PLSR and sklearn.PLSRegression pls3 = PLSR(num_comp = num_comp, center = True, scale = scale, tolerance = tol, max_iter = miter) pls3.fit(X, Y) Yhat3 = pls3.predict(X) assert_array_almost_equal(Yhat1, Yhat3, decimal = 5, err_msg = "PLSR gives wrong prediction") SSYdiff3 = np.sum((Yorig-Yhat3)**2) # print "PLSR : R2Yhat = %.4f" % (1 - (SSYdiff3 / SSY)) assert abs(SSYdiff1 - SSYdiff3) < 0.00005 pls2 = PLSCanonical(n_components = num_comp, scale = scale, tol = tol, max_iter = miter, copy = True) pls2.fit(Xorig, Yorig) Yhat2 = pls2.predict(Xorig) SSYdiff2 = np.sum((Yorig-Yhat2)**2) # print "PLSCanonical : R2Yhat = %.4f" % (1 - (SSYdiff2 / SSY)) # Compare PLSC and sklearn.PLSCanonical pls4 = PLSC(num_comp = num_comp, center = True, scale = scale, tolerance = tol, max_iter = miter) pls4.fit(X, Y) Yhat4 = pls4.predict(X) SSYdiff4 = np.sum((Yorig-Yhat4)**2) # print "PLSC : R2Yhat = %.4f" % (1 - (SSYdiff4 / SSY)) # Compare O2PLS and sklearn.PLSCanonical pls5 = O2PLS(num_comp = [num_comp, 1, 0], center = True, scale = scale, tolerance = tol, max_iter = miter) pls5.fit(X, Y) Yhat5 = pls5.predict(X) SSYdiff5 = np.sum((Yorig-Yhat5)**2) # print "O2PLS : R2Yhat = %.4f" % (1 - (SSYdiff5 / SSY)) assert abs(SSYdiff2 - SSYdiff4) < 0.00005 assert SSYdiff2 > SSYdiff5
data =np.array(filter(lambda row: '' not in row and 'NA' not in row and '?' not in row, [[convert(row[i]) for i in good_cols] for row in reader if 'evd' in row])).astype(float) data = data[~np.isnan(data).any(axis=1)] loc = data[:,9] tumor = data[:,4] #data = (data- data.min(axis=0))/(data.max(axis=0)-data.min(axis=0)) y = data[:,7] x = np.delete(data,7,1) x = (x-x.min(axis=0))/(x.max(axis=0)-x.min(axis=0)) print x.shape pls1 = PLSRegression(n_components = x.shape[1]) pls1.fit(x,y) cfs = np.nan_to_num(np.log(pls1.coefs)) fig, (coeffs,dist) = plt.subplots(nrows=1,ncols=2) coeffs.barh(range(len(cfs)),cfs, edgecolor='k', color = ['r' if x<0 else 'g' for x in cfs], linewidth=1) artist.adjust_spines(coeffs) coeffs.axvline(x=0,color='k',linestyle='--',linewidth=2) coeffs.axvline(x=-5.3,color='r',linestyle='--',linewidth=1) coeffs.axvline(x=5.3,color='r',linestyle='--',linewidth=1) coeffs.set_xlabel(r'\Large \textbf{Importance} $\left(\log \beta\right)$') coeffs.set_yticks(range(len(labels))) coeffs.set_yticklabels(map(format,labels))