def plss(X, y, cv, n_components=1): """ """ pls = PLSRegression(n_components=n_components) sse = np.zeros(y.shape[1]) for train, test in cv: X_train, X_test = X[train], X[test] y_train, y_test = y[train], y[test] y0 = y_train.mean(0) X0 = X_train.mean(0) pls.fit(X_train - X0, y_train - y0) sse += np.sum((y_test - y0 - pls.predict(X_test - X0)) ** 2, 0) return sse
def plss(X, y, cv, n_components=1): """ """ pls = PLSRegression(n_components=n_components) sse = np.zeros(y.shape[1]) for train, test in cv: X_train, X_test = X[train], X[test] y_train, y_test = y[train], y[test] y0 = y_train.mean(0) X0 = X_train.mean(0) pls.fit(X_train - X0, y_train - y0) sse += np.sum((y_test - y0 - pls.predict(X_test - X0))**2, 0) return sse
def pls_kfold( sample_set, kfold_group_count, max_components, preprocess ): print "load..."; l = AttrDict(load('linre_big'+sample_set+'.npz')) disa = l.disa expa = l.expa Y = disa[:,None] X = l.flum.T X, Y, expa = shuffle(X, Y, expa, random_state=1) print "fix..."; X_err, X = find_peaks(X,l.exa) pls = PLSRegression( scale=False, algorithm='svd' ) pls.fit(X=X,Y=Y) PC = pls.transform(X.copy()) PC1 = PC[:,0] good = PC1 > -PC1.std()*2 X, Y, expa = X[good,:], Y[good,:], expa[good] if preprocess: X[X<0.5]=0.5 X = X**0.25 #save? print "cross-validation..."; group_count = kfold_group_count(len(disa)) Ypred4n_components = empty((len(Y),max_components)) for n_components in arange(max_components)+1: Ypred = empty_like(Y) loo = KFold( n=len(Y), k=group_count, indices=False ) for fit, test in loo: pls = PLSRegression( scale=False, algorithm='svd', n_components=n_components ) pls.fit( X=X[fit].copy(), Y=Y[fit].copy() ) Ypred[test] = pls.predict(X[test].copy()) Ypred4n_components[:,n_components-1] = Ypred[:,0] print "done for "+str(n_components)+" components" savez('out23/'+preprocess+'pred.npz', X=X, Y=Y, expa=expa, Ypred4n_components=Ypred4n_components )
def dict2mean(X, dict): plsca = PLSRegression(n_components=np.shape(dict['coefs'])[0]) plsca.x_mean_ = dict['x_mean'] plsca.y_mean_ = dict['y_mean'] plsca.coefs = dict['coefs'] return plsca.predict(X)
if preprocess: X[X<0.5]=0.5 X = X**0.25 print "fit..." a4fit = arange(len(X)) >= samples_in_testing_set a4test = logical_not(a4fit) X4fit, Y4fit, expa4fit = X[a4fit ,:], Y[a4fit ,:], expa[a4fit ] X4test, Y4test, expa4test = X[a4test,:], Y[a4test,:], expa[a4test] pls = PLSRegression(n_components=n_components,algorithm='svd',scale=False) pls.fit(X=X4fit,Y=Y4fit) print "predict..." Y_pred = pls.predict(X4test.copy()) dis4test = Y4test[:,0] dis_pred = Y_pred[:,0] dis_max = max(disa) #dis_pred = where(dis_pred<dis_max+1,where(dis_pred<-1,-1,dis_pred),dis_max+1) persons = Persons(expa4test) #print ia4test, ia4fit, logical_not(a4fit & good_std) #print expa4test.shape, dis4test.shape, dis_pred.shape, Y.shape, Y4test.shape, Y_pred.shape plt.plot([0,dis_max],[0,dis_max],'g-') persons.plot(plt,dis4test,dis_pred) plt.savefig(out_pre+"pred.png") plt.cla()
n = 1000 q = 3 p = 10 X = np.random.normal(size=n * p).reshape((n, p)) B = np.array([[1, 2] + [0] * (p - 2)] * q).T # each Yj = 1*X1 + 2*X2 + noize Y = np.dot(X, B) + np.random.normal(size=n * q).reshape((n, q)) + 5 pls2 = PLSRegression(n_components=3) pls2.fit(X, Y) print ("True B (such that: Y = XB + Err)") print (B) # compare pls2.coefs with B print ("Estimated B") print (np.round(pls2.coefs, 1)) pls2.predict(X) ############################################################################### # PLS regression, with univariate response, a.k.a. PLS1 n = 1000 p = 10 X = np.random.normal(size=n * p).reshape((n, p)) y = X[:, 0] + 2 * X[:, 1] + np.random.normal(size=n * 1) + 5 pls1 = PLSRegression(n_components=3) pls1.fit(X, y) # note that the number of compements exceeds 1 (the dimension of y) print ("Estimated betas") print (np.round(pls1.coefs, 1)) ###############################################################################
n = 1000 q = 3 p = 10 X = np.random.normal(size=n * p).reshape((n, p)) B = np.array([[1, 2] + [0] * (p - 2)] * q).T # each Yj = 1*X1 + 2*X2 + noize Y = np.dot(X, B) + np.random.normal(size=n * q).reshape((n, q)) + 5 pls2 = PLSRegression(n_components=3) pls2.fit(X, Y) print "True B (such that: Y = XB + Err)" print B # compare pls2.coefs with B print "Estimated B" print np.round(pls2.coefs, 1) pls2.predict(X) ############################################################################### # PLS regression, with univariate response, a.k.a. PLS1 n = 1000 p = 10 X = np.random.normal(size=n * p).reshape((n, p)) y = X[:, 0] + 2 * X[:, 1] + np.random.normal(size=n * 1) + 5 pls1 = PLSRegression(n_components=3) pls1.fit(X, y) # note that the number of compements exceeds 1 (the dimension of y) print "Estimated betas" print np.round(pls1.coefs, 1) ###############################################################################
dumb_scores = [] for ncomp in max_comps: print 'Trying %d components' % ncomp pls = PLSRegression(n_components=ncomp) dumb = DummyRegressor(strategy='mean') mae = 0 dumb_mae = 0 for oidx, (train, test) in enumerate(cv): X_fmri_train = X_fmri[train] X_fmri_test = X_fmri[test] X_meg_train = X_meg[train] X_meg_test = X_meg[test] pls.fit(X_fmri_train, X_meg_train) pred = pls.predict(X_fmri_test) mae += mean_absolute_error(X_meg_test, pred) dumb.fit(X_fmri_train, X_meg_train) dumb_pred = dumb.predict(X_fmri_test) dumb_mae += mean_absolute_error(X_meg_test, dumb_pred) comp_scores.append(mae / nfolds) dumb_scores.append(dumb_mae / nfolds) import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt plt.plot(max_comps, comp_scores, max_comps, dumb_scores) t_str = seed + str(band)
c = DatasetCreator(dtk_params=params, encoder_params=[4096, 3]) D = c.get_d() n = len(D[0]) print(D[1]) train_X, train_Y = D[0][:n / 2], D[1][:n / 2] test_X, test_Y = D[0][n / 2:], D[1][n / 2:] pls2 = PLSRegression() pls2.fit(train_X, train_Y) #print(pls2.coefs) pred = pls2.predict(test_X) mean_err = np.mean((pred - test_Y)**2) print(mean_err) mean_cos = 0 mean_cos_original = 0 for i, j in zip(pred, test_Y): mean_cos = mean_cos + np.dot(i, j) / np.sqrt(np.dot(i, i) * np.dot(j, j)) print(mean_cos / n)
mae = 0 dumb_mae = 0 meg_mae, fmri_mae = 0, 0 for oidx, (train, test) in enumerate(cv): X_fmri_train = X_fmri[train] X_fmri_test = X_fmri[test] X_meg_train = X_meg[train] X_meg_test = X_meg[test] y_train = y[train] y_test = y[test] X_train = np.hstack([X_fmri_train,X_meg_train]) X_test = np.hstack([X_fmri_test,X_meg_test]) pls.fit(X_train, y_train) pred = pls.predict(X_test) mae += mean_absolute_error(y_test, pred) dumb.fit(X_train, y_train) dumb_pred = dumb.predict(X_test) dumb_mae += mean_absolute_error(y_test,dumb_pred) if within: pls.fit(X_fmri_train, y_train) pred = pls.predict(X_fmri_test) fmri_mae += mean_absolute_error(y_test, pred) pls.fit(X_meg_train, y_train) pred = pls.predict(X_meg_test) meg_mae += mean_absolute_error(y_test, pred)
dumb_scores = [] for ncomp in max_comps: print 'Trying %d components'%ncomp pls = PLSRegression(n_components=ncomp) dumb = DummyRegressor(strategy='mean') mae = 0 dumb_mae = 0 for oidx, (train, test) in enumerate(cv): X_fmri_train = X_fmri[train] X_fmri_test = X_fmri[test] X_meg_train = X_meg[train] X_meg_test = X_meg[test] pls.fit(X_fmri_train, X_meg_train) pred = pls.predict(X_fmri_test) mae += mean_absolute_error(X_meg_test, pred) dumb.fit(X_fmri_train, X_meg_train) dumb_pred = dumb.predict(X_fmri_test) dumb_mae += mean_absolute_error(X_meg_test,dumb_pred) comp_scores.append(mae/nfolds) dumb_scores.append(dumb_mae/nfolds) import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt plt.plot(max_comps,comp_scores,max_comps,dumb_scores) t_str = seed + str(band)
X = X**0.25 group_count = 11 n_components_list = range(9,20) for n_components in n_components_list: Ypred = empty_like(Y) loo = KFold( n=len(Y), k=group_count, indices=False ) for fit, test in loo: pls = PLSRegression( scale=False, algorithm='svd', n_components=n_components ) pls.fit( X=X[fit].copy(), Y=Y[fit].copy() ) Ypred[test] = pls.predict(X[test].copy()) print n_components, RMSEP(Y[:,0],Ypred[:,0]) #n, bins, patches = plt.hist(X.flatten(),40,range=(0,2)) #plt.show() """ stuff: print [v for v in X.flatten() if v<-0.4] #only 3 numbers from X <-0.4 X = (1-X)**2/X*2 #Kubelka-Munk function """ """- 5 0.407984567727 6 0.354843551016 7 0.340217332243 8 0.328329231934
def test_predictions(): d = load_linnerud() X = d.data Y = d.target tol = 5e-12 miter = 1000 num_comp = 2 Xorig = X.copy() Yorig = Y.copy() # SSY = np.sum(Yorig**2) # center = True scale = False pls1 = PLSRegression(n_components = num_comp, scale = scale, tol = tol, max_iter = miter, copy = True) pls1.fit(Xorig, Yorig) Yhat1 = pls1.predict(Xorig) SSYdiff1 = np.sum((Yorig-Yhat1)**2) # print "PLSRegression: R2Yhat = %.4f" % (1 - (SSYdiff1 / SSY)) # Compare PLSR and sklearn.PLSRegression pls3 = PLSR(num_comp = num_comp, center = True, scale = scale, tolerance = tol, max_iter = miter) pls3.fit(X, Y) Yhat3 = pls3.predict(X) assert_array_almost_equal(Yhat1, Yhat3, decimal = 5, err_msg = "PLSR gives wrong prediction") SSYdiff3 = np.sum((Yorig-Yhat3)**2) # print "PLSR : R2Yhat = %.4f" % (1 - (SSYdiff3 / SSY)) assert abs(SSYdiff1 - SSYdiff3) < 0.00005 pls2 = PLSCanonical(n_components = num_comp, scale = scale, tol = tol, max_iter = miter, copy = True) pls2.fit(Xorig, Yorig) Yhat2 = pls2.predict(Xorig) SSYdiff2 = np.sum((Yorig-Yhat2)**2) # print "PLSCanonical : R2Yhat = %.4f" % (1 - (SSYdiff2 / SSY)) # Compare PLSC and sklearn.PLSCanonical pls4 = PLSC(num_comp = num_comp, center = True, scale = scale, tolerance = tol, max_iter = miter) pls4.fit(X, Y) Yhat4 = pls4.predict(X) SSYdiff4 = np.sum((Yorig-Yhat4)**2) # print "PLSC : R2Yhat = %.4f" % (1 - (SSYdiff4 / SSY)) # Compare O2PLS and sklearn.PLSCanonical pls5 = O2PLS(num_comp = [num_comp, 1, 0], center = True, scale = scale, tolerance = tol, max_iter = miter) pls5.fit(X, Y) Yhat5 = pls5.predict(X) SSYdiff5 = np.sum((Yorig-Yhat5)**2) # print "O2PLS : R2Yhat = %.4f" % (1 - (SSYdiff5 / SSY)) assert abs(SSYdiff2 - SSYdiff4) < 0.00005 assert SSYdiff2 > SSYdiff5