def pls_kfold( sample_set, kfold_group_count, max_components, preprocess ): print "load..."; l = AttrDict(load('linre_big'+sample_set+'.npz')) disa = l.disa expa = l.expa Y = disa[:,None] X = l.flum.T X, Y, expa = shuffle(X, Y, expa, random_state=1) print "fix..."; X_err, X = find_peaks(X,l.exa) pls = PLSRegression( scale=False, algorithm='svd' ) pls.fit(X=X,Y=Y) PC = pls.transform(X.copy()) PC1 = PC[:,0] good = PC1 > -PC1.std()*2 X, Y, expa = X[good,:], Y[good,:], expa[good] if preprocess: X[X<0.5]=0.5 X = X**0.25 #save? print "cross-validation..."; group_count = kfold_group_count(len(disa)) Ypred4n_components = empty((len(Y),max_components)) for n_components in arange(max_components)+1: Ypred = empty_like(Y) loo = KFold( n=len(Y), k=group_count, indices=False ) for fit, test in loo: pls = PLSRegression( scale=False, algorithm='svd', n_components=n_components ) pls.fit( X=X[fit].copy(), Y=Y[fit].copy() ) Ypred[test] = pls.predict(X[test].copy()) Ypred4n_components[:,n_components-1] = Ypred[:,0] print "done for "+str(n_components)+" components" savez('out23/'+preprocess+'pred.npz', X=X, Y=Y, expa=expa, Ypred4n_components=Ypred4n_components )
expa = l['expa'] Y = disa[:,None] X = flum.T X, Y, expa = shuffle(X, Y, expa, random_state=1) print "fix peaks..." X_err, X = find_peaks(X,exa) print "fix outliers..." pls = PLSRegression( scale=False, algorithm='svd' ) pls.fit(X=X,Y=Y) PC = pls.transform(X.copy()) PC1, PC2 = PC[:,0], PC[:,1] good = PC1 > -PC1.std()*2 plot_scores(fn='_bad_1', expa=expa, x=PC1,y=PC2, xl='T1',yl='T2', title=', bad') print expa[logical_not(good)] X, Y, expa = X[good,:], Y[good,:], expa[good] print "preprocess with power..." if preprocess: X[X<0.5]=0.5 X = X**0.25 print "fit..." a4fit = arange(len(X)) >= samples_in_testing_set a4test = logical_not(a4fit)
pca = RandomizedPCA(n_components=ncomp, whiten=True) clf = LinearRegression().fit(pca.fit_transform(X_fmri_train), y_train) mse_fmri.append(mean_squared_error(clf.predict(pca.transform(X_fmri_test)), y_test)) clf = LinearRegression().fit(pca.fit_transform(X_meg_train), y_train) mse_meg.append(mean_squared_error(clf.predict(pca.transform(X_meg_test)), y_test)) both_train = np.hstack([X_meg_train, X_fmri_train]) both_test = np.hstack([X_meg_test, X_fmri_test]) clf = LinearRegression().fit(pca.fit_transform(both_train), y_train) mse_pca.append(mean_squared_error(clf.predict(pca.transform(both_test)), y_test)) plsca.fit(X_meg_train, X_fmri_train) X_mc_train, X_fc_train = plsca.transform(X_meg_train, X_fmri_train) X_mc_test, X_fc_test = plsca.transform(X_meg_test, X_fmri_test) clf = LinearRegression().fit(X_mc_train, y_train) mse_plsm.append(mean_squared_error(clf.predict(X_mc_test), y_test)) mse_plsf.append(mean_squared_error(clf.predict(X_fc_test), y_test)) # dumb.fit(X_fmri_train, X_meg_train) # dumb_pred = dumb.predict(X_fmri_test) # dumb_mae += mean_absolute_error(X_meg_test,dumb_pred) yf.append(np.sqrt(np.mean(mse_fmri))) ym.append(np.sqrt(np.mean(mse_meg))) ypca.append(np.sqrt(np.mean(mse_pca))) yplsm.append(np.sqrt(np.mean(mse_plsm))) yplsf.append(np.sqrt(np.mean(mse_plsf))) # dumb_scores.append(dumb_mae/nfolds)