Exemplo n.º 1
0
def pls_kfold( sample_set, kfold_group_count, max_components, preprocess ):
    print "load...";
    l = AttrDict(load('linre_big'+sample_set+'.npz'))
    disa = l.disa
    expa = l.expa
    Y = disa[:,None]
    X = l.flum.T
    X, Y, expa = shuffle(X, Y, expa, random_state=1)
    print "fix...";
    X_err, X = find_peaks(X,l.exa)
    pls = PLSRegression( scale=False, algorithm='svd' )
    pls.fit(X=X,Y=Y)
    PC = pls.transform(X.copy())
    PC1 = PC[:,0]
    good = PC1 > -PC1.std()*2
    X, Y, expa = X[good,:], Y[good,:], expa[good]
    if preprocess:
        X[X<0.5]=0.5
        X = X**0.25
    #save?
    print "cross-validation...";
    group_count = kfold_group_count(len(disa))
    Ypred4n_components = empty((len(Y),max_components))
    for n_components in arange(max_components)+1:
        Ypred = empty_like(Y)
        loo = KFold( n=len(Y), k=group_count, indices=False )
        for fit, test in loo:
            pls = PLSRegression( 
                scale=False, 
                algorithm='svd', 
                n_components=n_components 
            )
            pls.fit( X=X[fit].copy(), Y=Y[fit].copy() )
            Ypred[test] = pls.predict(X[test].copy())
        Ypred4n_components[:,n_components-1] = Ypred[:,0]
        print "done for "+str(n_components)+" components"
    savez('out23/'+preprocess+'pred.npz',
        X=X, Y=Y, expa=expa, Ypred4n_components=Ypred4n_components
    )
Exemplo n.º 2
0
print "load..."
l = load('linre_big'+sample_set+'.npz')
flum = l['flum']
disa = l['disa']
exa = l['exa']
expa = l['expa']

Y = disa[:,None]
X = flum.T

X, Y, expa = shuffle(X, Y, expa, random_state=1)

print "fix peaks..."

X_err, X = find_peaks(X,exa)

print "fix outliers..."

pls = PLSRegression( scale=False, algorithm='svd' )
pls.fit(X=X,Y=Y)
PC = pls.transform(X.copy())
PC1, PC2 = PC[:,0], PC[:,1]
good = PC1 > -PC1.std()*2
plot_scores(fn='_bad_1', expa=expa, x=PC1,y=PC2, xl='T1',yl='T2', title=', bad')
print expa[logical_not(good)]
X, Y, expa = X[good,:], Y[good,:], expa[good]

print "preprocess with power..."

if preprocess:
Exemplo n.º 3
0
"""
samples_in_testing_set = 5
n_components = 14

from numpy import load, arange, where
from linre_tools import find_peaks, PCA
from scipy.linalg import lstsq 

l = load('linre_big.npz')
flum = l['flum']
disa = l['disa']
exa = l['exa']
expa = l['expa']

X_orig = flum.T
X_err, X_orig = find_peaks(X_orig,exa)

## exclude outliers
PC = PCA(n_components=2).fit_transform(X_orig.copy()) #mean inside
PC1 = PC[:,0]
good_std = PC1 < PC1.std()

a4fit = arange(len(X_orig)) >= samples_in_testing_set
ia4fit, = where( a4fit & good_std )
X4fit = X_orig[ia4fit,:]
disa4fit = disa[ia4fit]
pca = PCA(n_components=n_components)
PC = pca.fit_transform(X4fit.copy())
dis_mean = disa4fit.mean()
#print PC.shape,(disa4fit-dis_mean).shape
(a,residues,rank,s) = lstsq(PC,disa4fit-dis_mean)
Exemplo n.º 4
0
def pca_kfold(sample_set,test_by_good_only,kfold_group_count):
    l = load('linre_big'+sample_set+'.npz')
    flum = l['flum']
    disa = l['disa']
    exa = l['exa']
    expa = l['expa']

    X_orig = flum.T
    X_err, X_orig = find_peaks(X_orig,exa)

    X_orig, disa, expa = shuffle(X_orig, disa, expa, random_state=0)

    ## exclude outliers
    PC = PCA(n_components=2).fit_transform(X_orig.copy()) #mean inside
    PC1 = PC[:,0]
    good_std = PC1 < PC1.std()
    print expa[~good_std]
    if test_by_good_only:
        good_idxa, = where(good_std)
        good_std = good_std[good_idxa]
        X_orig = X_orig[good_idxa,:]
        disa = disa[good_idxa]
        expa = expa[good_idxa]

    def pca_calc(ia4fit):
        X4fit = X_orig[ia4fit,:]
        pca = PCA(n_components=max_components)
        PC = pca.fit_transform(X4fit.copy())
        return (pca,PC)

    cache = dict()
    def cached(f,l):
        k = tuple(l)
        if k not in cache: cache[k] = f(l)
        return cache[k]

    def make(n_components,ia4fit,ia4test):
        disa4fit = disa[ia4fit]
        X4test = X_orig[ia4test,:]
        (pca,PC) = cached(pca_calc,ia4fit)
        PC = PC[:,:n_components].copy()
        dis_mean = disa4fit.mean()
        (a,residues,rank,s) = lstsq(PC,disa4fit-dis_mean)
        PC = pca.transform(X4test.copy())[:,:n_components]
        return PC.dot(a[:,None])[:,0] + dis_mean #returns prediced dis

    x4plot, y4plot = [], []
    group_count = kfold_group_count(len(disa))
    disa_pred4n_components = empty((max_components,len(disa)))
    is_loo = group_count == len(disa)
    title_method = 'LOO' if is_loo else 'K-Fold '+str(group_count)+' groups'
    title_n = str(len(disa))+' samples'
    title_bad = '' if test_by_good_only else ' (inc. outliers)'
    for n_components in arange(max_components)+1:
        disa_pred = empty_like(disa)
        loo = KFold( n=len(disa), k=group_count, indices=False )
        for train, test in loo:
            ia4fit, = where( train & good_std )
            ia4test, = where( test )
            if len(ia4test): disa_pred[ia4test] = make(n_components,ia4fit,ia4test)
        disa_pred4n_components[n_components-1] = disa_pred
        RMSEP = sqrt( power((disa_pred-disa),2).sum(axis=0) / len(disa) )
        print n_components, RMSEP
        x4plot.append(n_components)
        y4plot.append(RMSEP)
    print 'plot start'
    plt.grid(True)
    plt.title(title_method+', '+title_n+title_bad)
    plt.xlabel('PC Count')
    plt.ylabel('RMSEP, mg/L')
    plt.plot(x4plot,y4plot)
    res_dir = "out18";
    res_name = "ts"+sample_set+"g"+str(test_by_good_only)+"k"+str(group_count);
    savez(res_dir+'/'+res_name+".npz",
        disa = disa,
        disa_pred4n_components = disa_pred4n_components,
        expa = expa
    )
    plt.savefig(res_dir+'/png/'+res_name+".png")
    plt.savefig(res_dir+'/pdf/'+res_name+".pdf")
    plt.cla()
    print 'plot finish'
Exemplo n.º 5
0
peaks +-60 are observable with next values ~0;
its about to be a noise, we've marked it;
its possible to browse samples sorted by dis after pressing '/';
no facts was found found from this;
"""

sample_set = '2'

from numpy import load
l = load('linre_big'+sample_set+'.npz')
ema, exa, flum, expa, disa = l['ema'], l['exa'], l['flum'], l['expa'], l['disa']
X = flum.T

#mark errors
from linre_tools import find_peaks
X_err, X_wo_peaks = find_peaks(X,exa)
#X = X_wo_peaks

from linre_explorer import freq3d_explore
pga = [e+' '+str(d) for e, d in zip(expa,disa)]
is_sorted_by_dis = [False]
dis_idxa = list(enumerate(zip(expa,disa)))
dis_idxa.sort(key=lambda p:(p[1][0][0],p[1][1]))

def pg_indexer(i): return dis_idxa[i][0] if is_sorted_by_dis[0] else i
def on_key_inner(k): 
    if k == '/': is_sorted_by_dis[0] = not is_sorted_by_dis[0]
def mplot_inner(ax,gi):
    erra = X_err[gi]
    ax.plot(exa[erra],ema[erra],0,'ro')
    ax.set_zlabel ('Fluorescence')