Exemplo n.º 1
0
def plot_profiles(prots, eluts, sp='Hs', plot_sums=True, shape=None,
        min_count=1):
    """
    shape: (m,n) = m rows, n columns
    eluts: [el.NormElut(f, sp, norm_rows=False, norm_cols=False) for f in
    fs]
    """
    import plotting as pl
    gt = seqs.GTrans()
    use_eluts = elutions_containing_prots(eluts, sp, seqs.names2ids(prots),
            min_count)
    shape = shape if shape else ut.sqrt_shape(len(use_eluts)+1)
    fig = pl.figure()
    for i,e in enumerate(use_eluts):
        sp_target = ut.shortname(e.filename)[:2]
        pl.subplot(shape[0],shape[1],i+1)
        pl.title(ut.shortname(e.filename))
        pids = [gt.name2id[p] for p in prots]
        protsmax = max([np.max(e.normarr[r]) for p in pids if p in e.baseid2inds for
            r in e.baseid2inds[p]])
        plot_prots(e, pids, e.baseid2inds, protsmax)
        if plot_sums:
            # plot total spectral counts normalized to match biggest peak
            sums = np.sum(e.normarr,axis=0)
            fmax = np.max(sums)
            pl.plot(range(sums.shape[1]),
                    np.log2(sums[0,:]).T*np.log2(protsmax)*len(pids)/np.log2(fmax), 
                    color='k', linestyle='-', linewidth=.5)
    # make legend with all prots
    pl.subplot(shape[0],shape[1],0)
    for p in prots: pl.plot(0,label=p)
    pl.legend()
Exemplo n.º 2
0
def plot_sums(fs, shape=None):
    import plotting as pl
    shape = shape if shape else ut.sqrt_shape(len(fs))
    for i,f in enumerate(fs):
        e = el.load_elution(f)
        pl.subplot(shape[0],shape[1],i+1)
        pl.title(ut.shortname(f))
        sums = np.sum(e.mat,axis=0)
        pl.plot(range(sums.shape[1]), sums[0,:].T)
Exemplo n.º 3
0
def plot_bigprofiles(prots, pids, unnorm_eluts, sp='Hs', min_count=1,
        remove_multi_base=False, gt=None, eluts_per_plot=10,
        do_cluster=True, label_trans=None, do_plot_tree=False,
        rename_fracs=None, colors=None, **kwargs):
    """
    supply EITHER prots OR protids, set other to None
    unnorm_eluts: [el.NormElut(f, sp=sp, norm_cols=False, norm_rows=False) for f in fs]
    """
    import plotting as pl
    if prots is not None:
        pids = [gt.name2id[p] for p in prots]
    if do_cluster:
        print "clustering"
        pids = cluster_ids(pids, unnorm_eluts, sp, gt=gt, do_plot=do_plot_tree, 
                **kwargs)
    if gt is not None:
        prots = [gt.id2name[pid] for pid in pids if pid in gt.id2name] #re-order to match
    else:
        prots = pids
        print "No gene names provided--labeling with ids."
    if label_trans: 
        print "Translating names for display."
        # Translate displayed names from base ids according to provided dict
        #prots = [gt.id2name[pid] for pid in pids]
        prots = [label_trans.get(p,p) for p in prots]
    prots.reverse(); pids.reverse(); # put them top to bottom
    if colors is not None: colors.reverse()
    print "%s proteins" % len(pids)
    use_eluts = elutions_containing_prots(unnorm_eluts, sp, pids, min_count)
    nplots = int(np.ceil(len(use_eluts) / eluts_per_plot))
    maxfracs = 0
    for iplot in range(nplots):
        pl.subplot(nplots, 1, iplot+1)
        plot_eluts = use_eluts[iplot*eluts_per_plot: (iplot+1)*eluts_per_plot]
        frac_names = [ut.shortname(e.filename) for e in plot_eluts]
        if rename_fracs:
            frac_names = [rename_fracs.get(n,n) for n in frac_names]
        startcols = [0]
        for i,e in enumerate(plot_eluts):
            freqarr = ut.normalize_fracs(e.normarr, norm_rows=False)
            sp_target = ut.shortname(e.filename)[:2]
            protsmax = max([np.max(freqarr[r]) for p in pids if p in
                e.baseid2inds for r in e.baseid2inds[p]])
            plot_big_single(freqarr, pids, e.baseid2inds, protsmax,
                    startcols[-1], colors=colors)
            startcols.append(startcols[-1]+freqarr.shape[1])
        label_ys(prots)
        label_xs(startcols, frac_names)
        pl.grid(False)
        maxfracs = maxfracs if maxfracs > startcols[-1] else startcols[-1]
    for iplot in range(nplots):
        pl.subplot(nplots, 1, iplot+1)
        pl.xlim(0,maxfracs)
    pl.subplots_adjust(hspace=5/len(prots))
    return nplots
Exemplo n.º 4
0
def main(argv):
    # defaults

    window_length = 50
    overlap = window_length / 2
    featdim = 10
    #data_115818,sgmdata_115818 = load_dataset(window_length,overlap)
    training_data, training_sgmdata = load_dataset(window_length, overlap)

    training_featdata, header = build_dataset_features(training_sgmdata)
    cl.rnn_test(training_featdata)
    return
    data_120250, sgmdata_120250 = load_dataset(
        window_length,
        overlap,
        median_filter=True,
        alldatafile=
        '../../acquisizione20062014/acquisizione_20062014/Data_120250.txt')

    # questi dati son completamente diversi dagli altri tre
    # data_120611,sgmdata_120611 = load_dataset(window_length,overlap,median_filter=True,alldatafile='../../acquisizione20062014/acquisizione_20062014/Data_120611.txt')
    """
	data_120922,sgmdata_120922 = load_dataset(window_length,overlap,median_filter=True,alldatafile='../../acquisizione20062014/acquisizione_20062014/Data_120922.txt')

	all_data = [(data_115818,"115818"),(data_120250,"120250"),(data_120611,"120611"),(data_120922,"120922")]
	sgm_data = [sgmdata_115818,sgmdata_120250,sgmdata_120611,sgmdata_120922]
	cols = ['b','r','g','m']
	for (data,title),c in zip(all_data,cols):
		print "Acquisizione", title
		plt.plot_in_subplots(data,0,1,c)
		return
	"""
    return

    training_data, training_sgmdata = load_dataset(window_length, overlap)

    training_featdata, header = build_dataset_features(training_sgmdata)
    training_targets = fm.assign_target(training_featdata)
    """
	data1,sgmdata1 = load_dataset(window_length,overlap,alldatafile='/home/ilaria/Scrivania/marsupio/acquisizione20062014/acquisizione_20062014/Data_120250.txt')
	featdata1,_ = build_dataset_features(sgmdata1)
	targets1 = fm.assign_target(featdata1)
	"""

    #write_feature_data_to_file(featdata,header)
    #print featdata[0,idxs]
    #plt.plot_in_subplots(featdata,idxs)
    #plt.plot_all(featdata1[:,idxs])

    #X_r=preprocessing.scale(featdata)
    #pca = PCA(n_components=featdim)

    #kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=0.1)
    #X_r = kpca.fit_transform(X_r)
    #X_r = pca.fit(X_r).transform(X_r)

    X_r = training_featdata
    targets = training_targets
    pca = PCA(n_components=2)
    X_r = preprocessing.scale(X_r)
    X_r = pca.fit(X_r).transform(X_r)
    kmeans = KMeans(n_clusters=10)
    kmeans.fit(X_r)
    plt.plot_clustering_and_targets(X_r, kmeans, 0, 1, targets)
    return
    pars = [{
        'clf__kernel': ['rbf'],
        'clf__gamma': [1e-3, 1e-5, 1e-2, 1e-1, 1e-4],
        'clf__C': [0.001, 0.01, 0.1, 1, 10, 100],
        'pca__n_components': [5, 10, 20, 50, 80]
    }, {
        'clf__kernel': ['linear'],
        'clf__C': [0.001, 0.01, 0.1, 0.5, 1, 10, 100],
        'pca__n_components': [5, 10, 20, 50, 80]
    }]

    #evaluation set
    cl.cross_model_selection(X_r, targets, pars, save=True)
    c = cl.load_model('model.pkl')
    print c
    return

    #print X_train.shape, X_test.shape
    clf = svm.SVC(kernel='rbf', gamma=0.7, C=0.8)
    pca = PCA(n_components=featdim)
    pca_svm = Pipeline([
        ('pca', pca),
        ('svm', clf),
    ])
    scores = cross_validation.cross_val_score(clf,
                                              X_r,
                                              targets,
                                              cv=5,
                                              scoring='acc')
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    #pca_svm.fit(X_train, y_train)
    #print pca_svm.score(X_test,y_test)
    return
    #X_r = pca.fit(sint).transform(sint)

    #X_r = preprocessing
    pca = PCA(n_components=featdim)

    #kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=0.1)
    #X_r = kpca.fit_transform(X_r)
    X_r = pca.fit(X_r).transform(X_r)
    ncluster = 10
    """
	from sklearn.cluster import DBSCAN
	dbscan = DBSCAN()
	
	plt.plot_DBSCAN_clustering_result(X_r,dbscan,0,1)
	return
	"""
    #X_r = preprocessing.scale(X_r)
    kmeans = KMeans(n_clusters=ncluster)
    #print X_r
    kmeans.fit(X_r)
    plt.plot_clustering_and_targets(X_r, kmeans, 0, 1, target)

    return
    """
	test = open('./test.csv','w')
	for dt in sint:
		for ft in dt:
			test.write(str(ft)+',')
		
		test.write('\n')
	"""
    #colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
    #colors = np.hstack([colors] * 20)

    featdim = 10

    Y = randomtargets(sint)
    clf = svm.SVC(kernel='rbf', gamma=0.7)
    pca = PCA(n_components=featdim)
    pca_svm = Pipeline([
        ('pca', pca),
        ('svm', clf),
    ])

    pca_svm.fit(sint, Y)
    X_r = pca.fit(sint).transform(sint)
    cX_r = pca.fit(sint).transform(cint)
    #th1 = [l[1] for l in sint]
    #accx1 = [l[2] for l in sint]
    #print(th1)
    #plt.scatter(th1, accx1, 50,c=Y)
    #plt.show()

    features = []
    for i in range(0, featdim):
        features.append([l[i] for l in cX_r])
    Yp = [int(i) for i in pca_svm.predict(cint)]
    print Yp
    s = 411
    for f in features[1:5]:
        #	plt.subplot(s)
        #	plt.scatter(features[0], f, 50,c=Yp)
        i += 1
        s += 1

    #plt.show()
    s = 511
    for f in features[5:10]:
        #	plt.subplot(s)
        #	plt.scatter(features[0], f, color=colors[Yp].tolist())
        i += 1
        s += 1

    #plt.show()
    print clf.support_vectors_
    #	plt.scatter(clf.support_vectors_,range(0,3), color=colors[range(0,3)].tolist())
    # create a mesh to plot in
    sint = np.array(sint)
    Y = (np.array(Y))

    x_min, x_max = sint[:, 2].min() - 1, sint[:, 2].max() + 1
    y_min, y_max = Y.min() - 1, Y.max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, .02),
                         np.arange(y_min, y_max, .02))
    #print len(Y), yy.shape
    #Z = Y.reshape(yy.shape)
    pl.contourf(xx, yy, Y, cmap=pl.cm.Paired)
    pl.axis('off')

    # Plot also the training points
    pl.scatter(X[:, 1], X[:, 2], c=Y, cmap=pl.cm.Paired)
    pl.show()
    return
    #intervalslist=scale(intervalslist)
    #print intervalslist
    featdim = 5
    ncluster = 8
    clusters = range(1, ncluster + 1)

    pca = PCA(n_components=featdim)
    X_r = pca.fit(intervalslist).transform(intervalslist)
    features = []
    for i in range(0, featdim):
        features.append([l[i] for l in X_r])

    #return
    kmeans = KMeans()
    #print X_r
    pca_clustering = Pipeline([('pca', pca),
                               ('minmaxnorm', preprocessing.Normalizer()),
                               ('kmeans', kmeans)])
    clustering = Pipeline([('kmeans', kmeans)])
    print pca_clustering.fit(intervalslist)
    #return
    pca_clusters = pca_clustering.predict(intervalslist)

    clustering.fit(intervalslist)
    nopca_clusters = clustering.predict(intervalslist)
    clustered = []
    i = 0
    s = 411
    for f in features[1:]:
        plt.subplot(s)
        plt.scatter(features[0], f, color=colors[pca_clusters].tolist())
        i += 1
        s += 1

    plt.show()
    """