示例#1
0
	def run_tsne(self,R): 

		dr = rage_DR.DR(R.args,R.progress)
		out_name = R.args.prefix+'_transformsamples_TSNE'
		R.progress.start_minor('TSNE') 
		r_members = R.data.samples
		r_matrix = R.data.matrix('log')

		tsne_run = dr.run_tsne(r_matrix)
		dimplot = dplot.dimplot(2,2,R.args,R.progress).add_data(r_members,[tsne_run],{'title':'TSNE','out': out_name+'tsne.pdf'})

		dimplot.finish(out_name+'tsne.pdf') 
		R.progress.end() 
示例#2
0
    def run_condense_pca(self, R):
        #	R = self.rage
        D = R.data

        out_name = R.args.prefix + '_condensepca_'

        shared_features = [
            f.name for f in R.condensed_data.features
            if f.name in [fm.name for fm in D.features]
        ]

        cF = [
            f for f in R.condensed_data.features if f.name in shared_features
        ]
        rF = [f for f in D.features if f.name in shared_features]

        Yc = [[s.cnts[f.idx] for s in R.condensed_data.samples] for f in cF]
        Y = [[s.cnts[f.idx] for s in D.samples] for f in rF]

        dr = rage_DR.DR(R.args, R.progress)
        out_name = R.args.prefix + '_transformsamples_'
        R.progress.start_minor('PCA')
        r_members = R.data.samples

        condense_run = dr.set_y_matrix(Yc, LOG_TRANSFORM=True).pca(req='FULL')
        self.write_coeffs(condense_run['coefs'], cF, out_name + 'coefs.out')

        pca_run = dr.set_y_matrix(Y, LOG_TRANSFORM=True,
                                  SET_TRANSFORM=True).pca(req='FULL')

        dimplot = dplot.dimplot(1, 1, R.args, R.progress)

        shared_run = {
            'pts': condense_run['pts'] + pca_run['pts'],
            'axes': condense_run['axes']
        }
        shared_samples = [s for s in R.condensed_data.samples
                          ] + [s for s in R.data.samples]

        dimplot.add_data(shared_samples, [shared_run], {
            'title': 'SHARED_PCA',
            'out': out_name + 'pca.pdf'
        })

        #		dimplot.add_data(R.condensed_data.samples,[condense_run],{'title':'PCA','out': out_name+'pca.pdf'})
        #		dimplot.add_data(R.data.samples,[pca_run],{'title':'PCA','out': out_name+'pca.pdf'})
        dimplot.finish(out_name + 'pca')
        R.progress.end()

        sys.exit()
示例#3
0
	def run_iterative_pca(self,R,GROUPS=2):
		self.R = R
		D, F, S = R.data , R.data.features, R.data.samples 
		Y = [[s.cnts[f.idx] for s in D.samples] for f in D.features]


		
		self.D = R.data 

		### PARAMS ### 


		GROUPS = 2 
		SELECTION_FRACTION = 0.24	


		dr = rage_DR.DR(R.args,R.progress)
		out_name = R.args.prefix+'_transformsamples_'
		R.progress.start_minor('PCA') 

		LT = ('log' in self.options.notes)
		SC = ('scale' in self.options.notes)
		STD = ('std' in self.options.notes)

	

		if GROUPS == 200: 


			iterplot = dplot.iterplot(self.R.data.samples,self.R.data.features,R.args,R.progress,6)
			#sc = MinMaxScaler()
                	#Y = [[x[0] for x in sc.fit_transform(np.array(y).reshape(-1,1))] for y in Y]
#			dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.samples,[pca_run],dim_comps=[(0,1),(2,3),(4,5),(6,7)],NAMES=False).finish(R.args.prefix+'_transformsamples_sample_pca')

			PCA_1, Y_1, S_1 = self.iterative_run(Y,[s.name for s in S],iteration=1,SELECTION_FRACTION=0.02,MAX_SIZE=2,DIMS=1) #LOWEST=True) 
			for s in [s for s in S_1 if len(s.split('@')) > 1]:	print 1,s  


			iterplot.add_data(PCA_1, [s.name for s in self.R.data.samples]) 
			


			PCA_2, Y_2, S_2 = self.iterative_run(Y_1,S_1,iteration=2,SELECTION_FRACTION=0.05,MAX_SIZE=2,DIMS=1,LOWER=True) 
			for s in [s for s  in S_2 if len(s.split('@')) > 1]:	print 2,s  
 
			iterplot.add_data(PCA_2, S_1) 

			PCA_3, Y_3, S_3 = self.iterative_run(Y_2,S_2,iteration=2,SELECTION_FRACTION=0.025,MAX_SIZE=3,DIMS=1,LOWEST=True) 
			for s in [s for s  in S_3 if len(s.split('@')) > 1]:	print 3,s  
 
			iterplot.add_data(PCA_3, S_2) 
			
			PCA_4, Y_4, S_4 = self.iterative_run(Y_3,S_3,iteration=2,SELECTION_FRACTION=0.50,MAX_SIZE=2,DIMS=1,TAIL=False) 
			for s in [s for s  in S_4 if len(s.split('@')) > 1]:	print 4,s  
			iterplot.add_data(PCA_4, S_3) 

#			PCA_5, Y_5, S_5 = self.iterative_run(Y_4,S_4,iteration=2,SELECTION_FRACTION=0.12,MAX_SIZE=3,DIMS=1,TAIL=True,LOWER=True) 
#			for s in [s for s  in S_5 if len(s.split('@')) > 1]:	print 5,s  
#			iterplot.add_data(PCA_5, S_4) 

			
#			PCA_6, Y_6, S_6 = self.iterative_run(Y_5,S_5,iteration=2,SELECTION_FRACTION=0.40,MAX_SIZE=4,DIMS=2,TAIL=False) 
#			for s in [s for s  in S_6 if len(s.split('@')) > 1]:	print 6,s  
#			iterplot.add_data(PCA_6, S_5) 
			

			plt.savefig('PCA_ITERATIVE.pdf') 


				
			
						#print cn, pn, np.linalg.norm(c_pts-p_pts)


			
		#dimplot = dplot.dimplot(1,1,R.args,R.progress).add_data(R.data.samples,[pca_run],dim_comps=[(0,1),(2,3)],NAMES=True).finish(out_name+'named.pca')
		#dimplot = dplot.dimplot(1,1,R.args,R.progress).add_data(R.data.samples,[pca_run],dim_comps=[(0,1),(1,2),(3,4),(5,6)],NAMES=True).finish(out_name+'named.pca')
		#dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.samples,[pca_tran],dim_comps=[(0,1),(2,3),(4,5),(6,7)],NAMES=False).finish(out_name+'Cpca')

	
		dr.set_y_matrix(Y, LOG_TRANSFORM=LT,SCALE=SC,STD_SCALE=STD) 
		pca_run = dr.pca(req='FULL') 


		self.coeff_key, sample_data  = self.write_pca_data(pca_run, S, F, R.args.prefix+'_sampletransform_')		
		dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.samples, [pca_run],dim_comps=[(0,1),(2,3),(4,5),(6,7)],NAMES=False).finish(R.args.prefix+'_transformsamples_sample_pca')
		s_pts = [[p[1] for p in self.coeff_key[f.name]] for f in R.data.features]
		pca_run['pts'] = s_pts 
		dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.features,[pca_run],dim_comps=[(0,1),(2,3),(4,5),(6,7)], NAMES=True).finish(R.args.prefix+'_transformsamples_feature_pca')




		dr.set_y_matrix(Y, TRANSPOSE = True, SCALE=True) 
		pca_tran = dr.pca(req='FULL') 

		self.s_key, feature_data = self.write_pca_data(pca_tran,  F,S, R.args.prefix+'_featuretransform_')
		dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.features,[pca_tran],dim_comps=[(0,1),(2,3),(4,5),(6,7)], NAMES=True).finish(R.args.prefix+'_transformfeatures_feature_pca')

		s_pts = [[p[1] for p in self.s_key[s.name]] for s in R.data.samples]
		pca_tran['pts'] = s_pts 
		dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.samples,[pca_tran],dim_comps=[(0,1),(2,3),(4,5),(6,7)], NAMES=False).finish(R.args.prefix+'_transformfeatures_samples_pca')
示例#4
0
	def run_pca(self,R):
		self.LT, self.SC, self.STD, ttype, cstr = False, False, False, '_', '_'

		if len(self.options.color)>0: 	 cstr+= '-'.join(self.options.color) 
		if self.options.marker != None:   cstr+= '-'+self.options.marker		

		if 'log' in self.options.notes:     
			self.LT= True 
			ttype += 'LOG_'
		if 'scale' in self.options.notes:   
			self.SC = True
			ttype += 'SCALE_'
		elif 'std'   in self.options.notes: 
			self.STD = True 
			ttype += 'STD_'

		if len(ttype) < 3: ttype = '_RAW'

		self.out_name = R.args.prefix+'_'+cstr+'_transformsamples'+ttype
		self.plt_name = R.args.prefix+'_'+cstr+'_transformsamples'+ttype 
		self.D = R.data 
		self.S = self.D.samples 
		self.Y = [[s.cnts[f.idx] for s in self.D.samples] for f in self.D.features]

		if self.options.coeffs:
			self.precomp_pca(R,self.options.coeffs) 
			return

		else: 


			dr = rage_DR.DR(R.args,R.progress)
			R.progress.start_minor('PCA') 
			dr.set_y_matrix(self.Y, LOG_TRANSFORM=self.LT,SCALE=self.SC,STD_SCALE=self.STD)
			pca_run = dr.pca(req='FULL') 

			F_key = dd(list) 
			for i,C in enumerate(pca_run['coefs']):
				for j,(vs,vl,vi) in enumerate(C): 
					F_key[self.D.features[vi].name].append((j,vl))

			w= open(self.out_name+'pca_coefs.out','w') 
			w.write("%-50s %5s %10s %5s %10s %5s %10s\n" % ('---','R1','V1','R2','V2','R3','V3'))
			for k,C in F_key.items(): 
				w.write("%-50s" % (k))
				for i in range(len(C)): 	w.write(" %5d %10f" % (C[i][0],C[i][1]))
				w.write('\n') 
			w.close() 

			w= open(self.out_name+'pca_pts.out','w') 
			w.write("%-50s %10s %10s %10ss %10s %10s \n" % ('---','PC1','PC2','PC3','PC4','PC5'))
			for p,s in zip(pca_run['pts'],R.data.samples):	w.write("%-50s %10f %10f %10f %10f %10f\n" % (s.name,p[0],p[1],p[2],p[3],p[4]))
			w.close() 

				
			dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.samples,[pca_run],dim_comps=[(0,1),(2,3),(4,5),(6,7)],NAMES=False).finish(self.plt_name+'_sample_pca')
			dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.samples,[pca_run],dim_comps=[(8,9),(10,11),(12,13),(14,15)],NAMES=False).finish(self.plt_name+'_sample_hipca')
				
			R.progress.end() 
			
			
			tsne_run = dr.tsne() 	
			w= open(self.out_name+'tsne_pts.out','w') 
			w.write("%-50s %10s %10s \n" % ('---','TS1','TS2'))
			for p,s in zip(tsne_run['pts'],R.data.samples):	w.write("%-50s %10f %10f \n" % (s.name,p[0],p[1]))
			w.close() 

			dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.samples,[tsne_run],dim_comps=[(0,1)],NAMES=False).finish(self.plt_name+'_sample_tsne')

			R.progress.end() 
示例#5
0
	def precomp_pca(self,R,coeffs,PLEN=1500,MAX_COEFS=8):


		coeff_key = dd(lambda: {}) 
		scale_key = dd(lambda: {}) 
		projection_key = dd(lambda: {}) 
		for line in coeffs: 
			line = line.split() 
			if line[0] == '---': continue

			for i in range(2,len(line),2):

				coeff_key[(i/2)-1][line[0]] = float(line[i])
				if i >= 40: break 
			


		for f in self.D.features: 
			if f.name not in coeff_key[0]: 
				for i in coeff_key.keys(): coeff_key[i][f.name] = 0.0 
		

		for s in self.S: projection_key[s] = sorted([[c,self.D.features[i].name,[coeff_key[n][self.D.features[i].name] for n in range(len(coeff_key))]] for i,c in s.cnts.items()],reverse=True) 
				

		prj_len =  max(min([len(X) for X in projection_key.values()]),PLEN) 
		self.plt_name+='_projected_'+str(prj_len)
		pca_key = dd(list) 
		tsne_key = {} 

		for s in self.S:

			LK_DATA = [[pk[2][n]*log(pk[0],2) for pk in projection_key[s]] for n in range(len(coeff_key.keys()))]
			RK_DATA = [[pk[2][n]*log(pk[0],2) for pk in projection_key[s]] for n in range(len(coeff_key.keys()))]


			RK_DOT = [sum(rk) for rk in RK_DATA]
			LK_DOT = [sum(lk) for lk in LK_DATA]
			RK_PROJ = [sum(rk[0:prj_len]) for rk in RK_DATA]
			LK_PROJ = [sum(lk[0:prj_len]) for lk in LK_DATA]

			pca_key['LOGDOT'].append(LK_DOT)
			pca_key['RAWDOT'].append(RK_DOT)

			pca_key['LOGPRJ'].append(LK_PROJ)
			pca_key['RAWPRJ'].append(RK_PROJ)



		rawdot = {'pts': pca_key['RAWDOT'], 'axes': ['PC'+str(x+1)+'-RAWDOT' for x in range(len(coeff_key.keys()))]}
		logdot = {'pts': pca_key['LOGDOT'], 'axes': ['PC'+str(x+1)+'-LOGDOT' for x in range(len(coeff_key.keys()))]}
		rawprj = {'pts': pca_key['RAWPRJ'], 'axes': ['PC'+str(x+1)+'-RAWPRJ' for x in range(len(coeff_key.keys()))]}
		logprj = {'pts': pca_key['LOGPRJ'], 'axes': ['PC'+str(x+1)+'-LOGPRJ' for x in range(len(coeff_key.keys()))]}



		for kp,kpts in pca_key.items(): 
			w_name = self.plt_name+'_'+kp+'_pca_proj.pts'
			w= open(w_name,'w') 
			w.write("%-50s %10s %10s %10ss %10s %10s \n" % ('---','PC1','PC2','PC3','PC4','PC5'))
			for si,p in enumerate(kpts):
				s = self.S[si] 
				w.write("%-50s %10f %10f %10f %10f %10f\n" % (s.name,p[0],p[1],p[2],p[3],p[4]))
			w.close() 


		for dc in [(0,1),(2,3)]: 
			p_name = self.plt_name+'_'+'-'.join([str(ss) for ss in dc])+'_'
			dimplot = dplot.dimplot(R.args,R.progress).add_data(self.S,[rawdot,logdot,rawprj,logprj],dim_comps=[dc,dc,dc,dc],NAMES=False).finish(p_name+'pca')
			dimplot = dplot.dimplot(R.args,R.progress).add_data(self.S,[rawdot,logdot,rawprj,logprj],dim_comps=[dc,dc,dc,dc],NAMEOUTLIERS=True).finish(p_name+'exnamed_pca')

		dr = rage_DR.DR(R.args,R.progress)
		tsne_key = {k: dr.tsne(pca_pts=vals,axes_prefix=k) for k,vals in pca_key.items()}
		t_runs = [tsne_key['RAWDOT'],tsne_key['LOGDOT'],tsne_key['RAWPRJ'],tsne_key['LOGPRJ']]

		for kp,kpts in tsne_key.items(): 
			w_name = self.plt_name+'_'+kp+'_tsne_proj.pts'	
			w= open(w_name,'w') 
			w.write("%-50s %10s %10s\n" % ('---','TSNE1','TSNE2'))
			for si,p in enumerate(kpts['pts']):
				s = self.S[si] 
				w.write("%-50s %10f %10f\n" % (s.name,p[0],p[1]))
			w.close() 



		dimplot = dplot.dimplot(R.args,R.progress).add_data(self.S,t_runs,dim_comps=[(0,1),(0,1),(0,1),(0,1)],NAMES=False).finish(self.plt_name+'_tsne')
		dimplot = dplot.dimplot(R.args,R.progress).add_data(self.S,t_runs,dim_comps=[(0,1),(0,1),(0,1),(0,1)],NAMEOUTLIERS=True).finish(self.plt_name+'_exnamed_tsne')
		R.progress.end() 
		sys.exit()