示例#1
0
def make_dr_plots(R, choice='samples'):

    if choice == 'samples':
        r_members = R.data.samples
        r_matrix = R.data.matrix('log')
        out_name = R.args.prefix + '_samples_'
    else:
        r_members = R.data.features
        r_matrix = R.data.matrix('log').getT()
        out_name = R.args.prefix + '_features_'

    dr = rage_DR.DR(R.args, R.progress)
    pca_run = dr.run_pca(r_matrix)
    dimplot = dplot.dimplot(2, 2, R.args,
                            R.progress).add_data(r_members, pca_run, {
                                'title': 'PCA',
                                'out': out_name + 'pca.pdf'
                            })
    tsne_run = dr.run_tsne()
    dimplot = dplot.dimplot(2, 2, R.args,
                            R.progress).add_data(r_members, tsne_run, {
                                'title': 'TSNE',
                                'out': out_name + 'tsne.pdf'
                            })
    kca_run = dr.run_kca(r_matrix)
    dimplot = dplot.dimplot(2, 2, R.args,
                            R.progress).add_data(r_members, kca_run, {
                                'title': 'KCA',
                                'out': out_name + 'kca.pdf',
                                'zoom': True
                            })
    return pca_run, tsne_run, kca_run
示例#2
0
    def evaluate_model2(self):

        self.progress.start_minor('Running Model Regressions',
                                  len(self.D.features), False)
        for dist in self.options.dist:

            print 'yo'
            M = rt.RegModel(self.X, dist, self.options, self.progress,
                            True).run(self.Y,
                                      self.feature_names).aggregate(True)

            print 'yo'

            M_resids, C_resids = M.get_resids(COVAR=True)

            print 'yo'
            M_out = rage_regression_outputs.eval_output(self.options).write(
                M, self.feature_names)  #[f.name for f in self.D.features])
            Mc = rt.RegModel(self.Xc, dist, self.options).run(
                self.Y, self.feature_names).aggregate(True)
            #Mp 	= rt.RegModel(self.Xp,dist,self.options).run(self.Y,self.feature_names).aggregate(True)

            print 'yo'
            sims = dd(list)
            self.progress.start_minor('Running Model PCA',
                                      len(self.D.features), False)

            dim = rage_DR.DR(
                self.options,
                self.progress)  #.set_fit_matrix(self.D.matrix('log'))
            pca_init = dim.set_y_matrix(
                self.Y, LOG_TRANSFORM=dist[-3::] != 'LOG').pca(req='brief')
            #			pca_c_resid = dim.set_y_matrix(C_resids,LOG_TRANSFORM = dist[-3::] != 'LOG').pca(req='brief')
            pca_resid = dim.set_y_matrix(
                M_resids, LOG_TRANSFORM=dist[-3::] != 'LOG').pca(req='brief')
示例#3
0
	def run_tsne(self,R): 

		dr = rage_DR.DR(R.args,R.progress)
		out_name = R.args.prefix+'_transformsamples_TSNE'
		R.progress.start_minor('TSNE') 
		r_members = R.data.samples
		r_matrix = R.data.matrix('log')

		tsne_run = dr.run_tsne(r_matrix)
		dimplot = dplot.dimplot(2,2,R.args,R.progress).add_data(r_members,[tsne_run],{'title':'TSNE','out': out_name+'tsne.pdf'})

		dimplot.finish(out_name+'tsne.pdf') 
		R.progress.end() 
示例#4
0
    def run_condense_pca(self, R):
        #	R = self.rage
        D = R.data

        out_name = R.args.prefix + '_condensepca_'

        shared_features = [
            f.name for f in R.condensed_data.features
            if f.name in [fm.name for fm in D.features]
        ]

        cF = [
            f for f in R.condensed_data.features if f.name in shared_features
        ]
        rF = [f for f in D.features if f.name in shared_features]

        Yc = [[s.cnts[f.idx] for s in R.condensed_data.samples] for f in cF]
        Y = [[s.cnts[f.idx] for s in D.samples] for f in rF]

        dr = rage_DR.DR(R.args, R.progress)
        out_name = R.args.prefix + '_transformsamples_'
        R.progress.start_minor('PCA')
        r_members = R.data.samples

        condense_run = dr.set_y_matrix(Yc, LOG_TRANSFORM=True).pca(req='FULL')
        self.write_coeffs(condense_run['coefs'], cF, out_name + 'coefs.out')

        pca_run = dr.set_y_matrix(Y, LOG_TRANSFORM=True,
                                  SET_TRANSFORM=True).pca(req='FULL')

        dimplot = dplot.dimplot(1, 1, R.args, R.progress)

        shared_run = {
            'pts': condense_run['pts'] + pca_run['pts'],
            'axes': condense_run['axes']
        }
        shared_samples = [s for s in R.condensed_data.samples
                          ] + [s for s in R.data.samples]

        dimplot.add_data(shared_samples, [shared_run], {
            'title': 'SHARED_PCA',
            'out': out_name + 'pca.pdf'
        })

        #		dimplot.add_data(R.condensed_data.samples,[condense_run],{'title':'PCA','out': out_name+'pca.pdf'})
        #		dimplot.add_data(R.data.samples,[pca_run],{'title':'PCA','out': out_name+'pca.pdf'})
        dimplot.finish(out_name + 'pca')
        R.progress.end()

        sys.exit()
示例#5
0
def make_pca_and_tsne_plots(self):

    seaborn.set(rc={
        'axes.facecolor': 'black',
        'figure.facecolor': 'cornflowerblue'
    })
    my_sizes = scale_vals([len(s.cnts.keys()) for s in self.input.samples], 20,
                          55)
    self.progress.start_subtopic('Calculating PCA/TSNE', '', 0)
    data_matrix = self.input.data_matrix('log')
    dr = rage_DR.DR(self.args, self.progress).set_matrix(data_matrix)
    dr.run_pca().run_kca().run_tsne().run_ica()
    subplot = rage_subplots.subplot(2, 2, self.args)
    subplot.add_legend(self.color_key.keys(), self.color_key.values())
    subplot.add_pca_data(
        dr.pca_pts, {
            'vars': dr.pca_vars,
            'title': 'PCA',
            'colors': self.color_labels,
            'sizes': my_sizes
        }).update({'clear_axes': True})
    subplot.add_pca_data(
        dr.kca_pts, {
            'type': 'kca',
            'title': 'KCA',
            'colors': self.color_labels,
            'zoom': True,
            'sizes': my_sizes
        }).update({'clear_axes': True})
    subplot.add_pca_data(
        dr.ica_pts, {
            'type': 'ica',
            'title': 'ICA',
            'colors': self.color_labels,
            'sizes': my_sizes
        }).update({'clear_axes': True})
    subplot.add_pca_data(dr.tsne_pts, {
        'type': 'tsne',
        'colors': self.color_labels,
        'sizes': my_sizes
    }).update({'clear_axes': True})
    #subplot.add_legend(self.color_key.keys(),self.color_key.values())
    subplot.save(self.args.prefix + '_dimred.png', {})
    self.progress.finish_subtopic()
示例#6
0
	def iterative_run(self,Y,S_NAMES,iteration=1,GROUPS=2,SELECTION_FRACTION=0.24,MAX_SIZE=4,DIMS=3,TAIL=False,LOWER=False,LOWEST=False): 


		LT = ('log' in self.options.notes)
		SC = ('scale' in self.options.notes)
		STD = ('std' in self.options.notes)
		DIMS=2
	
		if GROUPS == 2: 


			dr = rage_DR.DR(self.R.args,self.R.progress)
			#dr.set_y_matrix(Y, LOG_TRANSFORM=LT,SCALE=SC,STD_SCALE=STD) 
			dr.set_y_matrix(Y, CENTER=False,SCALE=True) 
			pca_run = dr.pca(req='FULL') 
			pca_pts = pca_run['pts'] 
			coeff_key = dd(list) 
			for i,C in enumerate(pca_run['coefs']):
				for j,(vs,vl,vi) in enumerate(C): coeff_key[self.R.data.features[vi].name].append((j,vl))



			S_CNTS = [sorted([(Y[i][j],self.R.data.features[i].name) for i in range(len(Y)) if Y[i][j] > 0],reverse=True) for j in range(len(S_NAMES))]


			S_OBS =  {S_NAMES[j]: len(S_CNTS[j]) for j in range(len(S_NAMES))}
			S_MED =  np.median(S_OBS.values())
			S_25 =  np.percentile(S_OBS.values(),25)

			P_LEN  =  max(min([len(sc) for sc in S_CNTS]),100)
			S_PROJECTIONS = dd(lambda: dd(list)) 
			for i,C in enumerate(S_CNTS): 
				for v,f in C[0:P_LEN]: 
					for j in range(len(coeff_key[f])):
						S_PROJECTIONS[S_NAMES[i]][j].append(v*coeff_key[f][j][1])
			
			S_PROJECTIONS =  {s_name: [np.mean(S_PROJECTIONS[s_name][j]) for j in range(DIMS)] for s_name in S_PROJECTIONS.keys()}
			
			dr = rage_DR.DR(self.R.args,self.R.progress)
			#dr.set_y_matrix(Y, TRANSPOSE = True, SCALE=True) 
			dr.set_y_matrix(Y, TRANSPOSE=True,CENTER=False,SCALE=True) 
			pca_tran = dr.pca(req='FULL') 
			s_key = dd(list) 
			for i,C in enumerate(pca_tran['coefs']): 
				for j,(vs,vl,vi) in enumerate(C): s_key[S_NAMES[vi]].append((j,vl))

			print DIMS

#			print '--- name2 idx1 idx2 obs1 obs2 | aCoef bCoef'
			print '--- idx1 obs1 | aCoef aPCA aProj',
			print '| idx2 obs2 | bCoef bPCA bProj'
			for i in range(len(S_NAMES)):
				aName = S_NAMES[i] 
				aCoefs,aPCA, aProj,aObs = [x[1] for x in s_key[aName]][0:DIMS], pca_pts[i][0:DIMS], S_PROJECTIONS[aName], S_OBS[aName]


				for j in range(i+1,len(S_NAMES)): 

					print aName,i,aObs,'|',	
					print ",".join([str(round(x,3)) for x in aCoefs]),
					print ",".join([str(round(x,3)) for x in aPCA]),
					print ",".join([str(round(x,3)) for x in aProj]),'|',


					bName = S_NAMES[j] 
					bCoefs,bPCA, bProj,bObs = [x[1] for x in s_key[bName]][0:DIMS], pca_pts[j][0:DIMS], S_PROJECTIONS[bName], S_OBS[bName]

					print bName,j,bObs,'|',	
					print ",".join([str(round(x,3)) for x in bCoefs]),
					print ",".join([str(round(x,3)) for x in bPCA]),
					print ",".join([str(round(x,3)) for x in bProj]),

					coefD = np.linalg.norm(np.array(aCoefs) - np.array(bCoefs))
					coefP = np.linalg.norm(np.array(aPCA) - np.array(bPCA))
					coefPr = np.linalg.norm(np.array(aProj) - np.array(bProj))
				

					print '|', coefD, coefP,coefPr








			sys.exit() 



			if TAIL: s_order =  sorted([(s_key[s][0][1],s) for s in s_key.keys()])
			else: 	 s_order =  sorted([(s_key[s][0][1],s) for s in s_key.keys()],reverse=True)


			PAIRS, S_STOP, FOUND,SCAN, ADDED = [], len(s_order) * SELECTION_FRACTION, dd(bool) , 10, 0 
			for i in range(len(s_order)): 
				i_val, i_name = s_order[i]
				if LOWER and S_OBS[i_name] > S_MED: continue 
				elif LOWEST and S_OBS[i_name] > S_25: continue 
			 
				i_size, i_loc,i_dists, j  = len(i_name.split('@')), np.array(S_PROJECTIONS[i_name] ),[], i + 1
				if FOUND[i] or i_size >= MAX_SIZE: continue 
				while j < len(s_order): 
					if not FOUND[j] and (i_size+len(s_order[j][1].split('@'))) <= MAX_SIZE: 
						j_val, j_name = s_order[j]
						i_dists.append((np.linalg.norm(i_loc - np.array(S_PROJECTIONS[j_name])),j_name,j))
					if len(i_dists) >= SCAN: break  
					j+=1
				if len(i_dists) > 0:
					dist,j_name,j_idx = sorted(i_dists)[0] 
					PAIRS.append((i_name,j_name))
					FOUND[j_idx], FOUND[i_name], FOUND[j_name] = True, True, True 
					
					ADDED +=1
				if ADDED > S_STOP: break 

	


			NEW_Y, NEW_IDX, IDX_KEY = [], {}, {s: i for i,s in enumerate(S_NAMES)} 
			for a,b in PAIRS: NEW_IDX[a+"@"+b] = [IDX_KEY[a],IDX_KEY[b]]
			for s,i in [(s,i) for (s,i) in IDX_KEY.items() if s not in FOUND]: NEW_IDX[s] = [i] 

			NEW_SAMPLE_NAMES = NEW_IDX.keys() 
			#for y in Y: NEW_Y.append([max([y[j] for j in NEW_IDX[ns]]) for ns in NEW_SAMPLE_NAMES]) 
			#for y in Y: NEW_Y.append([sum([y[j] for j in NEW_IDX[ns]]) for ns in NEW_SAMPLE_NAMES]) 
			for y in Y: NEW_Y.append([np.mean([y[j] for j in NEW_IDX[ns]]) for ns in NEW_SAMPLE_NAMES]) 
	

			return pca_run, NEW_Y, NEW_SAMPLE_NAMES
示例#7
0
	def run_iterative_pca(self,R,GROUPS=2):
		self.R = R
		D, F, S = R.data , R.data.features, R.data.samples 
		Y = [[s.cnts[f.idx] for s in D.samples] for f in D.features]


		
		self.D = R.data 

		### PARAMS ### 


		GROUPS = 2 
		SELECTION_FRACTION = 0.24	


		dr = rage_DR.DR(R.args,R.progress)
		out_name = R.args.prefix+'_transformsamples_'
		R.progress.start_minor('PCA') 

		LT = ('log' in self.options.notes)
		SC = ('scale' in self.options.notes)
		STD = ('std' in self.options.notes)

	

		if GROUPS == 200: 


			iterplot = dplot.iterplot(self.R.data.samples,self.R.data.features,R.args,R.progress,6)
			#sc = MinMaxScaler()
                	#Y = [[x[0] for x in sc.fit_transform(np.array(y).reshape(-1,1))] for y in Y]
#			dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.samples,[pca_run],dim_comps=[(0,1),(2,3),(4,5),(6,7)],NAMES=False).finish(R.args.prefix+'_transformsamples_sample_pca')

			PCA_1, Y_1, S_1 = self.iterative_run(Y,[s.name for s in S],iteration=1,SELECTION_FRACTION=0.02,MAX_SIZE=2,DIMS=1) #LOWEST=True) 
			for s in [s for s in S_1 if len(s.split('@')) > 1]:	print 1,s  


			iterplot.add_data(PCA_1, [s.name for s in self.R.data.samples]) 
			


			PCA_2, Y_2, S_2 = self.iterative_run(Y_1,S_1,iteration=2,SELECTION_FRACTION=0.05,MAX_SIZE=2,DIMS=1,LOWER=True) 
			for s in [s for s  in S_2 if len(s.split('@')) > 1]:	print 2,s  
 
			iterplot.add_data(PCA_2, S_1) 

			PCA_3, Y_3, S_3 = self.iterative_run(Y_2,S_2,iteration=2,SELECTION_FRACTION=0.025,MAX_SIZE=3,DIMS=1,LOWEST=True) 
			for s in [s for s  in S_3 if len(s.split('@')) > 1]:	print 3,s  
 
			iterplot.add_data(PCA_3, S_2) 
			
			PCA_4, Y_4, S_4 = self.iterative_run(Y_3,S_3,iteration=2,SELECTION_FRACTION=0.50,MAX_SIZE=2,DIMS=1,TAIL=False) 
			for s in [s for s  in S_4 if len(s.split('@')) > 1]:	print 4,s  
			iterplot.add_data(PCA_4, S_3) 

#			PCA_5, Y_5, S_5 = self.iterative_run(Y_4,S_4,iteration=2,SELECTION_FRACTION=0.12,MAX_SIZE=3,DIMS=1,TAIL=True,LOWER=True) 
#			for s in [s for s  in S_5 if len(s.split('@')) > 1]:	print 5,s  
#			iterplot.add_data(PCA_5, S_4) 

			
#			PCA_6, Y_6, S_6 = self.iterative_run(Y_5,S_5,iteration=2,SELECTION_FRACTION=0.40,MAX_SIZE=4,DIMS=2,TAIL=False) 
#			for s in [s for s  in S_6 if len(s.split('@')) > 1]:	print 6,s  
#			iterplot.add_data(PCA_6, S_5) 
			

			plt.savefig('PCA_ITERATIVE.pdf') 


				
			
						#print cn, pn, np.linalg.norm(c_pts-p_pts)


			
		#dimplot = dplot.dimplot(1,1,R.args,R.progress).add_data(R.data.samples,[pca_run],dim_comps=[(0,1),(2,3)],NAMES=True).finish(out_name+'named.pca')
		#dimplot = dplot.dimplot(1,1,R.args,R.progress).add_data(R.data.samples,[pca_run],dim_comps=[(0,1),(1,2),(3,4),(5,6)],NAMES=True).finish(out_name+'named.pca')
		#dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.samples,[pca_tran],dim_comps=[(0,1),(2,3),(4,5),(6,7)],NAMES=False).finish(out_name+'Cpca')

	
		dr.set_y_matrix(Y, LOG_TRANSFORM=LT,SCALE=SC,STD_SCALE=STD) 
		pca_run = dr.pca(req='FULL') 


		self.coeff_key, sample_data  = self.write_pca_data(pca_run, S, F, R.args.prefix+'_sampletransform_')		
		dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.samples, [pca_run],dim_comps=[(0,1),(2,3),(4,5),(6,7)],NAMES=False).finish(R.args.prefix+'_transformsamples_sample_pca')
		s_pts = [[p[1] for p in self.coeff_key[f.name]] for f in R.data.features]
		pca_run['pts'] = s_pts 
		dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.features,[pca_run],dim_comps=[(0,1),(2,3),(4,5),(6,7)], NAMES=True).finish(R.args.prefix+'_transformsamples_feature_pca')




		dr.set_y_matrix(Y, TRANSPOSE = True, SCALE=True) 
		pca_tran = dr.pca(req='FULL') 

		self.s_key, feature_data = self.write_pca_data(pca_tran,  F,S, R.args.prefix+'_featuretransform_')
		dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.features,[pca_tran],dim_comps=[(0,1),(2,3),(4,5),(6,7)], NAMES=True).finish(R.args.prefix+'_transformfeatures_feature_pca')

		s_pts = [[p[1] for p in self.s_key[s.name]] for s in R.data.samples]
		pca_tran['pts'] = s_pts 
		dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.samples,[pca_tran],dim_comps=[(0,1),(2,3),(4,5),(6,7)], NAMES=False).finish(R.args.prefix+'_transformfeatures_samples_pca')
示例#8
0
	def run_pca(self,R):
		self.LT, self.SC, self.STD, ttype, cstr = False, False, False, '_', '_'

		if len(self.options.color)>0: 	 cstr+= '-'.join(self.options.color) 
		if self.options.marker != None:   cstr+= '-'+self.options.marker		

		if 'log' in self.options.notes:     
			self.LT= True 
			ttype += 'LOG_'
		if 'scale' in self.options.notes:   
			self.SC = True
			ttype += 'SCALE_'
		elif 'std'   in self.options.notes: 
			self.STD = True 
			ttype += 'STD_'

		if len(ttype) < 3: ttype = '_RAW'

		self.out_name = R.args.prefix+'_'+cstr+'_transformsamples'+ttype
		self.plt_name = R.args.prefix+'_'+cstr+'_transformsamples'+ttype 
		self.D = R.data 
		self.S = self.D.samples 
		self.Y = [[s.cnts[f.idx] for s in self.D.samples] for f in self.D.features]

		if self.options.coeffs:
			self.precomp_pca(R,self.options.coeffs) 
			return

		else: 


			dr = rage_DR.DR(R.args,R.progress)
			R.progress.start_minor('PCA') 
			dr.set_y_matrix(self.Y, LOG_TRANSFORM=self.LT,SCALE=self.SC,STD_SCALE=self.STD)
			pca_run = dr.pca(req='FULL') 

			F_key = dd(list) 
			for i,C in enumerate(pca_run['coefs']):
				for j,(vs,vl,vi) in enumerate(C): 
					F_key[self.D.features[vi].name].append((j,vl))

			w= open(self.out_name+'pca_coefs.out','w') 
			w.write("%-50s %5s %10s %5s %10s %5s %10s\n" % ('---','R1','V1','R2','V2','R3','V3'))
			for k,C in F_key.items(): 
				w.write("%-50s" % (k))
				for i in range(len(C)): 	w.write(" %5d %10f" % (C[i][0],C[i][1]))
				w.write('\n') 
			w.close() 

			w= open(self.out_name+'pca_pts.out','w') 
			w.write("%-50s %10s %10s %10ss %10s %10s \n" % ('---','PC1','PC2','PC3','PC4','PC5'))
			for p,s in zip(pca_run['pts'],R.data.samples):	w.write("%-50s %10f %10f %10f %10f %10f\n" % (s.name,p[0],p[1],p[2],p[3],p[4]))
			w.close() 

				
			dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.samples,[pca_run],dim_comps=[(0,1),(2,3),(4,5),(6,7)],NAMES=False).finish(self.plt_name+'_sample_pca')
			dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.samples,[pca_run],dim_comps=[(8,9),(10,11),(12,13),(14,15)],NAMES=False).finish(self.plt_name+'_sample_hipca')
				
			R.progress.end() 
			
			
			tsne_run = dr.tsne() 	
			w= open(self.out_name+'tsne_pts.out','w') 
			w.write("%-50s %10s %10s \n" % ('---','TS1','TS2'))
			for p,s in zip(tsne_run['pts'],R.data.samples):	w.write("%-50s %10f %10f \n" % (s.name,p[0],p[1]))
			w.close() 

			dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.samples,[tsne_run],dim_comps=[(0,1)],NAMES=False).finish(self.plt_name+'_sample_tsne')

			R.progress.end() 
示例#9
0
	def precomp_pca(self,R,coeffs,PLEN=1500,MAX_COEFS=8):


		coeff_key = dd(lambda: {}) 
		scale_key = dd(lambda: {}) 
		projection_key = dd(lambda: {}) 
		for line in coeffs: 
			line = line.split() 
			if line[0] == '---': continue

			for i in range(2,len(line),2):

				coeff_key[(i/2)-1][line[0]] = float(line[i])
				if i >= 40: break 
			


		for f in self.D.features: 
			if f.name not in coeff_key[0]: 
				for i in coeff_key.keys(): coeff_key[i][f.name] = 0.0 
		

		for s in self.S: projection_key[s] = sorted([[c,self.D.features[i].name,[coeff_key[n][self.D.features[i].name] for n in range(len(coeff_key))]] for i,c in s.cnts.items()],reverse=True) 
				

		prj_len =  max(min([len(X) for X in projection_key.values()]),PLEN) 
		self.plt_name+='_projected_'+str(prj_len)
		pca_key = dd(list) 
		tsne_key = {} 

		for s in self.S:

			LK_DATA = [[pk[2][n]*log(pk[0],2) for pk in projection_key[s]] for n in range(len(coeff_key.keys()))]
			RK_DATA = [[pk[2][n]*log(pk[0],2) for pk in projection_key[s]] for n in range(len(coeff_key.keys()))]


			RK_DOT = [sum(rk) for rk in RK_DATA]
			LK_DOT = [sum(lk) for lk in LK_DATA]
			RK_PROJ = [sum(rk[0:prj_len]) for rk in RK_DATA]
			LK_PROJ = [sum(lk[0:prj_len]) for lk in LK_DATA]

			pca_key['LOGDOT'].append(LK_DOT)
			pca_key['RAWDOT'].append(RK_DOT)

			pca_key['LOGPRJ'].append(LK_PROJ)
			pca_key['RAWPRJ'].append(RK_PROJ)



		rawdot = {'pts': pca_key['RAWDOT'], 'axes': ['PC'+str(x+1)+'-RAWDOT' for x in range(len(coeff_key.keys()))]}
		logdot = {'pts': pca_key['LOGDOT'], 'axes': ['PC'+str(x+1)+'-LOGDOT' for x in range(len(coeff_key.keys()))]}
		rawprj = {'pts': pca_key['RAWPRJ'], 'axes': ['PC'+str(x+1)+'-RAWPRJ' for x in range(len(coeff_key.keys()))]}
		logprj = {'pts': pca_key['LOGPRJ'], 'axes': ['PC'+str(x+1)+'-LOGPRJ' for x in range(len(coeff_key.keys()))]}



		for kp,kpts in pca_key.items(): 
			w_name = self.plt_name+'_'+kp+'_pca_proj.pts'
			w= open(w_name,'w') 
			w.write("%-50s %10s %10s %10ss %10s %10s \n" % ('---','PC1','PC2','PC3','PC4','PC5'))
			for si,p in enumerate(kpts):
				s = self.S[si] 
				w.write("%-50s %10f %10f %10f %10f %10f\n" % (s.name,p[0],p[1],p[2],p[3],p[4]))
			w.close() 


		for dc in [(0,1),(2,3)]: 
			p_name = self.plt_name+'_'+'-'.join([str(ss) for ss in dc])+'_'
			dimplot = dplot.dimplot(R.args,R.progress).add_data(self.S,[rawdot,logdot,rawprj,logprj],dim_comps=[dc,dc,dc,dc],NAMES=False).finish(p_name+'pca')
			dimplot = dplot.dimplot(R.args,R.progress).add_data(self.S,[rawdot,logdot,rawprj,logprj],dim_comps=[dc,dc,dc,dc],NAMEOUTLIERS=True).finish(p_name+'exnamed_pca')

		dr = rage_DR.DR(R.args,R.progress)
		tsne_key = {k: dr.tsne(pca_pts=vals,axes_prefix=k) for k,vals in pca_key.items()}
		t_runs = [tsne_key['RAWDOT'],tsne_key['LOGDOT'],tsne_key['RAWPRJ'],tsne_key['LOGPRJ']]

		for kp,kpts in tsne_key.items(): 
			w_name = self.plt_name+'_'+kp+'_tsne_proj.pts'	
			w= open(w_name,'w') 
			w.write("%-50s %10s %10s\n" % ('---','TSNE1','TSNE2'))
			for si,p in enumerate(kpts['pts']):
				s = self.S[si] 
				w.write("%-50s %10f %10f\n" % (s.name,p[0],p[1]))
			w.close() 



		dimplot = dplot.dimplot(R.args,R.progress).add_data(self.S,t_runs,dim_comps=[(0,1),(0,1),(0,1),(0,1)],NAMES=False).finish(self.plt_name+'_tsne')
		dimplot = dplot.dimplot(R.args,R.progress).add_data(self.S,t_runs,dim_comps=[(0,1),(0,1),(0,1),(0,1)],NAMEOUTLIERS=True).finish(self.plt_name+'_exnamed_tsne')
		R.progress.end() 
		sys.exit() 
示例#10
0
    def eval_predictors(self):

        dim = rage_DR.DR(self.options,
                         self.progress)  #.set_fit_matrix(self.D.matrix('log'))
        predictor_plot = rage_regression_plots.predictor_plot(
            self.options, len(self.rage.args.predictors))

        #		reg_out = rage_outputs.regression_output(self.options,M_full)
        #		pred_out = rage_outputs.predictor_output(self.options)

        for p in self.rage.args.predictors:

            self.D = copy.deepcopy(self.rage.data)
            self.D.rage.progress.reset()

            #self.D.filter_samples_by_attributes([p],[]).normalize()
            #self.V = self.D.set_sample_variables([p])

            self.D.filter_samples_by_attributes(
                [p], self.options.covariates).normalize()
            self.V = self.D.set_sample_variables([p], self.options.covariates)

            self.Y = [[s.cnts[f.idx] for s in self.D.samples]
                      for f in self.D.features]
            self.X = self.V.select_variables(self.V.variables)

            self.options.color = [p]
            self.D.samples.create_plot_labels(self.options)

            self.progress.start_minor('Running Predictor Regression: ' + p,
                                      len(self.D.features), False)
            self.progress.mark()
            M = rrm.RegModel(self.options, self.X,
                             True).run(self.Y).aggregate(True)
            pca_init = dim.pca(self.D.matrix(), req='brief')  #['total_var']
            pca_resid = dim.pca(np.matrix(M.out['resids']).getT(),
                                req='brief')  #@['total_var']

            sims = dd(list)
            SUMMARIZE = True

            preds = [sp for sp in M.pv_dict.keys() if sp != 'intercept']
            gt_key = dd(lambda: dd(int))
            best_key = dd(lambda: dd(float))
            for n in range(self.options.simulations):
                self.progress.start_minor('Running Simulation ' + str(n + 1),
                                          False)
                Ms = rrm.RegModel(
                    self.options,
                    self.V.select_variables(self.V.variables,
                                            permute=[p])).run(
                                                self.Y).aggregate(True)
                sims['var'].append(
                    dim.pca(np.matrix(Ms.out['resids']).getT(),
                            req='brief')['total_var'])
                sims['pv'].append(Ms.pv_cnt)
                sims['rs'].append(Ms.rs_cnt)

                if SUMMARIZE:
                    for i, (f, Yi) in enumerate(zip(self.D.features, self.Y)):
                        rsq, rsa = Ms.out['rsq'][i], Ms.out['rsa'][i]
                        if rsq > M.out['rsq'][i]: gt_key[i]['rsq'] += 1
                        if rsa > M.out['rsa'][i]: gt_key[i]['rsa'] += 1
                        if rsa > best_key[i]['rsa']: best_key[i]['rsa'] = rsa
                        if rsq > best_key[i]['rsq']: best_key[i]['rsq'] = rsq
                        for sp in M.pv_dict.keys():
                            spV = Ms.pv_dict[sp][i]
                            if spV < M.pv_dict[sp][i]: gt_key[i][sp] += 1
                            if spV < best_key[i][sp]: best_key[i][sp] = spV

            pred_out = rage_outputs.predictor_output(self.options, p, M,
                                                     self.D.features, self.Y)
            pred_out.add_sim_keys(gt_key, best_key, self.options.simulations)

            predictor_plot.add_predictor_row(p, self.D.samples, pca_init,
                                             pca_resid, M, sims)
            self.progress.end()
        predictor_plot.save(self.options.prefix + "-predictorplot-" +
                            "_".join(self.rage.args.predictors) + '-cov-' +
                            '_'.join(self.rage.args.covariates))
        sys.exit()
示例#11
0
    def evaluate_model(self):

        self.progress.start_minor('Running Model Regressions',
                                  len(self.D.features), False)
        for dist in self.dists:

            M = rrm.RegModel(self.X, dist, self.options, self.progress,
                             True).run(self.Y,
                                       self.feature_names).aggregate(True)
            M_resids, C_resids = M.get_resids()
            Mc = rrm.RegModel(self.Xc, dist, self.options).run(
                self.Y, self.feature_names).aggregate(True)

            sims = dd(list)
            self.progress.start_minor('Running Model PCA',
                                      len(self.D.features), False)

            dim = rage_DR.DR(
                self.options,
                self.progress)  #.set_fit_matrix(self.D.matrix('log'))
            pca_init = dim.set_y_matrix(self.Y, LOG_TRANSFORM=True,
                                        SCALE=True).pca(req='brief')
            pca_c_resid = dim.set_y_matrix(C_resids,
                                           LOG_TRANSFORM=dist[-3::] != 'LOG',
                                           SCALE=True).pca(req='brief')
            pca_resid = dim.set_y_matrix(M_resids,
                                         LOG_TRANSFORM=dist[-3::] != 'LOG',
                                         SCALE=True).pca(req='brief')

            for n in range(self.options.simulations):
                self.progress.start_major('Running Simulation ' + str(n + 1),
                                          False)
                Xs = self.V.select_variables(self.V.variables,
                                             permute=self.V.predictors)
                Xs.zp = self.X.zp
                Ms = rrm.RegModel(Xs, dist, self.options, self.progress).run(
                    self.Y, self.feature_names).aggregate()
                S_out = rage_regression_outputs.eval_output(
                    self.options).write(Ms, self.feature_names, n + 1)
                sims['pv'].append(Ms.pv_cnt)
                sims['rs'].append(Ms.rs_cnt)
                sims['v_exp'].append(np.mean(Ms.out['v_exp']))

            self.progress.start_minor('Plotting Results  ', 100, False)
            mplot = rage_regression_plots.model_plot(self.D.samples, self.X,
                                                     self.options, 3, 2, {
                                                         'p_key': M.pv_key,
                                                         'r_key': M.rs_key
                                                     })

            mplot.add_model_table(M,
                                  total_var=[
                                      np.mean(Mc.out['v_exp']),
                                      np.mean(M.out['v_exp']),
                                      np.mean(sims['v_exp'])
                                  ]).update()
            mplot.add_predictor_table(M, self.X, self.options, {
                'sim_pvs': sims['pv']
            }).update()
            mplot.add_rs_bars(M.rs_cnt, self.options, sims['rs']).update({
                'title':
                '$' + "\ ".join(self.V.predictors) + '$ ' + '$\  R^2\ Values$'
            })
            mplot.add_pv_bars(M.pv_cnt, self.options, sims['pv']).update({
                'title':
                '$' + "\ ".join(self.V.predictors) + '$ ' + '$\  P\ \ Values$'
            })
            mplot.add_pca_pts(pca_init, {
                'colspan': 2
            }).update({
                'title': 'PCA Initial Values',
                'yadd': 2,
                'colspan': 2
            })
            mplot.add_pca_pts(pca_c_resid, {
                'colspan': 2
            }).update({
                'title': 'PCA Covariate Residuals',
                'yadd': 2,
                'colspan': 2
            })
            mplot.add_pca_pts(pca_resid, {
                'colspan': 2
            }).update({
                'title': 'PCA Model Residuals',
                'yadd': 2,
                'colspan': 2
            })
            mplot.save(dist, self.options.predictors, self.options.covariates)
            rage_regression_outputs.reg_simulate(self.options).write(
                M.pv_cnt, sims['pv'], self.options.simulations,
                self.V.predictors)
            self.progress.end()
示例#12
0
    def run(self):
        R = self.rage

        # R.data.filter_samples_by_attributes().normalize()

        if R.args.command == 'samples':
            R.progress.start_major('SampleSummary')

            R.data.samples.create_plot_labels(R.args)

            if R.args.pca or R.args.tsne:
                R.progress.start_minor('Performing Dimensional Reduction',
                                       len(R.data.samples))
                dim = rage_DR.DR(R.args, R.progress).set_fit_matrix(
                    R.data.matrix('log'))
                pca = dim.pca()
                if R.args.tsne:
                    dim_plot = rage_scatterplots.DimR(
                        R.args, R.progress, 1,
                        2).add_dim_run(pca, R.data.samples).add_dim_run(
                            dim.tsne(), R.data.samples).save()
                else:
                    dim_plot = rage_scatterplots.DimR(
                        R.args, R.progress).add_dim_run(pca,
                                                        R.data.samples).save()

                rage_outputs.column_coefs(R.args).write(
                    pca['coefs'], R.data.features, {
                        'suffix': 'PCAcoeffs.features.out',
                        'width': 15
                    })
                rage_outputs.dr_pts(R.args).write(pca['pts'], R.data.samples,
                                                  {'suffix': 'pca.pts.out'})

            R.progress.start_minor('Calculating Summary Stats',
                                   len(R.data.samples))
            sample_stats = summary_hists(R.data.samples, R.data.features,
                                         R.args, R.progress)
            rage_outputs.column_stats(R.args).write(
                sample_stats, R.data.samples, {
                    'suffix': 'samplestats',
                    'width': 15
                })

            sample_trends = summary_trends(R.data.samples, R.data.features,
                                           R.args, R.progress)

        elif R.args.command == 'features':
            R.progress.start_major('FeatureSummary')

            R.data.features.create_plot_labels(R.args)
            if R.args.pca or R.args.tsne:
                R.progress.start_minor('Performing Dimensional Reduction',
                                       len(R.data.features))
                dim = rage_DR.DR(R.args, R.progress).set_fit_matrix(
                    R.data.matrix('log', TRANSPOSE=True))
                pca = dim.pca()
                if R.args.tsne:
                    dim_plot = rage_scatterplots.DimR(
                        R.args, R.progress, 1,
                        2).add_dim_run(pca, R.data.features).add_dim_run(
                            dim.tsne(), R.data.features).save()
                else:
                    dim_plot = rage_scatterplots.DimR(
                        R.args,
                        R.progress).add_dim_run(pca, R.data.features).save()

                rage_outputs.column_coefs(R.args).write(
                    pca['coefs'], R.data.features, {
                        'suffix': 'PCAcoeffs.features.out',
                        'width': 15
                    })
                rage_outputs.dr_pts(R.args).write(pca['pts'], R.data.samples,
                                                  {'suffix': 'pca.pts.out'})

            R.progress.start_minor('Calculating Summary Stats',
                                   len(R.data.samples))

            feature_stats = summary_hists(R.data.features, R.data.samples,
                                          R.args, R.progress)
            rage_outputs.column_stats(R.args).write(
                feature_stats, R.data.features, {
                    'suffix': 'featurestats.out',
                    'width': 15
                })

            feature_trends = summary_trends(R.data.features, R.data.samples,
                                            R.args, R.progress)

        elif R.args.command == 'ratios':

            feature_comps = rage_comps.features(self.rage).get_f_ratios()

            HOUSEKEEPING, r_key = feature_comps.HOUSEKEEPING, feature_comps.r_key

            feature_comps.predict_known_ratio_values()
示例#13
0
def summary_dists(X, Y, options, progress, X_NAME='SAMPLES'):
    seaborn.set(rc={
        'axes.facecolor': 'lightpink',
        'figure.facecolor': 'lightgray'
    })
    progress.start_major('Plotting Distribution Densities', len(X))
    kde = rage_KDE.samples(0.3)
    f_num, subplot = 1, rage_subplots.subplot(6, 2, options)
    LOG = True

    dr = rage_DR.DR(options, progress)
    y_vals = scale_vals([log((1.0 + sum(y.cnts.values()))) for y in Y])
    x1, y1 = kde.run(y_vals)
    subplot.add_lines(x1, y1, None, None,
                      'black').update({'title': 'Global Distribution'})

    iter_data = []
    for x in X:
        progress.mark()
        non_zeros = scale_vals([log(v + 1.0) for v in x.cnts.values()] + [0.0])
        all_vals = scale_vals([0
                               for v in range(Y.len - (1 + len(non_zeros)))] +
                              [log(v + 1.0) for v in x.cnts.values()])
        nz = simsample_items(non_zeros)
        sz = simsample_items(all_vals)
        x.notes['iter'] = [non_zeros, all_vals]

        iter_data.append([non_zeros, all_vals, nz, sz])
    r_matrix = np.matrix([it[2] for it in iter_data])
    pca_run = dr.run_pca(r_matrix)
    kmean_run = dr.run_kmeans(r_matrix)
    subplot.add_pca_data(pca_run['pts'],
                         {'title': 'PCA on binned distribution values'
                          })  #.update({'clear_axes': True})
    for i in range(len(kmean_run['labels'][0])):
        X[i].notes['km'] = kmean_run['labels'][0][i]
        if X[i].notes['km'] == 0:
            subplot.ax.scatter(pca_run['pts'][i][0],
                               pca_run['pts'][i][1],
                               color='yellow')
    subplot.update({'clear_axes': True})

    X0, X1 = [x for x in X
              if x.notes['km'] == 0], [x for x in X if x.notes['km'] == 1]

    for x1, x2 in zip(X0, X1):

        if x2.name in ['EB321', 'EB1015']: continue
        nz1, az1 = x1.notes['iter']
        nz2, az2 = x2.notes['iter']

        a1, b1 = kde.run(az1)
        a2, b2 = kde.run(nz1)

        subplot.add_lines(a1, b1, None, None, 'black')
        subplot.add_lines(a2, b2, None, None, 'cyan')
        subplot.update({'clear_axes': True, 'title': x1.name})

        a1, b1 = kde.run(az2)
        a2, b2 = kde.run(nz2)

        subplot.add_lines(a1, b1, None, None, 'black')
        subplot.add_lines(a2, b2, None, None, 'cyan')
        subplot.update({'clear_axes': True, 'title': x2.name})
        f_num += 1
        if not subplot.update or f_num > 15:
            break

    plt.subplots_adjust(left=0.07,
                        bottom=0.01,
                        right=0.93,
                        top=0.95,
                        wspace=0.2,
                        hspace=0.6)
    subplot.save(options.prefix + 'fig_dists' + str(f_num) + '.png',
                 {'title': 'Dual Dists: '})
    progress.end()
示例#14
0
def make_dr_plots2(R, choice='samples'):

    if choice == 'samples':
        r_members = R.data.samples
        r_matrix = R.data.matrix('log')
        out_name = R.args.prefix + '_samples_'
    else:
        r_members = R.data.features
        r_matrix = R.data.matrix('log').getT()
        out_name = R.args.prefix + '_features_'

    dr = rage_DR.DR(R.args, R.progress)
    pca_run = dr.run_pca(r_matrix)
    dimplot = dplot.dimplot(2, 2, R.args,
                            R.progress).add_data(r_members, pca_run, {
                                'title': 'PCA',
                                'out': out_name + 'pca.pdf'
                            })
    #	tsne_run = dr.run_tsne()

    #	dimplot = dplot.dimplot(2,2,R.args,R.progress).add_data(r_members,tsne_run,{'title':'TSNE','out': out_name+'tsne.pdf'})
    kca_gamma = 20
    #	kca_run = dr.run_kca(r_matrix,kernel='rbf',gamma=0.001)
    #	dimplot = dplot.dimplot(2,2,R.args,R.progress).add_data(r_members,kca_run,{'title':'KCA-rbf','out': out_name+'kca-001-rbf.pdf','zoom': True})

    #	kca_run = dr.run_kca(r_matrix,kernel='rbf',gamma=0.0001)
    #	dimplot = dplot.dimplot(2,2,R.args,R.progress).add_data(r_members,kca_run,{'title':'KCA-rbf','out': out_name+'kca-0001-rbf.pdf','zoom': True})

    #	kca_run = dr.run_kca(r_matrix,kernel='rbf',gamma=0.1)
    #	dimplot = dplot.dimplot(2,2,R.args,R.progress).add_data(r_members,kca_run,{'title':'KCA-rbf','out': out_name+'kca-1-rbf.pdf','zoom': True})

    kca_run = dr.run_kca(r_matrix, kernel='rbf', gamma=0.01)
    dimplot = dplot.dimplot(2, 2, R.args,
                            R.progress).add_data(r_members, kca_run, {
                                'title': 'KCA-rbf',
                                'out': out_name + 'kca-01-rbf.pdf',
                                'zoom': True
                            })

    kca_run2 = dr.run_kca(r_matrix, kernel='rbf', gamma=0.005)
    dimplot = dplot.dimplot(2, 2, R.args,
                            R.progress).add_data(r_members, kca_run, {
                                'title': 'KCA-rbf',
                                'out': out_name + 'kca-005-rbf.pdf',
                                'zoom': True
                            })

    kca_run = dr.run_kca(r_matrix, kernel='rbf', gamma=0.05)
    dimplot = dplot.dimplot(2, 2, R.args,
                            R.progress).add_data(r_members, kca_run, {
                                'title': 'KCA-rbf',
                                'out': out_name + 'kca-05-rbf.pdf',
                                'zoom': True
                            })

    kca_run = dr.run_kca(r_matrix, kernel='rbf', gamma=0.1)
    dimplot = dplot.dimplot(2, 2, R.args,
                            R.progress).add_data(r_members, kca_run, {
                                'title': 'KCA-rbf',
                                'out': out_name + 'kca-p1-rbf.pdf',
                                'zoom': True
                            })

    kca_run = dr.run_kca(r_matrix, kernel='rbf', gamma=1)
    dimplot = dplot.dimplot(2, 2, R.args,
                            R.progress).add_data(r_members, kca_run, {
                                'title': 'KCA-rbf',
                                'out': out_name + 'kca-i1-rbf.pdf',
                                'zoom': True
                            })

    kca_run = dr.run_kca(r_matrix, kernel='rbf', gamma=10)
    dimplot = dplot.dimplot(2, 2, R.args,
                            R.progress).add_data(r_members, kca_run, {
                                'title': 'KCA-rbf',
                                'out': out_name + 'kca-i10-rbf.pdf',
                                'zoom': True
                            })

    kca_lin = dr.run_kca(r_matrix, kernel='linear')
    dimplot = dplot.dimplot(2, 2, R.args,
                            R.progress).add_data(r_members, kca_lin, {
                                'title': 'KCA-linear',
                                'out': out_name + 'kca-line.pdf',
                                'zoom': True
                            })

    try:
        kca_poly = dr.run_kca(r_matrix, kernel='poly')
        dimplot = dplot.dimplot(2, 2, R.args,
                                R.progress).add_data(r_members, kca_poly, {
                                    'title': 'KCA-poly',
                                    'out': out_name + 'kca-poly.pdf',
                                    'zoom': True
                                })
    except np.linalg.linalg.LinAlgError:
        kca_poly = None

    try:
        kca_sig = dr.run_kca(r_matrix, kernel='sigmoid')
        dimplot = dplot.dimplot(2, 2, R.args,
                                R.progress).add_data(r_members, kca_sig, {
                                    'title': 'KCA-sig',
                                    'out': out_name + 'kca-sig.pdf',
                                    'zoom': True
                                })
    except np.linalg.linalg.LinAlgError:
        kca_poly = None
    try:
        kca_cosine = dr.run_kca(r_matrix, kernel='cosine')
        dimplot = dplot.dimplot(2, 2, R.args,
                                R.progress).add_data(r_members, kca_cosine, {
                                    'title': 'KCA-cosine',
                                    'out': out_name + 'kca-cos.pdf',
                                    'zoom': True
                                })
    except np.linalg.linalg.LinAlgError:
        kca_poly = None

    return pca_run, kca_run, kca_lin