def run_tsne(self,R): dr = rage_DR.DR(R.args,R.progress) out_name = R.args.prefix+'_transformsamples_TSNE' R.progress.start_minor('TSNE') r_members = R.data.samples r_matrix = R.data.matrix('log') tsne_run = dr.run_tsne(r_matrix) dimplot = dplot.dimplot(2,2,R.args,R.progress).add_data(r_members,[tsne_run],{'title':'TSNE','out': out_name+'tsne.pdf'}) dimplot.finish(out_name+'tsne.pdf') R.progress.end()
def run_condense_pca(self, R): # R = self.rage D = R.data out_name = R.args.prefix + '_condensepca_' shared_features = [ f.name for f in R.condensed_data.features if f.name in [fm.name for fm in D.features] ] cF = [ f for f in R.condensed_data.features if f.name in shared_features ] rF = [f for f in D.features if f.name in shared_features] Yc = [[s.cnts[f.idx] for s in R.condensed_data.samples] for f in cF] Y = [[s.cnts[f.idx] for s in D.samples] for f in rF] dr = rage_DR.DR(R.args, R.progress) out_name = R.args.prefix + '_transformsamples_' R.progress.start_minor('PCA') r_members = R.data.samples condense_run = dr.set_y_matrix(Yc, LOG_TRANSFORM=True).pca(req='FULL') self.write_coeffs(condense_run['coefs'], cF, out_name + 'coefs.out') pca_run = dr.set_y_matrix(Y, LOG_TRANSFORM=True, SET_TRANSFORM=True).pca(req='FULL') dimplot = dplot.dimplot(1, 1, R.args, R.progress) shared_run = { 'pts': condense_run['pts'] + pca_run['pts'], 'axes': condense_run['axes'] } shared_samples = [s for s in R.condensed_data.samples ] + [s for s in R.data.samples] dimplot.add_data(shared_samples, [shared_run], { 'title': 'SHARED_PCA', 'out': out_name + 'pca.pdf' }) # dimplot.add_data(R.condensed_data.samples,[condense_run],{'title':'PCA','out': out_name+'pca.pdf'}) # dimplot.add_data(R.data.samples,[pca_run],{'title':'PCA','out': out_name+'pca.pdf'}) dimplot.finish(out_name + 'pca') R.progress.end() sys.exit()
def run_iterative_pca(self,R,GROUPS=2): self.R = R D, F, S = R.data , R.data.features, R.data.samples Y = [[s.cnts[f.idx] for s in D.samples] for f in D.features] self.D = R.data ### PARAMS ### GROUPS = 2 SELECTION_FRACTION = 0.24 dr = rage_DR.DR(R.args,R.progress) out_name = R.args.prefix+'_transformsamples_' R.progress.start_minor('PCA') LT = ('log' in self.options.notes) SC = ('scale' in self.options.notes) STD = ('std' in self.options.notes) if GROUPS == 200: iterplot = dplot.iterplot(self.R.data.samples,self.R.data.features,R.args,R.progress,6) #sc = MinMaxScaler() #Y = [[x[0] for x in sc.fit_transform(np.array(y).reshape(-1,1))] for y in Y] # dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.samples,[pca_run],dim_comps=[(0,1),(2,3),(4,5),(6,7)],NAMES=False).finish(R.args.prefix+'_transformsamples_sample_pca') PCA_1, Y_1, S_1 = self.iterative_run(Y,[s.name for s in S],iteration=1,SELECTION_FRACTION=0.02,MAX_SIZE=2,DIMS=1) #LOWEST=True) for s in [s for s in S_1 if len(s.split('@')) > 1]: print 1,s iterplot.add_data(PCA_1, [s.name for s in self.R.data.samples]) PCA_2, Y_2, S_2 = self.iterative_run(Y_1,S_1,iteration=2,SELECTION_FRACTION=0.05,MAX_SIZE=2,DIMS=1,LOWER=True) for s in [s for s in S_2 if len(s.split('@')) > 1]: print 2,s iterplot.add_data(PCA_2, S_1) PCA_3, Y_3, S_3 = self.iterative_run(Y_2,S_2,iteration=2,SELECTION_FRACTION=0.025,MAX_SIZE=3,DIMS=1,LOWEST=True) for s in [s for s in S_3 if len(s.split('@')) > 1]: print 3,s iterplot.add_data(PCA_3, S_2) PCA_4, Y_4, S_4 = self.iterative_run(Y_3,S_3,iteration=2,SELECTION_FRACTION=0.50,MAX_SIZE=2,DIMS=1,TAIL=False) for s in [s for s in S_4 if len(s.split('@')) > 1]: print 4,s iterplot.add_data(PCA_4, S_3) # PCA_5, Y_5, S_5 = self.iterative_run(Y_4,S_4,iteration=2,SELECTION_FRACTION=0.12,MAX_SIZE=3,DIMS=1,TAIL=True,LOWER=True) # for s in [s for s in S_5 if len(s.split('@')) > 1]: print 5,s # iterplot.add_data(PCA_5, S_4) # PCA_6, Y_6, S_6 = self.iterative_run(Y_5,S_5,iteration=2,SELECTION_FRACTION=0.40,MAX_SIZE=4,DIMS=2,TAIL=False) # for s in [s for s in S_6 if len(s.split('@')) > 1]: print 6,s # iterplot.add_data(PCA_6, S_5) plt.savefig('PCA_ITERATIVE.pdf') #print cn, pn, np.linalg.norm(c_pts-p_pts) #dimplot = dplot.dimplot(1,1,R.args,R.progress).add_data(R.data.samples,[pca_run],dim_comps=[(0,1),(2,3)],NAMES=True).finish(out_name+'named.pca') #dimplot = dplot.dimplot(1,1,R.args,R.progress).add_data(R.data.samples,[pca_run],dim_comps=[(0,1),(1,2),(3,4),(5,6)],NAMES=True).finish(out_name+'named.pca') #dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.samples,[pca_tran],dim_comps=[(0,1),(2,3),(4,5),(6,7)],NAMES=False).finish(out_name+'Cpca') dr.set_y_matrix(Y, LOG_TRANSFORM=LT,SCALE=SC,STD_SCALE=STD) pca_run = dr.pca(req='FULL') self.coeff_key, sample_data = self.write_pca_data(pca_run, S, F, R.args.prefix+'_sampletransform_') dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.samples, [pca_run],dim_comps=[(0,1),(2,3),(4,5),(6,7)],NAMES=False).finish(R.args.prefix+'_transformsamples_sample_pca') s_pts = [[p[1] for p in self.coeff_key[f.name]] for f in R.data.features] pca_run['pts'] = s_pts dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.features,[pca_run],dim_comps=[(0,1),(2,3),(4,5),(6,7)], NAMES=True).finish(R.args.prefix+'_transformsamples_feature_pca') dr.set_y_matrix(Y, TRANSPOSE = True, SCALE=True) pca_tran = dr.pca(req='FULL') self.s_key, feature_data = self.write_pca_data(pca_tran, F,S, R.args.prefix+'_featuretransform_') dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.features,[pca_tran],dim_comps=[(0,1),(2,3),(4,5),(6,7)], NAMES=True).finish(R.args.prefix+'_transformfeatures_feature_pca') s_pts = [[p[1] for p in self.s_key[s.name]] for s in R.data.samples] pca_tran['pts'] = s_pts dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.samples,[pca_tran],dim_comps=[(0,1),(2,3),(4,5),(6,7)], NAMES=False).finish(R.args.prefix+'_transformfeatures_samples_pca')
def run_pca(self,R): self.LT, self.SC, self.STD, ttype, cstr = False, False, False, '_', '_' if len(self.options.color)>0: cstr+= '-'.join(self.options.color) if self.options.marker != None: cstr+= '-'+self.options.marker if 'log' in self.options.notes: self.LT= True ttype += 'LOG_' if 'scale' in self.options.notes: self.SC = True ttype += 'SCALE_' elif 'std' in self.options.notes: self.STD = True ttype += 'STD_' if len(ttype) < 3: ttype = '_RAW' self.out_name = R.args.prefix+'_'+cstr+'_transformsamples'+ttype self.plt_name = R.args.prefix+'_'+cstr+'_transformsamples'+ttype self.D = R.data self.S = self.D.samples self.Y = [[s.cnts[f.idx] for s in self.D.samples] for f in self.D.features] if self.options.coeffs: self.precomp_pca(R,self.options.coeffs) return else: dr = rage_DR.DR(R.args,R.progress) R.progress.start_minor('PCA') dr.set_y_matrix(self.Y, LOG_TRANSFORM=self.LT,SCALE=self.SC,STD_SCALE=self.STD) pca_run = dr.pca(req='FULL') F_key = dd(list) for i,C in enumerate(pca_run['coefs']): for j,(vs,vl,vi) in enumerate(C): F_key[self.D.features[vi].name].append((j,vl)) w= open(self.out_name+'pca_coefs.out','w') w.write("%-50s %5s %10s %5s %10s %5s %10s\n" % ('---','R1','V1','R2','V2','R3','V3')) for k,C in F_key.items(): w.write("%-50s" % (k)) for i in range(len(C)): w.write(" %5d %10f" % (C[i][0],C[i][1])) w.write('\n') w.close() w= open(self.out_name+'pca_pts.out','w') w.write("%-50s %10s %10s %10ss %10s %10s \n" % ('---','PC1','PC2','PC3','PC4','PC5')) for p,s in zip(pca_run['pts'],R.data.samples): w.write("%-50s %10f %10f %10f %10f %10f\n" % (s.name,p[0],p[1],p[2],p[3],p[4])) w.close() dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.samples,[pca_run],dim_comps=[(0,1),(2,3),(4,5),(6,7)],NAMES=False).finish(self.plt_name+'_sample_pca') dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.samples,[pca_run],dim_comps=[(8,9),(10,11),(12,13),(14,15)],NAMES=False).finish(self.plt_name+'_sample_hipca') R.progress.end() tsne_run = dr.tsne() w= open(self.out_name+'tsne_pts.out','w') w.write("%-50s %10s %10s \n" % ('---','TS1','TS2')) for p,s in zip(tsne_run['pts'],R.data.samples): w.write("%-50s %10f %10f \n" % (s.name,p[0],p[1])) w.close() dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.samples,[tsne_run],dim_comps=[(0,1)],NAMES=False).finish(self.plt_name+'_sample_tsne') R.progress.end()
def precomp_pca(self,R,coeffs,PLEN=1500,MAX_COEFS=8): coeff_key = dd(lambda: {}) scale_key = dd(lambda: {}) projection_key = dd(lambda: {}) for line in coeffs: line = line.split() if line[0] == '---': continue for i in range(2,len(line),2): coeff_key[(i/2)-1][line[0]] = float(line[i]) if i >= 40: break for f in self.D.features: if f.name not in coeff_key[0]: for i in coeff_key.keys(): coeff_key[i][f.name] = 0.0 for s in self.S: projection_key[s] = sorted([[c,self.D.features[i].name,[coeff_key[n][self.D.features[i].name] for n in range(len(coeff_key))]] for i,c in s.cnts.items()],reverse=True) prj_len = max(min([len(X) for X in projection_key.values()]),PLEN) self.plt_name+='_projected_'+str(prj_len) pca_key = dd(list) tsne_key = {} for s in self.S: LK_DATA = [[pk[2][n]*log(pk[0],2) for pk in projection_key[s]] for n in range(len(coeff_key.keys()))] RK_DATA = [[pk[2][n]*log(pk[0],2) for pk in projection_key[s]] for n in range(len(coeff_key.keys()))] RK_DOT = [sum(rk) for rk in RK_DATA] LK_DOT = [sum(lk) for lk in LK_DATA] RK_PROJ = [sum(rk[0:prj_len]) for rk in RK_DATA] LK_PROJ = [sum(lk[0:prj_len]) for lk in LK_DATA] pca_key['LOGDOT'].append(LK_DOT) pca_key['RAWDOT'].append(RK_DOT) pca_key['LOGPRJ'].append(LK_PROJ) pca_key['RAWPRJ'].append(RK_PROJ) rawdot = {'pts': pca_key['RAWDOT'], 'axes': ['PC'+str(x+1)+'-RAWDOT' for x in range(len(coeff_key.keys()))]} logdot = {'pts': pca_key['LOGDOT'], 'axes': ['PC'+str(x+1)+'-LOGDOT' for x in range(len(coeff_key.keys()))]} rawprj = {'pts': pca_key['RAWPRJ'], 'axes': ['PC'+str(x+1)+'-RAWPRJ' for x in range(len(coeff_key.keys()))]} logprj = {'pts': pca_key['LOGPRJ'], 'axes': ['PC'+str(x+1)+'-LOGPRJ' for x in range(len(coeff_key.keys()))]} for kp,kpts in pca_key.items(): w_name = self.plt_name+'_'+kp+'_pca_proj.pts' w= open(w_name,'w') w.write("%-50s %10s %10s %10ss %10s %10s \n" % ('---','PC1','PC2','PC3','PC4','PC5')) for si,p in enumerate(kpts): s = self.S[si] w.write("%-50s %10f %10f %10f %10f %10f\n" % (s.name,p[0],p[1],p[2],p[3],p[4])) w.close() for dc in [(0,1),(2,3)]: p_name = self.plt_name+'_'+'-'.join([str(ss) for ss in dc])+'_' dimplot = dplot.dimplot(R.args,R.progress).add_data(self.S,[rawdot,logdot,rawprj,logprj],dim_comps=[dc,dc,dc,dc],NAMES=False).finish(p_name+'pca') dimplot = dplot.dimplot(R.args,R.progress).add_data(self.S,[rawdot,logdot,rawprj,logprj],dim_comps=[dc,dc,dc,dc],NAMEOUTLIERS=True).finish(p_name+'exnamed_pca') dr = rage_DR.DR(R.args,R.progress) tsne_key = {k: dr.tsne(pca_pts=vals,axes_prefix=k) for k,vals in pca_key.items()} t_runs = [tsne_key['RAWDOT'],tsne_key['LOGDOT'],tsne_key['RAWPRJ'],tsne_key['LOGPRJ']] for kp,kpts in tsne_key.items(): w_name = self.plt_name+'_'+kp+'_tsne_proj.pts' w= open(w_name,'w') w.write("%-50s %10s %10s\n" % ('---','TSNE1','TSNE2')) for si,p in enumerate(kpts['pts']): s = self.S[si] w.write("%-50s %10f %10f\n" % (s.name,p[0],p[1])) w.close() dimplot = dplot.dimplot(R.args,R.progress).add_data(self.S,t_runs,dim_comps=[(0,1),(0,1),(0,1),(0,1)],NAMES=False).finish(self.plt_name+'_tsne') dimplot = dplot.dimplot(R.args,R.progress).add_data(self.S,t_runs,dim_comps=[(0,1),(0,1),(0,1),(0,1)],NAMEOUTLIERS=True).finish(self.plt_name+'_exnamed_tsne') R.progress.end() sys.exit()