def get_tf_ss(cluster = 0, trgnames = None, basic = False): if basic: trgs, tfs = nu.parse_net() #grab a list of the tfs regulating 10 random trgs tfl = [] for k in trgs.keys()[0:50]: item = trgs[k] tfs = item['tfs'] for t in tfs: if not t in tfl: tfl.append(t) else: min_regs = min_tf_perc nat = greater(nu.net_affinity(),0) tgkeys = nu.net_trg_keyidxs() tg_sub = nat[[tgkeys[k] for k in trgnames],:] mem_means = mean(array(tg_sub,float),0) tfkeys = nu.net_tf_keyidxs() tfl = [] tfhash = ['']* len(tfkeys.keys()) for k,v in tfkeys.items(): tfhash[v] = k for n in nonzero(greater(mem_means,min_regs))[0]: tfl.append(tfhash[n]) return tfl
def non_normal_cluster_expr(trg_ssnames, tf_ssnames, ctype = False, random_tfs = False): if random_tfs: tf_kidxs = nu.net_tf_keyidxs() r =np.random.random_integers(0,len(tf_kidxs.keys()),len(tf_ssnames)) tf_ssnames = [] print 'Randomizing TFs' for i in r: tf_ssnames.append(tf_kidxs.keys()[i]) n = nu.parse_net() ts = nu.load_TS() if ctype: cl = nu.load_CL() tf_vals = array([ts[k] + cl[k] for k in tf_ssnames]).T tg_vals = array([ts[k] + cl[k] for k in trg_ssnames]).T else: tf_vals = array([ts[k] for k in tf_ssnames]).T tg_vals = array([ts[k] for k in trg_ssnames]).T tf_vals -= np.mean(tf_vals,0)[:] tg_vals -= np.mean(tg_vals,0)[:] tf_vals /= np.std(tf_vals,0)[:] tg_vals /= np.std(tg_vals,0)[:] return [tg_vals, tf_vals]
def all_siblings(name = default_name, reset = 0): donp = True hardcopy = True if not reset: out, sxs = nw.rn2(default_name, np = donp, hardcopy = hardcopy) if reset or not sxs: trgs, tfs = nu.parse_net(reset=mod(reset,2)) na = nu.net_affinity(reset = mod(reset,2)) na_thr = greater(na, min_thr) nrms = sqrt(sum(power(na_thr,2),1))[:,newaxis] nrms[equal(nrms,0)] = 1 nnn = array(na_thr,float)/nrms gg = dot(nnn,nnn.T) sibs = array(greater(gg, sib_thr), bool) nw.wn2(default_name, sibs, np = donp, hardcopy = hardcopy) out = sibs return out
def coreg_keys(t0 = None, do_plot = False): trgs, tfs = nu.parse_net() btgs, btfs = trg_markov_blanket(t0) if do_plot: show_m(btgs,btfs,t0) min_wt = .3 tgs_thr =nonzero( greater(btgs.values(),min_wt) )[0] keys_thr =[btgs.keys()[i] for i in tgs_thr] keys_thr.remove(t0) na = nu.net_affinity() ktf = nu.net_tf_keyidxs() ktg = nu.net_trg_keyidxs() shared = [] threshold_sims = True for k in keys_thr: # bg,bf = trg_markov_blanket(k, do_tgs = False)# row1 = na[ktg[k]] row0 = na[ktg[t0]] if threshold_sims: row0=array(greater(row0,min_wt),float) row1=array(greater(row1,min_wt),float) for r in [row0,row1]: l = sqrt(sum(power(r,2))) if l != 0: r /= l shared.append(sum(row0*row1)) shared = array(shared) min_sharing = .4 coreg_keys = [ keys_thr[i] for i in nonzero(greater(shared,min_sharing))[0]] if do_plot: plot_shared(shared)
def trg_markov_blanket(tkey, tgs = True): trgs, tfs = nu.parse_net() tg = trgs[tkey] blnk_tfs = (tg['tfs'],tg['weights']) b_tfs = [(blnk_tfs[0][i], blnk_tfs[1][i]) for i in range(len(blnk_tfs[0]))] b_tfs = dict(b_tfs) if do_tgs: blnk_tgs = ([tkey],[1.0]) for i in range(len(blnk_tfs[0])): elt = (blnk_tfs[0][i],blnk_tfs[1][i]) w0 =elt[1] tgs = tfs[elt[0]]['targets'] ws = array(tfs[elt[0]]['weights']) blnk_tgs[0].extend(tgs) blnk_tgs[1].extend(ws*w0) tgn = array(blnk_tgs[0]) tgw = array(blnk_tgs[1]) ksrt =argsort(tgn) blnk_tgs =[[tgn[i],tgw[i]] for i in ksrt] t_weights = {} for k,g in it.groupby(blnk_tgs,lambda x: x[0]): l = list(g) ws =array( map(lambda x: x[1],l)) t_weights[k] = (1 -product(1 - ws)) b_tgs = t_weights else: b_tgs = {} return b_tgs, b_tfs
def get_trg_ss(cluster = 0): #for now... fake it! trgs, tfs = nu.parse_net() #grab a list of 10 random trgs! sib_arr = s3.sib_lists() sibs = nonzero(sib_arr[cluster])[0] kidxs = nu.net_trg_keyidxs() trgl = list(array(kidxs.keys())[sibs]) if len(trgl) == 0: raise Exception() return trgl
def predict(reset = 0): trgs, tfs = nu.parse_net(reset = mod(reset,2)) tgk = trgs.keys() tfk= tfs.keys() ntg, ntf = len(tgk),len(tfk) indeg, outdeg = zeros(ntg),zeros(ntf) for i in range(ntg): indeg[i] = len(trgs[tgk[i]]['tfs']) for i in range(ntf): outdeg[i] = len(tfs[tfk[i]]['targets']) f = plt.figure(1) f.clear() ax = f.add_subplot(211) ax.plot(indeg[argsort(indeg)]) ax2 = f.add_subplot(212) ax2.plot(outdeg[argsort(outdeg)]) raise Exception()
def viewclusters(cands, fig = 5): #clusters = get_candidates(get_candidates(10)) f = plt.figure(fig) f.clear() ax1 = f.add_subplot(111) f2 = plt.figure(fig+1) f2.clear() ax2 = f2.add_subplot(111) for index in cands: trg_ssnames = get_trg_ss(cluster = index ) tf_ssnames = get_tf_ss( trgnames = trg_ssnames) n = nu.parse_net() ts = nu.load_TS() tf_vals = array([ts[k] for k in tf_ssnames]).T tg_vals = array([ts[k] for k in trg_ssnames]).T vals = tf_vals mvals = np.mean(vals,1) vals -= mvals[:,newaxis] svals = np.std(vals,1) if len(nonzero(equal(svals,0))[0]): raise Exception('only one tf...') vals /= svals[:,newaxis] for v in vals.T: v -= mean(v) v /= std(v) ax1.plot(vals) break
def normalize_cluster_expr(trg_ssnames, tf_ssnames, ctype = False, random_tfs = False): if random_tfs: tf_kidxs = nu.net_tf_keyidxs() r =np.random.random_integers(0,len(tf_kidxs.keys()),len(tf_ssnames)) tf_ssnames = [] print 'Randomizing TFs' for i in r: tf_ssnames.append(tf_kidxs.keys()[i]) n = nu.parse_net() ts = nu.load_TS() if ctype: cl = nu.load_CL() tf_vals = array([ts[k] + cl[k] for k in tf_ssnames]).T tg_vals = array([ts[k] + cl[k] for k in trg_ssnames]).T else: tf_vals = array([ts[k] for k in tf_ssnames]).T tg_vals = array([ts[k] for k in trg_ssnames]).T all_exprs = [] for vstart in [tg_vals, tf_vals]: vals = vstart mvals = np.mean(vals,1) vals -= mvals[:,newaxis] svals = np.std(vals,1) if len(nonzero(equal(svals,0))[0]): raise Exception('only one tf...') vals /= svals[:,newaxis] for v in vals.T: v -= mean(v) v /= std(v) all_exprs.append(vals) #raise Exception() return all_exprs
def viewmany(all_means, all_clusters, fig = 12): n = len(all_means) f = plt.figure(fig) f.clear() print '''Running viewmany.py For now, viewmany assumes that k is equal across clustering instances this is not really important but has to do with how TF projections are stored. ''' #1 k. k = len(all_means[0]) ax1 = f.add_axes([.05,.05,.95,.4]) ax2 = f.add_axes([.05,.55,.95,.4]) ct0 = mycolors.getct(n) sqa = nu.net_square_affinity()[0] aff = nu.net_affinity()[0] #tf_sqidxs should have length = ntf #with each element giving the coordinate of the #i'th tf in sqa space. sqidxs = nu.net_sq_keyidxs() n_tfidxs = nu.net_tf_keyidxs() trgs,tfs = nu.parse_net() tf_sqidxs = [sqidxs[key] for key in tfs.keys()] tfidxs = n_tfidxs.values() ntf = len(tfidxs) tfweights = zeros(ntf,int) #find tfs of general interest, choosing at most ten for each clustering ntf_each = 20 print '''...Computing representative TFs for each clustering. In the current formulation, we project each mean on to associated tf and then normalize each projection so that each mean has equal weight in TF selection. Not that we have handled the case where we have clusted in TF space explicitly (e.g, dim = 541) and where we are in gene space explicitly, (e.g., dim = 8321, GG matrix or svdU). svdV is emphatically not handled. Neither would svdU of TF-TF which is actually the the exact same thing.''' TFprojs= zeros((n,k,ntf)) for i in range(n): m = all_means[i] dim = shape(m)[1] #we are now going to project clusters on to the tfs #in this form, we only need rows corresponding to tfs. if dim> 500: #If dim = 541, we just read off the most important tfs this_tf_sum = np.abs(m[:,tfidxs]) TFprojs[i,:,:] = this_tf_sum #normalize clusters this_tf_sum = this_tf_sum / np.sum(this_tf_sum,1)[:,newaxis] this_tf_sum = np.sum(this_tf_sum,0) #Now, since we are at the moment only working with GG #and SVD_U, we are in gene space and can undo the mapping #with sqaT elif dim > 8000: #remember, ROWS of the matrix correspond to the #target space. a = sqa.T[tf_sqidxs,:] this_tf_sum = np.abs(np.sum(a[newaxis,:,:]*m[:,newaxis,:],2)) TFprojs[i,:,:] = this_tf_sum #normalize so that each mean has the same weight this_tf_sum = this_tf_sum / np.sum(this_tf_sum,1)[:,newaxis] #sum over cluster means to find the most important tfs this_tf_sum = np.sum(this_tf_sum,0) best = argsort(this_tf_sum)[::-1] tfweights[best[0:ntf_each]]=1 print '''Finished computing representative TFs ''' tfs_of_interest = nonzero(tfweights)[0] ntf = len(tfs_of_interest) avg_unshared = float(ntf)/(n * ntf_each) avg_shared = 1. - float(ntf)/(n * ntf_each) print '''Allowing for each cluster to choose '+str(ntf_each) + 'tfs, we got ''' + str(ntf) + ''' tfs of interest. or a mean sharing ratio of ''' + str(round(avg_shared,3))+ '''.''' #get a color table for clusters. ct = mycolors.getct(n) for i in range(n): #p stands for 'point' as in datapoint. #data points are labeled with clusters. xax = linspace(0,1,ntf) ax1.plot(xax,np.sum(TFprojs[i,:,tfs_of_interest],1)/np.max(TFprojs[i,:,tfs_of_interest],1),color = ct[i]) return TFprojs
def one(all_means, all_mems, tfp, axis = 'tf', idxs = [0,1], fig = 5 ,choice_ax = 'x' ,nrml = 'axis' ,sorting = 'axis'): m = all_means[idxs[0]] c = all_mems[idxs[0]] proj=abs(tfp[idxs[0],:,:]) m2 = all_means[idxs[1]] c2 = all_mems[idxs[1]] proj2=abs(tfp[idxs[1],:,:]) sqidxs = nu.net_sq_keyidxs() n_tfidxs = nu.net_tf_keyidxs() trgs,tfs = nu.parse_net() tf_sqidxs = [sqidxs[key] for key in tfs.keys()] gene_sqidxs = [sqidxs[key] for key in trgs.keys()] tfk = nu.net_tf_keyidxs() tgk = nu.net_trg_keyidxs() tf_aidx = [ tfk[key] for key in tfs.keys()] gene_aidx = [ tgk[key] for key in trgs.keys()] tfidxs = tf_aidx k = len(m) ntf = len(tf_sqidxs) ng = len(gene_sqidxs) print '''Getting ready to plot clusters mapped on to tf components. --note-- In its current incarnation, netutils orders tfs by their out degree and genes by their in degree. Thus viewmany() orders projects by TF out degree. Left unsorted, this is the order of the TF x axis.''' #how to normalize the image? #axis: equal sum for each tf over all clusters. #other: equal sums for each cluster in img nrml = 'axis' nrml_type = lambda x,y:np.max(x,y) sorting = 'other' print axis d0 = shape(m)[1] d2 = shape(m2)[1] show_membership = True if axis == 'tf': if sorting == 'axis': img = proj mean_tfval = argmax(img,1) c_srt = np.argsort( mean_tfval) img = img[c_srt,:] img2 = proj2 mean_tfval = argmax(img2,1) c_srt = np.argsort( mean_tfval) img2 = img2[c_srt,:] else: img = proj mean_tfval = argmax(img,0) c_srt = np.argsort( mean_tfval) img = img[:,c_srt] img2 = proj2 mean_tfval = argmax(img2,0) c_srt = np.argsort( mean_tfval) img2 = img2[:,c_srt] elif axis =='gene': maxgene = 200 gsort = argsort(c) if d0 == 8321 and not show_membership: img = m[:,gsort][:,:maxgene] else: img = zeros((k,ng)) for i in range(ng): img[c[i],i] = 1 if d2 == 8321 and not show_membership: img2 = m2[:,gsort][:,:maxgene] else: img2 = zeros((k,ng)) for i in range(ng): img2[c2[i],i] = 1 #normalize to generate an image if nrml == 'axis': img2 = img2/nrml_type(img2,0)[newaxis,:] img = img/nrml_type(img,0)[newaxis,:] else: img2 = img2/nrml_type(img2,1)[:,newaxis] img = img/nrml_type(img,1)[:,newaxis] img /= np.max(img) img2 /=np.max(img) img_show= img[:,:,newaxis] *[0,0,1] + img2[:,:,newaxis]*[1,0,0] f = plt.figure(fig) f.clear() ax = f.add_axes([.05,.05,.9,.9]) ax.imshow(img_show[:,:,:], aspect = 'auto') nc = shape(img)[0] xs, ys, rs, cs = [[] for i in range(4)] nchoice = 1 if choice_ax == 'y': dim = shape(img)[0] maxes = [argsort(img,1)[::-1][:,:nchoice], argsort(img2,1)[::-1][:,:nchoice]] elif choice_ax == 'x': dim = shape(img)[1] maxes = [argsort(img,0)[::-1][:nchoice,:], argsort(img2,0)[::-1][:nchoice,:] ] else: raise Exception('bad axis') ct = mycolors.getct(len(maxes)) for j in range(len(maxes)): for i in range(dim): for k in range(nchoice): if choice_ax == 'x': ys.append(maxes[j][k][i]) xs.append(i) elif choice_ax =='y': xs.append(maxes[j][i][k]) ys.append(i) else: raise Exception('bad axis') rs.append(20 + 30*(1-j)) cs.append(ct[j]) xs, ys, rs, cs = np.array(xs),np.array(ys),np.array(rs),np.array(cs) ax.scatter(xs,ys,200,'1',edgecolor = 'none') ax.scatter(xs,ys,rs,cs,alpha = .8, edgecolor = 'none')
def run( method ='identity',index = 0, reset = 0, nxmax = 100 , binary_x = False, binary_y = False, expression = 'time' , cluster_idx = 0, lrn = 'tree', showall = False, tgonly = False, randomize_tfs = False, ctfs = 5, ctgs = 5, cofs = 1, do_normalize_cluster = True, cluster_tfs = True, verbose_expr_labels = False, ctype = False): ''' sush2.run: run a selected learning algorithm for a cluster. KEYWORDS: index [0]: select a tf/target to model from the cluster method ['identity']: a membership method multi [False]: meaningless nxmax [3]: max cluster members binary_x: model x data as binary binary_y: model y data as binary expression ['time']: which expression series to use cluster_idx: not yet implemented reset ''' #Data assembly: # #1: Grab a list of genes of interest and # corresponding expression vectors # trg_kidxs = nu.net_trg_keyidxs() tf_kidxs = nu.net_tf_keyidxs() # #retrieve the list of trg/tf names present in a given cluster. #note that at the moment, these are fake functions that just give back #a little list of trgs and all of their associated TFs # #--CLUSTERS USED-- cands = get_candidates(10,ctfs,ctgs) cidx = cands[cofs] trg_ssnames = get_trg_ss(cluster = cidx ) tf_ssnames = get_tf_ss(cluster = cidx , trgnames = trg_ssnames) if cluster_tfs: tf_ssnames = get_tf_ss(cluster = cidx , trgnames = trg_ssnames) else: tgs, tfs = nu.parse_net() tg_specific = trg_ssnames[cluster_idx] trg_tfs = tgs[tg_specific]['tfs'] tf_ssnames = trg_tfs if randomize_tfs: r =np.random.random_integers(0,len(tf_kidxs.keys()),len(tf_ssnames)) tf_ssnames = [] print 'Randomizing TFs' for i in r: tf_ssnames.append(tf_kidxs.keys()[i]) trg_ssidxs = array([trg_kidxs[name] for name in trg_ssnames]) tf_ssidxs = array([tf_kidxs[name] for name in tf_ssnames]) # #2: Project expression data onto membership vectors # #--EXPR CLUSTERING-- #4: Grab a list of 'membership vectors' which # translate genes to x and y in the machine learning problem # data merging has not yet been implemented but should be quite simple # x_memberships = get_membership(tf_ssnames, method = method) y_memberships = get_membership(trg_ssnames, method = method) if do_normalize_cluster: exprtype = 'clustered' else: exprtype = 'standard' if exprtype == 'standard': all_expr = non_normal_cluster_expr(trg_ssnames, tf_ssnames,ctype = ctype) else: all_expr = normalize_cluster_expr(trg_ssnames, tf_ssnames,ctype = ctype) tg_expr, tf_expr = all_expr x_expr = array((tf_expr)).T y_expr = array((tg_expr)).T show_clustered_expr(y_expr,x_expr, trg_ssnames, tf_ssnames,fig = 8) nx, npertg = shape(x_expr) x_all, y_all = fold_expr(x_expr, y_expr) nx, nt_folded = shape(x_all) train_idxs, test_idxs = [],[] nt = npertg if ctype: nt -= 4 tginds = range(cluster_idx *npertg,(cluster_idx*npertg)+npertg) cinds = [] for i in range(nt_folded): if (divmod(i,npertg))[1] >= npertg - 4: cinds.append(i) for i in range(nt_folded): if ctype: if i in cinds and i in tginds: test_idxs.append(i) else: if i in tginds[:-4]: test_idxs.append(i) if tgonly: if i in tginds[:-4]: train_idxs.append(i) else: if not (i in tginds) and not (i in cinds): train_idxs.append(i) print 'N_TRAIN' , len(train_idxs) expr_fig = 0 draw_expr(x_expr, y_expr, expr_fig = expr_fig) if lrn =='svm': model = learn_svm( x_all, y_all, train_idxs = train_idxs, test_idxs = test_idxs, binary_x = binary_x, binary_y = binary_y) predictions = run_svm((x_all.T)[test_idxs].T , y_all[test_idxs], model) if lrn in ['knn','tree','forest']: #pred = myrf.run_tree(x_all,y_all, train_idxs, test_idxs) #raise Exception() all_ex = myrf.get_ex(x_all,y_all) train_ex = all_ex.getitems([int(x) for x in train_idxs]) test_ex = all_ex.getitems([int(x) for x in test_idxs]) #test_ex = myrf.examples_from_inds(x_all,y_all,test_idxs) #cl_ex = myrf.examples_from_inds(x_all,y_all,cl_idxs) model = myrf.OLearn(lrn, train_ex, test_ex = test_ex) predictions = model.predictions(test_ex) if lrn == 'nn': nhc = 2 ntg = 2 ntf_s = 2 max_tfu = 2 gf = sf.genfann(nhc,ntg,ntf_s, [ max_tfu for i in range(ntg) ] ) xs, ys = sf.synth_data(ntg,max_tfu,ntf_s) g, ga = gf.sample_genome() gf.init_net() gf.make_cxns_from_genome(g) #gf.net_from_cxns(hidden_cxns,output_cxns) net = gf.mynn.net f = plt.figure(0) f.clear() ax = f.add_subplot(121) myplots.draw_pb(ax,net) myplots.hideaxes(ax) myplots.maketitle(ax,'GANN') gf.set_data(xs.T,ys.T) gf.set_trainer() gf.train() ax2 = f.add_subplot(122) myplots.draw_pb(ax2,net) myplots.hideaxes(ax2) myplots.maketitle(ax2,'GANN') return raise Exception() raise Exception() #igrps = [ arange(2)+2*i for i in range(3) ] #igrps = [ raise Exception() gf.train() raise Exception() #gagd.MyFANN(x_all.T,y_all[newaxis,:].T,train_idxs) actual = y_all[test_idxs] showall = True if showall: if verbose_expr_labels: names = tf_ssnames else: names = None draw_svm(x_all[:,test_idxs],actual, predictions, f = expr_fig,names = names) print predictions print actual if ctype: forstring = 'CL Data' else: forstring = 'TS Data' namestr = trg_ssnames[cluster_idx] subt = 'TFs: '+','.join(tf_ssnames) if randomize_tfs: title = 'Random TF Predictions ' + forstring + ', ' +namestr fnum = 5 else: if cluster_tfs: title = 'Network Cluster TF Predictions'+ forstring + ', ' +namestr else: title = 'Network UnClustered TF Predictions'+ forstring + ', ' +namestr fnum = 6 msecov = draw_prediction(predictions,actual,fig=fnum, title = title, subt = ','.join(tf_ssnames)) print msecov return msecov