Пример #1
0
def show_clustered_expr(tge,tfe,tgnames, tfnames, nrml = True,fig = 8):
    f1 = plt.figure(fig)
    f2 = plt.figure(fig + 1)
    f1.clear()
    f2.clear()

    ax1 = f1.add_subplot(111)
    ax2 = f2.add_subplot(111)
    
    
    tgct = colors.getct(len(tgnames))
    tfct = colors.getct(len(tfnames))
    for i in range(len(tge)):
        ax1.plot(tge[i],color = tgct[i])
    myplots.color_legend(f1,tgct,tgnames, ax = ax1,pos = 4)
    tstr = 'Target Expression Levels' 
    if nrml: tstr += '(Normalized)'
    myplots.maketitle(ax1,tstr)

    for i in range(len(tfe)):
        ax2.plot(tfe[i],color = tfct[i])
    myplots.color_legend(f2,tfct,tfnames, ax = ax2,pos = 4)
    tstr = 'TF Expression Levels' 
    if nrml: tstr += '(Normalized)'
    myplots.maketitle(ax2,tstr)
Пример #2
0
def run(domain_name='X', projection_name='Y8'):
    prob2 = sio.loadmat('prob2.mat')

    domain_names = ['X', 'XV', 'Y', 'Y8', 'Y12']

    domains = [prob2.get(d) for d in domain_names]
    #domain_clusters = [prob2.get('ids_' + d) for d in domain_names]
    tissue_clusters = prob2.get('tissue_category')

    clusters = domain_clusters[domain_names.index(domain_name)]
    pdom = domains[domain_names.index(projection_name)]
    cdom = domains[domain_names.index(domain_name)]

    f = plt.figure(1)
    f.clear()
    random.seed(1)
    ct = array(mc.getct(218))

    #px, py = 2, 2
    sstrings = ['21{0:d}'.format(i + 1) for i in range(4)]

    inds = arange(shape(dom)[1])

    c_inds = array(clusters).flatten() - 1
    tc_inds = tissue_clusters.flatten() - 1

    colors = ct[c_inds, :]

    ax = f.add_subplot(sstrings[0], title = \
                         'Clusters from genespace affinity. Projection to first two elements')
    ax.scatter(*cdom[inds, 0:2].T, s=100, c=colors)
    ax = f.add_subplot(sstrings[1], title = \
                       'Clusters from genespace affinity. Projection to MVE')
    ax.scatter(*pdom[inds, 0:2].T, s=100, c=colors)

    cpairs = set([
        '{0:d}x{1:d}'.format(ix, iy) for ix, x in enumerate(c_inds)
        for iy, y in enumerate(c_inds) if ix < iy and x == y
    ])
    tcpairs = set([
        '{0:d}x{1:d}'.format(ix, iy) for ix, x in enumerate(tc_inds)
        for iy, y in enumerate(tc_inds) if ix < iy and x == y
    ])
    f.savefig('figs/cluster_projectsions.tiff', format='tiff')

    max_pairs = (len(tc_inds) * len(tc_inds) - len(tc_inds)) / 2
    total_pairs = len(cpairs.union(tcpairs))
    shared_pairs = len(cpairs.intersection(tcpairs))

    print 'using affinity propagation with affinites over domain {0}'.format(
        domain_name)
    print 'found'
    print ' max pairs: {0}'.format(max_pairs)
    print ' total pairs: {0}'.format(total_pairs)
    print ' tissue pairs: {0}'.format(len(tcpairs))
    print ' cluster pairs: {0}'.format(len(cpairs))
    print ' shared pairs: {0}'.format(shared_pairs)

    hg = hypergeom(len(tcpairs), len(cpairs), max_pairs)
    return hg
Пример #3
0
def plot_clusters(inds,
                  embeddings,
                  plot3d = False,
                  title = '',
		  ax_in =None,
		  save = False,
		  colors = None):
        exemplars = list(set(inds))
        if colors == None:
		cluster_colors = dict([(exemplars[i], col) 
                              for i, col in enumerate(mycolors.getct(len(exemplars)))]
                              )
	else:
		cluster_colors = colors

        cols = [cluster_colors[e] for e in inds]
        try: 
		if ax == None: plt.clf()
        except Exception, e: pass
        if ax_in == None: f = plt.gcf()

        for i, k in enumerate(embeddings.keys()):
            embedding = embeddings[k]

	    #if i == 1: raise Exception()
            emb_sig = embedding[:,0:3]
            cluster_vars = array([ var(emb_sig[nonzero(equal(inds, j))[0]])  for j in exemplars])
            indexed_vars = array([ cluster_vars[exemplars.index(j)] for j in inds ])
	    indexed_vars[equal(indexed_vars,0)] = 1

            sizes = 10 *( exp( -1 * ( np.sum((emb_sig - emb_sig[inds,:])**2,1)/indexed_vars)))
            if plot3d:
                if ax_in == None: 
			ax = f.add_subplot('{1}1{0}'.format(i+1, len(embeddings)),projection = '3d')
		else: ax = ax_in
                ax.scatter(array(embedding[:,0],float)
                           ,array(embedding[:,1],float)
                           ,array(embedding[:,2],float), 
                           s = sizes,
                           color = cols)
                ax.set_xticks([])
                ax.set_yticks([])
                for tl in list(it.chain( ax.w_xaxis.get_ticklabels(),
                                    ax.w_yaxis.get_ticklabels(),
                                    ax.w_zaxis.get_ticklabels())): # re-create what autofmt_xdate but with w_xaxis
                    tl.set_visible(False)
                    tl.set_rotation(30)    
            else:
                if ax_in == None: ax = f.add_subplot('{1}1{0}'.format(i+1, len(embeddings)))
		else: ax = ax_in
                ax.scatter(array(embedding[:,0],float)
                           ,array(embedding[:,1],float),
                           s = sizes,
                           color = cols)
                print 'sttring'
            ax.set_title('{0} for subopts in {1}'.format(k, title))
        
        if save: 
		f.savefig(cfg.dataPath('cs874/figs/subopt_embeddings/{0}.ps').format(title))
Пример #4
0
def get_reinitz_data(**kwargs):

    ofs = kwargs.get('ofs',0)
    do_plot_coords = kwargs.get('plot_coords',False)
    do_plot_vals = kwargs.get('plot_vals',False)

    idm= id_map()
    df = datafiles(**mem.rc(kwargs))

    #I'm not sure exactly how this dataset works but
    #each nuclei has a bunch of numbers that appear to be
    #monotonically increasing.
    #
    #I just take the first instance.
    nums = dict([(k,v[:,0]) for k, v in df.iteritems()])
    nuc_count = len(set(nums.values()[2]))
   
    values = dict([(k,v[nuc_count *ofs: nuc_count *(ofs + 1),-1]) 
                   for k, v in df.iteritems()])
    coords = dict([(k,v[nuc_count *ofs :nuc_count *(ofs + 1),1:3]) for k, v in df.iteritems()])

    #to check the basic consistency of the data, enable the plot routines.
    #I suppose that I could do this for all of the nuclei occurences...
    #right now, only the first is used.
    if do_plot_coords:
        f = myplots.fignum(1,(8,8))
        ax = f.add_subplot(111)
        ct = mycolors.getct(len(values))
        for i,k in enumerate(values.keys()):
            ax.scatter(coords[k][:,0][::1], coords[k][:,1][::1], 10,
                       edgecolor = 'none', alpha = .25,c =ct[i],
                       label = k, )

        f.savefig(myplots.figpath( 'reinitz_exprdata_coords_nuc_offset={0}'.format(ofs)))
    if do_plot_vals:
        f = myplots.fignum(1,(8,8))
        ax = f.add_subplot(111)
        ct = mycolors.getct(len(values))
        for i,k in enumerate(values.keys()):
            ax.scatter(coords[k][:,0][::1], values[k][::1], 10,
                       edgecolor = 'none',alpha = .25,c =ct[i],
                       label = k, )

        f.savefig(myplots.figpath( 'reinitz_exprdata_ap_vals_nuc_offset={0}'.format(ofs)))

    return coords, values
Пример #5
0
def run( domain_name = 'X', projection_name = 'Y8'  ):
  prob2 = sio.loadmat('prob2.mat')
  
  domain_names = ['X', 'XV', 'Y', 'Y8', 'Y12']
  
  domains = [prob2.get(d) for d in domain_names]
  #domain_clusters = [prob2.get('ids_' + d) for d in domain_names]
  tissue_clusters = prob2.get('tissue_category')
  

  clusters = domain_clusters[domain_names.index(domain_name)]
  pdom = domains[domain_names.index(projection_name)]
  cdom = domains[domain_names.index(domain_name)]

  f = plt.figure(1)
  f.clear() 
  random.seed(1)
  ct = array(mc.getct(218))
  
  #px, py = 2, 2
  sstrings = ['21{0:d}'.format(i+1) for i in range(4)]
  
  inds = arange(shape(dom)[1])
  
  c_inds = array(clusters).flatten() -1
  tc_inds = tissue_clusters.flatten() -1

  colors = ct[c_inds,:]

  ax = f.add_subplot(sstrings[0], title = \
                       'Clusters from genespace affinity. Projection to first two elements')  
  ax.scatter(*cdom[inds,0:2].T,s= 100, c = colors)
  ax = f.add_subplot(sstrings[1], title = \
                     'Clusters from genespace affinity. Projection to MVE')  
  ax.scatter(*pdom[inds,0:2].T,s= 100, c = colors)

  cpairs = set(['{0:d}x{1:d}'.format(ix,iy) 
                for ix, x in enumerate(c_inds) for iy, y in enumerate(c_inds)
                if ix < iy and x == y ])
  tcpairs = set(['{0:d}x{1:d}'.format(ix,iy) 
                for ix, x in enumerate(tc_inds) for iy, y in enumerate(tc_inds)
                if ix < iy and x == y ])
  f.savefig('figs/cluster_projectsions.tiff',format = 'tiff')
  
  max_pairs =( len(tc_inds) * len(tc_inds)  - len(tc_inds)) / 2
  total_pairs = len(cpairs.union(tcpairs))
  shared_pairs =len(cpairs.intersection(tcpairs))

  print 'using affinity propagation with affinites over domain {0}'.format(domain_name)
  print 'found'
  print ' max pairs: {0}'.format(max_pairs)
  print ' total pairs: {0}'.format(total_pairs)
  print ' tissue pairs: {0}'.format(len(tcpairs))
  print ' cluster pairs: {0}'.format(len(cpairs))
  print ' shared pairs: {0}'.format(shared_pairs)

  hg =  hypergeom( len(tcpairs), len(cpairs), max_pairs )
  return hg
Пример #6
0
def family_exemplar_structs(rfid,
                            refseq_method = None,
                            sp_method = None,
                            aff_type = None):

    suboptimals = rutils.family_suboptimals(rfid)
    c2 = rutils.cluster_2(spairs, ungapped_ref)

    arr = rutils.rna_draw(ungapped_ref.seq, 
                          rutils.pairs_stk(sp,len(ungapped_ref)),
                          'name' )

    raise Exception()
    affinities, ss = rutils.affinity_matrix(spairs, aff_type = aff_type)
    aff_shape, ss_shape = rutils.affinity_matrix(spairs, aff_type = 'easy', ss_multiplier = .5)
    

    pca_vecs = mlab.PCA(affinities).project(affinities)  
    pca_vecs_shape = mlab.PCA(aff_shape).project(aff_shape)  
    inds = compute_clusters(aff_shape, ss_shape)
    exemplars = list(set(inds))
    
    import compbio.utils.colors as mycolors
    ct = mycolors.getct(len(exemplars))
    import matplotlib.pyplot as plt
    f = plt.gcf()
    plt.clf()
    
    for idx0, embeddings in enumerate([pca_vecs, pca_vecs_shape]):
            ax = f.add_subplot('21{0}'.format(idx0 +1))

            lims =[ [min(embeddings[:,0]),max(embeddings[:,0])],
                         [min(embeddings[:,1]),max(embeddings[:,1])] ]
            lims += [-.5,.5] *squeeze(diff(lims,1))[:,newaxis]
            

            ax.set_xlim(lims[0])
            ax.set_ylim(lims[1])
    
            print sum(embeddings)
            for idx, embedding in enumerate(embeddings):
              if mod(idx,1) != 0: continue
              sp = spairs[idx]
              arr = rutils.rna_draw(ungapped_ref.seq, 
                              rutils.pairs_stk(sp,len(ungapped_ref)),
                              'name' )
              struct_emb = arr + embedding[0:2]
              #plt.plot(*struct_emb.T)
              
              pkw = {'color':ct[exemplars.index(inds[idx])],
                     'lw':8 if idx in inds else 1,
                     'alpha': 1 if idx in inds else .2}
              
              lc = rplots.show_rna(embedding, arr, pkw = pkw)
    #exemplar_structs = [spairs[e] for e in set(inds)]  
    raise Exception()

    return pca_vecs, exemplar_structs
Пример #7
0
def cluster_2_show(clusters, polys): 
    sortorder = argsort(clusters)
    ct_colors = mycolors.getct(len(set(clusters)))
    ct_dict = dict([(cluster, ct_colors[i]) for i, cluster in enumerate(set(clusters))])
    
    plf2 = myplots.fignum(8,(10,10))
    
    rplots.grid_rnas(polys[sortorder], 
                     colors  = [ct_dict[i] for i in clusters[sortorder]],
                     size = (5,5), dims = [180,50])    
Пример #8
0
def p_m_correlation():
  prots = nio.getBDTNP(protein = True)
  mrnas = nio.getBDTNP()
  
  matched = set(mrnas.keys()).intersection(set(prots.keys()))
  pairs = [(prots[k] , mrnas[k], k) for k in matched]



  f = plt.figure(0)
  f.clear()
  f.suptitle('mRNA and Protein Levels from BDTNP at six times in ~6000 cells', fontsize = 22)
  nx = ny = ceil(sqrt(len(pairs)))
  
  shp = shape(mrnas.values()[0]['vals'])
  colors = mycolors.getct(shp[1])
  shr = None
  for i, p in enumerate(pairs):
    
    ax = f.add_subplot('{0:g}{1:g}{2:g}'.format(nx, ny , i+1),
                       sharex = shr,sharey = shr)
    if not shr: shr = ax
    fbid = p[-1]
    #ax.set_title('{2}'.format(\
    #    fbid, nu.gene_symbol(fbid), tw.fill(nu.gene_biology(fbid), 75)))
    ax.grid(True, alpha = .2)
    ax.annotate(nu.gene_symbol(fbid),xy = [.02,.98], 
                xycoords = 'axes fraction', size = 25, va = 'top')
    mu = corrcoef(p[0]['vals'][::,:].flatten(),p[1]['vals'][::,:].flatten())
    ax.annotate('$\mu = {0:.2g}$'.format(mu[0,1]),xy = [.98,.98],
                xycoords = 'axes fraction', size = 25,ha = 'right', va = 'top')

    if mod(i, nx) >0: 
      plt.setp( ax.get_yticklabels(), visible=False)
    else:  ax.set_ylabel('mrna expression level')
      #plt.setp( ax.get_ylabel(), visible=False)
    if floor(i/nx) < (ny -1) : 
      plt.setp( ax.get_xticklabels(), visible=False)
    else:  ax.set_xlabel('protein expression level')

      #plt.setp( ax.get_xlabel(), visible=False)

    for j in range(shp[1]):
      ax.scatter(p[0]['vals'][::,j],p[1]['vals'][::,j],
                 s = 20,alpha = .2,color = colors[j])
      
  f.savefig(cfg.dataPath('figs/network/mrna_protein_levels.tiff',
                            ),format = 'tiff')
Пример #9
0
def expr_getonoff(expr_in):
    expr = array(expr_in)
    dev = std(expr)
    #expr = log(expr+dev)
    k = 3
    km = mlpy.Kmeans(k)
    n = len(expr)
    expr_2d = []
    for i in range(n):
        expr_2d.append(array([expr[i],0]))
    expr_2d = array(expr_2d)
    comp = km.compute(expr_2d)
    means = km.means
    compsort = arange(k)[argsort(map(lambda x: x[0],means))]

    n = len(expr)
    xax = argsort(expr)
    
    means = zeros(k)
    stds = zeros(k)
    for i in range(k):
        idxs = nonzero(equal(comp,i))[0]
        vals = array(expr)[idxs]
        means[i] = mean(vals)
        stds[i] = std(vals)
        

    f = plt.figure(1)
    f.clear()
    ax = f.add_axes([0,0,1,1])

    ct =mycolors.getct(k)
    cs, rs = [], []
    for i in range(n):
        cs.append(ct[compsort[comp[i]]])
        rs.append(100)
    ax.scatter(xax,expr,rs, color = cs)
    

    x0 = 0
    y0 = 0
    for i in range(k):
        ax.plot([x0,x0],[means[i] - stds[i], means[i]+ stds[i]]
                ,linewidth = 5, color = ct[compsort[i]])
Пример #10
0
def show_binary(idx = 0):
    tsb = nu.expr_TS_binary(reset = 0)
    tsvals = nu.load_TS()

    net = nu2.get_net()
    tgs = net[1]
    tfs = net[0]

    f = plt.figure(0)
    f.clear()
    ax = f.add_subplot(111)


    
    for k in tsb.keys()[idx:]:
        
        my_tfs = tgs.get(k,[])
        ct = mycolors.getct(len(my_tfs))
        tgseries = tsvals[k]

        if not my_tfs: continue

        for i in range(len(my_tfs)):
            tf = my_tfs[i][0]
            
            series = tsvals.get(tf)
            if not series: continue
            binary = tsb.get(tf)
            #if not binary: 
            #    print 'no ts for ' + tg
            #    continue
            npts = len(binary)

            xax = tgseries
            cmap = equal(binary,0)[:,newaxis]*[1,0,0] + equal(binary,1)[:,newaxis]*[0,1,0]
        
            print my_tfs[i][1]
            ax.scatter(xax, series,  500, 
                       color = cmap,
                       alpha = my_tfs[i][1],
                       edgecolor = '0')
        break
    return
Пример #11
0
def tree_similarity(dist1, dist2, run_id,criterion = 'knn', k = 6):
    if criterion == 'knn':
        nq = len(dist1)
        nb1 = argsort(dist1, 1)[:,1:k+1]
        nb2 = argsort(dist2, 1)[:,1:k+1]
        all_nbs = [set(n1).union(set(n2)) for n1, n2 in zip(nb1, nb2)]
        nb_intersection = [set(n1).intersection(set(n2)) for n1, n2 in zip(nb1, nb2)]
        nb_dists = [ array([[dist1[i, n], dist2[i,n]]for n in nbs ]) for i,nbs in enumerate(all_nbs)]
        #take the first k distances.
        n_disagreements = [len(nbd) - k for nbd in nb_dists]
        nb_dists = array([ sorted(nbd, key = lambda x: min(x))[:k] for nbd in nb_dists])

        frac_diffs = [abs(diff(elt, 1).flatten()) / mean(elt,1) for  elt in nb_dists]
        abs_diffs = [abs(diff(elt, 1).flatten()) for  elt in nb_dists]
        
        ct = mycolors.getct(nq)
        f = myplots.fignum(4, (10,8))
        ax = f.add_axes([.05,.08,.25,.87])
        seismic.seismic(abs_diffs, ax = ax, colors = ct)
        
        jaccard = mean([float(len(nb_intersection[i])) / float(len(all_nbs[i])) for i in range(nq)])

        ax2 = f.add_axes([.34,.08,.6,.87])
        for i,d in enumerate(nb_dists):
            ax2.scatter(d[:,0], d[:,1], 20, alpha = .5,color =ct[i])

        
        lin = linregress(nb_dists[:,:,0].flatten(),nb_dists[:,:,1].flatten())
        rsquared = lin[2]**2

        ax2.annotate('NN dists for multi/struct-aligned trees.\nK = {0}'.format(k),
                    [0,1], xycoords = 'axes fraction', va = 'top',
                    xytext = [10,-10],textcoords = 'offset pixels')
        ax2.annotate('R-Squared: {0:3.3}\nJaccard Index: {1:3.3}'.format(rsquared, mean(jaccard)),
                    [1,0], xycoords = 'axes fraction', ha = 'right',
                    xytext = [-10,10],textcoords = 'offset pixels')
        ax2.set_xlabel('Muscle aligned tree distances')
        ax2.set_ylabel('Struct algined tree distances')
        
        datafile = cfg.dataPath('figs/gpm2/pt2_mus_cm_tree_dists_{0}_k{1}.tiff'.format(run_id, k))
        f.savefig(datafile)
Пример #12
0
    def draw2d(self):
        f = plt.figure(self.fig)
        f.clear()
        ax = f.add_axes([.05, .05, .9, .9])
        data_x = 0
        data_y = 1

        ct = mycolors.getct(self.nm)
        xs, ys, rs, cs = [[] for i in range(4)]
        for i in range(self.nd):
            xs.append(self.data[i][data_x])
            ys.append(self.data[i][data_y])
            rs.append(25)
            cs.append(ct[self.labels[i]])

        for i in range(self.nm):
            xs.append(self.means[i][data_x])
            ys.append(self.means[i][data_y])
            rs.append(100)
            cs.append([0, 0, 0])
        ax.scatter(xs, ys, rs, cs)
Пример #13
0
def show_multi(timepoint = -1):
  mrnas = nio.getBDTNP()
  misc = nio.getBDTNP(misc = True)
  shp = shape(mrnas.values()[0]['vals'])

  #choose to look only at one timepoint
  stds = [std(m['vals'][:,timepoint]) for m in mrnas.values()]
  
  f = plt.figure(0)
  try: f.clear()
  except Exception, e: print 'hi'
  ax = f.add_subplot(111, projection = '3d')
  vsort = argsort(stds)[::-1]
  
  n = 10
  colors = mycolors.getct(n)
  for i in arange(n):
    step = argmax(np.sum(mrnas.values()[vsort[i]]['vals'],0))
    show_3d(mrnas.keys()[vsort[i]], 
            step = step, skip = 20, ax = ax, ofs =10*random.rand(3),
            color = colors[i])
Пример #14
0
def draw_xy(xset, yset):
    
    nx = shape(xset)[0]
    nt =shape(xset)[1]
    ct = colors.getct(nx)
    f2 = plt.figure(1)
    f2.clear()
    ax2 = f2.add_axes([0,0,1,1])
    xs, ys, rs, cs = [], [], [], []
    for i in range(nx ):
        feature = xset[i]
        fmax = max(feature)
        for t in range(nt):
            xs.append(feature[t]/fmax)
            ys.append(yset[t])
            rs.append(20)
            cs.append(ct[i])

    ax2.scatter(xs,ys,rs,cs)
    
    f2.show()
Пример #15
0
    def makePlots(self, name="No Name"):
        xtrain, ytrain = self.xyTrain()
        xtest, ytest = self.xyTest()
        ytrain_predicted = self.predictTraining()
        ytest_predicted = self.predictTest()

        ny = len(ytrain)
        f = plt.figure(1)
        f.clear()
        ax0 = f.add_subplot("211")

        f1 = plt.figure(2)
        f1.clear()
        ax1 = f1.add_subplot("211")
        ct = mycolors.getct(ny)
        for actual, predicted, ax, subtitle in [
            [ytest, ytest_predicted, ax0, "test predictions"],
            [ytrain, ytrain_predicted, ax1, "training predictions"],
        ]:
            for i in range(len(actual)):
                lplots.plotPredictions(actual[i], predicted[i], ax, color=ct[i])
                myplots.maketitle(ax, name, subtitle=subtitle)
Пример #16
0
def view3():

    files = [l for l in os.listdir(cfg.dataPath("batch/tmp")) if "mcmc" in l]
    fpaths = [os.path.join(cfg.dataPath("batch/tmp"), f) for f in files]
    ids = [l[0:10] for l in files]

    inps = [butils.load_data(i, "input") for i in ids]
    idxs_good = nonzero(greater([elt.get("out_iter_num") for elt in inps], 2))[0]
    inps = [inps[i] for i in idxs_good]
    fpaths = [fpaths[i] for i in idxs_good]

    fig = myplots.fignum(3, (35, 15))
    ax = fig.add_axes([0, 0, 1, 1])

    for f, inp in zip(fpaths, inps):
        if inp["out_iter_num"] == 2:
            continue
        print inp["filename"]

        data = sio.loadmat(f)

        import compbio.utils.colors as mycolors

        ct = mycolors.getct(len(data["gene_names"]))

        term_list = [list(it.chain(*mod)) for mod in data["model"]]
        fac_list = [list(it.chain(*t)) for t in term_list]

        xvals, yvals, colors, rads = [], [], [], []
        for i, terms in enumerate(term_list):
            for j, term in enumerate(terms):
                for k, fact in enumerate(term):
                    xvals.extend([i] * len(term))
                    yvals.extend([fact] * len(term))
                    colors.extend([ct[c] for c in sorted(term)])
                    rads.extend(((arange(1, len(term) + 1) ** 2) * 50)[::-1])

        vecs = zeros((len(fac_list), len(fac_list)))
        for i, fl in enumerate(fac_list):
            for f in fl:
                vecs[i, f] = 1

        # plt.imshow(vecs)

        # ax1 = fig.add_subplot(121)
        # ax2 = fig.add_subplot(122)
        import hcluster

        clusters = hcluster.fclusterdata(vecs, 1.1, criterion="inconsistent", method="complete")

        # ax1.imshow(vecs)
        # ax2.imshow(vecs[argsort(clusters)])

        # raise Exception()

        csrt = argsort(argsort(clusters))
        xvals2 = [csrt[x] for x in xvals]

        # raise Exception()
        plt.scatter(xvals2, yvals, rads, color=colors)
        raise Exception()

    raise Exception()
Пример #17
0
def viewmany(all_means, all_clusters, fig = 12):
    n = len(all_means)
    f = plt.figure(fig)
    f.clear()
    print '''Running viewmany.py

For now, viewmany assumes that k is equal across clustering instances
this is not really important but has to do with how TF projections are
stored.
'''
    #1 k.
    k = len(all_means[0])

    ax1 = f.add_axes([.05,.05,.95,.4])
    ax2 = f.add_axes([.05,.55,.95,.4])
    ct0 = mycolors.getct(n)

    sqa = nu.net_square_affinity()[0]
    aff = nu.net_affinity()[0]

    #tf_sqidxs should have length = ntf
    #with each element giving the coordinate of the
    #i'th tf in sqa space.


    sqidxs = nu.net_sq_keyidxs()
    n_tfidxs = nu.net_tf_keyidxs() 
    trgs,tfs = nu.parse_net()
    tf_sqidxs = [sqidxs[key] for key in tfs.keys()]
    tfidxs = n_tfidxs.values()
    ntf = len(tfidxs)

    tfweights = zeros(ntf,int)
    #find tfs of general interest, choosing at most ten for each clustering
    ntf_each = 20
    
    print '''...Computing representative TFs for each clustering.

In the current formulation, we project each mean on to associated tf
and then normalize each projection so that each mean has equal weight
in TF selection.

Not that we have handled the case where we have clusted in TF space
explicitly (e.g, dim = 541) and where we are in gene space explicitly,
(e.g., dim = 8321, GG matrix or svdU). svdV is emphatically not handled.
Neither would svdU of TF-TF which is actually the the exact same thing.'''
    

    
    TFprojs= zeros((n,k,ntf))
    for i in range(n):
        m = all_means[i]
        dim = shape(m)[1]
        #we are now going to project clusters on to the tfs
        #in this form, we only need rows corresponding to tfs.

        if dim> 500:
            #If dim = 541, we just read off the most important tfs
            this_tf_sum = np.abs(m[:,tfidxs])
            TFprojs[i,:,:] = this_tf_sum
            #normalize clusters
            this_tf_sum = this_tf_sum / np.sum(this_tf_sum,1)[:,newaxis]
            this_tf_sum = np.sum(this_tf_sum,0)
    
        #Now, since we are at the moment only working with GG
        #and SVD_U, we are in gene space and can undo the mapping
        #with sqaT
        elif dim > 8000:
            #remember, ROWS of the matrix correspond to the
            #target space.
            a = sqa.T[tf_sqidxs,:]            
            this_tf_sum = np.abs(np.sum(a[newaxis,:,:]*m[:,newaxis,:],2))
            TFprojs[i,:,:] = this_tf_sum
            #normalize so that each mean has the same weight
            this_tf_sum = this_tf_sum / np.sum(this_tf_sum,1)[:,newaxis]
            #sum over cluster means to find the most important tfs
            this_tf_sum = np.sum(this_tf_sum,0)
            
    

        best = argsort(this_tf_sum)[::-1]
        tfweights[best[0:ntf_each]]=1
    print '''Finished computing representative TFs
'''

    tfs_of_interest = nonzero(tfweights)[0]
    ntf = len(tfs_of_interest)
    avg_unshared = float(ntf)/(n * ntf_each)
    avg_shared = 1. - float(ntf)/(n * ntf_each)
    print '''Allowing for each cluster to choose '+str(ntf_each) + 'tfs,
we got ''' + str(ntf) + ''' tfs of interest.
or a mean sharing ratio of ''' + str(round(avg_shared,3))+ '''.'''

    #get a color table for clusters.
    ct = mycolors.getct(n)

    for i in range(n):
        #p stands for 'point' as in datapoint.
        #data points are labeled with clusters.

        xax = linspace(0,1,ntf)

        ax1.plot(xax,np.sum(TFprojs[i,:,tfs_of_interest],1)/np.max(TFprojs[i,:,tfs_of_interest],1),color = ct[i])

    return TFprojs
Пример #18
0
def draw_cm_muscle_congruencies(seqs, profiles, run_id, reset = True):
    print 'computing alignments...'
    print '  ...using muscle'
    malis, mrefs, mpairs =\
            mem.getOrSet(setAlignments, 
                         **mem.rc({},
                                  seqs = seqs, profiles = profiles, 
                                  run_id = run_id, ali_type = 'muscle',
                                  reset = reset,
                                  on_fail = 'compute', 
                                  register = 'tuali_musc_{0}'.format(run_id))) 
    print '  ...using cmalign.'
    salis, srefs, spairs  =\
        mem.getOrSet(setAlignments, 
                     **mem.rc({},
                              seqs = seqs, profiles = profiles, 
                              run_id = run_id, ali_type = 'struct',
                              reset = reset,
                              on_fail = 'compute', 
                              register = 'tuali__struct_{0}'.format(run_id)))
 
    print '  ...making trees.'
    
    for idx, alis in enumerate(zip(malis, salis)):
        m, s = alis
        mtree  = phyml.tree(m,run_id, bionj = True)
        stree  = phyml.tree(s,run_id, bionj = True)
        
        maps = dict([(elt.id,i) for i, elt in enumerate(m)])
        mdists = zeros((len(maps),len(maps)))
        sdists = zeros((len(maps),len(maps)))
        for n1 in mtree.get_terminals():
            for n2 in mtree.get_terminals():
                mdists[maps[n1.name],maps[n2.name]] = \
                    mtree.distance(n1,n2)
        
        for n1 in stree.get_terminals():
            for n2 in stree.get_terminals():
                sdists[maps[n1.name],maps[n2.name]] = \
                    stree.distance(n1,n2)
        tree_similarity(sdists, mdists, '{0}_struct_{1}'.format(run_id,idx), k = len(sdists - 1))
        tree_similarity(sdists, mdists, '{0}_struct_{1}'.format(run_id,idx), k = 6)

        f = myplots.fignum(4, (8,10))
        ct = mycolors.getct(len(mtree.get_terminals()))

        import networkx

        for t, sp, ttype in zip([mtree, stree], [211,212], ['sequence', 'structural']):
            a = f.add_subplot(sp)
            layout = 'neato'
            G = phylo.to_networkx(t)
            Gi = networkx.convert_node_labels_to_integers(G, discard_old_labels=False)
            posi = networkx.pygraphviz_layout(Gi, layout, args = '')
            posn = dict((n, posi[Gi.node_labels[n]]) for n in G)


            networkx.draw(G, posn, labels = dict([(n, '') for n in G.nodes()]),
                      node_size = [100 if  n.name in maps.keys() else 0 for n in G.nodes()],
                      width = 1, edge_color = 'black',
                      ax = a,
                      node_color = [ct[maps.get(n.name, -1)] for n in G.nodes()] )
        

            a.annotate('Embedded tree for {0} alignment.'.format(ttype),
                    [0,1], xycoords = 'axes fraction', va = 'top',
                    xytext = [10,0],textcoords = 'offset pixels')
            a.annotate('Total branch length is {0}'.format(t.total_branch_length()),
                    [1,0], xycoords = 'axes fraction', ha = 'right',
                    xytext = [-10,10],textcoords = 'offset pixels')            

        #phylo.draw_graphviz(  mtree,  label_func = lambda x: '', 
        #                      node_color = [ct[maps.get(n.name, -1)] for n in G.nodes()] +\
        #                          [ct[0] for n in mtree.get_nonterminals()], axes = ax)

        datafile = cfg.dataPath('figs/gpm2/pt2_mus_cm_tree_embeddings_{0}_struct_{1}.ps'.format(run_id, idx))
        f.savefig(datafile, dpi = 200, format = 'ps')
Пример #19
0
def view4():

    files = [l for l in os.listdir(cfg.dataPath("batch/tmp")) if "mcmc" in l]
    fpaths = [os.path.join(cfg.dataPath("batch/tmp"), f) for f in files]
    ids = [l[0:10] for l in files]
    inps = [butils.load_data(i, "input") for i in ids]

    idxs_good = nonzero(greater([elt.get("out_iter_num") for elt in inps], -1))[0]
    inps = [inps[i] for i in idxs_good]
    fpaths = [fpaths[i] for i in idxs_good]

    termgroups, cnames, xvals, gvals, yvals, colors, rads, tfs, all_coefs = [], [], [], [], [], [], [], [], []
    l_info = {}

    for l, elt in enumerate(zip(fpaths, inps)):
        f, inp = elt
        if inp["out_iter_num"] == 2:
            continue
        print inp["filename"]
        clustname = re.search(re.compile("_([^_]+)\.mat"), inp["filename"]).group(1)
        cnames.append(clustname)
        l_info[l] = {}
        l_info[l]["cname"] = clustname
        l_info[l]["filename"] = inp["filename"]

        data = sio.loadmat(f)
        l_info[l]["stay_same"] = data["stay_same"]
        l_info[l]["improve_ratio"] = data["improve_ratio"]
        l_info[l]["error_test"] = data["error_test"]

        import compbio.utils.colors as mycolors

        ct = mycolors.getct(len(data["gene_names"]))

        term_list = [list(it.chain(*mod)) for mod in data["model"]]
        fac_list = [list(it.chain(*t)) for t in term_list]

        seen = set()
        all_coefs.append(data["coefs_dic_nonlinear"])
        coefs = data["coefs_dic_nonlinear"]
        nlcof_all = open(
            cfg.dataPath("network/network_predmodel/regressionwts/nonlinear_all/nw_{0}.sif".format(l)), "w"
        )

        nlcof_sing = open(
            cfg.dataPath("network/network_predmodel/regressionwts/nonlinear_sing/nw_{0}.sif".format(l)), "w"
        )

        tfnames = data["tf_names"]
        tgnames = data["gene_names"]

        for i, terms in enumerate(term_list):
            if i in (5, 49, 53, 30, 17, 8, 38):
                if sum(terms) > 0:
                    raise Exception()
            terms = [t - 1 for t in terms]
            for j, term in enumerate(terms):
                if len(term) == 1:
                    wt = coefs[i][0][0][j]
                    nlcof_sing.write("{0}\t{1}\t{2}\n".format(tfnames[term][0][0], tgnames[i][0], wt))

                for k, fact in enumerate(list(set(term))):
                    wt = coefs[i][0][0][j]
                    nlcof_all.write("{0}\t{1}\t{2}\n".format(tfnames[fact][0][0], tgnames[i][0][0], wt))

                    gvals.append([i] * (len(term) + 1))
                    yvals.append([fact] * (len(term) + 1))
                    colors.append([ct[c] for c in sorted(term)] + [1, 1, 1])
                    tfs.append([c for c in sorted(term)])
                    rads.append(((arange(1, len(term) + 2) ** 2) * 50)[::-1])
                    xvals.append([l] * (len(term) + 1))

        nlcof_all.close()
        nlcof_sing.close()

    return cnames, xvals, gvals, yvals, colors, rads, l_info, tfs, coefs
Пример #20
0
def show_output(outputs, 
		show = 'conservation',
		save = True):
        mvecs = outputs['all_vecs']['all_time']
        tvecs = outputs['all_vecs']['all_mut']
        fvecs = outputs['all_vecs']['fiftyfifty']

	run_id = outputs['run_id']
	structs = outputs['exemplar_structs']
	ref = outputs['reference_seq']
	
	thermo_pairs = outputs['thermo_pairs']
	thermo_inds  = outputs['thermo_ex_inds']

	run_title = outputs['title']
	fam_name = re.compile('RF\d*').search(run_title).group()

	fig = plt.gcf()
	try: fig.clear()
	except Exception, e: print 'wonky 3d bug'
	fig = plt.gcf()
	try: fig.clear()
	except Exception, e: print 'wonky 3d bug'
	fig.canvas.draw()


	exemplar_inds = sorted(list(set(thermo_inds)))
	struct_colors = dict([(exemplar_inds[i], col) 
			       for i, col in enumerate(mycolors.getct(len(exemplar_inds)))]
                              )

	if show == 'embeddings':

	   
	   exemplars = list(set(thermo_inds))
	   pair_embedding =  compute_embedding(thermo_pairs,
	   			       aff_type = 'pairs',
	   			       do_mve = False,
					       ss_multiplier = None)
	   
	   shape_embedding = compute_embedding(thermo_pairs,
	   			       aff_type = 'easy',
	   			       do_mve = False,
	   			       ss_multiplier = None)
	   show_3d = True
	   #shape_embedding[0] is pca
	   rplots.plot_clusters( thermo_inds, {'shape':shape_embedding[0],
	   			     'pairs':pair_embedding[0]}, 
			  plot3d = show_3d,
			  title = 'projection ({0}) '.format(run_id),
			  save = save,
			  colors = struct_colors)

	elif show == 'conservation':
		ax0 = fig.add_subplot('311')
		lstructs =  [project_lstruct(p, len(ref)) for p in structs]
		seismic.seismic([ abs(l) for l in lstructs] , 
				colors = struct_colors.values(),
				ax = ax0)

		myplots.maketitle(ax0, 'Predicted conservation patterns for {0}'.format(fam_name))

		shapes = array([shape(m) for m in mvecs])
		igood = nonzero(greater(shapes[:,1],0))[0]
		clade_colors = mycolors.getct(len(igood))
		mvg, tvg, fvg = [ [vecs[i] for i in igood] for vecs in [mvecs,tvecs,fvecs]]
		cons_types = array([ mvg, tvg, tvg])
		
		for c in cons_types:
			nrm = sum(c.flatten())
			if nrm == 0: nrm = 1
			c /= sum(c.flatten())
		if shape(cons_types)[1] == 0:
			print 'No good vectors!'
			return
		
		mtype_sums = np.sum(np.sum(cons_types,3),0)	
		stype_sums = np.sum(np.sum(cons_types,3),0).T


		ax1 = fig.add_subplot('312')		
		seismic.seismic(stype_sums , 
				colors = struct_colors.values(),
				ax = ax1)

		myplots.maketitle(ax1,'Observed conservation (struct v. clade) patterns for {0}'\
					  .format(fam_name),
				  )

		
		ax2 = fig.add_subplot('313')
		
		seismic.seismic(mtype_sums , 
				ax = ax2, colors = clade_colors, stacked = True,
				label_y = False)

		#myplots.maketitle(ax2, 'Observed conservation (clade v. struct) patterns for {0}'\
		#			   .format(run_title)
		#		   )
		ax2.annotate('Observed conservation (clade v. struct) patterns for {0}'\
				     .format(run_title),
			     [.5,0],xycoords = 'axes fraction', ha = 'center', va = 'top',
			     size = 'x-large')

		if save: fig.savefig(cfg.dataPath('cs874/figs/cons_profiles/{0}.ps'.format(run_title)))
	       				 	

	

	else: raise Exception('show type not implemented: {0}'.format(show))
Пример #21
0
def get_seq_groups(rfid = 'RF00167', reset = True, tree = True,
        draw_distances = draw_all_easy,
        draw_clusters = draw_all_easy,
        draw_single_cluster = draw_all_hard):
    '''
Run the tree computation for each clsuter in the rfam family.
(Or just one)

1) Compute clusters using a distance measure derived either 
   phyml or a simple levenshtein dist.

   kwds:
     tree          [True]  Use a tree or just a levenshtein 
                           distance to get distances for
                           init clustering.

2) Choose a cluster of well related sequences and for this 
   this cluster, compute an alignment (For each structure 
   using phase or for sequences using MUSCLE)
  
   kwds:
     struct_align  [True]   Whether to compute structural 
                            alignments or use MUSCLE

'''
    rutils = utils

    ali, tree, infos = rfam.get_fam(rfid)
    n = len(ali)

    if draw_distances:
        dists_t = seq_dists(ali,rfid, tree = True)
        dists_l = seq_dists(ali,rfid, tree = False)
        dtf = dists_t.flatten()
        dlf = dists_l.flatten()
        lin = linregress(dtf, dlf)
        rsquared = lin[2]**2

        f = myplots.fignum(5, (7,7))
        ax = f.add_subplot(111)
        ax.annotate('Levenshtein distance vs. BioNJ branch lengths',
                    [0,1], xycoords = 'axes fraction', va = 'top',
                    xytext = [10,-10],textcoords = 'offset pixels')
        ax.annotate('R-Squared: {0}'.format(rsquared),
                    [1,0], xycoords = 'axes fraction', ha = 'right',
                    xytext = [-10,10],textcoords = 'offset pixels')
        ax.set_xlabel('BIONJ Tree ML Distance')
        ax.set_ylabel('Levenshtein Distance')

        ax.scatter(dtf, dlf, 100)
        
        datafile = cfg.dataPath('figs/gpm2/pt2_lev_tree_dists.tiff')
        f.savefig(datafile)
        
    dists = mem.getOrSet(setDistances, ali = ali, tree = tree, run_id = rfid,
                         register = rfid, 
                         on_fail = 'compute',
                         reset = reset)
    
    clusters = maxclust_dists(dists, k = 5, method = 'complete')
    clusters -= 1

    if draw_clusters:

        ct = mycolors.getct(len(set(clusters)))
        colors = [ct[elt] for elt in clusters]
        pca_vecs = mlab.PCA(dists).project(dists) 
        
        f = myplots.fignum(5, (8,8))
        ax = f.add_subplot(111)
        ax.annotate('Rfam sequence clusters in first 2 PC of sequence space.',
                    [0,1], xycoords = 'axes fraction', va = 'top',
                    xytext = [10,-10],textcoords = 'offset pixels')
        ax.annotate('Number of Clusters: {0}'.format(len(ct)),
                    [1,0], xycoords = 'axes fraction', ha = 'right',
                    xytext = [-10,10],textcoords = 'offset pixels')
        ax.set_xlabel('PC 1')
        ax.set_ylabel('PC 2')

        ax.scatter(pca_vecs[:,0],pca_vecs[:,1], 20, color = colors)
        
        datafile = cfg.dataPath('figs/gpm2/pt2_all_seqs_clustered.ps')
        f.savefig(datafile)        

    #now take the largest cluster and do the analysis.
    
    cgrps = dict([ (k, list(g)) 
              for k , g  in it.groupby(\
                sorted( list(enumerate(clusters)),key = lambda x: x[1]),
                key = lambda x: x[1])])
    cbig = argmax([len(x) for x in cgrps.values()])
    cluster_seqs = [ elt[0] for elt in cgrps.values()[cbig] ] 
    csize = len(cluster_seqs)
    seqs =[ali[c] for c in cluster_seqs]

    
    
    if 0:

        ct = mycolors.getct(2)
        pca_vecs = mlab.PCA(dists).project(dists) 
        colors =[ct[1] if elt in cluster_seqs else ct[0] for elt in range(len(pca_vecs))] 
        
        f = myplots.fignum(5, (8,8))
        ax = f.add_subplot(111)
        ax.annotate('Inter and intra cluster distances vs. PC0 component for chosen cluster.',
                    [0,1], xycoords = 'axes fraction', va = 'top',
                    xytext = [10,-10],textcoords = 'offset pixels')
        ax.annotate('Number of cluster sequences: {0}, Number of total sequences'.format(csize, n  - csize),
                    [1,0], xycoords = 'axes fraction', ha = 'right',
                    xytext = [-10,10],textcoords = 'offset pixels')
        ax.set_xlabel('PC 0')
        ax.set_ylabel('Distance')


        for s in cluster_seqs:
            ax.scatter(pca_vecs[:,0],dists[s,:] ,200 *exp(-(dists[s,:] / .5) **2),  color = colors, alpha = .2)
        
        datafile = cfg.dataPath('figs/gpm2/pt2_focused_cluster_dists.ps')
        f.savefig(datafile)        
        
    clusters_final  = [ [ elt[0] for elt in cgrps.values()[i] ] for i in range(len(cgrps.values()))]
    seqs_final = [ [ ali[idx] for idx in clust ] for clust in clusters_final]
    return seqs_final
Пример #22
0
def cluster_tissues(nx = 20,ny = 500, timepoint = -1,
                    step = 4,
                    sim = 'neg_dist', 
                    imshow_sims = False,
                    scatter_sims = False,
                    hist_sims = False,
                    do_cluster= True,
                    do_show = True, cstep = -1):
  '''Cluster ny nuclei by the values of the nx mRNAs with highest
  variance. Uses the medioids method with number of clusters set
  by exemplar self simalarity as outlined in 6.874 and implemented
  at http://www.psi.toronto.edu/affinitypropagation 


  imaging:
  imshow_sims
  scatter_sims
  hist_sims
  do_show

  numerics:
  nx:           number of genes to cluster upon
  ny:           number of cells in the clusterin
  timepoint:    which time to use for cluster computation
  step:         how many genes to skip when showing results
  
  
  So far I have implemented a distance based similarity and a 
  '''
  mrnas = nio.getBDTNP()
  misc = nio.getBDTNP(misc = True)
  shp = shape(mrnas.values()[0]['vals'])

  #choose to look only at one timepoint
  stds = [std(m['vals'][:,timepoint]) for m in mrnas.values()]
  vsort = argsort(stds)[::-1]
  xinds = vsort[:nx]

  #Choose the most variable factors and use them as the 
  #underlying variables from which to construct a similarity
  nuclei =array([ mrnas.values()[idx]['vals'][:,timepoint]
                  for idx in xinds]).T

  t = [ mean(nuclei, 0), std(nuclei,0)]
  t[1][equal(t[1],0)] = 0
  sims = similarity(nuclei, transform = t, method = sim)
  cluster_inds = array(floor(linspace(0,len(nuclei)-1, ny)), int)  
  cluster_training = sims[cluster_inds,:][:,cluster_inds]

  f = plt.figure(0)

  #, projection = '3d')

  if scatter_sims:
    ax = f.add_subplot(111)
    scatterx = [cluster_sims[i] for i in range(ny) for j in range(ny)]
    scattery = [cluster_sims[j] for i in range(ny) for j in range(ny)]
    ax.scatter(scatterx, scattery, s =3, alpha = .1)

  if imshow_sims:
    ax = f.add_subplot(111)
    cmap = mycolors.blackbody()
    ax.imshow(cluster_sims, cmap = cmap, interpolation = 'nearest')

  if hist_sims:
    ax = f.add_subplot(111)
    csf = cluster_sims.flatten()
    csf -= max(csf)
    csf *= -1
    h = histogram(log10(1+csf), bins = 100)
    ax.plot(h[1][:-1],h[0])


  cluster(cluster_training, ss.scoreatpercentile(cluster_training,.2) )
  
  fopen = open(cfg.dataPath('bdtnp/clustering/nuclei/idxs'))
  lines = fopen.readlines()
  c = [int(l.strip()) for l in lines]
  c_training_exemplars = set(c)
  exemplar_inds = [cluster_inds[i] for i in c_training_exemplars]
  #I am being a bit lazy with subscripting here because I just assume
  #that the similarity is symmetric... I suppose I could let it be 
  #asymmetric if I liked

  
  exemplars = nuclei[exemplar_inds,:]
  all_sims = similarity(nuclei,  exemplars,
                        transform = t, 
                        transform_exemplars = True,
                        method = sim)
  assignments = np.argmax(all_sims,1)


  ne = len(c_training_exemplars)
  colors = array(mycolors.getct(len(c)))
  colors = array(colors)


  if do_show:
    for tp in range(shape(mrnas.values()[0]['vals'])[1])[-1:]:
      try: f.clear()
      except Exception, e: print 'Weird 3d plotting error. Alas'
      nuclei =array([ mrnas.values()[idx]['vals'][:,tp]
                      for idx in xinds]).T
      all_sims = similarity(nuclei,  exemplars,
                            transform = t, transform_exemplars = True,
                            method = sim)
      assignments = np.argmax(all_sims,1)


      ax = f.add_subplot(111)
      #colors = [colors[i] for i in c]
      xs = misc['x']['vals'][::step,0]
      ys = misc['y']['vals'][::step,0]
      zs = misc['z']['vals'][::step,0]
      ax.scatter(xs, zs,s= 50, color =colors[assignments[::step]])
      #ax.set_title('''Virtual embryo cell (n={2}) clusters 
#from similarities derived from {0} genes. 
#Clusters derived at T = {1}, shown at T = {3}.'''\
#                     .format(nx,timepoint, len(xs),tp))
    
      f.savefig(cfg.dataPath('figs/bdtnp/cluster_movie{0:02d}.tiff'.format(tp)), format = 'tiff')
Пример #23
0
def cluster_exprs(all_members, ct_data,
                  do_plot = False,
                  cluster_type = '4d',
                  cluster_id = 4):
  mrnas = nio.getBDTNP()
  misc = nio.getBDTNP(misc = True)

  c = all_members[cluster_id]
  c_unq = set(list(c))
  

  tissues = dict([('t_{0}'.format(i) , dict(cts = ct_data[equal(c,elt)]))
                  for i, elt in enumerate(c_unq)])
  
  nt = 6
  counts = array([[sum(equal(v['cts'][:,1],t))
                   for t in range(nt) ] 
                  for v in tissues.values() ])
  

  if do_plot:
    f = plt.figure(1)
    f.clear()
  
    ax1 = f.add_subplot('121')
    ax2 = f.add_subplot('122')
    seismic.seismic(counts , ax = ax1,stacked = True,colors = mycolors.getct(len(counts)))
    #seismic.seismic(np.sort(counts,0) , ax = ax2,stacked = False,colors = mycolors.getct(len(counts)))
    ax2.hist(np.sum(counts,1))
    
  
  all_exprs = {}
  for t, v in tissues.iteritems():
    ct_all = v['cts']
    
    for time in set([c[1] for c in ct_all]):
      ct = [ct for ct in ct_all if ct[1] == time]

      exprs =dict( [(k,elt['vals'][zip(*ct)]) for k, elt in mrnas.iteritems()])
      ys = misc['y']['vals'][zip(*ct)] #zip(*sim_xy)]
      zs = misc['z']['vals'][zip(*ct)] #zip(*sim_xy)]
      xs = misc['x']['vals'][zip(*ct)] #zip(*sim_xy)]

    
      f = plt.figure(1)
      f.clear()
      ax1 = f.add_subplot('121', title = 'X-Z axis view for tissue {0}'.\
                            format(t))
      ax2 = f.add_subplot('122',title = 'Y-Z axis view for tissue {0}'.\
                            format(t))
      ax1.scatter(xs, zs)
      ax2.scatter(ys, zs)
      
      v['exprs'] = exprs
      all_exprs['tiss_{0}_time_{1}'.format(t,time)]=exprs
      
      sio.savemat(open(cfg.dataPath('soheil/expression_c{0}_n{1}_tissue{2}_time{3}.mat'.\
                                      format(cluster_type,cluster_id,t,time)),'w'),
                  exprs)
      f.savefig(open(cfg.dataPath('soheil/expression_c{0}_n{1}_tissue{2}_time{3}.tiff'.\
                                      format(cluster_type,cluster_id,t,time)),'w'))
    
    
      
  exprs_out = dict([( k, [ mean(sub[k]) for sub in all_exprs[k].values() ]) 
                    for k in all_exprs.keys() ])

  sio.savemat(open(cfg.dataPath('soheil/expression_c{0}_n{1}_intercluster.mat'.\
                                    format(cluster_type,cluster_id)),'w'),
              exprs_out)
  
  raise Exception()
Пример #24
0
def c2( launcher = None, ncluster =2000, host = 'tin', 
        reset = 0, step = 10, exemp_time = 'all',
        doplot = False ,**kwargs):
  mrnas = nio.getBDTNP()
  misc = nio.getBDTNP(misc = True)
  
  vals = array([v['vals'] for v in mrnas.values()])
  gvars = var(vals, 1)
  gminvars = np.min(gvars,1)
  gmedvars = median(gvars,1)

  min20 = argsort(gminvars)[::-1][:20]
  med20 = argsort(gmedvars)[::-1][:20]

  int20 = set(min20).intersection(set(med20))
  xgenes = array(list(int20))

  cell_data = vals[xgenes].transpose(1,2,0)
  scd = shape(cell_data)
  #times = reshape(zeros(shape(cell_data[0:2]))[:,:,newaxis , arange(shape(cell_data[1]))
  #                    , (prod(shape(cell_data)[0:2])))
  xycoords = (arange(scd[0])[:,newaxis,newaxis]*[1,0] +\
                arange(scd[1])[newaxis,:,newaxis]*[0,1])
  cell_data = reshape(cell_data, (prod(shape(cell_data)[0:2]), shape(cell_data)[2] ))
  xy_data = reshape(xycoords, (prod(scd[0:2]),2 ))
    
  if exemp_time == 'all':
    inds = arange(len(cell_data))
  else:
    inds = arange(len(cell_data))[nonzero(equal(xy_data[:,1],exemp_time))[0]]
  
  np.random.seed(1)
  np.random.shuffle(inds)
  rand_thousand = inds[0:ncluster]
  
  sim_data = cell_data[rand_thousand]
  sim_xy = xy_data[rand_thousand]
  t = [ mean(sim_data, 0), std(sim_data,0)]
  t[1][equal(t[1],0)] = 0
  metric = 'neg_dist'
  sims = similarity(sim_data, transform = t, method = metric)

  name = 'll_{0}_{1}_{2}'.format(metric,ncluster,exemp_time)
  def setLauncher(**kwargs):
    sims= kwargs.get('sims')
    metric = kwargs.get('metric')
    name = kwargs.get('name')
    d_in = []
    percs = logspace(.1,1.5,8)
    for p in percs:
      d_in.append(dict(similarities = sims,
                       self_similarity = ss.scoreatpercentile(sims, p),
                       metric = metric
                       ))

    launcher = bcl.launcher(d_in, host = host, name = name)
    return launcher  
  if launcher == None:
    output = mem.getOrSet(setLauncher,
                          **mem.rc(dict(sims = sims, metric = metric,
                                        name = name,
                                        hardcopy = True,
                                        reset = reset,
                                        hard_reset = False,)))  
    return output



  def setC2(launcher = launcher, **kwargs):
    if launcher == None:
      raise Exception()
    else:
      output = launcher.output()
    return output
    #It appears that the bsub process failed for the first output.
    #No big deal. Debug later.
  
  output = mem.getOrSet(setC2,
                        **mem.rc(dict(harcopy = True,
                                      launcher = launcher,
                                      reset = reset,
                                      on_fail = 'compute',
                                      hard_reset = False,
                                      name =  'c2'+ name )))
  all_inds = array([  squeeze(o['inds']) for o in output[:] ])
  

  xs = misc['x']['vals'][zip(*xy_data)] #zip(*sim_xy)]
  ys = misc['y']['vals'][zip(*xy_data)] #zip(*sim_xy)]
  zs = misc['z']['vals'][zip(*xy_data)] #zip(*sim_xy)]
  
  colors =array( mycolors.getct(shape(all_inds)[1]) )
  f = plt.figure(0)
  f.clear()
  
  all_tps = range(scd[1])
  nc = len(all_inds)
  nt = len(all_tps)

  all_members = []
  for i, inds in enumerate(all_inds):
    #compute similarity matrices 1000 at a time:
    exemplars = sim_data[list(set(list(inds)))]
    sim = similarity(cell_data, 
                   exemplars, 
                   transform = t,
                   method = metric)
    closest = argmax(sim, 1)
    all_members.append(closest)
    
    
    if doplot:
      for j, tp in enumerate(all_tps):
        ax = f.add_axes( [float(j)/nt,float(i) /nc,1./nt, 1. /nc] )
        ax.set_yticks([])
        ax.set_xticks([])
        i_sub = nonzero(equal(xy_data[:,1], j) * greater(ys,0))[0]
        cs = colors[closest[i_sub]]
        x = xs[i_sub]
        z = zs[i_sub]
        plt.scatter(x[::step],z[::step], 40,alpha = .75, c = cs[::step], edgecolor = 'none')
    
  ct_data = xy_data
  return all_members, ct_data
Пример #25
0
def one(all_means, all_mems,
        tfp, 
        axis = 'tf',
        idxs = [0,1], fig = 5
        ,choice_ax = 'x'
        ,nrml = 'axis'
        ,sorting = 'axis'):
    m = all_means[idxs[0]]
    c = all_mems[idxs[0]]
    proj=abs(tfp[idxs[0],:,:])

    m2 = all_means[idxs[1]]
    c2 = all_mems[idxs[1]]
    proj2=abs(tfp[idxs[1],:,:])

    
    sqidxs = nu.net_sq_keyidxs()
    n_tfidxs = nu.net_tf_keyidxs() 
    trgs,tfs = nu.parse_net()
    tf_sqidxs = [sqidxs[key] for key in tfs.keys()]
    gene_sqidxs = [sqidxs[key] for key in trgs.keys()]

    tfk = nu.net_tf_keyidxs()
    tgk = nu.net_trg_keyidxs()
    tf_aidx = [ tfk[key] for key in tfs.keys()]
    gene_aidx = [ tgk[key] for key in trgs.keys()]


    tfidxs = tf_aidx
 
    k = len(m)
    ntf = len(tf_sqidxs)
    ng = len(gene_sqidxs)


    print '''Getting ready to plot clusters mapped on to tf components.

--note--
In its current incarnation, netutils orders tfs by their out degree
and genes by their in degree.

Thus viewmany() orders projects by TF out degree. Left unsorted, this
is the order of the TF x axis.'''
    
    #how to normalize the image?
    #axis: equal sum for each tf over all clusters.
    #other: equal sums for each cluster in img


    
    nrml = 'axis'
    nrml_type = lambda x,y:np.max(x,y)

    sorting = 'other'
    
    
    print axis

    d0 = shape(m)[1]
    d2 = shape(m2)[1]

    show_membership = True

    if axis == 'tf':    

        if sorting == 'axis':
            img = proj
            mean_tfval = argmax(img,1) 
            c_srt = np.argsort( mean_tfval)


            img = img[c_srt,:]    

            img2 = proj2
            mean_tfval = argmax(img2,1) 
            c_srt = np.argsort( mean_tfval)

            img2 = img2[c_srt,:]    
        else:
            img = proj
            mean_tfval = argmax(img,0) 
            c_srt = np.argsort( mean_tfval)


            img = img[:,c_srt]    

            img2 = proj2
            mean_tfval = argmax(img2,0) 
            c_srt = np.argsort( mean_tfval)

            img2 = img2[:,c_srt]    
    elif axis =='gene':  
        maxgene = 200
        gsort = argsort(c)
        
        
        if d0 == 8321 and not show_membership:
            img = m[:,gsort][:,:maxgene]
        else:
            img = zeros((k,ng))
            for i in range(ng):
                img[c[i],i] = 1
        if d2 == 8321 and not show_membership:

            img2 = m2[:,gsort][:,:maxgene] 
        else:
            img2 = zeros((k,ng))
            for i in range(ng):
                img2[c2[i],i] = 1


    #normalize to generate an image
    if nrml == 'axis':
        img2 = img2/nrml_type(img2,0)[newaxis,:]
        img = img/nrml_type(img,0)[newaxis,:]
    else:
        img2 = img2/nrml_type(img2,1)[:,newaxis]
        img = img/nrml_type(img,1)[:,newaxis]       
            
    
    img /= np.max(img)
    img2 /=np.max(img)

    img_show= img[:,:,newaxis] *[0,0,1] + img2[:,:,newaxis]*[1,0,0]
    


    f = plt.figure(fig)
    f.clear()


    ax = f.add_axes([.05,.05,.9,.9])
    ax.imshow(img_show[:,:,:], aspect = 'auto')

    nc = shape(img)[0]
    xs, ys, rs, cs = [[] for i in range(4)]
    
    nchoice = 1
    if choice_ax == 'y':

        dim =  shape(img)[0]
        maxes = [argsort(img,1)[::-1][:,:nchoice],
                 argsort(img2,1)[::-1][:,:nchoice]]
    elif choice_ax == 'x':
                     
        dim =  shape(img)[1]
        maxes = [argsort(img,0)[::-1][:nchoice,:],
                 argsort(img2,0)[::-1][:nchoice,:] ]    
    else:
        raise Exception('bad axis')                

    ct = mycolors.getct(len(maxes))
    for j in range(len(maxes)):
        for i in range(dim):
            for k in range(nchoice):
                if choice_ax == 'x':
                    ys.append(maxes[j][k][i])
                    xs.append(i)
                elif choice_ax =='y':
                    xs.append(maxes[j][i][k])
                    ys.append(i)
                else:
                    raise Exception('bad axis')

                rs.append(20 + 30*(1-j))
                cs.append(ct[j])
            
    xs, ys, rs, cs  = np.array(xs),np.array(ys),np.array(rs),np.array(cs)

    ax.scatter(xs,ys,200,'1',edgecolor = 'none')
    ax.scatter(xs,ys,rs,cs,alpha = .8, edgecolor = 'none')
Пример #26
0
def sig_grid(num = 1 ,  method = 'tree', reset = False,
             plot_kcs = True,
             bp_means = False,
             bp_zeros = True, zero_ofs = 1e-6,
             bp_logs = True,
             show_kos = False,
             filter_rows_and_cols = False):



  #Make and annotate the heatmap figure
  f = plt.figure(1, facecolor = 'w')
  f.clear()
  axdims= .9
  ax_box = array([.05,.05,axdims,axdims])
  sg_big_hm_annotations(f, ax_box)

  #Set up the sizes of each group axis in the heatmap figure
  kwts = float(sum([len(v) for  v in exps.values()]))
  mwidth = .015
  msize = mwidth*kwts
  kw_total = kwts +  ( msize * (len(exps)-1))
  ofs = 0


  allow_tf_kn = False
  if not allow_tf_kn: grid[zip(*knockout_cells)] = 0

  #Some more heatmap cfguration.
  saturation = [np.percentile(grid[nonzero(greater(grid,0))],10),
                np.percentile(grid[nonzero(greater(grid,0))],90)]
  tf_srt = argsort(np.mean(grid,1))
  all_bps = []
  expsums = [np.mean( grid.T[v,:], 1) for v in exps.values()]
  max_sum = np.max((list(it.chain(*expsums))))

  #For each experiment class, plot a heatmap and overlay per exp sums
  for k , v in exps.iteritems():
    #Axes positioning
    wid = len(v)
    ax_ofs =  array([ofs/kw_total, 0, (wid) / kw_total,1.])
    ax_box = array([.05,.05,0.,0.])
    ax_ofs = (ax_ofs * axdims) + ax_box

    #Make heatmap axes.
    ax = f.add_axes(ax_ofs, frameon = False)
    sums = np.mean(grid.T[v,:],1)
    exp_srt = argsort(sums)[::-1]
    hm.heatMap( grid.T[v[exp_srt],:][:,tf_srt], axes = ax,
                vmin = saturation[0],
                vmax = saturation[1])

    #Make overlay axes.
    ax2 = f.add_axes(array(ax_ofs) +  array([0,0,0,0]),
                     frameon = True,
                     axisbg = 'none',
                     xticks = [],
                     yticks = [])
    
    #Make the axes look the way I like em
    for a in ax2.spines.values():
      a.set_linewidth(2)
      a.set_alpha(.5)
    these_knockouts = nonzero([c [1]in v for c in knockout_cells])
    kc = knockout_cells[these_knockouts]
    kv = knockout_vals[these_knockouts]
    
    #If plot kcs is selected, plot the cells corresponding to TF deletion/OE
    if plot_kcs:  
      if len(kc) > 0:
        ax.scatter(*zip(*[( list(v).index(x[1]),x[0]) for x in kc]), s =50, 
                  color = 'none', edgecolor = 'black', linewidth = 3)
    color = 'blue'
    ax2.plot(sums[exp_srt],
            linewidth = 4, color = color)

    if bp_means: bpelts = sums
    else: bpelts = grid.T[v,:].flatten()
    if not( bp_zeros ): bpelts = bpelts[nonzero(bpelts)]
    all_bps.append(bpelts)

    ax2.set_xlim([0,wid])
    ax2.set_ylim([0,max_sum])
    ax.set_xlim([0,wid])
    ax.set_ylim([0,shape(grid)[0]])
    ax2.set_xticks([])

    #Annotate each axios
    tbb = matplotlib.transforms.Bbox(ax2.bbox).translated(0,-20)
    t = ax2.text(-2,0, k, 
                 va = 'bottom', ha = 'right',
                 rotation = 90, color = 'black',
                 size = 'x-large', family = 'serif')
    ofs +=  wid + msize


  #Make the boxplot figure
  f2 = plt.figure(3)
  plt.clf()

  if bp_means:  bp_kos =  array([  mean(grid.T[g[0],:],0) 
                             for g in it.groupby(sorted(\
        [ko[1] for ko in knockout_cells]))
                             ])
  else: bp_kos = array(knockout_vals)
  if not bp_zeros: bp_kos = bp_kos[nonzero(bp_kos)]

  all_bps = all_bps +  [bp_kos]

  ax3 = f2.add_subplot('111')
  if bp_logs: all_bps = [log(b + zero_ofs) for b in all_bps]
  bp_lzero = log(zero_ofs)

  boxplots = ax3.boxplot([bp for bp in all_bps], widths= .5)
  for p in boxplots.values():
      for e in p: e.set_linewidth(4)    

  #Annotate the boxplot figure
  ann_str = ''
  for i in range(8):
    ann_str += '{0}: {1}\n'.format(i+1, (exps.keys() + ['TF Knockout/OE'])[i])
  ax3.annotate(ann_str, [0,1],xycoords = 'axes fraction',
               xytext = [10,-10], textcoords = 'offset pixels',
               va = 'top', ha = 'left')
  ax3.set_title('''Boxplot of significances per experiment type for {3} learning method, Net {4} 

Filtered out were {0} cells corresponding to {1} TFs Knocked out or OverExpressed.
{2} of these cells have nonzero importance and are plotted at x=9,

Showing Means: {5}, Showing zeros: {6}, Plotting logs {7}'''.\
                  format(len(knockout_cells), len(knockout_tfs),
                         len(nonzero(knockout_vals)[0]), 
                         method, num,
                         bp_means, bp_zeros, bp_logs))
  ax3.set_ylabel('significance')
  ax3.set_xlabel('experiment class')
  
  f.savefig(cfg.dataPath('daniel/figs/{0}_net{1}_heatmaps.tiff'.format(method, num)),
            format = 'tiff')

    
  plam = lambda: filter_rows_and_cols and 'nonzero_exps_and_tfs_cells_log/'\
      or bp_zeros and not bp_logs and bp_means and 'zeros_means_nolog/'\
      or not bp_zeros and bp_means and not bp_logs and 'nozeros_means_nolog/'\
      or not bp_zeros and bp_means and bp_logs and 'nozeros_means_log/'\
      or bp_zeros and not bp_means and not bp_logs and 'zeros_cells_nolog/'\
      or not bp_zeros and not bp_means and not bp_logs and 'nozeros_cells_nolog/'\
      or not bp_zeros and not bp_means and bp_logs and 'nozeros_cells_log/'

  dataDir = cfg.dataPath('daniel/figs/{2}{0}_net{1}_boxplots.tiff'.\
                              format(method, num,plam()))
  print 'saving {0}'.format(dataDir)
  if not os.path.isdir(os.path.dirname(dataDir)): os.mkdir(os.path.dirname(dataDir))
  if os.path.isfile(dataDir): os.remove(dataDir)
  f2.savefig(dataDir,    format = 'tiff')
  
  
  

  mean_xvals = [ mean(all_bps[i][nonzero(greater(all_bps[i],bp_lzero))]) for i in range(len(all_bps))]
  pdfs, xvals = zip(*[histogram(x, bins=50, range=[-15,8], normed=False) for x in all_bps])
  import compbio.utils.colors as colors
  c = colors.getct(len(pdfs))
  f3 = plt.figure(3)
  f3.clear()
                                   
  sax = f3.add_subplot('111')
  seismic.seismic([array(x,float)/ sum(x) for x in pdfs], xax = xvals[0][:-1],stacked = False, colors = c, xmarkpts = mean_xvals, ax = sax)
  

  f4 = plt.figure(4)
  f4.clear()
  ax = f4.add_subplot('121')
  ax.set_title('(log base 10) of Percentage Nonzero for Experiment Classes')
  percs = log10(array([100*float(len(nonzero(greater(x,bp_lzero))[0])) / len(x) for x in all_bps]))
  ax.plot(percs,linewidth = 6)
  ax.set_yticks(percs)

  names = exps.keys() + ['TF Knockout/OE']
  ax.set_yticklabels(['{1}\n{0}'.format('%2.2f' % (10**p), names[idx]) for idx,p in enumerate(percs)])
  
  ax2 = f4.add_subplot('122')
  ax2.set_title('Mean of Nonzero Experiments for Experiment Classes')
  means = array([mean(bp[nonzero(greater(bp,bp_lzero))]) for bp in all_bps])
  ax2.plot(arange(1,9), means,linewidth = 6)
  ax2.boxplot( [bp[nonzero(greater(bp,bp_lzero))] for bp in all_bps], widths = .5)
  ax2.set_yticks(means)

  names = exps.keys() + ['TF Knockout/OE']
  ax2.set_yticklabels(['{1}\n{0}'.format('%2.2f' % (p), names[idx]) for idx,p in enumerate(means)])
Пример #27
0
def expr_gmm_onoff(expr_in,
                   log_expr = False, 
                   fig = 1,
                   draw = False):
    
    expr = (array(expr_in))
    dev = std(expr)
    if log_expr:
        expr = log(expr + dev)
    n = len(expr)
    expr_array = zeros((n,1))
    for i in range(n):
        expr_array[i] = expr[i]
    expr = expr_array
        
    from scikits.learn import gmm
    
    #demand seperation of max from alternate hypotheses by e/2
    cmin_diff = log(e*1.5)
    #cmin_diff = .0001
    k = 2

    G = gmm.GMM(n_states = k, n_dim = 1)
    G.fit(expr)
    [probs, clusters] = G.decode(expr)
    [probs, mixtures] = G.eval(expr)
    mean_as = argsort(G.means,0)
    for i in range(shape(mixtures)[0]):
        mixtures[i,:] = mixtures[i,squeeze(mean_as)]

    
    if draw:
        n = len(expr)
        xax = arange(n)[argsort(expr,0)]
        f = plt.figure(fig)
        f.clear()
        ax = f.add_axes([0,0,1,1])

        ct =mycolors.getct(k)
        cs, rs = [], []

        c2s  = []
        r2s = []
        x2s = []
        y2s = []
        for i in range(n):
            cs.append(ct[mean_as[clusters[i]]])
            rs.append(100)
            for j in range(k):
                mprob = mixtures[i,j]
                x2s.append(i)
                y2s.append(expr[i])
                c2s.append(ct[j])
                r2s.append(pow(exp(mprob),2)*100)
           
        x3s,y3s,c3s,r3s = [], [], [], []
        for i in range(n):
            probs = mixtures[i,:]
            cval = clusters[i]
            srt = argsort(probs)[::-1]
            maxval = probs[srt[0]]
            secval = probs[srt[1]]
            reliable = False
    
            if log(maxval) - log(secval) > cmin_diff: reliable = True

            x3s.append(i)
            y3s.append(expr[i])
            r3s.append(200)
            if not reliable:
                color = [0,0,0]
            else:
                color = ct[srt[0]]
            c3s.append(color)
    #ax.scatter(xax,expr,rs, color = cs)
        ax.scatter(x2s,y2s,r2s,edgecolor=c2s,facecolor = 'none')   
        ax.scatter(x3s,y3s,r3s,c3s)
    return mixtures
Пример #28
0
def dsi_boxplot(num = 1 ,  method = 'tree', reset = False,
                plot_kcs = True,
                bp_means = False,
                bp_zeros = True, zero_ofs = 1e-6,
                bp_logs = True,
                show_kos = True,
                log_scale = True,
                filter_rows_and_cols = True,
                boxplot = True):

  grid, descriptions = parseNet(num= num, method = method, reset = reset)
  grid = array(grid)
  descriptions = dict(descriptions)
  new_descriptions = {}

  if filter_rows_and_cols:
    #Filter out bad rows and columns
    good_exps = nonzero(np.max(grid,0))[0]

    tf_new_idxs = list(argsort(np.max(grid,1))[::-1])
    new_grid = grid[tf_new_idxs]
    good_tfs = nonzero(np.max(new_grid,1))[0]

    
    #Relabel the descriptions to take filtration into account
    #Assumed that one based indexing may be causing havoc so subtract one from the group.
    for k, value in descriptions.iteritems():
      if 'Genes' in k:
        new_descriptions[k] = [re.sub(re.compile('(\d+)'),\
                                        lambda x:  int(x.group()) in tf_new_idxs and str(tf_new_idxs.index(int(x.group()))) or x.group(), g) 
                               for g in value]
      else:
        new_descriptions[k] = value
      new_descriptions[k] = list(array(new_descriptions[k])[good_exps])
      
    new_grid = new_grid[good_tfs, :]
    new_grid = new_grid[ :,good_exps]
    
    grid = new_grid
    descriptions = new_descriptions


  #Make lambdas to split experiments into categories
  col_choosers = sg_choosers()
  #Split experiments
  exps = {}
  for k, v in col_choosers.iteritems():
    vs = [ dict(zip(descriptions.keys() , elt))
          for elt in  zip(*descriptions.values()) ]    
    exps[k] = nonzero( [v(e) for e in vs ])[0]

  '''Remove 'general' as the values wind up being all zeros.'''
  exps.pop('general')
  
  #Mark experiments that knock out TFS
  tf_kn_matches =[ sorted(list(it.chain(\
          nonzero([ 'G{0},'.format(t) in x+',' 
                    for x in  descriptions['DeletedGenes'] ])[0],
          nonzero([ 'G{0},'.format(t) in x+',' 
                    for x in  descriptions['OverexpressedGenes'] ])[0])))
                   for t in range(shape(grid)[0])]
  knockout_tfs = nonzero([len(k) for k in tf_kn_matches])[0]
  knockout_cells = array(list(it.chain(*[ [(i, exp) for exp in tf_kn_matches[i] ] 
                               for i in range(len(tf_kn_matches))])))
  knockout_vals = grid[zip(*knockout_cells)]
  
  do_final_bps = True
  kn_exps = {}

  split_ko_ts = False
  
  kn_exps['ko'] = []
  

  
  def getBPS(**kwargs):
    xlabels = []
    nz_frac_std  = []
    nz_frac_mean = []
    nz_val_std   = []
    nz_val_mean  = []
    
    nz_colvals = []

    for k, ecols in exps.iteritems():
      these_knockouts = array([c for c in knockout_cells if c[1] in ecols])
      exp_cells = array([(i,j) for j in ecols for i in arange(shape(grid)[0])])
      if these_knockouts != []:
        kns_found = [c for c in exp_cells 
                     if  np.sum(greater( np.product(c==these_knockouts,1),0),0)]
        kn_exps['ko'] += kns_found

        nokns_found = [c for c in exp_cells 
                       if not np.sum(greater( np.product(c==these_knockouts,1),0),0)]
      else:
        nokns_found = exp_cells

      cexp = [grid[zip(*exp_cells[\
              nonzero(equal(exp_cells[:,1],col))[0]])] \
                         for col in ecols] 

      if cexp == []:
        for arr in [nz_frac_std, nz_frac_mean,
                    nz_val_std, nz_val_mean]:
          arr.append(0.)
        nz_colvals.append([])
        xlabels.append(k)
        continue
      
      colwise_fracs = [mean(1.*greater(col,0)) for col in cexp]
      colwise_exprs = [mean(col[nonzero(greater(col,0))]) for col in cexp]
      colwise_exprs = [c if not isnan(c) else 0 for c in colwise_exprs]

      nz_colvals.append(colwise_exprs)

      nz_frac_std.append(std(colwise_fracs)/sqrt(len(colwise_fracs)))
      nz_frac_mean.append(mean(colwise_fracs))
      nz_val_std.append(std(colwise_exprs)/sqrt(len(colwise_exprs)))
      nz_val_mean.append(mean(colwise_exprs))
      
      if isnan(nz_val_mean[-1]): raise Exception()
      
      xlabels.append(k)

    for k, ecells in kn_exps.iteritems():
      ecells = array(ecells)
      nz_frac_std.append(0)
      nz_val_std.append(0)
      if len(ecells) == 0:
        for arr in [nz_frac_mean, nz_val_mean]:
          arr.append(0.)
        nz_colvals.append([])
      else:
        nz_frac_mean.append(mean(greater(grid[zip(*ecells)],0)))
        nz_val_mean.append(mean(grid[zip(*ecells[greater(grid[zip(*ecells)],0)])]))
        nz_colvals.append(grid[zip(*ecells[greater(grid[zip(*ecells)],0)])])
      xlabels.append(k)
      
    return xlabels, array(nz_frac_std),array(nz_val_std),array(nz_frac_mean), array(nz_val_mean), [array(cv) for cv in nz_colvals]
  xlabels, nz_frac_std,nz_val_std,nz_frac_mean, nz_val_mean, nz_colvals = mem.getOrSet(getBPS,on_fail = 'compute', reset = reset)
  
  args = [xlabels.index(x) for x in 
          ['general_ts', 'drug', 'drug_ts', 
           'genetic', 'genetic_ts', 'drug_genetic', 'drug_genetic_ts', 'ko']
          if x in xlabels]
  xlabels, nz_frac_std,nz_cal_std,nz_frac_mean,nz_val_mean =\
      array(xlabels)[args],nz_frac_std[args],nz_val_std[args],nz_frac_mean[args],nz_val_mean[args]
  nz_colvals = [nz_colvals[a] for a in args]

  f = plt.figure(0)
  f.clear()

  topen = open(cfg.dataPath('daniel/txt/net{0}_{1}'.format(num,method )),'w')
  topen.write('\t'.join(['exp_class','mean_influence','std_influence','stderr_influence'])+'\n')
  for idx, exp_class in enumerate(xlabels):
    topen.write('{0}\t{1}\t{2}\t{3}\n'.format(exp_class,mean(nz_colvals[idx]),std(nz_colvals[idx]),\
                                                std(nz_colvals[idx])/ len(nz_colvals[idx])))
  topen.close()

  plot_type = 'dsi_final'
  if plot_type == 'dsi_final':
    margin = .05
    wid0 = .75
    cs = mycolors.getct(len(nz_colvals))
    
    ax0 = f.add_axes([margin,margin, wid0 , 1. - 2* margin], title =  'Experminent mean significances: blue (red) lines denote quartiles (media).')
    if log_scale: ax0.set_yscale('log')
    #ax0.set_autoscaley_on(False)
    if boxplot:
      ax0.boxplot(nz_colvals[0:-1], widths = [.5] * (len(nz_colvals )-1))
      ax0.hlines([mean(nz_colvals[-1])],-100, 100,color = 'red',linestyle = ':',linewidth = 1)
    else:
      ax0.bar(.2 + arange(len(nz_colvals[0:-1])), [median(c) for c in nz_colvals[0:-1]],
              color = cs[:-1])
  
    ax0.set_xticklabels(xlabels[:-1])
    

    if boxplot:
      pass
      #ax0.set_ylim([min(nz_colvals[:-1]), max(nz_colvals[:-1])/10])

    #ax1 = f.add_axes([2*margin +wid0, margin, (1 - margin) - (2 * margin + wid0), 1- 2* margin],sharey = ax0, title = 'TF knockout/OE')
    #if boxplot:
    #  ax1.boxplot(nz_colvals[-1:],widths = .5)
    #else:
    #  ax1.bar([.2],[mean(c) for c in nz_colvals[-1:]],
    #          color = cs[-1:])

    #ax1.set_xticklabels(xlabels[-1:])
    
    if boxplot:
      pass
      #ax1.set_ylim([np.min([min(c) for c in nz_colvals[:-1]]), np.max([max(c) for c in nz_colvals[:-1]])])

    f.savefig(cfg.dataPath('daniel/figs/final_bp_net{0}_{1}_{2}.ps'.\
                                format(num, method,
                                       'log' if log_scale else 'lin')),
              dpi = 10)
  
    return
  elif plot_type == 'twoplots':
    nkeys = len(xlabels)
    if show_kos: xi = arange(nkeys)
    else: xi = arange(nkeys -1)
    
    y1 = nz_val_mean[xi]
    s1 =  nz_val_std[xi]
    y2 = nz_frac_mean[xi]
    s2 =  nz_frac_std[xi]
    
    a1 = f.add_subplot(211, ylim =[0, max(y1)+max(s1)], title = 'mean value of nonzero influences\n standard error across experiments')
    a2 = f.add_subplot(212, ylim =[0,max(y2)+ max(s2)], title = 'mean values of fraction nonzero influences\n standard error across experiments' )
    
    colors = mycolors.getct(nkeys)
    wofs = .15
    b1 = a1.bar(xi+wofs,y1,1.-wofs*2, linewidth = 3,color = colors,  ecolor = 'black')
    b2 = a2.bar(xi+wofs,y2,1.-wofs*2, linewidth = 3,color = colors,  ecolor = 'black' )
    p1,c1,b1 = a1.errorbar(xi+.5, y1, yerr = s1,capsize = 15, elinewidth = 4, color = 'black',linewidth = 0, ecolor = 'black')
    p2,c2,b2 = a2.errorbar(xi+.5, y2, yerr = s2,capsize = 15, elinewidth = 4, color = 'black',linewidth =0, ecolor = 'black')
    for c in c1:c.set_alpha(1.)
    for c in c2:c.set_color('black')
    for c in a2.get_children() + a1.get_children():
        try: 
          if not c in [p1,p2]: c.set_linewidth(4)
        except: pass
        continue
    a2.set_xticklabels([])
    for i in xi:
      a2.text( float(i) + .5,0,xlabels[i] , rotation = '-15',size = '16', ha = 'left',va='top')
    f.savefig(cfg.dataPath('daniel/figs/latest/{1:03d}_{0}_{2}.tiff'.\
                                format('no_kos' if not show_kos else 'kos', 
                                       num ,
                                       'log' if log_scale else 'lin')),format = 'tiff')
             
  return