示例#1
0
def init(**kwargs):
  '''
  Read in the 16s tree of life and a random clade corresponding to the
  halobacteria.

  At each node, sets metadata from the databases that I have grabbed.
  Metadata (node.m) for terminal nodes includes:
    taxnode   -- ncbi taxon of the node
    gbacc     -- genbank accession number of the 16s for the node
    gbid      -- genbank id of the 16s for the node
    
  inputs:
    reset [False]

  output:
    tree  <biopython tree>, the entire 16s tree of life
    halo  <biopython clade>, a clade of the tree of life

  usage:
    tree, halo = init()
'''
  
  print 'testing...'
  def setTree(**kwargs):
    nwk = Phylo.read(config.dataPath('sequences/16s.newick'),"newick")
    for n in it.chain(nwk.get_terminals(),nwk.get_nonterminals()): n.m = {}
    db_metadata(nwk)
    print "SETTING TREE!!!"
    return nwk
  
  return mem.getOrSet(setTree,
                      **mem.rc( kwargs, 
                                name = kwargs.get('name', 'default_tree'),
                                on_fail = 'compute',
                                register = 'init'))
示例#2
0
文件: io.py 项目: bh0085/compbio
def getBDTNP(protein = False,misc = False, **kwargs):
  def setBDTNP( protein = False, misc = False, **kwargs):
     gene_cols, misc_cols, rows, row_nns = bdtnp.parser.read()
     mapfile = open(config.dataPath('flybase/gene_map.tsv'))
     map_rows = []
     for l in mapfile.xreadlines(): 
       l = l.replace('\n','')
       if  l != '' and l[0] != '#' : map_rows.append(l.split('\t'))
     syms = [x[0] for x in map_rows]
     fbids= [x[1] for x in map_rows]

     times = set(it.chain(*[x['steps'] for x in gene_cols.values()]))
     for g in gene_cols.values() + misc_cols.values():
       gene_rows = zeros((len(rows), len(times)))
       for i,t in enumerate(times):
         if t in g['steps']: row = rows[:, g['idxs'][g['steps'].index(t)]]
         else: row = zeros(len(rows)) 
         gene_rows[:,i] = row

       #if g['info']['short_name'] == 'danr': raise Exception()
       g['vals'] = gene_rows

     protein_cols = dict([(k,val) for k,val in gene_cols.iteritems() 
                          if val['info']['type'] == 'protein'])
     mrna_cols = dict([(k,val) for k,val in gene_cols.iteritems() 
                       if val['info']['type'] == 'mRNA'])
     
     #things that are wonky include:
     # 1) Protein data (where column names do not match flybase symbols)
     # 2) Weird elements such as Traf1 that are not present in the network anyway
     # 3) FBgn0031375 / CG31670 which is apparently absent from the map and I fix.
     mrna_idxs = [syms.index(k) if k in syms  else 
                  syms.index('erm') if k == 'CG31670' else -1 
                  for k in mrna_cols.keys()]
     mrna_fbids = [fbids[idx] if idx != -1 else '' for idx in mrna_idxs] 

     protein_idxs = [syms.index(k[:-1]) if k[:-1] in syms   else -1 
                  for k in protein_cols.keys()]
     protein_fbids = [fbids[idx] if idx != -1 else '' for idx in protein_idxs] 
     

     if misc:
       return misc_cols
     if protein:
       return dict( [(protein_fbids[i], protein_cols.values()[i]) 
        for i, elt in enumerate(protein_idxs) if elt != -1])
     else:
       return dict( [(mrna_fbids[i], mrna_cols.values()[i]) 
        for i, elt in enumerate(mrna_idxs) if elt != -1])
  
  return mem.getOrSet(setBDTNP,
                      **mem.rc(kwargs,
                               register ='protein' if protein else \
                                 'misc' if misc else 'mrna',
                               protein = protein,
                               misc = misc,
                               on_fail = 'compute'))
示例#3
0
文件: io.py 项目: bh0085/compbio
def getBNet(**kwargs):
  '''Get the saved network from the knowledge based network, redFly.

  output: tuple of dicts keyed by gene/tf names

          trgs: {gname: {color:0.}{weights:[0....]}{tfs:['tfname']}
                 ...}
          tfs : {tfname:{color:0.}{weights:[0....]}{tgs:['tfname']}
                 ...}'''
  def setBNet(**kwargs):
    fpath = config.dataPath('network/network_predmodel/inputnetworks/bRN.txt')
    TC = getTC( reset = mod(kwargs.get('reset',0),2))
    CL = getCL( reset = mod(kwargs.get('reset',0),2))
    nwdata = open(fpath).read()
    #A few functions defined here to be used later
    trgfun = lambda x: x[1]
    wtfun = lambda x:float( x[2] )
    tffun = lambda x: x[0]
    sigmafun = lambda x: 1 / (1 + np.exp(-x /1))

    r = re.compile('^[ ]*(?P<tf>\S+)\s+(?P<target>\S+)'
                   ,re.M)
    matches = list(re.finditer(r,nwdata))    
    #Unsorted lists of tfs and targets
    targets =map(lambda x:x.group('target'),matches)
    tfs =    map(lambda x:x.group('tf'),matches)
    weights =[1.0] * len(tfs)
    
    #Concat the data for easier sorting
    cat = []
    for i in np.argsort(tfs):
      if TC.has_key(tfs[i]) and CL.has_key(targets[i]):
	cat.append([tfs[i],targets[i],weights[i]])

    #Extract a dictionary with information for each target.
    trg_d = {}
    count = 0.0
    for k, g in it.groupby(sorted(cat,key = trgfun),key = trgfun):
      l = list(g)
      count += 1.0
      trg_d[k] = {'color': np.array([count, 0, 0]),
		  'tfs' : map(tffun,l),
		  'weights': map(wtfun,l)
		  }

    #Extract a dictionary with information for each TF
    tf_d = {}
    for k, g in it.groupby(cat,key = lambda x: x[0]):
      l = list(g)
      tf_targets = map(lambda x: x[1],l)
        
      tf_d[k] = {'targets':map(trgfun,l),
		 'weights':map(wtfun,l)}

    return  (trg_d, tf_d)
  return mem.getOrSet(setBNet, **mem.rc({},on_fail = 'compute',**kwargs))
  pass
示例#4
0
文件: reinitz.py 项目: bh0085/compbio
def datafiles(**kwargs):
    def set_datafiles(**kwargs):
        out ={}
        idmap = id_map(**mem.sr(kwargs))
        for k,v in idmap.iteritems():
            out[k] = array([ [float(e) for e in re.compile('\s+').split(l.strip())] for l in open(v['file']).readlines() if l[0] in '0123456789'])
        return out
    return mem.getOrSet(set_datafiles, **mem.rc(kwargs,
                                                on_fail = 'compute'))
示例#5
0
文件: ali.py 项目: bh0085/compbio
def get_seqs(dbname, **kwargs):
  def set_seqs(**kwargs):
    cbdb = compbio.projects.cbdb
    dbname = kwargs['dbname']
    dbi = cbdb.getName(dbname)
    nodes = dbi.S.q(dbi.Sequence).all()
    return nodes
  kwnew =  mem.rc(kwargs,hardcopy = False,
                  name = dbname, on_fail = 'compute',
                  dbname = dbname)
  return mem.getOrSet(set_seqs, **kwnew)
示例#6
0
文件: btol.py 项目: bh0085/compbio
def getBTOL(**kwargs):
  def setBTOL(**kwargs):
    B = BTOL(**mem.sr(kwargs))
    if not B.treeInitialized():
      print 'Underlying tree structure apparently uninitialized: initializing\n...'
      B.initTree()
      print '...\nDone\nSaving\n...'
      B.saveTree()
      print '...\nDone'
    return B
  return mem.getOrSet(setBTOL, **mem.rc(kwargs, register = 'BTOL'))
示例#7
0
文件: exp.py 项目: bh0085/compbio
def recall_c2(**kwargs):
  '''
A kludgy wrapper to store the clustering results for later
without modifying the original mess of a program, c2...
'''
  def setC2(**kwargs):
    ll = c2(**mem.sr(kwargs))
    result =  c2(ll, **mem.sr(kwargs))
    return result
  return mem.getOrSet(setC2, 
                      **mem.rc(kwargs,
                               name = 'default_c2_settings',
                               on_fail = 'compute'))
示例#8
0
文件: btol.py 项目: bh0085/compbio
 def leafNodes(self,**kwargs):
   def setLeafNodes(**kwargs):
     all_leaves = self.t.get_terminals()
     dbi = cbdb.getName('taxdmp')
     all_nodes = [ ncbi.get_node(l.m['taxid'],dbi) 
                   if 'taxid' in l.m.keys() else None for l in all_leaves]
     return all_nodes
   nodes = mem.getOrSet(setLeafNodes, 
                        **mem.rc(kwargs,
                                 hardcopy = False,
                                 on_fail = 'compute',
                                 register = 'leaf_nodes'))
   return nodes
示例#9
0
文件: ali.py 项目: bh0085/compbio
def get_taxnodes(dbname, **kwargs):
  def set_taxnodes(**kwargs):
    
    all_seqs = get_seqs(dbname,**mem.sr(kwargs))
    seq_taxa = [s.source_taxon 
                   if s.source_taxon else None 
                   for s in all_seqs]
    alinodes = [ncbi.get_node(s) if s != None else None for s in seq_taxa]
    return alinodes
  return mem.getOrSet(set_taxnodes,
                      **mem.rc(kwargs,
                               on_fail = 'compute',
                               hardcopy = False, 
                               register = dbname))
示例#10
0
文件: ali.py 项目: bh0085/compbio
def get_taxon_forsome(nodes,rank,set_name = 'default_setname',**kwargs):
  def set_taxon_forsome(nodes = None, rank = None,**kwargs):
    assert nodes != None and rank != None
    taxon = [ncbi.get_taxon(node, rank = rank)
             if node else None for node in nodes]
    return taxon
  
  return mem.getOrSet(set_taxon_forsome,
                      **mem.rc(kwargs,
                               nodes = nodes,
                               rank = rank,
                               on_fail = 'compute',
                               hardcopy = False,
                               register= set_name + rank))
示例#11
0
文件: ncbi.py 项目: bh0085/compbio
def taxon_with_name( rank, name, **kwargs ):
  def set_taxon_with_name(name = None, rank = None, **kwargs):
    assert name != None and rank != None
    all_p = get_rank(rank)
    p_node = [p for p in all_p if sciname(p) == name]
    assert len(p_node) == 1, 'Ambiguous phylum match?'
    p_node = p_node[0]
    return p_node
  return mem.getOrSet(set_taxon_with_name,
                      **dict(rank = rank,
                             name = name,
                             register = rank+'_'+name,
                             hardcopy = False,
                             on_fail = 'compute',
                             **kwargs))
示例#12
0
文件: btol.py 项目: bh0085/compbio
 def getTaxon(self,rank = rank,
              **kwargs):
   def setTaxon(BTInstance = None, rank = None, **kwargs):
     assert rank; assert BTInstance
     leafnodes = BTInstance.leafNodes(**mem.sr(kwargs))
     leaf_families = [ncbi.get_taxon(node, rank=rank) 
                      if node else None for node in leafnodes]
     return leaf_families
   return mem.getOrSet(setTaxon,
                       **mem.sr(kwargs, 
                                rank = rank, 
                                BTInstance = self,
                                on_fail = 'compute',
                                hardcopy = False,
                                register = rank))
示例#13
0
文件: ali.py 项目: bh0085/compbio
def get_taxon_forall(aliname,
                     rank = None, 
                     **kwargs):
  def setTaxon(aliname = None, rank = None,**kwargs):
    assert aliname != None and rank != None
    nodes = get_taxnodes(aliname,**mem.sr(kwargs))
    taxon = [ncbi.get_taxon(node, rank=rank) 
             if node else None for node in nodes]
    return taxon
  return mem.getOrSet(setTaxon,
                      **mem.rc(kwargs,
                              aliname = aliname,
                              rank = rank,
                              on_fail = 'compute',
                              hardcopy = False,
                              register = aliname + rank))
示例#14
0
文件: reinitz.py 项目: bh0085/compbio
def id_map(**kwargs):
    def set_id_map(**kwargs):
     fname = cfg.dataPath('reinitz/28-7-2011-1-56-6-30-0/txt/byGenes')
     
     gsums = open(cfg.dataPath('flybase/gene_summaries.tsv'))
     gmap = open(cfg.dataPath('flybase/gene_map.tsv'))
     gassoc = open(cfg.dataPath('flybase/gene_association.fb'))
     
     gname_orig =  [ os.path.splitext(f)[0].lower() for f   in  os.listdir(fname) ] 
     gfiles =dict(  [ (gname_orig[i], os.path.join(fname,f)) for i, f in  enumerate(os.listdir(fname)) ] )
     gname_map = dict([( re.sub( re.compile('[^a-z]'),'',g), g) for g in gname_orig])
     gnames = gname_map.keys()
     
     glines = dict([(k.lower(),[]) for k in gnames])
     
     lines_kept = {}
     for i, g in enumerate(gassoc.xreadlines()):
         if g[0] == '!': continue
         g0 = g
         g = re.sub( re.compile('[^a-z]'),'', g.lower().split('\t')[9].strip())
         for k,v in glines.iteritems():
     
             if k == g: 
                 v.append((i,g))
                 lines_kept[i] = g0
         
     
     matches = glines
     ids = {}
     for k, v in matches.iteritems():
         names =  [ l[1] for l in v] 
         line_nums =  [ l[0] for l in v] 
         these_ids = [lines_kept[i].split('\t')[1].strip() for i in line_nums] 
         #just hacking here... for sloppy paired I use the first id...
         #alas...
         ids[k] = tuple(sorted(set(these_ids)))[0]
     
     return dict([ (idval, {'file': gfiles[gname_map[k]], 'name':gname_map[k]}) for k, idval in ids.iteritems()])
         #name_grps = dict([(gpkey, list(g)) for gpkey, g in  it.groupby(sorted(names))])
         #print k
         #print [ (gk, len(gv)) for gk, gv in name_grps.iteritems()] 
    return mem.getOrSet(set_id_map,**mem.rc(kwargs,on_fail = 'compute'))
示例#15
0
def parseNet(num =  1,method = 'tree', reset = False):
  '''
  Get one of daniel's nets. Allowable numbers are 1-3 and allowable
  types are 'tree', 'svm'
'''

  def setNet(**kwargs):
    method =kwargs.get('method', 'tree')
    num = kwargs.get('num', 1)

    description_path = cfg.dataPath('::daniel/net%s_chip_features.tsv') % num
    data_path = cfg.dataPath('::daniel/informativeness/%s%s.txt') %(method,num)
    split_re = re.compile('\s')
    
    desc_open = open(description_path)
    description_cols = split_re.split(desc_open.readline().strip()) + ['Exp_Index']
    description_vals = [split_re.split(l.strip()) for l in desc_open.readlines()]
    for idx, d in enumerate(description_vals): d.append(idx)
    

    data_open = open(data_path)
    weight, tf, exp = zip(*[array(split_re.split(l.strip()), float) 
                           for l in data_open.readlines()])
    exp  = [ e -1 for e in exp]
    description = {}
    for i in range(len(description_cols)): 
      description[description_cols[i]] = [d[i] for d in description_vals]
      
    
    ntf = np.max(tf) + 1
    nexp = len(description.values()[0]) 
    
    grid = zeros((ntf,nexp))
    for vals in zip(weight,tf,exp): grid[vals[1], vals[2]] = float(vals[0])
    
    return grid, description
  return mem.getOrSet(setNet,
    reset = reset, 
    register = method,
    name = '%s%s' %(method,num),
    method = method,
    num = num)
示例#16
0
文件: io.py 项目: bh0085/compbio
def getCL(**kwargs):
  '''Cell line data
  
  output: dict keyed by gene names'''
  def setCL(**kwargs):
    f = open(config.dataPath('network/CL.geneexp')).read()
    elts =f.split('\n')
    seqdict = {}
    for e in elts:
        matches = list(re.finditer(re.compile('([^\s]+)'), e))
        if not len(matches): continue
        name = matches[0].group(1)
        seqdict[name] = []
        for i in matches[1:]:
            seqdict[name].append(float(i.group(1)))

    for k, v in seqdict.iteritems():
      seqdict[k] = array(v)
      
    return seqdict
  return mem.getOrSet(setCL, **kwargs)
示例#17
0
文件: io.py 项目: bh0085/compbio
def getSush(**kwargs):
  '''Get sushmita's regression weights and biases'''
  def setSush(**kwargs):
    path = config.dataPath('network/network_predmodel/regressionwts/fRN')
    bias_files = [ os.path.join( path, f) for f in os.listdir(path)  if 'bias' in f ]
    nw_files = [ os.path.join( path, f) for f in os.listdir(path)  if 'nw' in f ]
    
    bias_re = re.compile('(?P<gname>\S+)\s+(?P<level>\S+)')
    weight_re = re.compile('(?P<gname>\S+)\s+(?P<tfname>\S+)\s+(?P<level>\S+)')
    genes = {}
    for b in bias_files:
      for l in open(b).xreadlines():
	match = bias_re.search(l)
	genes[match.group('gname')] = dict(bias = match.group('level'))
    for n in nw_files:
      for l in open(n).xreadlines():
	match = weight_re.search(l)
	g = genes[match.group('gname')]
	g['tfs'] = g.get('tfs', []) + [match.group('tfname')]
	g['weights'] = g.get('weights', []) + [match.group('level')]
    return genes
  return mem.getOrSet(setSush, **mem.rc(kwargs,
                                        hardcopy = True))
示例#18
0
def show_conservation(fidx = 0, reset = False):
    fnum = flist[fidx]
    rfid = 'RF{0:05}'.format(fnum)
    print rfid
    if fnum ==50: ftype = 'riboswitch'
    else: ftype = 'all'
    
    
    out = mem.getOrSet(setFamData,
                              **mem.rc({}, reset =reset,
                                       on_fail = 'compute',
                                       hardcopy = False,
                                       register = 'fdat'+rfid,
                                       ftype = ftype,
                                       rfid = rfid))

    
    mvals, tvals, structs = mem.getOrSet(setTree,
                                         **mem.rc({},reset = reset,
                                                  on_fail = 'compute',
                                                  hardcopy = True,
                                                  register = 'st'+rfid,
                                                  rfid = rfid,
                                                  ftype = ftype))
    
    idxs, tidx  = sutils.show_paired_v_energy(rfid,rfid,mvals,tvals,structs,ftype)
    
    all_pairs = structs['structs']
    all_energies = structs['energies']
    
    pints,eints, mints, tints = [structs['structs'][i] for i in idxs],\
        [ structs['energies'][i] for i in idxs],\
        [ mvals[tidx][i] for i in idxs],\
        [ tvals[tidx][i] for i in idxs]
    seq = structs['seq']
    
    if do_make_subopts:
        subopts = rutils.suboptimals(seq, n = 400)
        verts = rutils.struct_verts(subopts, seq, rfid)
        f = myplots.fignum(4,figsize)
        rplots.grid_rnas(verts, dims = [40])
        f.savefig(figfile.format('{0}_grid_rnas'.\
                                     format(rfid)))

    
                



    aff = rutils.struct_affinity_matrix(all_pairs, len(seq))
    pca = rutils.project_structs(all_pairs,
                          ptype ='pca',
                          affinities = aff,
                          n_comp = 3) 

    for metric in ['n_comp']:# ['frac_silent','frac_paired','n_comp']:
      scolors = []
      for i in range(len(tvals[tidx])):
          m_silent, pidxs, frac_good = sutils.metric(
              mvals[tidx][i],tvals[tidx][i],
              mtype = metric)
          
          scolors.append(mean(m_silent))
      scolors = myplots.rescale(scolors, [0.,1.])[:,newaxis] * array([1.,0.,0.])
      
      
      f = myplots.fignum(4,figsize)
      ax = f.add_subplot(111)
      xvals, yvals = pca[:,:2].T
      myplots.padded_limits(ax, xvals, yvals)
      
      ax.scatter(xvals,yvals,300,linewidth = 1,
                 edgecolor = 'black', color = scolors)

      ax.scatter(pca[idxs,0],pca[idxs,1], 2100 ,alpha = 1, 
                 color = 'black')
      ax.scatter(pca[idxs,0],pca[idxs,1], 2000 ,alpha = 1, 
                 color = 'white')
      ax.scatter(pca[idxs,0],pca[idxs,1], 400 ,alpha = 1, 
                 color = scolors[idxs],
                 )


      ax.annotate('''Conservation metric: {0}
Projected onto C=2 Principal Components'''.format(metric),
                  [0,1],xycoords = 'axes fraction', va = 'top',
                  xytext = [10,-10],textcoords='offset points')
      
      f.savefig(figfile.format('{0}_pca_{1}'.\
                                 format(rfid, metric)))
示例#19
0
def modules(reset=False):
    return mem.getOrSet(setModules, **mem.rc({}, reset=reset, hardcopy=True, on_fail="compute"))
示例#20
0
文件: io.py 项目: bh0085/compbio
def getNet(**kwargs):
  '''Get the saved network from patrick's files.

  output: tuple of dicts keyed by gene/tf names

          trgs: {gname: {color:0.}{weights:[0....]}{tfs:['tfname']}
                 ...}
          tfs : {tfname:{color:0.}{weights:[0....]}{tgs:['tfname']}
                 ...}'''
  def setNet(**kwargs):
    net_name = kwargs.get('net_name', 'unsup')
    if net_name == 'unsup':
      netfile = 'unsup_patrick.txt'
    elif net_name == 'logistic':
      netfile = 'logistic_0.6.txt'
    else:
      raise Exception()


    fpath = config.dataPath('network/patrick/{0}'.format(netfile))
    TC = getTC( reset = mod(kwargs.get('reset',0),2))
    CL = getCL( reset = mod(kwargs.get('reset',0),2))
    nwdata = open(fpath).read()
    #A few functions defined here to be used later
    trgfun = lambda x: x[1]
    wtfun = lambda x:float( x[2] )
    tffun = lambda x: x[0]
    sigmafun = lambda x: 1 / (1 + np.exp(-x /1))

    r = re.compile('^[ ]*(?P<tf>\S+)\s+(?P<target>\S+)\s+(?P<weight>\S+)'
                   ,re.M)
    matches = list(re.finditer(r,nwdata))    
    #Unsorted lists of tfs and targets
    targets =map(lambda x:x.group('target'),matches)
    tfs =    map(lambda x:x.group('tf'),matches)
    weights =map(lambda x:x.group('weight'),matches)
    
    #Concat the data for easier sorting
    cat = []
    for i in np.argsort(tfs):
      if TC.has_key(tfs[i]) and CL.has_key(targets[i]):
	cat.append([tfs[i],targets[i],weights[i]])

    #Extract a dictionary with information for each target.
    trg_d = {}
    count = 0.0
    for k, g in it.groupby(sorted(cat,key = trgfun),key = trgfun):
      l = list(g)
      count += 1.0
      trg_d[k] = {'color': np.array([count, 0, 0]),
		  'tfs' : map(tffun,l),
		  'weights': map(wtfun,l)
		  }


    #Extract a dictionary with information for each TF
    tf_d = {}
    for k, g in it.groupby(cat,key = lambda x: x[0]):
      l = list(g)
      tf_targets = map(lambda x: x[1],l)
        
      tf_d[k] = {'targets':map(trgfun,l),
		 'weights':map(wtfun,l)}

    return  (trg_d, tf_d)
  return mem.getOrSet(setNet,  **mem.rc(kwargs,
                                        hardcopy = True,
                                        on_fail = 'compute',
                                        register = kwargs.get('net_name',
                                                              'unsup')))
  pass
示例#21
0
def get_seq_groups(rfid = 'RF00167', reset = True, tree = True,
        draw_distances = draw_all_easy,
        draw_clusters = draw_all_easy,
        draw_single_cluster = draw_all_hard):
    '''
Run the tree computation for each clsuter in the rfam family.
(Or just one)

1) Compute clusters using a distance measure derived either 
   phyml or a simple levenshtein dist.

   kwds:
     tree          [True]  Use a tree or just a levenshtein 
                           distance to get distances for
                           init clustering.

2) Choose a cluster of well related sequences and for this 
   this cluster, compute an alignment (For each structure 
   using phase or for sequences using MUSCLE)
  
   kwds:
     struct_align  [True]   Whether to compute structural 
                            alignments or use MUSCLE

'''
    rutils = utils

    ali, tree, infos = rfam.get_fam(rfid)
    n = len(ali)

    if draw_distances:
        dists_t = seq_dists(ali,rfid, tree = True)
        dists_l = seq_dists(ali,rfid, tree = False)
        dtf = dists_t.flatten()
        dlf = dists_l.flatten()
        lin = linregress(dtf, dlf)
        rsquared = lin[2]**2

        f = myplots.fignum(5, (7,7))
        ax = f.add_subplot(111)
        ax.annotate('Levenshtein distance vs. BioNJ branch lengths',
                    [0,1], xycoords = 'axes fraction', va = 'top',
                    xytext = [10,-10],textcoords = 'offset pixels')
        ax.annotate('R-Squared: {0}'.format(rsquared),
                    [1,0], xycoords = 'axes fraction', ha = 'right',
                    xytext = [-10,10],textcoords = 'offset pixels')
        ax.set_xlabel('BIONJ Tree ML Distance')
        ax.set_ylabel('Levenshtein Distance')

        ax.scatter(dtf, dlf, 100)
        
        datafile = cfg.dataPath('figs/gpm2/pt2_lev_tree_dists.tiff')
        f.savefig(datafile)
        
    dists = mem.getOrSet(setDistances, ali = ali, tree = tree, run_id = rfid,
                         register = rfid, 
                         on_fail = 'compute',
                         reset = reset)
    
    clusters = maxclust_dists(dists, k = 5, method = 'complete')
    clusters -= 1

    if draw_clusters:

        ct = mycolors.getct(len(set(clusters)))
        colors = [ct[elt] for elt in clusters]
        pca_vecs = mlab.PCA(dists).project(dists) 
        
        f = myplots.fignum(5, (8,8))
        ax = f.add_subplot(111)
        ax.annotate('Rfam sequence clusters in first 2 PC of sequence space.',
                    [0,1], xycoords = 'axes fraction', va = 'top',
                    xytext = [10,-10],textcoords = 'offset pixels')
        ax.annotate('Number of Clusters: {0}'.format(len(ct)),
                    [1,0], xycoords = 'axes fraction', ha = 'right',
                    xytext = [-10,10],textcoords = 'offset pixels')
        ax.set_xlabel('PC 1')
        ax.set_ylabel('PC 2')

        ax.scatter(pca_vecs[:,0],pca_vecs[:,1], 20, color = colors)
        
        datafile = cfg.dataPath('figs/gpm2/pt2_all_seqs_clustered.ps')
        f.savefig(datafile)        

    #now take the largest cluster and do the analysis.
    
    cgrps = dict([ (k, list(g)) 
              for k , g  in it.groupby(\
                sorted( list(enumerate(clusters)),key = lambda x: x[1]),
                key = lambda x: x[1])])
    cbig = argmax([len(x) for x in cgrps.values()])
    cluster_seqs = [ elt[0] for elt in cgrps.values()[cbig] ] 
    csize = len(cluster_seqs)
    seqs =[ali[c] for c in cluster_seqs]

    
    
    if 0:

        ct = mycolors.getct(2)
        pca_vecs = mlab.PCA(dists).project(dists) 
        colors =[ct[1] if elt in cluster_seqs else ct[0] for elt in range(len(pca_vecs))] 
        
        f = myplots.fignum(5, (8,8))
        ax = f.add_subplot(111)
        ax.annotate('Inter and intra cluster distances vs. PC0 component for chosen cluster.',
                    [0,1], xycoords = 'axes fraction', va = 'top',
                    xytext = [10,-10],textcoords = 'offset pixels')
        ax.annotate('Number of cluster sequences: {0}, Number of total sequences'.format(csize, n  - csize),
                    [1,0], xycoords = 'axes fraction', ha = 'right',
                    xytext = [-10,10],textcoords = 'offset pixels')
        ax.set_xlabel('PC 0')
        ax.set_ylabel('Distance')


        for s in cluster_seqs:
            ax.scatter(pca_vecs[:,0],dists[s,:] ,200 *exp(-(dists[s,:] / .5) **2),  color = colors, alpha = .2)
        
        datafile = cfg.dataPath('figs/gpm2/pt2_focused_cluster_dists.ps')
        f.savefig(datafile)        
        
    clusters_final  = [ [ elt[0] for elt in cgrps.values()[i] ] for i in range(len(cgrps.values()))]
    seqs_final = [ [ ali[idx] for idx in clust ] for clust in clusters_final]
    return seqs_final
示例#22
0
文件: colors.py 项目: bh0085/compbio
def blackbody(reset = False, **kwargs):
  '''Generate a colormap according to a logarithm of the blackbody spectrum. Colormap is transformed by an arctangent to place whites at .5 in a band with width determined by the [contrast] and then by taking the max with a gaussian peaked at 0,1 with width determined by [width]

kwargs: 
 reset
 contrast   [.64]   adjusts arctangent slope near zero.
 width      [.01]   adjust gaussian thresholding near endpoints.
 flip       [False] Hot areas are blue if flip is false.
 flip_ends  [False] Gaussian threshold swaps high and low colors.

For a usage example, see compbio/fun/ocean.py'''
  def setBB( **kwargs  ): 
    dstr = '2deg'
    import inspect
    import os
    import re

    contrast = kwargs.get('contrast', .64)
    width = kwargs.get('width', .01)
    flip_ends = kwargs.get('flip_ends', False)
    flip = kwargs.get('flip',False)
    thisdir= os.path.dirname(inspect.stack()[0][1])
    bbcols = [re.split(re.compile('\s+'),l) for l in \
              open(os.path.join(thisdir,'blackbody.tab')).readlines()
              if dstr in l]
    rgb = []
    for b in bbcols:
      rgb.append(array(b[7:10],float))
    
    rgb = array(rgb)
    npts = 512
    ntot = len(rgb)
    #DON'T TOUCH THE SCALE!!!!
    scl = 33.5
    
    #xvals = logspace(0.,scl,npts)/pow(10,scl) - .5
    xvals =  arctan(linspace(-contrast,contrast,npts))/pi*2 
    #scaling = (linspace(-1,1,npts)**3)*(linspace(-1,1,npts)**polypow)
    #width = .001
    scaling = exp( - ( 1-abs(linspace(-1,1,npts))) **2 / width)
    
    scaling *= array([-1 if x <0 else 1 for x in linspace(-1,1,npts)])
    if flip_ends:
      scaling *= -1

    d2 = [xvals[:], 
         scaling[:]]
    inds = argmax(np.abs(d2),0)
    xvals = array([d2[inds[i]][i] for i in range(len(inds))])
    
    xvals = (xvals * .5) + .5
    f0 = float(argmin(var(rgb,1))) / ntot
    k = -2.0 * (2. * f0  - 1) / (f0**2)  /2
    lspace = log(linspace(1, 1 + k,ntot)) / log(1 + k)
    #x0s = log(linspace(1,1 + scl,ntot))/log(1 + scl)
    
    vals =array([ interp(xvals,
                         lspace,
                         [e[i] for e in rgb])
                  for i in range(3)])
    midpoint = argmin(var(vals,0))
    
#    raise Exception()
    if flip: vals = vals[:,::-1]
    xs=linspace(0,1,npts)
    cdict = dict(
        red = [  (xs[i],  vals[0,i], vals[0,i]) for i in range(npts)],
        green = [ ( xs[i], vals[1,i], vals[1,i]) for i in range(npts)],
        blue = [  (xs[i], vals[2,i], vals[2,i]) for i in range(npts)])
    
    #out = zip(vals)
    cmap = matplotlib.colors.LinearSegmentedColormap('bb',cdict) 
    return cmap
  out = mem.getOrSet(setBB, reset = reset, **kwargs)
  return out
示例#23
0
def eval_seq_group(gap_seqs, rfid, run_id, inp_run_id, reset = True,
                   draw_alis = draw_all_easy,
                   clade_alignment_method = clade_alignment_method,
                   max_structs = 5):

    rutils = utils
    data = butils.load_data(inp_run_id, 'output')
    structs = data['structs']
    energies = data['energies']
    esrt = argsort(energies)[::-1]
    s_inds = esrt[:max_structs]
    structs, energies = [structs[i] for i in s_inds], [energies[i] for i in s_inds]

    refseq = data['seq']
    
    nq = len(gap_seqs)
    ns = len(structs)

    names = ['N{1:04}'.format(rfid, idx) for idx in range(nq)]
    seqs = [rutils.ungapped_seq(gap_seqs[i], names[i]) for i in range(nq)]
    


    profiles = mem.getOrSet(setProfiles, 
                            **mem.rc({},
                                     seq = refseq, structs = structs, run_id = rfid,
                                     reset = reset,
                                     on_fail = 'compute', 
                                     register = 'tuprof_{0}'.format(rfid)))
    
    if draw_alis: 
        draw_cm_muscle_congruencies(seqs, profiles, 
                                    run_id, reset = reset)
    

    if clade_alignment_method == 'cm':
        alis, refs, all_pairs  =\
            mem.getOrSet(setAlignments, 
                         **mem.rc({},
                                  seqs = seqs, profiles = profiles, 
                                  run_id = rfid, ali_type = 'struct',
                                  reset = reset,
                                  on_fail = 'compute', 
                                  register = 'tuali_struct_{0}'.format(rfid)))
    else:
        raise Exception('No methods besides cm are yet implemented')
    

    seq_group_data = {}
    seq_group_data['seqs'] = gap_seqs
    seq_group_data['structs'] = []
    for i, struct in enumerate(structs):
        struct_data = {}
        ali = alis[i]
        ref = refs[i]
        pairs = all_pairs[i]
        
        #NOTE THAT DUE TO AN AWKWARD SYNTAX DECISION,
        #I AM ALLOWING FOR THE POSSIBILITY THAT EACH
        #ALI ELT HAS DIFFERENT PAIRS.
        #
        #ALL OF MY ROUTINES SO FAR ONLY USE A SINGLE 
        #PAIR SET AND SO I USE PAIRS[0] EXCLUSIVELY
        struct_data.update(ref = ref[0], 
                           pairs = pairs[0],
                           ali = ali)
                        
        rid = '{0}_{1}'.format(run_id, i)

        if clade_tree_method ==  'bionj': 
            tree = phyml.tree(ali, run_id = rid, bionj = True)
        else: tree = get_phase_tree(ali, pairs[0], run_id)

        for i, ct in enumerate(tree.get_terminals()):
            seq = filter(lambda x: x.id == ct.name, ali)[0]
            ct.m = {'seq':seq,
                    'probs':array([1 for j in range(len(seq))])}

        if clade_ancestor_method == 'independent':
            ml_tree = get_ml_ancestor_tree(tree, ali, 
                                           '{0}_paml{1}'.format(run_id, i))
        else:
            ml_tree = get_structure_ancestor_tree(\
                tree, ali,'{0}_stree{1}'.format(run_id, i))
        
        muts, times, gaps, irresolvables = tree_conservation.count_struct(ml_tree, pairs[0])

        struct_data.update(muts = muts, times = times, 
                        gaps = gaps, irresolvables = irresolvables)
        seq_group_data['structs'].append(struct_data)

    return seq_group_data
示例#24
0
文件: exp.py 项目: bh0085/compbio
def c2( launcher = None, ncluster =2000, host = 'tin', 
        reset = 0, step = 10, exemp_time = 'all',
        doplot = False ,**kwargs):
  mrnas = nio.getBDTNP()
  misc = nio.getBDTNP(misc = True)
  
  vals = array([v['vals'] for v in mrnas.values()])
  gvars = var(vals, 1)
  gminvars = np.min(gvars,1)
  gmedvars = median(gvars,1)

  min20 = argsort(gminvars)[::-1][:20]
  med20 = argsort(gmedvars)[::-1][:20]

  int20 = set(min20).intersection(set(med20))
  xgenes = array(list(int20))

  cell_data = vals[xgenes].transpose(1,2,0)
  scd = shape(cell_data)
  #times = reshape(zeros(shape(cell_data[0:2]))[:,:,newaxis , arange(shape(cell_data[1]))
  #                    , (prod(shape(cell_data)[0:2])))
  xycoords = (arange(scd[0])[:,newaxis,newaxis]*[1,0] +\
                arange(scd[1])[newaxis,:,newaxis]*[0,1])
  cell_data = reshape(cell_data, (prod(shape(cell_data)[0:2]), shape(cell_data)[2] ))
  xy_data = reshape(xycoords, (prod(scd[0:2]),2 ))
    
  if exemp_time == 'all':
    inds = arange(len(cell_data))
  else:
    inds = arange(len(cell_data))[nonzero(equal(xy_data[:,1],exemp_time))[0]]
  
  np.random.seed(1)
  np.random.shuffle(inds)
  rand_thousand = inds[0:ncluster]
  
  sim_data = cell_data[rand_thousand]
  sim_xy = xy_data[rand_thousand]
  t = [ mean(sim_data, 0), std(sim_data,0)]
  t[1][equal(t[1],0)] = 0
  metric = 'neg_dist'
  sims = similarity(sim_data, transform = t, method = metric)

  name = 'll_{0}_{1}_{2}'.format(metric,ncluster,exemp_time)
  def setLauncher(**kwargs):
    sims= kwargs.get('sims')
    metric = kwargs.get('metric')
    name = kwargs.get('name')
    d_in = []
    percs = logspace(.1,1.5,8)
    for p in percs:
      d_in.append(dict(similarities = sims,
                       self_similarity = ss.scoreatpercentile(sims, p),
                       metric = metric
                       ))

    launcher = bcl.launcher(d_in, host = host, name = name)
    return launcher  
  if launcher == None:
    output = mem.getOrSet(setLauncher,
                          **mem.rc(dict(sims = sims, metric = metric,
                                        name = name,
                                        hardcopy = True,
                                        reset = reset,
                                        hard_reset = False,)))  
    return output



  def setC2(launcher = launcher, **kwargs):
    if launcher == None:
      raise Exception()
    else:
      output = launcher.output()
    return output
    #It appears that the bsub process failed for the first output.
    #No big deal. Debug later.
  
  output = mem.getOrSet(setC2,
                        **mem.rc(dict(harcopy = True,
                                      launcher = launcher,
                                      reset = reset,
                                      on_fail = 'compute',
                                      hard_reset = False,
                                      name =  'c2'+ name )))
  all_inds = array([  squeeze(o['inds']) for o in output[:] ])
  

  xs = misc['x']['vals'][zip(*xy_data)] #zip(*sim_xy)]
  ys = misc['y']['vals'][zip(*xy_data)] #zip(*sim_xy)]
  zs = misc['z']['vals'][zip(*xy_data)] #zip(*sim_xy)]
  
  colors =array( mycolors.getct(shape(all_inds)[1]) )
  f = plt.figure(0)
  f.clear()
  
  all_tps = range(scd[1])
  nc = len(all_inds)
  nt = len(all_tps)

  all_members = []
  for i, inds in enumerate(all_inds):
    #compute similarity matrices 1000 at a time:
    exemplars = sim_data[list(set(list(inds)))]
    sim = similarity(cell_data, 
                   exemplars, 
                   transform = t,
                   method = metric)
    closest = argmax(sim, 1)
    all_members.append(closest)
    
    
    if doplot:
      for j, tp in enumerate(all_tps):
        ax = f.add_axes( [float(j)/nt,float(i) /nc,1./nt, 1. /nc] )
        ax.set_yticks([])
        ax.set_xticks([])
        i_sub = nonzero(equal(xy_data[:,1], j) * greater(ys,0))[0]
        cs = colors[closest[i_sub]]
        x = xs[i_sub]
        z = zs[i_sub]
        plt.scatter(x[::step],z[::step], 40,alpha = .75, c = cs[::step], edgecolor = 'none')
    
  ct_data = xy_data
  return all_members, ct_data
示例#25
0
def dsi_boxplot(num = 1 ,  method = 'tree', reset = False,
                plot_kcs = True,
                bp_means = False,
                bp_zeros = True, zero_ofs = 1e-6,
                bp_logs = True,
                show_kos = True,
                log_scale = True,
                filter_rows_and_cols = True,
                boxplot = True):

  grid, descriptions = parseNet(num= num, method = method, reset = reset)
  grid = array(grid)
  descriptions = dict(descriptions)
  new_descriptions = {}

  if filter_rows_and_cols:
    #Filter out bad rows and columns
    good_exps = nonzero(np.max(grid,0))[0]

    tf_new_idxs = list(argsort(np.max(grid,1))[::-1])
    new_grid = grid[tf_new_idxs]
    good_tfs = nonzero(np.max(new_grid,1))[0]

    
    #Relabel the descriptions to take filtration into account
    #Assumed that one based indexing may be causing havoc so subtract one from the group.
    for k, value in descriptions.iteritems():
      if 'Genes' in k:
        new_descriptions[k] = [re.sub(re.compile('(\d+)'),\
                                        lambda x:  int(x.group()) in tf_new_idxs and str(tf_new_idxs.index(int(x.group()))) or x.group(), g) 
                               for g in value]
      else:
        new_descriptions[k] = value
      new_descriptions[k] = list(array(new_descriptions[k])[good_exps])
      
    new_grid = new_grid[good_tfs, :]
    new_grid = new_grid[ :,good_exps]
    
    grid = new_grid
    descriptions = new_descriptions


  #Make lambdas to split experiments into categories
  col_choosers = sg_choosers()
  #Split experiments
  exps = {}
  for k, v in col_choosers.iteritems():
    vs = [ dict(zip(descriptions.keys() , elt))
          for elt in  zip(*descriptions.values()) ]    
    exps[k] = nonzero( [v(e) for e in vs ])[0]

  '''Remove 'general' as the values wind up being all zeros.'''
  exps.pop('general')
  
  #Mark experiments that knock out TFS
  tf_kn_matches =[ sorted(list(it.chain(\
          nonzero([ 'G{0},'.format(t) in x+',' 
                    for x in  descriptions['DeletedGenes'] ])[0],
          nonzero([ 'G{0},'.format(t) in x+',' 
                    for x in  descriptions['OverexpressedGenes'] ])[0])))
                   for t in range(shape(grid)[0])]
  knockout_tfs = nonzero([len(k) for k in tf_kn_matches])[0]
  knockout_cells = array(list(it.chain(*[ [(i, exp) for exp in tf_kn_matches[i] ] 
                               for i in range(len(tf_kn_matches))])))
  knockout_vals = grid[zip(*knockout_cells)]
  
  do_final_bps = True
  kn_exps = {}

  split_ko_ts = False
  
  kn_exps['ko'] = []
  

  
  def getBPS(**kwargs):
    xlabels = []
    nz_frac_std  = []
    nz_frac_mean = []
    nz_val_std   = []
    nz_val_mean  = []
    
    nz_colvals = []

    for k, ecols in exps.iteritems():
      these_knockouts = array([c for c in knockout_cells if c[1] in ecols])
      exp_cells = array([(i,j) for j in ecols for i in arange(shape(grid)[0])])
      if these_knockouts != []:
        kns_found = [c for c in exp_cells 
                     if  np.sum(greater( np.product(c==these_knockouts,1),0),0)]
        kn_exps['ko'] += kns_found

        nokns_found = [c for c in exp_cells 
                       if not np.sum(greater( np.product(c==these_knockouts,1),0),0)]
      else:
        nokns_found = exp_cells

      cexp = [grid[zip(*exp_cells[\
              nonzero(equal(exp_cells[:,1],col))[0]])] \
                         for col in ecols] 

      if cexp == []:
        for arr in [nz_frac_std, nz_frac_mean,
                    nz_val_std, nz_val_mean]:
          arr.append(0.)
        nz_colvals.append([])
        xlabels.append(k)
        continue
      
      colwise_fracs = [mean(1.*greater(col,0)) for col in cexp]
      colwise_exprs = [mean(col[nonzero(greater(col,0))]) for col in cexp]
      colwise_exprs = [c if not isnan(c) else 0 for c in colwise_exprs]

      nz_colvals.append(colwise_exprs)

      nz_frac_std.append(std(colwise_fracs)/sqrt(len(colwise_fracs)))
      nz_frac_mean.append(mean(colwise_fracs))
      nz_val_std.append(std(colwise_exprs)/sqrt(len(colwise_exprs)))
      nz_val_mean.append(mean(colwise_exprs))
      
      if isnan(nz_val_mean[-1]): raise Exception()
      
      xlabels.append(k)

    for k, ecells in kn_exps.iteritems():
      ecells = array(ecells)
      nz_frac_std.append(0)
      nz_val_std.append(0)
      if len(ecells) == 0:
        for arr in [nz_frac_mean, nz_val_mean]:
          arr.append(0.)
        nz_colvals.append([])
      else:
        nz_frac_mean.append(mean(greater(grid[zip(*ecells)],0)))
        nz_val_mean.append(mean(grid[zip(*ecells[greater(grid[zip(*ecells)],0)])]))
        nz_colvals.append(grid[zip(*ecells[greater(grid[zip(*ecells)],0)])])
      xlabels.append(k)
      
    return xlabels, array(nz_frac_std),array(nz_val_std),array(nz_frac_mean), array(nz_val_mean), [array(cv) for cv in nz_colvals]
  xlabels, nz_frac_std,nz_val_std,nz_frac_mean, nz_val_mean, nz_colvals = mem.getOrSet(getBPS,on_fail = 'compute', reset = reset)
  
  args = [xlabels.index(x) for x in 
          ['general_ts', 'drug', 'drug_ts', 
           'genetic', 'genetic_ts', 'drug_genetic', 'drug_genetic_ts', 'ko']
          if x in xlabels]
  xlabels, nz_frac_std,nz_cal_std,nz_frac_mean,nz_val_mean =\
      array(xlabels)[args],nz_frac_std[args],nz_val_std[args],nz_frac_mean[args],nz_val_mean[args]
  nz_colvals = [nz_colvals[a] for a in args]

  f = plt.figure(0)
  f.clear()

  topen = open(cfg.dataPath('daniel/txt/net{0}_{1}'.format(num,method )),'w')
  topen.write('\t'.join(['exp_class','mean_influence','std_influence','stderr_influence'])+'\n')
  for idx, exp_class in enumerate(xlabels):
    topen.write('{0}\t{1}\t{2}\t{3}\n'.format(exp_class,mean(nz_colvals[idx]),std(nz_colvals[idx]),\
                                                std(nz_colvals[idx])/ len(nz_colvals[idx])))
  topen.close()

  plot_type = 'dsi_final'
  if plot_type == 'dsi_final':
    margin = .05
    wid0 = .75
    cs = mycolors.getct(len(nz_colvals))
    
    ax0 = f.add_axes([margin,margin, wid0 , 1. - 2* margin], title =  'Experminent mean significances: blue (red) lines denote quartiles (media).')
    if log_scale: ax0.set_yscale('log')
    #ax0.set_autoscaley_on(False)
    if boxplot:
      ax0.boxplot(nz_colvals[0:-1], widths = [.5] * (len(nz_colvals )-1))
      ax0.hlines([mean(nz_colvals[-1])],-100, 100,color = 'red',linestyle = ':',linewidth = 1)
    else:
      ax0.bar(.2 + arange(len(nz_colvals[0:-1])), [median(c) for c in nz_colvals[0:-1]],
              color = cs[:-1])
  
    ax0.set_xticklabels(xlabels[:-1])
    

    if boxplot:
      pass
      #ax0.set_ylim([min(nz_colvals[:-1]), max(nz_colvals[:-1])/10])

    #ax1 = f.add_axes([2*margin +wid0, margin, (1 - margin) - (2 * margin + wid0), 1- 2* margin],sharey = ax0, title = 'TF knockout/OE')
    #if boxplot:
    #  ax1.boxplot(nz_colvals[-1:],widths = .5)
    #else:
    #  ax1.bar([.2],[mean(c) for c in nz_colvals[-1:]],
    #          color = cs[-1:])

    #ax1.set_xticklabels(xlabels[-1:])
    
    if boxplot:
      pass
      #ax1.set_ylim([np.min([min(c) for c in nz_colvals[:-1]]), np.max([max(c) for c in nz_colvals[:-1]])])

    f.savefig(cfg.dataPath('daniel/figs/final_bp_net{0}_{1}_{2}.ps'.\
                                format(num, method,
                                       'log' if log_scale else 'lin')),
              dpi = 10)
  
    return
  elif plot_type == 'twoplots':
    nkeys = len(xlabels)
    if show_kos: xi = arange(nkeys)
    else: xi = arange(nkeys -1)
    
    y1 = nz_val_mean[xi]
    s1 =  nz_val_std[xi]
    y2 = nz_frac_mean[xi]
    s2 =  nz_frac_std[xi]
    
    a1 = f.add_subplot(211, ylim =[0, max(y1)+max(s1)], title = 'mean value of nonzero influences\n standard error across experiments')
    a2 = f.add_subplot(212, ylim =[0,max(y2)+ max(s2)], title = 'mean values of fraction nonzero influences\n standard error across experiments' )
    
    colors = mycolors.getct(nkeys)
    wofs = .15
    b1 = a1.bar(xi+wofs,y1,1.-wofs*2, linewidth = 3,color = colors,  ecolor = 'black')
    b2 = a2.bar(xi+wofs,y2,1.-wofs*2, linewidth = 3,color = colors,  ecolor = 'black' )
    p1,c1,b1 = a1.errorbar(xi+.5, y1, yerr = s1,capsize = 15, elinewidth = 4, color = 'black',linewidth = 0, ecolor = 'black')
    p2,c2,b2 = a2.errorbar(xi+.5, y2, yerr = s2,capsize = 15, elinewidth = 4, color = 'black',linewidth =0, ecolor = 'black')
    for c in c1:c.set_alpha(1.)
    for c in c2:c.set_color('black')
    for c in a2.get_children() + a1.get_children():
        try: 
          if not c in [p1,p2]: c.set_linewidth(4)
        except: pass
        continue
    a2.set_xticklabels([])
    for i in xi:
      a2.text( float(i) + .5,0,xlabels[i] , rotation = '-15',size = '16', ha = 'left',va='top')
    f.savefig(cfg.dataPath('daniel/figs/latest/{1:03d}_{0}_{2}.tiff'.\
                                format('no_kos' if not show_kos else 'kos', 
                                       num ,
                                       'log' if log_scale else 'lin')),format = 'tiff')
             
  return
示例#26
0
def draw_cm_muscle_congruencies(seqs, profiles, run_id, reset = True):
    print 'computing alignments...'
    print '  ...using muscle'
    malis, mrefs, mpairs =\
            mem.getOrSet(setAlignments, 
                         **mem.rc({},
                                  seqs = seqs, profiles = profiles, 
                                  run_id = run_id, ali_type = 'muscle',
                                  reset = reset,
                                  on_fail = 'compute', 
                                  register = 'tuali_musc_{0}'.format(run_id))) 
    print '  ...using cmalign.'
    salis, srefs, spairs  =\
        mem.getOrSet(setAlignments, 
                     **mem.rc({},
                              seqs = seqs, profiles = profiles, 
                              run_id = run_id, ali_type = 'struct',
                              reset = reset,
                              on_fail = 'compute', 
                              register = 'tuali__struct_{0}'.format(run_id)))
 
    print '  ...making trees.'
    
    for idx, alis in enumerate(zip(malis, salis)):
        m, s = alis
        mtree  = phyml.tree(m,run_id, bionj = True)
        stree  = phyml.tree(s,run_id, bionj = True)
        
        maps = dict([(elt.id,i) for i, elt in enumerate(m)])
        mdists = zeros((len(maps),len(maps)))
        sdists = zeros((len(maps),len(maps)))
        for n1 in mtree.get_terminals():
            for n2 in mtree.get_terminals():
                mdists[maps[n1.name],maps[n2.name]] = \
                    mtree.distance(n1,n2)
        
        for n1 in stree.get_terminals():
            for n2 in stree.get_terminals():
                sdists[maps[n1.name],maps[n2.name]] = \
                    stree.distance(n1,n2)
        tree_similarity(sdists, mdists, '{0}_struct_{1}'.format(run_id,idx), k = len(sdists - 1))
        tree_similarity(sdists, mdists, '{0}_struct_{1}'.format(run_id,idx), k = 6)

        f = myplots.fignum(4, (8,10))
        ct = mycolors.getct(len(mtree.get_terminals()))

        import networkx

        for t, sp, ttype in zip([mtree, stree], [211,212], ['sequence', 'structural']):
            a = f.add_subplot(sp)
            layout = 'neato'
            G = phylo.to_networkx(t)
            Gi = networkx.convert_node_labels_to_integers(G, discard_old_labels=False)
            posi = networkx.pygraphviz_layout(Gi, layout, args = '')
            posn = dict((n, posi[Gi.node_labels[n]]) for n in G)


            networkx.draw(G, posn, labels = dict([(n, '') for n in G.nodes()]),
                      node_size = [100 if  n.name in maps.keys() else 0 for n in G.nodes()],
                      width = 1, edge_color = 'black',
                      ax = a,
                      node_color = [ct[maps.get(n.name, -1)] for n in G.nodes()] )
        

            a.annotate('Embedded tree for {0} alignment.'.format(ttype),
                    [0,1], xycoords = 'axes fraction', va = 'top',
                    xytext = [10,0],textcoords = 'offset pixels')
            a.annotate('Total branch length is {0}'.format(t.total_branch_length()),
                    [1,0], xycoords = 'axes fraction', ha = 'right',
                    xytext = [-10,10],textcoords = 'offset pixels')            

        #phylo.draw_graphviz(  mtree,  label_func = lambda x: '', 
        #                      node_color = [ct[maps.get(n.name, -1)] for n in G.nodes()] +\
        #                          [ct[0] for n in mtree.get_nonterminals()], axes = ax)

        datafile = cfg.dataPath('figs/gpm2/pt2_mus_cm_tree_embeddings_{0}_struct_{1}.ps'.format(run_id, idx))
        f.savefig(datafile, dpi = 200, format = 'ps')