def init(**kwargs): ''' Read in the 16s tree of life and a random clade corresponding to the halobacteria. At each node, sets metadata from the databases that I have grabbed. Metadata (node.m) for terminal nodes includes: taxnode -- ncbi taxon of the node gbacc -- genbank accession number of the 16s for the node gbid -- genbank id of the 16s for the node inputs: reset [False] output: tree <biopython tree>, the entire 16s tree of life halo <biopython clade>, a clade of the tree of life usage: tree, halo = init() ''' print 'testing...' def setTree(**kwargs): nwk = Phylo.read(config.dataPath('sequences/16s.newick'),"newick") for n in it.chain(nwk.get_terminals(),nwk.get_nonterminals()): n.m = {} db_metadata(nwk) print "SETTING TREE!!!" return nwk return mem.getOrSet(setTree, **mem.rc( kwargs, name = kwargs.get('name', 'default_tree'), on_fail = 'compute', register = 'init'))
def getBDTNP(protein = False,misc = False, **kwargs): def setBDTNP( protein = False, misc = False, **kwargs): gene_cols, misc_cols, rows, row_nns = bdtnp.parser.read() mapfile = open(config.dataPath('flybase/gene_map.tsv')) map_rows = [] for l in mapfile.xreadlines(): l = l.replace('\n','') if l != '' and l[0] != '#' : map_rows.append(l.split('\t')) syms = [x[0] for x in map_rows] fbids= [x[1] for x in map_rows] times = set(it.chain(*[x['steps'] for x in gene_cols.values()])) for g in gene_cols.values() + misc_cols.values(): gene_rows = zeros((len(rows), len(times))) for i,t in enumerate(times): if t in g['steps']: row = rows[:, g['idxs'][g['steps'].index(t)]] else: row = zeros(len(rows)) gene_rows[:,i] = row #if g['info']['short_name'] == 'danr': raise Exception() g['vals'] = gene_rows protein_cols = dict([(k,val) for k,val in gene_cols.iteritems() if val['info']['type'] == 'protein']) mrna_cols = dict([(k,val) for k,val in gene_cols.iteritems() if val['info']['type'] == 'mRNA']) #things that are wonky include: # 1) Protein data (where column names do not match flybase symbols) # 2) Weird elements such as Traf1 that are not present in the network anyway # 3) FBgn0031375 / CG31670 which is apparently absent from the map and I fix. mrna_idxs = [syms.index(k) if k in syms else syms.index('erm') if k == 'CG31670' else -1 for k in mrna_cols.keys()] mrna_fbids = [fbids[idx] if idx != -1 else '' for idx in mrna_idxs] protein_idxs = [syms.index(k[:-1]) if k[:-1] in syms else -1 for k in protein_cols.keys()] protein_fbids = [fbids[idx] if idx != -1 else '' for idx in protein_idxs] if misc: return misc_cols if protein: return dict( [(protein_fbids[i], protein_cols.values()[i]) for i, elt in enumerate(protein_idxs) if elt != -1]) else: return dict( [(mrna_fbids[i], mrna_cols.values()[i]) for i, elt in enumerate(mrna_idxs) if elt != -1]) return mem.getOrSet(setBDTNP, **mem.rc(kwargs, register ='protein' if protein else \ 'misc' if misc else 'mrna', protein = protein, misc = misc, on_fail = 'compute'))
def getBNet(**kwargs): '''Get the saved network from the knowledge based network, redFly. output: tuple of dicts keyed by gene/tf names trgs: {gname: {color:0.}{weights:[0....]}{tfs:['tfname']} ...} tfs : {tfname:{color:0.}{weights:[0....]}{tgs:['tfname']} ...}''' def setBNet(**kwargs): fpath = config.dataPath('network/network_predmodel/inputnetworks/bRN.txt') TC = getTC( reset = mod(kwargs.get('reset',0),2)) CL = getCL( reset = mod(kwargs.get('reset',0),2)) nwdata = open(fpath).read() #A few functions defined here to be used later trgfun = lambda x: x[1] wtfun = lambda x:float( x[2] ) tffun = lambda x: x[0] sigmafun = lambda x: 1 / (1 + np.exp(-x /1)) r = re.compile('^[ ]*(?P<tf>\S+)\s+(?P<target>\S+)' ,re.M) matches = list(re.finditer(r,nwdata)) #Unsorted lists of tfs and targets targets =map(lambda x:x.group('target'),matches) tfs = map(lambda x:x.group('tf'),matches) weights =[1.0] * len(tfs) #Concat the data for easier sorting cat = [] for i in np.argsort(tfs): if TC.has_key(tfs[i]) and CL.has_key(targets[i]): cat.append([tfs[i],targets[i],weights[i]]) #Extract a dictionary with information for each target. trg_d = {} count = 0.0 for k, g in it.groupby(sorted(cat,key = trgfun),key = trgfun): l = list(g) count += 1.0 trg_d[k] = {'color': np.array([count, 0, 0]), 'tfs' : map(tffun,l), 'weights': map(wtfun,l) } #Extract a dictionary with information for each TF tf_d = {} for k, g in it.groupby(cat,key = lambda x: x[0]): l = list(g) tf_targets = map(lambda x: x[1],l) tf_d[k] = {'targets':map(trgfun,l), 'weights':map(wtfun,l)} return (trg_d, tf_d) return mem.getOrSet(setBNet, **mem.rc({},on_fail = 'compute',**kwargs)) pass
def datafiles(**kwargs): def set_datafiles(**kwargs): out ={} idmap = id_map(**mem.sr(kwargs)) for k,v in idmap.iteritems(): out[k] = array([ [float(e) for e in re.compile('\s+').split(l.strip())] for l in open(v['file']).readlines() if l[0] in '0123456789']) return out return mem.getOrSet(set_datafiles, **mem.rc(kwargs, on_fail = 'compute'))
def get_seqs(dbname, **kwargs): def set_seqs(**kwargs): cbdb = compbio.projects.cbdb dbname = kwargs['dbname'] dbi = cbdb.getName(dbname) nodes = dbi.S.q(dbi.Sequence).all() return nodes kwnew = mem.rc(kwargs,hardcopy = False, name = dbname, on_fail = 'compute', dbname = dbname) return mem.getOrSet(set_seqs, **kwnew)
def getBTOL(**kwargs): def setBTOL(**kwargs): B = BTOL(**mem.sr(kwargs)) if not B.treeInitialized(): print 'Underlying tree structure apparently uninitialized: initializing\n...' B.initTree() print '...\nDone\nSaving\n...' B.saveTree() print '...\nDone' return B return mem.getOrSet(setBTOL, **mem.rc(kwargs, register = 'BTOL'))
def recall_c2(**kwargs): ''' A kludgy wrapper to store the clustering results for later without modifying the original mess of a program, c2... ''' def setC2(**kwargs): ll = c2(**mem.sr(kwargs)) result = c2(ll, **mem.sr(kwargs)) return result return mem.getOrSet(setC2, **mem.rc(kwargs, name = 'default_c2_settings', on_fail = 'compute'))
def leafNodes(self,**kwargs): def setLeafNodes(**kwargs): all_leaves = self.t.get_terminals() dbi = cbdb.getName('taxdmp') all_nodes = [ ncbi.get_node(l.m['taxid'],dbi) if 'taxid' in l.m.keys() else None for l in all_leaves] return all_nodes nodes = mem.getOrSet(setLeafNodes, **mem.rc(kwargs, hardcopy = False, on_fail = 'compute', register = 'leaf_nodes')) return nodes
def get_taxnodes(dbname, **kwargs): def set_taxnodes(**kwargs): all_seqs = get_seqs(dbname,**mem.sr(kwargs)) seq_taxa = [s.source_taxon if s.source_taxon else None for s in all_seqs] alinodes = [ncbi.get_node(s) if s != None else None for s in seq_taxa] return alinodes return mem.getOrSet(set_taxnodes, **mem.rc(kwargs, on_fail = 'compute', hardcopy = False, register = dbname))
def get_taxon_forsome(nodes,rank,set_name = 'default_setname',**kwargs): def set_taxon_forsome(nodes = None, rank = None,**kwargs): assert nodes != None and rank != None taxon = [ncbi.get_taxon(node, rank = rank) if node else None for node in nodes] return taxon return mem.getOrSet(set_taxon_forsome, **mem.rc(kwargs, nodes = nodes, rank = rank, on_fail = 'compute', hardcopy = False, register= set_name + rank))
def taxon_with_name( rank, name, **kwargs ): def set_taxon_with_name(name = None, rank = None, **kwargs): assert name != None and rank != None all_p = get_rank(rank) p_node = [p for p in all_p if sciname(p) == name] assert len(p_node) == 1, 'Ambiguous phylum match?' p_node = p_node[0] return p_node return mem.getOrSet(set_taxon_with_name, **dict(rank = rank, name = name, register = rank+'_'+name, hardcopy = False, on_fail = 'compute', **kwargs))
def getTaxon(self,rank = rank, **kwargs): def setTaxon(BTInstance = None, rank = None, **kwargs): assert rank; assert BTInstance leafnodes = BTInstance.leafNodes(**mem.sr(kwargs)) leaf_families = [ncbi.get_taxon(node, rank=rank) if node else None for node in leafnodes] return leaf_families return mem.getOrSet(setTaxon, **mem.sr(kwargs, rank = rank, BTInstance = self, on_fail = 'compute', hardcopy = False, register = rank))
def get_taxon_forall(aliname, rank = None, **kwargs): def setTaxon(aliname = None, rank = None,**kwargs): assert aliname != None and rank != None nodes = get_taxnodes(aliname,**mem.sr(kwargs)) taxon = [ncbi.get_taxon(node, rank=rank) if node else None for node in nodes] return taxon return mem.getOrSet(setTaxon, **mem.rc(kwargs, aliname = aliname, rank = rank, on_fail = 'compute', hardcopy = False, register = aliname + rank))
def id_map(**kwargs): def set_id_map(**kwargs): fname = cfg.dataPath('reinitz/28-7-2011-1-56-6-30-0/txt/byGenes') gsums = open(cfg.dataPath('flybase/gene_summaries.tsv')) gmap = open(cfg.dataPath('flybase/gene_map.tsv')) gassoc = open(cfg.dataPath('flybase/gene_association.fb')) gname_orig = [ os.path.splitext(f)[0].lower() for f in os.listdir(fname) ] gfiles =dict( [ (gname_orig[i], os.path.join(fname,f)) for i, f in enumerate(os.listdir(fname)) ] ) gname_map = dict([( re.sub( re.compile('[^a-z]'),'',g), g) for g in gname_orig]) gnames = gname_map.keys() glines = dict([(k.lower(),[]) for k in gnames]) lines_kept = {} for i, g in enumerate(gassoc.xreadlines()): if g[0] == '!': continue g0 = g g = re.sub( re.compile('[^a-z]'),'', g.lower().split('\t')[9].strip()) for k,v in glines.iteritems(): if k == g: v.append((i,g)) lines_kept[i] = g0 matches = glines ids = {} for k, v in matches.iteritems(): names = [ l[1] for l in v] line_nums = [ l[0] for l in v] these_ids = [lines_kept[i].split('\t')[1].strip() for i in line_nums] #just hacking here... for sloppy paired I use the first id... #alas... ids[k] = tuple(sorted(set(these_ids)))[0] return dict([ (idval, {'file': gfiles[gname_map[k]], 'name':gname_map[k]}) for k, idval in ids.iteritems()]) #name_grps = dict([(gpkey, list(g)) for gpkey, g in it.groupby(sorted(names))]) #print k #print [ (gk, len(gv)) for gk, gv in name_grps.iteritems()] return mem.getOrSet(set_id_map,**mem.rc(kwargs,on_fail = 'compute'))
def parseNet(num = 1,method = 'tree', reset = False): ''' Get one of daniel's nets. Allowable numbers are 1-3 and allowable types are 'tree', 'svm' ''' def setNet(**kwargs): method =kwargs.get('method', 'tree') num = kwargs.get('num', 1) description_path = cfg.dataPath('::daniel/net%s_chip_features.tsv') % num data_path = cfg.dataPath('::daniel/informativeness/%s%s.txt') %(method,num) split_re = re.compile('\s') desc_open = open(description_path) description_cols = split_re.split(desc_open.readline().strip()) + ['Exp_Index'] description_vals = [split_re.split(l.strip()) for l in desc_open.readlines()] for idx, d in enumerate(description_vals): d.append(idx) data_open = open(data_path) weight, tf, exp = zip(*[array(split_re.split(l.strip()), float) for l in data_open.readlines()]) exp = [ e -1 for e in exp] description = {} for i in range(len(description_cols)): description[description_cols[i]] = [d[i] for d in description_vals] ntf = np.max(tf) + 1 nexp = len(description.values()[0]) grid = zeros((ntf,nexp)) for vals in zip(weight,tf,exp): grid[vals[1], vals[2]] = float(vals[0]) return grid, description return mem.getOrSet(setNet, reset = reset, register = method, name = '%s%s' %(method,num), method = method, num = num)
def getCL(**kwargs): '''Cell line data output: dict keyed by gene names''' def setCL(**kwargs): f = open(config.dataPath('network/CL.geneexp')).read() elts =f.split('\n') seqdict = {} for e in elts: matches = list(re.finditer(re.compile('([^\s]+)'), e)) if not len(matches): continue name = matches[0].group(1) seqdict[name] = [] for i in matches[1:]: seqdict[name].append(float(i.group(1))) for k, v in seqdict.iteritems(): seqdict[k] = array(v) return seqdict return mem.getOrSet(setCL, **kwargs)
def getSush(**kwargs): '''Get sushmita's regression weights and biases''' def setSush(**kwargs): path = config.dataPath('network/network_predmodel/regressionwts/fRN') bias_files = [ os.path.join( path, f) for f in os.listdir(path) if 'bias' in f ] nw_files = [ os.path.join( path, f) for f in os.listdir(path) if 'nw' in f ] bias_re = re.compile('(?P<gname>\S+)\s+(?P<level>\S+)') weight_re = re.compile('(?P<gname>\S+)\s+(?P<tfname>\S+)\s+(?P<level>\S+)') genes = {} for b in bias_files: for l in open(b).xreadlines(): match = bias_re.search(l) genes[match.group('gname')] = dict(bias = match.group('level')) for n in nw_files: for l in open(n).xreadlines(): match = weight_re.search(l) g = genes[match.group('gname')] g['tfs'] = g.get('tfs', []) + [match.group('tfname')] g['weights'] = g.get('weights', []) + [match.group('level')] return genes return mem.getOrSet(setSush, **mem.rc(kwargs, hardcopy = True))
def show_conservation(fidx = 0, reset = False): fnum = flist[fidx] rfid = 'RF{0:05}'.format(fnum) print rfid if fnum ==50: ftype = 'riboswitch' else: ftype = 'all' out = mem.getOrSet(setFamData, **mem.rc({}, reset =reset, on_fail = 'compute', hardcopy = False, register = 'fdat'+rfid, ftype = ftype, rfid = rfid)) mvals, tvals, structs = mem.getOrSet(setTree, **mem.rc({},reset = reset, on_fail = 'compute', hardcopy = True, register = 'st'+rfid, rfid = rfid, ftype = ftype)) idxs, tidx = sutils.show_paired_v_energy(rfid,rfid,mvals,tvals,structs,ftype) all_pairs = structs['structs'] all_energies = structs['energies'] pints,eints, mints, tints = [structs['structs'][i] for i in idxs],\ [ structs['energies'][i] for i in idxs],\ [ mvals[tidx][i] for i in idxs],\ [ tvals[tidx][i] for i in idxs] seq = structs['seq'] if do_make_subopts: subopts = rutils.suboptimals(seq, n = 400) verts = rutils.struct_verts(subopts, seq, rfid) f = myplots.fignum(4,figsize) rplots.grid_rnas(verts, dims = [40]) f.savefig(figfile.format('{0}_grid_rnas'.\ format(rfid))) aff = rutils.struct_affinity_matrix(all_pairs, len(seq)) pca = rutils.project_structs(all_pairs, ptype ='pca', affinities = aff, n_comp = 3) for metric in ['n_comp']:# ['frac_silent','frac_paired','n_comp']: scolors = [] for i in range(len(tvals[tidx])): m_silent, pidxs, frac_good = sutils.metric( mvals[tidx][i],tvals[tidx][i], mtype = metric) scolors.append(mean(m_silent)) scolors = myplots.rescale(scolors, [0.,1.])[:,newaxis] * array([1.,0.,0.]) f = myplots.fignum(4,figsize) ax = f.add_subplot(111) xvals, yvals = pca[:,:2].T myplots.padded_limits(ax, xvals, yvals) ax.scatter(xvals,yvals,300,linewidth = 1, edgecolor = 'black', color = scolors) ax.scatter(pca[idxs,0],pca[idxs,1], 2100 ,alpha = 1, color = 'black') ax.scatter(pca[idxs,0],pca[idxs,1], 2000 ,alpha = 1, color = 'white') ax.scatter(pca[idxs,0],pca[idxs,1], 400 ,alpha = 1, color = scolors[idxs], ) ax.annotate('''Conservation metric: {0} Projected onto C=2 Principal Components'''.format(metric), [0,1],xycoords = 'axes fraction', va = 'top', xytext = [10,-10],textcoords='offset points') f.savefig(figfile.format('{0}_pca_{1}'.\ format(rfid, metric)))
def modules(reset=False): return mem.getOrSet(setModules, **mem.rc({}, reset=reset, hardcopy=True, on_fail="compute"))
def getNet(**kwargs): '''Get the saved network from patrick's files. output: tuple of dicts keyed by gene/tf names trgs: {gname: {color:0.}{weights:[0....]}{tfs:['tfname']} ...} tfs : {tfname:{color:0.}{weights:[0....]}{tgs:['tfname']} ...}''' def setNet(**kwargs): net_name = kwargs.get('net_name', 'unsup') if net_name == 'unsup': netfile = 'unsup_patrick.txt' elif net_name == 'logistic': netfile = 'logistic_0.6.txt' else: raise Exception() fpath = config.dataPath('network/patrick/{0}'.format(netfile)) TC = getTC( reset = mod(kwargs.get('reset',0),2)) CL = getCL( reset = mod(kwargs.get('reset',0),2)) nwdata = open(fpath).read() #A few functions defined here to be used later trgfun = lambda x: x[1] wtfun = lambda x:float( x[2] ) tffun = lambda x: x[0] sigmafun = lambda x: 1 / (1 + np.exp(-x /1)) r = re.compile('^[ ]*(?P<tf>\S+)\s+(?P<target>\S+)\s+(?P<weight>\S+)' ,re.M) matches = list(re.finditer(r,nwdata)) #Unsorted lists of tfs and targets targets =map(lambda x:x.group('target'),matches) tfs = map(lambda x:x.group('tf'),matches) weights =map(lambda x:x.group('weight'),matches) #Concat the data for easier sorting cat = [] for i in np.argsort(tfs): if TC.has_key(tfs[i]) and CL.has_key(targets[i]): cat.append([tfs[i],targets[i],weights[i]]) #Extract a dictionary with information for each target. trg_d = {} count = 0.0 for k, g in it.groupby(sorted(cat,key = trgfun),key = trgfun): l = list(g) count += 1.0 trg_d[k] = {'color': np.array([count, 0, 0]), 'tfs' : map(tffun,l), 'weights': map(wtfun,l) } #Extract a dictionary with information for each TF tf_d = {} for k, g in it.groupby(cat,key = lambda x: x[0]): l = list(g) tf_targets = map(lambda x: x[1],l) tf_d[k] = {'targets':map(trgfun,l), 'weights':map(wtfun,l)} return (trg_d, tf_d) return mem.getOrSet(setNet, **mem.rc(kwargs, hardcopy = True, on_fail = 'compute', register = kwargs.get('net_name', 'unsup'))) pass
def get_seq_groups(rfid = 'RF00167', reset = True, tree = True, draw_distances = draw_all_easy, draw_clusters = draw_all_easy, draw_single_cluster = draw_all_hard): ''' Run the tree computation for each clsuter in the rfam family. (Or just one) 1) Compute clusters using a distance measure derived either phyml or a simple levenshtein dist. kwds: tree [True] Use a tree or just a levenshtein distance to get distances for init clustering. 2) Choose a cluster of well related sequences and for this this cluster, compute an alignment (For each structure using phase or for sequences using MUSCLE) kwds: struct_align [True] Whether to compute structural alignments or use MUSCLE ''' rutils = utils ali, tree, infos = rfam.get_fam(rfid) n = len(ali) if draw_distances: dists_t = seq_dists(ali,rfid, tree = True) dists_l = seq_dists(ali,rfid, tree = False) dtf = dists_t.flatten() dlf = dists_l.flatten() lin = linregress(dtf, dlf) rsquared = lin[2]**2 f = myplots.fignum(5, (7,7)) ax = f.add_subplot(111) ax.annotate('Levenshtein distance vs. BioNJ branch lengths', [0,1], xycoords = 'axes fraction', va = 'top', xytext = [10,-10],textcoords = 'offset pixels') ax.annotate('R-Squared: {0}'.format(rsquared), [1,0], xycoords = 'axes fraction', ha = 'right', xytext = [-10,10],textcoords = 'offset pixels') ax.set_xlabel('BIONJ Tree ML Distance') ax.set_ylabel('Levenshtein Distance') ax.scatter(dtf, dlf, 100) datafile = cfg.dataPath('figs/gpm2/pt2_lev_tree_dists.tiff') f.savefig(datafile) dists = mem.getOrSet(setDistances, ali = ali, tree = tree, run_id = rfid, register = rfid, on_fail = 'compute', reset = reset) clusters = maxclust_dists(dists, k = 5, method = 'complete') clusters -= 1 if draw_clusters: ct = mycolors.getct(len(set(clusters))) colors = [ct[elt] for elt in clusters] pca_vecs = mlab.PCA(dists).project(dists) f = myplots.fignum(5, (8,8)) ax = f.add_subplot(111) ax.annotate('Rfam sequence clusters in first 2 PC of sequence space.', [0,1], xycoords = 'axes fraction', va = 'top', xytext = [10,-10],textcoords = 'offset pixels') ax.annotate('Number of Clusters: {0}'.format(len(ct)), [1,0], xycoords = 'axes fraction', ha = 'right', xytext = [-10,10],textcoords = 'offset pixels') ax.set_xlabel('PC 1') ax.set_ylabel('PC 2') ax.scatter(pca_vecs[:,0],pca_vecs[:,1], 20, color = colors) datafile = cfg.dataPath('figs/gpm2/pt2_all_seqs_clustered.ps') f.savefig(datafile) #now take the largest cluster and do the analysis. cgrps = dict([ (k, list(g)) for k , g in it.groupby(\ sorted( list(enumerate(clusters)),key = lambda x: x[1]), key = lambda x: x[1])]) cbig = argmax([len(x) for x in cgrps.values()]) cluster_seqs = [ elt[0] for elt in cgrps.values()[cbig] ] csize = len(cluster_seqs) seqs =[ali[c] for c in cluster_seqs] if 0: ct = mycolors.getct(2) pca_vecs = mlab.PCA(dists).project(dists) colors =[ct[1] if elt in cluster_seqs else ct[0] for elt in range(len(pca_vecs))] f = myplots.fignum(5, (8,8)) ax = f.add_subplot(111) ax.annotate('Inter and intra cluster distances vs. PC0 component for chosen cluster.', [0,1], xycoords = 'axes fraction', va = 'top', xytext = [10,-10],textcoords = 'offset pixels') ax.annotate('Number of cluster sequences: {0}, Number of total sequences'.format(csize, n - csize), [1,0], xycoords = 'axes fraction', ha = 'right', xytext = [-10,10],textcoords = 'offset pixels') ax.set_xlabel('PC 0') ax.set_ylabel('Distance') for s in cluster_seqs: ax.scatter(pca_vecs[:,0],dists[s,:] ,200 *exp(-(dists[s,:] / .5) **2), color = colors, alpha = .2) datafile = cfg.dataPath('figs/gpm2/pt2_focused_cluster_dists.ps') f.savefig(datafile) clusters_final = [ [ elt[0] for elt in cgrps.values()[i] ] for i in range(len(cgrps.values()))] seqs_final = [ [ ali[idx] for idx in clust ] for clust in clusters_final] return seqs_final
def blackbody(reset = False, **kwargs): '''Generate a colormap according to a logarithm of the blackbody spectrum. Colormap is transformed by an arctangent to place whites at .5 in a band with width determined by the [contrast] and then by taking the max with a gaussian peaked at 0,1 with width determined by [width] kwargs: reset contrast [.64] adjusts arctangent slope near zero. width [.01] adjust gaussian thresholding near endpoints. flip [False] Hot areas are blue if flip is false. flip_ends [False] Gaussian threshold swaps high and low colors. For a usage example, see compbio/fun/ocean.py''' def setBB( **kwargs ): dstr = '2deg' import inspect import os import re contrast = kwargs.get('contrast', .64) width = kwargs.get('width', .01) flip_ends = kwargs.get('flip_ends', False) flip = kwargs.get('flip',False) thisdir= os.path.dirname(inspect.stack()[0][1]) bbcols = [re.split(re.compile('\s+'),l) for l in \ open(os.path.join(thisdir,'blackbody.tab')).readlines() if dstr in l] rgb = [] for b in bbcols: rgb.append(array(b[7:10],float)) rgb = array(rgb) npts = 512 ntot = len(rgb) #DON'T TOUCH THE SCALE!!!! scl = 33.5 #xvals = logspace(0.,scl,npts)/pow(10,scl) - .5 xvals = arctan(linspace(-contrast,contrast,npts))/pi*2 #scaling = (linspace(-1,1,npts)**3)*(linspace(-1,1,npts)**polypow) #width = .001 scaling = exp( - ( 1-abs(linspace(-1,1,npts))) **2 / width) scaling *= array([-1 if x <0 else 1 for x in linspace(-1,1,npts)]) if flip_ends: scaling *= -1 d2 = [xvals[:], scaling[:]] inds = argmax(np.abs(d2),0) xvals = array([d2[inds[i]][i] for i in range(len(inds))]) xvals = (xvals * .5) + .5 f0 = float(argmin(var(rgb,1))) / ntot k = -2.0 * (2. * f0 - 1) / (f0**2) /2 lspace = log(linspace(1, 1 + k,ntot)) / log(1 + k) #x0s = log(linspace(1,1 + scl,ntot))/log(1 + scl) vals =array([ interp(xvals, lspace, [e[i] for e in rgb]) for i in range(3)]) midpoint = argmin(var(vals,0)) # raise Exception() if flip: vals = vals[:,::-1] xs=linspace(0,1,npts) cdict = dict( red = [ (xs[i], vals[0,i], vals[0,i]) for i in range(npts)], green = [ ( xs[i], vals[1,i], vals[1,i]) for i in range(npts)], blue = [ (xs[i], vals[2,i], vals[2,i]) for i in range(npts)]) #out = zip(vals) cmap = matplotlib.colors.LinearSegmentedColormap('bb',cdict) return cmap out = mem.getOrSet(setBB, reset = reset, **kwargs) return out
def eval_seq_group(gap_seqs, rfid, run_id, inp_run_id, reset = True, draw_alis = draw_all_easy, clade_alignment_method = clade_alignment_method, max_structs = 5): rutils = utils data = butils.load_data(inp_run_id, 'output') structs = data['structs'] energies = data['energies'] esrt = argsort(energies)[::-1] s_inds = esrt[:max_structs] structs, energies = [structs[i] for i in s_inds], [energies[i] for i in s_inds] refseq = data['seq'] nq = len(gap_seqs) ns = len(structs) names = ['N{1:04}'.format(rfid, idx) for idx in range(nq)] seqs = [rutils.ungapped_seq(gap_seqs[i], names[i]) for i in range(nq)] profiles = mem.getOrSet(setProfiles, **mem.rc({}, seq = refseq, structs = structs, run_id = rfid, reset = reset, on_fail = 'compute', register = 'tuprof_{0}'.format(rfid))) if draw_alis: draw_cm_muscle_congruencies(seqs, profiles, run_id, reset = reset) if clade_alignment_method == 'cm': alis, refs, all_pairs =\ mem.getOrSet(setAlignments, **mem.rc({}, seqs = seqs, profiles = profiles, run_id = rfid, ali_type = 'struct', reset = reset, on_fail = 'compute', register = 'tuali_struct_{0}'.format(rfid))) else: raise Exception('No methods besides cm are yet implemented') seq_group_data = {} seq_group_data['seqs'] = gap_seqs seq_group_data['structs'] = [] for i, struct in enumerate(structs): struct_data = {} ali = alis[i] ref = refs[i] pairs = all_pairs[i] #NOTE THAT DUE TO AN AWKWARD SYNTAX DECISION, #I AM ALLOWING FOR THE POSSIBILITY THAT EACH #ALI ELT HAS DIFFERENT PAIRS. # #ALL OF MY ROUTINES SO FAR ONLY USE A SINGLE #PAIR SET AND SO I USE PAIRS[0] EXCLUSIVELY struct_data.update(ref = ref[0], pairs = pairs[0], ali = ali) rid = '{0}_{1}'.format(run_id, i) if clade_tree_method == 'bionj': tree = phyml.tree(ali, run_id = rid, bionj = True) else: tree = get_phase_tree(ali, pairs[0], run_id) for i, ct in enumerate(tree.get_terminals()): seq = filter(lambda x: x.id == ct.name, ali)[0] ct.m = {'seq':seq, 'probs':array([1 for j in range(len(seq))])} if clade_ancestor_method == 'independent': ml_tree = get_ml_ancestor_tree(tree, ali, '{0}_paml{1}'.format(run_id, i)) else: ml_tree = get_structure_ancestor_tree(\ tree, ali,'{0}_stree{1}'.format(run_id, i)) muts, times, gaps, irresolvables = tree_conservation.count_struct(ml_tree, pairs[0]) struct_data.update(muts = muts, times = times, gaps = gaps, irresolvables = irresolvables) seq_group_data['structs'].append(struct_data) return seq_group_data
def c2( launcher = None, ncluster =2000, host = 'tin', reset = 0, step = 10, exemp_time = 'all', doplot = False ,**kwargs): mrnas = nio.getBDTNP() misc = nio.getBDTNP(misc = True) vals = array([v['vals'] for v in mrnas.values()]) gvars = var(vals, 1) gminvars = np.min(gvars,1) gmedvars = median(gvars,1) min20 = argsort(gminvars)[::-1][:20] med20 = argsort(gmedvars)[::-1][:20] int20 = set(min20).intersection(set(med20)) xgenes = array(list(int20)) cell_data = vals[xgenes].transpose(1,2,0) scd = shape(cell_data) #times = reshape(zeros(shape(cell_data[0:2]))[:,:,newaxis , arange(shape(cell_data[1])) # , (prod(shape(cell_data)[0:2]))) xycoords = (arange(scd[0])[:,newaxis,newaxis]*[1,0] +\ arange(scd[1])[newaxis,:,newaxis]*[0,1]) cell_data = reshape(cell_data, (prod(shape(cell_data)[0:2]), shape(cell_data)[2] )) xy_data = reshape(xycoords, (prod(scd[0:2]),2 )) if exemp_time == 'all': inds = arange(len(cell_data)) else: inds = arange(len(cell_data))[nonzero(equal(xy_data[:,1],exemp_time))[0]] np.random.seed(1) np.random.shuffle(inds) rand_thousand = inds[0:ncluster] sim_data = cell_data[rand_thousand] sim_xy = xy_data[rand_thousand] t = [ mean(sim_data, 0), std(sim_data,0)] t[1][equal(t[1],0)] = 0 metric = 'neg_dist' sims = similarity(sim_data, transform = t, method = metric) name = 'll_{0}_{1}_{2}'.format(metric,ncluster,exemp_time) def setLauncher(**kwargs): sims= kwargs.get('sims') metric = kwargs.get('metric') name = kwargs.get('name') d_in = [] percs = logspace(.1,1.5,8) for p in percs: d_in.append(dict(similarities = sims, self_similarity = ss.scoreatpercentile(sims, p), metric = metric )) launcher = bcl.launcher(d_in, host = host, name = name) return launcher if launcher == None: output = mem.getOrSet(setLauncher, **mem.rc(dict(sims = sims, metric = metric, name = name, hardcopy = True, reset = reset, hard_reset = False,))) return output def setC2(launcher = launcher, **kwargs): if launcher == None: raise Exception() else: output = launcher.output() return output #It appears that the bsub process failed for the first output. #No big deal. Debug later. output = mem.getOrSet(setC2, **mem.rc(dict(harcopy = True, launcher = launcher, reset = reset, on_fail = 'compute', hard_reset = False, name = 'c2'+ name ))) all_inds = array([ squeeze(o['inds']) for o in output[:] ]) xs = misc['x']['vals'][zip(*xy_data)] #zip(*sim_xy)] ys = misc['y']['vals'][zip(*xy_data)] #zip(*sim_xy)] zs = misc['z']['vals'][zip(*xy_data)] #zip(*sim_xy)] colors =array( mycolors.getct(shape(all_inds)[1]) ) f = plt.figure(0) f.clear() all_tps = range(scd[1]) nc = len(all_inds) nt = len(all_tps) all_members = [] for i, inds in enumerate(all_inds): #compute similarity matrices 1000 at a time: exemplars = sim_data[list(set(list(inds)))] sim = similarity(cell_data, exemplars, transform = t, method = metric) closest = argmax(sim, 1) all_members.append(closest) if doplot: for j, tp in enumerate(all_tps): ax = f.add_axes( [float(j)/nt,float(i) /nc,1./nt, 1. /nc] ) ax.set_yticks([]) ax.set_xticks([]) i_sub = nonzero(equal(xy_data[:,1], j) * greater(ys,0))[0] cs = colors[closest[i_sub]] x = xs[i_sub] z = zs[i_sub] plt.scatter(x[::step],z[::step], 40,alpha = .75, c = cs[::step], edgecolor = 'none') ct_data = xy_data return all_members, ct_data
def dsi_boxplot(num = 1 , method = 'tree', reset = False, plot_kcs = True, bp_means = False, bp_zeros = True, zero_ofs = 1e-6, bp_logs = True, show_kos = True, log_scale = True, filter_rows_and_cols = True, boxplot = True): grid, descriptions = parseNet(num= num, method = method, reset = reset) grid = array(grid) descriptions = dict(descriptions) new_descriptions = {} if filter_rows_and_cols: #Filter out bad rows and columns good_exps = nonzero(np.max(grid,0))[0] tf_new_idxs = list(argsort(np.max(grid,1))[::-1]) new_grid = grid[tf_new_idxs] good_tfs = nonzero(np.max(new_grid,1))[0] #Relabel the descriptions to take filtration into account #Assumed that one based indexing may be causing havoc so subtract one from the group. for k, value in descriptions.iteritems(): if 'Genes' in k: new_descriptions[k] = [re.sub(re.compile('(\d+)'),\ lambda x: int(x.group()) in tf_new_idxs and str(tf_new_idxs.index(int(x.group()))) or x.group(), g) for g in value] else: new_descriptions[k] = value new_descriptions[k] = list(array(new_descriptions[k])[good_exps]) new_grid = new_grid[good_tfs, :] new_grid = new_grid[ :,good_exps] grid = new_grid descriptions = new_descriptions #Make lambdas to split experiments into categories col_choosers = sg_choosers() #Split experiments exps = {} for k, v in col_choosers.iteritems(): vs = [ dict(zip(descriptions.keys() , elt)) for elt in zip(*descriptions.values()) ] exps[k] = nonzero( [v(e) for e in vs ])[0] '''Remove 'general' as the values wind up being all zeros.''' exps.pop('general') #Mark experiments that knock out TFS tf_kn_matches =[ sorted(list(it.chain(\ nonzero([ 'G{0},'.format(t) in x+',' for x in descriptions['DeletedGenes'] ])[0], nonzero([ 'G{0},'.format(t) in x+',' for x in descriptions['OverexpressedGenes'] ])[0]))) for t in range(shape(grid)[0])] knockout_tfs = nonzero([len(k) for k in tf_kn_matches])[0] knockout_cells = array(list(it.chain(*[ [(i, exp) for exp in tf_kn_matches[i] ] for i in range(len(tf_kn_matches))]))) knockout_vals = grid[zip(*knockout_cells)] do_final_bps = True kn_exps = {} split_ko_ts = False kn_exps['ko'] = [] def getBPS(**kwargs): xlabels = [] nz_frac_std = [] nz_frac_mean = [] nz_val_std = [] nz_val_mean = [] nz_colvals = [] for k, ecols in exps.iteritems(): these_knockouts = array([c for c in knockout_cells if c[1] in ecols]) exp_cells = array([(i,j) for j in ecols for i in arange(shape(grid)[0])]) if these_knockouts != []: kns_found = [c for c in exp_cells if np.sum(greater( np.product(c==these_knockouts,1),0),0)] kn_exps['ko'] += kns_found nokns_found = [c for c in exp_cells if not np.sum(greater( np.product(c==these_knockouts,1),0),0)] else: nokns_found = exp_cells cexp = [grid[zip(*exp_cells[\ nonzero(equal(exp_cells[:,1],col))[0]])] \ for col in ecols] if cexp == []: for arr in [nz_frac_std, nz_frac_mean, nz_val_std, nz_val_mean]: arr.append(0.) nz_colvals.append([]) xlabels.append(k) continue colwise_fracs = [mean(1.*greater(col,0)) for col in cexp] colwise_exprs = [mean(col[nonzero(greater(col,0))]) for col in cexp] colwise_exprs = [c if not isnan(c) else 0 for c in colwise_exprs] nz_colvals.append(colwise_exprs) nz_frac_std.append(std(colwise_fracs)/sqrt(len(colwise_fracs))) nz_frac_mean.append(mean(colwise_fracs)) nz_val_std.append(std(colwise_exprs)/sqrt(len(colwise_exprs))) nz_val_mean.append(mean(colwise_exprs)) if isnan(nz_val_mean[-1]): raise Exception() xlabels.append(k) for k, ecells in kn_exps.iteritems(): ecells = array(ecells) nz_frac_std.append(0) nz_val_std.append(0) if len(ecells) == 0: for arr in [nz_frac_mean, nz_val_mean]: arr.append(0.) nz_colvals.append([]) else: nz_frac_mean.append(mean(greater(grid[zip(*ecells)],0))) nz_val_mean.append(mean(grid[zip(*ecells[greater(grid[zip(*ecells)],0)])])) nz_colvals.append(grid[zip(*ecells[greater(grid[zip(*ecells)],0)])]) xlabels.append(k) return xlabels, array(nz_frac_std),array(nz_val_std),array(nz_frac_mean), array(nz_val_mean), [array(cv) for cv in nz_colvals] xlabels, nz_frac_std,nz_val_std,nz_frac_mean, nz_val_mean, nz_colvals = mem.getOrSet(getBPS,on_fail = 'compute', reset = reset) args = [xlabels.index(x) for x in ['general_ts', 'drug', 'drug_ts', 'genetic', 'genetic_ts', 'drug_genetic', 'drug_genetic_ts', 'ko'] if x in xlabels] xlabels, nz_frac_std,nz_cal_std,nz_frac_mean,nz_val_mean =\ array(xlabels)[args],nz_frac_std[args],nz_val_std[args],nz_frac_mean[args],nz_val_mean[args] nz_colvals = [nz_colvals[a] for a in args] f = plt.figure(0) f.clear() topen = open(cfg.dataPath('daniel/txt/net{0}_{1}'.format(num,method )),'w') topen.write('\t'.join(['exp_class','mean_influence','std_influence','stderr_influence'])+'\n') for idx, exp_class in enumerate(xlabels): topen.write('{0}\t{1}\t{2}\t{3}\n'.format(exp_class,mean(nz_colvals[idx]),std(nz_colvals[idx]),\ std(nz_colvals[idx])/ len(nz_colvals[idx]))) topen.close() plot_type = 'dsi_final' if plot_type == 'dsi_final': margin = .05 wid0 = .75 cs = mycolors.getct(len(nz_colvals)) ax0 = f.add_axes([margin,margin, wid0 , 1. - 2* margin], title = 'Experminent mean significances: blue (red) lines denote quartiles (media).') if log_scale: ax0.set_yscale('log') #ax0.set_autoscaley_on(False) if boxplot: ax0.boxplot(nz_colvals[0:-1], widths = [.5] * (len(nz_colvals )-1)) ax0.hlines([mean(nz_colvals[-1])],-100, 100,color = 'red',linestyle = ':',linewidth = 1) else: ax0.bar(.2 + arange(len(nz_colvals[0:-1])), [median(c) for c in nz_colvals[0:-1]], color = cs[:-1]) ax0.set_xticklabels(xlabels[:-1]) if boxplot: pass #ax0.set_ylim([min(nz_colvals[:-1]), max(nz_colvals[:-1])/10]) #ax1 = f.add_axes([2*margin +wid0, margin, (1 - margin) - (2 * margin + wid0), 1- 2* margin],sharey = ax0, title = 'TF knockout/OE') #if boxplot: # ax1.boxplot(nz_colvals[-1:],widths = .5) #else: # ax1.bar([.2],[mean(c) for c in nz_colvals[-1:]], # color = cs[-1:]) #ax1.set_xticklabels(xlabels[-1:]) if boxplot: pass #ax1.set_ylim([np.min([min(c) for c in nz_colvals[:-1]]), np.max([max(c) for c in nz_colvals[:-1]])]) f.savefig(cfg.dataPath('daniel/figs/final_bp_net{0}_{1}_{2}.ps'.\ format(num, method, 'log' if log_scale else 'lin')), dpi = 10) return elif plot_type == 'twoplots': nkeys = len(xlabels) if show_kos: xi = arange(nkeys) else: xi = arange(nkeys -1) y1 = nz_val_mean[xi] s1 = nz_val_std[xi] y2 = nz_frac_mean[xi] s2 = nz_frac_std[xi] a1 = f.add_subplot(211, ylim =[0, max(y1)+max(s1)], title = 'mean value of nonzero influences\n standard error across experiments') a2 = f.add_subplot(212, ylim =[0,max(y2)+ max(s2)], title = 'mean values of fraction nonzero influences\n standard error across experiments' ) colors = mycolors.getct(nkeys) wofs = .15 b1 = a1.bar(xi+wofs,y1,1.-wofs*2, linewidth = 3,color = colors, ecolor = 'black') b2 = a2.bar(xi+wofs,y2,1.-wofs*2, linewidth = 3,color = colors, ecolor = 'black' ) p1,c1,b1 = a1.errorbar(xi+.5, y1, yerr = s1,capsize = 15, elinewidth = 4, color = 'black',linewidth = 0, ecolor = 'black') p2,c2,b2 = a2.errorbar(xi+.5, y2, yerr = s2,capsize = 15, elinewidth = 4, color = 'black',linewidth =0, ecolor = 'black') for c in c1:c.set_alpha(1.) for c in c2:c.set_color('black') for c in a2.get_children() + a1.get_children(): try: if not c in [p1,p2]: c.set_linewidth(4) except: pass continue a2.set_xticklabels([]) for i in xi: a2.text( float(i) + .5,0,xlabels[i] , rotation = '-15',size = '16', ha = 'left',va='top') f.savefig(cfg.dataPath('daniel/figs/latest/{1:03d}_{0}_{2}.tiff'.\ format('no_kos' if not show_kos else 'kos', num , 'log' if log_scale else 'lin')),format = 'tiff') return
def draw_cm_muscle_congruencies(seqs, profiles, run_id, reset = True): print 'computing alignments...' print ' ...using muscle' malis, mrefs, mpairs =\ mem.getOrSet(setAlignments, **mem.rc({}, seqs = seqs, profiles = profiles, run_id = run_id, ali_type = 'muscle', reset = reset, on_fail = 'compute', register = 'tuali_musc_{0}'.format(run_id))) print ' ...using cmalign.' salis, srefs, spairs =\ mem.getOrSet(setAlignments, **mem.rc({}, seqs = seqs, profiles = profiles, run_id = run_id, ali_type = 'struct', reset = reset, on_fail = 'compute', register = 'tuali__struct_{0}'.format(run_id))) print ' ...making trees.' for idx, alis in enumerate(zip(malis, salis)): m, s = alis mtree = phyml.tree(m,run_id, bionj = True) stree = phyml.tree(s,run_id, bionj = True) maps = dict([(elt.id,i) for i, elt in enumerate(m)]) mdists = zeros((len(maps),len(maps))) sdists = zeros((len(maps),len(maps))) for n1 in mtree.get_terminals(): for n2 in mtree.get_terminals(): mdists[maps[n1.name],maps[n2.name]] = \ mtree.distance(n1,n2) for n1 in stree.get_terminals(): for n2 in stree.get_terminals(): sdists[maps[n1.name],maps[n2.name]] = \ stree.distance(n1,n2) tree_similarity(sdists, mdists, '{0}_struct_{1}'.format(run_id,idx), k = len(sdists - 1)) tree_similarity(sdists, mdists, '{0}_struct_{1}'.format(run_id,idx), k = 6) f = myplots.fignum(4, (8,10)) ct = mycolors.getct(len(mtree.get_terminals())) import networkx for t, sp, ttype in zip([mtree, stree], [211,212], ['sequence', 'structural']): a = f.add_subplot(sp) layout = 'neato' G = phylo.to_networkx(t) Gi = networkx.convert_node_labels_to_integers(G, discard_old_labels=False) posi = networkx.pygraphviz_layout(Gi, layout, args = '') posn = dict((n, posi[Gi.node_labels[n]]) for n in G) networkx.draw(G, posn, labels = dict([(n, '') for n in G.nodes()]), node_size = [100 if n.name in maps.keys() else 0 for n in G.nodes()], width = 1, edge_color = 'black', ax = a, node_color = [ct[maps.get(n.name, -1)] for n in G.nodes()] ) a.annotate('Embedded tree for {0} alignment.'.format(ttype), [0,1], xycoords = 'axes fraction', va = 'top', xytext = [10,0],textcoords = 'offset pixels') a.annotate('Total branch length is {0}'.format(t.total_branch_length()), [1,0], xycoords = 'axes fraction', ha = 'right', xytext = [-10,10],textcoords = 'offset pixels') #phylo.draw_graphviz( mtree, label_func = lambda x: '', # node_color = [ct[maps.get(n.name, -1)] for n in G.nodes()] +\ # [ct[0] for n in mtree.get_nonterminals()], axes = ax) datafile = cfg.dataPath('figs/gpm2/pt2_mus_cm_tree_embeddings_{0}_struct_{1}.ps'.format(run_id, idx)) f.savefig(datafile, dpi = 200, format = 'ps')