def read_length_matrix(filename, minlen=.0001, maxlen=1.0, nooutliers=True): """Read a length matrix made by spidir-prep""" from rasmus import util dat = [line.rstrip().split("\t") for line in open(filename)] species = dat[0][2:] lens = util.map2( float, util.submatrix(dat, range(1, len(dat)), range(2, len(dat[0])))) gene_sizes = map(int, util.cget(dat[1:], 1)) files = util.cget(dat[1:], 0) if nooutliers: treelens = map(sum, lens) m = mean(treelens) ind = util.find(lambda x: x < 5 * m, treelens) files, gene_sizes, lens, treelens = [ util.mget(x, ind) for x in files, gene_sizes, lens, treelens ] for row in lens: for i in xrange(len(row)): if row[i] < minlen: row[i] = minlen return species, lens, gene_sizes, files
def read_length_matrix(filename, minlen=.0001, maxlen=1.0, nooutliers=True): """Read a length matrix made by spidir-prep""" from rasmus import util dat = [line.rstrip().split("\t") for line in open(filename)] species = dat[0][2:] lens = util.map2(float, util.submatrix(dat, range(1, len(dat)), range(2, len(dat[0])))) gene_sizes = map(int, util.cget(dat[1:], 1)) files = util.cget(dat[1:], 0) if nooutliers: treelens = map(sum, lens) m = mean(treelens) ind = util.find(lambda x: x<5*m, treelens) files, gene_sizes, lens, treelens = [util.mget(x, ind) for x in files, gene_sizes, lens, treelens] for row in lens: for i in xrange(len(row)): if row[i] < minlen: row[i] = minlen return species, lens, gene_sizes, files
def _test_ml(self): """Test ML code""" # params bgfreq = [.258, .267, .266, .209] kappa = 1.59 # data tree = treelib.readTree("test/data/flies.nt/0/0.tree") align = fasta.readFasta("test/data/flies.nt/0/0.align") likes = [] dists = [] nodes = sorted(tree.nodes.values(), key=lambda x: x.dist) util.tic("find ML") for i in range(40): l = spidir.find_ml_branch_lengths_hky(tree, util.mget( align, tree.leafNames()), bgfreq, kappa, parsinit=False, maxiter=1) dists.append([n.dist for n in nodes]) likes.append(l) util.toc() print likes prep_dir("test/output/ml/") # distances plot util.rplot_start("test/output/ml/ml_branches.pdf") util.rplot("plot", util.cget(dists, 0), ylim=[0, max(dists[0])], t="l", main="branch length convergence", xlab="iterations", ylab="branch lengths (sub/site)") for d in zip(*dists): util.rplot("lines", d) util.rplot_end(True) print util.cget(dists, 4) # likelihood plot util.rplot_start("test/output/ml/ml_likelihood.pdf") util.rplot("plot", likes, t="l", xlab="iterations", ylab="log likelihood", main="likelihood convergence") util.rplot_end(True)
def _test_ml(self): """Test ML code""" # params bgfreq = [.258,.267,.266,.209] kappa = 1.59 # data tree = treelib.readTree("test/data/flies.nt/0/0.tree") align = fasta.readFasta("test/data/flies.nt/0/0.align") likes = [] dists = [] nodes = sorted(tree.nodes.values(), key=lambda x: x.dist) util.tic("find ML") for i in range(40): l = spidir.find_ml_branch_lengths_hky( tree, util.mget(align, tree.leafNames()), bgfreq, kappa, parsinit=False, maxiter=1) dists.append([n.dist for n in nodes]) likes.append(l) util.toc() print likes prep_dir("test/output/ml/") # distances plot util.rplot_start("test/output/ml/ml_branches.pdf") util.rplot("plot", util.cget(dists, 0), ylim=[0, max(dists[0])], t="l", main="branch length convergence", xlab="iterations", ylab="branch lengths (sub/site)") for d in zip(* dists): util.rplot("lines", d) util.rplot_end(True) print util.cget(dists, 4) # likelihood plot util.rplot_start("test/output/ml/ml_likelihood.pdf") util.rplot("plot", likes, t="l", xlab="iterations", ylab="log likelihood", main="likelihood convergence") util.rplot_end(True)
def join_tables(* args, **kwargs): """Join together tables into one table. Each argument is a tuple (table_i, key_i, cols_i) key_i is either a column name or a function that maps a table row to a unique key """ if len(args) == 0: return Table() # determine common keys tab, key, cols = args[0] if isinstance(key, str): keys = tab.cget(key) lookups = [tab.lookup(key)] else: keys = map(key, tab) lookup = {} for row in tab: lookup[key(row)] = row lookups = [lookup] keyset = set(keys) for tab, key, cols in args[1:]: if isinstance(key, str): keyset = keyset & set(tab.cget(key)) lookups.append(tab.lookup(key)) else: keyset = keyset & set(map(key, tab)) lookup = {} for row in tab: lookup[key(row)] = row lookups.append(lookup) keys = filter(lambda x: x in keyset, keys) # build new table if "headers" not in kwargs: headers = util.concat(*util.cget(args, 2)) else: headers = kwargs["headers"] tab = Table(headers=headers) for key in keys: row = {} for (tab2, key2, cols), lookup in zip(args, lookups): row.update(util.subdict(lookup[key], cols)) tab.append(row) return tab
def test_local_trees(self): rho = 1.5e-8 # recomb/site/gen l = 10000 # length of locus k = 10 # number of lineages n = 2*1e4 # effective popsize arg = arglib.sample_arg(k, n, rho, 0, l) blocks1 = util.cget(arglib.iter_local_trees(arg, 200, 1200), 0) blocks2 = list(arglib.iter_recomb_blocks(arg, 200, 1200)) self.assertEqual(blocks1, blocks2)
def is_contig(db, genes): """Returns True if genes are contiguous along chromosome""" if len(genes) > 1: pos = [db.get_region_pos_full(i) for i in genes if i in db.regions] # ensure hits are on same chromosome if not util.equal(* util.cget(pos, 1)): return False ind = util.cget(pos, 2) ind.sort() # check that each position is present i = ind[0] for j in ind[1:]: if j != i+1: return False i += 1 return True
def makeFamilyGeneNames(self): """Tries to name and describe a family using its genes""" self.cur.execute("""SELECT g.famid, g.common_name, g.description FROM Genes g """) fams = util.groupby(lambda x: x[0], self.cur) familyGeneNames = {} for famid, rows in fams.iteritems(): names = util.unique([ "".join([i for i in x if not i.isdigit() and i != "-"]) for x in util.cget(rows, 1) if x != "" ]) names.sort() description = self.getFamDescription(util.cget(rows, 2)) familyGeneNames[famid] = (",".join(names), description) return familyGeneNames
def is_contig(db, genes): """Returns True if genes are contiguous along chromosome""" if len(genes) > 1: pos = [db.get_region_pos_full(i) for i in genes if i in db.regions] # ensure hits are on same chromosome if not util.equal(*util.cget(pos, 1)): return False ind = util.cget(pos, 2) ind.sort() # check that each position is present i = ind[0] for j in ind[1:]: if j != i + 1: return False i += 1 return True
def makeFamilyGeneNames(self): """Tries to name and describe a family using its genes""" self.cur.execute("""SELECT g.famid, g.common_name, g.description FROM Genes g """) fams = util.groupby(lambda x: x[0], self.cur) familyGeneNames = {} for famid, rows in fams.iteritems(): names = util.unique(["".join([i for i in x if not i.isdigit() and i != "-"]) for x in util.cget(rows, 1) if x != ""]) names.sort() description = self.getFamDescription(util.cget(rows, 2)) familyGeneNames[famid] = (",".join(names), description) return familyGeneNames
def test_sample_coal_recomb(self): rho = 1.5e-8 # recomb/site/gen l = 2000 # length of locus k = 10 # number of lineages n = 2 * 10000 # effective popsize r = rho * l # recomb/locus/gen nsamples = 10000 samples = [arglib.sample_coal_recomb(k, n, r) for i in range(nsamples)] events = dict( (event, count / float(nsamples)) for event, count in util.hist_dict(util.cget(samples, 0)).items()) expected = {'coal': 0.88146, 'recomb': 0.11854} for key, value in events.items(): self.assertAlmostEqual(value, expected[key], places=2)
def test_sample_coal_recomb(self): rho = 1.5e-8 # recomb/site/gen l = 2000 # length of locus k = 10 # number of lineages n = 2*10000 # effective popsize r = rho * l # recomb/locus/gen nsamples = 10000 samples = [arglib.sample_coal_recomb(k, n, r) for i in range(nsamples)] events = dict( (event, count / float(nsamples)) for event, count in util.hist_dict(util.cget(samples, 0)).items()) expected = {'coal': 0.88146, 'recomb': 0.11854} for key, value in events.items(): self.assertAlmostEqual(value, expected[key], places=2)
def calc_conservation(aln): """Returns a list of percent matching in each column of an alignment""" length = len(aln.values()[0]) seqs = aln.values() percids = [] # find identity positions identity = "" for i in xrange(length): chars = util.hist_dict(util.cget(seqs, i)) if "-" in chars: del chars["-"] if len(chars) == 0: percids.append(0.0) else: pid = max(chars.values()) / float(len(aln)) percids.append(pid) return percids
def calc_conservation(aln): """Returns a list of percent matching in each column of an alignment""" length = len(aln.values()[0]) seqs = aln.values() percids = [] # find identity positions for i in xrange(length): chars = util.hist_dict(util.cget(seqs, i)) if "-" in chars: del chars["-"] if len(chars) == 0: percids.append(0.0) else: pid = max(chars.values()) / float(len(aln)) percids.append(pid) return percids
def drawTreeLogl(tree, out=None, events={}, baserate=1.0): labels = {} if out == None: out = DEBUG if "baserate" in tree.data: baserate = tree.data["baserate"] for node in tree.nodes.values(): notes = "" if "extra" in node.data: notes += "E" if "unfold" in node.data: notes += "U" if "logl" in node.data: if isinstance(node.data["logl"], float): labels[node.name] = "[%s]\n%.3f (%.3f) %s" % \ (node.name, node.dist, node.data["logl"], notes) #logl += node.data["logl"] else: labels[node.name] = "[%s]\n%.3f (%s) %s" % \ (node.name, node.dist, str(node.data["logl"]), notes) else: labels[node.name] = "[%s]\n%.3f (*) %s" % \ (node.name, node.dist, notes) if "params" in node.data: try: fracs = map(stats.mean, zip(* node.data["fracs"])) mean = sum(util.vmul(util.cget(node.data["params"], 0), fracs)) sdev = sum(util.vmul(util.cget(node.data["params"], 1), fracs)) mean *= baserate sdev *= baserate labels[node.name] += "\n%.3f %.3f" % (mean, sdev) except: print fracs, node.data['params'] #if "error" in node.data: # labels[node.name] += "\nerr %.4f" % node.data["error"] if node in events: labels[node.name] += " %s" % events[node] if "logl" in tree.data: debug("logl: %f" % tree.data["logl"]) debug("eventlogl: %f" % tree.data["eventlogl"]) debug("errorlogl: %f" % tree.data["errorlogl"]) debug("baserate: %f" % baserate) debug("treelen: %f" % sum(x.dist for x in tree.nodes.values())) if "error" in tree.data: debug("error: %f" % tree.data["error"]) treelib.drawTree(tree, minlen=20, maxlen=100, labels=labels, spacing=4, labelOffset=-3, out=out)