def draw_events(canvas, tree, coords, events, losses, lossColor=(0, 0, 1), dupColor=(1, 0, 0), size=4): # draw duplications for node in tree: x, y = coords[node] if events[node] == "dup": canvas.rect(x - size/2.0, y - size/2.0, size, size, fillColor=dupColor, strokeColor=(0,0,0,0)) # draw losses losses_per_branch = util.hist_dict([node for node, schild in losses]) for node, nlosses in losses_per_branch.iteritems(): if node.parent == None: continue x1 = coords[node.parent][0] x2, y1 = coords[node] step = (x2 - x1) / float(nlosses + 1) for x in util.frange(x1 + step, x2-(step/2.0), step): canvas.line(x, y1 - size, x, y1 + size, color=lossColor)
def find_orthologs(gtree, stree, recon, counts=True): """Find all ortholog pairs within a gene tree""" events = label_events(gtree, recon) orths = [] for node, event in events.items(): if event == "spec": leavesmat = [x.leaves() for x in node.children] sp_counts = [util.hist_dict(util.mget(recon, row)) for row in leavesmat] for i in range(len(leavesmat)): for j in range(i+1, len(leavesmat)): for gene1 in leavesmat[i]: for gene2 in leavesmat[j]: if gene1.name > gene2.name: g1, g2 = gene2, gene1 a, b = j, i else: g1, g2 = gene1, gene2 a, b = i, j if not counts: orths.append((g1.name, g2.name)) else: orths.append((g1.name, g2.name, sp_counts[a][recon[g1]], sp_counts[b][recon[g2]])) return orths
def draw_events(canvas, tree, coords, events, losses, lossColor=(0, 0, 1), dupColor=(1, 0, 0), size=4): # draw duplications for node in tree: x, y = coords[node] if events[node] == "dup": canvas.rect(x - size / 2.0, y - size / 2.0, size, size, fillColor=dupColor, strokeColor=(0, 0, 0, 0)) # draw losses losses_per_branch = util.hist_dict([node for node, schild in losses]) for node, nlosses in losses_per_branch.iteritems(): if node.parent == None: continue x1 = coords[node.parent][0] x2, y1 = coords[node] step = (x2 - x1) / float(nlosses + 1) for x in util.frange(x1 + step, x2 - (step / 2.0), step): canvas.line(x, y1 - size, x, y1 + size, color=lossColor)
def draw_events(self): # draw duplications dups = [color(*self.dup_color)] for node in self.tree: if self.events[node] == "dup": dups.append( zoom_clamp( shapes.box(node.x - .5, node.y - .5, node.x + .5, node.y + .5), link=True, link_type="smaller", maxx=8, minx=1, maxy=8, miny=1, origin=(node.x, node.y), prezoom=(self.xscale, 1.0))) # draw losses losses_per_branch = util.hist_dict([node for node, schild in self.losses]) losses = [color(*self.loss_color)] for node, nlosses in losses_per_branch.iteritems(): if node.parent == None: continue x1 = node.parent.x x2 = node.x step = (x2 - x1) / float(nlosses + 1) for x in util.frange(x1 + step, x2-(step/2.0), step): losses.append(lines(x, node.y - .2, x, node.y + .2)) return group(group(*dups), group(*losses))
def histtab(items, headers=None, item="item", count="count", percent="percent", cols=None): """Make a histogram table.""" if cols is not None: # items is a Table. items = items.as_tuples(cols=cols) if headers is None: headers = cols + [count, percent] if headers is None: headers = [item, count, percent] h = util.hist_dict(items) tab = Table(headers=headers) tot = float(sum(h.itervalues())) hist_items = h.items() if cols is not None: for key, val in hist_items: row = dict(zip(cols, key)) row[count] = val tab.append(row) else: for key, val in hist_items: tab.append({item: key, count: val}) if percent is not None: for i, (key, val) in enumerate(hist_items): tab[i][percent] = val / tot tab.sort(col=count, reverse=True) return tab
def mode(vals): """Computes the mode of a list of numbers""" top = 0 topkey = None for key, val in util.hist_dict(vals).iteritems(): if val > top: top = val topkey = key return topkey
def make_pep_colors(prop2color=prop2color): pep_colors = util.Dict(default=color(.5, .5, .5)) AA = 'ARNDCEQGHILKMFPSTWYVU*' pep_per_prop = util.hist_dict(util.mget(seqlib.AA_PROPERTY, AA)) prop_counts = util.Dict(default=0) for char in AA: prop = seqlib.AA_PROPERTY[char] tint = prop_counts[prop] / float(pep_per_prop[prop]) pep_colors[char] = prop2color(prop, tint * .5) prop_counts[prop] += 1 return pep_colors
def test_sample_coal_recomb(self): rho = 1.5e-8 # recomb/site/gen l = 2000 # length of locus k = 10 # number of lineages n = 2 * 10000 # effective popsize r = rho * l # recomb/locus/gen nsamples = 10000 samples = [arglib.sample_coal_recomb(k, n, r) for i in range(nsamples)] events = dict( (event, count / float(nsamples)) for event, count in util.hist_dict(util.cget(samples, 0)).items()) expected = {'coal': 0.88146, 'recomb': 0.11854} for key, value in events.items(): self.assertAlmostEqual(value, expected[key], places=2)
def test_sample_coal_recomb(self): rho = 1.5e-8 # recomb/site/gen l = 2000 # length of locus k = 10 # number of lineages n = 2*10000 # effective popsize r = rho * l # recomb/locus/gen nsamples = 10000 samples = [arglib.sample_coal_recomb(k, n, r) for i in range(nsamples)] events = dict( (event, count / float(nsamples)) for event, count in util.hist_dict(util.cget(samples, 0)).items()) expected = {'coal': 0.88146, 'recomb': 0.11854} for key, value in events.items(): self.assertAlmostEqual(value, expected[key], places=2)
def find_four_fold(aln): """Returns index of all columns in alignment that are completely fourfold degenerate Assumes that columns are already filtered for aligned codons """ # create peptide alignment pepAln = mapalign(aln, valfunc=translate) # find peptide conservation pepcons = [] pep = [] for i in xrange(pepAln.alignlen()): # get a column from the peptide alignment col = [seq[i] for seq in pepAln.itervalues()] # compute the histogram of the column. # ignore gaps '-' and non-translated 'X' hist = util.hist_dict(col) if "-" in hist: del hist["-"] if "X" in hist: del hist["X"] # column is conserved if only one AA appears if len(hist) == 1: pepcons.append(True) pep.append(hist.keys()[0]) else: pepcons.append(False) pep.append("X") # find four-fold sites in conserved peptides ind = [] for i in range(0, len(aln.values()[0]), 3): # process only those columns that are conserved at the peptide level if pepcons[i//3]: degen = AA_DEGEN[pep[i//3]] for j in range(3): if degen[j] == 4: ind.append(i+j) return ind
def debug_test3(): stree = treelib.read_tree('examples/nbin.stree') # run from ../ of this directory for node in stree: node.dist *= 1e7 # gen per myr popsize = 2e7 freq = 1e0 dr = .0000012 / 1e7 #.0012/1e7 lr = .0000011 / 1e7 #.0006/1e7 freqdup = freqloss = .05 forcetime = 1e7 for node in stree: print node.name, node.dist, len(node.children) print locus_tree, locus_extras = sim_DLILS_gene_tree(stree, popsize, freq, \ dr, lr, \ freqdup, freqloss, \ forcetime) for node in locus_tree: print node.name, node.dist, len(node.children) print logged_locus_tree, logged_extras = locus_to_logged_tree(locus_tree, popsize) daughters = logged_extras[0] pops = logged_extras[1] coal_tree, coal_recon = dlcoal.sample_locus_coal_tree(logged_locus_tree, n=pops, daughters=daughters, namefunc=lambda x: logged_extras[2][x] + '_' + str(x)) #begin debug print coal_tree.leaf_names() try: # print set(coal_tree) - set(coal_tree.postorder()) treelib.assert_tree(coal_tree) except AssertionError: print 'assertion error thrown on coal_tree being a proper tree' from rasmus import util hd= util.hist_dict(x.name for x in coal_tree.postorder()) for key in hd.keys(): print key if hd[key]>1 else '', print print len(coal_tree.nodes) - len(list(coal_tree.postorder()))
def find_xenologs(gtree, stree, recon, events, trans, counts=True, species_branch=False): """Find all xenolog pairs within a gene tree NOTE: THIS HAS NOT BEEN TESTED!!! """ xenos = [] for node, event in events.items(): if event == "trans": assert len(node.children) == 2 if trans[node] == node.children[0]: children = (node.children[1], node.children[0]) else: children = node.children leavesmat = [x.leaves() for x in children] sp_counts = [ util.hist_dict(util.mget(recon, row)) for row in leavesmat ] for i in range(len(leavesmat)): for j in range(i + 1, len(leavesmat)): for gene1 in leavesmat[i]: for gene2 in leavesmat[j]: g1, g2 = gene1, gene2 a, b = i, j xeno = [g1.name, g2.name] if counts: xeno.extend([ sp_counts[a][recon[g1]], sp_counts[b][recon[g2]] ]) if species_branch: xeno.append(recon[node]) xenos.append(tuple(xenos)) return xenos
def calc_conservation(aln): """Returns a list of percent matching in each column of an alignment""" length = len(aln.values()[0]) seqs = aln.values() percids = [] # find identity positions identity = "" for i in xrange(length): chars = util.hist_dict(util.cget(seqs, i)) if "-" in chars: del chars["-"] if len(chars) == 0: percids.append(0.0) else: pid = max(chars.values()) / float(len(aln)) percids.append(pid) return percids
def calc_conservation(aln): """Returns a list of percent matching in each column of an alignment""" length = len(aln.values()[0]) seqs = aln.values() percids = [] # find identity positions for i in xrange(length): chars = util.hist_dict(util.cget(seqs, i)) if "-" in chars: del chars["-"] if len(chars) == 0: percids.append(0.0) else: pid = max(chars.values()) / float(len(aln)) percids.append(pid) return percids
def getFamDescription(self, descriptions): # TODO: remove this hardcoding rmdesc = set([ "", "Predicted ORF from Assembly 19", "Predicted ORF in Assemblies 19 and 20", "ORF Predicted by Annotation Working Group", "possibly spurious ORF (Annotation Working Group prediction)" ]) descs = [] for d in descriptions: descs.extend(d.split("; ")) descs = filter(lambda x: x not in rmdesc, descs) items = util.hist_dict(descs).items() items.sort(key=lambda x: x[1], reverse=True) desc = "; ".join(["%s[%d]" % item for item in items]) return desc
def getFamDescription(self, descriptions): # TODO: remove this hardcoding rmdesc = set([ "", "Predicted ORF from Assembly 19", "Predicted ORF in Assemblies 19 and 20", "ORF Predicted by Annotation Working Group", "possibly spurious ORF (Annotation Working Group prediction)"]) descs = [] for d in descriptions: descs.extend(d.split("; ")) descs = filter(lambda x: x not in rmdesc, descs) items = util.hist_dict(descs).items() items.sort(key=lambda x: x[1], reverse=True) desc = "; ".join(["%s[%d]" % item for item in items]) return desc
def histtab(items, headers=["item", "count", "percent"]): h = util.hist_dict(items) tab = Table(headers=headers) tot = float(sum(h.itervalues())) if len(headers) == 2: for key, val in h.items(): tab.append({headers[0]: key, headers[1]: val}) elif len(headers) == 3: for key, val in h.items(): tab.append({headers[0]: key, headers[1]: val, headers[2]: val / tot}) else: raise Exception("Wrong number of headers (2 or 3 only)") tab.sort(col=headers[1], reverse=True) return tab
def histtab(items, headers=["item", "count", "percent"]): h = util.hist_dict(items) tab = Table(headers=headers) tot = float(len(items)) if len(headers) == 2: for key, val in h.items(): tab.append({headers[0]: key, headers[1]: val}) elif len(headers) == 3: for key, val in h.items(): tab.append({headers[0]: key, headers[1]: val, headers[2]: val / tot}) else: raise Exception("Wrong number of headers (2 or 3 only)") tab.sort(col=headers[1], reverse=True) return tab
def num_redundant_topology(node, gene2species, leaves=None, all_leaves=False): """Returns the number of 'redundant' topologies""" if leaves is None: leaves = node.leaves() leaves = set(leaves) colors = {} nmirrors = [0] def walk(node): if node in leaves: colors[node] = phylo.hash_tree(node, gene2species) else: # recurse for child in node.children: walk(child) childHashes = util.mget(colors, node.children) if len(childHashes) > 1 and util.equal(*childHashes): nmirrors[0] += 1 childHashes.sort() colors[node] = phylo.hash_tree_compose(childHashes) walk(node) colorsizes = util.hist_dict(util.mget(colors, leaves)).values() if all_leaves: val = stats.factorial(len(leaves)) else: val = 1 for s in colorsizes: if s > 1: val *= stats.factorial(s) #print "py val=", val, "nmirrors=", nmirrors[0] return val / (2**nmirrors[0])
def num_redundant_topology(node, gene2species, leaves=None, all_leaves=False): """Returns the number of 'redundant' topologies""" if leaves is None: leaves = node.leaves() leaves = set(leaves) colors = {} nmirrors = [0] def walk(node): if node in leaves: colors[node] = phylo.hash_tree(node, gene2species) else: # recurse for child in node.children: walk(child) childHashes = util.mget(colors, node.children) if len(childHashes) > 1 and util.equal(*childHashes): nmirrors[0] += 1 childHashes.sort() colors[node] = phylo.hash_tree_compose(childHashes) walk(node) colorsizes = util.hist_dict(util.mget(colors, leaves)).values() if all_leaves: val = stats.factorial(len(leaves)) else: val = 1 for s in colorsizes: if s > 1: val *= stats.factorial(s) # print "py val=", val, "nmirrors=", nmirrors[0] return val / (2 ** nmirrors[0])
def isOne2one(part, gene2species): counts = util.hist_dict(map(gene2species, part)) return (max(counts.values()) == 1)
def gcContent(seq): hist = util.hist_dict(seq) total = hist["A"] + hist["C"] + hist["T"] + hist["G"] return (hist["C"] + hist["G"]) / float(total)