def read_dlcoal_recon(filename, stree, exts={"coal_tree": ".coal.tree", "coal_recon": ".coal.recon", "locus_tree": ".locus.tree", "locus_recon": ".locus.recon", "daughters": ".daughters" }, filenames={}): """Reads a reconciled gene tree from files""" extra = {} # trees coal_tree = treelib.read_tree( filenames.get("coal_tree", filename + exts["coal_tree"])) extra["locus_tree"] = treelib.read_tree( filenames.get("locus_tree", filename + exts["locus_tree"])) # recons extra["coal_recon"], junk = phylo.read_recon_events( filenames.get("coal_recon", filename + exts["coal_recon"]), coal_tree, extra["locus_tree"]) extra["locus_recon"], extra["locus_events"] = phylo.read_recon( filenames.get("locus_recon", filename + exts["locus_recon"]), extra["locus_tree"], stree) extra["daughters"] = set( extra["locus_tree"].nodes[x] for x in util.read_strings( filenames.get("daughters", filename + exts["daughters"]))) return coal_tree, extra
def read(self, filename, stree, exts={"coal_tree": ".coal.tree", "coal_recon": ".coal.recon", "locus_tree": ".locus.tree", "locus_recon": ".locus.recon", "daughters": ".daughters" }, filenames={}, check=True): """Reads a reconciled gene tree from files""" # trees coal_tree = treelib.read_tree( filenames.get("coal_tree", filename + exts["coal_tree"])) self.locus_tree = treelib.read_tree( filenames.get("locus_tree", filename + exts["locus_tree"])) # recons self.coal_recon, junk = phylo.read_recon_events( filenames.get("coal_recon", filename + exts["coal_recon"]), coal_tree, self.locus_tree) self.locus_recon, self.locus_events = phylo.read_recon_events( filenames.get("locus_recon", filename + exts["locus_recon"]), self.locus_tree, stree) self.daughters = set( self.locus_tree.nodes[x] for x in util.read_strings( filenames.get("daughters", filename + exts["daughters"]))) assert (not check) or (check and self.is_valid(coal_tree)) return coal_tree, self.get_dict()
def read_dlcoal_recon(filename, stree, exts={"coal_tree": ".coal.tree", "coal_recon": ".coal.recon", "locus_tree": ".locus.tree", "locus_recon": ".locus.recon", "daughters": ".daughters" }, filenames={}): """Reads a reconciled gene tree from files""" extra = {} # trees coal_tree = treelib.read_tree( filenames.get("coal_tree", filename + exts["coal_tree"])) extra["locus_tree"] = treelib.read_tree( filenames.get("locus_tree", filename + exts["locus_tree"])) # recons extra["coal_recon"], junk = phylo.read_recon_events( filenames.get("coal_recon", filename + exts["coal_recon"]), coal_tree, extra["locus_tree"]) extra["locus_recon"], extra["locus_events"] = phylo.read_recon_events( filenames.get("locus_recon", filename + exts["locus_recon"]), extra["locus_tree"], stree) extra["daughters"] = set( extra["locus_tree"].nodes[x] for x in util.read_strings( filenames.get("daughters", filename + exts["daughters"]))) return coal_tree, extra
def test_nonbinary_trees(self): mul = MulRFModel(extra = None) gene2species = phylo.read_gene2species("../../../examples/test/nonBinaryAll.smap") stree = treelib.read_tree('../../../examples/test/nonBinaryAll.stree') gtree = treelib.read_tree('../../../examples/test/nonBinaryAll.gtree') mul.stree = stree mul.gene2species = gene2species self.assertEqual(mul.compute_cost(gtree), 6)
def test_smap_error(self): mul = MulRFModel(extra = None) gene2species = phylo.read_gene2species("../../../examples/test/nonBinaryAll.smap") stree = treelib.read_tree('../../../examples/test/24Hits.stree') gtree = treelib.read_tree('../../../examples/test/24Hits.gtree') with self.assertRaises(Exception): mul.optimize_model(gtree, stree, None)
def test_null_trees(self): mul = MulRFModel(extra = None) stree = treelib.read_tree('../../../examples/test/EmptyTree.stree') gtree = treelib.read_tree('../../../examples/test/EmptyTree.stree') gene2species = phylo.read_gene2species("../../../examples/test/24Hits.smap") mul.stree = stree mul.gene2species = gene2species with self.assertRaises(AttributeError): mul.compute_cost(gtree)
def test_deep(self): deep = DeepCoalescenceModel(extra = None) gene2species = phylo.read_gene2species("../../../examples/test/24Hits.smap") stree = treelib.read_tree('../../../examples/test/test1.stree') gtree = treelib.read_tree('../../../examples/test/test1.gtree') deep.stree = stree deep.gene2species = gene2species self.assertEqual(deep.compute_cost(gtree), 2)
def addFamilies(self, eventsfile, discard=[]): if not tableExists(self.cur, "Families"): self.makeFamiliesTable() util.tic("add families") events_tab = tablelib.read_table(eventsfile) events_lookup = events_tab.lookup("partid") familyGeneNames = self.makeFamilyGeneNames() discard = set(discard) for row in events_tab: famid = row["partid"] if famid in discard: util.logger("discarding '%s'" % famid) continue tree = treelib.read_tree(self.getTreeFile(famid)) treelen = sum(x.dist for x in tree) seqs = fasta.read_fasta(self.getFastaFile(famid)) seqlen = stats.median(map(len, seqs.values())) self.cur.execute( """INSERT INTO Families VALUES ("%s", "%s", %f, %f, %f, %d, %d, %d, "%s");""" % (row["partid"], familyGeneNames.get(row["partid"], ("", ""))[0], row["famrate"], treelen, seqlen * 3, row["dup"], row["loss"], row["genes"], familyGeneNames.get(row["partid"], ("", ""))[1])) util.toc()
def test_reorder(self): """Test reordering of tree children.""" infile = StringIO("((a,b),(c,d));") tree = read_tree(infile) infile = StringIO("((d,c),(b,a));") tree2 = read_tree(infile) hashtree1 = tree.get_one_line_newick() hashtree2 = tree2.get_one_line_newick() self.assertTrue(hashtree1 != hashtree2) reorder_tree(tree, tree2) hashtree1 = tree.get_one_line_newick() hashtree2 = tree2.get_one_line_newick() self.assertEqual(hashtree1, hashtree2)
def trainTree(conf, stree, gene2species): args = conf["REST"] treefiles = [] for arg in args: treefiles.extend(util.shellparser(arg)) util.tic("reading trees") trees = [] prog = progress.ProgressBar(len(treefiles)) for treefile in treefiles: prog.update() trees.append(treelib.read_tree(treefile)) # even out top two branches totlen = trees[-1].root.children[0].dist + \ trees[-1].root.children[1].dist trees[-1].root.children[0].dist = totlen / 2.0 trees[-1].root.children[1].dist = totlen / 2.0 util.toc() params = Spidir.learnModel(trees, stree, gene2species, conf["trainstats"], filenames=treefiles) Spidir.writeParams(conf["param"], params)
def bionj(aln=None, labels=None, distmat=None, seqtype="pep", verbose=True): # make temp files distfile = util.tempfile(".", "bionj-in", ".dist") treefile = util.tempfile(".", "bionj-out", ".tree") # find distances and then NJ tree if distmat is not None: phylip.write_dist_matrix(distmat, out=distfile) if labels is None: labels = aln.keys() else: if seqtype == "pep": labels = phylip.protdist(aln, distfile, verbose=verbose) else: labels = phylip.dnadist(aln, distfile, verbose=verbose) os.system("echo -n '%s\n%s' | bionj > /dev/null" % (distfile, treefile)) tree = treelib.read_tree(treefile) phylip.rename_tree_with_names(tree, labels) # clean up os.remove(distfile) os.remove(treefile) return tree
def test2(self): outdir = 'test/tmp/test_vistrans/Vis_test2/' make_clean_dir(outdir) stree = treelib.parse_newick(stree_newick) tree = treelib.read_tree(treefile2) brecon = phylo.read_brecon(breconfile2, tree, stree) transsvg.draw_tree(tree, brecon, stree, filename=outdir + "tree.svg")
def test3(self): outdir = 'test/tmp/test_vistrans/Vis_test3/' make_clean_dir(outdir) stree = treelib.parse_newick(stree_newick) tree = treelib.read_tree(treefile3) brecon = phylo.read_brecon(breconfile3, tree, stree) phylo.add_implied_spec_nodes_brecon(tree, brecon) phylo.write_brecon(open(outdir + 'brecon', 'w'), brecon) transsvg.draw_tree(tree, brecon, stree, filename=outdir + "tree.svg")
def debug_test1(): stree = treelib.read_tree('../examples/flies.stree') for node in stree: node.dist *= 1e7 # gen per myr popsize = 2e7 freq = 1e0 dr = .0012/1e7 lr = .0006/1e7 freqdup = freqloss = .05 forcetime = 1e7 ltree, ex = sim_DLILS_gene_tree(stree, popsize, freq, dr, lr, freqdup, freqloss, forcetime) return stree, gtree, ex
def test_read_tree(self): """Test reading tree structure.""" tree = treelib.read_tree(StringIO(fungi2)) ptree = dict((node.name, node.parent.name if node.parent else None) for node in tree) ptree_expected = { 1: None, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 2, 9: 8, 10: 'xx', 11: 10, 12: 10, 'sbay': 5, 14: 13, 'xx': 1, 'scer': 7, 'ctro': 11, 'scas': 3, 'agos': 9, 'kwal': 8, 'dhan': 14, 'smik': 6, 'cgla': 4, 'spar': 7, 'calb': 11, 'lelo': 12, 'cpar': 12, 13: 'xx', 'klac': 9, 'clus': 13, 'cgui': 14} self.assertEqual(ptree, ptree_expected) newick = tree.get_one_line_newick(writeData=treelib.write_nhx_data) self.assertEqual(newick, fungi2)
def test_nhx_big(self): """Test parsing of big NHX comments.""" text = """(CFTR_GASAC:0.028272[&&NHX:S=GASAC:O=ENSGACT00000011967.1:T=69293:G=ENSGACG00000009039],((((((((((((((((((CFTR_HUMAN:0.002013[&&NHX:S=HUMAN:O=ENST00000003084.5:T=9606:G=ENSG00000001626],CFTR_PANTR:0.001342[&&NHX:S=PANTR:O=ENSPTRT00000036339.2:T=9598:G=ENSPTRG00000019619]):0.001545,CFTR_PONPY:0.006514[&&NHX:S=PONPY:O=ENSPPYT00000020909.1:T=9600:G=ENSPPYG00000017940]):0.003539,CFTR_MACMU:0.008416[&&NHX:S=MACMU:O=ENSMMUT00000015762.2:T=9544:G=ENSMMUG00000011269]):0.022751,CFTR_TUPGB:0.110613[&&NHX:S=TUPGB:O=ENSTBET00000011046.1:T=37347:G=ENSTBEG00000010974]):0.006474,((CFTR_OTOGA:0.035577[&&NHX:S=OTOGA:O=ENSOGAT00000001759.1:T=30611:G=ENSOGAG00000001756],CFTR_MICMU:0.026588[&&NHX:S=MICMU:O=ENSMICT00000005779.1:T=30608:G=ENSMICG00000005761]):0.010514,CFTR_MYOLU:0.06919[&&NHX:S=MYOLU:O=ENSMLUT00000012267.1:T=59463:G=ENSMLUG00000012244]):0.00395):0.001879,(CFTR_ECHTE:0.065629[&&NHX:S=ECHTE:O=ENSETET00000000538.1:T=9371:G=ENSETEG00000000537],CFTR_LOXAF:0.050347[&&NHX:S=LOXAF:O=ENSLAFT00000005758.1:T=9785:G=ENSLAFG00000005753]):0.016592):0.002471,((CFTR_SORAR:0.056771[&&NHX:S=SORAR:O=ENSSART00000012124.1:T=42254:G=ENSSARG00000012121],CFTR_ERIEU:0.043527[&&NHX:S=ERIEU:O=ENSEEUT00000006570.1:T=9365:G=ENSEEUG00000006484]):0.015585,CFTR_DASNO:0.047157[&&NHX:S=DASNO:O=ENSDNOT00000016544.1:T=9361:G=ENSDNOG00000016541]):0.00431):0.005677,(CFTR_F2_HORSE:0.016035[&&NHX:S=HORSE:O=ENSECAT00000010738.1:T=9796:G=ENSECAG00000009139],((CFTR_CANFA:0.047251[&&NHX:S=CANFA:O=ENSCAFT00000005518.2:T=9615:G=ENSCAFG00000003429],Q9N1D7_FELCA:0.025264[&&NHX:S=FELCA:O=ENSFCAT00000014959.2:T=9685:G=ENSFCAG00000014955]):0.022297,CFTR_BOVIN:0.062409[&&NHX:S=BOVIN:O=ENSBTAT00000053450.1:T=9913:G=ENSBTAG00000006589]):0.00767):0.004191):0.006209,(CFTR_F2_CAVPO:0.136979[&&NHX:S=CAVPO:O=ENSCPOT00000012891.1:T=10141:G=ENSCPOG00000012767],CFTR_SPETR:0.026944[&&NHX:S=SPETR:O=ENSSTOT00000005733.1:T=43179:G=ENSSTOG00000005707]):0.009628):0.007329,(Q29399_RABIT:0.027324[&&NHX:S=RABIT:O=ENSOCUT00000010738.1:T=9986:G=ENSOCUG00000010733],CFTR_OCHPR:0.050953[&&NHX:S=OCHPR:O=ENSOPRT00000014760.1:T=9978:G=ENSOPRG00000014721]):0.017472):0.011797,(Cftr_MOUSE:0.035769[&&NHX:S=MOUSE:O=ENSMUST00000045706.4:T=10090:G=ENSMUSG00000041301],Cftr_RAT:0.049345[&&NHX:S=RAT:O=ENSRNOT00000010981.4:T=10116:G=ENSRNOG00000008284]):0.158692):0.033423,Q2QL94_MONDO:0.08197[&&NHX:S=MONDO:O=ENSMODT00000020031.2:T=13616:G=ENSMODG00000015771]):0.026265,CFTR_ORNAN:0.094961[&&NHX:S=ORNAN:O=ENSOANT00000013974.1:T=9258:G=ENSOANG00000008767]):0.03792,A0M8U4_CHICK:0.119618[&&NHX:S=CHICK:O=ENSGALT00000015182.3:T=9031:G=ENSGALG00000009324]):0.033083,CFTR_XENTR:0.130489[&&NHX:S=XENTR:O=ENSXETT00000047145.1:T=8364:G=ENSXETG00000021796]):0.352249,si_dkey-270i2_F3_BRARE:0.203525[&&NHX:S=BRARE:O=ENSDART00000100729.1:T=7955:G=ENSDARG00000041107]):0.063334,CFTR_ORYLA:0.123603[&&NHX:S=ORYLA:O=ENSORLT00000024332.1:T=8090:G=ENSORLG00000019555]):0.034773,CFTR_TETNG:0.049086[&&NHX:S=TETNG:O=ENSTNIT00000019381.1:T=99883:G=ENSTNIG00000016063]):0.028272)[&&NHX:Loglk=-24078.827174:RatioCons=0.000000;:LoglkSpec=0.000000];""" # nopep8 tree = read_tree(StringIO(text)) expected = { 29: {}, 30: {}, 'CFTR_MACMU': {'O': 'ENSMMUT00000015762.2', 'S': 'MACMU', 'T': '9544', 'G': 'ENSMMUG00000011269'}, } for name, data in expected.items(): self.assertEqual(tree[name].data, data)
def read( self, filename, stree, exts={ "coal_tree": ".coal.tree", "coal_recon": ".coal.recon", "locus_tree": ".locus.tree", "locus_recon": ".locus.recon", "daughters": ".daughters" }, filenames={}, check=True): """Reads a reconciled gene tree from files""" # trees coal_tree = treelib.read_tree( filenames.get("coal_tree", filename + exts["coal_tree"])) self.locus_tree = treelib.read_tree( filenames.get("locus_tree", filename + exts["locus_tree"])) # recons self.coal_recon, junk = phylo.read_recon_events( filenames.get("coal_recon", filename + exts["coal_recon"]), coal_tree, self.locus_tree) self.locus_recon, self.locus_events = phylo.read_recon_events( filenames.get("locus_recon", filename + exts["locus_recon"]), self.locus_tree, stree) self.daughters = set( self.locus_tree.nodes[x] for x in util.read_strings( filenames.get("daughters", filename + exts["daughters"]))) assert (not check) or (check and self.is_valid(coal_tree)) return coal_tree, self.get_dict()
def debug_test2(): stree = treelib.read_tree('examples/flies.stree') # run from ../ of this directory for node in stree: node.dist *= 1e7 # gen per myr popsize = 2e7 freq = 1e0 dr = .0012/1e7 lr = .0006/1e7 freqdup = freqloss = .05 forcetime = 1e7 # ltree, ex = sim_DLILS_gene_tree(stree, popsize, freq, dr, lr, freqdup, freqloss, forcetime) coal_tree, ex = sample_dlcoal_no_ifix(stree=stree, n=popsize, freq=freq, duprate=dr, lossrate=lr, freqdup=freqdup, freqloss=freqloss, forcetime=forcetime) treelib.draw_tree(coal_tree, scale=.00000005)
def test_nhx_big(self): """Test parsing of big NHX comments.""" text = """(CFTR_GASAC:0.028272[&&NHX:S=GASAC:O=ENSGACT00000011967.1:T=69293:G=ENSGACG00000009039],((((((((((((((((((CFTR_HUMAN:0.002013[&&NHX:S=HUMAN:O=ENST00000003084.5:T=9606:G=ENSG00000001626],CFTR_PANTR:0.001342[&&NHX:S=PANTR:O=ENSPTRT00000036339.2:T=9598:G=ENSPTRG00000019619]):0.001545,CFTR_PONPY:0.006514[&&NHX:S=PONPY:O=ENSPPYT00000020909.1:T=9600:G=ENSPPYG00000017940]):0.003539,CFTR_MACMU:0.008416[&&NHX:S=MACMU:O=ENSMMUT00000015762.2:T=9544:G=ENSMMUG00000011269]):0.022751,CFTR_TUPGB:0.110613[&&NHX:S=TUPGB:O=ENSTBET00000011046.1:T=37347:G=ENSTBEG00000010974]):0.006474,((CFTR_OTOGA:0.035577[&&NHX:S=OTOGA:O=ENSOGAT00000001759.1:T=30611:G=ENSOGAG00000001756],CFTR_MICMU:0.026588[&&NHX:S=MICMU:O=ENSMICT00000005779.1:T=30608:G=ENSMICG00000005761]):0.010514,CFTR_MYOLU:0.06919[&&NHX:S=MYOLU:O=ENSMLUT00000012267.1:T=59463:G=ENSMLUG00000012244]):0.00395):0.001879,(CFTR_ECHTE:0.065629[&&NHX:S=ECHTE:O=ENSETET00000000538.1:T=9371:G=ENSETEG00000000537],CFTR_LOXAF:0.050347[&&NHX:S=LOXAF:O=ENSLAFT00000005758.1:T=9785:G=ENSLAFG00000005753]):0.016592):0.002471,((CFTR_SORAR:0.056771[&&NHX:S=SORAR:O=ENSSART00000012124.1:T=42254:G=ENSSARG00000012121],CFTR_ERIEU:0.043527[&&NHX:S=ERIEU:O=ENSEEUT00000006570.1:T=9365:G=ENSEEUG00000006484]):0.015585,CFTR_DASNO:0.047157[&&NHX:S=DASNO:O=ENSDNOT00000016544.1:T=9361:G=ENSDNOG00000016541]):0.00431):0.005677,(CFTR_F2_HORSE:0.016035[&&NHX:S=HORSE:O=ENSECAT00000010738.1:T=9796:G=ENSECAG00000009139],((CFTR_CANFA:0.047251[&&NHX:S=CANFA:O=ENSCAFT00000005518.2:T=9615:G=ENSCAFG00000003429],Q9N1D7_FELCA:0.025264[&&NHX:S=FELCA:O=ENSFCAT00000014959.2:T=9685:G=ENSFCAG00000014955]):0.022297,CFTR_BOVIN:0.062409[&&NHX:S=BOVIN:O=ENSBTAT00000053450.1:T=9913:G=ENSBTAG00000006589]):0.00767):0.004191):0.006209,(CFTR_F2_CAVPO:0.136979[&&NHX:S=CAVPO:O=ENSCPOT00000012891.1:T=10141:G=ENSCPOG00000012767],CFTR_SPETR:0.026944[&&NHX:S=SPETR:O=ENSSTOT00000005733.1:T=43179:G=ENSSTOG00000005707]):0.009628):0.007329,(Q29399_RABIT:0.027324[&&NHX:S=RABIT:O=ENSOCUT00000010738.1:T=9986:G=ENSOCUG00000010733],CFTR_OCHPR:0.050953[&&NHX:S=OCHPR:O=ENSOPRT00000014760.1:T=9978:G=ENSOPRG00000014721]):0.017472):0.011797,(Cftr_MOUSE:0.035769[&&NHX:S=MOUSE:O=ENSMUST00000045706.4:T=10090:G=ENSMUSG00000041301],Cftr_RAT:0.049345[&&NHX:S=RAT:O=ENSRNOT00000010981.4:T=10116:G=ENSRNOG00000008284]):0.158692):0.033423,Q2QL94_MONDO:0.08197[&&NHX:S=MONDO:O=ENSMODT00000020031.2:T=13616:G=ENSMODG00000015771]):0.026265,CFTR_ORNAN:0.094961[&&NHX:S=ORNAN:O=ENSOANT00000013974.1:T=9258:G=ENSOANG00000008767]):0.03792,A0M8U4_CHICK:0.119618[&&NHX:S=CHICK:O=ENSGALT00000015182.3:T=9031:G=ENSGALG00000009324]):0.033083,CFTR_XENTR:0.130489[&&NHX:S=XENTR:O=ENSXETT00000047145.1:T=8364:G=ENSXETG00000021796]):0.352249,si_dkey-270i2_F3_BRARE:0.203525[&&NHX:S=BRARE:O=ENSDART00000100729.1:T=7955:G=ENSDARG00000041107]):0.063334,CFTR_ORYLA:0.123603[&&NHX:S=ORYLA:O=ENSORLT00000024332.1:T=8090:G=ENSORLG00000019555]):0.034773,CFTR_TETNG:0.049086[&&NHX:S=TETNG:O=ENSTNIT00000019381.1:T=99883:G=ENSTNIG00000016063]):0.028272)[&&NHX:Loglk=-24078.827174:RatioCons=0.000000;:LoglkSpec=0.000000];""" # nopep8 tree = read_tree(StringIO(text)) expected = { 29: {}, 30: {}, 'CFTR_MACMU': { 'O': 'ENSMMUT00000015762.2', 'S': 'MACMU', 'T': '9544', 'G': 'ENSMMUG00000011269' }, } for name, data in expected.items(): self.assertEqual(tree[name].data, data)
def __init__(self, dbfile=None, famfile=None, smapfile=None, genenamefile=None, streefile=None, baseDir=None, treeFileExt=None, fastaFileExt=None): self.fams = genecluster.FamilyDb(famfile) self.gene2species = phylo.read_gene2species(smapfile) self.genenames_tab = tablelib.read_table(genenamefile) self.gene2name = self.genenames_tab.lookup("id") self.stree = treelib.read_tree(streefile) self.baseDir = baseDir self.treeFileExt = treeFileExt self.fastaFileExt = fastaFileExt # open database self.con = sqlite.connect(dbfile, isolation_level="DEFERRED") self.cur = self.con.cursor()
def debug_test3(): stree = treelib.read_tree('examples/nbin.stree') # run from ../ of this directory for node in stree: node.dist *= 1e7 # gen per myr popsize = 2e7 freq = 1e0 dr = .0000012 / 1e7 #.0012/1e7 lr = .0000011 / 1e7 #.0006/1e7 freqdup = freqloss = .05 forcetime = 1e7 for node in stree: print node.name, node.dist, len(node.children) print locus_tree, locus_extras = sim_DLILS_gene_tree(stree, popsize, freq, \ dr, lr, \ freqdup, freqloss, \ forcetime) for node in locus_tree: print node.name, node.dist, len(node.children) print logged_locus_tree, logged_extras = locus_to_logged_tree(locus_tree, popsize) daughters = logged_extras[0] pops = logged_extras[1] coal_tree, coal_recon = dlcoal.sample_locus_coal_tree(logged_locus_tree, n=pops, daughters=daughters, namefunc=lambda x: logged_extras[2][x] + '_' + str(x)) #begin debug print coal_tree.leaf_names() try: # print set(coal_tree) - set(coal_tree.postorder()) treelib.assert_tree(coal_tree) except AssertionError: print 'assertion error thrown on coal_tree being a proper tree' from rasmus import util hd= util.hist_dict(x.name for x in coal_tree.postorder()) for key in hd.keys(): print key if hd[key]>1 else '', print print len(coal_tree.nodes) - len(list(coal_tree.postorder()))
def consense_from_file(intrees, verbose=True, args="y"): # read all trees trees = util.open_stream(intrees).readlines() ntrees = len(trees) cwd = create_temp_dir() out = open("intree", "w") for tree in trees: out.write(tree) out.close() exec_phylip("consense", args, verbose) tree = treelib.read_tree("outtree") cleanup_temp_dir(cwd) return tree, ntrees
def test_read_tree(self): """Test reading tree structure.""" tree = treelib.read_tree(StringIO(fungi2)) ptree = dict((node.name, node.parent.name if node.parent else None) for node in tree) ptree_expected = { 1: None, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 2, 9: 8, 10: 'xx', 11: 10, 12: 10, 'sbay': 5, 14: 13, 'xx': 1, 'scer': 7, 'ctro': 11, 'scas': 3, 'agos': 9, 'kwal': 8, 'dhan': 14, 'smik': 6, 'cgla': 4, 'spar': 7, 'calb': 11, 'lelo': 12, 'cpar': 12, 13: 'xx', 'klac': 9, 'clus': 13, 'cgui': 14 } self.assertEqual(ptree, ptree_expected) newick = tree.get_one_line_newick(writeData=treelib.write_nhx_data) self.assertEqual(newick, fungi2)
def test_tree_namefunc(self): """Test reading/writing tree with namefunc.""" count = [0] def namefunc(name): count[0] += 1 return 'name%d' % count[0] tree = treelib.read_tree(StringIO(fungi2), namefunc=namefunc) newick = tree.get_one_line_newick() expected_newick = '(((((((name1:7.061760,name2:7.061760):4.999680,name3:12.061440):5.970600,name4:18.032040):52.682400,name5:70.714260):7.220700,name6:77.934960):23.181480,((name7:78.553260,name8:78.553260):10.434960,name9:88.988220):12.128400):78.883560,(((name10:41.275620,name11:41.275980):29.632860,(name12:52.323120,name13:52.323120):18.585720):31.149540,((name14:75.615840,name15:75.615840):14.006880,name16:89.622720):12.435660)xx:77.941620);' # nopep8 self.assertEqual(newick, expected_newick) def namefunc2(name): return 'prefix_' + name newick2 = tree.get_one_line_newick(namefunc=namefunc2) expected_newick2 = '(((((((prefix_name1:7.061760,prefix_name2:7.061760):4.999680,prefix_name3:12.061440):5.970600,prefix_name4:18.032040):52.682400,prefix_name5:70.714260):7.220700,prefix_name6:77.934960):23.181480,((prefix_name7:78.553260,prefix_name8:78.553260):10.434960,prefix_name9:88.988220):12.128400):78.883560,(((prefix_name10:41.275620,prefix_name11:41.275980):29.632860,(prefix_name12:52.323120,prefix_name13:52.323120):18.585720):31.149540,((prefix_name14:75.615840,prefix_name15:75.615840):14.006880,prefix_name16:89.622720):12.435660)xx:77.941620);' # nopep8 self.assertEqual(newick2, expected_newick2)
def test_write_tree(self): """Test tree writing Test root data writing """ newick = '''( ( a:1.000000, b:2.000000 )x:3.000000, ( c:4.000000, d:5.000000 )y:6.000000 )rra:0.000000; ''' infile = StringIO(newick) tree = read_tree(infile) out = StringIO() tree.write(out, rootData=True) self.assertEqual(newick, out.getvalue())
def read(self, filename, stree, exts={"tree" : ".tree", "recon" : ".recon", "order" : ".order"}, filenames={}): """Read the reconciliation from a file""" gtree = treelib.read_tree( filenames.get("tree", filename + exts["tree"])) self.species_map = {} self.locus_map = {} for name, sname, locus in util.read_delim(filenames.get("recon", filename + exts["recon"])): if name.isdigit(): name = int(name) if sname.isdigit(): sname = int(sname) assert locus.isdigit() locus = int(locus) node = gtree.nodes[name] self.species_map[node] = stree.nodes[sname] self.locus_map[node] = locus self.order = collections.defaultdict(dict) for toks in util.read_delim(filenames.get("order", filename + exts["order"])): sname, locus, lst = toks[0], toks[1], toks[2].split(',') if sname.isdigit(): sname = int(sname) assert locus.isdigit() locus = int(locus) names = map(lambda x: int(x) if x.isdigit() else x, lst) snode = stree.nodes[sname] nodes = map(lambda x: gtree.nodes[x], names) if snode not in self.order: self.order[snode] = {} self.order[snode][locus] = nodes self.order = dict(self.order) return gtree, self.get_dict()
def test_nhx(self): """Test parsing of NHX comments.""" text = """(((ADH2:0.1[&&NHX:S=human:E=1.1.1.1], ADH1:0.11[&&NHX:S=human:E=1.1.1.1]):0.05[&&NHX:S=Primates:E=1.1.1.1:D=Y:B=100], ADHY:0.1[&&NHX:S=nematode:E=1.1.1.1],ADHX:0.12[&&NHX:S=insect:E=1.1.1.1]):0.1[&&NHX:S=Metazoa:E=1.1.1.1:D=N], (ADH4:0.09[&&NHX:S=yeast:E=1.1.1.1],ADH3:0.13[&&NHX:S=yeast:E=1.1.1.1], ADH2:0.12[&&NHX:S=yeast:E=1.1.1.1],ADH1:0.11[&&NHX:S=yeast:E=1.1.1.1]):0.1 [&&NHX:S=Fungi])[&&NHX:E=1.1.1.1:D=N];""" # nopep8 tree = read_tree(StringIO(text)) data = {'ADH3': {'S': 'yeast', 'E': '1.1.1.1'}, 1: {'E': '1.1.1.1', 'D': 'N'}, 2: {'S': 'Metazoa', 'E': '1.1.1.1', 'D': 'N'}, 3: {'S': 'Primates', 'B': '100', 'E': '1.1.1.1', 'D': 'Y'}, 4: {'S': 'Fungi'}, 'ADH2': {'S': 'human', 'E': '1.1.1.1'}, 'ADHY': {'S': 'nematode', 'E': '1.1.1.1'}, 'ADHX': {'S': 'insect', 'E': '1.1.1.1'}, 'ADH1': {'S': 'human', 'E': '1.1.1.1'}, 'ADH1_1': {'S': 'yeast', 'E': '1.1.1.1'}, 'ADH4': {'S': 'yeast', 'E': '1.1.1.1'}, 'ADH2_1': {'S': 'yeast', 'E': '1.1.1.1'}} data2 = dict((node.name, node.data) for node in tree) for key, val in data.items(): self.assertEqual(data2[key], val)
def get_branch_lens(trees, stree, gene2species=gene2species): # determine species nanes species = map(str, stree.nodes.keys()) species.remove(str(stree.root.name)) # make rates table rates = tablelib.Table(headers=species) # loop through trees for tree in trees: if isinstance(tree, str): tree = treelib.read_tree(tree) recon = reconcile(tree, stree, gene2species) events = label_events(tree, recon) # skip trees with duplications or with extremly long branch lengths assert "dup" not in events.values() row = {} for node in tree.nodes.values(): row[str(recon[node].name)] = node.dist rates.append(row) return rates
def gene2species(name): return name[:1].upper() params = {"A": [4, 2], "B": [3, 1]} conf = {"debug": 0, "dupprob": .5, "lossprob": 1.0} stree = treelib.read_tree(StringIO.StringIO("(A, B);")) # test 1 print "\n\nTest 1" tree = treelib.read_tree(StringIO.StringIO("(a:3, b:2);")) logl = treeLogLikelihood(conf, tree, stree, gene2species, params, baserate=1) treelib.draw_tree_lens(tree,scale=5) floateq(logl, log(stats.normalPdf(3, params["A"]) * stats.normalPdf(2, params["B"]))) # test 2 print "\n\nTest 2" tree = treelib.read_tree(StringIO.StringIO("((a1:2.5, a2:2):1, b:2);"))
def spidir(conf, distmat, labels, stree, gene2species, params): """Main function for the SPIDIR algorithm""" setDebug(conf["debug"]) if isDebug(DEBUG_HIGH) and pyspidir: pyspidir.set_log(3, "") if "out" in conf: # create debug table conf["debugtab_file"] = file(conf["out"] + ".debug.tab", "w") debugtab = tablelib.Table(headers=["correct", "logl", "treelen", "baserate", "error", "errorlogl", "eventlogl", "tree", "topology", "species_hash"], types={"correct": bool, "logl": float, "treelen": float, "baserate": float, "error": float, "errorlogl": float, "eventlogl": float, "tree": str, "topology": str, "species_hash": str}) debugtab.writeHeader(conf["debugtab_file"]) conf["debugtab"] = debugtab else: conf["debugfile"] = None trees = [] logls = [] tree = None visited = {} util.tic("SPIDIR") # do auto searches for search in conf["search"]: util.tic("Search by %s" % search) if search == "greedy": tree, logl = Search.searchGreedy(conf, distmat, labels, stree, gene2species, params, visited=visited) elif search == "mcmc": tree, logl = Search.searchMCMC(conf, distmat, labels, stree, gene2species, params, initTree=tree, visited=visited) elif search == "regraft": tree, logl = Search.searchRegraft(conf, distmat, labels, stree, gene2species, params, initTree=tree, visited=visited, proposeFunc=Search.proposeTree3) elif search == "exhaustive": if tree == None: tree = phylo.neighborjoin(distmat, labels) tree = phylo.recon_root(tree, stree, gene2species) tree, logl = Search.searchExhaustive(conf, distmat, labels, tree, stree, gene2species, params, depth=conf["depth"], visited=visited) elif search == "hillclimb": tree, logl = Search.searchHillClimb(conf, distmat, labels, stree, gene2species, params, initTree=tree, visited=visited) elif search == "none": break else: raise SindirError("unknown search '%s'" % search) util.toc() Search.printMCMC(conf, "N/A", tree, stree, gene2species, visited) printVisitedTrees(visited) def evalUserTree(tree): setTreeDistances(conf, tree, distmat, labels) logl = treeLogLikelihood(conf, tree, stree, gene2species, params) thash = phylo.hash_tree(tree) if thash in visited: a, b, count = visited[thash] else: count = 0 visited[thash] = [logl, tree.copy(), count+1] if isDebug(DEBUG_LOW): debug("\nuser given tree:") recon = phylo.reconcile(tree, stree, gene2species) events = phylo.label_events(tree, recon) drawTreeLogl(tree, events=events) # eval the user given trees for treefile in conf["tree"]: tree = treelib.read_tree(treefile) evalUserTree(tree) for topfile in conf["tops"]: infile = file(topfile) strees = [] while True: try: strees.append(treelib.read_tree(infile)) except: break print len(strees) for top in strees: tree = phylo.stree2gtree(top, labels, gene2species) evalUserTree(tree) if len(conf["tops"]) > 0: printVisitedTrees(visited) # eval correcttree for debug only if "correcttree" in conf: tree = conf["correcttree"] setTreeDistances(conf, tree, distmat, labels) logl = treeLogLikelihood(conf, tree, stree, gene2species, params) if isDebug(DEBUG_LOW): debug("\ncorrect tree:") recon = phylo.reconcile(tree, stree, gene2species) events = phylo.label_events(tree, recon) drawTreeLogl(tree, events=events) util.toc() if len(visited) == 0: raise SindirError("No search or tree topologies given") if "correcthash" in conf: if conf["correcthash"] in visited: debug("SEARCH: visited correct tree") else: debug("SEARCH: NEVER saw correct tree") # return ML tree trees = [x[1] for x in visited.itervalues()] i = util.argmax([x.data["logl"] for x in trees]) return trees[i], trees[i].data["logl"]
stree.add_child(parent, child) child.dist = newdist callagain = True break if callagain: remove_single_child_nodes() # main code sim_walk(stree.root, freq) remove_single_child_nodes() return stree # poor nomenclature; this will be fixed in v2.1 if __name__ == "__main__": stree = treelib.read_tree('simple.stree') popsize = 1e4 freq = 1e0 dr = 2.1 lr = 2.0 freqdup = .05 freqloss = .05 forcetime = 1e0 tree = sim_tree(stree, popsize, freq, dr, lr, freqdup, freqloss, forcetime) if tree: treelib.draw_tree(tree, scale=1) ### VERSION 1 CODE (for reference) #
def boxPlot(dataPath = '/home/muddcs15/research/work/hemiplasy/results/', prob1 = '0.001', prob2 = '0.05', prob3 = '0.1', prob4 = '0.5', spectree = '/home/muddcs15/research/work/hemiplasy/data/config/fungi.stree'): """ A function that will output boxplots of probability of hemiplasy and probability of hemiplasy over duploss vs. initial allele frequency """ stree = treelib.read_tree(spectree) # species tree species = stree.leaf_names() species1 = [] species2 = [] for node in stree: if len(node.leaves()) == 2: species1.append(node.children[0].name) species2.append(node.children[1].name) # identify the files for each of the different initial frequencies probs1 = os.path.join(dataPath, 'probabilities-' + prob1 + '.txt') probs2 = os.path.join(dataPath, 'probabilities-' + prob2 + '.txt') probs3 = os.path.join(dataPath, 'probabilities-' + prob3 + '.txt') probs4 = os.path.join(dataPath, 'probabilities-' + prob4 + '.txt') probsList = [probs1, probs2, probs3, probs4] totalPerList = [] # probability of hemiplasy compared to duploss totalAList = [] # probability of hemiplasy ocurring totalPairs = [] h = 0 # probability that ocurred by hemiplasy d = 0 # probability that ocurred by duploss pair1 = [] pair2 = [] pair3 = [] pair4 = [] pair5 = [] # open each probability file for probFilename in probsList: events = open('/home/muddcs15/research/work/hemiplasy/results/hemiplasy-loss.txt', 'r') hList = [] # list of probability of hemiplasy perList = [] # list of percentage with prob hemiplasy > prob duploss aveList = [] # list of average probability of hemiplasy per fam id countTrue = 0 # look at each famid for that initial frequency probFile = open(probFilename, "r") for line in probFile: sepProbs = line.split() fam = sepProbs.pop(0) famid = fam[6:] # get the probability of duploss and hemiplasy for each trial in each famid for pair in sepProbs: duploss, hemiplasy = map(float, pair.split(',')) hList.append(hemiplasy) # check whether hemiplasy is more likely or duploss if hemiplasy > duploss: h += 1 else: d += 1 # calculate the percent that likely ocurred by hemiplasy percent = float(h)/float(h+d) # get the average probability of hemiplasy for each famid ave = stats.mean(hList) # append percent by hemiplasy to perList and average for the famid to aveList perList.append(percent) aveList.append(ave) for line in events: ev_famid, locus, spcs, gns, snode, lca = line.rstrip().split('\t') if famid == ev_famid: countTrue += 1 for sp1, sp2 in zip(species1, species2): if (sp1 in spcs and sp2 not in spcs): spec_check = sp1 specPos = species1.index(sp1) elif (sp2 in spcs and sp1 not in spcs): spec_check = sp2 specPos = species2.index(sp2) break if specPos == 0: pair1.append(ave) if specPos == 1: pair2.append(ave) if specPos == 2: pair3.append(ave) if specPos == 3: pair4.append(ave) if specPos == 4: pair5.append(ave) events.close() # append the lists through each famid to the large lists for each list of values totalPerList.append(perList) totalAList.append(aveList) # close file probFile.close() totalPairs.append(pair1) totalPairs.append(pair2) totalPairs.append(pair3) totalPairs.append(pair4) totalPairs.append(pair5) plt.boxplot(totalPairs) plt.title('Hemiplasy by Pairs') plt.xlabel('Pair') plt.ylabel('Probability') # print the plots plt.show()
tr = raxml.new_tree() cmd = "raxmlHPC -t %s -s %s %s" % (treefile, seqfile, options.extra) raxml.init_program(adef, tr, cmd.split(" ")) util.tic("Optimizing model...") raxml.optimize_model(adef, tr) util.toc() # draw_raxml_tree(tr, adef) util.tic("Getting parameters for LH...") bestVector, bestLH, weightSum = raxml.compute_best_LH(tr) util.log("bestLH: %.3f" % bestLH) util.toc() tree = treelib.read_tree(treefile) for node in tree: node.dist = 0 if "boot" in node.data: del node.data["boot"] treehash = phylo.hash_tree(treelib.unroot(tree, newCopy=True)) treehashes = set([treehash]) for i in xrange(options.niter): while treehash in treehashes: util.log("random spr") node1, node2 = phylo.propose_random_spr(tree) phylo.perform_spr(tree, node1, node2) treehash = phylo.hash_tree(treelib.unroot(tree, newCopy=True)) treehashes.add(treehash)
def boxPlot(dataPath = '/home/muddcs15/research/work/hemiplasy/results/', prob1 = '0.001', prob2 = '0.05', prob3 = '0.1', prob4 = '0.5', spectree = '/home/muddcs15/research/work/hemiplasy/data/config/fungi.stree'): """ A function that will output boxplots of probability of hemiplasy and probability of hemiplasy over duploss vs. initial allele frequency """ stree = treelib.read_tree(spectree) # species tree species = stree.leaf_names() species1 = [] species2 = [] for node in stree: if len(node.leaves()) == 2: species1.append(node.children[0].name) species2.append(node.children[1].name) # define number of plots to be outputed fig, axes = plt.subplots(nrows=2, ncols=3) # identify the files for each of the different initial frequencies probs1 = os.path.join(dataPath, 'probabilities-' + prob1 + '.txt') probs2 = os.path.join(dataPath, 'probabilities-' + prob2 + '.txt') probs3 = os.path.join(dataPath, 'probabilities-' + prob3 + '.txt') probs4 = os.path.join(dataPath, 'probabilities-' + prob4 + '.txt') probsList = [probs1, probs2, probs3, probs4] totalPerList = [] # probability of hemiplasy compared to duploss totalAList = [] # probability of hemiplasy ocurring totalPairs = [] h = 0 # probability that ocurred by hemiplasy d = 0 # probability that ocurred by duploss pair1 = [] pair2 = [] pair3 = [] pair4 = [] pair5 = [] pairList = [pair1, pair2, pair3, pair4, pair5] totalFList = [] totalPDList =[] # open each probability file for probFilename in probsList: events = open('/home/muddcs15/research/work/hemiplasy/results/hemiplasy-loss.txt', 'r') hList = [] # list of probability of hemiplasy perList = [] # list of percentage with prob hemiplasy > prob duploss aveList = [] # list of average probability of hemiplasy per fam id famList = [] PDList = [] # look at each famid for that initial frequency probFile = open(probFilename, "r") for line in probFile: sepProbs = line.split() fam = sepProbs.pop(0) famid = fam[6:] famList.append(famid) # get the probability of duploss and hemiplasy for each trial in each famid for pair in sepProbs: duploss, hemiplasy = map(float, pair.split(',')) hList.append(hemiplasy) # check whether hemiplasy is more likely or duploss if hemiplasy > duploss: h += 1 else: d += 1 # calculate the percent that likely ocurred by hemiplasy percent = float(h)/float(h+d) # get the average probability of hemiplasy for each famid ave = stats.mean(hList) # append percent by hemiplasy to perList and average for the famid to aveList perList.append(percent) aveList.append(ave) for line in events: ev_famid, locus, spcs, gns, dup, lca = line.rstrip().split('\t') if famid == ev_famid: for sp1, sp2 in zip(species1, species2): if (sp1 in spcs and sp2 not in spcs): spec_check = sp1 specPos = species1.index(sp1) elif (sp2 in spcs and sp1 not in spcs): spec_check = sp2 specPos = species2.index(sp2) break PDList.append((specPos, dup)) famNum = 0 for pos, dpl in PDList: if pos == 0: pair1.append((int(dpl), aveList[famNum])) if pos == 1: pair2.append((int(dpl), aveList[famNum])) if pos == 2: pair3.append((int(dpl), aveList[famNum])) if pos == 3: pair4.append((int(dpl), aveList[famNum])) if pos == 4: pair5.append((int(dpl), aveList[famNum])) famNum += 1 events.close() # append the lists through each famid to the large lists for each list of values totalPerList.append(perList) totalAList.append(aveList) totalFList.append(famList) totalPDList.append(PDList) # close file probFile.close() # TODO: what does this do? finalPair = collections.defaultdict(list) pairCount = 0 for pairNum in pairList: pairCount += 1 dup = collections.defaultdict(list) for (dupLoc, prob) in pairNum: dup[dupLoc].append(prob) finalPair[pairCount].extend([dup[dupLoc] for dupLoc in xrange(1,14)]) # define the first plot and its labels axes[0,0].boxplot(finalPair[1]) axes[0,0].set_title('Pair1') axes[0,0].set_xlabel('Duplication Location') axes[0,0].set_ylabel('Probability') axes[0,0].set_ylim(0,0.25) # define the second plot and its labels axes[0,1].boxplot(finalPair[2]) axes[0,1].set_title('Pair2') axes[0,1].set_xlabel('Duplication Location') axes[0,1].set_ylabel('Probability') axes[0,1].set_ylim(0,0.25) axes[0,2].boxplot(finalPair[3]) axes[0,2].set_title('Pair3') axes[0,2].set_xlabel('Duplication Location') axes[0,2].set_ylabel('Probability') axes[0,2].set_ylim(0,0.25) axes[1,0].boxplot(finalPair[4]) axes[1,0].set_title('Pair4') axes[1,0].set_xlabel('Duplication Location') axes[1,0].set_ylabel('Probability') axes[1,0].set_ylim(0,0.25) axes[1,1].boxplot(finalPair[5]) axes[1,1].set_title('Pair5') axes[1,1].set_xlabel('Duplication Location') axes[1,1].set_ylabel('Probability') axes[1,1].set_ylim(0,0.25) # print the plots plt.show()
def test_nhx(self): """Test parsing of NHX comments.""" text = """(((ADH2:0.1[&&NHX:S=human:E=1.1.1.1], ADH1:0.11[&&NHX:S=human:E=1.1.1.1]):0.05[&&NHX:S=Primates:E=1.1.1.1:D=Y:B=100], ADHY:0.1[&&NHX:S=nematode:E=1.1.1.1],ADHX:0.12[&&NHX:S=insect:E=1.1.1.1]):0.1[&&NHX:S=Metazoa:E=1.1.1.1:D=N], (ADH4:0.09[&&NHX:S=yeast:E=1.1.1.1],ADH3:0.13[&&NHX:S=yeast:E=1.1.1.1], ADH2:0.12[&&NHX:S=yeast:E=1.1.1.1],ADH1:0.11[&&NHX:S=yeast:E=1.1.1.1]):0.1 [&&NHX:S=Fungi])[&&NHX:E=1.1.1.1:D=N];""" # nopep8 tree = read_tree(StringIO(text)) data = { 'ADH3': { 'S': 'yeast', 'E': '1.1.1.1' }, 1: { 'E': '1.1.1.1', 'D': 'N' }, 2: { 'S': 'Metazoa', 'E': '1.1.1.1', 'D': 'N' }, 3: { 'S': 'Primates', 'B': '100', 'E': '1.1.1.1', 'D': 'Y' }, 4: { 'S': 'Fungi' }, 'ADH2': { 'S': 'human', 'E': '1.1.1.1' }, 'ADHY': { 'S': 'nematode', 'E': '1.1.1.1' }, 'ADHX': { 'S': 'insect', 'E': '1.1.1.1' }, 'ADH1': { 'S': 'human', 'E': '1.1.1.1' }, 'ADH1_1': { 'S': 'yeast', 'E': '1.1.1.1' }, 'ADH4': { 'S': 'yeast', 'E': '1.1.1.1' }, 'ADH2_1': { 'S': 'yeast', 'E': '1.1.1.1' } } data2 = dict((node.name, node.data) for node in tree) for key, val in data.items(): self.assertEqual(data2[key], val)
def hemiplasyConditions(numFamilies = 5351, dataPath = '/home/muddcs15/research/work/hemiplasy/data/real-fungi/', outputFile = '/home/muddcs15/research/work/hemiplasy/results/hemiplasy-loss.txt', spectree = '/home/muddcs15/research/work/hemiplasy/data/config/fungi.stree'): # create variables and output file output = open(outputFile,'w') count = 0 # define a list of all species and lists of each of the species pairs in separate lists stree = treelib.read_tree(spectree) # species tree species = stree.leaf_names() species1 = [] species2 = [] for node in stree: if len(node.leaves()) == 2: species1.append(node.children[0].name) species2.append(node.children[1].name) # loop over each fam id for famid in xrange(numFamilies): flag = False # this families met the criteria for possible hemiplasy locus_dict = collections.defaultdict(list) # key = locus number, val = list of (gn, sp) in the locus famFilename = dataPath + '%d/%d-dup.dlcoal.dlcpar.recon' % (famid,famid) # if the file is not empty, process it if os.stat(famFilename).st_size != 0: # read the locus tree and the reconcilitation file tree_filename = dataPath + '%d/%d.dlcoal.locus.tree' % (famid,famid) recon_filename = dataPath + '%d/%d.dlcoal.locus.recon' % (famid,famid) tree = treelib.read_tree(tree_filename) # locus tree recon, events = phylo.read_recon_events(recon_filename, tree, stree) # reconciliation and events # create a dictionary for [locus] = species tree location locus_sname = {} # find location in species tree where each locus was created and then close dlcpar file dupFilename = dataPath + '%d/%d.dlcoal.dlcpar.dup.rel.txt' % (famid,famid) for line in util.open_stream(dupFilename): locus, gns1, gns2, sname = line.rstrip().split('\t') locus_sname[locus] = sname # track to genes and species in each locus for line in util.open_stream(famFilename): # assign names to the columns in the file gn, sp, locus = line.rstrip().split('\t') if locus == "1": continue # store dict of key = locus, val = list of (gene, species) in locus locus_dict[locus].append((gn, sp)) # for each locus, determine if genes in locus satisfy the properties for a possible hemiplasy for locus, lst in locus_dict.iteritems(): sps = [sp for (gn,sp) in lst] gns = [gn for (gn,sp) in lst] # check if exists in only one species in a pair for sp1, sp2 in zip(species1, species2): if (sp1 in sps and sp2 not in sps) or \ (sp2 in sps and sp1 not in sps): # check if exists elsewhere (outside pair) for allsp in species: if (allsp != sp1) and (allsp != sp2) and (allsp in sps): flag = True # output this family id, the locus, the species with that locus, the genes on that locus,\ # the species tree branch on which the duplication occurred, and the daughter of the duplication node in the locus tree if flag: leaf_sps = [] leaf_gns = [] for gn, sp in lst: if not gn.isdigit(): leaf_sps.append(sp) leaf_gns.append(gn) gnodes = [tree.nodes[name] for name in leaf_gns] lca = treelib.lca(gnodes) output.write('\t'.join([str(famid), locus, ','.join(leaf_sps), ','.join(leaf_gns), locus_sname[locus], lca.name])) output.write('\n') break # if it is a true case, add to count if flag: count += 1 # print total count and close output file print "Total number of true cases =", count output.close()
sample_coal_cond_counts = coal.sample_coal_cond_counts if __name__ == "__main__": #======================================== # test cases for prob_locus_gene_species_alignment_recon # sim-flies, N = 1e6, g = 0.1, R = 1x, L = 100bp, mu = 5e-9 import os import numpy from compbio import fasta import dlcoal import StringIO path = "/home/muddcs15/research/work/coestimation/" stree = treelib.read_tree( os.path.join(path, "simulation/config/flies.stree")) prob_raxml = [] prob_treefix = [] prob_dlca = [] for i in range(100, 200): # read raxml recon coal_tree_raxml, extra_raxml = dlcoal.read_dlcoal_recon( os.path.join(path, "simulation/data/1000/5e-9/1e6-1x/", str(i), str(i) + ".raxml.dlcoal"), stree) locus_tree_raxml = extra_raxml["locus_tree"] locus_recon_raxml = extra_raxml["locus_recon"] coal_recon_raxml = extra_raxml["coal_recon"] daughters_raxml = extra_raxml["daughters"]