def _stash_rewrite_and_call(self, fname, test_cases): """Safely run a series of tests on a parsed and rewritten file. Specifically: Parse a file, rename the source file to a backup, rewrite the file from the parsed object, check the rewritten file with the given series of test functions, then restore the original by renaming the backup copy. Python 2.4 support: This would make more sense as a context manager that simply handles renaming and finally restoring the original. """ phx = PhyloXMLIO.read(fname) if os.path.exists(fname + "~"): os.remove(fname + "~") os.rename(fname, fname + "~") try: PhyloXMLIO.write(phx, fname) for cls, tests in test_cases: inst = cls("setUp") for test in tests: getattr(inst, test)() finally: # XXX not safe! if os.path.exists(fname): os.remove(fname) os.rename(fname + "~", fname)
def _rewrite_and_call(self, orig_fname, test_cases): """Parse, rewrite and retest a phyloXML example file.""" infile = open(orig_fname, "rb") phx = PhyloXMLIO.read(infile) infile.close() outfile = open(DUMMY, "w+b") PhyloXMLIO.write(phx, outfile) outfile.close() for cls, tests in test_cases: inst = cls("setUp") for test in tests: getattr(inst, test)()
def _rewrite_and_call(self, orig_fname, test_cases): """Parse, rewrite and retest a phyloXML example file.""" infile = open(orig_fname, 'r') phx = PhyloXMLIO.read(infile) infile.close() outfile = open(DUMMY, 'w') PhyloXMLIO.write(phx, outfile) outfile.close() for cls, tests in test_cases: inst = cls('setUp') for test in tests: getattr(inst, test)()
def prepare_species_tree(FILE_TREE_IN,FILE_TREE_OUT): clan_taxa = {} treexml = PhyloXMLIO.read(open(FILE_TREE_IN, 'r')) tree = treexml[0] treexml.attributes.pop('schemaLocation', None) # not supported by Forester tree.rooted = True leaf_dict = {} for node in tree.clade.find_clades(): if node.name: tax_id = node.name if tax_id.startswith('INT'): tax_id = tax_id[3:] taxon = PhyloXML.Taxonomy(id=PhyloXML.Id(tax_id, provider='ncbi_taxonomy')) try: taxon.scientific_name = find_tax_name(tax_id) except KeyError: taxon.scientific_name = '(NA)' node._set_taxonomy(taxon) node.name = None else: pass PhyloXMLIO.write(treexml, FILE_TREE_OUT)
def prepare_species_tree(FILE_TREE_IN, FILE_TREE_OUT): clan_taxa = {} treexml = PhyloXMLIO.read(open(FILE_TREE_IN, 'r')) tree = treexml[0] treexml.attributes.pop('schemaLocation', None) # not supported by Forester tree.rooted = True leaf_dict = {} for node in tree.clade.find_clades(): if node.name: tax_id = node.name if tax_id.startswith('INT'): tax_id = tax_id[3:] taxon = PhyloXML.Taxonomy( id=PhyloXML.Id(tax_id, provider='ncbi_taxonomy')) try: taxon.scientific_name = find_tax_name(tax_id) except KeyError: taxon.scientific_name = '(NA)' node._set_taxonomy(taxon) node.name = None else: pass PhyloXMLIO.write(treexml, FILE_TREE_OUT)
def reconcile_tree(gene_tree_file, reconciled_file, rec_tag, pfam_id, db): if (os.path.isfile(rec_tag + 'ids.pickle')) and (pplacer_flag == 1): id_information = pickle.load(open(rec_tag + 'ids.pickle', 'rb')) existing_genes = id_information['existing_genes'] Sequnces = [] p_ids = [] new_genes = set([w['id'] for w in pplacer_queries[pfam_id]]) if not (new_genes - set(existing_genes)): print "All %s Genes for family %s have already been placed in the reconciled tree." % ( len(new_genes), pfam_id) print "Skip Reconciliation for %s" % pfam_id return txid_file = rec_tag + 'txid.xml' if not (os.path.isfile(rec_tag + 'ids.pickle')) or not ( os.path.isfile(reconciled_file + '.gz')) or (pplacer_flag == 1): print "Running Reconciliation for: %s" % pfam_id rand_id = random.randint(1000000, 9999999) subprocess.check_call("gunzip -c %s/%s.nw.gz > %s.%d" % (tree_folder, pfam_id, gene_tree_file, rand_id), shell=True) tree = ete2.PhyloTree('%s.%d' % (gene_tree_file, rand_id), format=0) tree.resolve_polytomy() tree.write(format=0, outfile=txid_file + '.tmp.nw') if os.path.exists('%s.%d' % (gene_tree_file, rand_id)): subprocess.check_call("rm %s.%d" % (gene_tree_file, rand_id), shell=True) Phylo.convert(txid_file + '.tmp.nw', 'newick', txid_file + '.tmp.xml', 'phyloxml') treexml = PhyloXMLIO.read(open(txid_file + '.tmp.xml', 'r')) tree = treexml[0] treexml.attributes.pop('schemaLocation', None) # not supported by Forester tree.rooted = True my_ids = set([]) my_query_by_taxid = {} for leaf in tree.clade.find_clades(terminal=True): up_name = leaf.name.split('/')[0] tax_id, tax_name = find_tax_id_unip(up_name, db) if tax_id not in all_species_txids: if tax_id in merged_taxid.keys(): tax_id = merged_taxid[tax_id] tax_name = find_tax_name(tax_id, db) if tax_id in best_taxid_map.keys(): tax_id = best_taxid_map[tax_id] tax_name = find_tax_name(tax_id, db) else: tax_id0 = tax_id tax_id, tax_name = find_best_taxid(tax_id, db) if tax_id > 0: best_taxid_map[tax_id0] = tax_id if tax_id < 0: if (-tax_id) in merged_taxid.keys(): tax_id = merged_taxid[-tax_id] tax_name = find_tax_name(tax_id, db) if tax_id in my_query_by_taxid: my_query_by_taxid[tax_id].append(up_name) else: my_query_by_taxid[tax_id] = [up_name] my_ids.add(tax_id) my_tax_id = PhyloXML.Id(tax_id, provider='ncbi_taxonomy') taxon = PhyloXML.Taxonomy(id=my_tax_id) taxon.scientific_name = tax_name leaf._set_taxonomy(taxon) PhyloXMLIO.write(treexml, open(txid_file, 'w')) os.system('rm ' + txid_file + '.tmp.nw') os.system('rm ' + txid_file + '.tmp.xml') print "Taxid file done for: %s" % pfam_id existing_ids = list(set(my_ids) & set(all_species_txids)) existing_genes = [ g for txid in my_query_by_taxid.keys() for g in my_query_by_taxid[txid] if txid in existing_ids ] pickle.dump( { 'pfam_id': pfam_id, 'existing_ids': existing_ids, 'existing_genes': existing_genes }, open(rec_tag + 'ids.pickle', 'wb')) print "Pickle file done for: %s" % pfam_id if os.path.exists(reconciled_file): os.system('rm ' + reconciled_file) os.system( "java -Xmx4g -cp %s/forester_1038.jar org.forester.application.gsdi -g %s %s/ncbi_2_fixed.xml %s" % (lib_path, txid_file, species_tree_data_path, reconciled_file)) if os.path.exists(reconciled_file): if os.path.exists(reconciled_file + '.gz'): subprocess.check_call("rm %s.gz" % (reconciled_file), shell=True) subprocess.check_call("gzip %s" % (reconciled_file), shell=True) os.system('rm ' + rec_tag + 'reconciled_species_tree_used.xml') os.system('rm ' + rec_tag + 'reconciled_gsdi_log.txt') os.system('rm ' + txid_file) print "Reconciliation file done for: %s" % pfam_id
def reconcile_tree(gene_tree_file,reconciled_file,rec_tag,pfam_id,db): if (os.path.isfile(rec_tag+'ids.pickle')) and (pplacer_flag==1): id_information = pickle.load(open(rec_tag+'ids.pickle', 'rb')) existing_genes=id_information['existing_genes'] Sequnces=[] p_ids=[] new_genes=set([w['id'] for w in pplacer_queries[pfam_id]]) if not (new_genes-set(existing_genes)): print "All %s Genes for family %s have already been placed in the reconciled tree."%(len(new_genes),pfam_id) print "Skip Reconciliation for %s"%pfam_id return txid_file=rec_tag+'txid.xml' if not(os.path.isfile(rec_tag+'ids.pickle')) or not(os.path.isfile(reconciled_file+'.gz')) or (pplacer_flag==1): print "Running Reconciliation for: %s"%pfam_id rand_id=random.randint(1000000,9999999) subprocess.check_call("gunzip -c %s/%s.nw.gz > %s.%d"%(tree_folder,pfam_id,gene_tree_file,rand_id),shell=True) tree = ete2.PhyloTree('%s.%d'%(gene_tree_file,rand_id), format=0) tree.resolve_polytomy() tree.write(format=0, outfile=txid_file+'.tmp.nw') if os.path.exists('%s.%d'%(gene_tree_file,rand_id)): subprocess.check_call("rm %s.%d"%(gene_tree_file,rand_id),shell=True) Phylo.convert(txid_file+'.tmp.nw', 'newick', txid_file+'.tmp.xml', 'phyloxml') treexml = PhyloXMLIO.read(open(txid_file+'.tmp.xml','r')) tree = treexml[0] treexml.attributes.pop('schemaLocation', None) # not supported by Forester tree.rooted = True my_ids=set([]) my_query_by_taxid={} for leaf in tree.clade.find_clades(terminal=True): up_name = leaf.name.split('/')[0] tax_id,tax_name=find_tax_id_unip(up_name,db) if tax_id not in all_species_txids: if tax_id in merged_taxid.keys(): tax_id=merged_taxid[tax_id] tax_name=find_tax_name(tax_id,db) if tax_id in best_taxid_map.keys(): tax_id=best_taxid_map[tax_id] tax_name=find_tax_name(tax_id,db) else: tax_id0=tax_id tax_id,tax_name=find_best_taxid(tax_id,db) if tax_id>0: best_taxid_map[tax_id0]=tax_id if tax_id<0: if (-tax_id) in merged_taxid.keys(): tax_id=merged_taxid[-tax_id] tax_name=find_tax_name(tax_id,db) if tax_id in my_query_by_taxid: my_query_by_taxid[tax_id].append(up_name) else: my_query_by_taxid[tax_id]=[up_name] my_ids.add(tax_id) my_tax_id = PhyloXML.Id(tax_id, provider='ncbi_taxonomy') taxon=PhyloXML.Taxonomy(id=my_tax_id) taxon.scientific_name = tax_name leaf._set_taxonomy(taxon) PhyloXMLIO.write(treexml, open(txid_file,'w')) os.system('rm '+txid_file+'.tmp.nw') os.system('rm '+txid_file+'.tmp.xml') print "Taxid file done for: %s"%pfam_id existing_ids=list(set(my_ids)&set(all_species_txids)) existing_genes=[g for txid in my_query_by_taxid.keys() for g in my_query_by_taxid[txid] if txid in existing_ids] pickle.dump({'pfam_id':pfam_id,'existing_ids':existing_ids,'existing_genes':existing_genes}, open(rec_tag+'ids.pickle', 'wb')) print "Pickle file done for: %s"%pfam_id if os.path.exists(reconciled_file): os.system('rm '+reconciled_file) os.system("java -Xmx4g -cp %s/forester_1038.jar org.forester.application.gsdi -g %s %s/ncbi_2_fixed.xml %s"%(lib_path, txid_file, species_tree_data_path, reconciled_file)) if os.path.exists(reconciled_file): if os.path.exists(reconciled_file+'.gz'): subprocess.check_call("rm %s.gz"%(reconciled_file),shell=True) subprocess.check_call("gzip %s"%(reconciled_file),shell=True) os.system('rm '+rec_tag+'reconciled_species_tree_used.xml') os.system('rm '+rec_tag+'reconciled_gsdi_log.txt') os.system('rm '+txid_file) print "Reconciliation file done for: %s"%pfam_id
print(aln) # Calculate the distance matrix # calculator = DistanceCalculator('identity') # dm = calculator.get_distance(aln) # # with open("data/project/dist_mat.p", "wb") as f: # pickle.dump(dm, f) with open("data/project/dist_mat.p", "rb") as f: dm = pickle.load(f) # Construct the phylogenetic tree using UPGMA algorithm constructor = DistanceTreeConstructor() tree = constructor.upgma(dm) tree_xml = tree.as_phyloxml() PhyloXMLIO.write(tree_xml, "data/project/tree1.xml") def labels(c): if not c.is_terminal(): return "" else: return " ".join(c.name.split()[:2]) fig = plt.figure(figsize=(25, 25), dpi=100) axes = fig.add_subplot(1, 1, 1) Phylo.draw(tree, axes=axes, do_show=False, label_func=labels) plt.savefig("plots/tree1.svg", format="svg") Phylo.draw_ascii(tree)