Пример #1
0
    def _stash_rewrite_and_call(self, fname, test_cases):
        """Safely run a series of tests on a parsed and rewritten file.

        Specifically: Parse a file, rename the source file to a backup, rewrite
        the file from the parsed object, check the rewritten file with the
        given series of test functions, then restore the original by renaming
        the backup copy.

        Python 2.4 support: This would make more sense as a context manager
        that simply handles renaming and finally restoring the original.
        """
        phx = PhyloXMLIO.read(fname)
        if os.path.exists(fname + "~"):
            os.remove(fname + "~")
        os.rename(fname, fname + "~")
        try:
            PhyloXMLIO.write(phx, fname)
            for cls, tests in test_cases:
                inst = cls("setUp")
                for test in tests:
                    getattr(inst, test)()
        finally:
            # XXX not safe!
            if os.path.exists(fname):
                os.remove(fname)
            os.rename(fname + "~", fname)
Пример #2
0
 def _rewrite_and_call(self, orig_fname, test_cases):
     """Parse, rewrite and retest a phyloXML example file."""
     infile = open(orig_fname, "rb")
     phx = PhyloXMLIO.read(infile)
     infile.close()
     outfile = open(DUMMY, "w+b")
     PhyloXMLIO.write(phx, outfile)
     outfile.close()
     for cls, tests in test_cases:
         inst = cls("setUp")
         for test in tests:
             getattr(inst, test)()
Пример #3
0
 def _rewrite_and_call(self, orig_fname, test_cases):
     """Parse, rewrite and retest a phyloXML example file."""
     infile = open(orig_fname, 'r')
     phx = PhyloXMLIO.read(infile)
     infile.close()
     outfile = open(DUMMY, 'w')
     PhyloXMLIO.write(phx, outfile)
     outfile.close()
     for cls, tests in test_cases:
         inst = cls('setUp')
         for test in tests:
             getattr(inst, test)()
def prepare_species_tree(FILE_TREE_IN,FILE_TREE_OUT):
    clan_taxa = {}
    treexml = PhyloXMLIO.read(open(FILE_TREE_IN, 'r'))
    tree = treexml[0]
    treexml.attributes.pop('schemaLocation', None)  # not supported by Forester
    tree.rooted = True
    leaf_dict = {}
    for node in tree.clade.find_clades():
        if node.name:
            tax_id = node.name
            if tax_id.startswith('INT'):
                tax_id = tax_id[3:]
            taxon = PhyloXML.Taxonomy(id=PhyloXML.Id(tax_id, provider='ncbi_taxonomy'))
            try:
                taxon.scientific_name = find_tax_name(tax_id)
            except KeyError:
                taxon.scientific_name = '(NA)'
            node._set_taxonomy(taxon)
            node.name = None
        else:
            pass
    PhyloXMLIO.write(treexml, FILE_TREE_OUT)
Пример #5
0
def prepare_species_tree(FILE_TREE_IN, FILE_TREE_OUT):
    clan_taxa = {}
    treexml = PhyloXMLIO.read(open(FILE_TREE_IN, 'r'))
    tree = treexml[0]
    treexml.attributes.pop('schemaLocation', None)  # not supported by Forester
    tree.rooted = True
    leaf_dict = {}
    for node in tree.clade.find_clades():
        if node.name:
            tax_id = node.name
            if tax_id.startswith('INT'):
                tax_id = tax_id[3:]
            taxon = PhyloXML.Taxonomy(
                id=PhyloXML.Id(tax_id, provider='ncbi_taxonomy'))
            try:
                taxon.scientific_name = find_tax_name(tax_id)
            except KeyError:
                taxon.scientific_name = '(NA)'
            node._set_taxonomy(taxon)
            node.name = None
        else:
            pass
    PhyloXMLIO.write(treexml, FILE_TREE_OUT)
Пример #6
0
def reconcile_tree(gene_tree_file, reconciled_file, rec_tag, pfam_id, db):
    if (os.path.isfile(rec_tag + 'ids.pickle')) and (pplacer_flag == 1):
        id_information = pickle.load(open(rec_tag + 'ids.pickle', 'rb'))
        existing_genes = id_information['existing_genes']
        Sequnces = []
        p_ids = []
        new_genes = set([w['id'] for w in pplacer_queries[pfam_id]])
        if not (new_genes - set(existing_genes)):
            print "All %s Genes for family %s have already been placed in the reconciled tree." % (
                len(new_genes), pfam_id)
            print "Skip Reconciliation for %s" % pfam_id
            return

    txid_file = rec_tag + 'txid.xml'
    if not (os.path.isfile(rec_tag + 'ids.pickle')) or not (
            os.path.isfile(reconciled_file + '.gz')) or (pplacer_flag == 1):
        print "Running Reconciliation for: %s" % pfam_id

        rand_id = random.randint(1000000, 9999999)
        subprocess.check_call("gunzip -c %s/%s.nw.gz > %s.%d" %
                              (tree_folder, pfam_id, gene_tree_file, rand_id),
                              shell=True)
        tree = ete2.PhyloTree('%s.%d' % (gene_tree_file, rand_id), format=0)
        tree.resolve_polytomy()
        tree.write(format=0, outfile=txid_file + '.tmp.nw')
        if os.path.exists('%s.%d' % (gene_tree_file, rand_id)):
            subprocess.check_call("rm  %s.%d" % (gene_tree_file, rand_id),
                                  shell=True)

        Phylo.convert(txid_file + '.tmp.nw', 'newick', txid_file + '.tmp.xml',
                      'phyloxml')
        treexml = PhyloXMLIO.read(open(txid_file + '.tmp.xml', 'r'))
        tree = treexml[0]
        treexml.attributes.pop('schemaLocation',
                               None)  # not supported by Forester
        tree.rooted = True
        my_ids = set([])
        my_query_by_taxid = {}
        for leaf in tree.clade.find_clades(terminal=True):
            up_name = leaf.name.split('/')[0]
            tax_id, tax_name = find_tax_id_unip(up_name, db)
            if tax_id not in all_species_txids:
                if tax_id in merged_taxid.keys():
                    tax_id = merged_taxid[tax_id]
                    tax_name = find_tax_name(tax_id, db)
                if tax_id in best_taxid_map.keys():
                    tax_id = best_taxid_map[tax_id]
                    tax_name = find_tax_name(tax_id, db)
                else:
                    tax_id0 = tax_id
                    tax_id, tax_name = find_best_taxid(tax_id, db)
                    if tax_id > 0:
                        best_taxid_map[tax_id0] = tax_id
            if tax_id < 0:
                if (-tax_id) in merged_taxid.keys():
                    tax_id = merged_taxid[-tax_id]
                    tax_name = find_tax_name(tax_id, db)
            if tax_id in my_query_by_taxid:
                my_query_by_taxid[tax_id].append(up_name)
            else:
                my_query_by_taxid[tax_id] = [up_name]
            my_ids.add(tax_id)
            my_tax_id = PhyloXML.Id(tax_id, provider='ncbi_taxonomy')
            taxon = PhyloXML.Taxonomy(id=my_tax_id)
            taxon.scientific_name = tax_name
            leaf._set_taxonomy(taxon)
        PhyloXMLIO.write(treexml, open(txid_file, 'w'))
        os.system('rm ' + txid_file + '.tmp.nw')
        os.system('rm ' + txid_file + '.tmp.xml')
        print "Taxid file done for: %s" % pfam_id
        existing_ids = list(set(my_ids) & set(all_species_txids))
        existing_genes = [
            g for txid in my_query_by_taxid.keys()
            for g in my_query_by_taxid[txid] if txid in existing_ids
        ]
        pickle.dump(
            {
                'pfam_id': pfam_id,
                'existing_ids': existing_ids,
                'existing_genes': existing_genes
            }, open(rec_tag + 'ids.pickle', 'wb'))
        print "Pickle file done for: %s" % pfam_id

    if os.path.exists(reconciled_file):
        os.system('rm ' + reconciled_file)
    os.system(
        "java -Xmx4g -cp %s/forester_1038.jar org.forester.application.gsdi -g %s %s/ncbi_2_fixed.xml %s"
        % (lib_path, txid_file, species_tree_data_path, reconciled_file))
    if os.path.exists(reconciled_file):
        if os.path.exists(reconciled_file + '.gz'):
            subprocess.check_call("rm  %s.gz" % (reconciled_file), shell=True)
        subprocess.check_call("gzip %s" % (reconciled_file), shell=True)
    os.system('rm ' + rec_tag + 'reconciled_species_tree_used.xml')
    os.system('rm ' + rec_tag + 'reconciled_gsdi_log.txt')
    os.system('rm ' + txid_file)
    print "Reconciliation file done for: %s" % pfam_id
def reconcile_tree(gene_tree_file,reconciled_file,rec_tag,pfam_id,db):
    if (os.path.isfile(rec_tag+'ids.pickle')) and  (pplacer_flag==1): 
        id_information = pickle.load(open(rec_tag+'ids.pickle', 'rb'))      
        existing_genes=id_information['existing_genes']
        Sequnces=[]
        p_ids=[]
        new_genes=set([w['id'] for w in pplacer_queries[pfam_id]])
        if not (new_genes-set(existing_genes)):
            print "All %s Genes for family %s have already been placed in the reconciled tree."%(len(new_genes),pfam_id)
            print "Skip Reconciliation for %s"%pfam_id
            return

    txid_file=rec_tag+'txid.xml'       
    if not(os.path.isfile(rec_tag+'ids.pickle')) or not(os.path.isfile(reconciled_file+'.gz')) or  (pplacer_flag==1): 
        print "Running Reconciliation for: %s"%pfam_id
        
        rand_id=random.randint(1000000,9999999)        
        subprocess.check_call("gunzip -c %s/%s.nw.gz > %s.%d"%(tree_folder,pfam_id,gene_tree_file,rand_id),shell=True)
        tree = ete2.PhyloTree('%s.%d'%(gene_tree_file,rand_id), format=0)
        tree.resolve_polytomy()
        tree.write(format=0, outfile=txid_file+'.tmp.nw')
        if os.path.exists('%s.%d'%(gene_tree_file,rand_id)):
            subprocess.check_call("rm  %s.%d"%(gene_tree_file,rand_id),shell=True)

        Phylo.convert(txid_file+'.tmp.nw', 'newick', txid_file+'.tmp.xml', 'phyloxml')
        treexml = PhyloXMLIO.read(open(txid_file+'.tmp.xml','r'))
        tree = treexml[0]
        treexml.attributes.pop('schemaLocation', None)  # not supported by Forester
        tree.rooted = True
        my_ids=set([])
        my_query_by_taxid={}
        for leaf in tree.clade.find_clades(terminal=True):
            up_name = leaf.name.split('/')[0]
            tax_id,tax_name=find_tax_id_unip(up_name,db)
            if tax_id not in all_species_txids:
                if tax_id in merged_taxid.keys():
                    tax_id=merged_taxid[tax_id]
                    tax_name=find_tax_name(tax_id,db)
                if tax_id in best_taxid_map.keys():
                    tax_id=best_taxid_map[tax_id]
                    tax_name=find_tax_name(tax_id,db)
                else:
                    tax_id0=tax_id
                    tax_id,tax_name=find_best_taxid(tax_id,db)
                    if tax_id>0:
                        best_taxid_map[tax_id0]=tax_id
            if tax_id<0:
                if (-tax_id) in merged_taxid.keys():
                    tax_id=merged_taxid[-tax_id]
                    tax_name=find_tax_name(tax_id,db)
            if tax_id in my_query_by_taxid:
               my_query_by_taxid[tax_id].append(up_name)
            else:
               my_query_by_taxid[tax_id]=[up_name]
            my_ids.add(tax_id)
            my_tax_id = PhyloXML.Id(tax_id, provider='ncbi_taxonomy')
            taxon=PhyloXML.Taxonomy(id=my_tax_id)
            taxon.scientific_name = tax_name
            leaf._set_taxonomy(taxon)
        PhyloXMLIO.write(treexml, open(txid_file,'w'))    
        os.system('rm '+txid_file+'.tmp.nw')
        os.system('rm '+txid_file+'.tmp.xml')
        print "Taxid file done for: %s"%pfam_id
        existing_ids=list(set(my_ids)&set(all_species_txids))
        existing_genes=[g for txid in my_query_by_taxid.keys() for g in my_query_by_taxid[txid] if txid in existing_ids]        
        pickle.dump({'pfam_id':pfam_id,'existing_ids':existing_ids,'existing_genes':existing_genes}, open(rec_tag+'ids.pickle', 'wb'))      
        print "Pickle file done for: %s"%pfam_id
        
       
    if os.path.exists(reconciled_file):
        os.system('rm '+reconciled_file)
    os.system("java -Xmx4g -cp %s/forester_1038.jar org.forester.application.gsdi -g %s %s/ncbi_2_fixed.xml %s"%(lib_path, txid_file, species_tree_data_path, reconciled_file))
    if os.path.exists(reconciled_file):
        if os.path.exists(reconciled_file+'.gz'):
            subprocess.check_call("rm  %s.gz"%(reconciled_file),shell=True)
        subprocess.check_call("gzip %s"%(reconciled_file),shell=True)
    os.system('rm '+rec_tag+'reconciled_species_tree_used.xml')
    os.system('rm '+rec_tag+'reconciled_gsdi_log.txt')
    os.system('rm '+txid_file)
    print "Reconciliation file done for: %s"%pfam_id
Пример #8
0
print(aln)

# Calculate the distance matrix
# calculator = DistanceCalculator('identity')
# dm = calculator.get_distance(aln)
#
# with open("data/project/dist_mat.p", "wb") as f:
#     pickle.dump(dm, f)

with open("data/project/dist_mat.p", "rb") as f:
    dm = pickle.load(f)
# Construct the phylogenetic tree using UPGMA algorithm
constructor = DistanceTreeConstructor()
tree = constructor.upgma(dm)
tree_xml = tree.as_phyloxml()
PhyloXMLIO.write(tree_xml, "data/project/tree1.xml")


def labels(c):
    if not c.is_terminal():
        return ""
    else:
        return " ".join(c.name.split()[:2])


fig = plt.figure(figsize=(25, 25), dpi=100)
axes = fig.add_subplot(1, 1, 1)
Phylo.draw(tree, axes=axes, do_show=False, label_func=labels)
plt.savefig("plots/tree1.svg", format="svg")
Phylo.draw_ascii(tree)