示例#1
0
 def _parse_clade(self, parent):
     """Parse a Clade node and its children, recursively (PRIVATE)."""
     clade = PX.Clade(**parent.attrib)
     if clade.branch_length is not None:
         clade.branch_length = float(clade.branch_length)
     # NB: Only evaluate nodes at the current level
     tag_stack = []
     for event, elem in self.context:
         namespace, tag = _split_namespace(elem.tag)
         if event == "start":
             if tag == "clade":
                 clade.clades.append(self._parse_clade(elem))
                 continue
             if tag == "taxonomy":
                 clade.taxonomies.append(self._parse_taxonomy(elem))
                 continue
             if tag == "sequence":
                 clade.sequences.append(self._parse_sequence(elem))
                 continue
             if tag in self._clade_tracked_tags:
                 tag_stack.append(tag)
         if event == "end":
             if tag == "clade":
                 elem.clear()
                 break
             if tag != tag_stack[-1]:
                 continue
             tag_stack.pop()
             # Handle the other non-recursive children
             if tag in self._clade_list_types:
                 getattr(clade, self._clade_list_types[tag]).append(
                     getattr(self, tag)(elem)
                 )
             elif tag in self._clade_complex_types:
                 setattr(clade, tag, getattr(self, tag)(elem))
             elif tag == "branch_length":
                 # NB: possible collision with the attribute
                 if clade.branch_length is not None:
                     raise PhyloXMLError(
                         "Attribute branch_length was already set for this Clade."
                     )
                 clade.branch_length = _float(elem.text)
             elif tag == "width":
                 clade.width = _float(elem.text)
             elif tag == "name":
                 clade.name = _collapse_wspace(elem.text)
             elif tag == "node_id":
                 clade.node_id = PX.Id(
                     elem.text.strip(), elem.attrib.get("provider")
                 )
             elif namespace != NAMESPACES["phy"]:
                 clade.other.append(self.other(elem, namespace, tag))
                 elem.clear()
             else:
                 raise PhyloXMLError("Misidentified tag: " + tag)
     return clade
def prepare_species_tree(FILE_TREE_IN, FILE_TREE_OUT):
    clan_taxa = {}
    treexml = PhyloXMLIO.read(open(FILE_TREE_IN, 'r'))
    tree = treexml[0]
    treexml.attributes.pop('schemaLocation', None)  # not supported by Forester
    tree.rooted = True
    leaf_dict = {}
    for node in tree.clade.find_clades():
        if node.name:
            tax_id = node.name
            if tax_id.startswith('INT'):
                tax_id = tax_id[3:]
            taxon = PhyloXML.Taxonomy(
                id=PhyloXML.Id(tax_id, provider='ncbi_taxonomy'))
            try:
                taxon.scientific_name = find_tax_name(tax_id)
            except KeyError:
                taxon.scientific_name = '(NA)'
            node._set_taxonomy(taxon)
            node.name = None
        else:
            pass
    PhyloXMLIO.write(treexml, FILE_TREE_OUT)
示例#3
0
    def _parse_clade(self, parent):
        """Parse a Clade node and its children, recursively."""

        clade = BPrecPhyloXML.Clade(**parent.attrib)
        if clade.branch_length is not None:
            clade.branch_length = float(clade.branch_length)
        # NB: Only evaluate nodes at the current level
        tag_stack = []
        for event, elem in self.context:
            namespace, tag = PhyloXMLIO._split_namespace(elem.tag)
            #print event, namespace, tag
            if event == 'start':
                if tag == 'clade':
                    clade.clades.append(self._parse_clade(elem))
                    continue
                if tag == 'taxonomy':
                    clade.taxonomies.append(self._parse_taxonomy(elem))
                    continue
                if tag == 'sequence':
                    clade.sequences.append(self._parse_sequence(elem))
                    continue
                if tag == EVENTSRECTAG:  ## list of reconciliation events
                    clade.eventsRec = self._parse_eventsRec(elem)
                    continue
                if tag in self._clade_tracked_tags:
                    tag_stack.append(tag)
            if event == 'end':
                if tag == 'clade':
                    elem.clear()
                    break
                if tag != tag_stack[-1]:
                    continue
                tag_stack.pop()
                # Handle the other non-recursive children
                if tag in self._clade_list_types:
                    getattr(clade, self._clade_list_types[tag]).append(
                        getattr(self, tag)(elem))
                elif tag in self._clade_complex_types:
                    setattr(clade, tag, getattr(self, tag)(elem))
                elif tag == 'branch_length':
                    # NB: possible collision with the attribute
                    if clade.branch_length is not None:
                        raise PhyloXMLIO.PhyloXMLError(
                            'Attribute branch_length was already set '
                            'for this Clade.')
                    clade.branch_length = PhyloXMLIO._float(elem.text)
                elif tag == 'width':
                    clade.width = PhyloXMLIO._float(elem.text)
                elif tag == 'name':
                    clade.name = PhyloXMLIO._collapse_wspace(elem.text)
                elif tag == 'node_id':
                    clade.node_id = PX.Id(elem.text.strip(),
                                          elem.attrib.get('provider'))
                elif namespace != PhyloXMLIO.NAMESPACES['phy']:
                    clade.other.append(self.other(elem, namespace, tag))
                    elem.clear()
                elif tag in self._clade_recPhyloXML_list_type:
                    #clade.eventsRec = self.other(elem, namespace, tag)
                    continue
                    #getattr(clade, self._clade_recPhyloXML_list_type[tag]).append(
                    #    getattr(self, tag)(elem))
                else:
                    raise PhyloXMLIO.PhyloXMLError('Misidentified tag: ' + tag)
        return clade
示例#4
0
 def id(self, elem):
     provider = elem.get('provider') or elem.get('type')
     return PX.Id(elem.text.strip(), provider)
示例#5
0
 def id(self, elem):
     """Create identifier object."""
     provider = elem.get("provider") or elem.get("type")
     return PX.Id(elem.text.strip(), provider)
示例#6
0
def reconcile_tree(gene_tree_file, reconciled_file, rec_tag, pfam_id, db):
    if (os.path.isfile(rec_tag + 'ids.pickle')) and (pplacer_flag == 1):
        id_information = pickle.load(open(rec_tag + 'ids.pickle', 'rb'))
        existing_genes = id_information['existing_genes']
        Sequnces = []
        p_ids = []
        new_genes = set([w['id'] for w in pplacer_queries[pfam_id]])
        if not (new_genes - set(existing_genes)):
            print "All %s Genes for family %s have already been placed in the reconciled tree." % (
                len(new_genes), pfam_id)
            print "Skip Reconciliation for %s" % pfam_id
            return

    txid_file = rec_tag + 'txid.xml'
    if not (os.path.isfile(rec_tag + 'ids.pickle')) or not (
            os.path.isfile(reconciled_file + '.gz')) or (pplacer_flag == 1):
        print "Running Reconciliation for: %s" % pfam_id

        rand_id = random.randint(1000000, 9999999)
        subprocess.check_call("gunzip -c %s/%s.nw.gz > %s.%d" %
                              (tree_folder, pfam_id, gene_tree_file, rand_id),
                              shell=True)
        tree = ete2.PhyloTree('%s.%d' % (gene_tree_file, rand_id), format=0)
        tree.resolve_polytomy()
        tree.write(format=0, outfile=txid_file + '.tmp.nw')
        if os.path.exists('%s.%d' % (gene_tree_file, rand_id)):
            subprocess.check_call("rm  %s.%d" % (gene_tree_file, rand_id),
                                  shell=True)

        Phylo.convert(txid_file + '.tmp.nw', 'newick', txid_file + '.tmp.xml',
                      'phyloxml')
        treexml = PhyloXMLIO.read(open(txid_file + '.tmp.xml', 'r'))
        tree = treexml[0]
        treexml.attributes.pop('schemaLocation',
                               None)  # not supported by Forester
        tree.rooted = True
        my_ids = set([])
        my_query_by_taxid = {}
        for leaf in tree.clade.find_clades(terminal=True):
            up_name = leaf.name.split('/')[0]
            tax_id, tax_name = find_tax_id_unip(up_name, db)
            if tax_id not in all_species_txids:
                if tax_id in merged_taxid.keys():
                    tax_id = merged_taxid[tax_id]
                    tax_name = find_tax_name(tax_id, db)
                if tax_id in best_taxid_map.keys():
                    tax_id = best_taxid_map[tax_id]
                    tax_name = find_tax_name(tax_id, db)
                else:
                    tax_id0 = tax_id
                    tax_id, tax_name = find_best_taxid(tax_id, db)
                    if tax_id > 0:
                        best_taxid_map[tax_id0] = tax_id
            if tax_id < 0:
                if (-tax_id) in merged_taxid.keys():
                    tax_id = merged_taxid[-tax_id]
                    tax_name = find_tax_name(tax_id, db)
            if tax_id in my_query_by_taxid:
                my_query_by_taxid[tax_id].append(up_name)
            else:
                my_query_by_taxid[tax_id] = [up_name]
            my_ids.add(tax_id)
            my_tax_id = PhyloXML.Id(tax_id, provider='ncbi_taxonomy')
            taxon = PhyloXML.Taxonomy(id=my_tax_id)
            taxon.scientific_name = tax_name
            leaf._set_taxonomy(taxon)
        PhyloXMLIO.write(treexml, open(txid_file, 'w'))
        os.system('rm ' + txid_file + '.tmp.nw')
        os.system('rm ' + txid_file + '.tmp.xml')
        print "Taxid file done for: %s" % pfam_id
        existing_ids = list(set(my_ids) & set(all_species_txids))
        existing_genes = [
            g for txid in my_query_by_taxid.keys()
            for g in my_query_by_taxid[txid] if txid in existing_ids
        ]
        pickle.dump(
            {
                'pfam_id': pfam_id,
                'existing_ids': existing_ids,
                'existing_genes': existing_genes
            }, open(rec_tag + 'ids.pickle', 'wb'))
        print "Pickle file done for: %s" % pfam_id

    if os.path.exists(reconciled_file):
        os.system('rm ' + reconciled_file)
    os.system(
        "java -Xmx4g -cp %s/forester_1038.jar org.forester.application.gsdi -g %s %s/ncbi_2_fixed.xml %s"
        % (lib_path, txid_file, species_tree_data_path, reconciled_file))
    if os.path.exists(reconciled_file):
        if os.path.exists(reconciled_file + '.gz'):
            subprocess.check_call("rm  %s.gz" % (reconciled_file), shell=True)
        subprocess.check_call("gzip %s" % (reconciled_file), shell=True)
    os.system('rm ' + rec_tag + 'reconciled_species_tree_used.xml')
    os.system('rm ' + rec_tag + 'reconciled_gsdi_log.txt')
    os.system('rm ' + txid_file)
    print "Reconciliation file done for: %s" % pfam_id