def to_orthoxml_str(self, pthr_version: str, database_version: str, organism_dat: str = None): # Ready some element data oscode_taxid_lkp = {} if organism_dat: oscode_taxid_lkp = OrganismDatFile.parse_organism_dat(organism_dat) ### Write out compiled OrthoXML # Write out all genes oxml = orthoxml.orthoXML() oxml.set_version(0.3) oxml.set_origin("PANTHER") oxml.set_originVersion(pthr_version) for oscode, gene_list in self.genes.species.items(): taxon_id = oscode_taxid_lkp.get(oscode) ortho_species = orthoxml.species(name=oscode, NCBITaxId=taxon_id) ortho_db = orthoxml.database(name="UniProt", version=database_version) ortho_genes = orthoxml.genes() for gene in gene_list: ortho_genes.add_gene( orthoxml.gene(protId=gene.gene_id, id=gene.orthoxml_id)) ortho_db.set_genes(ortho_genes) ortho_species.add_database(ortho_db) oxml.add_species(ortho_species) # Write out all groups ortho_groups = all_groups() oxml.set_groups(ortho_groups) for g in self: if isinstance(g, ParalogGroup): ortho_groups.add_paralogGroup(g.to_orthoxml_obj()) else: ortho_groups.add_orthologGroup(g.to_orthoxml_obj()) # print this to STDOUT out_str = io.StringIO() oxml.export(out_str, level=0, namespace_="", namespacedef_="xmlns=\"http://orthoXML.org/2011/\"") final_orthoxml_str = sanitize_xml_str(out_str.getvalue()) return final_orthoxml_str
def export_as_orthoXML(t, database, evoltype_attr="evoltype"): """ This function takes a ETE PhyloTree instance and export all its speciation and duplication events as OrthoXML format. Note that this function can be enriched or customized to include more orthoXML features using the orthoxml.py module. """ # Creates an empty orthoXML object O = orthoxml.orthoXML() # Generate the structure containing sequence information leaf2id = {} sp2genes = {} for genid, leaf in enumerate(t.iter_leaves()): spname = extract_spname(leaf.name) if spname not in sp2genes: sp = orthoxml.species(spname) db = orthoxml.database(name=database) genes = orthoxml.genes() sp.add_database(db) db.set_genes(genes) sp2genes[spname] = genes # add info to the orthoXML document O.add_species(sp) else: genes = sp2genes[spname] gn = orthoxml.gene(protId=leaf.name, id=genid) leaf2id[leaf] = genid genes.add_gene(gn) # Add an ortho group container to the orthoXML document ortho_groups = orthoxml.groups() O.set_groups(ortho_groups) # OrthoXML does not support duplication events to be at the root # of the tree, so we search for the top most speciation events in # the tree and export them as separate ortholog groups is_speciation = lambda n: getattr(n, evoltype_attr, "" ) == "S" or not n.children for speciation_root in t.iter_leaves(is_leaf_fn=is_speciation): # Creates an orthogroup in which all events will be added node2event = {} node2event[speciation_root] = orthoxml.group() ortho_groups.add_orthologGroup(node2event[speciation_root]) # if root node is a leaf, just export an orphan sequence within the group if speciation_root.is_leaf(): node2event[speciation_root].add_geneRef( orthoxml.geneRef(leaf2id[speciation_root])) # otherwise, descend the tree and export orthology structure for node in speciation_root.traverse("preorder"): if node.is_leaf(): continue parent_event = node2event[node] for ch in node.children: if ch.is_leaf(): parent_event.add_geneRef(orthoxml.geneRef(leaf2id[ch])) else: node2event[ch] = orthoxml.group() try: evoltype = getattr(ch, evoltype_attr) except AttributeError: if not ch.is_leaf(): print(ch.features) raise AttributeError( "\n\nUnknown evolutionary event. Please use [S] for speciation and [D] for duplication: %s" % ch.get_ascii()) if evoltype == "D": parent_event.add_paralogGroup(node2event[ch]) elif evoltype == "S": parent_event.add_orthologGroup(node2event[ch]) O.export(sys.stdout, 0, namespace_="")
def parse(xml_path: str): if os.path.isdir(xml_path): xml_files = [ os.path.join(xml_path, xf_basename) for xf_basename in os.listdir(xml_path) ] else: # Maybe don't allow single files cuz what's the point? xml_files = [xml_path] # Parse into Genes+Groups DS gene_id_index = 1 all_genes = GeneCollection() all_groups = GroupCollection(genes=all_genes) for xf in xml_files: # Gotta fix ete3.orthoxml's bytes-encoding quirk (I think it's a python2 thing): xml_string = "" orthoxml.orthoXML() with open(xf) as xml_f: for l in xml_f.readlines(): xml_string += sanitize_xml_str(l) file_genes = GeneCollection() file_groups = GroupCollection(genes=file_genes) try: tree = etree.fromstring(xml_string, parser=etree.XMLParser(recover=True)) except etree.XMLSyntaxError as e: # Some input files generated by divideHTtrees can be empty # TODO: Log out exception and xf (filename) continue if tree.tag != "orthoXML": # This is not standard orthoXML, assume it's output from divideHTtrees orthoxml_root = etree.Element("orthoXML") orthoxml_root.append(tree) tree = orthoxml_root for node in tree.getchildren(): if node.tag == "species": file_genes.add_genes_from_species_element(node) for node in tree.getchildren(): if node.tag == "groups": file_groups.add_groups_from_groups_element(node) elif node.tag in ["orthologGroup", "paralogGroup"]: # This is coming from divideHTtrees output orthoXML file_groups.add_group( file_groups.group_from_group_element(node)) # Extra filter to remove singleton groups produced by etree2orthoxml.py groups_to_remove = [] for g in file_groups: if len(g) < 2: groups_to_remove.append(g) file_groups.remove_groups(groups_to_remove) # Remint orthoxml_ids to avoid collisions across files for orthoxml_id in sorted(file_genes.genes.keys(), key=int): gene = file_genes.genes[orthoxml_id] gene.orthoxml_id = str(gene_id_index) gene_id_index += 1 all_genes.add_gene(gene) all_groups.merge_collection(file_groups) return all_groups
def exportToOrthoXML(t, database='customdb', handle=sys.stdout): """ This function takes a TreeClass instance and export all its speciation and duplication events to the OrthoXML format. """ # Creates an empty orthoXML object O = orthoxml.orthoXML() # Generate the structure containing sequence information leaf2id = {} sp2genes = {} for genid, leaf in enumerate(t.iter_leaves()): spname = leaf.species if spname not in sp2genes: sp = orthoxml.species(spname) db = orthoxml.database(name=database) genes = orthoxml.genes() sp.add_database(db) db.set_genes(genes) sp2genes[spname] = genes # add info to the orthoXML document O.add_species(sp) else: genes = sp2genes[spname] gn = orthoxml.gene(protId=leaf.name, id=genid) leaf2id[leaf] = genid genes.add_gene(gn) # Add an ortho group container to the orthoXML document ortho_groups = orthoxml.groups() O.set_groups(ortho_groups) # OrthoXML does not support duplication events to be at the root # of the tree, so we search for the top most speciation events in # the tree and export them as separate ortholog groups is_speciation = lambda n: getattr(n, 'type', "") == "S" or not n.children for speciation_root in t.iter_leaves(is_leaf_fn=is_speciation): # Creates an orthogroup in which all events will be added node2event = {} node2event[speciation_root] = orthoxml.group() ortho_groups.add_orthologGroup(node2event[speciation_root]) # if root node is a leaf, just export an orphan sequence within the # group if speciation_root.is_leaf(): node2event[speciation_root].add_geneRef( orthoxml.geneRef(leaf2id[speciation_root])) # otherwise, descend the tree and export orthology structure for node in speciation_root.traverse("preorder"): if node.is_leaf(): continue parent_event = node2event[node] for ch in node.children: if ch.is_leaf(): parent_event.add_geneRef(orthoxml.geneRef(leaf2id[ch])) else: node2event[ch] = orthoxml.group() if not (ch.has_feature('type') or ch.has_feature('dup')): raise AttributeError( "\n\nUnknown evolutionary event. %s" % ch.get_ascii()) if (ch.type == TreeClass.SPEC): parent_event.add_paralogGroup(node2event[ch]) elif ch.type > 0: parent_event.add_orthologGroup(node2event[ch]) else: raise AttributeError( "\n\Internals nodes labeled by losses are not expected in the orthoXML format" ) O.export(handle, 0, namespace_="")
def exportToOrthoXML(t, database='customdb', handle=sys.stdout): """ This function takes a TreeClass instance and export all its speciation and duplication events to the OrthoXML format. """ # Creates an empty orthoXML object O = orthoxml.orthoXML() # Generate the structure containing sequence information leaf2id = {} sp2genes = {} for genid, leaf in enumerate(t.iter_leaves()): spname = leaf.species if spname not in sp2genes: sp = orthoxml.species(spname) db = orthoxml.database(name=database) genes = orthoxml.genes() sp.add_database(db) db.set_genes(genes) sp2genes[spname] = genes # add info to the orthoXML document O.add_species(sp) else: genes = sp2genes[spname] gn = orthoxml.gene(protId=leaf.name, id=genid) leaf2id[leaf] = genid genes.add_gene(gn) # Add an ortho group container to the orthoXML document ortho_groups = orthoxml.groups() O.set_groups(ortho_groups) # OrthoXML does not support duplication events at the root # of the tree, so we search for the top most speciation events in # the tree and export them as separate ortholog groups for speciation_root in t.iter_leaves(is_leaf_fn=(lambda n: getattr(n, 'type', "") == "S" or not n.children)): # Creates an orthogroup in which all events will be added node2event = {} node2event[speciation_root] = orthoxml.group() ortho_groups.add_orthologGroup(node2event[speciation_root]) # if root node is a leaf, just export an orphan sequence within the # group if speciation_root.is_leaf(): node2event[speciation_root].add_geneRef( orthoxml.geneRef(leaf2id[speciation_root])) # otherwise, descend the tree and export orthology structure for node in speciation_root.traverse("preorder"): if node.is_leaf(): continue parent_event = node2event[node] for ch in node.children: if ch.is_leaf(): parent_event.add_geneRef(orthoxml.geneRef(leaf2id[ch])) else: node2event[ch] = orthoxml.group() if not (ch.has_feature('type') or ch.has_feature('dup')): raise AttributeError( "\n\nUnknown evolutionary event. %s" % ch.get_ascii()) if(ch.type == TreeClass.SPEC): parent_event.add_paralogGroup(node2event[ch]) elif ch.type > 0: parent_event.add_orthologGroup(node2event[ch]) else: raise AttributeError( "\n\Internals nodes labeled by losses are not expected in the orthoXML format") O.export(handle, 0, namespace_="")