Пример #1
0
    def to_orthoxml_str(self,
                        pthr_version: str,
                        database_version: str,
                        organism_dat: str = None):
        # Ready some element data
        oscode_taxid_lkp = {}
        if organism_dat:
            oscode_taxid_lkp = OrganismDatFile.parse_organism_dat(organism_dat)
        ### Write out compiled OrthoXML
        # Write out all genes
        oxml = orthoxml.orthoXML()
        oxml.set_version(0.3)
        oxml.set_origin("PANTHER")
        oxml.set_originVersion(pthr_version)
        for oscode, gene_list in self.genes.species.items():
            taxon_id = oscode_taxid_lkp.get(oscode)
            ortho_species = orthoxml.species(name=oscode, NCBITaxId=taxon_id)
            ortho_db = orthoxml.database(name="UniProt",
                                         version=database_version)
            ortho_genes = orthoxml.genes()
            for gene in gene_list:
                ortho_genes.add_gene(
                    orthoxml.gene(protId=gene.gene_id, id=gene.orthoxml_id))
            ortho_db.set_genes(ortho_genes)
            ortho_species.add_database(ortho_db)
            oxml.add_species(ortho_species)

        # Write out all groups
        ortho_groups = all_groups()
        oxml.set_groups(ortho_groups)
        for g in self:
            if isinstance(g, ParalogGroup):
                ortho_groups.add_paralogGroup(g.to_orthoxml_obj())
            else:
                ortho_groups.add_orthologGroup(g.to_orthoxml_obj())

        # print this to STDOUT
        out_str = io.StringIO()
        oxml.export(out_str,
                    level=0,
                    namespace_="",
                    namespacedef_="xmlns=\"http://orthoXML.org/2011/\"")
        final_orthoxml_str = sanitize_xml_str(out_str.getvalue())
        return final_orthoxml_str
Пример #2
0
def export_as_orthoXML(t, database, evoltype_attr="evoltype"):
    """ This function takes a ETE PhyloTree instance and export all
    its speciation and duplication events as OrthoXML format.

    Note that this function can be enriched or customized to include
    more orthoXML features using the orthoxml.py module.
    """

    # Creates an empty orthoXML object
    O = orthoxml.orthoXML()

    # Generate the structure containing sequence information
    leaf2id = {}
    sp2genes = {}
    for genid, leaf in enumerate(t.iter_leaves()):
        spname = extract_spname(leaf.name)
        if spname not in sp2genes:
            sp = orthoxml.species(spname)
            db = orthoxml.database(name=database)
            genes = orthoxml.genes()
            sp.add_database(db)
            db.set_genes(genes)
            sp2genes[spname] = genes
            # add info to the orthoXML document
            O.add_species(sp)
        else:
            genes = sp2genes[spname]

        gn = orthoxml.gene(protId=leaf.name, id=genid)
        leaf2id[leaf] = genid
        genes.add_gene(gn)

    # Add an ortho group container to the orthoXML document
    ortho_groups = orthoxml.groups()
    O.set_groups(ortho_groups)

    # OrthoXML does not support duplication events to be at the root
    # of the tree, so we search for the top most speciation events in
    # the tree and export them as separate ortholog groups
    is_speciation = lambda n: getattr(n, evoltype_attr, ""
                                      ) == "S" or not n.children
    for speciation_root in t.iter_leaves(is_leaf_fn=is_speciation):
        # Creates an orthogroup in which all events will be added
        node2event = {}
        node2event[speciation_root] = orthoxml.group()
        ortho_groups.add_orthologGroup(node2event[speciation_root])

        # if root node is a leaf, just export an orphan sequence within the group
        if speciation_root.is_leaf():
            node2event[speciation_root].add_geneRef(
                orthoxml.geneRef(leaf2id[speciation_root]))

        # otherwise, descend the tree and export orthology structure
        for node in speciation_root.traverse("preorder"):
            if node.is_leaf():
                continue
            parent_event = node2event[node]
            for ch in node.children:
                if ch.is_leaf():
                    parent_event.add_geneRef(orthoxml.geneRef(leaf2id[ch]))
                else:
                    node2event[ch] = orthoxml.group()
                    try:
                        evoltype = getattr(ch, evoltype_attr)
                    except AttributeError:
                        if not ch.is_leaf():
                            print(ch.features)
                            raise AttributeError(
                                "\n\nUnknown evolutionary event. Please use [S] for speciation and [D] for duplication: %s"
                                % ch.get_ascii())

                    if evoltype == "D":
                        parent_event.add_paralogGroup(node2event[ch])
                    elif evoltype == "S":
                        parent_event.add_orthologGroup(node2event[ch])
    O.export(sys.stdout, 0, namespace_="")
Пример #3
0
    def parse(xml_path: str):
        if os.path.isdir(xml_path):
            xml_files = [
                os.path.join(xml_path, xf_basename)
                for xf_basename in os.listdir(xml_path)
            ]
        else:
            # Maybe don't allow single files cuz what's the point?
            xml_files = [xml_path]

        # Parse into Genes+Groups DS
        gene_id_index = 1
        all_genes = GeneCollection()
        all_groups = GroupCollection(genes=all_genes)
        for xf in xml_files:
            # Gotta fix ete3.orthoxml's bytes-encoding quirk (I think it's a python2 thing):
            xml_string = ""
            orthoxml.orthoXML()
            with open(xf) as xml_f:
                for l in xml_f.readlines():
                    xml_string += sanitize_xml_str(l)

            file_genes = GeneCollection()
            file_groups = GroupCollection(genes=file_genes)
            try:
                tree = etree.fromstring(xml_string,
                                        parser=etree.XMLParser(recover=True))
            except etree.XMLSyntaxError as e:
                # Some input files generated by divideHTtrees can be empty
                # TODO: Log out exception and xf (filename)
                continue
            if tree.tag != "orthoXML":
                # This is not standard orthoXML, assume it's output from divideHTtrees
                orthoxml_root = etree.Element("orthoXML")
                orthoxml_root.append(tree)
                tree = orthoxml_root
            for node in tree.getchildren():
                if node.tag == "species":
                    file_genes.add_genes_from_species_element(node)
            for node in tree.getchildren():
                if node.tag == "groups":
                    file_groups.add_groups_from_groups_element(node)
                elif node.tag in ["orthologGroup", "paralogGroup"]:
                    # This is coming from divideHTtrees output orthoXML
                    file_groups.add_group(
                        file_groups.group_from_group_element(node))

            # Extra filter to remove singleton groups produced by etree2orthoxml.py
            groups_to_remove = []
            for g in file_groups:
                if len(g) < 2:
                    groups_to_remove.append(g)
            file_groups.remove_groups(groups_to_remove)

            # Remint orthoxml_ids to avoid collisions across files
            for orthoxml_id in sorted(file_genes.genes.keys(), key=int):
                gene = file_genes.genes[orthoxml_id]
                gene.orthoxml_id = str(gene_id_index)
                gene_id_index += 1
                all_genes.add_gene(gene)

            all_groups.merge_collection(file_groups)

        return all_groups
Пример #4
0
def exportToOrthoXML(t, database='customdb', handle=sys.stdout):
    """ This function takes a TreeClass instance and export all
    its speciation and duplication events to the OrthoXML format.

    """

    # Creates an empty orthoXML object
    O = orthoxml.orthoXML()

    # Generate the structure containing sequence information
    leaf2id = {}
    sp2genes = {}
    for genid, leaf in enumerate(t.iter_leaves()):
        spname = leaf.species
        if spname not in sp2genes:
            sp = orthoxml.species(spname)
            db = orthoxml.database(name=database)
            genes = orthoxml.genes()
            sp.add_database(db)
            db.set_genes(genes)
            sp2genes[spname] = genes
            # add info to the orthoXML document
            O.add_species(sp)
        else:
            genes = sp2genes[spname]

        gn = orthoxml.gene(protId=leaf.name, id=genid)
        leaf2id[leaf] = genid
        genes.add_gene(gn)

    # Add an ortho group container to the orthoXML document
    ortho_groups = orthoxml.groups()
    O.set_groups(ortho_groups)

    # OrthoXML does not support duplication events to be at the root
    # of the tree, so we search for the top most speciation events in
    # the tree and export them as separate ortholog groups
    is_speciation = lambda n: getattr(n, 'type', "") == "S" or not n.children
    for speciation_root in t.iter_leaves(is_leaf_fn=is_speciation):
        # Creates an orthogroup in which all events will be added
        node2event = {}
        node2event[speciation_root] = orthoxml.group()
        ortho_groups.add_orthologGroup(node2event[speciation_root])

        # if root node is a leaf, just export an orphan sequence within the
        # group
        if speciation_root.is_leaf():
            node2event[speciation_root].add_geneRef(
                orthoxml.geneRef(leaf2id[speciation_root]))

        # otherwise, descend the tree and export orthology structure
        for node in speciation_root.traverse("preorder"):
            if node.is_leaf():
                continue
            parent_event = node2event[node]
            for ch in node.children:
                if ch.is_leaf():
                    parent_event.add_geneRef(orthoxml.geneRef(leaf2id[ch]))
                else:
                    node2event[ch] = orthoxml.group()

                    if not (ch.has_feature('type') or ch.has_feature('dup')):
                        raise AttributeError(
                            "\n\nUnknown evolutionary event. %s" %
                            ch.get_ascii())

                    if (ch.type == TreeClass.SPEC):
                        parent_event.add_paralogGroup(node2event[ch])
                    elif ch.type > 0:
                        parent_event.add_orthologGroup(node2event[ch])
                    else:
                        raise AttributeError(
                            "\n\Internals nodes labeled by losses are not expected in the orthoXML format"
                        )

    O.export(handle, 0, namespace_="")
Пример #5
0
def exportToOrthoXML(t, database='customdb', handle=sys.stdout):
    """ This function takes a TreeClass instance and export all
    its speciation and duplication events to the OrthoXML format.

    """

    # Creates an empty orthoXML object
    O = orthoxml.orthoXML()

    # Generate the structure containing sequence information
    leaf2id = {}
    sp2genes = {}
    for genid, leaf in enumerate(t.iter_leaves()):
        spname = leaf.species
        if spname not in sp2genes:
            sp = orthoxml.species(spname)
            db = orthoxml.database(name=database)
            genes = orthoxml.genes()
            sp.add_database(db)
            db.set_genes(genes)
            sp2genes[spname] = genes
            # add info to the orthoXML document
            O.add_species(sp)
        else:
            genes = sp2genes[spname]

        gn = orthoxml.gene(protId=leaf.name, id=genid)
        leaf2id[leaf] = genid
        genes.add_gene(gn)

    # Add an ortho group container to the orthoXML document
    ortho_groups = orthoxml.groups()
    O.set_groups(ortho_groups)

    # OrthoXML does not support duplication events at the root
    # of the tree, so we search for the top most speciation events in
    # the tree and export them as separate ortholog groups
    for speciation_root in t.iter_leaves(is_leaf_fn=(lambda n: getattr(n, 'type', "") == "S" or not n.children)):
        # Creates an orthogroup in which all events will be added
        node2event = {}
        node2event[speciation_root] = orthoxml.group()
        ortho_groups.add_orthologGroup(node2event[speciation_root])

        # if root node is a leaf, just export an orphan sequence within the
        # group
        if speciation_root.is_leaf():
            node2event[speciation_root].add_geneRef(
                orthoxml.geneRef(leaf2id[speciation_root]))

        # otherwise, descend the tree and export orthology structure
        for node in speciation_root.traverse("preorder"):
            if node.is_leaf():
                continue
            parent_event = node2event[node]
            for ch in node.children:
                if ch.is_leaf():
                    parent_event.add_geneRef(orthoxml.geneRef(leaf2id[ch]))
                else:
                    node2event[ch] = orthoxml.group()

                    if not (ch.has_feature('type') or ch.has_feature('dup')):
                        raise AttributeError(
                            "\n\nUnknown evolutionary event. %s" % ch.get_ascii())

                    if(ch.type == TreeClass.SPEC):
                        parent_event.add_paralogGroup(node2event[ch])
                    elif ch.type > 0:
                        parent_event.add_orthologGroup(node2event[ch])
                    else:
                        raise AttributeError(
                            "\n\Internals nodes labeled by losses are not expected in the orthoXML format")

    O.export(handle, 0, namespace_="")