Exemplo n.º 1
0
def get_evol_events_from_leaf(node, sos_thr=0.0):
    """ Returns a list of duplication and speciation events in
    which the current node has been involved. Scanned nodes are
    also labeled internally as dup=True|False. You can access this
    labels using the 'node.dup' sintaxis.

    Method: the algorithm scans all nodes from the given leafName to
    the root. Nodes are assumed to be duplications when a species
    overlap is found between its child linages. Method is described
    more detail in:

    "The Human Phylome." Huerta-Cepas J, Dopazo H, Dopazo J, Gabaldon
    T. Genome Biol. 2007;8(6):R109.
    """
    # Get the tree's root
    root = node.get_tree_root()

    # Checks that is actually rooted
    outgroups = root.get_children()
    if len(outgroups) != 2:
        raise TypeError, "Tree is not rooted"

    # Cautch the smaller outgroup (will be stored as the tree
    # outgroup)
    o1 = set([n.name for n in outgroups[0].get_leaves()])
    o2 = set([n.name for n in outgroups[1].get_leaves()])

    if len(o2)<len(o1):
        smaller_outg = outgroups[1]
    else:
        smaller_outg = outgroups[0]


    # Prepare to browse tree from leaf to root
    all_events = []
    current  = node
    ref_spcs = node.species
    sister_leaves  = set([])
    browsed_spcs   = set([current.species])
    browsed_leaves = set([current])
    # get family Size
    fSize =  len([n for n in root.get_leaves() if n.species == ref_spcs])

    # Clean previous analysis
    for n in root.get_descendants()+[root]:
        n.del_feature("evoltype")

    while current.up:
        # distances control (0.0 distance check)
        d = 0
        for s in current.get_sisters():
            for leaf in s.get_leaves():
                d += current.get_distance(leaf)
                sister_leaves.add(leaf)
        # Process sister node only if there is any new sequence.
        # (previene dupliaciones por nombres repetidos)
        sister_leaves = sister_leaves.difference(browsed_leaves)
        if len(sister_leaves)==0:
            current = current.up
            continue
        # Gets species at both sides of event
        sister_spcs     = set([n.species for n in sister_leaves])
        overlaped_spces = browsed_spcs & sister_spcs
        all_spcs        = browsed_spcs | sister_spcs
        score = float(len(overlaped_spces))/len(all_spcs)
        # Creates a new evolEvent
        event = EvolEvent()
        event.fam_size   = fSize
        event.seed      = node.name
        # event.e_newick  = current.up.get_newick()  # high mem usage!!
        event.sos = score
        event.outgroup  = smaller_outg.name
        # event.allseqs   = set(current.up.get_leaf_names())
        event.in_seqs = set([n.name for n in browsed_leaves])
        event.out_seqs = set([n.name for n in sister_leaves])
        event.inparalogs  = set([n.name for n in browsed_leaves if n.species == ref_spcs])

        # If species overlap: duplication
        if score >sos_thr and d > 0.0:
            event.node = current.up
            event.etype = "D"
            event.outparalogs = set([n.name for n in sister_leaves  if n.species == ref_spcs])
            event.orthologs   = set([])
            current.up.add_feature("evoltype","D")
            all_events.append(event)

        # If NO species overlap: speciation
        elif score == sos_thr:
            event.node = current.up
            event.etype = "S"
            event.orthologs = set([n.name for n in sister_leaves if n.species != ref_spcs])
            event.outparalogs = set([])
            current.up.add_feature("evoltype","S")
            all_events.append(event)
        else:
            pass # do not add event if distances == 0

        # Updates browsed species
        browsed_spcs   |= sister_spcs
        browsed_leaves |= sister_leaves
        sister_leaves  = set([])
        # And keep ascending
        current = current.up
    return all_events
Exemplo n.º 2
0
def get_evol_events_from_root(node, sos_thr):
    """ Returns a list of **all** duplication and speciation
    events detected after this node. Nodes are assumed to be
    duplications when a species overlap is found between its child
    linages. Method is described more detail in:

    "The Human Phylome." Huerta-Cepas J, Dopazo H, Dopazo J, Gabaldon
    T. Genome Biol. 2007;8(6):R109.
    """

    # Get the tree's root
    root = node.get_tree_root()

    # Checks that is actually rooted
    outgroups = root.get_children()
    if len(outgroups) != 2:
        raise TypeError, "Tree is not rooted"

    # Cautch the smaller outgroup (will be stored as the tree outgroup)
    o1 = set([n.name for n in outgroups[0].get_leaves()])
    o2 = set([n.name for n in outgroups[1].get_leaves()])


    if len(o2)<len(o1):
        smaller_outg = outgroups[1]
    else:
        smaller_outg = outgroups[0]

    # Get family size
    fSize = len( [n for n in root.get_leaves()] )

    # Clean data from previous analyses
    for n in root.get_descendants()+[root]:
        n.del_feature("evoltype")

    # Gets Prepared to browse the tree from root to leaves
    to_visit = []
    current = root
    all_events = []
    while current:
        # Gets childs and appends them to the To_visit list
        childs = current.get_children()
        to_visit += childs
        if len(childs)>2:
            raise TypeError, "nodes are expected to have two childs."
        elif len(childs)==0:
            pass # leaf
        else:
            # Get leaves and species at both sides of event
            sideA_leaves= set([n for n in childs[0].get_leaves()])
            sideB_leaves= set([n for n in childs[1].get_leaves()])
            sideA_spcs  = set([n.species for n in childs[0].get_leaves()])
            sideB_spcs  = set([n.species for n in childs[1].get_leaves()])
            # Calculates species overlap
            overlaped_spcs = sideA_spcs & sideB_spcs
            all_spcs       = sideA_spcs | sideB_spcs
            score = float(len(overlaped_spcs))/len(all_spcs)

            # Creates a new evolEvent
            event = EvolEvent()
            event.fam_size   = fSize
            event.branch_supports = [current.support, current.children[0].support, current.children[1].support]
            # event.seed      = leafName
            # event.e_newick  = current.up.get_newick()  # high mem usage!!
            event.sos = score
            event.outgroup_spcs  = smaller_outg.get_species()
            event.in_seqs = set([n.name for n in sideA_leaves])
            event.out_seqs = set([n.name for n in sideB_leaves])
            event.inparalogs  = set([n.name for n in sideA_leaves])
            # If species overlap: duplication
            if score >sos_thr:
                event.node = current
                event.etype = "D"
                event.outparalogs = set([n.name for n in sideB_leaves])
                event.orthologs   = set([])
                current.add_feature("evoltype","D")
            # If NO species overlap: speciation
            else:
                event.node = current
                event.etype = "S"
                event.orthologs = set([n.name for n in sideB_leaves])
                event.outparalogs = set([])
                current.add_feature("evoltype","S")

            all_events.append(event)
        # Keep visiting nodes
        try:
            current = to_visit.pop(0)
        except IndexError:
            current = None
    return all_events
Exemplo n.º 3
0
def get_evol_events_from_leaf(node, sos_thr=0.0):
    """ Returns a list of duplication and speciation events in
    which the current node has been involved. Scanned nodes are
    also labeled internally as dup=True|False. You can access this
    labels using the 'node.dup' sintaxis.

    Method: the algorithm scans all nodes from the given leafName to
    the root. Nodes are assumed to be duplications when a species
    overlap is found between its child linages. Method is described
    more detail in:

    "The Human Phylome." Huerta-Cepas J, Dopazo H, Dopazo J, Gabaldon
    T. Genome Biol. 2007;8(6):R109.
    """
    # Get the tree's root
    root = node.get_tree_root()

    # Checks that is actually rooted
    outgroups = root.get_children()
    if len(outgroups) != 2:
        raise TypeError, "Tree is not rooted"

    # Cautch the smaller outgroup (will be stored as the tree
    # outgroup)
    o1 = set([n.name for n in outgroups[0].get_leaves()])
    o2 = set([n.name for n in outgroups[1].get_leaves()])

    if len(o2) < len(o1):
        smaller_outg = outgroups[1]
    else:
        smaller_outg = outgroups[0]

    # Prepare to browse tree from leaf to root
    all_events = []
    current = node
    ref_spcs = node.species
    sister_leaves = set([])
    browsed_spcs = set([current.species])
    browsed_leaves = set([current])
    # get family Size
    fSize = len([n for n in root.get_leaves() if n.species == ref_spcs])

    # Clean previous analysis
    for n in root.get_descendants() + [root]:
        n.del_feature("evoltype")

    while current.up:
        # distances control (0.0 distance check)
        d = 0
        for s in current.get_sisters():
            for leaf in s.get_leaves():
                d += current.get_distance(leaf)
                sister_leaves.add(leaf)
        # Process sister node only if there is any new sequence.
        # (previene dupliaciones por nombres repetidos)
        sister_leaves = sister_leaves.difference(browsed_leaves)
        if len(sister_leaves) == 0:
            current = current.up
            continue
        # Gets species at both sides of event
        sister_spcs = set([n.species for n in sister_leaves])
        overlaped_spces = browsed_spcs & sister_spcs
        all_spcs = browsed_spcs | sister_spcs
        score = float(len(overlaped_spces)) / len(all_spcs)
        # Creates a new evolEvent
        event = EvolEvent()
        event.fam_size = fSize
        event.seed = node.name
        # event.e_newick  = current.up.get_newick()  # high mem usage!!
        event.sos = score
        event.outgroup = smaller_outg.name
        # event.allseqs   = set(current.up.get_leaf_names())
        event.in_seqs = set([n.name for n in browsed_leaves])
        event.out_seqs = set([n.name for n in sister_leaves])
        event.inparalogs = set(
            [n.name for n in browsed_leaves if n.species == ref_spcs])

        # If species overlap: duplication
        if score > sos_thr:  # and d > 0.0: Removed branch control.
            event.node = current.up
            event.etype = "D"
            event.outparalogs = set(
                [n.name for n in sister_leaves if n.species == ref_spcs])
            event.orthologs = set([])
            current.up.add_feature("evoltype", "D")
            all_events.append(event)

        # If NO species overlap: speciation
        elif score == sos_thr:
            event.node = current.up
            event.etype = "S"
            event.orthologs = set(
                [n.name for n in sister_leaves if n.species != ref_spcs])
            event.outparalogs = set([])
            current.up.add_feature("evoltype", "S")
            all_events.append(event)
        else:
            pass  # do not add event if distances == 0

        # Updates browsed species
        browsed_spcs |= sister_spcs
        browsed_leaves |= sister_leaves
        sister_leaves = set([])
        # And keep ascending
        current = current.up
    return all_events
Exemplo n.º 4
0
def get_evol_events_from_root(node, sos_thr):
    """ Returns a list of **all** duplication and speciation
    events detected after this node. Nodes are assumed to be
    duplications when a species overlap is found between its child
    linages. Method is described more detail in:

    "The Human Phylome." Huerta-Cepas J, Dopazo H, Dopazo J, Gabaldon
    T. Genome Biol. 2007;8(6):R109.
    """

    # Get the tree's root
    root = node.get_tree_root()

    # Checks that is actually rooted
    outgroups = root.get_children()
    if len(outgroups) != 2:
        raise TypeError, "Tree is not rooted"

    # Cautch the smaller outgroup (will be stored as the tree outgroup)
    o1 = set([n.name for n in outgroups[0].get_leaves()])
    o2 = set([n.name for n in outgroups[1].get_leaves()])

    if len(o2) < len(o1):
        smaller_outg = outgroups[1]
    else:
        smaller_outg = outgroups[0]

    # Get family size
    fSize = len([n for n in root.get_leaves()])

    # Clean data from previous analyses
    for n in root.get_descendants() + [root]:
        n.del_feature("evoltype")

    # Gets Prepared to browse the tree from root to leaves
    to_visit = []
    current = root
    all_events = []
    while current:
        # Gets childs and appends them to the To_visit list
        childs = current.get_children()
        to_visit += childs
        if len(childs) > 2:
            raise TypeError, "nodes are expected to have two childs."
        elif len(childs) == 0:
            pass  # leaf
        else:
            # Get leaves and species at both sides of event
            sideA_leaves = set([n for n in childs[0].get_leaves()])
            sideB_leaves = set([n for n in childs[1].get_leaves()])
            sideA_spcs = set([n.species for n in childs[0].get_leaves()])
            sideB_spcs = set([n.species for n in childs[1].get_leaves()])
            # Calculates species overlap
            overlaped_spcs = sideA_spcs & sideB_spcs
            all_spcs = sideA_spcs | sideB_spcs
            score = float(len(overlaped_spcs)) / len(all_spcs)

            # Creates a new evolEvent
            event = EvolEvent()
            event.fam_size = fSize
            event.branch_supports = [
                current.support, current.children[0].support,
                current.children[1].support
            ]
            # event.seed      = leafName
            # event.e_newick  = current.up.get_newick()  # high mem usage!!
            event.sos = score
            event.outgroup_spcs = smaller_outg.get_species()
            event.in_seqs = set([n.name for n in sideA_leaves])
            event.out_seqs = set([n.name for n in sideB_leaves])
            event.inparalogs = set([n.name for n in sideA_leaves])
            # If species overlap: duplication
            if score > sos_thr:
                event.node = current
                event.etype = "D"
                event.outparalogs = set([n.name for n in sideB_leaves])
                event.orthologs = set([])
                current.add_feature("evoltype", "D")
            # If NO species overlap: speciation
            else:
                event.node = current
                event.etype = "S"
                event.orthologs = set([n.name for n in sideB_leaves])
                event.outparalogs = set([])
                current.add_feature("evoltype", "S")

            all_events.append(event)
        # Keep visiting nodes
        try:
            current = to_visit.pop(0)
        except IndexError:
            current = None
    return all_events
Exemplo n.º 5
0
def get_reconciled_tree(node, sptree, events):
    """ Returns the recoliation gene tree with a provided species
    topology """

    if len(node.children) == 2:
        # First visit childs
        morphed_childs = []
        for ch in node.children:
            mc, ev = get_reconciled_tree(ch, sptree, events)
            morphed_childs.append(mc)

        # morphed childs are the reconciled children. I trust its
        # topology. Remember tree is visited on recursive post-order
        sp_child_0 = morphed_childs[0].get_species()
        sp_child_1 = morphed_childs[1].get_species()
        all_species = sp_child_1 | sp_child_0

        # If childs represents a duplication (duplicated species)
        # Check that both are reconciliated to the same species
        if len(sp_child_0 & sp_child_1) > 0:
            newnode = copy.deepcopy(node)
            newnode.up = None
            newnode.children = []
            template = _get_expected_topology(sptree, all_species)
            # replaces child0 partition on the template
            newmorphed0, matchnode = _replace_on_template(
                template, morphed_childs[0])
            # replaces child1 partition on the template
            newmorphed1, matchnode = _replace_on_template(
                template, morphed_childs[1])
            newnode.add_child(newmorphed0)
            newnode.add_child(newmorphed1)
            newnode.add_feature("evoltype", "D")
            node.add_feature("evoltype", "D")
            e = EvolEvent()
            e.etype = "D"
            e.inparalogs = node.children[0].get_leaf_names()
            e.outparalogs = node.children[1].get_leaf_names()
            e.in_seqs = node.children[0].get_leaf_names()
            e.out_seqs = node.children[1].get_leaf_names()
            events.append(e)
            return newnode, events

        # Otherwise, we need to reconciliate species at both sides
        # into a single partition.
        else:
            # gets the topology expected by the observed species
            template = _get_expected_topology(sptree, all_species)
            # replaces child0 partition on the template
            template, matchnode = _replace_on_template(template,
                                                       morphed_childs[0])
            # replaces child1 partition on the template
            template, matchnode = _replace_on_template(template,
                                                       morphed_childs[1])
            template.add_feature("evoltype", "S")
            node.add_feature("evoltype", "S")
            e = EvolEvent()
            e.etype = "S"
            e.inparalogs = node.children[0].get_leaf_names()
            e.orthologs = node.children[1].get_leaf_names()
            e.in_seqs = node.children[0].get_leaf_names()
            e.out_seqs = node.children[1].get_leaf_names()
            events.append(e)
            return template, events
    elif len(node.children) == 0:
        return copy.deepcopy(node), events
    else:
        raise ValueError("Algorithm can only work with binary trees.")
Exemplo n.º 6
0
def get_reconciled_tree(node, sptree, events):
    """ Returns the recoliation gene tree with a provided species
    topology """

    if len(node.children) == 2:
        # First visit childs
        morphed_childs = []
        for ch in node.children:
            mc, ev = get_reconciled_tree(ch, sptree, events)
            morphed_childs.append(mc)

        # morphed childs are the reconciled children. I trust its
        # topology. Remember tree is visited on recursive post-order
        sp_child_0 = morphed_childs[0].get_species()
        sp_child_1 = morphed_childs[1].get_species()
        all_species = sp_child_1 | sp_child_0

        # If childs represents a duplication (duplicated species)
        # Check that both are reconciliated to the same species
        if len(sp_child_0 & sp_child_1) > 0:
            newnode = copy.deepcopy(node)
            newnode.up = None
            newnode.children = []
            template = _get_expected_topology(sptree, all_species)
            # replaces child0 partition on the template
            newmorphed0, matchnode = _replace_on_template(template, morphed_childs[0])
            # replaces child1 partition on the template
            newmorphed1, matchnode = _replace_on_template(template, morphed_childs[1])
            newnode.add_child(newmorphed0)
            newnode.add_child(newmorphed1)
            newnode.add_feature("evoltype", "D")
            node.add_feature("evoltype", "D")
            e = EvolEvent()
            e.etype = "D"
            e.inparalogs = node.children[0].get_leaf_names()
            e.outparalogs = node.children[1].get_leaf_names()
            e.in_seqs  = node.children[0].get_leaf_names()
            e.out_seqs = node.children[1].get_leaf_names()
            events.append(e)
            return newnode, events

        # Otherwise, we need to reconciliate species at both sides
        # into a single partition.
        else:
            # gets the topology expected by the observed species
            template = _get_expected_topology(sptree, all_species)
            # replaces child0 partition on the template
            template, matchnode = _replace_on_template(template, morphed_childs[0] )
            # replaces child1 partition on the template
            template, matchnode = _replace_on_template(template, morphed_childs[1])
            template.add_feature("evoltype","S")
            node.add_feature("evoltype","S")
            e = EvolEvent()
            e.etype = "S"
            e.inparalogs = node.children[0].get_leaf_names()
            e.orthologs = node.children[1].get_leaf_names()
            e.in_seqs  = node.children[0].get_leaf_names()
            e.out_seqs = node.children[1].get_leaf_names()
            events.append(e)
            return template, events
    elif len(node.children)==0:
        return copy.deepcopy(node), events
    else:
        raise ValueError("Algorithm can only work with binary trees.")