예제 #1
0
def get_evol_events_from_leaf(node, sos_thr=0.0):
    """ Returns a list of duplication and speciation events in
    which the current node has been involved. Scanned nodes are
    also labeled internally as dup=True|False. You can access this
    labels using the 'node.dup' sintaxis.

    Method: the algorithm scans all nodes from the given leafName to
    the root. Nodes are assumed to be duplications when a species
    overlap is found between its child linages. Method is described
    more detail in:

    "The Human Phylome." Huerta-Cepas J, Dopazo H, Dopazo J, Gabaldon
    T. Genome Biol. 2007;8(6):R109.
    """
    # Get the tree's root
    root = node.get_tree_root()

    # Checks that is actually rooted
    outgroups = root.get_children()
    if len(outgroups) != 2:
        raise TypeError, "Tree is not rooted"

    # Cautch the smaller outgroup (will be stored as the tree
    # outgroup)
    o1 = set([n.name for n in outgroups[0].get_leaves()])
    o2 = set([n.name for n in outgroups[1].get_leaves()])

    if len(o2)<len(o1):
        smaller_outg = outgroups[1]
    else:
        smaller_outg = outgroups[0]


    # Prepare to browse tree from leaf to root
    all_events = []
    current  = node
    ref_spcs = node.species
    sister_leaves  = set([])
    browsed_spcs   = set([current.species])
    browsed_leaves = set([current])
    # get family Size
    fSize =  len([n for n in root.get_leaves() if n.species == ref_spcs])

    # Clean previous analysis
    for n in root.get_descendants()+[root]:
        n.del_feature("evoltype")

    while current.up:
        # distances control (0.0 distance check)
        d = 0
        for s in current.get_sisters():
            for leaf in s.get_leaves():
                d += current.get_distance(leaf)
                sister_leaves.add(leaf)
        # Process sister node only if there is any new sequence.
        # (previene dupliaciones por nombres repetidos)
        sister_leaves = sister_leaves.difference(browsed_leaves)
        if len(sister_leaves)==0:
            current = current.up
            continue
        # Gets species at both sides of event
        sister_spcs     = set([n.species for n in sister_leaves])
        overlaped_spces = browsed_spcs & sister_spcs
        all_spcs        = browsed_spcs | sister_spcs
        score = float(len(overlaped_spces))/len(all_spcs)
        # Creates a new evolEvent
        event = EvolEvent()
        event.fam_size   = fSize
        event.seed      = node.name
        # event.e_newick  = current.up.get_newick()  # high mem usage!!
        event.sos = score
        event.outgroup  = smaller_outg.name
        # event.allseqs   = set(current.up.get_leaf_names())
        event.in_seqs = set([n.name for n in browsed_leaves])
        event.out_seqs = set([n.name for n in sister_leaves])
        event.inparalogs  = set([n.name for n in browsed_leaves if n.species == ref_spcs])

        # If species overlap: duplication
        if score >sos_thr and d > 0.0:
            event.node = current.up
            event.etype = "D"
            event.outparalogs = set([n.name for n in sister_leaves  if n.species == ref_spcs])
            event.orthologs   = set([])
            current.up.add_feature("evoltype","D")
            all_events.append(event)

        # If NO species overlap: speciation
        elif score == sos_thr:
            event.node = current.up
            event.etype = "S"
            event.orthologs = set([n.name for n in sister_leaves if n.species != ref_spcs])
            event.outparalogs = set([])
            current.up.add_feature("evoltype","S")
            all_events.append(event)
        else:
            pass # do not add event if distances == 0

        # Updates browsed species
        browsed_spcs   |= sister_spcs
        browsed_leaves |= sister_leaves
        sister_leaves  = set([])
        # And keep ascending
        current = current.up
    return all_events
예제 #2
0
파일: spoverlap.py 프로젝트: tarah28/ete
def get_evol_events_from_leaf(node, sos_thr=0.0):
    """ Returns a list of duplication and speciation events in
    which the current node has been involved. Scanned nodes are
    also labeled internally as dup=True|False. You can access this
    labels using the 'node.dup' sintaxis.

    Method: the algorithm scans all nodes from the given leafName to
    the root. Nodes are assumed to be duplications when a species
    overlap is found between its child linages. Method is described
    more detail in:

    "The Human Phylome." Huerta-Cepas J, Dopazo H, Dopazo J, Gabaldon
    T. Genome Biol. 2007;8(6):R109.
    """
    # Get the tree's root
    root = node.get_tree_root()

    # Checks that is actually rooted
    outgroups = root.get_children()
    if len(outgroups) != 2:
        raise TypeError, "Tree is not rooted"

    # Cautch the smaller outgroup (will be stored as the tree
    # outgroup)
    o1 = set([n.name for n in outgroups[0].get_leaves()])
    o2 = set([n.name for n in outgroups[1].get_leaves()])

    if len(o2) < len(o1):
        smaller_outg = outgroups[1]
    else:
        smaller_outg = outgroups[0]

    # Prepare to browse tree from leaf to root
    all_events = []
    current = node
    ref_spcs = node.species
    sister_leaves = set([])
    browsed_spcs = set([current.species])
    browsed_leaves = set([current])
    # get family Size
    fSize = len([n for n in root.get_leaves() if n.species == ref_spcs])

    # Clean previous analysis
    for n in root.get_descendants() + [root]:
        n.del_feature("evoltype")

    while current.up:
        # distances control (0.0 distance check)
        d = 0
        for s in current.get_sisters():
            for leaf in s.get_leaves():
                d += current.get_distance(leaf)
                sister_leaves.add(leaf)
        # Process sister node only if there is any new sequence.
        # (previene dupliaciones por nombres repetidos)
        sister_leaves = sister_leaves.difference(browsed_leaves)
        if len(sister_leaves) == 0:
            current = current.up
            continue
        # Gets species at both sides of event
        sister_spcs = set([n.species for n in sister_leaves])
        overlaped_spces = browsed_spcs & sister_spcs
        all_spcs = browsed_spcs | sister_spcs
        score = float(len(overlaped_spces)) / len(all_spcs)
        # Creates a new evolEvent
        event = EvolEvent()
        event.fam_size = fSize
        event.seed = node.name
        # event.e_newick  = current.up.get_newick()  # high mem usage!!
        event.sos = score
        event.outgroup = smaller_outg.name
        # event.allseqs   = set(current.up.get_leaf_names())
        event.in_seqs = set([n.name for n in browsed_leaves])
        event.out_seqs = set([n.name for n in sister_leaves])
        event.inparalogs = set(
            [n.name for n in browsed_leaves if n.species == ref_spcs])

        # If species overlap: duplication
        if score > sos_thr:  # and d > 0.0: Removed branch control.
            event.node = current.up
            event.etype = "D"
            event.outparalogs = set(
                [n.name for n in sister_leaves if n.species == ref_spcs])
            event.orthologs = set([])
            current.up.add_feature("evoltype", "D")
            all_events.append(event)

        # If NO species overlap: speciation
        elif score == sos_thr:
            event.node = current.up
            event.etype = "S"
            event.orthologs = set(
                [n.name for n in sister_leaves if n.species != ref_spcs])
            event.outparalogs = set([])
            current.up.add_feature("evoltype", "S")
            all_events.append(event)
        else:
            pass  # do not add event if distances == 0

        # Updates browsed species
        browsed_spcs |= sister_spcs
        browsed_leaves |= sister_leaves
        sister_leaves = set([])
        # And keep ascending
        current = current.up
    return all_events