示例#1
0
def swap_tree(tree):
    
    # make safe tree
    if not '"' in tree:
        tree = nwk.safe_newick_string(tree)

    # swap two nodes of the tree
    nodes = list(nwk.nodes_in_tree(tree))[1:]
    random.shuffle(nodes)
    
    # choose two nodes to be swapped
    nodeA = nodes.pop(0)

    # get another node that can be interchanged
    while nodes:
        nodeB = nodes.pop(0)
        if nodeB in nodeA or nodeA in nodeB:
            pass
        else:
            break

    tree = tree.replace(nodeA+',', '#dummyA#,')
    tree = tree.replace(nodeA+')', '#dummyA#)')
    tree = tree.replace(nodeB+',', '#dummyB#,')
    tree = tree.replace(nodeB+')', '#dummyB#)')

    tree = tree.replace('#dummyA#', nodeB)
    tree = tree.replace('#dummyB#', nodeA)

    return nwk.sort_tree(tree).replace('"','')
示例#2
0
def swap_tree(tree):

    # make safe tree
    if not '"' in tree:
        tree = nwk.safe_newick_string(tree)

    # swap two nodes of the tree
    nodes = list(nwk.nodes_in_tree(tree))[1:]
    random.shuffle(nodes)

    # choose two nodes to be swapped
    nodeA = nodes.pop(0)

    # get another node that can be interchanged
    while nodes:
        nodeB = nodes.pop(0)
        if nodeB in nodeA or nodeA in nodeB:
            pass
        else:
            break

    tree = tree.replace(nodeA + ',', '#dummyA#,')
    tree = tree.replace(nodeA + ')', '#dummyA#)')
    tree = tree.replace(nodeB + ',', '#dummyB#,')
    tree = tree.replace(nodeB + ')', '#dummyB#)')

    tree = tree.replace('#dummyA#', nodeB)
    tree = tree.replace('#dummyB#', nodeA)

    return nwk.sort_tree(tree).replace('"', '')
示例#3
0
def best_tree_brute_force(
        patterns,
        taxa,
        transitions,
        characters,
        proto_forms=False,
        verbose=False
        ):
    """
    This is an experimental parsimony version that allows for ordered
    character states.
    """

    minScore = 1000000000
    bestTree = []

    for idx,tree in enumerate(all_rooted_binary_trees(*taxa)):
        t = nwk.LingPyTree(tree)
        if verbose:
            print('[{0}] {1}...'.format(idx+1, t.newick))

        score = 0
        for i,(p,m,c) in enumerate(zip(patterns, transitions, characters)):
            weights = sankoff_parsimony_up(
                    p,
                    taxa,
                    t,
                    m,
                    c
                    )
            if not proto_forms:
                minWeight = min(weights[t.root].values())
            else:
                minWeight = weights[t.root][proto_forms[i]]
                
            score += minWeight
            
            if score > minScore:
                break

        if score == minScore:
            bestTree += [nwk.sort_tree(t.newick)]
        elif score < minScore:
            minScore = score
            bestTree = [nwk.sort_tree(t.newick)]

    return bestTree, minScore
示例#4
0
def best_tree_brute_force(patterns,
                          taxa,
                          transitions,
                          characters,
                          proto_forms=False,
                          verbose=False):
    """
    This is an experimental parsimony version that allows for ordered
    character states.
    """

    minScore = 1000000000
    bestTree = []

    for idx, tree in enumerate(all_rooted_binary_trees(*taxa)):
        t = nwk.LingPyTree(tree)
        if verbose:
            print('[{0}] {1}...'.format(idx + 1, t.newick))

        score = 0
        for i, (p, m, c) in enumerate(zip(patterns, transitions, characters)):
            weights = sankoff_parsimony_up(p, taxa, t, m, c)
            if not proto_forms:
                minWeight = min(weights[t.root].values())
            else:
                minWeight = weights[t.root][proto_forms[i]]

            score += minWeight

            if score > minScore:
                break

        if score == minScore:
            bestTree += [nwk.sort_tree(t.newick)]
        elif score < minScore:
            minScore = score
            bestTree = [nwk.sort_tree(t.newick)]

    return bestTree, minScore
示例#5
0
def all_rooted_binary_trees(*taxa):
    """
    Compute all rooted trees.

    Notes
    -----

    This procedure yields all rooted binary trees for a given set of taxa, as
    described in :bib:`Felsenstein1978`. It implements a depth-first search.
    """
    if len(taxa) <= 2:
        yield '(' + ','.join(taxa) + ');'

    # make queue with taxa included and taxa to be visited
    queue = [('(' + ','.join(taxa[:2]) + ')', list(taxa[2:]))]

    out = []

    while queue:

        # add next taxon
        tree, rest = queue.pop()

        if rest:
            next_taxon = rest.pop()

            nodes = list(nwk.nodes_in_tree(tree))
            random.shuffle(nodes)
            for node in nodes:
                new_tree = tree.replace(node,
                                        '(' + next_taxon + ',' + node + ')')

                r = [x for x in rest]
                random.shuffle(r)
                queue += [(new_tree, r)]
                if not rest:
                    yield new_tree
示例#6
0
def all_rooted_binary_trees(*taxa):
    """
    Compute all rooted trees.

    Notes
    -----

    This procedure yields all rooted binary trees for a given set of taxa, as
    described in :bib:`Felsenstein1978`. It implements a depth-first search.
    """
    if len(taxa) <= 2:
        yield '('+','.join(taxa)+');'

    # make queue with taxa included and taxa to be visited
    queue = [('('+','.join(taxa[:2])+')', list(taxa[2:]))]

    out = []

    while queue:
        
        # add next taxon
        tree, rest = queue.pop()

        if rest:
            next_taxon = rest.pop()
            
            nodes = list(nwk.nodes_in_tree(tree))
            random.shuffle(nodes)
            for node in nodes: 
                new_tree = tree.replace(node, '('+next_taxon+','+node+')')
                
                r = [x for x in rest]
                random.shuffle(r)
                queue += [(new_tree, r)]
                if not rest:
                    yield new_tree
示例#7
0
def heuristic_parsimony(
        taxa, 
        patterns,
        transitions,
        characters,
        guide_tree = False,
        verbose = True,
        lower_bound = False,
        iterations = 300,
        sample_steps = 100,
        log = False,
        stop_iteration = False
        ):
    """
    Try to make a heuristic parsimony calculation.

    Note
    ----
    This calculation uses the following heuristic to quickly walk through the
    tree space:

    1. Start from a guide tree or a random tree provided by the user.
    2. In each iteration step, create new trees and use them, if they are not
       already visited:
       1. Create a certain amount of trees by swapping the trees which currently
          have the best scores.
       2. Create a certain amount of trees by swapping the trees which
          have very good scores (one forth of the best trees of each run) from the
          queue.
       3. Create a certain amount of random trees to search also in different
          regions of the tree space and increase the chances of finding another
          maximum.
       4. Take a certain amount of random samples from a tree-generator which will successively
          produce all possible trees in a randomized order.
    3. In each iteration, a larger range of trees (around 100, depending on the
       number of taxonomic units) is created and investigated. Once this is done,
       the best 25% of the results are appended to the queue. A specific array
       stores the trees with minimal scores and updates them successively. The
       queue is re-ordered in each iteration step, according to the score of the
       trees in the queue. The proportion of trees which are harvested from the
       four different procedures will change depending on the number of trees
       with currently minimal scores. If there are very many trees with minimal
       score, the algorithm will increase the number of random trees in order
       to broaden the search. If the number of optimal trees is small, the
       algorithm tries to stick to those few trees in order to find their
       optimal neighbors.

    This procedure allows to search the tree space rather efficiently, and for
    smaller datasets, it optimizes rather satisfyingly. Unless one takes as
    many iterations as there are possible trees in the data, however, this
    procedure is never guaranteed to find the best tree or the best trees.

    """
    
    if log:
        logfile = open('log.log', 'w').close()
        logfile = open('log.log', 'a')

    if not guide_tree:
        guide_tree = random_tree(taxa)

    lower_bound = 0
    ltree = nwk.LingPyTree(guide_tree)
    for idx,(p,t,c) in enumerate(zip(patterns, transitions, characters)):
        lower_bound += sankoff_parsimony_up(
                p,
                taxa,
                ltree,
                t,
                c,
                weight_only=True
            )
    print("[i] Lower Bound (in guide tree):",lower_bound)

    # we start doing the same as in the case of the calculation of all rooted
    # trees below
    if len(taxa) <= 2:
        return '('+','.join(taxa)+');'

    # make queue with taxa included and taxa to be visited
    tree = nwk.sort_tree(guide_tree)
    queue = [(tree, lower_bound)]
    visited = [tree]
    trees = [tree]

    # create a generator for all rooted binary trees
    gen = all_rooted_binary_trees(*taxa)
    previous = 0

    while queue:       

        # modify queue
        queue = sorted(queue, key=lambda x: x[1])

        # check whether tree is in data or not
        #if tree in visited:
            
        # try creating a new tree in three steps:
        # a) swap the tree
        # b) make a random tree
        # c) take a generated tree

        
        forest = []


        # determine proportions, depending on the number of optimal trees
        if len(trees) < 50:
            props = [4,1,0.5,0.25]
        elif len(trees) < 100:
            props = [2,1,0.5,0.5]
        elif len(trees) >= 100:
            props = [1,1,1,1]

        # try and get the derivations from the best trees
        for i in range(int(props[0] * len(taxa)) or 5):
            new_tree = swap_tree(random.choice(trees))
            if new_tree not in visited:
                forest += [new_tree]
                visited += [new_tree]
            if previous < len(visited) and len(visited) % sample_steps == 0:
                print("[i] Investigated {0} trees so far, currently holding {1} trees with best score of {2}.".format(len(visited), len(trees), lower_bound)) 
                previous = len(visited)
        
        for i in range(int(props[1] * len(taxa)) or 5):
            
            # we change the new tree at a certain number of steps
            if i % (len(taxa) // 4 or 2) == 0:
                try:
                    tree, bound = queue.pop(0)
                    if tree.endswith(';'):
                        tree = tree[:-1]
                except IndexError:
                    pass

            new_tree = swap_tree(tree)
            if new_tree not in visited:
                forest += [new_tree]
                visited += [new_tree]
            if previous < len(visited) and len(visited) % sample_steps == 0:
                print("[i] Investigated {0} trees so far, currently holding {1} trees with best score of {2}.".format(len(visited), len(trees), lower_bound)) 
                previous = len(visited)    

        # go on with b
        for i in range(int(props[2] * len(taxa)) or 5):
            new_tree = nwk.sort_tree(random_tree(taxa))
            if new_tree not in visited:
                forest += [new_tree]
                visited += [new_tree]
            if previous < len(visited) and len(visited) % sample_steps == 0:
                print("[i] Investigated {0} trees so far, currently holding {1} trees with best score of {2}.".format(len(visited), len(trees), lower_bound)) 
                previous = len(visited)
        
        # be careful with stop of iteration when using this function, so we
        # need to add a try-except statement here
        for i in range(int(props[3] * len(taxa)) or 5):
            try:
                new_tree = nwk.sort_tree(next(gen))
                if new_tree not in visited:
                    forest += [new_tree]
                    visited += [new_tree]
                if previous < len(visited) and len(visited) % sample_steps == 0:
                    print("[i] Investigated {0} trees so far, currently holding {1} trees with best score of {2}.".format(len(visited), len(trees), lower_bound)) 
                    previous = len(visited)
            except StopIteration:
                pass

        # check whether forest is empty, if this is the case, try to exhaust it
        # by adding new items from the iteration process, and do this, until
        # the iterator is exhausted, to make sure that the exact number of
        # possible trees as wished by the user is also tested
        if not forest:
            while True:
                try:
                    new_tree = nwk.sort_tree(next(gen))
                    if new_tree not in visited:
                        visited += [new_tree]
                        forest += [new_tree]
                        break
                except StopIteration:
                    break

        best_scores = []
        for tree in forest:
            score = 0
            lp_tree = nwk.LingPyTree(tree)
            for p,t,c in zip(patterns, transitions, characters):
                weight  = sankoff_parsimony_up(
                        p,
                        taxa,
                        lp_tree,
                        t,
                        c,
                        weight_only =True
                        )
                score += weight
            if log:
                logfile.write(str(score)+'\t'+lp_tree.newick+';\n')

            # append stuff to queue
            best_scores += [(tree, score)]
        
        # important to include at least one of the trees to the queue,
        # otherwise the program terminates at points where we don't want it to
        # terminate
        for tree,score in sorted(best_scores, key=lambda x:
                x[1])[:len(best_scores) // 4 or 1]:
            queue += [(tree, score)]
            
            if score < lower_bound:
                trees = [tree]
                lower_bound = score
            elif score == lower_bound:
                trees += [tree]
        
        # check before terminating whether more iterations should be carried
        # out (in case scores are not satisfying)
        if len(visited) > iterations:
            if stop_iteration:
                break
            else:
                answer = input("[?] Number of chosen iterations is reached, do you want to go on with the analysis? y/n ").strip().lower()
                if answer == 'y':
                    while True:
                        number = input("[?] How many iterations? ")
                        try:
                            number = int(number)
                            iterations += number
                            break
                        except:
                            pass
                else:
                    break

    return trees, lower_bound
示例#8
0
def heuristic_parsimony(taxa,
                        patterns,
                        transitions,
                        characters,
                        guide_tree=False,
                        verbose=True,
                        lower_bound=False,
                        iterations=300,
                        sample_steps=100,
                        log=False,
                        stop_iteration=False):
    """
    Try to make a heuristic parsimony calculation.

    Note
    ----
    This calculation uses the following heuristic to quickly walk through the
    tree space:

    1. Start from a guide tree or a random tree provided by the user.
    2. In each iteration step, create new trees and use them, if they are not
       already visited:
       1. Create a certain amount of trees by swapping the trees which currently
          have the best scores.
       2. Create a certain amount of trees by swapping the trees which
          have very good scores (one forth of the best trees of each run) from the
          queue.
       3. Create a certain amount of random trees to search also in different
          regions of the tree space and increase the chances of finding another
          maximum.
       4. Take a certain amount of random samples from a tree-generator which will successively
          produce all possible trees in a randomized order.
    3. In each iteration, a larger range of trees (around 100, depending on the
       number of taxonomic units) is created and investigated. Once this is done,
       the best 25% of the results are appended to the queue. A specific array
       stores the trees with minimal scores and updates them successively. The
       queue is re-ordered in each iteration step, according to the score of the
       trees in the queue. The proportion of trees which are harvested from the
       four different procedures will change depending on the number of trees
       with currently minimal scores. If there are very many trees with minimal
       score, the algorithm will increase the number of random trees in order
       to broaden the search. If the number of optimal trees is small, the
       algorithm tries to stick to those few trees in order to find their
       optimal neighbors.

    This procedure allows to search the tree space rather efficiently, and for
    smaller datasets, it optimizes rather satisfyingly. Unless one takes as
    many iterations as there are possible trees in the data, however, this
    procedure is never guaranteed to find the best tree or the best trees.

    """

    if log:
        logfile = open('log.log', 'w').close()
        logfile = open('log.log', 'a')

    if not guide_tree:
        guide_tree = random_tree(taxa)

    lower_bound = 0
    ltree = nwk.LingPyTree(guide_tree)
    for idx, (p, t, c) in enumerate(zip(patterns, transitions, characters)):
        lower_bound += sankoff_parsimony_up(p,
                                            taxa,
                                            ltree,
                                            t,
                                            c,
                                            weight_only=True)
    print("[i] Lower Bound (in guide tree):", lower_bound)

    # we start doing the same as in the case of the calculation of all rooted
    # trees below
    if len(taxa) <= 2:
        return '(' + ','.join(taxa) + ');'

    # make queue with taxa included and taxa to be visited
    tree = nwk.sort_tree(guide_tree)
    queue = [(tree, lower_bound)]
    visited = [tree]
    trees = [tree]

    # create a generator for all rooted binary trees
    gen = all_rooted_binary_trees(*taxa)
    previous = 0

    while queue:

        # modify queue
        queue = sorted(queue, key=lambda x: x[1])

        # check whether tree is in data or not
        #if tree in visited:

        # try creating a new tree in three steps:
        # a) swap the tree
        # b) make a random tree
        # c) take a generated tree

        forest = []

        # determine proportions, depending on the number of optimal trees
        if len(trees) < 50:
            props = [4, 1, 0.5, 0.25]
        elif len(trees) < 100:
            props = [2, 1, 0.5, 0.5]
        elif len(trees) >= 100:
            props = [1, 1, 1, 1]

        # try and get the derivations from the best trees
        for i in range(int(props[0] * len(taxa)) or 5):
            new_tree = swap_tree(random.choice(trees))
            if new_tree not in visited:
                forest += [new_tree]
                visited += [new_tree]
            if previous < len(visited) and len(visited) % sample_steps == 0:
                print(
                    "[i] Investigated {0} trees so far, currently holding {1} trees with best score of {2}."
                    .format(len(visited), len(trees), lower_bound))
                previous = len(visited)

        for i in range(int(props[1] * len(taxa)) or 5):

            # we change the new tree at a certain number of steps
            if i % (len(taxa) // 4 or 2) == 0:
                try:
                    tree, bound = queue.pop(0)
                    if tree.endswith(';'):
                        tree = tree[:-1]
                except IndexError:
                    pass

            new_tree = swap_tree(tree)
            if new_tree not in visited:
                forest += [new_tree]
                visited += [new_tree]
            if previous < len(visited) and len(visited) % sample_steps == 0:
                print(
                    "[i] Investigated {0} trees so far, currently holding {1} trees with best score of {2}."
                    .format(len(visited), len(trees), lower_bound))
                previous = len(visited)

        # go on with b
        for i in range(int(props[2] * len(taxa)) or 5):
            new_tree = nwk.sort_tree(random_tree(taxa))
            if new_tree not in visited:
                forest += [new_tree]
                visited += [new_tree]
            if previous < len(visited) and len(visited) % sample_steps == 0:
                print(
                    "[i] Investigated {0} trees so far, currently holding {1} trees with best score of {2}."
                    .format(len(visited), len(trees), lower_bound))
                previous = len(visited)

        # be careful with stop of iteration when using this function, so we
        # need to add a try-except statement here
        for i in range(int(props[3] * len(taxa)) or 5):
            try:
                new_tree = nwk.sort_tree(next(gen))
                if new_tree not in visited:
                    forest += [new_tree]
                    visited += [new_tree]
                if previous < len(
                        visited) and len(visited) % sample_steps == 0:
                    print(
                        "[i] Investigated {0} trees so far, currently holding {1} trees with best score of {2}."
                        .format(len(visited), len(trees), lower_bound))
                    previous = len(visited)
            except StopIteration:
                pass

        # check whether forest is empty, if this is the case, try to exhaust it
        # by adding new items from the iteration process, and do this, until
        # the iterator is exhausted, to make sure that the exact number of
        # possible trees as wished by the user is also tested
        if not forest:
            while True:
                try:
                    new_tree = nwk.sort_tree(next(gen))
                    if new_tree not in visited:
                        visited += [new_tree]
                        forest += [new_tree]
                        break
                except StopIteration:
                    break

        best_scores = []
        for tree in forest:
            score = 0
            lp_tree = nwk.LingPyTree(tree)
            for p, t, c in zip(patterns, transitions, characters):
                weight = sankoff_parsimony_up(p,
                                              taxa,
                                              lp_tree,
                                              t,
                                              c,
                                              weight_only=True)
                score += weight
            if log:
                logfile.write(str(score) + '\t' + lp_tree.newick + ';\n')

            # append stuff to queue
            best_scores += [(tree, score)]

        # important to include at least one of the trees to the queue,
        # otherwise the program terminates at points where we don't want it to
        # terminate
        for tree, score in sorted(
                best_scores, key=lambda x: x[1])[:len(best_scores) // 4 or 1]:
            queue += [(tree, score)]

            if score < lower_bound:
                trees = [tree]
                lower_bound = score
            elif score == lower_bound:
                trees += [tree]

        # check before terminating whether more iterations should be carried
        # out (in case scores are not satisfying)
        if len(visited) > iterations:
            if stop_iteration:
                break
            else:
                answer = input(
                    "[?] Number of chosen iterations is reached, do you want to go on with the analysis? y/n "
                ).strip().lower()
                if answer == 'y':
                    while True:
                        number = input("[?] How many iterations? ")
                        try:
                            number = int(number)
                            iterations += number
                            break
                        except:
                            pass
                else:
                    break

    return trees, lower_bound