def swap_tree(tree): # make safe tree if not '"' in tree: tree = nwk.safe_newick_string(tree) # swap two nodes of the tree nodes = list(nwk.nodes_in_tree(tree))[1:] random.shuffle(nodes) # choose two nodes to be swapped nodeA = nodes.pop(0) # get another node that can be interchanged while nodes: nodeB = nodes.pop(0) if nodeB in nodeA or nodeA in nodeB: pass else: break tree = tree.replace(nodeA+',', '#dummyA#,') tree = tree.replace(nodeA+')', '#dummyA#)') tree = tree.replace(nodeB+',', '#dummyB#,') tree = tree.replace(nodeB+')', '#dummyB#)') tree = tree.replace('#dummyA#', nodeB) tree = tree.replace('#dummyB#', nodeA) return nwk.sort_tree(tree).replace('"','')
def swap_tree(tree): # make safe tree if not '"' in tree: tree = nwk.safe_newick_string(tree) # swap two nodes of the tree nodes = list(nwk.nodes_in_tree(tree))[1:] random.shuffle(nodes) # choose two nodes to be swapped nodeA = nodes.pop(0) # get another node that can be interchanged while nodes: nodeB = nodes.pop(0) if nodeB in nodeA or nodeA in nodeB: pass else: break tree = tree.replace(nodeA + ',', '#dummyA#,') tree = tree.replace(nodeA + ')', '#dummyA#)') tree = tree.replace(nodeB + ',', '#dummyB#,') tree = tree.replace(nodeB + ')', '#dummyB#)') tree = tree.replace('#dummyA#', nodeB) tree = tree.replace('#dummyB#', nodeA) return nwk.sort_tree(tree).replace('"', '')
def best_tree_brute_force( patterns, taxa, transitions, characters, proto_forms=False, verbose=False ): """ This is an experimental parsimony version that allows for ordered character states. """ minScore = 1000000000 bestTree = [] for idx,tree in enumerate(all_rooted_binary_trees(*taxa)): t = nwk.LingPyTree(tree) if verbose: print('[{0}] {1}...'.format(idx+1, t.newick)) score = 0 for i,(p,m,c) in enumerate(zip(patterns, transitions, characters)): weights = sankoff_parsimony_up( p, taxa, t, m, c ) if not proto_forms: minWeight = min(weights[t.root].values()) else: minWeight = weights[t.root][proto_forms[i]] score += minWeight if score > minScore: break if score == minScore: bestTree += [nwk.sort_tree(t.newick)] elif score < minScore: minScore = score bestTree = [nwk.sort_tree(t.newick)] return bestTree, minScore
def best_tree_brute_force(patterns, taxa, transitions, characters, proto_forms=False, verbose=False): """ This is an experimental parsimony version that allows for ordered character states. """ minScore = 1000000000 bestTree = [] for idx, tree in enumerate(all_rooted_binary_trees(*taxa)): t = nwk.LingPyTree(tree) if verbose: print('[{0}] {1}...'.format(idx + 1, t.newick)) score = 0 for i, (p, m, c) in enumerate(zip(patterns, transitions, characters)): weights = sankoff_parsimony_up(p, taxa, t, m, c) if not proto_forms: minWeight = min(weights[t.root].values()) else: minWeight = weights[t.root][proto_forms[i]] score += minWeight if score > minScore: break if score == minScore: bestTree += [nwk.sort_tree(t.newick)] elif score < minScore: minScore = score bestTree = [nwk.sort_tree(t.newick)] return bestTree, minScore
def all_rooted_binary_trees(*taxa): """ Compute all rooted trees. Notes ----- This procedure yields all rooted binary trees for a given set of taxa, as described in :bib:`Felsenstein1978`. It implements a depth-first search. """ if len(taxa) <= 2: yield '(' + ','.join(taxa) + ');' # make queue with taxa included and taxa to be visited queue = [('(' + ','.join(taxa[:2]) + ')', list(taxa[2:]))] out = [] while queue: # add next taxon tree, rest = queue.pop() if rest: next_taxon = rest.pop() nodes = list(nwk.nodes_in_tree(tree)) random.shuffle(nodes) for node in nodes: new_tree = tree.replace(node, '(' + next_taxon + ',' + node + ')') r = [x for x in rest] random.shuffle(r) queue += [(new_tree, r)] if not rest: yield new_tree
def all_rooted_binary_trees(*taxa): """ Compute all rooted trees. Notes ----- This procedure yields all rooted binary trees for a given set of taxa, as described in :bib:`Felsenstein1978`. It implements a depth-first search. """ if len(taxa) <= 2: yield '('+','.join(taxa)+');' # make queue with taxa included and taxa to be visited queue = [('('+','.join(taxa[:2])+')', list(taxa[2:]))] out = [] while queue: # add next taxon tree, rest = queue.pop() if rest: next_taxon = rest.pop() nodes = list(nwk.nodes_in_tree(tree)) random.shuffle(nodes) for node in nodes: new_tree = tree.replace(node, '('+next_taxon+','+node+')') r = [x for x in rest] random.shuffle(r) queue += [(new_tree, r)] if not rest: yield new_tree
def heuristic_parsimony( taxa, patterns, transitions, characters, guide_tree = False, verbose = True, lower_bound = False, iterations = 300, sample_steps = 100, log = False, stop_iteration = False ): """ Try to make a heuristic parsimony calculation. Note ---- This calculation uses the following heuristic to quickly walk through the tree space: 1. Start from a guide tree or a random tree provided by the user. 2. In each iteration step, create new trees and use them, if they are not already visited: 1. Create a certain amount of trees by swapping the trees which currently have the best scores. 2. Create a certain amount of trees by swapping the trees which have very good scores (one forth of the best trees of each run) from the queue. 3. Create a certain amount of random trees to search also in different regions of the tree space and increase the chances of finding another maximum. 4. Take a certain amount of random samples from a tree-generator which will successively produce all possible trees in a randomized order. 3. In each iteration, a larger range of trees (around 100, depending on the number of taxonomic units) is created and investigated. Once this is done, the best 25% of the results are appended to the queue. A specific array stores the trees with minimal scores and updates them successively. The queue is re-ordered in each iteration step, according to the score of the trees in the queue. The proportion of trees which are harvested from the four different procedures will change depending on the number of trees with currently minimal scores. If there are very many trees with minimal score, the algorithm will increase the number of random trees in order to broaden the search. If the number of optimal trees is small, the algorithm tries to stick to those few trees in order to find their optimal neighbors. This procedure allows to search the tree space rather efficiently, and for smaller datasets, it optimizes rather satisfyingly. Unless one takes as many iterations as there are possible trees in the data, however, this procedure is never guaranteed to find the best tree or the best trees. """ if log: logfile = open('log.log', 'w').close() logfile = open('log.log', 'a') if not guide_tree: guide_tree = random_tree(taxa) lower_bound = 0 ltree = nwk.LingPyTree(guide_tree) for idx,(p,t,c) in enumerate(zip(patterns, transitions, characters)): lower_bound += sankoff_parsimony_up( p, taxa, ltree, t, c, weight_only=True ) print("[i] Lower Bound (in guide tree):",lower_bound) # we start doing the same as in the case of the calculation of all rooted # trees below if len(taxa) <= 2: return '('+','.join(taxa)+');' # make queue with taxa included and taxa to be visited tree = nwk.sort_tree(guide_tree) queue = [(tree, lower_bound)] visited = [tree] trees = [tree] # create a generator for all rooted binary trees gen = all_rooted_binary_trees(*taxa) previous = 0 while queue: # modify queue queue = sorted(queue, key=lambda x: x[1]) # check whether tree is in data or not #if tree in visited: # try creating a new tree in three steps: # a) swap the tree # b) make a random tree # c) take a generated tree forest = [] # determine proportions, depending on the number of optimal trees if len(trees) < 50: props = [4,1,0.5,0.25] elif len(trees) < 100: props = [2,1,0.5,0.5] elif len(trees) >= 100: props = [1,1,1,1] # try and get the derivations from the best trees for i in range(int(props[0] * len(taxa)) or 5): new_tree = swap_tree(random.choice(trees)) if new_tree not in visited: forest += [new_tree] visited += [new_tree] if previous < len(visited) and len(visited) % sample_steps == 0: print("[i] Investigated {0} trees so far, currently holding {1} trees with best score of {2}.".format(len(visited), len(trees), lower_bound)) previous = len(visited) for i in range(int(props[1] * len(taxa)) or 5): # we change the new tree at a certain number of steps if i % (len(taxa) // 4 or 2) == 0: try: tree, bound = queue.pop(0) if tree.endswith(';'): tree = tree[:-1] except IndexError: pass new_tree = swap_tree(tree) if new_tree not in visited: forest += [new_tree] visited += [new_tree] if previous < len(visited) and len(visited) % sample_steps == 0: print("[i] Investigated {0} trees so far, currently holding {1} trees with best score of {2}.".format(len(visited), len(trees), lower_bound)) previous = len(visited) # go on with b for i in range(int(props[2] * len(taxa)) or 5): new_tree = nwk.sort_tree(random_tree(taxa)) if new_tree not in visited: forest += [new_tree] visited += [new_tree] if previous < len(visited) and len(visited) % sample_steps == 0: print("[i] Investigated {0} trees so far, currently holding {1} trees with best score of {2}.".format(len(visited), len(trees), lower_bound)) previous = len(visited) # be careful with stop of iteration when using this function, so we # need to add a try-except statement here for i in range(int(props[3] * len(taxa)) or 5): try: new_tree = nwk.sort_tree(next(gen)) if new_tree not in visited: forest += [new_tree] visited += [new_tree] if previous < len(visited) and len(visited) % sample_steps == 0: print("[i] Investigated {0} trees so far, currently holding {1} trees with best score of {2}.".format(len(visited), len(trees), lower_bound)) previous = len(visited) except StopIteration: pass # check whether forest is empty, if this is the case, try to exhaust it # by adding new items from the iteration process, and do this, until # the iterator is exhausted, to make sure that the exact number of # possible trees as wished by the user is also tested if not forest: while True: try: new_tree = nwk.sort_tree(next(gen)) if new_tree not in visited: visited += [new_tree] forest += [new_tree] break except StopIteration: break best_scores = [] for tree in forest: score = 0 lp_tree = nwk.LingPyTree(tree) for p,t,c in zip(patterns, transitions, characters): weight = sankoff_parsimony_up( p, taxa, lp_tree, t, c, weight_only =True ) score += weight if log: logfile.write(str(score)+'\t'+lp_tree.newick+';\n') # append stuff to queue best_scores += [(tree, score)] # important to include at least one of the trees to the queue, # otherwise the program terminates at points where we don't want it to # terminate for tree,score in sorted(best_scores, key=lambda x: x[1])[:len(best_scores) // 4 or 1]: queue += [(tree, score)] if score < lower_bound: trees = [tree] lower_bound = score elif score == lower_bound: trees += [tree] # check before terminating whether more iterations should be carried # out (in case scores are not satisfying) if len(visited) > iterations: if stop_iteration: break else: answer = input("[?] Number of chosen iterations is reached, do you want to go on with the analysis? y/n ").strip().lower() if answer == 'y': while True: number = input("[?] How many iterations? ") try: number = int(number) iterations += number break except: pass else: break return trees, lower_bound
def heuristic_parsimony(taxa, patterns, transitions, characters, guide_tree=False, verbose=True, lower_bound=False, iterations=300, sample_steps=100, log=False, stop_iteration=False): """ Try to make a heuristic parsimony calculation. Note ---- This calculation uses the following heuristic to quickly walk through the tree space: 1. Start from a guide tree or a random tree provided by the user. 2. In each iteration step, create new trees and use them, if they are not already visited: 1. Create a certain amount of trees by swapping the trees which currently have the best scores. 2. Create a certain amount of trees by swapping the trees which have very good scores (one forth of the best trees of each run) from the queue. 3. Create a certain amount of random trees to search also in different regions of the tree space and increase the chances of finding another maximum. 4. Take a certain amount of random samples from a tree-generator which will successively produce all possible trees in a randomized order. 3. In each iteration, a larger range of trees (around 100, depending on the number of taxonomic units) is created and investigated. Once this is done, the best 25% of the results are appended to the queue. A specific array stores the trees with minimal scores and updates them successively. The queue is re-ordered in each iteration step, according to the score of the trees in the queue. The proportion of trees which are harvested from the four different procedures will change depending on the number of trees with currently minimal scores. If there are very many trees with minimal score, the algorithm will increase the number of random trees in order to broaden the search. If the number of optimal trees is small, the algorithm tries to stick to those few trees in order to find their optimal neighbors. This procedure allows to search the tree space rather efficiently, and for smaller datasets, it optimizes rather satisfyingly. Unless one takes as many iterations as there are possible trees in the data, however, this procedure is never guaranteed to find the best tree or the best trees. """ if log: logfile = open('log.log', 'w').close() logfile = open('log.log', 'a') if not guide_tree: guide_tree = random_tree(taxa) lower_bound = 0 ltree = nwk.LingPyTree(guide_tree) for idx, (p, t, c) in enumerate(zip(patterns, transitions, characters)): lower_bound += sankoff_parsimony_up(p, taxa, ltree, t, c, weight_only=True) print("[i] Lower Bound (in guide tree):", lower_bound) # we start doing the same as in the case of the calculation of all rooted # trees below if len(taxa) <= 2: return '(' + ','.join(taxa) + ');' # make queue with taxa included and taxa to be visited tree = nwk.sort_tree(guide_tree) queue = [(tree, lower_bound)] visited = [tree] trees = [tree] # create a generator for all rooted binary trees gen = all_rooted_binary_trees(*taxa) previous = 0 while queue: # modify queue queue = sorted(queue, key=lambda x: x[1]) # check whether tree is in data or not #if tree in visited: # try creating a new tree in three steps: # a) swap the tree # b) make a random tree # c) take a generated tree forest = [] # determine proportions, depending on the number of optimal trees if len(trees) < 50: props = [4, 1, 0.5, 0.25] elif len(trees) < 100: props = [2, 1, 0.5, 0.5] elif len(trees) >= 100: props = [1, 1, 1, 1] # try and get the derivations from the best trees for i in range(int(props[0] * len(taxa)) or 5): new_tree = swap_tree(random.choice(trees)) if new_tree not in visited: forest += [new_tree] visited += [new_tree] if previous < len(visited) and len(visited) % sample_steps == 0: print( "[i] Investigated {0} trees so far, currently holding {1} trees with best score of {2}." .format(len(visited), len(trees), lower_bound)) previous = len(visited) for i in range(int(props[1] * len(taxa)) or 5): # we change the new tree at a certain number of steps if i % (len(taxa) // 4 or 2) == 0: try: tree, bound = queue.pop(0) if tree.endswith(';'): tree = tree[:-1] except IndexError: pass new_tree = swap_tree(tree) if new_tree not in visited: forest += [new_tree] visited += [new_tree] if previous < len(visited) and len(visited) % sample_steps == 0: print( "[i] Investigated {0} trees so far, currently holding {1} trees with best score of {2}." .format(len(visited), len(trees), lower_bound)) previous = len(visited) # go on with b for i in range(int(props[2] * len(taxa)) or 5): new_tree = nwk.sort_tree(random_tree(taxa)) if new_tree not in visited: forest += [new_tree] visited += [new_tree] if previous < len(visited) and len(visited) % sample_steps == 0: print( "[i] Investigated {0} trees so far, currently holding {1} trees with best score of {2}." .format(len(visited), len(trees), lower_bound)) previous = len(visited) # be careful with stop of iteration when using this function, so we # need to add a try-except statement here for i in range(int(props[3] * len(taxa)) or 5): try: new_tree = nwk.sort_tree(next(gen)) if new_tree not in visited: forest += [new_tree] visited += [new_tree] if previous < len( visited) and len(visited) % sample_steps == 0: print( "[i] Investigated {0} trees so far, currently holding {1} trees with best score of {2}." .format(len(visited), len(trees), lower_bound)) previous = len(visited) except StopIteration: pass # check whether forest is empty, if this is the case, try to exhaust it # by adding new items from the iteration process, and do this, until # the iterator is exhausted, to make sure that the exact number of # possible trees as wished by the user is also tested if not forest: while True: try: new_tree = nwk.sort_tree(next(gen)) if new_tree not in visited: visited += [new_tree] forest += [new_tree] break except StopIteration: break best_scores = [] for tree in forest: score = 0 lp_tree = nwk.LingPyTree(tree) for p, t, c in zip(patterns, transitions, characters): weight = sankoff_parsimony_up(p, taxa, lp_tree, t, c, weight_only=True) score += weight if log: logfile.write(str(score) + '\t' + lp_tree.newick + ';\n') # append stuff to queue best_scores += [(tree, score)] # important to include at least one of the trees to the queue, # otherwise the program terminates at points where we don't want it to # terminate for tree, score in sorted( best_scores, key=lambda x: x[1])[:len(best_scores) // 4 or 1]: queue += [(tree, score)] if score < lower_bound: trees = [tree] lower_bound = score elif score == lower_bound: trees += [tree] # check before terminating whether more iterations should be carried # out (in case scores are not satisfying) if len(visited) > iterations: if stop_iteration: break else: answer = input( "[?] Number of chosen iterations is reached, do you want to go on with the analysis? y/n " ).strip().lower() if answer == 'y': while True: number = input("[?] How many iterations? ") try: number = int(number) iterations += number break except: pass else: break return trees, lower_bound