def calcDistance(self): if self.path1 != '' and self.path2 != '': self.fileEx1 = (os.path.splitext(self.path1)[1])[1:] self.fileEx2 = (os.path.splitext(self.path2)[1])[1:] tns = dendropy.TaxonNamespace() self.tree1 = dendropy.Tree.get_from_path(self.path1, self.fileEx1, taxon_namespace=tns) self.tree2 = dendropy.Tree.get_from_path(self.path2, self.fileEx2, taxon_namespace=tns) self.tree1.encode_bipartitions() self.tree2.encode_bipartitions() print(treecompare.false_positives_and_negatives(self.tree1, self.tree2)) # self.tree1 = dendropy.Tree.get_from_string('((A, B), (C, D))', 'newick') # self.tree2 = dendropy.Tree.get_from_string('((A, B), (C, D))', 'newick') # self.tree1.encode_bipartitions() # self.tree2.encode_bipartitions() # oblicz dystans # self.symDist = self.tree1.symmetric_difference(self.tree2) self.symDist = treecompare.symmetric_difference(self.tree1, self.tree2) self.fpnDist = treecompare.false_positives_and_negatives(self.tree1, self.tree2) self.eucDist = treecompare.euclidean_distance(self.tree1, self.tree2) self.rfDist = treecompare.robinson_foulds_distance(self.tree1, self.tree2) # pokaz wyniki self.res1.setText(str(self.eucDist)) #eucDist self.res2.setText(str(self.rfDist)) #rfDist
def compareDendropyTrees(tr1, tr2): from dendropy.calculate.treecompare \ import false_positives_and_negatives lb1 = set([l.taxon.label for l in tr1.leaf_nodes()]) lb2 = set([l.taxon.label for l in tr2.leaf_nodes()]) com = lb1.intersection(lb2) if com != lb1 or com != lb2: com = list(com) tns = dendropy.TaxonNamespace(com) tr1.retain_taxa_with_labels(com) tr1.migrate_taxon_namespace(tns) tr2.retain_taxa_with_labels(com) tr2.migrate_taxon_namespace(tns) com = list(com) tr1.update_bipartitions() tr2.update_bipartitions() nl = len(com) ei1 = len(tr1.internal_edges(exclude_seed_edge=True)) ei2 = len(tr2.internal_edges(exclude_seed_edge=True)) [fp, fn] = false_positives_and_negatives(tr1, tr2) rf = float(fp + fn) / (ei1 + ei2) return (nl, ei1, ei2, fp, fn, rf)
def false_positives_and_negatives(reference_tree, test_tree): deprecate.dendropy_deprecation_warning( preamble="Deprecated since DendroPy 4: The 'dendropy.treecalc.false_positives_and_negatives()' function has moved to 'dendropy.calculate.treecompare.false_positives_and_negatives()'.", old_construct="from dendropy import treecalc\nd = treecalc.false_positives_and_negatives(...)", new_construct="from dendropy.calculate import treecompare\nd = treecompare.false_positives_and_negatives(...)", ) return treecompare.false_positives_and_negatives(reference_tree=reference_tree, comparison_tree=test_tree)
def compare_trees(tr1, tr2): from dendropy.calculate.treecompare \ import false_positives_and_negatives lb1 = set([l.taxon.label for l in tr1.leaf_nodes()]) lb2 = set([l.taxon.label for l in tr2.leaf_nodes()]) com = lb1.intersection(lb2) if com != lb1 or com != lb2: com = list(com) tns = dendropy.TaxonNamespace(com) tr1.retain_taxa_with_labels(com) tr1.migrate_taxon_namespace(tns) tr2.retain_taxa_with_labels(com) tr2.migrate_taxon_namespace(tns) com = list(com) tr1.update_bipartitions() tr2.update_bipartitions() nl = len(com) ei1 = len(tr1.internal_edges(exclude_seed_edge=True)) ei2 = len(tr2.internal_edges(exclude_seed_edge=True)) [fp, fn] = false_positives_and_negatives(tr1, tr2) # Note that the normalized symmetric difference equals the normalized RF # distance when both trees are fully resolved, i.e., binary. sd = float(fp + fn) / (ei1 + ei2) rf = float(fp + fn) / (2 * nl - 6) return (nl, ei1, ei2, fp, fn, sd, rf)
def compareRes(tree, taxa, anch, sp, outpath): tns = dendropy.TaxonNamespace() tree1 = dendropy.Tree.get_from_path(sp, "newick", taxon_namespace=tns, rooting="force-unrooted") tree2 = dendropy.Tree.get_from_path(tree, "newick", taxon_namespace=tns, rooting="force-unrooted") res = treecompare.false_positives_and_negatives(tree1, tree2) return res
def false_positives_and_negatives(reference_tree, test_tree): deprecate.dendropy_deprecation_warning( preamble="Deprecated since DendroPy 4: The 'dendropy.treecalc.false_positives_and_negatives()' function has moved to 'dendropy.calculate.treecompare.false_positives_and_negatives()'.", old_construct="from dendropy import treecalc\nd = treecalc.false_positives_and_negatives(...)", new_construct="from dendropy.calculate import treecompare\nd = treecompare.false_positives_and_negatives(...)") return treecompare.false_positives_and_negatives( reference_tree=reference_tree, comparison_tree=test_tree)
def get_fnrate(reftreepath,outtreepath): tns = dendropy.TaxonNamespace() rtree = dendropy.Tree.get(path=reftreepath,schema='newick', taxon_namespace=tns) otree = dendropy.Tree.get(path=outtreepath,schema='newick', taxon_namespace=tns) rtree.encode_bipartitions() otree.encode_bipartitions() fn_rate=treecompare.false_positives_and_negatives(rtree, otree)[1]/float(len(tns)-3) return fn_rate
def calcDistance(self): if self.path1 != '' and self.path2 != '': self.fileEx1 = (os.path.splitext(self.path1)[1])[1:] self.fileEx2 = (os.path.splitext(self.path2)[1])[1:] tns = dendropy.TaxonNamespace() self.tree1 = dendropy.Tree.get_from_path(self.path1, self.fileEx1, taxon_namespace=tns) self.tree2 = dendropy.Tree.get_from_path(self.path2, self.fileEx2, taxon_namespace=tns) self.tree1.encode_bipartitions() self.tree2.encode_bipartitions() print( treecompare.false_positives_and_negatives( self.tree1, self.tree2)) # self.tree1 = dendropy.Tree.get_from_string('((A, B), (C, D))', 'newick') # self.tree2 = dendropy.Tree.get_from_string('((A, B), (C, D))', 'newick') # self.tree1.encode_bipartitions() # self.tree2.encode_bipartitions() # oblicz dystans # self.symDist = self.tree1.symmetric_difference(self.tree2) self.symDist = treecompare.symmetric_difference( self.tree1, self.tree2) self.fpnDist = treecompare.false_positives_and_negatives( self.tree1, self.tree2) self.eucDist = treecompare.euclidean_distance( self.tree1, self.tree2) self.rfDist = treecompare.robinson_foulds_distance( self.tree1, self.tree2) # pokaz wyniki self.res1.setText(str(self.eucDist)) #eucDist self.res2.setText(str(self.rfDist)) #rfDist
def calculateDistance(self): if self.path1 != '' and self.path2 != '': #get files extensions self.fileExtension1 = (os.path.splitext(self.path1)[1])[1:] self.fileExtension2 = (os.path.splitext(self.path2)[1])[1:] #open tree files tns = dendropy.TaxonNamespace() self.tree1 = dendropy.Tree.get_from_path(self.path1, self.fileExtension1, taxon_namespace=tns) self.tree2 = dendropy.Tree.get_from_path(self.path2, self.fileExtension2, taxon_namespace=tns) self.tree1.encode_bipartitions() self.tree2.encode_bipartitions() print(treecompare.false_positives_and_negatives(self.tree1, self.tree2)) # self.tree1 = dendropy.Tree.get_from_string('((A, B), (C, D))', 'newick') # self.tree2 = dendropy.Tree.get_from_string('((A, B), (C, D))', 'newick') # self.tree1.encode_bipartitions() #self.tree2.encode_bipartitions() #calculate distances #self.symDist = self.tree1.symmetric_difference(self.tree2) self.symDist = treecompare.symmetric_difference(self.tree1, self.tree2) self.fpnDist = treecompare.false_positives_and_negatives(self.tree1, self.tree2) self.eucDist = treecompare.euclidean_distance(self.tree1, self.tree2) self.rfDist = treecompare.robinson_foulds_distance(self.tree1, self.tree2) #show distances self.dist1Value.setText(str(self.eucDist)) self.dist2Value.setText(str(self.rfDist)) self.dist3Value.setText(str(self.symDist)) self.dist4Value.setText(str(self.fpnDist))
def distance(file_path, file_format, file_path2): taxon_namespace = dendropy.TaxonNamespace() tree1 = dendropy.Tree.get_from_path(file_path, file_format, taxon_namespace=taxon_namespace) tree2 = dendropy.Tree.get_from_path(file_path2, file_format, taxon_namespace=taxon_namespace) sym_diff = treecompare.symmetric_difference(tree1, tree2) euc_dis = treecompare.euclidean_distance(tree1, tree2) false_pos = treecompare.false_positives_and_negatives(tree1, tree2) robinson_dis = treecompare.robinson_foulds_distance(tree1, tree2) print("Symetric difference: ", sym_diff) print("Robinson Foulds distance: ", robinson_dis) print("False positives and negatives: ", false_pos) print("Euclidean distance: ", euc_dis)
def are_two_trees_incompatible(tree1, tree2): """Check if two unrooted trees are equivalent on their shared taxon set Parameters ---------- tree1 : dendropy tree object tree2 : dendropy tree object Returns ------- violates : bool True, if trees are NOT compatible False, if trees are compatible """ leaves1 = get_leaf_set(tree1) leaves2 = get_leaf_set(tree2) shared = list(leaves1.intersection(leaves2)) taxa = dendropy.TaxonNamespace(shared) # CRITICAL!!! # No topological information if len(shared) < 4: return False # Move trees onto shared leaf set tree1.retain_taxa_with_labels(shared) tree1.migrate_taxon_namespace(taxa) tree1.is_rooted = False tree1.collapse_basal_bifurcation() tree1.update_bipartitions() tree2.retain_taxa_with_labels(shared) tree2.migrate_taxon_namespace(taxa) tree2.is_rooted = False tree2.collapse_basal_bifurcation() tree2.update_bipartitions() # Check for compatibility [fp, fn] = false_positives_and_negatives(tree1, tree2) if fp > 0 or fn > 0: return True else: return False
def compareAnchoredRes(tree, taxa, achs, sp, outpath, trueAnch): taxa = set(taxa) - set(achs) tns = dendropy.TaxonNamespace() anch = trueAnch tree1 = dendropy.Tree.get_from_path(sp, "newick", taxon_namespace=tns, rooting="force-unrooted") inferedTree = tree1.clone(2) inferedTree.retain_taxa_with_labels(taxa, update_bipartitions=True) inferedTree.deroot() ftmp1 = tempfile.mkstemp( suffix=".nwk", prefix="sp.nwk-" + str(anch[0]) + "-" + str(anch[1]), dir=outpath, text=None ) inferedTree.write(path=ftmp1[1], schema="newick", suppress_rooting=True) tree2 = dendropy.Tree.get_from_path(tree, "newick", taxon_namespace=tns, rooting="force-unrooted") inferedTree = tree2.clone(2) inferedTree.retain_taxa_with_labels(taxa, update_bipartitions=True) inferedTree.deroot() ftmp2 = tempfile.mkstemp(suffix=".nwk", prefix=tree + ".retained", dir=outpath, text=None) inferedTree.write(path=ftmp2[1], schema="newick", suppress_rooting=True) tns = dendropy.TaxonNamespace() tree1 = dendropy.Tree.get_from_path(ftmp1[1], taxon_namespace=tns, rooting="force-unrooted") tree2 = dendropy.Tree.get_from_path(ftmp2[1], "newick", taxon_namespace=tns, rooting="force-unrooted") res = treecompare.false_positives_and_negatives(tree1, tree2) return res
def compare_trees(tr1, tr2): # Find leaf labels that are in both trees lb1 = set([l.taxon.label for l in tr1.leaf_nodes()]) lb2 = set([l.taxon.label for l in tr2.leaf_nodes()]) com = lb1.intersection(lb2) # Restrict trees to shared leaf set if com != lb1 or com != lb2: com = list(com) tns = dendropy.TaxonNamespace(com) tr1.retain_taxa_with_labels(com) tr1.migrate_taxon_namespace(tns) tr2.retain_taxa_with_labels(com) tr2.migrate_taxon_namespace(tns) com = list(com) # Update tree bipartitions tr1.update_bipartitions() tr2.update_bipartitions() # Compute number of leaves and number of internal edges nl = len(com) ei1 = len(tr1.internal_edges(exclude_seed_edge=True)) ei2 = len(tr2.internal_edges(exclude_seed_edge=True)) # Compute number of false positives and false negatives [fp, fn] = false_positives_and_negatives(tr1, tr2) # Compute symmetric difference rate sd = float(fp + fn) / (ei1 + ei2) # Compute Robinson-Foulds error rate rf = float(fp + fn) / (2 * nl - 6) return (nl, ei1, ei2, fp, fn, sd, rf)
#!/usr/bin/env python import sys import argparse import dendropy from dendropy.calculate import treecompare distance_functions = { "euclidean": treecompare.euclidean_distance, "bipartition": lambda t1, t2: sum( treecompare.false_positives_and_negatives(t1, t2)), "wrf": treecompare.weighted_robinson_foulds_distance, "weighted_robinson_foulds": treecompare.weighted_robinson_foulds_distance, "rf": treecompare.unweighted_robinson_foulds_distance, "unweighted_robinson_foulds": treecompare.unweighted_robinson_foulds_distance, } if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "original", type=argparse.FileType('r'), help="File with original trees") parser.add_argument( "reconstructed", type=argparse.FileType('r'), help="File with reconstructed trees") parser.add_argument(
def remove_outliers(treeList, strategy, outpath, e, summary): print "the strategy is: " + strategy if len(treeList) < 10: print "number of trees is " + str(len(treeList)) + ". This is not enough for outlier removal!" return treeList if strategy == "consensus10" or strategy == "consensus3": ftmp = findMRL(treeList, e, outpath, summary) ref_tree = dendropy.Tree.get(path=ftmp, schema="newick") treeList.append(ref_tree) d = list() for tree in treeList: tree.encode_bipartitions() ref_tree.encode_bipartitions() res = treecompare.false_positives_and_negatives(ref_tree, tree) d.append(res[1]) if strategy == "consensus3": mean = np.mean(d) # mean = mstats.mode(d) # mean = mean[0] print "the mean distance to consensus tree was: " + str(mean) st = np.std(d) print "the std of distances to consensus tree was: " + str(st) for i in range(len(d) - 1, 0, -1): if d[i] > mean + 2.0 * st: print "deleting " + str(i) + "th tree!" print "d[i] to delete: " + str(d[i]) del treeList[i] else: sortIdx = np.argsort(d, 0) print len(sortIdx) print sortIdx m = int(len(sortIdx) / 4.0) print "deleting " + str(m) + " of the trees" idx = sorted([x for x in sortIdx[len(sortIdx) - m : len(sortIdx)]], reverse=True) print idx print d for i in idx: print "deleting the tree " + str(i) + "the. The distance to consensus tree was: " + str(d[i]) del treeList[i] elif strategy == "pairwise1" or strategy == "pairwise2" or strategy == "pairwise3": D = np.ndarray(shape=(len(treeList), len(treeList)), dtype=float) for i in range(0, len(treeList)): D[i][i] = 0.0 for j in range(i + 1, len(treeList)): tree1 = treeList[i] tree2 = treeList[j] tree1.encode_bipartitions() tree2.encode_bipartitions() res1 = treecompare.false_positives_and_negatives(tree1, tree2) D[i][j] = res1[1] D[j][i] = res1[0] if strategy == "pairwise1": d = np.mean(D, 1) C = np.cov(D) v = [distance.mahalanobis(D[:, i], d, C) for i in range(0, len(treeList))] print v sortIdx = np.argsort(v, 0) m = int(len(sortIdx) * 0.15) idx = sorted([x for x in sortIdx[len(sortIdx) - m : len(sortIdx)]], reverse=True) for i in idx: print "deleting the tree " + str(i) + "the. The distance to consensus tree was: " + str(v[i]) del treeList[i] elif strategy == "pairwise3": d = np.mean(D, 0) sortIdx = np.argsort(d, 0) print len(sortIdx) print sortIdx m = int(len(sortIdx) / 5.0) print "deleting " + str(m) + " of the trees" idx = sorted([x for x in sortIdx[len(sortIdx) - m : len(sortIdx)]], reverse=True) print idx print d for i in idx: print "deleting the tree " + str(i) + "the. The distance to consensus tree was: " + str(d[i]) del treeList[i] else: d = np.mean(D, 0) print d mean = np.mean(d) st = np.std(d) idx = list() for k in range(len(d) - 1, 0, -1): if d[k] > mean + 1.5 * st: print "deleting the tree " + str(k) + "the. The distance to consensus tree was: " + str(d[k]) del treeList[k] return treeList
def compare_trees(tr1, tr2): """ Compares two trees Parameters ---------- tr1 : dendropy tree object First tree (typically the model tree) tr2 : dendropy tree object Second tree (typically the estimated tree) Returns ------- nl : int Size of the shared leaf set, i.e., the number of leaves in both trees ei1 : int Number of internal edges in first tree (after restricting it to the shared leaf set) ei2 : int Number of internal edges in second tree (after restricting it to the shared leaf set) fn : int Number of edges in the first tree that are not in the second tree fp : int Number of edges in the second tree that are not in the first tree rf : float Normalized Robinson-Foulds (RF) distance between the first and second trees Example ------- If tree 1 corresponds to "(((A,B,C),D),E);" and tree 2 corresponds to "((((A,B),C),D),E);", then the output is "5 1 2 0 1 0.25". In this example, + first and second trees share 5 leaves (A, B, C, D, E). + first tree has one internal edge "A,B,C|D,E" + second tree has two internal edges "A,B|C,D,E" and "A,B,C|D,E" + one edges in the first tree that are missing from the second tree + no edge "A,B|C,D,E" in the second tree that is missing in the first tree + normalized RF distance is (FP+FN)/(2*NL-6) = (1+0)/(2*5-6) = 0.25 """ # Unroot the two trees! tr1.is_rooted = False tr1.collapse_basal_bifurcation(set_as_unrooted_tree=True) tr2.is_rooted = False tr2.collapse_basal_bifurcation(set_as_unrooted_tree=True) # Restrict the two trees to the same leaf set if necessary! lb1 = set([l.taxon.label for l in tr1.leaf_nodes()]) lb2 = set([l.taxon.label for l in tr2.leaf_nodes()]) com = lb1.intersection(lb2) if com != lb1 or com != lb2: com = list(com) tns = dendropy.TaxonNamespace(com) tr1.retain_taxa_with_labels(com) tr1.migrate_taxon_namespace(tns) tr2.retain_taxa_with_labels(com) tr2.migrate_taxon_namespace(tns) com = list(com) # Compare trees! tr1.update_bipartitions() tr2.update_bipartitions() nl = len(com) ei1 = len(tr1.internal_edges(exclude_seed_edge=True)) ei2 = len(tr2.internal_edges(exclude_seed_edge=True)) [fn, fp] = false_positives_and_negatives(tr1, tr2) rf = (fn + fp) / (2.0 * nl - 6.0) return (nl, ei1, ei2, fn, fp, rf)
f.write('\nMinimum (non-zero) RF Distance: {}\n'.format( rf_pair_matrix[np.nonzero(rf_pair_matrix)].min())) f.write('Maximum RF Distance: {}\n'.format(rf_pair_matrix.max())) f.write('Mean RF Distance: {}\n'.format( rf_pair_matrix[np.nonzero(rf_pair_matrix)].mean())) f.write('Std. Dev. of RF Distance: {}\n'.format( np.sqrt(rf_pair_matrix[np.nonzero(rf_pair_matrix)].var()))) # Consensus Tree Methods print('Calculating consensus trees...') f.write('\n#### Strict Consensus Tree ####\n') strict_con_tree = tlist.consensus(min_freq=1.0) f.write('{}\n'.format(strict_con_tree.as_string('newick'))) strict_stats = np.zeros((N, 3)) for i in range(N): fp, fn = treecompare.false_positives_and_negatives(strict_con_tree, dp_trees[i]) strict_stats[i] = [fp, fn, fp + fn] pd_strict_stats = pd.DataFrame( data=strict_stats, index=combined_iters, columns=['False Positive', 'False Negative', 'RF Distance']) f.write('{}\n'.format(str(pd_strict_stats))) f.write('\nMinimum RF Distance: {}\n'.format(strict_stats[:, 2].min())) f.write('Maximum RF Distance: {}\n'.format(strict_stats[:, 2].max())) f.write('Mean RF Distance: {}\n'.format(strict_stats[:, 2].mean())) f.write('Std. Dev. of RF Distance: {}\n'.format( np.sqrt(strict_stats[:, 2].var()))) f.write('\n#### Majority Rule Consensus Tree ####\n') maj_con_tree = tlist.consensus(min_freq=0.5) f.write('{}\n'.format(maj_con_tree.as_string('newick')))
#result_file.write(method + '\n') for i in range(20): truth = '../../{}/{}/R{}/rose.tt'.format(data, data, i) predicted_tree_file = (data + '/' + method + '/R' + str(i) + '/out_tree.nwk') if (not os.path.isfile(predicted_tree_file) or os.stat(predicted_tree_file).st_size == 0): result_file.write(method + ',R' + str(i) + ',err,err\n') continue true_tree_file = (truth) tree1 = Tree.get_from_path(predicted_tree_file, "newick", taxon_namespace=tns) tree2 = Tree.get_from_path(true_tree_file, "newick", taxon_namespace=tns) tree1.encode_bipartitions() tree2.encode_bipartitions() print('R' + str(i), treecompare.false_positives_and_negatives(tree1, tree2)) result_file.write(method + ',R' + str(i) + ',' + ','.join([ str(x) for x in treecompare.false_positives_and_negatives( tree1, tree2) ])) result_file.write('\n') result_file.close()
result_file = open('result_{}_nj.txt'.format(data), 'w') for method in distance_methods: #result_file.write(method + '\n') for i in range(20): truth = '../../{}/{}/R{}/rose.tt'.format(data, data, i) predicted_tree_file = (data + '/' + method + '/R'+ str(i) + '/out_tree.nwk') if (not os.path.isfile(predicted_tree_file) or os.stat(predicted_tree_file).st_size == 0): result_file.write(method+',R'+str(i)+',err,err\n') continue true_tree_file = (truth) tree1 = Tree.get_from_path( predicted_tree_file, "newick", taxon_namespace=tns) tree2 = Tree.get_from_path( true_tree_file, "newick", taxon_namespace=tns) tree1.encode_bipartitions() tree2.encode_bipartitions() print('R'+str(i),treecompare.false_positives_and_negatives(tree1, tree2)) result_file.write(method+',R'+str(i)+','+','.join([str(x) for x in treecompare.false_positives_and_negatives(tree1, tree2)])) result_file.write('\n') result_file.close()
def main(): parser = optparse.OptionParser(usage='ttp-parse-log [options] <log file>') parser.add_option('--out', dest='out_path', default=None, help='Path for output') parser.add_option( '--near', dest='near_percent', default=20, help='Trees within <--near>% of the optimal cost will be captured') parser.add_option( '--true', dest='true_tree_path', default=None, help='Can provide a true tree to compare multiple optimal trees with') parser.add_option( '--include_near', action='store_true', dest='include_near', default=False, help='Include nearby optimal trees in summary statistics') parser.add_option( '--separate_trees', action='store_true', dest='separate_trees', default=False, help='Create two output files separating trees and statistics') options, args = parser.parse_args() if len(args) == 0 or len(args) > 1: parser.print_help() sys.exit(1) print('### {} Version {} ###'.format(NAME, VERSION)) file_path = args[0] out_path = options.out_path near_percent = options.near_percent true_tree_path = options.true_tree_path include_near = options.include_near separate_trees = options.separate_trees print('Logfile: {}'.format(file_path)) if true_tree_path is not None: try: true_tree = dp.Tree.get(path=true_tree_path, schema='newick') except: print('True tree path is not a valid tree file') sys.exit(1) else: true_tree = False if out_path: f = open(out_path, 'w+') else: f = sys.stdout if separate_trees: if not out_path: print('Cannot separate trees if using stdout') sys.exit(1) g = open('{}.trees'.format(out_path), 'w+') else: g = f final = False current_iter = 0 best_cost = float("inf") accepted_iters = [] best_iters = [] best_trees = [] near_iters = [] near_trees = [] print('Parsing log file...') with open(file_path, 'r') as h: for line in h: if line.strip().startswith('search: cost'): current_cost = int(re.search(r'\d+$', line.strip()).group()) if current_cost < best_cost: best_cost = current_cost near_cost = float(best_cost * (1 + (float(near_percent) / 100))) with open(file_path, 'r') as h: for line in h: line = line.strip() if final and line.startswith('search: changed') and line.endswith( 'no'): break if line.startswith('search: iter'): current_iter = int(re.search(r'\d+$', line).group()) elif line.startswith('search: final') and not line.startswith( 'search: final cost'): current_iter = 'Final' final = True elif line.startswith('search: cost') or line.startswith( 'search: final cost'): current_cost = int(re.search(r'\d+$', line).group()) if current_cost == best_cost: best_iters.append(current_iter) elif current_cost <= near_cost: near_iters.append(current_iter) elif current_iter in best_iters and line.startswith( 'tree:') and line.endswith(';'): tree_string = re.search('tree: (.*)', line).group(1) new_tree = tree_string best_trees.append(new_tree) if final: break elif current_iter in near_iters and line.startswith( 'tree:') and line.endswith(';'): tree_string = re.search('tree: (.*)', line).group(1) new_tree = tree_string near_trees.append(new_tree) assert len(best_iters) == len(best_trees) # Best and Near Trees g.write('#### Best Trees ####\n') for i in range(len(best_iters)): g.write('>{}\n'.format(str(best_iters[i]))) g.write('{}\n'.format(best_trees[i])) g.write('\n#### Near Best Trees ####\n') for i in range(len(near_iters)): g.write('>{}\n'.format(str(near_iters[i]))) g.write('{}\n'.format(near_trees[i])) if include_near: combined_iters = best_iters + near_iters combined_trees = best_trees + near_trees else: combined_iters = best_iters combined_trees = best_trees N = len(combined_trees) f.write('\n#### Summary ####\n') f.write('There are {} trees with cost {}\n'.format(len(best_iters), best_cost)) f.write( 'There are {} more trees within {}% of the best cost ({} < cost <= {})\n' .format(len(near_iters), near_percent, best_cost, near_cost)) test_t = dp.Tree.get_from_string(combined_trees[0], 'newick') f.write('Number of taxa is {}\n'.format(len(test_t.leaf_nodes()))) # Pairwise Robinson Foulds print('Calculating pairwise Robinson-Foulds distances...') f.write('\n#### Pairwise Robinson-Foulds Distances ####\n') dp_trees = [dp.Tree.get_from_string(i, 'newick') for i in combined_trees] tlist = dp.TreeList(dp_trees) rf_pair_matrix = np.zeros((N, N)) for i in range(N): for j in range(N): if i != j: rf_pair_matrix[i, j] = treecompare.symmetric_difference( dp_trees[i], dp_trees[j]) pd.set_option('display.max_columns', None) pd_pair_matrix = pd.DataFrame(data=rf_pair_matrix, index=combined_iters, columns=combined_iters) f.write('{}\n'.format(str(pd_pair_matrix))) if len(combined_iters) > 1: f.write('\nMinimum (non-zero) RF Distance: {}\n'.format( rf_pair_matrix[np.nonzero(rf_pair_matrix)].min())) f.write('Maximum RF Distance: {}\n'.format(rf_pair_matrix.max())) f.write('Mean RF Distance: {}\n'.format( rf_pair_matrix[np.nonzero(rf_pair_matrix)].mean())) f.write('Std. Dev. of RF Distance: {}\n'.format( np.sqrt(rf_pair_matrix[np.nonzero(rf_pair_matrix)].var()))) # Consensus Tree Methods print('Calculating consensus trees...') f.write('\n#### Strict Consensus Tree ####\n') strict_con_tree = tlist.consensus(min_freq=1.0) f.write('{}\n'.format(strict_con_tree.as_string('newick'))) strict_stats = np.zeros((N, 3)) for i in range(N): fp, fn = treecompare.false_positives_and_negatives( strict_con_tree, dp_trees[i]) strict_stats[i] = [fp, fn, fp + fn] pd_strict_stats = pd.DataFrame( data=strict_stats, index=combined_iters, columns=['False Positive', 'False Negative', 'RF Distance']) f.write('{}\n'.format(str(pd_strict_stats))) f.write('\nMinimum RF Distance: {}\n'.format(strict_stats[:, 2].min())) f.write('Maximum RF Distance: {}\n'.format(strict_stats[:, 2].max())) f.write('Mean RF Distance: {}\n'.format(strict_stats[:, 2].mean())) f.write('Std. Dev. of RF Distance: {}\n'.format( np.sqrt(strict_stats[:, 2].var()))) f.write('\n#### Majority Rule Consensus Tree ####\n') maj_con_tree = tlist.consensus(min_freq=0.5) f.write('{}\n'.format(maj_con_tree.as_string('newick'))) maj_stats = np.zeros((N, 3)) for i in range(N): fp, fn = treecompare.false_positives_and_negatives( maj_con_tree, dp_trees[i]) maj_stats[i] = [fp, fn, fp + fn] pd_maj_stats = pd.DataFrame( data=maj_stats, index=combined_iters, columns=['False Positive', 'False Negative', 'RF Distance']) f.write('{}\n'.format(str(pd_maj_stats))) f.write('\nMinimum RF Distance: {}\n'.format(maj_stats[:, 2].min())) f.write('Maximum RF Distance: {}\n'.format(maj_stats[:, 2].max())) f.write('Mean RF Distance: {}\n'.format(maj_stats[:, 2].mean())) f.write('Std. Dev. of RF Distance: {}\n'.format( np.sqrt(maj_stats[:, 2].var()))) # Comparison with True Tree if true_tree: print('Calculating Robinson-Foulds distances to the true tree...') f.write('\n#### Comparison to True Tree ####\n') true_matrix = np.zeros((N + 2, 3)) true_plus_all = dp_trees + [strict_con_tree, maj_con_tree, true_tree] truelist = dp.TreeList(true_plus_all) for i in range(N + 2): fp, fn = treecompare.false_positives_and_negatives( true_tree, true_plus_all[i]) true_matrix[i] = [fp, fn, fp + fn] combined_consensus_iters = combined_iters + [ 'Strict Consensus', 'Majority Consensus' ] combined_consensus_trees = combined_trees + [ strict_con_tree, maj_con_tree ] pd_true_matrix = pd.DataFrame( data=true_matrix, index=combined_consensus_iters, columns=['False Positive', 'False Negative', 'RF Distance']) f.write('{}\n'.format(str(pd_true_matrix))) f.write('\nMinimum RF Distance: {}\n'.format(true_matrix[:, 2].min())) f.write('Maximum RF Distance: {}\n'.format(true_matrix[:, 2].max())) f.write('Mean RF Distance: {}\n'.format(true_matrix[:, 2].mean())) f.write('Std. Dev. of RF Distance: {}\n'.format( np.sqrt(true_matrix[:, 2].var()))) f.write('Tree {} is closest to the true tree\n'.format( combined_consensus_iters[np.argmin(true_matrix[:, 2])])) f.write('{}\n'.format(combined_consensus_trees[np.argmin( true_matrix[:, 2])])) f.close() if out_path and not separate_trees: print( 'Optimal and Near-Optimal Trees and Summary Statistics written to {}' .format(out_path)) elif out_path and separate_trees: print('Optimal and Near-Optimal Trees written to {}.trees'.format( out_path)) print('Summary Statistics written to {}'.format(out_path)) g.close()
true_tree_file = (truth) try: tree1 = Tree.get_from_path( predicted_tree_file, "newick", taxon_namespace=tns) tree2 = Tree.get_from_path( true_tree_file, "newick", taxon_namespace=tns) tree1.encode_bipartitions() tree2.encode_bipartitions() print("try") print('R'+str(i),treecompare.false_positives_and_negatives(tree1, tree2)) effective_samples = effective_samples+1 agg_false_positives = agg_false_negatives+treecompare.false_positives_and_negatives(tree1, tree2)[0] print(treecompare.false_positives_and_negatives(tree1, tree2)[0],treecompare.false_positives_and_negatives(tree1, tree2)[1]) agg_false_negatives = agg_false_negatives+treecompare.false_positives_and_negatives(tree1, tree2)[1] result_file.write('R'+str(i)+str(treecompare.false_positives_and_negatives(tree1, tree2))) except Exception as e: print("exception") print(e) result_file.write('R'+str(i)+"(err,err)\n") ave_false_positive = agg_false_positives/effective_samples ave_false_negative = agg_false_negatives/effective_samples result_file.write("total effective{}, averge false positive{},average false negative{}".format(effective_samples,ave_false_positive,ave_false_negative)) result_file.write('\n') result_file.close()