def tree_prop(tree, tree_file_name): ''' calculates the proportional representation of a tree in a nwk file.''' tsim = 0.0 ttotal = 0.0 treeFF = open(tree_file_name, 'r') for treeF in treeFF: tf = Tree(treeF) ttotal += 1.0 # I use Robinson-Foulds metric to find the same trees. if Tree.compare(tree, tf)['norm_rf'] == 0.0: tsim += 1.0 treeFF.close() treeProb = (float(tsim / ttotal)) return treeProb
def compare_main(args): """compare tree topologies Args: args.tree (str): input tree(s), in Newick format args.ref (str): reference tree, in Newick format Prints: tree result['norm_rf']: normalized robinson-foulds distance (from 0 to 1) result['ref_edges_in_source']: compatibility score of the target tree with respect to the source tree (how many edges in reference are found in the source) result['source_edges_in_ref']: compatibility score of the source tree with respect to the reference tree (how many edges in source are found in the reference) dstat: sum of differences between two distance matrices / sum of ref matrix rstat: avg ratio between corresponding pairwise distances """ print(args, file=sys.stderr) ref_tree = Tree(args.ref) ref_tree_leafnames = [l.name for l in ref_tree.get_leaves()] leaf_idx = {l:i for i,l in enumerate(ref_tree_leafnames)} #how to get int for leaf name consistent btwn trees ref_am = tree2adjacency(ref_tree,leaf_idx) #matrix of "distances" for ref (node counts) for f in args.tree: tree = Tree(f) tree_leafnames = [l.name for l in tree.get_leaves()] if set(tree_leafnames) != set(ref_tree_leafnames): print('leaf names are not the same', file=sys.stderr) am = tree2adjacency(tree,leaf_idx) #matrix of "distances" for comparison if ref_am.shape != am.shape: print('%s incompatible with %s' % (f, args.ref), file=sys.stderr) else: k = ref_am > 0 diff = np.abs(ref_am - am) dstat = diff[k].sum()/k.sum() ratio = am[k]/ref_am[k] ratio[ratio>1] = 1.0/ratio[ratio>1] rstat = np.power(ratio.prod(), 1.0/k.sum()) result = ref_tree.compare(tree, unrooted=True) #comparison calculated by ete2 # <tree>,<norm_rf>,<ref_edge_in_tree>,<tree_edge_in_ref>,<diff_adj>,<ratio_adj> print('%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' % (f, result['norm_rf'], result['ref_edges_in_source'], result['source_edges_in_ref'], dstat, rstat))
def compare_main(args): """compare tree topologies Args: args.tree (str): input tree(s), in Newick format args.ref (str): reference tree, in Newick format Prints: tree result['norm_rf']: normalized robinson-foulds distance (from 0 to 1) result['ref_edges_in_source']: compatibility score of the target tree with respect to the source tree (how many edges in reference are found in the source) result['source_edges_in_ref']: compatibility score of the source tree with respect to the reference tree (how many edges in source are found in the reference) dstat: sum of differences between two distance matrices / sum of ref matrix rstat: avg ratio between corresponding pairwise distances """ print(args, file=sys.stderr) ref_tree = Tree(args.ref) ref_tree_leafnames = [l.name for l in ref_tree.get_leaves()] leaf_idx = {l:i for i,l in enumerate(ref_tree_leafnames)} #how to get int for leaf name consistent btwn trees ref_am = tree2adjacency(ref_tree,leaf_idx) #matrix of "distances" for ref (node counts) for f in args.tree: tree = Tree(f) tree_leafnames = [l.name for l in tree.get_leaves()] if set(tree_leafnames) != set(ref_tree_leafnames): print('leaf names are not the same', file=sys.stderr) am = tree2adjacency(tree,leaf_idx) #matrix of "distances" for comparison if ref_am.shape != am.shape: print('%s incompatible with %s' % (f, args.ref), file=sys.stderr) else: k = ref_am > 0 diff = np.abs(ref_am - am) dstat = diff[k].sum()/k.sum() ratio = am[k]/ref_am[k] ratio[ratio>1] = 1.0/ratio[ratio>1] rstat = np.power(ratio.prod(), 1.0/k.sum()) result = ref_tree.compare(tree, unrooted=True) #comparison calculated by ete2 # <tree>,<norm_rf>,<ref_edge_in_tree>,<tree_edge_in_ref>,<diff_adj>,<ratio_adj> print('%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' % (f, result['norm_rf'], result['ref_edges_in_source'], result['source_edges_in_ref'], dstat, rstat))
def compare_main(args): print(args, file=sys.stderr) ref_tree = Tree(args.ref) ref_am = tree2adjacency(ref_tree) for f in args.tree: tree = Tree(f) am = tree2adjacency(tree) if ref_am.shape != am.shape: print('%s incompatible with %s' % (f, args.ref), file=sys.stderr) else: k = ref_am > 0 diff = np.abs(ref_am - am) dstat = diff[k].sum()/k.sum() ratio = am[k]/ref_am[k] ratio[ratio>1] = 1.0/ratio[ratio>1] rstat = np.power(ratio.prod(), 1.0/k.sum()) result = ref_tree.compare(tree, unrooted=True) # <tree>,<norm_rf>,<ref_edge_in_tree>,<tree_edge_in_ref>,<diff_adj>,<ratio_adj> print('%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' % (f, result['norm_rf'], result['ref_edges_in_source'], result['source_edges_in_ref'], dstat, rstat))
for grn in range(len(groupNames)): GSIind = gsi(t, indNames[grn]) GSIvalues.append(GSIind) GSIlists[grn].append(float(GSIind)) # write output treeNameP = treeName.rstrip() # remove \n from a string GSIvaluesP = '\t'.join(str(e) for e in GSIvalues) outputFilet.write("%s\t%s\n" % (treeNameP, GSIvaluesP)) # count topologies if not topologies: topologies.append(t) treeProb = tree_prop(t, args.tree) treePlist.append(treeProb) elif not any(Tree.compare(t, tt)['norm_rf'] == 0.0 for tt in topologies): topologies.append(t) treeProb = tree_prop(t, args.tree) treePlist.append(treeProb) # track the progress: counter += 1 if counter % 100 == 0: print str(counter), "trees processed" # calculate the GSI Total (formula 5 in Cummings et al. 2008) # print topologies # for debugging for top, p in zip(topologies, treePlist): for grnt in range(len(groupNames)): gsit = gsi(top, indNames[grnt])
def run(args): from ete2 import Tree from ete2.utils import print_table def iter_differences(set1, set2, unrooted=False): for s1 in set1: pairs = [] for r1 in set2: if unrooted: d = euc_dist_unrooted(s1, r1) else: d = euc_dist(s1, r1) if d < 1: pairs.append((d,r1)) yield s1, pairs col_sizes = [25, 25] + [8] * 8 header = ['source', 'ref', 'eff.size', 'nRF', 'RF', 'maxRF', "%src_branches", "%ref_branches", "subtrees", "treekoD" ] if args.taboutput: print '# ' + '\t'.join(header) elif args.show_mismatches or args.show_matches or args.show_edges: pass else: print_table([header, ["=========================="] * 10], fix_col_width=col_sizes, wrap_style="cut") for stree_name in args.src_tree_iterator: stree = Tree(stree_name) # Parses attrs if necessary src_tree_attr = args.src_tree_attr if args.src_attr_parser: for leaf in stree: leaf.add_feature('_tempattr', re.search( args.src_attr_parser, getattr(leaf, args.src_tree_attr)).groups()[0]) src_tree_attr = '_tempattr' for rtree_name in args.ref_trees: rtree = Tree(rtree_name) # Parses attrs if necessary ref_tree_attr = args.ref_tree_attr if args.ref_attr_parser: for leaf in rtree: leaf.add_feature('_tempattr', re.search( args.ref_attr_parser, getattr(leaf, args.ref_tree_attr)).groups()[0]) ref_tree_attr = '_tempattr' r = stree.compare(rtree, ref_tree_attr=ref_tree_attr, source_tree_attr=src_tree_attr, min_support_ref=args.min_support_ref, min_support_source = args.min_support_src, unrooted=args.unrooted, has_duplications=False) if args.show_mismatches or args.show_matches or args.show_edges: if args.show_mismatches: src = r['source_edges'] - r['ref_edges'] ref = r['ref_edges'] - r['source_edges'] elif args.show_matches: src = r['source_edges'] & r['ref_edges'] ref = r['ref_edges'] & r['source_edges'] elif args.show_edges: src = r['source_edges'] ref = r['ref_edges'] if args.unrooted: for tag, part in [("src: %s"%stree_name, src), ("ref: %s"%rtree_name, ref)]: print "%s\t%s" %(tag, '\t'.join( map(lambda x: '%s|%s' %(','.join(x[0]), ','.join(x[1])), part))) else: for tag, part in [("src: %s"%stree_name, src), ("ref: %s"%rtree_name, ref)]: print "%s\t%s" %(tag, '\t'.join([','.join(p) for p in part])) else: data = [shorten_str(stree_name,25), shorten_str(rtree_name,25), r['effective_tree_size'], r['norm_rf'], r['rf'], r['max_rf'], r["source_edges_in_ref"], r["ref_edges_in_source"], r['source_subtrees'], r['treeko_dist']] if args.taboutput: print '\t'.join(map(str, data)) else: print_table([map(as_str, data)], fix_col_width = col_sizes, wrap_style='cut')