Пример #1
0
def tree_prop(tree, tree_file_name):
    ''' calculates the proportional representation of a tree in a nwk file.'''
    tsim = 0.0
    ttotal = 0.0
    treeFF = open(tree_file_name, 'r')
    for treeF in treeFF:
        tf = Tree(treeF)
        ttotal += 1.0
        # I use Robinson-Foulds metric to find the same trees.
        if Tree.compare(tree, tf)['norm_rf'] == 0.0:
            tsim += 1.0
    treeFF.close()
    treeProb = (float(tsim / ttotal))
    return treeProb
Пример #2
0
def compare_main(args):
    	
    """compare tree topologies

    Args:
        args.tree (str): input tree(s), in Newick format
        args.ref (str): reference tree, in Newick format
        
    Prints:
        tree
        result['norm_rf']: normalized robinson-foulds distance (from 0 to 1)
        result['ref_edges_in_source']: compatibility score of the target tree with respect to the source tree (how many edges in reference are found in the source)
        result['source_edges_in_ref']: compatibility score of the source tree with respect to the reference tree (how many edges in source are found in the reference)
        dstat: sum of differences between two distance matrices / sum of ref matrix
        rstat: avg ratio between corresponding pairwise distances

    """
    
    print(args, file=sys.stderr)
    ref_tree = Tree(args.ref)
    ref_tree_leafnames = [l.name for l in ref_tree.get_leaves()]
    leaf_idx = {l:i for i,l in enumerate(ref_tree_leafnames)}  #how to get int for leaf name consistent btwn trees
    
    ref_am = tree2adjacency(ref_tree,leaf_idx)   #matrix of "distances" for ref (node counts)
    for f in args.tree:
        tree = Tree(f)
        tree_leafnames = [l.name for l in tree.get_leaves()]
        if set(tree_leafnames) != set(ref_tree_leafnames):
            print('leaf names are not the same', file=sys.stderr)
        am = tree2adjacency(tree,leaf_idx)   #matrix of "distances" for comparison
        if ref_am.shape != am.shape:
            print('%s incompatible with %s' % (f, args.ref), file=sys.stderr)
        else:
            k = ref_am > 0

            diff = np.abs(ref_am - am)
            dstat = diff[k].sum()/k.sum()

            ratio = am[k]/ref_am[k]
            ratio[ratio>1] = 1.0/ratio[ratio>1]
            rstat = np.power(ratio.prod(), 1.0/k.sum())

            result = ref_tree.compare(tree, unrooted=True)  #comparison calculated by ete2

            # <tree>,<norm_rf>,<ref_edge_in_tree>,<tree_edge_in_ref>,<diff_adj>,<ratio_adj>
            print('%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' % (f, result['norm_rf'], result['ref_edges_in_source'], result['source_edges_in_ref'], dstat, rstat))
Пример #3
0
def compare_main(args):
    	
    """compare tree topologies

    Args:
        args.tree (str): input tree(s), in Newick format
        args.ref (str): reference tree, in Newick format
        
    Prints:
        tree
        result['norm_rf']: normalized robinson-foulds distance (from 0 to 1)
        result['ref_edges_in_source']: compatibility score of the target tree with respect to the source tree (how many edges in reference are found in the source)
        result['source_edges_in_ref']: compatibility score of the source tree with respect to the reference tree (how many edges in source are found in the reference)
        dstat: sum of differences between two distance matrices / sum of ref matrix
        rstat: avg ratio between corresponding pairwise distances

    """
    
    print(args, file=sys.stderr)
    ref_tree = Tree(args.ref)
    ref_tree_leafnames = [l.name for l in ref_tree.get_leaves()]
    leaf_idx = {l:i for i,l in enumerate(ref_tree_leafnames)}  #how to get int for leaf name consistent btwn trees
    
    ref_am = tree2adjacency(ref_tree,leaf_idx)   #matrix of "distances" for ref (node counts)
    for f in args.tree:
        tree = Tree(f)
        tree_leafnames = [l.name for l in tree.get_leaves()]
        if set(tree_leafnames) != set(ref_tree_leafnames):
            print('leaf names are not the same', file=sys.stderr)
        am = tree2adjacency(tree,leaf_idx)   #matrix of "distances" for comparison
        if ref_am.shape != am.shape:
            print('%s incompatible with %s' % (f, args.ref), file=sys.stderr)
        else:
            k = ref_am > 0

            diff = np.abs(ref_am - am)
            dstat = diff[k].sum()/k.sum()

            ratio = am[k]/ref_am[k]
            ratio[ratio>1] = 1.0/ratio[ratio>1]
            rstat = np.power(ratio.prod(), 1.0/k.sum())

            result = ref_tree.compare(tree, unrooted=True)  #comparison calculated by ete2

            # <tree>,<norm_rf>,<ref_edge_in_tree>,<tree_edge_in_ref>,<diff_adj>,<ratio_adj>
            print('%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' % (f, result['norm_rf'], result['ref_edges_in_source'], result['source_edges_in_ref'], dstat, rstat))
Пример #4
0
def compare_main(args):
    print(args, file=sys.stderr)
    ref_tree = Tree(args.ref)
    ref_am = tree2adjacency(ref_tree)
    for f in args.tree:
        tree = Tree(f)
        am = tree2adjacency(tree)
        if ref_am.shape != am.shape:
            print('%s incompatible with %s' % (f, args.ref), file=sys.stderr)
        else:
            k = ref_am > 0

            diff = np.abs(ref_am - am)
            dstat = diff[k].sum()/k.sum()

            ratio = am[k]/ref_am[k]
            ratio[ratio>1] = 1.0/ratio[ratio>1]
            rstat = np.power(ratio.prod(), 1.0/k.sum())

            result = ref_tree.compare(tree, unrooted=True)

            # <tree>,<norm_rf>,<ref_edge_in_tree>,<tree_edge_in_ref>,<diff_adj>,<ratio_adj>
            print('%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' % (f, result['norm_rf'], result['ref_edges_in_source'], result['source_edges_in_ref'], dstat, rstat))
Пример #5
0
    for grn in range(len(groupNames)):
        GSIind = gsi(t, indNames[grn])
        GSIvalues.append(GSIind)
        GSIlists[grn].append(float(GSIind))

    # write output
    treeNameP = treeName.rstrip()  # remove \n from a string
    GSIvaluesP = '\t'.join(str(e) for e in GSIvalues)
    outputFilet.write("%s\t%s\n" % (treeNameP, GSIvaluesP))

    # count topologies
    if not topologies:
        topologies.append(t)
        treeProb = tree_prop(t, args.tree)
        treePlist.append(treeProb)
    elif not any(Tree.compare(t, tt)['norm_rf'] == 0.0 for tt in topologies):
        topologies.append(t)
        treeProb = tree_prop(t, args.tree)
        treePlist.append(treeProb)

    # track the progress:
    counter += 1
    if counter % 100 == 0:
        print str(counter), "trees processed"

# calculate the GSI Total (formula 5 in Cummings et al. 2008)

# print topologies # for debugging
for top, p in zip(topologies, treePlist):
    for grnt in range(len(groupNames)):
        gsit = gsi(top, indNames[grnt])
Пример #6
0
def run(args):
    from ete2 import Tree
    from ete2.utils import print_table
    
    def iter_differences(set1, set2, unrooted=False):
        for s1 in set1:
            pairs = []
            for r1 in set2:
                if unrooted:
                    d = euc_dist_unrooted(s1, r1)
                else:
                    d = euc_dist(s1, r1)
                if d < 1:
                    pairs.append((d,r1))
            yield s1, pairs

    
    col_sizes = [25, 25] + [8] * 8

    header = ['source', 'ref', 'eff.size', 'nRF',
              'RF', 'maxRF', "%src_branches",
              "%ref_branches", "subtrees", "treekoD" ]

    if args.taboutput:
        print '# ' + '\t'.join(header)
    elif args.show_mismatches or args.show_matches or args.show_edges:
        pass
    else: 
        print_table([header,
                     ["=========================="] * 10],
                    fix_col_width=col_sizes, wrap_style="cut")
    

    for stree_name in args.src_tree_iterator:
        stree = Tree(stree_name)

        # Parses attrs if necessary
        src_tree_attr = args.src_tree_attr
        if args.src_attr_parser:
            for leaf in stree:
                leaf.add_feature('_tempattr', re.search(
                    args.src_attr_parser, getattr(leaf, args.src_tree_attr)).groups()[0])
            src_tree_attr = '_tempattr'
  
        for rtree_name in args.ref_trees:
            rtree = Tree(rtree_name)

            # Parses attrs if necessary
            ref_tree_attr = args.ref_tree_attr
            if args.ref_attr_parser:
                for leaf in rtree:
                    leaf.add_feature('_tempattr', re.search(
                        args.ref_attr_parser, getattr(leaf, args.ref_tree_attr)).groups()[0])
                ref_tree_attr = '_tempattr'

            r = stree.compare(rtree, 
                              ref_tree_attr=ref_tree_attr,
                              source_tree_attr=src_tree_attr,
                              min_support_ref=args.min_support_ref,
                              min_support_source = args.min_support_src,
                              unrooted=args.unrooted,
                              has_duplications=False)


                
            if args.show_mismatches or args.show_matches or args.show_edges:
                if args.show_mismatches:
                    src = r['source_edges'] - r['ref_edges']
                    ref = r['ref_edges'] - r['source_edges']
                elif args.show_matches:
                    src = r['source_edges'] & r['ref_edges']
                    ref = r['ref_edges'] & r['source_edges']
                elif args.show_edges:
                    src = r['source_edges']
                    ref = r['ref_edges']

                if args.unrooted:
                    for tag, part in [("src: %s"%stree_name, src), ("ref: %s"%rtree_name, ref)]:
                        print "%s\t%s" %(tag, '\t'.join(
                            map(lambda x: '%s|%s' %(','.join(x[0]), ','.join(x[1])), part)))
                else:
                    for tag, part in [("src: %s"%stree_name, src), ("ref: %s"%rtree_name, ref)]:
                        print "%s\t%s" %(tag, '\t'.join([','.join(p) for p in part]))
            else:
                data = [shorten_str(stree_name,25),
                        shorten_str(rtree_name,25),
                        r['effective_tree_size'],
                        r['norm_rf'], 
                        r['rf'], r['max_rf'],
                        r["source_edges_in_ref"],
                        r["ref_edges_in_source"],
                        r['source_subtrees'],
                        r['treeko_dist']]
                if args.taboutput:                    
                    print '\t'.join(map(str, data))
                else:    
                    print_table([map(as_str, data)],
                                fix_col_width = col_sizes, wrap_style='cut')