示例#1
0
 def test_upgma(self):
     dists = [1.45, 1.51, 1.57, 2.98, 2.94, 3.04, 7.51, 7.55, 7.39, 7.10]
     labels = ["H", "C", "G", "O", "R"]
     tree = do_tree(dists, labels, method="upgma")
     newick_tree = "((((H,C):0.73,G):0.77,O):1.49,R):3.69;"
     expected_tree = Tree(newick_tree)
     result = expected_tree.compare(tree)
     assert result["source_edges_in_ref"] - 1 < 0.00001
     assert result["ref_edges_in_source"] - 1 < 0.00001
示例#2
0
 def test_nj(self):
     dists = [5, 9, 10, 9, 10, 8, 8, 9, 7, 3]
     labels = ["A", "B", "C", "D", "E"]
     tree = do_tree(dists, labels, method="nj")
     newick_tree = "(C:2,((A:2,B:3):3,(D:2,E:1):2):2);"
     expected_tree = Tree(newick_tree)
     result = expected_tree.compare(tree, unrooted=True)
     assert result["source_edges_in_ref"] - 1 < 0.00001
     assert result["ref_edges_in_source"] - 1 < 0.00001
示例#3
0
def ete_compare(usr_tree_str, ref_tree_str, outgroup_id=None):
    qt = Tree(usr_tree_str)
    rt = Tree(ref_tree_str)
    if outgroup_id:
        qt.set_outgroup(outgroup_id)
        rt.set_outgroup(outgroup_id)

    res = qt.compare(rt, unrooted=False if outgroup_id else True)
    rf = res["rf"]
    max_rf = res["max_rf"]
    nrf = res["norm_rf"]
    effective_tree_size = res["effective_tree_size"]
    ref_edges_in_source = res["ref_edges_in_source"]
    source_edges_in_ref = res["source_edges_in_ref"]
    source_subtrees = res["source_subtrees"]
    common_edges = res["common_edges"]
    treeko_dist = res["treeko_dist"]
    source_edges = res["source_edges"]
    ref_edges = res["ref_edges"]
    return qt, rt, nrf, rf, max_rf, source_edges_in_ref, ref_edges_in_source, treeko_dist
示例#4
0
def compare_trees(args):

    tree_ref = Tree(args['ref'])
    trees = []
    for i in args['all']:
        tree = {}
        tree['name'] = i
        tree['comparison'] = tree_ref.compare(Tree(i),unrooted=True)
        trees.append(tree)

    with open(args['output'],"w") as f_out:
        f_out.write("Tree\tEffective_Tree_Size\tRF-normalized\tRF\tRF-max\t% edges in Src\t% edges in Ref\n")
        for tree in trees:
            f_out.write(tree['name']+"\t")
            f_out.write(str(tree['comparison']['effective_tree_size'])+"\t")
            f_out.write(str(tree['comparison']['norm_rf'])+"\t")
            f_out.write(str(tree['comparison']['rf'])+"\t")
            f_out.write(str(tree['comparison']['max_rf'])+"\t")
            f_out.write(str(tree['comparison']['ref_edges_in_source'])+"\t")
            f_out.write(str(tree['comparison']['source_edges_in_ref'])+"\t")
            f_out.write("\n")
示例#5
0
def run_ete_compare_py(src,
                       ref,
                       format=0,
                       use_collateral=False,
                       min_support_source=0.0,
                       min_support_ref=0.0,
                       has_duplications=False,
                       expand_polytomies=False,
                       unrooted=False,
                       max_treeko_splits_to_be_artifact=1000,
                       ref_tree_attr='name',
                       source_tree_attr='name'):

    src_tree = Tree(newick=src)
    ref_tree = Tree(newick=ref)
    compare_result = Tree.compare(src_tree,
                                  ref_tree,
                                  use_collateral=False,
                                  min_support_source=0.0,
                                  min_support_ref=0.0,
                                  has_duplications=False,
                                  expand_polytomies=False,
                                  unrooted=False,
                                  max_treeko_splits_to_be_artifact=1000,
                                  ref_tree_attr='name',
                                  source_tree_attr='name')
    """
    'common_edges': {('a', 'b', 'c'), ('a', 'b', 'c', 'g')}, 
    'source_edges': {('a', 'b'), ('a', 'b', 'c'), ('a', 'b', 'c', 'g')}, 
    'ref_edges': {('a', 'c'), ('a', 'b', 'c'), ('a', 'b', 'c', 'g')}
    """
    #compare_result['common_edges'] = list(compare_result['common_edges'])
    #compare_result['source_edges'] = list(compare_result['source_edges'])
    #compare_result['ref_edges'] = list(compare_result['ref_edges'])

    #print(compare_result)
    return dict((k, compare_result[k]) for k in [
        'rf', 'max_rf', 'ref_edges_in_source', 'source_edges_in_ref',
        'effective_tree_size', 'norm_rf', 'treeko_dist', 'source_subtrees'
    ])
示例#6
0
FBA_tree.write(format=1, outfile="/home/acabbia/Documents/Muscle_Model/GSMM-distance/FBA_tree.nw")

#%%
'''

# dictionary to translate between model_taxonomy.index (GK and JD trees) and NCBI_id (NCBI tree)

idx_str = [str(i) for i in list(models_taxonomy.index)]
NCBI_str = [str(i) for i in NCBI_ID]

tr = dict(zip(idx_str, NCBI_str))

#Annotate GK and JD trees with NCBI id's
for leaf in GK_tree:
    leaf.name = tr[leaf.name]

for leaf in JD_tree:
    leaf.name = tr[leaf.name]

# Write
GK_tree.write(
    format=1,
    outfile="/home/acabbia/Documents/Muscle_Model/GSMM-distance/GK_tree.nw")
JD_tree.write(
    format=1,
    outfile="/home/acabbia/Documents/Muscle_Model/GSMM-distance/JD_tree.nw")

### Compare trees with reference taxonomy (NCBI)
resultGK = GK_tree.compare(NCBI_tree, unrooted=True)
resultJD = JD_tree.compare(NCBI_tree, unrooted=True)
                node_tree1.delete()
        for node_tree2 in tree2.get_descendants():
            #print(node2.dist)
            #if not node2.is_leaf() and round(node2.dist,4) <= 1:
            if not node_tree2.is_leaf(
            ) and node_tree2.dist <= 1.00000050002909e-06:
                node_tree2.delete()
        print("Trees read successfully.")
    except:
        print("Trees couldn't be loaded.")
        raise

    # Calculate Robinson-Foulds distance between two trees REF and test
    try:
        print("Calculating robinson-foulds distance...")
        results = tree1.compare(tree2)
        print("Calculation of robin-foulds distance failed.")
    except:
        print("Robinson-foulds distance calculation failed.")
        raise

    true_positives = len(results["common_edges"])
    false_positives = len(results["source_edges"] - results["ref_edges"])
    false_negatives = len(results["ref_edges"] - results["source_edges"])
    sensitivity = calculate_sensitivity(true_positives, false_negatives)
    precision = calculate_precision(true_positives, false_positives)

    #Create ids dictionary with challenges_ids and sample ids from tree leaves.
    try:

        metrics.update(challenges_ids=arguments.challenges_ids)
示例#8
0
def main():
    # files = glob.glob('/Users/williamlin/Desktop/IW/IW/phyloSim-master/trees_BL/*')
    # for f in files:
    # 	os.remove(f)
    # files = glob.glob('/Users/williamlin/Desktop/IW/IW/phyloSim-master/trees_unrooted/*')
    # for f in files:
    # 	os.remove(f)
    # files = glob.glob('/Users/williamlin/Desktop/IW/IW/phyloSim-master/trees_rooted/*')
    # for f in files:
    # 		os.remove(f)
    # for i in range(0,1):
    # 	files = glob.glob('/Users/williamlin/Desktop/IW/IW/phyloSim-master/data/OUTGROUP_*')
    # 	for f in files:
    # 		os.remove(f)
    # 	files = glob.glob('/Users/williamlin/Desktop/IW/IW/phyloSim-master/log_file')
    # 	for f in files:
    # 		os.remove(f)
    # 	directory = "/Users/williamlin/Desktop/IW/IW/phyloSim-master/data"
    # 	data = []
    # 	for filename in os.listdir(directory):
    # 		# print(filename)
    # 		# if filename.endswith("_" + str(i)):
    # 		data.append(filename)
    # 	# print(data)
    # 	# print(data)
    # 	#data = ['0.fasta', '1.fasta', '2.fasta', '3.fasta','4.fasta']
    # 	#identified = []
    # 	# for file in data:
    # 	# 	uniqueID(file)
    # 	# 	identified.append("U_" + file)
    # 	for file in data:
    # 		generateRootedTree(file)
    # print("AAAAAAAAA")
    # nodeLabelledRootedTree = generateTree(data,0)
    # # remove(nodeLabelledRootedTree, False, False, False, True)
    # branchLengthTree = branchLengths(nodeLabelledRootedTree)

    output = {}
    directory1 = "/Users/williamlin/Desktop/IW/IW/phyloSim-master/trees_BL/"
    for i in range(0, 150):
        # if i not in [28,108,66,285,388,432,472,492]:
        # print(filename)
        for fileName in os.listdir(directory1):
            if fileName.endswith("_" + str(i) + ".fasta"):
                tag = fileName.split(".")[0]
                joined = fileName.split(".")[1] + "." + fileName.split(".")[2]
                output[i] = joined
            # parts = filename.split("_")
            # if len(parts) == 4 and filename.endswith("_" + str(i) + ".fasta"):
            # 	output[i] = parts[2] + "_" + parts[3]
    # print(output)
    directory2 = "/Users/williamlin/Desktop/IW/IW/phyloSim-master/trees_rooted/RAxML_nodeLabelledRootedTree."
    directory3 = "/Users/williamlin/Desktop/IW/IW/phyloSim-master/simulated_guestTrees/"

    # print(output)
    rf_values = []
    compare = []
    for step in output:
        # print(output[step])
        print(directory2 + output[step])
        t1 = Tree(directory2 + output[step], format=1)
        t2 = Tree(directory3 + "guestTrees_step_" + str(step), format=1)
        rf = t1.robinson_foulds(t2)
        rfN = t1.compare(t2)["norm_rf"]
        rf_values.append(rf[0])
        compare.append(rfN)

    # print(rf_values)
    # print(compare)

    # print("----------------------------------------------------------------------")

    print("Average Distance: " + str(np.average(rf_values)))
    print("Variance of Distance " + str(np.var(rf_values)))
    print("Average Distance: " + str(np.average(compare)))
    print("Variance of Distance " + str(np.var(compare)))

    plt.hist(rf_values, bins=[0, 20, 40, 60, 80, 100, 120, 140])
    plt.title('Distribution of Robinson-Foulds Distances')
    plt.xlabel('Robinson-Foulds Distances')
    plt.ylabel('Number of Reconstructed Trees')
    plt.xlim((0, 150))
    plt.ylim((0, 60))
    plt.show()

    plt.hist(compare, bins=[0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
    plt.title('Distribution of Normalized Robinson-Foulds Distances')
    plt.xlabel('Robinson-Foulds Distances')
    plt.ylabel('Number of Reconstructed Trees')
    plt.show()
    try:
        print ("Reading public participants trees...")
        for public_participant in participants:
            tree_file = public_participant + "_canonical.nwk"
            part_fullpath = os.path.join(arguments.benchmark_trees_path,tree_file)
            if os.path.isfile(part_fullpath):
                tree_files.append(part_fullpath)

        for participant in tree_files:
            print("Reading public participant tree :" + participant)
            tree = Tree(participant)
            print("Setting root on midpoint...")
            tree = midpoint_root (tree)
            print("Collapsing nodes with branch distance = 0...")
            tree = collapse_nodes (tree, 1.00000050002909e-06 )
            results = tree_test.compare(tree)
            participant_row.append(results["norm_rf"])

        print("Public participants trees read and analyzed successfully.")
    except:
        print("Public participants trees couldn't be analyzed.")
        raise
        sys.exit(1)

    #Create ids dictionary with challenges_ids and sample ids from tree leaves.
    try:
        print("Updating benchmark data with new participant..")
        for row in benchmark_data:
            row.append(0)
        benchmark_data.append(participant_row)
        data.update(participants=participants)
示例#10
0
def rf_dist(t1, t2):
    tree1 = Tree(t1)
    tree2 = Tree(t2)
    r = tree1.compare(tree2)
    return r['norm_rf']
示例#11
0
shared_edge_support_values = []
ref_only_edge_support_values = []
for tree_file in tree_file_list:
    pct_mask_match = re.search(r"mask(\d+)", tree_file)
    if pct_mask_match is not None:
        pct_mask = int(pct_mask_match.groups()[0])
    else:
        pct_mask = 0
    print("Adding {} bootstrap values to tree from {}...".format(
        tree_file, ref_tree_file))
    pct_masks.append(pct_mask)
    tree = Tree(tree_file, format=1)
    add_support_and_subtypes(tree)
    tree.set_outgroup(outgroup)
    comparison = ref_tree.compare(tree)
    print("{}/{} common/total edges, normRF {:0.2f} for {} vs {}".format(
        len(comparison["common_edges"]), len(comparison["source_edges"]),
        comparison["norm_rf"], tree_file, ref_tree_file))
    for common_edge in comparison["common_edges"]:
        tree_node = tree.get_common_ancestor(common_edge)
        if hasattr(tree_node, "bootstrap"):
            ref_tree_node = ref_tree.get_common_ancestor(common_edge)
            ref_tree_node.barchart_values[pct_mask] = tree_node.bootstrap
            ref_tree_node.barchart_values[0] = ref_tree_node.bootstrap
            ref_tree_node.suport_symbol = get_support_symbol(
                ref_tree_node.bootstrap, tree_node.bootstrap)
            shared_edge_support_values.append({
                ref_bs_label: ref_tree_node.bootstrap,
                tree_bs_label: tree_node.bootstrap,
                "Percent Mask": pct_mask,
示例#12
0
def compare_tree(tree1, tree2):
    return Tree.compare(tree1, tree2, unrooted=True)['rf']