def cherry_count_for_tree(tree): cherry_count = 0 for node in tree.leaf_nodes(): if node.parent_node.child_nodes()[0].is_leaf( ) and node.parent_node.child_nodes()[1].is_leaf(): cherry_count += 1 cherry_count = cherry_count / 2 return cherry_count num_taxa = 20 num_reps = 10 N_vec = np.arange(100, 400, 50) #reference_tree = utils.balanced_binary(num_taxa) jc = generation.Jukes_Cantor(num_classes=2) mutation_rate = jc.p2t(0.9) snj = reconstruct_tree.SpectralNeighborJoining( reconstruct_tree.JC_similarity_matrix) nj = reconstruct_tree.NeighborJoining(reconstruct_tree.JC_similarity_matrix) treesvd = reconstruct_tree.TreeSVD() methods = [snj, treesvd] #results = compare_methods.experiment([reference_tree], jc, N_vec, methods=methods,\ # mutation_rates = [mutation_rate], reps_per_tree=num_reps) df = pd.DataFrame( columns=['method', 'runtime', 'RF', 'n', 'cherries_ref', 'cherries_res']) for i in np.arange(num_reps): print(i) reference_tree = utils.unrooted_pure_kingman_tree(num_taxa)
def run_method(method, size, run_num, tree, m=300, kappa=2, mutation_rate=0.05, threshold=None, verbose=False): subtree_folder = "/gpfs/ysm/scratch60/morgan_levine/mw957/tree_merge_test/seqlen_" + str( m) + "_" + method + "_" + str(threshold) + "_" + str(run_num) + "/" if os.path.exists(subtree_folder): shutil.rmtree(subtree_folder) os.mkdir(subtree_folder) tree.write(path=subtree_folder + "true_tree.txt", schema="newick") subtree_filename = subtree_folder + "subtree_%s.txt" start_time = time.time() observations, taxa_meta = generation.simulate_sequences( m, tree_model=tree, seq_model=generation.Jukes_Cantor(), mutation_rate=mutation_rate, alphabet="DNA") runtime = time.time() - start_time print("Simulation took %s seconds" % runtime) spectral_method = reconstruct_tree.STDR( reconstruct_tree.RAxML, reconstruct_tree.JC_similarity_matrix) start_time = time.time() tree_rec = spectral_method.deep_spectral_tree_reconstruction( observations, reconstruct_tree.JC_similarity_matrix, taxa_metadata=taxa_meta, threshhold=threshold, raxml_args="-T 2 --HKY85 -c 1", min_split=5, verbose=verbose, subtree_filename=subtree_filename) runtime = time.time() - start_time tree_rec.write(path=subtree_folder + "STDR_tree.txt", schema="newick") distance = reconstruct_tree.JC_distance_matrix(observations, taxa_meta) distance_pd = pd.DataFrame(distance) taxa_list = [x.label for x in taxa_meta] with open(subtree_folder + 'taxa.txt', 'w') as f: for item in taxa_list: f.write("%s\n" % item) distance_pd.index = taxa_list distance_path = subtree_folder + "HKY_distance.txt" distance_pd.to_csv(distance_path, sep="\t", header=False) with open(distance_path, 'r') as original: data = original.read() with open(distance_path, 'w') as modified: modified.write(str(size) + "\n" + data) # accuracy of the STDR method RF, F1 = reconstruct_tree.compare_trees(tree_rec, tree) print(method) if threshold is not None: print(threshold) print("--- %s seconds ---" % runtime) print("RF = ", RF) print("F1% = ", F1) return ([method, str(threshold), runtime, RF, F1])
def run_method(method, tree, m=300, kappa=2, mutation_rate=0.05, threshold=None, verbose=False): start_time = time.time() observations, taxa_meta = generation.simulate_sequences( m, tree_model=tree, seq_model=generation.Jukes_Cantor(), mutation_rate=mutation_rate, alphabet="DNA") runtime = time.time() - start_time print("Simulation took %s seconds" % runtime) if method == "RaXML": raxml_HKY = reconstruct_tree.RAxML() start_time = time.time() tree_rec = raxml_HKY(observations, taxa_meta, raxml_args="-T 2 --HKY85 -c 1") if method == "SNJ": snj = reconstruct_tree.SpectralNeighborJoining( reconstruct_tree.JC_similarity_matrix) start_time = time.time() tree_rec = snj(observations, taxa_meta) if method == "NJ": nj = reconstruct_tree.NeighborJoining( reconstruct_tree.JC_similarity_matrix) start_time = time.time() tree_rec = nj(observations, taxa_meta) if method == "STR+NJ": spectral_method = reconstruct_tree.STDR( reconstruct_tree.NeighborJoining, reconstruct_tree.JC_similarity_matrix) start_time = time.time() tree_rec = spectral_method.deep_spectral_tree_reconstruction( observations, reconstruct_tree.JC_similarity_matrix, taxa_metadata=taxa_meta, threshhold=threshold, min_split=5, verbose=verbose) if method == "STR+SNJ": spectral_method = reconstruct_tree.STDR( reconstruct_tree.SpectralNeighborJoining, reconstruct_tree.JC_similarity_matrix) start_time = time.time() tree_rec = spectral_method.deep_spectral_tree_reconstruction( observations, reconstruct_tree.JC_similarity_matrix, taxa_metadata=taxa_meta, threshhold=threshold, min_split=5, verbose=verbose) if method == "STR+RaXML": spectral_method = reconstruct_tree.STDR( reconstruct_tree.RAxML, reconstruct_tree.JC_similarity_matrix) start_time = time.time() tree_rec = spectral_method.deep_spectral_tree_reconstruction( observations, reconstruct_tree.JC_similarity_matrix, taxa_metadata=taxa_meta, threshhold=threshold, raxml_args="-T 2 --HKY85 -c 1", min_split=5, verbose=verbose) runtime = time.time() - start_time RF, F1 = reconstruct_tree.compare_trees(tree_rec, tree) print(method) if threshold is not None: print(threshold) print("--- %s seconds ---" % runtime) print("RF = ", RF) print("F1% = ", F1) return ([method, str(threshold), runtime, RF, F1])
import seaborn as sns import matplotlib.pylab as plt import dendropy import copy import sys, os sys.path.append(os.path.join(os.path.dirname(sys.path[0]), 'spectraltree')) sys.path.append(os.path.join(sys.path[0], 'spectraltree')) import utils import generation import reconstruct_tree N = 1000 num_taxa = 256 jc = generation.Jukes_Cantor() mutation_rate = [jc.p2t(0.95)] # Construct data of type 'tree' from class dendropy namespace = utils.default_namespace(num_taxa) tree = utils.unrooted_birth_death_tree(num_taxa, birth_rate=1) for x in tree.preorder_edge_iter(): x.length = 1 #tree = utils.lopsided_tree(num_taxa=num_taxa, namespace=namespace) #tree = utils.balanced_binary(num_taxa, namespace=namespace) tree.is_rooted = True for i in tree.bipartition_edge_map: if tree.bipartition_edge_map[i] == tree.seed_node.child_edges()[0]: taxa_half1 = set(i.leafset_taxa(tree.taxon_namespace)) if tree.bipartition_edge_map[i] == tree.seed_node.child_edges()[1]: taxa_half2 = set(i.leafset_taxa(tree.taxon_namespace))
from dendropy.calculate.treecompare import symmetric_difference num_taxa = 32 N = 1000 reference_tree = utils.balanced_binary(num_taxa) # %% ########################################################################### ## TEST WITH DENDROPY DATA ########################################################################### print("test dendropy data") time_s = time.time() data = simulate_discrete_chars( N, reference_tree, Jc69(), mutation_rate=generation.Jukes_Cantor().p2t(0.95), ) print("") print("Time for data generation", time.time() - time_s) time_s = time.time() raxml = reconstruct_tree.RAxML() tree = raxml(data) runtime = time.time() - time_s print("Data in DNAcharacterMatrix:") print("symmetric_difference: ", symmetric_difference(reference_tree, tree)) RF, F1 = reconstruct_tree.compare_trees(reference_tree, tree) print("raxml: ") print("RF = ", RF) print("F1% = ", F1) print("runtime = ", runtime)