def cherry_count_for_tree(tree):
    cherry_count = 0
    for node in tree.leaf_nodes():
        if node.parent_node.child_nodes()[0].is_leaf(
        ) and node.parent_node.child_nodes()[1].is_leaf():
            cherry_count += 1
    cherry_count = cherry_count / 2
    return cherry_count


num_taxa = 20
num_reps = 10

N_vec = np.arange(100, 400, 50)
#reference_tree = utils.balanced_binary(num_taxa)
jc = generation.Jukes_Cantor(num_classes=2)

mutation_rate = jc.p2t(0.9)

snj = reconstruct_tree.SpectralNeighborJoining(
    reconstruct_tree.JC_similarity_matrix)
nj = reconstruct_tree.NeighborJoining(reconstruct_tree.JC_similarity_matrix)
treesvd = reconstruct_tree.TreeSVD()
methods = [snj, treesvd]
#results = compare_methods.experiment([reference_tree], jc, N_vec, methods=methods,\
#     mutation_rates = [mutation_rate], reps_per_tree=num_reps)
df = pd.DataFrame(
    columns=['method', 'runtime', 'RF', 'n', 'cherries_ref', 'cherries_res'])
for i in np.arange(num_reps):
    print(i)
    reference_tree = utils.unrooted_pure_kingman_tree(num_taxa)
Exemplo n.º 2
0
def run_method(method,
               size,
               run_num,
               tree,
               m=300,
               kappa=2,
               mutation_rate=0.05,
               threshold=None,
               verbose=False):
    subtree_folder = "/gpfs/ysm/scratch60/morgan_levine/mw957/tree_merge_test/seqlen_" + str(
        m) + "_" + method + "_" + str(threshold) + "_" + str(run_num) + "/"
    if os.path.exists(subtree_folder):
        shutil.rmtree(subtree_folder)
    os.mkdir(subtree_folder)
    tree.write(path=subtree_folder + "true_tree.txt", schema="newick")
    subtree_filename = subtree_folder + "subtree_%s.txt"
    start_time = time.time()
    observations, taxa_meta = generation.simulate_sequences(
        m,
        tree_model=tree,
        seq_model=generation.Jukes_Cantor(),
        mutation_rate=mutation_rate,
        alphabet="DNA")
    runtime = time.time() - start_time
    print("Simulation took %s seconds" % runtime)

    spectral_method = reconstruct_tree.STDR(
        reconstruct_tree.RAxML, reconstruct_tree.JC_similarity_matrix)
    start_time = time.time()
    tree_rec = spectral_method.deep_spectral_tree_reconstruction(
        observations,
        reconstruct_tree.JC_similarity_matrix,
        taxa_metadata=taxa_meta,
        threshhold=threshold,
        raxml_args="-T 2 --HKY85 -c 1",
        min_split=5,
        verbose=verbose,
        subtree_filename=subtree_filename)
    runtime = time.time() - start_time
    tree_rec.write(path=subtree_folder + "STDR_tree.txt", schema="newick")

    distance = reconstruct_tree.JC_distance_matrix(observations, taxa_meta)
    distance_pd = pd.DataFrame(distance)
    taxa_list = [x.label for x in taxa_meta]

    with open(subtree_folder + 'taxa.txt', 'w') as f:
        for item in taxa_list:
            f.write("%s\n" % item)
    distance_pd.index = taxa_list
    distance_path = subtree_folder + "HKY_distance.txt"
    distance_pd.to_csv(distance_path, sep="\t", header=False)
    with open(distance_path, 'r') as original:
        data = original.read()
    with open(distance_path, 'w') as modified:
        modified.write(str(size) + "\n" + data)

    # accuracy of the STDR method
    RF, F1 = reconstruct_tree.compare_trees(tree_rec, tree)
    print(method)

    if threshold is not None: print(threshold)
    print("--- %s seconds ---" % runtime)
    print("RF = ", RF)
    print("F1% = ", F1)
    return ([method, str(threshold), runtime, RF, F1])
def run_method(method,
               tree,
               m=300,
               kappa=2,
               mutation_rate=0.05,
               threshold=None,
               verbose=False):
    start_time = time.time()
    observations, taxa_meta = generation.simulate_sequences(
        m,
        tree_model=tree,
        seq_model=generation.Jukes_Cantor(),
        mutation_rate=mutation_rate,
        alphabet="DNA")
    runtime = time.time() - start_time
    print("Simulation took %s seconds" % runtime)

    if method == "RaXML":
        raxml_HKY = reconstruct_tree.RAxML()
        start_time = time.time()
        tree_rec = raxml_HKY(observations,
                             taxa_meta,
                             raxml_args="-T 2 --HKY85 -c 1")
    if method == "SNJ":
        snj = reconstruct_tree.SpectralNeighborJoining(
            reconstruct_tree.JC_similarity_matrix)
        start_time = time.time()
        tree_rec = snj(observations, taxa_meta)
    if method == "NJ":
        nj = reconstruct_tree.NeighborJoining(
            reconstruct_tree.JC_similarity_matrix)
        start_time = time.time()
        tree_rec = nj(observations, taxa_meta)
    if method == "STR+NJ":
        spectral_method = reconstruct_tree.STDR(
            reconstruct_tree.NeighborJoining,
            reconstruct_tree.JC_similarity_matrix)
        start_time = time.time()
        tree_rec = spectral_method.deep_spectral_tree_reconstruction(
            observations,
            reconstruct_tree.JC_similarity_matrix,
            taxa_metadata=taxa_meta,
            threshhold=threshold,
            min_split=5,
            verbose=verbose)
    if method == "STR+SNJ":
        spectral_method = reconstruct_tree.STDR(
            reconstruct_tree.SpectralNeighborJoining,
            reconstruct_tree.JC_similarity_matrix)
        start_time = time.time()
        tree_rec = spectral_method.deep_spectral_tree_reconstruction(
            observations,
            reconstruct_tree.JC_similarity_matrix,
            taxa_metadata=taxa_meta,
            threshhold=threshold,
            min_split=5,
            verbose=verbose)
    if method == "STR+RaXML":
        spectral_method = reconstruct_tree.STDR(
            reconstruct_tree.RAxML, reconstruct_tree.JC_similarity_matrix)
        start_time = time.time()
        tree_rec = spectral_method.deep_spectral_tree_reconstruction(
            observations,
            reconstruct_tree.JC_similarity_matrix,
            taxa_metadata=taxa_meta,
            threshhold=threshold,
            raxml_args="-T 2 --HKY85 -c 1",
            min_split=5,
            verbose=verbose)
    runtime = time.time() - start_time
    RF, F1 = reconstruct_tree.compare_trees(tree_rec, tree)
    print(method)
    if threshold is not None: print(threshold)
    print("--- %s seconds ---" % runtime)
    print("RF = ", RF)
    print("F1% = ", F1)
    return ([method, str(threshold), runtime, RF, F1])
import seaborn as sns
import matplotlib.pylab as plt
import dendropy
import copy

import sys, os
sys.path.append(os.path.join(os.path.dirname(sys.path[0]), 'spectraltree'))
sys.path.append(os.path.join(sys.path[0], 'spectraltree'))

import utils
import generation
import reconstruct_tree

N = 1000
num_taxa = 256
jc = generation.Jukes_Cantor()
mutation_rate = [jc.p2t(0.95)]
# Construct data of type 'tree' from class dendropy
namespace = utils.default_namespace(num_taxa)
tree = utils.unrooted_birth_death_tree(num_taxa, birth_rate=1)
for x in tree.preorder_edge_iter():
    x.length = 1
#tree = utils.lopsided_tree(num_taxa=num_taxa, namespace=namespace)
#tree = utils.balanced_binary(num_taxa, namespace=namespace)
tree.is_rooted = True

for i in tree.bipartition_edge_map:
    if tree.bipartition_edge_map[i] == tree.seed_node.child_edges()[0]:
        taxa_half1 = set(i.leafset_taxa(tree.taxon_namespace))
    if tree.bipartition_edge_map[i] == tree.seed_node.child_edges()[1]:
        taxa_half2 = set(i.leafset_taxa(tree.taxon_namespace))
Exemplo n.º 5
0
from dendropy.calculate.treecompare import symmetric_difference

num_taxa = 32
N = 1000
reference_tree = utils.balanced_binary(num_taxa)
# %%
###########################################################################
##                   TEST WITH DENDROPY DATA
###########################################################################
print("test dendropy data")
time_s = time.time()
data = simulate_discrete_chars(
    N,
    reference_tree,
    Jc69(),
    mutation_rate=generation.Jukes_Cantor().p2t(0.95),
)
print("")
print("Time for data generation", time.time() - time_s)
time_s = time.time()
raxml = reconstruct_tree.RAxML()
tree = raxml(data)
runtime = time.time() - time_s

print("Data in DNAcharacterMatrix:")
print("symmetric_difference: ", symmetric_difference(reference_tree, tree))
RF, F1 = reconstruct_tree.compare_trees(reference_tree, tree)
print("raxml: ")
print("RF = ", RF)
print("F1% = ", F1)
print("runtime = ", runtime)