예제 #1
0
def run_method(method, tree, m = 300, kappa = 2, mutation_rate=0.05, threshold = None, verbose = False):
    start_time = time.time()
    observations, taxa_meta = generation.simulate_sequences(m, tree_model=tree, seq_model=generation.HKY(kappa = kappa), mutation_rate=mutation_rate, alphabet="DNA")
    runtime = time.time() - start_time
    print("Simulation took %s seconds" % runtime)
    
    if method == "RaXML":
        raxml_HKY = reconstruct_tree.RAxML()
        start_time = time.time()
        tree_rec = raxml_HKY(observations, taxa_meta, raxml_args="-T 2 --HKY85 -c 1")      
    if method == "SNJ":
        snj = reconstruct_tree.SpectralNeighborJoining(reconstruct_tree.HKY_similarity_matrix)
        start_time = time.time()
        tree_rec = snj(observations, taxa_meta)
    if method == "NJ":
        nj = reconstruct_tree.NeighborJoining(reconstruct_tree.HKY_similarity_matrix)
        start_time = time.time()
        tree_rec = nj(observations, taxa_meta)
    if method == "STR+NJ":
        spectral_method = reconstruct_tree.STDR(reconstruct_tree.NeighborJoining, reconstruct_tree.HKY_similarity_matrix)
        start_time = time.time()
        tree_rec = spectral_method.deep_spectral_tree_reconstruction(observations, reconstruct_tree.HKY_similarity_matrix, 
                                                            taxa_metadata = taxa_meta,
                                                            threshhold = threshold, min_split = 5, verbose = verbose)
    if method == "STR+SNJ":
        spectral_method = reconstruct_tree.STDR(reconstruct_tree.SpectralNeighborJoining, reconstruct_tree.HKY_similarity_matrix)
        start_time = time.time()
        tree_rec = spectral_method.deep_spectral_tree_reconstruction(observations, reconstruct_tree.HKY_similarity_matrix, 
                                                            taxa_metadata = taxa_meta, 
                                                            threshhold = threshold, min_split = 5, verbose = verbose)
    if method == "STR+RaXML":
        spectral_method = reconstruct_tree.STDR(reconstruct_tree.RAxML, reconstruct_tree.HKY_similarity_matrix)
        start_time = time.time()
        tree_rec = spectral_method.deep_spectral_tree_reconstruction(observations, reconstruct_tree.HKY_similarity_matrix, 
                                                            taxa_metadata = taxa_meta, 
                                                            threshhold = threshold,
                                                            raxml_args = "-T 2 --HKY85 -c 1", min_split = 5, verbose = verbose)
    runtime = time.time() - start_time
    RF,F1 = reconstruct_tree.compare_trees(tree_rec, tree)
    print(method)
    if threshold is not None: print(threshold)
    print("--- %s seconds ---" % runtime)
    print("RF = ",RF)
    print("F1% = ",F1) 
    return([method, str(threshold), runtime, RF, F1])
    reconstruct_tree.JC_similarity_matrix)
nj = reconstruct_tree.NeighborJoining(reconstruct_tree.JC_similarity_matrix)
treesvd = reconstruct_tree.TreeSVD()
methods = [snj, treesvd]
#results = compare_methods.experiment([reference_tree], jc, N_vec, methods=methods,\
#     mutation_rates = [mutation_rate], reps_per_tree=num_reps)
df = pd.DataFrame(
    columns=['method', 'runtime', 'RF', 'n', 'cherries_ref', 'cherries_res'])
for i in np.arange(num_reps):
    print(i)
    reference_tree = utils.unrooted_pure_kingman_tree(num_taxa)
    ch_ref = cherry_count_for_tree(reference_tree)
    for n in N_vec:
        observations, taxa_meta = generation.simulate_sequences(
            n,
            tree_model=reference_tree,
            seq_model=jc,
            mutation_rate=mutation_rate,
            alphabet="DNA")

        # Tree svd
        t_s = time.time()
        tree_svd_rec = treesvd(observations, taxa_meta)
        runtime_treesvd = time.time() - t_s
        RF_svd, F1 = reconstruct_tree.compare_trees(tree_svd_rec,
                                                    reference_tree)
        ch_svd = cherry_count_for_tree(tree_svd_rec)
        df = df.append(
            {
                'method': 'treesvd',
                'runtime': runtime_treesvd,
                'RF': RF_svd,
threshold = 32
# jc = generation.Jukes_Cantor()
hky = generation.HKY(kappa=2)
mutation_rate = 0.05
# mutation_rate = [jc.p2t(0.95)]
np.random.seed(0)
# reference_tree = utils.unrooted_birth_death_tree(num_taxa, birth_rate=1)
# reference_tree = utils.lopsided_tree(num_taxa)
reference_tree = utils.balanced_binary(num_taxa)
# for x in reference_tree.preorder_edge_iter():
#     x.length = 1
np.random.seed(0)
t0 = time.time()
observations, meta = generation.simulate_sequences(N,
                                                   tree_model=reference_tree,
                                                   seq_model=hky,
                                                   mutation_rate=mutation_rate,
                                                   rng=np.random,
                                                   alphabet='DNA')
print("gen time: ", time.time() - t0)
spectral_method = str.STR(reconstruct_tree.RAxML,
                          reconstruct_tree.HKY_similarity_matrix,
                          threshold=threshold,
                          merge_method="least_square",
                          num_gaps=1,
                          min_split=5,
                          verbose=False)

t0 = time.time()
#cProfile.run("""tree_rec = spectral_method.deep_spectral_tree_reconstruction(observations, reconstruct_tree.JC_similarity_matrix,
#                                    taxa_metadata= meta,
#                                    threshhold = 8, min_split = 3 ,verbose=False)""", filename="temp1.prof")
예제 #4
0
def run_method(method,
               size,
               run_num,
               tree,
               m=300,
               kappa=2,
               mutation_rate=0.05,
               threshold=None,
               verbose=False):
    subtree_folder = "/gpfs/ysm/scratch60/morgan_levine/mw957/tree_merge_test/seqlen_" + str(
        m) + "_" + method + "_" + str(threshold) + "_" + str(run_num) + "/"
    if os.path.exists(subtree_folder):
        shutil.rmtree(subtree_folder)
    os.mkdir(subtree_folder)
    tree.write(path=subtree_folder + "true_tree.txt", schema="newick")
    subtree_filename = subtree_folder + "subtree_%s.txt"
    start_time = time.time()
    observations, taxa_meta = generation.simulate_sequences(
        m,
        tree_model=tree,
        seq_model=generation.Jukes_Cantor(),
        mutation_rate=mutation_rate,
        alphabet="DNA")
    runtime = time.time() - start_time
    print("Simulation took %s seconds" % runtime)

    spectral_method = reconstruct_tree.STDR(
        reconstruct_tree.RAxML, reconstruct_tree.JC_similarity_matrix)
    start_time = time.time()
    tree_rec = spectral_method.deep_spectral_tree_reconstruction(
        observations,
        reconstruct_tree.JC_similarity_matrix,
        taxa_metadata=taxa_meta,
        threshhold=threshold,
        raxml_args="-T 2 --HKY85 -c 1",
        min_split=5,
        verbose=verbose,
        subtree_filename=subtree_filename)
    runtime = time.time() - start_time
    tree_rec.write(path=subtree_folder + "STDR_tree.txt", schema="newick")

    distance = reconstruct_tree.JC_distance_matrix(observations, taxa_meta)
    distance_pd = pd.DataFrame(distance)
    taxa_list = [x.label for x in taxa_meta]

    with open(subtree_folder + 'taxa.txt', 'w') as f:
        for item in taxa_list:
            f.write("%s\n" % item)
    distance_pd.index = taxa_list
    distance_path = subtree_folder + "HKY_distance.txt"
    distance_pd.to_csv(distance_path, sep="\t", header=False)
    with open(distance_path, 'r') as original:
        data = original.read()
    with open(distance_path, 'w') as modified:
        modified.write(str(size) + "\n" + data)

    # accuracy of the STDR method
    RF, F1 = reconstruct_tree.compare_trees(tree_rec, tree)
    print(method)

    if threshold is not None: print(threshold)
    print("--- %s seconds ---" % runtime)
    print("RF = ", RF)
    print("F1% = ", F1)
    return ([method, str(threshold), runtime, RF, F1])
jc = generation.Jukes_Cantor()
mutation_rate = [jc.p2t(0.95)]
num_itr = 2  #0
# reference_tree = utils.unrooted_birth_death_tree(num_taxa, birth_rate=1)
# for x in reference_tree.preorder_edge_iter():
#     x.length = 1
merging_method_list = ['least_square', 'angle']
RF = {'least_square': [], 'angle': []}
F1 = {'least_square': [], 'angle': []}
for merge_method in merging_method_list:
    for i in range(num_itr):
        #reference_tree = utils.balanced_binary(num_taxa)
        reference_tree = utils.lopsided_tree(num_taxa)
        observations, taxa_meta = generation.simulate_sequences(
            N,
            tree_model=reference_tree,
            seq_model=jc,
            mutation_rate=mutation_rate)
        spectral_method = reconstruct_tree.SpectralTreeReconstruction(
            reconstruct_tree.NeighborJoining,
            reconstruct_tree.JC_similarity_matrix)
        tree_rec = spectral_method.deep_spectral_tree_reconstruction(
            observations,
            reconstruct_tree.JC_similarity_matrix,
            taxa_metadata=taxa_meta,
            threshhold=16,
            merge_method=merge_method)
        RF_i, F1_i = reconstruct_tree.compare_trees(tree_rec, reference_tree)
        RF[merge_method].append(RF_i)
        F1[merge_method].append(F1_i)