Exemplo n.º 1
0
def run_method(method, tree, m = 300, kappa = 2, mutation_rate=0.05, threshold = None, verbose = False):
    start_time = time.time()
    observations, taxa_meta = generation.simulate_sequences(m, tree_model=tree, seq_model=generation.HKY(kappa = kappa), mutation_rate=mutation_rate, alphabet="DNA")
    runtime = time.time() - start_time
    print("Simulation took %s seconds" % runtime)
    
    if method == "RaXML":
        raxml_HKY = reconstruct_tree.RAxML()
        start_time = time.time()
        tree_rec = raxml_HKY(observations, taxa_meta, raxml_args="-T 2 --HKY85 -c 1")      
    if method == "SNJ":
        snj = reconstruct_tree.SpectralNeighborJoining(reconstruct_tree.HKY_similarity_matrix)
        start_time = time.time()
        tree_rec = snj(observations, taxa_meta)
    if method == "NJ":
        nj = reconstruct_tree.NeighborJoining(reconstruct_tree.HKY_similarity_matrix)
        start_time = time.time()
        tree_rec = nj(observations, taxa_meta)
    if method == "STR+NJ":
        spectral_method = reconstruct_tree.STDR(reconstruct_tree.NeighborJoining, reconstruct_tree.HKY_similarity_matrix)
        start_time = time.time()
        tree_rec = spectral_method.deep_spectral_tree_reconstruction(observations, reconstruct_tree.HKY_similarity_matrix, 
                                                            taxa_metadata = taxa_meta,
                                                            threshhold = threshold, min_split = 5, verbose = verbose)
    if method == "STR+SNJ":
        spectral_method = reconstruct_tree.STDR(reconstruct_tree.SpectralNeighborJoining, reconstruct_tree.HKY_similarity_matrix)
        start_time = time.time()
        tree_rec = spectral_method.deep_spectral_tree_reconstruction(observations, reconstruct_tree.HKY_similarity_matrix, 
                                                            taxa_metadata = taxa_meta, 
                                                            threshhold = threshold, min_split = 5, verbose = verbose)
    if method == "STR+RaXML":
        spectral_method = reconstruct_tree.STDR(reconstruct_tree.RAxML, reconstruct_tree.HKY_similarity_matrix)
        start_time = time.time()
        tree_rec = spectral_method.deep_spectral_tree_reconstruction(observations, reconstruct_tree.HKY_similarity_matrix, 
                                                            taxa_metadata = taxa_meta, 
                                                            threshhold = threshold,
                                                            raxml_args = "-T 2 --HKY85 -c 1", min_split = 5, verbose = verbose)
    runtime = time.time() - start_time
    RF,F1 = reconstruct_tree.compare_trees(tree_rec, tree)
    print(method)
    if threshold is not None: print(threshold)
    print("--- %s seconds ---" % runtime)
    print("RF = ",RF)
    print("F1% = ",F1) 
    return([method, str(threshold), runtime, RF, F1])
    print(i)
    reference_tree = utils.unrooted_pure_kingman_tree(num_taxa)
    ch_ref = cherry_count_for_tree(reference_tree)
    for n in N_vec:
        observations, taxa_meta = generation.simulate_sequences(
            n,
            tree_model=reference_tree,
            seq_model=jc,
            mutation_rate=mutation_rate,
            alphabet="DNA")

        # Tree svd
        t_s = time.time()
        tree_svd_rec = treesvd(observations, taxa_meta)
        runtime_treesvd = time.time() - t_s
        RF_svd, F1 = reconstruct_tree.compare_trees(tree_svd_rec,
                                                    reference_tree)
        ch_svd = cherry_count_for_tree(tree_svd_rec)
        df = df.append(
            {
                'method': 'treesvd',
                'runtime': runtime_treesvd,
                'RF': RF_svd,
                'n': n,
                'cherries_ref': ch_ref,
                'cherries_res': ch_svd
            },
            ignore_index=True)

        # SNJ
        t_s = time.time()
        tree_snj = snj(observations, taxa_meta)
data_HKY = simulate_discrete_chars(N, H3N2_tree, Hky85(kappa = 2), mutation_rate=0.1)
ch_list = list()
for t in data_HKY.taxon_namespace:
    ch_list.append([x.symbol for x in data_HKY[t]])
ch_arr = np.array(ch_list)
identical = np.array([np.mean(a == b) for a, b in product(ch_arr, repeat = 2)])

#start_time = time.time()
#cProfile.run('S = HKY_similarity_matrix(ch_arr)')
#compute_s_time = time.time() - start_time
#print("--- %s seconds ---" % compute_s_time)
threshold = 128
t1 = time.time()
spectral_method = reconstruct_tree.STDR(reconstruct_tree.RAxML,
                                                              reconstruct_tree.HKY_similarity_matrix)
tree_rec = spectral_method.deep_spectral_tree_reconstruction(ch_arr, reconstruct_tree.HKY_similarity_matrix, 
                                                            taxon_namespace = H3N2_tree.taxon_namespace, 
                                                            threshhold = threshold,min_split = 30)
runtime = time.time()-t1


Deep_nj_RF, Deep_nj_RF_F1 = reconstruct_tree.compare_trees(tree_rec, H3N2_tree)
print("SNJ: ")
print("RF = ", Deep_nj_RF)
print("F1% = ", Deep_nj_RF_F1)
print("runtime = ", runtime)
print("")



print(tree_path)
Exemplo n.º 4
0
        T_left = copy.deepcopy(H3N2_tree).extract_tree_with_taxa_labels(labels = left_namespace)
        T_left.purge_taxon_namespace()
        s = T_left.as_string(schema = "newick")
        T_left = dendropy.Tree.get(data=s, schema="newick", taxon_namespace = left_taxa)
        right_namespace = list(taxon_namespace_label[np.where(np.logical_not(partition))[0]])
        right_taxa = dendropy.TaxonNamespace([taxon for taxon in H3N2_tree.taxon_namespace
            if taxon.label in right_namespace])
        T_right = copy.deepcopy(H3N2_tree).extract_tree_with_taxa_labels(labels = right_namespace)
        T_right.purge_taxon_namespace()
        s = T_right.as_string(schema = "newick")
        T_right = dendropy.Tree.get(data=s,
        schema="newick", taxon_namespace = right_taxa)
        
        start_time = time.time()
        joined_tree = reconstruct_tree.join_trees_with_spectral_root_finding_ls(
            HKY_sim, T_left, T_right, taxon_namespace = H3N2_tree.taxon_namespace)
        runtime = time.time() - start_time
        
        RF,F1 = reconstruct_tree.compare_trees(joined_tree, H3N2_tree)
        
        Ns.append(n)
        par1s.append(par1_size)
        par2s.append(par2_size)
        RFs.append(RF)
        F1s.append(F1)
        rts.append(runtime)
        
perf_metrics = pd.DataFrame({'seqlength': Ns, 'par1_size': par1s, 'par2_size': par2s, 
                             'RF': RFs, "F1": F1s, "runtime": rts})
perf_metrics.to_csv("/gpfs/ysm/project/kleinstein/mw957/repos/spec_tree/script/rooting_metrics_ls_2.csv")
Exemplo n.º 5
0
def run_method(method, tree, threshold=None):
    data_HKY = simulate_discrete_chars(1000,
                                       tree,
                                       Hky85(kappa=2),
                                       mutation_rate=0.1)
    ch_list = list()
    for t in data_HKY.taxon_namespace:
        ch_list.append([x.symbol for x in data_HKY[t]])
    ch_arr = np.array(ch_list)

    if method == "RaXML":
        raxml_HKY = reconstruct_tree.RAxML()
        start_time = time.time()
        tree_rec = raxml_HKY(data_HKY, raxml_args="-T 2 --HKY85 -c 1")
    if method == "SNJ":
        snj = reconstruct_tree.SpectralNeighborJoining(
            reconstruct_tree.HKY_similarity_matrix)
        start_time = time.time()
        tree_rec = snj(ch_arr, tree.taxon_namespace)
    if method == "NJ":
        nj = reconstruct_tree.NeighborJoining(
            reconstruct_tree.HKY_similarity_matrix)
        start_time = time.time()
        tree_rec = nj(ch_arr, tree.taxon_namespace)
    if method == "STR + NJ":
        spectral_method = reconstruct_tree.SpectralTreeReconstruction(
            reconstruct_tree.NeighborJoining,
            reconstruct_tree.HKY_similarity_matrix)
        start_time = time.time()
        tree_rec = spectral_method.deep_spectral_tree_reonstruction(
            ch_arr,
            reconstruct_tree.HKY_similarity_matrix,
            taxon_namespace=tree.taxon_namespace,
            threshhold=threshold,
            min_split=5)
    if method == "STR + SNJ":
        spectral_method = reconstruct_tree.SpectralTreeReconstruction(
            reconstruct_tree.SpectralNeighborJoining,
            reconstruct_tree.HKY_similarity_matrix)
        start_time = time.time()
        tree_rec = spectral_method.deep_spectral_tree_reonstruction(
            ch_arr,
            reconstruct_tree.HKY_similarity_matrix,
            taxon_namespace=tree.taxon_namespace,
            threshhold=threshold,
            min_split=5)
    if method == "STR + RaXML":
        spectral_method = reconstruct_tree.SpectralTreeReconstruction(
            reconstruct_tree.RAxML, reconstruct_tree.HKY_similarity_matrix)
        start_time = time.time()
        tree_rec = spectral_method.deep_spectral_tree_reonstruction(
            ch_arr,
            reconstruct_tree.HKY_similarity_matrix,
            taxon_namespace=tree.taxon_namespace,
            threshhold=threshold,
            raxml_args="-T 2 --HKY85 -c 1",
            min_split=5)
    runtime = time.time() - start_time
    RF, F1 = reconstruct_tree.compare_trees(tree_rec, tree)
    print(method)
    if threshold is not None: print(threshold)
    print("--- %s seconds ---" % runtime)
    print("RF = ", RF)
    print("F1% = ", F1)
    return ([method, str(threshold), runtime, RF, F1])
Exemplo n.º 6
0
def run_method(method,
               size,
               run_num,
               tree,
               m=300,
               kappa=2,
               mutation_rate=0.05,
               threshold=None,
               verbose=False):
    subtree_folder = "/gpfs/ysm/scratch60/morgan_levine/mw957/tree_merge_test/seqlen_" + str(
        m) + "_" + method + "_" + str(threshold) + "_" + str(run_num) + "/"
    if os.path.exists(subtree_folder):
        shutil.rmtree(subtree_folder)
    os.mkdir(subtree_folder)
    tree.write(path=subtree_folder + "true_tree.txt", schema="newick")
    subtree_filename = subtree_folder + "subtree_%s.txt"
    start_time = time.time()
    observations, taxa_meta = generation.simulate_sequences(
        m,
        tree_model=tree,
        seq_model=generation.Jukes_Cantor(),
        mutation_rate=mutation_rate,
        alphabet="DNA")
    runtime = time.time() - start_time
    print("Simulation took %s seconds" % runtime)

    spectral_method = reconstruct_tree.STDR(
        reconstruct_tree.RAxML, reconstruct_tree.JC_similarity_matrix)
    start_time = time.time()
    tree_rec = spectral_method.deep_spectral_tree_reconstruction(
        observations,
        reconstruct_tree.JC_similarity_matrix,
        taxa_metadata=taxa_meta,
        threshhold=threshold,
        raxml_args="-T 2 --HKY85 -c 1",
        min_split=5,
        verbose=verbose,
        subtree_filename=subtree_filename)
    runtime = time.time() - start_time
    tree_rec.write(path=subtree_folder + "STDR_tree.txt", schema="newick")

    distance = reconstruct_tree.JC_distance_matrix(observations, taxa_meta)
    distance_pd = pd.DataFrame(distance)
    taxa_list = [x.label for x in taxa_meta]

    with open(subtree_folder + 'taxa.txt', 'w') as f:
        for item in taxa_list:
            f.write("%s\n" % item)
    distance_pd.index = taxa_list
    distance_path = subtree_folder + "HKY_distance.txt"
    distance_pd.to_csv(distance_path, sep="\t", header=False)
    with open(distance_path, 'r') as original:
        data = original.read()
    with open(distance_path, 'w') as modified:
        modified.write(str(size) + "\n" + data)

    # accuracy of the STDR method
    RF, F1 = reconstruct_tree.compare_trees(tree_rec, tree)
    print(method)

    if threshold is not None: print(threshold)
    print("--- %s seconds ---" % runtime)
    print("RF = ", RF)
    print("F1% = ", F1)
    return ([method, str(threshold), runtime, RF, F1])
Exemplo n.º 7
0
def run_method(method, tree, seqs, threshold=None):
    ch_list = list()
    taxons = [x.label for x in tree.taxon_namespace]
    for t in taxons:
        ch_list.append([x.symbol for x in seqs[t]])
    ch_arr = np.array(ch_list)
    ch_arr[ch_arr == "U"] = "T"

    ch_dendro = dendropy.DnaCharacterMatrix()
    ch_dendro.taxon_namespace = tree.taxon_namespace
    for t, taxon in enumerate(taxons):
        ch_dendro.new_sequence(tree.taxon_namespace[t], ch_arr[t, :].tolist())

    if method == "RaXML":
        raxml_HKY = reconstruct_tree.RAxML()
        start_time = time.time()
        tree_rec = raxml_HKY(ch_dendro, raxml_args="-T 2 --HKY85 -c 1")
    if method == "SNJ":
        snj = reconstruct_tree.SpectralNeighborJoining(
            reconstruct_tree.HKY_similarity_matrix_missing_data)
        start_time = time.time()
        tree_rec = snj(ch_arr, tree.taxon_namespace)
    if method == "NJ":
        nj = reconstruct_tree.NeighborJoining(
            reconstruct_tree.HKY_similarity_matrix_missing_data)
        start_time = time.time()
        tree_rec = nj(ch_arr, tree.taxon_namespace)
    if method == "STR + NJ":
        spectral_method = reconstruct_tree.SpectralTreeReconstruction(
            reconstruct_tree.NeighborJoining,
            reconstruct_tree.HKY_similarity_matrix_missing_data)
        start_time = time.time()
        tree_rec = spectral_method.deep_spectral_tree_reonstruction(
            ch_arr,
            reconstruct_tree.HKY_similarity_matrix_missing_data,
            taxon_namespace=tree.taxon_namespace,
            threshhold=threshold,
            min_split=5)
    if method == "STR + SNJ":
        spectral_method = reconstruct_tree.SpectralTreeReconstruction(
            reconstruct_tree.SpectralNeighborJoining,
            reconstruct_tree.HKY_similarity_matrix_missing_data)
        start_time = time.time()
        tree_rec = spectral_method.deep_spectral_tree_reonstruction(
            ch_arr,
            reconstruct_tree.HKY_similarity_matrix_missing_data,
            taxon_namespace=tree.taxon_namespace,
            threshhold=threshold,
            min_split=5)
    if method == "STR + RaXML":
        spectral_method = reconstruct_tree.SpectralTreeReconstruction(
            reconstruct_tree.RAxML,
            reconstruct_tree.HKY_similarity_matrix_missing_data)
        start_time = time.time()
        tree_rec = spectral_method.deep_spectral_tree_reonstruction(
            ch_arr,
            reconstruct_tree.HKY_similarity_matrix_missing_data,
            taxon_namespace=tree.taxon_namespace,
            threshhold=threshold,
            raxml_args="-T 2 --HKY85 -c 1",
            min_split=5)
    runtime = time.time() - start_time
    RF, F1 = reconstruct_tree.compare_trees(tree_rec, tree)
    print(method)
    if threshold is not None: print(threshold)
    print("--- %s seconds ---" % runtime)
    print("RF = ", RF)
    print("F1% = ", F1)
    return ([method, str(threshold), runtime, RF, F1])
observations, meta = generation.simulate_sequences(N,
                                                   tree_model=reference_tree,
                                                   seq_model=hky,
                                                   mutation_rate=mutation_rate,
                                                   rng=np.random,
                                                   alphabet='DNA')
print("gen time: ", time.time() - t0)
spectral_method = str.STR(reconstruct_tree.RAxML,
                          reconstruct_tree.HKY_similarity_matrix,
                          threshold=threshold,
                          merge_method="least_square",
                          num_gaps=1,
                          min_split=5,
                          verbose=False)

t0 = time.time()
#cProfile.run("""tree_rec = spectral_method.deep_spectral_tree_reconstruction(observations, reconstruct_tree.JC_similarity_matrix,
#                                    taxa_metadata= meta,
#                                    threshhold = 8, min_split = 3 ,verbose=False)""", filename="temp1.prof")
# # To view with a nice GUI, run: snakeviz .\temp.prof
tree_rec = spectral_method(observations, taxa_metadata=meta)
t = time.time() - t0

RF, F1 = reconstruct_tree.compare_trees(tree_rec, reference_tree)
print("Spectral: ")
print("time = ", t)

print("RF = ", RF)
print("F1% = ", F1)
print("")
# namespace1 = namespace
# T1 = utils.lopsided_tree(num_taxa=int(num_taxa/2), namespace=dendropy.TaxonNamespace(namespace[:int(num_taxa/2)]))
# T2 = utils.lopsided_tree(num_taxa=int(num_taxa/2), namespace=dendropy.TaxonNamespace(namespace[int(num_taxa/2):]))
# T1.print_plot
# TT1 = T1.extract_tree_with_taxa(namespace)
# TT2 = T2.extract_tree_with_taxa(namespace)
# tree.seed_node.set_child_nodes([TT1.seed_node,TT2.seed_node])
# create sequences for each node, not sure what is th

observations = generation.simulate_sequences_ordered(
    N, tree_model=tree, seq_model=jc, mutation_rate=mutation_rate)

t1 = time.time()
S = reconstruct_tree.JC_similarity_matrix(observations)
t2 = time.time()
TT = reconstruct_tree.join_trees_with_spectral_root_finding(
    S, tree1, tree2, namespace=tree.taxon_namespace)
# TT.print_plot(width = 80)
# T.print_plot(width = 80)
t3 = time.time()

RF, F1 = reconstruct_tree.compare_trees(tree, TT)
print("Spectral: ")
print("RF = ", RF)
print("F1% = ", F1)
print("")
print("time:")
print("    Build Sim.: ", t2 - t1)
print("    Find roots: ", t3 - t2)
merging_method_list = ['least_square', 'angle']
RF = {'least_square': [], 'angle': []}
F1 = {'least_square': [], 'angle': []}
for merge_method in merging_method_list:
    for i in range(num_itr):
        #reference_tree = utils.balanced_binary(num_taxa)
        reference_tree = utils.lopsided_tree(num_taxa)
        observations, taxa_meta = generation.simulate_sequences(
            N,
            tree_model=reference_tree,
            seq_model=jc,
            mutation_rate=mutation_rate)
        spectral_method = reconstruct_tree.SpectralTreeReconstruction(
            reconstruct_tree.NeighborJoining,
            reconstruct_tree.JC_similarity_matrix)
        tree_rec = spectral_method.deep_spectral_tree_reconstruction(
            observations,
            reconstruct_tree.JC_similarity_matrix,
            taxa_metadata=taxa_meta,
            threshhold=16,
            merge_method=merge_method)
        RF_i, F1_i = reconstruct_tree.compare_trees(tree_rec, reference_tree)
        RF[merge_method].append(RF_i)
        F1[merge_method].append(F1_i)

print("Angle RF: ", np.mean(RF['angle']), "Runtime: ", runtime)

#print("LS: ",np.mean(RF['least_square']))
#print("tensor: ",np.mean(RF['tensor']))
print("")
df = pd.DataFrame(columns=['method', 'runtime', 'RF', 'm'])
for m in num_taxa:
    for n_itr in range(n_itr):
        reference_tree = utils.unrooted_pure_kingman_tree(m)
        observations, taxa_meta = generation.simulate_sequences(
            N,
            tree_model=reference_tree,
            seq_model=jc,
            mutation_rate=mutation_rate,
            alphabet="DNA")

        # NJ
        time_s = time.time()
        tree_rec = nj(observations, taxa_meta)
        runtime = time.time() - time_s
        RF, F1 = reconstruct_tree.compare_trees(reference_tree, tree_rec)
        print('NJ iteration: ', n_itr, ' num_taxa: ', m, ' time: ', runtime)

        df = df.append({
            'method': 'nj',
            'runtime': runtime,
            'RF': RF,
            'm': m
        },
                       ignore_index=True)

        #RAXML
        time_s = time.time()
        tree_rec = raxml(observations, taxa_meta)
        runtime = time.time() - time_s
        RF, F1 = reconstruct_tree.compare_trees(reference_tree, tree_rec)
Exemplo n.º 12
0
        # run deep spectral
        print("Spectral deep")
        t_s = time.time()
        tree_spectral = spectral_method.deep_spectral_tree_reconstruction(
            observations,
            reconstruct_tree.JC_similarity_matrix,
            taxa_metadata=taxa_meta,
            threshhold=20,
            min_split=5,
            merge_method="least_square",
            verbose=False)
        runtime = time.time() - t_s
        print(runtime)
        tree_spectral.write(path="temp.tre", schema="newick")
        RF, F1 = reconstruct_tree.compare_trees(tree_spectral, reference_tree)
        df = df.append(
            {
                'method': 'STR+RAXML',
                'runtime': runtime,
                'RF': RF,
                'F1': F1,
                'n': n
            },
            ignore_index=True)

        # run raxml
        print("RAXML")
        t_s = time.time()
        tree_raxml = raxml(observations, taxa_meta)
        runtime = time.time() - t_s