Exemplo n.º 1
0
    def test_replacing_hgt(self):

        N = 20

        S = te.simulate_species_tree(N, model='innovation')

        # true gene tree (with losses)
        TGT = te.simulate_dated_gene_tree(
            S,
            dupl_rate=0.0,
            loss_rate=0.0,
            hgt_rate=1.0,
            prohibit_extinction='per_species',
            replace_prob=1.0,
        )

        # observable gene tree
        OGT = te.observable_tree(TGT)

        leaves = [v for v in OGT.leaves()]
        colors = {v.color for v in leaves}

        # print(TGT.to_newick())
        # print(OGT.to_newick())

        self.assertTrue(len(colors) == N and len(leaves) == N)
Exemplo n.º 2
0
    def test_rs_edges(self):

        S = te.simulate_species_tree(10)
        TGT = te.simulate_dated_gene_tree(S,
                                          dupl_rate=1.0,
                                          loss_rate=0.5,
                                          hgt_rate=0.5)
        OGT = te.observable_tree(TGT)

        transf1 = analysis.true_transfer_edges(OGT)
        transf2 = analysis.rs_transfer_edges(OGT, S)

        self.assertTrue(transf1.issuperset(transf2))
Exemplo n.º 3
0
def generate_solutions_unique_species(n, i_p=0.5, d_p=0.5):
    done = False
    count = 0
    while not done:
        S = te.simulate_species_tree(10, model='innovation')
        TGT = te.simulate_dated_gene_tree(S,
                                          dupl_rate=0.5,
                                          loss_rate=0.5,
                                          hgt_rate=0.5,
                                          prohibit_extinction="per_family",
                                          replace_prob=0.0)
        OGT = te.observable_tree(TGT)
        ldt = ldt_graph(OGT, S)
        if len(ldt.nodes()) == n:
            IG = InvestigateGraph(ldt)
            IG.perturb_graph(i_p, d_p)

            solver = LDTEditor(IG._G_perturbed)
            solver.build_model()
            solver.optimize(time_limit=None)

            sol_graph, sol_distance = solver.get_solution()

            properly_colored = is_properly_colored(sol_graph)
            cograph = is_cograph(sol_graph)
            compatible = is_compatible(sol_graph)

            edit_dist = gt.symmetric_diff(IG._G_perturbed, sol_graph)
            print("Runtime: {}".format(solver.get_solve_time()))
            if properly_colored and cograph and compatible:
                print("Saving data...")
                solver._save_ILP_data(
                    IG._G_perturbed,
                    sol_graph,
                    solver.get_solve_time(),
                    edit_dist,
                    only_add=False,
                    only_delete=False,
                    filename="{}nodes/LDTEdit_exact_solution".format(n))
            else:
                print("No solution found!")
            count += 1
        if count == 100:
            done = True
Exemplo n.º 4
0
def generate_trees(n=100,
                   m=10,
                   model='innovation',
                   dupl_rate=0.5,
                   loss_rate=0.5,
                   hgt_rate=0.5,
                   prohibit_extinction="per_family",
                   replace_prob=0.0,
                   size=10):
    i = 0
    dirName = 'exact_solutions/trees/{}trees'.format(size)
    # create folder if it doesnt exist
    if not os.path.exists(dirName):
        os.makedirs(dirName)
        ID = 0
    else:
        ID = find_next_ID('exact_solutions/trees/{}trees/'.format(size))

    while i < n:

        S = te.simulate_species_tree(m, model=model)
        TGT = te.simulate_dated_gene_tree(
            S,
            dupl_rate=dupl_rate,
            loss_rate=loss_rate,
            hgt_rate=hgt_rate,
            prohibit_extinction=prohibit_extinction,
            replace_prob=replace_prob)

        OGT = te.observable_tree(TGT)
        ldt = ldt_graph(OGT, S)
        amount_nodes = len(ldt.nodes())
        if amount_nodes == size:
            # save trees
            filename_species = 'exact_solutions/trees/{}trees/species_{}_{}_{}.json'.format(
                size, m, model, ID)
            filename_gene = 'exact_solutions/trees/{}trees/gene_{}_{}_{}_{}_{}_{}.json'.format(
                size, dupl_rate, loss_rate, hgt_rate, prohibit_extinction,
                replace_prob, ID)
            S.serialize(filename_species)
            TGT.serialize(filename_gene)
            ID += 1
            i += 1
Exemplo n.º 5
0
    def test_ldt_fitch(self):

        S = te.simulate_species_tree(20, model='innovation')

        # true gene tree (with losses)
        TGT = te.simulate_dated_gene_tree(S,
                                          dupl_rate=1.0,
                                          loss_rate=0.5,
                                          hgt_rate=0.2)

        # observable gene tree
        OGT = te.observable_tree(TGT)

        # finally we can extract the LDT and Fitch graph
        ldt = analysis.ldt_graph(OGT, S)
        transfer_edges = analysis.rs_transfer_edges(OGT, S)
        fitch = analysis.undirected_fitch(OGT, transfer_edges)

        cotree = to_cotree(ldt)

        self.assertTrue(gt.is_subgraph(ldt, fitch) and cotree)
Exemplo n.º 6
0
    def simulate_gene_trees(self, N, **kwargs):

        self.number_of_families = N

        self.true_gene_trees = te.simulate_gene_trees(self.S, N=N, **kwargs)
        if N == 1:
            self.true_gene_trees = [self.true_gene_trees]

        self.observable_gene_trees = [
            te.observable_tree(tree) for tree in self.true_gene_trees
        ]

        # sequences should be emptied here if methods were called before
        if hasattr(self, 'sequence_dicts'):
            self.sequence_dicts.clear()

        if self.outdir:
            for i in range(N):
                filename = self._path('true_gene_trees',
                                      'gene_tree{}.json'.format(i))
                self.true_gene_trees[i].serialize(filename, mode='json')
Exemplo n.º 7
0
# -*- coding: utf-8 -*-
    
import tralda.tools.GraphTools as gt

import asymmetree.treeevolve as te
from asymmetree.analysis import (undirected_fitch,
                                 rs_transfer_edges,
                                 below_equal_above,
                                 ldt_graph,
                                 RsScenarioConstructor,)
from asymmetree.tools.PhyloTreeTools import (to_newick,)

S = te.simulate_species_tree(10)
TGT = te.simulate_dated_gene_tree(S, dupl_rate=1.0, loss_rate=0.5,
                                  hgt_rate=0.5)
OGT = te.observable_tree(TGT)

print('--- S ---\n', to_newick(S))
print(to_newick(S, distance=False, label_inner=False))
print('--- OGT ---\n', to_newick(OGT))

ldt, above, equal = below_equal_above(OGT, S)
fitch = undirected_fitch(OGT, rs_transfer_edges(OGT, S))
n = ldt.order()
print('Genes:', n, 'Total relations:', int(n * (n-1) / 2))
print('< {}\n= {}\n> {}'.format(ldt.size(), equal.size(), above.size()))

rs_scen_constr = RsScenarioConstructor(ldt)
result = rs_scen_constr.run()

if result:
Exemplo n.º 8
0
# build loop

len(parameter_Df.index)

for ind in range(len(parameter_Df.index)-1):
    # species tree of type ’PhyloTree’
    s = te.simulate_species_tree(int(parameter_Df.loc[ind, 'num_of_leaves']), 
                                 model = parameter_Df.loc[ind, 'model'],
                                 non_binary_prob = parameter_Df.loc[ind, 'non_binary_prob'],
                                 planted = parameter_Df.loc[ind, 'planted'],
                                 remove_extinct = parameter_Df.loc[ind, 'remove_extinct'],
                                 rescale_to_height = parameter_Df.loc[ind, 'rescale_to_height']
                                 )
    
    # true gene tree (contains losses) of type ’PhyloTree’
    tgt = te.simulate_dated_gene_tree(s,
                                      dupl_rate = parameter_Df.loc[ind, 'dupl_rate'],
                                      loss_rate = parameter_Df.loc[ind, 'loss_rate'],
                                      hgt_rate = parameter_Df.loc[ind, 'hgt_rate'],
                                      dupl_polytomy = 0.0,
                                      prohibit_extinction= parameter_Df.loc[ind, 'prohibit_extinction'],
                                      replace_prob = parameter_Df.loc[ind, 'replace_prob']
                                      )
    
    
    # serialization
    s.serialize(wk_dir / '01_Data' / str(parameter_Df.loc[ind, 'ID'] + '_species_tree.pickle'))
    tgt.serialize(wk_dir / '01_Data' / str(parameter_Df.loc[ind, 'ID'] + '_gene_tree.pickle'))
    print('Simulating Tree :', ind)
ogt = te.observable_tree(tgt)
Exemplo n.º 9
0
def benchmark_fromTrees(n, p1, p2, filename='exact_solutions/trees'):
    # load species+gene trees
    name = filename + '/{}trees'.format(n)
    tree_files = []

    #probabilities = [0.15, 0.30, 0.50]
    #probs = [(0.15, 0.15), (0.3, 0.3), (0.5, 0.5), (0.15, 0.5), (0.5, 0.15)]
    #nodes = [10, 14, 18]

    for _, _, files in os.walk(name):
        for file in files:
            tree_files.append(file)
    species_trees = []
    gene_trees = []

    for f in tree_files:
        if f.startswith('species'):
            species_trees.append(f)
        else:
            gene_trees.append(f)

    c1_graphs, c1_edge_count, c1_is_ldt, c1_edit_dist = ([] for i in range(4)
                                                         )  # cograph editing
    t1_graphs, t1_edge_count, t1_is_ldt, t1_edit_dist = (
        [] for i in range(4))  # triples editing with both insertion/deletion
    t2_graphs, t2_edge_count, t2_is_ldt, t2_edit_dist = (
        [] for i in range(4))  # triples editing with deletion only
    t3_graphs, t3_edge_count, t3_is_ldt, t3_edit_dist = (
        [] for i in range(4))  # triples editing with insertion only
    t4_graphs, t4_edge_count, t4_is_ldt, t4_edit_dist = ([] for i in range(4)
                                                         )  # ldt editing

    IG1 = None
    for i in range(len(species_trees)):
        print("Tree pair {}".format(i))
        S = PhyloTree.load(name + '/{}'.format(species_trees[i]))
        TGT = PhyloTree.load(name + '/{}'.format(gene_trees[i]))
        OGT = te.observable_tree(TGT)
        ldt = ldt_graph(OGT, S)

        if not IG1:
            IG1 = InvestigateGraph(ldt)
            IG2 = copy.deepcopy(IG1)
            IG3 = copy.deepcopy(IG1)
            IG4 = copy.deepcopy(IG1)
            #IG5 = copy.deepcopy(IG1)

        IG1.perturb_graph_terminate(p1, p2)
        IG2.perturb_graph_terminate(p1, p2)
        IG3.perturb_graph_terminate(p1, p2)
        IG4.perturb_graph_terminate(p1, p2)
        #IG5.perturb_graph_terminate(p1, p2)

        cograph_edited_G, is_c1_ldt, c1_num_edges, c1_ldt_edit_dist = cograph_editing(
            IG1)
        triples1_edited_G, is_t1_ldt, t1_num_edges, t1_ldt_edit_dist = triples_editing(
            IG2, n=100)
        triples2_edited_G, is_t2_ldt, t2_num_edges, t2_ldt_edit_dist = triples_editing(
            IG3, deletion=True)
        triples3_edited_G, is_t3_ldt, t3_num_edges, t3_ldt_edit_dist = triples_editing(
            IG4, insertion=True)
        #ldt_edited_G, is_t4_ldt, t4_num_edges, t4_ldt_edit_dist = LDT_editing(IG5, deletion = True)

        c1_graphs.append(cograph_edited_G)
        t1_graphs.append(triples1_edited_G)
        t2_graphs.append(triples2_edited_G)
        t3_graphs.append(triples3_edited_G)
        #t4_graphs.append(ldt_edited_G)

        c1_is_ldt.append(is_c1_ldt)
        t1_is_ldt.append(is_t1_ldt)
        t2_is_ldt.append(is_t2_ldt)
        t3_is_ldt.append(is_t3_ldt)
        #t4_is_ldt.append(is_t4_ldt)

        c1_edge_count.append(c1_num_edges)
        t1_edge_count.append(t1_num_edges)
        t2_edge_count.append(t2_num_edges)
        t3_edge_count.append(t3_num_edges)
        #t4_edge_count.append(t4_num_edges)

        c1_edit_dist.append(c1_ldt_edit_dist)
        t1_edit_dist.append(t1_ldt_edit_dist)
        t2_edit_dist.append(t2_ldt_edit_dist)
        t3_edit_dist.append(t3_ldt_edit_dist)
        #t4_edit_dist.append(t4_ldt_edit_dist)

    _, cograph_freq, _ = get_freq(IG1)
    triples1_freq, _, _ = get_freq(IG2)
    triples2_freq, _, _ = get_freq(IG3)
    triples3_freq, _, _ = get_freq(IG4)
    #_, 		_, 		ldt_freq 	= get_freq(IG5)

    frequencies = [cograph_freq, triples1_freq, triples2_freq, triples3_freq]

    return frequencies
Exemplo n.º 10
0
def benchmark_fromTrees(n, p1, p2, filename='exact_solutions/trees'):
    # load species+gene trees
    name = filename + '/{}trees'.format(n)
    tree_files = []

    #probabilities = [0.15, 0.30, 0.50]
    #probs = [(0.15, 0.15), (0.3, 0.3), (0.5, 0.5), (0.15, 0.5), (0.5, 0.15)]
    #nodes = [10, 14, 18]

    for _, _, files in os.walk(name):
        for file in files:
            tree_files.append(file)
    species_trees = []
    gene_trees = []

    for f in tree_files:
        if f.startswith('species'):
            species_trees.append(f)
        else:
            gene_trees.append(f)

    t1_graphs, t1_edge_count, t1_is_ldt, t1_edit_dist = ([] for i in range(4)
                                                         )  # ldt editing
    t2_graphs, t2_edge_count, t2_is_ldt, t2_edit_dist = (
        [] for i in range(4))  # ldt editing (triples edit deletion)
    t3_graphs, t3_edge_count, t3_is_ldt, t3_edit_dist = (
        [] for i in range(4))  # ldt editing (triples edit insertion)

    IG1 = None
    for i in range(len(species_trees)):
        print("Tree pair {}".format(i))
        S = PhyloTree.load(name + '/{}'.format(species_trees[i]))
        TGT = PhyloTree.load(name + '/{}'.format(gene_trees[i]))
        OGT = te.observable_tree(TGT)
        ldt = ldt_graph(OGT, S)

        if not IG1:
            IG1 = InvestigateGraph(ldt)
            IG2 = copy.deepcopy(IG1)
            IG3 = copy.deepcopy(IG1)

        IG1.perturb_graph_terminate(p1, p2)
        IG2.perturb_graph_terminate(p1, p2)
        IG3.perturb_graph_terminate(p1, p2)

        t1_edited_G, is_t1_ldt, _, t1_ldt_edit_dist = LDT_editing(
            IG1, n=100
        )  # ldt editing with triples editing allowing both deletions and insertions for n = 100.
        t2_edited_G, is_t2_ldt, _, t2_ldt_edit_dist = LDT_editing(
            IG2, deletion=True
        )  # ldt editing with triples editing allowing only deletions.
        t3_edited_G, is_t3_ldt, _, t3_ldt_edit_dist = LDT_editing(
            IG3, insertion=True
        )  # ldt editing with triples editing allowing only insertions.

        t1_graphs.append(t1_edited_G)
        t2_graphs.append(t2_edited_G)
        t3_graphs.append(t3_edited_G)

        t1_is_ldt.append(is_t1_ldt)
        t2_is_ldt.append(is_t2_ldt)
        t3_is_ldt.append(is_t3_ldt)

        t1_edit_dist.append(t1_ldt_edit_dist)
        t2_edit_dist.append(t2_ldt_edit_dist)
        t3_edit_dist.append(t3_ldt_edit_dist)

    _, _, ldt1_freq = get_freq(IG1)
    _, _, ldt2_freq = get_freq(IG2)
    _, _, ldt3_freq = get_freq(IG3)

    frequencies = [ldt1_freq, ldt2_freq, ldt3_freq]

    return frequencies
Exemplo n.º 11
0
def generate_solutions_fromTrees(n, filename):
	# load species+gene trees
	name = filename + '/{}trees'.format(n)
	tree_files = []

	probabilities = [0.15, 0.30, 0.50]
	#nodes = [10, 14, 18]
	restrictions = ['', 'insertion', 'deletion']

	for _, _, files in os.walk(name):
		for file in files:
			tree_files.append(file)
	species_trees = []
	gene_trees 	  = []

	for f in tree_files:
		if f.startswith('species'):
			species_trees.append(f)
		else:
			gene_trees.append(f)
	ID = 0
	for i in range(len(species_trees)):
		S = PhyloTree.load(name + '/{}'.format(species_trees[i]))
		TGT = PhyloTree.load(name + '/{}'.format(gene_trees[i]))
		OGT = te.observable_tree(TGT)
		ldt = ldt_graph(OGT, S)
		# perturb using p = 0.15, 0.3, 0.5
		# for each p, solve using ILP with deletion, insertion and both
		for p1 in probabilities:
			for p2 in probabilities:
				p_i = str(p1).replace('.', '')
				if len(p_i) < 3:
					p_i = p_i + '0'
				p_d = str(p2).replace('.', '')
				if len(p_d) < 3:
					p_d = p_d + '0'
				IG = InvestigateGraph(ldt)
				perturbed = IG.perturb_graph_terminate(p1, p2)
				if not perturbed:
					print("failed")
				else:
					# solve 3 times using deletion, insertion and both
					solver1 = LDTEditor(IG._G_perturbed)
					solver1.build_model()
					solver1.optimize(time_limit=None)

					solver2 = LDTEditor(IG._G_perturbed, only_delete = True)
					solver2.build_model()
					solver2.optimize(time_limit=None)
					
					solver3 = LDTEditor(IG._G_perturbed, only_add = True)
					solver3.build_model()
					solver3.optimize(time_limit=None)

					sol_graph1, sol_distance1 = solver1.get_solution()
					sol_graph2, sol_distance2 = solver2.get_solution()
					sol_graph3, sol_distance3 = solver3.get_solution()

					properly_colored1 = is_properly_colored(sol_graph1)
					cograph1 = is_cograph(sol_graph1)
					compatible1 = is_compatible(sol_graph1)

					properly_colored2 = is_properly_colored(sol_graph2)
					cograph2 = is_cograph(sol_graph2)
					compatible2 = is_compatible(sol_graph2)

					properly_colored3 = is_properly_colored(sol_graph3)
					cograph3 = is_cograph(sol_graph3)
					compatible3 = is_compatible(sol_graph3)

					folderName = 'exact_solutions/{}_{}_{}nodes{}/'
					saveFolder1 = folderName.format(p1, p2, n, '')

					if properly_colored1 and cograph1 and compatible1:
						print("Saving data...")
						solver1._save_ILP_data(IG._G_perturbed, sol_graph1, solver1.get_solve_time(), sol_distance1, i_p = p1, d_p = p2, only_add=False, only_delete=False, saveFolder = folderName.format(p_i, p_d, n, ''), ID = ID)
					else:
						print("No solution found!")

					if properly_colored2 and cograph2 and compatible2:
						print("Saving data (deletion)...")
						solver2._save_ILP_data(IG._G_perturbed, sol_graph2, solver2.get_solve_time(), sol_distance2, i_p = p1, d_p = p2, only_add=False, only_delete=True, saveFolder = folderName.format(p_i, p_d, n, '_deletion'), ID = ID)
					else:
						print("No solution found for deletion only!")

					if properly_colored3 and cograph3 and compatible3:
						print("Saving data (insertion)...")
						solver3._save_ILP_data(IG._G_perturbed, sol_graph3, solver3.get_solve_time(), sol_distance3, i_p = p1, d_p = p2, only_add=True, only_delete=False, saveFolder = folderName.format(p_i, p_d, n, '_insertion'), ID = ID)
					else:
						print("No solution found for insertion only!")
					ID += 1