示例#1
0
 def test_identification_of_nondivergent_species(self):
     labels = ['A', 'B', 'C', 'D', 'E']
     m = 100
     k = (1, 2, 3, 4, 5)
     for zeros in [(0, 1), (0, 2, 5), (1, 2, 5), (2, 3, 6), (2, 4, 6),
                   (3, 4)]:
         a = [1 if i not in zeros else 0 for i in range(7)]
         treedata = "((({7:s}:{0:f}, {8:s}:{1:f}):{5:f},{9:s}:{2:f}):0, ({10:s}:{3:f},{11:s}:{4:f}):{6:f}):0".format(
             *(a + labels))
         handle = StringIO(treedata)
         tree = Phylo.read(handle, "newick")
         specieses = sorted(tree.get_terminals(),
                            key=lambda species: species.name)
         genes = [GeneLineage() for _ in range(len(specieses))]
         base_embedding = dict(zip(specieses, [[gene] for gene in genes]))
         coalescent = EmbeddedGeneForest(tree, base_embedding)
         coalescent.coalesce(0)
         genetree = coalescent.genetree()
         sequences = mutate_indelible(genetree, m, indelible_model='JC')
         #            self.assertEqual(str(sequences[zeros[0]].seq),str(sequences[zeros[1]].seq))
         for k_i in k:
             dm = kmer_distance_matrix(sequences,
                                       k_i,
                                       normalized_kmer_distance,
                                       grouping=base_embedding)
             self.assertEqual(dm[labels[zeros[0]], labels[zeros[1]]], 0)
示例#2
0
    def test_convergence_of_average_kmer_distance(self):
        labels = list(np.random.permutation(['A', 'B', 'C', 'D', 'E']))
        #a = [np.random.uniform() for i in range(7)]
        #a = [0.6884381370851735, 0.06944367272556096, 0.8481260207499146, 0.02095137803332825, 0.5434220735107143, 0.06683442096191772, 0.7457004343685913] # Tree reconstruction failed for this parameter vector with default optimization method (of scipy.minimize). It succeeds with 'SLSQP' method
        #treedata = "((({7:s}:{0:f}, {8:s}:{1:f}):{5:f},{9:s}:{2:f}):0, ({10:s}:{3:f},{11:s}:{4:f}):{6:f}):0".format(*(a+labels))
        #handle = StringIO(treedata)
        #tree = Phylo.read(handle, "newick")
        tree = huelsenbeck_tree(-3 / 4.0 * log(0.5), -3 / 4.0 * log(0.8), 5)
        k = 1
        theta = 0.01
        m = 100
        specieses = sorted(tree.get_terminals(),
                           key=lambda species: species.name)
        names = [species.name for species in specieses]
        genes = [GeneLineage() for _ in range(len(specieses))]
        base_embedding = dict(zip(specieses, [[gene] for gene in genes]))
        #indices = list(itertools.chain.from_iterable([[(i,j) for j in range(i)] for i in range(n)]))
        distances = [[tree.distance(names[i], names[j])
                      for j in range(i)] + [0.0]
                     for i in range(len(specieses))]
        dm = DistanceMatrix(names, distances)
        edm = DistanceMatrix(names, [[
            f_ttheta(tree.distance(names[i], names[j]), theta, k) * 2
            for j in range(i)
        ] + [0.0] for i in range(len(specieses))])
        kdm_total = zero_distance_matrix(names)
        finite_counts_matrix = zero_distance_matrix(names)
        N = 1000
        checkpoints = np.linspace(0, N, 11)
        for i in range(N):
            coalescent = EmbeddedGeneForest(tree, base_embedding)
            coalescent.coalesce(theta)
            genetree = coalescent.genetree()
            with TemporaryDirectory() as tmpdir:
                sequences = mutate_indelible(genetree,
                                             m,
                                             tmpdir,
                                             indelible_model='JC')
#            self.assertEqual(str(sequences[zeros[0]].seq),str(sequences[zeros[1]].seq))
#kdm = kmer_distance_matrix(sequences, k, scaled_kmer_distance, grouping=base_embedding)
            kdm = kmer_distance_matrix(sequences,
                                       k,
                                       normalized_kmer_distance,
                                       grouping=base_embedding)
            kdm_total += kdm
            finite_counts_matrix += dm.isfinite()
        adm = kdm_total / finite_counts_matrix
        print edm
        print adm
        print edm - adm
示例#3
0
 def test_identity_of_nondivergent_sequences(self):
     labels = ['A', 'B', 'C', 'D', 'E']
     m = 100
     k = (1, 2, 3, 4, 5)
     for zeros in [(0, 1), (0, 2, 5), (1, 2, 5), (2, 3, 6), (2, 4, 6),
                   (3, 4)]:
         a = [1 if i not in zeros else 0 for i in range(7)]
         treedata = "((({7:s}:{0:f}, {8:s}:{1:f}):{5:f},{9:s}:{2:f}):0, ({10:s}:{3:f},{11:s}:{4:f}):{6:f}):0".format(
             *(a + labels))
         handle = StringIO(treedata)
         tree = Phylo.read(handle, "newick")
         sequences = mutate_indelible(tree, m, indelible_model='JC')
         self.assertEqual(str(sequences[zeros[0]].seq),
                          str(sequences[zeros[1]].seq))
         for k_i in k:
             dm = kmer_distance_matrix(sequences, k_i,
                                       normalized_kmer_distance)
             self.assertEqual(dm[labels[zeros[0]], labels[zeros[1]]], 0)
示例#4
0
 def test_basic_simulation_and_reconstruction_procedure(self):
     from math import log
     # Generate a species tree and a collection of gene lineages embedded at the leaves
     tree = huelsenbeck_tree(-3 / 4.0 * log(0.5), -3 / 4.0 * log(0.8), 5)
     specieses = tree.get_terminals()
     genes = [GeneLineage() for _ in range(len(specieses))]
     base_embedding = dict(zip(specieses, [[gene] for gene in genes]))
     # Simulate coalescent process
     coalescent = EmbeddedGeneForest(tree, base_embedding)
     coalescent.coalesce(0.0)
     genetree = coalescent.genetree()
     # Simulate sequences along genetree
     sequences = mutate_indelible(genetree, 10, 'JC', aligned=True)
     aligned_sequences = align_sequences(sequences, 'clustalo')
     # Compute some kmer distances
     k = (1, 2, 3, 4, 5)
     k_trunc = (1, 3)  # for NJ method
     kmer_distance_matrices = dict()
     for k_i in k:
         kmer_distance_matrices[k_i] = kmer_distance_matrix(
             sequences,
             k_i,
             normalized_kmer_distance,
             grouping=base_embedding)
     # Reconstruct tree
     treehat1 = reconstruct_tree(
         dict((k_i, kmer_distance_matrices[k_i]) for k_i in k_trunc))
     treehat2 = reconstruct_tree2(kmer_distance_matrices)
     xtree = XTree(
         tree,
         dict((clade, set([clade.name])) for clade in tree.get_terminals()))
     xtreehat1 = XTree(
         treehat1,
         dict((clade, set([clade.name]))
              for clade in treehat1.get_terminals()))
     xtreehat2 = XTree(
         treehat2,
         dict((clade, set([clade.name]))
              for clade in treehat2.get_terminals()))
     xtreehat1.displays(xtree)
     xtreehat2.displays(xtree)
示例#5
0
 def test_reconstruct_tree_raxml(self):
     labels = list(np.random.permutation(['A', 'B', 'C', 'D', 'E']))
     tree = huelsenbeck_tree(-3 / 4.0 * log(0.5), -3 / 4.0 * log(0.8), 5)
     k = 2
     theta = 0.01
     m = 100
     specieses = sorted(tree.get_terminals(),
                        key=lambda species: species.name)
     names = [species.name for species in specieses]
     genes = [GeneLineage() for _ in range(len(specieses))]
     base_embedding = dict(zip(specieses, [[gene] for gene in genes]))
     #indices = list(itertools.chain.from_iterable([[(i,j) for j in range(i)] for i in range(n)]))
     coalescent = EmbeddedGeneForest(tree, base_embedding)
     coalescent.coalesce(theta)
     genetree = coalescent.genetree()
     #gene_species_map = dict((genes[0].name,species.name) for species,genes in base_embedding.items())
     #concatenated_sequences = dict((species,SeqRecord(Seq('',DNAAlphabet()),id=species,name=species,description=species)) for species in gene_species_map.values())
     sample_ids = [
         sample.name for sample in itertools.chain.from_iterable(
             base_embedding.values())
     ]
     concatenated_sequences = dict((sample_id,
                                    SeqRecord(Seq('', DNAAlphabet()),
                                              id=sample_id,
                                              name=sample_id,
                                              description=sample_id))
                                   for sample_id in sample_ids)
     with TemporaryDirectory() as tmpdir:
         sequences = mutate_indelible(genetree,
                                      m,
                                      tmpdir,
                                      indelible_model='JC')
         alignment = align_sequences(sequences,
                                     'clustalo',
                                     tmpdir,
                                     keep_fasta=True)
         for sample_id in sample_ids:
             concatenated_sequences[sample_id] += SeqIO.to_dict(
                 alignment)[sample_id]
         tree = reconstruct_tree_raxml(concatenated_sequences.values(),
                                       base_embedding, tmpdir)
示例#6
0
    def test_kmer_distance_storage_and_retrieval(self):
        from sqlalchemy import create_engine
        with TemporaryDirectory() as tmpdir:
            import logging
            sqla_logger = logging.getLogger('sqlalchemy.engine.base.Engine')
            sqla_logger.propagate = False
            sqla_logger.addHandler(logging.FileHandler('/tmp/sqla.log'))

            engine = create_engine('sqlite:///{:s}'.format(
                os.path.join(tmpdir, 'db.sql')),
                                   echo=True,
                                   convert_unicode=True)
            from sqlalchemy.orm import sessionmaker
            Session = sessionmaker(bind=engine)
            session = Session()
            Base.metadata.create_all(engine)

            labels = ['A', 'B', 'C', 'D', 'E']
            m = 100
            k = (1, 2, 3, 4, 5)
            indelible_model = 'JC'
            nr_genes = 100
            kmer_formula = 'ARS2015'
            alignment_method = None
            a = [np.random.uniform() for i in range(7)]
            treedata = "((({7:s}:{0:f}, {8:s}:{1:f}):{5:f},{9:s}:{2:f}):0, ({10:s}:{3:f},{11:s}:{4:f}):{6:f}):0".format(
                *(a + labels))
            handle = StringIO(treedata)
            tree = Phylo.read(handle, "newick")
            specieses = sorted(tree.get_terminals(),
                               key=lambda species: species.name)
            genes = [GeneLineage() for _ in range(len(specieses))]
            base_embedding = dict(zip(specieses, [[gene] for gene in genes]))
            coalescent = EmbeddedGeneForest(tree, base_embedding)
            coalescent.coalesce(0)
            genetree = coalescent.genetree()
            sequences = mutate_indelible(genetree,
                                         m,
                                         indelible_model=indelible_model)
            #            self.assertEqual(str(sequences[zeros[0]].seq),str(sequences[zeros[1]].seq))
            tree_newick = ")".join(tree.format('newick').split(")")[:-1]) + ")"
            sim = Simulation(tree=tree_newick,
                             theta=0,
                             indelible_model=indelible_model,
                             genes=nr_genes,
                             m=m)
            session.add(sim)
            for k_i in k:
                dm = kmer_distance_matrix(sequences,
                                          k_i,
                                          normalized_kmer_distance,
                                          grouping=base_embedding)
                kdm = kmer_distance_matrix_from_dm(dm, sim, kmer_formula,
                                                   alignment_method, k_i)
                session.add(kdm)
                session.commit()
                kdm2 = [
                    kdm for kdm in sim.kmer_distance_matrices if kdm.k == k_i
                ][0]
                #                kdm2 = session.query(KmerDistanceMatrix).\
                #                        join(KmerDistanceMatrix.simulation).\
                #                        filter(KmerDistanceMatrix.simulation==sim).\
                #                        filter(KmerDistanceMatrix.k==k_i).\
                #                        filter(KmerDistanceMatrix.kmer_formula==kmer_formula).\
                #                        filter(KmerDistanceMatrix.alignment_method==alignment_method).one()
                self.assertEqual(dm.matrix, kdm2.to_dm().matrix)