예제 #1
0
    def test_global_pairwise_align_nucleotide(self):
        obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide(
            DNA("GACCTTGACCAGGTACC"),
            DNA("GAACTTTGACGTAAC"),
            gap_open_penalty=5.,
            gap_extend_penalty=0.5,
            match_score=5,
            mismatch_score=-4)

        self.assertEqual(
            obs_msa,
            TabularMSA([DNA("G-ACCTTGACCAGGTACC"),
                        DNA("GAACTTTGAC---GTAAC")]))
        self.assertEqual(obs_score, 41.0)
        self.assertEqual(obs_start_end, [(0, 16), (0, 14)])

        obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide(
            DNA("GACCTTGACCAGGTACC"),
            DNA("GAACTTTGACGTAAC"),
            gap_open_penalty=10.,
            gap_extend_penalty=0.5,
            match_score=5,
            mismatch_score=-4)

        self.assertEqual(
            obs_msa,
            TabularMSA([DNA("-GACCTTGACCAGGTACC"),
                        DNA("GAACTTTGAC---GTAAC")]))
        self.assertEqual(obs_score, 32.0)
        self.assertEqual(obs_start_end, [(0, 16), (0, 14)])

        # DNA sequences with metadata
        obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide(
            DNA("GACCTTGACCAGGTACC", metadata={'id': "s1"}),
            DNA("GAACTTTGACGTAAC", metadata={'id': "s2"}),
            gap_open_penalty=10.,
            gap_extend_penalty=0.5,
            match_score=5,
            mismatch_score=-4)

        self.assertEqual(
            obs_msa,
            TabularMSA([
                DNA("-GACCTTGACCAGGTACC", metadata={'id': "s1"}),
                DNA("GAACTTTGAC---GTAAC", metadata={'id': "s2"})
            ]))

        self.assertEqual(obs_score, 32.0)
        self.assertEqual(obs_start_end, [(0, 16), (0, 14)])

        # Align one DNA sequence and one TabularMSA, score computed manually
        obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide(
            TabularMSA([
                DNA("GACCTTGACCAGGTACC", metadata={'id': "s1"}),
                DNA("GACCATGACCAGGTACC", metadata={'id': "s2"})
            ]),
            DNA("GAACTTTGACGTAAC", metadata={'id': "s3"}),
            gap_open_penalty=10.,
            gap_extend_penalty=0.5,
            match_score=5,
            mismatch_score=-4)

        self.assertEqual(
            obs_msa,
            TabularMSA([
                DNA("-GACCTTGACCAGGTACC", metadata={'id': "s1"}),
                DNA("-GACCATGACCAGGTACC", metadata={'id': "s2"}),
                DNA("GAACTTTGAC---GTAAC", metadata={'id': "s3"})
            ]))

        self.assertEqual(obs_score, 27.5)
        self.assertEqual(obs_start_end, [(0, 16), (0, 14)])

        # TypeError on invalid input
        self.assertRaises(TypeError, global_pairwise_align_nucleotide, 42,
                          DNA("ACGT"))
        self.assertRaises(TypeError, global_pairwise_align_nucleotide,
                          DNA("ACGT"), 42)
예제 #2
0
 def test_global_pairwise_align_invalid_type(self):
     with self.assertRaisesRegex(
             TypeError, "GrammaredSequence.*"
             "TabularMSA.*'Sequence'"):
         global_pairwise_align(DNA('ACGT'), Sequence('ACGT'), 1.0, 1.0, {})
예제 #3
0
 def test_global_pairwise_align_protein_invalid_dtype(self):
     with self.assertRaisesRegex(
             TypeError, "TabularMSA with Protein dtype.*dtype "
             "'DNA'"):
         global_pairwise_align_protein(TabularMSA([Protein('PAW')]),
                                       TabularMSA([DNA('ACGT')]))
예제 #4
0
 def test_reverse_transcribe_does_not_modify_input(self):
     seq = RNA('AUAU')
     self.assertEqual(seq.reverse_transcribe(), DNA('ATAT'))
     self.assertEqual(seq, RNA('AUAU'))
예제 #5
0
print("Building kmer tree using average linkage with an average number of allowed based of: {} {}".format(degen_base_num,time.asctime()))
Z = fastcluster.average(kmerdist)
kmer_length=final.shape[1]
maxdist=round((degen_base_num/kmer_length), 2)
clusters = fcluster(Z,maxdist,criterion='distance')
myclusters = {key:[] for key in set(clusters)}
for index, clust in enumerate(clusters):
    myclusters[clust].append(index)

clustergroups = []
for amp in Counter(clusters).keys():
    clustergroups.append(final.iloc[myclusters[amp]])

print("Building alignments for kmer motifs. {}".format(time.asctime()))
#group resulting clusters into de facto alignment objects
alignments = []
for c in clustergroups:
    group = [DNA(''.join(c.loc[i].map(numeric_to_dna))) for i in c.index]
    alignments.append(skbio.alignment.TabularMSA(group))

oligos = []
#find representative IUPAC base of observed positional variance
for n,a in enumerate(alignments):
    position_vars = [tuple(set(str(x))) for x in a.iter_positions()]
    degenseq = ''.join([basekey[tuple(sorted(p))] for p in position_vars])
    oligos.append(SeqRecord(Seq(degenseq,IUPAC.ambiguous_dna),id=pickle_file.split(sep='_')[0]+str(n),description=''))

print("Writing {} degenerate kmers as fasta file. {}".format(len(oligos),time.asctime()))
SeqIO.write(oligos,pickle_file.split(sep='.')[0]+str(degen_base_num)+'_degenerate_primers.fasta','fasta')

예제 #6
0
 def test_translate_genetic_code_object(self):
     gc = GeneticCode('M' * 64, '-' * 64)
     for seq in RNA('AAAUUUAUGCAU'), DNA('AAATTTATGCAT'):
         obs = seq.translate(gc)
         self.assertEqual(obs, Protein('MMMM'))
예제 #7
0
 def test_translate_six_frames_invalid_id(self):
     for seq in RNA('AUG'), DNA('ATG'):
         with six.assertRaisesRegex(self, ValueError, 'table_id.*42'):
             seq.translate_six_frames(42)
예제 #8
0
 def time_object_creation_validate(self):
     DNA(dna_bytes)
예제 #9
0
 def time_object_creation(self):
     DNA(dna_bytes, validate=False)
예제 #10
0
import numpy as np

num_bases = 1000000
size = int(num_bases / 4)
short_len = 100

dna_template_bytes = [ord(x) for x in 'ACGT']
dna_template_bytes_gapped = [ord(x) for x in 'AC-.']
rna_template_bytes = [ord(x) for x in 'ACGU']

dna_bytes = np.array(dna_template_bytes * size, dtype=np.uint8)
dna_bytes_short = dna_bytes[:short_len]
dna_bytes_gapped = np.array(dna_template_bytes_gapped * size, dtype=np.uint8)
rna_bytes = np.array(rna_template_bytes * size, dtype=np.uint8)

dna_seq = DNA(dna_bytes)
dna_seq_short = DNA(dna_bytes_short)
dna_gapped = DNA(dna_bytes_gapped)
rna_seq = RNA(rna_bytes)

motif_1 = "GGTGCAAGCCGGTGGAAACA"
motif_1_regex = '(' + motif_1 + ')'


def consume_iterator(iterator):
    for _ in iterator:
        pass


class BenchmarkSuite:
예제 #11
0
    def test_majority_consensus(self):
        # empty cases
        self.assertEqual(self.empty.majority_consensus(), Sequence(''))
        self.assertEqual(self.no_positions.majority_consensus(), RNA(''))

        # alignment where all sequences are the same
        aln = Alignment(
            [DNA('AG', metadata={'id': 'a'}),
             DNA('AG', metadata={'id': 'b'})])
        self.assertEqual(aln.majority_consensus(), DNA('AG'))

        # no ties
        d1 = DNA('TTT', metadata={'id': "d1"})
        d2 = DNA('TT-', metadata={'id': "d2"})
        d3 = DNA('TC-', metadata={'id': "d3"})
        a1 = Alignment([d1, d2, d3])
        self.assertEqual(a1.majority_consensus(), DNA('TT-'))

        # ties
        d1 = DNA('T', metadata={'id': "d1"})
        d2 = DNA('A', metadata={'id': "d2"})
        a1 = Alignment([d1, d2])
        self.assertTrue(a1.majority_consensus() in [DNA('T'), DNA('A')])
예제 #12
0
 def test_init_not_equal_lengths(self):
     invalid_seqs = [
         self.d1, self.d2, self.d3,
         DNA('.-ACC-GTGC--', metadata={'id': "i2"})
     ]
     self.assertRaises(AlignmentError, Alignment, invalid_seqs)