def test_global_pairwise_align_nucleotide(self): obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide( DNA("GACCTTGACCAGGTACC"), DNA("GAACTTTGACGTAAC"), gap_open_penalty=5., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4) self.assertEqual( obs_msa, TabularMSA([DNA("G-ACCTTGACCAGGTACC"), DNA("GAACTTTGAC---GTAAC")])) self.assertEqual(obs_score, 41.0) self.assertEqual(obs_start_end, [(0, 16), (0, 14)]) obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide( DNA("GACCTTGACCAGGTACC"), DNA("GAACTTTGACGTAAC"), gap_open_penalty=10., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4) self.assertEqual( obs_msa, TabularMSA([DNA("-GACCTTGACCAGGTACC"), DNA("GAACTTTGAC---GTAAC")])) self.assertEqual(obs_score, 32.0) self.assertEqual(obs_start_end, [(0, 16), (0, 14)]) # DNA sequences with metadata obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide( DNA("GACCTTGACCAGGTACC", metadata={'id': "s1"}), DNA("GAACTTTGACGTAAC", metadata={'id': "s2"}), gap_open_penalty=10., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4) self.assertEqual( obs_msa, TabularMSA([ DNA("-GACCTTGACCAGGTACC", metadata={'id': "s1"}), DNA("GAACTTTGAC---GTAAC", metadata={'id': "s2"}) ])) self.assertEqual(obs_score, 32.0) self.assertEqual(obs_start_end, [(0, 16), (0, 14)]) # Align one DNA sequence and one TabularMSA, score computed manually obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide( TabularMSA([ DNA("GACCTTGACCAGGTACC", metadata={'id': "s1"}), DNA("GACCATGACCAGGTACC", metadata={'id': "s2"}) ]), DNA("GAACTTTGACGTAAC", metadata={'id': "s3"}), gap_open_penalty=10., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4) self.assertEqual( obs_msa, TabularMSA([ DNA("-GACCTTGACCAGGTACC", metadata={'id': "s1"}), DNA("-GACCATGACCAGGTACC", metadata={'id': "s2"}), DNA("GAACTTTGAC---GTAAC", metadata={'id': "s3"}) ])) self.assertEqual(obs_score, 27.5) self.assertEqual(obs_start_end, [(0, 16), (0, 14)]) # TypeError on invalid input self.assertRaises(TypeError, global_pairwise_align_nucleotide, 42, DNA("ACGT")) self.assertRaises(TypeError, global_pairwise_align_nucleotide, DNA("ACGT"), 42)
def test_global_pairwise_align_invalid_type(self): with self.assertRaisesRegex( TypeError, "GrammaredSequence.*" "TabularMSA.*'Sequence'"): global_pairwise_align(DNA('ACGT'), Sequence('ACGT'), 1.0, 1.0, {})
def test_global_pairwise_align_protein_invalid_dtype(self): with self.assertRaisesRegex( TypeError, "TabularMSA with Protein dtype.*dtype " "'DNA'"): global_pairwise_align_protein(TabularMSA([Protein('PAW')]), TabularMSA([DNA('ACGT')]))
def test_reverse_transcribe_does_not_modify_input(self): seq = RNA('AUAU') self.assertEqual(seq.reverse_transcribe(), DNA('ATAT')) self.assertEqual(seq, RNA('AUAU'))
print("Building kmer tree using average linkage with an average number of allowed based of: {} {}".format(degen_base_num,time.asctime())) Z = fastcluster.average(kmerdist) kmer_length=final.shape[1] maxdist=round((degen_base_num/kmer_length), 2) clusters = fcluster(Z,maxdist,criterion='distance') myclusters = {key:[] for key in set(clusters)} for index, clust in enumerate(clusters): myclusters[clust].append(index) clustergroups = [] for amp in Counter(clusters).keys(): clustergroups.append(final.iloc[myclusters[amp]]) print("Building alignments for kmer motifs. {}".format(time.asctime())) #group resulting clusters into de facto alignment objects alignments = [] for c in clustergroups: group = [DNA(''.join(c.loc[i].map(numeric_to_dna))) for i in c.index] alignments.append(skbio.alignment.TabularMSA(group)) oligos = [] #find representative IUPAC base of observed positional variance for n,a in enumerate(alignments): position_vars = [tuple(set(str(x))) for x in a.iter_positions()] degenseq = ''.join([basekey[tuple(sorted(p))] for p in position_vars]) oligos.append(SeqRecord(Seq(degenseq,IUPAC.ambiguous_dna),id=pickle_file.split(sep='_')[0]+str(n),description='')) print("Writing {} degenerate kmers as fasta file. {}".format(len(oligos),time.asctime())) SeqIO.write(oligos,pickle_file.split(sep='.')[0]+str(degen_base_num)+'_degenerate_primers.fasta','fasta')
def test_translate_genetic_code_object(self): gc = GeneticCode('M' * 64, '-' * 64) for seq in RNA('AAAUUUAUGCAU'), DNA('AAATTTATGCAT'): obs = seq.translate(gc) self.assertEqual(obs, Protein('MMMM'))
def test_translate_six_frames_invalid_id(self): for seq in RNA('AUG'), DNA('ATG'): with six.assertRaisesRegex(self, ValueError, 'table_id.*42'): seq.translate_six_frames(42)
def time_object_creation_validate(self): DNA(dna_bytes)
def time_object_creation(self): DNA(dna_bytes, validate=False)
import numpy as np num_bases = 1000000 size = int(num_bases / 4) short_len = 100 dna_template_bytes = [ord(x) for x in 'ACGT'] dna_template_bytes_gapped = [ord(x) for x in 'AC-.'] rna_template_bytes = [ord(x) for x in 'ACGU'] dna_bytes = np.array(dna_template_bytes * size, dtype=np.uint8) dna_bytes_short = dna_bytes[:short_len] dna_bytes_gapped = np.array(dna_template_bytes_gapped * size, dtype=np.uint8) rna_bytes = np.array(rna_template_bytes * size, dtype=np.uint8) dna_seq = DNA(dna_bytes) dna_seq_short = DNA(dna_bytes_short) dna_gapped = DNA(dna_bytes_gapped) rna_seq = RNA(rna_bytes) motif_1 = "GGTGCAAGCCGGTGGAAACA" motif_1_regex = '(' + motif_1 + ')' def consume_iterator(iterator): for _ in iterator: pass class BenchmarkSuite:
def test_majority_consensus(self): # empty cases self.assertEqual(self.empty.majority_consensus(), Sequence('')) self.assertEqual(self.no_positions.majority_consensus(), RNA('')) # alignment where all sequences are the same aln = Alignment( [DNA('AG', metadata={'id': 'a'}), DNA('AG', metadata={'id': 'b'})]) self.assertEqual(aln.majority_consensus(), DNA('AG')) # no ties d1 = DNA('TTT', metadata={'id': "d1"}) d2 = DNA('TT-', metadata={'id': "d2"}) d3 = DNA('TC-', metadata={'id': "d3"}) a1 = Alignment([d1, d2, d3]) self.assertEqual(a1.majority_consensus(), DNA('TT-')) # ties d1 = DNA('T', metadata={'id': "d1"}) d2 = DNA('A', metadata={'id': "d2"}) a1 = Alignment([d1, d2]) self.assertTrue(a1.majority_consensus() in [DNA('T'), DNA('A')])
def test_init_not_equal_lengths(self): invalid_seqs = [ self.d1, self.d2, self.d3, DNA('.-ACC-GTGC--', metadata={'id': "i2"}) ] self.assertRaises(AlignmentError, Alignment, invalid_seqs)