def test_reverse_transcribe_preserves_all_metadata(self): im = IntervalMetadata(4) im.add([(0, 2)], metadata={'gene': 'p53'}) seq = RNA('AGUU', metadata={'foo': 'bar'}, positional_metadata={'foo': range(4)}, interval_metadata=im) exp = DNA('AGTT', metadata={'foo': 'bar'}, positional_metadata={'foo': range(4)}, interval_metadata=im) self.assertEqual(seq.reverse_transcribe(), exp)
def test_translate_six_frames(self): seq = RNA('AUGCUAACAUAAA') # rc = UUUAUGUUAGCAU # test default behavior exp = [ Protein('MLT*'), Protein('C*HK'), Protein('ANI'), Protein('FMLA'), Protein('LC*H'), Protein('YVS') ] obs = list(self.sgc.translate_six_frames(seq)) self.assertEqual(obs, exp) # test that start/stop are respected exp = [ Protein('MLT'), Protein('C'), Protein('ANI'), Protein('MLA'), Protein('LC'), Protein('YVS') ] obs = list( self.sgc.translate_six_frames(seq, start='optional', stop='optional')) self.assertEqual(obs, exp)
def test_translate_trim_to_cds(self): seq = RNA('UAAUUGCCUCAUUAAUAACAAUGA') # find first start codon, trim all before it, convert alternative start # codon to M, finally trim to first stop codon following the start # codon exp = Protein('MPH') for param in {'require', 'optional'}: obs = self.sgc.translate(seq, start=param, stop=param) self.assertEqual(obs, exp) exp = Protein('*LPH**Q*') obs = self.sgc.translate(seq, start='ignore', stop='ignore') self.assertEqual(obs, exp) # alternative reading frame disrupts cds: # AAUUGCCUCAUUAAUAACAAUGA # NCLINNN with self.assertRaisesRegex(ValueError, r'reading_frame=2.*start=\'require\''): self.sgc.translate(seq, reading_frame=2, start='require') with self.assertRaisesRegex(ValueError, r'reading_frame=2.*stop=\'require\''): self.sgc.translate(seq, reading_frame=2, stop='require') exp = Protein('NCLINNN') for param in {'ignore', 'optional'}: obs = self.sgc.translate(seq, reading_frame=2, start=param, stop=param) self.assertEqual(obs, exp)
def test_translate_varied_genetic_codes(self): # spot check using a few NCBI and custom genetic codes to translate seq = RNA('AAUGAUGUGACUAUCAGAAGG') # table_id=2 exp = Protein('NDVTI**') obs = GeneticCode.from_ncbi(2).translate(seq) self.assertEqual(obs, exp) exp = Protein('MTI') obs = GeneticCode.from_ncbi(2).translate(seq, start='require', stop='require') self.assertEqual(obs, exp) # table_id=22 exp = Protein('NDVTIRR') obs = GeneticCode.from_ncbi(22).translate(seq) self.assertEqual(obs, exp) with self.assertRaisesRegex(ValueError, r'reading_frame=1.*start=\'require\''): GeneticCode.from_ncbi(22).translate(seq, start='require', stop='require') # custom, no start codons gc = GeneticCode('MWN*' * 16, '-' * 64) exp = Protein('MM*MWN*') obs = gc.translate(seq) self.assertEqual(obs, exp) with self.assertRaisesRegex(ValueError, r'reading_frame=1.*start=\'require\''): gc.translate(seq, start='require', stop='require')
def compute_distance_matrix(msa_file, csvfile="distance_mat.csv"): """ load up some aligned sequences, and compute a distance matrix compute distances between the sequences using the hamming function see also: scipy.spatial.distance.hamming @args msa_file: multiple sequence alignment in fasta format @type msa_file: str @args csvfile: output distance matrix file in csv format @type csvfile: str """ records = [] for rec in SeqIO.parse(msa_file, "fasta"): records.append(RNA(rec.seq, rec.id)) aln = Alignment(records) master_dm = aln.distances() ## writing the result to a csv file csv_header_row = [header for header in master_dm.ids] ## result as a list of list with open(csvfile, "w") as output: writer = csv.writer(output, lineterminator="\n") writer.writerows([csv_header_row]) writer.writerows(master_dm) output.close()
def test_translate_reading_frame_non_empty_translation(self): seq = RNA('AUGGUGGAA') # rc = UUCCACCAU for reading_frame, exp_str in ((1, 'MVE'), (2, 'WW'), (3, 'GG'), (-1, 'FHH'), (-2, 'ST'), (-3, 'PP')): exp = Protein(exp_str) obs = self.sgc.translate(seq, reading_frame=reading_frame) self.assertEqual(obs, exp)
def test_translate_preserves_metadata(self): obs = self.sgc.translate( RNA('AUG', metadata={'foo': 'bar', 'baz': 42}, positional_metadata={'foo': range(3)})) # metadata retained, positional metadata dropped self.assertEqual(obs, Protein('M', metadata={'foo': 'bar', 'baz': 42}))
def test_update_ids_default_behavior(self): # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', id="1"), RNA('UUG', id="2"), RNA('U-----UGCC--', id="3") ]) exp_id_map = {'1': 'r1', '2': 'r2', '3': 'r3'} obs_sc, obs_id_map = self.s2.update_ids() self._assert_sequence_collections_equal(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids() self._assert_sequence_collections_equal(obs_sc, self.empty) self.assertEqual(obs_id_map, {})
def test_update_ids_ids_parameter(self): # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', id="abc"), RNA('UUG', id="def"), RNA('U-----UGCC--', id="ghi") ]) exp_id_map = {'abc': 'r1', 'def': 'r2', 'ghi': 'r3'} obs_sc, obs_id_map = self.s2.update_ids(ids=('abc', 'def', 'ghi')) self._assert_sequence_collections_equal(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids(ids=[]) self._assert_sequence_collections_equal(obs_sc, self.empty) self.assertEqual(obs_id_map, {})
def test_constructor_not_monomorphic(self): with six.assertRaisesRegex(self, TypeError, 'mixed types.*RNA.*DNA'): TabularMSA([DNA(''), RNA('')]) with six.assertRaisesRegex(self, TypeError, 'mixed types.*float.*Protein'): TabularMSA([Protein(''), Protein(''), 42.0, Protein('')])
def test_update_ids_prefix(self): # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', id="abc1"), RNA('UUG', id="abc2"), RNA('U-----UGCC--', id="abc3") ]) exp_id_map = {'abc1': 'r1', 'abc2': 'r2', 'abc3': 'r3'} obs_sc, obs_id_map = self.s2.update_ids(prefix='abc') self._assert_sequence_collections_equal(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids(prefix='abc') self._assert_sequence_collections_equal(obs_sc, self.empty) self.assertEqual(obs_id_map, {})
def test_translate_ncbi_table_id(self): for seq in RNA('AAAUUUAUGCAU'), DNA('AAATTTATGCAT'): # default obs = seq.translate() self.assertEqual(obs, Protein('KFMH')) obs = seq.translate(9) self.assertEqual(obs, Protein('NFMH'))
def test_translate_six_frames_preserves_metadata(self): seq = RNA('AUG', metadata={'foo': 'bar', 'baz': 42}, positional_metadata={'foo': range(3)}) obs = list(self.sgc.translate_six_frames(seq))[:2] # metadata retained, positional metadata dropped self.assertEqual( obs, [Protein('M', metadata={'foo': 'bar', 'baz': 42}), Protein('', metadata={'foo': 'bar', 'baz': 42})])
def setUp(self): self.d1 = DNA('GATTACA', metadata={'id': "d1"}) self.d2 = DNA('TTG', metadata={'id': "d2"}) self.d3 = DNA('GTATACA', metadata={'id': "d3"}) self.r1 = RNA('GAUUACA', metadata={'id': "r1"}) self.r2 = RNA('UUG', metadata={'id': "r2"}) self.r3 = RNA('U-----UGCC--', metadata={'id': "r3"}) self.seqs1 = [self.d1, self.d2] self.seqs2 = [self.r1, self.r2, self.r3] self.seqs3 = self.seqs1 + self.seqs2 self.seqs4 = [self.d1, self.d3] self.s1 = SequenceCollection(self.seqs1) self.s2 = SequenceCollection(self.seqs2) self.s3 = SequenceCollection(self.seqs3) self.s4 = SequenceCollection(self.seqs4) self.empty = SequenceCollection([])
def setUp(self): # ids all same length, seqs longer than 10 chars dna_3_seqs = Alignment([ DNA('..ACC-GTTGG..', metadata={'id': "d1"}), DNA('TTACCGGT-GGCC', metadata={'id': "d2"}), DNA('.-ACC-GTTGC--', metadata={'id': "d3"})]) # id lengths from 0 to 10, with mixes of numbers, characters, and # spaces. sequence characters are a mix of cases and gap characters. # sequences are shorter than 10 chars variable_length_ids = Alignment([ RNA('.-ACGU', metadata={'id': ''}), RNA('UGCA-.', metadata={'id': 'a'}), RNA('.ACGU-', metadata={'id': 'bb'}), RNA('ugca-.', metadata={'id': '1'}, validate=False), RNA('AaAaAa', metadata={'id': 'abcdefghij'}, validate=False), RNA('GGGGGG', metadata={'id': 'ab def42ij'})]) # sequences with 20 chars = exactly two chunks of size 10 two_chunks = Alignment([ DNA('..ACC-GTTGG..AATGC.C', metadata={'id': 'foo'}), DNA('TTACCGGT-GGCCTA-GCAT', metadata={'id': 'bar'})]) # single sequence with more than two chunks single_seq_long = Alignment([ DNA('..ACC-GTTGG..AATGC.C----', metadata={'id': 'foo'})]) # single sequence with only a single character (minimal writeable # alignment) single_seq_short = Alignment([DNA('-', metadata={'id': ''})]) # alignments that can be written in phylip format self.objs = [dna_3_seqs, variable_length_ids, two_chunks, single_seq_long, single_seq_short] self.fps = map(get_data_path, ['phylip_dna_3_seqs', 'phylip_variable_length_ids', 'phylip_two_chunks', 'phylip_single_seq_long', 'phylip_single_seq_short']) # alignments that cannot be written in phylip format, paired with their # expected error message regexps self.invalid_objs = [ # no seqs (Alignment([]), 'one sequence'), # no positions (Alignment([DNA('', metadata={'id': "d1"}), DNA('', metadata={'id': "d2"})]), 'one position'), # ids too long (Alignment([RNA('ACGU', metadata={'id': "foo"}), RNA('UGCA', metadata={'id': "alongsequenceid"})]), '10.*alongsequenceid') ]
def test_bool(self): self.assertFalse(TabularMSA([])) self.assertFalse(TabularMSA([RNA('')])) self.assertFalse( TabularMSA( [RNA('', metadata={'id': 1}), RNA('', metadata={'id': 2})], key='id')) self.assertTrue(TabularMSA([RNA('U')])) self.assertTrue(TabularMSA([RNA('--'), RNA('..')])) self.assertTrue(TabularMSA([RNA('AUC'), RNA('GCA')]))
def test_update_ids_fn_parameter(self): def append_42(ids): return [id_ + '-42' for id_ in ids] # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', id="r1-42"), RNA('UUG', id="r2-42"), RNA('U-----UGCC--', id="r3-42") ]) exp_id_map = {'r1-42': 'r1', 'r2-42': 'r2', 'r3-42': 'r3'} obs_sc, obs_id_map = self.s2.update_ids(fn=append_42) self._assert_sequence_collections_equal(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids(fn=append_42) self._assert_sequence_collections_equal(obs_sc, self.empty) self.assertEqual(obs_id_map, {})
def test_translate_stop_no_stop_codon(self): seq = RNA('GAAUCU') exp = Protein('ES') for stop in {'ignore', 'optional'}: obs = self.sgc.translate(seq, stop=stop) self.assertEqual(obs, exp) with six.assertRaisesRegex(self, ValueError, 'reading_frame=1.*stop=\'require\''): self.sgc.translate(seq, stop='require')
def test_iter_positions(self): actual = list(self.a2.iter_positions()) expected = [ [RNA('U', metadata={'id': 'r1'}), RNA('A', metadata={'id': 'r2'})], [RNA('U', metadata={'id': 'r1'}), RNA('C', metadata={'id': 'r2'})], [RNA('A', metadata={'id': 'r1'}), RNA('G', metadata={'id': 'r2'})], [RNA('U', metadata={'id': 'r1'}), RNA('U', metadata={'id': 'r2'})], [RNA('-', metadata={'id': 'r1'}), RNA('U', metadata={'id': 'r2'})] ] self.assertEqual(actual, expected) actual = list(self.a2.iter_positions(constructor=str)) expected = [list('UA'), list('UC'), list('AG'), list('UU'), list('-U')] self.assertEqual(actual, expected)
def test_multiple_msa_file(self): fp = get_data_path('stockholm_multiple_msa') msa = _stockholm_to_tabular_msa(fp, constructor=RNA) exp = TabularMSA([ RNA('AAGGGUUAUUUAUAUACUUU'), RNA('UGCUAAGAGUGGGGAUGAUU'), RNA('GCCACAACCGAUUAGAUAGA'), RNA('UUAGAAACCGAUGGACCGAA') ], metadata={ 'AC': 'G2134T23', 'ID': 'ARD' }, positional_metadata=({ 'AC_cons': list('GGGACUGGACAUCUAUUCAG') }), index=['RTC2231', 'RTF2124', 'RTH3322', 'RTB1512']) self.assertEqual(msa, exp)
def test_translate_six_frames_ncbi_table_id(self): # rc = CAAUUU for seq in RNA('AAAUUG'), DNA('AAATTG'): # default obs = list(seq.translate_six_frames()) self.assertEqual(obs, [Protein('KL'), Protein('N'), Protein('I'), Protein('QF'), Protein('N'), Protein('I')]) obs = list(seq.translate_six_frames(9)) self.assertEqual(obs, [Protein('NL'), Protein('N'), Protein('I'), Protein('QF'), Protein('N'), Protein('I')])
def test_translate_preserves_metadata(self): metadata = {'foo': 'bar', 'baz': 42} positional_metadata = {'foo': range(3)} for seq in (RNA('AUG', metadata=metadata, positional_metadata=positional_metadata), DNA('ATG', metadata=metadata, positional_metadata=positional_metadata)): obs = seq.translate() # metadata retained, positional metadata dropped self.assertEqual(obs, Protein('M', metadata={'foo': 'bar', 'baz': 42}))
def test_transcribe_preserves_all_metadata(self): im = IntervalMetadata(4) im.add([(0, 2)], metadata={'gene': 'p53'}) exp = RNA('AGUU', metadata={'foo': 'bar'}, positional_metadata={'foo': range(4)}, interval_metadata=im) seq = DNA('AGTT', metadata={'foo': 'bar'}, positional_metadata={'foo': range(4)}, interval_metadata=im) self.assertEqual(seq.transcribe(), exp)
def test_translate_six_frames_genetic_code_object(self): gc = GeneticCode('M' * 64, '-' * 64) for seq in RNA('AAAUUG'), DNA('AAATTG'): obs = list(seq.translate_six_frames(gc)) self.assertEqual(obs, [ Protein('MM'), Protein('M'), Protein('M'), Protein('MM'), Protein('M'), Protein('M') ])
def test_translate_six_frames_preserves_metadata(self): metadata = {'foo': 'bar', 'baz': 42} positional_metadata = {'foo': range(3)} for seq in (RNA('AUG', metadata=metadata, positional_metadata=positional_metadata), DNA('ATG', metadata=metadata, positional_metadata=positional_metadata)): obs = list(seq.translate_six_frames())[:2] # metadata retained, positional metadata dropped self.assertEqual( obs, [Protein('M', metadata={'foo': 'bar', 'baz': 42}), Protein('', metadata={'foo': 'bar', 'baz': 42})])
def test_translate_start_no_accidental_mutation(self): # `start` mutates a vector in-place that is derived from # GeneticCode._offset_table. the current code doesn't perform an # explicit copy because numpy's advanced indexing is used, which always # returns a copy. test this assumption here in case that behavior # changes in the future offset_table = self.sgc._offset_table.copy() seq = RNA('CAUUUGCUGAAAUGA') obs = self.sgc.translate(seq, start='require') self.assertEqual(obs, Protein('MLK*')) npt.assert_array_equal(self.sgc._offset_table, offset_table)
def test_transcribe(self): # without changes self.assertEqual(DNA('').transcribe(), RNA('')) self.assertEqual(DNA('A').transcribe(), RNA('A')) self.assertEqual(DNA('.ACGW-').transcribe(), RNA('.ACGW-')) # with changes self.assertEqual(DNA('T').transcribe(), RNA('U')) self.assertEqual(DNA('TT').transcribe(), RNA('UU')) self.assertEqual(DNA('ATCTG').transcribe(), RNA('AUCUG')) self.assertEqual(DNA('TTTG').transcribe(), RNA('UUUG'))
def test_kmer_frequencies(self): expected1 = Counter({'GAT': 1, 'TAC': 1}) expected2 = Counter({'TTG': 1}) self.assertEqual( self.s1.kmer_frequencies(k=3, overlap=False, relative=False), [expected1, expected2]) expected1 = defaultdict(float) expected1['A'] = 3 / 7. expected1['C'] = 1 / 7. expected1['G'] = 1 / 7. expected1['T'] = 2 / 7. expected2 = defaultdict(float) expected2['G'] = 1 / 3. expected2['T'] = 2 / 3. self.assertEqual(self.s1.kmer_frequencies(k=1, relative=True), [expected1, expected2]) expected1 = defaultdict(float) expected1['GAT'] = 1 / 2. expected1['TAC'] = 1 / 2. expected2 = defaultdict(float) expected2['TTG'] = 1 / 1. self.assertEqual( self.s1.kmer_frequencies(k=3, overlap=False, relative=True), [expected1, expected2]) self.assertEqual(self.empty.kmer_frequencies(k=1, relative=True), []) # Test to ensure floating point precision bug isn't present. See the # tests for Sequence.kmer_frequencies for more details. sc = SequenceCollection([ RNA('C' * 10, metadata={'id': 's1'}), RNA('G' * 10, metadata={'id': 's2'}) ]) self.assertEqual( sc.kmer_frequencies(1, relative=True), [defaultdict(float, {'C': 1.0}), defaultdict(float, {'G': 1.0})])
def test_motif_pyrimidine_run(self): seq = RNA("") self.assertEqual(list(seq.find_motifs("pyrimidine-run")), []) seq = RNA("AARC--UCRG") self.assertEqual(list(seq.find_motifs("pyrimidine-run")), [slice(3, 4), slice(6, 8)]) seq = RNA("AA-RC--UCR-G") self.assertEqual(list(seq.find_motifs("pyrimidine-run", min_length=3, ignore=seq.gaps())), [slice(4, 9)])
def test_motif_purine_run(self): seq = RNA("") self.assertEqual(list(seq.find_motifs("purine-run")), []) seq = RNA("AARC--UCRG") self.assertEqual(list(seq.find_motifs("purine-run")), [slice(0, 3), slice(8, 10)]) seq = RNA("AA-RC--UCR-G") self.assertEqual(list(seq.find_motifs("purine-run", min_length=3, ignore=seq.gaps())), [slice(0, 4)])
def test_translate_start_no_start_codon(self): seq = RNA('CAACAACAGCAA') exp = Protein('QQQQ') for start in {'ignore', 'optional'}: obs = self.sgc.translate(seq, start=start) self.assertEqual(obs, exp) with six.assertRaisesRegex(self, ValueError, 'reading_frame=1.*start=\'require\''): self.sgc.translate(seq, start='require') # non-start codon that translates to an AA that start codons also map # to. should catch bug if code attempts to search and trim *after* # translation -- this must happen *before* translation seq = RNA('UUACAA') exp = Protein('LQ') for start in {'ignore', 'optional'}: obs = self.sgc.translate(seq, start=start) self.assertEqual(obs, exp) with six.assertRaisesRegex(self, ValueError, 'reading_frame=1.*start=\'require\''): self.sgc.translate(seq, start='require')
def rnaAlign(seq1, seq2, gap_open_penalty, gap_extend_penalty, local=False): seq1 = seq1.upper() seq2 = seq2.upper() if local: aln, score, _ = local_pairwise_align_nucleotide( RNA(seq1), RNA(seq2), gap_open_penalty, gap_extend_penalty) else: aln, score, _ = global_pairwise_align_nucleotide( RNA(seq1), RNA(seq2), gap_open_penalty, gap_extend_penalty) response = { 'aln1': str(aln[0]), 'aln2': str(aln[1]), 'score': score, 'similarity': float('{:.2f}'.format(aln[0].match_frequency(aln[1], relative=True) * 100)) } return response
def test_degenerate_map(self): exp = { 'B': set(['C', 'U', 'G']), 'D': set(['A', 'U', 'G']), 'H': set(['A', 'C', 'U']), 'K': set(['U', 'G']), 'M': set(['A', 'C']), 'N': set(['A', 'C', 'U', 'G']), 'S': set(['C', 'G']), 'R': set(['A', 'G']), 'W': set(['A', 'U']), 'V': set(['A', 'C', 'G']), 'Y': set(['C', 'U']) } self.assertEqual(RNA('').degenerate_map, exp) self.assertEqual(RNA.degenerate_map, exp)
def test_reverse_transcribe_does_not_modify_input(self): seq = RNA('AUAU') self.assertEqual(seq.reverse_transcribe(), DNA('ATAT')) self.assertEqual(seq, RNA('AUAU'))
def test_reverse_transcribe_preserves_all_metadata(self): seq = RNA('AGUU', metadata={'foo': 'bar'}, positional_metadata={'foo': range(4)}) exp = DNA('AGTT', metadata={'foo': 'bar'}, positional_metadata={'foo': range(4)}) self.assertEqual(seq.reverse_transcribe(), exp)