def test_get_next_from_file(self): '''get_next_from_file() should read seqs from OK, and raise error at badly formatted file''' bad_files = [ 'sequences_test_fail_no_AT.fq', 'sequences_test_fail_no_seq.fq', 'sequences_test_fail_no_plus.fq', 'sequences_test_fail_no_qual.fq' ] bad_files = [os.path.join(data_dir, x) for x in bad_files] for fname in bad_files: f_in = utils.open_file_read(fname) fq = sequences.Fastq() with self.assertRaises(sequences.Error): while fq.get_next_from_file(f_in): pass utils.close(f_in) fname = os.path.join(data_dir, 'sequences_test_good_file.fq') try: f_in = open(fname) except IOError: print("Error opening '" + fname + "'", file=sys.stderr) sys.exit(1) fq = sequences.Fastq() while fq.get_next_from_file(f_in): self.assertEqual(fq, sequences.Fastq('ID', 'ACGTA', 'IIIII')) utils.close(f_in)
def test_translate(self): '''Test nucleatide -> amino acid conversion works on Fasta''' fq = sequences.Fastq( 'ID', 'GCAGCCGCGGCTAGAAGGCGACGCCGGCGTAACAATGACGATTGCTGTGAAGAGCAACAGGGAGGCGGGGGTCACCATATAATCATTTTATTGCTACTCCTGCTTAAAAAGATGTTCTTTCCACCCCCGCCTAGCAGTTCATCCTCGTCTACAACCACGACTTGGTACTATGTAGTCGTGGTTTAATAGTGA', 'IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII' ) self.assertEqual( sequences.Fastq( 'ID', 'AAAARRRRRRNNDDCCEEQQGGGGHHIIILLLLLLKKMFFPPPPSSSSSSTTTTWYYVVVV***', 'IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII' ), fq.translate())
def test_to_Fastq(self): '''Check to_Fastq converts OK, including out of range quality scores''' fa = sequences.Fasta('X', 'AAAAA') quals = [-1, 0, 40, 93, 94] self.assertEqual(sequences.Fastq('X', 'AAAAA', '!!I~~'), fa.to_Fastq(quals)) with self.assertRaises(sequences.Error): fa.to_Fastq('AAAAAAAAAAAAA')
def test_trim(self): '''trim() should trim the right number of bases off start and end''' fq = sequences.Fastq('ID', '1234567890', '1234567890') fq.trim(0, 0) self.assertEqual(fq, sequences.Fastq('ID', '1234567890', '1234567890')) fq = sequences.Fastq('ID', '1234567890', '1234567890') fq.trim(1, 0) self.assertEqual(fq, sequences.Fastq('ID', '234567890', '234567890')) fq = sequences.Fastq('ID', '1234567890', '1234567890') fq.trim(0, 1) self.assertEqual(fq, sequences.Fastq('ID', '123456789', '123456789')) fq = sequences.Fastq('ID', '1234567890', '1234567890') fq.trim(2, 2) self.assertEqual(fq, sequences.Fastq('ID', '345678', '345678'))
def test_trim_Ns(self): '''trim_Ns() should do the right trimming of a fastq sequence''' fq = sequences.Fastq('ID', 'ANNANA', '111111') test_seqs = [ sequences.Fastq('ID', 'ANNANA', '111111'), sequences.Fastq('ID', 'NANNANA', '1111111'), sequences.Fastq('ID', 'NANNANAN', '11111111'), sequences.Fastq('ID', 'ANNANAN', '1111111'), sequences.Fastq('ID', 'NNNNNNANNANAN', '1111111111111'), sequences.Fastq('ID', 'NNANNANANn', '1111111111') ] for s in test_seqs: s.trim_Ns() self.assertEqual(fq, s)
def merge_to_one_seq(infile, outfile, seqname='union'): '''Takes a multi fasta or fastq file and writes a new file that contains just one sequence, with the original sequences catted together, preserving their order''' seq_reader = sequences.file_reader(infile) seqs = [] for seq in seq_reader: seqs.append(copy.copy(seq)) new_seq = ''.join([seq.seq for seq in seqs]) if type(seqs[0]) == sequences.Fastq: new_qual = ''.join([seq.qual for seq in seqs]) seqs[:] = [] merged = sequences.Fastq(seqname, new_seq, new_qual) else: merged = sequences.Fasta(seqname, new_seq) seqs[:] = [] f = utils.open_file_write(outfile) print(merged, file=f) utils.close(f)
def test_replace_interval(self): '''Test replace_interval()''' fa = sequences.Fasta('ID', 'ACGTA') fa.replace_interval(0, 0, 'NEW') self.assertEqual(fa, sequences.Fasta('ID', 'NEWCGTA')) fa = sequences.Fasta('ID', 'ACGTA') fa.replace_interval(4, 4, 'NEW') self.assertEqual(fa, sequences.Fasta('ID', 'ACGTNEW')) fa = sequences.Fasta('ID', 'ACGTA') fa.replace_interval(2, 3, 'NEW') self.assertEqual(fa, sequences.Fasta('ID', 'ACNEWA')) fa = sequences.Fasta('ID', 'ACGTA') with self.assertRaises(sequences.Error): fa.replace_interval(3, 2, 'x') with self.assertRaises(sequences.Error): fa.replace_interval(1, 5, 'x') with self.assertRaises(sequences.Error): fa.replace_interval(5, 10, 'x') fq = sequences.Fastq('ID', 'ACGTA', 'ABCDE') fq.replace_interval(0, 0, 'NEW', 'III') self.assertEqual(fq, sequences.Fastq('ID', 'NEWCGTA', 'IIIBCDE')) fq = sequences.Fastq('ID', 'ACGTA', 'ABCDE') fq.replace_interval(4, 4, 'NEW', 'III') self.assertEqual(fq, sequences.Fastq('ID', 'ACGTNEW', 'ABCDIII')) fq = sequences.Fastq('ID', 'ACGTA', 'ABCDE') fq.replace_interval(2, 3, 'NEW', 'III') self.assertEqual(fq, sequences.Fastq('ID', 'ACNEWA', 'ABIIIE')) with self.assertRaises(sequences.Error): fq.replace_interval(1, 1, 'x', 'xx')
def test_file_reader_fastq(self): '''file_reader should iterate through a fastq file correctly''' reader = sequences.file_reader( os.path.join(data_dir, 'sequences_test_good_file.fq')) for seq in reader: self.assertEqual(seq, sequences.Fastq('ID', 'ACGTA', 'IIIII'))
def test_to_Fasta_and_qual(self): '''Check to_Fasta_and_qual converts quality scores correctly''' fq = sequences.Fastq('ID', 'ACGT', '>ADI') (fa, qual) = fq.to_Fasta_and_qual() self.assertEqual(fa, sequences.Fasta('ID', 'ACGT')) self.assertListEqual(qual, [29, 32, 35, 40])
def test_revcomp(self): '''revcomp() should correctly reverse complement a sequence''' fq = sequences.Fastq('ID', 'ACGTNacgtn', '1234567890') fq.revcomp() self.assertEqual(fq, sequences.Fastq('ID', 'nacgtNACGT', '0987654321'))
def test_init_length_mismatch(self): '''__init__ should raise an error when length of seq and quality not the same''' with self.assertRaises(sequences.Error): sequences.Fastq('X', 'A', 'II')
def setUp(self): self.fastq = sequences.Fastq('ID', 'ACGTA', 'IIIII')
def test_expand_nucleotides(self): '''Test expand_nucleotides''' tests = [ (sequences.Fasta('1', 'A'), [sequences.Fasta('1.1', 'A')]), (sequences.Fasta('2', 'C'), [sequences.Fasta('2.1', 'C')]), (sequences.Fasta('3', 'G'), [sequences.Fasta('3.1', 'G')]), (sequences.Fasta('4', 'T'), [sequences.Fasta('4.1', 'T')]), (sequences.Fasta('6', 'R'), [sequences.Fasta('6.1', 'A'), sequences.Fasta('6.2', 'G')]), (sequences.Fasta('7', 'Y'), [sequences.Fasta('7.1', 'C'), sequences.Fasta('7.2', 'T')]), (sequences.Fasta('8', 'S'), [sequences.Fasta('8.1', 'C'), sequences.Fasta('8.2', 'G')]), (sequences.Fasta('9', 'W'), [sequences.Fasta('9.1', 'A'), sequences.Fasta('9.2', 'T')]), (sequences.Fasta('10', 'K'), [sequences.Fasta('10.1', 'G'), sequences.Fasta('10.2', 'T')]), (sequences.Fasta('11', 'M'), [sequences.Fasta('11.1', 'A'), sequences.Fasta('11.2', 'C')]), (sequences.Fasta('12', 'B'), [ sequences.Fasta('12.1', 'C'), sequences.Fasta('12.2', 'G'), sequences.Fasta('12.3', 'T') ]), (sequences.Fasta('13', 'D'), [ sequences.Fasta('13.1', 'A'), sequences.Fasta('13.2', 'G'), sequences.Fasta('13.3', 'T') ]), (sequences.Fasta('14', 'H'), [ sequences.Fasta('14.1', 'A'), sequences.Fasta('14.2', 'C'), sequences.Fasta('14.3', 'T') ]), (sequences.Fasta('15', 'V'), [ sequences.Fasta('15.1', 'A'), sequences.Fasta('15.2', 'C'), sequences.Fasta('15.3', 'G') ]), (sequences.Fasta('16', 'N'), [ sequences.Fasta('16.1', 'A'), sequences.Fasta('16.2', 'C'), sequences.Fasta('16.3', 'G'), sequences.Fasta('16.4', 'T') ]), (sequences.Fasta('17', 'ART'), [sequences.Fasta('17.1', 'AAT'), sequences.Fasta('17.2', 'AGT')]), (sequences.Fasta('18', 'ARRT'), [ sequences.Fasta('18.1', 'AAAT'), sequences.Fasta('18.2', 'AAGT'), sequences.Fasta('18.3', 'AGAT'), sequences.Fasta('18.4', 'AGGT') ]), (sequences.Fasta('19', 'ARTR'), [ sequences.Fasta('19.1', 'AATA'), sequences.Fasta('19.2', 'AATG'), sequences.Fasta('19.3', 'AGTA'), sequences.Fasta('19.4', 'AGTG') ]), (sequences.Fastq('20', 'ART', 'GHI'), [ sequences.Fastq('20.1', 'AAT', 'GHI'), sequences.Fastq('20.2', 'AGT', 'GHI') ]), ] for t in tests: self.assertListEqual(t[0].expand_nucleotides(), t[1])