def test_file_to_dict(self): '''check file_to_dict fills dictionary correctly''' d_test = {} d = {} tasks.file_to_dict(os.path.join(data_dir, 'sequences_test.fa'), d_test) for i in range(1, 5): d[str(i)] = sequences.Fasta(str(i), 'ACGTA') self.assertSequenceEqual(d_test.keys(), d.keys()) for i in range(1, 5): key = str(i) self.assertEqual(d_test[key].id, d[key].id) self.assertEqual(d_test[key].seq, d[key].seq)
def deinterleave(infile, outfile_1, outfile_2, fasta_out=False): seq_reader = sequences.file_reader(infile) f_1 = utils.open_file_write(outfile_1) f_2 = utils.open_file_write(outfile_2) for seq in seq_reader: if fasta_out: print(sequences.Fasta(seq.id, seq.seq), file=f_1) else: print(seq, file=f_1) try: next(seq_reader) except StopIteration: utils.close(f_1) utils.close(f_2) raise Error('Error getting mate for sequence. Cannot continue') if fasta_out: print(sequences.Fasta(seq.id, seq.seq), file=f_2) else: print(seq, file=f_2) utils.close(f_1) utils.close(f_2)
def test_get_next_from_embl_file(self): f_in = utils.open_file_read( os.path.join(data_dir, 'sequences_test.embl')) embl = sequences.Embl() counter = 1 while embl.get_next_from_file(f_in): self.assertEqual( embl, sequences.Fasta('seq' + str(counter), expected_embl[counter - 1])) counter += 1 utils.close(f_in)
def test_trim_Ns(self): '''trim_Ns() should do the right trimming of a sequence''' fa = sequences.Fasta('ID', 'ANNANA') test_seqs = [sequences.Fasta('ID', 'ANNANA'), sequences.Fasta('ID', 'NANNANA'), sequences.Fasta('ID', 'NANNANAN'), sequences.Fasta('ID', 'ANNANAN'), sequences.Fasta('ID', 'NNNNNNANNANAN'), sequences.Fasta('ID', 'NNANNANANn')] for s in test_seqs: s.trim_Ns() self.assertEqual(fa, s)
def test_file_reader_phylip(self): '''Test read phylip file''' test_files = [ 'sequences_test_phylip.interleaved', 'sequences_test_phylip.interleaved2', 'sequences_test_phylip.sequential' ] test_files = [os.path.join(data_dir, f) for f in test_files] expected_seqs = [ sequences.Fasta('Turkey', 'AACTNGGGCATTTCAGGGTGAGCCCGGGCAATACAGGGTAT'), sequences.Fasta('Salmo_gair', 'AAGCCTTGGCAGTGCAGGGTGAGCCGTGGCCGGGCACGGTAT'), sequences.Fasta('H. Sapiens', 'ACCGGTTGGCCGTTCAGGGTACAGGTTGGCCGTTCAGGGTAA') ] for fname in test_files: reader = sequences.file_reader(fname) i = 0 for seq in reader: self.assertEqual(expected_seqs[i], seq) i += 1 # files made by seaview are a little different in the first line. # Test one of these expected_seqs = [ sequences.Fasta('seq1', 96 * 'G' + 'T'), sequences.Fasta('seq2', 94 * 'A' + 'G') ] reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test_phylip.made_by_seaview')) i = 0 for seq in reader: print(seq) self.assertEqual(expected_seqs[i], seq) i += 1
def test_is_complete_orf(self): '''Test is_complete_orf''' tests = [ (sequences.Fasta('ID', 'TTT'), False), (sequences.Fasta('ID', 'TTTTAA'), True), (sequences.Fasta('ID', 'TTTTAATAA'), False), (sequences.Fasta('ID', 'TTGTAA'), True), (sequences.Fasta('ID', 'TTTAAC'), True), (sequences.Fasta('ID', 'TGA'), False), (sequences.Fasta('ID', 'TGAA'), False), ] for t in tests: self.assertEqual(t[0].is_complete_orf(), t[1])
def get_next_from_file(self, f): self.__init__() line = f.readline() if not line: return None while line == '\n': line = f.readline() if not line.startswith('DNA : '): raise Error("Error reading caf file. Expected line starting with 'DNA : ...'") self.id = line.rstrip().split()[2] line = f.readline() seq = [] while line != '\n': seq.append(line.rstrip()) line = f.readline() self.seq = sequences.Fasta(self.id, ''.join(seq)) line = f.readline() if not line.startswith('BaseQuality : '): raise Error("Error reading caf file. Expected line starting with 'BaseQuality : ...'") quals = [int(x) for x in f.readline().rstrip().split()] self.seq = self.seq.to_Fastq(quals) line = f.readline() assert line == '\n' line = f.readline() while line not in ['', '\n']: a = line.rstrip().split() if a[0] == 'Insert_size': self.insert_min, self.insert_max = int(a[1]), int(a[2]) elif a[0] == 'Ligation_no': self.ligation = a[1] elif a[0] == 'Clone': self.clone = a[1] elif a[0] == 'Clipping' and a[1] == 'QUAL': self.clip_start, self.clip_end = int(a[2]) - 1, int(a[3]) - 1 line = f.readline() return True
def test_get_next_from_gbk_file(self): f_in = utils.open_file_read( os.path.join(data_dir, 'sequences_test.gbk')) embl = sequences.Embl() counter = 1 expected = [ 'gatcctccatatacaacggtatctccacctcaggtttagatctcaacaacggaaccattgccgacatgagacagttaggtatcgtcgagagttacaagctaaaacgagcagtagtcagctctgcatctgaagccgctgaagttctactaagggtggataacatcatccgtgcaagaccaatgccatgactcagattctaattttaagctattcaatttctctttgatc', 'gatcctccatatacaacggtatctccacctcaggtttagatctcaacaacggaaccattgccgacatgagacagttaggtatcgtcgagagttacaagctaaaacgagcagtagtcagctctgcatctgaagccgctgaagttctactaagggtggataacatcatccgtgcaagaccaatgccatgactcagattctaattttaagctattcaatttctctttgaaa' ] while embl.get_next_from_file(f_in): self.assertEqual( embl, sequences.Fasta('NAME' + str(counter), expected[counter - 1])) counter += 1 utils.close(f_in)
def test_looks_like_gene(self): '''Test looks_like_gene''' tests = [ (sequences.Fasta('ID', 'TTT'), False), (sequences.Fasta('ID', 'TTGTAA'), True), (sequences.Fasta('ID', 'ttgTAA'), True), (sequences.Fasta('ID', 'TTGTTTTAA'), True), (sequences.Fasta('ID', 'TTGTAATTTTAA'), False), (sequences.Fasta('ID', 'TTGTTTTGAA'), False), ] for t in tests: self.assertEqual(t[0].looks_like_gene(), t[1]) sequences.genetic_code = 1 self.assertFalse(sequences.Fasta('ID', 'ATTCAGTAA').looks_like_gene()) sequences.genetic_code = 11 self.assertTrue(sequences.Fasta('ID', 'ATTCAGTAA').looks_like_gene()) sequences.genetic_code = 1
def test_gc_content(self): """Test GC content calculation works as expected""" tests = [ (sequences.Fasta('ID', 'cgCG'), 1.0), (sequences.Fasta('ID', 'tTaA'), 0.0), (sequences.Fasta('ID', 'GCAT'), 0.5), (sequences.Fasta('ID', 'GCATNN'), 0.5), (sequences.Fasta('ID', 'GCATNNS'), 0.6), (sequences.Fasta('ID', 'GCATNNSK'), 0.5) ] for test, answer in tests: self.assertAlmostEqual(test.gc_content(), answer) self.assertAlmostEqual(test.gc_content(as_decimal=False), answer * 100)
def scaffolds_to_contigs(infile, outfile, number_contigs=False): '''Makes a file of contigs from scaffolds by splitting at every N. Use number_contigs=True to add .1, .2, etc onto end of each contig, instead of default to append coordinates.''' seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) for seq in seq_reader: contigs = seq.contig_coords() counter = 1 for contig in contigs: if number_contigs: name = seq.id + '.' + str(counter) counter += 1 else: name = '.'.join([seq.id, str(contig.start + 1), str(contig.end + 1)]) print(sequences.Fasta(name, seq[contig.start:contig.end+1]), file=fout) utils.close(fout)
def test_file_reader_embl(self): '''Test read embl file''' reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test.embl')) counter = 1 for seq in reader: self.assertEqual(seq, sequences.Fasta('seq' + str(counter), expected_embl[counter-1])) counter += 1 bad_files = [ 'sequences_test.embl.bad', 'sequences_test.embl.bad2', ] bad_files = [os.path.join(data_dir, x) for x in bad_files] for filename in bad_files: with self.assertRaises(sequences.Error): reader = sequences.file_reader(filename) for seq in reader: pass
def make_random_contigs(contigs, length, outfile, name_by_letters=False, prefix='', seed=None, first_number=1): '''Makes a multi fasta file of random sequences, all the same length''' random.seed(a=seed) fout = utils.open_file_write(outfile) letters = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ') letters_index = 0 for i in range(contigs): if name_by_letters: name = letters[letters_index] letters_index += 1 if letters_index == len(letters): letters_index = 0 else: name = str(i + first_number) fa = sequences.Fasta(prefix + name, ''.join([random.choice('ACGT') for x in range(length)])) print(fa, file=fout) utils.close(fout)
def to_fasta(infile, outfile, line_length=60, strip_after_first_whitespace=False): seq_reader = sequences.file_reader(infile) f_out = utils.open_file_write(outfile) original_line_length = sequences.Fasta.line_length sequences.Fasta.line_length = line_length for seq in seq_reader: if strip_after_first_whitespace: seq.strip_after_first_whitespace() if type(seq) == sequences.Fastq: print(sequences.Fasta(seq.id, seq.seq), file=f_out) else: print(seq, file=f_out) utils.close(f_out) sequences.Fasta.line_length = original_line_length
def test_extend(self): '''Test extend''' ctg = contig.Contig(sequences.Fasta('ID', 'ACCGT')) self.assertEqual(ctg.extend(5, 2, 100), (0, 0)) self.assertEqual(ctg.fa, sequences.Fasta('ID', 'ACCGT')) ctg.add_left_kmer('GT') self.assertEqual(ctg.extend(1, 2, 100), (2, 0)) self.assertEqual(ctg.fa, sequences.Fasta('ID', 'GTACCGT')) self.assertEqual(ctg.extend(1, 2, 100), (0, 0)) self.assertEqual(ctg.fa, sequences.Fasta('ID', 'GTACCGT')) ctg.add_right_kmer('TG') self.assertEqual(ctg.extend(1, 2, 100), (0, 2)) self.assertEqual(ctg.fa, sequences.Fasta('ID', 'GTACCGTTG')) self.assertEqual(ctg.extend(1, 2, 100), (0, 0)) self.assertEqual(ctg.fa, sequences.Fasta('ID', 'GTACCGTTG')) ctg.add_left_kmer('AG') ctg.add_right_kmer('GC') self.assertEqual(ctg.extend(1, 2, 100), (2, 2)) self.assertEqual(ctg.fa, sequences.Fasta('ID', 'AGGTACCGTTGGC'))
def merge_to_one_seq(infile, outfile, seqname='union'): '''Takes a multi fasta or fastq file and writes a new file that contains just one sequence, with the original sequences catted together, preserving their order''' seq_reader = sequences.file_reader(infile) seqs = [] for seq in seq_reader: seqs.append(copy.copy(seq)) new_seq = ''.join([seq.seq for seq in seqs]) if type(seqs[0]) == sequences.Fastq: new_qual = ''.join([seq.qual for seq in seqs]) seqs[:] = [] merged = sequences.Fastq(seqname, new_seq, new_qual) else: merged = sequences.Fasta(seqname, new_seq) seqs[:] = [] f = utils.open_file_write(outfile) print(merged, file=f) utils.close(f)
def to_fasta(infile, outfile, line_length=60, strip_after_first_whitespace=False, check_unique=False): seq_reader = sequences.file_reader(infile) f_out = utils.open_file_write(outfile) original_line_length = sequences.Fasta.line_length sequences.Fasta.line_length = line_length if check_unique: used_names = {} for seq in seq_reader: if strip_after_first_whitespace: seq.strip_after_first_whitespace() if check_unique: used_names[seq.id] = used_names.get(seq.id, 0) + 1 if type(seq) == sequences.Fastq: print(sequences.Fasta(seq.id, seq.seq), file=f_out) else: print(seq, file=f_out) utils.close(f_out) sequences.Fasta.line_length = original_line_length if check_unique: all_unique = True for name, count in used_names.items(): if count > 1: print('Sequence name "' + name + '" not unique. Found', count, 'times', file=sys.stderr) all_unique = False if not all_unique: raise Error('Not all sequence names unique. Cannot continue')
def test_contig_coords(self): '''contig_coords() should get the coords of all contigs in a sequence correctly''' test_seqs = [sequences.Fasta('ID', 'ACGT'), sequences.Fasta('ID', 'NACGT'), sequences.Fasta('ID', 'NNACGT'), sequences.Fasta('ID', 'ACGTN'), sequences.Fasta('ID', 'ACGTNN'), sequences.Fasta('ID', 'NANNCGT'), sequences.Fasta('ID', 'ACNNNGTNA'), sequences.Fasta('ID', 'ANNCGTNNAAAAA')] correct_coords = [[intervals.Interval(0,3)], [intervals.Interval(1, 4)], [intervals.Interval(2, 5)], [intervals.Interval(0, 3)], [intervals.Interval(0, 3)], [intervals.Interval(1, 1), intervals.Interval(4,6)], [intervals.Interval(0, 1), intervals.Interval(5, 6), intervals.Interval(8, 8)], [intervals.Interval(0, 0), intervals.Interval(3, 5), intervals.Interval(8, 12)]] for i in range(len(test_seqs)): gaps = test_seqs[i].contig_coords() self.assertListEqual(correct_coords[i], gaps)
def test_strip_illumina_suffix(self): '''Check that /1 and /2 removed correctly from IDs''' seqs = [sequences.Fasta('name/1', 'A'), sequences.Fasta('name/2', 'A'), sequences.Fasta('name', 'A'), sequences.Fasta('name/1/2', 'A'), sequences.Fasta('name/2/1', 'A'), sequences.Fasta('name/3', 'A')] correct_names = ['name', 'name', 'name', 'name/1', 'name/2', 'name/3'] for seq in seqs: seq.strip_illumina_suffix() for i in range(len(seqs)): self.assertEqual(seqs[i].id, correct_names[i])
def run(self): '''Produce a filtered fasta file.''' original_dir = os.getcwd() os.chdir(self.working_directory) small_contigs = set() contained_contigs = set() if len(self.contigs) > len(self.ids_to_skip): alignments = utils.run_nucmer(self.fasta_file, self.fasta_file, self._build_nucmer_filename(), min_percent_id=self.percent_match, run_promer=False) for id in self.contigs.keys(): if not id in self.ids_to_skip: if len(self.contigs[id]) < self.cutoff_contig_length: small_contigs.add(id) else: for algn in alignments: if (not algn.is_self_hit()) \ and algn.qry_name == id \ and algn.ref_name != algn.qry_name \ and not algn.ref_name in contained_contigs \ and (algn.hit_length_qry/algn.qry_length) * 100 >= self.percent_match: contained_contigs.add(id) discard = small_contigs.union(contained_contigs) ids_file = utils.write_ids_to_file(discard, "contig.ids.discard") tasks.filter(self.fasta_file, self.output_file, ids_file=ids_file, invert=True) if not self.debug: utils.delete(ids_file) utils.delete(self._build_nucmer_filename()) else: output_fw = fastaqutils.open_file_write(self.output_file) for contig_id in self.contigs: print(sequences.Fasta(contig_id, self.contigs[contig_id]), file=output_fw) fastaqutils.close(output_fw) self._write_summary(small_contigs, contained_contigs) os.chdir(original_dir)
def test_replace_interval(self): '''Test replace_interval()''' fa = sequences.Fasta('ID', 'ACGTA') fa.replace_interval(0, 0, 'NEW') self.assertEqual(fa, sequences.Fasta('ID', 'NEWCGTA')) fa = sequences.Fasta('ID', 'ACGTA') fa.replace_interval(4, 4, 'NEW') self.assertEqual(fa, sequences.Fasta('ID', 'ACGTNEW')) fa = sequences.Fasta('ID', 'ACGTA') fa.replace_interval(2, 3, 'NEW') self.assertEqual(fa, sequences.Fasta('ID', 'ACNEWA')) fa = sequences.Fasta('ID', 'ACGTA') with self.assertRaises(sequences.Error): fa.replace_interval(3,2,'x') with self.assertRaises(sequences.Error): fa.replace_interval(1,5,'x') with self.assertRaises(sequences.Error): fa.replace_interval(5,10,'x') fq = sequences.Fastq('ID', 'ACGTA', 'ABCDE') fq.replace_interval(0, 0, 'NEW', 'III') self.assertEqual(fq, sequences.Fastq('ID', 'NEWCGTA', 'IIIBCDE')) fq = sequences.Fastq('ID', 'ACGTA', 'ABCDE') fq.replace_interval(4, 4, 'NEW', 'III') self.assertEqual(fq, sequences.Fastq('ID', 'ACGTNEW', 'ABCDIII')) fq = sequences.Fastq('ID', 'ACGTA', 'ABCDE') fq.replace_interval(2, 3, 'NEW', 'III') self.assertEqual(fq, sequences.Fastq('ID', 'ACNEWA', 'ABIIIE')) with self.assertRaises(sequences.Error): fq.replace_interval(1,1,'x', 'xx')
def test_add_insertions(self): '''Test add_insertions''' fa = sequences.Fasta('X', 'acgtacgtacgt') fa.add_insertions(skip=4, window=0, test=True) self.assertEqual(fa, sequences.Fasta('X', 'acgtNacgtNacgt'))
def run(description): parser = argparse.ArgumentParser( description = 'Makes perfect paired end fastq reads from a sequence file, with insert sizes sampled from a normal distribution. Read orientation is innies. Output is an interleaved FASTQ file.', usage = 'fastaq to_perfect_reads [options] <infile> <outfile> <mean insert size> <insert std deviation> <mean coverage> <read length>') parser.add_argument('infile', help='Name of input file') parser.add_argument('outfile', help='Name of output file') parser.add_argument('mean_insert', type=int, help='Mean insert size of read pairs', metavar='mean insert size') parser.add_argument('insert_std', type=float, help='Standard devation of insert size', metavar='insert std deviation') parser.add_argument('coverage', type=float, help='Mean coverage of the reads', metavar='mean coverage') parser.add_argument('readlength', type=int, help='Length of each read', metavar='read length') parser.add_argument('--fragments', help='Write FASTA sequences of fragments (i.e. read pairs plus sequences in between them) to the given filename', metavar='FILENAME') parser.add_argument('--no_n', action='store_true', help='Don\'t allow any N or n characters in the reads') parser.add_argument('--seed', type=int, help='Seed for random number generator. Default is to use python\'s default', default=None, metavar='INT') options = parser.parse_args() random.seed(a=options.seed) seq_reader = sequences.file_reader(options.infile) fout = utils.open_file_write(options.outfile) pair_counter = 1 if options.fragments: fout_frags = utils.open_file_write(options.fragments) for ref in seq_reader: # check if current seq is long enough if len(ref) < options.mean_insert + 4 * options.insert_std: print('Warning, sequence ', ref.id, ' too short. Skipping it...', file=sys.stderr) continue # work out how many reads to simulate read_pairs = int(0.5 * options.coverage * len(ref) / options.readlength) # it's possible that we pick the same fragment twice, in which case the # reads would get the same name. So remember the frag coords used_fragments = {} # (middle_position, length) => count # do the simulation: pick insert size from normal distribution, and # position in genome from uniform distribution x = 0 while x < read_pairs: isize = int(random.normalvariate(options.mean_insert, options.insert_std)) while isize > len(ref) or isize < options.readlength: isize = int(random.normalvariate(options.mean_insert, options.insert_std)) middle_pos = random.randint(ceil(0.5 *isize), floor(len(ref) - 0.5 * isize)) read_start1 = int(middle_pos - ceil(0.5 * isize)) read_start2 = read_start1 + isize - options.readlength readname = ':'.join([ref.id, str(pair_counter), str(read_start1+1), str(read_start2+1)]) fragment = (middle_pos, isize) if fragment in used_fragments: used_fragments[fragment] += 1 readname += '.dup.' + str(used_fragments[fragment]) else: used_fragments[fragment] = 1 read1 = sequences.Fastq(readname + '/1', ref.seq[read_start1:read_start1 + options.readlength], 'I' * options.readlength) read2 = sequences.Fastq(readname + '/2', ref.seq[read_start2:read_start2 + options.readlength], 'I' * options.readlength) if options.no_n and ('n' in read1.seq or 'N' in read1.seq or 'n' in read2.seq or 'N' in read2.seq): continue read2.revcomp() print(read1, file=fout) print(read2, file=fout) if options.fragments: frag = sequences.Fasta(readname, ref.seq[read_start1:read_start2 + options.readlength]) print(frag, file=fout_frags) pair_counter += 1 x += 1 utils.close(fout) if options.fragments: utils.close(fout_frags)
def setUp(self): self.fasta = sequences.Fasta('ID', 'ACGTA')
def test_make_into_gene_fasta(self): '''Test make_into_gene fasta''' print('sequences.genetic_code', sequences.genetic_code) tests = [ (sequences.Fasta('ID', 'T'), None), (sequences.Fasta('ID', 'TT'), None), (sequences.Fasta('ID', 'TTT'), None), (sequences.Fasta('ID', 'TTG'), None), (sequences.Fasta('ID', 'TAA'), None), (sequences.Fasta('ID', 'TTGAAATAA'), (sequences.Fasta('ID', 'TTGAAATAA'), '+', 0)), (sequences.Fasta('ID', 'TTGAAATAT'), None), (sequences.Fasta('ID', 'TTGTAA'), (sequences.Fasta('ID', 'TTGTAA'), '+', 0)), (sequences.Fasta('ID', 'TTGTAAA'), (sequences.Fasta('ID', 'TTGTAA'), '+', 0)), (sequences.Fasta('ID', 'TTGTAAAA'), (sequences.Fasta('ID', 'TTGTAA'), '+', 0)), (sequences.Fasta('ID', 'TTGTAAAAA'), None), (sequences.Fasta('ID', 'ATTGTAA'), (sequences.Fasta('ID', 'TTGTAA'), '+', 1)), (sequences.Fasta('ID', 'ATTGTAAA'), (sequences.Fasta('ID', 'TTGTAA'), '+', 1)), (sequences.Fasta('ID', 'ATTGTAAAA'), (sequences.Fasta('ID', 'TTGTAA'), '+', 1)), (sequences.Fasta('ID', 'ATTGTAAAAA'), None), (sequences.Fasta('ID', 'AATTGTAA'), (sequences.Fasta('ID', 'TTGTAA'), '+', 2)), (sequences.Fasta('ID', 'AATTGTAAA'), (sequences.Fasta('ID', 'TTGTAA'), '+', 2)), (sequences.Fasta('ID', 'AATTGTAAAA'), (sequences.Fasta('ID', 'TTGTAA'), '+', 2)), (sequences.Fasta('ID', 'AATTGTAAAAA'), None), (sequences.Fasta('ID', 'TTACAA'), (sequences.Fasta('ID', 'TTGTAA'), '-', 0)), (sequences.Fasta('ID', 'ATTACAA'), (sequences.Fasta('ID', 'TTGTAA'), '-', 0)), (sequences.Fasta('ID', 'AATTACAA'), (sequences.Fasta('ID', 'TTGTAA'), '-', 0)), (sequences.Fasta('ID', 'AAATTACAA'), None), (sequences.Fasta('ID', 'TTACAAA'), (sequences.Fasta('ID', 'TTGTAA'), '-', 1)), (sequences.Fasta('ID', 'ATTACAAA'), (sequences.Fasta('ID', 'TTGTAA'), '-', 1)), (sequences.Fasta('ID', 'AATTACAAA'), (sequences.Fasta('ID', 'TTGTAA'), '-', 1)), (sequences.Fasta('ID', 'AAATTACAAA'), None), (sequences.Fasta('ID', 'TTACAAAA'), (sequences.Fasta('ID', 'TTGTAA'), '-', 2)), (sequences.Fasta('ID', 'ATTACAAAA'), (sequences.Fasta('ID', 'TTGTAA'), '-', 2)), (sequences.Fasta('ID', 'AATTACAAAA'), (sequences.Fasta('ID', 'TTGTAA'), '-', 2)), (sequences.Fasta('ID', 'AAATTACAAAA'), None), ] for seq, expected in tests: self.assertEqual(seq.make_into_gene(), expected)
def test_to_Fasta_and_qual(self): '''Check to_Fasta_and_qual converts quality scores correctly''' fq = sequences.Fastq('ID', 'ACGT', '>ADI') (fa, qual) = fq.to_Fasta_and_qual() self.assertEqual(fa, sequences.Fasta('ID', 'ACGT')) self.assertListEqual(qual, [29, 32, 35, 40])
def test_subseq(self): '''Test subseq''' fa = sequences.Fasta('name', 'ACGTA') self.assertEqual(fa.subseq(1,4), sequences.Fasta('name', 'CGT')) self.assertEqual(fa.subseq(None,4), sequences.Fasta('name', 'ACGT')) self.assertEqual(fa.subseq(1,None), sequences.Fasta('name', 'CGTA'))
def test_is_all_Ns(self): '''Test is_all_Ns()''' self.assertTrue(sequences.Fasta('ID', 'n').is_all_Ns()) self.assertTrue(sequences.Fasta('ID', 'N').is_all_Ns()) self.assertTrue(sequences.Fasta('ID', 'nNn').is_all_Ns()) self.assertFalse(sequences.Fasta('ID', 'a').is_all_Ns()) self.assertFalse(sequences.Fasta('ID', '').is_all_Ns()) self.assertFalse(sequences.Fasta('ID', 'anNg').is_all_Ns()) self.assertFalse(sequences.Fasta('ID', 'naN').is_all_Ns()) self.assertFalse(sequences.Fasta('ID', 'anNg').is_all_Ns(start=0, end=0)) self.assertFalse(sequences.Fasta('ID', 'anNg').is_all_Ns(start=0, end=1)) self.assertTrue(sequences.Fasta('ID', 'anNg').is_all_Ns(start=1, end=1)) self.assertTrue(sequences.Fasta('ID', 'anNg').is_all_Ns(start=1, end=2)) self.assertFalse(sequences.Fasta('ID', 'anNg').is_all_Ns(start=1)) self.assertTrue(sequences.Fasta('ID', 'anN').is_all_Ns(start=1)) self.assertFalse(sequences.Fasta('ID', 'anNg').is_all_Ns(end=1)) self.assertTrue(sequences.Fasta('ID', 'nNA').is_all_Ns(end=1)) with self.assertRaises(sequences.Error): sequences.Fasta('ID', 'anNg').is_all_Ns(start=1, end=0)
def test_expand_nucleotides(self): '''Test expand_nucleotides''' tests = [ (sequences.Fasta('1', 'A'), [sequences.Fasta('1.1', 'A')]), (sequences.Fasta('2', 'C'), [sequences.Fasta('2.1', 'C')]), (sequences.Fasta('3', 'G'), [sequences.Fasta('3.1', 'G')]), (sequences.Fasta('4', 'T'), [sequences.Fasta('4.1', 'T')]), (sequences.Fasta('6', 'R'), [sequences.Fasta('6.1', 'A'), sequences.Fasta('6.2', 'G')]), (sequences.Fasta('7', 'Y'), [sequences.Fasta('7.1', 'C'), sequences.Fasta('7.2', 'T')]), (sequences.Fasta('8', 'S'), [sequences.Fasta('8.1', 'C'), sequences.Fasta('8.2', 'G')]), (sequences.Fasta('9', 'W'), [sequences.Fasta('9.1', 'A'), sequences.Fasta('9.2', 'T')]), (sequences.Fasta('10', 'K'), [sequences.Fasta('10.1', 'G'), sequences.Fasta('10.2', 'T')]), (sequences.Fasta('11', 'M'), [sequences.Fasta('11.1', 'A'), sequences.Fasta('11.2', 'C')]), (sequences.Fasta('12', 'B'), [sequences.Fasta('12.1', 'C'), sequences.Fasta('12.2', 'G'), sequences.Fasta('12.3', 'T')]), (sequences.Fasta('13', 'D'), [sequences.Fasta('13.1', 'A'), sequences.Fasta('13.2', 'G'), sequences.Fasta('13.3', 'T')]), (sequences.Fasta('14', 'H'), [sequences.Fasta('14.1', 'A'), sequences.Fasta('14.2', 'C'), sequences.Fasta('14.3', 'T')]), (sequences.Fasta('15', 'V'), [sequences.Fasta('15.1', 'A'), sequences.Fasta('15.2', 'C'), sequences.Fasta('15.3', 'G')]), (sequences.Fasta('16', 'N'), [sequences.Fasta('16.1', 'A'), sequences.Fasta('16.2', 'C'), sequences.Fasta('16.3', 'G'), sequences.Fasta('16.4', 'T')]), (sequences.Fasta('17', 'ART'), [sequences.Fasta('17.1', 'AAT'), sequences.Fasta('17.2', 'AGT')]), (sequences.Fasta('18', 'ARRT'), [sequences.Fasta('18.1', 'AAAT'), sequences.Fasta('18.2', 'AAGT'), sequences.Fasta('18.3', 'AGAT'), sequences.Fasta('18.4', 'AGGT')]), (sequences.Fasta('19', 'ARTR'), [sequences.Fasta('19.1', 'AATA'), sequences.Fasta('19.2', 'AATG'), sequences.Fasta('19.3', 'AGTA'), sequences.Fasta('19.4', 'AGTG')]), (sequences.Fastq('20', 'ART', 'GHI'), [sequences.Fastq('20.1', 'AAT', 'GHI'), sequences.Fastq('20.2', 'AGT', 'GHI')]), ] for t in tests: self.assertListEqual(t[0].expand_nucleotides(), t[1])
def test_replace_bases(self): '''Check that bases get replaced correctly''' fa = sequences.Fasta('X', 'AUCGTUUACT') fa.replace_bases('U', 'T') self.assertEqual(fa, sequences.Fasta('X', 'ATCGTTTACT'))