def test_to_fastn(self): '''Check conversion to fastq with to_fastq()''' sams = [ sam.SamRecord( 'ID\t0\tref\t1\t47\t2S73M\t=\t362\t438\tACGTA\tIIIII'), sam.SamRecord( 'ID\t16\tref\t1\t47\t2S73M\t=\t362\t438\tACGTA\tIIIII'), sam.SamRecord( 'ID\t65\tref\t1\t47\t2S73M\t=\t362\t438\tACGTA\tIIIII'), sam.SamRecord( 'ID\t129\tref\t1\t47\t2S73M\t=\t362\t438\tACGTA\tIIIII'), sam.SamRecord('ID\t0\tref\t1\t47\t2S73M\t=\t362\t438\tACGTA\t*'), sam.SamRecord('ID\t16\tref\t1\t47\t2S73M\t=\t362\t438\tACGTA\t*') ] seqs = [ fastn.Fastq('ID', 'ACGTA', 'IIIII'), fastn.Fastq('ID', 'TACGT', 'IIIII'), fastn.Fastq('ID/1', 'ACGTA', 'IIIII'), fastn.Fastq('ID/2', 'ACGTA', 'IIIII'), fastn.Fasta('ID', 'ACGTA'), fastn.Fasta('ID', 'TACGT') ] for i in range(len(sams)): self.assertEqual(seqs[i], sams[i].to_fastn())
def test_get_next_from_file(self): '''get_next_from_file() should read seqs from OK, including weirdness in file''' f_in = utils.open_file_read('fastn_unittest.fa') fa = fastn.Fasta() counter = 1 while fa.get_next_from_file(f_in): self.assertEqual(fa, fastn.Fasta(str(counter), 'ACGTA')) counter += 1 utils.close(f_in)
def test_translate(self): '''Test nucleatide -> amino acid conversion works on Fasta''' fa = fastn.Fasta( 'ID', 'GCAGCCGCGGCTAGAAGGCGACGCCGGCGTAACAATGACGATTGCTGTGAAGAGCAACAGGGAGGCGGGGGTCACCATATAATCATTTTATTGCTACTCCTGCTTAAAAAGATGTTCTTTCCACCCCCGCCTAGCAGTTCATCCTCGTCTACAACCACGACTTGGTACTATGTAGTCGTGGTTTAATAGTGA' ) self.assertEqual( fastn.Fasta( 'ID', 'AAAARRRRRRNNDDCCEEQQGGGGHHIIILLLLLLKKMFFPPPPSSSSSSTTTTWYYVVVV***' ), fa.translate())
def test_split_capillary_id(self): '''Tests that we get information from a sanger capillary read name OK''' ids = [ 'abcde.p1k', 'abcde.x.p1k', 'abcde.p1ka', 'abcde.q1k', 'abcde.w2k' ] expected = [{ 'prefix': 'abcde', 'dir': 'fwd', 'suffix': 'p1k' }, { 'prefix': 'abcde.x', 'dir': 'fwd', 'suffix': 'p1k' }, { 'prefix': 'abcde', 'dir': 'fwd', 'suffix': 'p1ka' }, { 'prefix': 'abcde', 'dir': 'rev', 'suffix': 'q1k' }, { 'prefix': 'abcde', 'dir': 'unk', 'suffix': 'w2k' }] for i in range(len(ids)): fa = fastn.Fasta(ids[i], 'A') self.assertEqual(fa.split_capillary_id(), expected[i])
def update_perfect_contigs(nucmer_hit, ref_fasta, contigs): id = nucmer_hit.ref_name + ":" + str(nucmer_hit.ref_start) + '-' + str( nucmer_hit.ref_end) contig = fastn.Fasta( 'x', ref_fasta[nucmer_hit.ref_start - 1:nucmer_hit.ref_end]) contigs[(nucmer_hit.ref_name, nucmer_hit.ref_start, nucmer_hit.ref_end)] = contig
def test_getitem(self): '''getitem() should return the right subsequence''' seq = 'AACGTGTCA' fa = fastn.Fasta('x', seq) self.assertEqual(seq[1], fa[1]) self.assertEqual(seq[0:2], fa[0:2]) self.assertEqual(seq[1:], fa[1:])
def test_file_reader_fasta(self): '''file_reader should iterate through a fasta file correctly''' reader = fastn.file_reader('fastn_unittest.fa') counter = 1 for seq in reader: self.assertEqual(seq, fastn.Fasta(str(counter), 'ACGTA')) counter += 1
def test_contig_coords(self): '''contig_coords() should get the coords of all contigs in a sequence correctly''' test_seqs = [ fastn.Fasta('ID', 'ACGT'), fastn.Fasta('ID', 'NACGT'), fastn.Fasta('ID', 'NNACGT'), fastn.Fasta('ID', 'ACGTN'), fastn.Fasta('ID', 'ACGTNN'), fastn.Fasta('ID', 'NANNCGT'), fastn.Fasta('ID', 'ANNCGTNNAAAAA') ] correct_coords = [[genome_intervals.Interval(0, 3)], [genome_intervals.Interval(1, 4)], [genome_intervals.Interval(2, 5)], [genome_intervals.Interval(0, 3)], [genome_intervals.Interval(0, 3)], [ genome_intervals.Interval(1, 1), genome_intervals.Interval(4, 6) ], [ genome_intervals.Interval(0, 0), genome_intervals.Interval(3, 5), genome_intervals.Interval(8, 12) ]] for i in range(len(test_seqs)): gaps = test_seqs[i].contig_coords() self.assertListEqual(correct_coords[i], gaps)
def test_search_string(self): '''Check that search_string() finds all the hits''' fa = fastn.Fasta('X', 'AAA') hits = fa.search('G') self.assertTrue(len(hits) == 0) hits = fa.search('AAA') self.assertListEqual(hits, [(0, '+')]) hits = fa.search('AA') self.assertListEqual(hits, [(0, '+'), (1, '+')]) hits = fa.search('TTT') self.assertListEqual(hits, [(0, '-')])
def test_file_to_dict(self): '''check file_to_dict9 fills dictionary correctly''' d_test = {} d = {} fastn.file_to_dict('fastn_unittest.fa', d_test) for i in range(1, 5): d[str(i)] = fastn.Fasta(str(i), 'ACGTA') self.assertSequenceEqual(d_test.keys(), d.keys()) for i in range(1, 5): key = str(i) self.assertEqual(d_test[key].id, d[key].id) self.assertEqual(d_test[key].seq, d[key].seq)
def to_fastn(self): if self.qual == '*': seq = fastn.Fasta(self.id, self.seq) else: seq = fastn.Fastq(self.id, self.seq, self.qual) if self.query_strand() == '-': seq.revcomp() if self.is_first_of_pair(): seq.id += '/1' elif self.is_second_of_pair(): seq.id += '/2' return seq
def test_gaps(self): '''gaps() should find the gaps in a sequence correctly''' test_seqs = [ fastn.Fasta('ID', 'ACGT'), fastn.Fasta('ID', 'NACGT'), fastn.Fasta('ID', 'NACGTN'), fastn.Fasta('ID', 'ANNCGT'), fastn.Fasta('ID', 'NANNCGTNN') ] correct_gaps = [[], [genome_intervals.Interval(0, 0)], [ genome_intervals.Interval(0, 0), genome_intervals.Interval(5, 5) ], [genome_intervals.Interval(1, 2)], [ genome_intervals.Interval(0, 0), genome_intervals.Interval(2, 3), genome_intervals.Interval(7, 8) ]] for i in range(len(test_seqs)): gaps = test_seqs[i].gaps() self.assertListEqual(correct_gaps[i], gaps)
def get_next_from_file(self, f): line = f.readline() if not line: return None while line == '\n': line = f.readline() if not line.startswith('DNA : '): raise Error("Error reading caf file. Expected line starting with 'DNA : ...'") self.id = line.rstrip().split()[2] line = f.readline() seq = [] while line != '\n': seq.append(line.rstrip()) line = f.readline() self.seq = fastn.Fasta(self.id, ''.join(seq)) line = f.readline() if not line.startswith('BaseQuality : '): raise Error("Error reading caf file. Expected line starting with 'BaseQuality : ...'") quals = [int(x) for x in f.readline().rstrip().split()] self.seq = self.seq.to_Fastq(quals) line = f.readline() assert line == '\n' line = f.readline() while line not in ['', '\n']: a = line.rstrip().split() if a[0] == 'Insert_size': self.insert_min, self.insert_max = int(a[1]), int(a[2]) elif a[0] == 'Ligation_no': self.ligation = a[1] elif a[0] == 'Clone': self.clone = a[1] elif a[0] == 'Clipping' and a[1] == 'QUAL': self.clip_start, self.clip_end = int(a[2]), int(a[3]) line = f.readline() return True
def test_trim_Ns(self): '''trim_Ns() should do the right trimming of a sequence''' fa = fastn.Fasta('ID', 'ANNANA') test_seqs = [ fastn.Fasta('ID', 'ANNANA'), fastn.Fasta('ID', 'NANNANA'), fastn.Fasta('ID', 'NANNANAN'), fastn.Fasta('ID', 'ANNANAN'), fastn.Fasta('ID', 'NNNNNNANNANAN'), fastn.Fasta('ID', 'NNANNANANn') ] for s in test_seqs: s.trim_Ns() self.assertEqual(fa, s)
def test_get_differences_from_ref(self): '''check test_get_differences_from_ref finds the correct differences''' ref = fastn.Fasta('ID', 'ACGTACGTACGT') c = cigar.Cigar("12M") pairs_to_check = [(cigar.Cigar("12M"), 'ACGTACGTACGT'), (cigar.Cigar("12M"), 'AGGTACGTACGT'), (cigar.Cigar("1S12M"), 'AAGGTACGTACGT'), (cigar.Cigar("1S12M1S"), 'AAGGTACGTACGTA'), (cigar.Cigar("1M1I10M"), 'AiCGTACGTACGT'), (cigar.Cigar("3M1I3M1D3M"), 'AGGiTACTACGT'), (cigar.Cigar("2S3M1I3M1D3M5S"), 'ssAGGiTACTACGTsssss')] correct_answers = [[], [(1, 'S', 'C/G', 1)], [(1, 'S', 'C/G', 1)], [(1, 'S', 'C/G', 1)], [(1, 'I', 'i', 1)], [(1, 'S', 'C/G', 1), (3, 'I', 'i', 1), (6, 'D', 'G', 1)], [(1, 'S', 'C/G', 1), (3, 'I', 'i', 1), (6, 'D', 'G', 1)]] for i in range(len(pairs_to_check)): self.assertListEqual(pairs_to_check[i][0].get_differences_from_ref(pairs_to_check[i][1], ref), correct_answers[i])
def test_strip_illumina_suffix(self): '''Check that /1 and /2 removed correctly from IDs''' seqs = [ fastn.Fasta('name/1', 'A'), fastn.Fasta('name/2', 'A'), fastn.Fasta('name', 'A'), fastn.Fasta('name/1/2', 'A'), fastn.Fasta('name/2/1', 'A'), fastn.Fasta('name/3', 'A') ] correct_names = ['name', 'name', 'name', 'name/1', 'name/2', 'name/3'] for seq in seqs: seq.strip_illumina_suffix() for i in range(len(seqs)): self.assertEqual(seqs[i].id, correct_names[i])
'Used to generate a fake set of contigs from a genome. At regular intervals it puts in a gap and then breaks into contigs', usage='%(prog)s <infile> <gap length> <contig length> <outfile>') parser.add_argument('infile', help='Name of fasta/q file to be read') parser.add_argument('gap_length', type=int, help='Length of gaps to be added') parser.add_argument('contig_length', type=int, help='Length of each contig') parser.add_argument('outfile', help='Name of output fasta file') options = parser.parse_args() seq_reader = fastn.file_reader(options.infile) f_out = utils.open_file_write(options.outfile) for seq in seq_reader: if len(seq) < 2 * options.contig_length + options.gap_length: print('Sequence', seq.id, 'too short (', len(seq), 'bases). Skipping', file=sys.stderr) i = 0 while i + options.contig_length < len(seq): contig = fastn.Fasta( seq.id + ':' + str(i + 1) + '-' + str(i + options.contig_length), seq[i:i + options.contig_length]) print(contig, file=f_out) i += options.contig_length + options.gap_length utils.close(f_out)
options = parser.parse_args() gaps = {} if options.gaps_file: f = utils.open_file_read(options.gaps_file) for line in f: (id, start, end) = line.rstrip().split('\t') gap = genome_intervals.Interval(int(start) - 1, int(end) - 1) if id not in gaps: gaps[id] = [] gaps[id].append(gap) utils.close(f) f_in = utils.open_file_read(options.fai_file) f_out = utils.open_file_write(options.outfile) for line in f_in: a = line.rstrip().split() fa = fastn.Fasta(a[0], 'A' * int(a[1])) if fa.id in gaps: fa.seq = list(fa.seq) for gap in gaps[fa.id]: fa.seq[gap.start:gap.end + 1] = ['N'] * len(gap) fa.seq = ''.join(fa.seq) print(fa, file=f_out) utils.close(f_in) utils.close(f_out)
def test_revcomp(self): '''revcomp() should correctly reverse complement a sequence''' fa = fastn.Fasta('ID', 'ACGTNacgtn') fa.revcomp() self.assertEqual(fa, fastn.Fasta('ID', 'nacgtNACGT'))
type=int, help= 'Seed for random number generator. Default is to use python\'s default', default=None) parser.add_argument('contigs', type=int, help='Nunber of contigs to make') parser.add_argument('length', type=int, help='Length of each contig') parser.add_argument('outfile', help='Name of output file') options = parser.parse_args() random.seed(a=options.seed) fout = utils.open_file_write(options.outfile) letters = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ') letters_index = 0 for i in range(options.contigs): if options.name_by_letters: name = letters[letters_index] letters_index += 1 if letters_index == len(letters): letters_index = 0 else: name = str(i + options.first_number) fa = fastn.Fasta( options.prefix + name, ''.join([random.choice('ACGT') for x in range(options.length)])) print(fa, file=fout) utils.close(fout)
def test_get_next_from_file(self): '''get_next_from_file() should read caf records from file correctly''' f_in = utils.open_file_read('caf_unittest.caf') c = caf.Caf() c.get_next_from_file(f_in) e = caf.Caf() e.id = 'pknbac5b2Aa01.p1k' seq = ''.join(['NGGAGAGACTCGGACTAGTTCTACACCCTCACACCTTTGTCCTAAACCTTGAATCTAAGT' 'CCTAACACCCTGACACCTTTGTCCTAAGCCCGGAATCTAACTTCTAGCACCCCTACGACC', 'CTTATTCCTAAACCCAGAATCTGACTATTGACACCCCTACAACCCTAATTCCAACACCCT', 'TACAACCTTCATTCCAACACCGCAACAACCTTCATTCCAGCACCCCAACAACCTTCATTC', 'CAACACCCCAAACAACATCATTCCAACACCCCAAACAACATCATTCCAACACCCCAAACA', 'ACATCATTCCAACACCCCAAACAACATCATTCCAACACGGCAACAACATCATTCGAACAC', 'CCCTACAACATCATTCCAGCACCCCAACAACCTCCCTGCGAAACCCCGAATCCGAATTTT', 'GACACCCCTACAACCTTATTCTGACACCCCCAACAAACTTTCTCTAACACCCCAACAACG', 'TGACTACTAATACACCTAAAACCTTACTCCTAAACCCGGAATCCGACTTCTAATACCGCA', 'ACAACCTTCATTCCTAAACCCGGAATCTGAACCCTGAACCATTAAAACATAAAACGTGGA', 'AAATGAACCCCTGAACCATGAAAACCGTGAAAACCTATAACTTGGACCATGAACCTCTCA', 'ACCCCGAAATATGAGAACTTTGGAAACCCTAAATTTTGGGAAAACTCCTTTTTTTTTTTT', 'TTATTGTACATCCTGTGCGATGGTATACATTTTGGCGAATGCAAAAGAATTAGCATATAT', 'ATATGTGTAGGTCTTTGTGATGGTCAGGGGGGAGATCGACTAGGGTGTAGGTCTTTGTGA', 'TGGTCAAGGGAGATGGGCCAAAGGGAAGTCGGACAAGGTGAGATGGGCCAAGGAGATGGG', 'CCTAGGGTGGATGGGACAAGGGTGGATGGTCAGAGGTGGATGGTCAAGGGTGGATGGTCA', 'AGGATGAATGGGCAAGGGAGATGGGCAAAGTAGATGGGCAAGGGTGGATGGACAAGGTGG', 'ATGGCCAAAGTGGATGGCAAGGAGGATGGCCCAGGTAATAGGCAAGGAAATGGCCAGGTG', 'GATGGACCAGGTGGTGCCCTAATGGAGGCAGGGTGAAGTCCAGGAGGAGGCCCAGGAAAA', 'GGCCCAGAGAAACCCAAGGAAAGGCCCAGGGGGTGGGACAGGGGAAGCGCCAAGGGATGC', 'CAAGGTGGGGGCCAGAAAATAGCCCAGAAAAGGCCAAAATAAGCCAAGAAAAGCCCCAGA', 'AAACCCAAGAAA']) quals = [4, 4, 4, 4, 6, 6, 8, 6, 6, 6, 6, 10, 12, 11, 13, 13, 20, 19, 9, 10, 9, 9, 9, 19, 19, 34, 34, 39, 35, 35, 35, 37, 35, 34, 26, 26, 16, 17, 11, 21, 21, 32, 35, 37, 37, 32, 45, 23, 17, 17, 18, 27, 29, 32, 35, 32, 32, 32, 32, 39, 35, 35, 35, 35, 35, 37, 42, 31, 31, 14, 13, 13, 25, 25, 35, 40, 33, 29, 23, 23, 15, 25, 24, 35, 35, 35, 35, 23, 36, 18, 18, 23, 28, 33, 29, 29, 32, 32, 32, 32, 35, 35, 32, 35, 35, 32, 35, 35, 44, 44, 37, 35, 28, 26, 24, 19, 23, 30, 33, 40, 32, 32, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 50, 50, 50, 37, 30, 30, 27, 27, 21, 21, 21, 29, 26, 29, 29, 23, 23, 28, 37, 37, 50, 50, 40, 35, 35, 32, 32, 32, 35, 44, 37, 35, 35, 35, 35, 35, 35, 32, 32, 35, 35, 35, 35, 44, 42, 42, 41, 41, 41, 41, 41, 42, 41, 41, 41, 41, 41, 41, 44, 44, 42, 42, 42, 42, 42, 35, 37, 35, 35, 33, 37, 37, 44, 44, 44, 41, 42, 50, 42, 42, 42, 44, 44, 50, 50, 44, 44, 44, 44, 44, 44, 50, 50, 44, 44, 44, 44, 44, 41, 42, 44, 42, 42, 42, 44, 44, 42, 42, 41, 41, 41, 42, 44, 50, 50, 50, 44, 44, 44, 44, 44, 37, 37, 37, 37, 39, 41, 41, 44, 44, 44, 44, 47, 47, 44, 44, 44, 43, 43, 42, 42, 37, 37, 37, 41, 41, 42, 44, 44, 44, 44, 44, 42, 42, 42, 41, 41, 41, 44, 44, 44, 46, 42, 41, 37, 37, 37, 37, 37, 41, 42, 35, 35, 35, 35, 35, 35, 35, 42, 41, 42, 44, 50, 42, 42, 41, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 37, 37, 41, 44, 44, 47, 37, 37, 33, 33, 33, 27, 27, 37, 37, 47, 47, 47, 47, 47, 50, 44, 44, 42, 50, 35, 35, 35, 42, 42, 44, 50, 50, 50, 42, 42, 42, 42, 35, 35, 37, 42, 50, 44, 44, 44, 44, 44, 44, 47, 47, 47, 47, 44, 50, 44, 44, 44, 44, 47, 47, 44, 47, 50, 50, 50, 48, 37, 17, 17, 13, 22, 22, 35, 36, 42, 42, 35, 35, 35, 37, 37, 42, 50, 35, 35, 35, 35, 37, 37, 35, 35, 33, 33, 33, 33, 42, 42, 42, 41, 41, 41, 41, 41, 41, 50, 37, 44, 44, 44, 42, 37, 37, 21, 21, 21, 33, 33, 42, 50, 50, 44, 44, 44, 44, 44, 44, 44, 42, 37, 44, 44, 44, 44, 42, 42, 42, 42, 42, 44, 44, 44, 50, 50, 44, 44, 44, 37, 37, 35, 33, 33, 21, 21, 33, 33, 33, 41, 42, 42, 41, 44, 44, 44, 44, 42, 42, 42, 42, 44, 41, 44, 37, 42, 37, 41, 41, 42, 42, 50, 50, 44, 44, 44, 44, 42, 42, 27, 33, 27, 33, 33, 37, 37, 50, 35, 35, 35, 37, 37, 44, 44, 50, 44, 44, 44, 37, 37, 35, 31, 31, 37, 37, 44, 44, 44, 44, 50, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 37, 37, 28, 28, 23, 28, 26, 33, 33, 33, 29, 29, 29, 33, 35, 46, 33, 23, 23, 26, 33, 33, 50, 44, 37, 37, 30, 37, 37, 42, 50, 30, 30, 30, 37, 37, 23, 28, 15, 15, 11, 15, 27, 37, 33, 37, 26, 26, 28, 37, 42, 48, 48, 37, 23, 23, 23, 31, 31, 33, 23, 23, 24, 31, 31, 31, 33, 24, 25, 21, 21, 21, 28, 31, 33, 42, 42, 42, 42, 44, 44, 44, 44, 30, 23, 16, 10, 10, 16, 24, 33, 24, 24, 24, 30, 33, 36, 42, 42, 44, 44, 42, 39, 39, 33, 46, 27, 28, 28, 33, 33, 37, 37, 37, 22, 22, 17, 19, 19, 33, 31, 33, 27, 27, 18, 18, 24, 29, 32, 33, 35, 33, 40, 40, 37, 34, 27, 27, 14, 14, 13, 13, 18, 12, 20, 25, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 47, 56, 56, 56, 47, 42, 42, 42, 42, 27, 23, 15, 11, 11, 27, 33, 42, 42, 33, 24, 10, 10, 10, 13, 15, 18, 14, 14, 14, 14, 14, 25, 30, 33, 30, 21, 21, 27, 27, 22, 15, 13, 22, 22, 19, 19, 15, 15, 11, 10, 17, 27, 27, 21, 15, 18, 13, 13, 16, 22, 24, 37, 31, 40, 40, 37, 47, 40, 37, 27, 27, 24, 24, 17, 20, 13, 10, 10, 11, 11, 14, 12, 19, 10, 10, 12, 14, 11, 10, 10, 10, 10, 15, 11, 15, 15, 25, 12, 12, 8, 8, 10, 17, 10, 21, 21, 8, 8, 8, 10, 10, 19, 25, 21, 19, 10, 10, 8, 9, 10, 12, 14, 17, 24, 22, 16, 16, 10, 9, 8, 14, 12, 12, 9, 9, 9, 9, 9, 9, 13, 19, 15, 18, 22, 22, 15, 15, 15, 15, 9, 10, 9, 8, 8, 9, 10, 14, 10, 10, 19, 15, 12, 9, 15, 4, 4, 4, 8, 8, 10, 12, 9, 8, 6, 6, 6, 6, 7, 7, 8, 8, 8, 8, 16, 10, 10, 10, 8, 7, 7, 7, 7, 7, 13, 20, 19, 15, 15, 10, 10, 8, 8, 10, 10, 10, 15, 10, 8, 8, 9, 8, 9, 10, 11, 10, 8, 8, 8, 8, 8, 4, 8, 4, 7, 7, 9, 13, 16, 11, 10, 12, 11, 13, 8, 8, 8, 8, 8, 9, 10, 9, 9, 9, 8, 8, 8, 12, 8, 9, 9, 11, 10, 10, 7, 7, 9, 7, 8, 9, 11, 10, 9, 10, 9, 10, 7, 7, 7, 9, 8, 8, 10, 8, 8, 4, 7, 4, 4, 4, 4, 4, 8, 7, 7, 8, 9, 9, 7, 7, 9, 9, 9, 9, 8, 7, 7, 7, 7, 10, 10, 7, 8, 8, 9, 10, 10, 10, 14, 13, 9, 8, 7, 7, 7, 6, 6, 7, 7, 6, 6, 6, 6, 6, 8, 15, 10, 8, 8, 8, 8, 6, 7, 6, 6, 6, 6, 7, 7, 7, 8, 7, 4, 4, 4, 6, 6, 6, 6, 7, 13, 7, 7, 8, 8, 8, 7, 7, 7, 7, 7, 7, 8, 9, 7, 7, 7, 6, 6, 6, 6, 6, 7, 9, 7, 7, 7, 8, 10, 8, 8, 8, 8, 9, 6, 6, 6, 6, 6, 6, 6, 7, 7, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6, 7, 6, 6, 7, 9, 7, 7, 11, 6, 6, 7, 6, 6, 8, 7, 7, 8, 8, 10, 8, 8, 8, 6, 6, 7, 6, 6, 6, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 7, 7, 7, 7, 7, 12, 9, 14, 10, 10, 10, 10, 8, 9, 8, 8, 8, 7, 7, 7, 7, 7, 13, 7, 7, 6, 6, 6, 6, 6, 6, 8, 7, 7, 7, 6, 6, 6, 6, 6, 8, 8, 8, 9, 7, 7, 7, 8, 9, 7, 6, 6, 6, 6, 6, 6, 8, 8, 6, 6, 6, 6, 7, 7, 7, 8, 8, 9, 8, 9, 8, 8, 8, 8, 8, 8, 8, 12, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 6, 9, 6, 6, 6, 7, 7, 7, 7, 7, 8, 10, 10, 14, 9, 12, 7, 7, 7, 4, 4] e.seq = fastn.Fasta(e.id, seq) e.seq = e.seq.to_Fastq(quals) e.insert_min = 2000 e.insert_max = 4000 e.ligation = '96781' e.clone = 'pknbac5b2' e.clip_start = 23 e.clip_end = 789 self.assertEqual(c, e) c.get_next_from_file(f_in) e = caf.Caf() e.id = 'pknbac5b2Aa02.p1k' seq = ''.join(['AAAGACATACGACCTTTTTTTTTTTCGATAACAAAGGGTATCCTTTCACCAGAAAAAAAA', 'AAAGAACATTCTTCTTTTTTCTTGAAGAACATACATTCTTTTTTTTATTTTATTTTTTTT', 'TTTCGACCCCTCAGTGTTGTGGTAGCATGATGTGTTGGACTTGAATGGTATATGTATTGA', 'TTGTTTCGTTCGTTATGTAATTTCCGGTTTTTCCCCGTGGCATCCGGATAGTGTATAGTA', 'TCCGGTCCCTGTGTTCAAAAAGTTTTTCCTTTTCCCCTTAAAGCAACTGAAGTTAAACCC', 'TGAACCTTACTACTGAACCCGGAATTTGACTTCTAAAACCCTGAAGAATGATTCCTATAA', 'CCCTAAAAAATCCAACCTAAAACATCCAAACTGAACCATAGAACCTTCCTCCTAAACCCG', 'GAATCTATGTTCTAACACCCTGACATCTTTGTCCTAAACCCTGAATCTAAGTTCTAACAT', 'CCTGACAACTCTCCCTCCTAAACCCGGAATCTAAATTCGTACACCCTGACACCTCCCCCC', 'TAAACCCGGAATCCGCATTCTAACACCCTGACAATTTCCTCCTGAAAAGCGGAATCTGAC', 'TTCTAACACCCTGACACCTTTGTCCTGAACCCGGAATCTAAGTTCTTACACCCGGACACC', 'TCCCTCCTAAATCCGGAATCTAAGTTCTAACACCCTCACACCTTTGTCCTAAACCTTGAA', 'TCTAAGTCCTAACACCCTGACACCTTTGTCCTAAGCCCGGAATCTAACTTCTAGCACCCC', 'TACGACCCTTATTCCTAAACCCAGAATCTGACTATTGACACCCCTACAACCCTAATTCCA', 'ACACCCTTACAACCTTCATTCCAACACCGCAACAACCTTCATTCCAGCACCCCTACAACT', 'TCATTCCTACACCCCAAACAACATCATCCCTACACCCCAAACAACATCATTCCTACACCC', 'CAAACACATCATCCAACACCCCATAACACATCATTCCAACACGGCAACAACATCATTCGA', 'AACACCCCTACAAATCATTGCAGCACCCCCACTACCTCCCTGCGTATACCCGTATTCGAA', 'ATTTTGACACCCCTACTACCTTTATCTGACACCCCCAAAAAACTCCTCTTAAACCCAACA', 'AGGGGACTATAATACCCCTAAAACTTTATCTTAACCGGAATCCGAATTCTATACCGAAAA', 'AACTTCTTTCCTAACCGGGATCTGTACCCCGAACTTTTAAAATTAAAGGGGAAATGAACC', 'CCTGACCAGATAACGGGAAACCTTTATTGTGACAGGAACTCCTACCGCAATATGAAAATT', 'GGACCCCAAATTTGGGAAACCCCTTTT']) quals = [9, 9, 6, 4, 4, 4, 4, 7, 6, 6, 8, 6, 6, 6, 7, 7, 14, 8, 8, 8, 10, 17, 21, 12, 9, 10, 10, 9, 11, 8, 9, 11, 11, 21, 12, 15, 15, 21, 24, 33, 32, 35, 29, 29, 22, 22, 15, 29, 25, 26, 18, 18, 18, 31, 31, 47, 56, 56, 56, 42, 36, 44, 28, 28, 28, 39, 33, 35, 30, 36, 33, 35, 35, 36, 35, 37, 42, 35, 35, 31, 29, 26, 26, 20, 33, 15, 22, 22, 29, 29, 32, 35, 35, 36, 35, 35, 42, 42, 37, 37, 42, 47, 47, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 44, 47, 47, 47, 47, 47, 35, 30, 30, 23, 24, 30, 45, 37, 37, 37, 35, 23, 23, 11, 11, 13, 23, 31, 21, 21, 19, 20, 23, 29, 23, 20, 16, 16, 30, 29, 29, 28, 28, 28, 24, 24, 17, 29, 29, 33, 33, 35, 31, 37, 18, 15, 12, 16, 16, 23, 27, 24, 32, 29, 32, 32, 24, 26, 29, 37, 29, 30, 35, 35, 33, 35, 35, 31, 33, 31, 31, 35, 31, 31, 31, 31, 27, 33, 33, 42, 35, 37, 37, 21, 21, 21, 21, 37, 37, 50, 50, 50, 50, 50, 33, 33, 18, 16, 15, 25, 19, 20, 33, 33, 33, 35, 35, 33, 33, 33, 18, 18, 18, 33, 24, 33, 33, 33, 27, 33, 33, 33, 33, 33, 22, 33, 33, 33, 24, 24, 21, 24, 24, 31, 31, 11, 11, 11, 31, 33, 44, 44, 37, 42, 42, 47, 44, 44, 44, 44, 44, 44, 44, 47, 50, 50, 42, 42, 42, 41, 42, 42, 47, 47, 37, 37, 27, 33, 33, 33, 33, 35, 35, 42, 41, 37, 37, 44, 50, 50, 33, 33, 27, 33, 37, 42, 42, 42, 41, 41, 33, 33, 27, 27, 33, 33, 37, 50, 35, 35, 35, 35, 35, 35, 35, 42, 35, 37, 35, 37, 35, 41, 37, 42, 42, 42, 42, 50, 50, 50, 42, 35, 33, 33, 21, 21, 16, 23, 19, 27, 27, 33, 35, 41, 50, 37, 35, 35, 42, 50, 50, 50, 44, 44, 44, 50, 42, 42, 37, 37, 35, 35, 35, 44, 44, 50, 50, 41, 37, 37, 37, 37, 35, 35, 35, 37, 37, 37, 44, 37, 37, 33, 33, 22, 33, 37, 35, 33, 33, 21, 21, 21, 33, 33, 41, 41, 44, 44, 44, 44, 44, 50, 50, 44, 44, 37, 50, 33, 33, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 50, 50, 50, 50, 50, 44, 47, 44, 44, 48, 33, 21, 24, 21, 33, 33, 35, 37, 50, 50, 37, 35, 35, 50, 50, 56, 50, 50, 50, 50, 48, 33, 27, 33, 27, 33, 33, 44, 50, 50, 42, 37, 35, 42, 42, 50, 50, 50, 44, 44, 44, 33, 33, 18, 19, 18, 33, 33, 42, 35, 35, 44, 44, 44, 50, 44, 44, 44, 50, 50, 44, 44, 37, 37, 33, 33, 35, 35, 35, 35, 35, 37, 50, 37, 27, 27, 24, 37, 33, 35, 35, 37, 35, 37, 37, 46, 33, 24, 24, 21, 33, 33, 39, 42, 42, 44, 50, 50, 56, 50, 50, 37, 35, 35, 33, 37, 33, 33, 35, 35, 35, 35, 33, 33, 33, 33, 27, 27, 27, 37, 37, 44, 37, 41, 41, 41, 50, 46, 33, 24, 24, 16, 31, 19, 27, 31, 37, 37, 44, 44, 44, 37, 50, 23, 23, 22, 29, 31, 33, 23, 23, 23, 23, 23, 28, 25, 33, 26, 26, 22, 28, 37, 42, 44, 42, 42, 44, 44, 44, 44, 46, 33, 16, 19, 14, 27, 31, 42, 50, 50, 50, 44, 44, 44, 50, 50, 26, 26, 21, 28, 31, 29, 29, 26, 26, 26, 30, 30, 39, 27, 37, 26, 30, 30, 42, 42, 42, 36, 33, 29, 33, 33, 33, 20, 21, 23, 17, 23, 31, 36, 42, 43, 56, 56, 47, 47, 42, 42, 33, 33, 29, 29, 23, 31, 25, 26, 26, 26, 30, 30, 36, 27, 33, 28, 31, 33, 35, 44, 33, 33, 28, 33, 35, 44, 48, 48, 48, 42, 47, 42, 42, 42, 48, 44, 44, 37, 34, 34, 44, 48, 42, 37, 34, 42, 48, 33, 33, 34, 30, 30, 33, 33, 40, 30, 37, 28, 28, 26, 27, 27, 25, 19, 16, 25, 29, 40, 31, 27, 15, 18, 13, 25, 27, 40, 40, 33, 40, 33, 33, 33, 40, 37, 23, 12, 12, 17, 11, 10, 15, 15, 13, 13, 13, 18, 27, 23, 28, 28, 28, 28, 37, 28, 32, 26, 23, 26, 26, 19, 29, 25, 24, 25, 24, 15, 15, 15, 12, 17, 24, 24, 21, 21, 21, 25, 22, 29, 25, 22, 21, 24, 25, 17, 17, 14, 14, 12, 14, 19, 24, 18, 18, 14, 21, 11, 15, 10, 15, 18, 22, 27, 25, 25, 29, 29, 29, 25, 26, 25, 21, 22, 25, 22, 22, 18, 15, 15, 15, 25, 19, 25, 25, 16, 24, 24, 20, 20, 22, 20, 15, 10, 10, 10, 12, 13, 20, 20, 12, 14, 14, 12, 12, 12, 15, 15, 15, 18, 18, 11, 10, 11, 11, 10, 10, 14, 15, 18, 18, 19, 17, 12, 11, 10, 10, 20, 15, 19, 24, 24, 24, 23, 15, 13, 7, 6, 6, 6, 6, 6, 12, 13, 12, 9, 8, 10, 10, 9, 6, 6, 6, 6, 10, 10, 13, 15, 15, 15, 15, 17, 9, 9, 9, 9, 9, 11, 11, 9, 7, 7, 7, 6, 4, 4, 6, 9, 9, 8, 8, 8, 10, 9, 8, 7, 7, 7, 7, 7, 9, 13, 10, 10, 10, 15, 12, 9, 9, 9, 15, 19, 15, 15, 11, 7, 7, 7, 7, 7, 7, 8, 8, 19, 10, 10, 10, 12, 12, 19, 11, 15, 18, 11, 14, 9, 9, 6, 6, 6, 6, 6, 6, 6, 8, 11, 20, 13, 17, 14, 14, 9, 9, 10, 17, 11, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 11, 10, 12, 11, 10, 12, 9, 12, 8, 8, 8, 9, 12, 12, 8, 11, 7, 8, 8, 8, 8, 11, 9, 8, 6, 4, 4, 4, 6, 6, 7, 10, 10, 12, 9, 7, 7, 6, 6, 6, 6, 8, 6, 9, 10, 13, 8, 11, 8, 7, 7, 8, 7, 7, 7, 10, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 10, 14, 10, 8, 12, 8, 8, 6, 7, 9, 8, 7, 6, 8, 7, 4, 4, 7, 7, 6, 7, 6, 6, 6, 6, 8, 11, 8, 8, 8, 8, 12, 10, 12, 11, 11, 11, 10, 12, 10, 7, 7, 9, 4, 4, 8, 6, 6, 6, 6, 6, 6, 7, 10, 7, 7, 7, 7, 7, 9, 9, 9, 7, 7, 7, 6, 6, 6, 7, 7, 7, 10, 11, 9, 7, 6, 6, 8, 6, 6, 8, 8, 8, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7, 7, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 9, 12, 10, 15, 15, 16, 7, 7, 6, 6, 6, 7, 6, 6, 6, 6, 6, 8, 7, 7, 8, 7, 8, 7, 7, 9, 8, 7, 7, 8, 8, 9, 7, 6, 7, 6, 9, 6, 7, 11, 7, 7, 11, 8, 8, 7, 10, 8, 9, 8, 6, 6, 6, 6, 7, 7, 7, 6, 6, 6, 8, 8, 7, 7, 6, 9, 7, 6, 6, 6, 6, 6, 6, 6, 8, 7, 7, 7, 7, 7, 7, 7, 10, 12, 19, 13, 13, 10, 9, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 6, 6, 6, 6, 6, 4, 6, 4, 4, 4, 4, 4, 4, 6, 6, 6, 7, 7, 7, 7, 8, 7, 6, 6, 6, 6, 6, 6, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 11, 12, 9] e.seq = fastn.Fasta(e.id, seq) e.seq = e.seq.to_Fastq(quals) e.insert_min = 2000 e.insert_max = 4000 e.ligation = '96781' e.clone = 'pknbac5b2' e.clip_start = 33 e.clip_end = 848 self.assertEqual(c, e) utils.close(f_in)
genome_intervals.merge_overlapping_in_list(hits) for hit in hits: if hit.end - hit.start + 1 >= options.min_seq_length: if ref_name not in contigs_to_print: contigs_to_print[ref_name] = [] contigs_to_print[ref_name].append(copy.copy(hit)) # remove any contigs that are completely contained in another contig for ref, l in contigs_to_print.items(): genome_intervals.remove_contained_in_list(l) # print the final perfect contigs f_out = utils.open_file_write(options.outprefix + '.fa') counter = 1 last_id = None for ref_name in sorted(contigs_to_print): counter = 1 for interval in contigs_to_print[ref_name]: id = ':'.join([ str(x) for x in [ref_name, counter, interval.start, interval.end] ]) print(fastn.Fasta(id, ref_seqs[ref_name][interval.start - 1:interval.end]), file=f_out) counter += 1 utils.close(f_out)
def test_to_Fasta_and_qual(self): '''Check to_Fasta_and_qual converts quality scores correctly''' fq = fastn.Fastq('ID', 'ACGT', '>ADI') (fa, qual) = fq.to_Fasta_and_qual() self.assertEqual(fa, fastn.Fasta('ID', 'ACGT')) self.assertListEqual(qual, [29, 32, 35, 40])
gc_hist = dict(zip(range(101), [0] * 101)) f_in = mh12_utils.open_file_read(options.infile) f_out = mh12_utils.open_file_write(options.txtout) while 1: seq = fastn.get_next_seq_from_file(f_in, filetype) if not seq: break if options.window: i = 0 while i < len(seq): tmp = fastn.Fasta(seq.id, seq.seq[i:i + options.window]) gc = tmp.gc() gc_hist[floor(gc)] += 1 print >> f_out, seq.id, str(i + 1), gc i += options.window else: gc = seq.gc() gc_hist[floor(gc)] += 1 print >> f_out, seq.id, gc f_in.close() f_out.close() c = not options.noclean mh12_utils.hist2Rplot(gc_hist,
def get_unique_tags(ref_index, tag_length, unique_tagged_seqs, untagged_seqs, unique_tags, log_fh, second_index=None): if len(untagged_seqs) == 0: return tags_fasta_fname = options.outprefix + '.tags.test.fa' seqs_sam = options.outprefix + '.seqs.bowtie2.sam' second_seqs_sam = options.outprefix + '.second_seqs.bowtie2.sam' fout_tags = utils.open_file_write(tags_fasta_fname) tags = {} tag_info = {} # make fasta file of tags for id, seq in untagged_seqs.items(): tag = '' if len(seq) < tag_length: tag = fastn.Fasta(seq.id + ':1-' + str(len(seq)), seq.seq) tag_info[id] = [id, '1', str(len(seq)), tag.seq] else: left_coord = int(0.5 * len(seq) - 0.5 * tag_length) right_coord = left_coord + tag_length - 1 tag = fastn.Fasta( seq.id + ':' + str(left_coord + 1) + '-' + str(right_coord + 1), seq[left_coord:right_coord + 1]) tag_info[id] = [id, left_coord + 1, right_coord + 1, tag.seq] print(tag, file=fout_tags) tags[id] = copy.copy(seq) utils.close(fout_tags) # get the count of number of hits per tag from the results tag_counts = {} second_tag_counts = {} map_and_parse_sam(ref_index, tags_fasta_fname, tag_counts, log_fh) if second_index: map_and_parse_sam(second_index, tags_fasta_fname, second_tag_counts, log_fh) assert len(tag_counts) == len(second_tag_counts) # update the unique/non-unique tagged sequences for contig_name, hit_count in tag_counts.items(): assert contig_name not in unique_tagged_seqs if second_index: second_hit_count = second_tag_counts[contig_name] else: second_hit_count = 1 if hit_count == 1 == second_hit_count: unique_tagged_seqs[contig_name] = tags[contig_name] unique_tags.append(tag_info[contig_name]) del untagged_seqs[contig_name] try: os.unlink(tags_fasta_fname) except: print('Error deleting file "' + tags_fasta_fname + '"', file=sys.stderr) sys.exit(1)
def test_to_Fastq(self): '''Check to_Fastq converts OK, including out of range quality scores''' fa = fastn.Fasta('X', 'AAAAA') quals = [-1, 0, 40, 93, 94] self.assertEqual(fastn.Fastq('X', 'AAAAA', '!!I~~'), fa.to_Fastq(quals))
def to_fasta(self, id): seq = ''.join([self.bases[i][self.positions[i]] for i in range(len(self.positions))]) return fastn.Fasta(id, seq)
def test_replace_bases(self): '''Check that bases get replaced correctly''' fa = fastn.Fasta('X', 'AUCGTUUACT') fa.replace_bases('U', 'T') self.assertEqual(fa, fastn.Fasta('X', 'ATCGTTTACT'))
def to_fasta(self): return fastn.Fasta(self.id, self.seq)
def setUp(self): self.fasta = fastn.Fasta('ID', 'ACGTA')