def fastq_truncate(fname, max_len): cs = is_colorspace_fastq(fname) for name, seq, qual in read_fastq(fname): if cs and seq[0] in "atcgATGC": sys.stdout.write('%s\n%s\n+\n%s\n' % (name, seq[:max_len + 1], qual[:max_len])) else: sys.stdout.write('%s\n%s\n+\n%s\n' % (name, seq[:max_len], qual[:max_len]))
def fastq_csencode(fname): for name, seq, qual in read_fastq(fname, quiet=False): if seq: if seq[0] in 'ATCG': seq = seq[2:] else: seq = seq[1:] sys.stdout.write('%s\n%s\n+\n%s\n' % (name, encoded_seq(seq), qual[1:]))
def test_read_qualities(self): """ Tests the ability to get a list of bases and their corresponding qualties from a fastq file :return: """ sequences, qualities = fqu.read_fastq(full_file_name) self.assertGreater(len(sequences), 0) self.assertGreater(len(qualities), 0)
def fastq_trim(fname, linker_5=None, linker_3=None, out=sys.stdout, pct_identity=0.8, min_trim=4, min_len=25, verbose=False): ''' fname - the fastq filename linker_5 - the 5' linker to remove linker_3 - the 3' linker to remove out - an output stream (eg: file, stdout) pct_identity - the percentage of matches that must be present in the alignment to strip away linkers min_trim - the distance away from the edges that the linkers much match w/in ''' cs = is_colorspace_fastq(fname) sw = support.localalign.LocalAlignment(support.localalign.NucleotideScoringMatrix(2, -1), -1) removed = 0 trimmed = 0 for name, seq, qual in read_fastq(fname): if verbose: sys.stderr.write('Read: %s\n : %s\n' % (name, seq)) left = 0 right = len(seq) if linker_5: aln = sw.align(seq, linker_5) if verbose: sys.stderr.write("5' alignment:\n") aln.dump(sys.stderr) if aln.r_pos < min_trim and aln.identity > pct_identity: left = aln.r_end if linker_3: aln = sw.align(seq, linker_3) if verbose: sys.stderr.write("3' alignment:\n") aln.dump(sys.stderr) if aln.r_end > len(seq) - min_trim and aln.identity > pct_identity: right = aln.r_pos s = seq[left:right] if len(s) >= min_len: if left > 0 or right < len(seq): trimmed += 1 if cs and len(seq) != len(qual) and left == 0: out.write('%s\n%s\n+\n%s\n' % (name, s, qual[left:right - 1])) else: out.write('%s\n%s\n+\n%s\n' % (name, s, qual[left:right])) else: removed += 1 # out.write('%s\n%s (%s-%s)\n' % (name,seq,left,right)) # out.write('x'*left) # out.write(seq[left:right]) # out.write('x' *(len(seq)-right)) # out.write('\n') sys.stderr.write('Trimmed: %s\n' % trimmed) sys.stderr.write('Removed: %s (len)\n' % removed)
def fastq_split(fname, outbase, chunks, ignore_pairs=False, gz=False, count_fname=None): i = 0 if ignore_pairs: is_paired = False else: is_paired = is_paired_fastq(fname) outs = [] fnames = [] for i in xrange(chunks): if gz: fn = '%s.%s.fastq.gz' % (outbase, i + 1) tmp = os.path.join(os.path.dirname(fn), '.tmp.%s' % os.path.basename(fn)) fnames.append((tmp, fn)) sys.stderr.write('Output file: %s\n' % fn) outs.append(gzip.open(tmp, 'w')) else: fn = '%s.%s.fastq' % (outbase, i + 1) tmp = os.path.join(os.path.dirname(fn), '.tmp.%s' % os.path.basename(fn)) fnames.append((tmp, fn)) sys.stderr.write('Output file: %s\n' % fn) outs.append(open(tmp, 'w')) i = 0 last_name = None count = 0 for name, seq, qual in read_fastq(fname): count += 1 sn = name.split()[0] if not is_paired: i += 1 elif sn != last_name: i += 1 if i >= len(outs): i = 0 last_name = sn outs[i].write('%s\n%s\n+\n%s\n' % (name, seq, qual)) for out in outs: out.close() for tmp, fname in fnames: os.rename(tmp, fname) if count_fname: with open(count_fname, 'w') as f: f.write(count)
def test_create_hist(self): """ Tests the building of a histigram of quality scores from fastq file :return: """ sequences, qualities = fqu.read_fastq(full_file_name) hist = fqu.create_hist(qualities) self.assertEqual(len(hist), 50) #hist of read qualities plt.bar(range(len(hist)), hist) plt.show()
def test_find_gc_by_pos(self): """ Tests the GC by position function :return: """ sequences, qualities = fqu.read_fastq(full_file_name) gc_by_pos = fqu.find_gc_by_pos(sequences) self.assertEqual(len(gc_by_pos), 100) # line plot of GC ratio for these reads plt.plot(range(len(gc_by_pos)), gc_by_pos) plt.show()
def test_fastq_base_dist(self): """ Quick test to get the base distribution of our sequences :return: """ sequences, qualities = fqu.read_fastq(full_file_name) sequence = "".join(sequences) base_dist = dnau.get_frequency_counts(sequence) self.assertEqual(base_dist['A'], 21132) self.assertEqual(base_dist['C'], 28272) self.assertEqual(base_dist['G'], 28742) self.assertEqual(base_dist['T'], 21836) # 'N' means not confident self.assertEqual(base_dist['N'], 18)
def fastq_tag(fname, prefix, suffix): for name, seq, qual in read_fastq(fname): spl = name[1:].split(None, 1) nname = '' if len(spl) > 1: desc = spl[1] else: desc = None if prefix and suffix: nname = '%s%s%s' % (prefix, spl[0], suffix) elif prefix: nname = '%s%s' % (prefix, spl[0]) elif suffix: nname = '%s%s' % (spl[0], suffix) if desc: nname = '%s %s' % (nname, desc) sys.stdout.write('@%s\n%s\n+\n%s\n' % (nname, seq, qual))
def fastq_merge(fnames, split_slashes=False): infiles = [] first = True for fname in fnames: gen = read_fastq(fname, quiet=not first) infiles.append((fname, gen)) first = False while True: lastname = None try: for fname, generator in infiles: name, seq, qual = generator.next() if split_slashes and '/' in name: spl = name.split('/', 1) name = spl[0] desc = ' /%s' % spl[1] else: cols = name.split() name = cols[0] if len(cols) > 1: desc = cols[1] else: desc = '' if not lastname: lastname = name elif name != lastname: sys.stderr.write('Files are not paired! (error in: %s)\nExpected: %s\nGot : %s\n' % (fname, lastname, name)) sys.exit(1) sys.stdout.write('%s %s\n%s\n+\n%s\n' % (name, desc, seq, qual)) except: break
def fastq_convertqual(fname): for name, seq, qual in read_fastq(fname): sys.stdout.write('@%s\n%s\n+\n%s\n' % (name, seq, ''.join([chr(ord(q) - 31) for q in qual])))
def filter(self): for tup in read_fastq(fname, quiet=not self.verbose): self.kept += 1 if self.verbose: sys.stderr.write('[FASTQ] Read: %s\n' % tup[0]) yield tup
def export_seq(fname): for name, seq, quals in read_fastq(fname, quiet=False): sys.stdout.write('>%s\n%s\n' % (name[1:], seq))
def export_qual(fname): for name, seq, quals in read_fastq(fname, quiet=False): sys.stdout.write('>%s\n%s\n' % (name[1:], ' '.join([str(ord(x) - 33) for x in quals])))
def fastq_stats(fname, verbose=False): cs = is_colorspace_fastq(fname) if cs: print "Space:\tcolorspace" else: print "Space:\tnucleotide" pairs = is_paired_fastq(fname) if pairs > 0: print "Pairing:\tPaired-end (%s)" % pairs else: print "Pairing:\tFragmented" qual_totals = fastq_qualtype(fname) print "Quality scale:\t%s" % qual_totals[-1][1] if verbose: print ' '.join(['(%s,%s)' % (q[1], q[0]) for q in qual_totals]) lengths = [] posquals = [] qualities = [] total = [] total_reads = 0 line = 0 try: for name, seq, qual in read_fastq(fname): if not name[0] == '@': print 'Invalid formatted record [line %s]' % line break if cs: if len(seq) != len(qual) + 1: print 'Seq / qual mismatch [line %s]' % line break else: if len(seq) != len(qual): print 'Seq / qual mismatch [line %s]' % line break line += 4 total_reads += 1 while len(total) < len(qual) + 1: total.append(0) for x in xrange(len(qual) + 1): total[x] += 1 while len(qual) > len(lengths) - 1: lengths.append(0) qualities.append(0) posquals.append([]) lengths[len(qual)] += 1 for idx, q in enumerate([ord(x) for x in qual]): qualities[idx] += q while len(posquals[idx]) < (q - 32): posquals[idx].append(0) posquals[idx][q - 33] += 1 except KeyboardInterrupt: pass print "Number of reads:\t%s" % total_reads print "" mean, stdev, min_val, pct25, pct50, pct75, max_val = stats_counts(lengths) print "Length distribution" print 'Mean:\t%s' % mean print 'StdDev:\t%s' % stdev print 'Min:\t%s' % min_val print '25 percentile:\t%s' % pct25 print 'Median:\t%s' % pct50 print '75 percentile:\t%s' % pct75 print 'Max:\t%s' % max_val if verbose: print '' for idx, count in enumerate(lengths[::-1]): if count: print "%s\t%s" % (len(lengths) - idx - 1, count) print "Quality distribution" print "pos\tmean\tstdev\tmin\t25pct\t50pct\t75pct\tmax" for pos, quals in enumerate(posquals): if not quals: continue mean, stdev, min_val, pct25, pct50, pct75, max_val = stats_counts( quals) sys.stdout.write('%s\t' % (pos + 1)) sys.stdout.write('\t'.join([str(x) for x in stats_counts(quals)])) sys.stdout.write('\n') print "" print "Quality by position" for i, x in enumerate(qualities): q = x / total[i] if q > 33: sys.stdout.write(chr(q)) else: sys.stdout.write('~') if verbose: print '' for i, q in enumerate(qualities): print '[%s] %s' % (i, (q / total[i]) - 33) print ''
#!/usr/bin/env python ## category General ## desc Write out the read names ''' Writes out the read names present in the FASTQ file. ''' import os import sys from fastq_utils import read_fastq def usage(): print __doc__ print "Usage: fastqutils names filename.fastq{.gz}" sys.exit(1) if __name__ == '__main__': fname = None for arg in sys.argv[1:]: if os.path.exists(arg): fname = arg if not fname: usage() for name, seq, qual in read_fastq(fname): sys.stdout.write('%s\n' % name.split()[0][1:])
## category General ## desc Write out the read names ''' Writes out the read names present in the FASTQ file. ''' import os import sys from fastq_utils import read_fastq def usage(): print __doc__ print "Usage: fastqutils names filename.fastq{.gz}" sys.exit(1) if __name__ == '__main__': fname = None for arg in sys.argv[1:]: if os.path.exists(arg): fname = arg if not fname: usage() for name, seq, qual in read_fastq(fname): sys.stdout.write('%s\n' % name.split()[0][1:])
def fastq_stats(fname, verbose=False): cs = is_colorspace_fastq(fname) if cs: print "Space:\tcolorspace" else: print "Space:\tnucleotide" pairs = is_paired_fastq(fname) if pairs > 0: print "Pairing:\tPaired-end (%s)" % pairs else: print "Pairing:\tFragmented" qual_totals = fastq_qualtype(fname) print "Quality scale:\t%s" % qual_totals[-1][1] if verbose: print ' '.join(['(%s,%s)' % (q[1], q[0]) for q in qual_totals]) lengths = [] posquals = [] qualities = [] total = [] total_reads = 0 line = 0 try: for name, seq, qual in read_fastq(fname): if not name[0] == '@': print 'Invalid formatted record [line %s]' % line break if cs: if len(seq) != len(qual) + 1: print 'Seq / qual mismatch [line %s]' % line break else: if len(seq) != len(qual): print 'Seq / qual mismatch [line %s]' % line break line += 4 total_reads += 1 while len(total) < len(qual) + 1: total.append(0) for x in xrange(len(qual) + 1): total[x] += 1 while len(qual) > len(lengths) - 1: lengths.append(0) qualities.append(0) posquals.append([]) lengths[len(qual)] += 1 for idx, q in enumerate([ord(x) for x in qual]): qualities[idx] += q while len(posquals[idx]) < (q - 32): posquals[idx].append(0) posquals[idx][q - 33] += 1 except KeyboardInterrupt: pass print "Number of reads:\t%s" % total_reads print "" mean, stdev, min_val, pct25, pct50, pct75, max_val = stats_counts(lengths) print "Length distribution" print 'Mean:\t%s' % mean print 'StdDev:\t%s' % stdev print 'Min:\t%s' % min_val print '25 percentile:\t%s' % pct25 print 'Median:\t%s' % pct50 print '75 percentile:\t%s' % pct75 print 'Max:\t%s' % max_val if verbose: print '' for idx, count in enumerate(lengths[::-1]): if count: print "%s\t%s" % (len(lengths) - idx - 1, count) print "Quality distribution" print "pos\tmean\tstdev\tmin\t25pct\t50pct\t75pct\tmax" for pos, quals in enumerate(posquals): if not quals: continue mean, stdev, min_val, pct25, pct50, pct75, max_val = stats_counts(quals) sys.stdout.write('%s\t' % (pos + 1)) sys.stdout.write('\t'.join([str(x) for x in stats_counts(quals)])) sys.stdout.write('\n') print "" print "Quality by position" for i, x in enumerate(qualities): q = x / total[i] if q > 33: sys.stdout.write(chr(q)) else: sys.stdout.write('~') if verbose: print '' for i, q in enumerate(qualities): print '[%s] %s' % (i, (q / total[i]) - 33) print ''