def load_uniqued(all_quality,uniqued,readlen=None,nticks=20,baseQ=None,count_by_ind=False): '''given a .uniqued file produced by preprocess_radtag_lane.py loads data into all_quality, ensuring sequences remain unique all_quality per 20101114 - UPDATE below ''' nreads = get_read_count(uniqued) qfh = smartopen(uniqued) while baseQ is None: line = qfh.next() qstr = line.strip().split()[2] baseQ = get_baseQ(qstr) qfh.close() print >> sys.stderr, 'uniqued qualities base %s' % (baseQ) tickon = nreads/nticks if tickon < 1: tickon = 1 print >> sys.stderr, '\tloading' for i,line in enumerate(smartopen(uniqued)): if i % tickon == 0: print >> sys.stderr, '\t\t%s / %s (%d%%)' % (i,nreads,(float(i)/nreads)*100) try: s,c,qstr,indivstr,indcnt,r2,r2cnt = line.strip().split() except ValueError: print >> sys.stderr, 'line %s split: incorrect element number (%s) line:\n%ssplit:\n%s\n' % (i,len(line.strip().split()),line,line.strip().split()) q = numpy.array([ord(ch)-baseQ for ch in qstr]) c = int(c) indiv = set(indivstr.split(',')) if count_by_ind: indcntd = dict(zip(indivstr.split(','),map(int,indcnt.split(',')))) if readlen is not None: s = s[:readlen] q = q[:readlen] if all_quality.has_key(s): all_quality[s]['mIDs'] = list(set(all_quality[s]['mIDs']).union(indiv)) all_quality[s]['sum_quality'] += q*c all_quality[s]['tot'] += c if count_by_ind: for ind,cnt in indcntd.items(): if all_quality[s]['count_by_ind'].has_key(ind): all_quality[s]['count_by_ind'][ind] += cnt else: all_quality[s]['count_by_ind'][ind] = cnt else: all_quality[s]['mIDs'] = list(indiv) all_quality[s]['sum_quality'] = q*c all_quality[s]['tot'] = c if count_by_ind: all_quality[s]['count_by_ind'] = indcntd
def uniqued_to_fastq(uniqued, id_prefix=''): if uniqued.endswith('gz'): len_uni = int( Popen('zcat %s | wc -l' % uniqued, shell=True, stdout=PIPE).stdout.read().strip()) else: len_uni = int( Popen('cat %s | wc -l' % uniqued, shell=True, stdout=PIPE).stdout.read().strip()) fh = smartopen(uniqued) outname = remove_ext(uniqued) + '-fromuni.fastq.gz' if os.path.exists(outname) and get_read_count(outname) == len_uni: print >> sys.stderr, 'output %s exists' % outname return outname ofh = smartopen(outname, 'w') print >> sys.stderr, 'convert %s to fastq' % uniqued for i, l in enumerate(fh): fields = l.strip().split() fq_line = '@%s%s\n%s\n+\n%s\n' % (id_prefix, i, fields[0], fields[2]) ofh.write(fq_line) if i % 1000 == 0: print >> sys.stderr, '\r\t%s done' % i, ofh.close() print >> sys.stderr, '%s done' % outname return outname
def convert_fastq(fq,ofq,out_lnum=4,out_baseQ=33,tickon = 10000): nreads = preprocess_radtag_lane.get_read_count(fq) lnum,baseQ = preprocess_radtag_lane.get_fastq_properties(fq) fh = preprocess_radtag_lane.smartopen(fq) ofh = preprocess_radtag_lane.smartopen(ofq,'w') for i in xrange(nreads): if i%tickon == 0: print >> sys.stderr, '\r%s / %s (%0.1f%%)' % (i,nreads,(float(i)/nreads)*100), n,s,qs = preprocess_radtag_lane.next_read_from_fh(fh, lnum) ofh.write(preprocess_radtag_lane.as_fq_line(n,s,qs_to_q(qs,baseQ),out_baseQ,out_lnum)) print >> sys.stderr,'\n'
def convert_fastq(fq, ofq, out_lnum=4, out_baseQ=33, tickon=10000): nreads = preprocess_radtag_lane.get_read_count(fq) lnum, baseQ = preprocess_radtag_lane.get_fastq_properties(fq) fh = preprocess_radtag_lane.smartopen(fq) ofh = preprocess_radtag_lane.smartopen(ofq, 'w') for i in xrange(nreads): if i % tickon == 0: print >> sys.stderr, '\r%s / %s (%0.1f%%)' % (i, nreads, (float(i) / nreads) * 100), n, s, qs = preprocess_radtag_lane.next_read_from_fh(fh, lnum) ofh.write( preprocess_radtag_lane.as_fq_line(n, s, qs_to_q(qs, baseQ), out_baseQ, out_lnum)) print >> sys.stderr, '\n'
def uniqued_to_fastq(uniqued,id_prefix=''): if uniqued.endswith('gz'): len_uni = int(Popen('zcat %s | wc -l' % uniqued,shell=True,stdout=PIPE).stdout.read().strip()) else: len_uni = int(Popen('cat %s | wc -l' % uniqued,shell=True,stdout=PIPE).stdout.read().strip()) fh = smartopen(uniqued) outname = remove_ext(uniqued)+'-fromuni.fastq.gz' if os.path.exists(outname) and get_read_count(outname) == len_uni: print >> sys.stderr, 'output %s exists' % outname return outname ofh = smartopen(outname,'w') print >> sys.stderr, 'convert %s to fastq' % uniqued for i,l in enumerate(fh): fields = l.strip().split() fq_line = '@%s%s\n%s\n+\n%s\n' % (id_prefix,i,fields[0],fields[2]) ofh.write(fq_line) if i % 1000 == 0: print >> sys.stderr, '\r\t%s done' % i, ofh.close() print >> sys.stderr, '%s done' % outname return outname
for uniqued in uniqueds: load_uniqued(all_quality,uniqued,count_by_ind=True) print >> sys.stderr, 'LOAD COMPLETE. WRITE BY-SIZE.' ofbysize = write_uniqued_by_size(all_quality,bysize_dir) del all_quality ret = os.system('touch %s' % bysize_done) sizes = sorted(ofbysize.keys(),reverse=True) for i in sizes: print >> sys.stderr, '\nSTART %s' % i uni = ofbysize[i] ufq = uniqued_to_fastq(uni) nreads = get_read_count(ufq) if os.path.exists(denovo_ref): dn_len = ref_len(denovo_ref) noncontam_ubam = subtractive_map(ufq,contam_fa,stampy=False,readnames_only=False) unmapped = subtractive_map(noncontam_ubam,denovo_ref,force_index=True) else: dn_len = 0 unmapped = subtractive_map(ufq,contam_fa,stampy=False) print >> sys.stderr, '\nGET %s UNMAPPED' % len(unmapped) funi = os.path.splitext(uni)[0]+'.filtered.gz' filter_uniqued(uni,funi,map(int,unmapped)) outdir = os.path.splitext(uni)[0]+'-rtd' print >> sys.stderr, '\nRTD'
for paired end, argv: cutsite,fq1,fq2,outfile1,outfile2 ''' import preprocess_radtag_lane import os, sys barcode_len = 5 tick = 10000 #update progress every this-many reads if __name__ == "__main__": if len(sys.argv) == 4: cutsite, fq, outfile = sys.argv[1:] rc = preprocess_radtag_lane.get_read_count(fq) lnum, baseQ = preprocess_radtag_lane.get_fastq_properties(fq) fh = preprocess_radtag_lane.smartopen(fq) ofh = preprocess_radtag_lane.smartopen(outfile, 'w') found = 0 for i in range(rc): if i > 0 and i % tick == 0: print >> sys.stderr, '\r%s / %s (%0.1f%%) found %s (%0.1f%%)' % \ (i,rc,(float(i)/rc)*100,found,(float(found)/i)*100), n, s, q = preprocess_radtag_lane.next_read_from_fh(fh, lnum) if s[barcode_len:barcode_len + len(cutsite)] == cutsite: line = preprocess_radtag_lane.as_fq_line(n, s, q, None, lnum) ofh.write(line) found += 1
for uniqued in uniqueds: load_uniqued(all_quality, uniqued, count_by_ind=True) print >> sys.stderr, 'LOAD COMPLETE. WRITE BY-SIZE.' ofbysize = write_uniqued_by_size(all_quality, bysize_dir) del all_quality ret = os.system('touch %s' % bysize_done) sizes = sorted(ofbysize.keys(), reverse=True) for i in sizes: print >> sys.stderr, '\nSTART %s' % i uni = ofbysize[i] ufq = uniqued_to_fastq(uni) nreads = get_read_count(ufq) if os.path.exists(denovo_ref): dn_len = ref_len(denovo_ref) noncontam_ubam = subtractive_map(ufq, contam_fa, stampy=False, readnames_only=False) unmapped = subtractive_map(noncontam_ubam, denovo_ref, force_index=True) else: dn_len = 0 unmapped = subtractive_map(ufq, contam_fa, stampy=False) print >> sys.stderr, '\nGET %s UNMAPPED' % len(unmapped)
def load_uniqued(all_quality, uniqued, readlen=None, nticks=20, baseQ=None, count_by_ind=False): '''given a .uniqued file produced by preprocess_radtag_lane.py loads data into all_quality, ensuring sequences remain unique all_quality per 20101114 - UPDATE below ''' nreads = get_read_count(uniqued) qfh = smartopen(uniqued) while baseQ is None: line = qfh.next() qstr = line.strip().split()[2] baseQ = get_baseQ(qstr) qfh.close() print >> sys.stderr, 'uniqued qualities base %s' % (baseQ) tickon = nreads / nticks if tickon < 1: tickon = 1 print >> sys.stderr, '\tloading' for i, line in enumerate(smartopen(uniqued)): if i % tickon == 0: print >> sys.stderr, '\t\t%s / %s (%d%%)' % (i, nreads, (float(i) / nreads) * 100) try: s, c, qstr, indivstr, indcnt, r2, r2cnt = line.strip().split() except ValueError: print >> sys.stderr, 'line %s split: incorrect element number (%s) line:\n%ssplit:\n%s\n' % ( i, len(line.strip().split()), line, line.strip().split()) q = numpy.array([ord(ch) - baseQ for ch in qstr]) c = int(c) indiv = set(indivstr.split(',')) if count_by_ind: indcntd = dict( zip(indivstr.split(','), map(int, indcnt.split(',')))) if readlen is not None: s = s[:readlen] q = q[:readlen] if all_quality.has_key(s): all_quality[s]['mIDs'] = list( set(all_quality[s]['mIDs']).union(indiv)) all_quality[s]['sum_quality'] += q * c all_quality[s]['tot'] += c if count_by_ind: for ind, cnt in indcntd.items(): if all_quality[s]['count_by_ind'].has_key(ind): all_quality[s]['count_by_ind'][ind] += cnt else: all_quality[s]['count_by_ind'][ind] = cnt else: all_quality[s]['mIDs'] = list(indiv) all_quality[s]['sum_quality'] = q * c all_quality[s]['tot'] = c if count_by_ind: all_quality[s]['count_by_ind'] = indcntd
from editdist import distance from preprocess_radtag_lane import next_read_from_fh, smartopen, get_read_count idx_bp = 5 cut_bp = 5 lnum = 4 min_seqs = 7 uniqued, fastq = sys.argv[1:] readlen = len(next_read_from_fh(smartopen(fastq), 4)[1]) print >> sys.stderr, 'readlen: %s' % readlen num_reads = get_read_count(fastq, 4) tickon = num_reads / 200 useqs = [] for l in open(uniqued): s, cntstr = l.strip().split()[0], l.strip().split()[4] cnt = numpy.mean([int(i) for i in cntstr.split(',')]) if cnt >= min_seqs: useqs.append(s[cut_bp:readlen - idx_bp]) useqs = list(set(useqs)) print >> sys.stderr, '%s unique %sbp sequences in uniqued file' % ( len(useqs), len(s[cut_bp:readlen - idx_bp])) fh = smartopen(fastq)
from editdist import distance from preprocess_radtag_lane import next_read_from_fh, smartopen, get_read_count idx_bp = 5 cut_bp = 5 lnum = 4 min_seqs = 7 uniqued, fastq = sys.argv[1:] readlen = len(next_read_from_fh(smartopen(fastq),4)[1]) print >> sys.stderr, 'readlen: %s' % readlen num_reads = get_read_count(fastq,4) tickon = num_reads/200 useqs = [] for l in open(uniqued): s,cntstr = l.strip().split()[0], l.strip().split()[4] cnt = numpy.mean([int(i) for i in cntstr.split(',')]) if cnt >= min_seqs: useqs.append(s[cut_bp:readlen-idx_bp]) useqs = list(set(useqs)) print >> sys.stderr, '%s unique %sbp sequences in uniqued file' % (len(useqs),len(s[cut_bp:readlen-idx_bp])) fh = smartopen(fastq) for i in range(num_reads):
if len(sys.argv) == 2: fq = sys.argv[1] boundstr = "0:" else: fq, boundstr = sys.argv[1:] start, end = boundstr.split(':') start = int(start) lnum, baseQ, readlen = get_fastq_properties(fq) if end == '': end = readlen readcount = preprocess_radtag_lane.get_read_count(fq) qsc_n = 0 qsc_tot = numpy.zeros(readlen) qsc_by_read = [] fh = smartopen(fq) tickon = readcount / 1000 for i in range(readcount): if i % tickon == 0: print >> sys.stderr, '\r%0.1f' % ((i / float(readcount)) * 100), t, r, q = preprocess_radtag_lane.next_read_from_fh(fh, lnum) qsc = [ord(c) - baseQ for c in q] qsc_n += 1 qsc_tot += qsc
if len(sys.argv) == 2: fq = sys.argv[1] boundstr = "0:" else: fq, boundstr = sys.argv[1:] start,end = boundstr.split(':') start = int(start) lnum,baseQ,readlen = get_fastq_properties(fq) if end == '': end = readlen readcount = preprocess_radtag_lane.get_read_count(fq) qsc_n = 0 qsc_tot = numpy.zeros(readlen) qsc_by_read = [] fh = smartopen(fq) tickon = readcount/1000 for i in range(readcount): if i % tickon == 0: print >> sys.stderr, '\r%0.1f' % ((i/float(readcount)) * 100), t,r,q = preprocess_radtag_lane.next_read_from_fh(fh,lnum) qsc = [ord(c)-baseQ for c in q] qsc_n += 1 qsc_tot += qsc
for paired end, argv: cutsite,fq1,fq2,outfile1,outfile2 ''' import preprocess_radtag_lane import os,sys barcode_len = 5 tick = 10000 #update progress every this-many reads if __name__ == "__main__": if len(sys.argv) == 4: cutsite,fq,outfile = sys.argv[1:] rc = preprocess_radtag_lane.get_read_count(fq) lnum,baseQ = preprocess_radtag_lane.get_fastq_properties(fq) fh = preprocess_radtag_lane.smartopen(fq) ofh = preprocess_radtag_lane.smartopen(outfile,'w') found = 0 for i in range(rc): if i>0 and i % tick == 0: print >> sys.stderr, '\r%s / %s (%0.1f%%) found %s (%0.1f%%)' % \ (i,rc,(float(i)/rc)*100,found,(float(found)/i)*100), n,s,q = preprocess_radtag_lane.next_read_from_fh(fh,lnum) if s[barcode_len:barcode_len+len(cutsite)] == cutsite: line = preprocess_radtag_lane.as_fq_line(n,s,q,None,lnum) ofh.write(line) found += 1