def load_uniqued(all_quality,uniqued,readlen=None,nticks=20,baseQ=None,count_by_ind=False): '''given a .uniqued file produced by preprocess_radtag_lane.py loads data into all_quality, ensuring sequences remain unique all_quality per 20101114 - UPDATE below ''' nreads = get_read_count(uniqued) qfh = smartopen(uniqued) while baseQ is None: line = qfh.next() qstr = line.strip().split()[2] baseQ = get_baseQ(qstr) qfh.close() print >> sys.stderr, 'uniqued qualities base %s' % (baseQ) tickon = nreads/nticks if tickon < 1: tickon = 1 print >> sys.stderr, '\tloading' for i,line in enumerate(smartopen(uniqued)): if i % tickon == 0: print >> sys.stderr, '\t\t%s / %s (%d%%)' % (i,nreads,(float(i)/nreads)*100) try: s,c,qstr,indivstr,indcnt,r2,r2cnt = line.strip().split() except ValueError: print >> sys.stderr, 'line %s split: incorrect element number (%s) line:\n%ssplit:\n%s\n' % (i,len(line.strip().split()),line,line.strip().split()) q = numpy.array([ord(ch)-baseQ for ch in qstr]) c = int(c) indiv = set(indivstr.split(',')) if count_by_ind: indcntd = dict(zip(indivstr.split(','),map(int,indcnt.split(',')))) if readlen is not None: s = s[:readlen] q = q[:readlen] if all_quality.has_key(s): all_quality[s]['mIDs'] = list(set(all_quality[s]['mIDs']).union(indiv)) all_quality[s]['sum_quality'] += q*c all_quality[s]['tot'] += c if count_by_ind: for ind,cnt in indcntd.items(): if all_quality[s]['count_by_ind'].has_key(ind): all_quality[s]['count_by_ind'][ind] += cnt else: all_quality[s]['count_by_ind'][ind] = cnt else: all_quality[s]['mIDs'] = list(indiv) all_quality[s]['sum_quality'] = q*c all_quality[s]['tot'] = c if count_by_ind: all_quality[s]['count_by_ind'] = indcntd
def load_uniqued(all_quality,uniqued,readlen=None,nticks=20,baseQ=None): '''given a .uniqued file produced by preprocess_radtag_lane.py loads data into all_quality, ensuring sequences remain unique all_quality per 20101114 - UPDATE below ''' print >> sys.stderr, '%s readcount: ' % (uniqued), #number of sequences nreads = int(Popen('wc -l %s' % uniqued,shell=True,stdout=PIPE).stdout.read().split()[0]) print >> sys.stderr, nreads qfh = open(uniqued) while baseQ is None: line = qfh.next() qstr = line.strip().split()[2] baseQ = get_baseQ(qstr) qfh.close() print >> sys.stderr, 'uniqued qualities base %s' % (baseQ) tickon = nreads/nticks if tickon < 1: tickon = 1 print >> sys.stderr, '\tloading' for i,line in enumerate(open(uniqued)): if i % tickon == 0: print >> sys.stderr, '\t\t%s / %s (%d%%)' % (i,nreads,(float(i)/nreads)*100) try: s,c,qstr,indiv,indcnt,r2,r2cnt = line.strip().split() except ValueError: print >> sys.stderr, 'line %s split: incorrect element number (%s) line:\n%ssplit:\n%s\n' % (i,len(line.strip().split()),line,line.strip().split()) q = numpy.array([ord(ch)-baseQ for ch in qstr]) c = int(c) indiv = set(indiv.split(',')) if readlen is not None: s = s[:readlen] q = q[:readlen] if all_quality.has_key(s): all_quality[s]['mIDs'] = list(set(all_quality[s]['mIDs']).union(indiv)) all_quality[s]['sum_quality'] += q*c all_quality[s]['tot'] += c else: all_quality[s]['mIDs'] = list(indiv) all_quality[s]['sum_quality'] = q*c all_quality[s]['tot'] = c
def load_lines_from_uniqued(source_uniques, rv_sort=True, sort_key=lambda x: (len(x[0]), int(x[1])), keep_source_id=False): ''' if keep_source_id is True returns list of 2-tuples uniqued_id (eg 100617_lane6_PE for "data/100617/100617_lane6_PE.uniqued") tuples are (parsed_lines,uniqued_id) else list of lines. ''' uniquedlines = [] for f in source_uniques: lines = [] print >> sys.stderr, 'load %s ...' % f, lines = tuple([l.strip().split() for l in open(f).readlines()]) print >> sys.stderr, '%s lines' % len(lines) #get qual base baseQ = None for l in lines: baseQ = get_baseQ(l[2]) if baseQ is not None: break print >> sys.stderr, 'qual base: %s' % baseQ if baseQ == 64: print >> sys.stderr, 'Translate quality encoding to base 33 ...', for l in lines: l[2] = ''.join([chr(ord(c) - 64 + 33) for c in l[2]]) print >> sys.stderr, 'done' if keep_source_id: uniqued_id = os.path.basename(os.path.splitext(f)[0]) uniquedlines.extend(zip(lines, [uniqued_id] * len(lines))) else: uniquedlines.extend(lines) print >> sys.stderr, 'sort', if keep_source_id: uniquedlines.sort(reverse=rv_sort, key=lambda x: sort_key(x[0])) else: uniquedlines.sort(reverse=rv_sort, key=sort_key) print >> sys.stderr, 'done' return uniquedlines
def load_lines_from_uniqued(source_uniques,rv_sort = True, sort_key = lambda x: (len(x[0]),int(x[1])), keep_source_id = False): ''' if keep_source_id is True returns list of 2-tuples uniqued_id (eg 100617_lane6_PE for "data/100617/100617_lane6_PE.uniqued") tuples are (parsed_lines,uniqued_id) else list of lines. ''' uniquedlines = [] for f in source_uniques: lines = [] print >> sys.stderr, 'load %s ...' % f, lines = tuple([l.strip().split() for l in open(f).readlines()]) print >> sys.stderr, '%s lines' % len(lines) #get qual base baseQ = None for l in lines: baseQ = get_baseQ(l[2]) if baseQ is not None: break print >> sys.stderr, 'qual base: %s' % baseQ if baseQ == 64: print >> sys.stderr, 'Translate quality encoding to base 33 ...', for l in lines: l[2] = ''.join([chr(ord(c)-64+33) for c in l[2]]) print >> sys.stderr, 'done' if keep_source_id: uniqued_id = os.path.basename(os.path.splitext(f)[0]) uniquedlines.extend( zip( lines,[uniqued_id]*len(lines) ) ) else: uniquedlines.extend(lines) print >> sys.stderr, 'sort', if keep_source_id: uniquedlines.sort(reverse = rv_sort,key = lambda x: sort_key(x[0])) else: uniquedlines.sort(reverse = rv_sort,key = sort_key) print >> sys.stderr, 'done' return uniquedlines
def get_fastq_properties(fq): if smartopen(fq).read(1) == '@': lnum = 4 else: lnum = 1 print >> sys.stderr, 'fastq format lnum: %s' % lnum baseQ = None qfh = smartopen(fq) while baseQ is None: t, r, q = preprocess_radtag_lane.next_read_from_fh(qfh, lnum) baseQ = preprocess_radtag_lane.get_baseQ(q) qfh.close() print >> sys.stderr, 'fastq format baseQ: %s' % baseQ readlen = len(r) print >> sys.stderr, 'fastq format readlen: %s' % readlen return lnum, baseQ, readlen
def get_fastq_properties(fq): if smartopen(fq).read(1) == '@': lnum = 4 else: lnum = 1 print >> sys.stderr, 'fastq format lnum: %s' % lnum baseQ = None qfh = smartopen(fq) while baseQ is None: t,r,q = preprocess_radtag_lane.next_read_from_fh(qfh,lnum) baseQ = preprocess_radtag_lane.get_baseQ(q) qfh.close() print >> sys.stderr, 'fastq format baseQ: %s' % baseQ readlen = len(r) print >> sys.stderr, 'fastq format readlen: %s' % readlen return lnum,baseQ,readlen
def load_uniqued(all_quality, uniqued, readlen=None, nticks=20, baseQ=None, count_by_ind=False): '''given a .uniqued file produced by preprocess_radtag_lane.py loads data into all_quality, ensuring sequences remain unique all_quality per 20101114 - UPDATE below ''' nreads = get_read_count(uniqued) qfh = smartopen(uniqued) while baseQ is None: line = qfh.next() qstr = line.strip().split()[2] baseQ = get_baseQ(qstr) qfh.close() print >> sys.stderr, 'uniqued qualities base %s' % (baseQ) tickon = nreads / nticks if tickon < 1: tickon = 1 print >> sys.stderr, '\tloading' for i, line in enumerate(smartopen(uniqued)): if i % tickon == 0: print >> sys.stderr, '\t\t%s / %s (%d%%)' % (i, nreads, (float(i) / nreads) * 100) try: s, c, qstr, indivstr, indcnt, r2, r2cnt = line.strip().split() except ValueError: print >> sys.stderr, 'line %s split: incorrect element number (%s) line:\n%ssplit:\n%s\n' % ( i, len(line.strip().split()), line, line.strip().split()) q = numpy.array([ord(ch) - baseQ for ch in qstr]) c = int(c) indiv = set(indivstr.split(',')) if count_by_ind: indcntd = dict( zip(indivstr.split(','), map(int, indcnt.split(',')))) if readlen is not None: s = s[:readlen] q = q[:readlen] if all_quality.has_key(s): all_quality[s]['mIDs'] = list( set(all_quality[s]['mIDs']).union(indiv)) all_quality[s]['sum_quality'] += q * c all_quality[s]['tot'] += c if count_by_ind: for ind, cnt in indcntd.items(): if all_quality[s]['count_by_ind'].has_key(ind): all_quality[s]['count_by_ind'][ind] += cnt else: all_quality[s]['count_by_ind'][ind] = cnt else: all_quality[s]['mIDs'] = list(indiv) all_quality[s]['sum_quality'] = q * c all_quality[s]['tot'] = c if count_by_ind: all_quality[s]['count_by_ind'] = indcntd