def check_merged_reads(merge_d, source_d): results = {} for idx_seq,idx in idx_lookup.items(): for r in [1,2]: for l in range(1,9): sources = glob('%s/*_%s_L00%s_R%s*.fastq.gz' % (source_d,idx_seq,l,r)) if len(sources) > 0: source_sum = sum([preprocess_radtag_lane.get_read_count(f) for f in sources]) merge_f = '%s/s_%s_%s_sequence_index%s.txt.gz' % (merge_d,l,r,idx) if os.path.exists(merge_f): merge_sum = preprocess_radtag_lane.get_read_count(merge_f) results[merge_f] = source_sum == merge_sum else: results[merge_f] = None return results
#!/usr/bin/env python import Seq, os,sys from radtag_denovo import preprocess_radtag_lane from Util import smartopen def join_pair(r1,r2,num_n=10,qual_n='#'): return [r1[0],r1[1]+'N'*num_n+str(Seq.Sequence(r2[1]).rc()),r1[2]+qual_n*num_n+''.join(reversed(r2[2]))] if __name__ == "__main__": f1,f2 = sys.argv[1:] fh1 = smartopen(f1) fh2 = smartopen(f2) rc = preprocess_radtag_lane.get_read_count(f1) for i in xrange(rc): if i % 1000 == 0: print >> sys.stderr, '\r%s / %s' % (i,rc), r1 = preprocess_radtag_lane.next_read_from_fh(fh1,4) r2 = preprocess_radtag_lane.next_read_from_fh(fh2,4) print preprocess_radtag_lane.as_fq4_lines(*join_pair(r1,r2)) print >> sys.stderr, '\ndone'
if len(sys.argv) == 2: fq = sys.argv[1] boundstr = "0:" else: fq, boundstr = sys.argv[1:] start,end = boundstr.split(':') start = int(start) lnum,baseQ,readlen = get_fastq_properties(fq) if end == '': end = readlen readcount = preprocess_radtag_lane.get_read_count(fq) qsc_n = 0 qsc_tot = numpy.zeros(readlen) qsc_by_read = [] fh = smartopen(fq) tickon = readcount/1000 for i in range(readcount): if i % tickon == 0: print >> sys.stderr, '\r%0.1f' % ((i/float(readcount)) * 100), t,r,q = preprocess_radtag_lane.next_read_from_fh(fh,lnum) qsc = [ord(c)-baseQ for c in q] qsc_n += 1 qsc_tot += qsc
raise OSError else: if opts.check_donefiles: if os.path.exists(donefile): print >> sys.stderr, '.done file for bam %s found, but bam is missing; remove %s ...' % (rg_ref_bam,donefile), ret = os.system('rm -f %s' % donefile) if ret == 0: print >> sys.stderr, 'DONE' else: raise OSError, 'FAILED' if isinstance(readfile,tuple): r1,r2 = readfile readct1 = preprocess_radtag_lane.get_read_count(r1) readct2 = preprocess_radtag_lane.get_read_count(r2) if readct1 != readct2: raise ValueError, 'read counts do not match, abort' num_parts = (readct1*2)/opts.reads_per_part if num_parts < 1: num_parts = 1 print >> sys.stderr, 'map in %s part(s)' % num_parts samparts_by_bam[rg_ref_bam] = [] for i in xrange(1,num_parts+1): sampart = samfbase+'_%05dof%05d.sam' % (i,num_parts) samparts_by_bam[rg_ref_bam].append(sampart) cmdstr = 'run_safe.py \"module load %s; stampy.py -g %s -h %s --gatkcigarworkaround --overwrite --readgroup=%s --processpart %s/%s -o %s %s -M %s %s\" %s.done' % (stampy_module,t,t,readgroup_arg,i,num_parts,sampart,stampy_argstr,r1,r2,sampart) cmds.append(cmdstr) cmd_by_sam[sampart] = cmdstr
#!/usr/bin/env python '''calculate the percent of reads in a lane properly resolved by barcode ''' import os,sys from radtag_denovo import preprocess_radtag_lane from glob import glob fastq,analysis_folder = sys.argv[1:] tot_reads = preprocess_radtag_lane.get_read_count(fastq) indiv_barcoded = {} fqs = glob('%s/*1_sequence.33.fq4*' % analysis_folder) for fq in fqs: indiv_barcoded[fq] = preprocess_radtag_lane.get_read_count(fq) print float(sum(indiv_barcoded.values()))/tot_reads