def construct_header_from_reference_fasta(ref_fasta_filename): g = FastaData(open(ref_fasta_filename).read()) #g = SequenceBasics.read_fasta_into_hash(ref_fasta_filename) chrs = {} for name in sorted(g.keys()): chrs[name] = len(g[name]) sys.stderr.write(name+" is there at length "+str(len(g[name]))+"\n") header = '' header += "@HD\tVN:1.0\tSO:coordinate\n" for chr in sorted(chrs): header += "@SQ\tSN:"+chr+"\tLN:"+str(chrs[chr])+"\n" header += "@PG\tID:SamBasics.py\tVN:1.0\n" return header
def construct_header_from_reference_fasta(ref_fasta_filename): g = FastaData(open(ref_fasta_filename).read()) #g = SequenceBasics.read_fasta_into_hash(ref_fasta_filename) chrs = {} for name in sorted(g.keys()): chrs[name] = len(g[name]) sys.stderr.write(name + " is there at length " + str(len(g[name])) + "\n") header = '' header += "@HD\tVN:1.0\tSO:coordinate\n" for chr in sorted(chrs): header += "@SQ\tSN:" + chr + "\tLN:" + str(chrs[chr]) + "\n" header += "@PG\tID:SamBasics.py\tVN:1.0\n" return header
def main(): parser = argparse.ArgumentParser( description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input', help="Use - for STDIN") parser.add_argument('-r', '--reference', help="reference genome FASTA") parser.add_argument('--no_qual', action='store_true', help="dont put in quality") args = parser.parse_args() ref = {} if args.reference: ref = FastaData(open(args.reference, 'rb').read()) if args.input == '-': args.input = sys.stdin else: args.input = open(args.input) h1 = '@HD VN:1.0 SO:unsorted' h2 = '@PG ID:FA2UN PN:FA2UN VN:2016-06-09 CL:' + ' '.join(sys.argv) print h1 print h2 if ref: for chr in sorted(ref.keys()): print "@SQ\tSN:" + chr + "\t" + 'LN:' + str(len(ref[chr])) inf = FastqHandle(args.input) for e in inf: o = '' o += e.name + "\t" o += "4\t" o += "*\t" o += "0\t" o += "0\t" o += "*\t" o += "*\t" o += "0\t" o += "0\t" o += e.seq + "\t" if args.no_qual: o += "*\t" else: o += e.qual + "\t" o += "XO:Z:NM" print o
def main(): parser = argparse.ArgumentParser(description="",formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input',help="Use - for STDIN") parser.add_argument('-r','--reference',help="reference genome FASTA") parser.add_argument('--no_qual',action='store_true',help="dont put in quality") args = parser.parse_args() ref = {} if args.reference: ref = FastaData(open(args.reference,'rb').read()) if args.input == '-': args.input = sys.stdin else: args.input = open(args.input) h1 = '@HD VN:1.0 SO:unsorted' h2 = '@PG ID:FA2UN PN:FA2UN VN:2016-06-09 CL:'+' '.join(sys.argv) print h1 print h2 if ref: for chr in sorted(ref.keys()): print "@SQ\tSN:"+chr+"\t"+'LN:'+str(len(ref[chr])) inf = FastqHandle(args.input) for e in inf: o = '' o += e.name+"\t" o += "4\t" o += "*\t" o += "0\t" o += "0\t" o += "*\t" o += "*\t" o += "0\t" o += "0\t" o += e.seq+"\t" if args.no_qual: o+= "*\t" else: o += e.qual+"\t" o += "XO:Z:NM" print o
def main(): parser = argparse.ArgumentParser( description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input', help="Use - for STDIN or specify a BAM file") parser.add_argument('-r', '--reference', help="Reference fasta", required=True) args = parser.parse_args() ref = None if args.reference: ref = FastaData(open(args.reference, 'rb').read()) if args.input == '-': args.input = SamStream(sys.stdin, reference=ref) else: args.input = BAMFile(args.input, reference=ref) for e in args.input: if e.is_aligned(): print e.get_PSL()
def main(): parser = argparse.ArgumentParser(description="Read a sam file and output a bed file in the format of junction_color.bed") parser.add_argument('-o','--output',help='FILENAME is output') parser.add_argument('--min_intron_size',type=int,default=68,help='minimum intron size') parser.add_argument('infile',help='FILENAME of sam file or "-" for STDIN') parser.add_argument('reference_genome',help='FILENAME of the reference genome') args = parser.parse_args() # get our reference genome sys.stderr.write("reading reference genome\n") #g = SequenceBasics.read_fasta_into_hash(args.reference_genome) g = FastaData(open(args.reference_genome).read()) sys.stderr.write("finished reading reference genome\n") inf = sys.stdin read_mapping_count = {} junctions = {} if args.infile != '-': inf = open(args.infile) sys.stderr.write("reading through sam file\n") zall = 0 zn = 0 while True: line = inf.readline() if not line: break line = line.rstrip() if SamBasics.is_header(line): continue d = SamBasics.sam_line_to_dictionary(line) chrom = d['rname'] if chrom =='*': continue if chrom not in g.keys(): sys.stderr.write("WARNING: "+chrom+" not in reference, skipping\n") continue mate = 'U' if SamBasics.check_flag(d['flag'],int('0x4',16)): #check if its unmapped continue # we can ignore the unmapped things for now if SamBasics.check_flag(d['flag'],int('0x40',16)): mate = 'L' elif SamBasics.check_flag(d['flag'],int('0x80',16)): mate = 'R' actual_read = d['qname']+"\t"+mate if actual_read not in read_mapping_count: read_mapping_count[actual_read] = 0 read_mapping_count[actual_read] += 1 has_intron = 0 start_loc = d['pos'] current_loc = start_loc bounds = [] for i in range(0,len(d['cigar_array'])): ce = d['cigar_array'][i] if ce['op'] == 'N' and ce['val'] >= args.min_intron_size: has_intron = 1 lbound = current_loc # should be the intron start base index-1 current_loc += ce['val'] rbound = current_loc # should be the second exon start base index-1 right_size = d['cigar_array'][i+1]['val'] bounds.append([lbound,rbound,right_size]) elif ce['op'] == 'D': current_loc += ce['val'] elif re.match('[=XMSHP]',ce['op']): current_loc += ce['val'] if has_intron == 0: continue # there are no splices to report here #print actual_read #print d['cigar'] #print d #print start_loc #print bounds for bound in bounds: zall += 1 intronflank = g[chrom][bound[0]-1:bound[0]+1].upper() + '-' + \ g[chrom][bound[1]-3:bound[1]-1].upper() strand = '' if is_canon(intronflank): # its a positive strand strand = '+' elif is_revcanon(intronflank): # its a negative strand strand = '-' else: # We can't deal with the non-canonical splice sorry zn += 1 sys.stderr.write("WARNING skipping non-canonical splice ("+str(zn)+"/"+str(zall)+")\r") continue # If we are still in we have successfully found a splice out_chrom = chrom out_start = bound[0]-51 out_end = bound[1]+49 out_name = '*' # this will be done later out_score = 50 out_strand = strand out_thickStart = out_start out_thickEnd = out_end out_rgb = '0,0,0' out_block_count = 2 out_block_sizes = '50,50' out_block_starts = '0,'+str(bound[1]-bound[0]+50) bed = [] bed.append(out_chrom) bed.append(str(out_start)) bed.append(str(out_end)) bed.append(out_name) bed.append(str(out_score)) bed.append(out_strand) bed.append(str(out_thickStart)) bed.append(str(out_thickEnd)) bed.append(out_rgb) bed.append(str(out_block_count)) bed.append(out_block_sizes) bed.append(out_block_starts) entry = "\t".join(bed) if entry not in junctions: junctions[entry] = {} junctions[entry]['reads'] = set() junctions[entry]['positions'] = set() junctions[entry]['right_sizes'] = set() junctions[entry]['reads'].add(actual_read) junctions[entry]['positions'].add(d['pos']) junctions[entry]['right_sizes'].add(bound[2]) sys.stderr.write("\n") sys.stderr.write("finished reading sam\n") of = sys.stdout if args.output: of = open(args.output,'w') if len(junctions) > 0: # if we have stuff lets print a header of.write("track\tname=junctions\tdescription=\"SpliceMap junctions\" itemRgb=\"On\"\n") for entry in junctions: nR = len(junctions[entry]['reads']) width = max(junctions[entry]['right_sizes'])-min(junctions[entry]['right_sizes']) nNR = len(junctions[entry]['positions']) nUR = 0 nMR = 0 for read in junctions[entry]['reads']: if read_mapping_count[read] == 1: nUR += 1 elif read_mapping_count[read] > 1: nMR += 1 else: sys.stderr.write("ERROR: nonsense read count\n") return name = '('+str(nR)+')['+str(width)+'_'+str(nNR)+']('+str(nUR)+'/'+str(nMR)+')' bed = entry.split("\t") bed[3] = name of.write("\t".join(bed)+"\n")
def main(): #do our inputs args = do_inputs() of = sys.stdout if args.output: of = open(args.output, 'w') inf = sys.stdin if args.input != '-': if args.input[-3:] == '.gz': inf = gzip.open(args.input) else: inf = open(args.input) sys.stderr.write("reading in fasta\n") f = FastaData(open(args.reference).read()) sh = GPDStream(inf) gc_bins = range(0, args.number_of_bins) bin_handles = [] for i in range(0, args.number_of_bins): fname = args.tempdir + '/' + str(i) + '.bed.gz' cmd2 = 'bed_to_bed_depth.py - -o ' + fname p2 = Popen(cmd2.split(), stdin=PIPE, close_fds=True) cmd1 = 'sort -k 1,1 -k2,2n -k3,3n -T ' + args.tempdir p1 = Popen(cmd1.split(), stdin=PIPE, stdout=p2.stdin, close_fds=True) bin_handles.append([p1, p2, fname, i]) if args.best_X_covered: sys.stderr.write("work out stratified data\n") cmd3 = 'bed_depth_to_stratified_coverage.py --minimum_coverage 10 --output_key ' + args.tempdir + '/key' + ' -r ' + args.reference + ' - -o ' + args.tempdir + '/combo.bed.gz' pstrat3 = Popen(cmd3.split(), stdin=PIPE, close_fds=True) cmd2 = 'bed_to_bed_depth.py -' pstrat2 = Popen(cmd2.split(), stdin=PIPE, stdout=pstrat3.stdin, close_fds=True) cmd1 = 'sort -k 1,1 -k2,2n -k3,3n -T ' + args.tempdir pstrat1 = Popen(cmd1.split(), stdin=PIPE, stdout=pstrat2.stdin, close_fds=True) num = 0 for gpd in sh: num += 1 if (num % 1000 == 0): sys.stderr.write(str(num) + " \r") results = [] if args.minimum_sequence_length: if gpd.get_length() < args.minimum_sequence_length: continue seq = gpd.get_sequence(f).upper() seq_obj = Seq(seq) n_count = seq_obj.n_count() if len(seq) - n_count < args.min_non_N: continue gc = seq_obj.gc_content() gc_bin = int(args.number_of_bins * gc) if gc_bin == args.number_of_bins: gc_bin -= 1 for exon in gpd.exons: bed_bin = [ "\t".join([str(x) for x in exon.rng.get_bed_array()]), gc_bin ] results.append(bed_bin) elif args.fragment: seqlen = gpd.get_length() if seqlen < args.fragment: continue sfrags = int(float(seqlen) / float(args.fragment)) sremain = seqlen % args.fragment offset = 0 if random.random() < 0.5: offset = sremain #print '^^^' for i in range(0, sfrags): gsub = gpd.subset(i * args.fragment + offset, (i + 1) * args.fragment + offset) seq = gsub.get_sequence(f).upper() seq_obj = Seq(seq) n_count = seq_obj.n_count() if len(seq) - n_count < args.min_non_N: continue gc = seq_obj.gc_content() gc_bin = int(args.number_of_bins * gc) if gc_bin == args.number_of_bins: gc_bin -= 1 for exon in gsub.exons: bed_bin = [ "\t".join([str(x) for x in exon.rng.get_bed_array()]), gc_bin ] results.append(bed_bin) for val in results: [bed, bin] = val bin_handles[bin][0].stdin.write(bed + "\n") if args.best_X_covered: pstrat1.stdin.write(bed + "\n") #if not gc: print len(gpd.get_sequence(f)) sys.stderr.write("\n") for v in bin_handles: v[0].communicate() v[1].communicate() if args.best_X_covered: pstrat1.communicate() pstrat2.communicate() pstrat3.communicate() # If we want stratified data we should do it here sys.stderr.write("read the key\n") d = {} with open(args.tempdir + '/key') as inf: header = inf.readline() for line in inf: f = line.rstrip().split("\t") d[int(f[0])] = int(f[1]) if args.best_X_covered not in d: sys.stderr.write( "ERROR: the number of bases you specified is probably too big you didn't make the digit begin with 1 or 5 and restof the numbers be zero\n" ) sys.exit() num = d[args.best_X_covered] ninf = gzip.open(args.tempdir + '/combo.bed.gz') nof = gzip.open(args.tempdir + '/strat.bed.gz', 'w') for line in ninf: f = line.rstrip().split("\t") if int(f[3]) >= num: nof.write("\t".join(f[:-1]) + "\n") nof.close() ninf.close() for i in range(0, len(bin_handles)): v = bin_handles[i] fname = v[2] fname2 = args.tempdir + '/' + str(v[3]) + '.strata.bed.gz' gof = open(fname2, 'w') cmd2 = 'gzip' p2 = Popen(cmd2.split(), stdout=gof, stdin=PIPE) cmd1 = 'bedtools intersect -a ' + fname + ' -b ' + args.tempdir + '/strat.bed.gz' p1 = Popen(cmd1.split(), stdout=p2.stdin) p1.communicate() p2.communicate() gof.close() # lets just replace the name of the file that the final output will read from bin_handles[i][2] = fname2 # Now we have bed depths for each bin for v in bin_handles: fname = v[2] #sys.stderr.write(fname+" ... prosessing\n") depths = {} bin = v[3] inf = gzip.open(fname) for line in inf: f = line.rstrip().split("\t") bases = int(f[2]) - int(f[1]) depth = int(f[3]) if depth not in depths: depths[depth] = 0 depths[depth] += bases inf.close() for depth in sorted(depths.keys()): of.write( str(bin) + "\t" + str(depth) + "\t" + str(depths[depth]) + "\n") of.close() # Temporary working directory step 3 of 3 - Cleanup if not args.specific_tempdir: rmtree(args.tempdir)
def main(args): # make our error profile report sys.stderr.write("Reading reference fasta\n") ref = FastaData(open(args.reference).read()) sys.stderr.write("Reading alignments\n") epf = ErrorProfileFactory() if args.random: bf = None if args.input_index: bf = BAMFile(args.input, reference=ref, index_file=args.input_index) bf.read_index(index_file=args.input_index) else: bf = BAMFile(args.input, reference=ref) bf.read_index() if not bf.has_index(): sys.stderr.write("Random access requires an index be set\n") z = 0 strand = 'target' if args.query: strand = 'query' con = 0 while True: rname = random.choice(bf.index.get_names()) #print rname coord = bf.index.get_longest_target_alignment_coords_by_name(rname) #print coord if not coord: continue e = bf.fetch_by_coord(coord) if e.is_aligned(): epf.add_alignment(e) z += 1 if z % 100 == 1: con = epf.get_min_context_count(strand) sys.stderr.write( str(z) + " alignments, " + str(con) + " min context coverage\r") if args.max_alignments <= z: break if args.stopping_point <= con: break else: bf = BAMFile(args.input, reference=ref) z = 0 strand = 'target' if args.query: strand = 'query' con = 0 for e in bf: if e.is_aligned(): epf.add_alignment(e) z += 1 if z % 100 == 1: con = epf.get_min_context_count(strand) sys.stderr.write( str(z) + " alignments, " + str(con) + " min context coverage\r") if args.max_alignments <= z: break if args.stopping_point <= con: break sys.stderr.write("\n") sys.stderr.write('working with:' + "\n") sys.stderr.write( str(z) + " alignments, " + str(con) + " min context coverage" + "\n") epf.write_context_error_report(args.tempdir + '/err.txt', strand) for ofile in args.output: cmd = args.rscript_path + ' ' + os.path.dirname( os.path.realpath(__file__) ) + '/plot_base_error_context.r ' + args.tempdir + '/err.txt ' + ofile + ' ' if args.scale: cmd += ' '.join([str(x) for x in args.scale]) sys.stderr.write(cmd + "\n") call(cmd.split()) sys.stderr.write("finished\n") if args.output_raw: of = open(args.output_raw, 'w') with open(args.tempdir + "/err.txt") as inf: for line in inf: of.write(line) # Temporary working directory step 3 of 3 - Cleanup if not args.specific_tempdir: rmtree(args.tempdir)
def main(args): random.seed(args.seed) sum = 0 if args.reference_genome: ref = FastaData(open(args.reference_genome).read()) for name in ref.keys(): sum += len(ref[name]) else: with open(args.reference_lengths) as inf: for line in inf: f = line.rstrip().split("\t") sum += int(f[1]) c = args.minimum_coverage z = 0 values = {} while c < sum: z += 1 values[c] = z c = c * 5 if c >= sum: break z += 1 values[c] = z c = c * 2 z += 1 values[sum] = z for c in sorted(values.keys()): values[c] = z - values[c] + 1 ### Now values contains the stratified coverage values if args.output_key: of = open(args.output_key, 'w') of.write("bp_size\tstrata_label\n") for c in sorted(values.keys()): of.write(str(c) + "\t" + str(values[c]) + "\n") of.close() inf = sys.stdin if args.input != '-': if args.input[-3:] == '.gz': inf = gzip.open(args.input) else: inf = open(args.input) of = sys.stdout if args.output: if args.output[-3:] == '.gz': of = gzip.open(args.output, 'w') else: of = open(args.output, 'w') depths = {} vals = [] z = 0 for line in inf: z += 1 if z % 100000 == 0: sys.stderr.write(str(z) + " bed entries read \r") f = line.rstrip().split("\t") addition = 0 if not args.dont_make_unique: addition = +args.unique_scale * random.random() vals.append([f[0], int(f[1]), int(f[2]), float(f[3]) + addition]) z = 0 sys.stderr.write("\n") for f in vals: z += 1 if z % 100000 == 0: sys.stderr.write(str(z) + " bed entries read \r") #keep track of the number of bases at each depth depth = f[3] cov = f[2] - f[1] if depth not in depths: depths[depth] = 0 depths[depth] += cov #vals.append([f[0],int(f[1]),int(f[2]),depth]) sys.stderr.write("\n") #total_bases = sum(depths.values()) #thresh = {} #for strata in stratas: # pos = 0 # cur = float(i)*float(total_bases)/float(args.strata) stratas = sorted(values.keys()) pos = 0 depth_strata = {} for d in reversed(sorted(depths.keys())): pos += depths[d] while stratas[0] < pos: stratas.pop(0) depth_strata[d] = values[stratas[0]] #print str(d)+"\t"+str(values[stratas[0]]) #if float(pos) > cur: # thresh[d] = [pos,i] # break vals[0][3] = depth_strata[vals[0][3]] buffer = vals[0] for val in vals[1:]: val[3] = depth_strata[val[3]] if val[1] == buffer[2] and val[3] == buffer[3] and val[0] == buffer[0]: #print 'hello' buffer[2] = val[2] continue else: of.write(buffer[0] + "\t" + str(buffer[1]) + "\t" + str(buffer[2]) + "\t" + str(buffer[3]) + "\n") buffer = val of.write(buffer[0] + "\t" + str(buffer[1]) + "\t" + str(buffer[2]) + "\t" + str(buffer[3]) + "\n") of.close()
def main(): #do our inputs args = do_inputs() global of of = sys.stdout if args.output: if args.output[-4:] == '.bam': cmd = 'samtools view -Sb - -o '+args.output p = Popen(cmd.split(),stdin=PIPE) of = p.stdin else: sys.stderr.write("ERROR: stdout and .bam are the only valid output formats\n") sys.exit() inf = sys.stdin if args.input != '-': if args.input[-3:] == '.gz': inf = gzip.open(args.input) else: inf = open(args.input) sys.stderr.write("reading reference genome\n") ref = FastaData(open(args.reference).read()) #shared = manager.dict() shared = {} for chr in sorted(ref.keys()): sys.stderr.write("reading "+chr+"\n") shared[chr] = ref[chr].upper() ref.remove(chr) sys.stderr.write("finished reading shared memory reference\n") sys.stderr.write("Now make the header\n") of.write("@HD\tVN:1.0\tSO:unknown\n") of.write("@PG\tID:SLR\n") for chr in sorted(shared.keys()): of.write("@SQ\tSN:"+chr+"\tLN:"+str(len(shared[chr]))+"\n") if args.threads > 1: poo = Pool(processes=args.threads) buffer = [] max_buffer = 1 z = 0 for line in inf: z += 1 if z%1000==0: sys.stderr.write(str(z)+" \r") buffer.append(line) if len(buffer) >= max_buffer: if args.threads == 1: results = do_buffer(buffer,shared,args) do_out(results) else: poo.apply_async(do_buffer,args=(buffer[:],shared,args,),callback=do_out) buffer = [] if len(buffer) > 0: if args.threads ==1: results = do_buffer(buffer,shared,args) do_out(results) else: poo.apply_async(do_buffer,args=(buffer[:],shared,args,),callback=do_out) if args.threads > 1: poo.close() poo.join() sys.stderr.write("\n") if args.output: p.communicate() else: of.close() # Temporary working directory step 3 of 3 - Cleanup if not args.specific_tempdir: rmtree(args.tempdir)
def main(args): random.seed(args.seed) sum = 0 if args.reference_genome: ref = FastaData(open(args.reference_genome).read()) for name in ref.keys(): sum += len(ref[name]) else: with open(args.reference_lengths) as inf: for line in inf: f = line.rstrip().split("\t") sum += int(f[1]) c = args.minimum_coverage z = 0 values = {} while c < sum: z += 1 values[c] = z c = c*5 if c >= sum: break z += 1 values[c] = z c = c*2 z +=1 values[sum] = z for c in sorted(values.keys()): values[c] = z-values[c]+1 ### Now values contains the stratified coverage values if args.output_key: of = open(args.output_key,'w') of.write("bp_size\tstrata_label\n") for c in sorted(values.keys()): of.write(str(c)+"\t"+str(values[c])+"\n") of.close() inf = sys.stdin if args.input != '-': if args.input[-3:]=='.gz': inf = gzip.open(args.input) else: inf = open(args.input) of = sys.stdout if args.output: if args.output[-3:]=='.gz': of = gzip.open(args.output,'w') else: of = open(args.output,'w') depths = {} vals = [] z = 0 for line in inf: z += 1 if z % 100000 == 0: sys.stderr.write(str(z)+" bed entries read \r") f = line.rstrip().split("\t") addition = 0 if not args.dont_make_unique: addition = +args.unique_scale*random.random() vals.append([f[0],int(f[1]),int(f[2]),float(f[3])+addition]) z = 0 sys.stderr.write("\n") for f in vals: z += 1 if z % 100000 == 0: sys.stderr.write(str(z)+" bed entries read \r") #keep track of the number of bases at each depth depth = f[3] cov = f[2]-f[1] if depth not in depths: depths[depth] = 0 depths[depth] += cov #vals.append([f[0],int(f[1]),int(f[2]),depth]) sys.stderr.write("\n") #total_bases = sum(depths.values()) #thresh = {} #for strata in stratas: # pos = 0 # cur = float(i)*float(total_bases)/float(args.strata) stratas = sorted(values.keys()) pos = 0 depth_strata = {} for d in reversed(sorted(depths.keys())): pos += depths[d] while stratas[0] < pos: stratas.pop(0) depth_strata[d] = values[stratas[0]] #print str(d)+"\t"+str(values[stratas[0]]) #if float(pos) > cur: # thresh[d] = [pos,i] # break vals[0][3] = depth_strata[vals[0][3]] buffer = vals[0] for val in vals[1:]: val[3] = depth_strata[val[3]] if val[1]==buffer[2] and val[3]==buffer[3] and val[0]==buffer[0]: #print 'hello' buffer[2] = val[2] continue else: of.write(buffer[0]+"\t"+str(buffer[1])+"\t"+str(buffer[2])+"\t"+str(buffer[3])+"\n") buffer = val of.write(buffer[0]+"\t"+str(buffer[1])+"\t"+str(buffer[2])+"\t"+str(buffer[3])+"\n") of.close()
def main(args): sys.stderr.write("Read reference fasta\n") fasta = FastaData(open(args.reference_fasta).read()) sys.stderr.write("Read alignment file\n") bf = BAMFile(args.bam_input,reference=fasta) bf.read_index() total_qualities = [] for j in range(0,100): total_qualities.append([]) ef = ErrorProfileFactory() mincontext = 0 alignments = 0 for i in range(0,args.max_alignments): rname = random.choice(bf.index.get_names()) coord = bf.index.get_longest_target_alignment_coords_by_name(rname) if not coord: continue bam = bf.fetch_by_coord(coord) qual = bam.value('qual') do_qualities(total_qualities,qual) if not bam.is_aligned(): continue alignments += 1 ef.add_alignment(bam) if i%100 == 0: mincontext = ef.get_min_context_count('target') if mincontext: if mincontext >= args.min_context and alignments >= args.min_alignments: break sys.stderr.write(str(i+1)+" lines "+str(alignments)+"/"+str(args.min_alignments)+" alignments "+str(mincontext)+"/"+str(args.min_context)+" mincontext \r") sys.stderr.write("\n") sys.stderr.write(str(mincontext)+" minimum contexts observed\n") target_context = ef.get_target_context_error_report() general_error_stats = ef.get_alignment_errors().get_stats() general_error_report = ef.get_alignment_errors().get_report() # convert report to table general_all = [x.split("\t") for x in general_error_report.rstrip().split("\n")] general_head = general_all[0] #print [y for y in general_all[1:]] general_data = [[y[0],y[1],int(y[2]),int(y[3])] for y in general_all[1:]] general_error_report = {'head':general_head,'data':general_data} quality_counts = [] for vals in total_qualities: garr = [] grp = {} for v in vals: if v[0] not in grp: grp[v[0]] = {}# check ordinal if v[1] not in grp[v[0]]: grp[v[0]][v[1]] = 0 # run length grp[v[0]][v[1]]+=1 for ordval in sorted(grp.keys()): for runlen in sorted(grp[ordval].keys()): garr.append([ordval,runlen,grp[ordval][runlen]]) quality_counts.append(garr) #Quailty counts now has 100 bins, each has an ordered array of # [ordinal_quality, run_length, observation_count] # Can prepare an output output = {} output['quality_counts'] = quality_counts output['context_error'] = target_context output['alignment_error'] = general_error_report output['error_stats'] = general_error_stats of = None if args.output[-3:]=='.gz': of = gzip.open(args.output,'w') else: of = open(args.output,'w') of.write(base64.b64encode(zlib.compress(json.dumps(output)))+"\n") of.close() # Temporary working directory step 3 of 3 - Cleanup if not args.specific_tempdir: rmtree(args.tempdir)
def main(args): sys.stderr.write("Reading our reference Fasta\n") ref = FastaData(open(args.reference, 'rb').read()) sys.stderr.write("Finished reading our reference Fasta\n") bf = None if args.input_index: bf = BAMFile(args.input, reference=ref, index_file=args.input_index) bf.read_index(index_file=args.input_index) else: bf = BAMFile(args.input, reference=ref) bf.read_index() epf = ErrorProfileFactory() if args.random: if not bf.has_index(): sys.stderr.write( "Random access requires our format of index bgi to be set\n") sys.exit() z = 0 while True: rname = random.choice(bf.index.get_names()) coord = bf.index.get_longest_target_alignment_coords_by_name(rname) if not coord: continue e = bf.fetch_by_coord(coord) if e.is_aligned(): epf.add_alignment(e) z += 1 #print z if z % 100 == 1: con = epf.get_alignment_errors().alignment_length if args.max_length <= con: break sys.stderr.write( str(con) + "/" + str(args.max_length) + " bases from " + str(z) + " alignments\r") sys.stderr.write("\n") else: z = 0 for e in bf: if e.is_aligned(): epf.add_alignment(e) z += 1 #print z if z % 100 == 1: con = epf.get_alignment_errors().alignment_length if args.max_length <= con: break sys.stderr.write( str(con) + "/" + str(args.max_length) + " bases from " + str(z) + " alignments\r") sys.stderr.write("\n") of = open(args.tempdir + '/report.txt', 'w') of.write(epf.get_alignment_errors().get_report()) of.close() for ofile in args.output: cmd = args.rscript_path + ' ' + os.path.dirname( os.path.realpath(__file__) ) + '/plot_alignment_errors.r ' + args.tempdir + '/report.txt ' + ofile + ' ' if args.scale: cmd += ' '.join([str(x) for x in args.scale]) sys.stderr.write(cmd + "\n") call(cmd.split()) if args.output_raw: of = open(args.output_raw, 'w') with open(args.tempdir + "/report.txt") as inf: for line in inf: of.write(line) of.close() if args.output_stats: of = open(args.output_stats, 'w') of.write(epf.get_alignment_errors().get_stats()) of.close() sys.stderr.write("finished\n") # Temporary working directory step 3 of 3 - Cleanup if not args.specific_tempdir: rmtree(args.tempdir)
def set_reference_genome(self, ref_genome): self.ref_genome_set = True self.ref_genome = FastaData(open(ref_genome).read())
def main(): #do our inputs args = do_inputs() global of of = sys.stdout if args.output: if args.output[-4:] == '.bam': cmd = 'samtools view -Sb - -o ' + args.output p = Popen(cmd.split(), stdin=PIPE) of = p.stdin else: sys.stderr.write( "ERROR: stdout and .bam are the only valid output formats\n") sys.exit() inf = sys.stdin if args.input != '-': if args.input[-3:] == '.gz': inf = gzip.open(args.input) else: inf = open(args.input) sys.stderr.write("reading reference genome\n") ref = FastaData(open(args.reference).read()) #shared = manager.dict() shared = {} for chr in sorted(ref.keys()): sys.stderr.write("reading " + chr + "\n") shared[chr] = ref[chr].upper() ref.remove(chr) sys.stderr.write("finished reading shared memory reference\n") sys.stderr.write("Now make the header\n") of.write("@HD\tVN:1.0\tSO:unknown\n") of.write("@PG\tID:SLR\n") for chr in sorted(shared.keys()): of.write("@SQ\tSN:" + chr + "\tLN:" + str(len(shared[chr])) + "\n") if args.threads > 1: poo = Pool(processes=args.threads) buffer = [] max_buffer = 1 z = 0 for line in inf: z += 1 if z % 1000 == 0: sys.stderr.write(str(z) + " \r") buffer.append(line) if len(buffer) >= max_buffer: if args.threads == 1: results = do_buffer(buffer, shared, args) do_out(results) else: poo.apply_async(do_buffer, args=( buffer[:], shared, args, ), callback=do_out) buffer = [] if len(buffer) > 0: if args.threads == 1: results = do_buffer(buffer, shared, args) do_out(results) else: poo.apply_async(do_buffer, args=( buffer[:], shared, args, ), callback=do_out) if args.threads > 1: poo.close() poo.join() sys.stderr.write("\n") if args.output: p.communicate() else: of.close() # Temporary working directory step 3 of 3 - Cleanup if not args.specific_tempdir: rmtree(args.tempdir)