def main(args): sys.stderr.write("Read reference fasta\n") fasta = FastaData(open(args.reference_fasta).read()) sys.stderr.write("Read alignment file\n") bf = BAMFile(args.bam_input,reference=fasta) bf.read_index() total_qualities = [] for j in range(0,100): total_qualities.append([]) ef = ErrorProfileFactory() mincontext = 0 alignments = 0 for i in range(0,args.max_alignments): rname = random.choice(bf.index.get_names()) coord = bf.index.get_longest_target_alignment_coords_by_name(rname) if not coord: continue bam = bf.fetch_by_coord(coord) qual = bam.value('qual') do_qualities(total_qualities,qual) if not bam.is_aligned(): continue alignments += 1 ef.add_alignment(bam) if i%100 == 0: mincontext = ef.get_min_context_count('target') if mincontext: if mincontext >= args.min_context and alignments >= args.min_alignments: break sys.stderr.write(str(i+1)+" lines "+str(alignments)+"/"+str(args.min_alignments)+" alignments "+str(mincontext)+"/"+str(args.min_context)+" mincontext \r") sys.stderr.write("\n") sys.stderr.write(str(mincontext)+" minimum contexts observed\n") target_context = ef.get_target_context_error_report() general_error_stats = ef.get_alignment_errors().get_stats() general_error_report = ef.get_alignment_errors().get_report() # convert report to table general_all = [x.split("\t") for x in general_error_report.rstrip().split("\n")] general_head = general_all[0] #print [y for y in general_all[1:]] general_data = [[y[0],y[1],int(y[2]),int(y[3])] for y in general_all[1:]] general_error_report = {'head':general_head,'data':general_data} quality_counts = [] for vals in total_qualities: garr = [] grp = {} for v in vals: if v[0] not in grp: grp[v[0]] = {}# check ordinal if v[1] not in grp[v[0]]: grp[v[0]][v[1]] = 0 # run length grp[v[0]][v[1]]+=1 for ordval in sorted(grp.keys()): for runlen in sorted(grp[ordval].keys()): garr.append([ordval,runlen,grp[ordval][runlen]]) quality_counts.append(garr) #Quailty counts now has 100 bins, each has an ordered array of # [ordinal_quality, run_length, observation_count] # Can prepare an output output = {} output['quality_counts'] = quality_counts output['context_error'] = target_context output['alignment_error'] = general_error_report output['error_stats'] = general_error_stats of = None if args.output[-3:]=='.gz': of = gzip.open(args.output,'w') else: of = open(args.output,'w') of.write(base64.b64encode(zlib.compress(json.dumps(output)))+"\n") of.close() # Temporary working directory step 3 of 3 - Cleanup if not args.specific_tempdir: rmtree(args.tempdir)
def main(args): # make our error profile report sys.stderr.write("Reading reference fasta\n") ref = FastaData(open(args.reference).read()) sys.stderr.write("Reading alignments\n") epf = ErrorProfileFactory() if args.random: bf = None if args.input_index: bf = BAMFile(args.input, reference=ref, index_file=args.input_index) bf.read_index(index_file=args.input_index) else: bf = BAMFile(args.input, reference=ref) bf.read_index() if not bf.has_index(): sys.stderr.write("Random access requires an index be set\n") z = 0 strand = 'target' if args.query: strand = 'query' con = 0 while True: rname = random.choice(bf.index.get_names()) #print rname coord = bf.index.get_longest_target_alignment_coords_by_name(rname) #print coord if not coord: continue e = bf.fetch_by_coord(coord) if e.is_aligned(): epf.add_alignment(e) z += 1 if z % 100 == 1: con = epf.get_min_context_count(strand) sys.stderr.write( str(z) + " alignments, " + str(con) + " min context coverage\r") if args.max_alignments <= z: break if args.stopping_point <= con: break else: bf = BAMFile(args.input, reference=ref) z = 0 strand = 'target' if args.query: strand = 'query' con = 0 for e in bf: if e.is_aligned(): epf.add_alignment(e) z += 1 if z % 100 == 1: con = epf.get_min_context_count(strand) sys.stderr.write( str(z) + " alignments, " + str(con) + " min context coverage\r") if args.max_alignments <= z: break if args.stopping_point <= con: break sys.stderr.write("\n") sys.stderr.write('working with:' + "\n") sys.stderr.write( str(z) + " alignments, " + str(con) + " min context coverage" + "\n") epf.write_context_error_report(args.tempdir + '/err.txt', strand) for ofile in args.output: cmd = args.rscript_path + ' ' + os.path.dirname( os.path.realpath(__file__) ) + '/plot_base_error_context.r ' + args.tempdir + '/err.txt ' + ofile + ' ' if args.scale: cmd += ' '.join([str(x) for x in args.scale]) sys.stderr.write(cmd + "\n") call(cmd.split()) sys.stderr.write("finished\n") if args.output_raw: of = open(args.output_raw, 'w') with open(args.tempdir + "/err.txt") as inf: for line in inf: of.write(line) # Temporary working directory step 3 of 3 - Cleanup if not args.specific_tempdir: rmtree(args.tempdir)
def main(args): sys.stderr.write("Reading our reference Fasta\n") ref = FastaData(open(args.reference, 'rb').read()) sys.stderr.write("Finished reading our reference Fasta\n") bf = None if args.input_index: bf = BAMFile(args.input, reference=ref, index_file=args.input_index) bf.read_index(index_file=args.input_index) else: bf = BAMFile(args.input, reference=ref) bf.read_index() epf = ErrorProfileFactory() if args.random: if not bf.has_index(): sys.stderr.write( "Random access requires our format of index bgi to be set\n") sys.exit() z = 0 while True: rname = random.choice(bf.index.get_names()) coord = bf.index.get_longest_target_alignment_coords_by_name(rname) if not coord: continue e = bf.fetch_by_coord(coord) if e.is_aligned(): epf.add_alignment(e) z += 1 #print z if z % 100 == 1: con = epf.get_alignment_errors().alignment_length if args.max_length <= con: break sys.stderr.write( str(con) + "/" + str(args.max_length) + " bases from " + str(z) + " alignments\r") sys.stderr.write("\n") else: z = 0 for e in bf: if e.is_aligned(): epf.add_alignment(e) z += 1 #print z if z % 100 == 1: con = epf.get_alignment_errors().alignment_length if args.max_length <= con: break sys.stderr.write( str(con) + "/" + str(args.max_length) + " bases from " + str(z) + " alignments\r") sys.stderr.write("\n") of = open(args.tempdir + '/report.txt', 'w') of.write(epf.get_alignment_errors().get_report()) of.close() for ofile in args.output: cmd = args.rscript_path + ' ' + os.path.dirname( os.path.realpath(__file__) ) + '/plot_alignment_errors.r ' + args.tempdir + '/report.txt ' + ofile + ' ' if args.scale: cmd += ' '.join([str(x) for x in args.scale]) sys.stderr.write(cmd + "\n") call(cmd.split()) if args.output_raw: of = open(args.output_raw, 'w') with open(args.tempdir + "/report.txt") as inf: for line in inf: of.write(line) of.close() if args.output_stats: of = open(args.output_stats, 'w') of.write(epf.get_alignment_errors().get_stats()) of.close() sys.stderr.write("finished\n") # Temporary working directory step 3 of 3 - Cleanup if not args.specific_tempdir: rmtree(args.tempdir)