def extract_mismatched_reads( min_distance, max_distance, mappable, bam, in_fh, out_fh, bam_to_sam ): bias.log_stderr( 'Filtering based on min distance %i, max_distance %i, mappable %s' % ( min_distance, max_distance, mappable ) ) allowed_tags = set() candidates = 0 for line in in_fh: fields = line.strip().split(',') if len(fields) == 5 and fields[0].isdigit(): candidates += 1 distance = abs(int(fields[3])) line_mappable = int(fields[4]) == 0 if distance >= min_distance and distance <= max_distance: if line_mappable and mappable or not line_mappable and not mappable: allowed_tags.add( fields[2] ) bias.log_stderr( '%i allowed tags from %i possibles' % ( len(allowed_tags), candidates ) ) bias.SamFilter( sam_fh=bias.BamReaderExternal( bam_to_sam, bam ), target_fh=out_fh, allowed_tags=allowed_tags, log=bias.log_stderr )
def compare_bams( bams, mapq, compare_position, subset_detail, mismatch_detail, xmfa, origin, target, out_fh, bam_to_sam ): diff = bias.SamDiff( [ bias.BamReaderExternal( bam_to_sam, sam_file ) for sam_file in bams ], mapq_min=mapq, compare_position=compare_position, subset_detail=subset_detail, mismatch_detail=None if mismatch_detail == -1 else mismatch_detail ) out_fh.write( "mapq stats\n==========\n" ) out_fh.write( "i:\tn\tmax\tmin\tmean\tsd\tfilename\n" ) for idx, stats in enumerate( diff.mapq_stats ): out_fh.write( '%i:\t%i\t%.2f\t%.2f\t%.2f\t%.2f\t%s\n' % ( idx, stats['mapped'], stats['max'], stats['min'], stats['mean'], stats['sd'], bams[idx] ) ) out_fh.write( "\nmapped vs unmapped commonality\n===================\n" ) for key in sorted( diff.totals.keys() ): out_fh.write( ( "{0:0%ib}: {1}\n" % ( len(bams) ) ).format( key, diff.totals[key] ) ) if compare_position: #out_fh.write( "\nmapped vs unmapped commonality including position differences\n===================\n" ) #for key in sorted( diff.position_totals.keys() ): # out_fh.write( ( "{0:0%ib}: {1}\n" % ( len(bams) ) ).format( key, diff.position_totals[key] ) ) pass if subset_detail: out_fh.write( "\nmapq vs position differences\n===================\n" ) out_fh.write( "i:\tmax\tmin\tmean\tsd\thist\n" ) for key, value in diff.mapq_subset_stats.items(): bin_key = ( "{0:0%ib}" % ( len(bams) ) ).format( key ) out_fh.write( '%s:\t%.2f\t%.2f\t%.2f\t%.2f\t%s\n' % ( bin_key, value['max'], value['min'], value['mean'], value['sd'], value['hist'] ) ) if mismatch_detail > -1: out_fh.write( "\nmismatch details\n===================\n" ) if xmfa is None: out_fh.write( "pos,alt,read_id,wrongness\n" ) else: mauve_map = bias.MauveMap( open(xmfa, 'r'), src_strand=origin, target_strand=target ) out_fh.write( "pos,alt,read_id,wrongness,nearestmap\n" ) mismatch_count = 0 unpaired_count = 0 for read, value in diff.mismatch_stats.items(): if 'p' in value and 'a' in value: if xmfa is None: out_fh.write( '%i,%i,%s,%i\n' % ( value['p'], value['a'], read, value['p'] - value['a'] ) ) else: nearest = mauve_map.find_nearest_target( int(value['a']) ) out_fh.write( '%i,%i,%s,%i,%i\n' % ( value['p'], value['a'], read, value['p'] - value['a'], nearest ) ) mismatch_count += 1 else: unpaired_count += 1 #print "missing values", read, value bias.log_stderr( "%i mismatches with incorrect alternatives; %i unpaired reads" % ( mismatch_count, unpaired_count ) )
def run(cmd): """ run a system command """ bias.log_stderr(cmd) os.system(cmd)
for line in open(stats_tmp, "r"): fields = line.strip().split() if len(fields) > 3 and fields[3] == "mapped": return int(fields[0]) finally: run("rm {0} {1}".format(sam_tmp, stats_tmp)) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Choose reference") parser.add_argument("--fastq", help="set of reads") parser.add_argument("--sample", type=int, default=10000, help="number of reads to sample") parser.add_argument("--skipindex", action="store_true", default=False, help="skip fasta indexing step") parser.add_argument("references", nargs="+", help="reference fasta file(s)") args = parser.parse_args() # sample fastq fastq_tmp = "tmp{0}.fq".format(random.randint(1, 1e6)) sample_fastq(args.fastq, fastq_tmp, args.sample) # assess with unmapped results = {} bias.log_stderr("{0} reference sequences".format(len(args.references))) for reference in args.references: results[reference] = unmapped_count(reference, fastq_tmp, skipindex=args.skipindex) for k in sorted(results, key=results.get): sys.stdout.write("{0}\t{1}\n".format(k, results[k])) run("rm {0}".format(fastq_tmp))