def extract_mismatched_reads( min_distance, max_distance, mappable, bam, in_fh, out_fh, bam_to_sam ):
  bias.log_stderr( 'Filtering based on min distance %i, max_distance %i, mappable %s' % ( min_distance, max_distance, mappable ) )

  allowed_tags = set()
  candidates = 0
  for line in in_fh:
    fields = line.strip().split(',')
    if len(fields) == 5 and fields[0].isdigit():
      candidates += 1
      distance = abs(int(fields[3]))
      line_mappable = int(fields[4]) == 0
      if distance >= min_distance and distance <= max_distance:
        if line_mappable and mappable or not line_mappable and not mappable:
          allowed_tags.add( fields[2] )
  bias.log_stderr( '%i allowed tags from %i possibles' % ( len(allowed_tags), candidates ) )

  bias.SamFilter( sam_fh=bias.BamReaderExternal( bam_to_sam, bam ), target_fh=out_fh, allowed_tags=allowed_tags, log=bias.log_stderr )
示例#2
0
def compare_bams( bams, mapq, compare_position, subset_detail, mismatch_detail, xmfa, origin, target, out_fh, bam_to_sam ):
  diff = bias.SamDiff( [ bias.BamReaderExternal( bam_to_sam, sam_file ) for sam_file in bams ], mapq_min=mapq, compare_position=compare_position, subset_detail=subset_detail, mismatch_detail=None if mismatch_detail == -1 else mismatch_detail )

  out_fh.write( "mapq stats\n==========\n" )
  out_fh.write( "i:\tn\tmax\tmin\tmean\tsd\tfilename\n" )
  for idx, stats in enumerate( diff.mapq_stats ):
    out_fh.write( '%i:\t%i\t%.2f\t%.2f\t%.2f\t%.2f\t%s\n' % ( idx, stats['mapped'], stats['max'], stats['min'], stats['mean'], stats['sd'], bams[idx] ) )

  out_fh.write( "\nmapped vs unmapped commonality\n===================\n" )
  for key in sorted( diff.totals.keys() ):
    out_fh.write( ( "{0:0%ib}: {1}\n" % ( len(bams) ) ).format( key, diff.totals[key] ) )

  if compare_position:
    #out_fh.write( "\nmapped vs unmapped commonality including position differences\n===================\n" )
    #for key in sorted( diff.position_totals.keys() ):
    #  out_fh.write( ( "{0:0%ib}: {1}\n" % ( len(bams) ) ).format( key, diff.position_totals[key] ) )
    pass

  if subset_detail:
    out_fh.write( "\nmapq vs position differences\n===================\n" )
    out_fh.write( "i:\tmax\tmin\tmean\tsd\thist\n" )
    for key, value in diff.mapq_subset_stats.items():
      bin_key = ( "{0:0%ib}" % ( len(bams) ) ).format( key )
      out_fh.write( '%s:\t%.2f\t%.2f\t%.2f\t%.2f\t%s\n' % ( bin_key, value['max'], value['min'], value['mean'], value['sd'], value['hist'] ) )

  if mismatch_detail > -1:
    out_fh.write( "\nmismatch details\n===================\n" )
    if xmfa is None:
      out_fh.write( "pos,alt,read_id,wrongness\n" )
    else:
      mauve_map = bias.MauveMap( open(xmfa, 'r'), src_strand=origin, target_strand=target )
      out_fh.write( "pos,alt,read_id,wrongness,nearestmap\n" )
    mismatch_count = 0
    unpaired_count = 0
    for read, value in diff.mismatch_stats.items():
      if 'p' in value and 'a' in value:
        if xmfa is None:
          out_fh.write( '%i,%i,%s,%i\n' % ( value['p'], value['a'], read, value['p'] - value['a'] ) )
        else:
          nearest = mauve_map.find_nearest_target( int(value['a']) )
          out_fh.write( '%i,%i,%s,%i,%i\n' % ( value['p'], value['a'], read, value['p'] - value['a'], nearest ) )
        mismatch_count += 1
      else:
        unpaired_count += 1
        #print "missing values", read, value
    bias.log_stderr( "%i mismatches with incorrect alternatives; %i unpaired reads" % ( mismatch_count, unpaired_count ) )
def run(cmd):
    """
    run a system command
  """
    bias.log_stderr(cmd)
    os.system(cmd)
        for line in open(stats_tmp, "r"):
            fields = line.strip().split()
            if len(fields) > 3 and fields[3] == "mapped":
                return int(fields[0])
    finally:
        run("rm {0} {1}".format(sam_tmp, stats_tmp))


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Choose reference")
    parser.add_argument("--fastq", help="set of reads")
    parser.add_argument("--sample", type=int, default=10000, help="number of reads to sample")
    parser.add_argument("--skipindex", action="store_true", default=False, help="skip fasta indexing step")
    parser.add_argument("references", nargs="+", help="reference fasta file(s)")
    args = parser.parse_args()

    # sample fastq
    fastq_tmp = "tmp{0}.fq".format(random.randint(1, 1e6))
    sample_fastq(args.fastq, fastq_tmp, args.sample)

    # assess with unmapped
    results = {}
    bias.log_stderr("{0} reference sequences".format(len(args.references)))
    for reference in args.references:
        results[reference] = unmapped_count(reference, fastq_tmp, skipindex=args.skipindex)

    for k in sorted(results, key=results.get):
        sys.stdout.write("{0}\t{1}\n".format(k, results[k]))

    run("rm {0}".format(fastq_tmp))