Пример #1
0
def construct_header_from_reference_fasta(ref_fasta_filename):
  g = FastaData(open(ref_fasta_filename).read())
  #g = SequenceBasics.read_fasta_into_hash(ref_fasta_filename)
  chrs = {}
  for name in sorted(g.keys()):
    chrs[name] = len(g[name])
    sys.stderr.write(name+" is there at length "+str(len(g[name]))+"\n")
  header = ''
  header += "@HD\tVN:1.0\tSO:coordinate\n"
  for chr in sorted(chrs):
    header += "@SQ\tSN:"+chr+"\tLN:"+str(chrs[chr])+"\n"
  header += "@PG\tID:SamBasics.py\tVN:1.0\n"
  return header 
Пример #2
0
def construct_header_from_reference_fasta(ref_fasta_filename):
    g = FastaData(open(ref_fasta_filename).read())
    #g = SequenceBasics.read_fasta_into_hash(ref_fasta_filename)
    chrs = {}
    for name in sorted(g.keys()):
        chrs[name] = len(g[name])
        sys.stderr.write(name + " is there at length " + str(len(g[name])) +
                         "\n")
    header = ''
    header += "@HD\tVN:1.0\tSO:coordinate\n"
    for chr in sorted(chrs):
        header += "@SQ\tSN:" + chr + "\tLN:" + str(chrs[chr]) + "\n"
    header += "@PG\tID:SamBasics.py\tVN:1.0\n"
    return header
Пример #3
0
def main():
    parser = argparse.ArgumentParser(
        description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('input', help="Use - for STDIN")
    parser.add_argument('-r', '--reference', help="reference genome FASTA")
    parser.add_argument('--no_qual',
                        action='store_true',
                        help="dont put in quality")

    args = parser.parse_args()
    ref = {}
    if args.reference:
        ref = FastaData(open(args.reference, 'rb').read())
    if args.input == '-':
        args.input = sys.stdin
    else:
        args.input = open(args.input)

    h1 = '@HD	VN:1.0	SO:unsorted'
    h2 = '@PG	ID:FA2UN	PN:FA2UN	VN:2016-06-09	CL:' + ' '.join(sys.argv)
    print h1
    print h2
    if ref:
        for chr in sorted(ref.keys()):
            print "@SQ\tSN:" + chr + "\t" + 'LN:' + str(len(ref[chr]))
    inf = FastqHandle(args.input)
    for e in inf:
        o = ''
        o += e.name + "\t"
        o += "4\t"
        o += "*\t"
        o += "0\t"
        o += "0\t"
        o += "*\t"
        o += "*\t"
        o += "0\t"
        o += "0\t"
        o += e.seq + "\t"
        if args.no_qual:
            o += "*\t"
        else:
            o += e.qual + "\t"
        o += "XO:Z:NM"
        print o
def main():
  parser = argparse.ArgumentParser(description="",formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument('input',help="Use - for STDIN")
  parser.add_argument('-r','--reference',help="reference genome FASTA")
  parser.add_argument('--no_qual',action='store_true',help="dont put in quality")
  
  args = parser.parse_args()
  ref = {}
  if args.reference:
    ref = FastaData(open(args.reference,'rb').read())
  if args.input == '-':
    args.input = sys.stdin
  else: args.input = open(args.input)
  
  
  h1 = '@HD	VN:1.0	SO:unsorted'
  h2 = '@PG	ID:FA2UN	PN:FA2UN	VN:2016-06-09	CL:'+' '.join(sys.argv)
  print h1
  print h2
  if ref:
    for chr in sorted(ref.keys()):
      print "@SQ\tSN:"+chr+"\t"+'LN:'+str(len(ref[chr]))
  inf = FastqHandle(args.input)
  for e in inf:
    o =  ''
    o += e.name+"\t"
    o += "4\t"
    o += "*\t"
    o += "0\t"
    o += "0\t"
    o += "*\t"
    o += "*\t"
    o += "0\t"
    o += "0\t"
    o += e.seq+"\t"
    if args.no_qual:
      o+= "*\t"
    else:
      o += e.qual+"\t"
    o += "XO:Z:NM"
    print o
def main():
    parser = argparse.ArgumentParser(
        description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('input', help="Use - for STDIN or specify a BAM file")
    parser.add_argument('-r',
                        '--reference',
                        help="Reference fasta",
                        required=True)
    args = parser.parse_args()

    ref = None
    if args.reference:
        ref = FastaData(open(args.reference, 'rb').read())

    if args.input == '-':
        args.input = SamStream(sys.stdin, reference=ref)
    else:
        args.input = BAMFile(args.input, reference=ref)
    for e in args.input:
        if e.is_aligned():
            print e.get_PSL()
Пример #6
0
def main():
  parser = argparse.ArgumentParser(description="Read a sam file and output a bed file in the format of junction_color.bed")
  parser.add_argument('-o','--output',help='FILENAME is output')
  parser.add_argument('--min_intron_size',type=int,default=68,help='minimum intron size')
  parser.add_argument('infile',help='FILENAME of sam file or "-" for STDIN')
  parser.add_argument('reference_genome',help='FILENAME of the reference genome')
  args = parser.parse_args()

  # get our reference genome
  sys.stderr.write("reading reference genome\n")
  #g = SequenceBasics.read_fasta_into_hash(args.reference_genome)
  g = FastaData(open(args.reference_genome).read())
  sys.stderr.write("finished reading reference genome\n")

  inf = sys.stdin
  read_mapping_count = {}
  junctions = {}
  if args.infile != '-':
    inf = open(args.infile)
  sys.stderr.write("reading through sam file\n")
  zall = 0
  zn = 0
  while True:
    line = inf.readline()
    if not line: break
    line = line.rstrip()
    if SamBasics.is_header(line): continue
    d = SamBasics.sam_line_to_dictionary(line)
    chrom = d['rname']
    if chrom =='*': continue
    if chrom not in g.keys():
      sys.stderr.write("WARNING: "+chrom+" not in reference, skipping\n")
      continue
    mate = 'U'
    if SamBasics.check_flag(d['flag'],int('0x4',16)): #check if its unmapped
      continue  # we can ignore the unmapped things for now
    if SamBasics.check_flag(d['flag'],int('0x40',16)):
      mate = 'L'
    elif SamBasics.check_flag(d['flag'],int('0x80',16)):
      mate = 'R'
    actual_read = d['qname']+"\t"+mate
    if actual_read not in read_mapping_count:
      read_mapping_count[actual_read] = 0
    read_mapping_count[actual_read] += 1
    has_intron = 0
    start_loc = d['pos']
    current_loc = start_loc
    bounds  = []
    for i in range(0,len(d['cigar_array'])):
      ce = d['cigar_array'][i]
      if ce['op'] == 'N' and ce['val'] >= args.min_intron_size:
        has_intron = 1
        lbound = current_loc # should be the intron start base index-1
        current_loc += ce['val']
        rbound = current_loc # should be the second exon start base index-1
        right_size = d['cigar_array'][i+1]['val']
        bounds.append([lbound,rbound,right_size])
      elif ce['op'] == 'D':
        current_loc += ce['val']
      elif re.match('[=XMSHP]',ce['op']):
        current_loc += ce['val'] 
    if has_intron == 0: continue # there are no splices to report here
    #print actual_read
    #print d['cigar']
    #print d
    #print start_loc
    #print bounds
    for bound in bounds:
      zall += 1
      intronflank = g[chrom][bound[0]-1:bound[0]+1].upper() + '-' + \
                    g[chrom][bound[1]-3:bound[1]-1].upper()
      strand = ''
      if is_canon(intronflank): # its a positive strand
        strand = '+'
      elif is_revcanon(intronflank): # its a negative strand
        strand = '-'
      else:
        # We can't deal with the non-canonical splice sorry
        zn += 1
        sys.stderr.write("WARNING skipping non-canonical splice ("+str(zn)+"/"+str(zall)+")\r")
        continue
      # If we are still in we have successfully found a splice
      out_chrom = chrom
      out_start = bound[0]-51
      out_end = bound[1]+49
      out_name = '*' # this will be done later
      out_score = 50
      out_strand = strand
      out_thickStart = out_start
      out_thickEnd = out_end
      out_rgb = '0,0,0'
      out_block_count = 2
      out_block_sizes = '50,50'
      out_block_starts = '0,'+str(bound[1]-bound[0]+50)
      bed = []
      bed.append(out_chrom)
      bed.append(str(out_start))
      bed.append(str(out_end))
      bed.append(out_name)
      bed.append(str(out_score))
      bed.append(out_strand)
      bed.append(str(out_thickStart))
      bed.append(str(out_thickEnd))
      bed.append(out_rgb)
      bed.append(str(out_block_count))
      bed.append(out_block_sizes)
      bed.append(out_block_starts)
      entry = "\t".join(bed)
      if entry not in junctions:
        junctions[entry] = {}
        junctions[entry]['reads'] = set()
        junctions[entry]['positions'] = set()
        junctions[entry]['right_sizes'] = set()
      junctions[entry]['reads'].add(actual_read)
      junctions[entry]['positions'].add(d['pos'])
      junctions[entry]['right_sizes'].add(bound[2])
  sys.stderr.write("\n")
  sys.stderr.write("finished reading sam\n")
  of = sys.stdout
  if args.output:
    of = open(args.output,'w')
  if len(junctions) > 0: # if we have stuff lets print a header
    of.write("track\tname=junctions\tdescription=\"SpliceMap junctions\" itemRgb=\"On\"\n")
  for entry in junctions:
    nR = len(junctions[entry]['reads'])
    width = max(junctions[entry]['right_sizes'])-min(junctions[entry]['right_sizes'])
    nNR = len(junctions[entry]['positions'])
    nUR = 0
    nMR = 0
    for read in junctions[entry]['reads']:
      if read_mapping_count[read] == 1:
        nUR += 1
      elif read_mapping_count[read] > 1:
        nMR += 1
      else:
        sys.stderr.write("ERROR: nonsense read count\n")
        return
    name = '('+str(nR)+')['+str(width)+'_'+str(nNR)+']('+str(nUR)+'/'+str(nMR)+')'
    bed = entry.split("\t")
    bed[3] = name
    of.write("\t".join(bed)+"\n")    
def main():
    #do our inputs
    args = do_inputs()

    of = sys.stdout
    if args.output: of = open(args.output, 'w')

    inf = sys.stdin
    if args.input != '-':
        if args.input[-3:] == '.gz':
            inf = gzip.open(args.input)
        else:
            inf = open(args.input)

    sys.stderr.write("reading in fasta\n")
    f = FastaData(open(args.reference).read())
    sh = GPDStream(inf)
    gc_bins = range(0, args.number_of_bins)
    bin_handles = []
    for i in range(0, args.number_of_bins):
        fname = args.tempdir + '/' + str(i) + '.bed.gz'
        cmd2 = 'bed_to_bed_depth.py - -o ' + fname
        p2 = Popen(cmd2.split(), stdin=PIPE, close_fds=True)
        cmd1 = 'sort -k 1,1 -k2,2n -k3,3n -T ' + args.tempdir
        p1 = Popen(cmd1.split(), stdin=PIPE, stdout=p2.stdin, close_fds=True)
        bin_handles.append([p1, p2, fname, i])

    if args.best_X_covered:
        sys.stderr.write("work out stratified data\n")
        cmd3 = 'bed_depth_to_stratified_coverage.py --minimum_coverage 10 --output_key ' + args.tempdir + '/key' + ' -r ' + args.reference + ' - -o ' + args.tempdir + '/combo.bed.gz'
        pstrat3 = Popen(cmd3.split(), stdin=PIPE, close_fds=True)
        cmd2 = 'bed_to_bed_depth.py -'
        pstrat2 = Popen(cmd2.split(),
                        stdin=PIPE,
                        stdout=pstrat3.stdin,
                        close_fds=True)
        cmd1 = 'sort -k 1,1 -k2,2n -k3,3n -T ' + args.tempdir
        pstrat1 = Popen(cmd1.split(),
                        stdin=PIPE,
                        stdout=pstrat2.stdin,
                        close_fds=True)

    num = 0
    for gpd in sh:
        num += 1
        if (num % 1000 == 0): sys.stderr.write(str(num) + "     \r")
        results = []
        if args.minimum_sequence_length:
            if gpd.get_length() < args.minimum_sequence_length: continue
            seq = gpd.get_sequence(f).upper()
            seq_obj = Seq(seq)
            n_count = seq_obj.n_count()
            if len(seq) - n_count < args.min_non_N: continue
            gc = seq_obj.gc_content()
            gc_bin = int(args.number_of_bins * gc)
            if gc_bin == args.number_of_bins: gc_bin -= 1
            for exon in gpd.exons:
                bed_bin = [
                    "\t".join([str(x) for x in exon.rng.get_bed_array()]),
                    gc_bin
                ]
                results.append(bed_bin)
        elif args.fragment:
            seqlen = gpd.get_length()
            if seqlen < args.fragment: continue
            sfrags = int(float(seqlen) / float(args.fragment))
            sremain = seqlen % args.fragment
            offset = 0
            if random.random() < 0.5: offset = sremain
            #print '^^^'
            for i in range(0, sfrags):
                gsub = gpd.subset(i * args.fragment + offset,
                                  (i + 1) * args.fragment + offset)
                seq = gsub.get_sequence(f).upper()
                seq_obj = Seq(seq)
                n_count = seq_obj.n_count()
                if len(seq) - n_count < args.min_non_N: continue
                gc = seq_obj.gc_content()
                gc_bin = int(args.number_of_bins * gc)
                if gc_bin == args.number_of_bins: gc_bin -= 1
                for exon in gsub.exons:
                    bed_bin = [
                        "\t".join([str(x) for x in exon.rng.get_bed_array()]),
                        gc_bin
                    ]
                    results.append(bed_bin)

        for val in results:
            [bed, bin] = val
            bin_handles[bin][0].stdin.write(bed + "\n")
            if args.best_X_covered:
                pstrat1.stdin.write(bed + "\n")
                #if not gc: print len(gpd.get_sequence(f))

    sys.stderr.write("\n")
    for v in bin_handles:
        v[0].communicate()
        v[1].communicate()
    if args.best_X_covered:
        pstrat1.communicate()
        pstrat2.communicate()
        pstrat3.communicate()
        # If we want stratified data we should do it here
        sys.stderr.write("read the key\n")
        d = {}
        with open(args.tempdir + '/key') as inf:
            header = inf.readline()
            for line in inf:
                f = line.rstrip().split("\t")
                d[int(f[0])] = int(f[1])
        if args.best_X_covered not in d:
            sys.stderr.write(
                "ERROR: the number of bases you specified is probably too big you didn't make the digit begin with 1 or 5 and restof the numbers be zero\n"
            )
            sys.exit()
        num = d[args.best_X_covered]
        ninf = gzip.open(args.tempdir + '/combo.bed.gz')
        nof = gzip.open(args.tempdir + '/strat.bed.gz', 'w')
        for line in ninf:
            f = line.rstrip().split("\t")
            if int(f[3]) >= num:
                nof.write("\t".join(f[:-1]) + "\n")
        nof.close()
        ninf.close()
        for i in range(0, len(bin_handles)):
            v = bin_handles[i]
            fname = v[2]
            fname2 = args.tempdir + '/' + str(v[3]) + '.strata.bed.gz'
            gof = open(fname2, 'w')
            cmd2 = 'gzip'
            p2 = Popen(cmd2.split(), stdout=gof, stdin=PIPE)
            cmd1 = 'bedtools intersect -a ' + fname + ' -b ' + args.tempdir + '/strat.bed.gz'
            p1 = Popen(cmd1.split(), stdout=p2.stdin)
            p1.communicate()
            p2.communicate()
            gof.close()
            # lets just replace the name of the file that the final output will read from
            bin_handles[i][2] = fname2
    # Now we have bed depths for each bin
    for v in bin_handles:
        fname = v[2]
        #sys.stderr.write(fname+" ... prosessing\n")
        depths = {}
        bin = v[3]
        inf = gzip.open(fname)
        for line in inf:
            f = line.rstrip().split("\t")
            bases = int(f[2]) - int(f[1])
            depth = int(f[3])
            if depth not in depths: depths[depth] = 0
            depths[depth] += bases
        inf.close()
        for depth in sorted(depths.keys()):
            of.write(
                str(bin) + "\t" + str(depth) + "\t" + str(depths[depth]) +
                "\n")
    of.close()
    # Temporary working directory step 3 of 3 - Cleanup
    if not args.specific_tempdir:
        rmtree(args.tempdir)
Пример #8
0
def main(args):
    # make our error profile report
    sys.stderr.write("Reading reference fasta\n")
    ref = FastaData(open(args.reference).read())
    sys.stderr.write("Reading alignments\n")
    epf = ErrorProfileFactory()
    if args.random:
        bf = None
        if args.input_index:
            bf = BAMFile(args.input,
                         reference=ref,
                         index_file=args.input_index)
            bf.read_index(index_file=args.input_index)
        else:
            bf = BAMFile(args.input, reference=ref)
            bf.read_index()
        if not bf.has_index():
            sys.stderr.write("Random access requires an index be set\n")
        z = 0
        strand = 'target'
        if args.query: strand = 'query'
        con = 0
        while True:
            rname = random.choice(bf.index.get_names())
            #print rname
            coord = bf.index.get_longest_target_alignment_coords_by_name(rname)
            #print coord
            if not coord: continue
            e = bf.fetch_by_coord(coord)
            if e.is_aligned():
                epf.add_alignment(e)
                z += 1
                if z % 100 == 1:
                    con = epf.get_min_context_count(strand)
                sys.stderr.write(
                    str(z) + " alignments, " + str(con) +
                    " min context coverage\r")
                if args.max_alignments <= z: break
                if args.stopping_point <= con: break

    else:
        bf = BAMFile(args.input, reference=ref)
        z = 0
        strand = 'target'
        if args.query: strand = 'query'
        con = 0
        for e in bf:
            if e.is_aligned():
                epf.add_alignment(e)
                z += 1
                if z % 100 == 1:
                    con = epf.get_min_context_count(strand)
                sys.stderr.write(
                    str(z) + " alignments, " + str(con) +
                    " min context coverage\r")
                if args.max_alignments <= z: break
                if args.stopping_point <= con: break
    sys.stderr.write("\n")
    sys.stderr.write('working with:' + "\n")
    sys.stderr.write(
        str(z) + " alignments, " + str(con) + " min context coverage" + "\n")
    epf.write_context_error_report(args.tempdir + '/err.txt', strand)

    for ofile in args.output:
        cmd = args.rscript_path + ' ' + os.path.dirname(
            os.path.realpath(__file__)
        ) + '/plot_base_error_context.r ' + args.tempdir + '/err.txt ' + ofile + ' '
        if args.scale:
            cmd += ' '.join([str(x) for x in args.scale])
        sys.stderr.write(cmd + "\n")
        call(cmd.split())
    sys.stderr.write("finished\n")
    if args.output_raw:
        of = open(args.output_raw, 'w')
        with open(args.tempdir + "/err.txt") as inf:
            for line in inf:
                of.write(line)
    # Temporary working directory step 3 of 3 - Cleanup
    if not args.specific_tempdir:
        rmtree(args.tempdir)
def main(args):
    random.seed(args.seed)
    sum = 0
    if args.reference_genome:
        ref = FastaData(open(args.reference_genome).read())
        for name in ref.keys():
            sum += len(ref[name])
    else:
        with open(args.reference_lengths) as inf:
            for line in inf:
                f = line.rstrip().split("\t")
                sum += int(f[1])
    c = args.minimum_coverage
    z = 0
    values = {}
    while c < sum:
        z += 1
        values[c] = z
        c = c * 5
        if c >= sum: break
        z += 1
        values[c] = z
        c = c * 2
    z += 1
    values[sum] = z
    for c in sorted(values.keys()):
        values[c] = z - values[c] + 1
    ### Now values contains the stratified coverage values
    if args.output_key:
        of = open(args.output_key, 'w')
        of.write("bp_size\tstrata_label\n")
        for c in sorted(values.keys()):
            of.write(str(c) + "\t" + str(values[c]) + "\n")
        of.close()
    inf = sys.stdin
    if args.input != '-':
        if args.input[-3:] == '.gz': inf = gzip.open(args.input)
        else: inf = open(args.input)

    of = sys.stdout
    if args.output:
        if args.output[-3:] == '.gz': of = gzip.open(args.output, 'w')
        else: of = open(args.output, 'w')
    depths = {}
    vals = []
    z = 0
    for line in inf:
        z += 1
        if z % 100000 == 0:
            sys.stderr.write(str(z) + "    bed entries read   \r")
        f = line.rstrip().split("\t")
        addition = 0
        if not args.dont_make_unique:
            addition = +args.unique_scale * random.random()
        vals.append([f[0], int(f[1]), int(f[2]), float(f[3]) + addition])
    z = 0
    sys.stderr.write("\n")
    for f in vals:
        z += 1
        if z % 100000 == 0:
            sys.stderr.write(str(z) + "    bed entries read   \r")
        #keep track of the number of bases at each depth
        depth = f[3]
        cov = f[2] - f[1]
        if depth not in depths: depths[depth] = 0
        depths[depth] += cov
        #vals.append([f[0],int(f[1]),int(f[2]),depth])
    sys.stderr.write("\n")
    #total_bases = sum(depths.values())
    #thresh = {}
    #for strata in stratas:
    #  pos = 0
    #  cur = float(i)*float(total_bases)/float(args.strata)
    stratas = sorted(values.keys())
    pos = 0
    depth_strata = {}
    for d in reversed(sorted(depths.keys())):
        pos += depths[d]
        while stratas[0] < pos:
            stratas.pop(0)
        depth_strata[d] = values[stratas[0]]
        #print str(d)+"\t"+str(values[stratas[0]])
        #if float(pos) > cur:
        #  thresh[d] = [pos,i]
        #  break
    vals[0][3] = depth_strata[vals[0][3]]
    buffer = vals[0]
    for val in vals[1:]:
        val[3] = depth_strata[val[3]]
        if val[1] == buffer[2] and val[3] == buffer[3] and val[0] == buffer[0]:
            #print 'hello'
            buffer[2] = val[2]
            continue
        else:
            of.write(buffer[0] + "\t" + str(buffer[1]) + "\t" +
                     str(buffer[2]) + "\t" + str(buffer[3]) + "\n")
            buffer = val
    of.write(buffer[0] + "\t" + str(buffer[1]) + "\t" + str(buffer[2]) + "\t" +
             str(buffer[3]) + "\n")
    of.close()
Пример #10
0
def main():
  #do our inputs
  args = do_inputs()
  global of
  of = sys.stdout
  if args.output:
    if args.output[-4:] == '.bam':
      cmd = 'samtools view -Sb - -o '+args.output
      p = Popen(cmd.split(),stdin=PIPE)
      of = p.stdin
    else:
      sys.stderr.write("ERROR: stdout and .bam are the only valid output formats\n")
      sys.exit()
  inf = sys.stdin
  if args.input != '-':
    if args.input[-3:] == '.gz':
      inf = gzip.open(args.input)
    else: inf = open(args.input)
  sys.stderr.write("reading reference genome\n")
  ref = FastaData(open(args.reference).read())
  #shared = manager.dict()
  shared = {}
  for chr in sorted(ref.keys()): 
    sys.stderr.write("reading "+chr+"\n")
    shared[chr] = ref[chr].upper()
    ref.remove(chr)
  sys.stderr.write("finished reading shared memory reference\n")
  sys.stderr.write("Now make the header\n")
  of.write("@HD\tVN:1.0\tSO:unknown\n")
  of.write("@PG\tID:SLR\n")
  for chr in sorted(shared.keys()):
    of.write("@SQ\tSN:"+chr+"\tLN:"+str(len(shared[chr]))+"\n")

  if args.threads > 1:
    poo = Pool(processes=args.threads)

  buffer = []
  max_buffer = 1
  z = 0
  for line in inf:
    z += 1
    if z%1000==0: sys.stderr.write(str(z)+"   \r")
    buffer.append(line)
    if len(buffer) >= max_buffer:
      if args.threads == 1:
        results = do_buffer(buffer,shared,args)
        do_out(results)
      else:
        poo.apply_async(do_buffer,args=(buffer[:],shared,args,),callback=do_out)
      buffer = []
  if len(buffer) > 0:
    if args.threads ==1:
      results = do_buffer(buffer,shared,args)
      do_out(results)
    else:
      poo.apply_async(do_buffer,args=(buffer[:],shared,args,),callback=do_out)

  if args.threads > 1:
    poo.close()
    poo.join()

  sys.stderr.write("\n")
  if args.output:
    p.communicate()
  else: of.close()

  # Temporary working directory step 3 of 3 - Cleanup
  if not args.specific_tempdir:
    rmtree(args.tempdir)
Пример #11
0
def main(args):
  random.seed(args.seed)
  sum = 0
  if args.reference_genome:
    ref = FastaData(open(args.reference_genome).read())
    for name in ref.keys():
      sum += len(ref[name])
  else:
    with open(args.reference_lengths) as inf:
      for line in inf:
        f = line.rstrip().split("\t")
        sum += int(f[1])
  c = args.minimum_coverage
  z = 0
  values = {}
  while c < sum:
    z += 1
    values[c] = z
    c = c*5
    if c >= sum: break
    z += 1
    values[c] = z
    c = c*2
  z +=1
  values[sum] = z
  for c in sorted(values.keys()):
    values[c] = z-values[c]+1
  ### Now values contains the stratified coverage values
  if args.output_key:
    of = open(args.output_key,'w')
    of.write("bp_size\tstrata_label\n")
    for c in sorted(values.keys()):
      of.write(str(c)+"\t"+str(values[c])+"\n")
    of.close()
  inf = sys.stdin
  if args.input != '-': 
    if args.input[-3:]=='.gz': inf = gzip.open(args.input)
    else: inf = open(args.input)

  of = sys.stdout
  if args.output:
    if args.output[-3:]=='.gz': of = gzip.open(args.output,'w')
    else: of = open(args.output,'w')
  depths = {}
  vals = []
  z = 0
  for line in inf:
    z += 1
    if z % 100000 == 0: sys.stderr.write(str(z)+"    bed entries read   \r")
    f = line.rstrip().split("\t")
    addition = 0
    if not args.dont_make_unique: addition = +args.unique_scale*random.random()
    vals.append([f[0],int(f[1]),int(f[2]),float(f[3])+addition])
  z = 0
  sys.stderr.write("\n")
  for f in vals:
    z += 1
    if z % 100000 == 0: sys.stderr.write(str(z)+"    bed entries read   \r")
    #keep track of the number of bases at each depth
    depth = f[3]
    cov = f[2]-f[1]
    if depth not in depths:  depths[depth] = 0
    depths[depth] += cov
    #vals.append([f[0],int(f[1]),int(f[2]),depth])
  sys.stderr.write("\n")
  #total_bases = sum(depths.values())
  #thresh = {}
  #for strata in stratas:
  #  pos = 0
  #  cur = float(i)*float(total_bases)/float(args.strata)
  stratas = sorted(values.keys())
  pos = 0
  depth_strata = {}
  for d in reversed(sorted(depths.keys())):
    pos += depths[d]
    while stratas[0] < pos:
      stratas.pop(0)
    depth_strata[d] = values[stratas[0]]
    #print str(d)+"\t"+str(values[stratas[0]])
    #if float(pos) > cur:
    #  thresh[d] = [pos,i]
    #  break
  vals[0][3] = depth_strata[vals[0][3]]
  buffer = vals[0]
  for val in vals[1:]:
    val[3] = depth_strata[val[3]]
    if val[1]==buffer[2] and val[3]==buffer[3] and val[0]==buffer[0]:
      #print 'hello'
      buffer[2] = val[2]
      continue
    else:
      of.write(buffer[0]+"\t"+str(buffer[1])+"\t"+str(buffer[2])+"\t"+str(buffer[3])+"\n")
      buffer = val
  of.write(buffer[0]+"\t"+str(buffer[1])+"\t"+str(buffer[2])+"\t"+str(buffer[3])+"\n")
  of.close()
Пример #12
0
def main(args):
  sys.stderr.write("Read reference fasta\n")
  fasta = FastaData(open(args.reference_fasta).read())
  sys.stderr.write("Read alignment file\n")
  bf = BAMFile(args.bam_input,reference=fasta)
  bf.read_index()
  total_qualities = []
  for j in range(0,100):
    total_qualities.append([])
  ef = ErrorProfileFactory()
  mincontext = 0
  alignments = 0
  for i in range(0,args.max_alignments):
    rname = random.choice(bf.index.get_names())
    coord = bf.index.get_longest_target_alignment_coords_by_name(rname)
    if not coord: continue
    bam = bf.fetch_by_coord(coord)
    qual = bam.value('qual')
    do_qualities(total_qualities,qual)
    if not bam.is_aligned(): continue
    alignments += 1
    ef.add_alignment(bam)
    if i%100 == 0:
      mincontext = ef.get_min_context_count('target')
      if mincontext:
        if mincontext >= args.min_context and alignments >= args.min_alignments: break
    sys.stderr.write(str(i+1)+" lines   "+str(alignments)+"/"+str(args.min_alignments)+" alignments   "+str(mincontext)+"/"+str(args.min_context)+" mincontext        \r")
  sys.stderr.write("\n")
  sys.stderr.write(str(mincontext)+" minimum contexts observed\n")
  target_context = ef.get_target_context_error_report()
  general_error_stats = ef.get_alignment_errors().get_stats()
  general_error_report = ef.get_alignment_errors().get_report()
  # convert report to table
  general_all = [x.split("\t") for x in general_error_report.rstrip().split("\n")]
  general_head = general_all[0]
  #print [y for y in general_all[1:]]
  general_data = [[y[0],y[1],int(y[2]),int(y[3])] for y in general_all[1:]]
  general_error_report = {'head':general_head,'data':general_data}
  quality_counts = []
  for vals in total_qualities:
    garr = []
    grp = {}
    for v in vals:
      if v[0] not in grp: grp[v[0]] = {}# check ordinal
      if v[1] not in grp[v[0]]: grp[v[0]][v[1]] = 0 # run length
      grp[v[0]][v[1]]+=1
    for ordval in sorted(grp.keys()):
      for runlen in sorted(grp[ordval].keys()):
        garr.append([ordval,runlen,grp[ordval][runlen]])
    quality_counts.append(garr)
  #Quailty counts now has 100 bins, each has an ordered array of
  # [ordinal_quality, run_length, observation_count]
  
  # Can prepare an output
  output = {}
  output['quality_counts'] = quality_counts
  output['context_error'] = target_context
  output['alignment_error'] = general_error_report
  output['error_stats'] = general_error_stats
  of = None
  if args.output[-3:]=='.gz':
    of = gzip.open(args.output,'w')
  else: of = open(args.output,'w')
  of.write(base64.b64encode(zlib.compress(json.dumps(output)))+"\n")
  of.close()
  # Temporary working directory step 3 of 3 - Cleanup
  if not args.specific_tempdir:
    rmtree(args.tempdir)
Пример #13
0
def main(args):

    sys.stderr.write("Reading our reference Fasta\n")
    ref = FastaData(open(args.reference, 'rb').read())
    sys.stderr.write("Finished reading our reference Fasta\n")
    bf = None
    if args.input_index:
        bf = BAMFile(args.input, reference=ref, index_file=args.input_index)
        bf.read_index(index_file=args.input_index)
    else:
        bf = BAMFile(args.input, reference=ref)
        bf.read_index()
    epf = ErrorProfileFactory()
    if args.random:
        if not bf.has_index():
            sys.stderr.write(
                "Random access requires our format of index bgi to be set\n")
            sys.exit()
        z = 0
        while True:
            rname = random.choice(bf.index.get_names())
            coord = bf.index.get_longest_target_alignment_coords_by_name(rname)
            if not coord: continue
            e = bf.fetch_by_coord(coord)
            if e.is_aligned():
                epf.add_alignment(e)
                z += 1
                #print z
                if z % 100 == 1:
                    con = epf.get_alignment_errors().alignment_length
                    if args.max_length <= con: break
                    sys.stderr.write(
                        str(con) + "/" + str(args.max_length) +
                        " bases from " + str(z) + " alignments\r")
        sys.stderr.write("\n")
    else:
        z = 0
        for e in bf:
            if e.is_aligned():
                epf.add_alignment(e)
                z += 1
                #print z
                if z % 100 == 1:
                    con = epf.get_alignment_errors().alignment_length
                    if args.max_length <= con: break
                    sys.stderr.write(
                        str(con) + "/" + str(args.max_length) +
                        " bases from " + str(z) + " alignments\r")
        sys.stderr.write("\n")
    of = open(args.tempdir + '/report.txt', 'w')
    of.write(epf.get_alignment_errors().get_report())
    of.close()

    for ofile in args.output:
        cmd = args.rscript_path + ' ' + os.path.dirname(
            os.path.realpath(__file__)
        ) + '/plot_alignment_errors.r ' + args.tempdir + '/report.txt ' + ofile + ' '
        if args.scale:
            cmd += ' '.join([str(x) for x in args.scale])
        sys.stderr.write(cmd + "\n")
        call(cmd.split())

    if args.output_raw:
        of = open(args.output_raw, 'w')
        with open(args.tempdir + "/report.txt") as inf:
            for line in inf:
                of.write(line)
        of.close()
    if args.output_stats:
        of = open(args.output_stats, 'w')
        of.write(epf.get_alignment_errors().get_stats())
        of.close()
    sys.stderr.write("finished\n")
    # Temporary working directory step 3 of 3 - Cleanup
    if not args.specific_tempdir:
        rmtree(args.tempdir)
Пример #14
0
 def set_reference_genome(self, ref_genome):
     self.ref_genome_set = True
     self.ref_genome = FastaData(open(ref_genome).read())
Пример #15
0
def main():
    #do our inputs
    args = do_inputs()
    global of
    of = sys.stdout
    if args.output:
        if args.output[-4:] == '.bam':
            cmd = 'samtools view -Sb - -o ' + args.output
            p = Popen(cmd.split(), stdin=PIPE)
            of = p.stdin
        else:
            sys.stderr.write(
                "ERROR: stdout and .bam are the only valid output formats\n")
            sys.exit()
    inf = sys.stdin
    if args.input != '-':
        if args.input[-3:] == '.gz':
            inf = gzip.open(args.input)
        else:
            inf = open(args.input)
    sys.stderr.write("reading reference genome\n")
    ref = FastaData(open(args.reference).read())
    #shared = manager.dict()
    shared = {}
    for chr in sorted(ref.keys()):
        sys.stderr.write("reading " + chr + "\n")
        shared[chr] = ref[chr].upper()
        ref.remove(chr)
    sys.stderr.write("finished reading shared memory reference\n")
    sys.stderr.write("Now make the header\n")
    of.write("@HD\tVN:1.0\tSO:unknown\n")
    of.write("@PG\tID:SLR\n")
    for chr in sorted(shared.keys()):
        of.write("@SQ\tSN:" + chr + "\tLN:" + str(len(shared[chr])) + "\n")

    if args.threads > 1:
        poo = Pool(processes=args.threads)

    buffer = []
    max_buffer = 1
    z = 0
    for line in inf:
        z += 1
        if z % 1000 == 0: sys.stderr.write(str(z) + "   \r")
        buffer.append(line)
        if len(buffer) >= max_buffer:
            if args.threads == 1:
                results = do_buffer(buffer, shared, args)
                do_out(results)
            else:
                poo.apply_async(do_buffer,
                                args=(
                                    buffer[:],
                                    shared,
                                    args,
                                ),
                                callback=do_out)
            buffer = []
    if len(buffer) > 0:
        if args.threads == 1:
            results = do_buffer(buffer, shared, args)
            do_out(results)
        else:
            poo.apply_async(do_buffer,
                            args=(
                                buffer[:],
                                shared,
                                args,
                            ),
                            callback=do_out)

    if args.threads > 1:
        poo.close()
        poo.join()

    sys.stderr.write("\n")
    if args.output:
        p.communicate()
    else:
        of.close()

    # Temporary working directory step 3 of 3 - Cleanup
    if not args.specific_tempdir:
        rmtree(args.tempdir)