示例#1
0
def construct_header_from_reference_fasta(ref_fasta_filename):
  g = FastaData(open(ref_fasta_filename).read())
  #g = SequenceBasics.read_fasta_into_hash(ref_fasta_filename)
  chrs = {}
  for name in sorted(g.keys()):
    chrs[name] = len(g[name])
    sys.stderr.write(name+" is there at length "+str(len(g[name]))+"\n")
  header = ''
  header += "@HD\tVN:1.0\tSO:coordinate\n"
  for chr in sorted(chrs):
    header += "@SQ\tSN:"+chr+"\tLN:"+str(chrs[chr])+"\n"
  header += "@PG\tID:SamBasics.py\tVN:1.0\n"
  return header 
示例#2
0
def construct_header_from_reference_fasta(ref_fasta_filename):
    g = FastaData(open(ref_fasta_filename).read())
    #g = SequenceBasics.read_fasta_into_hash(ref_fasta_filename)
    chrs = {}
    for name in sorted(g.keys()):
        chrs[name] = len(g[name])
        sys.stderr.write(name + " is there at length " + str(len(g[name])) +
                         "\n")
    header = ''
    header += "@HD\tVN:1.0\tSO:coordinate\n"
    for chr in sorted(chrs):
        header += "@SQ\tSN:" + chr + "\tLN:" + str(chrs[chr]) + "\n"
    header += "@PG\tID:SamBasics.py\tVN:1.0\n"
    return header
示例#3
0
def main():
    parser = argparse.ArgumentParser(
        description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('input', help="Use - for STDIN")
    parser.add_argument('-r', '--reference', help="reference genome FASTA")
    parser.add_argument('--no_qual',
                        action='store_true',
                        help="dont put in quality")

    args = parser.parse_args()
    ref = {}
    if args.reference:
        ref = FastaData(open(args.reference, 'rb').read())
    if args.input == '-':
        args.input = sys.stdin
    else:
        args.input = open(args.input)

    h1 = '@HD	VN:1.0	SO:unsorted'
    h2 = '@PG	ID:FA2UN	PN:FA2UN	VN:2016-06-09	CL:' + ' '.join(sys.argv)
    print h1
    print h2
    if ref:
        for chr in sorted(ref.keys()):
            print "@SQ\tSN:" + chr + "\t" + 'LN:' + str(len(ref[chr]))
    inf = FastqHandle(args.input)
    for e in inf:
        o = ''
        o += e.name + "\t"
        o += "4\t"
        o += "*\t"
        o += "0\t"
        o += "0\t"
        o += "*\t"
        o += "*\t"
        o += "0\t"
        o += "0\t"
        o += e.seq + "\t"
        if args.no_qual:
            o += "*\t"
        else:
            o += e.qual + "\t"
        o += "XO:Z:NM"
        print o
def main():
  parser = argparse.ArgumentParser(description="",formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument('input',help="Use - for STDIN")
  parser.add_argument('-r','--reference',help="reference genome FASTA")
  parser.add_argument('--no_qual',action='store_true',help="dont put in quality")
  
  args = parser.parse_args()
  ref = {}
  if args.reference:
    ref = FastaData(open(args.reference,'rb').read())
  if args.input == '-':
    args.input = sys.stdin
  else: args.input = open(args.input)
  
  
  h1 = '@HD	VN:1.0	SO:unsorted'
  h2 = '@PG	ID:FA2UN	PN:FA2UN	VN:2016-06-09	CL:'+' '.join(sys.argv)
  print h1
  print h2
  if ref:
    for chr in sorted(ref.keys()):
      print "@SQ\tSN:"+chr+"\t"+'LN:'+str(len(ref[chr]))
  inf = FastqHandle(args.input)
  for e in inf:
    o =  ''
    o += e.name+"\t"
    o += "4\t"
    o += "*\t"
    o += "0\t"
    o += "0\t"
    o += "*\t"
    o += "*\t"
    o += "0\t"
    o += "0\t"
    o += e.seq+"\t"
    if args.no_qual:
      o+= "*\t"
    else:
      o += e.qual+"\t"
    o += "XO:Z:NM"
    print o
示例#5
0
def main():
  parser = argparse.ArgumentParser(description="Read a sam file and output a bed file in the format of junction_color.bed")
  parser.add_argument('-o','--output',help='FILENAME is output')
  parser.add_argument('--min_intron_size',type=int,default=68,help='minimum intron size')
  parser.add_argument('infile',help='FILENAME of sam file or "-" for STDIN')
  parser.add_argument('reference_genome',help='FILENAME of the reference genome')
  args = parser.parse_args()

  # get our reference genome
  sys.stderr.write("reading reference genome\n")
  #g = SequenceBasics.read_fasta_into_hash(args.reference_genome)
  g = FastaData(open(args.reference_genome).read())
  sys.stderr.write("finished reading reference genome\n")

  inf = sys.stdin
  read_mapping_count = {}
  junctions = {}
  if args.infile != '-':
    inf = open(args.infile)
  sys.stderr.write("reading through sam file\n")
  zall = 0
  zn = 0
  while True:
    line = inf.readline()
    if not line: break
    line = line.rstrip()
    if SamBasics.is_header(line): continue
    d = SamBasics.sam_line_to_dictionary(line)
    chrom = d['rname']
    if chrom =='*': continue
    if chrom not in g.keys():
      sys.stderr.write("WARNING: "+chrom+" not in reference, skipping\n")
      continue
    mate = 'U'
    if SamBasics.check_flag(d['flag'],int('0x4',16)): #check if its unmapped
      continue  # we can ignore the unmapped things for now
    if SamBasics.check_flag(d['flag'],int('0x40',16)):
      mate = 'L'
    elif SamBasics.check_flag(d['flag'],int('0x80',16)):
      mate = 'R'
    actual_read = d['qname']+"\t"+mate
    if actual_read not in read_mapping_count:
      read_mapping_count[actual_read] = 0
    read_mapping_count[actual_read] += 1
    has_intron = 0
    start_loc = d['pos']
    current_loc = start_loc
    bounds  = []
    for i in range(0,len(d['cigar_array'])):
      ce = d['cigar_array'][i]
      if ce['op'] == 'N' and ce['val'] >= args.min_intron_size:
        has_intron = 1
        lbound = current_loc # should be the intron start base index-1
        current_loc += ce['val']
        rbound = current_loc # should be the second exon start base index-1
        right_size = d['cigar_array'][i+1]['val']
        bounds.append([lbound,rbound,right_size])
      elif ce['op'] == 'D':
        current_loc += ce['val']
      elif re.match('[=XMSHP]',ce['op']):
        current_loc += ce['val'] 
    if has_intron == 0: continue # there are no splices to report here
    #print actual_read
    #print d['cigar']
    #print d
    #print start_loc
    #print bounds
    for bound in bounds:
      zall += 1
      intronflank = g[chrom][bound[0]-1:bound[0]+1].upper() + '-' + \
                    g[chrom][bound[1]-3:bound[1]-1].upper()
      strand = ''
      if is_canon(intronflank): # its a positive strand
        strand = '+'
      elif is_revcanon(intronflank): # its a negative strand
        strand = '-'
      else:
        # We can't deal with the non-canonical splice sorry
        zn += 1
        sys.stderr.write("WARNING skipping non-canonical splice ("+str(zn)+"/"+str(zall)+")\r")
        continue
      # If we are still in we have successfully found a splice
      out_chrom = chrom
      out_start = bound[0]-51
      out_end = bound[1]+49
      out_name = '*' # this will be done later
      out_score = 50
      out_strand = strand
      out_thickStart = out_start
      out_thickEnd = out_end
      out_rgb = '0,0,0'
      out_block_count = 2
      out_block_sizes = '50,50'
      out_block_starts = '0,'+str(bound[1]-bound[0]+50)
      bed = []
      bed.append(out_chrom)
      bed.append(str(out_start))
      bed.append(str(out_end))
      bed.append(out_name)
      bed.append(str(out_score))
      bed.append(out_strand)
      bed.append(str(out_thickStart))
      bed.append(str(out_thickEnd))
      bed.append(out_rgb)
      bed.append(str(out_block_count))
      bed.append(out_block_sizes)
      bed.append(out_block_starts)
      entry = "\t".join(bed)
      if entry not in junctions:
        junctions[entry] = {}
        junctions[entry]['reads'] = set()
        junctions[entry]['positions'] = set()
        junctions[entry]['right_sizes'] = set()
      junctions[entry]['reads'].add(actual_read)
      junctions[entry]['positions'].add(d['pos'])
      junctions[entry]['right_sizes'].add(bound[2])
  sys.stderr.write("\n")
  sys.stderr.write("finished reading sam\n")
  of = sys.stdout
  if args.output:
    of = open(args.output,'w')
  if len(junctions) > 0: # if we have stuff lets print a header
    of.write("track\tname=junctions\tdescription=\"SpliceMap junctions\" itemRgb=\"On\"\n")
  for entry in junctions:
    nR = len(junctions[entry]['reads'])
    width = max(junctions[entry]['right_sizes'])-min(junctions[entry]['right_sizes'])
    nNR = len(junctions[entry]['positions'])
    nUR = 0
    nMR = 0
    for read in junctions[entry]['reads']:
      if read_mapping_count[read] == 1:
        nUR += 1
      elif read_mapping_count[read] > 1:
        nMR += 1
      else:
        sys.stderr.write("ERROR: nonsense read count\n")
        return
    name = '('+str(nR)+')['+str(width)+'_'+str(nNR)+']('+str(nUR)+'/'+str(nMR)+')'
    bed = entry.split("\t")
    bed[3] = name
    of.write("\t".join(bed)+"\n")    
def main(args):
    random.seed(args.seed)
    sum = 0
    if args.reference_genome:
        ref = FastaData(open(args.reference_genome).read())
        for name in ref.keys():
            sum += len(ref[name])
    else:
        with open(args.reference_lengths) as inf:
            for line in inf:
                f = line.rstrip().split("\t")
                sum += int(f[1])
    c = args.minimum_coverage
    z = 0
    values = {}
    while c < sum:
        z += 1
        values[c] = z
        c = c * 5
        if c >= sum: break
        z += 1
        values[c] = z
        c = c * 2
    z += 1
    values[sum] = z
    for c in sorted(values.keys()):
        values[c] = z - values[c] + 1
    ### Now values contains the stratified coverage values
    if args.output_key:
        of = open(args.output_key, 'w')
        of.write("bp_size\tstrata_label\n")
        for c in sorted(values.keys()):
            of.write(str(c) + "\t" + str(values[c]) + "\n")
        of.close()
    inf = sys.stdin
    if args.input != '-':
        if args.input[-3:] == '.gz': inf = gzip.open(args.input)
        else: inf = open(args.input)

    of = sys.stdout
    if args.output:
        if args.output[-3:] == '.gz': of = gzip.open(args.output, 'w')
        else: of = open(args.output, 'w')
    depths = {}
    vals = []
    z = 0
    for line in inf:
        z += 1
        if z % 100000 == 0:
            sys.stderr.write(str(z) + "    bed entries read   \r")
        f = line.rstrip().split("\t")
        addition = 0
        if not args.dont_make_unique:
            addition = +args.unique_scale * random.random()
        vals.append([f[0], int(f[1]), int(f[2]), float(f[3]) + addition])
    z = 0
    sys.stderr.write("\n")
    for f in vals:
        z += 1
        if z % 100000 == 0:
            sys.stderr.write(str(z) + "    bed entries read   \r")
        #keep track of the number of bases at each depth
        depth = f[3]
        cov = f[2] - f[1]
        if depth not in depths: depths[depth] = 0
        depths[depth] += cov
        #vals.append([f[0],int(f[1]),int(f[2]),depth])
    sys.stderr.write("\n")
    #total_bases = sum(depths.values())
    #thresh = {}
    #for strata in stratas:
    #  pos = 0
    #  cur = float(i)*float(total_bases)/float(args.strata)
    stratas = sorted(values.keys())
    pos = 0
    depth_strata = {}
    for d in reversed(sorted(depths.keys())):
        pos += depths[d]
        while stratas[0] < pos:
            stratas.pop(0)
        depth_strata[d] = values[stratas[0]]
        #print str(d)+"\t"+str(values[stratas[0]])
        #if float(pos) > cur:
        #  thresh[d] = [pos,i]
        #  break
    vals[0][3] = depth_strata[vals[0][3]]
    buffer = vals[0]
    for val in vals[1:]:
        val[3] = depth_strata[val[3]]
        if val[1] == buffer[2] and val[3] == buffer[3] and val[0] == buffer[0]:
            #print 'hello'
            buffer[2] = val[2]
            continue
        else:
            of.write(buffer[0] + "\t" + str(buffer[1]) + "\t" +
                     str(buffer[2]) + "\t" + str(buffer[3]) + "\n")
            buffer = val
    of.write(buffer[0] + "\t" + str(buffer[1]) + "\t" + str(buffer[2]) + "\t" +
             str(buffer[3]) + "\n")
    of.close()
示例#7
0
def main():
  #do our inputs
  args = do_inputs()
  global of
  of = sys.stdout
  if args.output:
    if args.output[-4:] == '.bam':
      cmd = 'samtools view -Sb - -o '+args.output
      p = Popen(cmd.split(),stdin=PIPE)
      of = p.stdin
    else:
      sys.stderr.write("ERROR: stdout and .bam are the only valid output formats\n")
      sys.exit()
  inf = sys.stdin
  if args.input != '-':
    if args.input[-3:] == '.gz':
      inf = gzip.open(args.input)
    else: inf = open(args.input)
  sys.stderr.write("reading reference genome\n")
  ref = FastaData(open(args.reference).read())
  #shared = manager.dict()
  shared = {}
  for chr in sorted(ref.keys()): 
    sys.stderr.write("reading "+chr+"\n")
    shared[chr] = ref[chr].upper()
    ref.remove(chr)
  sys.stderr.write("finished reading shared memory reference\n")
  sys.stderr.write("Now make the header\n")
  of.write("@HD\tVN:1.0\tSO:unknown\n")
  of.write("@PG\tID:SLR\n")
  for chr in sorted(shared.keys()):
    of.write("@SQ\tSN:"+chr+"\tLN:"+str(len(shared[chr]))+"\n")

  if args.threads > 1:
    poo = Pool(processes=args.threads)

  buffer = []
  max_buffer = 1
  z = 0
  for line in inf:
    z += 1
    if z%1000==0: sys.stderr.write(str(z)+"   \r")
    buffer.append(line)
    if len(buffer) >= max_buffer:
      if args.threads == 1:
        results = do_buffer(buffer,shared,args)
        do_out(results)
      else:
        poo.apply_async(do_buffer,args=(buffer[:],shared,args,),callback=do_out)
      buffer = []
  if len(buffer) > 0:
    if args.threads ==1:
      results = do_buffer(buffer,shared,args)
      do_out(results)
    else:
      poo.apply_async(do_buffer,args=(buffer[:],shared,args,),callback=do_out)

  if args.threads > 1:
    poo.close()
    poo.join()

  sys.stderr.write("\n")
  if args.output:
    p.communicate()
  else: of.close()

  # Temporary working directory step 3 of 3 - Cleanup
  if not args.specific_tempdir:
    rmtree(args.tempdir)
示例#8
0
def main(args):
  random.seed(args.seed)
  sum = 0
  if args.reference_genome:
    ref = FastaData(open(args.reference_genome).read())
    for name in ref.keys():
      sum += len(ref[name])
  else:
    with open(args.reference_lengths) as inf:
      for line in inf:
        f = line.rstrip().split("\t")
        sum += int(f[1])
  c = args.minimum_coverage
  z = 0
  values = {}
  while c < sum:
    z += 1
    values[c] = z
    c = c*5
    if c >= sum: break
    z += 1
    values[c] = z
    c = c*2
  z +=1
  values[sum] = z
  for c in sorted(values.keys()):
    values[c] = z-values[c]+1
  ### Now values contains the stratified coverage values
  if args.output_key:
    of = open(args.output_key,'w')
    of.write("bp_size\tstrata_label\n")
    for c in sorted(values.keys()):
      of.write(str(c)+"\t"+str(values[c])+"\n")
    of.close()
  inf = sys.stdin
  if args.input != '-': 
    if args.input[-3:]=='.gz': inf = gzip.open(args.input)
    else: inf = open(args.input)

  of = sys.stdout
  if args.output:
    if args.output[-3:]=='.gz': of = gzip.open(args.output,'w')
    else: of = open(args.output,'w')
  depths = {}
  vals = []
  z = 0
  for line in inf:
    z += 1
    if z % 100000 == 0: sys.stderr.write(str(z)+"    bed entries read   \r")
    f = line.rstrip().split("\t")
    addition = 0
    if not args.dont_make_unique: addition = +args.unique_scale*random.random()
    vals.append([f[0],int(f[1]),int(f[2]),float(f[3])+addition])
  z = 0
  sys.stderr.write("\n")
  for f in vals:
    z += 1
    if z % 100000 == 0: sys.stderr.write(str(z)+"    bed entries read   \r")
    #keep track of the number of bases at each depth
    depth = f[3]
    cov = f[2]-f[1]
    if depth not in depths:  depths[depth] = 0
    depths[depth] += cov
    #vals.append([f[0],int(f[1]),int(f[2]),depth])
  sys.stderr.write("\n")
  #total_bases = sum(depths.values())
  #thresh = {}
  #for strata in stratas:
  #  pos = 0
  #  cur = float(i)*float(total_bases)/float(args.strata)
  stratas = sorted(values.keys())
  pos = 0
  depth_strata = {}
  for d in reversed(sorted(depths.keys())):
    pos += depths[d]
    while stratas[0] < pos:
      stratas.pop(0)
    depth_strata[d] = values[stratas[0]]
    #print str(d)+"\t"+str(values[stratas[0]])
    #if float(pos) > cur:
    #  thresh[d] = [pos,i]
    #  break
  vals[0][3] = depth_strata[vals[0][3]]
  buffer = vals[0]
  for val in vals[1:]:
    val[3] = depth_strata[val[3]]
    if val[1]==buffer[2] and val[3]==buffer[3] and val[0]==buffer[0]:
      #print 'hello'
      buffer[2] = val[2]
      continue
    else:
      of.write(buffer[0]+"\t"+str(buffer[1])+"\t"+str(buffer[2])+"\t"+str(buffer[3])+"\n")
      buffer = val
  of.write(buffer[0]+"\t"+str(buffer[1])+"\t"+str(buffer[2])+"\t"+str(buffer[3])+"\n")
  of.close()
示例#9
0
def main():
    #do our inputs
    args = do_inputs()
    global of
    of = sys.stdout
    if args.output:
        if args.output[-4:] == '.bam':
            cmd = 'samtools view -Sb - -o ' + args.output
            p = Popen(cmd.split(), stdin=PIPE)
            of = p.stdin
        else:
            sys.stderr.write(
                "ERROR: stdout and .bam are the only valid output formats\n")
            sys.exit()
    inf = sys.stdin
    if args.input != '-':
        if args.input[-3:] == '.gz':
            inf = gzip.open(args.input)
        else:
            inf = open(args.input)
    sys.stderr.write("reading reference genome\n")
    ref = FastaData(open(args.reference).read())
    #shared = manager.dict()
    shared = {}
    for chr in sorted(ref.keys()):
        sys.stderr.write("reading " + chr + "\n")
        shared[chr] = ref[chr].upper()
        ref.remove(chr)
    sys.stderr.write("finished reading shared memory reference\n")
    sys.stderr.write("Now make the header\n")
    of.write("@HD\tVN:1.0\tSO:unknown\n")
    of.write("@PG\tID:SLR\n")
    for chr in sorted(shared.keys()):
        of.write("@SQ\tSN:" + chr + "\tLN:" + str(len(shared[chr])) + "\n")

    if args.threads > 1:
        poo = Pool(processes=args.threads)

    buffer = []
    max_buffer = 1
    z = 0
    for line in inf:
        z += 1
        if z % 1000 == 0: sys.stderr.write(str(z) + "   \r")
        buffer.append(line)
        if len(buffer) >= max_buffer:
            if args.threads == 1:
                results = do_buffer(buffer, shared, args)
                do_out(results)
            else:
                poo.apply_async(do_buffer,
                                args=(
                                    buffer[:],
                                    shared,
                                    args,
                                ),
                                callback=do_out)
            buffer = []
    if len(buffer) > 0:
        if args.threads == 1:
            results = do_buffer(buffer, shared, args)
            do_out(results)
        else:
            poo.apply_async(do_buffer,
                            args=(
                                buffer[:],
                                shared,
                                args,
                            ),
                            callback=do_out)

    if args.threads > 1:
        poo.close()
        poo.join()

    sys.stderr.write("\n")
    if args.output:
        p.communicate()
    else:
        of.close()

    # Temporary working directory step 3 of 3 - Cleanup
    if not args.specific_tempdir:
        rmtree(args.tempdir)