def get_output(bedarray,z):
  sarray = sort_genomic_ranges(bedarray[:])
  covs = ranges_to_coverage(bedarray)
  olines = ''
  for c in covs:
    olines += c.chr+"\t"+str(c.start-1)+"\t"+str(c.end)+"\t"+str(c.get_payload())+"\n"
  return [olines,z]
Exemplo n.º 2
0
def main(args):

    inf = sys.stdin
    if args.input != '-':
        if re.search('\.gz$', args.input):
            inf = gzip.open(args.input)
        else:
            inf = open(args.input)
    of = sys.stdout
    if args.output:
        if re.search('\.gz$', args.output):
            of = gzip.open(args.output, 'w')
        else:
            of = open(args.output, 'w')
    loci = LocusStream(GPDStream(inf))
    for locus in loci:
        exranges = []
        for entry in locus.get_payload():
            for exon in entry.exons:
                exranges.append(exon.get_range())
        covs = ranges_to_coverage(exranges)
        for cov in covs:
            of.write("\t".join([str(x) for x in cov.get_bed_coordinates()]) +
                     "\t" + str(+cov.get_payload()) + "\n")
    of.close()
    inf.close()
Exemplo n.º 3
0
 def get_depth_per_transcript(self,mindepth=1):
   bedarray = []
   for tx in self.get_transcripts():
     for ex in [x.rng for x in tx.exons]: bedarray.append(ex)
   cov = ranges_to_coverage(bedarray)
   results = {}
   for tx in self.get_transcripts():
     tlen = tx.get_length()
     bcov = []
     for ex in [x.rng for x in tx.exons]:     
       excov = [[x.overlap_size(ex),x.get_payload()] for x in cov]
       for coved in [x for x in excov if x[0] > 0]:
         bcov.append(coved)
     total_base_coverage = sum([x[0]*x[1] for x in bcov])
     average_coverage = float(total_base_coverage)/float(tlen)
     minimum_bases_covered = sum([x[0] for x in bcov if x[1] >= mindepth])
     fraction_covered_at_minimum = float(minimum_bases_covered)/float(tlen)
     res = {'tx':tx,'average_coverage':average_coverage,'fraction_covered':fraction_covered_at_minimum,'mindepth':mindepth,'length_covered':minimum_bases_covered}
     results[tx.get_id()] = res
     #print average_coverage
     #print fraction_covered_at_minimum
     #print tlen
     #tcov = float(bcov)/float(tlen)
     #print tcov
   #for c in cov:
   #  print c
   return results
def main():
  parser = argparse.ArgumentParser(description="",formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument('input',help="Use - for STDIN")
  parser.add_argument('-o','--output',help="output file or use STDOUT if not set")
  args = parser.parse_args()
  
  if args.input == '-':
    args.input = sys.stdin
  else: args.input = open(args.input)
  gs = GPDStream(args.input)
  ls = LocusStream(gs)
  of = sys.stdout
  if args.output:
    if re.search('\.gz$',args.output): 
      of = gzip.open(args.output,'w')
    else:
      of = open(args.output,'w')
  for rng in ls:
    sys.stderr.write(rng.get_range_string()+"    \r")
    gpds = rng.get_payload()
    exs = []
    for ex_set in [[y.get_range() for y in x.exons] for x in gpds]:
      exs += ex_set
    cov = ranges_to_coverage(exs)
    #use our coverage data on each gpd entry now
    for gpd in gpds:
      totcov = 0
      for exon in [x.get_range() for x in gpd.exons]:
        gcovs = union_range_array(exon,cov,payload=2)
        totcov += sum([x.get_payload()*x.length() for x in gcovs])
      of.write(gpd.get_gene_name()+"\t"+str(gpd.get_exon_count())+"\t"+str(gpd.get_length())+"\t"+str(float(totcov)/float(gpd.get_length()))+"\n")
  sys.stderr.write("\n")
  of.close()
def do_tx_line(ref_gpd, annots, reads, args):
    allbits = []
    read_count = 0
    for read in reads:
        if not args.allow_overflowed_matches and read.get_range(
        ).start < ref_gpd.get_range().start:
            continue
        if not args.allow_overflowed_matches and read.get_range(
        ).end > ref_gpd.get_range().end:
            continue
        v = ref_gpd.union(read)
        for e in [x.rng for x in v.exons]:
            allbits.append(e)
        read_count += 1
    if len(allbits) == 0: return None
    if read_count < args.minimum_read_count: return None
    cov = ranges_to_coverage(allbits)
    #print [x.get_payload() for x in cov]
    curr = 0
    bps = []
    for i in range(0, ref_gpd.get_length()):
        bps.append(0)
    for rng1 in [x.rng for x in ref_gpd.exons]:
        overs = [[z[0], z[1].get_payload()]
                 for z in [[y.union(rng1), y] for y in cov] if z[0]]
        for ov in overs:
            dist1 = ov[0].start - rng1.start + curr
            dist2 = ov[0].end - rng1.start + curr
            for i in range(dist1, dist2 + 1):
                bps[i] += ov[1]
        curr += rng1.length()
    trimmedbps = bps
    if args.only_covered_ends:
        start = 0
        finish = len(bps) - 1
        for i in range(0, len(bps)):
            if bps[i] != 0:
                start = i
                break
        for i in reversed(range(0, len(bps))):
            if bps[i] != 0:
                finish = i
                break
        trimmedbps = bps[start:finish + 1]
    exp = float(sum(trimmedbps)) / float(len(trimmedbps))
    if ref_gpd.get_strand() == '-': trimmedbps = list(reversed(trimmedbps))
    if len(trimmedbps) < args.minimum_read_count: return None
    #bin the results
    vals = {}
    for dat in [[
            str(1 + int(100 * float(i) / float(len(trimmedbps)))),
            float(trimmedbps[i]) / float(read_count)
    ] for i in range(0, len(trimmedbps))]:
        if dat[0] not in vals: vals[dat[0]] = []
        vals[dat[0]].append(dat[1])
    for num in vals:
        vals[num] = average(vals[num])
    return [vals, read_count, exp, len(trimmedbps), ref_gpd.get_exon_count()]
def main():
    parser = argparse.ArgumentParser(
        description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('input', help="IDP output folder")
    #parser.add_argument('--min_exons',type=int,default=1,help="At least this number of exons")
    parser.add_argument('--offset',
                        type=int,
                        default=1,
                        help="add this much to all expressions")
    parser.add_argument('--mult',
                        type=int,
                        default=10,
                        help="multiply all expressions by this much")
    parser.add_argument('-o',
                        '--output',
                        help="OUTPUT file or nothing for STDOUT")
    args = parser.parse_args()

    args.input = args.input.rstrip('/')
    inf = open(args.input + '/isoform.gpd')
    sys.stderr.write("Reading isoform.gpd\n")
    txs = {}
    for line in inf:
        gpd = GPD(line)
        tx = gpd.get_transcript_name()
        if tx not in txs:
            txs[tx] = []
        for exon in gpd.exons:
            txs[tx].append(exon.get_range())
    inf.close()

    sys.stderr.write("Reading isoform.exp file\n")
    inf = open(args.input + '/isoform.exp')
    vals = []
    for line in inf:
        f = line.rstrip().split("\t")
        v = int((float(f[1]) * args.mult) + args.offset)
        tx = f[0]
        exons = txs[tx]
        #if len(exons) < args.min_exons: continue
        for i in range(0, v):
            vals += exons[:]
    inf.close()
    sys.stderr.write("Generating coverage file " + str(len(vals)) + "\n")
    of = sys.stdout
    if args.output:
        if args.output[-3:] == '.gz':
            of = gzip.open(args.output, 'w')
        else:
            of = open(args.output, 'w')
    covs = ranges_to_coverage(vals)
    for v in covs:
        of.write(v.chr + "\t" + str(v.start - 1) + "\t" + str(v.end) + "\t" +
                 str(v.get_payload()) + "\n")
    #    of.write(tx+"\t"+gene+"\t"+str(genes[gene]['transcripts'][tx])+"\t"+str(genes[gene]['cnt'])+"\n")
    of.close()
def main():
  parser = argparse.ArgumentParser(description="",formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument('input',help="Use - for STDIN")
  parser.add_argument('--offset',type=int,default=0,help="add this much to transcript tpms")
  parser.add_argument('--mult',type=int,default=10,help="multiply this much to tpms")
  parser.add_argument('--min_exons',type=int,default=1,help="require at least this many exons")
  parser.add_argument('-o','--output',help="OUTPUT file or nothing for STDOUT")
  args = parser.parse_args()
  
  inf = sys.stdin
  if args.input != '-':
    if args.input[-3:]=='.gz':
      inf = gzip.open(args.input)
    else: inf = open(args.input)
  genes = {}
  sys.stderr.write("Reading gtf file\n")
  txs = {}
  for line in inf:
    if re.match('#',line): continue
    f = line.rstrip().split("\t")
    tx = None
    if f[2] == 'exon' or f[2] == 'transcript':
      tx = re.search('transcript_id\s+"([^"]+)"',f[8]).group(1)
      if tx not in txs:
        txs[tx] = {}
        txs[tx]['tpm'] = 0
        txs[tx]['exons'] = []
    if f[2] == 'transcript':
      tpm = float(re.search('TPM\s+"([^"]+)"',f[8]).group(1))
      txs[tx]['tpm'] = int((tpm*float(args.mult))+args.offset)
    if f[2] == 'exon':
      chr = f[0]
      start = int(f[3])-1
      end = int(f[4])
      txs[tx]['exons'].append(Bed(chr,start,end))
  inf.close()
  vals = []
  sys.stderr.write("Traversing annotation file\n")
  for tx in txs:
    exons = txs[tx]['exons']
    v = txs[tx]['tpm']
    if len(exons) < args.min_exons: continue
    for i in range(0,v):
      vals += exons[:]
  sys.stderr.write("Generating coverage file "+str(len(vals))+"\n")
  of = sys.stdout
  if args.output:
    if args.output[-3:]=='.gz':
      of = gzip.open(args.output,'w')
    else:
      of = open(args.output,'w')
  covs = ranges_to_coverage(vals)
  for v in covs:
    of.write(v.chr+"\t"+str(v.start-1)+"\t"+str(v.end)+"\t"+str(v.get_payload())+"\n")
  #    of.write(tx+"\t"+gene+"\t"+str(genes[gene]['transcripts'][tx])+"\t"+str(genes[gene]['cnt'])+"\n")
  of.close()
Exemplo n.º 8
0
def do_locus(locus):
  exranges = []
  for entry in locus.get_payload():
    for exon in entry.exons:
      exranges.append(exon.get_range())
  covs = ranges_to_coverage(exranges)
  output = []
  for cov in covs:    
    output.append("\t".join([str(x) for x in cov.get_bed_coordinates()])+"\t"+str(+cov.get_payload())+"\n")
  return output
def do_locus(locus):
    exranges = []
    for entry in locus.get_payload():
        for exon in entry.exons:
            exranges.append(exon.get_range())
    covs = ranges_to_coverage(exranges)
    output = []
    for cov in covs:
        output.append("\t".join([str(x) for x in cov.get_bed_coordinates()]) +
                      "\t" + str(+cov.get_payload()) + "\n")
    return output
def main():
  parser = argparse.ArgumentParser(description="",formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument('input',help="IDP output folder")
  #parser.add_argument('--min_exons',type=int,default=1,help="At least this number of exons")
  parser.add_argument('--offset',type=int,default=1,help="add this much to all expressions")
  parser.add_argument('--mult',type=int,default=10,help="multiply all expressions by this much")
  parser.add_argument('-o','--output',help="OUTPUT file or nothing for STDOUT")
  args = parser.parse_args()
  
  args.input= args.input.rstrip('/')
  inf = open(args.input+'/isoform.gpd')
  sys.stderr.write("Reading isoform.gpd\n")
  txs = {}
  for line in inf:
    gpd = GPD(line)
    tx = gpd.get_transcript_name()
    if tx not in txs:
      txs[tx] = []
    for exon in gpd.exons:
      txs[tx].append(exon.get_range())
  inf.close()

  sys.stderr.write("Reading isoform.exp file\n")
  inf = open(args.input+'/isoform.exp')
  vals = []
  for line in inf:
      f = line.rstrip().split("\t")
      v = int((float(f[1])*args.mult)+args.offset)
      tx = f[0]
      exons = txs[tx]
      #if len(exons) < args.min_exons: continue
      for i in range(0,v):
        vals += exons[:]
  inf.close()
  sys.stderr.write("Generating coverage file "+str(len(vals))+"\n")
  of = sys.stdout
  if args.output:
    if args.output[-3:]=='.gz':
      of = gzip.open(args.output,'w')
    else:
      of = open(args.output,'w')
  covs = ranges_to_coverage(vals)
  for v in covs:
    of.write(v.chr+"\t"+str(v.start-1)+"\t"+str(v.end)+"\t"+str(v.get_payload())+"\n")
  #    of.write(tx+"\t"+gene+"\t"+str(genes[gene]['transcripts'][tx])+"\t"+str(genes[gene]['cnt'])+"\n")
  of.close()
Exemplo n.º 11
0
def main():
    parser = argparse.ArgumentParser(
        description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('input', help="Use - for STDIN")
    parser.add_argument('-o',
                        '--output',
                        help="output file or use STDOUT if not set")
    args = parser.parse_args()

    if args.input == '-':
        args.input = sys.stdin
    else:
        args.input = open(args.input)
    gs = GPDStream(args.input)
    ls = LocusStream(gs)
    of = sys.stdout
    if args.output:
        if re.search('\.gz$', args.output):
            of = gzip.open(args.output, 'w')
        else:
            of = open(args.output, 'w')
    for rng in ls:
        sys.stderr.write(rng.get_range_string() + "    \r")
        gpds = rng.get_payload()
        exs = []
        for ex_set in [[y.get_range() for y in x.exons] for x in gpds]:
            exs += ex_set
        cov = ranges_to_coverage(exs)
        #use our coverage data on each gpd entry now
        for gpd in gpds:
            totcov = 0
            for exon in [x.get_range() for x in gpd.exons]:
                gcovs = union_range_array(exon, cov, payload=2)
                totcov += sum([x.get_payload() * x.length() for x in gcovs])
            of.write(gpd.get_gene_name() + "\t" + str(gpd.get_exon_count()) +
                     "\t" + str(gpd.get_length()) + "\t" +
                     str(float(totcov) / float(gpd.get_length())) + "\n")
    sys.stderr.write("\n")
    of.close()
 def get_depth_per_transcript(self, mindepth=1):
     bedarray = []
     for tx in self.get_transcripts():
         for ex in [x.rng for x in tx.exons]:
             bedarray.append(ex)
     cov = ranges_to_coverage(bedarray)
     results = {}
     for tx in self.get_transcripts():
         tlen = tx.get_length()
         bcov = []
         for ex in [x.rng for x in tx.exons]:
             excov = [[x.overlap_size(ex), x.get_payload()] for x in cov]
             for coved in [x for x in excov if x[0] > 0]:
                 bcov.append(coved)
         total_base_coverage = sum([x[0] * x[1] for x in bcov])
         average_coverage = float(total_base_coverage) / float(tlen)
         minimum_bases_covered = sum(
             [x[0] for x in bcov if x[1] >= mindepth])
         fraction_covered_at_minimum = float(minimum_bases_covered) / float(
             tlen)
         res = {
             'tx': tx,
             'average_coverage': average_coverage,
             'fraction_covered': fraction_covered_at_minimum,
             'mindepth': mindepth,
             'length_covered': minimum_bases_covered
         }
         results[tx.get_id()] = res
         #print average_coverage
         #print fraction_covered_at_minimum
         #print tlen
         #tcov = float(bcov)/float(tlen)
         #print tcov
     #for c in cov:
     #  print c
     return results
def main():
    parser = argparse.ArgumentParser(
        description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('input', help="Use - for STDIN")
    parser.add_argument('genepred', help="the genepred used for this alignqc")
    parser.add_argument('--min_exons',
                        type=int,
                        default=1,
                        help="At least this number of exons")
    parser.add_argument('--full',
                        action='store_true',
                        help="only use full matches")
    parser.add_argument('-o',
                        '--output',
                        help="OUTPUT file or nothing for STDOUT")
    args = parser.parse_args()

    inf = sys.stdin
    if args.input != '-':
        if args.input[-3:] == '.gz':
            inf = gzip.open(args.input)
        else:
            inf = open(args.input)
    genes = {}
    sys.stderr.write("Reading annotation file\n")
    for line in inf:
        f = line.rstrip().split("\t")
        gene = f[2]
        tx = f[3]
        type = f[4]
        if args.full and type != 'full': continue
        if gene not in genes:
            genes[gene] = {}
            genes[gene]['transcripts'] = {}
            genes[gene]['cnt'] = 0
        if tx not in genes[gene]['transcripts']:
            genes[gene]['transcripts'][tx] = 0
        genes[gene]['cnt'] += 1
        genes[gene]['transcripts'][tx] += 1
    inf.close()

    txs = {}
    sys.stderr.write("Reading genepred file\n")
    z = 0
    with open(args.genepred) as inf:
        for line in inf:
            z += 1
            if z % 1000 == 0: sys.stderr.write(str(z) + "   \r")
            gpd = GPD(line)
            exs = []
            for ex in gpd.exons:
                exs.append(ex.get_range())
            txs[gpd.get_transcript_name()] = exs
    sys.stderr.write("\n")
    vals = []
    sys.stderr.write("Traversing annotation file\n")
    for gene in genes:
        for tx in genes[gene]['transcripts']:
            v = genes[gene]['transcripts'][tx]
            exons = txs[tx]
            if len(exons) < args.min_exons: continue
            for i in range(0, v):
                vals += exons[:]
    sys.stderr.write("Generating coverage file " + str(len(vals)) + "\n")
    of = sys.stdout
    if args.output:
        if args.output[-3:] == '.gz':
            of = gzip.open(args.output, 'w')
        else:
            of = open(args.output, 'w')
    covs = ranges_to_coverage(vals)
    for v in covs:
        of.write(v.chr + "\t" + str(v.start - 1) + "\t" + str(v.end) + "\t" +
                 str(v.get_payload()) + "\n")
    #    of.write(tx+"\t"+gene+"\t"+str(genes[gene]['transcripts'][tx])+"\t"+str(genes[gene]['cnt'])+"\n")
    of.close()
def main():
    parser = argparse.ArgumentParser(
        description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('input', help="Use - for STDIN")
    parser.add_argument('--offset',
                        type=int,
                        default=0,
                        help="add this much to transcript tpms")
    parser.add_argument('--mult',
                        type=int,
                        default=10,
                        help="multiply this much to tpms")
    parser.add_argument('--min_exons',
                        type=int,
                        default=1,
                        help="require at least this many exons")
    parser.add_argument('-o',
                        '--output',
                        help="OUTPUT file or nothing for STDOUT")
    args = parser.parse_args()

    inf = sys.stdin
    if args.input != '-':
        if args.input[-3:] == '.gz':
            inf = gzip.open(args.input)
        else:
            inf = open(args.input)
    genes = {}
    sys.stderr.write("Reading gtf file\n")
    txs = {}
    for line in inf:
        if re.match('#', line): continue
        f = line.rstrip().split("\t")
        tx = None
        if f[2] == 'exon' or f[2] == 'transcript':
            tx = re.search('transcript_id\s+"([^"]+)"', f[8]).group(1)
            if tx not in txs:
                txs[tx] = {}
                txs[tx]['tpm'] = 0
                txs[tx]['exons'] = []
        if f[2] == 'transcript':
            tpm = float(re.search('TPM\s+"([^"]+)"', f[8]).group(1))
            txs[tx]['tpm'] = int((tpm * float(args.mult)) + args.offset)
        if f[2] == 'exon':
            chr = f[0]
            start = int(f[3]) - 1
            end = int(f[4])
            txs[tx]['exons'].append(Bed(chr, start, end))
    inf.close()
    vals = []
    sys.stderr.write("Traversing annotation file\n")
    for tx in txs:
        exons = txs[tx]['exons']
        v = txs[tx]['tpm']
        if len(exons) < args.min_exons: continue
        for i in range(0, v):
            vals += exons[:]
    sys.stderr.write("Generating coverage file " + str(len(vals)) + "\n")
    of = sys.stdout
    if args.output:
        if args.output[-3:] == '.gz':
            of = gzip.open(args.output, 'w')
        else:
            of = open(args.output, 'w')
    covs = ranges_to_coverage(vals)
    for v in covs:
        of.write(v.chr + "\t" + str(v.start - 1) + "\t" + str(v.end) + "\t" +
                 str(v.get_payload()) + "\n")
    #    of.write(tx+"\t"+gene+"\t"+str(genes[gene]['transcripts'][tx])+"\t"+str(genes[gene]['cnt'])+"\n")
    of.close()
def main():
  parser = argparse.ArgumentParser(description="",formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument('input',help="Use - for STDIN")
  parser.add_argument('genepred',help="the genepred used for this alignqc")
  parser.add_argument('--min_exons',type=int,default=1,help="At least this number of exons")
  parser.add_argument('--full',action='store_true',help="only use full matches")
  parser.add_argument('-o','--output',help="OUTPUT file or nothing for STDOUT")
  args = parser.parse_args()
  
  inf = sys.stdin
  if args.input != '-':
    if args.input[-3:]=='.gz':
      inf = gzip.open(args.input)
    else: inf = open(args.input)
  genes = {}
  sys.stderr.write("Reading annotation file\n")
  for line in inf:
    f = line.rstrip().split("\t")  
    gene = f[2]
    tx = f[3]
    type = f[4]
    if args.full and type != 'full': continue
    if gene not in genes:
      genes[gene] = {}
      genes[gene]['transcripts'] = {}
      genes[gene]['cnt'] = 0
    if tx not in genes[gene]['transcripts']:
      genes[gene]['transcripts'][tx] = 0
    genes[gene]['cnt'] += 1
    genes[gene]['transcripts'][tx] += 1
  inf.close()

  txs = {}
  sys.stderr.write("Reading genepred file\n")
  z = 0
  with open(args.genepred) as inf:
    for line in inf:
      z +=1
      if z%1000==0: sys.stderr.write(str(z)+"   \r")
      gpd = GPD(line)
      exs = []
      for ex in gpd.exons:
        exs.append(ex.range)
      txs[gpd.get_transcript_name()] = exs
  sys.stderr.write("\n")
  vals = []
  sys.stderr.write("Traversing annotation file\n")
  for gene in genes:
    for tx in genes[gene]['transcripts']:
      v = genes[gene]['transcripts'][tx]
      exons = txs[tx]
      if len(exons) < args.min_exons: continue
      for i in range(0,v):
        vals += exons[:]
  sys.stderr.write("Generating coverage file "+str(len(vals))+"\n")
  of = sys.stdout
  if args.output:
    if args.output[-3:]=='.gz':
      of = gzip.open(args.output,'w')
    else:
      of = open(args.output,'w')
  covs = ranges_to_coverage(vals)
  for v in covs:
    of.write(v.chr+"\t"+str(v.start-1)+"\t"+str(v.end)+"\t"+str(v.get_payload())+"\n")
  #    of.write(tx+"\t"+gene+"\t"+str(genes[gene]['transcripts'][tx])+"\t"+str(genes[gene]['cnt'])+"\n")
  of.close()