def read_next(inf,strata): while True: v = inf.readline() if not v: return False num = int(v.rstrip().split("\t")[3]) if num < strata: continue arr = v.split("\t") res = Bed(arr[0],int(arr[1]),int(arr[2])) res.set_payload(int(num)) return res
def get_junctions(sams,args): prog = re.compile('^[MDNX=]$') outsams = {} z = 0 outs = [] for sam in sams: z+=1 outsams[z] = sam v = [x for x in sam.value('cigar_array') if prog.match(x['op'])] juncs = [i for i in range(0,len(v)) if v[i]['op'] =='N' and v[i]['val'] >= args.minimum_intron_size] for i in juncs: coord1 = sum([x['val'] for x in v[0:i]]) + sam.value('pos') coord2 = coord1 + v[i]['val'] b1 = Bed(sam.value('rname'),coord1-2,coord1-1) b2 = Bed(sam.value('rname'),coord2-1,coord2) outs.append([Junction(b1,b2),z]) return [outs,outsams]
def main(): #do our inputs args = do_inputs() sys.stderr.write("Reading reference genepred\n") ref = {} tx_strand = {} z = 0 with open(args.reference_genepred) as inf: for line in inf: gpd = GPD(line) gname = gpd.get_gene_name() tname = gpd.get_transcript_name() tx_strand[tname] = gpd.get_strand() if gname not in ref: ref[gname] = [] ref[gname].append(gpd) z += 1 sys.stderr.write("Read "+str(len(ref.keys()))+" genes and "+str(z)+" transcripts\n") if args.maximum_isoforms > 0: sys.stderr.write("Removing genes with more than "+str(args.maximum_isoforms)+" isoforms.\n") for gname in ref.keys(): if len(ref[gname]) > args.maximum_isoforms: del ref[gname] sys.stderr.write("Now have "+str(len(ref.keys()))+" genes and "+str(sum([len(ref[x]) for x in ref.keys()]))+" transcripts\n") sys.stderr.write("Filtering by length "+str(args.minimum_length)+" bp\n") for gname in ref.keys(): passing = [] for gpd in ref[gname]: if gpd.get_length() < args.minimum_length: continue passing.append(gpd) if len(passing) == 0: del ref[gname] else: ref[gname] = passing sys.stderr.write("Now have "+str(len(ref.keys()))+" genes and "+str(sum([len(ref[x]) for x in ref.keys()]))+" transcripts\n") sys.stderr.write("Converting gpd into exon bed\n") beds = [] for gname in ref.keys(): for gpd in ref[gname]: tname = gpd.get_transcript_name() for i in range(0,len(gpd.exons)): ex = gpd.exons[i] beds.append(ex.get_range().get_bed_array()+[gname,tname,i]) with open(args.tempdir+'/gpd.bed','w') as of: for bed in sorted(beds,key=lambda x: (x[0],x[1],x[2],x[3],x[4],x[5])): of.write("\t".join([str(x) for x in bed])+"\n") sys.stderr.write("intersecting with bed depth\n") of = open(args.tempdir+'/intersect.bed','w') cmd = 'bedtools intersect -wo -a - -b '+args.tempdir+'/gpd.bed' p = Popen(cmd.split(),stdin=args.bed_depth,stdout=of) p.communicate() coverage = {} sys.stderr.write("Reading the intersection\n") with open(args.tempdir+'/intersect.bed') as inf: for line in inf: f = line.rstrip().split("\t") gname = f[7] tname = f[8] depth = int(f[3]) bed1 = Bed(f[0],int(f[1]),int(f[2])) bed2 = Bed(f[4],int(f[5]),int(f[6])) bed = bed1.union(bed2) bed.set_payload(depth) if gname not in coverage: coverage[gname] = {} if tname not in coverage[gname]: coverage[gname][tname] = [] coverage[gname][tname].append(bed) transcript_depths = {} for gname in coverage: for tname in coverage[gname]: ref_gpd = [x for x in ref[gname] if x.get_transcript_name()==tname][0] rlen = ref_gpd.get_length() bases_covered = sum([x.length() for x in coverage[gname][tname]]) bases_area = sum([x.length()*x.get_payload() for x in coverage[gname][tname]]) avg_depth = float(bases_area)/float(rlen) if avg_depth < args.minimum_average_depth: continue if bases_covered < args.minimum_length: continue #print gname #print tname #print rlen #print bases_covered #print bases_area total_positions = {} for ex in ref_gpd.exons: b = ex.get_range().get_bed_array() for i in range(b[1],b[2]): total_positions[i] = 0 # zero indexed for b in coverage[gname][tname]: depth = b.get_payload() barr = b.get_bed_array() for i in range(barr[1],barr[2]): total_positions[i] = depth transcript_depths[tname] = total_positions sys.stderr.write("have information needed to plot from "+str(len(transcript_depths.keys()))+" transcripts\n") outputs = [] for tname in transcript_depths: depths = transcript_depths[tname] positions = sorted(depths.keys()) tx_len = len(positions) bins = {} for i in range(0,tx_len): bin = int(100*float(i)/float(tx_len)) if bin not in bins: bins[bin] = [] bins[bin].append(depths[positions[i]]) for bin in bins: bins[bin] = average(bins[bin]) biggest = float(max(bins.values())) tx_array = [float(bins[x])/biggest for x in sorted(bins.keys())] if tx_strand[tname] == '-': tx_array.reverse() #outputs.append(tx_array) args.output.write(tname+"\t"+"\t".join([str(x) for x in tx_array])+"\n") #for i in range(0,100): # args.output.write("\t".join([str(x[i]) for x in outputs])+"\n") args.output.close() # Temporary working directory step 3 of 3 - Cleanup if not args.specific_tempdir: rmtree(args.tempdir)
def get_overlap(fileA,fileB,min_A,min_B): infA = do_open(fileA) infB = do_open(fileB) bufA = read_next(infA,min_A) bufB = read_next(infB,min_B) tot = 0 sizeA = 0 sizeB = 0 if bufA: sizeA = bufA.length() if bufB: sizeB = bufB.length() zA = 1 zB = 1 while True: #if (zA%10000 ==0 or zB%10000==0): sys.stderr.write(str(zA)+" "+str(zB)+" \r") if not bufA or not bufB: break c = bufA.cmp(bufB) if c == 0: tot += bufA.overlap_size(bufB) saveA = bufA nA = bufA.subtract(bufB) if len(nA) > 0 and nA[-1].end == bufA.end: num = bufA.get_payload() bufA = Bed(nA[-1].chr,nA[-1].start-1,nA[-1].end) bufA.set_payload(num) else: bufA = read_next(infA,min_A) if bufA: sizeA += bufA.length() zA+=1 nB = bufB.subtract(saveA) if len(nB) > 0 and nB[-1].end == bufB.end: num = bufB.get_payload() bufB = Bed(nB[-1].chr,nB[-1].start-1,nB[-1].end) bufB.set_payload(num) else: bufB = read_next(infB,min_B) if bufB: sizeB += bufB.length() zB+=1 elif c == -1: bufA = read_next(infA,min_A) if bufA: sizeA += bufA.length() zA += 1 else: bufB = read_next(infB,min_B) if bufB: sizeB += bufB.length() zB += 1 #sys.stderr.write("\n") if bufA: while True: bufA = read_next(infA,min_A) if bufA: sizeA += bufA.length() else: break if bufB: while True: bufB = read_next(infB,min_B) if bufB: sizeB += bufB.length() else: break infA.close() infB.close() return [min_A,min_B,sizeA,sizeB,tot]
def main(): parser = argparse.ArgumentParser( description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input', help="Use - for STDIN") parser.add_argument('--offset', type=int, default=0, help="add this much to transcript tpms") parser.add_argument('--mult', type=int, default=10, help="multiply this much to tpms") parser.add_argument('--min_exons', type=int, default=1, help="require at least this many exons") parser.add_argument('-o', '--output', help="OUTPUT file or nothing for STDOUT") args = parser.parse_args() inf = sys.stdin if args.input != '-': if args.input[-3:] == '.gz': inf = gzip.open(args.input) else: inf = open(args.input) genes = {} sys.stderr.write("Reading gtf file\n") txs = {} for line in inf: if re.match('#', line): continue f = line.rstrip().split("\t") tx = None if f[2] == 'exon' or f[2] == 'transcript': tx = re.search('transcript_id\s+"([^"]+)"', f[8]).group(1) if tx not in txs: txs[tx] = {} txs[tx]['tpm'] = 0 txs[tx]['exons'] = [] if f[2] == 'transcript': tpm = float(re.search('TPM\s+"([^"]+)"', f[8]).group(1)) txs[tx]['tpm'] = int((tpm * float(args.mult)) + args.offset) if f[2] == 'exon': chr = f[0] start = int(f[3]) - 1 end = int(f[4]) txs[tx]['exons'].append(Bed(chr, start, end)) inf.close() vals = [] sys.stderr.write("Traversing annotation file\n") for tx in txs: exons = txs[tx]['exons'] v = txs[tx]['tpm'] if len(exons) < args.min_exons: continue for i in range(0, v): vals += exons[:] sys.stderr.write("Generating coverage file " + str(len(vals)) + "\n") of = sys.stdout if args.output: if args.output[-3:] == '.gz': of = gzip.open(args.output, 'w') else: of = open(args.output, 'w') covs = ranges_to_coverage(vals) for v in covs: of.write(v.chr + "\t" + str(v.start - 1) + "\t" + str(v.end) + "\t" + str(v.get_payload()) + "\n") # of.write(tx+"\t"+gene+"\t"+str(genes[gene]['transcripts'][tx])+"\t"+str(genes[gene]['cnt'])+"\n") of.close()