def do_buffer(buffer, txome, args): results = [] for line_z in buffer: z = line_z[1] line = line_z[0] gpd = GPD(line) v = annotate_line(gpd, txome, args) if not v: continue type = 'partial' if v[0]: type = 'full' exon_count = v[2] most_consecutive_exons = v[3] read_exon_count = v[4] tx_exon_count = v[5] overlap_size = v[6] read_length = v[7] tx_length = v[8] results.append(str(z)+"\t"+gpd.get_gene_name()+"\t"+v[9].get_gene_name()+"\t"+v[9].get_transcript_name()+"\t"+type+"\t"+\ str(exon_count)+"\t"+str(most_consecutive_exons)+"\t"+str(read_exon_count)+"\t"+str(tx_exon_count)+"\t"+\ str(overlap_size)+"\t"+str(read_length)+"\t"+str(tx_length)+"\t"+gpd.get_range().get_range_string()+"\t"+v[9].get_range().get_range_string()+"\t"+str(v[9].get_payload())+"\n") return results
def main(): #do our inputs args = do_inputs() sys.stderr.write("Reading reference genepred\n") ref = {} tx_strand = {} z = 0 with open(args.reference_genepred) as inf: for line in inf: gpd = GPD(line) gname = gpd.get_gene_name() tname = gpd.get_transcript_name() tx_strand[tname] = gpd.get_strand() if gname not in ref: ref[gname] = [] ref[gname].append(gpd) z += 1 sys.stderr.write("Read "+str(len(ref.keys()))+" genes and "+str(z)+" transcripts\n") if args.maximum_isoforms > 0: sys.stderr.write("Removing genes with more than "+str(args.maximum_isoforms)+" isoforms.\n") for gname in ref.keys(): if len(ref[gname]) > args.maximum_isoforms: del ref[gname] sys.stderr.write("Now have "+str(len(ref.keys()))+" genes and "+str(sum([len(ref[x]) for x in ref.keys()]))+" transcripts\n") sys.stderr.write("Filtering by length "+str(args.minimum_length)+" bp\n") for gname in ref.keys(): passing = [] for gpd in ref[gname]: if gpd.get_length() < args.minimum_length: continue passing.append(gpd) if len(passing) == 0: del ref[gname] else: ref[gname] = passing sys.stderr.write("Now have "+str(len(ref.keys()))+" genes and "+str(sum([len(ref[x]) for x in ref.keys()]))+" transcripts\n") sys.stderr.write("Converting gpd into exon bed\n") beds = [] for gname in ref.keys(): for gpd in ref[gname]: tname = gpd.get_transcript_name() for i in range(0,len(gpd.exons)): ex = gpd.exons[i] beds.append(ex.get_range().get_bed_array()+[gname,tname,i]) with open(args.tempdir+'/gpd.bed','w') as of: for bed in sorted(beds,key=lambda x: (x[0],x[1],x[2],x[3],x[4],x[5])): of.write("\t".join([str(x) for x in bed])+"\n") sys.stderr.write("intersecting with bed depth\n") of = open(args.tempdir+'/intersect.bed','w') cmd = 'bedtools intersect -wo -a - -b '+args.tempdir+'/gpd.bed' p = Popen(cmd.split(),stdin=args.bed_depth,stdout=of) p.communicate() coverage = {} sys.stderr.write("Reading the intersection\n") with open(args.tempdir+'/intersect.bed') as inf: for line in inf: f = line.rstrip().split("\t") gname = f[7] tname = f[8] depth = int(f[3]) bed1 = Bed(f[0],int(f[1]),int(f[2])) bed2 = Bed(f[4],int(f[5]),int(f[6])) bed = bed1.union(bed2) bed.set_payload(depth) if gname not in coverage: coverage[gname] = {} if tname not in coverage[gname]: coverage[gname][tname] = [] coverage[gname][tname].append(bed) transcript_depths = {} for gname in coverage: for tname in coverage[gname]: ref_gpd = [x for x in ref[gname] if x.get_transcript_name()==tname][0] rlen = ref_gpd.get_length() bases_covered = sum([x.length() for x in coverage[gname][tname]]) bases_area = sum([x.length()*x.get_payload() for x in coverage[gname][tname]]) avg_depth = float(bases_area)/float(rlen) if avg_depth < args.minimum_average_depth: continue if bases_covered < args.minimum_length: continue #print gname #print tname #print rlen #print bases_covered #print bases_area total_positions = {} for ex in ref_gpd.exons: b = ex.get_range().get_bed_array() for i in range(b[1],b[2]): total_positions[i] = 0 # zero indexed for b in coverage[gname][tname]: depth = b.get_payload() barr = b.get_bed_array() for i in range(barr[1],barr[2]): total_positions[i] = depth transcript_depths[tname] = total_positions sys.stderr.write("have information needed to plot from "+str(len(transcript_depths.keys()))+" transcripts\n") outputs = [] for tname in transcript_depths: depths = transcript_depths[tname] positions = sorted(depths.keys()) tx_len = len(positions) bins = {} for i in range(0,tx_len): bin = int(100*float(i)/float(tx_len)) if bin not in bins: bins[bin] = [] bins[bin].append(depths[positions[i]]) for bin in bins: bins[bin] = average(bins[bin]) biggest = float(max(bins.values())) tx_array = [float(bins[x])/biggest for x in sorted(bins.keys())] if tx_strand[tname] == '-': tx_array.reverse() #outputs.append(tx_array) args.output.write(tname+"\t"+"\t".join([str(x) for x in tx_array])+"\n") #for i in range(0,100): # args.output.write("\t".join([str(x[i]) for x in outputs])+"\n") args.output.close() # Temporary working directory step 3 of 3 - Cleanup if not args.specific_tempdir: rmtree(args.tempdir)