def union_exons(refbed): ''' take the union of all exons defined in refbed file and build bitset ''' from qcmodule import BED tmp = BED.ParseBED(refbed) all_exons = tmp.getExon() unioned_exons = BED.unionBed3(all_exons) exon_ranges = build_bitsets(unioned_exons) return exon_ranges
def process_gene_model(gene_model): print >>sys.stderr, "processing " + gene_model + ' ...', obj = BED.ParseBED(gene_model) utr_3 = obj.getUTR(utr=3) utr_5 = obj.getUTR(utr=5) cds_exon = obj.getCDSExon() intron = obj.getIntron() intron = BED.unionBed3(intron) cds_exon=BED.unionBed3(cds_exon) utr_5 = BED.unionBed3(utr_5) utr_3 = BED.unionBed3(utr_3) utr_5 = BED.subtractBed3(utr_5,cds_exon) utr_3 = BED.subtractBed3(utr_3,cds_exon) intron = BED.subtractBed3(intron,cds_exon) intron = BED.subtractBed3(intron,utr_5) intron = BED.subtractBed3(intron,utr_3) intergenic_up_1kb = obj.getIntergenic(direction="up",size=1000) intergenic_down_1kb = obj.getIntergenic(direction="down",size=1000) intergenic_up_5kb = obj.getIntergenic(direction="up",size=5000) intergenic_down_5kb = obj.getIntergenic(direction="down",size=5000) intergenic_up_10kb = obj.getIntergenic(direction="up",size=10000) intergenic_down_10kb = obj.getIntergenic(direction="down",size=10000) #merge integenic region intergenic_up_1kb=BED.unionBed3(intergenic_up_1kb) intergenic_up_5kb=BED.unionBed3(intergenic_up_5kb) intergenic_up_10kb=BED.unionBed3(intergenic_up_10kb) intergenic_down_1kb=BED.unionBed3(intergenic_down_1kb) intergenic_down_5kb=BED.unionBed3(intergenic_down_5kb) intergenic_down_10kb=BED.unionBed3(intergenic_down_10kb) #purify intergenic region intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,cds_exon) intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,utr_5) intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,utr_3) intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,intron) intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,cds_exon) intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,utr_5) intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,utr_3) intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,intron) #purify intergenic region intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,cds_exon) intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,utr_5) intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,utr_3) intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,intron) intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,cds_exon) intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,utr_5) intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,utr_3) intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,intron) #purify intergenic region intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,cds_exon) intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,utr_5) intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,utr_3) intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,intron) intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,cds_exon) intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,utr_5) intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,utr_3) intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,intron) #build intervalTree cds_exon_ranges = build_bitsets(cds_exon) utr_5_ranges = build_bitsets(utr_5) utr_3_ranges = build_bitsets(utr_3) intron_ranges = build_bitsets(intron) interg_ranges_up_1kb_ranges = build_bitsets(intergenic_up_1kb) interg_ranges_up_5kb_ranges = build_bitsets(intergenic_up_5kb) interg_ranges_up_10kb_ranges = build_bitsets(intergenic_up_10kb) interg_ranges_down_1kb_ranges = build_bitsets(intergenic_down_1kb) interg_ranges_down_5kb_ranges = build_bitsets(intergenic_down_5kb) interg_ranges_down_10kb_ranges = build_bitsets(intergenic_down_10kb) exon_size = cal_size(cds_exon) intron_size = cal_size(intron) utr3_size = cal_size(utr_3) utr5_size = cal_size(utr_5) int_up1k_size = cal_size(intergenic_up_1kb) int_up5k_size = cal_size(intergenic_up_5kb) int_up10k_size = cal_size(intergenic_up_10kb) int_down1k_size = cal_size(intergenic_down_1kb) int_down5k_size = cal_size(intergenic_down_5kb) int_down10k_size = cal_size(intergenic_down_10kb) print >>sys.stderr, "Done" return (cds_exon_ranges,intron_ranges,utr_5_ranges,utr_3_ranges,\ interg_ranges_up_1kb_ranges,interg_ranges_up_5kb_ranges,interg_ranges_up_10kb_ranges,\ interg_ranges_down_1kb_ranges,interg_ranges_down_5kb_ranges,interg_ranges_down_10kb_ranges,\ exon_size,intron_size,utr5_size,utr3_size,\ int_up1k_size,int_up5k_size,int_up10k_size,\ int_down1k_size,int_down5k_size,int_down10k_size)
def main(): usage = "%prog [options]" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-i", "--bwfile", action="store", type="string", dest="BigWig_File", help="Input BigWig file. [required]") parser.add_option("-o", "--output", action="store", type="string", dest="output_wig", help="Output wig file. [required]") parser.add_option( "-s", "--chromSize", action="store", type="string", dest="chromSize", help= "Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome. [required]" ) parser.add_option( "-t", "--wigsum", action="store", type="int", dest="total_wigsum", default=100000000, help= "Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default [optional]" ) parser.add_option("-r", "--refgene", action="store", type="string", dest="refgene_bed", help="Reference gene model in bed format. [optional]") parser.add_option( "-c", "--chunk", action="store", type="int", dest="chunk_size", default=500000, help= "Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]" ) parser.add_option( "-f", "--format", action="store", type="string", dest="out_format", default="bgr", help= "Output format. either \"wig\" or \"bgr\". \"bgr\" save disk space but make program slower. default=%default" ) (options, args) = parser.parse_args() if not (options.BigWig_File and options.output_wig and options.chromSize): parser.print_help() sys.exit(0) OUT = open(options.output_wig, 'w') bw = BigWigFile(file=open(options.BigWig_File)) chrom_sizes = load_chromsize(options.chromSize) exons = [] WIG_SUM = 0.0 if (options.refgene_bed): print >> sys.stderr, "Extract exons from " + options.refgene_bed obj = BED.ParseBED(options.refgene_bed) exons = obj.getExon() print >> sys.stderr, "Merge overlapping exons ..." exons = BED.unionBed3(exons) print >> sys.stderr, "Calculate wigsum covered by " + options.refgene_bed + ' only' for chrom, st, end in exons: try: bw.get_as_array(chrom, 0, 1).size except: continue bw_signal = bw.get_as_array(chrom, st, end) tmp = numpy.nansum( bw_signal ) #nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0 if numpy.isnan(tmp): continue WIG_SUM += tmp print >> sys.stderr, "Total wigsum is %.2f\n" % WIG_SUM else: print >> sys.stderr, "Calculate wigsum from " + options.BigWig_File for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom #if chr_name != "chrY":continue try: bw.get_as_array(chr_name, 0, 1).size except: print >> sys.stderr, "Skip " + chr_name + "!" continue print >> sys.stderr, "Processing " + chr_name + " ..." for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): bw_signal = bw.get_as_array(interval[0], interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue WIG_SUM += tmp print >> sys.stderr, "\nTotal wigsum is %.2f\n" % WIG_SUM try: weight = options.total_wigsum / WIG_SUM except: "Error, WIG_SUM cannot be 0" eys.exit(1) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ print >> sys.stderr, "Normalizing bigwig file ..." for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom #if chr_name != "chrY":continue try: bw.get_as_array(chr_name, 0, 1).size except: print >> sys.stderr, "Skip " + chr_name + "!" continue if options.out_format.upper() == "WIG": print >> sys.stderr, "Writing " + chr_name + " ..." OUT.write('variableStep chrom=' + chr_name + '\n') for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): coord = interval[1] bw_signal = bw.get_as_array(chr_name, interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue bw_signal = numpy.nan_to_num(bw_signal) * weight for v in bw_signal: coord += 1 if v != 0: print >> OUT, "%d\t%.2f" % (coord, v) elif options.out_format.upper() == "BGR": print >> sys.stderr, "Writing " + chr_name + " ..." #OUT.write('variableStep chrom='+chr_name+'\n') for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): v2p = collections.defaultdict(list) #value to position range2p = { } #coorindate range to value, bedgraph. #[start]=[len,value] coord = interval[1] bw_signal = bw.get_as_array(chr_name, interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue bw_signal = numpy.nan_to_num(bw_signal) * weight for v in bw_signal: coord += 1 #if v != 0: print >>OUT, "%d\t%.2f" % (coord,v) if v != 0: v2p[v].append(coord) for v in v2p: for k, g in groupby(enumerate(v2p[v]), lambda (i, x): i - x): for l in [map(itemgetter(1), g)]: range2p[l[0] - 1] = [len(l), v] for i in sorted(range2p): print >> OUT, chr_name + '\t' + str(i) + '\t' + str( i + range2p[i][0]) + '\t' + str(range2p[i][1]) else: print >> sys.stderr, "unknown output format" sys.exit(1)
def main(): usage = "%prog [options]" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-i", "--bwfile", action="store", type="string", dest="BigWig_File", help="Input BigWig file. [required]") parser.add_option("-o", "--output", action="store", type="string", dest="output_wig", help="Output wig file. [required]") parser.add_option( "-s", "--chromSize", action="store", type="string", dest="chromSize", help= "Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome. [required]" ) parser.add_option( "-t", "--wigsum", action="store", type="int", dest="total_wigsum", default=100000000, help= "Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default [optional]" ) parser.add_option("-r", "--refgene", action="store", type="string", dest="refgene_bed", help="Reference gene model in bed format. [optional]") parser.add_option( "-c", "--chunk", action="store", type="int", dest="chunk_size", default=100000, help= "Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]" ) (options, args) = parser.parse_args() if not (options.BigWig_File and options.output_wig and options.chromSize): parser.print_help() sys.exit(0) OUT = open(options.output_wig, 'w') bw = BigWigFile(file=open(options.BigWig_File)) chrom_sizes = load_chromsize(options.chromSize) exons = [] WIG_SUM = 0.0 if (options.refgene_bed): print >> sys.stderr, "Extract exons from " + options.refgene_bed obj = BED.ParseBED(options.refgene_bed) exons = obj.getExon() print >> sys.stderr, "Merge overlapping exons ..." exons = BED.unionBed3(exons) print >> sys.stderr, "Calculate wigsum covered by " + options.refgene_bed + ' only' for chrom, st, end in exons: try: bw.get_as_array(chrom, 0, 1).size except: continue bw_signal = bw.get_as_array(chrom, st, end) tmp = numpy.nansum( bw_signal ) #nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0 if numpy.isnan(tmp): continue WIG_SUM += tmp print >> sys.stderr, "Total wigsum is %.2f\n" % WIG_SUM else: print >> sys.stderr, "Calculate wigsum from " + options.BigWig_File for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom try: bw.get_as_array(chr_name, 0, 1).size except: print >> sys.stderr, "Skip " + chr_name + "!" continue print >> sys.stderr, "Processing " + chr_name + " ..." for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): bw_signal = bw.get_as_array(interval[0], interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue WIG_SUM += tmp print >> sys.stderr, "\nTotal wigsum is %.2f\n" % WIG_SUM try: weight = options.total_wigsum / WIG_SUM except: "Error, WIG_SUM cannot be 0" eys.exit(1) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ print >> sys.stderr, "Normalizing bigwig file, output wiggle file" for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom try: bw.get_as_array(chr_name, 0, 1).size except: print >> sys.stderr, "Skip " + chr_name + "!" continue print >> sys.stderr, "Writing " + chr_name + " ..." OUT.write('variableStep chrom=' + chr_name + '\n') for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): coord = interval[1] bw_signal = bw.get_as_array(chr_name, interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue bw_signal = numpy.nan_to_num(bw_signal) for v in bw_signal: coord += 1 if v != 0: print >> OUT, "%d\t%.4f" % (coord, v * weight)
def main(): usage="%prog [options]" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--bwfile",action="store",type="string",dest="BigWig_File",help="Input BigWig file. [required]") parser.add_option("-o","--output",action="store",type="string",dest="output_wig",help="Output wig file. [required]") parser.add_option("-s","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome. [required]") parser.add_option("-t","--wigsum",action="store",type="int",dest="total_wigsum",default=100000000,help="Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default [optional]") parser.add_option("-r","--refgene",action="store",type="string",dest="refgene_bed",help="Reference gene model in bed format. [optional]") parser.add_option("-c","--chunk",action="store",type="int",dest="chunk_size",default=500000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]") parser.add_option("-f","--format",action="store",type="string",dest="out_format",default="bgr",help="Output format. either \"wig\" or \"bgr\". \"bgr\" save disk space but make program slower. default=%default") (options,args)=parser.parse_args() if not (options.BigWig_File and options.output_wig and options.chromSize): parser.print_help() sys.exit(0) OUT=open(options.output_wig,'w') bw = BigWigFile( file=open(options.BigWig_File) ) chrom_sizes = load_chromsize(options.chromSize) exons=[] WIG_SUM=0.0 if (options.refgene_bed): print >>sys.stderr, "Extract exons from " + options.refgene_bed obj = BED.ParseBED(options.refgene_bed) exons = obj.getExon() print >>sys.stderr, "Merge overlapping exons ..." exons = BED.unionBed3(exons) print >>sys.stderr, "Calculate wigsum covered by " + options.refgene_bed + ' only' for chrom,st,end in exons: try: bw.get_as_array(chrom,0,1).size except:continue bw_signal = bw.get_as_array(chrom,st,end) tmp = numpy.nansum(bw_signal) #nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0 if numpy.isnan(tmp):continue WIG_SUM += tmp print >>sys.stderr, "Total wigsum is %.2f\n" % WIG_SUM else: print >>sys.stderr, "Calculate wigsum from " + options.BigWig_File for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom #if chr_name != "chrY":continue try: bw.get_as_array(chr_name,0,1).size except: print >>sys.stderr, "Skip " + chr_name + "!" continue print >>sys.stderr, "Processing " + chr_name + " ..." for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size): bw_signal = bw.get_as_array(interval[0],interval[1],interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp):continue WIG_SUM += tmp print >>sys.stderr, "\nTotal wigsum is %.2f\n" % WIG_SUM try: weight = options.total_wigsum/WIG_SUM except: "Error, WIG_SUM cannot be 0" eys.exit(1) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ print >>sys.stderr, "Normalizing bigwig file ..." for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom #if chr_name != "chrY":continue try: bw.get_as_array(chr_name,0,1).size except: print >>sys.stderr, "Skip " + chr_name + "!" continue if options.out_format.upper() == "WIG": print >>sys.stderr, "Writing " + chr_name + " ..." OUT.write('variableStep chrom='+chr_name+'\n') for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size): coord = interval[1] bw_signal = bw.get_as_array(chr_name,interval[1],interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp):continue bw_signal = numpy.nan_to_num(bw_signal) * weight for v in bw_signal: coord +=1 if v != 0: print >>OUT, "%d\t%.2f" % (coord,v) elif options.out_format.upper() == "BGR": print >>sys.stderr, "Writing " + chr_name + " ..." #OUT.write('variableStep chrom='+chr_name+'\n') for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size): v2p = collections.defaultdict(list) #value to position range2p={} #coorindate range to value, bedgraph. #[start]=[len,value] coord = interval[1] bw_signal = bw.get_as_array(chr_name,interval[1],interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp):continue bw_signal = numpy.nan_to_num(bw_signal) * weight for v in bw_signal: coord +=1 #if v != 0: print >>OUT, "%d\t%.2f" % (coord,v) if v != 0: v2p[v].append(coord) for v in v2p: for k,g in groupby(enumerate(v2p[v]), lambda (i,x):i-x): for l in [map(itemgetter(1), g)]: range2p[l[0]-1] = [len(l),v] for i in sorted(range2p): print >>OUT, chr_name + '\t' + str(i) +'\t' + str(i + range2p[i][0]) + '\t' + str(range2p[i][1]) else: print >>sys.stderr, "unknown output format" sys.exit(1)
def main(): usage = "%prog [options]" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-i", "--bwfile", action="store", type="string", dest="BigWig_File", help="Input BigWig file. [required]") parser.add_option("-o", "--output", action="store", type="string", dest="output_wig", help="Output wig file. [required]") parser.add_option( "-t", "--wigsum", action="store", type="int", dest="total_wigsum", default=100000000, help= "Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default [optional]" ) parser.add_option("-r", "--refgene", action="store", type="string", dest="refgene_bed", help="Reference gene model in bed format. [optional]") parser.add_option( "-c", "--chunk", action="store", type="int", dest="chunk_size", default=500000, help= "Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]" ) parser.add_option( "-f", "--format", action="store", type="string", dest="out_format", default="bgr", help= "Output format. either \"wig\" or \"bgr\". \"bgr\" save disk space but make program slower. default=%default" ) (options, args) = parser.parse_args() if not (options.BigWig_File and options.output_wig): parser.print_help() sys.exit(0) OUT = open(options.output_wig, 'w') bw = pyBigWig.open(options.BigWig_File) if bw.isBigWig(): pass else: print("%s is not a bigwig file!" % options.BigWig_File, file=sys.stderr) sys.exit(0) print("Get chromosome sizes from BigWig header ...", file=sys.stderr) chrom_sizes = {} for chr, size in bw.chroms().items(): chrom_sizes[chr] = size exons = [] WIG_SUM = 0.0 if (options.refgene_bed): print("Extract exons from " + options.refgene_bed, file=sys.stderr) obj = BED.ParseBED(options.refgene_bed) exons = obj.getExon() print("Merge overlapping exons ...", file=sys.stderr) exons = BED.unionBed3(exons) print("Calculate wigsum covered by " + options.refgene_bed + ' only', file=sys.stderr) for chrom, st, end in exons: if bw.stats(chrom, st, end)[0] is None: continue bw_signal = bw.values(chrom, st, end) tmp = numpy.nansum( bw_signal ) #nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0 if numpy.isnan(tmp): continue WIG_SUM += tmp print("Total wigsum is %.2f\n" % WIG_SUM, file=sys.stderr) else: print("Calculate wigsum from " + options.BigWig_File, file=sys.stderr) for chr_name, chr_size in list( chrom_sizes.items()): #iterate each chrom if bw.stats(chr_name, 0, chr_size)[0] is None: print("Skip " + chr_name + "!", file=sys.stderr) continue print("Processing " + chr_name + " ...", file=sys.stderr) for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): if bw.stats(interval[0], interval[1], interval[2])[0] is None: continue bw_signal = bw.values(interval[0], interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue WIG_SUM += tmp print("\nTotal wigsum is %.2f\n" % WIG_SUM, file=sys.stderr) try: weight = options.total_wigsum / WIG_SUM except: "Error, WIG_SUM cannot be 0" sys.exit(1) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ print("Normalizing bigwig file ...", file=sys.stderr) for chr_name, chr_size in list(chrom_sizes.items()): #iterate each chrom if bw.stats(chr_name, 0, chr_size)[0] is None: print("Skip " + chr_name + "!", file=sys.stderr) continue if options.out_format.upper() == "WIG": print("Writing " + chr_name + " ...", file=sys.stderr) OUT.write('variableStep chrom=' + chr_name + '\n') for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): coord = interval[1] bw_signal = bw.values(chr_name, interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue bw_signal = numpy.nan_to_num(bw_signal) * weight for v in bw_signal: coord += 1 if v != 0: print("%d\t%.2f" % (coord, v), file=OUT) elif options.out_format.upper() == "BGR": print("Writing " + chr_name + " ...", file=sys.stderr) #OUT.write('variableStep chrom='+chr_name+'\n') for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): v2p = collections.defaultdict(list) #value to position range2p = { } #coorindate range to value, bedgraph. #[start]=[len,value] coord = interval[1] bw_signal = bw.values(chr_name, interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue bw_signal = numpy.nan_to_num(bw_signal) * weight for v in bw_signal: coord += 1 if v != 0: v2p[v].append(coord) for v in v2p: for k, g in groupby(enumerate(v2p[v]), lambda i_x: i_x[0] - i_x[1]): for l in [list(map(itemgetter(1), g))]: range2p[l[0] - 1] = [len(l), v] for i in sorted(range2p): print(chr_name + '\t' + str(i) + '\t' + str(i + range2p[i][0]) + '\t' + str(range2p[i][1]), file=OUT) else: print("unknown output format", file=sys.stderr) sys.exit(1)
def main(): usage="%prog [options]" + '\n' + __doc__ + "\n" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Input file in SAM format. Use \"-\" represents standard input [required]") parser.add_option("-r","--refgene",action="store",type="string",dest="ref_gene_model",help="Reference gene model in bed format. [required]") (options,args)=parser.parse_args() if not (options.input_file and options.ref_gene_model): parser.print_help() sys.exit(0) if not os.path.exists(options.ref_gene_model): print >>sys.stderr, '\n\n' + options.ref_gene_model + " does NOT exists" + '\n' #parser.print_help() sys.exit(0) if os.path.exists(options.input_file): file_obj=open(options.input_file) pass elif options.input_file == '-': file_obj=sys.stdin pass else: print >>sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n' #parser.print_help() sys.exit(0) print >>sys.stderr, "processing " + options.ref_gene_model + ' ...', obj = BED.ParseBED(options.ref_gene_model) utr_3 = obj.getUTR(utr=3) utr_5 = obj.getUTR(utr=5) cds_exon = obj.getCDSExon() intron = obj.getIntron() intron = BED.unionBed3(intron) cds_exon=BED.unionBed3(cds_exon) utr_5 = BED.unionBed3(utr_5) utr_3 = BED.unionBed3(utr_3) utr_5 = BED.subtractBed3(utr_5,cds_exon) utr_3 = BED.subtractBed3(utr_3,cds_exon) intron = BED.subtractBed3(intron,cds_exon) intron = BED.subtractBed3(intron,utr_5) intron = BED.subtractBed3(intron,utr_3) intergenic_up_1kb = obj.getIntergenic(direction="up",size=1000) intergenic_down_1kb = obj.getIntergenic(direction="down",size=1000) intergenic_up_5kb = obj.getIntergenic(direction="up",size=5000) intergenic_down_5kb = obj.getIntergenic(direction="down",size=5000) intergenic_up_10kb = obj.getIntergenic(direction="up",size=10000) intergenic_down_10kb = obj.getIntergenic(direction="down",size=10000) #merge integenic region intergenic_up_1kb=BED.unionBed3(intergenic_up_1kb) intergenic_up_5kb=BED.unionBed3(intergenic_up_5kb) intergenic_up_10kb=BED.unionBed3(intergenic_up_10kb) intergenic_down_1kb=BED.unionBed3(intergenic_down_1kb) intergenic_down_5kb=BED.unionBed3(intergenic_down_5kb) intergenic_down_10kb=BED.unionBed3(intergenic_down_10kb) #purify intergenic region intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,cds_exon) intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,utr_5) intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,utr_3) intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,intron) intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,cds_exon) intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,utr_5) intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,utr_3) intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,intron) #purify intergenic region intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,cds_exon) intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,utr_5) intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,utr_3) intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,intron) intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,cds_exon) intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,utr_5) intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,utr_3) intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,intron) #purify intergenic region intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,cds_exon) intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,utr_5) intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,utr_3) intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,intron) intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,cds_exon) intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,utr_5) intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,utr_3) intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,intron) print >>sys.stderr, "Done" ranges={} totalReads=0 spliceReads=0 cUR=0 multiMapReads=0 print >>sys.stderr, "reading SAM file", for line in file_obj: if line.startswith("@"):continue fields=line.rstrip('\n ').split() flagCode=string.atoi(fields[1]) if (flagCode & 0x0004) != 0: continue #skip unmap reads totalReads +=1 if not SAM.ParseSAM._uniqueHit_pat.search(line): #skip multiple mapped reads multiMapReads +=1 continue chrom = fields[2].upper() chromStart = string.atoi(fields[3])-1 comb=[int(i) for i in SAM.ParseSAM._splicedHit_pat.findall(fields[5])] #"9M4721N63M3157N8M" return ['9', '4721', '63', '3157', '8'] cUR += (len(comb) +1)/2 if(len(comb)>1): spliceReads += 1 blockStart=[] blockSize=[] for i in range(0,len(comb),2): blockStart.append(chromStart + sum(comb[:i]) ) for i in range(0,len(comb),2): blockSize.append(comb[i]) for st,size in zip(blockStart,blockSize): mid = int(st) + (size/2) if chrom not in ranges: ranges[chrom] = Intersecter() else: ranges[chrom].add_interval( Interval( mid, mid ) ) print >>sys.stderr, "Done" print >>sys.stderr, "Total Reads: " + str(totalReads) print >>sys.stderr, "Multiple Hits: " + str(multiMapReads) print >>sys.stderr, "Unique Hits: " + str(totalReads-multiMapReads) print >>sys.stderr, "Spliced Hits: " + str(spliceReads) print >>sys.stderr, "Total fragments: " + str(cUR) print >>sys.stderr, "\nAssignning reads ...", intron_read=0 intron_base=0 cds_exon_read=0 cds_exon_base=0 utr_5_read=0 utr_5_base=0 utr_3_read=0 utr_3_base=0 intergenic_up1kb_base=0 intergenic_up1kb_read=0 intergenic_down1kb_base=0 intergenic_down1kb_read=0 intergenic_up5kb_base=0 intergenic_up5kb_read=0 intergenic_down5kb_base=0 intergenic_down5kb_read=0 intergenic_up10kb_base=0 intergenic_up10kb_read=0 intergenic_down10kb_base=0 intergenic_down10kb_read=0 (intron_base,intron_read) = base_read(intron,ranges) (cds_exon_base,cds_exon_read) = base_read(cds_exon,ranges) (utr_5_base,utr_5_read) = base_read(utr_5,ranges) (utr_3_base,utr_3_read) = base_read(utr_3,ranges) (intergenic_up1kb_base, intergenic_up1kb_read) = base_read(intergenic_up_1kb,ranges) (intergenic_up5kb_base, intergenic_up5kb_read) = base_read(intergenic_up_5kb,ranges) (intergenic_up10kb_base, intergenic_up10kb_read) = base_read(intergenic_up_10kb,ranges) (intergenic_down1kb_base, intergenic_down1kb_read) = base_read(intergenic_down_1kb,ranges) (intergenic_down5kb_base, intergenic_down5kb_read) = base_read(intergenic_down_5kb,ranges) (intergenic_down10kb_base, intergenic_down10kb_read) = base_read(intergenic_down_10kb,ranges) print >>sys.stderr, "Done" print >>sys.stderr, "=========================================================" print >>sys.stderr, "Group\tTotal_bases\tReads_count\tReads/Kb" print >>sys.stderr, "CDS Exons:\t%d\t%d\t%5.2f" % (cds_exon_base,cds_exon_read,cds_exon_read*1000.0/cds_exon_base) print >>sys.stderr, "5'UTR Exons:\t%d\t%d\t%5.2f" % (utr_5_base,utr_5_read, utr_5_read*1000.0/utr_5_base) print >>sys.stderr, "3'UTR Exons:\t%d\t%d\t%5.2f" % (utr_3_base,utr_3_read, utr_3_read*1000.0/utr_3_base) print >>sys.stderr, "Intronic region:\t%d\t%d\t%5.2f" % (intron_base,intron_read,intron_read*1000.0/intron_base) print >>sys.stderr, "TSS up 1kb:\t%d\t%d\t%5.2f" % (intergenic_up1kb_base, intergenic_up1kb_read, intergenic_up1kb_read*1000.0/intergenic_up1kb_base) print >>sys.stderr, "TSS up 5kb:\t%d\t%d\t%5.2f" % (intergenic_up5kb_base, intergenic_up5kb_read, intergenic_up5kb_read*1000.0/intergenic_up5kb_base) print >>sys.stderr, "TSS up 10kb:\t%d\t%d\t%5.2f" % (intergenic_up10kb_base, intergenic_up10kb_read, intergenic_up10kb_read*1000.0/intergenic_up10kb_base) print >>sys.stderr, "TES down 1kb:\t%d\t%d\t%5.2f" % (intergenic_down1kb_base, intergenic_down1kb_read, intergenic_down1kb_read*1000.0/intergenic_down1kb_base) print >>sys.stderr, "TES down 5kb:\t%d\t%d\t%5.2f" % (intergenic_down5kb_base, intergenic_down5kb_read, intergenic_down5kb_read*1000.0/intergenic_down5kb_base) print >>sys.stderr, "TES down 10kb:\t%d\t%d\t%5.2f" % (intergenic_down10kb_base, intergenic_down10kb_read, intergenic_down10kb_read*1000.0/intergenic_down10kb_base) print >>sys.stderr, "========================================================="
def main(): usage = "%prog [options]" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option( "-i", "--bwfile", action="store", type="string", dest="BigWig_File", help="Input BigWig file. [required]" ) parser.add_option( "-o", "--output", action="store", type="string", dest="output_wig", help="Output wig file. [required]" ) parser.add_option( "-s", "--chromSize", action="store", type="string", dest="chromSize", help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome. [required]", ) parser.add_option( "-t", "--wigsum", action="store", type="int", dest="total_wigsum", default=100000000, help="Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default [optional]", ) parser.add_option( "-r", "--refgene", action="store", type="string", dest="refgene_bed", help="Reference gene model in bed format. [optional]", ) parser.add_option( "-c", "--chunk", action="store", type="int", dest="chunk_size", default=100000, help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]", ) (options, args) = parser.parse_args() if not (options.BigWig_File and options.output_wig and options.chromSize): parser.print_help() sys.exit(0) OUT = open(options.output_wig, "w") bw = BigWigFile(file=open(options.BigWig_File)) chrom_sizes = load_chromsize(options.chromSize) exons = [] WIG_SUM = 0.0 if options.refgene_bed: print >>sys.stderr, "Extract exons from " + options.refgene_bed obj = BED.ParseBED(options.refgene_bed) exons = obj.getExon() print >>sys.stderr, "Merge overlapping exons ..." exons = BED.unionBed3(exons) print >>sys.stderr, "Calculate wigsum covered by " + options.refgene_bed + " only" for chrom, st, end in exons: try: bw.get_as_array(chrom, 0, 1).size except: continue bw_signal = bw.get_as_array(chrom, st, end) tmp = numpy.nansum( bw_signal ) # nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0 if numpy.isnan(tmp): continue WIG_SUM += tmp print >>sys.stderr, "Total wigsum is %.2f\n" % WIG_SUM else: print >>sys.stderr, "Calculate wigsum from " + options.BigWig_File for chr_name, chr_size in chrom_sizes.items(): # iterate each chrom try: bw.get_as_array(chr_name, 0, 1).size except: print >>sys.stderr, "Skip " + chr_name + "!" continue print >>sys.stderr, "Processing " + chr_name + " ..." for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): bw_signal = bw.get_as_array(interval[0], interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue WIG_SUM += tmp print >>sys.stderr, "\nTotal wigsum is %.2f\n" % WIG_SUM try: weight = options.total_wigsum / WIG_SUM except: "Error, WIG_SUM cannot be 0" eys.exit(1) # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ print >>sys.stderr, "Normalizing bigwig file, output wiggle file" for chr_name, chr_size in chrom_sizes.items(): # iterate each chrom try: bw.get_as_array(chr_name, 0, 1).size except: print >>sys.stderr, "Skip " + chr_name + "!" continue print >>sys.stderr, "Writing " + chr_name + " ..." OUT.write("variableStep chrom=" + chr_name + "\n") for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): coord = interval[1] bw_signal = bw.get_as_array(chr_name, interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue bw_signal = numpy.nan_to_num(bw_signal) for v in bw_signal: coord += 1 if v != 0: print >> OUT, "%d\t%.4f" % (coord, v * weight)