def union_exons(refbed): ''' take the union of all exons defined in refbed file and build bitset ''' from qcmodule import BED tmp = BED.ParseBED(refbed) all_exons = tmp.getExon() unioned_exons = BED.unionBed3(all_exons) exon_ranges = build_bitsets(unioned_exons) return exon_ranges
def main(): usage="%prog [options]" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--bwfile1",action="store",type="string",dest="BigWig_File1",help="BigWig files") parser.add_option("-j","--bwfile2",action="store",type="string",dest="BigWig_File2",help="BigWig files") parser.add_option("-a","--action",action="store",type="string",dest="action",help='After pairwise align two bigwig files, perform the follow actions (Only select one keyword):"Add" = add signals. "Average" = average signals. "Division"= divide bigwig2 from bigwig1. Add 1 to both bigwig. "Max" = pick the signal that is larger. "Min" = pick the signal that is smaller. "Product" = multiply signals. "Subtract" = subtract signals in 2nd bigwig file from the corresponiding ones in the 1st bigwig file. "geometricMean" = take the geometric mean of signals.') parser.add_option("-o","--output",action="store",type="string",dest="output_wig",help="Output wig file") parser.add_option("-s","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome.") parser.add_option("-c","--chunk",action="store",type="int",dest="chunk_size",default=100000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp)") (options,args)=parser.parse_args() if not (options.BigWig_File1 and options.BigWig_File2 and options.output_wig and options.chromSize): parser.print_help() sys.exit(0) OUT=open(options.output_wig,'w') bw1 = BigWigFile( file=open(options.BigWig_File1) ) bw2 = BigWigFile( file=open(options.BigWig_File2) ) chrom_sizes = load_chromsize(options.chromSize) for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom print >>sys.stderr, "Processing " + chr_name + " ..." OUT.write('variableStep chrom='+chr_name+'\n') for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size): coord = interval[1] bw_signal1 = bw1.get_as_array(chr_name,interval[1],interval[2]) bw_signal2 = bw2.get_as_array(chr_name,interval[1],interval[2]) if all_nan(bw_signal1) and all_nan(bw_signal2): continue bw_signal1 = replace_nan( bw_signal1 ) bw_signal2 = replace_nan( bw_signal2 ) call_back = getattr(twoList,options.action) for v in call_back(bw_signal1,bw_signal2): coord +=1 if v != 0: print >>OUT, "%d\t%.2f" % (coord,v)
def main(): usage="%prog [options]" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--bwfile1",action="store",type="string",dest="BigWig_File1",help="One BigWig file.") parser.add_option("-j","--bwfile2",action="store",type="string",dest="BigWig_File2",help="Another BigWig file. Both BigWig files should use the same reference genome.") parser.add_option("-a","--action",action="store",type="string",dest="action",help='After pairwise align two bigwig files, perform the follow actions (Only select one keyword):"Add" = add signals. "Average" = average signals. "Division"= divide bigwig2 from bigwig1. Add 1 to both bigwig. "Max" = pick the signal that is larger. "Min" = pick the signal that is smaller. "Product" = multiply signals. "Subtract" = subtract signals in 2nd bigwig file from the corresponiding ones in the 1st bigwig file. "geometricMean" = take the geometric mean of signals.') parser.add_option("-o","--output",action="store",type="string",dest="output_wig",help="Output wig file") parser.add_option("-c","--chunk",action="store",type="int",dest="chunk_size",default=100000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp)") (options,args)=parser.parse_args() if not (options.BigWig_File1 and options.BigWig_File2 and options.output_wig): parser.print_help() sys.exit(0) OUT=open(options.output_wig,'w') bw1 = pyBigWig.open(options.BigWig_File1) bw2 = pyBigWig.open(options.BigWig_File2) print("Get chromosome sizes from BigWig header ...", file=sys.stderr) chrom_sizes = {} for chr,size in bw1.chroms().items(): chrom_sizes[chr] = size for chr,size in bw2.chroms().items(): chrom_sizes[chr] = size for chr_name, chr_size in list(chrom_sizes.items()): #iterate each chrom print("Processing " + chr_name + " ...", file=sys.stderr) OUT.write('variableStep chrom='+chr_name+'\n') for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size): if (bw1.stats(chr_name,interval[1],interval[2] )[0] is None) and (bw2.stats(chr_name,interval[1],interval[2] )[0] is None): continue coord = interval[1] try: bw_signal1 = bw1.values(chr_name,interval[1],interval[2]) except: bw_signal1 = numpy.array() try: bw_signal2 = bw2.values(chr_name,interval[1],interval[2]) except: bw_signal2 = numpy.array() if bw_signal1 is None and bw_signal2 is None: continue if numpy.isnan(numpy.nansum(bw_signal1)) and numpy.isnan(numpy.nansum(bw_signal2)): continue if len(bw_signal1) == 0 and len(bw_signal2) == 0: continue bw_signal1 = numpy.nan_to_num( bw_signal1 ) bw_signal2 = numpy.nan_to_num( bw_signal2 ) call_back = getattr(twoList,options.action) for v in call_back(bw_signal1,bw_signal2): coord +=1 if v != 0 : print("%d\t%.2f" % (coord,v), file=OUT)
def main(): usage="%prog [options]" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Input file in BAM format. BAM file must be sorted and indexed using samTools. HowTo: http://genome.ucsc.edu/goldenPath/help/bam.html") parser.add_option("-r","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome.") parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output wig files(s). \"Prefix_Forward.wig\" and \"Prefix_Reverse.wig\" will be generated") parser.add_option("-b","--bin",action="store",type="int",dest="bin",default=100000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp)") parser.add_option("-e","--extension",action="store",type="int",dest="extension",default=None,help="Extended coverage from 5' end of read. default=%default (full read coverage will be used)") (options,args)=parser.parse_args() if not (options.output_prefix and options.input_file and options.chromSize): parser.print_help() sys.exit(0) for file in (options.input_file,options.chromSize): if not os.path.exists(file): print >>sys.stderr, '\n\n' + file + " does NOT exists" + '\n' sys.exit(0) if not os.path.exists(options.input_file + '.bai'): print >>sys.stderr, "index file " + options.input_file + '.bai' + "does not exists" sys.exit(0) chrom_sizes = load_chromsize(options.chromSize) samfile = SAM.ParseBAM(options.input_file) FWOUT = open(options.output_prefix + "_Forward.wig",'w') RWOUT = open(options.output_prefix + "_Reverse.wig",'w') for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom try: samfile.fetchAlignments(chr_name,0,chr_size) except: print >>sys.stderr, "No alignments for " + chr_name + '. skipped' continue print >>sys.stderr, "Processing " + chr_name + " ..." FWOUT.write('variableStep chrom='+chr_name+'\n') RWOUT.write('variableStep chrom='+chr_name+'\n') for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.bin): #cut chrom into bins, interval such as ('chr1', 235000000, 236000000) Fwig={} Rwig={} alignedReads = samfile.fetchAlignments(interval[0],interval[1],interval[2]) (Fwig,Rwig) = build_wig(alignedReads,options.extension) if (len(Fwig)>0): for i in xrange(interval[1]+1,interval[2]+1): if Fwig.has_key(i): FWOUT.write("%d\t%d\n" % (i, Fwig[i])) if (len(Rwig)>0): for i in xrange(interval[1]+1,interval[2]+1): if Rwig.has_key(i): RWOUT.write("%d\t%d\n" % (i, Rwig[i]))
def main(): usage="%prog [options]" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--bwfile1",action="store",type="string",dest="BigWig_File1",help="One BigWig file") parser.add_option("-j","--bwfile2",action="store",type="string",dest="BigWig_File2",help="Another BigWig file") parser.add_option("-a","--action",action="store",type="string",dest="action",help='After pairwise align two bigwig files, perform the follow actions (Only select one keyword):"Add" = add signals. "Average" = average signals. "Division"= divide bigwig2 from bigwig1. Add 1 to both bigwig. "Max" = pick the signal that is larger. "Min" = pick the signal that is smaller. "Product" = multiply signals. "Subtract" = subtract signals in 2nd bigwig file from the corresponiding ones in the 1st bigwig file. "geometricMean" = take the geometric mean of signals.') parser.add_option("-o","--output",action="store",type="string",dest="output_wig",help="Output wig file") parser.add_option("-s","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome.") parser.add_option("-c","--chunk",action="store",type="int",dest="chunk_size",default=100000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp)") parser.add_option("-m","--min_signal",action="store",type="float",dest="min_score",default=0.0,help="To redude the size of output wigfile, genomic positions with signal value smaller than (<) this threshold will be filtered out. default=%default") (options,args)=parser.parse_args() if not (options.BigWig_File1 and options.BigWig_File2 and options.output_wig and options.chromSize): parser.print_help() sys.exit(0) OUT=open(options.output_wig,'w') bw1 = BigWigFile( file=open(options.BigWig_File1) ) bw2 = BigWigFile( file=open(options.BigWig_File2) ) chrom_sizes = load_chromsize(options.chromSize) for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom print >>sys.stderr, "Processing " + chr_name + " ..." OUT.write('variableStep chrom='+chr_name+'\n') for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size): coord = interval[1] try: bw_signal1 = bw1.get_as_array(chr_name,interval[1],interval[2]) except: bw_signal1 = numpy.array() try: bw_signal2 = bw2.get_as_array(chr_name,interval[1],interval[2]) except: bw_signal2 = numpy.array() if bw_signal1 is None and bw_signal2 is None: continue if numpy.isnan(numpy.nansum(bw_signal1)) and numpy.isnan(numpy.nansum(bw_signal2)): continue if len(bw_signal1) == 0 and len(bw_signal2) == 0: continue bw_signal1 = numpy.nan_to_num( bw_signal1 ) bw_signal2 = numpy.nan_to_num( bw_signal2 ) call_back = getattr(twoList,options.action) for v in call_back(bw_signal1,bw_signal2): coord +=1 if v >= options.min_score: print >>OUT, "%d\t%.2f" % (coord,v)
def process_gene_model(gene_model): print >>sys.stderr, "processing " + gene_model + ' ...', obj = BED.ParseBED(gene_model) utr_3 = obj.getUTR(utr=3) utr_5 = obj.getUTR(utr=5) cds_exon = obj.getCDSExon() intron = obj.getIntron() intron = BED.unionBed3(intron) cds_exon=BED.unionBed3(cds_exon) utr_5 = BED.unionBed3(utr_5) utr_3 = BED.unionBed3(utr_3) utr_5 = BED.subtractBed3(utr_5,cds_exon) utr_3 = BED.subtractBed3(utr_3,cds_exon) intron = BED.subtractBed3(intron,cds_exon) intron = BED.subtractBed3(intron,utr_5) intron = BED.subtractBed3(intron,utr_3) intergenic_up_1kb = obj.getIntergenic(direction="up",size=1000) intergenic_down_1kb = obj.getIntergenic(direction="down",size=1000) intergenic_up_5kb = obj.getIntergenic(direction="up",size=5000) intergenic_down_5kb = obj.getIntergenic(direction="down",size=5000) intergenic_up_10kb = obj.getIntergenic(direction="up",size=10000) intergenic_down_10kb = obj.getIntergenic(direction="down",size=10000) #merge integenic region intergenic_up_1kb=BED.unionBed3(intergenic_up_1kb) intergenic_up_5kb=BED.unionBed3(intergenic_up_5kb) intergenic_up_10kb=BED.unionBed3(intergenic_up_10kb) intergenic_down_1kb=BED.unionBed3(intergenic_down_1kb) intergenic_down_5kb=BED.unionBed3(intergenic_down_5kb) intergenic_down_10kb=BED.unionBed3(intergenic_down_10kb) #purify intergenic region intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,cds_exon) intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,utr_5) intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,utr_3) intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,intron) intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,cds_exon) intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,utr_5) intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,utr_3) intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,intron) #purify intergenic region intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,cds_exon) intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,utr_5) intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,utr_3) intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,intron) intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,cds_exon) intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,utr_5) intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,utr_3) intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,intron) #purify intergenic region intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,cds_exon) intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,utr_5) intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,utr_3) intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,intron) intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,cds_exon) intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,utr_5) intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,utr_3) intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,intron) #build intervalTree cds_exon_ranges = build_bitsets(cds_exon) utr_5_ranges = build_bitsets(utr_5) utr_3_ranges = build_bitsets(utr_3) intron_ranges = build_bitsets(intron) interg_ranges_up_1kb_ranges = build_bitsets(intergenic_up_1kb) interg_ranges_up_5kb_ranges = build_bitsets(intergenic_up_5kb) interg_ranges_up_10kb_ranges = build_bitsets(intergenic_up_10kb) interg_ranges_down_1kb_ranges = build_bitsets(intergenic_down_1kb) interg_ranges_down_5kb_ranges = build_bitsets(intergenic_down_5kb) interg_ranges_down_10kb_ranges = build_bitsets(intergenic_down_10kb) exon_size = cal_size(cds_exon) intron_size = cal_size(intron) utr3_size = cal_size(utr_3) utr5_size = cal_size(utr_5) int_up1k_size = cal_size(intergenic_up_1kb) int_up5k_size = cal_size(intergenic_up_5kb) int_up10k_size = cal_size(intergenic_up_10kb) int_down1k_size = cal_size(intergenic_down_1kb) int_down5k_size = cal_size(intergenic_down_5kb) int_down10k_size = cal_size(intergenic_down_10kb) print >>sys.stderr, "Done" return (cds_exon_ranges,intron_ranges,utr_5_ranges,utr_3_ranges,\ interg_ranges_up_1kb_ranges,interg_ranges_up_5kb_ranges,interg_ranges_up_10kb_ranges,\ interg_ranges_down_1kb_ranges,interg_ranges_down_5kb_ranges,interg_ranges_down_10kb_ranges,\ exon_size,intron_size,utr5_size,utr3_size,\ int_up1k_size,int_up5k_size,int_up10k_size,\ int_down1k_size,int_down5k_size,int_down10k_size)
def main(): usage = "%prog [options]" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-i", "--bwfile", action="store", type="string", dest="BigWig_File", help="Input BigWig file. [required]") parser.add_option("-o", "--output", action="store", type="string", dest="output_wig", help="Output wig file. [required]") parser.add_option( "-s", "--chromSize", action="store", type="string", dest="chromSize", help= "Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome. [required]" ) parser.add_option( "-t", "--wigsum", action="store", type="int", dest="total_wigsum", default=100000000, help= "Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default [optional]" ) parser.add_option("-r", "--refgene", action="store", type="string", dest="refgene_bed", help="Reference gene model in bed format. [optional]") parser.add_option( "-c", "--chunk", action="store", type="int", dest="chunk_size", default=500000, help= "Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]" ) parser.add_option( "-f", "--format", action="store", type="string", dest="out_format", default="bgr", help= "Output format. either \"wig\" or \"bgr\". \"bgr\" save disk space but make program slower. default=%default" ) (options, args) = parser.parse_args() if not (options.BigWig_File and options.output_wig and options.chromSize): parser.print_help() sys.exit(0) OUT = open(options.output_wig, 'w') bw = BigWigFile(file=open(options.BigWig_File)) chrom_sizes = load_chromsize(options.chromSize) exons = [] WIG_SUM = 0.0 if (options.refgene_bed): print >> sys.stderr, "Extract exons from " + options.refgene_bed obj = BED.ParseBED(options.refgene_bed) exons = obj.getExon() print >> sys.stderr, "Merge overlapping exons ..." exons = BED.unionBed3(exons) print >> sys.stderr, "Calculate wigsum covered by " + options.refgene_bed + ' only' for chrom, st, end in exons: try: bw.get_as_array(chrom, 0, 1).size except: continue bw_signal = bw.get_as_array(chrom, st, end) tmp = numpy.nansum( bw_signal ) #nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0 if numpy.isnan(tmp): continue WIG_SUM += tmp print >> sys.stderr, "Total wigsum is %.2f\n" % WIG_SUM else: print >> sys.stderr, "Calculate wigsum from " + options.BigWig_File for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom #if chr_name != "chrY":continue try: bw.get_as_array(chr_name, 0, 1).size except: print >> sys.stderr, "Skip " + chr_name + "!" continue print >> sys.stderr, "Processing " + chr_name + " ..." for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): bw_signal = bw.get_as_array(interval[0], interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue WIG_SUM += tmp print >> sys.stderr, "\nTotal wigsum is %.2f\n" % WIG_SUM try: weight = options.total_wigsum / WIG_SUM except: "Error, WIG_SUM cannot be 0" eys.exit(1) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ print >> sys.stderr, "Normalizing bigwig file ..." for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom #if chr_name != "chrY":continue try: bw.get_as_array(chr_name, 0, 1).size except: print >> sys.stderr, "Skip " + chr_name + "!" continue if options.out_format.upper() == "WIG": print >> sys.stderr, "Writing " + chr_name + " ..." OUT.write('variableStep chrom=' + chr_name + '\n') for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): coord = interval[1] bw_signal = bw.get_as_array(chr_name, interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue bw_signal = numpy.nan_to_num(bw_signal) * weight for v in bw_signal: coord += 1 if v != 0: print >> OUT, "%d\t%.2f" % (coord, v) elif options.out_format.upper() == "BGR": print >> sys.stderr, "Writing " + chr_name + " ..." #OUT.write('variableStep chrom='+chr_name+'\n') for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): v2p = collections.defaultdict(list) #value to position range2p = { } #coorindate range to value, bedgraph. #[start]=[len,value] coord = interval[1] bw_signal = bw.get_as_array(chr_name, interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue bw_signal = numpy.nan_to_num(bw_signal) * weight for v in bw_signal: coord += 1 #if v != 0: print >>OUT, "%d\t%.2f" % (coord,v) if v != 0: v2p[v].append(coord) for v in v2p: for k, g in groupby(enumerate(v2p[v]), lambda (i, x): i - x): for l in [map(itemgetter(1), g)]: range2p[l[0] - 1] = [len(l), v] for i in sorted(range2p): print >> OUT, chr_name + '\t' + str(i) + '\t' + str( i + range2p[i][0]) + '\t' + str(range2p[i][1]) else: print >> sys.stderr, "unknown output format" sys.exit(1)
def main(): usage="%prog [options]" + '\n' + __doc__ + "\n" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM or SAM format. BAM file should be sorted and indexed.") parser.add_option("-r","--genelist",action="store",type="string",dest="gene_list",help="Gene list in bed foramt. All reads hits to exon regions (defined by this gene list) will be saved into one BAM file, the remaining reads will saved into another BAM file.") parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output BAM files. \"prefix.in.bam\" file contains reads mapped to the gene list specified by \"-r\", \"prefix.ex.bam\" contains reads that cannot mapped to gene list. \"prefix.junk.bam\" contains qcfailed or unmapped reads.") (options,args)=parser.parse_args() if not (options.input_file and options.gene_list): parser.print_help() sys.exit(0) if not os.path.exists(options.gene_list): print >>sys.stderr, '\n\n' + options.gene_list + " does NOT exists" + '\n' #parser.print_help() sys.exit(0) if not os.path.exists(options.input_file): print >>sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n' sys.exit(0) #build bitset for gene list print >>sys.stderr, 'reading ' + options.gene_list + ' ... ', obj = BED.ParseBED(options.gene_list) exons = obj.getExon() exon_ranges = build_bitsets(exons) print >>sys.stderr, 'Done' samfile = pysam.Samfile(options.input_file,'rb') out1 = pysam.Samfile(options.output_prefix + '.in.bam','wb',template=samfile) #bam file containing reads hit to exon region out2 = pysam.Samfile(options.output_prefix + '.ex.bam','wb',template=samfile) #bam file containing reads not hit to exon region out3 = pysam.Samfile(options.output_prefix + '.junk.bam','wb',template=samfile) #bam file containing reads not hit to exon region total_alignment = 0 in_alignment = 0 ex_alignment = 0 bad_alignment = 0 print >>sys.stderr, "spliting " + options.input_file + " ...", try: while(1): aligned_read = samfile.next() total_alignment += 1 if aligned_read.is_qcfail: bad_alignment +=1 out3.write(aligned_read) continue if aligned_read.is_unmapped: bad_alignment +=1 out3.write(aligned_read) continue chrom = samfile.getrname(aligned_read.tid) chrom=chrom.upper() read_start = aligned_read.pos mate_start = aligned_read.mpos #read_exons = bam_cigar.fetch_exon(chrom, aligned_read.pos, aligned_read.cigar) if aligned_read.mate_is_unmapped: #only one end mapped if chrom not in exon_ranges: out2.write(aligned_read) ex_alignment += 1 continue else: if len(exon_ranges[chrom].find(read_start, read_start +1)) >= 1: out1.write(aligned_read) in_alignment += 1 continue elif len(exon_ranges[chrom].find(read_start, read_start +1)) == 0: out2.write(aligned_read) ex_alignment += 1 continue else: #both end mapped if chrom not in exon_ranges: out2.write(aligned_read) ex_alignment += 1 continue else: if (len(exon_ranges[chrom].find(read_start, read_start +1)) >= 1) or (len(exon_ranges[chrom].find(mate_start, mate_start +1)) >= 1): out1.write(aligned_read) in_alignment += 1 else: out2.write(aligned_read) ex_alignment += 1 except StopIteration: print >>sys.stderr, "Done" print "%-55s%d" % ("Total records:",total_alignment) print "%-55s%d" % (options.output_prefix + '.in.bam (Reads consumed by input gene list):',in_alignment) print "%-55s%d" % (options.output_prefix + '.ex.bam (Reads not consumed by input gene list):',ex_alignment) print "%-55s%d" % (options.output_prefix + '.junk.bam (qcfailed, unmapped reads):',bad_alignment)
def main(): usage = "%prog [options]" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-i", "--bwfile", action="store", type="string", dest="BigWig_File", help="Input BigWig file. [required]") parser.add_option("-o", "--output", action="store", type="string", dest="output_wig", help="Output wig file. [required]") parser.add_option( "-s", "--chromSize", action="store", type="string", dest="chromSize", help= "Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome. [required]" ) parser.add_option( "-t", "--wigsum", action="store", type="int", dest="total_wigsum", default=100000000, help= "Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default [optional]" ) parser.add_option("-r", "--refgene", action="store", type="string", dest="refgene_bed", help="Reference gene model in bed format. [optional]") parser.add_option( "-c", "--chunk", action="store", type="int", dest="chunk_size", default=100000, help= "Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]" ) (options, args) = parser.parse_args() if not (options.BigWig_File and options.output_wig and options.chromSize): parser.print_help() sys.exit(0) OUT = open(options.output_wig, 'w') bw = BigWigFile(file=open(options.BigWig_File)) chrom_sizes = load_chromsize(options.chromSize) exons = [] WIG_SUM = 0.0 if (options.refgene_bed): print >> sys.stderr, "Extract exons from " + options.refgene_bed obj = BED.ParseBED(options.refgene_bed) exons = obj.getExon() print >> sys.stderr, "Merge overlapping exons ..." exons = BED.unionBed3(exons) print >> sys.stderr, "Calculate wigsum covered by " + options.refgene_bed + ' only' for chrom, st, end in exons: try: bw.get_as_array(chrom, 0, 1).size except: continue bw_signal = bw.get_as_array(chrom, st, end) tmp = numpy.nansum( bw_signal ) #nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0 if numpy.isnan(tmp): continue WIG_SUM += tmp print >> sys.stderr, "Total wigsum is %.2f\n" % WIG_SUM else: print >> sys.stderr, "Calculate wigsum from " + options.BigWig_File for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom try: bw.get_as_array(chr_name, 0, 1).size except: print >> sys.stderr, "Skip " + chr_name + "!" continue print >> sys.stderr, "Processing " + chr_name + " ..." for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): bw_signal = bw.get_as_array(interval[0], interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue WIG_SUM += tmp print >> sys.stderr, "\nTotal wigsum is %.2f\n" % WIG_SUM try: weight = options.total_wigsum / WIG_SUM except: "Error, WIG_SUM cannot be 0" eys.exit(1) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ print >> sys.stderr, "Normalizing bigwig file, output wiggle file" for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom try: bw.get_as_array(chr_name, 0, 1).size except: print >> sys.stderr, "Skip " + chr_name + "!" continue print >> sys.stderr, "Writing " + chr_name + " ..." OUT.write('variableStep chrom=' + chr_name + '\n') for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): coord = interval[1] bw_signal = bw.get_as_array(chr_name, interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue bw_signal = numpy.nan_to_num(bw_signal) for v in bw_signal: coord += 1 if v != 0: print >> OUT, "%d\t%.4f" % (coord, v * weight)
def main(): usage="%prog [options]" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--bwfile",action="store",type="string",dest="BigWig_File",help="Input BigWig file. [required]") parser.add_option("-o","--output",action="store",type="string",dest="output_wig",help="Output wig file. [required]") parser.add_option("-s","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome. [required]") parser.add_option("-t","--wigsum",action="store",type="int",dest="total_wigsum",default=100000000,help="Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default [optional]") parser.add_option("-r","--refgene",action="store",type="string",dest="refgene_bed",help="Reference gene model in bed format. [optional]") parser.add_option("-c","--chunk",action="store",type="int",dest="chunk_size",default=500000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]") parser.add_option("-f","--format",action="store",type="string",dest="out_format",default="bgr",help="Output format. either \"wig\" or \"bgr\". \"bgr\" save disk space but make program slower. default=%default") (options,args)=parser.parse_args() if not (options.BigWig_File and options.output_wig and options.chromSize): parser.print_help() sys.exit(0) OUT=open(options.output_wig,'w') bw = BigWigFile( file=open(options.BigWig_File) ) chrom_sizes = load_chromsize(options.chromSize) exons=[] WIG_SUM=0.0 if (options.refgene_bed): print >>sys.stderr, "Extract exons from " + options.refgene_bed obj = BED.ParseBED(options.refgene_bed) exons = obj.getExon() print >>sys.stderr, "Merge overlapping exons ..." exons = BED.unionBed3(exons) print >>sys.stderr, "Calculate wigsum covered by " + options.refgene_bed + ' only' for chrom,st,end in exons: try: bw.get_as_array(chrom,0,1).size except:continue bw_signal = bw.get_as_array(chrom,st,end) tmp = numpy.nansum(bw_signal) #nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0 if numpy.isnan(tmp):continue WIG_SUM += tmp print >>sys.stderr, "Total wigsum is %.2f\n" % WIG_SUM else: print >>sys.stderr, "Calculate wigsum from " + options.BigWig_File for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom #if chr_name != "chrY":continue try: bw.get_as_array(chr_name,0,1).size except: print >>sys.stderr, "Skip " + chr_name + "!" continue print >>sys.stderr, "Processing " + chr_name + " ..." for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size): bw_signal = bw.get_as_array(interval[0],interval[1],interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp):continue WIG_SUM += tmp print >>sys.stderr, "\nTotal wigsum is %.2f\n" % WIG_SUM try: weight = options.total_wigsum/WIG_SUM except: "Error, WIG_SUM cannot be 0" eys.exit(1) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ print >>sys.stderr, "Normalizing bigwig file ..." for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom #if chr_name != "chrY":continue try: bw.get_as_array(chr_name,0,1).size except: print >>sys.stderr, "Skip " + chr_name + "!" continue if options.out_format.upper() == "WIG": print >>sys.stderr, "Writing " + chr_name + " ..." OUT.write('variableStep chrom='+chr_name+'\n') for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size): coord = interval[1] bw_signal = bw.get_as_array(chr_name,interval[1],interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp):continue bw_signal = numpy.nan_to_num(bw_signal) * weight for v in bw_signal: coord +=1 if v != 0: print >>OUT, "%d\t%.2f" % (coord,v) elif options.out_format.upper() == "BGR": print >>sys.stderr, "Writing " + chr_name + " ..." #OUT.write('variableStep chrom='+chr_name+'\n') for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size): v2p = collections.defaultdict(list) #value to position range2p={} #coorindate range to value, bedgraph. #[start]=[len,value] coord = interval[1] bw_signal = bw.get_as_array(chr_name,interval[1],interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp):continue bw_signal = numpy.nan_to_num(bw_signal) * weight for v in bw_signal: coord +=1 #if v != 0: print >>OUT, "%d\t%.2f" % (coord,v) if v != 0: v2p[v].append(coord) for v in v2p: for k,g in groupby(enumerate(v2p[v]), lambda (i,x):i-x): for l in [map(itemgetter(1), g)]: range2p[l[0]-1] = [len(l),v] for i in sorted(range2p): print >>OUT, chr_name + '\t' + str(i) +'\t' + str(i + range2p[i][0]) + '\t' + str(range2p[i][1]) else: print >>sys.stderr, "unknown output format" sys.exit(1)
def main(): usage = "%prog [options]" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-i", "--bwfile", action="store", type="string", dest="BigWig_File", help="Input BigWig file. [required]") parser.add_option("-o", "--output", action="store", type="string", dest="output_wig", help="Output wig file. [required]") parser.add_option( "-t", "--wigsum", action="store", type="int", dest="total_wigsum", default=100000000, help= "Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default [optional]" ) parser.add_option("-r", "--refgene", action="store", type="string", dest="refgene_bed", help="Reference gene model in bed format. [optional]") parser.add_option( "-c", "--chunk", action="store", type="int", dest="chunk_size", default=500000, help= "Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]" ) parser.add_option( "-f", "--format", action="store", type="string", dest="out_format", default="bgr", help= "Output format. either \"wig\" or \"bgr\". \"bgr\" save disk space but make program slower. default=%default" ) (options, args) = parser.parse_args() if not (options.BigWig_File and options.output_wig): parser.print_help() sys.exit(0) OUT = open(options.output_wig, 'w') bw = pyBigWig.open(options.BigWig_File) if bw.isBigWig(): pass else: print("%s is not a bigwig file!" % options.BigWig_File, file=sys.stderr) sys.exit(0) print("Get chromosome sizes from BigWig header ...", file=sys.stderr) chrom_sizes = {} for chr, size in bw.chroms().items(): chrom_sizes[chr] = size exons = [] WIG_SUM = 0.0 if (options.refgene_bed): print("Extract exons from " + options.refgene_bed, file=sys.stderr) obj = BED.ParseBED(options.refgene_bed) exons = obj.getExon() print("Merge overlapping exons ...", file=sys.stderr) exons = BED.unionBed3(exons) print("Calculate wigsum covered by " + options.refgene_bed + ' only', file=sys.stderr) for chrom, st, end in exons: if bw.stats(chrom, st, end)[0] is None: continue bw_signal = bw.values(chrom, st, end) tmp = numpy.nansum( bw_signal ) #nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0 if numpy.isnan(tmp): continue WIG_SUM += tmp print("Total wigsum is %.2f\n" % WIG_SUM, file=sys.stderr) else: print("Calculate wigsum from " + options.BigWig_File, file=sys.stderr) for chr_name, chr_size in list( chrom_sizes.items()): #iterate each chrom if bw.stats(chr_name, 0, chr_size)[0] is None: print("Skip " + chr_name + "!", file=sys.stderr) continue print("Processing " + chr_name + " ...", file=sys.stderr) for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): if bw.stats(interval[0], interval[1], interval[2])[0] is None: continue bw_signal = bw.values(interval[0], interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue WIG_SUM += tmp print("\nTotal wigsum is %.2f\n" % WIG_SUM, file=sys.stderr) try: weight = options.total_wigsum / WIG_SUM except: "Error, WIG_SUM cannot be 0" sys.exit(1) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ print("Normalizing bigwig file ...", file=sys.stderr) for chr_name, chr_size in list(chrom_sizes.items()): #iterate each chrom if bw.stats(chr_name, 0, chr_size)[0] is None: print("Skip " + chr_name + "!", file=sys.stderr) continue if options.out_format.upper() == "WIG": print("Writing " + chr_name + " ...", file=sys.stderr) OUT.write('variableStep chrom=' + chr_name + '\n') for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): coord = interval[1] bw_signal = bw.values(chr_name, interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue bw_signal = numpy.nan_to_num(bw_signal) * weight for v in bw_signal: coord += 1 if v != 0: print("%d\t%.2f" % (coord, v), file=OUT) elif options.out_format.upper() == "BGR": print("Writing " + chr_name + " ...", file=sys.stderr) #OUT.write('variableStep chrom='+chr_name+'\n') for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): v2p = collections.defaultdict(list) #value to position range2p = { } #coorindate range to value, bedgraph. #[start]=[len,value] coord = interval[1] bw_signal = bw.values(chr_name, interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue bw_signal = numpy.nan_to_num(bw_signal) * weight for v in bw_signal: coord += 1 if v != 0: v2p[v].append(coord) for v in v2p: for k, g in groupby(enumerate(v2p[v]), lambda i_x: i_x[0] - i_x[1]): for l in [list(map(itemgetter(1), g))]: range2p[l[0] - 1] = [len(l), v] for i in sorted(range2p): print(chr_name + '\t' + str(i) + '\t' + str(i + range2p[i][0]) + '\t' + str(range2p[i][1]), file=OUT) else: print("unknown output format", file=sys.stderr) sys.exit(1)
def main(): usage="%prog [options]" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-b","--forward",action="store",type="string",dest="forward_bw",help="BigWig file for forward reads (extend 1 nt from 5' end of read)") parser.add_option("-d","--reverse",action="store",type="string",dest="reverse_bw",help="BigWig file for reverse reads (extend 1 nt from 5' end of read)") parser.add_option("-s","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome.") parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output files") parser.add_option("-z","--fuzziness",action="store",type="int",dest="fuzzy_size",default=10,help="Peaks within fuzzy window will be merged. default=%default (bp)") parser.add_option("-w","--bgw",action="store",type="int",dest="window_size",default=200,help="Background window size used to determine background signal level (lambda in Poisson model). default=%default (bp)") parser.add_option("-c","--chunk",action="store",type="int",dest="chunk_size",default=100000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp)") parser.add_option("-p","--pvalue",action="store",type="float",dest="pvalue_cutoff",default=0.1,help="Pvalue cutoff for peak detection. default=%default") parser.add_option("-r","--bg-root-num",action="store",type="float",dest="bg_root_num",default=100,help="Background peak root number. default=%default") parser.add_option("-e","--extention",action="store",type="int",dest="extention_size",default=5,help="Window size used to calculate peak area. Larger number will signficantly reduce speed, and make peak calling more meaningless. default=%default") (options,args)=parser.parse_args() if not (options.output_prefix and options.chromSize and options.forward_bw and options.reverse_bw): parser.print_help() sys.exit(0) for file in (options.chromSize,options.forward_bw,options.reverse_bw): if not os.path.exists(file): print >>sys.stderr, '\n\n' + file + " does NOT exists" + '\n' sys.exit(0) chrom_sizes = load_chromsize(options.chromSize) OUT = open(options.output_prefix + ".single_nt_peak.xls",'w') fw_bw_obj = BigWigFile( file = open(options.forward_bw)) rv_bw_obj = BigWigFile( file = open(options.reverse_bw)) rv_peak_roots = {} rv_peak_height = {} rv_ranges={} rv_peak_pvalue={} pv_cutoff = -10*math.log10(options.pvalue_cutoff) signal.signal(signal.SIGINT, signal_handler) print >>sys.stderr, logo #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #calculate peak height and peak area for forward bigwig print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Processing " + options.forward_bw + ' ...' for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom fw_peak_roots = {} #key is chr,pos,strand,height: ("chr19 51345387 + 2.83"), value is area("2.82999992371") fw_peak_height = {} fw_ranges={} fw_peak_pvalue={} if chr_name != 'chrY': continue print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Processing " + chr_name + " ..." progress = 0 coord = 0 #for each chunk for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size): #cut chrom into bins, interval such as ('chr1', 235000000, 236000000) for indx,val in enumerate(fw_bw_obj.get_as_array(interval[0],interval[1],interval[2])): coord += 1 #coord is 1-based on genome if numpy.isnan(val):continue area_value = sum_bwfile(chr_name, coord, options.extention_size, fw_bw_obj,chrom_sizes) fw_peak_roots[chr_name + "\t" + str(coord) + "\t+"] = area_value #key is chrom + position + strand,value is area fw_peak_height[chr_name + "\t" + str(coord) + "\t+"] = val if chr_name not in fw_ranges: fw_ranges[chr_name] = IntervalTree() else: fw_ranges[chr_name].insert_interval( Interval( coord-1, coord, value=area_value) ) finish_part = int(interval[2]*100/chr_size) if finish_part > progress: print >>sys.stderr, " %d%% finished\r" % (finish_part), progress = finish_part #fw_global_lamda = numpy.mean(fw_peak_roots.values()) #print >>sys.stderr, "Global mean (Forward) = " + str(fw_global_lamda) print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Calculating pvalues for " + options.forward_bw + ' ...' for k in fw_peak_roots: chrom = k.split("\t")[0] coord = int(k.split("\t")[1]) fw_peak_pvalue[k] = cal_poisson_pvalue(int(fw_peak_roots[k]), coord-1, coord, fw_ranges[chrom],options.window_size,options.bg_root_num) fw_peak_filtered = merge_peaks(fw_peak_height,fuzziness=options.fuzzy_size) for k,v in fw_peak_filtered.items(): #print k + '\t' + str(v) (chrom,end,strand) = k.split('\t') end = int(end) start = end -1 height = str(v) area = str(fw_peak_roots[k]) pvalue = fw_peak_pvalue[k] if pvalue < pv_cutoff:continue print >>OUT, '\t'.join([chrom, str(start), str(end), area,str(round(pvalue)),strand,height]) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #calculate peak height and peak area for reverse bigwig print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Processing " + options.reverse_bw + ' ...' for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom if chr_name != 'chrY': continue print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Processing " + chr_name + " ..." progress = 0 coord = 0 #for each chunk for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size): #cut chrom into bins, interval such as ('chr1', 235000000, 236000000) for indx,val in enumerate(rv_bw_obj.get_as_array(interval[0],interval[1],interval[2])): coord += 1 #coord is 1-based on genome if numpy.isnan(val):continue area_value = sum_bwfile(chr_name, coord, options.extention_size, rv_bw_obj,chrom_sizes) rv_peak_roots[chr_name + "\t" + str(coord) + "\t-"] = area_value rv_peak_height[chr_name + "\t" + str(coord) + "\t-"] = val if chr_name not in rv_ranges: rv_ranges[chr_name] = IntervalTree() else: rv_ranges[chr_name].insert_interval( Interval( coord-1, coord, value = area_value) ) finish_part = int(interval[2]*100/chr_size) if finish_part > progress: print >>sys.stderr, " %d%% finished\r" % (finish_part), progress = finish_part #rv_global_lamda = numpy.mean(rv_peak_roots.values()) #print >>sys.stderr, "Global mean (Reverse) = " + str(rv_global_lamda) print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Calculating pvalues for " + options.reverse_bw + ' ... ' for k in rv_peak_roots: chrom = k.split("\t")[0] coord = int(k.split("\t")[1]) rv_peak_pvalue[k] = cal_poisson_pvalue(int(rv_peak_roots[k]),coord-1,coord, rv_ranges[chrom],options.window_size,options.bg_root_num) #print k + '\t' + str(rv_peak_roots[k]) + '\t' + str(pvalue) rv_peak_filtered = merge_peaks(rv_peak_height,fuzziness=options.fuzzy_size) for k,v in rv_peak_filtered.items(): (chrom,end,strand) = k.split('\t') end = int(end) start = end -1 height = str(v) area = str(rv_peak_roots[k]) pvalue = rv_peak_pvalue[k] if pvalue < pv_cutoff:continue print >>OUT, '\t'.join([chrom, str(start), str(end), area, str(round(pvalue)),strand,height])
def main(): usage="%prog [options]" + '\n' + __doc__ + "\n" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Input file in SAM format. Use \"-\" represents standard input [required]") parser.add_option("-r","--refgene",action="store",type="string",dest="ref_gene_model",help="Reference gene model in bed format. [required]") (options,args)=parser.parse_args() if not (options.input_file and options.ref_gene_model): parser.print_help() sys.exit(0) if not os.path.exists(options.ref_gene_model): print >>sys.stderr, '\n\n' + options.ref_gene_model + " does NOT exists" + '\n' #parser.print_help() sys.exit(0) if os.path.exists(options.input_file): file_obj=open(options.input_file) pass elif options.input_file == '-': file_obj=sys.stdin pass else: print >>sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n' #parser.print_help() sys.exit(0) print >>sys.stderr, "processing " + options.ref_gene_model + ' ...', obj = BED.ParseBED(options.ref_gene_model) utr_3 = obj.getUTR(utr=3) utr_5 = obj.getUTR(utr=5) cds_exon = obj.getCDSExon() intron = obj.getIntron() intron = BED.unionBed3(intron) cds_exon=BED.unionBed3(cds_exon) utr_5 = BED.unionBed3(utr_5) utr_3 = BED.unionBed3(utr_3) utr_5 = BED.subtractBed3(utr_5,cds_exon) utr_3 = BED.subtractBed3(utr_3,cds_exon) intron = BED.subtractBed3(intron,cds_exon) intron = BED.subtractBed3(intron,utr_5) intron = BED.subtractBed3(intron,utr_3) intergenic_up_1kb = obj.getIntergenic(direction="up",size=1000) intergenic_down_1kb = obj.getIntergenic(direction="down",size=1000) intergenic_up_5kb = obj.getIntergenic(direction="up",size=5000) intergenic_down_5kb = obj.getIntergenic(direction="down",size=5000) intergenic_up_10kb = obj.getIntergenic(direction="up",size=10000) intergenic_down_10kb = obj.getIntergenic(direction="down",size=10000) #merge integenic region intergenic_up_1kb=BED.unionBed3(intergenic_up_1kb) intergenic_up_5kb=BED.unionBed3(intergenic_up_5kb) intergenic_up_10kb=BED.unionBed3(intergenic_up_10kb) intergenic_down_1kb=BED.unionBed3(intergenic_down_1kb) intergenic_down_5kb=BED.unionBed3(intergenic_down_5kb) intergenic_down_10kb=BED.unionBed3(intergenic_down_10kb) #purify intergenic region intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,cds_exon) intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,utr_5) intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,utr_3) intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,intron) intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,cds_exon) intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,utr_5) intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,utr_3) intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,intron) #purify intergenic region intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,cds_exon) intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,utr_5) intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,utr_3) intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,intron) intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,cds_exon) intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,utr_5) intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,utr_3) intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,intron) #purify intergenic region intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,cds_exon) intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,utr_5) intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,utr_3) intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,intron) intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,cds_exon) intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,utr_5) intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,utr_3) intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,intron) print >>sys.stderr, "Done" ranges={} totalReads=0 spliceReads=0 cUR=0 multiMapReads=0 print >>sys.stderr, "reading SAM file", for line in file_obj: if line.startswith("@"):continue fields=line.rstrip('\n ').split() flagCode=string.atoi(fields[1]) if (flagCode & 0x0004) != 0: continue #skip unmap reads totalReads +=1 if not SAM.ParseSAM._uniqueHit_pat.search(line): #skip multiple mapped reads multiMapReads +=1 continue chrom = fields[2].upper() chromStart = string.atoi(fields[3])-1 comb=[int(i) for i in SAM.ParseSAM._splicedHit_pat.findall(fields[5])] #"9M4721N63M3157N8M" return ['9', '4721', '63', '3157', '8'] cUR += (len(comb) +1)/2 if(len(comb)>1): spliceReads += 1 blockStart=[] blockSize=[] for i in range(0,len(comb),2): blockStart.append(chromStart + sum(comb[:i]) ) for i in range(0,len(comb),2): blockSize.append(comb[i]) for st,size in zip(blockStart,blockSize): mid = int(st) + (size/2) if chrom not in ranges: ranges[chrom] = Intersecter() else: ranges[chrom].add_interval( Interval( mid, mid ) ) print >>sys.stderr, "Done" print >>sys.stderr, "Total Reads: " + str(totalReads) print >>sys.stderr, "Multiple Hits: " + str(multiMapReads) print >>sys.stderr, "Unique Hits: " + str(totalReads-multiMapReads) print >>sys.stderr, "Spliced Hits: " + str(spliceReads) print >>sys.stderr, "Total fragments: " + str(cUR) print >>sys.stderr, "\nAssignning reads ...", intron_read=0 intron_base=0 cds_exon_read=0 cds_exon_base=0 utr_5_read=0 utr_5_base=0 utr_3_read=0 utr_3_base=0 intergenic_up1kb_base=0 intergenic_up1kb_read=0 intergenic_down1kb_base=0 intergenic_down1kb_read=0 intergenic_up5kb_base=0 intergenic_up5kb_read=0 intergenic_down5kb_base=0 intergenic_down5kb_read=0 intergenic_up10kb_base=0 intergenic_up10kb_read=0 intergenic_down10kb_base=0 intergenic_down10kb_read=0 (intron_base,intron_read) = base_read(intron,ranges) (cds_exon_base,cds_exon_read) = base_read(cds_exon,ranges) (utr_5_base,utr_5_read) = base_read(utr_5,ranges) (utr_3_base,utr_3_read) = base_read(utr_3,ranges) (intergenic_up1kb_base, intergenic_up1kb_read) = base_read(intergenic_up_1kb,ranges) (intergenic_up5kb_base, intergenic_up5kb_read) = base_read(intergenic_up_5kb,ranges) (intergenic_up10kb_base, intergenic_up10kb_read) = base_read(intergenic_up_10kb,ranges) (intergenic_down1kb_base, intergenic_down1kb_read) = base_read(intergenic_down_1kb,ranges) (intergenic_down5kb_base, intergenic_down5kb_read) = base_read(intergenic_down_5kb,ranges) (intergenic_down10kb_base, intergenic_down10kb_read) = base_read(intergenic_down_10kb,ranges) print >>sys.stderr, "Done" print >>sys.stderr, "=========================================================" print >>sys.stderr, "Group\tTotal_bases\tReads_count\tReads/Kb" print >>sys.stderr, "CDS Exons:\t%d\t%d\t%5.2f" % (cds_exon_base,cds_exon_read,cds_exon_read*1000.0/cds_exon_base) print >>sys.stderr, "5'UTR Exons:\t%d\t%d\t%5.2f" % (utr_5_base,utr_5_read, utr_5_read*1000.0/utr_5_base) print >>sys.stderr, "3'UTR Exons:\t%d\t%d\t%5.2f" % (utr_3_base,utr_3_read, utr_3_read*1000.0/utr_3_base) print >>sys.stderr, "Intronic region:\t%d\t%d\t%5.2f" % (intron_base,intron_read,intron_read*1000.0/intron_base) print >>sys.stderr, "TSS up 1kb:\t%d\t%d\t%5.2f" % (intergenic_up1kb_base, intergenic_up1kb_read, intergenic_up1kb_read*1000.0/intergenic_up1kb_base) print >>sys.stderr, "TSS up 5kb:\t%d\t%d\t%5.2f" % (intergenic_up5kb_base, intergenic_up5kb_read, intergenic_up5kb_read*1000.0/intergenic_up5kb_base) print >>sys.stderr, "TSS up 10kb:\t%d\t%d\t%5.2f" % (intergenic_up10kb_base, intergenic_up10kb_read, intergenic_up10kb_read*1000.0/intergenic_up10kb_base) print >>sys.stderr, "TES down 1kb:\t%d\t%d\t%5.2f" % (intergenic_down1kb_base, intergenic_down1kb_read, intergenic_down1kb_read*1000.0/intergenic_down1kb_base) print >>sys.stderr, "TES down 5kb:\t%d\t%d\t%5.2f" % (intergenic_down5kb_base, intergenic_down5kb_read, intergenic_down5kb_read*1000.0/intergenic_down5kb_base) print >>sys.stderr, "TES down 10kb:\t%d\t%d\t%5.2f" % (intergenic_down10kb_base, intergenic_down10kb_read, intergenic_down10kb_read*1000.0/intergenic_down10kb_base) print >>sys.stderr, "========================================================="
def main(): usage = "%prog [options]" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option( "-i", "--bwfile", action="store", type="string", dest="BigWig_File", help="Input BigWig file. [required]" ) parser.add_option( "-o", "--output", action="store", type="string", dest="output_wig", help="Output wig file. [required]" ) parser.add_option( "-s", "--chromSize", action="store", type="string", dest="chromSize", help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome. [required]", ) parser.add_option( "-t", "--wigsum", action="store", type="int", dest="total_wigsum", default=100000000, help="Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default [optional]", ) parser.add_option( "-r", "--refgene", action="store", type="string", dest="refgene_bed", help="Reference gene model in bed format. [optional]", ) parser.add_option( "-c", "--chunk", action="store", type="int", dest="chunk_size", default=100000, help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]", ) (options, args) = parser.parse_args() if not (options.BigWig_File and options.output_wig and options.chromSize): parser.print_help() sys.exit(0) OUT = open(options.output_wig, "w") bw = BigWigFile(file=open(options.BigWig_File)) chrom_sizes = load_chromsize(options.chromSize) exons = [] WIG_SUM = 0.0 if options.refgene_bed: print >>sys.stderr, "Extract exons from " + options.refgene_bed obj = BED.ParseBED(options.refgene_bed) exons = obj.getExon() print >>sys.stderr, "Merge overlapping exons ..." exons = BED.unionBed3(exons) print >>sys.stderr, "Calculate wigsum covered by " + options.refgene_bed + " only" for chrom, st, end in exons: try: bw.get_as_array(chrom, 0, 1).size except: continue bw_signal = bw.get_as_array(chrom, st, end) tmp = numpy.nansum( bw_signal ) # nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0 if numpy.isnan(tmp): continue WIG_SUM += tmp print >>sys.stderr, "Total wigsum is %.2f\n" % WIG_SUM else: print >>sys.stderr, "Calculate wigsum from " + options.BigWig_File for chr_name, chr_size in chrom_sizes.items(): # iterate each chrom try: bw.get_as_array(chr_name, 0, 1).size except: print >>sys.stderr, "Skip " + chr_name + "!" continue print >>sys.stderr, "Processing " + chr_name + " ..." for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): bw_signal = bw.get_as_array(interval[0], interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue WIG_SUM += tmp print >>sys.stderr, "\nTotal wigsum is %.2f\n" % WIG_SUM try: weight = options.total_wigsum / WIG_SUM except: "Error, WIG_SUM cannot be 0" eys.exit(1) # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ print >>sys.stderr, "Normalizing bigwig file, output wiggle file" for chr_name, chr_size in chrom_sizes.items(): # iterate each chrom try: bw.get_as_array(chr_name, 0, 1).size except: print >>sys.stderr, "Skip " + chr_name + "!" continue print >>sys.stderr, "Writing " + chr_name + " ..." OUT.write("variableStep chrom=" + chr_name + "\n") for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): coord = interval[1] bw_signal = bw.get_as_array(chr_name, interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue bw_signal = numpy.nan_to_num(bw_signal) for v in bw_signal: coord += 1 if v != 0: print >> OUT, "%d\t%.4f" % (coord, v * weight)