def union_exons(refbed): ''' take the union of all exons defined in refbed file and build bitset ''' from qcmodule import BED tmp = BED.ParseBED(refbed) all_exons = tmp.getExon() unioned_exons = BED.unionBed3(all_exons) exon_ranges = build_bitsets(unioned_exons) return exon_ranges
def process_gene_model(gene_model): print >>sys.stderr, "processing " + gene_model + ' ...', obj = BED.ParseBED(gene_model) utr_3 = obj.getUTR(utr=3) utr_5 = obj.getUTR(utr=5) cds_exon = obj.getCDSExon() intron = obj.getIntron() intron = BED.unionBed3(intron) cds_exon=BED.unionBed3(cds_exon) utr_5 = BED.unionBed3(utr_5) utr_3 = BED.unionBed3(utr_3) utr_5 = BED.subtractBed3(utr_5,cds_exon) utr_3 = BED.subtractBed3(utr_3,cds_exon) intron = BED.subtractBed3(intron,cds_exon) intron = BED.subtractBed3(intron,utr_5) intron = BED.subtractBed3(intron,utr_3) intergenic_up_1kb = obj.getIntergenic(direction="up",size=1000) intergenic_down_1kb = obj.getIntergenic(direction="down",size=1000) intergenic_up_5kb = obj.getIntergenic(direction="up",size=5000) intergenic_down_5kb = obj.getIntergenic(direction="down",size=5000) intergenic_up_10kb = obj.getIntergenic(direction="up",size=10000) intergenic_down_10kb = obj.getIntergenic(direction="down",size=10000) #merge integenic region intergenic_up_1kb=BED.unionBed3(intergenic_up_1kb) intergenic_up_5kb=BED.unionBed3(intergenic_up_5kb) intergenic_up_10kb=BED.unionBed3(intergenic_up_10kb) intergenic_down_1kb=BED.unionBed3(intergenic_down_1kb) intergenic_down_5kb=BED.unionBed3(intergenic_down_5kb) intergenic_down_10kb=BED.unionBed3(intergenic_down_10kb) #purify intergenic region intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,cds_exon) intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,utr_5) intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,utr_3) intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,intron) intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,cds_exon) intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,utr_5) intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,utr_3) intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,intron) #purify intergenic region intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,cds_exon) intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,utr_5) intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,utr_3) intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,intron) intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,cds_exon) intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,utr_5) intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,utr_3) intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,intron) #purify intergenic region intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,cds_exon) intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,utr_5) intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,utr_3) intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,intron) intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,cds_exon) intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,utr_5) intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,utr_3) intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,intron) #build intervalTree cds_exon_ranges = build_bitsets(cds_exon) utr_5_ranges = build_bitsets(utr_5) utr_3_ranges = build_bitsets(utr_3) intron_ranges = build_bitsets(intron) interg_ranges_up_1kb_ranges = build_bitsets(intergenic_up_1kb) interg_ranges_up_5kb_ranges = build_bitsets(intergenic_up_5kb) interg_ranges_up_10kb_ranges = build_bitsets(intergenic_up_10kb) interg_ranges_down_1kb_ranges = build_bitsets(intergenic_down_1kb) interg_ranges_down_5kb_ranges = build_bitsets(intergenic_down_5kb) interg_ranges_down_10kb_ranges = build_bitsets(intergenic_down_10kb) exon_size = cal_size(cds_exon) intron_size = cal_size(intron) utr3_size = cal_size(utr_3) utr5_size = cal_size(utr_5) int_up1k_size = cal_size(intergenic_up_1kb) int_up5k_size = cal_size(intergenic_up_5kb) int_up10k_size = cal_size(intergenic_up_10kb) int_down1k_size = cal_size(intergenic_down_1kb) int_down5k_size = cal_size(intergenic_down_5kb) int_down10k_size = cal_size(intergenic_down_10kb) print >>sys.stderr, "Done" return (cds_exon_ranges,intron_ranges,utr_5_ranges,utr_3_ranges,\ interg_ranges_up_1kb_ranges,interg_ranges_up_5kb_ranges,interg_ranges_up_10kb_ranges,\ interg_ranges_down_1kb_ranges,interg_ranges_down_5kb_ranges,interg_ranges_down_10kb_ranges,\ exon_size,intron_size,utr5_size,utr3_size,\ int_up1k_size,int_up5k_size,int_up10k_size,\ int_down1k_size,int_down5k_size,int_down10k_size)
def main(): usage = "%prog [options]" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-i", "--bwfile", action="store", type="string", dest="BigWig_File", help="Input BigWig file. [required]") parser.add_option("-o", "--output", action="store", type="string", dest="output_wig", help="Output wig file. [required]") parser.add_option( "-s", "--chromSize", action="store", type="string", dest="chromSize", help= "Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome. [required]" ) parser.add_option( "-t", "--wigsum", action="store", type="int", dest="total_wigsum", default=100000000, help= "Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default [optional]" ) parser.add_option("-r", "--refgene", action="store", type="string", dest="refgene_bed", help="Reference gene model in bed format. [optional]") parser.add_option( "-c", "--chunk", action="store", type="int", dest="chunk_size", default=500000, help= "Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]" ) parser.add_option( "-f", "--format", action="store", type="string", dest="out_format", default="bgr", help= "Output format. either \"wig\" or \"bgr\". \"bgr\" save disk space but make program slower. default=%default" ) (options, args) = parser.parse_args() if not (options.BigWig_File and options.output_wig and options.chromSize): parser.print_help() sys.exit(0) OUT = open(options.output_wig, 'w') bw = BigWigFile(file=open(options.BigWig_File)) chrom_sizes = load_chromsize(options.chromSize) exons = [] WIG_SUM = 0.0 if (options.refgene_bed): print >> sys.stderr, "Extract exons from " + options.refgene_bed obj = BED.ParseBED(options.refgene_bed) exons = obj.getExon() print >> sys.stderr, "Merge overlapping exons ..." exons = BED.unionBed3(exons) print >> sys.stderr, "Calculate wigsum covered by " + options.refgene_bed + ' only' for chrom, st, end in exons: try: bw.get_as_array(chrom, 0, 1).size except: continue bw_signal = bw.get_as_array(chrom, st, end) tmp = numpy.nansum( bw_signal ) #nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0 if numpy.isnan(tmp): continue WIG_SUM += tmp print >> sys.stderr, "Total wigsum is %.2f\n" % WIG_SUM else: print >> sys.stderr, "Calculate wigsum from " + options.BigWig_File for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom #if chr_name != "chrY":continue try: bw.get_as_array(chr_name, 0, 1).size except: print >> sys.stderr, "Skip " + chr_name + "!" continue print >> sys.stderr, "Processing " + chr_name + " ..." for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): bw_signal = bw.get_as_array(interval[0], interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue WIG_SUM += tmp print >> sys.stderr, "\nTotal wigsum is %.2f\n" % WIG_SUM try: weight = options.total_wigsum / WIG_SUM except: "Error, WIG_SUM cannot be 0" eys.exit(1) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ print >> sys.stderr, "Normalizing bigwig file ..." for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom #if chr_name != "chrY":continue try: bw.get_as_array(chr_name, 0, 1).size except: print >> sys.stderr, "Skip " + chr_name + "!" continue if options.out_format.upper() == "WIG": print >> sys.stderr, "Writing " + chr_name + " ..." OUT.write('variableStep chrom=' + chr_name + '\n') for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): coord = interval[1] bw_signal = bw.get_as_array(chr_name, interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue bw_signal = numpy.nan_to_num(bw_signal) * weight for v in bw_signal: coord += 1 if v != 0: print >> OUT, "%d\t%.2f" % (coord, v) elif options.out_format.upper() == "BGR": print >> sys.stderr, "Writing " + chr_name + " ..." #OUT.write('variableStep chrom='+chr_name+'\n') for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): v2p = collections.defaultdict(list) #value to position range2p = { } #coorindate range to value, bedgraph. #[start]=[len,value] coord = interval[1] bw_signal = bw.get_as_array(chr_name, interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue bw_signal = numpy.nan_to_num(bw_signal) * weight for v in bw_signal: coord += 1 #if v != 0: print >>OUT, "%d\t%.2f" % (coord,v) if v != 0: v2p[v].append(coord) for v in v2p: for k, g in groupby(enumerate(v2p[v]), lambda (i, x): i - x): for l in [map(itemgetter(1), g)]: range2p[l[0] - 1] = [len(l), v] for i in sorted(range2p): print >> OUT, chr_name + '\t' + str(i) + '\t' + str( i + range2p[i][0]) + '\t' + str(range2p[i][1]) else: print >> sys.stderr, "unknown output format" sys.exit(1)
def main(): usage="%prog [options]" + '\n' + __doc__ + "\n" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM or SAM format. BAM file should be sorted and indexed.") parser.add_option("-r","--genelist",action="store",type="string",dest="gene_list",help="Gene list in bed foramt. All reads hits to exon regions (defined by this gene list) will be saved into one BAM file, the remaining reads will saved into another BAM file.") parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output BAM files. \"prefix.in.bam\" file contains reads mapped to the gene list specified by \"-r\", \"prefix.ex.bam\" contains reads that cannot mapped to gene list. \"prefix.junk.bam\" contains qcfailed or unmapped reads.") (options,args)=parser.parse_args() if not (options.input_file and options.gene_list): parser.print_help() sys.exit(0) if not os.path.exists(options.gene_list): print >>sys.stderr, '\n\n' + options.gene_list + " does NOT exists" + '\n' #parser.print_help() sys.exit(0) if not os.path.exists(options.input_file): print >>sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n' sys.exit(0) #build bitset for gene list print >>sys.stderr, 'reading ' + options.gene_list + ' ... ', obj = BED.ParseBED(options.gene_list) exons = obj.getExon() exon_ranges = build_bitsets(exons) print >>sys.stderr, 'Done' samfile = pysam.Samfile(options.input_file,'rb') out1 = pysam.Samfile(options.output_prefix + '.in.bam','wb',template=samfile) #bam file containing reads hit to exon region out2 = pysam.Samfile(options.output_prefix + '.ex.bam','wb',template=samfile) #bam file containing reads not hit to exon region out3 = pysam.Samfile(options.output_prefix + '.junk.bam','wb',template=samfile) #bam file containing reads not hit to exon region total_alignment = 0 in_alignment = 0 ex_alignment = 0 bad_alignment = 0 print >>sys.stderr, "spliting " + options.input_file + " ...", try: while(1): aligned_read = samfile.next() total_alignment += 1 if aligned_read.is_qcfail: bad_alignment +=1 out3.write(aligned_read) continue if aligned_read.is_unmapped: bad_alignment +=1 out3.write(aligned_read) continue chrom = samfile.getrname(aligned_read.tid) chrom=chrom.upper() read_start = aligned_read.pos mate_start = aligned_read.mpos #read_exons = bam_cigar.fetch_exon(chrom, aligned_read.pos, aligned_read.cigar) if aligned_read.mate_is_unmapped: #only one end mapped if chrom not in exon_ranges: out2.write(aligned_read) ex_alignment += 1 continue else: if len(exon_ranges[chrom].find(read_start, read_start +1)) >= 1: out1.write(aligned_read) in_alignment += 1 continue elif len(exon_ranges[chrom].find(read_start, read_start +1)) == 0: out2.write(aligned_read) ex_alignment += 1 continue else: #both end mapped if chrom not in exon_ranges: out2.write(aligned_read) ex_alignment += 1 continue else: if (len(exon_ranges[chrom].find(read_start, read_start +1)) >= 1) or (len(exon_ranges[chrom].find(mate_start, mate_start +1)) >= 1): out1.write(aligned_read) in_alignment += 1 else: out2.write(aligned_read) ex_alignment += 1 except StopIteration: print >>sys.stderr, "Done" print "%-55s%d" % ("Total records:",total_alignment) print "%-55s%d" % (options.output_prefix + '.in.bam (Reads consumed by input gene list):',in_alignment) print "%-55s%d" % (options.output_prefix + '.ex.bam (Reads not consumed by input gene list):',ex_alignment) print "%-55s%d" % (options.output_prefix + '.junk.bam (qcfailed, unmapped reads):',bad_alignment)
def main(): usage = "%prog [options]" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-i", "--bwfile", action="store", type="string", dest="BigWig_File", help="Input BigWig file. [required]") parser.add_option("-o", "--output", action="store", type="string", dest="output_wig", help="Output wig file. [required]") parser.add_option( "-s", "--chromSize", action="store", type="string", dest="chromSize", help= "Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome. [required]" ) parser.add_option( "-t", "--wigsum", action="store", type="int", dest="total_wigsum", default=100000000, help= "Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default [optional]" ) parser.add_option("-r", "--refgene", action="store", type="string", dest="refgene_bed", help="Reference gene model in bed format. [optional]") parser.add_option( "-c", "--chunk", action="store", type="int", dest="chunk_size", default=100000, help= "Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]" ) (options, args) = parser.parse_args() if not (options.BigWig_File and options.output_wig and options.chromSize): parser.print_help() sys.exit(0) OUT = open(options.output_wig, 'w') bw = BigWigFile(file=open(options.BigWig_File)) chrom_sizes = load_chromsize(options.chromSize) exons = [] WIG_SUM = 0.0 if (options.refgene_bed): print >> sys.stderr, "Extract exons from " + options.refgene_bed obj = BED.ParseBED(options.refgene_bed) exons = obj.getExon() print >> sys.stderr, "Merge overlapping exons ..." exons = BED.unionBed3(exons) print >> sys.stderr, "Calculate wigsum covered by " + options.refgene_bed + ' only' for chrom, st, end in exons: try: bw.get_as_array(chrom, 0, 1).size except: continue bw_signal = bw.get_as_array(chrom, st, end) tmp = numpy.nansum( bw_signal ) #nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0 if numpy.isnan(tmp): continue WIG_SUM += tmp print >> sys.stderr, "Total wigsum is %.2f\n" % WIG_SUM else: print >> sys.stderr, "Calculate wigsum from " + options.BigWig_File for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom try: bw.get_as_array(chr_name, 0, 1).size except: print >> sys.stderr, "Skip " + chr_name + "!" continue print >> sys.stderr, "Processing " + chr_name + " ..." for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): bw_signal = bw.get_as_array(interval[0], interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue WIG_SUM += tmp print >> sys.stderr, "\nTotal wigsum is %.2f\n" % WIG_SUM try: weight = options.total_wigsum / WIG_SUM except: "Error, WIG_SUM cannot be 0" eys.exit(1) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ print >> sys.stderr, "Normalizing bigwig file, output wiggle file" for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom try: bw.get_as_array(chr_name, 0, 1).size except: print >> sys.stderr, "Skip " + chr_name + "!" continue print >> sys.stderr, "Writing " + chr_name + " ..." OUT.write('variableStep chrom=' + chr_name + '\n') for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): coord = interval[1] bw_signal = bw.get_as_array(chr_name, interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue bw_signal = numpy.nan_to_num(bw_signal) for v in bw_signal: coord += 1 if v != 0: print >> OUT, "%d\t%.4f" % (coord, v * weight)
def main(): usage = "%prog [options]" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-i", "--bwfile", action="store", type="string", dest="BigWig_File", help="Input BigWig file. [required]") parser.add_option("-o", "--output", action="store", type="string", dest="output_wig", help="Output wig file. [required]") parser.add_option( "-t", "--wigsum", action="store", type="int", dest="total_wigsum", default=100000000, help= "Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default [optional]" ) parser.add_option("-r", "--refgene", action="store", type="string", dest="refgene_bed", help="Reference gene model in bed format. [optional]") parser.add_option( "-c", "--chunk", action="store", type="int", dest="chunk_size", default=500000, help= "Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]" ) parser.add_option( "-f", "--format", action="store", type="string", dest="out_format", default="bgr", help= "Output format. either \"wig\" or \"bgr\". \"bgr\" save disk space but make program slower. default=%default" ) (options, args) = parser.parse_args() if not (options.BigWig_File and options.output_wig): parser.print_help() sys.exit(0) OUT = open(options.output_wig, 'w') bw = pyBigWig.open(options.BigWig_File) if bw.isBigWig(): pass else: print("%s is not a bigwig file!" % options.BigWig_File, file=sys.stderr) sys.exit(0) print("Get chromosome sizes from BigWig header ...", file=sys.stderr) chrom_sizes = {} for chr, size in bw.chroms().items(): chrom_sizes[chr] = size exons = [] WIG_SUM = 0.0 if (options.refgene_bed): print("Extract exons from " + options.refgene_bed, file=sys.stderr) obj = BED.ParseBED(options.refgene_bed) exons = obj.getExon() print("Merge overlapping exons ...", file=sys.stderr) exons = BED.unionBed3(exons) print("Calculate wigsum covered by " + options.refgene_bed + ' only', file=sys.stderr) for chrom, st, end in exons: if bw.stats(chrom, st, end)[0] is None: continue bw_signal = bw.values(chrom, st, end) tmp = numpy.nansum( bw_signal ) #nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0 if numpy.isnan(tmp): continue WIG_SUM += tmp print("Total wigsum is %.2f\n" % WIG_SUM, file=sys.stderr) else: print("Calculate wigsum from " + options.BigWig_File, file=sys.stderr) for chr_name, chr_size in list( chrom_sizes.items()): #iterate each chrom if bw.stats(chr_name, 0, chr_size)[0] is None: print("Skip " + chr_name + "!", file=sys.stderr) continue print("Processing " + chr_name + " ...", file=sys.stderr) for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): if bw.stats(interval[0], interval[1], interval[2])[0] is None: continue bw_signal = bw.values(interval[0], interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue WIG_SUM += tmp print("\nTotal wigsum is %.2f\n" % WIG_SUM, file=sys.stderr) try: weight = options.total_wigsum / WIG_SUM except: "Error, WIG_SUM cannot be 0" sys.exit(1) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ print("Normalizing bigwig file ...", file=sys.stderr) for chr_name, chr_size in list(chrom_sizes.items()): #iterate each chrom if bw.stats(chr_name, 0, chr_size)[0] is None: print("Skip " + chr_name + "!", file=sys.stderr) continue if options.out_format.upper() == "WIG": print("Writing " + chr_name + " ...", file=sys.stderr) OUT.write('variableStep chrom=' + chr_name + '\n') for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): coord = interval[1] bw_signal = bw.values(chr_name, interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue bw_signal = numpy.nan_to_num(bw_signal) * weight for v in bw_signal: coord += 1 if v != 0: print("%d\t%.2f" % (coord, v), file=OUT) elif options.out_format.upper() == "BGR": print("Writing " + chr_name + " ...", file=sys.stderr) #OUT.write('variableStep chrom='+chr_name+'\n') for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): v2p = collections.defaultdict(list) #value to position range2p = { } #coorindate range to value, bedgraph. #[start]=[len,value] coord = interval[1] bw_signal = bw.values(chr_name, interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue bw_signal = numpy.nan_to_num(bw_signal) * weight for v in bw_signal: coord += 1 if v != 0: v2p[v].append(coord) for v in v2p: for k, g in groupby(enumerate(v2p[v]), lambda i_x: i_x[0] - i_x[1]): for l in [list(map(itemgetter(1), g))]: range2p[l[0] - 1] = [len(l), v] for i in sorted(range2p): print(chr_name + '\t' + str(i) + '\t' + str(i + range2p[i][0]) + '\t' + str(range2p[i][1]), file=OUT) else: print("unknown output format", file=sys.stderr) sys.exit(1)