def main(): usage="%prog [options]" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--bwfile1",action="store",type="string",dest="BigWig_File1",help="BigWig files") parser.add_option("-j","--bwfile2",action="store",type="string",dest="BigWig_File2",help="BigWig files") parser.add_option("-a","--action",action="store",type="string",dest="action",help='After pairwise align two bigwig files, perform the follow actions (Only select one keyword):"Add" = add signals. "Average" = average signals. "Division"= divide bigwig2 from bigwig1. Add 1 to both bigwig. "Max" = pick the signal that is larger. "Min" = pick the signal that is smaller. "Product" = multiply signals. "Subtract" = subtract signals in 2nd bigwig file from the corresponiding ones in the 1st bigwig file. "geometricMean" = take the geometric mean of signals.') parser.add_option("-o","--output",action="store",type="string",dest="output_wig",help="Output wig file") parser.add_option("-s","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome.") parser.add_option("-c","--chunk",action="store",type="int",dest="chunk_size",default=100000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp)") (options,args)=parser.parse_args() if not (options.BigWig_File1 and options.BigWig_File2 and options.output_wig and options.chromSize): parser.print_help() sys.exit(0) OUT=open(options.output_wig,'w') bw1 = BigWigFile( file=open(options.BigWig_File1) ) bw2 = BigWigFile( file=open(options.BigWig_File2) ) chrom_sizes = load_chromsize(options.chromSize) for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom print >>sys.stderr, "Processing " + chr_name + " ..." OUT.write('variableStep chrom='+chr_name+'\n') for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size): coord = interval[1] bw_signal1 = bw1.get_as_array(chr_name,interval[1],interval[2]) bw_signal2 = bw2.get_as_array(chr_name,interval[1],interval[2]) if all_nan(bw_signal1) and all_nan(bw_signal2): continue bw_signal1 = replace_nan( bw_signal1 ) bw_signal2 = replace_nan( bw_signal2 ) call_back = getattr(twoList,options.action) for v in call_back(bw_signal1,bw_signal2): coord +=1 if v != 0: print >>OUT, "%d\t%.2f" % (coord,v)
def Main(): global args args = ParseArg() bw1 = BigWigFile(open(args.percentile1)) bw2 = BigWigFile(open(args.percentile2)) gout = WriteToFile(args.output + ".list") perc_array1 = [] perc_array2 = [] for line in ReadFromFile(args.geneList): row = line.strip().split() gene = row[0] chrom = row[1] start = int(row[2]) end = int(row[3]) array1 = bw1.get_as_array(chrom, start, end) array2 = bw2.get_as_array(chrom, start, end) if array1 is not None and array2 is not None: perc1 = np.mean(array1) + 50 perc2 = np.mean(array2) + 50 print >> gout, '%s\t%s\t%d\t%d\t%f\t%f' % (gene, chrom, start, end, perc1, perc2) perc_array1.append(perc1) perc_array2.append(perc2) '''scatter plot''' sns.set() plt.scatter(perc_array1, perc_array2, marker=',', color='black', s=1, alpha=0.1) plt.axes().set_aspect('equal') plt.xlabel(args.x, fontsize=20) plt.ylabel(args.y, fontsize=20) plt.ylim(0, 100) plt.xlim(0, 100) plt.tick_params(axis='both', which='major', labelsize=20, width=2) plt.gca().set_yticks([0, 20, 40, 60, 80, 100]) plt.gca().set_xticks([0, 20, 40, 60, 80, 100]) x1, y1 = [0, 89.5], [10.5, 100] x2, y2 = [10.5, 100 ], [0, 89.5 ] # draw lines showing the threshold to call changed domains plt.gca().spines['left'].set_linewidth(2) plt.gca().spines['bottom'].set_linewidth(2) plt.gca().spines['right'].set_linewidth(2) plt.gca().spines['top'].set_linewidth(2) plt.subplots_adjust(bottom=.2, left=.2) plt.plot(x1, y1, linewidth=0.5, linestyle='--', color='red') plt.plot(x2, y2, linewidth=0.5, linestyle='--', color='red') plt.savefig(args.output + '_dot.eps', format='eps') plt.close() logging("DONE!!!")
class BigWigWrapper(object): """A wrapper for bx-python BigWig file""" def __init__(self, filepath): self.bw = BigWigFile(open(filepath)) def __getitem__(self, iv): return self.bw.get_as_array(iv.chrom, iv.start, iv.end)
def findInsertions(bwFile, bedData, interval, x): if interval =='start': sL = int(bedData[x][1])-options.l sR = int(bedData[x][1])+options.r elif interval == 'end': sL = int(bedData[x][2])-options.l sR = int(bedData[x][2])+options.r else: sL = int(bedData[x][1])-options.l sR = int(bedData[x][2])+options.r # get signal data f = open(bwFile, "rb") bigwig_class = BigWigFile(f) try: signal = bigwig_class.get_as_array(bedData[x][0],sL,sR) except OverflowError: signal = np.array([np.nan]*(sR-sL)) f.close() if signal is not None: if np.sum(np.isfinite(signal)) > 0: out = np.nanmean(signal) else: out = 0 else: out = 0 out = signal return out
def summarize(self, interval, bins=None, method='summarize', function='mean'): # We may be dividing by zero in some cases, which raises a warning in # NumPy based on the IEEE 754 standard (see # http://docs.scipy.org/doc/numpy/reference/generated/ # numpy.seterr.html) # # That's OK -- we're expecting that to happen sometimes. So temporarily # disable this error reporting for the duration of this method. orig = np.geterr()['invalid'] np.seterr(invalid='ignore') if (bins is None) or (method == 'get_as_array'): bw = BigWigFile(open(self.fn)) s = bw.get_as_array( interval.chrom, interval.start, interval.stop, ) if s is None: s = np.zeros((interval.stop - interval.start, )) else: s[np.isnan(s)] = 0 elif method == 'ucsc_summarize': if function in ['mean', 'min', 'max', 'std', 'coverage']: return self.ucsc_summarize(interval, bins, function=function) else: raise ValueError('function "%s" not supported by UCSC\'s' 'bigWigSummary') else: bw = BigWigFile(open(self.fn)) s = bw.summarize(interval.chrom, interval.start, interval.stop, bins) if s is None: s = np.zeros((bins, )) else: if function == 'sum': s = s.sum_data if function == 'mean': s = s.sum_data / s.valid_count s[np.isnan(s)] = 0 if function == 'min': s = s.min_val s[np.isinf(s)] = 0 if function == 'max': s = s.max_val s[np.isinf(s)] = 0 if function == 'std': s = (s.sum_squares / s.valid_count) s[np.isnan(s)] = 0 # Reset NumPy error reporting np.seterr(divide=orig) return s
def getChromatinDataSeries(bigwigFile, libraryTable, sgInfoTable, tssTable, colname = '', naValue = 0): bwindex = BigWigFile(open(bigwigFile)) chromDict = tssTable['chromosome'].to_dict() chromatinScores = [] for name, sgInfo in sgInfoTable.iterrows(): geneTup = (sgInfo['gene_name'],','.join(sgInfo['transcript_list'])) if geneTup not in chromDict: #negative controls chromatinScores.append(np.nan) continue if sgInfo['strand'] == '+': sgRange = sgInfo['pam coordinate'] + sgInfo['length'] else: sgRange = sgInfo['pam coordinate'] - sgInfo['length'] chrom = chromDict[geneTup] chromatinArray = bwindex.get_as_array(chrom, min(sgInfo['pam coordinate'], sgRange), max(sgInfo['pam coordinate'], sgRange)) if chromatinArray is not None and len(chromatinArray) > 0: chromatinScores.append(np.nanmean(chromatinArray)) else: #often chrY when using K562 data.. # print name # print chrom, min(sgInfo['pam coordinate'], sgRange), max(sgInfo['pam coordinate'], sgRange) chromatinScores.append(np.nan) chromatinSeries = pd.Series(chromatinScores, index=libraryTable.index, name = colname) return chromatinSeries.fillna(naValue)
def get_mean_phastcons(bedtool, phastcons_location): """ Get means phastcons scores for all intervals in a bed tool bedtool - bedtool to extract data from phastcons_location - location of phastcons file """ f = open(phastcons_location, 'r') bw = BigWigFile(file=f) #if bedtool data = np.ndarray(len(bedtool)) for i, bedline in enumerate(bedtool): conservation_values = bw.get_as_array(bedline.chrom, bedline.start, bedline.stop) if len(conservation_values) > 0: mean_phastcons = np.mean(conservation_values) else: mean_phastcons = 0 data[i] = mean_phastcons return data
def get_mean_phastcons(bedtool, phastcons_location, sample_size = 1000): """ Get means phastcons scores for all intervals in a bed tool bedtool - bedtool to extract data from phastcons_location - location of phastcons file """ with open(phastcons_location) as bw_file: bw = BigWigFile(bw_file) data = [] for bedline in bedtool.random_subset(min(len(bedtool), sample_size)): conservation_values = bw.get_as_array(bedline.chrom, bedline.start, bedline.stop) try: if len(conservation_values) > 0: mean_phastcons = np.mean(conservation_values) else: mean_phastcons = 0 data.append(mean_phastcons) except TypeError: pass return data
def get_mean_phastcons(bedtool, phastcons_location, sample_size=1000): """ Get means phastcons scores for all intervals in a bed tool bedtool - bedtool to extract data from phastcons_location - location of phastcons file """ with open(phastcons_location) as bw_file: bw = BigWigFile(bw_file) data = [] for bedline in bedtool.random_subset(min(len(bedtool), sample_size)): conservation_values = bw.get_as_array(bedline.chrom, bedline.start, bedline.stop) try: if len(conservation_values) > 0: mean_phastcons = np.mean(conservation_values) else: mean_phastcons = 0 data.append(mean_phastcons) except TypeError: pass return data
def Main(): global args args = ParseArg() bw = BigWigFile(open(args.bigwig)) CheckFolderExist(args.output) fout = WriteToFile(args.output + '/' + args.name + '.bed') wout = WriteToFile(args.output + '/' + args.name + '.wig') genome = LoadGenome(args.genome) if args.smooth: logging("Options: turn on smooth mode") for chrom in SortGenome(genome): chrom_size = genome[chrom] logging("Process: %s\t%d" % (chrom, chrom_size)) array = bw.get_as_array(chrom, 0, chrom_size) invalid = np.isnan(array) array[invalid] = 0 agg_array = [] start = 0 stop = args.window for nn in range(int(math.ceil(len(array) / float(args.window)))): if stop >= len(array): stop = len(array) agg_array.append(np.mean(array[start:stop])) break agg_array.append(np.mean(array[start:stop])) start += args.window stop += args.window agg_array = np.array(agg_array) if args.smooth: smooth_array = Smooth(agg_array) else: smooth_array = agg_array print >> wout, "variableStep chrom=%s span=%d" % (chrom, args.window) for nn, value in enumerate(smooth_array): if nn == 0: print >> fout, "%s\t0\t%d\t%.6f" % (chrom, (nn + 1) * args.window, float(value)) print >> wout, "%d\t%.6f" % (nn + 1, value) elif nn == len(smooth_array) - 1: print >> fout, "%s\t%d\t%d\t%.6f" % (chrom, nn * args.window, chrom_size, float(value)) print >> wout, "variableStep chrom=%s span=%d" % ( chrom, chrom_size - ((nn) * args.window)) print >> wout, "%d\t%.6f" % (nn * args.window + 1, float(value)) else: print >> fout, "%s\t%d\t%d\t%.6f" % (chrom, nn * args.window, (nn + 1) * args.window, float(value)) print >> wout, "%d\t%.6f" % (nn * args.window + 1, float(value)) fout.flush() wout.flush() wig2bw = "wigToBigWig -clip %s %s %s" % (args.output + '/' + args.name + '.wig', args.genome, args.output + '/' + args.name + '.bw') os.system(wig2bw) logging("Finish: TSA_smooth DONE!!!")
def main(): usage="%prog [options]" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--bwfile1",action="store",type="string",dest="BigWig_File1",help="One BigWig file") parser.add_option("-j","--bwfile2",action="store",type="string",dest="BigWig_File2",help="Another BigWig file") parser.add_option("-a","--action",action="store",type="string",dest="action",help='After pairwise align two bigwig files, perform the follow actions (Only select one keyword):"Add" = add signals. "Average" = average signals. "Division"= divide bigwig2 from bigwig1. Add 1 to both bigwig. "Max" = pick the signal that is larger. "Min" = pick the signal that is smaller. "Product" = multiply signals. "Subtract" = subtract signals in 2nd bigwig file from the corresponiding ones in the 1st bigwig file. "geometricMean" = take the geometric mean of signals.') parser.add_option("-o","--output",action="store",type="string",dest="output_wig",help="Output wig file") parser.add_option("-s","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome.") parser.add_option("-c","--chunk",action="store",type="int",dest="chunk_size",default=100000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp)") parser.add_option("-m","--min_signal",action="store",type="float",dest="min_score",default=0.0,help="To redude the size of output wigfile, genomic positions with signal value smaller than (<) this threshold will be filtered out. default=%default") (options,args)=parser.parse_args() if not (options.BigWig_File1 and options.BigWig_File2 and options.output_wig and options.chromSize): parser.print_help() sys.exit(0) OUT=open(options.output_wig,'w') bw1 = BigWigFile( file=open(options.BigWig_File1) ) bw2 = BigWigFile( file=open(options.BigWig_File2) ) chrom_sizes = load_chromsize(options.chromSize) for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom print >>sys.stderr, "Processing " + chr_name + " ..." OUT.write('variableStep chrom='+chr_name+'\n') for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size): coord = interval[1] try: bw_signal1 = bw1.get_as_array(chr_name,interval[1],interval[2]) except: bw_signal1 = numpy.array() try: bw_signal2 = bw2.get_as_array(chr_name,interval[1],interval[2]) except: bw_signal2 = numpy.array() if bw_signal1 is None and bw_signal2 is None: continue if numpy.isnan(numpy.nansum(bw_signal1)) and numpy.isnan(numpy.nansum(bw_signal2)): continue if len(bw_signal1) == 0 and len(bw_signal2) == 0: continue bw_signal1 = numpy.nan_to_num( bw_signal1 ) bw_signal2 = numpy.nan_to_num( bw_signal2 ) call_back = getattr(twoList,options.action) for v in call_back(bw_signal1,bw_signal2): coord +=1 if v >= options.min_score: print >>OUT, "%d\t%.2f" % (coord,v)
def summarize(self, interval, bins=None, method='summarize', function='mean'): # We may be dividing by zero in some cases, which raises a warning in # NumPy based on the IEEE 754 standard (see # http://docs.scipy.org/doc/numpy/reference/generated/ # numpy.seterr.html) # # That's OK -- we're expecting that to happen sometimes. So temporarily # disable this error reporting for the duration of this method. orig = np.geterr()['invalid'] np.seterr(invalid='ignore') if (bins is None) or (method == 'get_as_array'): bw = BigWigFile(open(self.fn)) s = bw.get_as_array( interval.chrom, interval.start, interval.stop,) if s is None: s = np.zeros((interval.stop - interval.start,)) else: s[np.isnan(s)] = 0 elif method == 'ucsc_summarize': if function in ['mean', 'min', 'max', 'std', 'coverage']: return self.ucsc_summarize(interval, bins, function=function) else: raise ValueError('function "%s" not supported by UCSC\'s' 'bigWigSummary') else: bw = BigWigFile(open(self.fn)) s = bw.summarize( interval.chrom, interval.start, interval.stop, bins) if s is None: s = np.zeros((bins,)) else: if function == 'sum': s = s.sum_data if function == 'mean': s = s.sum_data / s.valid_count s[np.isnan(s)] = 0 if function == 'min': s = s.min_val s[np.isinf(s)] = 0 if function == 'max': s = s.max_val s[np.isinf(s)] = 0 if function == 'std': s = (s.sum_squares / s.valid_count) s[np.isnan(s)] = 0 # Reset NumPy error reporting np.seterr(divide=orig) return s
def wig_reader(infile, chrom_sizes=None, informat='wiggle', bin_size=2000): '''infile: either a wiggle or bigwig format file chromsize: chrom_name: size, only needed is format is bigwig format: either 'wiggle' or 'bigwig' return: chrom, position (0-based), value ''' if informat.upper() == 'WIGGLE': point_num = 1 count = 0 for chrom, start, end, strand, score in bx.wiggle.IntervalReader( infile): yield (chrom, start, end, score) """ count += 1 if count ==1: chrom = fields[0] up_bound = fields[1]+1 score = fields[2] continue if (fields[0] == chrom) and (fields[1] +1 == up_bound + 1) and (fields[2] == score): point_num += 1 up_bound = fields[1]+1 continue else: yield((chrom, up_bound - point_num, up_bound, score)) chrom = fields[0] score = fields[2] up_bound = fields[1]+1 point_num = 1 """ elif informat.upper() == 'BIGWIG': bw_obj = BigWigFile(file=open(infile)) for chr_name, chr_size in list(chrom_sizes.items()): for chrom, st, end in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=bin_size): sig_list = bw_obj.get_as_array(chrom, st, end) if sig_list is None: continue sig_list = numpy.nan_to_num(sig_list) if numpy.sum(sig_list) == 0: continue low_bound = st point_num = 1 score = sig_list[0] for value in (sig_list[1:]): if value == score: point_num += 1 else: yield ((chrom, low_bound, low_bound + point_num, score)) score = value low_bound = low_bound + point_num point_num = 1 else: raise Exception("Unknown format. Must be 'wiggle' or 'bigwig'")
def bigwig_to_wav(args): import numpy as np from bx.bbi.bigwig_file import BigWigFile from scipy.signal import convolve from scipy.stats import norm from scipy.ndimage import zoom logger.info('read input BigWigfile: ' + args.bigwig_file) f_bigwig = open(args.bigwig_file, 'rb') logger.info('read input BED file: ' + args.bed_file) f_bed = open(args.bed_file, 'r') bigwig = BigWigFile(f_bigwig) smooth_filter = None scale_factors = None if args.smooth == 'boxcar': smooth_filter = np.ones(args.window_size, dtype=np.float32) elif args.smooth == 'gaussian': smooth_filter = norm.pdf( np.linspace(-3, 3, args.window_size * 3, endpoint=True)).astype(np.float32) if args.smooth != 'none': scale_factors = convolve(np.ones(smooth_filter.shape[0]), smooth_filter, mode='same') if not os.path.exists(args.output_dir): logger.info('create output directory: ' + args.output_dir) os.makedirs(args.output_dir) for line in f_bed: c = line.strip().split('\t') chrom = c[0] start = int(c[1]) end = int(c[2]) x = np.nan_to_num(bigwig.get_as_array(chrom, start, end)) # zoom the signals x = zoom(x, args.zoom) if args.smooth != 'none': # smooth the raw signal with a moving window x = convolve(x, smooth_filter, mode='same') # scale the signal filter_length = smooth_filter.shape[0] x[:(filter_length / 2)] /= scale_factors[:(filter_length / 2)] x[(-filter_length / 2):] /= scale_factors[(-filter_length / 2):] if x.shape[0] > filter_length: x[(filter_length / 2):(-filter_length / 2)] /= np.sum(smooth_filter) wav_file = os.path.join(args.output_dir, '%s:%d-%d.wav' % (chrom, start, end)) logger.info('create wav file: ' + wav_file) modulate(x, wav_file, sample_rate=args.sample_rate, n_channels=args.n_channels)
class Phylop(object): def __init__(self, bw_fname): """ :param bw_fname: Phylop 100way bigwig file name. """ self.bw_handle = open(os.path.expanduser(bw_fname)) self.bw = BigWigFile(self.bw_handle) def get(self, chrom, start, end, flanking=0): """ :param chrom: chr1, chr2, etc. :param start: 0-based. :param end: 1-based. :param flanking: length of flanking sequence on each side. """ return np.nanmean( self.bw.get_as_array(chrom, start - flanking, end + flanking)) def calculate(self, fname, out_fname): """ :param fname: SNP BED. :param out_fname: output file. """ with open(fname) as bed_f, open(out_fname, 'w') as out_f: out_f.write('\t'.join(self._build_header()) + '\n') for line in bed_f: cols = line.rstrip().split('\t') chrom, start, end = cols[:3] start = int(start) end = int(end) scores = [ self.get(chrom, start, end), self.get(chrom, start, end, 3), self.get(chrom, start, end, 7) ] out_f.write('\t'.join(map(str, cols + scores)) + '\n') def close(self): self.bw_handle.close() def _build_header(self): header = [ '#chrom_snp', 'start_snp', 'end_snp', 'ref', 'alt', 'feature', 'gene_id', 'chrom', 'start', 'end', 'name', 'score', 'strand', 'distance' ] header += ['phylop1', 'phylop3', 'phylop7'] return header
def findInsertions(bwFile, bedData, x): if options.tn5 is not None: bwFile = options.b + options.tn5 + "." + bedData[x][0] + ".Scores.bw" sL = int(bedData[x][1]) - options.l sR = int(bedData[x][2]) + options.r # get signal data f = open(bwFile, "rb") bw = BigWigFile(f) try: signal = bw.get_as_array(bedData[x][0], sL, sR) except OverflowError: signal = np.array([np.nan] * (sR - sL)) f.close() out = signal try: if bedData[x][3] == "-": out = out[::-1] except IndexError: pass return out
def bwSmooth(c): # open bigwig t = open(options.a) bw = BigWigFile(t) # get data, pass if not available chrN = c[0];sPos=int(c[1]);ePos=int(c[2]) signal = bw.get_as_array(chrN,sPos,ePos) t.close() # smooth data if type(signal) == type(None): signal = np.zeros(ePos-sPos) #else: signal[np.isnan(signal)] = 0 convM = np.convolve(signal,wSmooth,'same') # save sList = np.arange(sPos,ePos,step) eList = sList+step chrList = np.array([chrN]*len(sList)) meanSig = convM[range(step/2,chunkSize+padLen,step)] # save out idx1 = meanSig>0; idx2 = eList<chrLen; idx = idx1*idx2 idx[chunkSize/step:] = False pData = np.c_[chrList[idx],np.array(sList[idx],dtype=str),eList[idx],meanSig[idx]] return pData
def profile_bwfile(inbed,bwfile): '''retrieve signal from bigwig file for each entry in input bed file''' bw = BigWigFile( file=open( bwfile ) ) for line in open(inbed): bw_signal=[] try: if line.startswith('#'):continue if line.startswith('track'):continue if line.startswith('browser'):continue if not line.strip(): continue else: line = line.rstrip('\r\n') fields = line.split() chrom = fields[0] start = int(fields[1]) end = int(fields[2]) except: print >>sys.stderr,"Must be chrom [space] start [space] end: " + line, continue bw_signal.extend(bw.get_as_array(chrom,start,end)) print chrom +'\t'+ str(start) +'\t'+ str(end) + '\t' + ','.join(str(i) for i in bw_signal)
def coverageGeneBody_bigwig(bigFile, refbed, outfile, gtype="png"): '''Calculate reads coverage over gene body, from 5'to 3'. each gene will be equally divided into 100 regsions. bigFile is bigwig format file''' if refbed is None: print >> sys.stderr, "You must specify a bed file representing gene model\n" exit(0) OUT1 = open(outfile + ".geneBodyCoverage_plot.r", 'w') OUT2 = open(outfile + ".geneBodyCoverage.txt", 'w') bw = BigWigFile(file=open(bigFile)) print >> sys.stderr, "calculating coverage over gene body ..." coverage = collections.defaultdict(int) flag = 0 gene_count = 0 for line in open(refbed, 'r'): try: if line.startswith(('#', 'track', 'browser')): continue gene_count += 1 # Parse fields from gene tabls fields = line.split() chrom = fields[0] tx_start = int(fields[1]) tx_end = int(fields[2]) geneName = fields[3] strand = fields[5] exon_starts = map(int, fields[11].rstrip(',\n').split(',')) exon_starts = map((lambda x: x + tx_start), exon_starts) exon_ends = map(int, fields[10].rstrip(',\n').split(',')) exon_ends = map((lambda x, y: x + y), exon_starts, exon_ends) except: print >> sys.stderr, "[NOTE:input bed must be 12-column] skipped this line: " + line, continue gene_all_base = [] percentile_base = [] mRNA_len = 0 flag = 0 for st, end in zip(exon_starts, exon_ends): gene_all_base.extend(range(st + 1, end + 1)) #0-based coordinates on genome mRNA_len = len(gene_all_base) if mRNA_len < 100: flag = 1 break if flag == 1: continue if strand == '-': gene_all_base.sort(reverse=True) #deal with gene on minus stand else: gene_all_base.sort(reverse=False) percentile_base = mystat.percentile_list( gene_all_base) #get 101 points from each gene's coordinates for i in range(0, len(percentile_base)): #try: sig = bw.get_as_array(chrom, percentile_base[i] - 1, percentile_base[i]) if sig is None: continue coverage[i] += np.nan_to_num(sig[0]) #except: # continue print >> sys.stderr, " %d genes finished\r" % gene_count, x_coord = [] y_coord = [] print >> OUT2, "percentile\tcount" for i in coverage: x_coord.append(str(i)) y_coord.append(str(coverage[i])) print >> OUT2, str(i) + '\t' + str(coverage[i]) print >> OUT1, "%s(\'%s\')" % (gtype, outfile + ".geneBodyCoverage." + gtype) print >> OUT1, "x=0:100" print >> OUT1, "y=c(" + ','.join(y_coord) + ')' print >> OUT1, "plot(x,y/%s,xlab=\"percentile of gene body (5'->3')\",ylab='average wigsum',type='s')" % gene_count print >> OUT1, "dev.off()"
def main(): usage = "%prog [options]" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-i", "--bwfile", action="store", type="string", dest="BigWig_File", help="Input BigWig file. [required]") parser.add_option("-o", "--output", action="store", type="string", dest="output_wig", help="Output wig file. [required]") parser.add_option( "-s", "--chromSize", action="store", type="string", dest="chromSize", help= "Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome. [required]" ) parser.add_option( "-t", "--wigsum", action="store", type="int", dest="total_wigsum", default=100000000, help= "Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default [optional]" ) parser.add_option("-r", "--refgene", action="store", type="string", dest="refgene_bed", help="Reference gene model in bed format. [optional]") parser.add_option( "-c", "--chunk", action="store", type="int", dest="chunk_size", default=500000, help= "Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]" ) parser.add_option( "-f", "--format", action="store", type="string", dest="out_format", default="bgr", help= "Output format. either \"wig\" or \"bgr\". \"bgr\" save disk space but make program slower. default=%default" ) (options, args) = parser.parse_args() if not (options.BigWig_File and options.output_wig and options.chromSize): parser.print_help() sys.exit(0) OUT = open(options.output_wig, 'w') bw = BigWigFile(file=open(options.BigWig_File)) chrom_sizes = load_chromsize(options.chromSize) exons = [] WIG_SUM = 0.0 if (options.refgene_bed): print >> sys.stderr, "Extract exons from " + options.refgene_bed obj = BED.ParseBED(options.refgene_bed) exons = obj.getExon() print >> sys.stderr, "Merge overlapping exons ..." exons = BED.unionBed3(exons) print >> sys.stderr, "Calculate wigsum covered by " + options.refgene_bed + ' only' for chrom, st, end in exons: try: bw.get_as_array(chrom, 0, 1).size except: continue bw_signal = bw.get_as_array(chrom, st, end) tmp = numpy.nansum( bw_signal ) #nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0 if numpy.isnan(tmp): continue WIG_SUM += tmp print >> sys.stderr, "Total wigsum is %.2f\n" % WIG_SUM else: print >> sys.stderr, "Calculate wigsum from " + options.BigWig_File for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom #if chr_name != "chrY":continue try: bw.get_as_array(chr_name, 0, 1).size except: print >> sys.stderr, "Skip " + chr_name + "!" continue print >> sys.stderr, "Processing " + chr_name + " ..." for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): bw_signal = bw.get_as_array(interval[0], interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue WIG_SUM += tmp print >> sys.stderr, "\nTotal wigsum is %.2f\n" % WIG_SUM try: weight = options.total_wigsum / WIG_SUM except: "Error, WIG_SUM cannot be 0" eys.exit(1) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ print >> sys.stderr, "Normalizing bigwig file ..." for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom #if chr_name != "chrY":continue try: bw.get_as_array(chr_name, 0, 1).size except: print >> sys.stderr, "Skip " + chr_name + "!" continue if options.out_format.upper() == "WIG": print >> sys.stderr, "Writing " + chr_name + " ..." OUT.write('variableStep chrom=' + chr_name + '\n') for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): coord = interval[1] bw_signal = bw.get_as_array(chr_name, interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue bw_signal = numpy.nan_to_num(bw_signal) * weight for v in bw_signal: coord += 1 if v != 0: print >> OUT, "%d\t%.2f" % (coord, v) elif options.out_format.upper() == "BGR": print >> sys.stderr, "Writing " + chr_name + " ..." #OUT.write('variableStep chrom='+chr_name+'\n') for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): v2p = collections.defaultdict(list) #value to position range2p = { } #coorindate range to value, bedgraph. #[start]=[len,value] coord = interval[1] bw_signal = bw.get_as_array(chr_name, interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue bw_signal = numpy.nan_to_num(bw_signal) * weight for v in bw_signal: coord += 1 #if v != 0: print >>OUT, "%d\t%.2f" % (coord,v) if v != 0: v2p[v].append(coord) for v in v2p: for k, g in groupby(enumerate(v2p[v]), lambda (i, x): i - x): for l in [map(itemgetter(1), g)]: range2p[l[0] - 1] = [len(l), v] for i in sorted(range2p): print >> OUT, chr_name + '\t' + str(i) + '\t' + str( i + range2p[i][0]) + '\t' + str(range2p[i][1]) else: print >> sys.stderr, "unknown output format" sys.exit(1)
inFile.close(); chromSizesFile = MYUTILS.smartGZOpen(args.chrsFile,'r'); chromSizes = {}; for line in chromSizesFile: if line is None or line == "" or line[0]=="#": continue data=line.rstrip().split("\t"); chromSizes[data[0]]=int(data[1]); curBW = BigWigFile(open(args.inBW)) outStream = MYUTILS.smartGZOpen("%s.wig.gz"%(args.outFPre),"w"); outStream.write("track type=wiggle_0\n") for chr in transChrs: values = curBW.get_as_array( chr, 0, chromSizes[oldToNew[chr]] ) #print(chr); if values is not None: sys.stderr.write("Adding %s -> %s\n"%(chr, oldToNew[chr])); outStream.write("fixedStep chrom=%s start=1 step=1\n"%(oldToNew[chr])) outStream.write("\n".join(map(str,values))); outStream.write("\n"); toBW = subprocess.Popen(["wigToBigWig","%s.wig.gz"%(args.outFPre),args.chrsFile,"%s.bw"%(args.outFPre)]) temp = toBW.communicate() if temp[0] is not None: sys.stderr.write("wigToBigWig: %s"%(temp[0])); if temp[1] is not None: sys.stderr.write("wigToBigWig: %s"%(temp[1])); if temp[0] is None and temp[1] is None and os.path.isfile("%s.bw"%(args.outFPre)): # if no errors, delete the original os.remove("%s.wig.gz"%(args.outFPre))
def main(): usage = "%prog [options]" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-i", "--bwfile", action="store", type="string", dest="BigWig_File", help="Input BigWig file. [required]") parser.add_option("-o", "--output", action="store", type="string", dest="output_wig", help="Output wig file. [required]") parser.add_option( "-s", "--chromSize", action="store", type="string", dest="chromSize", help= "Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome. [required]" ) parser.add_option( "-t", "--wigsum", action="store", type="int", dest="total_wigsum", default=100000000, help= "Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default [optional]" ) parser.add_option("-r", "--refgene", action="store", type="string", dest="refgene_bed", help="Reference gene model in bed format. [optional]") parser.add_option( "-c", "--chunk", action="store", type="int", dest="chunk_size", default=100000, help= "Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]" ) (options, args) = parser.parse_args() if not (options.BigWig_File and options.output_wig and options.chromSize): parser.print_help() sys.exit(0) OUT = open(options.output_wig, 'w') bw = BigWigFile(file=open(options.BigWig_File)) chrom_sizes = load_chromsize(options.chromSize) exons = [] WIG_SUM = 0.0 if (options.refgene_bed): print >> sys.stderr, "Extract exons from " + options.refgene_bed obj = BED.ParseBED(options.refgene_bed) exons = obj.getExon() print >> sys.stderr, "Merge overlapping exons ..." exons = BED.unionBed3(exons) print >> sys.stderr, "Calculate wigsum covered by " + options.refgene_bed + ' only' for chrom, st, end in exons: try: bw.get_as_array(chrom, 0, 1).size except: continue bw_signal = bw.get_as_array(chrom, st, end) tmp = numpy.nansum( bw_signal ) #nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0 if numpy.isnan(tmp): continue WIG_SUM += tmp print >> sys.stderr, "Total wigsum is %.2f\n" % WIG_SUM else: print >> sys.stderr, "Calculate wigsum from " + options.BigWig_File for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom try: bw.get_as_array(chr_name, 0, 1).size except: print >> sys.stderr, "Skip " + chr_name + "!" continue print >> sys.stderr, "Processing " + chr_name + " ..." for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): bw_signal = bw.get_as_array(interval[0], interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue WIG_SUM += tmp print >> sys.stderr, "\nTotal wigsum is %.2f\n" % WIG_SUM try: weight = options.total_wigsum / WIG_SUM except: "Error, WIG_SUM cannot be 0" eys.exit(1) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ print >> sys.stderr, "Normalizing bigwig file, output wiggle file" for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom try: bw.get_as_array(chr_name, 0, 1).size except: print >> sys.stderr, "Skip " + chr_name + "!" continue print >> sys.stderr, "Writing " + chr_name + " ..." OUT.write('variableStep chrom=' + chr_name + '\n') for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): coord = interval[1] bw_signal = bw.get_as_array(chr_name, interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue bw_signal = numpy.nan_to_num(bw_signal) for v in bw_signal: coord += 1 if v != 0: print >> OUT, "%d\t%.4f" % (coord, v * weight)
wSize = options.w wSmooth = np.ones(wSize) step = options.s #### SCRIPT ##### # split genome into chunks for i in range(0,len(gSizes)): # break chrs into pieces chrN = gSizes[i][0] chrLen = int(gSizes[i][1]) sVals = np.arange(1,int(gSizes[i][1]),chunkSize) # read in bigWig for j in range(0,len(sVals)): # get data, pass if not available signal = bw.get_as_array(chrN,sVals[j],sVals[j]+chunkSize+padLen) try: signal.any() except: continue # smooth data print chrN, sVals[j] signal[np.isnan(signal)] = 0 convM = np.convolve(signal,wSmooth,'same') # save data sList = np.arange(sVals[j],sVals[j]+chunkSize+padLen,step) eList = sList+step chrList = np.array([chrN]*len(sList)) meanSig = convM[range(step/2,chunkSize+padLen,step)] # save out
def main(): usage="%prog [options]" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-p","--peak-file",action="store",type="string",dest="peak_file",help="Peak file generated by ChEAP_PeakCalling") parser.add_option("-f","--forward",action="store",type="string",dest="forward_peak",help="BigWig file of forward peak (first 5nt)") parser.add_option("-r","--reverse",action="store",type="string",dest="reverse_peak",help="BigWig file of reverse peak (first 5nt)") parser.add_option("-c","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome.") parser.add_option("-w","--window",action="store",type="int",dest="window_size",default=5,help="Window size (on genome) to calculate cross strand distance. default=%default") parser.add_option("-s","--shift-size",action="store",type="int",dest="max_distance",default=100,help="Maximum shift size. default=%default") (options,args)=parser.parse_args() if not (options.peak_file and options.forward_peak and options.reverse_peak and options.chromSize): parser.print_help() sys.exit(0) if options.window_size <1: print >>sys.stderr, "window size must be intreger larger than 1" parser.print_help() sys.exit(0) fwd = BigWigFile( file=open(options.forward_peak) ) rev = BigWigFile( file=open(options.reverse_peak) ) chrom_sizes = load_chromsize(options.chromSize) shiftSize=collections.defaultdict(list) count=0 avg_eud=collections.defaultdict(int) #average euclidean distance over window for line in open(options.peak_file,'r'): if line.startswith('#'): continue if not line.rstrip(): continue fields=line.rstrip().split() if fields[3] == '-': continue if int(fields[4]) <30: continue chrom = fields[0] peak_pos = int(fields[2]) peak_start = peak_pos - options.window_size peak_end = peak_pos + options.window_size if peak_start <0: peak_start=0 if peak_end > chrom_sizes[chrom]: peak_end = chrom_sizes[chrom] fwd_signal = fwd.get_as_array(chrom,peak_start,peak_end) #if all_nan(fwd_signal): # continue fwd_signal = replace_nan( fwd_signal ) for offset in range(0,options.max_distance+1): rev_signal = rev.get_as_array(chrom,peak_start + offset, peak_end + offset) rev_signal = replace_nan( rev_signal ) #print >>OUT, chrom + ":" + str(peak_start) + '-' + str(peak_end) + '\t' + str(offset) + '\t' + str(twoList.euclidean_distance(fwd_signal,rev_signal)) shiftSize[chrom + str(peak_pos)].append(twoList.euclidean_distance(fwd_signal,rev_signal)) for k in shiftSize: if len(set(shiftSize[k]))==1: continue count +=1 norm_factor = max(shiftSize[k]) for indx, val in enumerate(shiftSize[k]): avg_eud[indx] += val/norm_factor for k,v in avg_eud.items(): print str(k) + '\t' + str(v/count)
def retrieve_boo(boo_list, spe, out_filename, mode, phyloP_filename, maf_folder, tree): """retrieve the age of each binding site, main species is spe""" out = WriteToFile(out_filename) if mode == "phyloP": if phyloP_filename is None: error( "outorder method is phyloP, a bigwig phyloP file must be provided" ) exit(1) else: phyloP_bw = BigWigFile(open(phyloP_filename)) node = tree.get_leaves_by_name(spe)[0] branch_order = {} num = 0 while not node.is_root(): branch_order[num] = node.name node = node.up num += 1 # add the root branch_order[num] = node.name branch_order_sorted = sorted(branch_order.keys()) elif mode == "maf": if maf_folder is None: error("outrder method is maf, maf file folder must be provided") exit(1) else: maf_block = MafFile(spe, maf_folder) node = tree.get_leaves_by_name(spe)[0] branch_order = {} num = 0 while not node.is_root(): branch_order[num] = node.name node = node.up num += 1 # add the root branch_order[num] = node.name branch_order_sorted = sorted(branch_order.keys()) # get spe_list spe_list = [leaf.name for leaf in tree] else: print >> sys.stderr, "Unknown mode: %s" % (mode) exit(1) for boo_table in boo_list: poslist = boo_table['pos'] agelist = [] if len(boo_table['count']) == 0: warning( "The program think the number of TFBS (eg. %s) in target species is not realistic. Skip it." % (poslist[0])) continue if mode == "simple": for branch in boo_table['count'].keys(): for nn in range(boo_table['count'][branch]): agelist.append(branch) elif mode == "phyloP": phyloP_list = [] total_spe = 0 for branch in boo_table['count'].keys(): total_spe += boo_table['count'][branch] # in some cases the number of tfbs will not equal to leaf number if total_spe != len(poslist): warning( "The program think the number of TFBS (eg. %s) in target species is not realistic. Sample %d TFBS from total %d TFBS" % (poslist[0], total_spe, len(poslist))) poslist = [ poslist[i] for i in sorted( random.sample(xrange(len(poslist)), total_spe)) ] for region in poslist: region_pos = region.replace(':', '-').split('-') chrom = region_pos[0] start = int(region_pos[1]) stop = int(region_pos[2]) array = phyloP_bw.get_as_array(chrom, start, stop) invalid = np.isnan(array) array[invalid] = 0.0 phyloP_list.append(sum(array)) phyloP_index = np.argsort(np.array(phyloP_list)) phyloP_order = {} for nn in range(len(poslist)): phyloP_order[phyloP_index[nn]] = nn branch_list = {} num = 0 for nn in branch_order_sorted: if branch_order[nn] in boo_table['count']: for mm in range(boo_table['count'][branch_order[nn]]): branch_list[num] = branch_order[nn] num += 1 for nn in range(len(poslist)): agelist.append(branch_list[phyloP_order[nn]]) elif mode == "maf": maf_list = [] total_spe = 0 for branch in boo_table['count'].keys(): total_spe += boo_table['count'][branch] if total_spe != len(poslist): warning( "The program think the number of TFBS (eg. %s) in target species is not realistic. Sample %d TFBS from total %d TFBS" % (poslist[0], total_spe, len(poslist))) poslist = [ poslist[i] for i in sorted( random.sample(xrange(len(poslist)), total_spe)) ] for region in poslist: region_pos = region.replace(':', '-').split('-') chrom = region_pos[0] start = int(region_pos[1]) stop = int(region_pos[2]) score = maf_block.score(chrom, start, stop, spe, spe_list) maf_list.append(score) maf_index = np.argsort(np.array(maf_list)) maf_order = {} for nn in range(len(poslist)): maf_order[maf_index[nn]] = nn branch_list = {} num = 0 for nn in branch_order_sorted: if branch_order[nn] in boo_table['count']: for mm in range(boo_table['count'][branch_order[nn]]): branch_list[num] = branch_order[nn] num += 1 for nn in range(len(poslist)): agelist.append(branch_list[maf_order[nn]]) else: error("Unknown outorder method%s" % (mode)) exit(1) try: assert len(poslist) == len(agelist) except AssertionError: error("motif count is not equal to boo count") exit(1) for nn in range(len(poslist)): pos = poslist[nn] chrom, start, stop = pos.replace(':', '-').split('-') print >> out, chrom + '\t' + start + '\t' + stop + '\t' + agelist[ nn]
def main(): usage="%prog [options]" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-b","--forward",action="store",type="string",dest="forward_bw",help="BigWig file for forward reads (extend 1 nt from 5' end of read)") parser.add_option("-d","--reverse",action="store",type="string",dest="reverse_bw",help="BigWig file for reverse reads (extend 1 nt from 5' end of read)") parser.add_option("-s","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome.") parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output files") parser.add_option("-z","--fuzziness",action="store",type="int",dest="fuzzy_size",default=10,help="Peaks within fuzzy window will be merged. default=%default (bp)") parser.add_option("-w","--bgw",action="store",type="int",dest="window_size",default=200,help="Background window size used to determine background signal level (lambda in Poisson model). default=%default (bp)") parser.add_option("-c","--chunk",action="store",type="int",dest="chunk_size",default=100000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp)") parser.add_option("-p","--pvalue",action="store",type="float",dest="pvalue_cutoff",default=0.1,help="Pvalue cutoff for peak detection. default=%default") parser.add_option("-r","--bg-root-num",action="store",type="float",dest="bg_root_num",default=100,help="Background peak root number. default=%default") parser.add_option("-e","--extention",action="store",type="int",dest="extention_size",default=5,help="Window size used to calculate peak area. Larger number will signficantly reduce speed, and make peak calling more meaningless. default=%default") (options,args)=parser.parse_args() if not (options.output_prefix and options.chromSize and options.forward_bw and options.reverse_bw): parser.print_help() sys.exit(0) for file in (options.chromSize,options.forward_bw,options.reverse_bw): if not os.path.exists(file): print >>sys.stderr, '\n\n' + file + " does NOT exists" + '\n' sys.exit(0) chrom_sizes = load_chromsize(options.chromSize) OUT = open(options.output_prefix + ".single_nt_peak.xls",'w') fw_bw_obj = BigWigFile( file = open(options.forward_bw)) rv_bw_obj = BigWigFile( file = open(options.reverse_bw)) rv_peak_roots = {} rv_peak_height = {} rv_ranges={} rv_peak_pvalue={} pv_cutoff = -10*math.log10(options.pvalue_cutoff) signal.signal(signal.SIGINT, signal_handler) print >>sys.stderr, logo #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #calculate peak height and peak area for forward bigwig print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Processing " + options.forward_bw + ' ...' for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom fw_peak_roots = {} #key is chr,pos,strand,height: ("chr19 51345387 + 2.83"), value is area("2.82999992371") fw_peak_height = {} fw_ranges={} fw_peak_pvalue={} if chr_name != 'chrY': continue print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Processing " + chr_name + " ..." progress = 0 coord = 0 #for each chunk for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size): #cut chrom into bins, interval such as ('chr1', 235000000, 236000000) for indx,val in enumerate(fw_bw_obj.get_as_array(interval[0],interval[1],interval[2])): coord += 1 #coord is 1-based on genome if numpy.isnan(val):continue area_value = sum_bwfile(chr_name, coord, options.extention_size, fw_bw_obj,chrom_sizes) fw_peak_roots[chr_name + "\t" + str(coord) + "\t+"] = area_value #key is chrom + position + strand,value is area fw_peak_height[chr_name + "\t" + str(coord) + "\t+"] = val if chr_name not in fw_ranges: fw_ranges[chr_name] = IntervalTree() else: fw_ranges[chr_name].insert_interval( Interval( coord-1, coord, value=area_value) ) finish_part = int(interval[2]*100/chr_size) if finish_part > progress: print >>sys.stderr, " %d%% finished\r" % (finish_part), progress = finish_part #fw_global_lamda = numpy.mean(fw_peak_roots.values()) #print >>sys.stderr, "Global mean (Forward) = " + str(fw_global_lamda) print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Calculating pvalues for " + options.forward_bw + ' ...' for k in fw_peak_roots: chrom = k.split("\t")[0] coord = int(k.split("\t")[1]) fw_peak_pvalue[k] = cal_poisson_pvalue(int(fw_peak_roots[k]), coord-1, coord, fw_ranges[chrom],options.window_size,options.bg_root_num) fw_peak_filtered = merge_peaks(fw_peak_height,fuzziness=options.fuzzy_size) for k,v in fw_peak_filtered.items(): #print k + '\t' + str(v) (chrom,end,strand) = k.split('\t') end = int(end) start = end -1 height = str(v) area = str(fw_peak_roots[k]) pvalue = fw_peak_pvalue[k] if pvalue < pv_cutoff:continue print >>OUT, '\t'.join([chrom, str(start), str(end), area,str(round(pvalue)),strand,height]) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #calculate peak height and peak area for reverse bigwig print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Processing " + options.reverse_bw + ' ...' for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom if chr_name != 'chrY': continue print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Processing " + chr_name + " ..." progress = 0 coord = 0 #for each chunk for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size): #cut chrom into bins, interval such as ('chr1', 235000000, 236000000) for indx,val in enumerate(rv_bw_obj.get_as_array(interval[0],interval[1],interval[2])): coord += 1 #coord is 1-based on genome if numpy.isnan(val):continue area_value = sum_bwfile(chr_name, coord, options.extention_size, rv_bw_obj,chrom_sizes) rv_peak_roots[chr_name + "\t" + str(coord) + "\t-"] = area_value rv_peak_height[chr_name + "\t" + str(coord) + "\t-"] = val if chr_name not in rv_ranges: rv_ranges[chr_name] = IntervalTree() else: rv_ranges[chr_name].insert_interval( Interval( coord-1, coord, value = area_value) ) finish_part = int(interval[2]*100/chr_size) if finish_part > progress: print >>sys.stderr, " %d%% finished\r" % (finish_part), progress = finish_part #rv_global_lamda = numpy.mean(rv_peak_roots.values()) #print >>sys.stderr, "Global mean (Reverse) = " + str(rv_global_lamda) print >>sys.stderr, "@" + strftime("%Y-%m-%d %H:%M:%S") + ": Calculating pvalues for " + options.reverse_bw + ' ... ' for k in rv_peak_roots: chrom = k.split("\t")[0] coord = int(k.split("\t")[1]) rv_peak_pvalue[k] = cal_poisson_pvalue(int(rv_peak_roots[k]),coord-1,coord, rv_ranges[chrom],options.window_size,options.bg_root_num) #print k + '\t' + str(rv_peak_roots[k]) + '\t' + str(pvalue) rv_peak_filtered = merge_peaks(rv_peak_height,fuzziness=options.fuzzy_size) for k,v in rv_peak_filtered.items(): (chrom,end,strand) = k.split('\t') end = int(end) start = end -1 height = str(v) area = str(rv_peak_roots[k]) pvalue = rv_peak_pvalue[k] if pvalue < pv_cutoff:continue print >>OUT, '\t'.join([chrom, str(start), str(end), area, str(round(pvalue)),strand,height])
#make a matrix of the data allData = np.empty([totalLength,len(IDs)]); if args.eliminateMissing>0: keepThese = np.ones([totalLength]).astype(bool); # onlt those for which data was observed in all tracks for i in range(0,len(IDs)): #input GB tracks curBW = BigWigFile(open(files[i])) curTot = 0; if args.verbose>1: sys.stderr.write("Inputting data for %s.\n"%(IDs[i])); for chr in chrOrder: if args.verbose>1: sys.stderr.write(" Inputting data for %s.\n"%(chr)); if args.verbose>2: sys.stderr.write(" Getting data from BW.\n"); values = curBW.get_as_array( chr, 0, chromSizes[chr] ) if values is None: sys.stderr.write("%s is missing %s... skipping it for all\n"%(IDs[i],chr)); chrOrder.remove(chr) allData = np.delete(allData, [range(curTot, (curTot+np.sum(useThese[chr])))],0); if args.eliminateMissing>0: keepThese = np.delete(keepThese, [range(curTot, (curTot+np.sum(useThese[chr])))],0); totalLength = totalLength - np.sum(useThese[chr]); del useThese[chr] del chromSizes[chr] continue if args.verbose>2: sys.stderr.write(" Checking for missing data.\n"); if args.eliminateMissing>0: #keepThese[curTot:(curTot+sum(useThese[chr]))] = np.logical_and(keepThese[curTot:(curTot+sum(useThese[chr]))], np.logical_not(np.isnan( values ))[useThese[chr]]); keepThese[np.add(curTot,np.nonzero(np.isnan( values[useThese[chr]])))] = False; #print(np.add(curTot,np.nonzero(np.isnan( values[useThese[chr]]))))
Create a site profile vector showing the average signal accumulated from a bigwig file around the center of each interval from a BED file. Output is the average signal value at that relative position across the intervals. usage: %prog bigwig_file.bw padding < bed_file.bed """ import sys from numpy import * from bx.intervals.io import GenomicIntervalReader from bx.bbi.bigwig_file import BigWigFile bw = BigWigFile( open( sys.argv[1] ) ) padding = int( sys.argv[2] ) totals = zeros( padding*2, dtype=float64 ) valid = zeros( padding*2, dtype=int32 ) for interval in GenomicIntervalReader( sys.stdin ): center = floor( ( interval.start + interval.end ) / 2 ) values = bw.get_as_array( interval.chrom, center - padding, center + padding ) # Determine which positions had data and mask the rest for totalling invalid = isnan( values ) values[ invalid ] = 0 totals += values valid += ( ~ invalid ) savetxt( sys.stdout, totals/valid )
def main(args): bw_file = BigWigFile( open(args.bigWigFile) ) bw_file.get_as_array(chrom, st, end)
def coverageGeneBody_bigwig(bigFile,refbed,outfile,gtype="png"): '''Calculate reads coverage over gene body, from 5'to 3'. each gene will be equally divided into 100 regsions. bigFile is bigwig format file''' if refbed is None: print >>sys.stderr,"You must specify a bed file representing gene model\n" exit(0) OUT1 = open(outfile + ".geneBodyCoverage_plot.r",'w') OUT2 = open(outfile + ".geneBodyCoverage.txt",'w') bw = BigWigFile( file = open(bigFile) ) print >>sys.stderr, "calculating coverage over gene body ..." coverage=collections.defaultdict(int) flag=0 gene_count = 0 for line in open(refbed,'r'): try: if line.startswith(('#','track','browser')):continue gene_count += 1 # Parse fields from gene tabls fields = line.split() chrom = fields[0] tx_start = int( fields[1] ) tx_end = int( fields[2] ) geneName = fields[3] strand = fields[5] exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) ) exon_starts = map((lambda x: x + tx_start ), exon_starts) exon_ends = map( int, fields[10].rstrip( ',\n' ).split( ',' ) ) exon_ends = map((lambda x, y: x + y ), exon_starts, exon_ends); except: print >>sys.stderr,"[NOTE:input bed must be 12-column] skipped this line: " + line, continue gene_all_base=[] percentile_base=[] mRNA_len =0 flag=0 for st,end in zip(exon_starts,exon_ends): gene_all_base.extend(range(st+1,end+1)) #0-based coordinates on genome mRNA_len = len(gene_all_base) if mRNA_len <100: flag=1 break if flag==1: continue if strand == '-': gene_all_base.sort(reverse=True) #deal with gene on minus stand else: gene_all_base.sort(reverse=False) percentile_base = mystat.percentile_list (gene_all_base) #get 101 points from each gene's coordinates for i in range(0,len(percentile_base)): #try: sig = bw.get_as_array(chrom,percentile_base[i]-1,percentile_base[i]) if sig is None:continue coverage[i] += np.nan_to_num(sig[0]) #except: # continue print >>sys.stderr, " %d genes finished\r" % gene_count, x_coord=[] y_coord=[] print >>OUT2, "percentile\tcount" for i in coverage: x_coord.append(str(i)) y_coord.append(str(coverage[i])) print >>OUT2, str(i) + '\t' + str(coverage[i]) print >>OUT1, "%s(\'%s\')" % (gtype, outfile + ".geneBodyCoverage." + gtype) print >>OUT1, "x=0:100" print >>OUT1, "y=c(" + ','.join(y_coord) + ')' print >>OUT1, "plot(x,y/%s,xlab=\"percentile of gene body (5'->3')\",ylab='average wigsum',type='s')" % gene_count print >>OUT1, "dev.off()"
def main(): usage = "%prog [options]" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option( "-i", "--bwfile", action="store", type="string", dest="BigWig_File", help="Input BigWig file. [required]" ) parser.add_option( "-o", "--output", action="store", type="string", dest="output_wig", help="Output wig file. [required]" ) parser.add_option( "-s", "--chromSize", action="store", type="string", dest="chromSize", help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome. [required]", ) parser.add_option( "-t", "--wigsum", action="store", type="int", dest="total_wigsum", default=100000000, help="Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default [optional]", ) parser.add_option( "-r", "--refgene", action="store", type="string", dest="refgene_bed", help="Reference gene model in bed format. [optional]", ) parser.add_option( "-c", "--chunk", action="store", type="int", dest="chunk_size", default=100000, help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]", ) (options, args) = parser.parse_args() if not (options.BigWig_File and options.output_wig and options.chromSize): parser.print_help() sys.exit(0) OUT = open(options.output_wig, "w") bw = BigWigFile(file=open(options.BigWig_File)) chrom_sizes = load_chromsize(options.chromSize) exons = [] WIG_SUM = 0.0 if options.refgene_bed: print >>sys.stderr, "Extract exons from " + options.refgene_bed obj = BED.ParseBED(options.refgene_bed) exons = obj.getExon() print >>sys.stderr, "Merge overlapping exons ..." exons = BED.unionBed3(exons) print >>sys.stderr, "Calculate wigsum covered by " + options.refgene_bed + " only" for chrom, st, end in exons: try: bw.get_as_array(chrom, 0, 1).size except: continue bw_signal = bw.get_as_array(chrom, st, end) tmp = numpy.nansum( bw_signal ) # nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0 if numpy.isnan(tmp): continue WIG_SUM += tmp print >>sys.stderr, "Total wigsum is %.2f\n" % WIG_SUM else: print >>sys.stderr, "Calculate wigsum from " + options.BigWig_File for chr_name, chr_size in chrom_sizes.items(): # iterate each chrom try: bw.get_as_array(chr_name, 0, 1).size except: print >>sys.stderr, "Skip " + chr_name + "!" continue print >>sys.stderr, "Processing " + chr_name + " ..." for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): bw_signal = bw.get_as_array(interval[0], interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue WIG_SUM += tmp print >>sys.stderr, "\nTotal wigsum is %.2f\n" % WIG_SUM try: weight = options.total_wigsum / WIG_SUM except: "Error, WIG_SUM cannot be 0" eys.exit(1) # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ print >>sys.stderr, "Normalizing bigwig file, output wiggle file" for chr_name, chr_size in chrom_sizes.items(): # iterate each chrom try: bw.get_as_array(chr_name, 0, 1).size except: print >>sys.stderr, "Skip " + chr_name + "!" continue print >>sys.stderr, "Writing " + chr_name + " ..." OUT.write("variableStep chrom=" + chr_name + "\n") for interval in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=options.chunk_size): coord = interval[1] bw_signal = bw.get_as_array(chr_name, interval[1], interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp): continue bw_signal = numpy.nan_to_num(bw_signal) for v in bw_signal: coord += 1 if v != 0: print >> OUT, "%d\t%.4f" % (coord, v * weight)
class BigWig(object): def __init__(self, filename): self.filename = filename self.determine_sizes() self.bwf = BigWigFile(open(filename)) def determine_sizes(self): self.sizes = {} fh = open(self.filename, "rb") # read magic number to guess endianness magic = fh.read(4) if magic == '&\xfc\x8f\x88': endianness = '<' elif magic == '\x88\x8f\xfc&': endianness = '>' else: raise IOError("The file is not in bigwig format") # read the header info = struct.unpack(endianness + 'HHQQQHHQQIQ', fh.read(60)) self.version = info[0] self.zoom_levels = info[1] self.chromosome_tree_offset = info[2] self.full_data_offset = info[3] self.full_index_offset = info[4] self.field_count = info[5] self.defined_field_count = info[6] self.auto_SQL_offset = info[7] self.total_summary_offset = info[8] self.uncompress_buf_size = info[9] # go to the data fh.seek(self.chromosome_tree_offset) # read magic again magic = fh.read(4) if magic == '\x91\x8c\xcax': endianness = '<' elif magic == 'x\xca\x8c\x91': endianness = '>' else: raise ValueError("Wrong magic for this bigwig data file") info2 = struct.unpack(endianness + 'IIIQQ', fh.read(28)) self.block_size = info2[0] self.key_size = info2[1] self.val_size = info2[2] self.item_count = info2[3] info3 = struct.unpack(endianness + 'BBH', fh.read(4)) self.is_leaf = info3[0] self.count = info3[2] for n in range(self.count): format_code = endianness + str(self.key_size) + 'sII' info = struct.unpack(format_code, fh.read(self.key_size + 2 * 4)) key, chrom_id, chrom_size = info key = key.replace('\x00', '') self.sizes[key] = chrom_size def get_as_array(self, chrom, start, end): return self.bwf.get_as_array(chrom, start, end) def get(self, chrom, start, end): return self.bwf.get(chrom, start, end) def query(self, chrom, start, end, number): return self.bwf.query(chrom, start, end, number)
if args.inFile2 is not None: inFile2 = BedReader(open(args.inFile2)) else: raise Exception("Unrecognized format!"); for locus in scanThese: if args.verbose>0: print("Scanning %s"%(locus[GENOMEDATA.NAME])) stF = max(locus[GENOMEDATA.ST] - padding,0); enF = locus[GENOMEDATA.EN] + padding + inclusive; try: if (locus[GENOMEDATA.STR]=="-" and args.inFile2 is not None): if args.format=="BIGWIG" or args.format=="BW" or args.format=="BIGBED" or args.format=="BB": values = inFile2.get_as_array( locus[GENOMEDATA.CHR], stF, enF ) else: if args.format=="BIGWIG" or args.format=="BW" or args.format=="BIGBED" or args.format=="BB": values = inFile1.get_as_array( locus[GENOMEDATA.CHR], stF, enF ) except OverflowError as e: sys.stderr.write("OverflowError at '%s'; st=%d, en=%d\n"%(locus[GENOMEDATA.NAME],locus[GENOMEDATA.ST],locus[GENOMEDATA.EN])); raise(e); if values is None and args.correctChr>0: #try again adding chr or taking it away if locus[GENOMEDATA.CHR][:3]=="chr": locus[GENOMEDATA.CHR] = locus[GENOMEDATA.CHR][3:] else: locus[GENOMEDATA.CHR]="chr"+ locus[GENOMEDATA.CHR]; try: if (locus[GENOMEDATA.STR]=="-" and args.inFile2 is not None): if args.format=="BIGWIG" or args.format=="BW" or args.format=="BIGBED" or args.format=="BB":
curBW1 = BigWigFile(open(args.inBW1)) curBW2 = BigWigFile(open(args.inBW2)) outStream = MYUTILS.smartGZOpen("%s.wig.gz"%(args.outFPre),"w"); outStream.write("track type=wiggle_0\n") for chr in chromSizes.keys(): last = 0; final = chromSizes[chr]; sys.stderr.write("Outputting data for %s:\n"%(chr)); while last!=final: # this breaks it up into chunks so that I'm not piping entire (human) chromosomes at once if args.verbose>0: sys.stderr.write(" Section %i - %i:\n"%(last,curLast)); curLast = np.min([last+args.chunks,final]); values1 = curBW1.get_as_array( chr, last, curLast ) values2 = curBW2.get_as_array( chr, last, curLast ) #print(chr); if values1 is not None and values2 is not None: # what if only a chunk of a chromosome is missing? then I will get errors values = applyFunction(values1,values2,args.function); if last==0: outStream.write("fixedStep chrom=%s start=1 step=1\n"%(chr)) outStream.write("\n".join(map(str,values))); outStream.write("\n"); outStream.flush(); last=curLast; outStream.close(); toBW = subprocess.Popen(["wigToBigWig","%s.wig.gz"%(args.outFPre),args.chrsFile,"%s.bw"%(args.outFPre)])
or left_or_right == 'r' and strand == '-'): # 5' site if (chrom, coordinate) in annotated_5p: fivep_splice_site_counts = ( annotated_fivep_splice_site_counts ) line_counts = annotated_line_counts else: fivep_splice_site_counts = ( unannotated_fivep_splice_site_counts ) line_counts = unannotated_line_counts if strand == '+': bwvals = bw.get_as_array( chrom, coordinate - args.extension, coordinate + args.extension ) if bwvals is None: continue for i, j in enumerate( xrange(-args.extension, args.extension) ): if not math.isnan(bwvals[i]): fivep_splice_site_counts[j] += bwvals[i] line_counts[j] += 1 elif strand == '-': bwvals = bw.get_as_array( chrom, coordinate - (args.extension - 1), coordinate + (args.extension + 1)
curBW = BigWigFile(open(args.inBW)) outStream = MYUTILS.smartGZOpen("%s.wig.gz"%(args.outFPre),"w"); outStream.write("track type=wiggle_0\n") for chr in chromSizes.keys(): last = 0; final = chromSizes[chr]; sys.stderr.write("Outputting data for %s:\n"%(chr)); while last!=final: # this breaks it up into chunks so that I'm not piping entire (human) chromosomes at once if args.verbose>0: sys.stderr.write(" Section %i - %i:\n"%(last,curLast)); curLast = np.min([last+args.chunks,final]); curEnd = np.min([curLast+additionalFlankSize, final]); curSt = np.max([last-additionalFlankSize,0]); values = curBW.get_as_array( chr, curSt, curEnd ) #print(chr); if values is not None: for f in allFunctions: values = applyFunction(values,f); values = values[(last - curSt):(curLast-last + (last-curSt))];# set them only to the middle part of this data so that the additionalFlankSize regions are not output. #print(values.shape); if last==0: outStream.write("fixedStep chrom=%s start=1 step=1\n"%(chr)) outStream.write("\n".join(map(str,values))); outStream.write("\n"); outStream.flush(); last=curLast; outStream.close();
def summarize(self, interval, bins=None, method='summarize', function='mean', zero_inf=True, zero_nan=True): """ Parameters ---------- interval : object Object with chrom (str), start (int) and stop (int) attributes. bins : int or None Number of bins; if None, bins will be the length of the interval method : summarize | ucsc_summarize | get_as_array "summarize" and "get_as_array" use bx-python; "ucsc_summarize" uses bigWigSummarize. See other notes in docstring for metaseq.array_helpers._local_coverage. If None, defaults to "summarize". function : mean | min | max | std | coverage Determines the nature of the summarized values. Ignored if `method="get_as_array"`; "coverage" is only valid if method is "ucsc_summarize". zero_inf, zero_nan : bool If `zero_inf` is True, set any inf or -inf to zero before returning. If `zero_nan` is True, set any nan values to zero before returning. """ if method is None: method = 'summarize' # We may be dividing by zero in some cases, which raises a warning in # NumPy based on the IEEE 754 standard (see # http://docs.scipy.org/doc/numpy/reference/generated/ # numpy.seterr.html) # # That's OK -- we're expecting that to happen sometimes. So temporarily # disable this error reporting for the duration of this method. orig = np.geterr()['invalid'] np.seterr(invalid='ignore') if (bins is None) or (method == 'get_as_array'): bw = BigWigFile(open(self.fn)) s = bw.get_as_array( interval.chrom, interval.start, interval.stop, ) if s is None: s = np.zeros((interval.stop - interval.start, )) else: if zero_nan: s[np.isnan(s)] = 0 if zero_inf: s[np.isinf(s)] = 0 elif method == 'ucsc_summarize': if function in ['mean', 'min', 'max', 'std', 'coverage']: return self.ucsc_summarize(interval, bins, function=function) else: raise ValueError('function "%s" not supported by UCSC\'s' 'bigWigSummary') elif method == 'summarize': bw = BigWigFile(open(self.fn)) s = bw.summarize(interval.chrom, interval.start, interval.stop, bins) if s is None: s = np.zeros((bins, )) else: if function == 'sum': s = s.sum_data elif function == 'mean': s = s.sum_data / s.valid_count if zero_nan: s[np.isnan(s)] = 0 elif function == 'min': s = s.min_val if zero_inf: s[np.isinf(s)] = 0 elif function == 'max': s = s.max_val if zero_inf: s[np.isinf(s)] = 0 elif function == 'std': s = (s.sum_squares / s.valid_count) if zero_nan: s[np.isnan(s)] = 0 else: raise ValueError( 'function "%s" not supported by bx-python' % function) else: raise ValueError( "method '%s' not in [summarize, ucsc_summarize, get_as_array]" % method) # Reset NumPy error reporting np.seterr(divide=orig) return s
def main(): usage="%prog [options]" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--bwfile",action="store",type="string",dest="BigWig_File",help="Input BigWig file. [required]") parser.add_option("-o","--output",action="store",type="string",dest="output_wig",help="Output wig file. [required]") parser.add_option("-s","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome. [required]") parser.add_option("-t","--wigsum",action="store",type="int",dest="total_wigsum",default=100000000,help="Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default [optional]") parser.add_option("-r","--refgene",action="store",type="string",dest="refgene_bed",help="Reference gene model in bed format. [optional]") parser.add_option("-c","--chunk",action="store",type="int",dest="chunk_size",default=500000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]") parser.add_option("-f","--format",action="store",type="string",dest="out_format",default="bgr",help="Output format. either \"wig\" or \"bgr\". \"bgr\" save disk space but make program slower. default=%default") (options,args)=parser.parse_args() if not (options.BigWig_File and options.output_wig and options.chromSize): parser.print_help() sys.exit(0) OUT=open(options.output_wig,'w') bw = BigWigFile( file=open(options.BigWig_File) ) chrom_sizes = load_chromsize(options.chromSize) exons=[] WIG_SUM=0.0 if (options.refgene_bed): print >>sys.stderr, "Extract exons from " + options.refgene_bed obj = BED.ParseBED(options.refgene_bed) exons = obj.getExon() print >>sys.stderr, "Merge overlapping exons ..." exons = BED.unionBed3(exons) print >>sys.stderr, "Calculate wigsum covered by " + options.refgene_bed + ' only' for chrom,st,end in exons: try: bw.get_as_array(chrom,0,1).size except:continue bw_signal = bw.get_as_array(chrom,st,end) tmp = numpy.nansum(bw_signal) #nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0 if numpy.isnan(tmp):continue WIG_SUM += tmp print >>sys.stderr, "Total wigsum is %.2f\n" % WIG_SUM else: print >>sys.stderr, "Calculate wigsum from " + options.BigWig_File for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom #if chr_name != "chrY":continue try: bw.get_as_array(chr_name,0,1).size except: print >>sys.stderr, "Skip " + chr_name + "!" continue print >>sys.stderr, "Processing " + chr_name + " ..." for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size): bw_signal = bw.get_as_array(interval[0],interval[1],interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp):continue WIG_SUM += tmp print >>sys.stderr, "\nTotal wigsum is %.2f\n" % WIG_SUM try: weight = options.total_wigsum/WIG_SUM except: "Error, WIG_SUM cannot be 0" eys.exit(1) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ print >>sys.stderr, "Normalizing bigwig file ..." for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom #if chr_name != "chrY":continue try: bw.get_as_array(chr_name,0,1).size except: print >>sys.stderr, "Skip " + chr_name + "!" continue if options.out_format.upper() == "WIG": print >>sys.stderr, "Writing " + chr_name + " ..." OUT.write('variableStep chrom='+chr_name+'\n') for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size): coord = interval[1] bw_signal = bw.get_as_array(chr_name,interval[1],interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp):continue bw_signal = numpy.nan_to_num(bw_signal) * weight for v in bw_signal: coord +=1 if v != 0: print >>OUT, "%d\t%.2f" % (coord,v) elif options.out_format.upper() == "BGR": print >>sys.stderr, "Writing " + chr_name + " ..." #OUT.write('variableStep chrom='+chr_name+'\n') for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size): v2p = collections.defaultdict(list) #value to position range2p={} #coorindate range to value, bedgraph. #[start]=[len,value] coord = interval[1] bw_signal = bw.get_as_array(chr_name,interval[1],interval[2]) tmp = numpy.nansum(bw_signal) if numpy.isnan(tmp):continue bw_signal = numpy.nan_to_num(bw_signal) * weight for v in bw_signal: coord +=1 #if v != 0: print >>OUT, "%d\t%.2f" % (coord,v) if v != 0: v2p[v].append(coord) for v in v2p: for k,g in groupby(enumerate(v2p[v]), lambda (i,x):i-x): for l in [map(itemgetter(1), g)]: range2p[l[0]-1] = [len(l),v] for i in sorted(range2p): print >>OUT, chr_name + '\t' + str(i) +'\t' + str(i + range2p[i][0]) + '\t' + str(range2p[i][1]) else: print >>sys.stderr, "unknown output format" sys.exit(1)