def getRawCoverageLargeMode(self, chrom, start, end, largeMode=10): """Retrieve an array with the genome coverage.""" out = np.zeros(end-start) infile = BigWigFile(self.path) wigs = infile.fetch(chrom, start, end) for wig in wigs: out[wig[0]-start:wig[1]-start] = wig[2] infile.close() return out[::largeMode]
def bigwig_fetcher(bigwig, ichr, istart, iend): bw=BigWigFile(bigwig) scores = [] with BigWigFile(bigwig) as bw: for i in bw.fetch(chrom=ichr,start=istart,stop=iend): scores.append(i.score) return scores bw.wWigIO.close()
def coverage_from_bigwig(self, bigwig_file, stepsize=100): """Return list of arrays describing the coverage of each genomicRegions from <bigwig_file>. *Keyword arguments:* - bigwig_file -- path to bigwig file - stepsize -- used stepsize *Output:* Class variable <coverage>: a list where the elements correspond to the GenomicRegion. The list elements give the number of reads falling into the GenomicRegion. """ if platform == "darwin" or "http" in bigwig_file: self.coverage = [] # mp_input = [] for gr in self.genomicRegions: # print(gr) steps = int(abs(gr.final-gr.initial)/stepsize) cmd = ["bigWigSummary",bigwig_file,gr.chrom,str(gr.initial-stepsize),str(gr.final-stepsize),str(steps)] # print(" ".join(cmd)) try: output = subprocess.check_output(cmd, shell=False, stderr=subprocess.STDOUT) # print(output) ds = [0 if "n/a" in x else float(x) for x in output.strip().split()] self.coverage.append( np.array(ds) ) except: continue ### Linux platform else: # print("\tUsing ngslib on linux system...") from ngslib import BigWigFile self.coverage = [] bwf = BigWigFile(bigwig_file) for gr in self.genomicRegions: depth = bwf.pileup(gr.chrom, max(0,int(gr.initial-stepsize/2)), max(1,int(gr.final+stepsize/2))) ds = [depth[d] for d in range(0, gr.final-gr.initial, stepsize)] self.coverage.append( np.array(ds) ) bwf.close()
def bigwig_loader(bigwig, chromInfo): print >> sys.stderr, "Loading the BigWig file in RAM memory ...", bw=BigWigFile(bigwig) for row in csv.reader(open(chromInfo), delimiter = '\t'): chr = row[0] chr_size = int(row[1]) for i in bw.fetch(chrom=chr,start=0,stop=chr_size): phylop_bigwig[chr].append(i.score) bw.wWigIO.close() print >> sys.stderr, "OK",
def __init__(self, src_file, title='', version=None): # only necessary to import ngslib if instance of BigWigDatasource is created # This should not run on OS X machines from ngslib import BigWigFile super(BigWigDatasource, self).__init__(src_file, title=title, version=version) self.output_headers = [title + '_score'] self.bigwig_fh = BigWigFile(src_file) self.has_chr = True if self.bigwig_fh.chroms[0].startswith('chr') else False
class BigWigDatasource(Datasource): """ A datasource derived from a BigWig file. For variants spanning a genomic range (i.e. non SNVs), the median of values from the BigWig are returned. """ def __init__(self, src_file, title='', version=None): # only necessary to import ngslib if instance of BigWigDatasource is created # This should not run on OS X machines from ngslib import BigWigFile super(BigWigDatasource, self).__init__(src_file, title=title, version=version) self.output_headers = [title + '_score'] self.bigwig_fh = BigWigFile(src_file) self.has_chr = True if self.bigwig_fh.chroms[0].startswith('chr') else False def annotate_mutation(self, mutation): if self.has_chr and not mutation.chr.startswith('chr'): chrn = 'chr' + mutation.chr else: chrn = mutation.chr variant_start, variant_end = int(mutation.start) - 1, int(mutation.end) #start - 1 because bigwig format is zero-based coords scores = [r[2] for r in self.bigwig_fh.fetch(chrom=chrn, start=variant_start, stop=variant_end)] if not scores: final_score = None elif len(scores) == 1: final_score = scores[0] else: final_score = np.median(scores) mutation.createAnnotation(self.output_headers[0], final_score, annotationSource=self.title) return mutation def close(self): self.bigwig_fh.close()
def phastCons46way_score(self, stepsize=100): """Load the phastCons46way bigwig files to fetch the scores as coverage. *Keyword arguments:* - stepsize -- used stepsize """ self.coverage = [] phastCons46way_dir = "/data/phastCons46way/" for gr in self.genomicRegions: bwf = BigWigFile(os.path.join(phastCons46way_dir, gr.chrom+".phastCons46way.bw")) depth = bwf.pileup(gr.chrom, gr.initial-stepsize/2, gr.final+stepsize/2) ds = [] for i in range(0, gr.final-gr.initial): d = [ depth[j] for j in range(i,i+stepsize) ] ds.append(sum(d)/len(d)) if gr.orientation == "-": self.coverage.append( np.array(list(reversed(ds))) ) else: self.coverage.append( np.array(ds) ) bwf.close()
def bigwig_mean(bigwig, chr, start, end): bw=BigWigFile(bigwig) score_sum = 0 mean_score = 0 with BigWigFile(bigwig) as bw: for i in bw.fetch(chrom=chr,start=start,stop=end): score_sum += i.score if (end-start) != 0: mean_score = score_sum/(end-start) else: mean_score = 0 return mean_score bw.wWigIO.close()
def coverage_from_bigwig(self, bigwig_file, stepsize=100): """Return list of arrays describing the coverage of each genomicRegions from <bigwig_file>. *Keyword arguments:* - bigwig_file -- path to bigwig file - stepsize -- used stepsize *Output:* Class variable <coverage>: a list where the elements correspond to the GenomicRegion. The list elements give the number of reads falling into the GenomicRegion. """ try: from ngslib import BigWigFile self.coverage = [] bwf = BigWigFile(bigwig_file) for gr in self.genomicRegions: depth = bwf.pileup(gr.chrom, max(0, int(gr.initial - stepsize / 2)), max(1, int(gr.final + stepsize / 2))) ds = [ depth[d] for d in range(0, gr.final - gr.initial, stepsize) ] self.coverage.append(np.array(ds)) bwf.close() except ImportError, e: import pyBigWig self.coverage = [] bwf = pyBigWig.open(bigwig_file) for gr in self.genomicRegions: steps = int(len(gr) / stepsize) ds = bwf.stats(gr.chrom, gr.initial, gr.final, type="mean", nBins=steps) ds = [x if x else 0 for x in ds] self.coverage.append(np.array(ds)) bwf.close()
def __init__(self, file_name): """ Initializes GenomicSignal. """ self.file_name = file_name self.bam = None self.bw = None self.sg_coefs = None self.is_bam = False self.is_bw = False if(self.file_name.split(".")[-1].upper() == "BAM"): self.is_bam = True self.bam = Samfile(file_name,"rb") elif(self.file_name.split(".")[-1].upper() == "BW" or self.file_name.split(".")[-1].upper() == "BIGWIG"): self.is_bw = True self.bw = BigWigFile(file_name) else: pass # TODO ERROR
def coverage_from_bigwig(self, bigwig_file, stepsize=100): """Return list of arrays describing the coverage of each genomicRegions from <bigwig_file>. *Keyword arguments:* - bigwig_file -- path to bigwig file - stepsize -- used stepsize *Output:* Class variable <coverage>: a list where the elements correspond to the GenomicRegion. The list elements give the number of reads falling into the GenomicRegion. """ try: from ngslib import BigWigFile self.coverage = [] bwf = BigWigFile(bigwig_file) for gr in self.genomicRegions: depth = bwf.pileup(gr.chrom, max(0, int(gr.initial - stepsize / 2)), max(1, int(gr.final + stepsize / 2))) ds = [depth[d] for d in range(0, gr.final - gr.initial, stepsize)] self.coverage.append(np.array(ds)) bwf.close() except ImportError, e: import pyBigWig self.coverage = [] bwf = pyBigWig.open(bigwig_file) for gr in self.genomicRegions: steps = int(len(gr) / stepsize) ds = bwf.stats(gr.chrom, gr.initial, gr.final, type="mean", nBins=steps) ds = [ x if x else 0 for x in ds ] self.coverage.append( np.array(ds) ) bwf.close()
def getChromSizesNGSLIB(self): infile = BigWigFile(self.path) out = infile.chromSizes() out = dict(zip(out[0], out[1])) infile.close() return out
class GenomicSignal: """ Represents a genomic signal. It should be used to fetch normalized and slope signals from a bam or bw file. Usage: 1. Initialize class. 2. Call load_sg_coefs once. 3. Call get_signal as many times as needed. Authors: Eduardo G. Gusmao. Methods: load_sg_coefs(self, slope_window_size): Loads Savitzky-Golay coefficients into self.sg_coefs based on a slope_window_size. get_signal(self, ref, start, end, ext, initial_clip = 1000, per_norm = 98, per_slope = 98) Gets the signal associated with self.bam or self.bw based on start, end and ext. initial_clip, per_norm and per_slope are used as normalization factors during the normalization and slope evaluation procedures. hon_norm(self, sequence, mean, std): Normalizes a sequence according to hon's criterion using mean and std. This represents a between-dataset normalization. boyle_norm(self, sequence): Normalizes a sequence according to Boyle's criterion. This represents a within-dataset normalization. savitzky_golay_coefficients(self, window_size, order, deriv): Evaluate the Savitzky-Golay coefficients in order to evaluate the slope of the signal. It uses a window_size (of the interpolation), order (of the polynomial), deriv (derivative needed). slope(self, sequence, sg_coefs): Evaluates the slope of sequence given the sg_coefs loaded. """ def __init__(self, file_name): """ Initializes GenomicSignal. """ self.file_name = file_name self.bam = None self.bw = None self.sg_coefs = None self.is_bam = False self.is_bw = False if(self.file_name.split(".")[-1].upper() == "BAM"): self.is_bam = True self.bam = Samfile(file_name,"rb") elif(self.file_name.split(".")[-1].upper() == "BW" or self.file_name.split(".")[-1].upper() == "BIGWIG"): self.is_bw = True self.bw = BigWigFile(file_name) else: pass # TODO ERROR def load_sg_coefs(self, slope_window_size): """ Loads Savitzky-Golay coefficients into self.sg_coefs based on a slope_window_size. Keyword arguments: slope_window_size -- Window size of Savitzky-Golay coefficients. Return: None -- It updates self.sg_coefs. """ self.sg_coefs = self.savitzky_golay_coefficients(slope_window_size, 2, 1) def get_tag_count(self, ref, start, end, ext, initial_clip = 1000, ext_both_directions=False): """ Gets the tag count associated with self.bam based on start, end and ext. Keyword arguments: ref -- Chromosome name. start -- Initial genomic coordinate of signal. end -- Final genomic coordinate of signal. ext -- Fragment extention. Eg. 1 for DNase and 200 for histone modifications. initial_clip -- Signal will be initially clipped at this level to avoid outliers. Return: tag_count -- Total signal. """ # Fetch raw signal pileup_region = PileupRegion(start,end,ext) if(self.is_bam): if(ps_version == "0.7.5"): self.bam.fetch(reference=ref, start=start, end=end, callback = pileup_region) else: iter = self.bam.fetch(reference=ref, start=start, end=end) if(not ext_both_directions): for alignment in iter: pileup_region.__call__(alignment) else: for alignment in iter: pileup_region.__call2__(alignment) raw_signal = array([min(e,initial_clip) for e in pileup_region.vector]) elif(self.is_bw): signal = self.bw.pileup(ref, start, end) raw_signal = array([min(e,initial_clip) for e in signal]) # Std-based clipping mean = raw_signal.mean() std = raw_signal.std() clip_signal = [min(e, mean + (10 * std)) for e in raw_signal] # Tag count try: tag_count = sum(clip_signal) except Exception: tag_count = 0 return tag_count def get_signal(self, ref, start, end, ext, initial_clip = 1000, per_norm = 99.5, per_slope = 98, bias_table = None, genome_file_name = None, ext_both_directions=False, print_wig = None): """ Gets the signal associated with self.bam based on start, end and ext. initial_clip, per_norm and per_slope are used as normalization factors during the normalization and slope evaluation procedures. Keyword arguments: ref -- Chromosome name. start -- Initial genomic coordinate of signal. end -- Final genomic coordinate of signal. ext -- Fragment extention. Eg. 1 for DNase and 200 for histone modifications. initial_clip -- Signal will be initially clipped at this level to avoid outliers. per_norm -- Percentile value for 'hon_norm' function of the normalized signal. per_slope -- Percentile value for 'hon_norm' function of the slope signal. bias_table -- Bias table to perform bias correction. Return: hon_signal -- Normalized signal. slopehon_signal -- Slope signal. """ # Fetch raw signal pileup_region = PileupRegion(start,end,ext) if(self.is_bam): if(ps_version == "0.7.5"): self.bam.fetch(reference=ref, start=start, end=end, callback = pileup_region) else: iter = self.bam.fetch(reference=ref, start=start, end=end) if(not ext_both_directions): for alignment in iter: pileup_region.__call__(alignment) else: for alignment in iter: pileup_region.__call2__(alignment) raw_signal = array([min(e,initial_clip) for e in pileup_region.vector]) elif(self.is_bw): signal = self.bw.pileup(ref, start, end) raw_signal = array([min(e,initial_clip) for e in signal]) # Std-based clipping mean = raw_signal.mean() std = raw_signal.std() clip_signal = [min(e, mean + (10 * std)) for e in raw_signal] # Bias correction bias_corrected_signal = self.bias_correction(clip_signal, bias_table, genome_file_name, ref, start, end) # Boyle normalization (within-dataset normalization) boyle_signal = array(self.boyle_norm(bias_corrected_signal)) # Hon normalization (between-dataset normalization) perc = scoreatpercentile(boyle_signal, per_norm) std = boyle_signal.std() hon_signal = self.hon_norm(boyle_signal, perc, std) # Slope signal slope_signal = self.slope(hon_signal, self.sg_coefs) # Hon normalization on slope signal (between-dataset slope smoothing) abs_seq = array([abs(e) for e in slope_signal]) perc = scoreatpercentile(abs_seq, per_slope) std = abs_seq.std() slopehon_signal = self.hon_norm(slope_signal, perc, std) # Writing signal if(print_wig): signal_file = open(print_wig+"signal.wig","a") norm_file = open(print_wig+"norm.wig","a") slope_file = open(print_wig+"slope.wig","a") signal_file.write("fixedStep chrom="+ref+" start="+str(start+1)+" step=1\n"+"\n".join([str(e) for e in clip_signal])+"\n") norm_file.write("fixedStep chrom="+ref+" start="+str(start+1)+" step=1\n"+"\n".join([str(e) for e in hon_signal])+"\n") slope_file.write("fixedStep chrom="+ref+" start="+str(start+1)+" step=1\n"+"\n".join([str(e) for e in slopehon_signal])+"\n") signal_file.close() norm_file.close() slope_file.close() # Returning normalized and slope sequences return hon_signal, slopehon_signal def bias_correction(self, signal, bias_table, genome_file_name, chrName, start, end): """ Performs bias correction. Keyword arguments: signal -- Input signal. bias_table -- Bias table. Return: bias_corrected_signal -- Bias-corrected sequence. """ if(not bias_table): return signal # Parameters window = 50 defaultKmerValue = 1.0 # Initialization fastaFile = Fastafile(genome_file_name) fBiasDict = bias_table.table[0]; rBiasDict = bias_table.table[1] k_nb = len(fBiasDict.keys()[0]) p1 = start; p2 = end p1_w = p1 - (window/2); p2_w = p2 + (window/2) p1_wk = p1_w - (k_nb/2); p2_wk = p2_w + (k_nb/2) # Raw counts nf = [0.0] * (p2_w-p1_w); nr = [0.0] * (p2_w-p1_w) for r in self.bam.fetch(chrName, p1_w, p2_w): if((not r.is_reverse) and (r.pos > p1_w)): nf[r.pos-p1_w] += 1.0 if((r.is_reverse) and ((r.aend-1) < p2_w)): nr[r.aend-1-p1_w] += 1.0 # Smoothed counts Nf = []; Nr = []; fSum = sum(nf[:window]); rSum = sum(nr[:window]); fLast = nf[0]; rLast = nr[0] for i in range((window/2),len(nf)-(window/2)): Nf.append(fSum) Nr.append(rSum) fSum -= fLast; fSum += nf[i+(window/2)]; fLast = nf[i-(window/2)+1] rSum -= rLast; rSum += nr[i+(window/2)]; rLast = nr[i-(window/2)+1] # Fetching sequence currStr = str(fastaFile.fetch(chrName, p1_wk-1, p2_wk-2)).upper() currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName,p1_wk+2, p2_wk+1)).upper()) # Iterating on sequence to create signal af = []; ar = [] for i in range((k_nb/2),len(currStr)-(k_nb/2)+1): fseq = currStr[i-(k_nb/2):i+(k_nb/2)] rseq = currRevComp[len(currStr)-(k_nb/2)-i:len(currStr)+(k_nb/2)-i] try: af.append(fBiasDict[fseq]) except Exception: af.append(defaultKmerValue) try: ar.append(rBiasDict[rseq]) except Exception: ar.append(defaultKmerValue) # Calculating bias and writing to wig file fSum = sum(af[:window]); rSum = sum(ar[:window]); fLast = af[0]; rLast = ar[0] bias_corrected_signal = [] for i in range((window/2),len(af)-(window/2)): nhatf = Nf[i-(window/2)]*(af[i]/fSum) nhatr = Nr[i-(window/2)]*(ar[i]/rSum) zf = log(nf[i]+1)-log(nhatf+1) zr = log(nr[i]+1)-log(nhatr+1) bias_corrected_signal.append(zf+zr) fSum -= fLast; fSum += af[i+(window/2)]; fLast = af[i-(window/2)+1] rSum -= rLast; rSum += ar[i+(window/2)]; rLast = ar[i-(window/2)+1] # Termination fastaFile.close() return bias_corrected_signal def hon_norm(self, sequence, mean, std): """ Normalizes a sequence according to hon's criterion using mean and std. This represents a between-dataset normalization. Keyword arguments: sequence -- Input sequence. mean -- Global mean. std -- Global std. Return: norm_seq -- Normalized sequence. """ norm_seq = [] for e in sequence: if(e == 0.0): norm_seq.append(0.0) elif(e > 0.0): norm_seq.append(1.0/(1.0+(exp(-(e-mean)/std)))) else: norm_seq.append(-1.0/(1.0+(exp(-(-e-mean)/std)))) return norm_seq def boyle_norm(self, sequence): """ Normalizes a sequence according to Boyle's criterion. This represents a within-dataset normalization. Keyword arguments: sequence -- Input sequence. Return: norm_seq -- Normalized sequence. """ mean = array([e for e in sequence if e>0]).mean() norm_seq = [(float(e)/mean) for e in sequence] return norm_seq def savitzky_golay_coefficients(self, window_size, order, deriv): """ Evaluate the Savitzky-Golay coefficients in order to evaluate the slope of the signal. It uses a window_size (of the interpolation), order (of the polynomial), deriv (derivative needed). Keyword arguments: window_size -- Size of the window for function interpolation. order -- Order of polynomial. deriv -- Derivative. Return: m[::-1] -- The Savitzky-Golay coefficients. """ # Get statistics #try: # TODO Errors window_size = abs(int(window_size)) order = abs(int(order)) #except ValueError, msg: # raise ValueError("windowSize and order have to be of type int") #if windowSize % 2 != 1 or windowSize < 1: # raise TypeError("windowSize size must be a positive odd number") #if windowSize < order + 2: # raise TypeError("windowSize is too small for the polynomials order") order_range = range(order+1) half_window = (window_size -1) // 2 # Precompute Coefficients b = mat([[k**i for i in order_range] for k in range(-half_window, half_window+1)]) m = linalg.pinv(b).A[deriv] return m[::-1] def slope(self, sequence, sg_coefs): """ Evaluates the slope of sequence given the sg_coefs loaded. Keyword arguments: sequence -- Input sequence. sg_coefs -- Savitzky-Golay coefficients. Return: slope_seq -- Slope sequence. """ slope_seq = convolve(sequence, sg_coefs) slope_seq = [e for e in slope_seq[(len(sg_coefs)/2):(len(slope_seq)-(len(sg_coefs)/2))]] return slope_seq