def compareSignal(bamFilesList, binLength, numberOfSamples, defaultFragmentLength, outFileName, outFileFormat, outFileNameLambda=None, region=None, extendPairedEnds=True, numberOfProcessors=1, Nsigmas = 2, maxSignalRatio=10, verbose=False): bam1 = bamHandler.openBam(bamFilesList[0]) genomeSize = sum(bam1.lengths) bam2 = bamHandler.openBam(bamFilesList[1]) treatmentMapped = bam1.mapped controlMapped = bam2.mapped treatmentControlRatioMapped = float(treatmentMapped) / controlMapped # 1. Get a table containing number of reads in a sample from the genome. # Only regions for which both samples have non zero counts are considered num_reads_per_region = getNumReadsPerBin(bamFilesList, binLength, numberOfSamples, defaultFragmentLength, numberOfProcessors, skipZeros=True, verbose=verbose) if verbose: print "number of non-zero regions sampled: {}".format(num_reads_per_region.shape[0]) # 2. get Mean and std of treatment (col1) and control (col2) treatmentMean, controlMean = np.mean(num_reads_per_region, axis=0) # axis=0: that measn by column treatmentStd, controlStd = np.std(num_reads_per_region, axis=0) treatmentTotal, controlTotal = np.sum(num_reads_per_region, axis=0) # 3. Calculate residual in treatment & control data, at regions for which treatment # signal exceeds mean + std * Nsigmas # (these are expected to be the regions at which the signal > mean-signal, # so the residual signal is positive) overRows = np.where(num_reads_per_region[:,0].copy() >= treatmentMean + treatmentStd*Nsigmas )[0] over_Nsigma_regions = num_reads_per_region[overRows, :] treatmentSigMean, controlSigMean = np.mean(over_Nsigma_regions, axis=0) treatmentExtraSignal = treatmentSigMean - treatmentMean controlExtraSignal = controlSigMean - controlMean treatmentControlRatio = float(treatmentTotal) / controlTotal adjSignalRatio = maxSignalRatio * treatmentControlRatio; treatmentSignalRatio = float(treatmentExtraSignal) / controlExtraSignal if treatmentSignalRatio < adjSignalRatio and treatmentSignalRatio > 0: treatmentSignalRatio = adjSignalRatio if treatmentSignalRatio < 1: raise NameError("estimated signal in control file {} is greater than estimated signal in treatmant file {}. Perhaps the file names are swapped?".format(bamFilesList[0], bamFilesList[1])) else: controlSignalRatio = 1.0/treatmentSignalRatio controlRatio = 1.0 / treatmentControlRatio print "Treatment mean: {:.2f}, Treatment total:{:.2f}".format(treatmentMean, treatmentTotal) print "Control mean: {:.2f}, Control total:{}".format(controlMean, controlTotal) print "the ratio of treatment vs. control for enriched regions is: {:.2f}".format(treatmentSignalRatio) print "the ratio of treatment vs. control ratio: {:.2f} (if based on mapped reads: {:.2f})".format(treatmentControlRatio, treatmentControlRatioMapped) funcArgs = {'controlMean': controlMean, 'treatmentMean': treatmentMean, 'controlSignalRatio': controlSignalRatio, 'controlRatio': controlRatio, 'treatmentControlRatio': treatmentControlRatio } writeBedGraph.writeBedGraph( bamFilesList, outFileName, defaultFragmentLength, computePvalue, funcArgs, tileSize=binLength, region=region, format=outFileFormat, zerosToNans = False, numberOfProcessors=numberOfProcessors, extendPairedEnds=extendPairedEnds) if outFileNameLambda: writeBedGraph.writeBedGraph( bamFilesList, outFileNameLambda, defaultFragmentLength, computeLambda, funcArgs, tileSize=binLength, region=region, format=outFileFormat, zerosToNans = False, numberOfProcessors=numberOfProcessors, extendPairedEnds=extendPairedEnds)
def estimateScaleFactor(bamFilesList, binLength, numberOfSamples, defaultFragmentLength, normalizationLength, avg_method='median', numberOfProcessors=1, verbose=False, chrsToSkip=[]): r""" Subdivides the genome into chunks to be analyzed in parallel using several processors. The code handles the creation of workers that compute fragment counts (coverage) for different regions and then collect and integrates the results. The arguments are: 'bamFilesList', list of bam files to normalize 'binLength', the window size in bp, where reads are going to be counted. 'numberOfSamples', Number of sites to sample. 'defaultFragmentLength', if the reads are not paired, this value is used extend the reads. 'normalizationLength', length, in bp, to normalize the data. For a value of 1, are given such that on average 1 fragment per base pair is found 'avg_method', defines how the different values are to be summarized. The options are 'mean' and 'median' 'chrsToSkip', name of the chromosomes to be excluded from the scale stimation. Usually the chrX is included. For example, to test about 1 million regions of length 500 bp, the binLength will be 500 and the numberOfSamples is going to be the size of the genome divided by the 1 million. This number is not exact because regions in which all counts are 0 are not taken into account The test data contains reads for 200 bp >>> test = Tester() >>> dict = estimateScaleFactor([test.bamFile1, test.bamFile2], 50, 4, 0, 1) >>> dict['size_factors'] array([ 1. , 0.5]) >>> dict['size_factors_based_on_mean'] array([ 1. , 0.5]) """ if len(bamFilesList) > 2: raise NameError("SES scale factors are only defined for 2 files") bamFilesHandlers = [bamHandler.openBam(x) for x in bamFilesList] mappedReads = [x.mapped for x in bamFilesHandlers] sizeFactorBasedOnMappedReads = np.array(mappedReads, dtype='float64') sizeFactorBasedOnMappedReads = \ sizeFactorBasedOnMappedReads.min() / sizeFactorBasedOnMappedReads num_reads_per_bin = getNumReadsPerBin( bamFilesList, binLength, numberOfSamples, defaultFragmentLength, numberOfProcessors=numberOfProcessors, verbose=verbose, chrsToSkip=chrsToSkip) sitesSampled = len(num_reads_per_bin) # the transpose is taken to easily iterate by columns which are now # converted to rows num_reads_per_bin = num_reads_per_bin.transpose() # np.savetxt("/home/ramirez/tmp/test.num_reads", num_reads_per_bin) # size factors based on order statistics # see Signal extraction scaling (SES) method in: Diaz et al (2012) # Normalization, bias correction, and peak calling for ChIP-seq. # Statistical applications in genetics and molecular biology, 11(3). # using the same names as in Diaz paper # p refers to ChIP, q to input p = np.sort(num_reads_per_bin[0, :]).cumsum() q = np.sort(num_reads_per_bin[1, :]).cumsum() # p[-1] and q[-1] are the maximum values in the arrays. # both p and q are normalized by this value diff = np.abs(p / p[-1] - q / q[-1]) # get the lowest rank for wich the difference is the maximum maxIndex = np.flatnonzero(diff == diff.max())[0] # Take a lower rank to move to a region with probably # less peaks and more background. maxIndex = int(maxIndex * 0.8) while (maxIndex < len(p)): # in rare cases the maxIndex maps to a zero value. # In such cases, the next index is used until # a non zero value appears. cumSum = np.array([float(p[maxIndex]), float(q[maxIndex])]) if cumSum.min() > 0: break maxIndex += 1 meanSES = [ np.mean(np.sort(num_reads_per_bin[0, :])[:maxIndex]), np.mean(np.sort(num_reads_per_bin[1, :])[:maxIndex]) ] # the maxIndex may be too close to the the signal regions # so i take a more conservative approach by taking a close number sizeFactorsSES = cumSum.min() / cumSum median = np.median(num_reads_per_bin, axis=1) # consider only those read numbers that are below the 90 # percentile to stimate the # mean and std mean = [] std = [] for values in num_reads_per_bin: maxNumReads = (np.percentile(values, 90)) if maxNumReads == 0: maxNumReads = (np.percentile(values, 99)) if maxNumReads == 0: print "all genomic regions sampled from one " "of the bam files have no reads.\n" values = values[values <= maxNumReads] mean.append(np.mean(values)) std.append(np.std(values)) mean = np.array(mean) readsPerBin = mean if avg_method == 'mean' else median sizeFactor = sizeFactorsSES return { 'size_factors': sizeFactor, 'size_factors_based_on_mapped_reads': sizeFactorBasedOnMappedReads, 'size_factors_SES': sizeFactorsSES, 'size_factors_based_on_mean': mean.min() / mean, 'size_factors_based_on_median': median.min() / median, 'mean': mean, 'meanSES': meanSES, 'median': median, 'reads_per_bin': readsPerBin, 'std': std, 'sites_sampled': sitesSampled }
def estimateScaleFactor(bamFilesList, binLength, numberOfSamples, defaultFragmentLength, normalizationLength, avg_method='median', numberOfProcessors=1, verbose=False, chrsToSkip=[]): r""" Subdivides the genome into chunks to be analyzed in parallel using several processors. The code handles the creation of workers that compute fragment counts (coverage) for different regions and then collect and integrates the results. The arguments are: 'bamFilesList', list of bam files to normalize 'binLength', the window size in bp, where reads are going to be counted. 'numberOfSamples', Number of sites to sample. 'defaultFragmentLength', if the reads are not paired, this value is used extend the reads. 'normalizationLength', length, in bp, to normalize the data. For a value of 1, are given such that on average 1 fragment per base pair is found 'avg_method', defines how the different values are to be summarized. The options are 'mean' and 'median' 'chrsToSkip', name of the chromosomes to be excluded from the scale stimation. Usually the chrX is included. For example, to test about 1 million regions of length 500 bp, the binLength will be 500 and the numberOfSamples is going to be the size of the genome divided by the 1 million. This number is not exact because regions in which all counts are 0 are not taken into account The test data contains reads for 200 bp >>> test = Tester() >>> dict = estimateScaleFactor([test.bamFile1, test.bamFile2], 50, 4, 0, 1) >>> dict['size_factors'] array([ 1. , 0.5]) >>> dict['size_factors_based_on_mean'] array([ 1. , 0.5]) """ if len(bamFilesList) > 2: raise NameError("SES scale factors are only defined for 2 files") bamFilesHandlers = [bamHandler.openBam(x) for x in bamFilesList] mappedReads = [x.mapped for x in bamFilesHandlers] sizeFactorBasedOnMappedReads = np.array(mappedReads, dtype='float64') sizeFactorBasedOnMappedReads = \ sizeFactorBasedOnMappedReads.min() / sizeFactorBasedOnMappedReads num_reads_per_bin = getNumReadsPerBin(bamFilesList, binLength, numberOfSamples, defaultFragmentLength, numberOfProcessors=numberOfProcessors, verbose=verbose, chrsToSkip=chrsToSkip) sitesSampled = len(num_reads_per_bin) # the transpose is taken to easily iterate by columns which are now # converted to rows num_reads_per_bin = num_reads_per_bin.transpose() # np.savetxt("/home/ramirez/tmp/test.num_reads", num_reads_per_bin) # size factors based on order statistics # see Signal extraction scaling (SES) method in: Diaz et al (2012) # Normalization, bias correction, and peak calling for ChIP-seq. # Statistical applications in genetics and molecular biology, 11(3). # using the same names as in Diaz paper # p refers to ChIP, q to input p = np.sort(num_reads_per_bin[0, :]).cumsum() q = np.sort(num_reads_per_bin[1, :]).cumsum() # p[-1] and q[-1] are the maximum values in the arrays. # both p and q are normalized by this value diff = np.abs(p / p[-1] - q / q[-1]) # get the lowest rank for wich the difference is the maximum maxIndex = np.flatnonzero(diff == diff.max())[0] # Take a lower rank to move to a region with probably # less peaks and more background. maxIndex = int(maxIndex * 0.8) while(maxIndex < len(p)): # in rare cases the maxIndex maps to a zero value. # In such cases, the next index is used until # a non zero value appears. cumSum = np.array([float(p[maxIndex]), float(q[maxIndex])]) if cumSum.min() > 0: break maxIndex += 1 meanSES = [np.mean(np.sort(num_reads_per_bin[0, :])[:maxIndex]), np.mean(np.sort(num_reads_per_bin[1, :])[:maxIndex])] # the maxIndex may be too close to the the signal regions # so i take a more conservative approach by taking a close number sizeFactorsSES = cumSum.min() / cumSum median = np.median(num_reads_per_bin, axis=1) # consider only those read numbers that are below the 90 # percentile to stimate the # mean and std mean = [] std = [] for values in num_reads_per_bin: maxNumReads = (np.percentile(values, 90)) if maxNumReads == 0: maxNumReads = (np.percentile(values, 99)) if maxNumReads == 0: print "all genomic regions sampled from one " "of the bam files have no reads.\n" values = values[values <= maxNumReads] mean.append(np.mean(values)) std.append(np.std(values)) mean = np.array(mean) readsPerBin = mean if avg_method == 'mean' else median sizeFactor = sizeFactorsSES return {'size_factors': sizeFactor, 'size_factors_based_on_mapped_reads': sizeFactorBasedOnMappedReads, 'size_factors_SES': sizeFactorsSES, 'size_factors_based_on_mean': mean.min() / mean, 'size_factors_based_on_median': median.min() / median, 'mean': mean, 'meanSES': meanSES, 'median': median, 'reads_per_bin': readsPerBin, 'std': std, 'sites_sampled': sitesSampled}