def getChrLen(cls, genome, chr): assert genome is not None assert chr is not None # For the unit-tests if genome.lower() == 'testgenome': if chr == 'chr21': return 46944323 if chr == 'chrM': return 16571 if genome in cls._chrLengths and \ chr in cls._chrLengths[genome]: return cls._chrLengths[genome][chr] else: try: #length = cls.getNumElementsInFastaFile(os.sep.join([ORIG_DATA_PATH, genome, 'sequence', cls.fixChr(chr) + '.fa'])) from gold.util.CommonFunctions import createOrigPath length = cls.getNumElementsInFastaFile( createOrigPath(genome, cls.getSequenceTrackName(genome), chr + '.fa')) except IOError: raise ArgumentValueError( "Error: chromosome '%s' is not part of genome '%s'." % (chr, genome)) if not genome in cls._chrLengths: cls._chrLengths[genome] = {} cls._chrLengths[genome][chr] = length return length
def computePurePseudoPvalue(observation, mcSamples, tail): numResamplings = len(mcSamples) if tail in ['right-tail', 'left-tail']: tailFactor = 1.0 elif tail == 'two-tail': tailFactor = 2.0 else: raise ArgumentValueError('Invalid value for tails argument:', tail) numMoreExtreme = computeNumMoreExtreme(observation, mcSamples, tail) pval = tailFactor * (numMoreExtreme+1) / (numResamplings+1) pval = min(1.0, pval) return pval
def computeNumMoreExtreme(observation, mcSamples, tails): numMoreExtremeRight = sum(1 for res in mcSamples \ if res >= observation ) numMoreExtremeLeft = sum(1 for res in mcSamples \ if res <= observation ) if tails == 'right-tail': return numMoreExtremeRight elif tails == 'left-tail': return numMoreExtremeLeft elif tails == 'two-tail': return min(numMoreExtremeLeft, numMoreExtremeRight) raise ArgumentValueError('Invalid value for tails argument:', tails)
def _init(self, kernelType=None, kernelStdev=None, minimumOffsetValue=1, **kwArgs): #assert kernelType in ['gaussian','divideByOffset'] #divideByOffset: weigh by 1/x, where x is offset from center, meaning integral of region (on one side) 0-x is log(x). if kernelType == 'gaussian': assert kernelStdev is not None self._kernelStdev = float(kernelStdev) elif kernelType == 'divideByOffset': assert minimumOffsetValue is not None self._minimumOffsetValue = float(minimumOffsetValue) else: raise ArgumentValueError('Invalid kernelType') self._kernelType = kernelType
def customHeaders(self, customHeaders): self._customHeaders = OrderedDict() for key, val in customHeaders.iteritems(): if val is not None: if val == '': raise InvalidFormatError( 'Empty header values not allowed. ' 'Please use ".", the period character, to ' 'indicate missing values') if key.lower() in self._customHeaders: raise ArgumentValueError( 'Custom header "{}" appears multiple times in the ' 'header list. Note that custom headers are case ' 'insensitive (e.g., "ABC" and "abc" is the same ' 'header).'.format(key)) self.setCustomHeader(key, val)
def attributes(self, attributes): self._attributes = OrderedDict() for key, val in attributes.iteritems(): if val is not None: if val == '': raise InvalidFormatError( 'Empty attribute contents not allowed. ' 'Please use ".", the period character, to ' 'indicate missing values') if self._doUnquote: val = urlDecodePhrase(val) if key.lower() in self._attributes: raise ArgumentValueError( 'Attribute "{}" appears multiple times in the ' 'attribute list. Note that attributes are case ' 'insensitive (e.g., "ABC" and "abc" is the same ' 'attribute).'.format(key)) self.setAttribute(key, val)
def _compute(self): #if any([randTrackClass==SegsSampledByIntensityTrack for randTrackClass in [self._randTrackClass1, self._randTrackClass2]]) \ # and self._kwArgs.get('trackNameIntensity') in [None,'']: # return None #from dbgp.client import brk #brk(host='localhost', port=9000, idekey='galaxy') #print 'computing for reg: ',self._region #if VERBOSE: # print [randChild.getResult() for randChild in self._children] #try: if self._kwArgs.get('minimal') != True and ( self._realChild.getResult() is None or anyIsNan(self._realChild.getResult())): return None #TODO: change this to a "is this a parallel run?" check #if not USE_PARALLEL or ('minimal' in self._kwArgs and self._kwArgs['minimal']): for i in xrange(len(self._randResults), self._numResamplings): #print 'computing randChild..' #print ',', randChild = self._createRandomizedStat(i) self._randResults.append( randChild.getResult() ) #only to ensure result is created, will be accessed afterwards.. #else: # jobWrapper = RandomizationManagerStatJobWrapper(self, seed=self._kwArgs["uniqueId"]) # jobHandler = JobHandler(self._kwArgs["uniqueId"], True) # self._randResults = jobHandler.run(jobWrapper) #logMessage(','.join([str(x) for x in randResults])) numpyRandResults = array(self._randResults) if self._observation is None: self._observation = self._realChild.getResult() if self._kwArgs.get('minimal') == True and ( self._observation is None or anyIsNan(self._observation)): return None #meanOfNullDistr = 1.0 * sum( randResults ) / \ #self._numResamplings nonNanNumpyRandResults = numpyRandResults[~isnan(numpyRandResults)] assert len(numpyRandResults) == self._numResamplings numberOfNonNanRandResults = len(nonNanNumpyRandResults) meanOfNullDistr = nonNanNumpyRandResults.mean(dtype='float64') medianOfNullDistr = median(nonNanNumpyRandResults) sdOfNullDistr = nonNanNumpyRandResults.std(dtype='float64') #sdCountFromNullOfObs = (observation - meanOfNullDistr) / sdOfNullDistr diffObsMean = (self._observation - meanOfNullDistr) numMoreExtreme = sum(1 for res in self._randResults \ if res >= self._observation ) #pvalEqual = 1.0 * sum(1 for res in self._randResults \ # if res == self._observation ) / self._numResamplings #pvalStrictLeft = 1.0 * sum(1 for res in self._randResults \ # if res < self._observation ) / self._numResamplings numMoreExtremeRight = sum(1 for res in self._randResults \ if res >= self._observation ) numMoreExtremeLeft = sum(1 for res in self._randResults \ if res <= self._observation ) if self._tails == 'right-tail': numMoreExtreme = numMoreExtremeRight tailFactor = 1.0 elif self._tails == 'left-tail': numMoreExtreme = numMoreExtremeLeft tailFactor = 1.0 elif self._tails == 'two-tail': numMoreExtreme = min(numMoreExtremeLeft, numMoreExtremeRight) tailFactor = 2.0 else: raise ArgumentValueError('Invalid value for tails argument:', self._tails) # For more info on the formula for calculating p-values: # "Permutation P-values should never be zero: calculating exact P-values # when permutations are randomly drawn" (http://www.ncbi.nlm.nih.gov/pubmed/21044043) pval = tailFactor * (numMoreExtreme + 1) / (self._numResamplings + 1) pval = min(1.0, pval) #pvalEqual = 1.0 * sum(1 for res in self._randResults \ # if res == self._observation ) / self._numResamplings #pvalStrictRight = 1.0 * sum(1 for res in self._randResults \ # if res > self._observation ) / self._numResamplings #pvalStrictLeft = 1.0 * sum(1 for res in self._randResults \ # if res < self._observation ) / self._numResamplings # #if self._tails == 'right-tail': # pval = pvalStrictRight + pvalEqual #elif self._tails == 'left-tail': # pval = pvalStrictLeft + pvalEqual #elif self._tails == 'two-tail': # #pval = 2 * min(pvalStrictLeft, pvalStrictRight) + pvalEqual # pval = min(1, 2 * min(pvalStrictLeft+ pvalEqual, pvalStrictRight+ pvalEqual)) #else: # raise RuntimeError() #if pval == 0: #pval = 1.0 / self._numResamplings resDict = OrderedDict([(self.PVAL_KEY, pval), ('TSMC_' + self.getRawStatisticMainClassName(), self._observation), ('MeanOfNullDistr', meanOfNullDistr), ('MedianOfNullDistr', medianOfNullDistr), ('SdNullDistr', sdOfNullDistr), ('DiffFromMean', diffObsMean), (self.NUM_SAMPLES_KEY, self._numResamplings), ('NumSamplesNotNan', numberOfNonNanRandResults), (self.M_KEY, numMoreExtreme)]) #if self._pointCount1.getResult() is not None: #if self._track._trackFormatReq is not None and not self._track._trackFormatReq.isDense() and not self._track._trackFormatReq.allowOverlaps(): if hasattr(self, '_pointCount1'): numElTr1 = self._pointCount1.getResult() if numElTr1 < 1: resDict[self.PVAL_KEY] = None resDict.update({'NumPointsTr1': numElTr1}) #if self._pointCount2.getResult() is not None: #if self._track2._trackFormatReq is not None and not self._track2._trackFormatReq.isDense() and not self._track2._trackFormatReq.allowOverlaps(): if hasattr(self, '_pointCount2'): numElTr2 = self._pointCount2.getResult() if numElTr2 < 1: resDict['P-value'] = None resDict.update({'NumPointsTr2': numElTr2}) if self._kwArgs.get('includeFullNullDistribution') == 'yes': resDict['fullNullDistribution'] = ','.join( [str(x) for x in nonNanNumpyRandResults]) assert len(self._randResults) == self._numResamplings return resDict