def renameExistingStdTrackIfNeeded(cls, genome, stdTrackName): oldTrackName = None for allowOverlaps in [False, True]: parentDir = createDirPath(stdTrackName[:-1], genome, allowOverlaps=allowOverlaps) if os.path.exists(parentDir): dirContents = os.listdir(parentDir) realDirs = [ x for x in dirContents if os.path.isdir(os.path.join(parentDir, x)) and not os.path.islink(os.path.join(parentDir, x)) ] reqDirName = stdTrackName[-1] reqDirPath = os.path.join(parentDir, reqDirName) from gold.application.LogSetup import logMessage logMessage('Checking ' + reqDirPath) if os.path.islink(reqDirPath) and not os.path.isdir( os.readlink(reqDirPath)): # This is to fix a bug that ended in the symlink pointing to a file os.remove(reqDirPath) logMessage('Removed ' + reqDirPath) if realDirs and reqDirName not in dirContents: oldTrackName = stdTrackName[:-1] + [realDirs[0]] os.symlink(realDirs[0], reqDirPath) if oldTrackName is not None: ti = TrackInfo(genome, oldTrackName) ti.trackName = stdTrackName ti.store()
def _computeBinomialTail(cls, x, size, prob, tail): from gold.application.RSetup import r x, size, prob = int(x), int(size), float(prob) if prob*size >= cls.MIN_SUCCESSES_FOR_NORM_APPROXIMATION <= (1-prob)*size: mean = size * prob sd = (size*prob*(1-prob))**0.5 lessPval = r.pnorm(x,mean,sd) if tail=='less': pval = lessPval elif tail=='more': pval = 1 - lessPval elif tail=='different': pval = min(1, 2*min( lessPval, 1-lessPval)) else: from gold.application.LogSetup import logMessage, logging logMessage('Unsupported tail (%s) encountered in _computeBinomialTail.'%tail, level=logging.WARN) elif x > cls.MAX_SUCCESSES_FOR_BINOMIAL_RUNTIME: return None #raise NotImplementedError() else: if tail=='less': pval = r.pbinom(x,size,prob) elif tail=='more': pval = 1 - r.pbinom(x-1,size,prob) elif tail=='different': pval = min(1,2*min( r.pbinom(x,size,prob), 1 - r.pbinom(x-1,size,prob))) return pval
def __init__(self, region, track, track2=None, *args, **kwArgs): from config.Config import IS_EXPERIMENTAL_INSTALLATION if 'isExperimental' in kwArgs: x = kwArgs['isExperimental'].lower() if not x in ['false', 'true']: logMessage('isExperimental has value other than false/true', level=logging.WARN) raise ShouldNotOccurError( 'isExperimental has value other than false/true.') if x == 'true': assert IS_EXPERIMENTAL_INSTALLATION, IS_EXPERIMENTAL_INSTALLATION #else: # assert IS_EXPERIMENTAL_INSTALLATION if 'assumptions' in kwArgs: self._checkAssumptions(kwArgs['assumptions']) self._region = region self._track = track if track2 not in [None, []]: self._track2 = track2 self._kwArgs = kwArgs self._init(**kwArgs) self._trace('__init__')
def execute(cls, choices, galaxyFn=None, username=''): print choices logMessage('CreateBpsVennDIagram choices input: ' + repr(choices)) debugstring = 'i execute\n' genome, trackNames = CreateBpsVennDIagram.getTrackNamesFromFormParameters( choices) trackNameStrings = [':'.join(tn) for tn in trackNames] print trackNameStrings geSourceList, trackNamesWithoutPath = CreateBpsVennDIagram.getGeSourceList( genome, trackNames) # Make input similar, if it is many files or one category.bed file. # turn into a categoryBedLIst if len(trackNames) == 1: # assume input is one category.bed file categoryBedList, categoryNames = CreateBpsVennDIagram.getCategoryBedList( geSourceList[0]) else: categoryBedList = CreateBpsVennDIagram.collapseToCategoryBedList( geSourceList, trackNamesWithoutPath) categoryNames = trackNamesWithoutPath # print categoryBedList # return # make cat selection list, all are considerd in the from this tool. To be used in subsequent methoods that also can be called from other tools where this come into play. labelToPrime = CreateBpsVennDIagram.getPrimeList() counter = 0 catInfo = OrderedDict() for c in categoryNames: thisTrackName = trackNameStrings[0] if len(trackNames) > 1: thisTrackName = trackNameStrings[counter] debugstring += str(thisTrackName) + '\n' catInfo[c] = { 'label': labelToPrime.keys()[counter], 'prime': labelToPrime.values()[counter], 'selection': 'in', 'fullTrackName': thisTrackName } # catInfo[c] = {'label':labelToPrime.keys()[counter], 'prime':labelToPrime.values()[counter], 'selection':'in', 'fullTrackName':thisTrackName} counter = counter + 1 # collapse to startorstop and state lists posDict, catDict = CreateBpsVennDIagram.getPosCatDictsFromCategoryBedList( categoryBedList, catInfo) # iterate list and get stateBPCounter and stateRegions stateBPCounter, stateRegions, thisdebugstring = CreateBpsVennDIagram.getStateCount( posDict, catDict) debugstring += 'stateBPCounter: ' + str(stateBPCounter) + '\n' utfil = open(galaxyFn, 'w') utfil.write( CreateBpsVennDIagram.getHtmlString(catInfo, stateBPCounter, genome)) utfil.close() # Turn the stateBPCounter into the object used by javascript '''
def _combineResults(self): if len(self._childResults)>0: logMessage(repr(self._childResults)) x,y = zip(*self._childResults) return self._pearsonr(x,y) else: return 2.5
def _computeBinomialPval(cls, x, size, prob, tail): from proto.RSetup import r x, size, prob = int(x), int(size), float(prob) if prob * size >= cls.MIN_SUCCESSES_FOR_NORM_APPROXIMATION <= ( 1 - prob) * size: mean = size * prob sd = (size * prob * (1 - prob))**0.5 lessPval = r.pnorm(x, mean, sd) if tail == 'less': pval = lessPval elif tail == 'more': pval = 1 - lessPval elif tail == 'different': pval = min(1, 2 * min(lessPval, 1 - lessPval)) else: from gold.application.LogSetup import logMessage, logging logMessage( 'Unsupported tail (%s) encountered in _computeBinomialTail.' % tail, level=logging.WARN) elif x > cls.MAX_SUCCESSES_FOR_BINOMIAL_RUNTIME: return None #raise NotImplementedError() else: if tail == 'less': pval = r.pbinom(x, size, prob) elif tail == 'more': pval = 1 - r.pbinom(x - 1, size, prob) elif tail == 'different': pval = min( 1, 2 * min(r.pbinom(x, size, prob), 1 - r.pbinom(x - 1, size, prob))) return pval
def __init__(self, region, trackStructure, *args, **kwArgs): from config.Config import IS_EXPERIMENTAL_INSTALLATION # @UnresolvedImport if 'isExperimental' in kwArgs: x = kwArgs['isExperimental'].lower() if x not in ['false', 'true']: logMessage('isExperimental has value other than false/true', level=logging.WARN) raise ShouldNotOccurError( 'isExperimental has value other than false/true.') if x == 'true': assert IS_EXPERIMENTAL_INSTALLATION, IS_EXPERIMENTAL_INSTALLATION if 'assumptions' in kwArgs: self._checkAssumptions(kwArgs['assumptions']) self._region = region self._trackStructure = trackStructure #TODO:boris 20150924, Code for checking if query and reference (track and track2) are the same track. #We should decide if we will allow this in the future. self._kwArgs = kwArgs self._init(**kwArgs) self._trace('__init__')
def getValuesFromBedFile(cls, genome, fn, colorPattern=(1,0,0)): resDict = defaultdict(list) valDict = defaultdict(list) lineTab = [] if type(fn) == type(None): return resDict elif isinstance(fn, basestring): lineTab = open(fn,'r').read().split('\n') else: lineTab = fn.returnComposed().split('\n') valueList = [] for line in lineTab: lineTab = line.split('\t') try: chrom = lineTab[0] valDict[chrom]+=[float(lineTab[3])] except: logMessage(line) maxVal = max(max(valDict.values())) for chrom in GenomeInfo.getChrList(genome): if valDict.has_key(chrom): try: resDict[chrom]+= [tuple([255 - (int(val*255/maxVal)*v) for v in colorPattern]) for val in valDict[chrom]] except: logMessage ('Ny rundeeee: '+ str([v for v in valDict[chrom][:10]])+ ': '+str(maxVal)) print 'count', len(valDict.values()) return resDict, maxVal
def _determineStatClass(self, flushMemoized=True): assert( hasattr(self, '_track') ) assert( hasattr(self, '_track2') ) dummyGESource = MinimalBinSource(self._genome) if len(self._statClassList) == 0: # if self._reversed: logMessage('Stat class list is empty, for analysisDef: ' + self._analysisLine, level = logging.WARNING) if DebugConfig.PASS_ON_VALIDSTAT_EXCEPTIONS: raise ShouldNotOccurError('Stat class list is empty. Analysisdef: ' + self._analysisLine) for statClass in self._statClassList: if DebugConfig.VERBOSE: logMessage('Checking validity of stat class "{}" for analysisDef "{}".'.format(statClass.__name__, self.getDefAfterChoices())) trackA, trackB = self._track, self._track2 if trackA is None: continue try: StatJob(dummyGESource, trackA, trackB, statClass, minimal=True, **self.getAllChoices(filterByActivation=True)).run(False, flushMemoized=flushMemoized) except IncompatibleTracksError, e: if DebugConfig.VERBOSE: logException(e, level=logging.DEBUG, messagePrefix='Warning: error in _determineStatClass for stat: %s' % statClass.__name__) if DebugConfig.PASS_ON_VALIDSTAT_EXCEPTIONS: raise except (AssertionError, IncompatibleAssumptionsError, IdenticalTrackNamesError), e: if DebugConfig.VERBOSE: logException(e, level=logging.DEBUG, messagePrefix='Warning: error in _determineStatClass for stat: %s' % statClass.__name__) if DebugConfig.PASS_ON_VALIDSTAT_EXCEPTIONS: raise
def __init__(self, region, trackStructure, *args, **kwArgs): from config.Config import IS_EXPERIMENTAL_INSTALLATION # @UnresolvedImport if 'isExperimental' in kwArgs: x = kwArgs['isExperimental'].lower() if x not in ['false', 'true']: logMessage('isExperimental has value other than false/true', level=logging.WARN) raise ShouldNotOccurError( 'isExperimental has value other than false/true.') if x == 'true': assert IS_EXPERIMENTAL_INSTALLATION, IS_EXPERIMENTAL_INSTALLATION if 'assumptions' in kwArgs: self._checkAssumptions(kwArgs['assumptions']) self._region = region self._trackStructure = trackStructure #TODO:boris 20150924, Code for checking if query and reference (track and track2) are the same track. #We should decide if we will allow this in the future. #TODO: This should probably instead happen in the default _init method, so that when this is # overridden, one needs to explicitly store kwArgs if desired. #As it is now, parameters will be handled explicitly in _init while still becoming part of self_kwArgs self._kwArgs = kwArgs self._init(**kwArgs) self._trace('__init__')
def __init__(self, region, track, track2, rawStatistic, randTrackClass=None, assumptions=None, tails=None, numResamplings=2000, randomSeed=None, **kwArgs): if tails==None: if 'tail' in kwArgs: tailTranslator = {'more':'right-tail', 'less':'left-tail', 'different':'two-tail'} tails = tailTranslator[kwArgs['tail']] if DebugConfig.VERBOSE: logMessage('Argument tail provided instead of tails to RandomizationManagerStatUnsplittable', level=logging.DEBUG) else: tails = 'right-tail' # or 'two-tail'? logMessage('No tails argument provided to RandomizationManagerStatUnsplittable', level=logging.DEBUG) if track2 is None: self._track2 = None #to allow track2 to be passed on as None to rawStatistics without error. For use by single-track MC-tests.. from gold.util.RandomUtil import getManualSeed, setManualSeed if randomSeed is not None and randomSeed != 'Random' and getManualSeed() is None: setManualSeed(int(randomSeed)) if 'mcSetupScheme' in kwArgs: kwArgs = copy(kwArgs) #to not edit original dict.. if kwArgs['mcSetupScheme'] != 'custom': assert not 'maxSamples' in kwArgs #check that specific values are not redundantly set # Statistic.__init__(self, region, track, track2, rawStatistic=rawStatistic, randTrackClass=randTrackClass, assumptions=assumptions, tails=tails, numResamplings=numResamplings, randomSeed=randomSeed, **kwArgs) #if type(rawStatistic) is str: # from gold.statistic.AllStatistics import STAT_CLASS_DICT # rawStatistic = STAT_CLASS_DICT[rawStatistic] assert (randTrackClass is None) ^ (assumptions is None) # xor if assumptions is not None: assert assumptions.count('_') == 1, assumptions randTrackClass1, randTrackClass2 = assumptions.split('_') else: randTrackClass1 = None randTrackClass2 = randTrackClass self._randTrackClass1, self._randTrackClass2 = \ [ ( globals()[clsDef] if clsDef not in ['None',''] else None ) \ if isinstance(clsDef, basestring) else clsDef for clsDef in [randTrackClass1, randTrackClass2]] assert not (randTrackClass1 is None and randTrackClass2 is None) for cls in [self._randTrackClass1, self._randTrackClass2]: assert cls in [None, PermutedSegsAndSampledIntersegsTrack, \ PermutedSegsAndIntersegsTrack, RandomGenomeLocationTrack, SegsSampledByIntensityTrack, ShuffledMarksTrack, SegsSampledByDistanceToReferenceTrack, PointsSampledFromBinaryIntensityTrack] #print self._randTrackClass1, self._randTrackClass2 self._rawStatistic = self.getRawStatisticClass(rawStatistic) #self._randTrackList = [] self._tails = tails if kwArgs.get('minimal') == True: self._numResamplings = 1 self._kwArgs['maxSamples'] = 1 else: self._numResamplings = int(numResamplings) CompBinManager.ALLOW_COMP_BIN_SPLITTING = False self._randResults = [] self._observation = None #to load r libraries for McFdr: McFdr._initMcFdr()
def computeStep(self): self._trace('computeStep') if not self.hasResult(): self._loadMemoizedResult() if self.hasResult(): return for child in self._children: if not child.hasResult(): child.computeStep() if not all([child.hasResult() for child in self._children]): return self._trace('_compute') #The method _compute may either return the result, or set the result variable directly: res = None with StatisticExceptionHandling(**self._kwArgs): res = self._compute() if DebugConfig.VERBOSE: logMessage('Result of statistic %s in region %s: %s' % (getClassName(self), self._region, res)) if not self.hasResult(): #Only set _result if this was not set directly by the previous call to _compute self._result = res self._storeMemoizedResult()
def __init__(self, region, track, track2=None, *args, **kwArgs): from config.Config import IS_EXPERIMENTAL_INSTALLATION if 'isExperimental' in kwArgs: x = kwArgs['isExperimental'].lower() if not x in ['false','true']: logMessage('isExperimental has value other than false/true', level=logging.WARN) raise ShouldNotOccurError('isExperimental has value other than false/true.') if x=='true': assert IS_EXPERIMENTAL_INSTALLATION #else: # assert IS_EXPERIMENTAL_INSTALLATION if 'assumptions' in kwArgs: self._checkAssumptions(kwArgs['assumptions']) self._region = region self._track = track if track2 not in [None, []]: if track.trackName == track2.trackName: #if not kwArgs.get('allowIdenticalTracks') in [True,'True']: #Does not work, as all kwArgs are not sent further down in createChildren, meaning that a base statistic like RawDataStat would not find allowIdenticalTracks and throw exception.. #if not IS_EXPERIMENTAL_INSTALLATION: #does not work either, as results in: gold.util.CustomExceptions.IncompatibleTracksError: Track 'Unmarked segments (Sample tracks)'was created, but not touched by statistic from gold.util.CustomExceptions import IdenticalTrackNamesError raise IdenticalTrackNamesError("Track names are identical. Track name = " + ':'.join(track.trackName)) self._track2 = track2 self._kwArgs = kwArgs self._init(**kwArgs) self._trace('__init__')
def computeStep(self): # if self.hasResult(): # return self._trace('computeStep') try: try: if not self._curChild.hasResult(): self._curChild.computeStep() if not self._curChild.hasResult(): return nextRes = self._curChild.getResult() if DebugConfig.VERBOSE: logMessage('Result of statistic %s in region %s: %s' % (getClassName(self._curChild), self._curChild._region, nextRes)) # except NoneResultError, e: except (CentromerError, NoneResultError), e: nextRes = None if DebugConfig.VERBOSE: logException(e, level=logging.DEBUG) if DebugConfig.PASS_ON_NONERESULT_EXCEPTIONS: raise self._childResults.append(nextRes) tempRefHolderChild = self._curChild # To avoid children of this _curChild to be collected in the next line. # It will live long enough for createChildren to be called on new _curChild self._curChild.afterComputeCleanup() # In case a global analysis was run without prior local analyses self._curChild = None #first sets curchild to None to free memory even when self._bins.next() raises StopIteration.. self._curChild = self._getChildObject(self._bins.next()) self._curChild.createChildren()
def isValidForListing(self): anyTextParts = len(self._analysisParts) > 0 if not anyTextParts: if DebugConfig.VERBOSE: logMessage('Analysisdef "{}" does not have any text available for listing. '.format(self.getDef()) + 'Skipping...') else: return self.getStat(flushMemoized=False) is not None
def _logAssumptionReduction(self, removedAssumptions): #global VERBOSE #prev = VERBOSE #VERBOSE = True for assumption in removedAssumptions: logMessage('Assumption "' + str(assumption) + '" was removed from analysisDef: ' + self.getDef()) self.setChoice(self.ASSUMP_LABEL_KEY, assumption) self._determineStatClass()
def getOrigFn(genome, trackName, suffix, fileTree='standardized'): fns = getOrigFns(genome, trackName, suffix, fileTree=fileTree) if len(fns) != 1: if IS_EXPERIMENTAL_INSTALLATION: from gold.application.LogSetup import logMessage, logging logMessage('getOrigFn - Cannot decide among zero or several filenames: %s' % fns, logging.WARNING) return None return fns[0]
def globalAnalysisEnded(self): if not self._printProgress: return if self._startGlobalTime is None: logMessage( 'Called globalAnalysisEnded without globalAnalysisStarted being called before.', level=logging.WARN) print "\nglobal analysis took %f seconds" % (time.time() - self._startGlobalTime)
def __init__(self, region, track, track2, rawStatistic, randTrackClass=None, assumptions=None, tails=None, numResamplings=2000, randomSeed=None, **kwArgs): #print 'TEMP RM:',kwArgs if tails==None: if 'tail' in kwArgs: tailTranslator = {'more':'right-tail', 'less':'left-tail', 'different':'two-tail'} tails = tailTranslator[kwArgs['tail']] if DebugConfig.VERBOSE: logMessage('Argument tail provided instead of tails to RandomizationManagerStatUnsplittable', level=logging.DEBUG) else: tails = 'right-tail' # or 'two-tail'? logMessage('No tails argument provided to RandomizationManagerStatUnsplittable', level=logging.DEBUG) if track2 is None: self._track2 = None #to allow track2 to be passed on as None to rawStatistics without error. For use by single-track MC-tests.. from gold.util.RandomUtil import getManualSeed, setManualSeed if randomSeed is not None and randomSeed != 'Random' and getManualSeed() is None: setManualSeed(int(randomSeed)) Statistic.__init__(self, region, track, track2, rawStatistic=rawStatistic, randTrackClass=randTrackClass, assumptions=assumptions, tails=tails, numResamplings=numResamplings, randomSeed=randomSeed, **kwArgs) #if type(rawStatistic) is str: # from gold.statistic.AllStatistics import STAT_CLASS_DICT # rawStatistic = STAT_CLASS_DICT[rawStatistic] assert (randTrackClass is None) ^ (assumptions is None) # xor if assumptions is not None: assert assumptions.count('_') == 1, assumptions randTrackClass1, randTrackClass2 = assumptions.split('_') else: randTrackClass1 = None randTrackClass2 = randTrackClass self._randTrackClass1, self._randTrackClass2 = \ [ ( globals()[clsDef] if clsDef not in ['None',''] else None ) \ if type(clsDef) is str else clsDef for clsDef in [randTrackClass1, randTrackClass2] ] assert not (randTrackClass1 is None and randTrackClass2 is None) for cls in [self._randTrackClass1, self._randTrackClass2]: assert cls in [None, PermutedSegsAndSampledIntersegsTrack, \ PermutedSegsAndIntersegsTrack, RandomGenomeLocationTrack, SegsSampledByIntensityTrack, ShuffledMarksTrack] #print self._randTrackClass1, self._randTrackClass2 self._rawStatistic = self.getRawStatisticClass(rawStatistic) #self._randTrackList = [] self._tails = tails if kwArgs.get('minimal') == True: self._numResamplings = 1 self._kwArgs['maxSamples'] = 1 else: self._numResamplings = int(numResamplings) CompBinManager.ALLOW_COMP_BIN_SPLITTING = False self._randResults = [] self._observation = None #to load r libraries for McFdr: McFdr._initMcFdr()
def _handleMissingStat(self): from gold.application.LogSetup import logMessage, logging from gold.description.RunDescription import RunDescription import gold.description.Analysis as AnalysisModule #AnalysisModule.VERBOSE = True msg = 'Started run with invalid statistic... Def: ' + self._analysisDef #+ ', Run description: ' + \ #RunDescription.getRevEngBatchLine( self._trackName1, self._trackName2, self._analysisDef, \ #'Not Available', 'Not Available', self._userBinSource.genome) logMessage(msg, level=logging.ERROR) raise Exception(msg)
def _determineStatClass(self): assert( hasattr(self, '_track') ) assert( hasattr(self, '_track2') ) dummyGESource = MinimalBinSource(self._genome) if len(self._statClassList)==0: #logging.getLogger(HB_LOGGER).warning('Stat class list is empty, for analysisDef: ' + self._analysisLine) logMessage('Stat class list is empty, for analysisDef: ' + self._analysisLine, level = logging.WARNING) if DebugConfig.PASS_ON_VALIDSTAT_EXCEPTIONS: raise ShouldNotOccurError('Stat class list is empty. Analysisdef: '+self._analysisLine) for statClass in self._statClassList: if DebugConfig.VERBOSE: logMessage(statClass.__name__ + ': Trying (' + self.getDefAfterChoices() + ')') # print statClass.__name__ + ': Trying (' + self.getDefAfterChoices() + ')' #for reversed, trackA, trackB in [(False, self._track, self._track2), (True, self._track2, self._track) ]: for trackA, trackB in [[self._track, self._track2]]: if trackA == None: continue try: StatJob(dummyGESource, trackA, trackB, statClass, minimal=True, **self.getChoices()).run(False) #In order not to mess up integration tests initSeed() for track in [trackA, trackB]: if track is not None and track.formatConverters is None: raise IncompatibleTracksError('Track ' + prettyPrintTrackName(track.trackName) +\ 'was created, but not touched by statistic') except IncompatibleTracksError, e: if DebugConfig.PASS_ON_VALIDSTAT_EXCEPTIONS: raise if DebugConfig.VERBOSE: logException(e, message='(Warning: error in _determineStatClass for stat: %s)' % statClass.__name__) #if VERBOSE: # print 'Incompatible tracks: ', \ # statClass.__name__ + ': ' + e.__class__.__name__ + ': ' + str(e) # print 'Incompatible: ', e except (AssertionError, IncompatibleAssumptionsError, IdenticalTrackNamesError), e: if DebugConfig.PASS_ON_VALIDSTAT_EXCEPTIONS: raise if DebugConfig.VERBOSE: logException(e, message='(Warning: error in _determineStatClass for stat: %s)' % statClass.__name__) #if VERBOSE: # print 'Warning: exception in getStat: ', \ # statClass.__name__ + ': ' + e.__class__.__name__ + ': ' + str(e) # traceback.print_exc(file=sys.stdout) except OSError, e: if DebugConfig.PASS_ON_VALIDSTAT_EXCEPTIONS: raise elif not 'withOverlaps' in str(e): raise
def storeResult(cls, stat): if stat.resultLoadedFromDisk(): return if cls._shouldUseDiskMemoization(stat, store=True): memoPath = cls._createMemoPath(stat) key = cls._createMemoKey(stat) if DebugConfig.VERBOSE: logMessage('Storing result "{}" in "{}"'.format( stat._result, memoPath)) cls.memoDataCollection[memoPath][key] = stat._result
def getNullModel(self): nullModel = self.getChoiceText(self.ASSUMP_LABEL_KEY) if self._reversed: if re.search('[^ ,.]T[12][^ ,.]',nullModel): logMessage('found instance of T1/T2 in null-model that may not refer to tracks as assumed in getNullModel') assert not 'tempT2' in nullModel nullModel = nullModel.replace('T1','tempT2') nullModel = nullModel.replace('T2','T1') nullModel = nullModel.replace('tempT2','T2') return nullModel
def _getH0andH1Text(self, coreCls): h0 = self._results._analysis.getH0() h1 = self._results._analysis.getH1() if not None in (h0,h1): core = coreCls() core.descriptionLine('H0', h0, indent=True) core.line('vs') core.descriptionLine('H1', h1, indent=True) return str(core) else: logMessage('Did not find H0 or H1. Their values: ' + str(h0) +' and ' + str(h1)) return None
def _constructBins(regSpec, binSpec, genome, trackNames): # Construct and check bins try: from quick.application.GalaxyInterface import GalaxyInterface userBinSource = GalaxyInterface._getUserBinSource(regSpec, binSpec, genome, trackNames) return [None, userBinSource] except Exception, e: results = Results([], [], '') results.addError(InvalidRunSpecException('Error in specification of analysis region or binsize: ' + str(e))) logMessage('Error in specification of analysis region (' + regSpec +') or binsize: (' + binSpec + ')') if DebugConfig.PASS_ON_BATCH_EXCEPTIONS: raise return [results, None]
def getNullModel(self): nullModel = self.getChoiceText(self.ASSUMP_LABEL_KEY) if self.reversed: if re.search('[^ ,.]T[12][^ ,.]',nullModel): logMessage('Found instance of T1/T2 in null-model that may not refer to ' 'tracks as assumed in getNullModel') assert not 'tempT2' in nullModel nullModel = nullModel.replace('T1','tempT2') nullModel = nullModel.replace('T2','T1') nullModel = nullModel.replace('tempT2','T2') return nullModel
def _getH0andH1Text(self, coreCls): h0 = self._results._h0 h1 = self._results._h1 if not None in (h0, h1): core = coreCls() core.descriptionLine('H0', h0, indent=True) core.line('vs') core.descriptionLine('H1', h1, indent=True) return str(core) else: logMessage('Did not find H0 or H1. Their values: ' + str(h0) + ' and ' + str(h1)) return None
def getH1(self): #H1 = self.getChoiceText(self.H1_KEY) #return H1 tailChoice = self.getChoice(self.TAIL_KEY) if tailChoice is None: return None selectedH1Key = self.H1_KEY + '_' + tailChoice H1 = self.getChoice(selectedH1Key) if H1 is None: logMessage('Could not find H1, probably mismatch between tail-choice and corresponding H1-option in analysisDef '+\ '(tail choice: %s, options: %s)' % (self.getChoice('tail'), self.getAllOptionsAsKeys() ) ) return H1
def monitor_load(): taskQueueManager = TaskQueueManagerFactory.getTaskQueueManager() nextPossibleJobRequestTime = time.time() while True: loadAverage = taskQueueManager.getLoadAverage() #logging.getLogger(PARALLEL_LOGGER).debug("load average is %f", loadAverage) #logMessage("load average is %f" % loadAverage, level=5, logger=PARALLEL_LOGGER) if loadAverage > LOAD_THRESHOLD: if time.time() > nextPossibleJobRequestTime: #logging.getLogger(PARALLEL_LOGGER).debug("load over threshold, submitting titan job") logMessage("load over threshold, submitting titan job", level=5, logger=PARALLEL_LOGGER) nextPossibleJobRequestTime = time.time() + JOB_SUBMISSION_WAIT_PERIOD TitanJobScript.submitJob() time.sleep(5)
def getOrigFns(genome, trackName, suffix, fileTree='standardized'): assert fileTree in ['standardized', 'collected', 'parsing error'] from gold.application.LogSetup import logMessage, logging path = getOrigPathForFileTree(genome, trackName, fileTree) if not os.path.exists(path): if IS_EXPERIMENTAL_INSTALLATION: logMessage('getOrigFn - Path does not exist: ' + path, logging.WARNING) return [] return [path + os.sep + x for x in os.listdir(path) if os.path.isfile(path + os.sep + x) \ and x.endswith(suffix) and not x[0] in ['.','_','#'] and not x[-1] in ['~','#']]
def getPreviewFile(trackNameTuple): logMessage('trackNameTuple := ' + repr(trackNameTuple)) if trackNameTuple[-1].find(',FOLDER') > 0: return None context = zmq.Context() socket = context.socket(zmq.REQ) socket.connect("tcp://localhost:5559") trackNameTuple[-1] = trackNameTuple[-1].split(',')[0] datasetId = getDatasetId(trackNameTuple[1]) subtype = trackNameTuple[2] fileName = '/'.join(trackNameTuple[3:]).replace(',FOLDER', '').split(',')[0] if fileName == '': return None subList = [subtype, fileName] paramlist = [ 'params:=' + '<#>'.join([datasetId, repr(subList)]), 'operation:=GetFilePreviewFromPublicDataset', 'class:=dataStorageServicePub' ] socket.send(messageSep.join(paramlist)) filePreview = socket.recv_unicode().encode('ascii', 'ignore') #startIndex, endIndex = filePreview.find('<Preview>')+9, filePreview.rfind('</Preview>') #filePreview = filePreview[startIndex:endIndex] from tempfile import NamedTemporaryFile tempfile = NamedTemporaryFile() tempfile.write(filePreview) logMessage('fileName := ' + fileName) logMessage('NamedTemporaryFile := ' + tempfile.name) logMessage('FilePreview := ' + filePreview) return tempfile
def getPreviewFile(trackNameTuple, userName, pwd): logMessage('trackNameTuple := '+ repr(trackNameTuple)) if trackNameTuple[-1].find(',FOLDER')>0: return None context = zmq.Context() socket = context.socket(zmq.REQ) socket.connect("tcp://localhost:5559") trackNameTuple[-1] = trackNameTuple[-1].split(',')[0] datasetId = getDatasetId(trackNameTuple[1]) subtype = trackNameTuple[2] fileName = '/'.join(trackNameTuple[3:]).replace(',FOLDER','').split(',')[0] if fileName =='': return None subList = [subtype, fileName] paramlist = ['params:='+'<#>'.join([ datasetId, repr(subList)]), 'operation:=GetFilePreviewFromPublicDataset','class:=dataStorageServicePub'] socket.send(messageSep.join(paramlist)) filePreview = socket.recv_unicode().encode('ascii','ignore') #startIndex, endIndex = filePreview.find('<Preview>')+9, filePreview.rfind('</Preview>') #filePreview = filePreview[startIndex:endIndex] from tempfile import NamedTemporaryFile tempfile = NamedTemporaryFile() tempfile.write(filePreview) logMessage('fileName := '+fileName) logMessage('NamedTemporaryFile := '+tempfile.name) logMessage('FilePreview := '+ filePreview) return tempfile #[email protected]
def loadResult(cls, stat): if cls._shouldUseDiskMemoization(stat, store=False): memoPath = cls._createMemoPath(stat) key = cls._createMemoKey(stat) memoDict = cls.memoDataCollection[memoPath] if key in memoDict: res = cls.memoDataCollection[memoPath][key] stat.setMemoizedResult(res) if DebugConfig.VERBOSE: logMessage('Loading result "{}" in "{}"'.format( res, memoPath)) stat.setMemoizedResult(res)
def isCompatibleWith(self, sourceFormat, exceptionList=[]): assert (not isinstance(sourceFormat, TrackFormatReq)) attrList = [ attr for attr in self._getAttributes(includeReqExtensions=False) if attr[1:] not in exceptionList ] pairedAttrs = [[getattr(obj, attr) for obj in [self, sourceFormat]] for attr in attrList] if DebugConfig.VERBOSE: logMessage( "Checking track format compatibility. Paired attributes: " + ', '.join("{}: {}".format(x, y) for x, y in zip(attrList, pairedAttrs))) res = (not False in [s is None or s == sf for s, sf in pairedAttrs]) return res
def computeStep(self): self._trace('computeStep') if not self.hasResult(): self._loadMemoized() if self.hasResult(): return for child in self._children: if not child.hasResult(): child.computeStep() if not all([child.hasResult() for child in self._children]): return self._trace('_compute') #The method _compute may either return the result, or set the result variable directly: res = None with StatisticExceptionHandling(**self._kwArgs): res = self._compute() # import os # import psutil # process = psutil.Process(os.getpid()) # print process.memory_info().rss if DebugConfig.VERBOSE: logMessage('Result of statistic %s in region %s: %s' % (getClassName(self), self._region, res)) #try: # self._trace('_compute') # #The method _compute may either return the result, or set the result variable directly: # res = self._compute() # #except (TooLargeBinError, TooSmallBinError, CentromerError),e: # logException(e) # raise #except (ZeroDivisionError, FloatingPointError, TypeError, ValueError),e: # #print 'Error: ', e.__class__.__name__, e # res = None # if DebugConfig.VERBOSE or e.__class__ in [TypeError, ValueError]: # logException(e, message='kwArgs: ' + str(self._kwArgs)) # if DebugConfig.PASS_ON_COMPUTE_EXCEPTIONS: # raise if not self.hasResult(): #Only set _result if this was not set directly by the previous call to _compute self._result = res self._storeResult()
def inferValType(valList, shapeOffset=0): if valList is None: return False elif type(valList) in [list, tuple]: return 'number' elif isinstance(valList, numpy.ndarray) or isinstance( valList, SmartMemmap): if len(valList.shape) == 2 + shapeOffset and valList.shape[ 1 + shapeOffset] == 2 and valList.dtype == numpy.dtype('float128'): return 'mean_sd' elif any(valList.dtype == numpy.dtype(x) for x in ['float32', 'float64', 'float128']): if len(valList.shape) == 1 + shapeOffset: return 'number' elif valList.shape[1 + shapeOffset] >= 2: return 'population' if any(valList.dtype == numpy.dtype(x) for x in ['int32', 'int64']): if len(valList.shape) == 1 + shapeOffset: return 'number (integer)' elif valList.shape[1 + shapeOffset] >= 2: return 'population' elif any(valList.dtype == numpy.dtype(x) for x in ['int8', 'bool8']): if len(valList.shape) == 1 + shapeOffset: return 'tc' elif valList.shape[1 + shapeOffset] >= 2: return 'tc_vector' elif valList.dtype == numpy.dtype('S1'): if len(valList.shape) == 1 + shapeOffset: return 'char' elif valList.shape[1 + shapeOffset] >= 2: return 'char_vector' elif _dtypeIsStringLongerThanOne(valList.dtype): if len(valList.shape) == 1 + shapeOffset: return 'category' elif valList.shape[1 + shapeOffset] >= 2: return 'category_vector' if valList.shape[1 + shapeOffset] == 0: return 'unsupported list' logMessage('Shape or dtype not recognized: ' + str(valList.shape) + ' and ' + str(valList.dtype)) raise ShouldNotOccurError() else: logMessage('Type of valList not recognized: ' + str(type(valList))) raise ShouldNotOccurError()
def _constructBins(regSpec, binSpec, genome, trackName1, trackName2): #Construct and check bins try: #userBinSource= UserBinSource(regSpec, binSpec) from quick.application.GalaxyInterface import GalaxyInterface # from config.Config import DEFAULT_GENOME userBinSource = GalaxyInterface._getUserBinSource(regSpec, binSpec, genome, trackName1, trackName2) return [None, userBinSource] except Exception, e: #results = Results(trackName1, trackName2, statClassName) results = Results([],[],'') results.addError(InvalidRunSpecException('Error in specification of analysis region or binsize: ' + str(e))) logMessage('Error in specification of analysis region (' + regSpec +') or binsize: (' + binSpec + ')') if DebugConfig.PASS_ON_BATCH_EXCEPTIONS: raise return [results, None]
def getRevEngBatchLine(trackName1, trackName2, cleanedTrackName1, cleanedTrackName2, analysisDef, \ regSpec, binSpec, genome, manualSeed, **kwArgs): #analysisDef is assumed to be unquoted #if this is to work, must check explicitly against special keywords in regSpec (or check that regSpec is a valid region that is to have region..)... #if not genome in regSpec: # regSpec = genome+':'+regSpec try: if DebugConfig.VERBOSE: logMessage('getting RevEngBatchLine:') #analysisDef =analysisDef.replace('%20PointCountInSegsPvalStat%2C','') #REMOVE #print 'NOWAG: ',analysisDef analysis = Analysis(analysisDef, genome, cleanedTrackName1, cleanedTrackName2, **kwArgs) stat = analysis.getStat() if stat is None: return 'No corr batch line, as no valid statistic was found.. ' #print 'CAME HERE' statClassName = stat.__name__ #fixme: Add space, but this is not checked in batchrunner... params = ','.join(['='.join([choicePair[0], str(manualSeed)]) \ if (manualSeed is not None and choicePair[0] == 'randomSeed' and choicePair[1] == 'Random') else '='.join(choicePair) \ for choicePair in analysis.getChoices().items() \ if choicePair[0] not in ['H0','H1_more','H1_less','H1_different','H1_ha1','H1_ha2','H1_ha3','H1_ha4','H1_ha5'] ]) statText = statClassName + '(' + params + ')' #return BATCH_COL_SEPARATOR.join([regSpec, binSpec, \ # (':'.join(trackName1)).replace(' ','_'),\ # (':'.join(trackName2)).replace(' ','_') if trackName2 is not None else 'None',\ # statText]) #assert unquote(regSpec) == regSpec assert unquote(binSpec) == binSpec #To assure that unquote can be safely applied to binSpec without any consequences (we don't want to always quote, but still want the possibility to use quoted history track names) batchElements = [genome, regSpec, binSpec, \ (':'.join([quote(x, safe='') for x in trackName1])),\ (':'.join([quote(x, safe='') for x in trackName2])) if trackName2 is not None else 'None',\ statText] #batchElements = [el.replace(BATCH_COL_SEPARATOR, '\\' + BATCH_COL_SEPARATOR) for el in batchElements] #batchElements = [quote(el, safe='') for el in batchElements] return BATCH_COL_SEPARATOR.join(batchElements) except Exception,e: #raise logException(e,logging.WARNING,'Could not generate corresponding batch line: ') #if DebugConfig.VERBOSE: logMessage('analysisDef, genome, trackName1, trackName2: \n' + str([analysisDef, genome, trackName1, trackName2]) ) return 'Warning: Could not generate corresponding batch line.'
def _trace(self, methodName): if TRACE_STAT[methodName]: if not hasattr(Statistic, 'objAddresses'): Statistic.objAddresses = {} if not hasattr(self, '_traceId'): if not self.__class__.__name__ in Statistic.objAddresses: Statistic.objAddresses[self.__class__.__name__] = 0 self._traceId = str(Statistic.objAddresses[self.__class__.__name__]) Statistic.objAddresses[self.__class__.__name__] += 1 logMessage( self.__class__.__name__ + '(' + self._traceId + ').' + methodName \ + ( (' (' + str(self._region) +')') if TRACE_STAT['printRegions'] else '') \ + ( ( ' (' + str(self._track.trackName) \ + (',' + str(self._track2.trackName) if hasattr(self,'_track2') else '') \ + ')' ) if TRACE_STAT['printTrackNames'] else '') )
def downloadFirstFtpFile(localFile, FtpAddress): user, pwdServ, port = FtpAddress.replace('/', '').split(':')[1:] pwd, server = pwdServ.split('@') logMessage('localFile: ' + localFile) ftp_h = FTP() ftp_h.connect(server, port) ftp_h.login(user, pwd) filenames = [] ftp_h.retrlines('NLST', filenames.append) for fn in filenames: utfil = open(localFile, 'wb') ftp_h.retrbinary('RETR ' + fn, utfil.write) utfil.close() break ftp_h.close()
def inferValType(valList, shapeOffset=0): if valList is None: return False elif type(valList) in [list,tuple]: return 'number' elif isinstance(valList, numpy.ndarray) or isinstance(valList, SmartMemmap): if len(valList.shape) == 2 + shapeOffset and valList.shape[1 + shapeOffset] == 2 and valList.dtype == numpy.dtype('float128'): return 'mean_sd' elif any(valList.dtype == numpy.dtype(x) for x in ['float32', 'float64', 'float128']): if len( valList.shape ) == 1 + shapeOffset: return 'number' elif valList.shape[1 + shapeOffset] >= 2: return 'population' if any(valList.dtype == numpy.dtype(x) for x in ['int32', 'int64']): if len( valList.shape ) == 1 + shapeOffset: return 'number (integer)' elif valList.shape[1 + shapeOffset] >= 2: return 'population' elif any(valList.dtype == numpy.dtype(x) for x in ['int8', 'bool8']): if len( valList.shape ) == 1 + shapeOffset: return 'tc' elif valList.shape[1 + shapeOffset] >= 2: return 'tc_vector' elif valList.dtype == numpy.dtype('S1'): if len( valList.shape ) == 1 + shapeOffset: return 'char' elif valList.shape[1 + shapeOffset] >= 2: return 'char_vector' elif _dtypeIsStringLongerThanOne(valList.dtype): if len( valList.shape ) == 1 + shapeOffset: return 'category' elif valList.shape[1 + shapeOffset] >= 2: return 'category_vector' if valList.shape[1 + shapeOffset] == 0: return 'unsupported list' logMessage('Shape or dtype not recognized: ' + str(valList.shape) + ' and ' + str(valList.dtype) ) raise ShouldNotOccurError() else: logMessage('Type of valList not recognized: ' + str(type(valList))) raise ShouldNotOccurError()
def _parseDef(self, id): self._analysisParts = [] self._statClassList = [] # ([^-[]* #pure text - not '[' #print 'NOWAG id:', id parts = re.findall(''' # Match pure text (part[0]): ( (?: [^-[]* (?:-(?!>))? )* #1. pure text - not '[' or '-', #2. separated by a possible '-' that is not before a '>' #1 and 2 is repeated as long as necessary [^-[\s]+) #should not end with whitespace, #as this may belong to the '->'-expression # Match option clause (part[1]) |( \[ [^[\]]* \] ) #Matches an expression inside brackets '[]' # Match specification of statistic classes (part[2]) |( \s? \-> \s? .* ) # Match any additional whitespace (part[3]) |(\s*) ''', id, flags=re.VERBOSE) from gold.statistic.AllStatistics import STAT_CLASS_DICT for part in parts: if part[0] != '': self._analysisParts.append(part[0]) if part[1] != '': self._analysisParts.append(AnalysisOption(part[1])) if part[2] != '': statNames = part[2].replace('->','').replace(' ','').split(',') #self._statClassList = statNames self._statClassList = [STAT_CLASS_DICT[statName] for statName in statNames \ if STAT_CLASS_DICT.get(statName) is not None] if len(self._statClassList)==0: if len(statNames)==0: logMessage('No statistic found when parsing analysisDef: ' + self._analysisLine) else: logMessage('Specified statistics not found in STAT_CLASS_DICT. Statistics:%s, and keys in STAT_CLASS_DICT: %s' % (str(statNames), str(STAT_CLASS_DICT)) ) if part[3] != '': self._analysisParts.append(part[3])
logException(e, message='(Error in _determineStatClass, with statClass %s)' % statClass.__name__) #if VERBOSE: # print 'Warning: exception in getStat: ', \ # statClass.__name__ + ': ' + e.__class__.__name__ + ': ' + str(e) # traceback.print_exc(file=sys.stdout) else: #self._reversed = reversed #self._conversionsUsed = len(trackA.conversionsUsed) > 0 or \ # ((trackB is not None) and len(trackB.conversionsUsed) > 0) ##self._validStatClass = functools.partial(statClass, **self.getChoices()) #functools.update_wrapper(self._validStatClass, statClass) validStatClass = wrapClass(statClass, keywords=self.getChoices() ) #self.setConverters( self._track.formatConverters, self._track2.formatConverters if self._track2 is not None else None) #self._updateOptions() if DebugConfig.VERBOSE: logMessage(statClass.__name__ + ': OK') # print statClass.__name__ + ': OK' return validStatClass return None def _appendConverterOptions(self, track, labelKey): if track is None: return if self.getChoice(labelKey) is not None: assert(self.getChoice(labelKey) == getClassName(track.formatConverters[0])) return labelPair = (labelKey, '_Treat ' + prettyPrintTrackName(track.trackName) + ' as') choicePairs = [ ( getClassName(fc), fc.getOutputDescription(TrackInfo(self._genome, track.trackName).trackFormatName) ) \
if time.time() > nextPossibleJobRequestTime: #logging.getLogger(PARALLEL_LOGGER).debug("load over threshold, submitting titan job") logMessage("load over threshold, submitting titan job", level=5, logger=PARALLEL_LOGGER) nextPossibleJobRequestTime = time.time() + JOB_SUBMISSION_WAIT_PERIOD TitanJobScript.submitJob() time.sleep(5) class MyManager(BaseManager): pass pid = os.getpid() pidFileName = GALAXY_BASE_DIR + "/taskQueue.pid" with open(pidFileName, "w") as f: f.write(str(pid)) MyManager.register("TaskQueueReferent", TaskQueueManagerFactory.getTaskQueueManager) MyManager.register("shutdown", shutdown) loadThread = threading.Thread(target=monitor_load) loadThread.daemon = True loadThread.start() manager = MyManager(address=("", PP_MANAGER_PORT), authkey=PP_PASSPHRASE) server = manager.get_server() #logging.getLogger(PARALLEL_LOGGER).debug("Task queue started, serving forever...") logMessage("Task queue started, serving forever...", level=5, logger=PARALLEL_LOGGER) server.serve_forever()
def submit(self, func, args=(), depfuncs=(), modules=(), callback=None, callbackargs=(), group='default', globals=None, restrictions=[]): """Submits function to the execution queue func - function to be executed args - tuple with arguments of the 'func' depfuncs - tuple with functions which might be called from 'func' modules - tuple with module names to import callback - callback function which will be called with argument list equal to callbackargs+(result,) as soon as calculation is done callbackargs - additional arguments for callback function group - job group, is used when wait(group) is called to wait for jobs in a given group to finish globals - dictionary from which all modules, functions and classes will be imported, for instance: globals=globals() """ # perform some checks for frequent mistakes if self._exiting: raise DestroyedServerError("Cannot submit jobs: server"\ " instance has been destroyed") if not isinstance(args, tuple): raise TypeError("args argument must be a tuple") if not isinstance(depfuncs, tuple): raise TypeError("depfuncs argument must be a tuple") if not isinstance(modules, tuple): raise TypeError("modules argument must be a tuple") if not isinstance(callbackargs, tuple): raise TypeError("callbackargs argument must be a tuple") if globals is not None and not isinstance(globals, dict): raise TypeError("globals argument must be a dictionary") for module in modules: if not isinstance(module, types.StringType): raise TypeError("modules argument must be a list of strings") tid = self.__gentid() if globals: modules += tuple(self.__find_modules("", globals)) modules = tuple(set(modules)) self.logger.debug("Task %i will autoimport next modules: %s" % (tid, str(modules))) for object1 in globals.values(): if isinstance(object1, types.FunctionType) \ or isinstance(object1, types.ClassType): depfuncs += (object1, ) task = _Task(self, tid, callback, callbackargs, group) self.__waittasks_lock.acquire() self.__waittasks.append(task) self.__waittasks_lock.release() # if the function is a method of a class add self to the arguments list if isinstance(func, types.MethodType) and func.im_self is not None: args = (func.im_self, ) + args # if there is an instance of a user deined class in the arguments add # whole class to dependancies for arg in args: # Checks for both classic or new class instances if isinstance(arg, types.InstanceType) \ or str(type(arg))[:6] == "<class": # do not include source for imported modules if ppcommon.is_not_imported(arg, modules): depfuncs += tuple(ppcommon.get_class_hierarchy(arg.__class__)) # if there is a function in the arguments add this # function to dependancies for arg in args: if isinstance(arg, types.FunctionType): depfuncs += (arg, ) #Add task id args += (tid, ) sfunc = self.__dumpsfunc((func, ) + depfuncs, modules) sargs = pickle.dumps(args, self.__pickle_proto) with self.__jobs_lock: if task.group not in self.__jobs: job = Job(task.group, restrictions) job.worker = self.getFreeWorker() self.__jobs[task.group] = job self.__queue.append(job) self.__jobs[task.group].addTask((task, sfunc, sargs)) #logging.getLogger(PARALLEL_LOGGER).debug("task %i submitted in group %s" , tid, group) logMessage("task %i submitted in group %s" % (tid, group), level = 5, logger=PARALLEL_LOGGER) self.__scheduler() return task
def validateAndPossiblyResetLocalResults(cls, stats): #return 0#to short-circuit this functionality as it is currently in development #return McFdr.dummyStub(stats) if len(stats)==0: return 0 #else: #print 'LEN: ',len(stats) mt = stats[0]._kwArgs.get('mThreshold') ft = stats[0]._kwArgs.get('fdrThreshold') ms = stats[0]._kwArgs.get('maxSamples') fc = stats[0]._kwArgs.get('fdrCriterion') M_THRESHOLD = int(mt) if mt is not None else 20 FDR_THRESHOLD = float(ft) if ft is not None else 0.1 if ms is None: MAX_SAMPLES = 50000 elif type(ms) is int: MAX_SAMPLES = ms elif ms.lower() == 'unlimited': MAX_SAMPLES = None else: MAX_SAMPLES = int(ms) #print 'M_THRESHOLD:%i, FDR_THRESHOLD:%.2f, MAX_SAMPLES:%s' % (M_THRESHOLD,FDR_THRESHOLD,str(MAX_SAMPLES)) #print 'min samples:%i, samples per chunk:%i' % (stats[0]._numResamplings, NUM_SAMPLES_PER_CHUNK) assert fc in [None, 'individual','simultaneous'], 'fdrCriterion:'+str(fc) individualFdr = (fc == 'individual') #print 'FDR criterion: %s' % fc if fc is None: logMessage('Warning: empty fdrCriterion, using simultaneous') #USE_MC_FDR = True #if false, use only standard sequential MC, without checking q-values from gold.application.RSetup import r import numpy #print '<pre>' #pvals = [x.getResult()[RandomizationManagerStatUnsplittable.PVAL_KEY] for x in stats] pvals = range(len(stats)) allMs = range(len(stats)) allNumSamples = range(len(stats)) isInValid = range(len(stats)) for i,x in enumerate(stats): try: pvals[i] = x.getResult()[RandomizationManagerStatUnsplittable.PVAL_KEY] allMs[i] = x.getResult()[RandomizationManagerStatUnsplittable.M_KEY] allNumSamples[i] = x.getResult()[RandomizationManagerStatUnsplittable.NUM_SAMPLES_KEY] isInValid[i] = False except: pvals[i] = None allMs[i] = None allNumSamples[i] = None isInValid[i] = True #print 'P: ',pvals #print 'Stats: ',stats #print 'LEN: ',len(stats) fdrVals = McFdr.adjustPvalues(pvals, verbose=False) #if not type(fdrVals) in (list,tuple): # fdrVals = [fdrVals] #print 'FDR: ', fdrVals #allMs = [x.getResult()[RandomizationManagerStatUnsplittable.M_KEY] for x in stats] #maybe just access stat object directly to get this.. #allMs = range(len(stats)) #for i,x in enumerate(stats): # try: # allMs[i] = x.getResult()[RandomizationManagerStatUnsplittable.M_KEY] # except: # allMs[i] = None #determinedByM = [M_THRESHOLD is not None and m is not None and m>=M_THRESHOLD for m in allMs] determinedByM = [M_THRESHOLD is not None and m>=M_THRESHOLD for m in allMs] determinedByFdr = [FDR_THRESHOLD is not None and not numpy.isnan(f) and f<FDR_THRESHOLD for f in fdrVals] determinedByMaxSamples = [MAX_SAMPLES is not None and n>=MAX_SAMPLES for n in allNumSamples] statIndividuallyDetermined = list(any(x) for x in zip(determinedByM,determinedByMaxSamples,isInValid)) #determined by anything except FDR, as the latter is not necessarily handled on a per test level.. statDeterminedByAnyMeans = list(any(x) for x in zip(statIndividuallyDetermined, determinedByFdr)) #determined individually or by FDR assert len(stats) == len(pvals) == len(fdrVals) == len(allMs) == len(determinedByM) == len(determinedByFdr) == len(statIndividuallyDetermined) #print '</pre>' #print allMs #print fdrVals #ndIndexes = [i for i in range(len(statDetermined)) if not statDetermined[i]] #print 'INDEXES: ' + ','.join([str(x) for x in ndIndexes]), '<br>' #print 'M-VALUES: ' + ','.join([str(allMs[x]) for x in ndIndexes]), '<br>' #print 'P-VALUES: ' + ','.join([str(pvals[x]) for x in ndIndexes]), '<br>' #print 'FDR-VALUES: ' + ','.join([str(fdrVals[x]) for x in ndIndexes]), '<br>' for i in range(len(statIndividuallyDetermined)): determined = statIndividuallyDetermined[i] or (individualFdr and determinedByFdr[i]) if not determined: if hasattr(stats[i], '_result'): del stats[i]._result else: print 'no _result to delete at index %i in stats: '%i #, stats print 'obj details: ',stats[i]._region stats[i]._numResamplings += cls.NUM_SAMPLES_PER_CHUNK #get number from mcFdr.. #return all(statDeterminedByAnyMeans) #returns number of not determined stats.. return sum((1 if not determined else 0) for determined in statDeterminedByAnyMeans)
def _run_local(self, task, sfunc, sargs, worker, job): """Runs a job locally""" if self._exiting: return self.logger.info("Task %i started", task.tid) start_time = time.time() sresult = None while sresult == None: try: worker.t.csend(sfunc) worker.t.send(sargs) except: if self._exiting: return if SHOW_EXPECTED_EXCEPTIONS: self.logger.debug("Exception sending task in _run_local (possibly expected)", exc_info=True) #print "exception in run_local for group %s for worker %s when fetching result for tid %s: (thread is %s)" % (job.group, worker, task.tid, threading.current_thread()) raise while sresult == None: try: sresult = worker.t.receive(timeout=10) except pptransport.TimeoutException: #Can happen because of R crashing... if worker.is_alive(): continue else: print "Worker appears to have crashed for task %d, reinserting task..." worker.stop() job.worker = self.getFreeWorker() self.insert(sfunc, sargs, job, task) self.__scheduler() return except: if self._exiting: return if SHOW_EXPECTED_EXCEPTIONS: self.logger.debug("Exception receiving result in _run_local (possibly expected)", exc_info=True) raise task.finalize(sresult) # remove the job from the waiting list if self.__waittasks: with self.__waittasks_lock: self.__waittasks.remove(task) job.taskFinished() worker.free() self.__add_to_active_tasks(-1) if not self._exiting: self.__stat_add_time("local", time.time()-start_time) #self.logger.debug("Task %i ended", task.tid) #logging.getLogger(PARALLEL_LOGGER).debug("Task %i ended", task.tid) logMessage("Task %i ended" % task.tid, level = 5, logger = PARALLEL_LOGGER) self.__scheduler()
def validateAndPossiblyResetGlobalResult(cls, stat): mt = stat._kwArgs.get('mThreshold') ft = stat._kwArgs.get('fdrThreshold') ms = stat._kwArgs.get('maxSamples') fc = stat._kwArgs.get('fdrCriterion') M_THRESHOLD = int(mt) if mt is not None else 20 FDR_THRESHOLD = float(ft) if ft is not None else 0.1 if ms is None: MAX_SAMPLES = 50000 elif type(ms) is int: MAX_SAMPLES = ms elif ms.lower() == 'unlimited': MAX_SAMPLES = None else: MAX_SAMPLES = int(ms) #print 'M_THRESHOLD:%i, FDR_THRESHOLD:%.2f, MAX_SAMPLES:%s' % (M_THRESHOLD,FDR_THRESHOLD,str(MAX_SAMPLES)) #print 'min samples:%i, samples per chunk:%i' % (stats[0]._numResamplings, NUM_SAMPLES_PER_CHUNK) assert fc in [None, 'individual','simultaneous'], 'fdrCriterion:'+str(fc) individualFdr = (fc == 'individual') #print 'FDR criterion: %s' % fc if fc is None: logMessage('Warning: empty fdrCriterion, using simultaneous') #USE_MC_FDR = True #if false, use only standard sequential MC, without checking q-values from gold.application.RSetup import r import numpy #print '<pre>' #pvals = [x.getResult()[RandomizationManagerStatUnsplittable.PVAL_KEY] for x in stats] #pvals = range(len(stats)) #allMs = range(len(stats)) #allNumSamples = range(len(stats)) #isInValid = range(len(stats)) #for i,x in enumerate(stats): try: pval = stat.getResult()[RandomizationManagerStatUnsplittable.PVAL_KEY] mVal = stat.getResult()[RandomizationManagerStatUnsplittable.M_KEY] numSamples = stat.getResult()[RandomizationManagerStatUnsplittable.NUM_SAMPLES_KEY] isInValid = False except: pval= None mVal = None numSamples = None isInValid = True #raise #print type(pval) determinedByM = M_THRESHOLD is not None and mVal>=M_THRESHOLD determinedByFdr = FDR_THRESHOLD is not None and pval is not None and not numpy.isnan(pval) and pval<FDR_THRESHOLD determinedByMaxSamples = MAX_SAMPLES is not None and numSamples>=MAX_SAMPLES #print 'TEMP statdet1: ',determinedByM, determinedByFdr, determinedByMaxSamples statDetermined = any([determinedByM, determinedByFdr, determinedByMaxSamples,isInValid]) #determined by anything except FDR, as the latter is not necessarily handled on a per test level.. #statDeterminedByAnyMeans = list(any(x) for x in zip(statIndividuallyDetermined, determinedByFdr)) #determined individually or by FDR #assert len(stats) == len(pvals) == len(fdrVals) == len(allMs) == len(determinedByM) == len(determinedByFdr) == len(statIndividuallyDetermined) if not statDetermined: if hasattr(stat, '_result'): del stat._result else: print 'no _result to delete at global level' #print 'obj details: ',stats._region stat._numResamplings += cls.NUM_SAMPLES_PER_CHUNK #get number from mcFdr.. #return all(statDeterminedByAnyMeans) #returns number of not determined stats.. #print 'TEMP statdet: ',statDetermined return (1 if not statDetermined else 0)
def globalAnalysisEnded(self): if not self._printProgress: return if self._startGlobalTime is None: logMessage('Called globalAnalysisEnded without globalAnalysisStarted being called before.', level=logging.WARN) print "\nglobal analysis took %f seconds" % (time.time() - self._startGlobalTime)
def _generateAnswerText(self, coreCls): onlyLocalPvals = self._results.hasOnlyLocalPvals() globalPval = self._results.getGlobalResult().get(self._results.getPvalKey()) if not onlyLocalPvals else None localPvalsUrl = getRelativeUrlFromWebPath(os.sep.join([self._baseDir, 'table.html'])) #problematic dependency towards fn in tablePresenter.. #Change to something like this when StaticFile has been used throughout presenters.. #from quick.util.StaticFile import GalaxyRunSpecificFile #GalaxyRunSpecificFile([], '', self._galaxyFn) core = coreCls() core.styleInfoBegin(styleClass="infomessagesmall answerbox question") core.header('You asked:') core.line(str(coreCls().highlight(self._getHeader()))) core.styleInfoEnd() #Simplistic answer core.styleInfoBegin(styleClass="infomessagesmall answerbox simplisticanswer") core.header(str(coreCls().link('Simplistic answer:', '#', \ args='''onclick="return toggle('simplistic_answer_expl')"'''))) core.styleInfoBegin(styleId="simplistic_answer_expl", styleClass="infomessagesmall explanation") if onlyLocalPvals : core.line(''' Under "simplistic answer" you will find a simple statement on whether there were any findings for the local analysis. The number of significant bins at 10% false discovery rate (FDR) is provided.<br> <br> It is not possible to draw a decisive conclusion based on a p-value, so the statements are only meant as simple indications.<br> ''') else: core.line(''' Under "simplistic answer" you will find a yes/maybe/no-conclusion answer to the question asked, based on a simple thresholding scheme on the p-value:<br> "yes" if p-value < 0.01<br> "maybe" if 0.01 < p-value < 0.1<br> "no conclusion" if p-value > 0.1<br> <br> It is not possible to draw a decisive conclusion based on a p-value, so the statements are only meant as simple indications.<br> ''') core.styleInfoEnd() if onlyLocalPvals: numSign, numTested, numIgnored = self._results.getFdrSignBins() if numSign == numTested and numSign != 0: simplisticPhrase = 'Yes - the data suggests this for all bins' elif numSign>0: simplisticPhrase = 'Yes - the data suggests this at least in some bins' numSign, numTested, numIgnored = self._results.getFdrSignBins() simplisticPhrase += ' (%i significant bins out of %i, at %i%% FDR' % (numSign, numTested, self._results.FDR_THRESHOLD*100) else: simplisticPhrase = 'No support from data for this conclusion in any bin' core.line(str(coreCls().highlight(simplisticPhrase))) else: assert globalPval is not None directionality = '' if self._results._analysis.isTwoSidedTest(): tsValue, expTsValue = self._getTestStatisticAndExpectedValues() if tsValue is not None and expTsValue is not None and tsValue!=expTsValue: directionality = '(higher) ' if tsValue > expTsValue else '(lower) ' if globalPval < 0.01: simplisticPhrase = 'Yes %s- the data suggests this' % directionality elif globalPval < 0.1: simplisticPhrase = 'Maybe %s- weak evidence' % directionality else: simplisticPhrase = 'No support from data for this conclusion' core.line(str(coreCls().highlight(simplisticPhrase + ' (p-value: ' + strWithStdFormatting(globalPval) + ')' ))) core.styleInfoEnd() #Precise answer core.styleInfoBegin(styleClass="infomessagesmall answerbox preciseanswer") core.header(str(coreCls().link('Precise answer:', '#', \ args='''onclick="return toggle('precise_answer_expl')"'''))) core.styleInfoBegin(styleId="precise_answer_expl", styleClass="infomessagesmall explanation") if onlyLocalPvals : core.line(''' Significance testing evaluates a <b>null hypothesis (H0)</b> versus an <b>alternative hypothesis (H1)</b>. Low <b>p-values</b> are evidence against H0. The testing involves comparing the observed value of a <b>test statistic</b> to the distribution of the test statistic under a <b>null model</b>. The testing was performed in each local bin, with a list of FDR-corrected p-values per bin provided. ''') else: core.line(''' Significance testing evaluates a <b>null hypothesis (H0)</b> versus an <b>alternative hypothesis (H1)</b>. Low <b>p-values</b> are evidence against H0. The testing involves comparing the observed value of a <b>test statistic</b> to the distribution of the test statistic under a <b>null model</b>. ''') core.styleInfoEnd() EffectSizeText = 'Please note that both the effect size and the p-value should be considered in order to assess the practical significance of a result.' FDR_text = '* False Discovery Rate: The expected proportion of false positive results among the significant bins is no more than %i%%.' \ % (self._results.FDR_THRESHOLD*100) if onlyLocalPvals: numSign, numTested, numIgnored = self._results.getFdrSignBins() core.line(str(coreCls().highlight('%i significant bins out of %i, at %i' \ % (numSign, numTested, self._results.FDR_THRESHOLD*100) + '% FDR*'))) core.line('') localPvalsLink = str(coreCls().link('collection of FDR-corrected p-values per bin', localPvalsUrl)) notComputeLink = str(coreCls().link('Not able to compute', '#', \ args='''onclick="return toggle('no_global_pval_expl')"''')) core.line('A ' + localPvalsLink + ' was computed. ' + notComputeLink + ' a global p-value for this analysis.') core.styleInfoBegin(styleId="no_global_pval_expl", styleClass="infomessagesmall explanation") core.line('(Explanation to appear in box)') core.styleInfoEnd() if numIgnored > 0: core.line('') core.line('%s bin%s excluded due to lack of data.' % (numIgnored, 's' if numIgnored > 1 else '')) core.line('') core.line(EffectSizeText) core.line('') core.line(FDR_text) h0h1Text = self._getH0andH1Text(coreCls) if h0h1Text is not None: core.divider(withSpacing=True) core.line('In each bin, the test of') core.append(h0h1Text) core.line('was performed.') else: h0h1Text = self._getH0andH1Text(coreCls) if h0h1Text is not None: core.line('The p-value is %s for the test' % strWithStdFormatting(globalPval) ) core.append(h0h1Text) else: core.line('The p-value is %s.' % strWithStdFormatting(globalPval) ) core.line('') core.line('Low p-values are evidence against H0.') numSign, numTested, numIgnored = self._results.getFdrSignBins() if numTested+numIgnored > 1: localPvalsLink = str(coreCls().link('each bin separately', localPvalsUrl)) excludeText = ' (%i bin%s excluded from FDR-analysis due to lacking p-values).' \ % (numIgnored, 's' if numIgnored>1 else '.') if numIgnored>0 else '' core.line('') core.line('The test was also performed for ' + localPvalsLink + \ ', resulting in %i significant bins out of %i, at %i%% FDR*' % (numSign, numTested, self._results.FDR_THRESHOLD*100) +\ excludeText) core.line('') core.line(EffectSizeText) core.line('') core.line(FDR_text) nullModel = self._results._analysis.getNullModel() if nullModel is not None: core.divider(withSpacing=True) core.line('P-values were computed under the %s defined by the following preservation and randomization rules:' \ % str(coreCls().highlight('null model'))) core.paragraph(nullModel, indent=True) testStatistic = self._results.getTestStatisticText() if testStatistic != None: #pick out relevant part: mo = re.search('^[tT]est.[sS]tatistic ?:? ?',testStatistic) if mo!= None: testStatistic = testStatistic[mo.end():] #if len(testStatistic)>0 and testStatistic[0]=='(': #testStatistic = testStatistic[1:] #if len(testStatistic)>0 and testStatistic[-1]==')': #testStatistic = testStatistic[:-1] tsValue, expTsValue = self._getTestStatisticAndExpectedValues() core.divider(withSpacing=True) core.line('The %s used is:' % str(coreCls().highlight('test statistic'))) core.paragraph(testStatistic, indent=True) if tsValue is not None: if expTsValue is not None: core.line('The value of the test statistic is %s, which is %s the expected value: %s.' \ % (strWithStdFormatting(tsValue), \ (str(coreCls().emphasize('higher')) + ' than' if tsValue > expTsValue else \ (str(coreCls().emphasize('lower')) + ' than' if tsValue < expTsValue else \ str(coreCls().emphasize('equal')) + ' to')), \ strWithStdFormatting(expTsValue))) else: core.line('The value of the test statistic is %s.' % (strWithStdFormatting(tsValue))) #temporary solution, as lacking objects needed to construct note-link directly.. noteText = '' if self._results._runDescription is not None: #mo = re.search('<note.*note>', self._results._runDescription) mo = re.search('<a href[^>]*/notes/[^>]*>[^<]*</a>', self._results._runDescription) if mo is not None: noteLink = mo.string[mo.start():mo.end()] noteText = ' See ' + noteLink + ' for a more complete description of the test.' if noteText == '': logMessage('Note-link not found in runDescription, and thus omitted from results') core.divider(withSpacing=True) runDescLink = str(coreCls().link('run description', '#', \ args='''onclick="return toggle('run_description')"''')) core.line('The p-values may be subject to further parameter choices, which are listed in the %s.' %\ (runDescLink) + noteText) core.divider(withSpacing=True) resultsLink = str(coreCls().link('See full details', '#', \ args='''onclick="return toggle('results_box')"''')) core.line(resultsLink + ' of the results in table form.') core.styleInfoEnd() return str(core)
def getRunDescription(trackName1, trackName2, trackNameIntensity, analysisDef, ubSource, revEngBatchLine, \ urlForTrackAutoSelection, manualSeed, **kwArgs): genome = ubSource.genome core = HtmlCore() analysis = Analysis(analysisDef, genome, trackName1, trackName2, **kwArgs) core.header('GENOME') core.append(GenomeInfo(genome).mainInfo(printEmpty=False)) core.divider() formatChoices = analysis.getFormatConverterChoicesAsText().items() tr1FormatChoice, tr2FormatChoice = formatChoices if len(formatChoices) == 2 else (None, None) first = True for tn,label,formatChoice in zip([trackName1,trackName2,trackNameIntensity], \ ['TRACK 1','TRACK 2','INTENSITY TRACK'], \ [tr1FormatChoice,tr2FormatChoice,None]): if tn in [None, []]: continue if not first: core.divider() core.header(label) trackInfo = TrackInfo(genome, tn) trackText = '' if ExternalTrackManager.isHistoryTrack(tn): assert len(tn)>=4, 'Length of external track name < 4: %s' % str(tn) core.descriptionLine('Name', ExternalTrackManager.extractNameFromHistoryTN(tn) + ' (from history)' + os.linesep) else: core.descriptionLine('Name', ':'.join(tn) + os.linesep) core.append(trackInfo.mainInfo(printEmpty=False)) if formatChoice is not None: core.descriptionLine('Treated as', formatChoice[1]) first = False core.divider() core.header('ANALYSIS') core.paragraph( ''.join(str(analysis).split(':')[1:]) ) first = True for label,choice in analysis.getInterfaceChoicesAsText().items(): if first: core.divider() core.header('OPTIONS') if manualSeed is not None and label == 'Random seed' and choice == 'Random': choice = str(manualSeed) core.descriptionLine(label, choice) first = False h0 = analysis.getH0() if h0 is not None: core.divider() core.header('NULL HYPOTHESIS') core.paragraph(h0) h1 = analysis.getH1() if h1 is not None: core.divider() core.header('ALTERNATIVE HYPOTHESIS') core.paragraph(h1) core.divider() core.header('ANALYSIS REGIONS') if hasattr(ubSource, 'description'): core.paragraph(ubSource.description) core.divider() core.header('SOLUTION') statClass = analysis.getStat() #One alternative is to put getDescription in MagicStatFactory-hierarchy as class-method, and get real class behind partial-object. #if isinstance(statClass, functools.partial): #statClass = statClass.func #core.paragraph( statClass.getDescription() ) #Chosen alternative is to Instantiate an object, which will automatically give object of real class.. #and then use the following two lines, which will get class in Statistic-hierarchy instead of MagicStatFactory-hierarchy .. try: reg = ubSource.__iter__().next() except: core.paragraph('Solution not relevant, as there are no specified analysis regions..') else: track1, track2 = analysis.getTracks() if statClass is None: core.paragraph('Solution not available, due to currently invalid analysis') logMessage('Solution not available, with params: ' + str([trackName1, trackName2, analysisDef]), level=logging.WARN ) else: statObj = statClass(reg,track1, track2) statDescr = statObj.getDescription() replPat = '<a href=' + os.sep.join([STATIC_REL_PATH,'notes','stats','']) + r'\1>note</a>' statDescr = re.sub('<note>(.*)</note>', replPat, statDescr) core.paragraph( statDescr ) core.divider() core.header('TIME OF ANALYSIS') core.paragraph('Analysis initiated at time: ' + str( datetime.datetime.now() ) ) if urlForTrackAutoSelection not in [None, '']: core.divider() core.header('URL FOR TRACK AUTOSELECTION') #urlOptions = '&'.join(['track1=' + quote(':'.join(trackName1)), 'track2=' + quote(':'.join(trackName2))]) #core.paragraph(URL_PREFIX + '/hyper?' + urlOptions) core.styleInfoBegin(styleClass='break-word') core.paragraph(urlForTrackAutoSelection) core.styleInfoEnd() if revEngBatchLine not in [None, '']: core.divider() core.header('CORRESPONDING BATCH-RUN LINE') #if any(ExternalTrackManager.isRedirectOrExternalTrack(tn) for tn in [trackName1, trackName2]): #core.paragraph('Batch-run line not available with tracks from history') #else: core.styleInfoBegin(styleClass='break-word') core.paragraph(revEngBatchLine) core.styleInfoEnd() core.divider() core.header('REFERENCES') core.paragraph('The HyperBrowser system is described in:<br>"Sandve et al., <a href="http://genomebiology.com/2010/11/12/R121/">The Genomic HyperBrowser: inferential genomics at the sequence level</a>, Genome Biol. 2010;11(12):R121') from gold.statistic.RandomizationManagerStat import RandomizationManagerStat if statClass is not None and RandomizationManagerStat.getMcSamplingScheme(statClass.keywords) == 'MCFDR': core.paragraph('The p-values of this analysis were computed using the MCFDR scheme for Monte Carlo based p-value computation'+\ ', described in:<br>Sandve et al., <a href="http://bioinformatics.oxfordjournals.org/content/early/2011/10/13/bioinformatics.btr568.long">Sequential Monte Carlo multiple testing</a>, Bioinformatics 2011') # description = \ #''' #Run descriptions will be introduced in the next version of HB. <br> #Below is an example run description, which is a static text unconnected to your choices. The purpose is to get feedback from you on what this should look like:<br> #Track1 (refseg:genes): Unmarked points (converted from unmarked segments, taking midpoints)<br> #Track2 (DNA melting:meltmap): Function<br> #Bins: Chr1, divided into bins of 10 megabases<br> #Question: Are track1-points occurring with different frequency inside track2-segment than outside?<br> #Analysis:<br> #The main result is a p-value resulting from a statistical test connected to the question.<br> #The null-hypothesis assumes that the track1-points are randomly distributed according to a poisson-distribution, with the same number of points as in the original data. Track2-segment are assumed fixed as they are in the original data. This can be answered by a binomial test. The alternative hypothesis is then that the count of points inside segments has resulted from a different distribution of points, where the points are then either distributed more or less inside segments versus outside. See the note on this question in the user guide for further info.<br> #''' return str(core)
def storeBoundingRegions(self, boundingRegionTuples, genomeElementChrList, sparse): assert sparse in [False, True] tempContents = OrderedDict() genomeElementChrs = set(genomeElementChrList) lastRegion = None chrStartIdxs = OrderedDict() chrEndIdxs = OrderedDict() totElCount = 0 totBinCount = 0 for br in boundingRegionTuples: if lastRegion is None or br.region.chr != lastRegion.chr: if br.region.chr in tempContents: raise InvalidFormatError("Error: bounding region (%s) is not grouped with previous bounding regions of the same chromosome (sequence)." % br.region) lastRegion = None tempContents[br.region.chr] = OrderedDict() #sorteddict() if sparse: chrStartIdxs[br.region.chr] = totElCount else: if br.region < lastRegion: raise InvalidFormatError("Error: bounding regions in the same chromosome (sequence) are unsorted: %s > %s." % (lastRegion, br.region)) if lastRegion.overlaps(br.region): raise InvalidFormatError("Error: bounding regions '%s' and '%s' overlap." % (lastRegion, br.region)) if lastRegion.end == br.region.start: raise InvalidFormatError("Error: bounding regions '%s' and '%s' are adjoining (there is no gap between them)." % (lastRegion, br.region)) if len(br.region) < 1: raise InvalidFormatError("Error: bounding region '%s' does not have positive length." % br.region) if not sparse and len(br.region) != br.elCount: raise InvalidFormatError("Error: track type representation is dense, but the length of bounding region '%s' is not equal to the element count: %s != %s" % (br.region, len(br.region), br.elCount)) startIdx, endIdx = (totElCount, totElCount + br.elCount) if not sparse else (None, None) totElCount += br.elCount if sparse: chrEndIdxs[br.region.chr] = totElCount tempContents[br.region.chr][br.region.start] = BoundingRegionInfo(br.region.start, br.region.end, startIdx, endIdx, 0, 0) lastRegion = br.region if sparse: totBinCount = 0 for chr in tempContents: chrLen = GenomeInfo.getChrLen(self._genome, chr) numBinsInChr = CompBinManager.getNumOfBins(GenomeRegion(start=0, end=chrLen)) for key in tempContents[chr].keys(): startBinIdx = totBinCount endBinIdx = totBinCount + numBinsInChr brInfo = tempContents[chr][key] if chr in genomeElementChrs: tempContents[chr][key] = BoundingRegionInfo(brInfo.start, brInfo.end, \ chrStartIdxs[chr], chrEndIdxs[chr], \ startBinIdx, endBinIdx) else: if chrEndIdxs[chr] - chrStartIdxs[chr] > 0: raise InvalidFormatError("Error: bounding region '%s' has incorrect element count: %s > 0" % (GenomeRegion(chr=chr, start=brInfo.start, end=brInfo.end), chrEndIdxs[chr] - chrStartIdxs[chr])) tempContents[chr][key] = BoundingRegionInfo(brInfo.start, brInfo.end, 0, 0, 0, 0) if chr in genomeElementChrs: totBinCount += numBinsInChr if len(genomeElementChrs - set(tempContents.keys())) > 0: raise InvalidFormatError('Error: some chromosomes (sequences) contains data, but has no bounding regions: %s' % ', '.join(genomeElementChrs - set(tempContents.keys()))) ensurePathExists(self._fn) for chr in tempContents: brInfoDict = tempContents[chr] tempContents[chr] = BrInfoHolder(tuple(brInfoDict.keys()), tuple(brInfoDict.values())) brShelve = safeshelve.open(self._fn) brShelve.update(tempContents) brShelve.close() while not self.fileExists(): from gold.application.LogSetup import logMessage logMessage("Bounding region shelve file '%s' has yet to be created" % self._fn) import time time.sleep(0.2)
def __init__(self, category, numSamples): self._count = 0 self._category = category self._numSamples = numSamples logMessage('kategori = %s' % self._category) self.valueDict = dict()
def storePickledResults(self): try: from cPickle import dump pickleStaticFile = GalaxyRunSpecificFile(['results.pickle'],self._galaxyFn) #print 'TEMP1: PATH: ',pickleStaticFile.getDiskPath(True) from copy import copy pickleList = [copy(res) for res in self._resultsList] for res in pickleList: res._analysis=None dump(pickleList, pickleStaticFile.getFile()) #dump(self._resultsList, pickleStaticFile.getFile()) except Exception, e: logException(e, message='Not able to pickle results object') except: logMessage('Exception object not subclassing Exception encountered',level=logging.ERROR) class ResultsViewer(object): def __new__(self, results, baseDir): #print 'TEMP1 ', results.values() presCollectionType = results.getPresCollectionType() #print 'presCollectionType: ',presCollectionType if presCollectionType == 'standard': return StandardResultsViewer.__new__(StandardResultsViewer, results, baseDir) elif presCollectionType == 'distribution': return DistributionResultsViewer.__new__(DistributionResultsViewer, results, baseDir) elif presCollectionType == 'dictofdicts': return DictOfDictsResultsViewer.__new__(DictOfDictsResultsViewer, results, baseDir) elif presCollectionType == 'matrix': return MatrixResultsViewer.__new__(MatrixResultsViewer, results, baseDir) elif presCollectionType == 'scatter':