def inferValType(valList, shapeOffset=0): if valList is None: return False elif type(valList) in [list, tuple]: return 'number' elif isinstance(valList, numpy.ndarray) or isinstance( valList, SmartMemmap): if len(valList.shape) == 2 + shapeOffset and valList.shape[ 1 + shapeOffset] == 2 and valList.dtype == numpy.dtype('float128'): return 'mean_sd' elif any(valList.dtype == numpy.dtype(x) for x in ['float32', 'float64', 'float128']): if len(valList.shape) == 1 + shapeOffset: return 'number' elif valList.shape[1 + shapeOffset] >= 2: return 'population' if any(valList.dtype == numpy.dtype(x) for x in ['int32', 'int64']): if len(valList.shape) == 1 + shapeOffset: return 'number (integer)' elif valList.shape[1 + shapeOffset] >= 2: return 'population' elif any(valList.dtype == numpy.dtype(x) for x in ['int8', 'bool8']): if len(valList.shape) == 1 + shapeOffset: return 'tc' elif valList.shape[1 + shapeOffset] >= 2: return 'tc_vector' elif valList.dtype == numpy.dtype('S1'): if len(valList.shape) == 1 + shapeOffset: return 'char' elif valList.shape[1 + shapeOffset] >= 2: return 'char_vector' elif _dtypeIsStringLongerThanOne(valList.dtype): if len(valList.shape) == 1 + shapeOffset: return 'category' elif valList.shape[1 + shapeOffset] >= 2: return 'category_vector' if valList.shape[1 + shapeOffset] == 0: return 'unsupported list' logMessage('Shape or dtype not recognized: ' + str(valList.shape) + ' and ' + str(valList.dtype)) raise ShouldNotOccurError() else: logMessage('Type of valList not recognized: ' + str(type(valList))) raise ShouldNotOccurError()
def __init__(self, region, trackStructure, *args, **kwArgs): from config.Config import IS_EXPERIMENTAL_INSTALLATION # @UnresolvedImport if 'isExperimental' in kwArgs: x = kwArgs['isExperimental'].lower() if x not in ['false', 'true']: logMessage('isExperimental has value other than false/true', level=logging.WARN) raise ShouldNotOccurError( 'isExperimental has value other than false/true.') if x == 'true': assert IS_EXPERIMENTAL_INSTALLATION, IS_EXPERIMENTAL_INSTALLATION if 'assumptions' in kwArgs: self._checkAssumptions(kwArgs['assumptions']) self._region = region self._trackStructure = trackStructure #TODO:boris 20150924, Code for checking if query and reference (track and track2) are the same track. #We should decide if we will allow this in the future. #TODO: This should probably instead happen in the default _init method, so that when this is # overridden, one needs to explicitly store kwArgs if desired. #As it is now, parameters will be handled explicitly in _init while still becoming part of self_kwArgs self._kwArgs = kwArgs self._init(**kwArgs) self._trace('__init__')
def __init__(self, region, trackStructure, *args, **kwArgs): from config.Config import IS_EXPERIMENTAL_INSTALLATION # @UnresolvedImport if 'isExperimental' in kwArgs: x = kwArgs['isExperimental'].lower() if x not in ['false', 'true']: logMessage('isExperimental has value other than false/true', level=logging.WARN) raise ShouldNotOccurError( 'isExperimental has value other than false/true.') if x == 'true': assert IS_EXPERIMENTAL_INSTALLATION, IS_EXPERIMENTAL_INSTALLATION if 'assumptions' in kwArgs: self._checkAssumptions(kwArgs['assumptions']) self._region = region self._trackStructure = trackStructure #TODO:boris 20150924, Code for checking if query and reference (track and track2) are the same track. #We should decide if we will allow this in the future. self._kwArgs = kwArgs self._init(**kwArgs) self._trace('__init__')
def _track(self): if TrackStructure.QUERY_KEY not in self._trackStructure\ or not self._trackStructure.getQueryTrackList(): raise ShouldNotOccurError( 'Track structure must contain a query list of at least one track' ) return self._trackStructure.getQueryTrackList()[0]
def _adjustComplementaryEdgeWeightDict(complementEdgeWeightDict, id, edges, weights): for index, edgeId in enumerate(edges): weight = weights[index] if weights is not None else '' if id in complementEdgeWeightDict and edgeId in complementEdgeWeightDict[id]: complWeight = complementEdgeWeightDict[id][edgeId] try: equal = numpy.all(complWeight == weight | numpy.isnan(complWeight) & numpy.isnan(weight)) except TypeError: try: equal = (complWeight == weight) or (numpy.isnan(complWeight) and numpy.isnan(weight)) except (TypeError, ValueError): equal = numpy.all(complWeight == weight) if not equal: raise InvalidFormatError("Error: edge ('%s' <-> '%s') is not undirected. The weight must be equal in both directions (%s != %s)" % (edgeId, id, complementEdgeWeightDict[id][edgeId], weights[index])) del complementEdgeWeightDict[id][edgeId] if len(complementEdgeWeightDict[id]) == 0: del complementEdgeWeightDict[id] elif id == edgeId: continue elif edgeId in complementEdgeWeightDict: if id in complementEdgeWeightDict[edgeId]: raise ShouldNotOccurError('Error: the complementary edge(%s) has already been added to complementEdgeWeightDict["%s"] ... ' % (id, edgeId)) complementEdgeWeightDict[edgeId][id] = weight else: complementEdgeWeightDict[edgeId] = {id: weight}
def _determineStatClass(self, flushMemoized=True): assert( hasattr(self, '_track') ) assert( hasattr(self, '_track2') ) dummyGESource = MinimalBinSource(self._genome) if len(self._statClassList) == 0: # if self._reversed: logMessage('Stat class list is empty, for analysisDef: ' + self._analysisLine, level = logging.WARNING) if DebugConfig.PASS_ON_VALIDSTAT_EXCEPTIONS: raise ShouldNotOccurError('Stat class list is empty. Analysisdef: ' + self._analysisLine) for statClass in self._statClassList: if DebugConfig.VERBOSE: logMessage('Checking validity of stat class "{}" for analysisDef "{}".'.format(statClass.__name__, self.getDefAfterChoices())) trackA, trackB = self._track, self._track2 if trackA is None: continue try: StatJob(dummyGESource, trackA, trackB, statClass, minimal=True, **self.getAllChoices(filterByActivation=True)).run(False, flushMemoized=flushMemoized) except IncompatibleTracksError, e: if DebugConfig.VERBOSE: logException(e, level=logging.DEBUG, messagePrefix='Warning: error in _determineStatClass for stat: %s' % statClass.__name__) if DebugConfig.PASS_ON_VALIDSTAT_EXCEPTIONS: raise except (AssertionError, IncompatibleAssumptionsError, IdenticalTrackNamesError), e: if DebugConfig.VERBOSE: logException(e, level=logging.DEBUG, messagePrefix='Warning: error in _determineStatClass for stat: %s' % statClass.__name__) if DebugConfig.PASS_ON_VALIDSTAT_EXCEPTIONS: raise
def _resolveFunction(self, summaryFunc): if summaryFunc not in self.functionDict: raise ShouldNotOccurError( str(summaryFunc) + ' not in list, must be one of ' + str(sorted(self.functionDict.keys()))) else: return self.functionDict[summaryFunc]
def __init__(self, region, track, track2=None, *args, **kwArgs): from config.Config import IS_EXPERIMENTAL_INSTALLATION if 'isExperimental' in kwArgs: x = kwArgs['isExperimental'].lower() if not x in ['false', 'true']: logMessage('isExperimental has value other than false/true', level=logging.WARN) raise ShouldNotOccurError( 'isExperimental has value other than false/true.') if x == 'true': assert IS_EXPERIMENTAL_INSTALLATION, IS_EXPERIMENTAL_INSTALLATION #else: # assert IS_EXPERIMENTAL_INSTALLATION if 'assumptions' in kwArgs: self._checkAssumptions(kwArgs['assumptions']) self._region = region self._track = track if track2 not in [None, []]: self._track2 = track2 self._kwArgs = kwArgs self._init(**kwArgs) self._trace('__init__')
def _getValInCorrectType(self, val, valueOrEdgeWeight='value', isEmptyElement=False): headerDictInFile = self.getHeaderDictInFile() valTypeList = ['binary', 'number', 'category', 'character'] for i, valueType in enumerate(valTypeList): if valueOrEdgeWeight in self._valTypeIndexDict and self._valTypeIndexDict[ valueOrEdgeWeight] > i: continue valTypeInfo = GtrackGenomeElementSource.VAL_TYPE_DICT[valueType] if self._isValOfParticularType(val, valTypeInfo): self._noteIfAllValuesAreMissing(valueOrEdgeWeight, val, valTypeInfo) self._valTypeIndexDict[valueOrEdgeWeight] = i valueDim = self._getGtrackValueDim(val, valTypeInfo, valueOrEdgeWeight) if not '%s type' % valueOrEdgeWeight in headerDictInFile: self._headerDict['%s type' % valueOrEdgeWeight] = valTypeList[i] if not '%s dimension' % valueOrEdgeWeight in headerDictInFile: self._headerDict['%s dimension' % valueOrEdgeWeight] = valueDim return GtrackGenomeElementSource._getValInCorrectType( self, val, valueOrEdgeWeight, isEmptyElement) raise ShouldNotOccurError()
def _compute(self): if self._summaryFunction: results = [] for i, child in enumerate(self._children): results.append(child.getResult()) return self._summaryFunction(results) else: raise ShouldNotOccurError('The summary function is not defined')
def _compute(self): if self._multitrackSummaryFunc: res = [child.getResult() for child in self._children] if self._multitrackSummaryFunc == 'RawResults': return res else: return self._multitrackSummaryFunc(res) else: raise ShouldNotOccurError('The summary function is not defined')
def returnToStoredState(): if random._storedStates is None: return ShouldNotOccurError( 'Tried to return to previous random state without a stored state.') random.setstate(random._storedStates[0]) numpy.random.set_state(random._storedStates[1]) from proto.RSetup import r r('function(state) {.Random.seed <- state}')(random._storedStates[2])
def createChildren(self): if self.hasResult() or self._curChild is not None: return self._trace('_createChildren') #logMessage(str(self._bins)) try: self._curChild = self._getChildObject(self._bins.next()) except StopIteration,e: logException(e) raise ShouldNotOccurError('Splittable statistic should not have zero bins!')
def _compute(self): if self._summaryFunction: if self._summaryFunction == 'RawResults': resultList = [child.getResult() for child in self._children] return resultList else: childrenResList = [child.getResult() for child in self._children] return self._summaryFunction(childrenResList) else: raise ShouldNotOccurError('The summary function is not defined. Must be one of %' % str(sorted(self.functionDict.keys())))
def returnToStoredFullState(self): if self._storedFullState is None: return ShouldNotOccurError( 'Tried to return to previous random state without a stored state.' ) self.setstate(self._storedFullState[0]) numpy.random.set_state(self._storedFullState[1]) from proto.RSetup import r r('function(state) {.Random.seed <- state}')(self._storedFullState[2]) self._storedFullState = None
def _commonGetBpLevelArray(self, vals): if self.trackFormat.reprIsDense(): if self.allowOverlaps: raise ShouldNotOccurError() return vals else: bpLevelArray = numpy.zeros(self._bpSize()+1) numElements = self.getNumElements() if numElements > 0: bpLevelArray += self._getBpLevelModificationArray(self.startsAsNumpyArray(), vals) bpLevelArray -= self._getBpLevelModificationArray(self.endsAsNumpyArray(), vals) bpLevelArray = bpLevelArray.cumsum(dtype='float64') return bpLevelArray[:-1]
def __new__(cls, sortedGeSource): hasStart, hasEnd = [attrs in sortedGeSource.getPrefixList() for attrs in ['start', 'end']] if not hasStart and not hasEnd: return GEOverlapClusterer_Function(sortedGeSource) elif hasStart and hasEnd: return GEOverlapClusterer_Segment(sortedGeSource) elif hasStart and not hasEnd: return GEOverlapClusterer_Point(sortedGeSource) elif not hasStart and hasEnd: return GEOverlapClusterer_Partition(sortedGeSource) else: raise ShouldNotOccurError()
def createBoundingRegionShelve(genome, trackName, allowOverlaps): collector = PreProcMetaDataCollector(genome, trackName) boundingRegionTuples = collector.getBoundingRegionTuples(allowOverlaps) if not collector.getTrackFormat().reprIsDense(): boundingRegionTuples = sorted(boundingRegionTuples) geChrList = collector.getPreProcessedChrs(allowOverlaps) brShelve = BoundingRegionShelve(genome, trackName, allowOverlaps) brShelve.storeBoundingRegions(boundingRegionTuples, geChrList, not collector.getTrackFormat().reprIsDense()) #Sanity check if brShelve.getTotalElementCount() != collector.getNumElements(allowOverlaps): raise ShouldNotOccurError("Error: The total element count for all bounding regions is not equal to the total number of genome elements. %s != %s" % \ (brShelve.getTotalElementCount(), collector.getNumElements(allowOverlaps)) )
def _getArchiveReader(cls, choices): from gold.gsuite.GSuiteArchiver import TarArchiveReader, ZipArchiveReader from quick.application.ExternalTrackManager import ExternalTrackManager suffix = ExternalTrackManager.extractFileSuffixFromGalaxyTN\ (choices.archive, allowUnsupportedSuffixes=True) fn = ExternalTrackManager.extractFnFromGalaxyTN(choices.archive) if suffix == 'gsuite.tar': return TarArchiveReader(fn) elif suffix == 'gsuite.zip': return ZipArchiveReader(fn) else: raise ShouldNotOccurError()
def findEmptyVal(valDataType): if any(x in valDataType for x in ['str', 'S']): baseVal = '' elif 'int' in valDataType: from gold.util.CommonConstants import BINARY_MISSING_VAL baseVal = BINARY_MISSING_VAL elif 'float' in valDataType: baseVal = numpy.nan elif 'bool' in valDataType: baseVal = False else: from gold.util.CustomExceptions import ShouldNotOccurError raise ShouldNotOccurError('Error: valDataType (%s) not supported.' % valDataType) return baseVal
def _compute(self): array = self._children[0].getResult().valsAsNumpyArray() if len(array)==0: return numpy.nan assert array.dtype == "float32" or array.dtype == "float64" if self._aggregateOperation == 'sum': return float(array.sum(dtype="float64")) #accumulator must be 64-bit or rounding errors occur elif self._aggregateOperation == 'min': return float(array.min()) elif self._aggregateOperation == 'max': res = float(array.max()) #assert not any([v.isnan() for v in ]) return res else: raise ShouldNotOccurError()
def returns(sometype): "Return type checking decorator" # convert decorator argument into a checker checker = Checker.create(sometype) if checker is None: if RAISE_DEVIANCES: raise ShouldNotOccurError( "@returns decorator got parameter of unsupported " "type %s" % type_name(sometype)) else: logMessageOnce("@returns decorator got parameter of unsupported " "type %s" % type_name(sometype), level=5, logger=SIGNATURE_DEVIANCE_LOGGER) if NO_CHECK: # no type checking is performed, return decorated method itself def returns_proxy(method): return method else: def returns_proxy(method): def returns_invocation_proxy(*args, **kwargs): result = method(*args, **kwargs) if not checker.check(result): if RAISE_DEVIANCES: raise ReturnValueError( "%s() has returned an invalid " "value of type %s" % (method.__name__, type_name(result))) else: logMessageOnce("%s() has returned an invalid " "value of type %s" % (method.__name__, type_name(result)), level=5, logger=SIGNATURE_DEVIANCE_LOGGER) return result returns_invocation_proxy.__name__ = method.__name__ return returns_invocation_proxy return returns_proxy
def _getGenome(cls, choices): if hasattr(choices, 'genome'): return choices.genome else: gsuites = cls._getAllSelectedGsuites(choices) if len(gsuites) > 0: genomes = set(gsuite.genome for gsuite in gsuites) if len(genomes) == 1: genome = genomes.pop() if genome: return genome raise ShouldNotOccurError( 'Genome information is not provided in the selected genomes. ' 'Subclass of UserBinMixin should add a genome choice box using GenomeMixin, ' 'or override the cls._getGenome method')
def _compute(self): tsResult = TSResult(self._computeTrackStructure) rawResults = [] for key, child in self._childrenDict.iteritems(): childRes = child.getResult() tsResult[key] = childRes rawResults.append(childRes.getResult()) if self._multitrackSummaryFunc: if self._multitrackSummaryFunc == 'RawResults': tsResult.setResult(rawResults) else: tsResult.setResult(self._multitrackSummaryFunc(rawResults)) else: raise ShouldNotOccurError('The summary function is not defined') return tsResult
def getGlobalSource(globalSourceStr, genome, minimal): if minimal == True: return MinimalBinSource(genome) elif globalSourceStr == 'test': return UserBinSource('TestGenome:chr21:10000000-15000000', '1000000') elif globalSourceStr == 'chrs': return GenomeInfo.getChrRegs(genome) elif globalSourceStr == 'chrarms': return GenomeInfo.getChrArmRegs(genome) elif globalSourceStr == 'ensembl': return GenomeInfo.getStdGeneRegs(genome) elif globalSourceStr == 'userbins': from gold.application.StatRunner import StatJob assert StatJob.USER_BIN_SOURCE is not None return StatJob.USER_BIN_SOURCE #return kwArgs['userBins'] else: raise ShouldNotOccurError('globalSource not recognized')
def _storeMinimalMemoResult(cls, stat, minimalMemoResult): minimalMemoDict = cls.memoDataCollection[cls.MINIMAL_MEMO_PATH] minimalMemoKey = cls._createMinimalMemoKey(stat) if minimalMemoKey in minimalMemoDict: raise ShouldNotOccurError( 'Trying to store minimal memo result for "{}"'.format( minimalMemoKey) + 'when already present in minimal memoized result dict. The existing result ' 'should have been loaded and used, and no result should subsequently be stored.' ) else: if minimalMemoResult.error: if DebugConfig.VERBOSE: logMessage('Storing exception "{}" for "{}"'.format( minimalMemoResult.error.exc_value, minimalMemoKey)) else: if DebugConfig.VERBOSE: logMessage('Storing result "{}" for "{}"'.format( minimalMemoResult.result, minimalMemoKey)) minimalMemoDict[minimalMemoKey] = minimalMemoResult
def parseAndStoreProfile(stdout, testName, revision, diskMemo): offset = 0 if diskMemo: offset = 2 splittedStdout = stdout.split(Profiler.PROFILE_HEADER + os.linesep) totStats = re.findall('([0-9\.]+)', splittedStdout[offset + 1].splitlines()[0]) funcCalls = totStats[0] if len(totStats) == 2: primCalls = funcCalls cpuTime = totStats[1] elif len(totStats) == 3: primCalls = totStats[1] cpuTime = totStats[2] else: raise ShouldNotOccurError() cumProfile = splittedStdout[offset + 1].split( Profiler.PROFILE_FOOTER)[0] intProfile = splittedStdout[offset + 2].split( Profiler.PROFILE_FOOTER)[0] storage = ProfilingStorage._getStorage('c') if not storage.has_key(testName): storage[testName] = {} if not storage[testName].has_key(str(revision)): storage[testName][str(revision)] = {} storage[testName][str(revision)][str(diskMemo)] = {'funcCalls': funcCalls, \ 'primCalls': primCalls, \ 'cpuTime': cpuTime, \ 'cumProfile': cumProfile, \ 'intProfile': intProfile} storage.close()
def __iter__(self): try: i = 0 while not self._finished: if len(self._countList) > self._curCountListIdx and \ i == self._countList[self._curCountListIdx]: self._curCountListIdx += 1 if len(self._countList) == self._curCountListIdx: self._finished = True break yield self._geIter.next() i += 1 except StopIteration: if self._finished: raise else: raise ShouldNotOccurError( 'Premature stop. GESource was shorter than sum of countList.' )
def _validateAllTracksRead(self): if not self.hasResult(): raise ShouldNotOccurError("At this stage, statistic should either have result, " "or exception should have been raised") tracks = self.getAllTracks() trackUniqueKeys = [Track(tr.trackName).getUniqueKey(self.getGenome()) for tr in tracks] for trackIndex, restTrackIndexes in allElementsVersusRest(xrange(len(trackUniqueKeys))): track = tracks[trackIndex] if track is not None and not track.hasBeenFlaggedAsRead(): uniqueKeyForRestTracks = \ set(trackUniqueKeys[i] for i in restTrackIndexes) # If several tracks are the same, memory memoization will only result # in one RawDataStat being created, for one Track object. This is a # wanted optimization. In other cases, something is probably wrong if # a track has not been touched. However, this rule may be revisited # when track structure functionality is implemented. if trackUniqueKeys[trackIndex] not in uniqueKeyForRestTracks: raise IncompatibleTracksError( 'Track ' + prettyPrintTrackName(track.trackName) + ' was created, but not touched by statistic')
def getZeroBinsValidationMessage(self, regSpec, binSpec): raise ShouldNotOccurError('The region specification "%s" does not ' % regSpec + ' describe any real regions')