def __init__(self, geSource): self._geSource = self._decorateGESource(geSource) self._boundingRegionsAndGEsCorresponds = None self._areValsCategorical = TrackFormat.createInstanceFromGeSource(geSource).getValTypeName() == 'Category' self._areEdgeWeightsCategorical = TrackFormat.createInstanceFromGeSource(geSource).getWeightTypeName() == 'Category' self._valCategories = set() self._edgeWeightCategories = set() self._numElements = OrderedDefaultDict(int) self._maxStrLens = OrderedDefaultDict(partial(self._initMaxStrLens, self._getMaxStrLensKeys())) self._maxNumEdges = OrderedDefaultDict(int) self._hasCalculatedStats = False
def __init__(self, geSource): self._geSource = self._decorateGESource(geSource) self._boundingRegionsAndGEsCorrespond = None self._areValsCategorical = TrackFormat.createInstanceFromGeSource(geSource).getValTypeName() == 'Category' self._areEdgeWeightsCategorical = TrackFormat.createInstanceFromGeSource(geSource).getWeightTypeName() == 'Category' self._valCategories = set() self._edgeWeightCategories = set() self._numElements = OrderedDefaultDict(int) self._maxStrLens = OrderedDefaultDict(partial(self._initMaxStrLens, self._getMaxStrLensKeys())) self._maxNumEdges = OrderedDefaultDict(int) self._hasCalculatedStats = False
class GESourceManager(object): def __init__(self, geSource): self._geSource = self._decorateGESource(geSource) self._boundingRegionsAndGEsCorresponds = None self._areValsCategorical = TrackFormat.createInstanceFromGeSource(geSource).getValTypeName() == 'Category' self._areEdgeWeightsCategorical = TrackFormat.createInstanceFromGeSource(geSource).getWeightTypeName() == 'Category' self._valCategories = set() self._edgeWeightCategories = set() self._numElements = OrderedDefaultDict(int) self._maxStrLens = OrderedDefaultDict(partial(self._initMaxStrLens, self._getMaxStrLensKeys())) self._maxNumEdges = OrderedDefaultDict(int) self._hasCalculatedStats = False # self._calcStatisticsInExtraPass() def _decorateGESource(self, geSource): return GEDependentAttributesHolder(geSource) def _getMaxStrLensKeys(self): prefixSet = set(self._geSource.getPrefixList()) return (['val'] if 'val' in prefixSet and self._geSource.getValDataType() == 'S' else []) + \ (['id'] if 'id' in prefixSet else []) + \ (['edges'] if 'edges' in prefixSet else []) + \ (['weights'] if 'weights' in prefixSet and self._geSource.getEdgeWeightDataType() == 'S' else []) + \ [x for x in prefixSet if x not in RESERVED_PREFIXES] @staticmethod def _initMaxStrLens(keys): return dict([(x,0) for x in keys]) def _calcStatisticsInExtraPass(self): if not self._hasCalculatedStats: prevPrintWarnings = self._geSource.getPrintWarnings() self._geSource.setPrintWarnings(False) for el in self._geSource: chr = el.chr self._numElements[chr] += 1 if el.isBlankElement: continue if self._areValsCategorical: self._valCategories.add(el.val) if self._areEdgeWeightsCategorical: self._edgeWeightCategories |= set(el.weights) for prefix in self._maxStrLens[chr]: content = getattr(el, prefix, None) if content is not None: self._maxStrLens[chr][prefix] = \ max( self._maxStrLens[chr][prefix], \ max(1, len(content)) if isinstance(content, basestring) else \ max([1] + [len(x) for x in flatten(content)]) ) if prefix == 'edges': self._maxNumEdges[chr] = max(self._maxNumEdges[chr], len(el.edges)) self._geSource.setPrintWarnings(prevPrintWarnings) self._hasCalculatedStats = True def getGESource(self): return self._geSource def getBoundingRegionTuples(self): boundingRegionTuples = [x for x in self._getBoundingRegionTuples() \ if x.region.chr is not None] if len(boundingRegionTuples) == 0: from gtrackcore.input.core.GenomeElementSource import BoundingRegionTuple from gtrackcore.track.core.GenomeRegion import GenomeRegion from gtrackcore.metadata.GenomeInfo import GenomeInfo geChrList = self.getAllChrs() boundingRegionTuples = [BoundingRegionTuple( \ GenomeRegion(chr=chr, start=0, end=GenomeInfo.getChrLen(self._geSource.genome, chr)), \ self.getNumElementsForChr(chr) ) \ for chr in geChrList] self._boundingRegionsAndGEsCorresponds = False else: self._boundingRegionsAndGEsCorresponds = True return boundingRegionTuples def _getBoundingRegionTuples(self): return self._geSource.getBoundingRegionTuples() def boundingRegionsAndGEsCorresponds(self): assert self._boundingRegionsAndGEsCorresponds is not None return self._boundingRegionsAndGEsCorresponds def getPrefixList(self): return self._geSource.getPrefixList() def getValDataType(self): return self._geSource.getValDataType() def getValDim(self): return self._geSource.getValDim() def getEdgeWeightDataType(self): return self._geSource.getEdgeWeightDataType() def getEdgeWeightDim(self): return self._geSource.getEdgeWeightDim() def isSorted(self): return self._geSource.isSorted() def getAllChrs(self): self._calcStatisticsInExtraPass() return self._numElements.keys() def getNumElements(self): self._calcStatisticsInExtraPass() return sum(self._numElements.values()) def getNumElementsForChr(self, chr): self._calcStatisticsInExtraPass() return self._numElements[chr] def getValCategories(self): self._calcStatisticsInExtraPass() return self._valCategories def getEdgeWeightCategories(self): self._calcStatisticsInExtraPass() return self._edgeWeightCategories def getMaxNumEdges(self): self._calcStatisticsInExtraPass() return max(self._maxNumEdges.values()) def getMaxNumEdgesForChr(self, chr): self._calcStatisticsInExtraPass() return self._maxNumEdges[chr] def getMaxStrLens(self): self._calcStatisticsInExtraPass() reduce(lambda x,y:dict((key, max(x[key], y[key])) for key in x.keys()), \ self._maxStrLens.values()) def getMaxStrLensForChr(self, chr): self._calcStatisticsInExtraPass() return self._maxStrLens[chr]
class GESourceManager(object): def __init__(self, geSource): self._geSource = self._decorateGESource(geSource) self._boundingRegionsAndGEsCorrespond = None self._areValsCategorical = TrackFormat.createInstanceFromGeSource(geSource).getValTypeName() == 'Category' self._areEdgeWeightsCategorical = TrackFormat.createInstanceFromGeSource(geSource).getWeightTypeName() == 'Category' self._valCategories = set() self._edgeWeightCategories = set() self._numElements = OrderedDefaultDict(int) self._maxStrLens = OrderedDefaultDict(partial(self._initMaxStrLens, self._getMaxStrLensKeys())) self._maxNumEdges = OrderedDefaultDict(int) self._hasCalculatedStats = False # self._calcStatisticsInExtraPass() def _decorateGESource(self, geSource): return GEDependentAttributesHolder(geSource) def _getMaxStrLensKeys(self): prefixSet = set(self._geSource.getPrefixList()) return (['val'] if 'val' in prefixSet and self._geSource.getValDataType() == 'S' else []) + \ (['id'] if 'id' in prefixSet else []) + \ (['edges'] if 'edges' in prefixSet else []) + \ (['weights'] if 'weights' in prefixSet and self._geSource.getEdgeWeightDataType() == 'S' else []) + \ [x for x in prefixSet if x not in RESERVED_PREFIXES] @staticmethod def _initMaxStrLens(keys): return dict([(x,0) for x in keys]) def _calcStatisticsInExtraPass(self): if not self._hasCalculatedStats: prevPrintWarnings = self._geSource.getPrintWarnings() self._geSource.setPrintWarnings(False) if self._geSource.isSliceSource(): if len(self._getMaxStrLensKeys()): raise NotImplementedError('Dimension calculation not yet implemented for slice-based GenomeElementSources.') prefixList = self._geSource.getPrefixList() for el in self._geSource: chr = el.chr self._numElements[chr] += len(getattr(el, prefixList[0])) else: for el in self._geSource: chr = el.chr self._numElements[chr] += 1 if el.isBlankElement: continue if self._areValsCategorical: self._valCategories.add(el.val) if self._areEdgeWeightsCategorical: self._edgeWeightCategories |= set(el.weights) for prefix in self._maxStrLens[chr]: content = getattr(el, prefix, None) if content is not None: self._maxStrLens[chr][prefix] = \ max( self._maxStrLens[chr][prefix], \ max(1, len(content)) if isinstance(content, basestring) else \ max([1] + [len(x) for x in flatten(content)]) ) if prefix == 'edges': self._maxNumEdges[chr] = max(self._maxNumEdges[chr], len(el.edges)) self._geSource.setPrintWarnings(prevPrintWarnings) self._hasCalculatedStats = True def getGESource(self): return self._geSource def getBoundingRegionTuples(self): boundingRegionTuples = [x for x in self._getBoundingRegionTuples() \ if x.region.chr is not None] if len(boundingRegionTuples) == 0: from gtrackcore.input.core.GenomeElementSource import BoundingRegionTuple from gtrackcore.track.core.GenomeRegion import GenomeRegion from gtrackcore.metadata.GenomeInfo import GenomeInfo geChrList = self.getAllChrs() boundingRegionTuples = [BoundingRegionTuple( \ GenomeRegion(chr=chr, start=0, end=GenomeInfo.getChrLen(self._geSource.genome, chr)), \ self.getNumElementsForChr(chr) ) \ for chr in geChrList] self._boundingRegionsAndGEsCorrespond = False else: self._boundingRegionsAndGEsCorrespond = True return boundingRegionTuples def _getBoundingRegionTuples(self): return self._geSource.getBoundingRegionTuples() def boundingRegionsAndGEsCorrespond(self): assert self._boundingRegionsAndGEsCorrespond is not None return self._boundingRegionsAndGEsCorrespond def getPrefixList(self): return self._geSource.getPrefixList() def getValDataType(self): return self._geSource.getValDataType() def getValDim(self): return self._geSource.getValDim() def getEdgeWeightDataType(self): return self._geSource.getEdgeWeightDataType() def getEdgeWeightDim(self): return self._geSource.getEdgeWeightDim() def isSorted(self): return self._geSource.isSorted() def getAllChrs(self): self._calcStatisticsInExtraPass() return self._numElements.keys() def getNumElements(self): self._calcStatisticsInExtraPass() return sum(self._numElements.values()) def getNumElementsForChr(self, chr): self._calcStatisticsInExtraPass() return self._numElements[chr] def getValCategories(self): self._calcStatisticsInExtraPass() return self._valCategories def getEdgeWeightCategories(self): self._calcStatisticsInExtraPass() return self._edgeWeightCategories def getMaxNumEdges(self): self._calcStatisticsInExtraPass() return max(self._maxNumEdges.values()) def getMaxNumEdgesForChr(self, chr): self._calcStatisticsInExtraPass() return self._maxNumEdges[chr] def getMaxStrLens(self): self._calcStatisticsInExtraPass() return reduce(lambda x,y:dict((key, max(x[key], y[key])) for key in x.keys()), \ self._maxStrLens.values()) def getMaxStrLensForChr(self, chr): self._calcStatisticsInExtraPass() return self._maxStrLens[chr] def getMaxChrStrLen(self): self._calcStatisticsInExtraPass() return max(len(chr) for chr in self._maxStrLens.keys())