def _compute(self): tv = self._children[0].getResult() starts = tv.startsAsNumpyArray() ends = tv.endsAsNumpyArray() vals = [] dists = starts[1:] - ends[:-1] if len(dists) == 0: return TrackView(genomeAnchor=tv.genomeAnchor, startList=starts, endList=ends, valList=numpy.array(vals, dtype='int32'), \ strandList=None, idList=None, edgesList=None, weightsList=None, borderHandling=tv.borderHandling, allowOverlaps=tv.allowOverlaps) dists[dists < 0] = 0 selector = [False] * len(ends) if dists[0] <= self.threshold: selector[0] = True vals.append(dists[0]) for index in xrange(1, len(starts) - 1): nearestDist = min(dists[index - 1], dists[index]) if nearestDist <= self.threshold: vals.append(nearestDist) selector[index] = True selector = numpy.array(selector) return TrackView(genomeAnchor=tv.genomeAnchor, startList=starts[selector], endList=ends[selector], valList=numpy.array(vals, dtype='int32'), \ strandList=None, idList=None, edgesList=None, weightsList=None, borderHandling=tv.borderHandling, allowOverlaps=tv.allowOverlaps)
def __init__(self, vals=True, strands=True, anchor=None, valDType='float64'): assert(vals!=True or anchor!=None) if anchor==None: numElements = len(vals) anchor = [10, 10 + numElements] else: numElements = anchor[1] - anchor[0] vals = self._createList(vals, getRandValList(numElements), valDType) strands = self._createList(strands, getRandStrandList(numElements), 'bool8') #print (vals, strands, anchor) TrackView.__init__(self, GenomeRegion('TestGenome', 'chr21', anchor[0], anchor[1]), None, None, vals, strands, None, None, None, 'crop', False)
def _compute(self): tv1, tv2 = self._children[0].getResult(), self._children[1].getResult() t1s = tv1.startsAsNumpyArray() t1e = tv1.endsAsNumpyArray() t2s = tv2.startsAsNumpyArray() t2e = tv2.endsAsNumpyArray() allSortedDecodedEvents, allEventLengths, cumulativeCoverStatus = \ self._findAllStartAndEndEvents(t1s, t1e, t2s, t2e) allResultStarts = allSortedDecodedEvents[cumulativeCoverStatus == 3] allResultLengths = allEventLengths[cumulativeCoverStatus[:-1] == 3] allResultEnds = allResultStarts + allResultLengths return TrackView(genomeAnchor=tv1.genomeAnchor, startList=allResultStarts, endList=allResultEnds, valList=None, strandList=None, idList=None, edgesList=None, weightsList=None, borderHandling=tv1.borderHandling, allowOverlaps=False)
def _createTrackView(self, starts, ends, vals, strands, ids, edges, weights, extras, sourceRegion, allowOverlaps, sliceFull=False): genomeAnchor = GenomeRegion(genome=self.genome, chr=self.chr, start=sourceRegion[0], end=sourceRegion[1]) tv = TrackView(genomeAnchor, \ array(starts) if starts is not None else None, \ array(ends) if ends is not None else None, \ array(vals, dtype='float64') if vals is not None else None, \ array(strands) if strands is not None else None, \ array(ids) if ids is not None else None, \ array(edges) if edges is not None else None, \ array(weights) if weights is not None else None, \ 'crop', allowOverlaps, \ extraLists=OrderedDict([(key, array(extra)) for key, extra in extras.iteritems()]) if extras is not None else OrderedDict()) if sliceFull: tv.sliceElementsAccordingToGenomeAnchor() return tv
def _compute(self): tv = self._children[0].getResult() starts, ends = tv.startsAsNumpyArray(), tv.endsAsNumpyArray() borderDict = defaultdict(int) listLen = len(starts) for index in xrange(listLen): borderDict[starts[index]]+=1 borderDict[ends[index]]-=1 sortedPos = sorted(borderDict) range(0, chrlength, microbinzie) #handle start border issues startList, endList, valList = (sortedPos, sortedPos[1:], []) if sortedPos[0] == 0 else ([0] + sortedPos, sortedPos, [0]) #Handle end border issues chrEndPos = GenomeInfo.getChrLen(tv.genomeAnchor.genome, tv.genomeAnchor.chr)-1 startList, endList = (startList, endList+[chrEndPos]) if endList[-1]<chrEndPos else (startList[:-1], endList) #make step-function values accVal = 0 for pos in sortedPos: accVal+= borderDict[pos] valList.append(accVal) if chrEndPos == pos: valList.pop() return TrackView(genomeAnchor=tv.genomeAnchor, startList=np.array(startList), endList=np.array(endList), valList=np.array(valList), \ strandList=None, idList=None, edgesList=None, weightsList=None, borderHandling=tv.borderHandling, allowOverlaps=False)
def _compute(self): tv1, tv2 = self._children[0].getResult(), self._children[1].getResult() t1s = tv1.startsAsNumpyArray() t1e = tv1.endsAsNumpyArray() t1vals = tv1.valsAsNumpyArray() t2s = tv2.startsAsNumpyArray() t2e = tv2.endsAsNumpyArray() allSortedDecodedEvents, allEventLengths, cumulativeCoverStatus = \ self._findAllStartAndEndEvents(t1s, t1e, t2s, t2e) allResultStarts = allSortedDecodedEvents[cumulativeCoverStatus[:-1] == 3] allResultLengths = allEventLengths[cumulativeCoverStatus[:-1] == 3] allResultEnds = allResultStarts + allResultLengths valList = [] cursor = 0 for rs, re in zip(allResultStarts, allResultEnds): for i in xrange(cursor, len(t1s)): if rs >= t1s[i] and re <= t1e[i]: valList.append(float(t1vals[i])) cursor = i break assert len(valList) == len(allResultStarts), valList return TrackView(genomeAnchor=tv1.genomeAnchor, startList=allResultStarts, endList=allResultEnds, valList=array(valList), strandList=None, idList=None, edgesList=None, weightsList=None, borderHandling=tv1.borderHandling, allowOverlaps=False)
def _compute(self): tv = self._children[0].getResult() starts = tv.startsAsNumpyArray() binArray = starts / self.microBin binCounts = np.bincount(binArray) numMicroBins = int(math.ceil(float(len(self._region)) / self.microBin)) binCounts = np.concatenate( [binCounts, np.zeros(numMicroBins - len(binCounts), dtype='int')]) #print 'temp1: ', len(binCounts) assert [i * self.microBin for i in xrange(len(binCounts)) ] == range(0, len(self._region), self.microBin), ([ i * self.microBin for i in xrange(len(binCounts)) ], range(0, len(self._region), self.microBin)) startList = [i * self.microBin for i in xrange(len(binCounts))] assert [ min((i + 1) * self.microBin, len(self._region)) for i in xrange(len(binCounts)) ] == startList[1:] + [len(self._region)] endList = [ min((i + 1) * self.microBin, len(self._region)) for i in xrange(len(binCounts)) ] #print ','.join([str(x) for x in binCounts]) return TrackView(genomeAnchor=tv.genomeAnchor, startList=np.array(startList), endList=np.array(endList), valList=binCounts, \ strandList=None, idList=None, edgesList=None, weightsList=None, borderHandling=tv.borderHandling, allowOverlaps=False)
def _getTrackView(self, region): from gold.util.RandomUtil import random # To initialize random generators if not done previously #if self._cachedTV is None: rawData = RawDataStat(region, self._origTrack, self._trackFormatReq) origTV = rawData.getResult() self._checkTrackFormat(origTV) assert(not origTV.allowOverlaps) assert(origTV.borderHandling == 'crop') assert region == origTV.genomeAnchor starts, ends, vals, strands, ids, edges, weights, extras = \ self._createRandomizedNumpyArrays(len(origTV.genomeAnchor), origTV.startsAsNumpyArray(), \ origTV.endsAsNumpyArray(), origTV.valsAsNumpyArray(), \ origTV.strandsAsNumpyArray(), origTV.idsAsNumpyArray(), \ origTV.edgesAsNumpyArray(), origTV.weightsAsNumpyArray(), \ origTV.allExtrasAsDictOfNumpyArrays(), region) starts, ends, vals, strands, ids, edges, weights, extras = \ self._undoTrackViewChanges(starts, ends, vals, strands, ids, edges, weights, extras, origTV) from gold.util.CommonFunctions import getClassName self._cachedTV = TrackView(origTV.genomeAnchor, starts, ends, vals, strands, ids, edges, weights, \ origTV.borderHandling, origTV.allowOverlaps, extraLists=extras) assert self._trackFormatReq.isCompatibleWith(self._cachedTV.trackFormat), 'Incompatible track-format: '\ + str(self._trackFormatReq) + ' VS ' + str(self._cachedTV.trackFormat) return self._cachedTV
def _getRandTrackView(self, region): origTV = self._origTrack.getTrackView(region) starts, ends, vals, strands, ids, edges, weights, extras = \ self._createRandomizedNumpyArrays( len(origTV.genomeAnchor), origTV.startsAsNumpyArray(), origTV.endsAsNumpyArray(), origTV.valsAsNumpyArray(), origTV.strandsAsNumpyArray(), origTV.idsAsNumpyArray(), origTV.edgesAsNumpyArray(), origTV.weightsAsNumpyArray(), origTV.allExtrasAsDictOfNumpyArrays(), region) starts, ends, vals, strands, ids, edges, weights, extras = \ self._undoTrackViewChanges(starts, ends, vals, strands, ids, edges, weights, extras, origTV) return TrackView(origTV.genomeAnchor, starts, ends, vals, strands, ids, edges, weights, origTV.borderHandling, origTV.allowOverlaps, extraLists=extras)
def _compute(self): tv = self._children[0].getResult() startL, endL, valL = list(tv.startsAsNumpyArray()), list( tv.endsAsNumpyArray()), tv.valsAsNumpyArray() index = 0 numSegments = len(startL) overlapTreshold = 0 startRes, endRes, valRes = [], [], [] while index < numSegments - 1: start = startL[index] end = endL[index] if start >= overlapTreshold and startL[index + 1] >= end: startRes.append(start) endRes.append(end) valRes.append(valL[index]) overlapTreshold = max(overlapTreshold, end) index += 1 if startL[-1] >= overlapTreshold: startRes.append(startL[index]) endRes.append(endL[index]) valRes.append(valL[index]) return TrackView(genomeAnchor=tv.genomeAnchor, startList=np.array(startRes), endList=np.array(endRes), valList=np.array(valRes), \ strandList=None, idList=None, edgesList=None, weightsList=None, borderHandling=tv.borderHandling, allowOverlaps=False)
def getOneTrackViewFromPool(self, origTrack, randIndex): trackId = origTrack.getUniqueKey(self._region.genome) assert trackId in self._trackIdToIndexDict.keys(), \ 'given track should be in the original TrackStructure that was used to make this pool' trackIndex = self._trackIdToIndexDict[origTrack.getUniqueKey( self._region.genome)] if randIndex not in self._randomTrackSets['starts']: self._computeRandomTrackSet(randIndex) origTV = origTrack.getTrackView(self._region) for tvParam in self._randomTrackSets: try: self._randomTrackSets[tvParam][randIndex][trackIndex] except KeyError: # if the parameter does not exist, set it to None self._randomTrackSets[tvParam][randIndex] = {} self._randomTrackSets[tvParam][randIndex][trackIndex] = None return TrackView( genomeAnchor=origTV.genomeAnchor, startList=self._randomTrackSets['starts'][randIndex][trackIndex], endList=self._randomTrackSets['ends'][randIndex][trackIndex], valList=self._randomTrackSets['vals'][randIndex][trackIndex], strandList=self._randomTrackSets['strands'][randIndex][trackIndex], idList=self._randomTrackSets['ids'][randIndex][trackIndex], edgesList=self._randomTrackSets['edges'][randIndex][trackIndex], weightsList=self._randomTrackSets['weights'][randIndex] [trackIndex], borderHandling=origTV.borderHandling, allowOverlaps=origTV.allowOverlaps)
def _compute(self): raise #not finished implementing.. tv = self._children[0].getResult() vals = tv.valsAsNumpyArray() numMicroBins = int(math.ceil(float(len(self._region)) / self.microBin)) miBinBorders = range(0, len(self._region), self.microBin) + [len(self._region)] miBins = [] #go from borders 0,10,20 to paired bin intervals 0,10, 10,20, 20,30 ... for i, b in enumerate(miBinBorders): miBins.append(b) if i != 0 and i != len(miBinBorders) - 1: miBins.append(b) accVals = vals.add.reduceat(vals, miBins)[::2] binCounts = accVals #print 'temp1: ', len(binCounts) #Fix asserts here.. assert [i * self.microBin for i in xrange(len(binCounts)) ] == range(0, len(self._region), self.microBin), ([ i * self.microBin for i in xrange(len(binCounts)) ], range(0, len(self._region), self.microBin)) startList = [i * self.microBin for i in xrange(len(binCounts))] assert [ min((i + 1) * self.microBin, len(self._region)) for i in xrange(len(binCounts)) ] == startList[1:] + [len(self._region)] endList = [ min((i + 1) * self.microBin, len(self._region)) for i in xrange(len(binCounts)) ] #print ','.join([str(x) for x in binCounts]) return TrackView(genomeAnchor=tv.genomeAnchor, startList=np.array(startList), endList=np.array(endList), valList=binCounts, \ strandList=None, idList=None, edgesList=None, weightsList=None, borderHandling=tv.borderHandling, allowOverlaps=False)
def loadTrackView(trackData, region, borderHandling, allowOverlaps, trackName=[]): """ trackData : see TrackSource.getTrackData {'id' : smartmemmap} region : see GenomeRegion """ #brShelve = BoundingRegionShelve(region.genome, trackName, allowOverlaps) brShelve = trackData.boundingRegionShelve brInfo = brShelve.getBoundingRegionInfo(region) if brShelve is not None else None extraArrayNames = [arrayName for arrayName in trackData if arrayName not in \ RESERVED_PREFIXES.keys() + ['leftIndex', 'rightIndex']] reservedArrays = [TrackViewLoader._getArray(trackData, arrayName, brInfo) for arrayName in RESERVED_PREFIXES] extraArrays = [TrackViewLoader._getArray(trackData, arrayName, brInfo) for arrayName in extraArrayNames] trackFormat = TrackFormat( *(reservedArrays + [OrderedDict(zip(extraArrayNames, extraArrays))]) ) if trackFormat.reprIsDense(): if brInfo is None: leftIndex = region.start rightIndex = region.end else: leftIndex = region.start - brInfo.start rightIndex = region.end - brInfo.start else: leftBin = CompBinManager.getBinNumber(region.start) rightBin = CompBinManager.getBinNumber(region.end-1) #leftBin = region.start/COMP_BIN_SIZE #rightBin = (region.end-1)/COMP_BIN_SIZE if trackData.get('leftIndex') is None or trackData.get('rightIndex') is None: raise IOError('Preprocessed track not found. TrackData: ' + ', '.join(trackData.keys())) leftIndex = TrackViewLoader._getArray(trackData, 'leftIndex', brInfo, leftBin) rightIndex = TrackViewLoader._getArray(trackData, 'rightIndex', brInfo, rightBin) slicedReservedArrays = [(array[leftIndex:rightIndex] if array is not None else None) for array in reservedArrays] slicedExtraArrays = [(array[leftIndex:rightIndex] if array is not None else None) for array in extraArrays] argList = [region] + slicedReservedArrays + [borderHandling, allowOverlaps] + [OrderedDict(zip(extraArrayNames, slicedExtraArrays))] tv = TrackView( *(argList) ) if not trackFormat.reprIsDense(): tv.sliceElementsAccordingToGenomeAnchor() #tv._doScatteredSlicing() return tv
def _getRawTrackView(self, region, borderHandling, allowOverlaps): assert len(region) == 1 from collections import OrderedDict from gold.track.CommonMemmapFunctions import findEmptyVal from gold.track.TrackView import TrackView import numpy as np geSource = ExternalTrackManager.getGESourceFromGalaxyOrVirtualTN(self.trackName, region.genome) prefixList = geSource.getPrefixList() valDataType = geSource.getValDataType() valDim = geSource.getValDim() weightDataType = geSource.getEdgeWeightDataType() weightDim = geSource.getEdgeWeightDim() startList, endList, valList, strandList, idList, edgesList, weightsList = [None]*7 extraLists=OrderedDict() tf = TrackFormat.createInstanceFromPrefixList(prefixList, valDataType, valDim, \ weightDataType, weightDim) if allowOverlaps and (tf.isDense() or geSource.hasNoOverlappingElements()): raise IncompatibleTracksError(prettyPrintTrackName(self.trackName) + ' with format: '\ + str(tf) + ' does not satisfy ' + str(self._trackFormatReq)) denseAndInterval = tf.isDense() and tf.isInterval() numEls = 2 if denseAndInterval else 1 if valDataType == 'S': valDataType = 'S2' if weightDataType == 'S': weightDataType = 'S2' for prefix in prefixList: if prefix == 'start': startList = np.array([-1], dtype='int32') elif prefix == 'end': if denseAndInterval: endList = np.array([0, 1], dtype='int32') else: endList = np.array([0], dtype='int32') elif prefix == 'val': valList = np.array([findEmptyVal(valDataType)] * valDim * numEls, \ dtype=valDataType).reshape((numEls, valDim) if valDim > 1 else numEls) elif prefix == 'strand': strandList = np.array([1] * numEls, dtype='int8') elif prefix == 'id': idList = np.array([''] * numEls, dtype='S1') elif prefix == 'edges': edgesList = np.array([['']] * numEls, dtype='S1') elif prefix == 'weights': weightsList = np.array([[[findEmptyVal(weightDataType)]]] * weightDim * numEls, \ dtype=weightDataType).reshape((numEls, 1, weightDim) if weightDim > 1 else (numEls, 1)) else: extraLists[prefix] = np.array([''] * numEls, dtype='S1') return TrackView(region, startList, endList, valList, strandList, idList, edgesList, weightsList, borderHandling, allowOverlaps, extraLists)
def __init__(self, vals=True, strands=True, anchor=None, valDType='float64'): assert (vals != True or anchor is not None) if anchor is None: numElements = len(vals) anchor = [10, 10 + numElements] else: numElements = anchor[1] - anchor[0] vals = self._createList(vals, getRandValList(numElements), valDType) strands = self._createList(strands, getRandStrandList(numElements), 'bool8') #print (vals, strands, anchor) TrackView.__init__( self, GenomeRegion('TestGenome', 'chr21', anchor[0], anchor[1]), None, None, vals, strands, None, None, None, 'crop', False)
def runIntegrationTest(): track = Track(['melting']) track2 = Track(['melting']) #regionIter = [_getRegion(c,s,e) for c,s,e in [('M',1000,2000),('M',2000,5000),('M',1000,15000)]]#('M',4000,4000)] ] regionIter = GenomeElementSource('Z:\\new_hb\\2sSegs.bed', 'hg18') # segments: genomeAnchor = GenomeRegion(genome='hg18', chr='chrM', start=0, end=50) trackView = TrackView(genomeAnchor, [2, 16, 23, 40], [9, 20, 26, 45], None, 4, None) trackView2 = TrackView(genomeAnchor, [4, 8, 22], [6, 16, 24], None, 3, None) # data = StatRunner.run(regionIter, track, track2, RawOverlapStat, trackView, trackView2) # data = StatRunner.run(regionIter, track, track2, DerivedOverlapStat, trackView, trackView2) data = StatRunner.run(regionIter, track, track2, AccuracyStat, trackView, trackView2) print data param = "cc" for el in data: s = SingleValExtractor(el, param) print s.getVal()
def getTrackView(self, trackBinIndex): trackBinPair = self._trackBinIndexer.getTrackBinPairForTrackBinIndex( trackBinIndex) trackStorageView = self._getDataFrameView(trackBinIndex) starts = trackStorageView.getArray(START_KEY) lengths = trackStorageView.getArray(LENGTH_KEY) ends = starts + lengths return TrackView(trackBinPair.bin, starts, ends, None, None, None, None, None, borderHandling='crop', allowOverlaps=self.allowOverlaps)
def _compute(self): binSize = self._children[0].getResult() tv = self._children[1].getResult() starts = list(tv.startsAsNumpyArray()) ends = starts[:] vals = strandType = strandList = None if len(starts) > 0: if starts[0] > 0: starts.insert(0, 0) else: del ends[0] if len(ends) == 0 or ends[-1] < binSize - 1: ends.append(binSize - 1) else: del starts[-1] strands = tv.strandsAsNumpyArray() if strands != None: strands = set(strands) if len(strands) > 1: raise InvalidFormatError( 'All strands within a bin must be of same sort: error at %s' % (tv.genomeAnchor)) strandType = strands.pop() strandList = [strandType] * len(starts) vals = range(len(starts) - 1, -1, -1) if strandType == 0 else range(len(starts)) starts = np.array(starts) + tv.genomeAnchor.start ends = np.array(ends) + tv.genomeAnchor.start strTemplate = self._region.chr + '\t%s\t%s\t%s\t' + getStringFromStrand( strandType) return '\n'.join([ strTemplate % (str(starts[i]), str(ends[i]), str(vals[i])) for i in xrange(len(starts)) ]) return TrackView(genomeAnchor=tv.genomeAnchor, startList=starts, endList=ends, valList=vals, \ strandList=strandList, idList=None, edgesList=None, weightsList=None, borderHandling=tv.borderHandling, allowOverlaps=tv.allowOverlaps)
def testInit(self): starts = [1, 11, 21] ends = [9, 19, 29] values = [5.2, -5, 0] strands = [False, True, False] ids = ['a1', 'b2', 'c3'] edges = [['b2','c3'], ['a1',''], ['','']] weights = [[0.2,0.3], [-0.1,nan], [nan,nan]] extras = OrderedDict([('extra1', ['A','B','C']), ('extra2', ['1.0','2.0','3.0'])]) genomeAnchor = GenomeRegion(self.genome, self.chr, 0, 100) self.assertRaises(AssertionError, TrackView, genomeAnchor, [], ends, values, strands, ids, edges, weights, 'crop', False, extraLists=extras) self.assertRaises(AssertionError, TrackView, genomeAnchor, starts, [], values, strands, ids, edges, weights, 'crop', False, extraLists=extras) self.assertRaises(AssertionError, TrackView, genomeAnchor, starts, ends, [], strands, ids, edges, weights, 'crop', False, extraLists=extras) self.assertRaises(AssertionError, TrackView, genomeAnchor, starts, ends, values, [], ids, edges, weights, 'crop', False, extraLists=extras) self.assertRaises(AssertionError, TrackView, genomeAnchor, starts, ends, values, strands, [], edges, weights, 'crop', False, extraLists=extras) self.assertRaises(AssertionError, TrackView, genomeAnchor, starts, ends, values, strands, ids, [], weights, 'crop', False, extraLists=extras) self.assertRaises(AssertionError, TrackView, genomeAnchor, starts, ends, values, strands, ids, edges, [], 'crop', False, extraLists=extras) self.assertRaises(AssertionError, TrackView, genomeAnchor, starts, ends, values, strands, ids, edges, weights, 'crop', False, extraLists=OrderedDict([('extra1', [])])) t = TrackView(genomeAnchor, None, [0] + ends, [nan] + values, [True] + strands, [''] + ids, [['','']] + edges, [[nan,nan]] + weights, 'crop', False, \ extraLists=OrderedDict([(x,[''] + y) for x,y in extras.items()])) t = TrackView(genomeAnchor, starts, None, values, strands, ids, edges, weights, 'crop', False, extraLists=extras) t = TrackView(genomeAnchor, starts, ends, None, strands, ids, edges, weights, 'crop', False, extraLists=extras) t = TrackView(genomeAnchor, starts, ends, values, None, ids, edges, weights, 'crop', False, extraLists=extras) t = TrackView(genomeAnchor, starts, ends, values, strands, None, None, None, 'crop', False, extraLists=extras) t = TrackView(genomeAnchor, starts, ends, values, strands, ids, None, None, 'crop', False, extraLists=extras) t = TrackView(genomeAnchor, starts, ends, values, strands, ids, edges, None, 'crop', False, extraLists=extras) t = TrackView(genomeAnchor, starts, ends, values, strands, ids, edges, weights, 'crop', False) self.assertRaises(AssertionError, TrackView, genomeAnchor, starts[0:-1], ends, values, strands, ids, edges, weights, 'crop', False, extraLists=extras) self.assertRaises(AssertionError, TrackView, genomeAnchor, starts, ends[0:-1], values, strands, ids, edges, weights, 'crop', False, extraLists=extras) self.assertRaises(AssertionError, TrackView, genomeAnchor, starts, ends, values[0:-1], strands, ids, edges, weights, 'crop', False, extraLists=extras) self.assertRaises(AssertionError, TrackView, genomeAnchor, starts, ends, values, strands[0:-1], ids, edges, weights, 'crop', False, extraLists=extras) self.assertRaises(AssertionError, TrackView, genomeAnchor, starts, ends, values, strands, ids[0:-1], edges, weights, 'crop', False, extraLists=extras) self.assertRaises(AssertionError, TrackView, genomeAnchor, starts, ends, values, strands, ids, edges[0:-1], weights, 'crop', False, extraLists=extras) self.assertRaises(AssertionError, TrackView, genomeAnchor, starts, ends, values, strands, ids, edges, weights[0:-1], 'crop', False, extraLists=extras) self.assertRaises(AssertionError, TrackView, genomeAnchor, starts, ends, values, strands, ids, edges, weights, 'crop', False, extraLists={'cat': extras['extra1'][0:-1]})
def _compute(self): start = time.time() tv = self._children[0].getResult() starts, ends, vals = tv.startsAsNumpyArray(), tv.endsAsNumpyArray( ), tv.valsAsNumpyArray() endVals = vals[np.argsort(ends)] borderDict = dict() for index, val in enumerate(starts): if val in borderDict: borderDict[val][0] += 1 else: borderDict[val] = [1, 0] if ends[index] in borderDict: borderDict[ends[index]][1] += 1 else: borderDict[ends[index]] = [0, 1] uniquePoints = sorted(borderDict.keys()) borderArray = [borderDict[v] for v in uniquePoints] del starts, ends # delete unneccessary memory if self._combineMethod == 'mostCommonCat': combineMethod = MostCommonCategory() elif self._combineMethod == 'freqOfCat': combineMethod = FrequencyOfCategory(self._category, self._numSamples) if len(uniquePoints) > 0: newVals = [ combineMethod.getEmptyElement() for v in xrange(len(uniquePoints)) ] # np.zeros(len(uniquePoints), dtype=vals.dtype)#FIXME lager arrayen som skal holde resultatene accStart = numVals = accEnd = 0 #print 'before for-loop: ', time.time()-start countAddition = 0 for uniqueIndex, uniqueRow in enumerate(borderArray): newVals[uniqueIndex] = combineMethod.getCombinedValueForRegion( numVals) newAccStart = accStart + uniqueRow[0] newAccEnd = accEnd + uniqueRow[1] while accStart < newAccStart: combineMethod.updateForRegionStart(vals[accStart]) accStart += 1 while accEnd < newAccEnd: combineMethod.updateForRegionEnd(endVals[accEnd]) accEnd += 1 accEnd, accStart = newAccEnd, newAccStart numVals += uniqueRow[0] - uniqueRow[1] #print 'after for-loop: ', time.time()-start #logMessage('Iterated through %i Subtractions' % countAddition) #logMessage(repr(combineMethod.valueDict.keys())) else: newVals = [combineMethod.getEmptyElement()] segBorders = np.array(uniquePoints) + tv.genomeAnchor.start return TrackView(genomeAnchor = tv.genomeAnchor, startList=segBorders[:-1], endList=segBorders[1:], valList=np.array(newVals[1:], dtype=combineMethod.getDataType()), \ strandList=None, idList=None, edgesList=None, weightsList=None, borderHandling=tv.borderHandling, allowOverlaps=tv.allowOverlaps)
def convert(cls, tv): startList = cls._virtualListClass(tv._startList, tv._endList, tv._strandList) valList = tv._valList strandList = tv._strandList idList = tv._idList edgesList = tv._edgesList weigthsList = tv._weightsList extraLists = tv._extraLists if len(startList) > 0: sortIndexes = numpy.argsort(startList) startList = startList[sortIndexes] if valList is not None: valList = valList[sortIndexes] if strandList is not None: strandList = strandList[sortIndexes] if idList is not None: idList = idList[sortIndexes] if edgesList is not None: edgesList = edgesList[sortIndexes] if weigthsList is not None: weigthsList = weigthsList[sortIndexes] for key in extraLists: if extraLists[key] is not None: extraLists[key] = extraLists[key][sortIndexes] #if tv.allowOverlaps and len(startList) >= 2: # #What we really want to do is: # #sortedZippedList = sorted(zip(startList, valList, strandList) )) # #startList, valList, strandList = zip(*sortedZippedList) # #But, since valList or strandList may be None: # # sortedZippedList = sorted(zip(*( [startList] + \ # [x for x in [valList, strandList, idList, \ # edgesList, weigthsList] + \ # extraLists.values() if x is not None] ))) # x = zip(*sortedZippedList) # startList = x.pop(0) # if valList is not None: # valList = x.pop(0) # if strandList is not None: # strandList = x.pop(0) # if idList is not None: # idList = x.pop(0) # if edgesList is not None: # edgesList = x.pop(0) # if weigthsList is not None: # weigthsList = x.pop(0) # for key in extraLists: # if extraLists[key] is not None: # extraLists[key] = x.pop(0) # assert(x == []) newTv = TrackView(tv.genomeAnchor, startList, None, valList, strandList, idList, edgesList, weigthsList, tv.borderHandling, tv.allowOverlaps, extraLists=extraLists) newTv = newTv[:] return newTv
def __init__(self, segments=None, starts=True, ends=True, vals=True, strands=False, ids=False, edges=False, weights=False, \ extras=False, anchor=None, numElements=None, valDType='float64', borderHandling='crop', allowOverlaps=False): if type(starts) != bool and ends == True: ends = False if type(ends) != bool and starts == True: starts = False assert not (starts==False and ends==False) assert segments!=False and segments!=True assert starts!=None and ends!=None and vals!=None and strands!=None assert segments==None or (starts==True and ends==True) assert not (isIter(weights) and not isIter(edges)) assert (any( type(x) not in [bool,type(None)] for x in [segments,starts,ends,vals,strands,ids,edges,weights,extras]) and numElements==None) \ or numElements!=None #assert(( (type(segments)!=bool or type(starts)!=bool or type(ends)!=bool or \ # type(vals)!=bool or type(strands)!=bool) and numElements==None )\ # or numElements!=None) # if anchor==None: anchor = [10,1000] if segments != None: starts = [] ends = [] for seg in segments: starts.append(seg[0]) ends.append(seg[1]) if isIter(edges): maxNumEdges = self._findMaxNumEls(edges) edges = self._appendEmptyToEnd(edges, '', maxNumEdges) if isIter(weights): weights = self._appendEmptyToEnd(weights, numpy.nan, maxNumEdges) [starts, ends, vals, strands, ids, edges, weights] + ([x for x in extras.values()] if isinstance(extras, dict) else []) for list in [starts, ends, vals, strands, ids, edges, weights] + ([x for x in extras.values()] if isinstance(extras, dict) else []): if type(list) != bool and numElements == None: numElements = len(list) assert(type(list) == bool or len(list) == numElements) for coordList in [starts, ends]: if type(coordList) != bool: for j in range(len(coordList)): coordList[j] += anchor[0] randSegmentLists = getRandSegments(numElements, anchor[0], anchor[1]) starts = self._createList(starts, randSegmentLists[0], 'int32') ends = self._createList(ends, randSegmentLists[1], 'int32') vals = self._createList(vals, getRandValList(numElements, valDType), valDType) strands = self._createList(strands, getRandStrandList(numElements), 'bool8') randIds, randEdges, randWeights = getRandGraphLists(numElements) ids = self._createList(ids, randIds, randIds.dtype) edges = self._createList(edges, randEdges, randEdges.dtype) weights = self._createList(weights, randWeights, 'float64') if weights is not None and len(weights.shape) == 1: weights = weights.reshape(weights.shape + (0,)) extras = self._createExtraLists(extras, 'S', numElements) if starts == None: if ends[0] != 0: ends = numpy.append([anchor[0]], ends) if vals != None: vals = numpy.append([nan], vals) if strands != None: strands = numpy.append([True], strands) if ends[-1] != anchor[1]: ends[-1] = anchor[1] # print (starts, ends, vals, strands, anchor) TrackView.__init__(self, GenomeRegion('TestGenome', 'chr21', anchor[0], anchor[1]), starts, ends, vals, \ strands, ids, edges, weights, borderHandling, allowOverlaps, extraLists=extras)
def __iter__(self): for region in [self._region]: valArray = numpy.arange(len(region), dtype='int32') yield TrackView(self._region, valList=valArray, allowOverlaps=False)
def _compute(self): tv = self._children[0].getResult() nElements = tv.getNumElements() starts, ends, vals = tv.startsAsNumpyArray(), tv.endsAsNumpyArray( ), tv.valsAsNumpyArray() sortedToOriginalEndIndices = np.argsort( ends) # neccessary to find the correct values for end events uniqueSortedPositions, uniquePosIndices = np.unique1d( np.concatenate((starts, ends)), return_inverse=True) if uniqueSortedPositions.size > 1: posEventArray = np.zeros( (uniqueSortedPositions.size, 2), dtype='int32' ) #the number of segments starting and ending at each unique position # starts indices, counts = self._countDuplicates( uniquePosIndices[:nElements]) posEventArray[indices, 0] = counts # ends indices, counts = self._countDuplicates( uniquePosIndices[nElements:]) posEventArray[indices, 1] = counts del starts, ends, indices, counts, uniquePosIndices # delete unneccessary arrays to free memory newVals = np.zeros(uniqueSortedPositions.size - 1, dtype=vals.dtype) uniqueVals = np.unique(vals) uniqueValCounts = np.zeros( uniqueVals.size, dtype='int32' ) # array with the current count for each unique value #NumPy record array for converting from category values to corr. indices in the uniqueValCounts array recDType = np.dtype({ 'names': [str(x) for x in uniqueVals], 'formats': ['int32'] * uniqueVals.size }) uniqueValCountsIndices = np.array([tuple(range(len(uniqueVals)))], dtype=recDType) accStart = 0 accEnd = 0 for posEventIndex in xrange(len(posEventArray)): startEvents, endEvents = posEventArray[posEventIndex] numVals = uniqueValCounts.sum() if numVals > 0: maxCount = uniqueValCounts.max() newVals[posEventIndex - 1] = ';'.join([ str(x) for x in uniqueVals[np.where( uniqueValCounts == maxCount)] ]) + '(%i/%i)' % (maxCount, numVals) uniqueEventVals, counts = self._countDuplicates( vals[accStart:accStart + startEvents]) if uniqueEventVals.size > 0: uniqueValCounts[uniqueValCountsIndices[uniqueEventVals]. view('int32')] += counts uniqueEventVals, counts = self._countDuplicates( vals[sortedToOriginalEndIndices[accEnd:accEnd + endEvents]]) if uniqueEventVals.size > 0: uniqueValCounts[uniqueValCountsIndices[uniqueEventVals]. view('int32')] -= counts accStart += startEvents accEnd += endEvents else: newVals = np.array([], dtype=vals.dtype) segBorders = uniqueSortedPositions + tv.genomeAnchor.start return TrackView(genomeAnchor = tv.genomeAnchor, startList=segBorders[:-1], endList=segBorders[1:], valList=newVals, \ strandList=None, idList=None, edgesList=None, weightsList=None, borderHandling=tv.borderHandling, allowOverlaps=tv.allowOverlaps)
def loadTrackView(trackData, region, borderHandling, allowOverlaps, trackName=[]): """ trackData : see TrackSource.getTrackData {'id' : smartmemmap} region : see GenomeRegion """ #brShelve = BoundingRegionShelve(region.genome, trackName, allowOverlaps) brShelve = trackData.boundingRegionShelve brInfo = brShelve.getBoundingRegionInfo( region) if brShelve is not None else None extraArrayNames = [arrayName for arrayName in trackData if arrayName not in \ RESERVED_PREFIXES.keys() + ['leftIndex', 'rightIndex']] reservedArrays = [ TrackViewLoader._getArray(trackData, arrayName, brInfo) for arrayName in RESERVED_PREFIXES ] extraArrays = [ TrackViewLoader._getArray(trackData, arrayName, brInfo) for arrayName in extraArrayNames ] trackFormat = TrackFormat( *(reservedArrays + [OrderedDict(zip(extraArrayNames, extraArrays))])) if trackFormat.reprIsDense(): if brInfo is None: leftIndex = region.start rightIndex = region.end else: leftIndex = region.start - brInfo.start rightIndex = region.end - brInfo.start else: leftBin = CompBinManager.getBinNumber(region.start) rightBin = CompBinManager.getBinNumber(region.end - 1) #leftBin = region.start/COMP_BIN_SIZE #rightBin = (region.end-1)/COMP_BIN_SIZE if trackData.get('leftIndex') is None or trackData.get( 'rightIndex') is None: raise IOError('Preprocessed track not found. TrackData: ' + ', '.join(trackData.keys())) leftIndex = TrackViewLoader._getArray(trackData, 'leftIndex', brInfo, leftBin) rightIndex = TrackViewLoader._getArray(trackData, 'rightIndex', brInfo, rightBin) slicedReservedArrays = [ (array[leftIndex:rightIndex] if array is not None else None) for array in reservedArrays ] slicedExtraArrays = [ (array[leftIndex:rightIndex] if array is not None else None) for array in extraArrays ] argList = [region] + slicedReservedArrays + [ borderHandling, allowOverlaps ] + [OrderedDict(zip(extraArrayNames, slicedExtraArrays))] tv = TrackView(*(argList)) if not trackFormat.reprIsDense(): tv.sliceElementsAccordingToGenomeAnchor() #tv._doScatteredSlicing() return tv
def __init__(self, segments=None, starts=True, ends=True, vals=False, strands=False, ids=False, edges=False, weights=False, extras=False, anchor=None, numElements=None, valDType='float64', borderHandling='crop', allowOverlaps=False): if type(starts) != bool and ends == True: ends = False if type(ends) != bool and starts == True: starts = False assert not (starts == False and ends == False) assert segments != False and segments != True assert starts is not None and ends is not None and vals is not None and strands is not None assert segments is None or (starts == True and ends == True) assert not (isIter(weights) and not isIter(edges)) assert (any( type(x) not in [bool,type(None)] for x in [segments,starts,ends,vals,strands,ids,edges,weights,extras]) and numElements==None) \ or numElements is not None #assert(( (type(segments)!=bool or type(starts)!=bool or type(ends)!=bool or \ # type(vals)!=bool or type(strands)!=bool) and numElements==None )\ # or numElements!=None) # if anchor is None: anchor = [10, 1000] if segments is not None: starts = [] ends = [] for seg in segments: starts.append(seg[0]) ends.append(seg[1]) if isIter(edges): maxNumEdges = self._findMaxNumEls(edges) edges = self._appendEmptyToEnd(edges, '', maxNumEdges) if isIter(weights): weights = self._appendEmptyToEnd(weights, numpy.nan, maxNumEdges) [starts, ends, vals, strands, ids, edges, weights ] + ([x for x in extras.values()] if isinstance(extras, dict) else []) for list in [starts, ends, vals, strands, ids, edges, weights] + ( [x for x in extras.values()] if isinstance(extras, dict) else []): if type(list) != bool and numElements is None: numElements = len(list) assert (type(list) == bool or len(list) == numElements) for coordList in [starts, ends]: if type(coordList) != bool: for j in range(len(coordList)): coordList[j] += anchor[0] randSegmentLists = getRandSegments(numElements, anchor[0], anchor[1]) starts = self._createList(starts, randSegmentLists[0], 'int32') ends = self._createList(ends, randSegmentLists[1], 'int32') vals = self._createList(vals, getRandValList(numElements, valDType), valDType) strands = self._createList(strands, getRandStrandList(numElements), 'bool8') randIds, randEdges, randWeights = getRandGraphLists(numElements) ids = self._createList(ids, randIds, randIds.dtype) edges = self._createList(edges, randEdges, randEdges.dtype) weights = self._createList(weights, randWeights, 'float64') if weights is not None and len(weights.shape) == 1: weights = weights.reshape(weights.shape + (0, )) extras = self._createExtraLists(extras, 'S', numElements) if starts is None: if ends[0] != 0: ends = numpy.append([anchor[0]], ends) if vals is not None: vals = numpy.append([nan], vals) if strands is not None: strands = numpy.append([True], strands) if ends[-1] != anchor[1]: ends[-1] = anchor[1] # print (starts, ends, vals, strands, anchor) TrackView.__init__(self, GenomeRegion('TestGenome', 'chr21', anchor[0], anchor[1]), starts, ends, vals, \ strands, ids, edges, weights, borderHandling, allowOverlaps, extraLists=extras)