def checkUndirectedEdges(genome, trackName, allowOverlaps): collector = PreProcMetaDataCollector(genome, trackName) if not (collector.getTrackFormat().isLinked() and collector.hasUndirectedEdges()): return complementEdgeWeightDict = {} for chr in collector.getPreProcessedChrs(allowOverlaps): trackSource = TrackSource() trackData = trackSource.getTrackData(trackName, genome, chr, allowOverlaps) ids = trackData['id'] edges = trackData['edges'] weights = trackData.get('weights') for i, id in enumerate(ids): edgesAttr = edges[i][edges[i] != ''] weightsAttr = weights[i][edges[i] != ''] if weights is not None else None PreProcessUtils._adjustComplementaryEdgeWeightDict(complementEdgeWeightDict, id, edgesAttr, weightsAttr) if len(complementEdgeWeightDict) != 0: unmatchedPairs = [] for toId in complementEdgeWeightDict: for fromId in complementEdgeWeightDict[toId]: unmatchedPairs.append((fromId, toId, complementEdgeWeightDict[toId][fromId])) raise InvalidFormatError("Error: All edges are not undirected. The following edges specifications " +\ "are not matched by an opposite edge with equal weight:" + os.linesep +\ os.linesep.join(["from '%s' to '%s'" % (fromId, toId) + \ (" with weight '%s'" % weight if weight != '' else '') \ for fromId, toId, weight in unmatchedPairs]))
def checkIfEdgeIdsExist(genome, trackName, allowOverlaps): collector = PreProcMetaDataCollector(genome, trackName) if not collector.getTrackFormat().isLinked(): return uniqueIds = numpy.array([], dtype='S') uniqueEdgeIds = numpy.array([], dtype='S') for chr in collector.getPreProcessedChrs(allowOverlaps): trackSource = TrackSource() trackData = trackSource.getTrackData(trackName, genome, chr, allowOverlaps) uniqueIds = numpy.unique( numpy.concatenate((uniqueIds, trackData['id'][:]))) uniqueEdgeIds = numpy.unique( numpy.concatenate( (uniqueEdgeIds, trackData['edges'][:].flatten()))) uniqueIds = uniqueIds[uniqueIds != ''] uniqueEdgeIds = uniqueEdgeIds[uniqueEdgeIds != ''] unmatchedIds = set(uniqueEdgeIds) - set(uniqueIds) if len(unmatchedIds) > 0: raise InvalidFormatError( "Error: the following ids specified in the 'edges' column do not exist in the dataset: " + ', '.join(sorted(unmatchedIds)))
def merge(genome, trackName, allowOverlaps): path = createDirPath(trackName, genome, allowOverlaps=allowOverlaps) collector = PreProcMetaDataCollector(genome, trackName) chrList = collector.getPreProcessedChrs(allowOverlaps) if not collector.getTrackFormat().reprIsDense(): chrList = sorted(chrList) existingChrList = [ chr for chr in ChrMemmapFolderMerger._existingChrIter(path, chrList) ] if len(existingChrList) == 0: raise EmptyGESourceError( 'No data lines has been read from source file (probably because it is empty).' ) firstChrTrackData = TrackSource().getTrackData(trackName, genome, existingChrList[0], allowOverlaps, forceChrFolders=True) arrayList = firstChrTrackData.keys() for arrayName in arrayList: mergedArray = firstChrTrackData[arrayName][:] elementDim, dtypeDim = parseMemmapFileFn( firstChrTrackData[arrayName].filename)[1:3] del firstChrTrackData[arrayName] for chr in existingChrList[1:]: chrTrackData = TrackSource().getTrackData(trackName, genome, chr, allowOverlaps, forceChrFolders=True) mergedArray = ChrMemmapFolderMerger.mergeArrays( mergedArray, np.array(chrTrackData[arrayName][:])) elementDimNew, dtypeDimNew = parseMemmapFileFn( chrTrackData[arrayName].filename)[1:3] elementDim = max(elementDim, elementDimNew) dtypeDim = max(dtypeDim, dtypeDimNew) del chrTrackData[arrayName] mergedFn = createMemmapFileFn(path, arrayName, elementDim, dtypeDim, str(mergedArray.dtype)) f = np.memmap(mergedFn, dtype=mergedArray.dtype, mode='w+', shape=mergedArray.shape) f[:] = mergedArray f.flush() del f del mergedArray
def createBoundingRegionShelve(genome, trackName, allowOverlaps): collector = PreProcMetaDataCollector(genome, trackName) boundingRegionTuples = collector.getBoundingRegionTuples(allowOverlaps) if not collector.getTrackFormat().reprIsDense(): boundingRegionTuples = sorted(boundingRegionTuples) geChrList = collector.getPreProcessedChrs(allowOverlaps) brShelve = BoundingRegionShelve(genome, trackName, allowOverlaps) brShelve.storeBoundingRegions(boundingRegionTuples, geChrList, not collector.getTrackFormat().reprIsDense()) #Sanity check if brShelve.getTotalElementCount() != collector.getNumElements(allowOverlaps): raise ShouldNotOccurError("Error: The total element count for all bounding regions is not equal to the total number of genome elements. %s != %s" % \ (brShelve.getTotalElementCount(), collector.getNumElements(allowOverlaps)) )