def checkUndirectedEdges(genome, trackName, allowOverlaps): collector = PreProcMetaDataCollector(genome, trackName) if not (collector.getTrackFormat().isLinked() and collector.hasUndirectedEdges()): return complementEdgeWeightDict = {} for chr in collector.getPreProcessedChrs(allowOverlaps): trackSource = TrackSource() trackData = trackSource.getTrackData(trackName, genome, chr, allowOverlaps) ids = trackData['id'] edges = trackData['edges'] weights = trackData.get('weights') for i, id in enumerate(ids): edgesAttr = edges[i][edges[i] != ''] weightsAttr = weights[i][edges[i] != ''] if weights is not None else None PreProcessUtils._adjustComplementaryEdgeWeightDict(complementEdgeWeightDict, id, edgesAttr, weightsAttr) if len(complementEdgeWeightDict) != 0: unmatchedPairs = [] for toId in complementEdgeWeightDict: for fromId in complementEdgeWeightDict[toId]: unmatchedPairs.append((fromId, toId, complementEdgeWeightDict[toId][fromId])) raise InvalidFormatError("Error: All edges are not undirected. The following edges specifications " +\ "are not matched by an opposite edge with equal weight:" + os.linesep +\ os.linesep.join(["from '%s' to '%s'" % (fromId, toId) + \ (" with weight '%s'" % weight if weight != '' else '') \ for fromId, toId, weight in unmatchedPairs]))
def removeOutdatedPreProcessedFiles(genome, trackName, allowOverlaps, mode): collector = PreProcMetaDataCollector(genome, trackName) if PreProcessUtils.preProcFilesExist(genome, trackName, allowOverlaps) and not \ collector.hasRemovedPreProcFiles(allowOverlaps): dirPath = createDirPath(trackName, genome, allowOverlaps=allowOverlaps) assert dirPath.startswith(Config.PROCESSED_DATA_PATH), \ "Processed data path '%s' does not start with '%s'" % \ (dirPath, Config.PROCESSED_DATA_PATH) if mode == 'Real': print 'Removing outdated preprocessed data: ', dirPath for fn in os.listdir(dirPath): fullFn = os.path.join(dirPath, fn) if os.path.isfile(fullFn): os.unlink(fullFn) if os.path.isdir(fullFn): if PreProcessUtils._isOldTypeChromDirectory(fullFn, genome): shutil.rmtree(fullFn) else: print 'Would now have removed outdated preprocessed data if real run: ', dirPath collector.updateRemovedPreProcFilesFlag(allowOverlaps, True) if mode == 'Real': ti = TrackInfo(genome, trackName) ti.resetTimeOfPreProcessing()
def merge(genome, trackName, allowOverlaps): path = createDirPath(trackName, genome, allowOverlaps=allowOverlaps) collector = PreProcMetaDataCollector(genome, trackName) chrList = collector.getPreProcessedChrs(allowOverlaps) if not collector.getTrackFormat().reprIsDense(): chrList = sorted(chrList) existingChrList = [ chr for chr in ChrMemmapFolderMerger._existingChrIter(path, chrList) ] if len(existingChrList) == 0: raise EmptyGESourceError( 'No data lines has been read from source file (probably because it is empty).' ) firstChrTrackData = TrackSource().getTrackData(trackName, genome, existingChrList[0], allowOverlaps, forceChrFolders=True) arrayList = firstChrTrackData.keys() for arrayName in arrayList: mergedArray = firstChrTrackData[arrayName][:] elementDim, dtypeDim = parseMemmapFileFn( firstChrTrackData[arrayName].filename)[1:3] del firstChrTrackData[arrayName] for chr in existingChrList[1:]: chrTrackData = TrackSource().getTrackData(trackName, genome, chr, allowOverlaps, forceChrFolders=True) mergedArray = ChrMemmapFolderMerger.mergeArrays( mergedArray, np.array(chrTrackData[arrayName][:])) elementDimNew, dtypeDimNew = parseMemmapFileFn( chrTrackData[arrayName].filename)[1:3] elementDim = max(elementDim, elementDimNew) dtypeDim = max(dtypeDim, dtypeDimNew) del chrTrackData[arrayName] mergedFn = createMemmapFileFn(path, arrayName, elementDim, dtypeDim, str(mergedArray.dtype)) f = np.memmap(mergedFn, dtype=mergedArray.dtype, mode='w+', shape=mergedArray.shape) f[:] = mergedArray f.flush() del f del mergedArray
def preProcFilesExist(genome, trackName, allowOverlaps): collector = PreProcMetaDataCollector(genome, trackName) preProcFilesExist = collector.preProcFilesExist(allowOverlaps) if preProcFilesExist is None: dirPath = createDirPath(trackName, genome, allowOverlaps=allowOverlaps) if BoundingRegionShelve(genome, trackName, allowOverlaps).fileExists(): preProcFilesExist = True # any( fn.split('.')[0] in ['start', 'end', 'val', 'edges'] \ # for fn in os.listdir(dirPath) if os.path.isfile(os.path.join(dirPath, fn)) ) else: if os.path.exists(dirPath): preProcFilesExist = PreProcessUtils._hasOldTypeChromSubDirs(dirPath, genome) else: preProcFilesExist = False collector.updatePreProcFilesExistFlag(allowOverlaps, preProcFilesExist) return preProcFilesExist
def _allGESourceManagers(self, trackName, allowOverlaps): collector = PreProcMetaDataCollector(self._genome, trackName) if allowOverlaps == False and collector.overlapRuleHasBeenFinalized(True): for i in range(1): self._status = 'Trying to prepare preprocessing for track "%s"' % ':'.join(trackName) + \ (' (allowOverlaps: %s)' % allowOverlaps) yield self._getGESourceManagerFromTrack(trackName) else: for geSource in self._allGESources(trackName): if allowOverlaps == True: tf = TrackFormat.createInstanceFromGeSource(geSource) if tf.isDense() or geSource.hasNoOverlappingElements(): return self._status = 'Trying to prepare preprocessing for track "%s"' % ':'.join(trackName) + \ (' (filename: "%s")' % geSource.getFileName() if geSource.hasOrigFile() else '') + \ (' (allowOverlaps: %s)' % allowOverlaps) if PreProcessUtils.shouldPreProcessGESource(trackName, geSource, allowOverlaps): yield self._getGESourceManagerFromGESource(geSource)
def checkIfEdgeIdsExist(genome, trackName, allowOverlaps): collector = PreProcMetaDataCollector(genome, trackName) if not collector.getTrackFormat().isLinked(): return uniqueIds = numpy.array([], dtype='S') uniqueEdgeIds = numpy.array([], dtype='S') for chr in collector.getPreProcessedChrs(allowOverlaps): trackSource = TrackSource() trackData = trackSource.getTrackData(trackName, genome, chr, allowOverlaps) uniqueIds = numpy.unique(numpy.concatenate((uniqueIds, trackData['id'][:]))) uniqueEdgeIds = numpy.unique(numpy.concatenate((uniqueEdgeIds, trackData['edges'][:].flatten()))) uniqueIds = uniqueIds[uniqueIds != ''] uniqueEdgeIds = uniqueEdgeIds[uniqueEdgeIds != ''] unmatchedIds = set(uniqueEdgeIds) - set(uniqueIds) if len(unmatchedIds) > 0: raise InvalidFormatError("Error: the following ids specified in the 'edges' column do not exist in the dataset: " + ', '.join(sorted(unmatchedIds)))
def _allGESourceManagers(self, trackName, allowOverlaps): collector = PreProcMetaDataCollector(self._genome, trackName) if allowOverlaps == False and collector.overlapRuleHasBeenFinalized( True): for i in range(1): self._status = 'Trying to prepare preprocessing for track "%s"' % ':'.join(trackName) + \ (' (allowOverlaps: %s)' % allowOverlaps) yield self._getGESourceManagerFromTrack(trackName) else: for geSource in self._allGESources(trackName): if allowOverlaps == True: tf = TrackFormat.createInstanceFromGeSource(geSource) if tf.isDense() or geSource.hasNoOverlappingElements(): return self._status = 'Trying to prepare preprocessing for track "%s"' % ':'.join(trackName) + \ (' (filename: "%s")' % geSource.getFileName() if geSource.hasOrigFile() else '') + \ (' (allowOverlaps: %s)' % allowOverlaps) if PreProcessUtils.shouldPreProcessGESource( trackName, geSource, allowOverlaps): yield self._getGESourceManagerFromGESource(geSource)
def _createPreProcFiles(self): geSource = self._geSourceManager.getGESource() genome = geSource.genome collector = PreProcMetaDataCollector(genome, self._trackName) collector.updateMetaDataForFinalization(geSource.getFileSuffix(), geSource.getPrefixList(), \ geSource.getValDataType(), geSource.getValDim(), \ geSource.getEdgeWeightDataType(), geSource.getEdgeWeightDim(), \ geSource.hasUndirectedEdges(), geSource.getVersion(), PreProcessUtils.constructId(geSource), \ self._geSourceManager.getNumElements(), \ self._geSourceManager.getBoundingRegionTuples(), \ self._geSourceManager.getValCategories(), \ self._geSourceManager.getEdgeWeightCategories(), \ self._allowOverlaps) if self._geSourceManager.getNumElements() == 0: return if self._mode != 'Real': for ge in geSource: pass return output = OutputManager(genome, self._trackName, self._allowOverlaps, self._geSourceManager) writeFunc = output.writeRawSlice if geSource.isSliceSource() else output.writeElement for ge in geSource: writeFunc(ge) collector.flagChrsAsPreProcessed(self._allowOverlaps, self._geSourceManager.getAllChrs()) output.close()
def merge(genome, trackName, allowOverlaps): path = createDirPath(trackName, genome, allowOverlaps=allowOverlaps) collector = PreProcMetaDataCollector(genome, trackName) chrList = collector.getPreProcessedChrs(allowOverlaps) if not collector.getTrackFormat().reprIsDense(): chrList = sorted(chrList) existingChrList = [chr for chr in ChrMemmapFolderMerger._existingChrIter(path, chrList)] if len(existingChrList) == 0: raise EmptyGESourceError('No data lines has been read from source file (probably because it is empty).') firstChrTrackData = TrackSource().getTrackData(trackName, genome, existingChrList[0], allowOverlaps, forceChrFolders=True) arrayList = firstChrTrackData.keys() for arrayName in arrayList: mergedArray = firstChrTrackData[arrayName][:] elementDim, dtypeDim = parseMemmapFileFn(firstChrTrackData[arrayName].filename)[1:3] del firstChrTrackData[arrayName] for chr in existingChrList[1:]: chrTrackData = TrackSource().getTrackData(trackName, genome, chr, allowOverlaps, forceChrFolders=True) mergedArray = ChrMemmapFolderMerger.mergeArrays(mergedArray, np.array(chrTrackData[arrayName][:])) elementDimNew, dtypeDimNew = parseMemmapFileFn(chrTrackData[arrayName].filename)[1:3] elementDim = max(elementDim, elementDimNew) dtypeDim = max(dtypeDim, dtypeDimNew) del chrTrackData[arrayName] mergedFn = createMemmapFileFn(path, arrayName, elementDim, dtypeDim, str(mergedArray.dtype)) f = np.memmap(mergedFn, dtype=mergedArray.dtype, mode='w+', shape=mergedArray.shape) f[:] = mergedArray f.flush() del f del mergedArray
def createBoundingRegionShelve(genome, trackName, allowOverlaps): collector = PreProcMetaDataCollector(genome, trackName) boundingRegionTuples = collector.getBoundingRegionTuples(allowOverlaps) if not collector.getTrackFormat().reprIsDense(): boundingRegionTuples = sorted(boundingRegionTuples) geChrList = collector.getPreProcessedChrs(allowOverlaps) brShelve = BoundingRegionShelve(genome, trackName, allowOverlaps) brShelve.storeBoundingRegions(boundingRegionTuples, geChrList, not collector.getTrackFormat().reprIsDense()) #Sanity check if brShelve.getTotalElementCount() != collector.getNumElements(allowOverlaps): raise ShouldNotOccurError("Error: The total element count for all bounding regions is not equal to the total number of genome elements. %s != %s" % \ (brShelve.getTotalElementCount(), collector.getNumElements(allowOverlaps)) )
def _findTrackInfoBasedMetaData(self): if not self._foundTrackInfoBasedMetaData: if PreProcMetaDataCollector.hasKey(self._genome, self._trackName): collector = PreProcMetaDataCollector(self._genome, self._trackName) self._fileSuffix = collector.getFileSuffix() self._preProcVersion = collector.getPreProcVersion() self._id = collector.getId() self._undirectedEdges = True if collector.hasUndirectedEdges() else False else: ti = TrackInfo(self._genome, self._trackName) self._fileSuffix = ti.fileType self._preProcVersion = ti.preProcVersion self._id = ti.id self._undirectedEdges = True if ti.undirectedEdges else False
def _findTrackInfoBasedMetaData(self): if not self._foundTrackInfoBasedMetaData: if PreProcMetaDataCollector.hasKey(self._genome, self._trackName): collector = PreProcMetaDataCollector(self._genome, self._trackName) self._fileSuffix = collector.getFileSuffix() self._preProcVersion = collector.getPreProcVersion() self._id = collector.getId() self._undirectedEdges = True if collector.hasUndirectedEdges() else False else: ti = TrackInfo(self._genome, self._trackName) self._fileSuffix = ti.fileType self._preProcVersion = ti.preProcVersion self._id = ti.id self._undirectedEdges = True if ti.undirectedEdges else False
def process(self): assert self._genome is not None, 'Error: genome must be specified when preprocessing tracks.' atLeastOneFinalized = False for trackName in self._allTrackNames(): assert trackName != [''] overlapRulesProcessedForTrackName = [] collector = PreProcMetaDataCollector(self._genome, trackName) try: trackName = self._renameTrackNameIfIllegal(trackName) for allowOverlaps in [True, False]: anyGeSourceManagers = False for geSourceManager in self._allGESourceManagers(trackName, allowOverlaps): anyGeSourceManagers = True # PreProcess if needed if self._shouldPreProcess(): PreProcessUtils.removeOutdatedPreProcessedFiles(self._genome, trackName, allowOverlaps, self._mode) if self._shouldPrintProcessMessages() and allowOverlaps not in overlapRulesProcessedForTrackName: self._printProcessTrackMessage(trackName, allowOverlaps) overlapRulesProcessedForTrackName.append(allowOverlaps) self._status = 'Trying to preprocess geSource...' geSourceJob = PreProcessGeSourceJob(trackName, geSourceManager, allowOverlaps, self._mode) anyWarnings = geSourceJob.process() if self._raiseIfAnyWarnings and anyWarnings and trackName not in self._warningTrackNames: self._warningTrackNames.append(trackName) collector.updatePreProcDirtyStatus(geSourceJob.hasModifiedData()) # Finalize overlapRule output if needed if anyGeSourceManagers and self._shouldFinalize() and collector.preProcIsDirty(): if self._mode == 'Real' and self._shouldMergeChrFolders(): self._status = 'Trying to combine chromosome vectors into combined vectors.' PreProcessUtils.createBoundingRegionShelve(self._genome, trackName, allowOverlaps) ChrMemmapFolderMerger.merge(self._genome, trackName, allowOverlaps) self._status = 'Trying to remove chromosome folders' PreProcessUtils.removeChrMemmapFolders(self._genome, trackName, allowOverlaps) self._status = 'Trying to check whether 3D data is correct' PreProcessUtils.checkIfEdgeIdsExist(self._genome, trackName, allowOverlaps) PreProcessUtils.checkUndirectedEdges(self._genome, trackName, allowOverlaps) PreProcessUtils.checkUndirectedEdges(self._genome, trackName, allowOverlaps) collector.markOverlapRuleAsFinalized(allowOverlaps) # Finalize track if needed if self._shouldFinalize(): if collector.preProcIsDirty(): self._status = 'Trying to finalize.' collector.finalize(self._username, self._shouldPrintProcessMessages()) if not atLeastOneFinalized: atLeastOneFinalized = True else: collector.removeEntry() except NotSupportedError, e: collector.removeEntry() if self.PASS_ON_EXCEPTIONS: raise else: self._printExceptionMsg(e, trackName, Error=False) except Exception, e: collector.removeEntry() if self.PASS_ON_EXCEPTIONS: raise else: self._printExceptionMsg(e, trackName, Error=True)
def process(self): assert self._genome is not None, 'Error: genome must be specified when preprocessing tracks.' atLeastOneFinalized = False for trackName in self._allTrackNames(): assert trackName != [''] overlapRulesProcessedForTrackName = [] collector = PreProcMetaDataCollector(self._genome, trackName) try: trackName = self._renameTrackNameIfIllegal(trackName) for allowOverlaps in [True, False]: anyGeSourceManagers = False for geSourceManager in self._allGESourceManagers( trackName, allowOverlaps): anyGeSourceManagers = True # PreProcess if needed if self._shouldPreProcess(): PreProcessUtils.removeOutdatedPreProcessedFiles( self._genome, trackName, allowOverlaps, self._mode) if self._shouldPrintProcessMessages( ) and allowOverlaps not in overlapRulesProcessedForTrackName: self._printProcessTrackMessage( trackName, allowOverlaps) overlapRulesProcessedForTrackName.append( allowOverlaps) self._status = 'Trying to preprocess geSource...' geSourceJob = PreProcessGeSourceJob( trackName, geSourceManager, allowOverlaps, self._mode) anyWarnings = geSourceJob.process() if self._raiseIfAnyWarnings and anyWarnings and trackName not in self._warningTrackNames: self._warningTrackNames.append(trackName) collector.updatePreProcDirtyStatus( geSourceJob.hasModifiedData()) # Finalize overlapRule output if needed if anyGeSourceManagers and self._shouldFinalize( ) and collector.preProcIsDirty(): if self._mode == 'Real' and self._shouldMergeChrFolders( ): self._status = 'Trying to combine chromosome vectors into combined vectors.' PreProcessUtils.createBoundingRegionShelve( self._genome, trackName, allowOverlaps) ChrMemmapFolderMerger.merge( self._genome, trackName, allowOverlaps) self._status = 'Trying to remove chromosome folders' PreProcessUtils.removeChrMemmapFolders( self._genome, trackName, allowOverlaps) self._status = 'Trying to check whether 3D data is correct' PreProcessUtils.checkIfEdgeIdsExist( self._genome, trackName, allowOverlaps) PreProcessUtils.checkUndirectedEdges( self._genome, trackName, allowOverlaps) PreProcessUtils.checkUndirectedEdges( self._genome, trackName, allowOverlaps) collector.markOverlapRuleAsFinalized(allowOverlaps) # Finalize track if needed if self._shouldFinalize(): if collector.preProcIsDirty(): self._status = 'Trying to finalize.' collector.finalize(self._username, self._shouldPrintProcessMessages()) if not atLeastOneFinalized: atLeastOneFinalized = True else: collector.removeEntry() except NotSupportedError, e: collector.removeEntry() if self.PASS_ON_EXCEPTIONS: raise else: self._printExceptionMsg(e, trackName, Error=False) except Exception, e: collector.removeEntry() if self.PASS_ON_EXCEPTIONS: raise else: self._printExceptionMsg(e, trackName, Error=True)
def _getGESourceManagerFromTrack(self, trackName): origBrTuples = PreProcMetaDataCollector(self._genome, trackName).\ getBoundingRegionTuples(allowOverlaps=True) return OverlapClusteringGESourceManager(self._genome, trackName, origBrTuples)