def checkIfEdgeIdsExist(genome, trackName, allowOverlaps): collector = PreProcMetaDataCollector(genome, trackName) if not collector.getTrackFormat().isLinked(): return uniqueIds = numpy.array([], dtype='S') uniqueEdgeIds = numpy.array([], dtype='S') for chr in collector.getPreProcessedChrs(allowOverlaps): trackSource = TrackSource() trackData = trackSource.getTrackData(trackName, genome, chr, allowOverlaps) uniqueIds = numpy.unique( numpy.concatenate((uniqueIds, trackData['id'][:]))) uniqueEdgeIds = numpy.unique( numpy.concatenate( (uniqueEdgeIds, trackData['edges'][:].flatten()))) uniqueIds = uniqueIds[uniqueIds != ''] uniqueEdgeIds = uniqueEdgeIds[uniqueEdgeIds != ''] unmatchedIds = set(uniqueEdgeIds) - set(uniqueIds) if len(unmatchedIds) > 0: raise InvalidFormatError( "Error: the following ids specified in the 'edges' column do not exist in the dataset: " + ', '.join(sorted(unmatchedIds)))
def checkUndirectedEdges(genome, trackName, allowOverlaps): collector = PreProcMetaDataCollector(genome, trackName) if not (collector.getTrackFormat().isLinked() and collector.hasUndirectedEdges()): return complementEdgeWeightDict = {} for chr in collector.getPreProcessedChrs(allowOverlaps): trackSource = TrackSource() trackData = trackSource.getTrackData(trackName, genome, chr, allowOverlaps) ids = trackData['id'] edges = trackData['edges'] weights = trackData.get('weights') for i, id in enumerate(ids): edgesAttr = edges[i][edges[i] != ''] weightsAttr = weights[i][edges[i] != ''] if weights is not None else None PreProcessUtils._adjustComplementaryEdgeWeightDict(complementEdgeWeightDict, id, edgesAttr, weightsAttr) if len(complementEdgeWeightDict) != 0: unmatchedPairs = [] for toId in complementEdgeWeightDict: for fromId in complementEdgeWeightDict[toId]: unmatchedPairs.append((fromId, toId, complementEdgeWeightDict[toId][fromId])) raise InvalidFormatError("Error: All edges are not undirected. The following edges specifications " +\ "are not matched by an opposite edge with equal weight:" + os.linesep +\ os.linesep.join(["from '%s' to '%s'" % (fromId, toId) + \ (" with weight '%s'" % weight if weight != '' else '') \ for fromId, toId, weight in unmatchedPairs]))
def removeOutdatedPreProcessedFiles(cls, genome, trackName, allowOverlaps, mode): collector = PreProcMetaDataCollector(genome, trackName) if cls.preProcFilesExist(genome, trackName, allowOverlaps) and not \ collector.hasRemovedPreProcFiles(allowOverlaps): dirPath = createDirPath(trackName, genome, allowOverlaps=allowOverlaps) assert (dirPath.startswith(PROCESSED_DATA_PATH)) if mode == 'Real': print 'Removing outdated preprocessed data: ', dirPath for fn in os.listdir(dirPath): fullFn = os.path.join(dirPath, fn) if os.path.isfile(fullFn): os.unlink(fullFn) if os.path.isdir(fullFn): if cls._isOldTypeChromDirectory(fullFn, genome): shutil.rmtree(fullFn) else: print 'Would now have removed outdated preprocessed data if real run: ', dirPath collector.updateRemovedPreProcFilesFlag(allowOverlaps, True) if mode == 'Real': ti = TrackInfo(genome, trackName) ti.resetTimeOfPreProcessing()
def merge(genome, trackName, allowOverlaps): path = createDirPath(trackName, genome, allowOverlaps=allowOverlaps) collector = PreProcMetaDataCollector(genome, trackName) chrList = collector.getPreProcessedChrs(allowOverlaps) if not collector.getTrackFormat().reprIsDense(): chrList = sorted(chrList) existingChrList = [ chr for chr in ChrMemmapFolderMerger._existingChrIter(path, chrList) ] if len(existingChrList) == 0: raise EmptyGESourceError( 'No data lines has been read from source file (probably because it is empty).' ) firstChrTrackData = TrackSource().getTrackData(trackName, genome, existingChrList[0], allowOverlaps, forceChrFolders=True) arrayList = firstChrTrackData.keys() for arrayName in arrayList: mergedArray = firstChrTrackData[arrayName][:] elementDim, dtypeDim = parseMemmapFileFn( firstChrTrackData[arrayName].filename)[1:3] del firstChrTrackData[arrayName] for chr in existingChrList[1:]: chrTrackData = TrackSource().getTrackData(trackName, genome, chr, allowOverlaps, forceChrFolders=True) mergedArray = ChrMemmapFolderMerger.mergeArrays( mergedArray, np.array(chrTrackData[arrayName][:])) elementDimNew, dtypeDimNew = parseMemmapFileFn( chrTrackData[arrayName].filename)[1:3] elementDim = max(elementDim, elementDimNew) dtypeDim = max(dtypeDim, dtypeDimNew) del chrTrackData[arrayName] mergedFn = createMemmapFileFn(path, arrayName, elementDim, dtypeDim, str(mergedArray.dtype)) f = np.memmap(mergedFn, dtype=mergedArray.dtype, mode='w+', shape=mergedArray.shape) f[:] = mergedArray f.flush() del f del mergedArray
def preProcFilesExist(cls, genome, trackName, allowOverlaps): collector = PreProcMetaDataCollector(genome, trackName) preProcFilesExist = collector.preProcFilesExist(allowOverlaps) if preProcFilesExist is None: merged = cls.mergedPreProcFilesExist(genome, trackName, allowOverlaps) if merged: preProcFilesExist = True else: preProcFilesExist = cls.oldTypePreProcFilesExist( genome, trackName, allowOverlaps) collector.updatePreProcFilesExistFlag(allowOverlaps, preProcFilesExist, merged) return preProcFilesExist
def preProcFilesExist(genome, trackName, allowOverlaps): collector = PreProcMetaDataCollector(genome, trackName) preProcFilesExist = collector.preProcFilesExist(allowOverlaps) if preProcFilesExist is None: dirPath = createDirPath(trackName, genome, allowOverlaps=allowOverlaps) if BoundingRegionShelve(genome, trackName, allowOverlaps).fileExists(): preProcFilesExist = True # any( fn.split('.')[0] in ['start', 'end', 'val', 'edges'] \ # for fn in os.listdir(dirPath) if os.path.isfile(os.path.join(dirPath, fn)) ) else: if os.path.exists(dirPath): preProcFilesExist = PreProcessUtils._hasOldTypeChromSubDirs(dirPath, genome) else: preProcFilesExist = False collector.updatePreProcFilesExistFlag(allowOverlaps, preProcFilesExist) return preProcFilesExist
def removeChrMemmapFolders(genome, trackName, allowOverlaps): chrList = PreProcMetaDataCollector(genome, trackName).getPreProcessedChrs(allowOverlaps) for chr in chrList: path = createDirPath(trackName, genome, chr, allowOverlaps) assert os.path.exists(path), 'Path does not exist: ' + path assert os.path.isdir(path), 'Path is not a directory: ' + path shutil.rmtree(path)
def process(self): geSource = self._geSourceManager.getGESource() genome = geSource.genome collector = PreProcMetaDataCollector(genome, self._trackName) from gold.origdata.PreProcessTracksJob import PreProcessTracksJob collector.updateMetaDataForFinalization(geSource.getFileSuffix(), geSource.getPrefixList(), geSource.getValDataType(), geSource.getValDim(), geSource.getEdgeWeightDataType(), geSource.getEdgeWeightDim(), geSource.hasUndirectedEdges(), geSource.getVersion(), PreProcessTracksJob.VERSION, PreProcessUtils.constructId(geSource), self._geSourceManager.getNumElements(), self._geSourceManager.getBoundingRegionTuples(), self._geSourceManager.getValCategories(), self._geSourceManager.getEdgeWeightCategories(), self._allowOverlaps) if self._geSourceManager.getNumElements() > 0: if self._mode == 'Real': output = OutputManager(genome, self._trackName, self._allowOverlaps, self._geSourceManager) writeFunc = output.writeRawSlice if geSource.isSliceSource() else output.writeElement for ge in geSource: writeFunc(ge) output.close() else: for ge in geSource: pass if self._mode in ['UpdateMeta', 'Real']: self._dirty = True collector.flagChrsAsPreProcessed(self._allowOverlaps, self._geSourceManager.getAllChrs())
def createBoundingRegionShelve(genome, trackName, allowOverlaps): collector = PreProcMetaDataCollector(genome, trackName) boundingRegionTuples = collector.getBoundingRegionTuples(allowOverlaps) if not collector.getTrackFormat().reprIsDense(): boundingRegionTuples = sorted(boundingRegionTuples) geChrList = collector.getPreProcessedChrs(allowOverlaps) brShelve = BoundingRegionShelve(genome, trackName, allowOverlaps) brShelve.storeBoundingRegions(boundingRegionTuples, geChrList, not collector.getTrackFormat().reprIsDense()) #Sanity check if brShelve.getTotalElementCount() != collector.getNumElements(allowOverlaps): raise ShouldNotOccurError("Error: The total element count for all bounding regions is not equal to the total number of genome elements. %s != %s" % \ (brShelve.getTotalElementCount(), collector.getNumElements(allowOverlaps)) )
def _allGESourceManagers(self, trackName, allowOverlaps): trackNameStr = ':'.join(trackName) self._status = "Trying to create GESourceManager " \ "(trackName: {}, allowOverlaps: {})".format(trackNameStr, allowOverlaps) collector = PreProcMetaDataCollector(self._genome, trackName) if allowOverlaps == False and collector.overlapRuleHasBeenFinalized( True): for i in range(1): self._status = 'Trying to prepare preprocessing for track "%s"' % trackNameStr + \ (' (allowOverlaps: %s)' % allowOverlaps) yield self._getGESourceManagerFromTrack(trackName) else: for geSource in self._allGESources(trackName): if allowOverlaps == True: tf = TrackFormat.createInstanceFromGeSource(geSource) if tf.isDense() or geSource.hasNoOverlappingElements(): return self._status = 'Trying to prepare preprocessing for track "%s"' % trackNameStr + \ (' (filename: "%s")' % geSource.getFileName() if geSource.hasOrigFile() else '') + \ (' (allowOverlaps: %s)' % allowOverlaps) if PreProcessUtils.shouldPreProcessGESource( trackName, geSource, allowOverlaps): yield self._getGESourceManagerFromGESource(geSource)
def _findTrackInfoBasedMetaData(self): if not self._foundTrackInfoBasedMetaData: if PreProcMetaDataCollector.hasKey(self._genome, self._trackName): collector = PreProcMetaDataCollector(self._genome, self._trackName) self._fileSuffix = collector.getFileSuffix() self._geSourceVersion = collector.getGeSourceVersion() self._id = collector.getId() self._undirectedEdges = True if collector.hasUndirectedEdges() else False else: ti = TrackInfo(self._genome, self._trackName) self._fileSuffix = ti.fileType self._geSourceVersion = ti.geSourceVersion self._id = ti.id self._undirectedEdges = True if ti.undirectedEdges else False
def _createPreProcFiles(self): geSource = self._geSourceManager.getGESource() genome = geSource.genome collector = PreProcMetaDataCollector(genome, self._trackName) collector.updateMetaDataForFinalization(geSource.getFileSuffix(), geSource.getPrefixList(), \ geSource.getValDataType(), geSource.getValDim(), \ geSource.getEdgeWeightDataType(), geSource.getEdgeWeightDim(), \ geSource.hasUndirectedEdges(), geSource.getVersion(), PreProcessUtils.constructId(geSource), \ self._geSourceManager.getNumElements(), \ self._geSourceManager.getBoundingRegionTuples(), \ self._geSourceManager.getValCategories(), \ self._geSourceManager.getEdgeWeightCategories(), \ self._allowOverlaps) if self._geSourceManager.getNumElements() == 0: return if self._mode != 'Real': for ge in geSource: pass return output = OutputManager(genome, self._trackName, self._allowOverlaps, self._geSourceManager) writeFunc = output.writeRawSlice if geSource.isSliceSource( ) else output.writeElement for ge in geSource: writeFunc(ge) collector.flagChrsAsPreProcessed(self._allowOverlaps, self._geSourceManager.getAllChrs()) output.close()
def process(self): assert self._genome is not None, 'Error: genome must be specified when preprocessing tracks.' atLeastOneFinalized = False for trackName in self._allTrackNames(): assert trackName != [''] overlapRulesProcessedForTrackName = [] collector = PreProcMetaDataCollector(self._genome, trackName) try: trackName = self._renameTrackNameIfIllegal(trackName) for allowOverlaps in [True, False]: anyGeSourceManagers = False for geSourceManager in self._allGESourceManagers( trackName, allowOverlaps): anyGeSourceManagers = True # PreProcess if needed if self._shouldPreProcess(): PreProcessUtils.removeOutdatedPreProcessedFiles( self._genome, trackName, allowOverlaps, self._mode) if self._shouldPrintProcessMessages( ) and allowOverlaps not in overlapRulesProcessedForTrackName: self._printProcessTrackMessage( trackName, allowOverlaps) overlapRulesProcessedForTrackName.append( allowOverlaps) self._status = 'Trying to preprocess geSource...' geSourceJob = PreProcessGeSourceJob( trackName, geSourceManager, allowOverlaps, self._mode) anyWarnings = geSourceJob.process() if self._raiseIfAnyWarnings and anyWarnings and trackName not in self._warningTrackNames: self._warningTrackNames.append(trackName) collector.updatePreProcDirtyStatus( geSourceJob.hasModifiedData()) # Finalize overlapRule output if needed if anyGeSourceManagers and self._shouldFinalize( ) and collector.preProcIsDirty(): if self._mode == 'Real' and self._shouldMergeChrFolders( ): self._status = 'Trying to combine chromosome vectors into combined vectors.' PreProcessUtils.createBoundingRegionShelve( self._genome, trackName, allowOverlaps) ChrMemmapFolderMerger.merge( self._genome, trackName, allowOverlaps) self._status = 'Trying to remove chromosome folders' PreProcessUtils.removeChrMemmapFolders( self._genome, trackName, allowOverlaps) collector.updatePreProcFilesExistFlag( allowOverlaps, preProcFilesExist=True, merged=True) self._status = 'Trying to check whether 3D data is correct' PreProcessUtils.checkIfEdgeIdsExist( self._genome, trackName, allowOverlaps) PreProcessUtils.checkUndirectedEdges( self._genome, trackName, allowOverlaps) PreProcessUtils.checkUndirectedEdges( self._genome, trackName, allowOverlaps) collector.markOverlapRuleAsFinalized(allowOverlaps) # Finalize track if needed if self._shouldFinalize(): if collector.preProcIsDirty(): self._status = 'Trying to finalize.' collector.finalize(self._username, self._shouldPrintProcessMessages()) if not atLeastOneFinalized: atLeastOneFinalized = True else: collector.removeEntry() except NotSupportedError, e: collector.removeEntry() if DebugConfig.PASS_ON_PREPROCESS_EXCEPTIONS: raise_from( PreprocessWarning( self._addContextToExceptionMsg(e, trackName)), e) else: self._printExceptionMsg(e, trackName, Error=False) except Exception, e: collector.removeEntry() if DebugConfig.PASS_ON_PREPROCESS_EXCEPTIONS: raise_from( PreprocessError( self._addContextToExceptionMsg(e, trackName)), e) else: self._printExceptionMsg(e, trackName, Error=True)
def _getGESourceManagerFromTrack(self, trackName): origBrTuples = PreProcMetaDataCollector( self._genome, trackName).getBoundingRegionTuples(allowOverlaps=True) return OverlapClusteringGESourceManager(self._genome, trackName, origBrTuples)