def removeOutdatedPreProcessedFiles(genome, trackName, allowOverlaps, mode): collector = PreProcMetaDataCollector(genome, trackName) if PreProcessUtils.preProcFilesExist(genome, trackName, allowOverlaps) and not \ collector.hasRemovedPreProcFiles(allowOverlaps): dirPath = createDirPath(trackName, genome, allowOverlaps=allowOverlaps) assert dirPath.startswith(Config.PROCESSED_DATA_PATH), \ "Processed data path '%s' does not start with '%s'" % \ (dirPath, Config.PROCESSED_DATA_PATH) if mode == 'Real': print 'Removing outdated preprocessed data: ', dirPath for fn in os.listdir(dirPath): fullFn = os.path.join(dirPath, fn) if os.path.isfile(fullFn): os.unlink(fullFn) if os.path.isdir(fullFn): if PreProcessUtils._isOldTypeChromDirectory(fullFn, genome): shutil.rmtree(fullFn) else: print 'Would now have removed outdated preprocessed data if real run: ', dirPath collector.updateRemovedPreProcFilesFlag(allowOverlaps, True) if mode == 'Real': ti = TrackInfo(genome, trackName) ti.resetTimeOfPreProcessing()
def _calcAndStoreSubTrackCount(self, trackName): ti = TrackInfo(self._genome, trackName) trackCount = 0 for subTrackName in ProcTrackOptions.getSubtypes(self._genome, trackName, True): subTrackCount = TrackInfo(self._genome, trackName + [subTrackName]).subTrackCount if subTrackCount: trackCount += subTrackCount if ti.isValid(): trackCount += 1 ti.subTrackCount = trackCount ti.store()
def modifyTnRecord(genome, oldTn, newTn, verbose): trackInfo = TrackInfo(genome, oldTn) assert trackInfo.trackName == oldTn assert trackInfo.timeOfPreProcessing is not None, 'ERROR: trackInfo-object not complete for TN (is this track preprocessed?): ' + str(oldTn) #if trackInfo.timeOfPreProcessing is None: #print 'WARNING: timeOfPreProcessing is None for: ',oldTn trackInfo.trackName = newTn if not ONLY_SIMULATION: trackInfo.store() if verbose: print '(Storing track-info with new tn: %s)' % str(newTn) else: if verbose: print 'Would now store track-info with new tn: %s' % str(newTn)
def getTrackExtractionOptions(genome, trackName): from gtrackcore.track.core.Track import PlainTrack from gtrackcore.input.userbins.UserBinSource import MinimalBinSource from gtrackcore.extract.fileformats.FileFormatComposer import \ findMatchingFileFormatComposers, getComposerClsFromFileSuffix tf = PlainTrack(trackName).getTrackView( MinimalBinSource(genome)[0]).trackFormat extractionOptions = [] matchingComposers = findMatchingFileFormatComposers(tf) for composerInfo in matchingComposers: allOverlapRules = tf.getAllOverlapRules() for allowOverlaps in allOverlapRules: extractionOptions.append( \ (composerInfo.trackFormatName.capitalize() + \ ' ' + TrackExtractor.getFileFormatText(composerInfo.fileFormatName) + \ (', ' + (TrackExtractor.ALLOW_OVERLAPS_TRUE_TEXT if allowOverlaps else \ TrackExtractor.ALLOW_OVERLAPS_FALSE_TEXT) \ if len(allOverlapRules) > 1 else ''), \ composerInfo.fileSuffix) ) ti = TrackInfo(genome, trackName) if ti.fileType != '': try: extractionOptions.append( (TrackExtractor.ORIG_FILE_FORMAT_TEXT.capitalize() + \ ' ' + TrackExtractor.getFileSuffixText(ti.fileType), \ getComposerClsFromFileSuffix(ti.fileType).getDefaultFileNameSuffix())) except Exception, e: print 'Error:', e
def modifyTnRecord(genome, oldTn, newTn, verbose): trackInfo = TrackInfo(genome, oldTn) assert trackInfo.trackName == oldTn assert trackInfo.timeOfPreProcessing is not None, 'ERROR: trackInfo-object not complete for TN (is this track preprocessed?): ' + str( oldTn) #if trackInfo.timeOfPreProcessing is None: #print 'WARNING: timeOfPreProcessing is None for: ',oldTn trackInfo.trackName = newTn if not ONLY_SIMULATION: trackInfo.store() if verbose: print '(Storing track-info with new tn: %s)' % str(newTn) else: if verbose: print 'Would now store track-info with new tn: %s' % str(newTn)
def constructId(geSource): from gtrackcore.preprocess.PreProcessTracksJob import PreProcessTracksJob if geSource.hasOrigFile(): origPath = os.path.dirname(geSource.getFileName()) if not geSource.isExternal() else geSource.getFileName() return TrackInfo.constructIdFromPath(geSource.getGenome(), origPath, \ geSource.getVersion(), PreProcessTracksJob.VERSION) else: return geSource.getId()
def getUniqueKey(self, genome): assert( not None in [self._trackFormatReq.allowOverlaps(), \ self._trackFormatReq.borderHandling()] ) if not self._trackId: self._trackId = TrackInfo(genome, self.trackName).id return hash((tuple(self.trackName), self._trackId, self._trackFormatReq.allowOverlaps(), \ self._trackFormatReq.borderHandling()))
def isValidTrack(genome, trackName, fullAccess=False): if not TrackInfo(genome, trackName).isValid(fullAccess): return False for fn in ProcTrackOptions._getDirContents(genome, trackName): if GenomeInfo.isValidChr(genome, fn) or isBoundingRegionFileName(fn): return True return False
def getUniqueKey(self, genome): if not self._trackId: self._trackId = TrackInfo(genome, self.trackName).id return hash((tuple(self.trackName), self._trackId if self._trackId else '', getClassName(self.formatConverters[0]) if self.formatConverters else '', self.formatConverters[0].VERSION if self.formatConverters else '', self._trackFormatReq.allowOverlaps() if self._trackFormatReq.allowOverlaps() else '', self._trackFormatReq.borderHandling() if self._trackFormatReq.borderHandling() else ''))
def getUniqueKey(self, genome): assert self.formatConverters is not None and len( self.formatConverters) == 1, 'FC: ' + str(self.formatConverters) assert( not None in [self._trackFormatReq.allowOverlaps(), \ self._trackFormatReq.borderHandling()] ) if not self._trackId: self._trackId = TrackInfo(genome, self.trackName).id return hash((tuple(self.trackName), self._trackId, getClassName(self.formatConverters[0]), \ self.formatConverters[0].VERSION, self._trackFormatReq.allowOverlaps(), \ self._trackFormatReq.borderHandling()))
def _findTrackInfoBasedMetaData(self): if not self._foundTrackInfoBasedMetaData: if PreProcMetaDataCollector.hasKey(self._genome, self._trackName): collector = PreProcMetaDataCollector(self._genome, self._trackName) self._fileSuffix = collector.getFileSuffix() self._preProcVersion = collector.getPreProcVersion() self._id = collector.getId() self._undirectedEdges = True if collector.hasUndirectedEdges() else False else: ti = TrackInfo(self._genome, self._trackName) self._fileSuffix = ti.fileType self._preProcVersion = ti.preProcVersion self._id = ti.id self._undirectedEdges = True if ti.undirectedEdges else False
def _calcAndStoreSubTrackCount(self, trackName): ti = TrackInfo(self._genome, trackName) trackCount = 0 for subTrackName in ProcTrackOptions.getSubtypes( self._genome, trackName, True): subTrackCount = TrackInfo(self._genome, trackName + [subTrackName]).subTrackCount if subTrackCount: trackCount += subTrackCount if ti.isValid(): trackCount += 1 ti.subTrackCount = trackCount ti.store()
def shouldPreProcessGESource(trackName, geSource, allowOverlaps): genome = geSource.getGenome() storedInfo = TrackInfo(genome, trackName) validFilesExist = PreProcessUtils.preProcFilesExist(genome, trackName, allowOverlaps) and \ storedInfo.isValid() if not geSource.hasOrigFile(): return False if validFilesExist or geSource.isExternal() else True storedAsAccordingToGeSource = \ (PreProcessUtils.constructId(geSource) == storedInfo.id and \ geSource.getVersion() == storedInfo.preProcVersion) #from gtrackcore.application.LogSetup import logMessage #logMessage(geSource.getGenome()) #logMessage(':'.join(trackName)) #logMessage('%s %s %s %s %s' % (PreProcessUtils.preProcFilesExist(genome, trackName, allowOverlaps), \ # storedInfo.isValid(), \ # geSource.hasOrigFile(), \ # PreProcessUtils.constructId(geSource) == storedInfo.id, \ # geSource.getVersion() == storedInfo.preProcVersion)) return not (validFilesExist and storedAsAccordingToGeSource)
def extract(cls, trackName, regionList, fn, fileFormatName=DEFAULT_FILE_FORMAT_NAME, globalCoords=True, \ addSuffix=False, asOriginal=False, allowOverlaps=False, ignoreEmpty=False): from gtrackcore.input.adapters.TrackGenomeElementSource import TrackGenomeElementSource from gtrackcore.extract.fileformats.FileFormatComposer import getComposerClsFromFileFormatName, getComposerClsFromFileSuffix assert len(regionList) > 0 for region in regionList: genome = region.genome break #To silently extract correctly if track type is dense if allowOverlaps: allowOverlaps = os.path.exists( createDirPath(trackName, genome, allowOverlaps=True)) trackGESource = TrackGenomeElementSource(genome, trackName, regionList, globalCoords=globalCoords, \ allowOverlaps=allowOverlaps, printWarnings=False) composerCls = None if asOriginal: ti = TrackInfo(genome, trackName) if ti.fileType != '': try: composerCls = getComposerClsFromFileSuffix(ti.fileType) except: pass if composerCls is None: composerCls = getComposerClsFromFileFormatName(fileFormatName) if addSuffix: fn = os.path.splitext( fn)[0] + '.' + composerCls.getDefaultFileNameSuffix() composer = composerCls(trackGESource) ok = composer.composeToFile(fn, ignoreEmpty=ignoreEmpty) if ok: return fn
def finalize(self, username, printMsg): ti = TrackInfo(self._genome, self._trackName) ti.fileType = self._fileSuffix trackFormat = self.getTrackFormat() ti.trackFormatName = trackFormat.getFormatName() ti.markType = trackFormat.getValTypeName() ti.weightType = trackFormat.getWeightTypeName() ti.undirectedEdges = self._undirectedEdges ti.preProcVersion = self._preProcVersion ti.origElCount = self._numElements[True] ti.clusteredElCount = self._numElements[False] if trackFormat.isDense() and trackFormat.isInterval(): ti.origElCount -= len(self._boundingRegionTuples[True]) ti.clusteredElCount -= len(self._boundingRegionTuples[False]) if True in self._valCategories: ti.numValCategories = len(self._valCategories[True]) if False in self._valCategories: ti.numClusteredValCategories = len(self._valCategories[False]) if True in self._edgeWeightCategories: ti.numEdgeWeightCategories = len(self._edgeWeightCategories[True]) ti.id = self._id ti.timeOfPreProcessing = datetime.datetime.now() ti.lastUpdatedBy = username if ti.hbContact == '': ti.hbContact = username ti.store() if printMsg: print "Finished preprocessing track '%s'." % ':'.join( self._trackName) print self.removeEntry()
def _calcAndStoreSubTrackCount(self, trackName): ti = TrackInfo(self._genome, trackName) if ti.isValid(): ti.subTrackCount = 1 ti.store()
def finalize(self, username, printMsg): ti = TrackInfo(self._genome, self._trackName) ti.fileType = self._fileSuffix trackFormat = self.getTrackFormat() ti.trackFormatName = trackFormat.getFormatName() ti.markType = trackFormat.getValTypeName() ti.weightType = trackFormat.getWeightTypeName() ti.undirectedEdges = self._undirectedEdges ti.preProcVersion = self._preProcVersion ti.origElCount = self._numElements[True] ti.clusteredElCount = self._numElements[False] if trackFormat.isDense() and trackFormat.isInterval(): ti.origElCount -= len(self._boundingRegionTuples[True]) ti.clusteredElCount -= len(self._boundingRegionTuples[False]) if True in self._valCategories: ti.numValCategories = len(self._valCategories[True]) if False in self._valCategories: ti.numClusteredValCategories = len(self._valCategories[False]) if True in self._edgeWeightCategories: ti.numEdgeWeightCategories = len(self._edgeWeightCategories[True]) ti.id = self._id ti.timeOfPreProcessing = datetime.datetime.now() ti.lastUpdatedBy = username if ti.hbContact == '': ti.hbContact = username ti.store() if printMsg: print "Finished preprocessing track '%s'." % ':'.join(self._trackName) print self.removeEntry()