def removeOutdatedPreProcessedFiles(genome, trackName, allowOverlaps, mode): collector = PreProcMetaDataCollector(genome, trackName) if PreProcessUtils.preProcFilesExist(genome, trackName, allowOverlaps) and not \ collector.hasRemovedPreProcFiles(allowOverlaps): dirPath = createDirPath(trackName, genome, allowOverlaps=allowOverlaps) assert dirPath.startswith(Config.PROCESSED_DATA_PATH), \ "Processed data path '%s' does not start with '%s'" % \ (dirPath, Config.PROCESSED_DATA_PATH) if mode == 'Real': print 'Removing outdated preprocessed data: ', dirPath for fn in os.listdir(dirPath): fullFn = os.path.join(dirPath, fn) if os.path.isfile(fullFn): os.unlink(fullFn) if os.path.isdir(fullFn): if PreProcessUtils._isOldTypeChromDirectory(fullFn, genome): shutil.rmtree(fullFn) else: print 'Would now have removed outdated preprocessed data if real run: ', dirPath collector.updateRemovedPreProcFilesFlag(allowOverlaps, True) if mode == 'Real': ti = TrackInfo(genome, trackName) ti.resetTimeOfPreProcessing()
def _calcAndStoreSubTrackCount(self, trackName): ti = TrackInfo(self._genome, trackName) trackCount = 0 for subTrackName in ProcTrackOptions.getSubtypes(self._genome, trackName, True): subTrackCount = TrackInfo(self._genome, trackName + [subTrackName]).subTrackCount if subTrackCount: trackCount += subTrackCount if ti.isValid(): trackCount += 1 ti.subTrackCount = trackCount ti.store()
def modifyTnRecord(genome, oldTn, newTn, verbose): trackInfo = TrackInfo(genome, oldTn) assert trackInfo.trackName == oldTn assert trackInfo.timeOfPreProcessing is not None, 'ERROR: trackInfo-object not complete for TN (is this track preprocessed?): ' + str(oldTn) #if trackInfo.timeOfPreProcessing is None: #print 'WARNING: timeOfPreProcessing is None for: ',oldTn trackInfo.trackName = newTn if not ONLY_SIMULATION: trackInfo.store() if verbose: print '(Storing track-info with new tn: %s)' % str(newTn) else: if verbose: print 'Would now store track-info with new tn: %s' % str(newTn)
def constructId(geSource): from gtrackcore_memmap.preprocess.PreProcessTracksJob import PreProcessTracksJob if geSource.hasOrigFile(): origPath = os.path.dirname(geSource.getFileName()) if not geSource.isExternal() else geSource.getFileName() return TrackInfo.constructIdFromPath(geSource.getGenome(), origPath, \ geSource.getVersion(), PreProcessTracksJob.VERSION) else: return geSource.getId()
def shouldPreProcessGESource(trackName, geSource, allowOverlaps): genome = geSource.getGenome() storedInfo = TrackInfo(genome, trackName) validFilesExist = PreProcessUtils.preProcFilesExist(genome, trackName, allowOverlaps) and \ storedInfo.isValid() if not geSource.hasOrigFile(): return False if validFilesExist or geSource.isExternal() else True storedAsAccordingToGeSource = \ (PreProcessUtils.constructId(geSource) == storedInfo.id and \ geSource.getVersion() == storedInfo.preProcVersion) #from gtrackcore_memmap.application.LogSetup import logMessage #logMessage(geSource.getGenome()) #logMessage(':'.join(trackName)) #logMessage('%s %s %s %s %s' % (PreProcessUtils.preProcFilesExist(genome, trackName, allowOverlaps), \ # storedInfo.isValid(), \ # geSource.hasOrigFile(), \ # PreProcessUtils.constructId(geSource) == storedInfo.id, \ # geSource.getVersion() == storedInfo.preProcVersion)) return not (validFilesExist and storedAsAccordingToGeSource)
def finalize(self, username, printMsg): ti = TrackInfo(self._genome, self._trackName) ti.fileType = self._fileSuffix trackFormat = self.getTrackFormat() ti.trackFormatName = trackFormat.getFormatName() ti.markType = trackFormat.getValTypeName() ti.weightType = trackFormat.getWeightTypeName() ti.undirectedEdges = self._undirectedEdges ti.preProcVersion = self._preProcVersion ti.origElCount = self._numElements[True] ti.clusteredElCount = self._numElements[False] if trackFormat.isDense() and trackFormat.isInterval(): ti.origElCount -= len(self._boundingRegionTuples[True]) ti.clusteredElCount -= len(self._boundingRegionTuples[False]) if True in self._valCategories: ti.numValCategories = len(self._valCategories[True]) if False in self._valCategories: ti.numClusteredValCategories = len(self._valCategories[False]) if True in self._edgeWeightCategories: ti.numEdgeWeightCategories = len(self._edgeWeightCategories[True]) ti.id = self._id ti.timeOfPreProcessing = datetime.datetime.now() ti.lastUpdatedBy = username if ti.hbContact == '': ti.hbContact = username ti.store() if printMsg: print "Finished preprocessing track '%s'." % ':'.join(self._trackName) print self.removeEntry()
def _calcAndStoreSubTrackCount(self, trackName): ti = TrackInfo(self._genome, trackName) if ti.isValid(): ti.subTrackCount = 1 ti.store()