class RasterMapIndexer(object): def __init__(self, opts): self.opts = opts self.subscriber = ZmqSubscriber( **ZmqSubscriber.getOptionValues(self.opts)) self.cacheDir = os.path.join(DATA_PATH, 'cache') self.store = None self.delayBox = DelayBox(self.writeOutputTile, maxDelaySeconds=5, numBuckets=10) if opts.timeSeries: timeSeriesList = opts.timeSeries.split(',') print 'indexing only the following time series:' for valueCode in timeSeriesList: print ' %s' % valueCode timeSeriesSet = set(opts.timeSeries.split(',')) else: timeSeriesSet = None self.indexes = {} for m in meta.TIME_SERIES: if 'map' in m: if timeSeriesSet and m['valueCode'] not in timeSeriesSet: continue index = TileIndex(m, self) self.indexes[m['valueCode']] = index def writeOutputTile(self, info): indexName, tileParams = info self.indexes[indexName].writeOutputTile(tileParams) def start(self): self.store = LruCacheStore(FileStore(self.cacheDir), MAX_TILES_IN_MEMORY) self.delayBox.start() self.subscriber.start() for index in self.indexes.itervalues(): print print '###################################################' print '# initializing map index:', index.valueCode print '###################################################' index.start() def stop(self): logging.info('cleaning up indexer...') self.store.sync() self.delayBox.stop() for index in self.indexes.itervalues(): index.stop() logging.info(' ... done') def clean(self): plotUtil.rmIfPossible(self.cacheDir) for index in self.indexes.itervalues(): index.clean()
class RasterMapIndexer(object): def __init__(self, opts): self.opts = opts self.subscriber = ZmqSubscriber(**ZmqSubscriber.getOptionValues(self.opts)) self.cacheDir = os.path.join(DATA_PATH, 'cache') self.store = None self.delayBox = DelayBox(self.writeOutputTile, maxDelaySeconds=5, numBuckets=10) if opts.timeSeries: timeSeriesList = opts.timeSeries.split(',') print 'indexing only the following time series:' for valueCode in timeSeriesList: print ' %s' % valueCode timeSeriesSet = set(opts.timeSeries.split(',')) else: timeSeriesSet = None self.indexes = {} for m in meta.TIME_SERIES: if 'map' in m: if timeSeriesSet and m['valueCode'] not in timeSeriesSet: continue index = TileIndex(m, self) self.indexes[m['valueCode']] = index def writeOutputTile(self, info): indexName, tileParams = info self.indexes[indexName].writeOutputTile(tileParams) def start(self): self.store = LruCacheStore(FileStore(self.cacheDir), MAX_TILES_IN_MEMORY) self.delayBox.start() self.subscriber.start() for index in self.indexes.itervalues(): print print '###################################################' print '# initializing map index:', index.valueCode print '###################################################' index.start() def stop(self): logging.info('cleaning up indexer...') self.store.sync() self.delayBox.stop() for index in self.indexes.itervalues(): index.stop() logging.info(' ... done') def clean(self): plotUtil.rmIfPossible(self.cacheDir) for index in self.indexes.itervalues(): index.clean()
def start(self): self.store = LruCacheStore(FileStore(self.cacheDir), MAX_TILES_IN_MEMORY) self.delayBox.start() self.subscriber.start() for index in self.indexes.itervalues(): print print '###################################################' print '# initializing map index:', index.valueCode print '###################################################' index.start()
def start(self): self.store = LruCacheStore(FileStore(self.cacheDir), SEGMENTS_IN_MEMORY_PER_TIME_SERIES) self.delayBox.start() if self.subscriber: self.queryManager.subscribeDjango( self.subscriber, lambda topic, obj: self.handleRecord(obj)) self.running = True self.statusPath = os.path.join(self.segmentDir, 'status.json') self.statusStore = plotUtil.JsonStore(self.statusPath) self.status = self.statusStore.read(dflt={ 'minTime': None, 'maxTime': None, 'numSamples': 0, 'numSegments': 0 }) self.statusStore.write(self.status) if self.queueMode: self.batchIndex()
def start(self): self.store = LruCacheStore(FileStore(self.cacheDir), SEGMENTS_IN_MEMORY_PER_TIME_SERIES) self.delayBox.start() if self.subscriber: self.queryManager.subscribeDjango(self.subscriber, lambda topic, obj: self.handleRecord(obj)) self.running = True self.statusPath = os.path.join(self.segmentDir, 'status.json') self.statusStore = plotUtil.JsonStore(self.statusPath) self.status = self.statusStore.read(dflt={ 'minTime': None, 'maxTime': None, 'numSamples': 0, 'numSegments': 0 }) self.statusStore.write(self.status) if self.queueMode: self.batchIndex()
def test_LruCacheStore(self): self.storeTest(lambda path: LruCacheStore(FileStore(path), 5))
class SegmentIndex(object): @classmethod def getKeyFromSegmentIndex(cls, segmentIndex): level, t = segmentIndex return '%s_%s' % (level, t) @classmethod def getSegmentIndexContainingTime(cls, level, posixTimeMs): segmentLength = 2.0 ** level return (level, int(posixTimeMs / segmentLength)) @classmethod def getBucketIndexContainingTime(cls, level, posixTimeMs): segmentLength = 2.0 ** level segVal = posixTimeMs / segmentLength bucketIndex = int((segVal - int(segVal)) * settings.XGDS_PLOT_SEGMENT_RESOLUTION) return bucketIndex @classmethod def getSegmentIndicesContainingTime(cls, posixTimeMs): return [cls.getSegmentIndexContainingTime(level, posixTimeMs) for level in xrange(MIN_SEGMENT_LEVEL, MAX_SEGMENT_LEVEL)] def __init__(self, meta, subscriber, batchIndexAtStart=True): self.meta = meta self.subscriber = subscriber self.queueMode = batchIndexAtStart queryClass = getClassByName(self.meta['queryType']) self.queryManager = queryClass(self.meta) valueClass = getClassByName(self.meta['valueType']) self.valueManager = valueClass(self.meta, self.queryManager) self.valueCode = self.meta['valueCode'] self.cacheDir = os.path.join(DATA_PATH, 'plot', 'cache', self.valueCode) self.segmentDir = os.path.join(DATA_PATH, 'plot', self.valueCode) self.delayBox = DelayBox(self.writeOutputSegment, maxDelaySeconds=5, numBuckets=10) self.queue = deque() self.running = False self.status = None self.statusPath = None self.statusStore = None self.store = None self.batchProcessStartTime = None def start(self): self.store = LruCacheStore(FileStore(self.cacheDir), SEGMENTS_IN_MEMORY_PER_TIME_SERIES) self.delayBox.start() if self.subscriber: self.queryManager.subscribeDjango(self.subscriber, lambda topic, obj: self.handleRecord(obj)) self.running = True self.statusPath = os.path.join(self.segmentDir, 'status.json') self.statusStore = plotUtil.JsonStore(self.statusPath) self.status = self.statusStore.read(dflt={ 'minTime': None, 'maxTime': None, 'numSamples': 0, 'numSegments': 0 }) self.statusStore.write(self.status) if self.queueMode: self.batchIndex() def flushStore(self): print '--> flushing store for %s' % self.valueCode if self.running: self.store.sync() self.delayBox.sync() self.statusStore.write(self.status) def stop(self): if self.running: self.flushStore() self.delayBox.stop() self.running = False def handleRecord(self, obj): if self.queueMode: self.queue.append(obj) else: self.indexRecord(obj) def indexRecord(self, obj): if (self.queueMode and (self.status['numSamples'] % BATCH_SLEEP_NUM_SAMPLES) == 0): batchProcessDuration = time.time() - self.batchProcessStartTime if settings.XGDS_PLOT_BATCH_SLEEP_TIME_FACTOR > 0: sleepTime = batchProcessDuration * settings.XGDS_PLOT_BATCH_SLEEP_TIME_FACTOR print 'sleeping for %.3f seconds to avoid overloading server' % sleepTime time.sleep(sleepTime) else: time.sleep(0) self.batchProcessStartTime = time.time() posixTimeMs = self.queryManager.getTimestamp(obj) maxTime = self.status['maxTime'] or -99e+20 if posixTimeMs <= maxTime: print ('skipping old (duplicate?) record: posixTimeMs %.3f <= maxTime %.3f' % (posixTimeMs, self.status['maxTime'])) return self.status['maxTime'] = max(maxTime, posixTimeMs) minTime = self.status['minTime'] or 99e+20 self.status['minTime'] = min(minTime, posixTimeMs) self.status['numSamples'] += 1 if self.status['numSamples'] % 100 == 0: print '%d %s segment update' % (self.status['numSamples'], self.valueCode) for segmentIndex in self.getSegmentIndicesContainingTime(posixTimeMs): val = self.valueManager.getValue(obj) if val is None: continue self.addSample(segmentIndex, posixTimeMs, val) self.delayBox.addJob(segmentIndex) def addSample(self, segmentIndex, posixTimeMs, val): segmentKey = self.getKeyFromSegmentIndex(segmentIndex) try: segmentData = self.store[segmentKey] except KeyError: segmentData = self.valueManager.makeSegment() self.status['numSegments'] += 1 level, _t = segmentIndex bucketIndex = self.getBucketIndexContainingTime(level, posixTimeMs) segmentData.addSample(bucketIndex, posixTimeMs, val) self.store[segmentKey] = segmentData def writeJsonWithTmp(self, outPath, obj, styleArgs=None): if styleArgs is None: styleArgs = {} tmpPath = outPath + '.part' json.dump(plotUtil.compactFloats(obj), open(tmpPath, 'wb'), **styleArgs) os.rename(tmpPath, outPath) def writeOutputSegment(self, segmentIndex): level, t = segmentIndex segmentKey = self.getKeyFromSegmentIndex(segmentIndex) segmentData = self.store[segmentKey] outPath = '%s/%d/%d.json' % (self.segmentDir, level, t) outDir = os.path.dirname(outPath) if not os.path.exists(outDir): os.makedirs(outDir) if settings.XGDS_PLOT_PRETTY_PRINT_JSON_SEGMENTS: styleArgs = dict(sort_keys=True, indent=4) else: styleArgs = dict(separators=(',', ':')) self.writeJsonWithTmp(outPath, segmentData.getJsonObj(), styleArgs) def clean(self): # must call this before start() ! assert not self.running plotUtil.rmIfPossible(self.cacheDir) plotUtil.rmIfPossible(self.segmentDir) def flushQueue(self): while self.queue: self.indexRecord(self.queue.popleft()) def batchIndex(self): self.batchProcessStartTime = time.time() # index everything in db that comes after the last thing we indexed on # the previous run print '--> batch indexing %s' % self.valueCode while 1: recs = self.queryManager.getData(minTime=self.status['maxTime']) n = recs.count() if n == 0: break print '--> %d %s samples remaining' % (n, self.valueCode) for rec in recs[:BATCH_READ_NUM_SAMPLES]: self.indexRecord(rec) # avoid django debug log memory leak db.reset_queries() self.statusStore.write(self.status) # batch process new records that arrived while we were # processing the database table. print ('--> indexing %d %s samples that came in during batch indexing' % (len(self.queue), self.valueCode)) self.flushQueue() self.flushStore() # switch modes to process each new record as it comes in. print '--> switching to live data mode' self.queueMode = False
class SegmentIndex(object): @classmethod def getKeyFromSegmentIndex(cls, segmentIndex): level, t = segmentIndex return '%s_%s' % (level, t) @classmethod def getSegmentIndexContainingTime(cls, level, posixTimeMs): segmentLength = 2.0**level return (level, int(posixTimeMs / segmentLength)) @classmethod def getBucketIndexContainingTime(cls, level, posixTimeMs): segmentLength = 2.0**level segVal = posixTimeMs / segmentLength bucketIndex = int( (segVal - int(segVal)) * settings.XGDS_PLOT_SEGMENT_RESOLUTION) return bucketIndex @classmethod def getSegmentIndicesContainingTime(cls, posixTimeMs): return [ cls.getSegmentIndexContainingTime(level, posixTimeMs) for level in xrange(MIN_SEGMENT_LEVEL, MAX_SEGMENT_LEVEL) ] def __init__(self, meta, subscriber, batchIndexAtStart=True): self.meta = meta self.subscriber = subscriber self.queueMode = batchIndexAtStart queryClass = getClassByName(self.meta['queryType']) self.queryManager = queryClass(self.meta) valueClass = getClassByName(self.meta['valueType']) self.valueManager = valueClass(self.meta, self.queryManager) self.valueCode = self.meta['valueCode'] self.cacheDir = os.path.join(DATA_PATH, 'plot', 'cache', self.valueCode) self.segmentDir = os.path.join(DATA_PATH, 'plot', self.valueCode) self.delayBox = DelayBox(self.writeOutputSegment, maxDelaySeconds=5, numBuckets=10) self.queue = deque() self.running = False self.status = None self.statusPath = None self.statusStore = None self.store = None self.batchProcessStartTime = None def start(self): self.store = LruCacheStore(FileStore(self.cacheDir), SEGMENTS_IN_MEMORY_PER_TIME_SERIES) self.delayBox.start() if self.subscriber: self.queryManager.subscribeDjango( self.subscriber, lambda topic, obj: self.handleRecord(obj)) self.running = True self.statusPath = os.path.join(self.segmentDir, 'status.json') self.statusStore = plotUtil.JsonStore(self.statusPath) self.status = self.statusStore.read(dflt={ 'minTime': None, 'maxTime': None, 'numSamples': 0, 'numSegments': 0 }) self.statusStore.write(self.status) if self.queueMode: self.batchIndex() def flushStore(self): print '--> flushing store for %s' % self.valueCode if self.running: self.store.sync() self.delayBox.sync() self.statusStore.write(self.status) def stop(self): if self.running: self.flushStore() self.delayBox.stop() self.running = False def handleRecord(self, obj): if self.queueMode: self.queue.append(obj) else: self.indexRecord(obj) def indexRecord(self, obj): if (self.queueMode and (self.status['numSamples'] % BATCH_SLEEP_NUM_SAMPLES) == 0): batchProcessDuration = time.time() - self.batchProcessStartTime if settings.XGDS_PLOT_BATCH_SLEEP_TIME_FACTOR > 0: sleepTime = batchProcessDuration * settings.XGDS_PLOT_BATCH_SLEEP_TIME_FACTOR print 'sleeping for %.3f seconds to avoid overloading server' % sleepTime time.sleep(sleepTime) else: time.sleep(0) self.batchProcessStartTime = time.time() posixTimeMs = self.queryManager.getTimestamp(obj) maxTime = self.status['maxTime'] or -99e+20 if posixTimeMs <= maxTime: print( 'skipping old (duplicate?) record: posixTimeMs %.3f <= maxTime %.3f' % (posixTimeMs, self.status['maxTime'])) return self.status['maxTime'] = max(maxTime, posixTimeMs) minTime = self.status['minTime'] or 99e+20 self.status['minTime'] = min(minTime, posixTimeMs) self.status['numSamples'] += 1 if self.status['numSamples'] % 100 == 0: print '%d %s segment update' % (self.status['numSamples'], self.valueCode) for segmentIndex in self.getSegmentIndicesContainingTime(posixTimeMs): val = self.valueManager.getValue(obj) if val is None: continue self.addSample(segmentIndex, posixTimeMs, val) self.delayBox.addJob(segmentIndex) def addSample(self, segmentIndex, posixTimeMs, val): segmentKey = self.getKeyFromSegmentIndex(segmentIndex) try: segmentData = self.store[segmentKey] except KeyError: segmentData = self.valueManager.makeSegment() self.status['numSegments'] += 1 level, _t = segmentIndex bucketIndex = self.getBucketIndexContainingTime(level, posixTimeMs) segmentData.addSample(bucketIndex, posixTimeMs, val) self.store[segmentKey] = segmentData def writeJsonWithTmp(self, outPath, obj, styleArgs=None): if styleArgs is None: styleArgs = {} tmpPath = outPath + '.part' json.dump(plotUtil.compactFloats(obj), open(tmpPath, 'wb'), **styleArgs) os.rename(tmpPath, outPath) def writeOutputSegment(self, segmentIndex): level, t = segmentIndex segmentKey = self.getKeyFromSegmentIndex(segmentIndex) segmentData = self.store[segmentKey] outPath = '%s/%d/%d.json' % (self.segmentDir, level, t) outDir = os.path.dirname(outPath) if not os.path.exists(outDir): os.makedirs(outDir) if settings.XGDS_PLOT_PRETTY_PRINT_JSON_SEGMENTS: styleArgs = dict(sort_keys=True, indent=4) else: styleArgs = dict(separators=(',', ':')) self.writeJsonWithTmp(outPath, segmentData.getJsonObj(), styleArgs) def clean(self): # must call this before start() ! assert not self.running plotUtil.rmIfPossible(self.cacheDir) plotUtil.rmIfPossible(self.segmentDir) def flushQueue(self): while self.queue: self.indexRecord(self.queue.popleft()) def batchIndex(self): self.batchProcessStartTime = time.time() # index everything in db that comes after the last thing we indexed on # the previous run print '--> batch indexing %s' % self.valueCode while 1: recs = self.queryManager.getData(minTime=self.status['maxTime']) n = recs.count() if n == 0: break print '--> %d %s samples remaining' % (n, self.valueCode) for rec in recs[:BATCH_READ_NUM_SAMPLES]: self.indexRecord(rec) # avoid django debug log memory leak db.reset_queries() self.statusStore.write(self.status) # batch process new records that arrived while we were # processing the database table. print('--> indexing %d %s samples that came in during batch indexing' % (len(self.queue), self.valueCode)) self.flushQueue() self.flushStore() # switch modes to process each new record as it comes in. print '--> switching to live data mode' self.queueMode = False