def processCursor(cursor, constructObjectFunc, onIterationFunc=None, cursorSize=None, getCurrentIterationFunc=None): try: if cursor is None: return None timer = Timer() results = [] if getCurrentIterationFunc is None: currentIterationCounter = [0] def getIterationFunc( obj, currentIterationCounter=currentIterationCounter): currentIterationCounter[0] += 1 return currentIterationCounter[0] getCurrentIterationFunc = getIterationFunc brokeOut = False iteration = 0 cursorIterationOffset = 0 endIteration = cursorSize isIterationBoundsInitialised = False if onIterationFunc is not None: onIterationFunc(cursorIterationOffset, endIteration, False, None, 'base') for item in cursor: currentObject = constructObjectFunc(item) if onIterationFunc is not None: iteration = getCurrentIterationFunc(currentObject) if iteration is None: continue if cursorSize is not None: if not isIterationBoundsInitialised: cursorIterationOffset = iteration - 1 # Iterations don't have to be 0 indexed. endIteration = cursorSize - cursorIterationOffset isIterationBoundsInitialised = True if isIterationBoundsInitialised: iteration -= cursorIterationOffset #logger.info('S: %d, M: %d, E: %d' % (0, iteration, endIteration)) #assert 0 <= iteration <= (endIteration + 5) result = onIterationFunc(iteration, endIteration, False, currentObject, 'base') if result is False: brokeOut = True break else: # Don't return in results if we have an iteration func. # This is important in case we are processing millions of rows # (more than we can fit in memory). if currentObject is not None: results.append(currentObject) # Signal that we're finished. if onIterationFunc is not None: if not brokeOut: iteration = endIteration onIterationFunc(iteration, endIteration, True, None, 'base') timeTaken2 = timer.time_since_constructed logger.info('Successfully processed cursor in %dms' % timeTaken2) timeTaken = timer.time_since_constructed logger.info('Successfully read %d items from cache (%d) in %dms' % (len(results), timer.__hash__(), timeTaken)) if len(results) == 0: return None else: return results finally: if cursor is not None: cursor.close()
def processCursor(cursor, constructObjectFunc, onIterationFunc=None, cursorSize=None, getCurrentIterationFunc=None): try: if cursor is None: return None timer = Timer() results = [] if getCurrentIterationFunc is None: currentIterationCounter = [0] def getIterationFunc(obj, currentIterationCounter=currentIterationCounter): currentIterationCounter[0] += 1 return currentIterationCounter[0] getCurrentIterationFunc = getIterationFunc brokeOut = False iteration = 0 cursorIterationOffset = 0 endIteration = cursorSize isIterationBoundsInitialised = False if onIterationFunc is not None: onIterationFunc(cursorIterationOffset, endIteration, False, None, 'base') for item in cursor: currentObject = constructObjectFunc(item) if onIterationFunc is not None: iteration = getCurrentIterationFunc(currentObject) if iteration is None: continue if cursorSize is not None: if not isIterationBoundsInitialised: cursorIterationOffset = iteration - 1 # Iterations don't have to be 0 indexed. endIteration = cursorSize - cursorIterationOffset isIterationBoundsInitialised = True if isIterationBoundsInitialised: iteration -= cursorIterationOffset #logger.info('S: %d, M: %d, E: %d' % (0, iteration, endIteration)) #assert 0 <= iteration <= (endIteration + 5) result = onIterationFunc(iteration, endIteration, False, currentObject, 'base') if result is False: brokeOut = True break else: # Don't return in results if we have an iteration func. # This is important in case we are processing millions of rows # (more than we can fit in memory). if currentObject is not None: results.append(currentObject) # Signal that we're finished. if onIterationFunc is not None: if not brokeOut: iteration = endIteration onIterationFunc(iteration, endIteration, True, None, 'base') timeTaken2 = timer.time_since_constructed logger.info('Successfully processed cursor in %dms' % timeTaken2) timeTaken = timer.time_since_constructed logger.info('Successfully read %d items from cache (%d) in %dms' % (len(results),timer.__hash__(),timeTaken)) if len(results) == 0: return None else: return results finally: if cursor is not None: cursor.close()
def cursorItemsFromCache(instanceId, getCollectionFunc, placeId=None, epochMsStartRange=None, epochMsEndRange=None, pageNum=None, pageSize=None, typeSpecificQuery=None, projection=None, sortByTimestamp=None, typeSpecificHint=None): if sortByTimestamp is None: sortByTimestamp = True epochMsStartRange, epochMsEndRange = fixEpochMsRange( epochMsStartRange, epochMsEndRange) if epochMsEndRange is None: upperBoundTimestamp = getEpochMs() else: upperBoundTimestamp = epochMsEndRange if projection is not None and projection.do_query is False: return None assert instanceId is not None assert getCollectionFunc is not None collection = getCollectionFunc(instanceId) logFormatting = 'IN:%s, P:%s, ES:%s, EE:%s, PN:%s, PS:%s, T:%s, P:%s' % ( instanceId, placeId, epochMsStartRange, epochMsEndRange, pageNum, pageSize, typeSpecificQuery, projection) timer = Timer() logger.info('Attempting to read items from cache (%d) -- %s' % (timer.__hash__(), logFormatting)) findDic = dict() timestampDic = None if epochMsEndRange is not None: if timestampDic is None: timestampDic = dict() timestampDic.update({'$lt': epochMsEndRange}) if epochMsStartRange is not None: if timestampDic is None: timestampDic = dict() timestampDic.update({'$gte': epochMsStartRange}) if timestampDic is not None: findDic.update({'timestamp': timestampDic}) if placeId is not None: findDic.update( dict({ 'geocode.providerId': placeId['providerId'], 'geocode.placeId': placeId['placeId'] })) # MongoDB sometimes gets it wrong, particularly with geocode.placeId. if typeSpecificHint is None: if timestampDic is not None: if placeId is not None: hint = [('geocode.placeId', pymongo.ASCENDING), ('timestamp', pymongo.ASCENDING)] else: hint = [('timestamp', pymongo.ASCENDING)] else: if placeId is not None: hint = [('geocode.placeId', pymongo.ASCENDING)] else: hint = None else: hint = typeSpecificHint if typeSpecificQuery is not None: findDic.update(typeSpecificQuery) if projection is None: cursor = collection.find(findDic).hint(hint) else: cursor = collection.find(findDic, projection.projection).hint(hint) if sortByTimestamp: cursor = cursor.sort([('timestamp', pymongo.ASCENDING)]) if pageSize is not None and pageNum is not None: cursor = cursor.skip(pageSize * pageNum).limit(pageSize) # We use this to calculate progress through the cursor, # It is more efficient than using cursor.count. cursor.upper_bound_timestamp = upperBoundTimestamp timeTaken = timer.time_since_constructed logger.info('Successfully setup cursor in %dms -- %s' % (timeTaken, logFormatting)) if Configuration.MONGO_EXPLAINS_ENABLED: logger.critical('Tweet/User Explain: %s' % unicode(cursor.explain())) return cursor
def cursorItemsFromCache(instanceId, getCollectionFunc, placeId=None, epochMsStartRange=None, epochMsEndRange=None, pageNum=None, pageSize=None, typeSpecificQuery=None, projection=None, sortByTimestamp=None, typeSpecificHint=None): if sortByTimestamp is None: sortByTimestamp = True epochMsStartRange, epochMsEndRange = fixEpochMsRange(epochMsStartRange, epochMsEndRange) if epochMsEndRange is None: upperBoundTimestamp = getEpochMs() else: upperBoundTimestamp = epochMsEndRange if projection is not None and projection.do_query is False: return None assert instanceId is not None assert getCollectionFunc is not None collection = getCollectionFunc(instanceId) logFormatting = 'IN:%s, P:%s, ES:%s, EE:%s, PN:%s, PS:%s, T:%s, P:%s' % (instanceId, placeId, epochMsStartRange, epochMsEndRange, pageNum, pageSize, typeSpecificQuery, projection) timer = Timer() logger.info('Attempting to read items from cache (%d) -- %s' % (timer.__hash__(),logFormatting)) findDic = dict() timestampDic = None if epochMsEndRange is not None: if timestampDic is None: timestampDic = dict() timestampDic.update({'$lt' : epochMsEndRange}) if epochMsStartRange is not None: if timestampDic is None: timestampDic = dict() timestampDic.update({'$gte' : epochMsStartRange}) if timestampDic is not None: findDic.update({'timestamp' : timestampDic}) if placeId is not None: findDic.update({'geocode.placeId' : placeId['placeId'], 'geocode.providerId' : placeId['providerId']}) # MongoDB sometimes gets it wrong, particularly with geocode.placeId. if typeSpecificHint is None: if timestampDic is not None: if placeId is not None: hint = [('geocode.placeId', pymongo.ASCENDING), ('timestamp', pymongo.ASCENDING)] else: hint = [('timestamp', pymongo.ASCENDING)] else: if placeId is not None: hint = [('geocode.placeId', pymongo.ASCENDING)] else: hint = None else: hint = typeSpecificHint if typeSpecificQuery is not None: findDic.update(typeSpecificQuery) if projection is None: cursor = collection.find(findDic,timeout=False).hint(hint) else: cursor = collection.find(findDic, projection.projection,timeout=False).hint(hint) if sortByTimestamp: cursor = cursor.sort([('timestamp', pymongo.ASCENDING)]) if pageSize is not None and pageNum is not None: cursor = cursor.skip(pageSize*pageNum).limit(pageSize) # We use this to calculate progress through the cursor, # It is more efficient than using cursor.count. cursor.upper_bound_timestamp = upperBoundTimestamp timeTaken = timer.time_since_constructed logger.info('Successfully setup cursor in %dms -- %s' % (timeTaken,logFormatting)) if Configuration.MONGO_EXPLAINS_ENABLED: logger.critical('Tweet/User Explain: %s' % unicode(cursor.explain())) return cursor