Пример #1
0
def processCursor(cursor,
                  constructObjectFunc,
                  onIterationFunc=None,
                  cursorSize=None,
                  getCurrentIterationFunc=None):
    try:
        if cursor is None:
            return None

        timer = Timer()

        results = []

        if getCurrentIterationFunc is None:
            currentIterationCounter = [0]

            def getIterationFunc(
                    obj, currentIterationCounter=currentIterationCounter):
                currentIterationCounter[0] += 1
                return currentIterationCounter[0]

            getCurrentIterationFunc = getIterationFunc

        brokeOut = False
        iteration = 0
        cursorIterationOffset = 0

        endIteration = cursorSize

        isIterationBoundsInitialised = False

        if onIterationFunc is not None:
            onIterationFunc(cursorIterationOffset, endIteration, False, None,
                            'base')

        for item in cursor:
            currentObject = constructObjectFunc(item)

            if onIterationFunc is not None:
                iteration = getCurrentIterationFunc(currentObject)

                if iteration is None:
                    continue

                if cursorSize is not None:
                    if not isIterationBoundsInitialised:
                        cursorIterationOffset = iteration - 1  # Iterations don't have to be 0 indexed.
                        endIteration = cursorSize - cursorIterationOffset
                        isIterationBoundsInitialised = True

                    if isIterationBoundsInitialised:
                        iteration -= cursorIterationOffset

                #logger.info('S: %d, M: %d, E: %d' % (0, iteration, endIteration))
                #assert 0 <= iteration <= (endIteration + 5)

                result = onIterationFunc(iteration, endIteration, False,
                                         currentObject, 'base')
                if result is False:
                    brokeOut = True
                    break
            else:
                # Don't return in results if we have an iteration func.
                # This is important in case we are processing millions of rows
                # (more than we can fit in memory).
                if currentObject is not None:
                    results.append(currentObject)

        # Signal that we're finished.
        if onIterationFunc is not None:
            if not brokeOut:
                iteration = endIteration

            onIterationFunc(iteration, endIteration, True, None, 'base')

        timeTaken2 = timer.time_since_constructed
        logger.info('Successfully processed cursor in %dms' % timeTaken2)

        timeTaken = timer.time_since_constructed
        logger.info('Successfully read %d items from cache (%d) in %dms' %
                    (len(results), timer.__hash__(), timeTaken))

        if len(results) == 0:
            return None
        else:
            return results
    finally:
        if cursor is not None:
            cursor.close()
Пример #2
0
def processCursor(cursor, constructObjectFunc, onIterationFunc=None, cursorSize=None, getCurrentIterationFunc=None):
    try:
        if cursor is None:
            return None

        timer = Timer()

        results = []

        if getCurrentIterationFunc is None:
            currentIterationCounter = [0]
            def getIterationFunc(obj, currentIterationCounter=currentIterationCounter):
                currentIterationCounter[0] += 1
                return currentIterationCounter[0]

            getCurrentIterationFunc = getIterationFunc

        brokeOut = False
        iteration = 0
        cursorIterationOffset = 0

        endIteration = cursorSize

        isIterationBoundsInitialised = False

        if onIterationFunc is not None:
            onIterationFunc(cursorIterationOffset, endIteration, False, None, 'base')

        for item in cursor:
            currentObject = constructObjectFunc(item)

            if onIterationFunc is not None:
                iteration = getCurrentIterationFunc(currentObject)

                if iteration is None:
                    continue

                if cursorSize is not None:
                    if not isIterationBoundsInitialised:
                        cursorIterationOffset = iteration - 1 # Iterations don't have to be 0 indexed.
                        endIteration = cursorSize - cursorIterationOffset
                        isIterationBoundsInitialised = True

                    if isIterationBoundsInitialised:
                        iteration -= cursorIterationOffset

                #logger.info('S: %d, M: %d, E: %d' % (0, iteration, endIteration))
                #assert 0 <= iteration <= (endIteration + 5)

                result = onIterationFunc(iteration, endIteration, False, currentObject, 'base')
                if result is False:
                    brokeOut = True
                    break
            else:
                # Don't return in results if we have an iteration func.
                # This is important in case we are processing millions of rows
                # (more than we can fit in memory).
                if currentObject is not None:
                    results.append(currentObject)

        # Signal that we're finished.
        if onIterationFunc is not None:
            if not brokeOut:
                iteration = endIteration

            onIterationFunc(iteration, endIteration, True, None, 'base')

        timeTaken2 = timer.time_since_constructed
        logger.info('Successfully processed cursor in %dms' % timeTaken2)

        timeTaken = timer.time_since_constructed
        logger.info('Successfully read %d items from cache (%d) in %dms' % (len(results),timer.__hash__(),timeTaken))

        if len(results) == 0:
            return None
        else:
            return results
    finally:
        if cursor is not None:
            cursor.close()
Пример #3
0
def cursorItemsFromCache(instanceId,
                         getCollectionFunc,
                         placeId=None,
                         epochMsStartRange=None,
                         epochMsEndRange=None,
                         pageNum=None,
                         pageSize=None,
                         typeSpecificQuery=None,
                         projection=None,
                         sortByTimestamp=None,
                         typeSpecificHint=None):
    if sortByTimestamp is None:
        sortByTimestamp = True

    epochMsStartRange, epochMsEndRange = fixEpochMsRange(
        epochMsStartRange, epochMsEndRange)

    if epochMsEndRange is None:
        upperBoundTimestamp = getEpochMs()
    else:
        upperBoundTimestamp = epochMsEndRange

    if projection is not None and projection.do_query is False:
        return None

    assert instanceId is not None
    assert getCollectionFunc is not None
    collection = getCollectionFunc(instanceId)

    logFormatting = 'IN:%s, P:%s, ES:%s, EE:%s, PN:%s, PS:%s, T:%s, P:%s' % (
        instanceId, placeId, epochMsStartRange, epochMsEndRange, pageNum,
        pageSize, typeSpecificQuery, projection)

    timer = Timer()
    logger.info('Attempting to read items from cache (%d) -- %s' %
                (timer.__hash__(), logFormatting))

    findDic = dict()

    timestampDic = None
    if epochMsEndRange is not None:
        if timestampDic is None:
            timestampDic = dict()

        timestampDic.update({'$lt': epochMsEndRange})

    if epochMsStartRange is not None:
        if timestampDic is None:
            timestampDic = dict()

        timestampDic.update({'$gte': epochMsStartRange})

    if timestampDic is not None:
        findDic.update({'timestamp': timestampDic})

    if placeId is not None:
        findDic.update(
            dict({
                'geocode.providerId': placeId['providerId'],
                'geocode.placeId': placeId['placeId']
            }))

    # MongoDB sometimes gets it wrong, particularly with geocode.placeId.
    if typeSpecificHint is None:
        if timestampDic is not None:
            if placeId is not None:
                hint = [('geocode.placeId', pymongo.ASCENDING),
                        ('timestamp', pymongo.ASCENDING)]
            else:
                hint = [('timestamp', pymongo.ASCENDING)]
        else:
            if placeId is not None:
                hint = [('geocode.placeId', pymongo.ASCENDING)]
            else:
                hint = None
    else:
        hint = typeSpecificHint

    if typeSpecificQuery is not None:
        findDic.update(typeSpecificQuery)

    if projection is None:
        cursor = collection.find(findDic).hint(hint)
    else:
        cursor = collection.find(findDic, projection.projection).hint(hint)

    if sortByTimestamp:
        cursor = cursor.sort([('timestamp', pymongo.ASCENDING)])

    if pageSize is not None and pageNum is not None:
        cursor = cursor.skip(pageSize * pageNum).limit(pageSize)

    # We use this to calculate progress through the cursor,
    # It is more efficient than using cursor.count.
    cursor.upper_bound_timestamp = upperBoundTimestamp

    timeTaken = timer.time_since_constructed
    logger.info('Successfully setup cursor in %dms -- %s' %
                (timeTaken, logFormatting))

    if Configuration.MONGO_EXPLAINS_ENABLED:
        logger.critical('Tweet/User Explain: %s' % unicode(cursor.explain()))

    return cursor
Пример #4
0
def cursorItemsFromCache(instanceId, getCollectionFunc, placeId=None, epochMsStartRange=None, epochMsEndRange=None, pageNum=None, pageSize=None, typeSpecificQuery=None, projection=None, sortByTimestamp=None, typeSpecificHint=None):
    if sortByTimestamp is None:
        sortByTimestamp = True

    epochMsStartRange, epochMsEndRange = fixEpochMsRange(epochMsStartRange, epochMsEndRange)

    if epochMsEndRange is None:
        upperBoundTimestamp = getEpochMs()
    else:
        upperBoundTimestamp = epochMsEndRange

    if projection is not None and projection.do_query is False:
        return None

    assert instanceId is not None
    assert getCollectionFunc is not None
    collection = getCollectionFunc(instanceId)

    logFormatting = 'IN:%s, P:%s, ES:%s, EE:%s, PN:%s, PS:%s, T:%s, P:%s' % (instanceId, placeId, epochMsStartRange, epochMsEndRange, pageNum, pageSize, typeSpecificQuery, projection)

    timer = Timer()
    logger.info('Attempting to read items from cache (%d) -- %s' % (timer.__hash__(),logFormatting))

    findDic = dict()

    timestampDic = None
    if epochMsEndRange is not None:
        if timestampDic is None:
            timestampDic = dict()

        timestampDic.update({'$lt' : epochMsEndRange})

    if epochMsStartRange is not None:
        if timestampDic is None:
            timestampDic = dict()

        timestampDic.update({'$gte' : epochMsStartRange})

    if timestampDic is not None:
        findDic.update({'timestamp' : timestampDic})

    if placeId is not None:
        findDic.update({'geocode.placeId' : placeId['placeId'],
                        'geocode.providerId' : placeId['providerId']})

    # MongoDB sometimes gets it wrong, particularly with geocode.placeId.
    if typeSpecificHint is None:
        if timestampDic is not None:
            if placeId is not None:
                hint = [('geocode.placeId', pymongo.ASCENDING), ('timestamp', pymongo.ASCENDING)]
            else:
                hint = [('timestamp', pymongo.ASCENDING)]
        else:
            if placeId is not None:
                hint = [('geocode.placeId', pymongo.ASCENDING)]
            else:
                hint = None
    else:
        hint = typeSpecificHint

    if typeSpecificQuery is not None:
        findDic.update(typeSpecificQuery)

    if projection is None:
        cursor = collection.find(findDic,timeout=False).hint(hint)
    else:
        cursor = collection.find(findDic, projection.projection,timeout=False).hint(hint)

    if sortByTimestamp:
        cursor = cursor.sort([('timestamp', pymongo.ASCENDING)])

    if pageSize is not None and pageNum is not None:
        cursor = cursor.skip(pageSize*pageNum).limit(pageSize)

    # We use this to calculate progress through the cursor,
    # It is more efficient than using cursor.count.
    cursor.upper_bound_timestamp = upperBoundTimestamp

    timeTaken = timer.time_since_constructed
    logger.info('Successfully setup cursor in %dms -- %s' % (timeTaken,logFormatting))

    if Configuration.MONGO_EXPLAINS_ENABLED:
        logger.critical('Tweet/User Explain: %s' % unicode(cursor.explain()))

    return cursor