    def splitWorkload(self):
        """Divide the workload up into segments for skew analysis"""

        start_time = None
        end_time = None
        for i in xrange(len(self.state.workload)):
            if start_time is None or start_time > self.state.workload[i]['start_time']:
                start_time = self.state.workload[i]['start_time']
            if end_time is None or end_time < self.state.workload[i]['end_time']:
                end_time = self.state.workload[i]['end_time']
        assert not start_time is None,\
            "Failed to find start time in %d sessions" % len(self.state.workload)
        assert not end_time is None,\
            "Failed to find end time in %d sessions" % len(self.state.workload)

        if self.debug:
            LOG.debug("Workload Segments - START:%d / END:%d", start_time, end_time)
        self.workload_segments = [ [] for i in xrange(0, self.state.skew_segments) ]
        segment_h = Histogram()
        for sess in self.state.workload:
            idx = self.getSessionSegment(sess, start_time, end_time)
            assert idx >= 0 and idx < self.state.skew_segments,\
                "Invalid workload segment '%d' for Session #%d\n%s" % (idx, sess['session_id'], segment_h)
def print_stats(host, port, w_db, w_col):
    print ""
    LOG.info("..:: MongoDesigner Workload Info ::..")
    print ""

    #start connection and set global variables...
    connection = initDB(host, port, w_db, w_col)

    LOG.info("=" * 50)

    session_cnt = workload_db[workload_col].find().count()
    LOG.info("Number of sessions: %d", session_cnt)
    LOG.info("Number of operations per session:")

    maxOpCnt = 0
    minOpCnt = sys.maxint
    vals = []

    typeCnts = Histogram()
    for session in workload_db[workload_col].find():
        for op in session['operations']:

        op_cnt = len(session['operations'])
        minOpCnt = min(op_cnt, minOpCnt)
        maxOpCnt = max(op_cnt, maxOpCnt)
    ## FOR
    avgOpCnt = None
    if vals:
        avgOpCnt = "%.2f" % float(sum(vals)) / float(len(vals))

    LOG.info("%10s: %d" % ("min", minOpCnt))
    LOG.info("%10s: %d" % ("max", maxOpCnt))
    LOG.info("%10s: %s" % ("avg", avgOpCnt))

    LOG.info("Number of operations by type:")
    for opType in typeCnts.values():
        LOG.info("%10s: %d" % (opType, typeCnts[opType]))
    ## FOR

    LOG.info("=" * 50)

    def calculateSessions(self):
        # Calculate outliers using the quartile method
        # http://en.wikipedia.org/wiki/Quartile#Computing_methods
        if self.debug:
            LOG.debug("Calculating time difference for operations in %d sessions" % len(self.sessOps))
        # Get the full list of all the time differences
        allDiffs = [ ]
        for clientOps in self.sessOps.values():
            allDiffs += [x[-1] for x in clientOps]
        allDiffs = sorted(allDiffs)
        numDiffs = len(allDiffs)

        #print "\n".join(map(str, allDiffs))
        # Lower + Upper Quartiles
        lowerQuartile, upperQuartile = mathutil.quartiles(allDiffs)
        if lowerQuartile is None or upperQuartile is None:
            LOG.warn("Null quartiles! Can't continue!")
        # Interquartile Range
        iqr = (upperQuartile - lowerQuartile) * 1.5
        if self.debug:
            LOG.debug("Calculating stats for %d op pairs" % len(allDiffs))
            LOG.debug("  Lower Quartile: %s" % lowerQuartile)
            LOG.debug("  Upper Quartile: %s" % upperQuartile)
            LOG.debug("  IQR: %s" % iqr)
        # Go through operations for each client and identify the
        # pairs of operations that are above the IQR in the upperQuartile
        opHist = Histogram()
        prevOpHist = Histogram()
        nextOpHist = Histogram()
        threshold = upperQuartile + iqr
        for sessId, clientOps in self.sessOps.iteritems():
            for op0, op1, opDiff in clientOps:
                if opDiff >= threshold:
                    opHist.put((op0["query_hash"], op1["query_hash"]))
            ## FOR
        ## FOR
        if self.debug:
            LOG.debug("Outlier Op Hashes:\n%s" % opHist)
        # I guess at this point we can just compute the outliers
        # again for the pairs of operations that have a time difference
        # outlier. We won't use the IQR. We'll just take the upper quartile
        # because that seems to give us the right answer
        outlierCounts = sorted(opHist.getCounts())
        lowerQuartile, upperQuartile = mathutil.quartiles(outlierCounts)
        if self.debug:
            LOG.debug("Calculating stats for %d count outliers" % len(outlierCounts))
            LOG.debug("  Lower Quartile: %s" % lowerQuartile)
            LOG.debug("  Upper Quartile: %s" % upperQuartile)
        # If we're doing this randomly, we want each session to have roughly 
        # the same number of operations as RANDOMIZE_TARGET
        if self.randomize:
            num_outliers = len(outlierCounts)
            force = 1 if int(num_outliers*0.10) == 1 else random.randint(1, int(num_outliers*0.10))
            LOG.warn("Forcing %d random outliers out of %d to be chosen from workload", force, num_outliers)
            force = 0
        for cnt in outlierCounts:
            if cnt >= upperQuartile or (self.randomize and force > 0):
                self.sessionBoundaries |= set(opHist.getValuesForCount(cnt))
                force -= 1
        ## FOR
        LOG.debug("Found %d outlier hashes" % len(self.sessionBoundaries))
    def sessionizeWorkload(self):
            Split the Sessions based on the gap between operation times
        LOG.info("Sessionizing sample workload")

        s = Sessionizer(self.metadata_db, randomize=self.random_sessionizer)

        # We first feed in all of the operations in for each session
        nextSessId = -1
        origTotal = 0
        origHistogram = Histogram()
        sessions = []
        for sess in self.metadata_db.Session.fetch():
            s.process(sess['session_id'], sess['operations'])
            nextSessId = max(nextSessId, sess['session_id'])
            origTotal += len(sess['operations'])
        ## FOR
        nextSessId += 1

        avg_ops = 0 if origHistogram.getSampleCount() == 0 else (
            origTotal / float(origHistogram.getSampleCount()))
        LOG.info("BEFORE Sessionization\n" +
                 "  # of Sessions: %d\n" +
                 "  Avg Ops per Session: %.2f\n" +
                 "  Next Session Id: %d", \
                 origHistogram.getSampleCount(), \
                 avg_ops, nextSessId)

        # Then split them into separate sessions
        newTotal = 0
        newHistogram = Histogram()

        # We have to do this because otherwise we will start to process
        # the new sessions that we just inserted... I know...
        for sess in sessions:
            newSessions = s.sessionize(sess, nextSessId)
            nextSessId += len(newSessions)

            # And then add all of our new sessions
            # Count the number of operations so that can see the change
            if self.debug:
                LOG.debug("Split Session %d [%d ops] into %d separate sessions", \
                          sess['session_id'], len(sess['operations']), len(newSessions))
            totalOps = 0
            for newSess in newSessions:
                    LOG.error("Unexpected error when saving new Session\n%s",
                newOpCtr = len(newSess['operations'])
                totalOps += newOpCtr
                if self.debug:
                    LOG.debug("Session %d -> %d Ops" %
                              (newSess['session_id'], newOpCtr))
            # Make sure that all of our operations end up in a session
            assert len(sess['operations']) == totalOps, \
                "Expected %d operations, but new sessions only had %d" % (len(sess['operations']), totalOps)
            newTotal += totalOps

            # Mark the original session as deletable
            # deletable.append(sess)
        ## FOR
        avg_ops = 0 if origHistogram.getSampleCount() == 0 else (
            origTotal / float(origHistogram.getSampleCount()))
        LOG.info("AFTER Sessionization\n" +
                 "  # of Sessions: %d\n" +
                 "  Avg Ops per Session: %.2f", \
                 newHistogram.getSampleCount(), \
        if self.debug:
            LOG.debug("Ops per Session\n%s" % newHistogram)


    ## DEF