def splitWorkload(self): """Divide the workload up into segments for skew analysis""" start_time = None end_time = None for i in xrange(len(self.state.workload)): if start_time is None or start_time > self.state.workload[i]['start_time']: start_time = self.state.workload[i]['start_time'] if end_time is None or end_time < self.state.workload[i]['end_time']: end_time = self.state.workload[i]['end_time'] assert not start_time is None,\ "Failed to find start time in %d sessions" % len(self.state.workload) assert not end_time is None,\ "Failed to find end time in %d sessions" % len(self.state.workload) if self.debug: LOG.debug("Workload Segments - START:%d / END:%d", start_time, end_time) self.workload_segments = [ [] for i in xrange(0, self.state.skew_segments) ] segment_h = Histogram() for sess in self.state.workload: idx = self.getSessionSegment(sess, start_time, end_time) segment_h.put(idx) assert idx >= 0 and idx < self.state.skew_segments,\ "Invalid workload segment '%d' for Session #%d\n%s" % (idx, sess['session_id'], segment_h) self.workload_segments[idx].append(sess)
def __init__(self, state): AbstractCostComponent.__init__(self, state) self.debug = LOG.isEnabledFor(logging.DEBUG) # Keep track of how many times that we accessed each node self.nodeCounts = Histogram() self.workload_segments = [ ] # Pre-split the workload into separate intervals self.splitWorkload()
def print_stats(host, port, w_db, w_col): print "" LOG.info("..:: MongoDesigner Workload Info ::..") print "" #start connection and set global variables... connection = initDB(host, port, w_db, w_col) LOG.info("="*50) session_cnt = workload_db[workload_col].find().count() LOG.info("Number of sessions: %d", session_cnt) LOG.info("Number of operations per session:") maxOpCnt = 0 minOpCnt = sys.maxint vals = [] typeCnts = Histogram() for session in workload_db[workload_col].find(): for op in session['operations']: typeCnts.put(op['type']) op_cnt = len(session['operations']) minOpCnt = min(op_cnt, minOpCnt) maxOpCnt = max(op_cnt, maxOpCnt) vals.append(op_cnt) ## FOR avgOpCnt = None if vals: avgOpCnt = "%.2f" % float(sum(vals)) / float(len(vals)) LOG.info("%10s: %d" % ("min", minOpCnt)) LOG.info("%10s: %d" % ("max", maxOpCnt)) LOG.info("%10s: %s" % ("avg", avgOpCnt)) LOG.info("Number of operations by type:") for opType in typeCnts.values(): LOG.info("%10s: %d" % (opType, typeCnts[opType])) ## FOR LOG.info("="*50) return
def print_stats(host, port, w_db, w_col): print "" LOG.info("..:: MongoDesigner Workload Info ::..") print "" #start connection and set global variables... connection = initDB(host, port, w_db, w_col) LOG.info("=" * 50) session_cnt = workload_db[workload_col].find().count() LOG.info("Number of sessions: %d", session_cnt) LOG.info("Number of operations per session:") maxOpCnt = 0 minOpCnt = sys.maxint vals = [] typeCnts = Histogram() for session in workload_db[workload_col].find(): for op in session['operations']: typeCnts.put(op['type']) op_cnt = len(session['operations']) minOpCnt = min(op_cnt, minOpCnt) maxOpCnt = max(op_cnt, maxOpCnt) vals.append(op_cnt) ## FOR avgOpCnt = None if vals: avgOpCnt = "%.2f" % float(sum(vals)) / float(len(vals)) LOG.info("%10s: %d" % ("min", minOpCnt)) LOG.info("%10s: %d" % ("max", maxOpCnt)) LOG.info("%10s: %s" % ("avg", avgOpCnt)) LOG.info("Number of operations by type:") for opType in typeCnts.values(): LOG.info("%10s: %d" % (opType, typeCnts[opType])) ## FOR LOG.info("=" * 50) return
def gen_hist(percent_particles): all_data_len = len(all_data) data_len_1p = int(all_data_len * percent_particles) print("Generating hist for %s, %s" % (all_data_len * percent_particles, percent_particles)) sub_data = all_data[:data_len_1p] data_hist = Histogram(sub_data, 512) # print("Sub data: ", len(sub_data)) # print("Data Hist: ", data_hist.bin_edges[-1], # data_hist.hist[-1], sum(data_hist.hist)) global accurate_hist accurate_hist, mass_per_bin = data_hist._rebalance(32) # print("Accuurate Hist: ", accurate_hist[-1], # mass_per_bin, sum(data_hist.hist)) # print(data_hist.bin_edges, data_hist.hist) # print(accurate_hist) del data_hist
def plot(self): reneg_bins = self.renegotiate() self.ranks_data = flatten(self.ranks_data) ref_hist = Histogram(data=self.ranks_produced_flattened, nbins=self.num_pivots_sent) ref_hist.rebalance(self.num_bins_final) cur_hist = Histogram(data=self.ranks_data, bin_edges=reneg_bins.bin_edges) fig, ax = plt.subplots() plot1 = ax.bar(range(32), cur_hist.hist) mean_load = len(self.ranks_data) / 32 ax.plot([-1, 32], [mean_load, mean_load], color='orange', linewidth=1) ax.text(21, mean_load * 1.05, 'Ideal (balanced) load', color='#c04e01') ax.set_xlabel("Rank ID") ax.set_ylabel("Load") plt.tight_layout() plt.savefig("../vis/ASCR/naive_lb_2.pdf")
def generateCollectionHistograms(self): col_keys = dict([(col_name, Histogram()) for col_name in self.collections]) for sess in self.workload: for op in sess["operations"]: if op["collection"].find("$cmd") != -1: continue if not op["collection"] in col_keys: LOG.warn("Missing: " + op["collection"]) continue fields = workload.getReferencedFields(op) h = col_keys[op["collection"]] for i in xrange(1, len(fields) + 1): map(h.put, itertools.combinations(fields, i)) ## FOR (op) ## FOR (sess) return (col_keys)
def calculateSessions(self): # Calculate outliers using the quartile method # http://en.wikipedia.org/wiki/Quartile#Computing_methods if self.debug: LOG.debug( "Calculating time difference for operations in %d sessions" % len(self.sessOps)) # Get the full list of all the time differences allDiffs = [] for clientOps in self.sessOps.values(): allDiffs += [x[-1] for x in clientOps] allDiffs = sorted(allDiffs) numDiffs = len(allDiffs) #print "\n".join(map(str, allDiffs)) # Lower + Upper Quartiles lowerQuartile, upperQuartile = mathutil.quartiles(allDiffs) if lowerQuartile is None or upperQuartile is None: LOG.warn("Null quartiles! Can't continue!") return # Interquartile Range iqr = (upperQuartile - lowerQuartile) * 1.5 if self.debug: LOG.debug("Calculating stats for %d op pairs" % len(allDiffs)) LOG.debug(" Lower Quartile: %s" % lowerQuartile) LOG.debug(" Upper Quartile: %s" % upperQuartile) LOG.debug(" IQR: %s" % iqr) # Go through operations for each client and identify the # pairs of operations that are above the IQR in the upperQuartile opHist = Histogram() prevOpHist = Histogram() nextOpHist = Histogram() threshold = upperQuartile + iqr for sessId, clientOps in self.sessOps.iteritems(): for op0, op1, opDiff in clientOps: if opDiff >= threshold: prevOpHist.put(op0["query_hash"]) nextOpHist.put(op1["query_hash"]) opHist.put((op0["query_hash"], op1["query_hash"])) ## FOR ## FOR if self.debug: LOG.debug("Outlier Op Hashes:\n%s" % opHist) # I guess at this point we can just compute the outliers # again for the pairs of operations that have a time difference # outlier. We won't use the IQR. We'll just take the upper quartile # because that seems to give us the right answer outlierCounts = sorted(opHist.getCounts()) lowerQuartile, upperQuartile = mathutil.quartiles(outlierCounts) if self.debug: LOG.debug("Calculating stats for %d count outliers" % len(outlierCounts)) LOG.debug(" Lower Quartile: %s" % lowerQuartile) LOG.debug(" Upper Quartile: %s" % upperQuartile) self.sessionBoundaries.clear() # If we're doing this randomly, we want each session to have roughly # the same number of operations as RANDOMIZE_TARGET if self.randomize: num_outliers = len(outlierCounts) force = 1 if int(num_outliers * 0.10) == 1 else random.randint( 1, int(num_outliers * 0.10)) LOG.warn( "Forcing %d random outliers out of %d to be chosen from workload", force, num_outliers) else: force = 0 for cnt in outlierCounts: if cnt >= upperQuartile or (self.randomize and force > 0): self.sessionBoundaries |= set(opHist.getValuesForCount(cnt)) force -= 1 ## FOR LOG.debug("Found %d outlier hashes" % len(self.sessionBoundaries))
def calculateSessions(self): # Calculate outliers using the quartile method # http://en.wikipedia.org/wiki/Quartile#Computing_methods if self.debug: LOG.debug("Calculating time difference for operations in %d sessions" % len(self.sessOps)) # Get the full list of all the time differences allDiffs = [ ] for clientOps in self.sessOps.values(): allDiffs += [x[-1] for x in clientOps] allDiffs = sorted(allDiffs) numDiffs = len(allDiffs) #print "\n".join(map(str, allDiffs)) # Lower + Upper Quartiles lowerQuartile, upperQuartile = mathutil.quartiles(allDiffs) if lowerQuartile is None or upperQuartile is None: LOG.warn("Null quartiles! Can't continue!") return # Interquartile Range iqr = (upperQuartile - lowerQuartile) * 1.5 if self.debug: LOG.debug("Calculating stats for %d op pairs" % len(allDiffs)) LOG.debug(" Lower Quartile: %s" % lowerQuartile) LOG.debug(" Upper Quartile: %s" % upperQuartile) LOG.debug(" IQR: %s" % iqr) # Go through operations for each client and identify the # pairs of operations that are above the IQR in the upperQuartile opHist = Histogram() prevOpHist = Histogram() nextOpHist = Histogram() threshold = upperQuartile + iqr for sessId, clientOps in self.sessOps.iteritems(): for op0, op1, opDiff in clientOps: if opDiff >= threshold: prevOpHist.put(op0["query_hash"]) nextOpHist.put(op1["query_hash"]) opHist.put((op0["query_hash"], op1["query_hash"])) ## FOR ## FOR if self.debug: LOG.debug("Outlier Op Hashes:\n%s" % opHist) # I guess at this point we can just compute the outliers # again for the pairs of operations that have a time difference # outlier. We won't use the IQR. We'll just take the upper quartile # because that seems to give us the right answer outlierCounts = sorted(opHist.getCounts()) lowerQuartile, upperQuartile = mathutil.quartiles(outlierCounts) if self.debug: LOG.debug("Calculating stats for %d count outliers" % len(outlierCounts)) LOG.debug(" Lower Quartile: %s" % lowerQuartile) LOG.debug(" Upper Quartile: %s" % upperQuartile) self.sessionBoundaries.clear() # If we're doing this randomly, we want each session to have roughly # the same number of operations as RANDOMIZE_TARGET if self.randomize: num_outliers = len(outlierCounts) force = 1 if int(num_outliers*0.10) == 1 else random.randint(1, int(num_outliers*0.10)) LOG.warn("Forcing %d random outliers out of %d to be chosen from workload", force, num_outliers) else: force = 0 for cnt in outlierCounts: if cnt >= upperQuartile or (self.randomize and force > 0): self.sessionBoundaries |= set(opHist.getValuesForCount(cnt)) force -= 1 ## FOR LOG.debug("Found %d outlier hashes" % len(self.sessionBoundaries))
def sessionizeWorkload(self): """ Split the Sessions based on the gap between operation times """ LOG.info("Sessionizing sample workload") s = Sessionizer(self.metadata_db, randomize=self.random_sessionizer) # We first feed in all of the operations in for each session nextSessId = -1 origTotal = 0 origHistogram = Histogram() sessions = [ ] for sess in self.metadata_db.Session.fetch(): s.process(sess['session_id'], sess['operations']) nextSessId = max(nextSessId, sess['session_id']) origHistogram.put(len(sess['operations'])) origTotal += len(sess['operations']) sessions.append(sess) ## FOR nextSessId += 1 avg_ops = 0 if origHistogram.getSampleCount() == 0 else (origTotal / float(origHistogram.getSampleCount())) LOG.info("BEFORE Sessionization\n" + " # of Sessions: %d\n" + " Avg Ops per Session: %.2f\n" + " Next Session Id: %d", \ origHistogram.getSampleCount(), \ avg_ops, nextSessId) # Then split them into separate sessions s.calculateSessions() newTotal = 0 newHistogram = Histogram() # We have to do this because otherwise we will start to process # the new sessions that we just inserted... I know... for sess in sessions: newSessions = s.sessionize(sess, nextSessId) nextSessId += len(newSessions) # And then add all of our new sessions # Count the number of operations so that can see the change if self.debug: LOG.debug("Split Session %d [%d ops] into %d separate sessions", \ sess['session_id'], len(sess['operations']), len(newSessions)) totalOps = 0 for newSess in newSessions: try: newSess.save() except: LOG.error("Unexpected error when saving new Session\n%s", pformat(newSess)) raise newOpCtr = len(newSess['operations']) totalOps += newOpCtr newHistogram.put(newOpCtr) if self.debug: LOG.debug("Session %d -> %d Ops" % (newSess['session_id'], newOpCtr)) # Make sure that all of our operations end up in a session assert len(sess['operations']) == totalOps, \ "Expected %d operations, but new sessions only had %d" % (len(sess['operations']), totalOps) newTotal += totalOps # Mark the original session as deletable # deletable.append(sess) sess.delete() ## FOR avg_ops = 0 if origHistogram.getSampleCount() == 0 else (origTotal / float(origHistogram.getSampleCount())) LOG.info("AFTER Sessionization\n" + " # of Sessions: %d\n" + " Avg Ops per Session: %.2f", \ newHistogram.getSampleCount(), \ avg_ops) if self.debug: LOG.debug("Ops per Session\n%s" % newHistogram) return ## DEF ## CLASS
class SkewCostComponent(AbstractCostComponent): def __init__(self, state): AbstractCostComponent.__init__(self, state) self.debug = LOG.isEnabledFor(logging.DEBUG) # Keep track of how many times that we accessed each node self.nodeCounts = Histogram() self.workload_segments = [ ] # Pre-split the workload into separate intervals self.splitWorkload() ## DEF def getCostImpl(self, design): """Calculate the network cost for each segment for skew analysis""" # If there is only one node, then the cost is always zero if self.state.num_nodes == 1: LOG.info("Computed Skew Cost: %f", 0.0) return 0.0 self.nodeCounts op_counts = [ 0 ] * self.state.skew_segments segment_skew = [ 0 ] * self.state.skew_segments for i in range(0, len(self.workload_segments)): # TODO: We should cache this so that we don't have to call it twice segment_skew[i], op_counts[i] = self.calculateSkew(design, self.workload_segments[i]) weighted_skew = sum([segment_skew[i] * op_counts[i] for i in xrange(len(self.workload_segments))]) cost = weighted_skew / float(sum(op_counts)) LOG.info("Computed Skew Cost: %f", cost) return cost ## DEF def calculateSkew(self, design, segment): """ Calculate the cluster skew factor for the given workload segment See Alg.#3 from Pavlo et al. 2012: http://hstore.cs.brown.edu/papers/hstore-partitioning.pdf """ if self.debug: LOG.debug("Computing skew cost for %d sessions over %d segments", \ len(segment), self.state.skew_segments) self.nodeCounts.clear() # Iterate over each session and get the list of nodes # that we estimate that each of its operations will need to touch num_ops = 0 err_ops = 0 for sess in segment: for op in sess['operations']: # Skip anything that doesn't have a design configuration if not design.hasCollection(op['collection']): if self.debug: LOG.debug("Not in design: SKIP - %s Op #%d on %s", op['type'], op['query_id'], op['collection']) continue if design.isRelaxed(op['collection']): if self.debug: LOG.debug("Relaxed: SKIP - %s Op #%d on %s", op['type'], op['query_id'], op['collection']) continue col_info = self.state.collections[op['collection']] cache = self.state.getCacheHandle(col_info) # This just returns an estimate of which nodes we expect # the op to touch. We don't know exactly which ones they will # be because auto-sharding could put shards anywhere... try: node_ids = self.state.__getNodeIds__(cache, design, op) map(self.nodeCounts.put, node_ids) num_ops += 1 except: if self.debug: LOG.warn("Failed to estimate touched nodes for op\n%s" % pformat(op)) err_ops += 1 continue ## FOR (op) ## FOR (sess) if self.debug: LOG.info("Total ops %s, errors %s", num_ops, err_ops) if self.debug: LOG.debug("Node Count Histogram:\n%s", self.nodeCounts) total = self.nodeCounts.getSampleCount() if not total: return (0.0, num_ops) best = 1 / float(self.state.num_nodes) skew = 0.0 for i in xrange(self.state.num_nodes): ratio = self.nodeCounts.get(i, 0) / float(total) if ratio < best: ratio = best + ((1 - ratio/best) * (1 - best)) skew += math.log(ratio / best) return skew / (math.log(1 / best) * self.state.num_nodes), num_ops ## DEF ## ----------------------------------------------------------------------- ## WORKLOAD SEGMENTATION ## ----------------------------------------------------------------------- def splitWorkload(self): """Divide the workload up into segments for skew analysis""" start_time = None end_time = None for i in xrange(len(self.state.workload)): if start_time is None or start_time > self.state.workload[i]['start_time']: start_time = self.state.workload[i]['start_time'] if end_time is None or end_time < self.state.workload[i]['end_time']: end_time = self.state.workload[i]['end_time'] assert not start_time is None,\ "Failed to find start time in %d sessions" % len(self.state.workload) assert not end_time is None,\ "Failed to find end time in %d sessions" % len(self.state.workload) if self.debug: LOG.debug("Workload Segments - START:%d / END:%d", start_time, end_time) self.workload_segments = [ [] for i in xrange(0, self.state.skew_segments) ] segment_h = Histogram() for sess in self.state.workload: idx = self.getSessionSegment(sess, start_time, end_time) segment_h.put(idx) assert idx >= 0 and idx < self.state.skew_segments,\ "Invalid workload segment '%d' for Session #%d\n%s" % (idx, sess['session_id'], segment_h) self.workload_segments[idx].append(sess) ## FOR ## DEF def getSessionSegment(self, sess, start_time, end_time): """Return the segment offset that the given Session should be assigned to""" timestamp = sess['start_time'] if timestamp == end_time: timestamp -= 1 ratio = (timestamp - start_time) / float(end_time - start_time) return min(self.state.skew_segments-1, int(self.state.skew_segments * ratio)) # HACK ## DEF ## CLASS
def sessionizeWorkload(self): """ Split the Sessions based on the gap between operation times """ LOG.info("Sessionizing sample workload") s = Sessionizer(self.metadata_db, randomize=self.random_sessionizer) # We first feed in all of the operations in for each session nextSessId = -1 origTotal = 0 origHistogram = Histogram() sessions = [] for sess in self.metadata_db.Session.fetch(): s.process(sess['session_id'], sess['operations']) nextSessId = max(nextSessId, sess['session_id']) origHistogram.put(len(sess['operations'])) origTotal += len(sess['operations']) sessions.append(sess) ## FOR nextSessId += 1 avg_ops = 0 if origHistogram.getSampleCount() == 0 else ( origTotal / float(origHistogram.getSampleCount())) LOG.info("BEFORE Sessionization\n" + " # of Sessions: %d\n" + " Avg Ops per Session: %.2f\n" + " Next Session Id: %d", \ origHistogram.getSampleCount(), \ avg_ops, nextSessId) # Then split them into separate sessions s.calculateSessions() newTotal = 0 newHistogram = Histogram() # We have to do this because otherwise we will start to process # the new sessions that we just inserted... I know... for sess in sessions: newSessions = s.sessionize(sess, nextSessId) nextSessId += len(newSessions) # And then add all of our new sessions # Count the number of operations so that can see the change if self.debug: LOG.debug("Split Session %d [%d ops] into %d separate sessions", \ sess['session_id'], len(sess['operations']), len(newSessions)) totalOps = 0 for newSess in newSessions: try: newSess.save() except: LOG.error("Unexpected error when saving new Session\n%s", pformat(newSess)) raise newOpCtr = len(newSess['operations']) totalOps += newOpCtr newHistogram.put(newOpCtr) if self.debug: LOG.debug("Session %d -> %d Ops" % (newSess['session_id'], newOpCtr)) # Make sure that all of our operations end up in a session assert len(sess['operations']) == totalOps, \ "Expected %d operations, but new sessions only had %d" % (len(sess['operations']), totalOps) newTotal += totalOps # Mark the original session as deletable # deletable.append(sess) sess.delete() ## FOR avg_ops = 0 if origHistogram.getSampleCount() == 0 else ( origTotal / float(origHistogram.getSampleCount())) LOG.info("AFTER Sessionization\n" + " # of Sessions: %d\n" + " Avg Ops per Session: %.2f", \ newHistogram.getSampleCount(), \ avg_ops) if self.debug: LOG.debug("Ops per Session\n%s" % newHistogram) return ## DEF ## CLASS