def testPickle(self): h = Histogram() letters = [x for x in string.letters] + ["-"] for i in xrange(0, 100): key = "" for x in xrange(0, 10): key += random.choice(letters) assert len(key) > 0 h.put(key, delta=random.randint(1, 10)) assert h[key] > 0 ## FOR # Serialize import pickle p = pickle.dumps(h, -1) assert p # Deserialize clone = pickle.loads(p) assert clone for key in h.keys(): self.assertEquals(h[key], clone[key]) ## FOR self.assertEquals(h.getSampleCount(), clone.getSampleCount()) self.assertEquals(sorted(h.getMinCountKeys()), sorted(clone.getMinCountKeys()))
def testPickle(self): h = Histogram() letters = [x for x in string.letters] + ["-"] for i in xrange(0, 100): key = "" for x in xrange(0, 10): key += random.choice(letters) assert len(key) > 0 h.put(key, delta=random.randint(1, 10)) assert h[key] > 0 ## FOR # Serialize import pickle p = pickle.dumps(h, -1) assert p # Deserialize clone = pickle.loads(p) assert clone for key in h.keys(): self.assertEquals(h[key], clone[key]) ## FOR self.assertEquals(h.getSampleCount(), clone.getSampleCount()) self.assertEquals(sorted(h.getMinCountKeys()), sorted(clone.getMinCountKeys()))
def computeInStats(query, h=None): for k, v in query.iteritems(): if k == "#in": if h is None: h = Histogram() h.put(len(v)) elif isinstance(v, list): for inner in v: if isinstance(inner, dict): h = computeInStats(inner, h) elif isinstance(v, dict): h = computeInStats(v, h) return h
def computeInStats(query, h=None): for k,v in query.iteritems(): if k == "#in": if h is None: h = Histogram() h.put(len(v)) elif isinstance(v, list): for inner in v: if isinstance(inner, dict): h = computeInStats(inner, h) elif isinstance(v, dict): h = computeInStats(v, h) return h
def fixInvalidCollections(self): searchKey = { "operations.collection": constants.INVALID_COLLECTION_MARKER } for session in self.metadata_db.Session.find(searchKey): for op in session["operations"]: dirty = False if op["collection"] != constants.INVALID_COLLECTION_MARKER: continue if self.debug: LOG.debug("Attempting to fix corrupted Operation:\n%s" % pformat(op)) # For each field referenced in the query, build a histogram of # which collections have a field with the same name fields = workload.getReferencedFields(op) h = Histogram() for c in self.metadata_db.Collection.find(): for f in c['fields']: if f in fields: h.put(c['name']) ## FOR ## FOR matches = h.getMaxCountKeys() if len(matches) == 0: LOG.warn( "No matching collection was found for corrupted operation\n%s" % pformat(op)) continue elif len(matches) > 1: LOG.warn( "More than one matching collection was found for corrupted operation %s\n%s" % (matches, pformat(op))) continue else: op["collection"] = matches[0] dirty = True self.fix_ctr += 1 LOG.info("Fix corrupted collection in operation\n%s" % pformat(op)) ## IF ## FOR (operations) if dirty: session.save()
def fixInvalidCollections(self): searchKey = {"operations.collection": constants.INVALID_COLLECTION_MARKER} for session in self.metadata_db.Session.find(searchKey): for op in session["operations"]: dirty = False if op["collection"] != constants.INVALID_COLLECTION_MARKER: continue if self.debug: LOG.debug("Attempting to fix corrupted Operation:\n%s" % pformat(op)) # For each field referenced in the query, build a histogram of # which collections have a field with the same name fields = workload.getReferencedFields(op) h = Histogram() for c in self.metadata_db.Collection.find(): for f in c["fields"]: if f in fields: h.put(c["name"]) ## FOR ## FOR matches = h.getMaxCountKeys() if len(matches) == 0: LOG.warn("No matching collection was found for corrupted operation\n%s" % pformat(op)) continue elif len(matches) > 1: LOG.warn( "More than one matching collection was found for corrupted operation %s\n%s" % (matches, pformat(op)) ) continue else: op["collection"] = matches[0] dirty = True self.fix_ctr += 1 LOG.info("Fix corrupted collection in operation\n%s" % pformat(op)) ## IF ## FOR (operations) if dirty: session.save()
class State: """Cost Model State""" ## ----------------------------------------------------------------------- ## INTERNAL CACHE STATE ## ----------------------------------------------------------------------- class Cache: """ Internal cache for a single collection. Note that this is different than the LRUBuffer cache stuff. These are cached look-ups that the CostModel uses for figuring out what operations do. """ def __init__(self, col_info, num_nodes): # The number of pages needed to do a full scan of this collection # The worst case for all other operations is if we have to do # a full scan that requires us to evict the entire buffer # Hence, we multiple the max pages by two # self.fullscan_pages = (col_info['max_pages'] * 2) self.fullscan_pages = col_info["doc_count"] * 2 assert self.fullscan_pages > 0, "Zero max_pages for collection '%s'" % col_info["name"] # Cache of Best Index Tuples # QueryHash -> BestIndex self.best_index = {} # Cache of Regex Operations # QueryHash -> Boolean self.op_regex = {} # Cache of Touched Node Ids # QueryId -> [NodeId] self.op_nodeIds = {} # Cache of Document Ids # QueryId -> Index/Collection DocumentIds self.collection_docIds = {} self.index_docIds = {} ## DEF def reset(self): self.best_index.clear() self.op_regex.clear() self.op_nodeIds.clear() self.collection_docIds.clear() self.index_docIds.clear() self.op_count = 0 self.msg_count = 0 self.network_reset = True ## DEF def __str__(self): ret = "" max_len = max(map(len, self.__dict__.iterkeys())) + 1 f = " %-" + str(max_len) + "s %s\n" for k, v in self.__dict__.iteritems(): if isinstance(v, dict): v_str = "[%d entries]" % len(v) else: v_str = str(v) ret += f % (k + ":", v_str) return ret ## DEF ## CLASS def __init__(self, collections, workload, config): assert isinstance(collections, dict) # LOG.setLevel(logging.DEBUG) self.debug = LOG.isEnabledFor(logging.DEBUG) self.collections = collections self.col_names = [col_name for col_name in collections.iterkeys()] self.workload = None # working workload self.originalWorload = workload # points to the original workload self.weight_network = config.get("weight_network", 1.0) self.weight_disk = config.get("weight_disk", 1.0) self.weight_skew = config.get("weight_skew", 1.0) self.max_num_nodes = config.get("nodes", 1) # Convert MB to bytes self.max_memory = config["max_memory"] * 1024 * 1024 self.skew_segments = config["skew_intervals"] # Why? "- 1" self.address_size = config["address_size"] / 4 self.estimator = NodeEstimator(collections, self.max_num_nodes) self.window_size = config["window_size"] # Build indexes from collections to sessions/operations # Note that this won't change dynamically based on denormalization schemes # It's up to the cost components to figure things out based on that self.restoreOriginalWorkload() # We need to know the number of operations in the original workload # so that all of our calculations are based on that self.orig_op_count = 0 for sess in self.originalWorload: self.orig_op_count += len(sess["operations"]) ## FOR ## ---------------------------------------------- ## CACHING ## ---------------------------------------------- self.cache_enable = True self.cache_miss_ctr = Histogram() self.cache_hit_ctr = Histogram() # ColName -> CacheHandle self.cache_handles = {} ## DEF def init_xref(self, workload): """ initialize the cross reference based on the current working workload """ self.col_sess_xref = dict([(col_name, []) for col_name in self.col_names]) self.col_op_xref = dict([(col_name, []) for col_name in self.col_names]) self.__buildCrossReference__(workload) ## DEF def updateWorkload(self, workload): self.workload = workload self.init_xref(workload) ## DEF def restoreOriginalWorkload(self): self.workload = self.originalWorload self.init_xref(self.workload) ## DEF def __buildCrossReference__(self, workload): for sess in workload: cols = set() for op in sess["operations"]: col_name = op["collection"] if col_name in self.col_sess_xref: self.col_op_xref[col_name].append(op) cols.add(col_name) ## FOR (op) for col_name in cols: self.col_sess_xref[col_name].append(sess) ## FOR (sess) def invalidateCache(self, col_name): if col_name in self.cache_handles: if self.debug: LOG.debug("Invalidating cache for collection '%s'", col_name) self.cache_handles[col_name].reset() ## DEF def getCacheHandleByName(self, col_info): """ Return a cache handle for the given collection name. This is the preferrred method because it requires fewer hashes """ cache = self.cache_handles.get(col_info["name"], None) if cache is None: cache = State.Cache(col_info, self.max_num_nodes) self.cache_handles[col_info["name"]] = cache return cache ## DEF def getCacheHandle(self, col_info): return self.getCacheHandleByName(col_info) ## DEF def reset(self): """ Reset all of the internal state and cache information """ # Clear out caches for all collections self.cache_handles.clear() self.estimator.reset() def calcNumNodes(self, design, maxCardinality): num_nodes = {} for col_name in self.collections.keys(): num_nodes[col_name] = self.max_num_nodes if maxCardinality[col_name] is not None and design.hasCollection(col_name): cardinality = 1 shard_keys = design.getShardKeys(col_name) if shard_keys is None or len(shard_keys) == 0: continue for shard_key in shard_keys: if (not self.collections[col_name]["fields"].has_key(shard_key)) or ( not self.collections[col_name]["fields"][shard_key].has_key("cardinality") ): continue field_cardinality = self.collections[col_name]["fields"][shard_key]["cardinality"] if field_cardinality > 0: cardinality *= field_cardinality cardinality_ratio = maxCardinality[col_name] / float(cardinality) if cardinality_ratio == 1: cardinality_ratio = 0 elif cardinality_ratio < 2: cardinality_ratio = 1 else: cardinality_ratio = int(math.ceil(math.log(cardinality_ratio, 2))) col_num_nodes = self.max_num_nodes - cardinality_ratio if col_num_nodes <= 0: col_num_nodes = 1 num_nodes[col_name] = col_num_nodes return num_nodes ## ----------------------------------------------------------------------- ## UTILITY CODE ## ----------------------------------------------------------------------- def __getIsOpRegex__(self, cache, op): isRegex = cache.op_regex.get(op["query_hash"], None) if isRegex is None: isRegex = workload.isOpRegex(op) if self.cache_enable: if self.debug: self.cache_miss_ctr.put("op_regex") cache.op_regex[op["query_hash"]] = isRegex elif self.debug: self.cache_hit_ctr.put("op_regex") return isRegex ## DEF def __getNodeIds__(self, cache, design, op, num_nodes=None): node_ids = cache.op_nodeIds.get(op["query_id"], None) if node_ids is None: try: node_ids = self.estimator.estimateNodes(design, op, num_nodes) except: if self.debug: LOG.error("Failed to estimate touched nodes for op #%d\n%s", op["query_id"], pformat(op)) raise if self.cache_enable: if self.debug: self.cache_miss_ctr.put("op_nodeIds") cache.op_nodeIds[op["query_id"]] = node_ids if self.debug: LOG.debug("Estimated Touched Nodes for Op #%d: %d", op["query_id"], len(node_ids)) elif self.debug: self.cache_hit_ctr.put("op_nodeIds") return node_ids
colls = dict() for col_info in metadata_db.Collection.fetch( {"workload_queries": { "$gt": 0 }}): # Skip any collection that doesn't have any documents in it if not col_info['doc_count'] or not col_info['avg_doc_size']: continue colls[col_info['name']] = col_info if not colls: raise Exception("No collections were found in metadata catalog") for sess in metadata_db.Session.fetch(): for op in sess["operations"]: QUERY_COUNTS.put(op["query_hash"]) if not op["query_hash"] in QUERY_HASH_XREF: QUERY_HASH_XREF[op["query_hash"]] = [] QUERY_HASH_XREF[op["query_hash"]].append(op) QUERY_COLLECTION_COUNTS.put(op["collection"]) ## FOR ## FOR LOG.info("Toal # of Unique Queries: %d", len(QUERY_COUNTS.values())) TOTAL_DB_SIZE = sum( [col_info["data_size"] for col_info in colls.itervalues()]) LOG.debug("Estimated Total Database Size: %d" % TOTAL_DB_SIZE) TOTAL_QUERY_COUNT = QUERY_COLLECTION_COUNTS.getSampleCount() LOG.debug("Total # of Queries: %d" % TOTAL_QUERY_COUNT) # HACK: Fix collections
def hash(self, op): """Compute a deterministic signature for the given operation based on its keys""" fields = None updateFields = None # QUERY if op["type"] == constants.OP_TYPE_QUERY: # The query field has our where clause if not "#query" in op["query_content"][0]: msg = "Missing query field in query_content for operation #%d" % op["query_id"] if self.debug: LOG.warn(pformat(op)) raise Exception(msg) fields = op["query_content"][0][constants.REPLACE_KEY_DOLLAR_PREFIX + "query"] # UPDATE elif op["type"] == constants.OP_TYPE_UPDATE: # The first element in the content field is the WHERE clause fields = op["query_content"][0] # We use a separate field for the updated columns so that updateFields = op['query_content'][1] # INSERT elif op["type"] == constants.OP_TYPE_INSERT: # They could be inserting more than one document here, # which all may have different fields... # So we will need to build a histogram for which keys are referenced # and use the onese that appear the most # XXX: We'll only consider keys in the first-level h = Histogram() for doc in op["query_content"]: assert type(doc) == dict, "Unexpected insert value:\n%s" % pformat(doc) for k in doc.keys(): h.put(k) ## FOR if LOG.isEnabledFor(logging.DEBUG): LOG.debug("Insert '%s' Keys Histogram:\n%s" % (op["collection"], h)) maxKeys = h.getMaxCountKeys() assert len(maxKeys) > 0, \ "No keys were found in %d insert documents?" % len(op["query_content"]) fields = { } for doc in op["query_content"]: for k, v in doc.iteritems(): if k in maxKeys: fields[k] = v ## FOR ## FOR # DELETE elif op["type"] == constants.OP_TYPE_DELETE: # The first element in the content field is the WHERE clause fields = op["query_content"][0] # UNKNOWN! else: raise Exception("Unexpected query type: %s" % op["type"]) # Extract the list of fields that are used try: fieldsHash = self.computeFieldsHash(fields) except: LOG.error("Unexpected error when processing operation %d [fields=%s]" % (op["query_id"], str(fields))) raise updateHash = self.computeFieldsHash(updateFields) if updateFields else None t = (op["collection"], op["type"], fieldsHash, updateHash) h = long(hash(t)) LOG.debug("%s %s => HASH:%d" % (fields, t, h)) self.histogram.put(h) return h
class State(): """Cost Model State""" ## ----------------------------------------------------------------------- ## INTERNAL CACHE STATE ## ----------------------------------------------------------------------- class Cache(): """ Internal cache for a single collection. Note that this is different than the LRUBuffer cache stuff. These are cached look-ups that the CostModel uses for figuring out what operations do. """ def __init__(self, col_info, num_nodes): # The number of pages needed to do a full scan of this collection # The worst case for all other operations is if we have to do # a full scan that requires us to evict the entire buffer # Hence, we multiple the max pages by two # self.fullscan_pages = (col_info['max_pages'] * 2) self.fullscan_pages = col_info['doc_count'] * 2 assert self.fullscan_pages > 0,\ "Zero max_pages for collection '%s'" % col_info['name'] # Cache of Best Index Tuples # QueryHash -> BestIndex self.best_index = {} # Cache of Regex Operations # QueryHash -> Boolean self.op_regex = {} # Cache of Touched Node Ids # QueryId -> [NodeId] self.op_nodeIds = {} # Cache of Document Ids # QueryId -> Index/Collection DocumentIds self.collection_docIds = {} self.index_docIds = {} ## DEF def reset(self): self.best_index.clear() self.op_regex.clear() self.op_nodeIds.clear() self.collection_docIds.clear() self.index_docIds.clear() self.op_count = 0 self.msg_count = 0 self.network_reset = True ## DEF def __str__(self): ret = "" max_len = max(map(len, self.__dict__.iterkeys())) + 1 f = " %-" + str(max_len) + "s %s\n" for k, v in self.__dict__.iteritems(): if isinstance(v, dict): v_str = "[%d entries]" % len(v) else: v_str = str(v) ret += f % (k + ":", v_str) return ret ## DEF ## CLASS def __init__(self, collections, workload, config): assert isinstance(collections, dict) # LOG.setLevel(logging.DEBUG) self.debug = LOG.isEnabledFor(logging.DEBUG) self.collections = collections self.col_names = [col_name for col_name in collections.iterkeys()] self.workload = None # working workload self.originalWorload = workload # points to the original workload self.weight_network = config.get('weight_network', 1.0) self.weight_disk = config.get('weight_disk', 1.0) self.weight_skew = config.get('weight_skew', 1.0) self.num_nodes = config.get('nodes', 1) # Convert MB to bytes self.max_memory = config['max_memory'] * 1024 * 1024 self.skew_segments = config['skew_intervals'] # Why? "- 1" self.address_size = config['address_size'] / 4 self.estimator = NodeEstimator(collections, self.num_nodes) self.window_size = config['window_size'] # Build indexes from collections to sessions/operations # Note that this won't change dynamically based on denormalization schemes # It's up to the cost components to figure things out based on that self.restoreOriginalWorkload() # We need to know the number of operations in the original workload # so that all of our calculations are based on that self.orig_op_count = 0 for sess in self.originalWorload: self.orig_op_count += len(sess["operations"]) ## FOR ## ---------------------------------------------- ## CACHING ## ---------------------------------------------- self.cache_enable = True self.cache_miss_ctr = Histogram() self.cache_hit_ctr = Histogram() # ColName -> CacheHandle self.cache_handles = {} ## DEF def init_xref(self, workload): ''' initialize the cross reference based on the current working workload ''' self.col_sess_xref = dict([(col_name, []) for col_name in self.col_names]) self.col_op_xref = dict([(col_name, []) for col_name in self.col_names]) self.__buildCrossReference__(workload) ## DEF def updateWorkload(self, workload): self.workload = workload self.init_xref(workload) ## DEF def restoreOriginalWorkload(self): self.workload = self.originalWorload self.init_xref(self.workload) ## DEF def __buildCrossReference__(self, workload): for sess in workload: cols = set() for op in sess["operations"]: col_name = op["collection"] if col_name in self.col_sess_xref: self.col_op_xref[col_name].append(op) cols.add(col_name) ## FOR (op) for col_name in cols: self.col_sess_xref[col_name].append(sess) ## FOR (sess) def invalidateCache(self, col_name): if col_name in self.cache_handles: if self.debug: LOG.debug("Invalidating cache for collection '%s'", col_name) self.cache_handles[col_name].reset() ## DEF def getCacheHandleByName(self, col_info): """ Return a cache handle for the given collection name. This is the preferrred method because it requires fewer hashes """ cache = self.cache_handles.get(col_info['name'], None) if cache is None: cache = State.Cache(col_info, self.num_nodes) self.cache_handles[col_info['name']] = cache return cache ## DEF def getCacheHandle(self, col_info): return self.getCacheHandleByName(col_info) ## DEF def reset(self): """ Reset all of the internal state and cache information """ # Clear out caches for all collections self.cache_handles.clear() self.estimator.reset() ## ----------------------------------------------------------------------- ## UTILITY CODE ## ----------------------------------------------------------------------- def __getIsOpRegex__(self, cache, op): isRegex = cache.op_regex.get(op["query_hash"], None) if isRegex is None: isRegex = workload.isOpRegex(op) if self.cache_enable: if self.debug: self.cache_miss_ctr.put("op_regex") cache.op_regex[op["query_hash"]] = isRegex elif self.debug: self.cache_hit_ctr.put("op_regex") return isRegex ## DEF def __getNodeIds__(self, cache, design, op): node_ids = cache.op_nodeIds.get(op['query_id'], None) if node_ids is None: try: node_ids = self.estimator.estimateNodes(design, op) except: if self.debug: LOG.error( "Failed to estimate touched nodes for op #%d\n%s", op['query_id'], pformat(op)) raise if self.cache_enable: if self.debug: self.cache_miss_ctr.put("op_nodeIds") cache.op_nodeIds[op['query_id']] = node_ids if self.debug: LOG.debug("Estimated Touched Nodes for Op #%d: %d", op['query_id'], len(node_ids)) elif self.debug: self.cache_hit_ctr.put("op_nodeIds") return node_ids ## DEF ## CLASS
class Results: def __init__(self, config=None): self.start = None self.stop = None self.txn_id = 0 self.opCount = 0 self.completed = [ ] # (txnName, timestamp) self.txn_counters = Histogram() self.txn_times = { } self.running = { } self.config = config def startBenchmark(self): """Mark the benchmark as having been started""" assert self.start == None LOG.debug("Starting benchmark statistics collection") self.start = time.time() return self.start def stopBenchmark(self): """Mark the benchmark as having been stopped""" assert self.start != None assert self.stop == None LOG.debug("Stopping benchmark statistics collection") self.stop = time.time() def startTransaction(self, txn): self.txn_id += 1 id = self.txn_id self.running[id] = (txn, time.time()) return id def abortTransaction(self, id): """Abort a transaction and discard its times""" assert id in self.running txn_name, txn_start = self.running[id] del self.running[id] def stopTransaction(self, id, opCount, latencies=[]): """Record that the benchmark completed an invocation of the given transaction""" assert id in self.running timestamp = time.time() txn_name, txn_start = self.running[id] del self.running[id] self.completed.append((txn_name, timestamp, latencies)) duration = timestamp - txn_start total_time = self.txn_times.get(txn_name, 0) self.txn_times[txn_name] = total_time + duration # OpCount if opCount is not None: self.opCount += opCount else: LOG.debug("ithappens") # Txn Counter Histogram self.txn_counters.put(txn_name) assert self.txn_counters[txn_name] > 0 if LOG.isEnabledFor(logging.DEBUG): LOG.debug("Completed %s in %f sec" % (txn_name, duration)) ## DEF @staticmethod def show_table(title, headers, table, line_width): cols_width = [len(header) for header in headers] for row in table: row_width = 0 for i in range(len(headers)): if len(row[i]) > cols_width[i]: cols_width[i] = len(row[i]) row_width += cols_width[i] row_width += 4 * (len(headers) - 1) if row_width > line_width: line_width = row_width output = ("%s\n" % ("=" * line_width)) output += ("%s\n" % title) output += ("%s\n" % ("-" * line_width)) for i in range(len(headers)): header = headers[i] output += ("%s%s" % (header, " " * (cols_width[i] - len(header)))) if i != len(headers) - 1: output += " " * 4 output += "\n" for row in table: for i in range(len(headers)): cell = row[i] output += ("%s%s" % (cell, " " * (cols_width[i] - len(cell)))) if i != len(headers) - 1: output += " " * 4 output += "\n" output += ("%s\n" % ("-" * line_width)) return output, line_width def show_latencies(self, line_width): latencies = [] output = "" for txn_stats in self.completed: latencies.extend(txn_stats[2]) if len(latencies) > 0: latencies = sorted(latencies, key=itemgetter(0)) percents = [0.1, 0.2, 0.5, 0.8, 0.9, 0.999] latency_table = [] slowest_ops = [] for percent in percents: index = int(math.floor(percent * len(latencies))) percent_str = "%0.1f%%" % (percent * 100) millis_sec_str = "%0.4f" % (latencies[index][0]) latency_table.append((percent_str, millis_sec_str)) latency_headers = ["Queries(%)", "Latency(ms)"] output, line_width = \ Results.show_table("Latency Report", latency_headers, latency_table, line_width) if self.config is not None and self.config["default"]["slow_ops_num"] > 0: num_ops = self.config["default"]["slow_ops_num"] slowest_ops_headers = ["#", "Latency(ms)", "Session Id", "Operation Id", "Type", "Collection", "Predicates"] for i in range(num_ops): if i < len(latencies): slowest_ops.append([ "%d" % i, "%0.4f" % (latencies[len(latencies) - i - 1][0]), str(latencies[len(latencies) - i - 1][1]), str(latencies[len(latencies) - i - 1][2]), latencies[len(latencies) - i - 1][3], latencies[len(latencies) - i - 1][4], json.dumps(latencies[len(latencies) - i - 1][5]) ]) slowest_ops_output, line_width = \ Results.show_table("Top %d Slowest Operations" % num_ops, slowest_ops_headers, slowest_ops, line_width) output += ("\n%s" % slowest_ops_output) return output def append(self, r): self.opCount += r.opCount for txn_name in r.txn_counters.keys(): self.txn_counters.put(txn_name, delta=r.txn_counters[txn_name]) orig_time = self.txn_times.get(txn_name, 0) self.txn_times[txn_name] = orig_time + r.txn_times[txn_name] #LOG.info("resOps="+str(r.opCount)) #LOG.debug("%s [cnt=%d, time=%d]" % (txn_name, self.txn_counters[txn_name], self.txn_times[txn_name])) ## HACK if type(r.completed) == list: self.completed.extend(r.completed) if not self.start: self.start = r.start else: self.start = min(self.start, r.start) if not self.stop: self.stop = r.stop else: self.stop = max(self.stop, r.stop) ## DEF def __str__(self): return self.show() def show(self, load_time = None): if self.start == None: msg = "Attempting to get benchmark results before it was started" raise Exception(msg) LOG.warn(msg) return "Benchmark not started" if self.stop == None: duration = time.time() - self.start else: duration = self.stop - self.start col_width = 18 total_width = (col_width*4)+2 f = "\n " + (("%-" + str(col_width) + "s")*4) line = "-"*total_width ret = u"" + "="*total_width + "\n" if load_time != None: ret += "Data Loading Time: %d seconds\n\n" % (load_time) ret += "Execution Results after %d seconds\n%s" % (duration, line) ret += f % ("", "Executed", u"Total Time (ms)", "Rate") total_time = duration total_cnt = self.txn_counters.getSampleCount() #total_running_time = 0 for txn in sorted(self.txn_counters.keys()): txn_time = self.txn_times[txn] txn_cnt = "%6d - %4.1f%%" % (self.txn_counters[txn], (self.txn_counters[txn] / float(total_cnt))*100) rate = u"%.02f txn/s" % ((self.txn_counters[txn] / total_time)) #total_running_time +=txn_time #rate = u"%.02f op/s" % ((self.txn_counters[txn] / total_time)) rate = u"%.02f op/s" % ((self.opCount / total_time)) ret += f % (txn, txn_cnt, str(txn_time * 1000), rate) #LOG.info("totalOps="+str(self.totalOps)) # total_time += txn_time ret += "\n" + ("-"*total_width) rate = 0 if total_time > 0: rate = total_cnt / float(total_time) # TXN RATE rate = total_cnt / float(total_time) #total_rate = "%.02f txn/s" % rate total_rate = "%.02f op/s" % rate #total_rate = str(rate) ret += f % ("TOTAL", str(total_cnt), str(total_time*1000), total_rate) return ("%s\n%s" % (ret, self.show_latencies(total_width))).encode('utf-8')
class Results: def __init__(self): self.start = None self.stop = None self.txn_id = 0 self.opCount = 0 self.completed = [] # (txnName, timestamp) self.txn_counters = Histogram() self.txn_times = {} self.running = {} def startBenchmark(self): """Mark the benchmark as having been started""" assert self.start == None LOG.debug("Starting benchmark statistics collection") self.start = time.time() return self.start def stopBenchmark(self): """Mark the benchmark as having been stopped""" assert self.start != None assert self.stop == None LOG.debug("Stopping benchmark statistics collection") self.stop = time.time() def startTransaction(self, txn): self.txn_id += 1 id = self.txn_id self.running[id] = (txn, time.time()) return id def abortTransaction(self, id): """Abort a transaction and discard its times""" assert id in self.running txn_name, txn_start = self.running[id] del self.running[id] def stopTransaction(self, id, opCount): """Record that the benchmark completed an invocation of the given transaction""" assert id in self.running timestamp = time.time() txn_name, txn_start = self.running[id] del self.running[id] self.completed.append((txn_name, timestamp)) duration = timestamp - txn_start total_time = self.txn_times.get(txn_name, 0) self.txn_times[txn_name] = total_time + duration # OpCount if opCount is not None: self.opCount += opCount else: LOG.debug("ithappens") # Txn Counter Histogram self.txn_counters.put(txn_name) assert self.txn_counters[txn_name] > 0 if LOG.isEnabledFor(logging.DEBUG): LOG.debug("Completed %s in %f sec" % (txn_name, duration)) ## DEF def append(self, r): self.opCount += r.opCount for txn_name in r.txn_counters.keys(): self.txn_counters.put(txn_name, delta=r.txn_counters[txn_name]) orig_time = self.txn_times.get(txn_name, 0) self.txn_times[txn_name] = orig_time + r.txn_times[txn_name] #LOG.info("resOps="+str(r.opCount)) #LOG.debug("%s [cnt=%d, time=%d]" % (txn_name, self.txn_counters[txn_name], self.txn_times[txn_name])) ## HACK if type(r.completed) == list: self.completed.extend(r.completed) if not self.start: self.start = r.start else: self.start = min(self.start, r.start) if not self.stop: self.stop = r.stop else: self.stop = max(self.stop, r.stop) ## DEF def __str__(self): return self.show() def show(self, load_time=None): if self.start == None: msg = "Attempting to get benchmark results before it was started" raise Exception(msg) LOG.warn(msg) return "Benchmark not started" if self.stop == None: duration = time.time() - self.start else: duration = self.stop - self.start col_width = 18 total_width = (col_width * 4) + 2 f = "\n " + (("%-" + str(col_width) + "s") * 4) line = "-" * total_width ret = u"" + "=" * total_width + "\n" if load_time != None: ret += "Data Loading Time: %d seconds\n\n" % (load_time) ret += "Execution Results after %d seconds\n%s" % (duration, line) ret += f % ("", "Executed", u"Total Time (ms)", "Rate") total_time = duration total_cnt = self.txn_counters.getSampleCount() #total_running_time = 0 for txn in sorted(self.txn_counters.keys()): txn_time = self.txn_times[txn] txn_cnt = "%6d - %4.1f%%" % ( self.txn_counters[txn], (self.txn_counters[txn] / float(total_cnt)) * 100) rate = u"%.02f txn/s" % ((self.txn_counters[txn] / total_time)) #total_running_time +=txn_time #rate = u"%.02f op/s" % ((self.txn_counters[txn] / total_time)) #rate = u"%.02f op/s" % ((self.opCount / total_time)) ret += f % (txn, txn_cnt, str(txn_time * 1000), rate) #LOG.info("totalOps="+str(self.totalOps)) # total_time += txn_time ret += "\n" + ("-" * total_width) rate = 0 if total_time > 0: rate = total_cnt / float(total_time) # TXN RATE rate = total_cnt / float(total_time) #total_rate = "%.02f txn/s" % rate total_rate = "%.02f op/s" % rate #total_rate = str(rate) ret += f % ("TOTAL", str(total_cnt), str( total_time * 1000), total_rate) return (ret.encode('utf-8'))
class Results: def __init__(self): self.start = None self.stop = None self.txn_id = 0 self.opCount = 0 self.completed = [ ] # (txnName, timestamp) self.txn_counters = Histogram() self.txn_times = { } self.running = { } def startBenchmark(self): """Mark the benchmark as having been started""" assert self.start == None LOG.debug("Starting benchmark statistics collection") self.start = time.time() return self.start def stopBenchmark(self): """Mark the benchmark as having been stopped""" assert self.start != None assert self.stop == None LOG.debug("Stopping benchmark statistics collection") self.stop = time.time() def startTransaction(self, txn): self.txn_id += 1 id = self.txn_id self.running[id] = (txn, time.time()) return id def abortTransaction(self, id): """Abort a transaction and discard its times""" assert id in self.running txn_name, txn_start = self.running[id] del self.running[id] def stopTransaction(self, id, opCount): """Record that the benchmark completed an invocation of the given transaction""" assert id in self.running timestamp = time.time() txn_name, txn_start = self.running[id] del self.running[id] self.completed.append((txn_name, timestamp)) duration = timestamp - txn_start total_time = self.txn_times.get(txn_name, 0) self.txn_times[txn_name] = total_time + duration # OpCount if opCount is not None: self.opCount += opCount else: LOG.debug("ithappens") # Txn Counter Histogram self.txn_counters.put(txn_name) assert self.txn_counters[txn_name] > 0 if LOG.isEnabledFor(logging.DEBUG): LOG.debug("Completed %s in %f sec" % (txn_name, duration)) ## DEF def append(self, r): self.opCount += r.opCount for txn_name in r.txn_counters.keys(): self.txn_counters.put(txn_name, delta=r.txn_counters[txn_name]) orig_time = self.txn_times.get(txn_name, 0) self.txn_times[txn_name] = orig_time + r.txn_times[txn_name] #LOG.info("resOps="+str(r.opCount)) #LOG.debug("%s [cnt=%d, time=%d]" % (txn_name, self.txn_counters[txn_name], self.txn_times[txn_name])) ## HACK if type(r.completed) == list: self.completed.extend(r.completed) if not self.start: self.start = r.start else: self.start = min(self.start, r.start) if not self.stop: self.stop = r.stop else: self.stop = max(self.stop, r.stop) ## DEF def __str__(self): return self.show() def show(self, load_time = None): if self.start == None: msg = "Attempting to get benchmark results before it was started" raise Exception(msg) LOG.warn(msg) return "Benchmark not started" if self.stop == None: duration = time.time() - self.start else: duration = self.stop - self.start col_width = 18 total_width = (col_width*4)+2 f = "\n " + (("%-" + str(col_width) + "s")*4) line = "-"*total_width ret = u"" + "="*total_width + "\n" if load_time != None: ret += "Data Loading Time: %d seconds\n\n" % (load_time) ret += "Execution Results after %d seconds\n%s" % (duration, line) ret += f % ("", "Executed", u"Total Time (ms)", "Rate") total_time = duration total_cnt = self.txn_counters.getSampleCount() #total_running_time = 0 for txn in sorted(self.txn_counters.keys()): txn_time = self.txn_times[txn] txn_cnt = "%6d - %4.1f%%" % (self.txn_counters[txn], (self.txn_counters[txn] / float(total_cnt))*100) rate = u"%.02f txn/s" % ((self.txn_counters[txn] / total_time)) #total_running_time +=txn_time #rate = u"%.02f op/s" % ((self.txn_counters[txn] / total_time)) #rate = u"%.02f op/s" % ((self.opCount / total_time)) ret += f % (txn, txn_cnt, str(txn_time * 1000), rate) #LOG.info("totalOps="+str(self.totalOps)) # total_time += txn_time ret += "\n" + ("-"*total_width) rate = 0 if total_time > 0: rate = total_cnt / float(total_time) # TXN RATE rate = total_cnt / float(total_time) #total_rate = "%.02f txn/s" % rate total_rate = "%.02f op/s" % rate #total_rate = str(rate) ret += f % ("TOTAL", str(total_cnt), str(total_time*1000), total_rate) return (ret.encode('utf-8'))
## ---------------------------------------------- metadata_db = conn[config.get(configutil.SECT_MONGODB, 'metadata_db')] dataset_db = conn[config.get(configutil.SECT_MONGODB, 'dataset_db')] colls = dict() for col_info in metadata_db.Collection.fetch({"workload_queries": {"$gt": 0}}): # Skip any collection that doesn't have any documents in it if not col_info['doc_count'] or not col_info['avg_doc_size']: continue colls[col_info['name']] = col_info if not colls: raise Exception("No collections were found in metadata catalog") for sess in metadata_db.Session.fetch(): for op in sess["operations"]: QUERY_COUNTS.put(op["query_hash"]) if not op["query_hash"] in QUERY_HASH_XREF: QUERY_HASH_XREF[op["query_hash"]] = [ ] QUERY_HASH_XREF[op["query_hash"]].append(op) QUERY_COLLECTION_COUNTS.put(op["collection"]) ## FOR ## FOR LOG.info("Toal # of Unique Queries: %d", len(QUERY_COUNTS.values())) TOTAL_DB_SIZE = sum([col_info["data_size"] for col_info in colls.itervalues()]) LOG.debug("Estimated Total Database Size: %d" % TOTAL_DB_SIZE) TOTAL_QUERY_COUNT = QUERY_COLLECTION_COUNTS.getSampleCount() LOG.debug("Total # of Queries: %d" % TOTAL_QUERY_COUNT) # HACK: Fix collections for col_name, col_info in colls.iteritems():