def testGetAllValues(self): values = catalog.getAllValues(TestUtilMethods.TEST_FIELDS) self.assertIsNotNone(values) self.assertIsInstance(values, tuple) # Make sure we can hash it hash_v = hash(values) # print "hash_v:", hash_v self.assertIsNotNone(hash_v) for v in TestUtilMethods.TEST_FIELDS.itervalues(): if isinstance(v, dict): expected = tuple(v.values()) elif isinstance(v, list): expected = tuple(v) else: expected = v self.assertIn(expected, values)
def getCostImpl(self, design, num_nodes=None): """ Estimate the Disk Cost for a design and a workload Note: If this is being invoked with overallCost(), then the diskCost() should be calculated before skewCost() because we will reused the same histogram of how often nodes are touched in the workload """ # delta = self.__getDelta__(design) # Initialize all of the LRU buffers # since every lru has the same configuration, we can cache the first initialization then deepcopy it to other # lrus cache = None # for lru in self.buffers: # cache = lru.initialize(design, delta, cache) # LOG.info(lru) # lru.validate() # Ok strap on your helmet, this is the magical part of the whole thing! self.buildEmbeddingCostDictionary(design) # print "Magic map: ", pformat(cost_map) # print "Magic list: ", child_collections # Outline: # + For each operation, we need to figure out what document(s) it's going # to need to touch. From this we want to compute a unique hash signature # for those document so that we can identify what node those documents # reside on and whether those documents are in our working set memory. # # + For each node, we are going to have a single LRU buffer that simulates # the working set for all collections and indexes in the database. # Documents entries are going to be tagged based on whether they are # part of an index or a collection. # # + Now when we iterate through each operation in our workload, we are # going to need to first figure out what index (if any) it will need # to use and how it will be used (i.e., equality look-up or range scan). # We can then compute the hash for the look-up keys. # If that key is in the LRU buffer, then we will update its entry's last # accessed timestamp. If it's not, then we will increase the page hit # counter and evict some other entry. # After evaluating the target index, we will check whether the index # covers the query. If it does, then we're done # If not, then we need to compute hash for the "base" documents that it # wants to access (i.e., in the collection). Then just as before, we # will check whether its in our buffer, make an eviction if not, and # update our page hit counter. # There are several additional corner cases that we need to handle: # INSERT/UPDATE: Check whether it's an upsert query # INSERT/UPDATE/DELETE: We assume that they're using a WAL and therefore # writing dirty pages is "free" # UPDATE/DELETE: Check whether the "multi" flag is set to true, which will # tell us to stop the scan after the first matching document # is found. # # NOTE: We don't need to keep track of evicted tuples. It's either in the LRU buffer or not. # TODO: We may want to figure out how to estimate whether we are traversing # indexes on the right-hand side of the tree. We could some preserve # the sort order the keys when we hash them... # Worst case is when every query requires a full collection scan # Best case, every query is satisfied by main memory totalWorst = 0 totalCost = 0 sess_ctr = 0 total_index_penalty = 0 total_worst_index_penalty = 0 for sess in self.state.workload: for op in sess["operations"]: # is the collection in the design - if not ignore if not design.hasCollection(op["collection"]): if self.debug: LOG.debug("NOT in design: SKIP - All operations on %s", col_name) continue if design.isRelaxed(op["collection"]): if self.debug: LOG.debug("NOT in design: SKIP - All operations on %s", col_name) continue col_info = self.state.collections[op["collection"]] # Initialize cache if necessary # We will always want to do this regardless of whether caching is enabled cache = self.state.getCacheHandle(col_info) # Check whether we have a cache index selection based on query_hashes indexKeys, covering, index_size, slot_size = cache.best_index.get( op["query_hash"], (None, None, None, None) ) if indexKeys is None: indexKeys, covering, index_size, slot_size = self.guess_op_info(design, op) if self.state.cache_enable: if self.debug: self.state.cache_miss_ctr.put("best_index") cache.best_index[op["query_hash"]] = (indexKeys, covering, index_size, slot_size) elif self.debug: self.state.cache_hit_ctr.put("best_index") pageHits = 0 maxHits = 0 indexKeyInsertionPenalty = 0 worst_index_penalty = 0 isRegex = self.state.__getIsOpRegex__(cache, op) try: opNodes = self.state.__getNodeIds__(cache, design, op) except: if self.debug: LOG.warn("Failed to estimate touched nodes for op\n%s" % pformat(op)) self.err_ctr += 1 continue for content in workload.getOpContents(op): for node_id in opNodes: lru = self.buffers[node_id] self.total_op_contents += 1 maxHits += cache.fullscan_pages indexKeyInsertionPenalty += self.getIndexKeyInsertionPenalty(indexKeys, content) worst_index_penalty += 1 # If slot size is too large, we consider it as a full page scan if slot_size >= constants.SLOT_SIZE_LIMIT: pageHits += cache.fullscan_pages continue ## FOR # TODO: Need to handle whether it's a scan or an equality predicate # TODO: We need to handle when we have a regex predicate. These are tricky # because they may use an index that will examine all a subset of collections # and then execute a regex on just those documents. # If we have a target index, hit that up if indexKeys and not isRegex: # FIXME documentId = cache.index_docIds.get(op["query_id"], None) if documentId is None: values = catalog.getFieldValues(indexKeys, content) try: documentId = hash(values) except: if self.debug: LOG.error( "Failed to compute index documentIds for op #%d - %s\n%s", op["query_id"], values, pformat(op), ) self.err_ctr += 1 break if self.state.cache_enable: if self.debug: self.state.cache_miss_ctr.put("index_docIds") cache.index_docIds[op["query_id"]] = documentId elif self.debug: self.state.cache_hit_ctr.put("index_docIds") ## IF hits = lru.getDocumentFromIndex(indexKeys, index_size) # print "hits: ", hits pageHits += hits # maxHits += hits if op['type'] == constants.OP_TYPE_INSERT else cache.fullscan_pages if self.debug: LOG.debug( "Node #%02d: Estimated %d index scan pageHits for op #%d on %s.%s", node_id, hits, op["query_id"], op["collection"], indexKeys, ) # If we don't have an index, then we know that it's a full scan because the # collections are unordered if not indexKeys: if self.debug: LOG.debug( "No index available for op #%d. Will have to do full scan on '%s'", op["query_id"], op["collection"], ) pageHits += cache.fullscan_pages # maxHits += cache.fullscan_pages # Otherwise, if it's not a covering index, then we need to hit up # the collection to retrieve the whole document elif not covering: documentId = cache.collection_docIds.get(op["query_id"], None) if documentId is None: values = catalog.getAllValues(content) try: documentId = hash(values) except: if self.debug: LOG.error( "Failed to compute collection documentIds for op #%d - %s\n%s", op["query_id"], values, pformat(op), ) self.err_ctr += 1 break if self.state.cache_enable: if self.debug: self.state.cache_miss_ctr.put("collection_docIds") cache.collection_docIds[op["query_id"]] = documentId elif self.debug: self.state.cache_hit_ctr.put("collection_docIds") ## IF hits = lru.getDocumentFromCollection(op["collection"], documentId, slot_size) pageHits += hits # maxHits += hits if op['type'] == constants.OP_TYPE_INSERT else cache.fullscan_pages if self.debug: LOG.debug( "Node #%02d: Estimated %d collection scan pageHits for op #%d on %s", node_id, hits, op["query_id"], op["collection"], ) # We have a covering index, which means that we don't have # to do a look-up on the document in the collection. # But we still need to increase maxHits so that the final # ratio is counted correctly # Yang seems happy with this... else: assert op["type"] != constants.OP_TYPE_INSERT # maxHits += cache.fullscan_pages ## FOR (node) ## FOR (content) totalCost += pageHits totalWorst += maxHits total_index_penalty += indexKeyInsertionPenalty total_worst_index_penalty += worst_index_penalty if self.debug: LOG.debug( "Op #%d on '%s' -> [pageHits:%d / worst:%d]", op["query_id"], op["collection"], pageHits, maxHits, ) assert pageHits <= maxHits, "Estimated pageHits [%d] is greater than worst [%d] for op #%d\n%s" % ( pageHits, maxHits, op["query_id"], pformat(op), ) ## FOR (op) sess_ctr += 1 ## FOR (sess) self.total_index_insertion_penalty = total_index_penalty # Add index insertion penalty to the total cost if not self.no_index_insertion_penalty: totalCost += total_index_penalty totalWorst += total_worst_index_penalty ## IF # The final disk cost is the ratio of our estimated disk access cost divided # by the worst possible cost for this design. If we don't have a worst case, # then the cost is simply zero if self.debug: LOG.info("Total operation contents %s, errors %s", self.total_op_contents, self.err_ctr) assert totalCost <= totalWorst, "Estimated total pageHits [%d] is greater than worst case pageHits [%d]" % ( totalCost, totalWorst, ) final_cost = float(totalCost) / float(totalWorst) if totalWorst else 0 evicted = sum([lru.evicted for lru in self.buffers]) LOG.info( "Computed Disk Cost: %s [pageHits=%d / worstCase=%d / evicted=%d]", final_cost, totalCost, totalWorst, evicted, ) return final_cost
def getCostImpl(self, design): """ Estimate the Disk Cost for a design and a workload Note: If this is being invoked with overallCost(), then the diskCost() should be calculated before skewCost() because we will reused the same histogram of how often nodes are touched in the workload """ # delta = self.__getDelta__(design) # Initialize all of the LRU buffers # since every lru has the same configuration, we can cache the first initialization then deepcopy it to other # lrus cache = None # for lru in self.buffers: # cache = lru.initialize(design, delta, cache) # LOG.info(lru) # lru.validate() # Ok strap on your helmet, this is the magical part of the whole thing! self.buildEmbeddingCostDictionary(design) #print "Magic map: ", pformat(cost_map) #print "Magic list: ", child_collections # Outline: # + For each operation, we need to figure out what document(s) it's going # to need to touch. From this we want to compute a unique hash signature # for those document so that we can identify what node those documents # reside on and whether those documents are in our working set memory. # # + For each node, we are going to have a single LRU buffer that simulates # the working set for all collections and indexes in the database. # Documents entries are going to be tagged based on whether they are # part of an index or a collection. # # + Now when we iterate through each operation in our workload, we are # going to need to first figure out what index (if any) it will need # to use and how it will be used (i.e., equality look-up or range scan). # We can then compute the hash for the look-up keys. # If that key is in the LRU buffer, then we will update its entry's last # accessed timestamp. If it's not, then we will increase the page hit # counter and evict some other entry. # After evaluating the target index, we will check whether the index # covers the query. If it does, then we're done # If not, then we need to compute hash for the "base" documents that it # wants to access (i.e., in the collection). Then just as before, we # will check whether its in our buffer, make an eviction if not, and # update our page hit counter. # There are several additional corner cases that we need to handle: # INSERT/UPDATE: Check whether it's an upsert query # INSERT/UPDATE/DELETE: We assume that they're using a WAL and therefore # writing dirty pages is "free" # UPDATE/DELETE: Check whether the "multi" flag is set to true, which will # tell us to stop the scan after the first matching document # is found. # # NOTE: We don't need to keep track of evicted tuples. It's either in the LRU buffer or not. # TODO: We may want to figure out how to estimate whether we are traversing # indexes on the right-hand side of the tree. We could some preserve # the sort order the keys when we hash them... # Worst case is when every query requires a full collection scan # Best case, every query is satisfied by main memory totalWorst = 0 totalCost = 0 sess_ctr = 0 total_index_penalty = 0 total_worst_index_penalty = 0 for sess in self.state.workload: for op in sess['operations']: # is the collection in the design - if not ignore if not design.hasCollection(op['collection']): if self.debug: LOG.debug("NOT in design: SKIP - All operations on %s", col_name) continue if design.isRelaxed(op['collection']): if self.debug: LOG.debug("NOT in design: SKIP - All operations on %s", col_name) continue col_info = self.state.collections[op['collection']] # Initialize cache if necessary # We will always want to do this regardless of whether caching is enabled cache = self.state.getCacheHandle(col_info) # Check whether we have a cache index selection based on query_hashes indexKeys, covering, index_size, slot_size = cache.best_index.get(op["query_hash"], (None, None, None, None)) if indexKeys is None: indexKeys, covering, index_size, slot_size = self.guess_op_info(design, op) if self.state.cache_enable: if self.debug: self.state.cache_miss_ctr.put("best_index") cache.best_index[op["query_hash"]] = (indexKeys, covering, index_size, slot_size) elif self.debug: self.state.cache_hit_ctr.put("best_index") pageHits = 0 maxHits = 0 indexKeyInsertionPenalty = 0 worst_index_penalty = 0 isRegex = self.state.__getIsOpRegex__(cache, op) try: opNodes = self.state.__getNodeIds__(cache, design, op) except: if self.debug: LOG.warn("Failed to estimate touched nodes for op\n%s" % pformat(op)) self.err_ctr += 1 continue for content in workload.getOpContents(op): for node_id in opNodes: lru = self.buffers[node_id] self.total_op_contents += 1 maxHits += cache.fullscan_pages indexKeyInsertionPenalty += self.getIndexKeyInsertionPenalty(indexKeys, content) worst_index_penalty += 1 # If slot size is too large, we consider it as a full page scan if slot_size >= constants.SLOT_SIZE_LIMIT: pageHits += cache.fullscan_pages continue ## FOR # TODO: Need to handle whether it's a scan or an equality predicate # TODO: We need to handle when we have a regex predicate. These are tricky # because they may use an index that will examine all a subset of collections # and then execute a regex on just those documents. # If we have a target index, hit that up if indexKeys and not isRegex: # FIXME documentId = cache.index_docIds.get(op['query_id'], None) if documentId is None: values = catalog.getFieldValues(indexKeys, content) try: documentId = hash(values) except: if self.debug: LOG.error("Failed to compute index documentIds for op #%d - %s\n%s",\ op['query_id'], values, pformat(op)) self.err_ctr += 1 break if self.state.cache_enable: if self.debug: self.state.cache_miss_ctr.put("index_docIds") cache.index_docIds[op['query_id']] = documentId elif self.debug: self.state.cache_hit_ctr.put("index_docIds") ## IF hits = lru.getDocumentFromIndex(indexKeys, index_size) # print "hits: ", hits pageHits += hits # maxHits += hits if op['type'] == constants.OP_TYPE_INSERT else cache.fullscan_pages if self.debug: LOG.debug("Node #%02d: Estimated %d index scan pageHits for op #%d on %s.%s",\ node_id, hits, op["query_id"], op["collection"], indexKeys) # If we don't have an index, then we know that it's a full scan because the # collections are unordered if not indexKeys: if self.debug: LOG.debug("No index available for op #%d. Will have to do full scan on '%s'",\ op["query_id"], op["collection"]) pageHits += cache.fullscan_pages #maxHits += cache.fullscan_pages # Otherwise, if it's not a covering index, then we need to hit up # the collection to retrieve the whole document elif not covering: documentId = cache.collection_docIds.get(op['query_id'], None) if documentId is None: values = catalog.getAllValues(content) try: documentId = hash(values) except: if self.debug: LOG.error("Failed to compute collection documentIds for op #%d - %s\n%s",\ op['query_id'], values, pformat(op)) self.err_ctr += 1 break if self.state.cache_enable: if self.debug: self.state.cache_miss_ctr.put("collection_docIds") cache.collection_docIds[op['query_id']] = documentId elif self.debug: self.state.cache_hit_ctr.put("collection_docIds") ## IF hits = lru.getDocumentFromCollection(op['collection'], documentId, slot_size) pageHits += hits #maxHits += hits if op['type'] == constants.OP_TYPE_INSERT else cache.fullscan_pages if self.debug: LOG.debug("Node #%02d: Estimated %d collection scan pageHits for op #%d on %s",\ node_id, hits, op["query_id"], op["collection"]) # We have a covering index, which means that we don't have # to do a look-up on the document in the collection. # But we still need to increase maxHits so that the final # ratio is counted correctly # Yang seems happy with this... else: assert op['type'] != constants.OP_TYPE_INSERT #maxHits += cache.fullscan_pages ## FOR (node) ## FOR (content) totalCost += pageHits totalWorst += maxHits total_index_penalty += indexKeyInsertionPenalty total_worst_index_penalty += worst_index_penalty if self.debug: LOG.debug("Op #%d on '%s' -> [pageHits:%d / worst:%d]",\ op["query_id"], op["collection"], pageHits, maxHits) assert pageHits <= maxHits,\ "Estimated pageHits [%d] is greater than worst [%d] for op #%d\n%s" %\ (pageHits, maxHits, op["query_id"], pformat(op)) ## FOR (op) sess_ctr += 1 ## FOR (sess) self.total_index_insertion_penalty = total_index_penalty # Add index insertion penalty to the total cost if not self.no_index_insertion_penalty: totalCost += total_index_penalty totalWorst += total_worst_index_penalty ## IF # The final disk cost is the ratio of our estimated disk access cost divided # by the worst possible cost for this design. If we don't have a worst case, # then the cost is simply zero if self.debug: LOG.info("Total operation contents %s, errors %s", self.total_op_contents, self.err_ctr) assert totalCost <= totalWorst,\ "Estimated total pageHits [%d] is greater than worst case pageHits [%d]" % (totalCost, totalWorst) final_cost = float(totalCost) / float(totalWorst) if totalWorst else 0 evicted = sum([ lru.evicted for lru in self.buffers ]) LOG.info("Computed Disk Cost: %s [pageHits=%d / worstCase=%d / evicted=%d]",\ final_cost, totalCost, totalWorst, evicted) return final_cost