Exemplo n.º 1
    def getCostImpl(self, design, num_nodes=None):
            Estimate the Disk Cost for a design and a workload
            Note: If this is being invoked with overallCost(), then the diskCost()
            should be calculated before skewCost() because we will reused the same
            histogram of how often nodes are touched in the workload
        # delta = self.__getDelta__(design)

        # Initialize all of the LRU buffers
        # since every lru has the same configuration, we can cache the first initialization then deepcopy it to other
        #    lrus
        cache = None
        # for lru in self.buffers:
        #     cache = lru.initialize(design, delta, cache)
        #     LOG.info(lru)
        #     lru.validate()
        # Ok strap on your helmet, this is the magical part of the whole thing!

        # print "Magic map: ", pformat(cost_map)
        # print "Magic list: ", child_collections
        # Outline:
        # + For each operation, we need to figure out what document(s) it's going
        #   to need to touch. From this we want to compute a unique hash signature
        #   for those document so that we can identify what node those documents
        #   reside on and whether those documents are in our working set memory.
        # + For each node, we are going to have a single LRU buffer that simulates
        #   the working set for all collections and indexes in the database.
        #   Documents entries are going to be tagged based on whether they are
        #   part of an index or a collection.
        # + Now when we iterate through each operation in our workload, we are
        #   going to need to first figure out what index (if any) it will need
        #   to use and how it will be used (i.e., equality look-up or range scan).
        #   We can then compute the hash for the look-up keys.
        #   If that key is in the LRU buffer, then we will update its entry's last
        #   accessed timestamp. If it's not, then we will increase the page hit
        #   counter and evict some other entry.
        #   After evaluating the target index, we will check whether the index
        #   covers the query. If it does, then we're done
        #   If not, then we need to compute hash for the "base" documents that it
        #   wants to access (i.e., in the collection). Then just as before, we
        #   will check whether its in our buffer, make an eviction if not, and
        #   update our page hit counter.
        #   There are several additional corner cases that we need to handle:
        #      INSERT/UPDATE: Check whether it's an upsert query
        #      INSERT/UPDATE/DELETE: We assume that they're using a WAL and therefore
        #                            writing dirty pages is "free"
        #      UPDATE/DELETE: Check whether the "multi" flag is set to true, which will
        #                     tell us to stop the scan after the first matching document
        #                     is found.
        # NOTE: We don't need to keep track of evicted tuples. It's either in the LRU buffer or not.
        # TODO: We may want to figure out how to estimate whether we are traversing
        #       indexes on the right-hand side of the tree. We could some preserve
        #       the sort order the keys when we hash them...

        # Worst case is when every query requires a full collection scan
        # Best case, every query is satisfied by main memory
        totalWorst = 0
        totalCost = 0
        sess_ctr = 0
        total_index_penalty = 0
        total_worst_index_penalty = 0

        for sess in self.state.workload:
            for op in sess["operations"]:
                # is the collection in the design - if not ignore
                if not design.hasCollection(op["collection"]):
                    if self.debug:
                        LOG.debug("NOT in design: SKIP - All operations on %s", col_name)
                if design.isRelaxed(op["collection"]):
                    if self.debug:
                        LOG.debug("NOT in design: SKIP - All operations on %s", col_name)
                col_info = self.state.collections[op["collection"]]

                # Initialize cache if necessary
                # We will always want to do this regardless of whether caching is enabled
                cache = self.state.getCacheHandle(col_info)

                # Check whether we have a cache index selection based on query_hashes
                indexKeys, covering, index_size, slot_size = cache.best_index.get(
                    op["query_hash"], (None, None, None, None)
                if indexKeys is None:
                    indexKeys, covering, index_size, slot_size = self.guess_op_info(design, op)
                    if self.state.cache_enable:
                        if self.debug:
                        cache.best_index[op["query_hash"]] = (indexKeys, covering, index_size, slot_size)
                elif self.debug:
                pageHits = 0
                maxHits = 0
                indexKeyInsertionPenalty = 0
                worst_index_penalty = 0

                isRegex = self.state.__getIsOpRegex__(cache, op)

                    opNodes = self.state.__getNodeIds__(cache, design, op)
                    if self.debug:
                        LOG.warn("Failed to estimate touched nodes for op\n%s" % pformat(op))
                    self.err_ctr += 1

                for content in workload.getOpContents(op):
                    for node_id in opNodes:
                        lru = self.buffers[node_id]
                        self.total_op_contents += 1
                        maxHits += cache.fullscan_pages

                        indexKeyInsertionPenalty += self.getIndexKeyInsertionPenalty(indexKeys, content)
                        worst_index_penalty += 1

                        # If slot size is too large, we consider it as a full page scan
                        if slot_size >= constants.SLOT_SIZE_LIMIT:
                            pageHits += cache.fullscan_pages
                        ## FOR

                        # TODO: Need to handle whether it's a scan or an equality predicate
                        # TODO: We need to handle when we have a regex predicate. These are tricky
                        #       because they may use an index that will examine all a subset of collections
                        #       and then execute a regex on just those documents.

                        # If we have a target index, hit that up
                        if indexKeys and not isRegex:  # FIXME
                            documentId = cache.index_docIds.get(op["query_id"], None)
                            if documentId is None:
                                values = catalog.getFieldValues(indexKeys, content)
                                    documentId = hash(values)
                                    if self.debug:
                                            "Failed to compute index documentIds for op #%d - %s\n%s",
                                    self.err_ctr += 1

                                if self.state.cache_enable:
                                    if self.debug:
                                    cache.index_docIds[op["query_id"]] = documentId
                            elif self.debug:
                                ## IF
                            hits = lru.getDocumentFromIndex(indexKeys, index_size)
                            # print "hits: ", hits
                            pageHits += hits
                            # maxHits += hits if op['type'] == constants.OP_TYPE_INSERT else cache.fullscan_pages
                            if self.debug:
                                    "Node #%02d: Estimated %d index scan pageHits for op #%d on %s.%s",

                        # If we don't have an index, then we know that it's a full scan because the
                        # collections are unordered
                        if not indexKeys:
                            if self.debug:
                                    "No index available for op #%d. Will have to do full scan on '%s'",
                            pageHits += cache.fullscan_pages
                            # maxHits += cache.fullscan_pages
                        # Otherwise, if it's not a covering index, then we need to hit up
                        # the collection to retrieve the whole document
                        elif not covering:
                            documentId = cache.collection_docIds.get(op["query_id"], None)
                            if documentId is None:
                                values = catalog.getAllValues(content)
                                    documentId = hash(values)
                                    if self.debug:
                                            "Failed to compute collection documentIds for op #%d - %s\n%s",
                                    self.err_ctr += 1

                                if self.state.cache_enable:
                                    if self.debug:
                                    cache.collection_docIds[op["query_id"]] = documentId
                            elif self.debug:
                                ## IF
                            hits = lru.getDocumentFromCollection(op["collection"], documentId, slot_size)
                            pageHits += hits
                            # maxHits += hits if op['type'] == constants.OP_TYPE_INSERT else cache.fullscan_pages
                            if self.debug:
                                    "Node #%02d: Estimated %d collection scan pageHits for op #%d on %s",

                        # We have a covering index, which means that we don't have
                        # to do a look-up on the document in the collection.
                        # But we still need to increase maxHits so that the final
                        # ratio is counted correctly
                        # Yang seems happy with this...
                            assert op["type"] != constants.OP_TYPE_INSERT
                            # maxHits += cache.fullscan_pages
                    ## FOR (node)
                ## FOR (content)
                totalCost += pageHits
                totalWorst += maxHits
                total_index_penalty += indexKeyInsertionPenalty
                total_worst_index_penalty += worst_index_penalty

                if self.debug:
                        "Op #%d on '%s' -> [pageHits:%d / worst:%d]",
                assert pageHits <= maxHits, "Estimated pageHits [%d] is greater than worst [%d] for op #%d\n%s" % (
                ## FOR (op)
            sess_ctr += 1

            ## FOR (sess)

        self.total_index_insertion_penalty = total_index_penalty

        # Add index insertion penalty to the total cost
        if not self.no_index_insertion_penalty:
            totalCost += total_index_penalty
            totalWorst += total_worst_index_penalty
        ## IF

        # The final disk cost is the ratio of our estimated disk access cost divided
        # by the worst possible cost for this design. If we don't have a worst case,
        # then the cost is simply zero
        if self.debug:
            LOG.info("Total operation contents %s, errors %s", self.total_op_contents, self.err_ctr)
        assert totalCost <= totalWorst, "Estimated total pageHits [%d] is greater than worst case pageHits [%d]" % (
        final_cost = float(totalCost) / float(totalWorst) if totalWorst else 0
        evicted = sum([lru.evicted for lru in self.buffers])
            "Computed Disk Cost: %s [pageHits=%d / worstCase=%d / evicted=%d]",
        return final_cost
Exemplo n.º 2
    def getCostImpl(self, design):
            Estimate the Disk Cost for a design and a workload
            Note: If this is being invoked with overallCost(), then the diskCost()
            should be calculated before skewCost() because we will reused the same
            histogram of how often nodes are touched in the workload
        # delta = self.__getDelta__(design)

        # Initialize all of the LRU buffers
        # since every lru has the same configuration, we can cache the first initialization then deepcopy it to other
        #    lrus
        cache = None
        # for lru in self.buffers:
        #     cache = lru.initialize(design, delta, cache)
        #     LOG.info(lru)
        #     lru.validate()
        # Ok strap on your helmet, this is the magical part of the whole thing!
        #print "Magic map: ", pformat(cost_map)
        #print "Magic list: ", child_collections
        # Outline:
        # + For each operation, we need to figure out what document(s) it's going
        #   to need to touch. From this we want to compute a unique hash signature
        #   for those document so that we can identify what node those documents
        #   reside on and whether those documents are in our working set memory.
        # + For each node, we are going to have a single LRU buffer that simulates
        #   the working set for all collections and indexes in the database.
        #   Documents entries are going to be tagged based on whether they are
        #   part of an index or a collection.
        # + Now when we iterate through each operation in our workload, we are
        #   going to need to first figure out what index (if any) it will need
        #   to use and how it will be used (i.e., equality look-up or range scan).
        #   We can then compute the hash for the look-up keys.
        #   If that key is in the LRU buffer, then we will update its entry's last
        #   accessed timestamp. If it's not, then we will increase the page hit
        #   counter and evict some other entry.
        #   After evaluating the target index, we will check whether the index
        #   covers the query. If it does, then we're done
        #   If not, then we need to compute hash for the "base" documents that it
        #   wants to access (i.e., in the collection). Then just as before, we
        #   will check whether its in our buffer, make an eviction if not, and
        #   update our page hit counter.
        #   There are several additional corner cases that we need to handle:
        #      INSERT/UPDATE: Check whether it's an upsert query
        #      INSERT/UPDATE/DELETE: We assume that they're using a WAL and therefore
        #                            writing dirty pages is "free"
        #      UPDATE/DELETE: Check whether the "multi" flag is set to true, which will
        #                     tell us to stop the scan after the first matching document
        #                     is found.
        # NOTE: We don't need to keep track of evicted tuples. It's either in the LRU buffer or not.
        # TODO: We may want to figure out how to estimate whether we are traversing
        #       indexes on the right-hand side of the tree. We could some preserve
        #       the sort order the keys when we hash them...

        # Worst case is when every query requires a full collection scan
        # Best case, every query is satisfied by main memory
        totalWorst = 0
        totalCost = 0
        sess_ctr = 0
        total_index_penalty = 0
        total_worst_index_penalty = 0
        for sess in self.state.workload:
            for op in sess['operations']:
                # is the collection in the design - if not ignore
                if not design.hasCollection(op['collection']):
                    if self.debug: LOG.debug("NOT in design: SKIP - All operations on %s", col_name)
                if design.isRelaxed(op['collection']):
                    if self.debug: LOG.debug("NOT in design: SKIP - All operations on %s", col_name)
                col_info = self.state.collections[op['collection']]

                # Initialize cache if necessary
                # We will always want to do this regardless of whether caching is enabled
                cache = self.state.getCacheHandle(col_info)

                # Check whether we have a cache index selection based on query_hashes
                indexKeys, covering, index_size, slot_size = cache.best_index.get(op["query_hash"], (None, None, None, None))
                if indexKeys is None:
                    indexKeys, covering, index_size, slot_size = self.guess_op_info(design, op)
                    if self.state.cache_enable:
                        if self.debug: self.state.cache_miss_ctr.put("best_index")
                        cache.best_index[op["query_hash"]] = (indexKeys, covering, index_size, slot_size)
                elif self.debug:
                pageHits = 0
                maxHits = 0
                indexKeyInsertionPenalty = 0
                worst_index_penalty = 0
                isRegex = self.state.__getIsOpRegex__(cache, op)

                    opNodes = self.state.__getNodeIds__(cache, design, op)
                    if self.debug:
                        LOG.warn("Failed to estimate touched nodes for op\n%s" % pformat(op))
                    self.err_ctr += 1
                for content in workload.getOpContents(op):
                    for node_id in opNodes:
                        lru = self.buffers[node_id]
                        self.total_op_contents += 1
                        maxHits += cache.fullscan_pages
                        indexKeyInsertionPenalty += self.getIndexKeyInsertionPenalty(indexKeys, content)
                        worst_index_penalty += 1
                        # If slot size is too large, we consider it as a full page scan
                        if slot_size >= constants.SLOT_SIZE_LIMIT:
                            pageHits += cache.fullscan_pages
                        ## FOR
                        # TODO: Need to handle whether it's a scan or an equality predicate
                        # TODO: We need to handle when we have a regex predicate. These are tricky
                        #       because they may use an index that will examine all a subset of collections
                        #       and then execute a regex on just those documents.

                        # If we have a target index, hit that up
                        if indexKeys and not isRegex: # FIXME
                            documentId = cache.index_docIds.get(op['query_id'], None)
                            if documentId is None:
                                values = catalog.getFieldValues(indexKeys, content)
                                    documentId = hash(values)
                                    if self.debug: LOG.error("Failed to compute index documentIds for op #%d - %s\n%s",\
                                        op['query_id'], values, pformat(op))
                                    self.err_ctr += 1
                                if self.state.cache_enable:
                                    if self.debug: self.state.cache_miss_ctr.put("index_docIds")
                                    cache.index_docIds[op['query_id']] = documentId
                            elif self.debug:
                                ## IF
                            hits = lru.getDocumentFromIndex(indexKeys, index_size)
                            # print "hits: ", hits
                            pageHits += hits
                            # maxHits += hits if op['type'] == constants.OP_TYPE_INSERT else cache.fullscan_pages
                            if self.debug:
                                LOG.debug("Node #%02d: Estimated %d index scan pageHits for op #%d on %s.%s",\
                                    node_id, hits, op["query_id"], op["collection"], indexKeys)

                        # If we don't have an index, then we know that it's a full scan because the
                        # collections are unordered
                        if not indexKeys:
                            if self.debug:
                                LOG.debug("No index available for op #%d. Will have to do full scan on '%s'",\
                                    op["query_id"], op["collection"])
                            pageHits += cache.fullscan_pages
                            #maxHits += cache.fullscan_pages
                        # Otherwise, if it's not a covering index, then we need to hit up
                        # the collection to retrieve the whole document
                        elif not covering:
                            documentId = cache.collection_docIds.get(op['query_id'], None)
                            if documentId is None:
                                values = catalog.getAllValues(content)
                                    documentId = hash(values)
                                    if self.debug: LOG.error("Failed to compute collection documentIds for op #%d - %s\n%s",\
                                        op['query_id'], values, pformat(op))
                                    self.err_ctr += 1
                                if self.state.cache_enable:
                                    if self.debug: self.state.cache_miss_ctr.put("collection_docIds")
                                    cache.collection_docIds[op['query_id']] = documentId
                            elif self.debug:
                                ## IF
                            hits = lru.getDocumentFromCollection(op['collection'], documentId, slot_size)
                            pageHits += hits
                            #maxHits += hits if op['type'] == constants.OP_TYPE_INSERT else cache.fullscan_pages
                            if self.debug:
                                LOG.debug("Node #%02d: Estimated %d collection scan pageHits for op #%d on %s",\
                                    node_id, hits, op["query_id"], op["collection"])

                        # We have a covering index, which means that we don't have
                        # to do a look-up on the document in the collection.
                        # But we still need to increase maxHits so that the final
                        # ratio is counted correctly
                        # Yang seems happy with this...
                            assert op['type'] != constants.OP_TYPE_INSERT
                            #maxHits += cache.fullscan_pages
                    ## FOR (node)
                ## FOR (content)
                totalCost += pageHits
                totalWorst += maxHits
                total_index_penalty += indexKeyInsertionPenalty
                total_worst_index_penalty += worst_index_penalty
                if self.debug:
                    LOG.debug("Op #%d on '%s' -> [pageHits:%d / worst:%d]",\
                        op["query_id"], op["collection"], pageHits, maxHits)
                assert pageHits <= maxHits,\
                    "Estimated pageHits [%d] is greater than worst [%d] for op #%d\n%s" %\
                    (pageHits, maxHits, op["query_id"], pformat(op))
                ## FOR (op)
            sess_ctr += 1

            ## FOR (sess)

        self.total_index_insertion_penalty = total_index_penalty
        # Add index insertion penalty to the total cost
        if not self.no_index_insertion_penalty:
            totalCost += total_index_penalty
            totalWorst += total_worst_index_penalty
        ## IF
        # The final disk cost is the ratio of our estimated disk access cost divided
        # by the worst possible cost for this design. If we don't have a worst case,
        # then the cost is simply zero
        if self.debug: LOG.info("Total operation contents %s, errors %s", self.total_op_contents, self.err_ctr)
        assert totalCost <= totalWorst,\
            "Estimated total pageHits [%d] is greater than worst case pageHits [%d]" % (totalCost, totalWorst)
        final_cost = float(totalCost) / float(totalWorst) if totalWorst else 0
        evicted = sum([ lru.evicted for lru in self.buffers ])
        LOG.info("Computed Disk Cost: %s [pageHits=%d / worstCase=%d / evicted=%d]",\
                 final_cost, totalCost, totalWorst, evicted)
        return final_cost
Exemplo n.º 3
    def estimateNodes(self, design, op):
            For the given operation and a design object,
            return an estimate of a list of node ids that we think that
            the query will be executed on

        results = set()
        broadcast = True
        shardingKeys = design.getShardKeys(op['collection'])

        if self.debug:
            LOG.debug("Computing node estimate for Op #%d [sharding=%s]", \
                      op['query_id'], shardingKeys)

        # Inserts always go to a single node
        if op['type'] == constants.OP_TYPE_INSERT:
            # Get the documents that they're trying to insert and then
            # compute their hashes based on the sharding key
            # Because there is no logical replication, each document will
            # be inserted in one and only one node
            for content in workload.getOpContents(op):
                values = catalog.getFieldValues(shardingKeys, content)
            ## FOR
            broadcast = False

        # Network costs of SELECT, UPDATE, DELETE queries are based off
        # of using the sharding key in the predicate
        elif len(op['predicates']) > 0:
            predicate_types = set()
            for k, v in op['predicates'].iteritems():
                if design.inShardKeyPattern(op['collection'], k):
                    broadcast = False
            if self.debug:
                LOG.debug("Op #%d %s Predicates: %s [broadcast=%s / predicateTypes=%s]",\
                          op['query_id'], op['collection'], op['predicates'], broadcast, list(predicate_types))

            ## ----------------------------------------------
            ## PRED_TYPE_REGEX
            ## ----------------------------------------------
            if not broadcast and constants.PRED_TYPE_REGEX in predicate_types:
                # Any query that is using a regex on the sharding key must be broadcast to every node
                # It's not complete accurate but it's just easier that way
                broadcast = True

            ## ----------------------------------------------
            ## PRED_TYPE_RANGE
            ## ----------------------------------------------
            elif not broadcast and constants.PRED_TYPE_RANGE in predicate_types:
                # If it's a scan, then we need to first figure out what
                # node they will start the scan at, and then just approximate
                # what it will do by adding N nodes to the touched list starting
                # from that first node. We will wrap around to zero
                num_touched = self.guessNodes(design, op['collection'], k)
                if self.debug:
                    LOG.info("Estimating that Op #%d on '%s' touches %d nodes",\
                             op["query_id"], op["collection"], num_touched)
                for content in workload.getOpContents(op):
                    values = catalog.getFieldValues(shardingKeys, content)
                    if self.debug: LOG.debug("%s -> %s", shardingKeys, values)
                        node_id = self.computeTouchedNode(values)
                        if self.debug:
                                "Unexpected error when computing touched nodes\n%s"
                                % pformat(values))
                    for i in xrange(num_touched):
                        if node_id >= self.num_nodes: node_id = 0
                        node_id += 1
                    ## FOR
                ## FOR
            ## ----------------------------------------------
            ## PRED_TYPE_EQUALITY
            ## ----------------------------------------------
            elif not broadcast and constants.PRED_TYPE_EQUALITY in predicate_types:
                broadcast = False
                for content in workload.getOpContents(op):
                    values = catalog.getFieldValues(shardingKeys, content)
                ## FOR
            ## ----------------------------------------------
            ## BUSTED!
            ## ----------------------------------------------
            elif not broadcast:
                raise Exception("Unexpected predicate types '%s' for op #%d" %
                                (list(predicate_types), op['query_id']))
        ## IF

        if broadcast:
            if self.debug:                LOG.debug("Op #%d on '%s' is a broadcast query to all nodes",\
                          op["query_id"], op["collection"])
            map(results.add, xrange(0, self.num_nodes))

        map(self.nodeCounts.put, results)
        self.op_count += 1
        return results
Exemplo n.º 4
    def estimateNodes(self, design, op):
            For the given operation and a design object,
            return an estimate of a list of node ids that we think that
            the query will be executed on

        results = set()
        broadcast = True
        shardingKeys = design.getShardKeys(op['collection'])

        if self.debug:
            LOG.debug("Computing node estimate for Op #%d [sharding=%s]", \
                      op['query_id'], shardingKeys)

        # Inserts always go to a single node
        if op['type'] == constants.OP_TYPE_INSERT:
            # Get the documents that they're trying to insert and then
            # compute their hashes based on the sharding key
            # Because there is no logical replication, each document will
            # be inserted in one and only one node
            for content in workload.getOpContents(op):
                values = catalog.getFieldValues(shardingKeys, content)
            ## FOR
            broadcast = False

        # Network costs of SELECT, UPDATE, DELETE queries are based off
        # of using the sharding key in the predicate
        elif len(op['predicates']) > 0:
            predicate_types = set()
            for k,v in op['predicates'].iteritems() :
                if design.inShardKeyPattern(op['collection'], k) :
                    broadcast = False
            if self.debug:
                LOG.debug("Op #%d %s Predicates: %s [broadcast=%s / predicateTypes=%s]",\
                          op['query_id'], op['collection'], op['predicates'], broadcast, list(predicate_types))

            ## ----------------------------------------------
            ## PRED_TYPE_REGEX
            ## ----------------------------------------------
            if not broadcast and constants.PRED_TYPE_REGEX in predicate_types:
                # Any query that is using a regex on the sharding key must be broadcast to every node
                # It's not complete accurate but it's just easier that way
                broadcast = True

            ## ----------------------------------------------
            ## PRED_TYPE_RANGE
            ## ----------------------------------------------
            elif not broadcast and constants.PRED_TYPE_RANGE in predicate_types:
                # If it's a scan, then we need to first figure out what
                # node they will start the scan at, and then just approximate
                # what it will do by adding N nodes to the touched list starting
                # from that first node. We will wrap around to zero
                num_touched = self.guessNodes(design, op['collection'], k)
                if self.debug:
                    LOG.info("Estimating that Op #%d on '%s' touches %d nodes",\
                             op["query_id"], op["collection"], num_touched)
                for content in workload.getOpContents(op):
                    values = catalog.getFieldValues(shardingKeys, content)
                    if self.debug: LOG.debug("%s -> %s", shardingKeys, values)
                        node_id = self.computeTouchedNode(values)
                        if self.debug:
                            LOG.error("Unexpected error when computing touched nodes\n%s" % pformat(values))
                    for i in xrange(num_touched):
                        if node_id >= self.num_nodes: node_id = 0
                        node_id += 1
                    ## FOR
                ## FOR
            ## ----------------------------------------------
            ## PRED_TYPE_EQUALITY
            ## ----------------------------------------------
            elif not broadcast and constants.PRED_TYPE_EQUALITY in predicate_types:
                broadcast = False
                for content in workload.getOpContents(op):
                    values = catalog.getFieldValues(shardingKeys, content)
                ## FOR
            ## ----------------------------------------------
            ## BUSTED!
            ## ----------------------------------------------
            elif not broadcast:
                raise Exception("Unexpected predicate types '%s' for op #%d" % (list(predicate_types), op['query_id']))
        ## IF

        if broadcast:
            if self.debug: LOG.debug("Op #%d on '%s' is a broadcast query to all nodes",\
                                     op["query_id"], op["collection"])
            map(results.add, xrange(0, self.num_nodes))

        map(self.nodeCounts.put, results)
        self.op_count += 1
        return results
Exemplo n.º 5
    def estimateNodes(self, design, op, num_nodes=None):
            For the given operation and a design object,
            return an estimate of a list of node ids that we think that
            the query will be executed on
        results = set()
        broadcast = True
        shardingKeys = design.getShardKeys(op['collection'])

        if self.debug:
            LOG.debug("Computing node estimate for Op #%d [sharding=%s]", \
                      op['query_id'], shardingKeys)

        # If there are no sharding keys
        # All requests on this collection will be routed to the primary node
        # We assume the node 0 is the primary node
        if len(shardingKeys) == 0:
            broadcast = False

        # Inserts always go to a single node
        elif op['type'] == constants.OP_TYPE_INSERT:
            # Get the documents that they're trying to insert and then
            # compute their hashes based on the sharding key
            # Because there is no logical replication, each document will
            # be inserted in one and only one node
            for content in workload.getOpContents(op):
                values = catalog.getFieldValues(shardingKeys, content)
                results.add(self.computeTouchedNode(op['collection'], shardingKeys, values, num_nodes))
            ## FOR
            broadcast = False

        # Network costs of SELECT, UPDATE, DELETE queries are based off
        # of using the sharding key in the predicate
        elif len(op['predicates']) > 0:
            predicate_fields = set()
            predicate_types = set()
            for k,v in op['predicates'].iteritems() :
                if design.inShardKeyPattern(op['collection'], k):
            if len(predicate_fields) == len(shardingKeys):
                broadcast = False
            if self.debug:
                LOG.debug("Op #%d %s Predicates: %s [broadcast=%s / predicateTypes=%s]",\
                          op['query_id'], op['collection'], op['predicates'], broadcast, list(predicate_types))

            ## ----------------------------------------------
            ## PRED_TYPE_REGEX
            ## ----------------------------------------------
            if not broadcast and constants.PRED_TYPE_REGEX in predicate_types:
                # Any query that is using a regex on the sharding key must be broadcast to every node
                # It's not complete accurate but it's just easier that way
                broadcast = True

            ## ----------------------------------------------
            ## PRED_TYPE_RANGE
            ## ----------------------------------------------
            elif not broadcast and constants.PRED_TYPE_RANGE in predicate_types:
                broadcast = True
            ## ----------------------------------------------
            ## PRED_TYPE_EQUALITY
            ## ----------------------------------------------
            elif not broadcast and constants.PRED_TYPE_EQUALITY in predicate_types:
                broadcast = False
                for content in workload.getOpContents(op):
                    values = catalog.getFieldValues(shardingKeys, content)
                    results.add(self.computeTouchedNode(op['collection'], shardingKeys, values, num_nodes))
                ## FOR
            ## ----------------------------------------------
            ## BUSTED!
            ## ----------------------------------------------
            elif not broadcast:
                raise Exception("Unexpected predicate types '%s' for op #%d" % (list(predicate_types), op['query_id']))
        ## IF

        if broadcast:
            if self.debug: LOG.debug("Op #%d on '%s' is a broadcast query to all nodes",\
                                     op["query_id"], op["collection"])
            map(results.add, xrange(0, self.colNumNodes(num_nodes, op["collection"])))

        map(self.nodeCounts.put, results)
        self.op_count += 1
        return results