def processOpFields(self, fields, op, content): if self.debug: LOG.debug("Processing operation fields\n%s", pformat(content)) for k, v in content.iteritems(): # Skip anything that starts with our special char # Those are flag markers used by MongoDB's queries if k.startswith(constants.REPLACE_KEY_DOLLAR_PREFIX): continue # We need to add the field to the collection if it doesn't # already exist. This will occur if this op was an aggregate, # which we ignore when recreating the schema f_type = type(v) if not k in fields: fields[k] = catalog.Collection.fieldFactory( k, catalog.fieldTypeToString(f_type)) fields[k]['query_use_count'] += 1 # No predicate for insert operations # No projections for insert operations if op['type'] != constants.OP_TYPE_INSERT: # Update how this key was used with predicates if workload.isOpRegex(op, field=k): op['predicates'][k] = constants.PRED_TYPE_REGEX elif isinstance(v, dict): op['predicates'][k] = constants.PRED_TYPE_RANGE elif not k in op['predicates']: op['predicates'][k] = constants.PRED_TYPE_EQUALITY ## TODO: Should we expect there to be field names with dot notation here? ## Or should have all been cleaned out by the converters? ## FOR return
def testIsOpRegex(self): op = { 'collection': 'blah', 'predicates': {'_id': constants.PRED_TYPE_REGEX}, 'query_aggregate': True, 'query_content': [ {'#query': {'_id': {'#options': 'XXXXXXX', '#regex': 'YYYYY'}}, 'count': 'site.songs', 'fields': None}], 'query_group': None, 'query_hash': 3563430808431869716L, 'query_id': 579750519L, 'query_limit': -1, 'query_offset': 0, 'query_size': 125, 'query_time': 1338410992.894204, 'resp_content': [{'n': 16, 'ok': 1}], 'resp_id': 108641633L, 'resp_size': 64, 'resp_time': 1338410992.911907, 'type': constants.OP_TYPE_QUERY, 'update_multi': None, 'update_upsert': None } ret = workload.isOpRegex(op) self.assertTrue(ret)
def processOpFields(self, fields, op, content): if self.debug: LOG.debug("Processing operation fields\n%s", pformat(content)) for k,v in content.iteritems(): # Skip anything that starts with our special char # Those are flag markers used by MongoDB's queries if k.startswith(constants.REPLACE_KEY_DOLLAR_PREFIX): continue # We need to add the field to the collection if it doesn't # already exist. This will occur if this op was an aggregate, # which we ignore when recreating the schema f_type = type(v) if not k in fields: fields[k] = catalog.Collection.fieldFactory(k, catalog.fieldTypeToString(f_type)) fields[k]['query_use_count'] += 1 # No predicate for insert operations # No projections for insert operations if op['type'] != constants.OP_TYPE_INSERT: # Update how this key was used with predicates if workload.isOpRegex(op, field=k): op['predicates'][k] = constants.PRED_TYPE_REGEX elif isinstance(v, dict): op['predicates'][k] = constants.PRED_TYPE_RANGE elif not k in op['predicates']: op['predicates'][k] = constants.PRED_TYPE_EQUALITY ## TODO: Should we expect there to be field names with dot notation here? ## Or should have all been cleaned out by the converters? ## FOR return
def __getIsOpRegex__(self, cache, op): isRegex = cache.op_regex.get(op["query_hash"], None) if isRegex is None: isRegex = workload.isOpRegex(op) if self.cache_enable: if self.debug: self.cache_miss_ctr.put("op_regex") cache.op_regex[op["query_hash"]] = isRegex elif self.debug: self.cache_hit_ctr.put("op_regex") return isRegex
def guess_op_info(self, design, op): """ Return a tuple containing the best index to use for this operation and a boolean flag that is true if that index covers the entire operation's query """ # Simply choose the index that has most of the fields # referenced in the operation. col_name = op["collection"] indexes = design.getIndexes(col_name) op_contents = workload.getOpContents(op) # extract the keys from op_contents op_index_list = [] for query in op_contents: for key in query.iterkeys(): op_index_list.append(key) # add the projection keys into op_index_set # The op["query_fileds"] is the projection hasProjectionField = False projectionFields = op.get("query_fields", None) if projectionFields: hasProjectionField = True for key in projectionFields.iterkeys(): op_index_list.append(key) best_index = None best_ratio = None for i in xrange(len(indexes)): field_cnt = 0 for indexKey in indexes[i]: indexMatch = indexKey in op_index_list # We can't use a field if it's being used in a regex operation if indexMatch and not workload.isOpRegex(op, field=indexKey): field_cnt += 1 if not indexMatch or field_cnt >= len(op_index_list): break field_ratio = field_cnt / float(len(indexes[i])) if not best_index or field_ratio >= best_ratio: # If the ratios are the same, then choose the # one with the most keys if field_ratio == best_ratio: if len(indexes[i]) <= len(best_index): continue if field_ratio != 0: best_index = indexes[i] best_ratio = field_ratio ## FOR if self.debug: LOG.debug("Op #%d - BestIndex:%s / BestRatio:%s", op["query_id"], best_index, best_ratio) # Check whether this is a covering index covering = False if hasProjectionField: if best_index and op["type"] == constants.OP_TYPE_QUERY: # Extract the indexes from best_index best_index_list = [] for index in best_index: best_index_list.append(index) if len(op_index_list) <= len(best_index_list): counter = 0 while counter < len(op_index_list): if op_index_list[counter] != best_index_list[counter]: break counter += 1 if counter == len(op_index_list): covering = True ## IF ## IF ## IF # Get the size of the best index if not self.no_index_size_estimation: index_size = 0 col_info = self.state.collections[col_name] index_size += getIndexSize(col_info, best_index) if col_name in self.parent_to_children_map: children_set = self.parent_to_children_map[col_name] if len(children_set) > 0: for child in children_set: col_info = self.state.collections[child] index_size += getIndexSize(col_info, best_index) ## FOR ## IF ## IF ## IF else: index_size = 1 # Get the slot size of this operation assert not col_name in self.child_collections, ( "collection %s should not be queried.\n child_collecitons: %s\ndesign: \n%s" % (col_name, self.child_collections, design) ) slot_size = 0 if col_name in self.col_cost_map: slot_size = int(math.ceil(self.col_cost_map[col_name])) else: slot_size = 1 if slot_size != 1: slot_size *= 100 return best_index, covering, index_size, slot_size
def guess_op_info(self, design, op): """ Return a tuple containing the best index to use for this operation and a boolean flag that is true if that index covers the entire operation's query """ # Simply choose the index that has most of the fields # referenced in the operation. col_name = op['collection'] indexes = design.getIndexes(col_name) op_contents = workload.getOpContents(op) # extract the keys from op_contents op_index_list = [] for query in op_contents: for key in query.iterkeys(): op_index_list.append(key) # add the projection keys into op_index_set # The op["query_fileds"] is the projection hasProjectionField = False projectionFields = op.get('query_fields', None) if projectionFields: hasProjectionField = True for key in projectionFields.iterkeys(): op_index_list.append(key) best_index = None best_ratio = None for i in xrange(len(indexes)): field_cnt = 0 for indexKey in indexes[i]: indexMatch = (indexKey in op_index_list) # We can't use a field if it's being used in a regex operation if indexMatch and not workload.isOpRegex(op, field=indexKey): field_cnt += 1 if not indexMatch or field_cnt >= len(op_index_list): break field_ratio = field_cnt / float(len(indexes[i])) if not best_index or field_ratio >= best_ratio: # If the ratios are the same, then choose the # one with the most keys if field_ratio == best_ratio: if len(indexes[i]) <= len(best_index): continue if field_ratio != 0: best_index = indexes[i] best_ratio = field_ratio ## FOR if self.debug: LOG.debug("Op #%d - BestIndex:%s / BestRatio:%s",\ op['query_id'], best_index, best_ratio) # Check whether this is a covering index covering = False if hasProjectionField: if best_index and op['type'] == constants.OP_TYPE_QUERY: # Extract the indexes from best_index best_index_list = [] for index in best_index: best_index_list.append(index) if len(op_index_list) <= len(best_index_list): counter = 0 while counter < len(op_index_list): if op_index_list[counter] != best_index_list[counter]: break counter += 1 if counter == len(op_index_list): covering = True ## IF ## IF ## IF # Get the size of the best index if not self.no_index_size_estimation: index_size = 0 col_info = self.state.collections[col_name] index_size += getIndexSize(col_info, best_index) if col_name in self.parent_to_children_map: children_set = self.parent_to_children_map[col_name] if len(children_set) > 0: for child in children_set: col_info = self.state.collections[child] index_size += getIndexSize(col_info, best_index) ## FOR ## IF ## IF ## IF else: index_size = 1 # Get the slot size of this operation assert not col_name in self.child_collections, "collection %s should not be queried.\n child_collecitons: %s\ndesign: \n%s" % (col_name, self.child_collections, design) slot_size = 0 if col_name in self.col_cost_map: slot_size = int(math.ceil(self.col_cost_map[col_name])) else: slot_size = 1 if slot_size != 1: slot_size *= 100 return best_index, covering, index_size, slot_size