def testPickle(self): h = Histogram() letters = [x for x in string.letters] + ["-"] for i in xrange(0, 100): key = "" for x in xrange(0, 10): key += random.choice(letters) assert len(key) > 0 h.put(key, delta=random.randint(1, 10)) assert h[key] > 0 ## FOR # Serialize import pickle p = pickle.dumps(h, -1) assert p # Deserialize clone = pickle.loads(p) assert clone for key in h.keys(): self.assertEquals(h[key], clone[key]) ## FOR self.assertEquals(h.getSampleCount(), clone.getSampleCount()) self.assertEquals(sorted(h.getMinCountKeys()), sorted(clone.getMinCountKeys()))
def __init__(self, collections, workload, config): assert isinstance(collections, dict) # LOG.setLevel(logging.DEBUG) self.debug = LOG.isEnabledFor(logging.DEBUG) self.collections = collections self.col_names = [col_name for col_name in collections.iterkeys()] self.workload = None # working workload self.originalWorload = workload # points to the original workload self.weight_network = config.get('weight_network', 1.0) self.weight_disk = config.get('weight_disk', 1.0) self.weight_skew = config.get('weight_skew', 1.0) self.num_nodes = config.get('nodes', 1) # Convert MB to bytes self.max_memory = config['max_memory'] * 1024 * 1024 self.skew_segments = config['skew_intervals'] # Why? "- 1" self.address_size = config['address_size'] / 4 self.estimator = NodeEstimator(collections, self.num_nodes) self.window_size = config['window_size'] # Build indexes from collections to sessions/operations # Note that this won't change dynamically based on denormalization schemes # It's up to the cost components to figure things out based on that self.restoreOriginalWorkload() # We need to know the number of operations in the original workload # so that all of our calculations are based on that self.orig_op_count = 0 for sess in self.originalWorload: self.orig_op_count += len(sess["operations"]) ## FOR ## ---------------------------------------------- ## CACHING ## ---------------------------------------------- self.cache_enable = True self.cache_miss_ctr = Histogram() self.cache_hit_ctr = Histogram() # ColName -> CacheHandle self.cache_handles = {}
def __init__(self): self.start = None self.stop = None self.txn_id = 0 self.opCount = 0 self.completed = [] # (txnName, timestamp) self.txn_counters = Histogram() self.txn_times = {} self.running = {}
def __init__(self, collections, num_nodes): assert isinstance(collections, dict) # LOG.setLevel(logging.DEBUG) self.debug = LOG.isEnabledFor(logging.DEBUG) self.collections = collections self.num_nodes = num_nodes # Keep track of how many times that we accessed each node self.nodeCounts = Histogram() self.op_count = 0
def computeInStats(query, h=None): for k, v in query.iteritems(): if k == "#in": if h is None: h = Histogram() h.put(len(v)) elif isinstance(v, list): for inner in v: if isinstance(inner, dict): h = computeInStats(inner, h) elif isinstance(v, dict): h = computeInStats(v, h) return h
def fixInvalidCollections(self): searchKey = { "operations.collection": constants.INVALID_COLLECTION_MARKER } for session in self.metadata_db.Session.find(searchKey): for op in session["operations"]: dirty = False if op["collection"] != constants.INVALID_COLLECTION_MARKER: continue if self.debug: LOG.debug("Attempting to fix corrupted Operation:\n%s" % pformat(op)) # For each field referenced in the query, build a histogram of # which collections have a field with the same name fields = workload.getReferencedFields(op) h = Histogram() for c in self.metadata_db.Collection.find(): for f in c['fields']: if f in fields: h.put(c['name']) ## FOR ## FOR matches = h.getMaxCountKeys() if len(matches) == 0: LOG.warn( "No matching collection was found for corrupted operation\n%s" % pformat(op)) continue elif len(matches) > 1: LOG.warn( "More than one matching collection was found for corrupted operation %s\n%s" % (matches, pformat(op))) continue else: op["collection"] = matches[0] dirty = True self.fix_ctr += 1 LOG.info("Fix corrupted collection in operation\n%s" % pformat(op)) ## IF ## FOR (operations) if dirty: session.save()
"workload_percent", ] STRIP_FIELDS = [ "predicates", "query_hash", "query_time", "query_size", "query_type", "query_id", "orig_query", "resp_.*", ] STRIP_REGEXES = [re.compile(r) for r in STRIP_FIELDS] QUERY_COUNTS = Histogram() QUERY_COLLECTION_COUNTS = Histogram() QUERY_HASH_XREF = {} QUERY_TOP_LIMIT = 10 ## ============================================== ## DUMP SCHEMA ## ============================================== def dumpSchema(writer, collection, fields, spacer=""): cur_spacer = spacer if len(spacer) > 0: cur_spacer += " - " for f_name in sorted(fields.iterkeys(), key=lambda x: x != "_id"): row = [] f = fields[f_name] for key in SCHEMA_COLUMNS:
def hash(self, op): """Compute a deterministic signature for the given operation based on its keys""" fields = None updateFields = None # QUERY if op["type"] == constants.OP_TYPE_QUERY: # The query field has our where clause if not "#query" in op["query_content"][0]: msg = "Missing query field in query_content for operation #%d" % op["query_id"] if self.debug: LOG.warn(pformat(op)) raise Exception(msg) fields = op["query_content"][0][constants.REPLACE_KEY_DOLLAR_PREFIX + "query"] # UPDATE elif op["type"] == constants.OP_TYPE_UPDATE: # The first element in the content field is the WHERE clause fields = op["query_content"][0] # We use a separate field for the updated columns so that updateFields = op['query_content'][1] # INSERT elif op["type"] == constants.OP_TYPE_INSERT: # They could be inserting more than one document here, # which all may have different fields... # So we will need to build a histogram for which keys are referenced # and use the onese that appear the most # XXX: We'll only consider keys in the first-level h = Histogram() for doc in op["query_content"]: assert type(doc) == dict, "Unexpected insert value:\n%s" % pformat(doc) for k in doc.keys(): h.put(k) ## FOR if LOG.isEnabledFor(logging.DEBUG): LOG.debug("Insert '%s' Keys Histogram:\n%s" % (op["collection"], h)) maxKeys = h.getMaxCountKeys() assert len(maxKeys) > 0, \ "No keys were found in %d insert documents?" % len(op["query_content"]) fields = { } for doc in op["query_content"]: for k, v in doc.iteritems(): if k in maxKeys: fields[k] = v ## FOR ## FOR # DELETE elif op["type"] == constants.OP_TYPE_DELETE: # The first element in the content field is the WHERE clause fields = op["query_content"][0] # UNKNOWN! else: raise Exception("Unexpected query type: %s" % op["type"]) # Extract the list of fields that are used try: fieldsHash = self.computeFieldsHash(fields) except: LOG.error("Unexpected error when processing operation %d [fields=%s]" % (op["query_id"], str(fields))) raise updateHash = self.computeFieldsHash(updateFields) if updateFields else None t = (op["collection"], op["type"], fieldsHash, updateHash) h = long(hash(t)) LOG.debug("%s %s => HASH:%d" % (fields, t, h)) self.histogram.put(h) return h
def __init__(self): self.histogram = Histogram() self.debug = LOG.isEnabledFor(logging.DEBUG) pass
def processDataFields(self, col_info, fields, doc): """ Recursively traverse a single document and extract out the field information """ if self.debug: LOG.debug("Extracting fields for document:\n%s" % pformat(doc)) # Check if the current doc has parent_col, but this will only apply to its fields parent_col = doc.get('parent_col', None) for k, v in doc.iteritems(): # Skip if this is the _id field if constants.SKIP_MONGODB_ID_FIELD and k == '_id': continue if k == constants.FUNCTIONAL_FIELD: continue f_type = type(v) f_type_str = catalog.fieldTypeToString(f_type) if not k in fields: # This is only subset of what we will compute for each field # See catalog.Collection for more information if self.debug: LOG.debug("Creating new field entry for '%s'" % k) fields[k] = catalog.Collection.fieldFactory(k, f_type_str) else: fields[k]['type'] = f_type_str # Sanity check # This won't work if the data is not uniform #if v != None: #assert fields[k]['type'] == f_type_str, \ #"Mismatched field types '%s' <> '%s' for '%s'" % (fields[k]['type'], f_type_str, k) # We will store the distinct values for each field in a set # that is embedded in the field. We will delete it when # we call computeFieldStats() if not 'distinct_values' in fields[k]: fields[k]['distinct_values'] = set() if not "num_values" in fields[k]: fields[k]['num_values'] = 0 # Likewise, we will also store a histogram for the different sizes # of each field. We will use this later on to compute the weighted average if not 'size_histogram' in fields[k]: fields[k]['size_histogram'] = Histogram() # Maintain a histogram of list lengths if not 'list_len' in fields[k]: fields[k]['list_len'] = Histogram() if fields[k]['query_use_count'] > 0 and not k in col_info[ 'interesting']: col_info['interesting'].append(k) ## ---------------------------------------------- ## NESTED FIELDS ## ---------------------------------------------- if isinstance(v, dict): # Check for a special data field if len(v) == 1 and v.keys()[0].startswith( constants.REPLACE_KEY_DOLLAR_PREFIX): v = v[v.keys()[0]] # HACK to handle lists (hopefully dict as well)from nested IN clauses... all_values = v if isinstance(v, list) else [v] for v in all_values: if isinstance(v, dict): v = v.values()[0] fields[k]['type'] = catalog.fieldTypeToString(type(v)) try: size = catalog.getEstimatedSize( fields[k]['type'], v) self.total_field_ctr += 1 except: if self.debug: LOG.error("Failed to estimate size for field '%s' in collection '%s'\n%s", \ k, col_info['name'], pformat(fields[k])) self.err_field_ctr += 1 LOG.info( "Total fields so far [%s], error fields [%s]", self.total_field_ctr, self.err_field_ctr) continue col_info['data_size'] += size fields[k]['size_histogram'].put(size) fields[k]['distinct_values'].add(v) fields[k]['num_values'] += 1 if parent_col: fields[k]['parent_col'] = parent_col ## FOR else: if self.debug: LOG.debug("Extracting keys in nested field for '%s'" % k) if not 'fields' in fields[k]: fields[k]['fields'] = {} self.processDataFields(col_info, fields[k]['fields'], doc[k]) ## ---------------------------------------------- ## LIST OF VALUES ## Could be either scalars or dicts. If it's a dict, then we'll just ## store the nested field information in the 'fields' value ## If it's a list, then we'll use a special marker 'LIST_INNER_FIELD' to ## store the field information for the inner values. ## ---------------------------------------------- elif isinstance(v, list): if self.debug: LOG.debug("Extracting keys in nested list for '%s'" % k) if not 'fields' in fields[k]: fields[k]['fields'] = {} list_len = len(doc[k]) fields[k]['list_len'].put(list_len) for i in xrange(list_len): inner_type = type(doc[k][i]) # More nested documents... if inner_type == dict: if self.debug: LOG.debug( "Extracting keys in nested field in list position %d for '%s'" % (i, k)) self.processDataFields(col_info, fields[k]['fields'], doc[k][i]) else: # TODO: We probably should store a list of types here in case # the list has different types of values inner = fields[k]['fields'].get( constants.LIST_INNER_FIELD, {}) inner['type'] = catalog.fieldTypeToString(inner_type) try: inner_size = catalog.getEstimatedSize( inner['type'], doc[k][i]) self.total_field_ctr += 1 except: if self.debug: LOG.error("Failed to estimate size for list entry #%d for field '%s' in collection '%s'\n%s",\ i, k, col_info['name'], pformat(fields[k])) self.err_field_ctr += 1 LOG.info( "Total fields so far [%s], error fields [%s]", self.total_field_ctr, self.err_field_ctr) continue fields[k]['fields'][constants.LIST_INNER_FIELD] = inner fields[k]['size_histogram'].put(inner_size) fields[k]['distinct_values'].add(doc[k][i]) fields[k]['num_values'] += 1 if parent_col: fields[k]['parent_col'] = parent_col ## FOR (list) ## ---------------------------------------------- ## SCALAR VALUES ## ---------------------------------------------- else: try: size = catalog.getEstimatedSize(fields[k]['type'], v) self.total_field_ctr += 1 except: LOG.error("Failed to estimate size for field %s in collection %s\n%s",\ k, col_info['name'], pformat(fields[k])) self.err_field_ctr += 1 LOG.info("Total fields so far [%s], error fields [%s]", self.total_field_ctr, self.err_field_ctr) continue col_info['data_size'] += size fields[k]['size_histogram'].put(size) fields[k]['distinct_values'].add(v) fields[k]['num_values'] += 1 if parent_col: fields[k]['parent_col'] = parent_col
def main(): # parser = optparse.OptionParser() parser = argparse.ArgumentParser(description='') parser.add_argument('-v', '--verbose', dest='verbose', action='count', help='Increase verbosity (specify' ' multiple times for more)') parser.add_argument('-g', '--print-hist', action='store_true', dest='hist', help='Print request latency histogram', default=False) parser.add_argument('-c', '--cores', dest='cores', action='store', help='Set the number of cores of the system', default=8) parser.add_argument('-n', '--network-cores', dest='network_cores', action='store', help='Set the number of networking' ' cores of the system', default=0) parser.add_argument('-s', '--seed', dest='seed', action='store', help='Set the seed for request generator') parser.add_argument('-t', '--sim_time', dest='sim_time', action='store', help='Set the simulation time', default=500000) parser.add_argument('--workload-conf', dest='work_conf', action='store', help='Configuration file for the load generation' ' functions', default="../config/work.json") group = parser.add_argument_group('Host Options') group.add_argument('--host-type', dest='host_type', action='store', help=('Set the host configuration (global queue,' ' local queue, shinjuku, per flow queues,' ' static core allocation)'), default='global') group.add_argument('--deq-cost', dest='deq_cost', action='store', help='Set the dequeuing cost', default=0.0) group.add_argument('--queue-policy', dest='queue_policy', action='store', help=('Set the queue policy to be followed by the per' ' flow queue, ignored in any other queue' ' configuration'), default='FlowQueues') parser.add_argument_group(group) group = parser.add_argument_group('Print Options') group.add_argument('--print-values', dest='print_values', action='store_true', help='Print all the latencies for' ' each flow', default=False) group.add_argument('--output-file', dest='output_file', action='store', help='File to print all latencies', default=None) opts = parser.parse_args() # Seeding if opts.seed: random.seed(int(opts.seed)) np.random.seed(int(opts.seed)) # Setup logging log_level = logging.WARNING if opts.verbose == 1: log_level = logging.INFO elif opts.verbose >= 2: log_level = logging.DEBUG logging.basicConfig(level=log_level) # Initialize the different components of the system env = simpy.Environment() # Parse the configuration file flow_config = json.loads(open(opts.work_conf).read()) # Create a histogram per flow and a global histogram histograms = Histogram(len(flow_config), float(opts.cores), flow_config, opts) # Get the queue configuration host_conf = getattr(sys.modules[__name__], gen_dict[opts.host_type]) sim_host = host_conf(env, int(opts.cores), histograms, float(opts.deq_cost), flow_config, opts) # TODO:Update so that it's parametrizable # print "Warning: Need to update sim.py for parameterization and Testing" # First list is time slice, second list is load # sim_host = StaticCoreAllocationHost(env, int(opts.cores), # float(opts.deq_cost), [0.0, 0.0], # histograms, len(flow_config), # [0.4, 0.4]) multigenerator = MultipleRequestGenerator(env, sim_host) # Create one object per flow for flow in flow_config: params = flow #work_gen = getattr(sys.modules[__name__], # gen_dict[params["work_gen"]]) # Need to generate less load when we have shinjuku because one # of the cores is just the dispatcher if (opts.host_type == "shinjuku"): opts.cores = int(opts.cores) - 1 multigenerator.add_generator( RequestGenerator(env, sim_host, int(opts.cores), params)) multigenerator.begin_generation() # Run the simulation env.run(until=opts.sim_time) # Print results in json format histograms.print_info()