示例#1
0
    def testPickle(self):
        h = Histogram()
        letters = [x for x in string.letters] + ["-"]

        for i in xrange(0, 100):
            key = ""
            for x in xrange(0, 10):
                key += random.choice(letters)
            assert len(key) > 0

            h.put(key, delta=random.randint(1, 10))
            assert h[key] > 0
        ## FOR

        # Serialize
        import pickle
        p = pickle.dumps(h, -1)
        assert p

        # Deserialize
        clone = pickle.loads(p)
        assert clone

        for key in h.keys():
            self.assertEquals(h[key], clone[key])
        ## FOR
        self.assertEquals(h.getSampleCount(), clone.getSampleCount())
        self.assertEquals(sorted(h.getMinCountKeys()),
                          sorted(clone.getMinCountKeys()))
示例#2
0
    def __init__(self, collections, workload, config):
        assert isinstance(collections, dict)
        #        LOG.setLevel(logging.DEBUG)
        self.debug = LOG.isEnabledFor(logging.DEBUG)

        self.collections = collections
        self.col_names = [col_name for col_name in collections.iterkeys()]
        self.workload = None  # working workload
        self.originalWorload = workload  # points to the original workload

        self.weight_network = config.get('weight_network', 1.0)
        self.weight_disk = config.get('weight_disk', 1.0)
        self.weight_skew = config.get('weight_skew', 1.0)
        self.num_nodes = config.get('nodes', 1)

        # Convert MB to bytes
        self.max_memory = config['max_memory'] * 1024 * 1024
        self.skew_segments = config['skew_intervals']  # Why? "- 1"
        self.address_size = config['address_size'] / 4

        self.estimator = NodeEstimator(collections, self.num_nodes)

        self.window_size = config['window_size']

        # Build indexes from collections to sessions/operations
        # Note that this won't change dynamically based on denormalization schemes
        # It's up to the cost components to figure things out based on that
        self.restoreOriginalWorkload()

        # We need to know the number of operations in the original workload
        # so that all of our calculations are based on that
        self.orig_op_count = 0
        for sess in self.originalWorload:
            self.orig_op_count += len(sess["operations"])
        ## FOR

        ## ----------------------------------------------
        ## CACHING
        ## ----------------------------------------------
        self.cache_enable = True
        self.cache_miss_ctr = Histogram()
        self.cache_hit_ctr = Histogram()

        # ColName -> CacheHandle
        self.cache_handles = {}
示例#3
0
 def __init__(self):
     self.start = None
     self.stop = None
     self.txn_id = 0
     self.opCount = 0
     self.completed = []  # (txnName, timestamp)
     self.txn_counters = Histogram()
     self.txn_times = {}
     self.running = {}
示例#4
0
    def __init__(self, collections, num_nodes):
        assert isinstance(collections, dict)
        #        LOG.setLevel(logging.DEBUG)
        self.debug = LOG.isEnabledFor(logging.DEBUG)
        self.collections = collections
        self.num_nodes = num_nodes

        # Keep track of how many times that we accessed each node
        self.nodeCounts = Histogram()
        self.op_count = 0
示例#5
0
def computeInStats(query, h=None):
    for k, v in query.iteritems():
        if k == "#in":
            if h is None: h = Histogram()
            h.put(len(v))
        elif isinstance(v, list):
            for inner in v:
                if isinstance(inner, dict):
                    h = computeInStats(inner, h)
        elif isinstance(v, dict):
            h = computeInStats(v, h)
    return h
示例#6
0
    def fixInvalidCollections(self):
        searchKey = {
            "operations.collection": constants.INVALID_COLLECTION_MARKER
        }
        for session in self.metadata_db.Session.find(searchKey):
            for op in session["operations"]:
                dirty = False
                if op["collection"] != constants.INVALID_COLLECTION_MARKER:
                    continue

                if self.debug:
                    LOG.debug("Attempting to fix corrupted Operation:\n%s" %
                              pformat(op))

                # For each field referenced in the query, build a histogram of
                # which collections have a field with the same name
                fields = workload.getReferencedFields(op)
                h = Histogram()
                for c in self.metadata_db.Collection.find():
                    for f in c['fields']:
                        if f in fields:
                            h.put(c['name'])
                    ## FOR
                ## FOR

                matches = h.getMaxCountKeys()
                if len(matches) == 0:
                    LOG.warn(
                        "No matching collection was found for corrupted operation\n%s"
                        % pformat(op))
                    continue
                elif len(matches) > 1:
                    LOG.warn(
                        "More than one matching collection was found for corrupted operation %s\n%s"
                        % (matches, pformat(op)))
                    continue
                else:
                    op["collection"] = matches[0]
                    dirty = True
                    self.fix_ctr += 1
                    LOG.info("Fix corrupted collection in operation\n%s" %
                             pformat(op))
                    ## IF
                    ## FOR (operations)

            if dirty: session.save()
示例#7
0
    "workload_percent",
]

STRIP_FIELDS = [
    "predicates",
    "query_hash",
    "query_time",
    "query_size",
    "query_type",
    "query_id",
    "orig_query",
    "resp_.*",
]
STRIP_REGEXES = [re.compile(r) for r in STRIP_FIELDS]

QUERY_COUNTS = Histogram()
QUERY_COLLECTION_COUNTS = Histogram()
QUERY_HASH_XREF = {}
QUERY_TOP_LIMIT = 10


## ==============================================
## DUMP SCHEMA
## ==============================================
def dumpSchema(writer, collection, fields, spacer=""):
    cur_spacer = spacer
    if len(spacer) > 0: cur_spacer += " - "
    for f_name in sorted(fields.iterkeys(), key=lambda x: x != "_id"):
        row = []
        f = fields[f_name]
        for key in SCHEMA_COLUMNS:
示例#8
0
    def hash(self, op):
        """Compute a deterministic signature for the given operation based on its keys"""
        
        fields = None
        updateFields = None
        
        # QUERY
        if op["type"] == constants.OP_TYPE_QUERY:
            # The query field has our where clause
            if not "#query" in op["query_content"][0]:
                msg = "Missing query field in query_content for operation #%d" % op["query_id"]
                if self.debug: LOG.warn(pformat(op))
                raise Exception(msg)

            fields = op["query_content"][0][constants.REPLACE_KEY_DOLLAR_PREFIX + "query"]

        # UPDATE
        elif op["type"] == constants.OP_TYPE_UPDATE:
            # The first element in the content field is the WHERE clause
            fields = op["query_content"][0]
            
            # We use a separate field for the updated columns so that 
            updateFields = op['query_content'][1]

        # INSERT
        elif op["type"] == constants.OP_TYPE_INSERT:
            # They could be inserting more than one document here,
            # which all may have different fields...
            # So we will need to build a histogram for which keys are referenced
            # and use the onese that appear the most
            # XXX: We'll only consider keys in the first-level
            h = Histogram()
            for doc in op["query_content"]:
                assert type(doc) == dict, "Unexpected insert value:\n%s" % pformat(doc)
                for k in doc.keys():
                    h.put(k)
            ## FOR
            if LOG.isEnabledFor(logging.DEBUG):
                LOG.debug("Insert '%s' Keys Histogram:\n%s" % (op["collection"], h))
            maxKeys = h.getMaxCountKeys()
            assert len(maxKeys) > 0, \
                "No keys were found in %d insert documents?" % len(op["query_content"])
            
            fields = { }
            for doc in op["query_content"]:
                for k, v in doc.iteritems():
                    if k in maxKeys:
                        fields[k] = v
                ## FOR
            ## FOR
            
        # DELETE
        elif op["type"] == constants.OP_TYPE_DELETE:
            # The first element in the content field is the WHERE clause
            fields = op["query_content"][0]
        # UNKNOWN!
        else:
            raise Exception("Unexpected query type: %s" % op["type"])
        
        # Extract the list of fields that are used
        try:
            fieldsHash = self.computeFieldsHash(fields)
        except:
            LOG.error("Unexpected error when processing operation %d [fields=%s]" % (op["query_id"], str(fields)))
            raise
        updateHash = self.computeFieldsHash(updateFields) if updateFields else None
        
        t = (op["collection"], op["type"], fieldsHash, updateHash)
        h = long(hash(t))
        LOG.debug("%s %s => HASH:%d" % (fields, t, h))
        self.histogram.put(h)
        return h
示例#9
0
 def __init__(self):
     self.histogram = Histogram()
     self.debug = LOG.isEnabledFor(logging.DEBUG)
     pass
示例#10
0
    def processDataFields(self, col_info, fields, doc):
        """
            Recursively traverse a single document and extract out the field information
        """
        if self.debug:
            LOG.debug("Extracting fields for document:\n%s" % pformat(doc))

        # Check if the current doc has parent_col, but this will only apply to its fields
        parent_col = doc.get('parent_col', None)

        for k, v in doc.iteritems():
            # Skip if this is the _id field
            if constants.SKIP_MONGODB_ID_FIELD and k == '_id': continue
            if k == constants.FUNCTIONAL_FIELD: continue
            f_type = type(v)
            f_type_str = catalog.fieldTypeToString(f_type)

            if not k in fields:
                # This is only subset of what we will compute for each field
                # See catalog.Collection for more information
                if self.debug:
                    LOG.debug("Creating new field entry for '%s'" % k)
                fields[k] = catalog.Collection.fieldFactory(k, f_type_str)
            else:
                fields[k]['type'] = f_type_str
                # Sanity check
                # This won't work if the data is not uniform
                #if v != None:
                #assert fields[k]['type'] == f_type_str, \
                #"Mismatched field types '%s' <> '%s' for '%s'" % (fields[k]['type'], f_type_str, k)

            # We will store the distinct values for each field in a set
            # that is embedded in the field. We will delete it when
            # we call computeFieldStats()
            if not 'distinct_values' in fields[k]:
                fields[k]['distinct_values'] = set()
            if not "num_values" in fields[k]:
                fields[k]['num_values'] = 0
            # Likewise, we will also store a histogram for the different sizes
            # of each field. We will use this later on to compute the weighted average
            if not 'size_histogram' in fields[k]:
                fields[k]['size_histogram'] = Histogram()
            # Maintain a histogram of list lengths
            if not 'list_len' in fields[k]:
                fields[k]['list_len'] = Histogram()

            if fields[k]['query_use_count'] > 0 and not k in col_info[
                    'interesting']:
                col_info['interesting'].append(k)

            ## ----------------------------------------------
            ## NESTED FIELDS
            ## ----------------------------------------------
            if isinstance(v, dict):
                # Check for a special data field
                if len(v) == 1 and v.keys()[0].startswith(
                        constants.REPLACE_KEY_DOLLAR_PREFIX):
                    v = v[v.keys()[0]]
                    # HACK to handle lists (hopefully dict as well)from nested IN clauses...
                    all_values = v if isinstance(v, list) else [v]
                    for v in all_values:
                        if isinstance(v, dict):
                            v = v.values()[0]

                        fields[k]['type'] = catalog.fieldTypeToString(type(v))
                        try:
                            size = catalog.getEstimatedSize(
                                fields[k]['type'], v)
                            self.total_field_ctr += 1
                        except:
                            if self.debug:
                                LOG.error("Failed to estimate size for field '%s' in collection '%s'\n%s", \
                                    k, col_info['name'], pformat(fields[k]))
                            self.err_field_ctr += 1
                            LOG.info(
                                "Total fields so far [%s], error fields [%s]",
                                self.total_field_ctr, self.err_field_ctr)
                            continue
                        col_info['data_size'] += size
                        fields[k]['size_histogram'].put(size)
                        fields[k]['distinct_values'].add(v)
                        fields[k]['num_values'] += 1
                        if parent_col:
                            fields[k]['parent_col'] = parent_col
                    ## FOR
                else:
                    if self.debug:
                        LOG.debug("Extracting keys in nested field for '%s'" %
                                  k)
                    if not 'fields' in fields[k]: fields[k]['fields'] = {}
                    self.processDataFields(col_info, fields[k]['fields'],
                                           doc[k])

            ## ----------------------------------------------
            ## LIST OF VALUES
            ## Could be either scalars or dicts. If it's a dict, then we'll just
            ## store the nested field information in the 'fields' value
            ## If it's a list, then we'll use a special marker 'LIST_INNER_FIELD' to
            ## store the field information for the inner values.
            ## ----------------------------------------------
            elif isinstance(v, list):
                if self.debug:
                    LOG.debug("Extracting keys in nested list for '%s'" % k)
                if not 'fields' in fields[k]: fields[k]['fields'] = {}

                list_len = len(doc[k])
                fields[k]['list_len'].put(list_len)
                for i in xrange(list_len):
                    inner_type = type(doc[k][i])
                    # More nested documents...
                    if inner_type == dict:
                        if self.debug:
                            LOG.debug(
                                "Extracting keys in nested field in list position %d for '%s'"
                                % (i, k))
                        self.processDataFields(col_info, fields[k]['fields'],
                                               doc[k][i])
                    else:
                        # TODO: We probably should store a list of types here in case
                        #       the list has different types of values
                        inner = fields[k]['fields'].get(
                            constants.LIST_INNER_FIELD, {})
                        inner['type'] = catalog.fieldTypeToString(inner_type)
                        try:
                            inner_size = catalog.getEstimatedSize(
                                inner['type'], doc[k][i])
                            self.total_field_ctr += 1
                        except:
                            if self.debug:
                                LOG.error("Failed to estimate size for list entry #%d for field '%s' in collection '%s'\n%s",\
                                      i, k, col_info['name'], pformat(fields[k]))
                            self.err_field_ctr += 1
                            LOG.info(
                                "Total fields so far [%s], error fields [%s]",
                                self.total_field_ctr, self.err_field_ctr)
                            continue

                        fields[k]['fields'][constants.LIST_INNER_FIELD] = inner
                        fields[k]['size_histogram'].put(inner_size)
                        fields[k]['distinct_values'].add(doc[k][i])
                        fields[k]['num_values'] += 1
                        if parent_col:
                            fields[k]['parent_col'] = parent_col
                ## FOR (list)
            ## ----------------------------------------------
            ## SCALAR VALUES
            ## ----------------------------------------------
            else:
                try:
                    size = catalog.getEstimatedSize(fields[k]['type'], v)
                    self.total_field_ctr += 1
                except:
                    LOG.error("Failed to estimate size for field %s in collection %s\n%s",\
                              k, col_info['name'], pformat(fields[k]))
                    self.err_field_ctr += 1
                    LOG.info("Total fields so far [%s], error fields [%s]",
                             self.total_field_ctr, self.err_field_ctr)
                    continue

                col_info['data_size'] += size
                fields[k]['size_histogram'].put(size)
                fields[k]['distinct_values'].add(v)
                fields[k]['num_values'] += 1
                if parent_col:
                    fields[k]['parent_col'] = parent_col
示例#11
0
def main():
    # parser = optparse.OptionParser()
    parser = argparse.ArgumentParser(description='')

    parser.add_argument('-v',
                        '--verbose',
                        dest='verbose',
                        action='count',
                        help='Increase verbosity (specify'
                        ' multiple times for more)')
    parser.add_argument('-g',
                        '--print-hist',
                        action='store_true',
                        dest='hist',
                        help='Print request latency histogram',
                        default=False)
    parser.add_argument('-c',
                        '--cores',
                        dest='cores',
                        action='store',
                        help='Set the number of cores of the system',
                        default=8)
    parser.add_argument('-n',
                        '--network-cores',
                        dest='network_cores',
                        action='store',
                        help='Set the number of networking'
                        ' cores of the system',
                        default=0)
    parser.add_argument('-s',
                        '--seed',
                        dest='seed',
                        action='store',
                        help='Set the seed for request generator')
    parser.add_argument('-t',
                        '--sim_time',
                        dest='sim_time',
                        action='store',
                        help='Set the simulation time',
                        default=500000)
    parser.add_argument('--workload-conf',
                        dest='work_conf',
                        action='store',
                        help='Configuration file for the load generation'
                        ' functions',
                        default="../config/work.json")

    group = parser.add_argument_group('Host Options')
    group.add_argument('--host-type',
                       dest='host_type',
                       action='store',
                       help=('Set the host configuration (global queue,'
                             ' local queue, shinjuku, per flow queues,'
                             ' static core allocation)'),
                       default='global')
    group.add_argument('--deq-cost',
                       dest='deq_cost',
                       action='store',
                       help='Set the dequeuing cost',
                       default=0.0)
    group.add_argument('--queue-policy',
                       dest='queue_policy',
                       action='store',
                       help=('Set the queue policy to be followed by the per'
                             ' flow queue, ignored in any other queue'
                             ' configuration'),
                       default='FlowQueues')
    parser.add_argument_group(group)

    group = parser.add_argument_group('Print Options')
    group.add_argument('--print-values',
                       dest='print_values',
                       action='store_true',
                       help='Print all the latencies for'
                       ' each flow',
                       default=False)
    group.add_argument('--output-file',
                       dest='output_file',
                       action='store',
                       help='File to print all latencies',
                       default=None)

    opts = parser.parse_args()

    # Seeding
    if opts.seed:
        random.seed(int(opts.seed))
        np.random.seed(int(opts.seed))

    # Setup logging
    log_level = logging.WARNING
    if opts.verbose == 1:
        log_level = logging.INFO
    elif opts.verbose >= 2:
        log_level = logging.DEBUG
    logging.basicConfig(level=log_level)

    # Initialize the different components of the system
    env = simpy.Environment()

    # Parse the configuration file
    flow_config = json.loads(open(opts.work_conf).read())

    # Create a histogram per flow and a global histogram
    histograms = Histogram(len(flow_config), float(opts.cores), flow_config,
                           opts)

    # Get the queue configuration
    host_conf = getattr(sys.modules[__name__], gen_dict[opts.host_type])
    sim_host = host_conf(env, int(opts.cores), histograms,
                         float(opts.deq_cost), flow_config, opts)

    # TODO:Update so that it's parametrizable
    # print "Warning: Need to update sim.py for parameterization and Testing"
    # First list is time slice, second list is load
    # sim_host = StaticCoreAllocationHost(env, int(opts.cores),
    #                                     float(opts.deq_cost), [0.0, 0.0],
    #                                     histograms, len(flow_config),
    #                                     [0.4, 0.4])

    multigenerator = MultipleRequestGenerator(env, sim_host)

    # Create one object per flow
    for flow in flow_config:
        params = flow
        #work_gen = getattr(sys.modules[__name__],
        #                   gen_dict[params["work_gen"]])

        # Need to generate less load when we have shinjuku because one
        # of the cores is just the dispatcher
        if (opts.host_type == "shinjuku"):
            opts.cores = int(opts.cores) - 1

        multigenerator.add_generator(
            RequestGenerator(env, sim_host, int(opts.cores), params))

    multigenerator.begin_generation()

    # Run the simulation
    env.run(until=opts.sim_time)

    # Print results in json format
    histograms.print_info()