예제 #1
2
    def __init__(self, metadata_db, dataset_db):
        self.metadata_db = metadata_db
        self.dataset_db = dataset_db
        self.stop_on_error = False
        self.limit = None
        self.skip = None
        self.clean = None
        self.no_load = False
        self.no_reconstruct = False

        self.total_ops = 0
        self.err_field_ctr = 0
        self.total_field_ctr = 0
        
        self.hasher = OpHasher()
        self.rng = random.Random()
        self.debug = LOG.isEnabledFor(logging.DEBUG)
예제 #2
0
    def __init__(self, metadata_db, dataset_db):
        self.metadata_db = metadata_db
        self.dataset_db = dataset_db
        self.stop_on_error = False
        self.limit = None
        self.skip = None
        self.clean = None
        self.no_load = False
        self.no_reconstruct = False

        self.total_ops = 0
        self.err_field_ctr = 0
        self.total_field_ctr = 0

        self.hasher = OpHasher()
        self.rng = random.Random()
        self.debug = LOG.isEnabledFor(logging.DEBUG)
예제 #3
0
class AbstractConverter():
    def __init__(self, metadata_db, dataset_db):
        self.metadata_db = metadata_db
        self.dataset_db = dataset_db
        self.stop_on_error = False
        self.limit = None
        self.skip = None
        self.clean = None
        self.no_load = False
        self.no_reconstruct = False

        self.total_ops = 0
        self.err_field_ctr = 0
        self.total_field_ctr = 0

        self.hasher = OpHasher()
        self.rng = random.Random()
        self.debug = LOG.isEnabledFor(logging.DEBUG)

    ## DEF

    def reset(self):
        # FIXME: This should be done with a single update query
        for col_info in self.metadata_db.Collection.fetch():
            for k, v in catalog.Collection.default_values.iteritems():
                if type(v) in [list, dict]: continue
                col_info[k] = v
            ## FOR

            ## TODO: This needs to be recursive
            for k, v in col_info['fields'].iteritems():
                v['query_use_count'] = 0
                v['query_hash'] = 0
                v['cardinality'] = 0
                v['selectivity'] = 0.0
                v['avg_size'] = 0
            col_info.save()
        ## FOR

    ## DEF

    def process(self,
                no_load=False,
                no_post_process=False,
                page_size=constants.DEFAULT_PAGE_SIZE):
        if not no_load: self.loadImpl()
        #self.printAllSessionsTime()
        if not no_post_process: self.postProcess(page_size)
        # self.printAllCollectionInfo()

    ## DEF

    def loadImpl(self):
        raise NotImplementedError("Unimplemented %s.loadImpl()" %
                                  self.__init__.im_class)

    ## DEF

    def printAllSessionsTime(self):
        for sess in self.metadata_db.Session.fetch():
            print "start_time: %s", pformat(sess['start_time'])
            print "end_time: %s", pformat(sess['end_time'])
        ## FOR
        exit("CUPCAKE")

    ## DEF

    def printAllOperations(self):
        for sess in self.metadata_db.Session.fetch():
            for op in sess['operations']:
                print "ops\n%s", pformat(op)
            ## FOR
        ## FOR
        exit("CUPCAKE")

    ## DEF

    def printAllDocs(self):
        for col_name in self.dataset_db.collection_names():
            for doc in self.dataset_db[col_name].find():
                print "col_name: ", col_name
                print "doc\n", pformat(doc)
            ## FOR
        ## FOR
        exit("CUPCAKE")

    ## DEF

    def countDocs(self):
        counter = 0
        for colName in self.dataset_db.collection_names():
            for doc in self.dataset_db[colName].find():
                counter += 1
            ## FOR
        ## FOR
        print "doc count: ", counter

    ## DEF

    def printAllCollectionInfo(self):
        for colName in self.dataset_db.collection_names():
            # Skip ignored collections
            if colName.split(".")[0] in constants.IGNORED_COLLECTIONS:
                continue
            col_info = self.metadata_db.Collection.one({'name': colName})
            print "col_name: ", col_info['name']
            print "col_info\n", pformat(col_info)
        ## FOR
        exit("CUPCAKE")

    ## DEF

    def postProcess(self, page_size=constants.DEFAULT_PAGE_SIZE):
        """
            The PostProcessor performs final calculations on an already loaded metadata
            and workload database. These calculations can be computed regardless of whether
            the catalog information came from MongoSniff or MySQL.
            This should only be invoked once after do the initial loading.
        """
        # STEP 1: Add query hashes
        self.addQueryHashes()

        # STEP 2: Process workload
        self.computeWorkloadStats()

        # STEP 3: Process dataset
        self.extractSchemaCatalog()

        # Let the converter implementation have a shot at it
        self.postProcessImpl()

        # Calculate the embedding ratio
        self.calculateEmbeddingRatio()

        # Finalize workload percentage statistics for each collection
        percent_total = 0.0
        for col_info in self.metadata_db.Collection.find():
            for k in ['doc_count', 'workload_queries', 'avg_doc_size']:
                assert not col_info[k] is None, "%s.%s == None" % (
                    col_info['name'], k)
            col_info['workload_percent'] = col_info[
                'workload_queries'] / float(self.total_ops)
            percent_total += col_info['workload_percent']

            col_info['max_pages'] = min(
                1,
                col_info['doc_count'] * col_info['avg_doc_size'] / page_size)
            col_info.save()
        ## FOR
        assert round(percent_total, 1) > 0.0 and round(percent_total, 1) <= 1.0,\
            "Invalid workload percent total %f [totalOps=%d]" % (percent_total, self.total_ops)

    ## DEF

    def postProcessImpl(self):
        LOG.warn("Unimplemented %s.postProcessImpl()" % self.__init__.im_class)
        pass

    ## DEF

    def getEmbeddingRatio(self, parent_col, child_col, foreign_key):
        '''
            count how many unique values does the parent collectio and child collection have for the
            given foreign_key
        '''
        commom_values = self.getCommonKeyValues(foreign_key, parent_col,
                                                child_col)

        parent_count_dict = self.getCountOfValues(parent_col, foreign_key,
                                                  commom_values)
        child_count_dict = self.getCountOfValues(child_col, foreign_key,
                                                 commom_values)
        #LOG.info("parent dict: %s", parent_count_dict)
        #LOG.info("child dict: %s", child_count_dict)

        geomean, parent_average = self.getRatio(parent_count_dict,
                                                child_count_dict,
                                                commom_values)
        return geomean, parent_average, commom_values

    ## DEF

    def getRatio(self, parent_count_dict, child_count_dict, values):
        ratio_list = []
        parent_sum = 0
        parent_count = 0

        for value in values:
            ratio_list.append(
                float(child_count_dict[value]) / parent_count_dict[value])
            parent_count += 1
            parent_sum += parent_count_dict[value]
        ## FOR

        if len(ratio_list) == 0:
            ratio_list.append(0)
        ## IF

        if parent_count > 0:
            parent_average = float(parent_sum) / parent_count
        else:
            parent_average = 0
        ## IF

        return mstats.gmean(ratio_list), parent_average

    ## DEF

    def getCountOfValues(self, col_name, key, values):
        """
            build a histogram of the number of d../inputs/abstractconverter.pyocuments found by the given value
        """
        value_count_dict = {x: 0 for x in values}

        self.dataset_db[col_name].create_index(key)

        for value in values:
            value_count_dict[value] = self.dataset_db[col_name].find({
                key: value
            }).count()
        ## FOR

        self.dataset_db[col_name].drop_indexes()

        return value_count_dict

    ## DEF

    def getCommonKeyValues(self, key, parent_col, child_col):
        parent_distinct_values = set(self.dataset_db[parent_col].distinct(key))
        child_distinct_values = set(self.dataset_db[child_col].distinct(key))

        intersect_values = parent_distinct_values.intersection(
            child_distinct_values)

        sample_num = len(intersect_values) if len(
            intersect_values
        ) < 10000 else 10000  # FIXME use this magic number 1000 so far
        sampled_values = self.rng.sample(intersect_values, sample_num)

        if self.debug: LOG.info("sampled values: %s", len(sampled_values))

        return sampled_values

    ## DEF

    def calculateEmbeddingRatio(self):
        for colName in self.dataset_db.collection_names():
            # Skip ignored collections
            if colName.split(".")[0] in constants.IGNORED_COLLECTIONS:
                continue
            if colName.endswith("$cmd"): continue

            LOG.info("Calculating embedding ratio for collection '%s'",
                     colName)

            # Get the collection information object
            # We will use this to store the number times each key is referenced in a query
            col_info = self.metadata_db.Collection.one({'name': colName})
            assert col_info, "col_name %s" % colName

            self.calculateFieldEmbeddingRato(col_info, col_info['fields'])
        ## FOR

    ## DEF

    def calculateFieldEmbeddingRato(self, col_info, fields):
        for k, field in fields.iteritems():
            # Compute list information
            if 'parent_candidates' in field and len(
                    field['parent_candidates']) > 0:
                for tup in field['parent_candidates']:
                    parent_col = tup[0]

                    if parent_col.endswith("$cmd"): continue

                    foreign_key = tup[1]
                    child_col = col_info['name']

                    count = 0  # we have no idea the relationship between these two collections, so we use them as both child and parent
                    while count < 2:
                        if count == 1:
                            tmp = parent_col
                            parent_col = child_col
                            child_col = tmp
                        # IF
                        if self.debug:
                            LOG.info("%s   --->   %s, key: %s", child_col,
                                     parent_col, foreign_key)

                        ratio, parent_average, commom_values = self.getEmbeddingRatio(
                            parent_col, child_col, foreign_key)

                        if self.debug:
                            LOG.info("ratio: %s", ratio)
                            LOG.info("parent_average: %s", parent_average)

                        if ratio > 1.0 and (
                                parent_average >= 1.0 and parent_average <= 2.0
                                and len(commom_values) > 1
                        ):  # if it is a 1:N relationship from parent to child
                            parent_col_info = self.metadata_db.Collection.fetch_one(
                                {"name": parent_col})
                            if self.debug:
                                LOG.info("%s might be embedded into %s",
                                         child_col, parent_col)

                            if child_col.find('.') != -1:
                                ## HACK HACK HACK
                                child_col = child_col.replace('.', '__')
                            ## IF
                            if child_col in parent_col_info['embedding_ratio']:
                                previous_ratio = parent_col_info[
                                    'embedding_ratio'][child_col]
                                if ratio > previous_ratio:
                                    parent_col_info['embedding_ratio'][
                                        child_col] = ratio
                            else:
                                parent_col_info['embedding_ratio'][
                                    child_col] = ratio

                            parent_col_info.save()
                        ## IF

                        count += 1
                    ## WHILE
                ## FOR
            ## IF

            if 'parent_col' in field and field[
                    'parent_col'] and 'parent_key' in field and field[
                        'parent_key']:  # This is probably only used by mysql trace
                # add the ratio to the parent collection
                parent_col_info = self.metadata_db.Collection.fetch_one(
                    {"name": field['parent_col']})
                ratio, parent_average, commom_values = self.getEmbeddingRatio(
                    field['parent_col'], col_info['name'], field["parent_key"])

                if col_info['name'] in parent_col_info['embedding_ratio']:
                    previous_ratio = parent_col_info['embedding_ratio'][
                        col_info['name']]
                    if ratio > previous_ratio:
                        parent_col_info['embedding_ratio'][
                            col_info['name']] = ratio
                else:
                    parent_col_info['embedding_ratio'][
                        col_info['name']] = ratio
                ## IF

                LOG.info("child col: %s", col_info['name'])
                LOG.info("parent col: %s", field['parent_col'])
                LOG.info("embedding_ratio: %s",
                         parent_col_info['embedding_ratio'][col_info['name']])
                parent_col_info.save()

            if 'fields' in field and field['fields']:
                self.computeFieldStats(col_info, field['fields'])
        ## FOR

    ## DEF

    def addQueryHashes(self):
        total = self.metadata_db.Session.find().count()
        LOG.info("Adding query hashes to %d sessions" % total)
        invalid_sess = []
        for sess in self.metadata_db.Session.fetch():
            # It is possible that the query content of some operations, which will
            # cause sess.save() to fail. To work this around, we just delete the incomplete op
            # from the session
            for i in xrange(len(sess['operations']) - 1, -1, -1):
                op = sess['operations'][i]
                # For some reason even though the hash is an int, it
                # gets converted into a long when it shows up in MongoDB
                # I don't know why and frankly, I don't really care
                # So we'll just cast everything to a long and make sure that
                # the Session schema expects it to be a long
                try:
                    op["query_hash"] = long(self.hasher.hash(op))
                    if self.debug:
                        LOG.debug("  %d.%d -> Hash:%d", sess["session_id"],
                                  op["query_id"], op["query_hash"])
                except Exception:
                    sess['operations'].remove(op)
                ## TRY
            try:
                sess.save()
            except Exception:
                if self.debug:
                    LOG.info(
                        "session can not be saved due to operations error. Dump:\n%s",
                        sess['operations'])
                sess.delete()
            if self.debug: LOG.debug("Updated Session #%d", sess["session_id"])
        ## FOR
        if self.debug:
            LOG.debug("Query Class Histogram:\n%s" % self.hasher.histogram)
        ## IF

    ## DEF

    def computeWorkloadStats(self):
        """Process Workload Trace"""
        LOG.info("Computing database statistics from workload trace")

        # We'll maintain a local cache of Collection objects so that
        # we only have to look-up/create each one once, and then we can
        # save them all together at the end
        collectionCache = {}
        for sess in self.metadata_db.Session.fetch():
            start_time = None
            end_time = None

            for op in sess['operations']:
                # We need to know the total number of queries that we've seen
                self.total_ops += 1

                # The start_time is the timestamp of when the first query occurs
                if not start_time: start_time = op['query_time']
                start_time = min(start_time, op['query_time'])

                # The end_time is the timestamp of when the last response arrives
                if 'resp_time' in op and op['resp_time']:
                    end_time = max(end_time, op['resp_time'])
                elif not end_time and op['query_time']:
                    end_time = op['query_time']

                # Get the collection information object
                # We will use this to store the number times each key is referenced in a query
                if not op['collection'] in collectionCache:
                    if op['collection'] in constants.IGNORED_COLLECTIONS or op[
                            'collection'].endswith("$cmd"):
                        continue
                    col_info = self.metadata_db.Collection.one(
                        {'name': op['collection']})
                    if not col_info:
                        col_info = self.metadata_db.Collection()
                        col_info['name'] = op['collection']
                    collectionCache[op['collection']] = col_info
                    col_info['workload_queries'] = 0

                col_info = collectionCache[op['collection']]
                col_info['workload_queries'] += 1

                if not 'predicates' in op or not op['predicates']:
                    op['predicates'] = {}

                try:
                    for content in workload.getOpContents(op):
                        self.processOpFields(col_info['fields'], op, content)
                except:
                    LOG.error("Unexpected error for operation #%d in Session #%d\n%s", \
                              op['query_id'], sess['session_id'], pformat(op))
                    raise
            ## FOR (operations)

            if start_time and end_time:
                sess['start_time'] = start_time
                sess['end_time'] = end_time

            if self.debug:
                LOG.debug("Updating Session #%d" % sess['session_id'])
            try:
                sess.save()
            except:
                LOG.error("Failed to update Session #%d", sess['session_id'])
                raise
        ## FOR (sessions)

        # Save off all our skanky stink boxes right here...
        for col_info in collectionCache.itervalues():
            col_info.save()

    ## DEF

    def processOpFields(self, fields, op, content):
        if self.debug:
            LOG.debug("Processing operation fields\n%s", pformat(content))
        for k, v in content.iteritems():
            # Skip anything that starts with our special char
            # Those are flag markers used by MongoDB's queries
            if k.startswith(constants.REPLACE_KEY_DOLLAR_PREFIX):
                continue

            # We need to add the field to the collection if it doesn't
            # already exist. This will occur if this op was an aggregate,
            # which we ignore when recreating the schema
            f_type = type(v)
            if not k in fields:
                fields[k] = catalog.Collection.fieldFactory(
                    k, catalog.fieldTypeToString(f_type))
            fields[k]['query_use_count'] += 1

            # No predicate for insert operations
            # No projections for insert operations
            if op['type'] != constants.OP_TYPE_INSERT:
                # Update how this key was used with predicates
                if workload.isOpRegex(op, field=k):
                    op['predicates'][k] = constants.PRED_TYPE_REGEX
                elif isinstance(v, dict):
                    op['predicates'][k] = constants.PRED_TYPE_RANGE
                elif not k in op['predicates']:
                    op['predicates'][k] = constants.PRED_TYPE_EQUALITY

            ## TODO: Should we expect there to be field names with dot notation here?
            ##       Or should have all been cleaned out by the converters?

        ## FOR

        return

    ## DEF

    ## ==============================================
    ## SCHEMA EXTRACTION
    ## ==============================================

    def extractSchemaCatalog(self, sample_rate=100):
        """
            Iterates through all documents and infers the schema.
            This only needs to extract the schema skeleton. The
            post-processing stuff in the AbstractConverter will populate
            the statistics information for each collection
        """
        LOG.info("Extracting database schema catalog from workload trace")
        for colName in self.dataset_db.collection_names():
            # Skip ignored collections
            if colName.split(".")[0] in constants.IGNORED_COLLECTIONS:
                continue
            LOG.info(
                "Extracting schema catalog information from collection '%s'",
                colName)

            # Get the collection information object
            # We will use this to store the number times each key is referenced in a query
            col_info = self.metadata_db.Collection.one({'name': colName})
            if not col_info:
                col_info = self.metadata_db.Collection()
                col_info['name'] = colName

            col_info['doc_count'] = 0
            col_info['data_size'] = 0l

            # Examine each document in the dataset for this collection
            for doc in self.dataset_db[colName].find():
                col_info['doc_count'] += 1
                if random.randint(0, 100) <= sample_rate:
                    try:
                        self.processDataFields(col_info, col_info['fields'],
                                               doc)
                    except:
                        msg = "Unexpected error when processing '%s' data fields" % colName
                        msg += "\n" + pformat(doc)
                        LOG.error(msg)
                        raise
            ## FOR

            # Calculate average tuple size (if we have at least one)
            if not col_info['doc_count']:
                col_info['avg_doc_size'] = int(col_info['data_size'])
            else:
                col_info['avg_doc_size'] = int(col_info['data_size'] /
                                               col_info['doc_count'])

            # Calculate cardinality and selectivity
            self.computeFieldStats(col_info, col_info['fields'])

            if self.debug:
                LOG.debug("Saved new catalog entry for collection '%s'" %
                          colName)
            try:
                col_info.save()
            except Exception:
                if self.debug:
                    LOG.info("abnormal col_info\n%s", pformat(col_info))
                raise
        ## FOR

    ## DEF

    def processDataFields(self, col_info, fields, doc):
        """
            Recursively traverse a single document and extract out the field information
        """
        if self.debug:
            LOG.debug("Extracting fields for document:\n%s" % pformat(doc))

        # Check if the current doc has parent_col, but this will only apply to its fields
        parent_col = doc.get('parent_col', None)

        for k, v in doc.iteritems():
            # Skip if this is the _id field
            if constants.SKIP_MONGODB_ID_FIELD and k == '_id': continue
            if k == constants.FUNCTIONAL_FIELD: continue
            f_type = type(v)
            f_type_str = catalog.fieldTypeToString(f_type)

            if not k in fields:
                # This is only subset of what we will compute for each field
                # See catalog.Collection for more information
                if self.debug:
                    LOG.debug("Creating new field entry for '%s'" % k)
                fields[k] = catalog.Collection.fieldFactory(k, f_type_str)
            else:
                fields[k]['type'] = f_type_str
                # Sanity check
                # This won't work if the data is not uniform
                #if v != None:
                #assert fields[k]['type'] == f_type_str, \
                #"Mismatched field types '%s' <> '%s' for '%s'" % (fields[k]['type'], f_type_str, k)

            # We will store the distinct values for each field in a set
            # that is embedded in the field. We will delete it when
            # we call computeFieldStats()
            if not 'distinct_values' in fields[k]:
                fields[k]['distinct_values'] = set()
            if not "num_values" in fields[k]:
                fields[k]['num_values'] = 0
            # Likewise, we will also store a histogram for the different sizes
            # of each field. We will use this later on to compute the weighted average
            if not 'size_histogram' in fields[k]:
                fields[k]['size_histogram'] = Histogram()
            # Maintain a histogram of list lengths
            if not 'list_len' in fields[k]:
                fields[k]['list_len'] = Histogram()

            if fields[k]['query_use_count'] > 0 and not k in col_info[
                    'interesting']:
                col_info['interesting'].append(k)

            ## ----------------------------------------------
            ## NESTED FIELDS
            ## ----------------------------------------------
            if isinstance(v, dict):
                # Check for a special data field
                if len(v) == 1 and v.keys()[0].startswith(
                        constants.REPLACE_KEY_DOLLAR_PREFIX):
                    v = v[v.keys()[0]]
                    # HACK to handle lists (hopefully dict as well)from nested IN clauses...
                    all_values = v if isinstance(v, list) else [v]
                    for v in all_values:
                        if isinstance(v, dict):
                            v = v.values()[0]

                        fields[k]['type'] = catalog.fieldTypeToString(type(v))
                        try:
                            size = catalog.getEstimatedSize(
                                fields[k]['type'], v)
                            self.total_field_ctr += 1
                        except:
                            if self.debug:
                                LOG.error("Failed to estimate size for field '%s' in collection '%s'\n%s", \
                                    k, col_info['name'], pformat(fields[k]))
                            self.err_field_ctr += 1
                            LOG.info(
                                "Total fields so far [%s], error fields [%s]",
                                self.total_field_ctr, self.err_field_ctr)
                            continue
                        col_info['data_size'] += size
                        fields[k]['size_histogram'].put(size)
                        fields[k]['distinct_values'].add(v)
                        fields[k]['num_values'] += 1
                        if parent_col:
                            fields[k]['parent_col'] = parent_col
                    ## FOR
                else:
                    if self.debug:
                        LOG.debug("Extracting keys in nested field for '%s'" %
                                  k)
                    if not 'fields' in fields[k]: fields[k]['fields'] = {}
                    self.processDataFields(col_info, fields[k]['fields'],
                                           doc[k])

            ## ----------------------------------------------
            ## LIST OF VALUES
            ## Could be either scalars or dicts. If it's a dict, then we'll just
            ## store the nested field information in the 'fields' value
            ## If it's a list, then we'll use a special marker 'LIST_INNER_FIELD' to
            ## store the field information for the inner values.
            ## ----------------------------------------------
            elif isinstance(v, list):
                if self.debug:
                    LOG.debug("Extracting keys in nested list for '%s'" % k)
                if not 'fields' in fields[k]: fields[k]['fields'] = {}

                list_len = len(doc[k])
                fields[k]['list_len'].put(list_len)
                for i in xrange(list_len):
                    inner_type = type(doc[k][i])
                    # More nested documents...
                    if inner_type == dict:
                        if self.debug:
                            LOG.debug(
                                "Extracting keys in nested field in list position %d for '%s'"
                                % (i, k))
                        self.processDataFields(col_info, fields[k]['fields'],
                                               doc[k][i])
                    else:
                        # TODO: We probably should store a list of types here in case
                        #       the list has different types of values
                        inner = fields[k]['fields'].get(
                            constants.LIST_INNER_FIELD, {})
                        inner['type'] = catalog.fieldTypeToString(inner_type)
                        try:
                            inner_size = catalog.getEstimatedSize(
                                inner['type'], doc[k][i])
                            self.total_field_ctr += 1
                        except:
                            if self.debug:
                                LOG.error("Failed to estimate size for list entry #%d for field '%s' in collection '%s'\n%s",\
                                      i, k, col_info['name'], pformat(fields[k]))
                            self.err_field_ctr += 1
                            LOG.info(
                                "Total fields so far [%s], error fields [%s]",
                                self.total_field_ctr, self.err_field_ctr)
                            continue

                        fields[k]['fields'][constants.LIST_INNER_FIELD] = inner
                        fields[k]['size_histogram'].put(inner_size)
                        fields[k]['distinct_values'].add(doc[k][i])
                        fields[k]['num_values'] += 1
                        if parent_col:
                            fields[k]['parent_col'] = parent_col
                ## FOR (list)
            ## ----------------------------------------------
            ## SCALAR VALUES
            ## ----------------------------------------------
            else:
                try:
                    size = catalog.getEstimatedSize(fields[k]['type'], v)
                    self.total_field_ctr += 1
                except:
                    LOG.error("Failed to estimate size for field %s in collection %s\n%s",\
                              k, col_info['name'], pformat(fields[k]))
                    self.err_field_ctr += 1
                    LOG.info("Total fields so far [%s], error fields [%s]",
                             self.total_field_ctr, self.err_field_ctr)
                    continue

                col_info['data_size'] += size
                fields[k]['size_histogram'].put(size)
                fields[k]['distinct_values'].add(v)
                fields[k]['num_values'] += 1
                if parent_col:
                    fields[k]['parent_col'] = parent_col
        ## FOR

    ## DEF

    def computeFieldStats(self, col_info, fields):
        """
            Recursively calculate the cardinality of each field.
            This should only be invoked after processDataFields() has been called
        """
        for k, field in fields.iteritems():
            # Compute list information
            if 'list_len' in field:
                if len(field['list_len']) > 0:
                    all_values = field['list_len'].getAllValues()
                    field['list_len_min'] = int(min(all_values))
                    field['list_len_max'] = int(max(all_values))
                    field['list_len_avg'] = numpy.average(all_values)
                    field['list_len_stdev'] = numpy.average(all_values)
                del field['list_len']

            # Compute a weighted average for each field
            if 'size_histogram' in field:
                h = field['size_histogram']
                total = 0.0
                for size, count in h.iteritems():
                    if count: total += (size * count)
                num_samples = h.getSampleCount()
                if num_samples:
                    field['avg_size'] = int(math.ceil(total / num_samples))
                else:
                    field['avg_size'] = 0
                del field['size_histogram']
            # Use the distinct values set to determine cardinality + selectivity
            if 'distinct_values' in field:
                #                print "*" * 20
                #                print "col_name: ", col_info['name']
                #                print "field: ", k
                #                print "distinct values: ", pformat(field['distinct_values'])
                field['cardinality'] = len(field['distinct_values'])
                if field['num_values'] == 0:
                    field['selectivity'] = 0.0
                else:
                    field['selectivity'] = float(
                        field['cardinality']) / field['num_values']
                del field['distinct_values']
                del field['num_values']

            if 'fields' in field and field['fields']:
                self.computeFieldStats(col_info, field['fields'])
        ## FOR

    ## DEF

    ## ==============================================
    ## OPERATION FIXIN'
    ## ==============================================

    def fixInvalidCollections(self):
        searchKey = {
            "operations.collection": constants.INVALID_COLLECTION_MARKER
        }
        for session in self.metadata_db.Session.find(searchKey):
            for op in session["operations"]:
                dirty = False
                if op["collection"] != constants.INVALID_COLLECTION_MARKER:
                    continue

                if self.debug:
                    LOG.debug("Attempting to fix corrupted Operation:\n%s" %
                              pformat(op))

                # For each field referenced in the query, build a histogram of
                # which collections have a field with the same name
                fields = workload.getReferencedFields(op)
                h = Histogram()
                for c in self.metadata_db.Collection.find():
                    for f in c['fields']:
                        if f in fields:
                            h.put(c['name'])
                    ## FOR
                ## FOR

                matches = h.getMaxCountKeys()
                if len(matches) == 0:
                    LOG.warn(
                        "No matching collection was found for corrupted operation\n%s"
                        % pformat(op))
                    continue
                elif len(matches) > 1:
                    LOG.warn(
                        "More than one matching collection was found for corrupted operation %s\n%s"
                        % (matches, pformat(op)))
                    continue
                else:
                    op["collection"] = matches[0]
                    dirty = True
                    self.fix_ctr += 1
                    LOG.info("Fix corrupted collection in operation\n%s" %
                             pformat(op))
                    ## IF
                    ## FOR (operations)

            if dirty: session.save()
예제 #4
0
class AbstractConverter:
    def __init__(self, metadata_db, dataset_db, num_nodes=1):
        self.metadata_db = metadata_db
        self.dataset_db = dataset_db
        self.stop_on_error = False
        self.limit = None
        self.skip = None
        self.clean = None
        self.no_load = False
        self.no_reconstruct = False

        self.total_ops = 0
        self.err_field_ctr = 0
        self.total_field_ctr = 0

        self.hasher = OpHasher()
        self.rng = random.Random()
        self.debug = LOG.isEnabledFor(logging.DEBUG)
        self.num_nodes = num_nodes

    ## DEF

    def reset(self):
        # FIXME: This should be done with a single update query
        for col_info in self.metadata_db.Collection.fetch():
            for k, v in catalog.Collection.default_values.iteritems():
                if type(v) in [list, dict]:
                    continue
                col_info[k] = v
            ## FOR

            ## TODO: This needs to be recursive
            for k, v in col_info["fields"].iteritems():
                v["query_use_count"] = 0
                v["query_hash"] = 0
                v["cardinality"] = 0
                v["ranges"] = []
                v["selectivity"] = 0.0
                v["avg_size"] = 0
            col_info.save()
        ## FOR

    ## DEF

    def process(self, no_load=False, no_post_process=False, page_size=constants.DEFAULT_PAGE_SIZE):
        if not no_load:
            self.loadImpl()
        # self.printAllSessionsTime()
        if not no_post_process:
            self.postProcess(page_size)
        # self.printAllCollectionInfo()

    ## DEF

    def loadImpl(self):
        raise NotImplementedError("Unimplemented %s.loadImpl()" % self.__init__.im_class)

    ## DEF

    def printAllSessionsTime(self):
        for sess in self.metadata_db.Session.fetch():
            print "start_time: %s", pformat(sess["start_time"])
            print "end_time: %s", pformat(sess["end_time"])
        ## FOR
        exit("CUPCAKE")

    ## DEF

    def printAllOperations(self):
        for sess in self.metadata_db.Session.fetch():
            for op in sess["operations"]:
                print "ops\n%s", pformat(op)
            ## FOR
        ## FOR
        exit("CUPCAKE")

    ## DEF

    def printAllDocs(self):
        for col_name in self.dataset_db.collection_names():
            for doc in self.dataset_db[col_name].find():
                print "col_name: ", col_name
                print "doc\n", pformat(doc)
            ## FOR
        ## FOR
        exit("CUPCAKE")

    ## DEF

    def countDocs(self):
        counter = 0
        for colName in self.dataset_db.collection_names():
            for doc in self.dataset_db[colName].find():
                counter += 1
            ## FOR
        ## FOR
        print "doc count: ", counter

    ## DEF

    def printAllCollectionInfo(self):
        for colName in self.dataset_db.collection_names():
            # Skip ignored collections
            if colName.split(".")[0] in constants.IGNORED_COLLECTIONS:
                continue
            col_info = self.metadata_db.Collection.one({"name": colName})
            print "col_name: ", col_info["name"]
            print "col_info\n", pformat(col_info)
        ## FOR
        exit("CUPCAKE")

    ## DEF

    def postProcess(self, page_size=constants.DEFAULT_PAGE_SIZE):
        """
            The PostProcessor performs final calculations on an already loaded metadata
            and workload database. These calculations can be computed regardless of whether
            the catalog information came from MongoSniff or MySQL.
            This should only be invoked once after do the initial loading.
        """
        # STEP 1: Add query hashes
        self.addQueryHashes()

        # STEP 2: Process workload
        self.computeWorkloadStats()

        # STEP 3: Process dataset
        self.extractSchemaCatalog()

        # Let the converter implementation have a shot at it
        self.postProcessImpl()

        # Calculate the embedding ratio
        self.calculateEmbeddingRatio()

        # Finalize workload percentage statistics for each collection
        percent_total = 0.0
        for col_info in self.metadata_db.Collection.find():
            for k in ["doc_count", "workload_queries", "avg_doc_size"]:
                assert not col_info[k] is None, "%s.%s == None" % (col_info["name"], k)
            col_info["workload_percent"] = col_info["workload_queries"] / float(self.total_ops)
            percent_total += col_info["workload_percent"]

            col_info["max_pages"] = min(1, col_info["doc_count"] * col_info["avg_doc_size"] / page_size)
            col_info.save()
        ## FOR
        assert (
            round(percent_total, 1) > 0.0 and round(percent_total, 1) <= 1.0
        ), "Invalid workload percent total %f [totalOps=%d]" % (percent_total, self.total_ops)

    ## DEF

    def postProcessImpl(self):
        LOG.warn("Unimplemented %s.postProcessImpl()" % self.__init__.im_class)
        pass

    ## DEF

    def getEmbeddingRatio(self, parent_col, child_col, foreign_key, key):
        """
            count how many unique values does the parent collectio and child collection have for the
            given foreign_key
        """
        commom_values = self.getCommonKeyValues(foreign_key, key, parent_col, child_col)

        parent_count_dict = self.getCountOfValues(parent_col, foreign_key, commom_values)
        child_count_dict = self.getCountOfValues(child_col, key, commom_values)
        # LOG.info("parent dict: %s", parent_count_dict)
        # LOG.info("child dict: %s", child_count_dict)

        geomean, parent_average = self.getRatio(parent_count_dict, child_count_dict, commom_values)
        return geomean, parent_average, commom_values

    ## DEF

    def getRatio(self, parent_count_dict, child_count_dict, values):
        ratio_list = []
        parent_sum = 0
        parent_count = 0

        for value in values:
            if parent_count_dict[value] == 0:
                if child_count_dict[value] == 0:
                    ratio_list.append(0)
                else:
                    ratio_list.append(float("inf"))
            else:
                ratio_list.append(float(child_count_dict[value]) / parent_count_dict[value])
            parent_count += 1
            parent_sum += parent_count_dict[value]
        ## FOR

        if len(ratio_list) == 0:
            ratio_list.append(0)
        ## IF

        if parent_count > 0:
            parent_average = float(parent_sum) / parent_count
        else:
            parent_average = 0
        ## IF

        return mstats.gmean(ratio_list), parent_average

    ## DEF

    def getCountOfValues(self, col_name, key, values):
        """
            build a histogram of the number of d../inputs/abstractconverter.pyocuments found by the given value
        """
        value_count_dict = {x: 0 for x in values}

        self.dataset_db[col_name].create_index(key)

        for value in values:
            value_count_dict[value] = self.dataset_db[col_name].find({key: value}).count()
        ## FOR

        self.dataset_db[col_name].drop_indexes()

        return value_count_dict

    ## DEF

    def getCommonKeyValues(self, foreign_key, key, parent_col, child_col):
        parent_distinct_values = set(self.dataset_db[parent_col].distinct(foreign_key))
        child_distinct_values = set(self.dataset_db[child_col].distinct(key))

        intersect_values = parent_distinct_values.intersection(child_distinct_values)

        sample_num = (
            len(intersect_values) if len(intersect_values) < 10000 else 10000
        )  # FIXME use this magic number 1000 so far
        sampled_values = self.rng.sample(intersect_values, sample_num)

        if self.debug:
            LOG.info("sampled values: %s", len(sampled_values))

        return sampled_values

    ## DEF

    def calculateEmbeddingRatio(self):
        for colName in self.dataset_db.collection_names():
            # Skip ignored collections
            if colName.split(".")[0] in constants.IGNORED_COLLECTIONS:
                continue
            if colName.endswith("$cmd"):
                continue

            LOG.info("Calculating embedding ratio for collection '%s'", colName)

            # Get the collection information object
            # We will use this to store the number times each key is referenced in a query
            col_info = self.metadata_db.Collection.one({"name": colName})
            assert col_info, "col_name %s" % colName

            self.calculateFieldEmbeddingRato(col_info, col_info["fields"])
        ## FOR

    ## DEF

    def calculateFieldEmbeddingRato(self, col_info, fields):
        for k, field in fields.iteritems():
            # Compute list information
            if "parent_candidates" in field and len(field["parent_candidates"]) > 0:
                for tup in field["parent_candidates"]:
                    parent_col = tup[0]

                    if parent_col.endswith("$cmd"):
                        continue

                    foreign_key = tup[1]
                    child_col = col_info["name"]

                    count = (
                        0
                    )  # we have no idea the relationship between these two collections, so we use them as both child and parent
                    while count < 2:
                        if count == 1:
                            tmp = parent_col
                            parent_col = child_col
                            child_col = tmp
                        # IF
                        if self.debug:
                            LOG.info("%s   --->   %s, key: %s", child_col, parent_col, foreign_key)

                        ratio, parent_average, commom_values = self.getEmbeddingRatio(
                            parent_col, child_col, foreign_key, k
                        )

                        if self.debug:
                            LOG.info("ratio: %s", ratio)
                            LOG.info("parent_average: %s", parent_average)

                        if ratio > 1.0 and (
                            parent_average >= 1.0 and parent_average <= 2.0 and len(commom_values) > 1
                        ):  # if it is a 1:N relationship from parent to child
                            parent_col_info = self.metadata_db.Collection.fetch_one({"name": parent_col})
                            if self.debug:
                                LOG.info("%s might be embedded into %s", child_col, parent_col)

                            if child_col.find(".") != -1:
                                ## HACK HACK HACK
                                child_col = child_col.replace(".", "__")
                            ## IF
                            if child_col in parent_col_info["embedding_ratio"]:
                                previous_ratio = parent_col_info["embedding_ratio"][child_col]
                                if ratio > previous_ratio:
                                    parent_col_info["embedding_ratio"][child_col] = ratio
                            else:
                                parent_col_info["embedding_ratio"][child_col] = ratio

                            parent_col_info.save()
                        ## IF

                        count += 1
                    ## WHILE
                ## FOR
            ## IF

            if (
                "parent_col" in field and field["parent_col"] and "parent_key" in field and field["parent_key"]
            ):  # This is probably only used by mysql trace
                # add the ratio to the parent collection
                parent_col_info = self.metadata_db.Collection.fetch_one({"name": field["parent_col"]})
                ratio, parent_average, commom_values = self.getEmbeddingRatio(
                    field["parent_col"], col_info["name"], field["parent_key"], k
                )

                if col_info["name"] in parent_col_info["embedding_ratio"]:
                    previous_ratio = parent_col_info["embedding_ratio"][col_info["name"]]
                    if ratio > previous_ratio:
                        parent_col_info["embedding_ratio"][col_info["name"]] = ratio
                else:
                    parent_col_info["embedding_ratio"][col_info["name"]] = ratio
                ## IF

                LOG.info("child col: %s", col_info["name"])
                LOG.info("parent col: %s", field["parent_col"])
                LOG.info("embedding_ratio: %s", parent_col_info["embedding_ratio"][col_info["name"]])

                parent_col_info.save()

            if "fields" in field and field["fields"]:
                self.computeFieldStats(col_info, field["fields"])
        ## FOR

    ## DEF

    def addQueryHashes(self):
        total = self.metadata_db.Session.find().count()
        LOG.info("Adding query hashes to %d sessions" % total)
        invalid_sess = []
        for sess in self.metadata_db.Session.fetch():
            # It is possible that the query content of some operations, which will
            # cause sess.save() to fail. To work this around, we just delete the incomplete op
            # from the session
            for i in xrange(len(sess["operations"]) - 1, -1, -1):
                op = sess["operations"][i]
                # For some reason even though the hash is an int, it
                # gets converted into a long when it shows up in MongoDB
                # I don't know why and frankly, I don't really care
                # So we'll just cast everything to a long and make sure that
                # the Session schema expects it to be a long
                try:
                    op["query_hash"] = long(self.hasher.hash(op))
                    if self.debug:
                        LOG.debug("  %d.%d -> Hash:%d", sess["session_id"], op["query_id"], op["query_hash"])
                except Exception:
                    sess["operations"].remove(op)
                ## TRY
            try:
                sess.save()
            except Exception:
                if self.debug:
                    LOG.info("session can not be saved due to operations error. Dump:\n%s", sess["operations"])
                sess.delete()
            if self.debug:
                LOG.debug("Updated Session #%d", sess["session_id"])
        ## FOR
        if self.debug:
            LOG.debug("Query Class Histogram:\n%s" % self.hasher.histogram)
        ## IF

    ## DEF

    def computeWorkloadStats(self):
        """Process Workload Trace"""
        LOG.info("Computing database statistics from workload trace")

        # We'll maintain a local cache of Collection objects so that
        # we only have to look-up/create each one once, and then we can
        # save them all together at the end
        collectionCache = {}
        for sess in self.metadata_db.Session.fetch():
            start_time = None
            end_time = None

            for op in sess["operations"]:
                # We need to know the total number of queries that we've seen
                self.total_ops += 1

                # The start_time is the timestamp of when the first query occurs
                if not start_time:
                    start_time = op["query_time"]
                start_time = min(start_time, op["query_time"])

                # The end_time is the timestamp of when the last response arrives
                if "resp_time" in op and op["resp_time"]:
                    end_time = max(end_time, op["resp_time"])
                elif not end_time and op["query_time"]:
                    end_time = op["query_time"]

                # Get the collection information object
                # We will use this to store the number times each key is referenced in a query
                if not op["collection"] in collectionCache:
                    if op["collection"] in constants.IGNORED_COLLECTIONS or op["collection"].endswith("$cmd"):
                        continue
                    col_info = self.metadata_db.Collection.one({"name": op["collection"]})
                    if not col_info:
                        col_info = self.metadata_db.Collection()
                        col_info["name"] = op["collection"]
                    collectionCache[op["collection"]] = col_info
                    col_info["workload_queries"] = 0

                col_info = collectionCache[op["collection"]]
                col_info["workload_queries"] += 1

                if not "predicates" in op or not op["predicates"]:
                    op["predicates"] = {}

                try:
                    for content in workload.getOpContents(op):
                        self.processOpFields(col_info["fields"], op, content)
                except:
                    LOG.error(
                        "Unexpected error for operation #%d in Session #%d\n%s",
                        op["query_id"],
                        sess["session_id"],
                        pformat(op),
                    )
                    raise
            ## FOR (operations)

            if start_time and end_time:
                sess["start_time"] = start_time
                sess["end_time"] = end_time

            if self.debug:
                LOG.debug("Updating Session #%d" % sess["session_id"])
            try:
                sess.save()
            except:
                LOG.error("Failed to update Session #%d", sess["session_id"])
                raise
        ## FOR (sessions)

        # Save off all our skanky stink boxes right here...
        for col_info in collectionCache.itervalues():
            col_info.save()

    ## DEF

    def processOpFields(self, fields, op, content):
        if self.debug:
            LOG.debug("Processing operation fields\n%s", pformat(content))
        for k, v in content.iteritems():
            # Skip anything that starts with our special char
            # Those are flag markers used by MongoDB's queries
            if k.startswith(constants.REPLACE_KEY_DOLLAR_PREFIX):
                continue

            # We need to add the field to the collection if it doesn't
            # already exist. This will occur if this op was an aggregate,
            # which we ignore when recreating the schema
            f_type = type(v)
            if not k in fields:
                fields[k] = catalog.Collection.fieldFactory(k, catalog.fieldTypeToString(f_type))

            if op["type"] != constants.OP_TYPE_INSERT:
                fields[k]["query_use_count"] += 1

            # No predicate for insert operations
            # No projections for insert operations
            if op["type"] != constants.OP_TYPE_INSERT:
                # Update how this key was used with predicates
                if workload.isOpRegex(op, field=k):
                    op["predicates"][k] = constants.PRED_TYPE_REGEX
                elif isinstance(v, dict):
                    op["predicates"][k] = constants.PRED_TYPE_RANGE
                elif not k in op["predicates"]:
                    op["predicates"][k] = constants.PRED_TYPE_EQUALITY

            ## TODO: Should we expect there to be field names with dot notation here?
            ##       Or should have all been cleaned out by the converters?

        ## FOR

        return

    ## DEF

    ## ==============================================
    ## SCHEMA EXTRACTION
    ## ==============================================

    def extractSchemaCatalog(self, sample_rate=100):
        """
            Iterates through all documents and infers the schema.
            This only needs to extract the schema skeleton. The
            post-processing stuff in the AbstractConverter will populate
            the statistics information for each collection
        """
        LOG.info("Extracting database schema catalog from workload trace")
        for colName in self.dataset_db.collection_names():
            # Skip ignored collections
            if colName.split(".")[0] in constants.IGNORED_COLLECTIONS:
                continue
            LOG.info("Extracting schema catalog information from collection '%s'", colName)

            # Get the collection information object
            # We will use this to store the number times each key is referenced in a query
            col_info = self.metadata_db.Collection.one({"name": colName})
            if not col_info:
                col_info = self.metadata_db.Collection()
                col_info["name"] = colName

            col_info["doc_count"] = 0
            col_info["data_size"] = 0L

            # Examine each document in the dataset for this collection
            for doc in self.dataset_db[colName].find():
                col_info["doc_count"] += 1
                if random.randint(0, 100) <= sample_rate:
                    try:
                        self.processDataFields(col_info, col_info["fields"], doc)
                    except:
                        msg = "Unexpected error when processing '%s' data fields" % colName
                        msg += "\n" + pformat(doc)
                        LOG.error(msg)
                        raise
            ## FOR

            # Calculate average tuple size (if we have at least one)
            if not col_info["doc_count"]:
                col_info["avg_doc_size"] = int(col_info["data_size"])
            else:
                col_info["avg_doc_size"] = int(col_info["data_size"] / col_info["doc_count"])

            # Calculate cardinality and selectivity
            self.computeFieldStats(col_info, col_info["fields"])

            if self.debug:
                LOG.debug("Saved new catalog entry for collection '%s'" % colName)
            try:
                col_info.save()
            except Exception:
                if self.debug:
                    LOG.info("abnormal col_info\n%s", pformat(col_info))
                raise
        ## FOR

    ## DEF

    def processDataFields(self, col_info, fields, doc):
        """
            Recursively traverse a single document and extract out the field information
        """
        if self.debug:
            LOG.debug("Extracting fields for document:\n%s" % pformat(doc))

        # Check if the current doc has parent_col, but this will only apply to its fields
        parent_col = doc.get("parent_col", None)

        for k, v in doc.iteritems():
            # Skip if this is the _id field
            if constants.SKIP_MONGODB_ID_FIELD and k == "_id":
                continue
            if k == constants.FUNCTIONAL_FIELD:
                continue
            f_type = type(v)
            f_type_str = catalog.fieldTypeToString(f_type)

            if not k in fields:
                # This is only subset of what we will compute for each field
                # See catalog.Collection for more information
                if self.debug:
                    LOG.debug("Creating new field entry for '%s'" % k)
                fields[k] = catalog.Collection.fieldFactory(k, f_type_str)
            else:
                fields[k]["type"] = f_type_str
                # Sanity check
                # This won't work if the data is not uniform
                # if v != None:
                # assert fields[k]['type'] == f_type_str, \
                # "Mismatched field types '%s' <> '%s' for '%s'" % (fields[k]['type'], f_type_str, k)

            # We will store the distinct values for each field in a set
            # that is embedded in the field. We will delete it when
            # we call computeFieldStats()
            if not "distinct_values" in fields[k]:
                fields[k]["distinct_values"] = set()
            if not "num_values" in fields[k]:
                fields[k]["num_values"] = 0
            # Likewise, we will also store a histogram for the different sizes
            # of each field. We will use this later on to compute the weighted average
            if not "size_histogram" in fields[k]:
                fields[k]["size_histogram"] = Histogram()
            # Maintain a histogram of list lengths
            if not "list_len" in fields[k]:
                fields[k]["list_len"] = Histogram()

            if fields[k]["query_use_count"] > 0 and not k in col_info["interesting"]:
                col_info["interesting"].append(k)

            ## ----------------------------------------------
            ## NESTED FIELDS
            ## ----------------------------------------------
            if isinstance(v, dict):
                # Check for a special data field
                if len(v) == 1 and v.keys()[0].startswith(constants.REPLACE_KEY_DOLLAR_PREFIX):
                    v = v[v.keys()[0]]
                    # HACK to handle lists (hopefully dict as well)from nested IN clauses...
                    all_values = v if isinstance(v, list) else [v]
                    for v in all_values:
                        if isinstance(v, dict):
                            v = v.values()[0]

                        fields[k]["type"] = catalog.fieldTypeToString(type(v))
                        try:
                            size = catalog.getEstimatedSize(fields[k]["type"], v)
                            self.total_field_ctr += 1
                        except:
                            if self.debug:
                                LOG.error(
                                    "Failed to estimate size for field '%s' in collection '%s'\n%s",
                                    k,
                                    col_info["name"],
                                    pformat(fields[k]),
                                )
                            self.err_field_ctr += 1
                            LOG.info(
                                "Total fields so far [%s], error fields [%s]", self.total_field_ctr, self.err_field_ctr
                            )
                            continue
                        col_info["data_size"] += size
                        fields[k]["size_histogram"].put(size)
                        fields[k]["distinct_values"].add(v)
                        fields[k]["num_values"] += 1
                        if parent_col:
                            fields[k]["parent_col"] = parent_col
                    ## FOR
                else:
                    if self.debug:
                        LOG.debug("Extracting keys in nested field for '%s'" % k)
                    if not "fields" in fields[k]:
                        fields[k]["fields"] = {}
                    self.processDataFields(col_info, fields[k]["fields"], doc[k])

            ## ----------------------------------------------
            ## LIST OF VALUES
            ## Could be either scalars or dicts. If it's a dict, then we'll just
            ## store the nested field information in the 'fields' value
            ## If it's a list, then we'll use a special marker 'LIST_INNER_FIELD' to
            ## store the field information for the inner values.
            ## ----------------------------------------------
            elif isinstance(v, list):
                if self.debug:
                    LOG.debug("Extracting keys in nested list for '%s'" % k)
                if not "fields" in fields[k]:
                    fields[k]["fields"] = {}

                list_len = len(doc[k])
                fields[k]["list_len"].put(list_len)
                for i in xrange(list_len):
                    inner_type = type(doc[k][i])
                    # More nested documents...
                    if inner_type == dict:
                        if self.debug:
                            LOG.debug("Extracting keys in nested field in list position %d for '%s'" % (i, k))
                        self.processDataFields(col_info, fields[k]["fields"], doc[k][i])
                    else:
                        # TODO: We probably should store a list of types here in case
                        #       the list has different types of values
                        inner = fields[k]["fields"].get(constants.LIST_INNER_FIELD, {})
                        inner["type"] = catalog.fieldTypeToString(inner_type)
                        try:
                            inner_size = catalog.getEstimatedSize(inner["type"], doc[k][i])
                            self.total_field_ctr += 1
                        except:
                            if self.debug:
                                LOG.error(
                                    "Failed to estimate size for list entry #%d for field '%s' in collection '%s'\n%s",
                                    i,
                                    k,
                                    col_info["name"],
                                    pformat(fields[k]),
                                )
                            self.err_field_ctr += 1
                            LOG.info(
                                "Total fields so far [%s], error fields [%s]", self.total_field_ctr, self.err_field_ctr
                            )
                            continue

                        fields[k]["fields"][constants.LIST_INNER_FIELD] = inner
                        fields[k]["size_histogram"].put(inner_size)
                        fields[k]["distinct_values"].add(doc[k][i])
                        fields[k]["num_values"] += 1
                        if parent_col:
                            fields[k]["parent_col"] = parent_col
                ## FOR (list)
            ## ----------------------------------------------
            ## SCALAR VALUES
            ## ----------------------------------------------
            else:
                try:
                    size = catalog.getEstimatedSize(fields[k]["type"], v)
                    self.total_field_ctr += 1
                except:
                    LOG.error(
                        "Failed to estimate size for field %s in collection %s\n%s",
                        k,
                        col_info["name"],
                        pformat(fields[k]),
                    )
                    self.err_field_ctr += 1
                    LOG.info("Total fields so far [%s], error fields [%s]", self.total_field_ctr, self.err_field_ctr)
                    continue

                col_info["data_size"] += size
                fields[k]["size_histogram"].put(size)
                fields[k]["distinct_values"].add(v)
                fields[k]["num_values"] += 1
                if parent_col:
                    fields[k]["parent_col"] = parent_col
        ## FOR

    ## DEF

    def computeFieldStats(self, col_info, fields):
        """
            Recursively calculate the cardinality of each field.
            This should only be invoked after processDataFields() has been called
        """
        for k, field in fields.iteritems():
            # Compute list information
            if "list_len" in field:
                if len(field["list_len"]) > 0:
                    all_values = field["list_len"].getAllValues()
                    field["list_len_min"] = int(min(all_values))
                    field["list_len_max"] = int(max(all_values))
                    field["list_len_avg"] = numpy.average(all_values)
                    field["list_len_stdev"] = numpy.average(all_values)
                del field["list_len"]

            # Compute a weighted average for each field
            if "size_histogram" in field:
                h = field["size_histogram"]
                total = 0.0
                for size, count in h.iteritems():
                    if count:
                        total += size * count
                num_samples = h.getSampleCount()
                if num_samples:
                    field["avg_size"] = int(math.ceil(total / num_samples))
                else:
                    field["avg_size"] = 0
                del field["size_histogram"]
            # Use the distinct values set to determine cardinality + selectivity
            if "distinct_values" in field:
                #                print "*" * 20
                #                print "col_name: ", col_info['name']
                #                print "field: ", k
                #                print "distinct values: ", pformat(field['distinct_values'])
                field["cardinality"] = len(field["distinct_values"])
                if field["num_values"] == 0:
                    field["selectivity"] = 0.0
                else:
                    field["selectivity"] = float(field["cardinality"]) / field["num_values"]
                self.computeFieldRange(field)
                del field["distinct_values"]
                del field["num_values"]

            if "fields" in field and field["fields"]:
                self.computeFieldStats(col_info, field["fields"])
        ## FOR

    ## DEF

    def computeFieldRange(self, field):
        num_ranges = self.num_nodes
        sorted_distinct_values = sorted([value for value in field["distinct_values"] if value is not None])
        if len(sorted_distinct_values) > 0:
            field["ranges"] = []
            if num_ranges <= 1:
                field["ranges"].append(sorted_distinct_values[0])
            else:
                range_len = len(sorted_distinct_values) / num_ranges
                if range_len == 0:
                    range_len = 1
                i = 0
                while i < len(sorted_distinct_values) and len(field["ranges"]) < num_ranges:
                    field["ranges"].append(sorted_distinct_values[i])
                    i += range_len

    ## ==============================================
    ## OPERATION FIXIN'
    ## ==============================================

    def fixInvalidCollections(self):
        searchKey = {"operations.collection": constants.INVALID_COLLECTION_MARKER}
        for session in self.metadata_db.Session.find(searchKey):
            for op in session["operations"]:
                dirty = False
                if op["collection"] != constants.INVALID_COLLECTION_MARKER:
                    continue

                if self.debug:
                    LOG.debug("Attempting to fix corrupted Operation:\n%s" % pformat(op))

                # For each field referenced in the query, build a histogram of
                # which collections have a field with the same name
                fields = workload.getReferencedFields(op)
                h = Histogram()
                for c in self.metadata_db.Collection.find():
                    for f in c["fields"]:
                        if f in fields:
                            h.put(c["name"])
                    ## FOR
                ## FOR

                matches = h.getMaxCountKeys()
                if len(matches) == 0:
                    LOG.warn("No matching collection was found for corrupted operation\n%s" % pformat(op))
                    continue
                elif len(matches) > 1:
                    LOG.warn(
                        "More than one matching collection was found for corrupted operation %s\n%s"
                        % (matches, pformat(op))
                    )
                    continue
                else:
                    op["collection"] = matches[0]
                    dirty = True
                    self.fix_ctr += 1
                    LOG.info("Fix corrupted collection in operation\n%s" % pformat(op))
                    ## IF
                    ## FOR (operations)

            if dirty:
                session.save()