def setUp(self):
        MongoDBTestCase.setUp(self)

        # Generate some fake workload sessions
        for i in xrange(0, NUM_SESSIONS):
            sess = self.metadata_db.Session()
            sess['session_id'] = i
            sess['ip_client'] = "client:%d" % (1234+i)
            sess['ip_server'] = "server:5678"
            sess['start_time'] = time.time()
            sess['end_time'] = time.time() + 5
            for j in xrange(0, NUM_OPS_PER_SESSION):
                _id = str(random.random())
                queryId = long((i<<16) + j)

                responseContent = {"_id": _id}
                responseId = (queryId<<8)
                for f in xrange(0, NUM_FIELDS):
                    f_name = "field%02d" % f
                    if f % 2 == 0:
                        responseContent[f_name] = random.randint(0, 100)
                    else:
                        responseContent[f_name] = str(random.randint(1000, 100000))
                ## FOR

                queryContent = { constants.REPLACE_KEY_DOLLAR_PREFIX + "query": responseContent }
                op = workload.Session.operationFactory()
                op['collection']    = COLLECTION_NAME
                op['type']          = constants.OP_TYPE_QUERY
                op['query_id']      = queryId
                op['query_content'] = [ queryContent ]
                op['resp_content']  = [ responseContent ]
                op['resp_id']       = responseId
                sess['operations'].append(op)
            ## FOR (ops)

            sess.save()
        ## FOR (sess)
        self.assertEqual(NUM_SESSIONS, self.metadata_db.Session.find().count())

        self.converter = AbstractConverter(self.metadata_db, self.dataset_db)
    def setUp(self):
        MongoDBTestCase.setUp(self)

        # Generate some fake workload sessions
        for i in xrange(0, NUM_SESSIONS):
            sess = self.metadata_db.Session()
            sess["session_id"] = i
            sess["ip_client"] = "client:%d" % (1234 + i)
            sess["ip_server"] = "server:5678"
            sess["start_time"] = time.time()
            sess["end_time"] = time.time() + 5
            for j in xrange(0, NUM_OPS_PER_SESSION):
                _id = str(random.random())
                queryId = long((i << 16) + j)

                responseContent = {"_id": _id}
                responseId = queryId << 8
                for f in xrange(0, NUM_FIELDS):
                    f_name = "field%02d" % f
                    if f % 2 == 0:
                        responseContent[f_name] = random.randint(0, 100)
                    else:
                        responseContent[f_name] = str(random.randint(1000, 100000))
                ## FOR

                queryContent = {constants.REPLACE_KEY_DOLLAR_PREFIX + "query": responseContent}
                op = workload.Session.operationFactory()
                op["collection"] = COLLECTION_NAME
                op["type"] = constants.OP_TYPE_QUERY
                op["query_id"] = queryId
                op["query_content"] = [queryContent]
                op["resp_content"] = [responseContent]
                op["resp_id"] = responseId
                sess["operations"].append(op)
            ## FOR (ops)

            sess.save()
        ## FOR (sess)
        self.assertEqual(NUM_SESSIONS, self.metadata_db.Session.find().count())

        self.converter = AbstractConverter(self.metadata_db, self.dataset_db)
class TestAbstractConverter(MongoDBTestCase):
    def setUp(self):
        MongoDBTestCase.setUp(self)

        # Generate some fake workload sessions
        for i in xrange(0, NUM_SESSIONS):
            sess = self.metadata_db.Session()
            sess["session_id"] = i
            sess["ip_client"] = "client:%d" % (1234 + i)
            sess["ip_server"] = "server:5678"
            sess["start_time"] = time.time()
            sess["end_time"] = time.time() + 5
            for j in xrange(0, NUM_OPS_PER_SESSION):
                _id = str(random.random())
                queryId = long((i << 16) + j)

                responseContent = {"_id": _id}
                responseId = queryId << 8
                for f in xrange(0, NUM_FIELDS):
                    f_name = "field%02d" % f
                    if f % 2 == 0:
                        responseContent[f_name] = random.randint(0, 100)
                    else:
                        responseContent[f_name] = str(random.randint(1000, 100000))
                ## FOR

                queryContent = {constants.REPLACE_KEY_DOLLAR_PREFIX + "query": responseContent}
                op = workload.Session.operationFactory()
                op["collection"] = COLLECTION_NAME
                op["type"] = constants.OP_TYPE_QUERY
                op["query_id"] = queryId
                op["query_content"] = [queryContent]
                op["resp_content"] = [responseContent]
                op["resp_id"] = responseId
                sess["operations"].append(op)
            ## FOR (ops)

            sess.save()
        ## FOR (sess)
        self.assertEqual(NUM_SESSIONS, self.metadata_db.Session.find().count())

        self.converter = AbstractConverter(self.metadata_db, self.dataset_db)

    ## DEF

    def testAddQueryHashes(self):
        # First make sure that all of the operations' query hashes are null
        for sess in self.metadata_db.Session.find():
            self.assertNotEqual(0, len(sess["operations"]))
            for op in sess["operations"]:
                self.assertIsNone(op["query_hash"])
        ## FOR

        # Now add the hashes. They should all be the same
        self.converter.addQueryHashes()
        firstHash = None
        for sess in self.metadata_db.Session.find():
            self.assertNotEqual(0, len(sess["operations"]))
            for op in sess["operations"]:
                if not firstHash:
                    firstHash = op["query_hash"]
                    self.assertIsNotNone(firstHash)
                self.assertEqual(firstHash, op["query_hash"])
        ## FOR

    ## DEF

    def testPostProcess(self):
        """
            Check whether we can successfully extract the database schema
            into our internal catalog
        """
        Reconstructor(self.metadata_db, self.dataset_db).process()
        self.converter.postProcess()

        col_info = self.metadata_db.Collection.one({"name": COLLECTION_NAME})
        #        pprint(col_info)

        # Workload-derived Attributes
        self.assertEqual(NUM_SESSIONS * NUM_OPS_PER_SESSION, col_info["workload_queries"])
        self.assertAlmostEqual(1.0, col_info["workload_percent"])

        # Fields
        # Add one for the '_id' field count
        self.assertEqual(NUM_FIELDS + 1, len(col_info["fields"]))
        for k, field in col_info["fields"].iteritems():
            self.assertEqual(NUM_SESSIONS * NUM_OPS_PER_SESSION, field["query_use_count"])

    ## DEF

    def testProcessDataFieldsSimple(self):
        doc = {
            "int": 123,
            "str": "abc",
            "float": 123.4,
            # TODO
            #'list':    range(10),
            #'dict': ....
        }

        col_info = catalog.Collection()
        col_info["data_size"] = 0

        fields = {}
        self.converter.processDataFields(col_info, fields, doc)
        self.assertIsNotNone(fields)
        self.assertEquals(dict, type(fields))
        for key, val in doc.iteritems():
            self.assertIn(key, fields)
            f = fields[key]
            self.assertIsNotNone(f)
            self.assertEquals(key, f["type"])
        ## FOR

    ## DEF

    def testProcessDataFields(self):
        doc = {
            "similar_artists": ["50e130f676d6081483d7aeaf90702caa/7", "3b6fac3e5e112ae35414480ccc5eb154/23"],
            "name": "596ea227ea0ce4dadbca2f06bddd30c9/15",
            "created": {"\$date": 1335871184519L},
            "image": {
                "large": "1b942d952ccd004325c997c012d49354/49",
                "extralarge": "bd11cf67bd8ee7653a1cfdf782c4ffaa/49",
                "small": "f5728a43a9e3efac9a0670cc66c2229f/48",
                "medium": "00b6d53c70a4fe656a4fc867ed9aceed/48",
                "mega": "6998e2abb589312f0fd358943865bf3c/61",
            },
            "last_modified": {"\$date": datetime.now()},
            "alias_md5s": ["2b763d64b83180c5512a962d5c4d5115/34"],
            "aliases": ["3019b6686229c4cf5089431332dee196/15"],
        }

        col_info = catalog.Collection()
        col_info["data_size"] = 0

        fields = {}
        self.converter.processDataFields(col_info, fields, doc)
        self.assertIsNotNone(fields)
        self.assertNotEqual(len(fields), 0)
        #        pprint(fields)

        # Check to make sure that we have a field entry for each
        # key in our original document. We will need to check recursively
        # to make sure that our nested keys get picked up
        for key, val in doc.iteritems():
            self.assertIn(key, fields)
            f = fields[key]
            self.assertNotEqual(f, None)
class TestAbstractConverter(MongoDBTestCase):

    def setUp(self):
        MongoDBTestCase.setUp(self)

        # Generate some fake workload sessions
        for i in xrange(0, NUM_SESSIONS):
            sess = self.metadata_db.Session()
            sess['session_id'] = i
            sess['ip_client'] = "client:%d" % (1234+i)
            sess['ip_server'] = "server:5678"
            sess['start_time'] = time.time()
            sess['end_time'] = time.time() + 5
            for j in xrange(0, NUM_OPS_PER_SESSION):
                _id = str(random.random())
                queryId = long((i<<16) + j)

                responseContent = {"_id": _id}
                responseId = (queryId<<8)
                for f in xrange(0, NUM_FIELDS):
                    f_name = "field%02d" % f
                    if f % 2 == 0:
                        responseContent[f_name] = random.randint(0, 100)
                    else:
                        responseContent[f_name] = str(random.randint(1000, 100000))
                ## FOR

                queryContent = { constants.REPLACE_KEY_DOLLAR_PREFIX + "query": responseContent }
                op = workload.Session.operationFactory()
                op['collection']    = COLLECTION_NAME
                op['type']          = constants.OP_TYPE_QUERY
                op['query_id']      = queryId
                op['query_content'] = [ queryContent ]
                op['resp_content']  = [ responseContent ]
                op['resp_id']       = responseId
                sess['operations'].append(op)
            ## FOR (ops)

            sess.save()
        ## FOR (sess)
        self.assertEqual(NUM_SESSIONS, self.metadata_db.Session.find().count())

        self.converter = AbstractConverter(self.metadata_db, self.dataset_db)
    ## DEF

    def testAddQueryHashes(self):
        # First make sure that all of the operations' query hashes are null
        for sess in self.metadata_db.Session.find():
            self.assertNotEqual(0, len(sess['operations']))
            for op in sess['operations']:
                self.assertIsNone(op['query_hash'])
        ## FOR

        # Now add the hashes. They should all be the same
        self.converter.addQueryHashes()
        firstHash = None
        for sess in self.metadata_db.Session.find():
            self.assertNotEqual(0, len(sess['operations']))
            for op in sess['operations']:
                if not firstHash:
                    firstHash = op['query_hash']
                    self.assertIsNotNone(firstHash)
                self.assertEqual(firstHash, op['query_hash'])
        ## FOR

    ## DEF

    def testPostProcess(self):
        """
            Check whether we can successfully extract the database schema
            into our internal catalog
        """
        Reconstructor(self.metadata_db, self.dataset_db).process()
        self.converter.postProcess()

        col_info = self.metadata_db.Collection.one({"name": COLLECTION_NAME})
#        pprint(col_info)

        # Workload-derived Attributes
        self.assertEqual(NUM_SESSIONS*NUM_OPS_PER_SESSION, col_info['workload_queries'])
        self.assertAlmostEqual(1.0, col_info['workload_percent'])

        # Fields
        # Add one for the '_id' field count
        self.assertEqual(NUM_FIELDS + 1, len(col_info['fields']))
        for k,field in col_info['fields'].iteritems():
            self.assertEqual(NUM_SESSIONS*NUM_OPS_PER_SESSION, field['query_use_count'])

    ## DEF

    def testProcessDataFieldsSimple(self):
        doc = {
            'int':     123,
            'str':     'abc',
            'float':   123.4,
            # TODO
            #'list':    range(10),
            #'dict': ....
        }

        col_info = catalog.Collection()
        col_info['data_size'] = 0

        fields = { }
        self.converter.processDataFields(col_info, fields, doc)
        self.assertIsNotNone(fields)
        self.assertEquals(dict, type(fields))
        for key, val in doc.iteritems():
            self.assertIn(key, fields)
            f = fields[key]
            self.assertIsNotNone(f)
            self.assertEquals(key, f['type'])
        ## FOR
    ## DEF

    def testProcessDataFields(self):
        doc = {
            "similar_artists" : [
                "50e130f676d6081483d7aeaf90702caa/7",
                "3b6fac3e5e112ae35414480ccc5eb154/23",
                ],
            "name" : "596ea227ea0ce4dadbca2f06bddd30c9/15",
            "created" : {
                "\$date" : 1335871184519l,
                },
            "image" : {
                "large" : "1b942d952ccd004325c997c012d49354/49",
                "extralarge" : "bd11cf67bd8ee7653a1cfdf782c4ffaa/49",
                "small" : "f5728a43a9e3efac9a0670cc66c2229f/48",
                "medium" : "00b6d53c70a4fe656a4fc867ed9aceed/48",
                "mega" : "6998e2abb589312f0fd358943865bf3c/61"
            },
            "last_modified" : {
                "\$date" : datetime.now(),
                },
            "alias_md5s" : [
                "2b763d64b83180c5512a962d5c4d5115/34"
            ],
            "aliases" : [
                "3019b6686229c4cf5089431332dee196/15"
            ]
        }

        col_info = catalog.Collection()
        col_info['data_size'] = 0

        fields = { }
        self.converter.processDataFields(col_info, fields, doc)
        self.assertIsNotNone(fields)
        self.assertNotEqual(len(fields), 0)
#        pprint(fields)

        # Check to make sure that we have a field entry for each
        # key in our original document. We will need to check recursively
        # to make sure that our nested keys get picked up
        for key, val in doc.iteritems():
            self.assertIn(key, fields)
            f = fields[key]
            self.assertNotEqual(f, None)