Exemplo n.º 1
0
 def setUp(self):
     CostModelTestCase.setUp(self)
     self.cm = DiskCostComponent(self.state)
     self.cm.no_index_size_estimation = False
     self.ops = []
     for sess in self.workload:
         map(self.ops.append, sess["operations"])
class TestDiskCostIndexesWithProjection(CostModelTestCase):

    def setUp(self):
        CostModelTestCase.setUp(self)
        self.cm = DiskCostComponent(self.state)
        self.cm.no_index_insertion_penalty = True
        
    ## DEF
    def testDiskCostIndexes(self):
        """Check whether disk cost calculations work correctly"""
        # First get the disk cost when there are no indexes
        d = Design()
        col_info = self.collections[CostModelTestCase.COLLECTION_NAME]
        d.addCollection(col_info['name'])

        cost0 = self.cm.getCost(d)
        print "diskCost0:", cost0
        # The cost should be exactly equal to one, which means that every operation
        # has to perform a full sequential scan on the collection
        self.assertEqual(cost0, 1.0)

        # Now add one index. The disk cost should be lower
        d = Design()
        col_info = self.collections[CostModelTestCase.COLLECTION_NAME]
        d.addCollection(col_info['name'])
        d.addIndex(col_info['name'], ["field01"])
        self.state.invalidateCache(col_info['name'])

        self.cm.reset()
        self.cm.state.reset()
        cost1 = self.cm.getCost(d)
        print "diskCost1:", cost1
        self.assertGreater(cost0, cost1)
        
        # Now add one more index. The disk cost should be lower again
        d = Design()
        col_info = self.collections[CostModelTestCase.COLLECTION_NAME]
        d.addCollection(col_info['name'])
        d.addIndex(col_info['name'], ["field01", "field00"])
        self.state.invalidateCache(col_info['name'])

        self.cm.reset()
        self.cm.state.reset()
        cost2 = self.cm.getCost(d)
        print "diskCost2:", cost2
        
        # Now add the one index. The disk cost should be much lower
        d = Design()
        col_info = self.collections[CostModelTestCase.COLLECTION_NAME]
        d.addCollection(col_info['name'])
        d.addIndex(col_info['name'], ["field01", "field00", "field02"])
        self.state.invalidateCache(col_info['name'])

        self.cm.reset()
        self.cm.state.reset()
        cost3 = self.cm.getCost(d)
        print "diskCost3:", cost3
        self.assertGreater(cost2, cost3)
class TestDiskCostIndexesWithProjection(CostModelTestCase):
    def setUp(self):
        CostModelTestCase.setUp(self)
        self.cm = DiskCostComponent(self.state)
        self.cm.no_index_insertion_penalty = True

    ## DEF
    def testDiskCostIndexes(self):
        """Check whether disk cost calculations work correctly"""
        # First get the disk cost when there are no indexes
        d = Design()
        col_info = self.collections[CostModelTestCase.COLLECTION_NAME]
        d.addCollection(col_info['name'])

        cost0 = self.cm.getCost(d)
        print "diskCost0:", cost0
        # The cost should be exactly equal to one, which means that every operation
        # has to perform a full sequential scan on the collection
        self.assertEqual(cost0, 1.0)

        # Now add one index. The disk cost should be lower
        d = Design()
        col_info = self.collections[CostModelTestCase.COLLECTION_NAME]
        d.addCollection(col_info['name'])
        d.addIndex(col_info['name'], ["field01"])
        self.state.invalidateCache(col_info['name'])

        self.cm.reset()
        self.cm.state.reset()
        cost1 = self.cm.getCost(d)
        print "diskCost1:", cost1
        self.assertGreater(cost0, cost1)

        # Now add one more index. The disk cost should be lower again
        d = Design()
        col_info = self.collections[CostModelTestCase.COLLECTION_NAME]
        d.addCollection(col_info['name'])
        d.addIndex(col_info['name'], ["field01", "field00"])
        self.state.invalidateCache(col_info['name'])

        self.cm.reset()
        self.cm.state.reset()
        cost2 = self.cm.getCost(d)
        print "diskCost2:", cost2

        # Now add the one index. The disk cost should be much lower
        d = Design()
        col_info = self.collections[CostModelTestCase.COLLECTION_NAME]
        d.addCollection(col_info['name'])
        d.addIndex(col_info['name'], ["field01", "field00", "field02"])
        self.state.invalidateCache(col_info['name'])

        self.cm.reset()
        self.cm.state.reset()
        cost3 = self.cm.getCost(d)
        print "diskCost3:", cost3
        self.assertGreater(cost2, cost3)
 def setUp(self):
     CostModelTestCase.setUp(self)
     self.cm = DiskCostComponent(self.state)
     self.cm.no_index_size_estimation = False
     self.ops = []
     for sess in self.workload:
         map(self.ops.append, sess["operations"])
 def setUp(self):
     CostModelTestCase.setUp(self)
     self.cm = DiskCostComponent(self.state)
     self.cm.no_index_insertion_penalty = True
 def setUp(self):
     CostModelTestCase.setUp(self)
     self.cm = DiskCostComponent(self.state)
     self.cmn = NetworkCostComponent(self.state)
     self.col_names = [ x for x in self.collections.iterkeys()]
class TestWorkloadCombiner(CostModelTestCase):

    def setUp(self):
        CostModelTestCase.setUp(self)
        self.cm = DiskCostComponent(self.state)
        self.cmn = NetworkCostComponent(self.state)
        self.col_names = [ x for x in self.collections.iterkeys()]
    ## DEF

    def testQueriesCombination(self):
        """Test if the total number of queries are reduced"""
        original_number_of_queries = 0
        for sess in self.workload:
            for op in sess["operations"]:
                original_number_of_queries += 1

        print "orignal number of queries: " + str(original_number_of_queries)

        # Initialize a combiner
        combiner = WorkloadCombiner(self.col_names, self.workload)

        # initialize a design with denormalization
        d = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d.addCollection(col_info['name'])

        d.setDenormalizationParent("koalas", "apples")

        combinedWorkload = combiner.process(d)

        number_of_queries_from_combined_workload = 0
        for sess in combinedWorkload:
            for op in sess["operations"]:
                number_of_queries_from_combined_workload += 1
                
        print "number of queries after query combination: " + str(number_of_queries_from_combined_workload)

        self.assertGreater(original_number_of_queries, number_of_queries_from_combined_workload)

    ## DEF
    
    def testDiskCostChangesAfterQueryCombination(self):
        """
            Assume we have collection A, B, C and we want to embed C to A
            If we build index on field00 of A and field02 of C
            The cost after query combination should be lower
        """
        d0 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d0.addCollection(col_info['name'])
            d0.addIndex(col_info['name'], ['field00', 'field02'])
        
        cost0 = self.cm.getCost(d0)
        print "cost0 " + str(cost0)

        # Initialize a combiner
        combiner = WorkloadCombiner(self.col_names, self.workload)

        # initialize a design with denormalization
        d1 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d1.addCollection(col_info['name'])
            d1.addIndex(col_info['name'], ['field00', 'field02'])
            self.state.invalidateCache(col_info['name'])
            
        d1.setDenormalizationParent("koalas", "apples")

        combinedWorkload = combiner.process(d1)
        self.state.updateWorkload(combinedWorkload)
                
        self.cm.reset()
        self.cm.state.reset()
        cost1 = self.cm.getCost(d1)

        print "cost1 " + str(cost1)
        
        self.assertGreater(cost0, cost1)

        # Cost should remain the same after restoring the original workload
        self.state.restoreOriginalWorkload()
        self.cm.reset()
        print "child collection ", self.cm.child_collections
        self.cm.state.reset()
        cost2 = self.cm.getCost(d0)

        print "cost2 " + str(cost2)

        self.assertEqual(cost2, cost0)
    ## def
    
    def testNetworkCostShouldReduceAfterQueryCombination(self):
        """
            Network cost should be reduce after embedding collections
        """
        d0 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d0.addCollection(col_info['name'])
            d0.addIndex(col_info['name'], ['field00', 'field02'])
        cost0 = self.cmn.getCost(d0)
        print "cost0 " + str(cost0)

        # Initialize a combiner
        combiner = WorkloadCombiner(self.col_names, self.workload)

        # initialize a design with denormalization
        d1 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d1.addCollection(col_info['name'])
            d1.addIndex(col_info['name'], ['field00', 'field02'])
            self.state.invalidateCache(col_info['name'])

        d1.setDenormalizationParent("koalas", "apples")

        combinedWorkload = combiner.process(d1)
        self.state.updateWorkload(combinedWorkload)

        self.cmn.reset()
        self.cmn.state.reset()
        cost1 = self.cmn.getCost(d1)

        print "cost1 " + str(cost1)

        self.assertGreater(cost0, cost1)

        # Cost should remain the same after restoring the original workload
        self.state.restoreOriginalWorkload()
        self.cmn.reset()
        self.cmn.state.reset()
        cost2 = self.cmn.getCost(d0)

        print "cost2 " + str(cost2)

        self.assertEqual(cost2, cost0)
    ## def

    def testNotCollectionEmbeddingProcessShouldReturnNone(self):
        """
            If the given design has no collection embedding, we should return right away
        """
        d0 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d0.addCollection(col_info['name'])
            d0.addIndex(col_info['name'], ['field00', 'field02'])

        # Initialize a combiner
        combiner = WorkloadCombiner(self.col_names, self.workload)

        combinedWorkload = combiner.process(d0)
        self.assertEqual(None, combinedWorkload)
class TestWorkloadCombiner(CostModelTestCase):

    def setUp(self):
        CostModelTestCase.setUp(self)
        self.cm = DiskCostComponent(self.state)
        self.col_names = [ x for x in self.collections.iterkeys()]
        ## DEF

    def testQueriesCombination(self):
        """Test if the total number of queries are reduced"""
        original_number_of_queries = 0
        for sess in self.workload:
            for op in sess["operations"]:
                original_number_of_queries += 1

        print "orignal number of queries: " + str(original_number_of_queries)

        # Initialize a combiner
        combiner = WorkloadCombiner(self.col_names, self.workload)

        # initialize a design with denormalization
        d = Design()
        for col_name in self.collections.iterkeys():
            d.addCollection(col_name)

        d.setDenormalizationParent(tpccConstants.TABLENAME_ORDER_LINE, tpccConstants.TABLENAME_ORDERS)

        combinedWorkload = combiner.process(d)

        number_of_queries_from_combined_workload = 0
        for sess in combinedWorkload:
            for op in sess["operations"]:
                number_of_queries_from_combined_workload += 1
                
        print "number of queries after query combination: " + str(number_of_queries_from_combined_workload)

        self.assertGreater(original_number_of_queries, number_of_queries_from_combined_workload)
        
    def testDiskCostNotChangedAfterQueryCombination(self):
        """Disk cost should not be changed after query combination"""
        d = Design()
        d = Design()
        for col_name in self.collections.iterkeys():
            d.addCollection(col_name)
        
        cost0 = self.cm.getCost(d)
        print "cost0 " + str(cost0)

        # Initialize a combiner
        combiner = WorkloadCombiner(self.col_names, self.workload)

        # initialize a design with denormalization
        d = Design()
        d = Design()
        for col_name in self.collections.iterkeys():
            d.addCollection(col_name)
            self.state.invalidateCache(col_name)
            
        d.setDenormalizationParent(tpccConstants.TABLENAME_ORDER_LINE, tpccConstants.TABLENAME_ORDERS)

        combinedWorkload = combiner.process(d)
        self.state.updateWorkload(combinedWorkload)
                
        self.cm.reset()
        self.cm.state.reset()
        cost1 = self.cm.getCost(d)

        print "cost1 " + str(cost1)
        
        self.assertEqual(cost0, cost1)
 def setUp(self):
     CostModelTestCase.setUp(self)
     self.cm = DiskCostComponent(self.state)
     self.cm.no_index_insertion_penalty = True
class TestDiskCostIndexes(CostModelTestCase):

    def setUp(self):
        CostModelTestCase.setUp(self)
        self.cm = DiskCostComponent(self.state)
        self.cm.no_index_insertion_penalty = True
    # DEF
    def testDiskCostIndexes(self):
        """Check whether disk cost calculations work correctly"""
        # First get the disk cost when there are no indexes
        d = Design()
        col_info = self.collections[CostModelTestCase.COLLECTION_NAME]
        d.addCollection(col_info['name'])

        cost0 = self.cm.getCost(d)
        print "diskCost0:", cost0
        # The cost should be exactly equal to one, which means that every operation
        # has to perform a full sequential scan on the collection
        self.assertEqual(cost0, 1.0)

        # Now add the all indexes. The disk cost should be lower
        d = Design()
        col_info = self.collections[CostModelTestCase.COLLECTION_NAME]
        d.addCollection(col_info['name'])
        d.addIndex(col_info['name'], col_info['interesting'])
        self.state.invalidateCache(col_info['name'])

        self.cm.reset()
        self.cm.state.reset()
        cost1 = self.cm.getCost(d)
        print "diskCost1:", cost1
        self.assertGreater(cost0, cost1)

    def testDiskCostOnDifferentIndexes(self):
        """Check how indexes will affect the disk cost"""
        # 1. Put index on both of the fields seperately
        d = Design()
        d.addCollection(CostModelTestCase.COLLECTION_NAME)
        d.addIndex(CostModelTestCase.COLLECTION_NAME, ["field00"])
        d.addIndex(CostModelTestCase.COLLECTION_NAME, ["field01"])

        self.cm.reset()
        self.cm.state.reset()
        cost0 = self.cm.getCost(d)
        print "diskCost0:", cost0

        # 3. Put indexes on both field together
        d = Design()
        col_info = self.collections[CostModelTestCase.COLLECTION_NAME]
        d.addCollection(CostModelTestCase.COLLECTION_NAME)
        d.addIndex(CostModelTestCase.COLLECTION_NAME, ["field01", "field00"])
        self.state.invalidateCache(col_info['name'])

        self.cm.reset()
        self.cm.state.reset()
        cost1 = self.cm.getCost(d)
        print "diskCost1:", cost1

        self.assertGreater(cost0, cost1)

    def testDiskCostCaching(self):
        """Check whether disk cost calculations work correctly with caching enabled"""
        self.cm.cache_enable = True

        # Give the mofo a full Design with indexes
        d = Design()
        col_info = self.collections[CostModelTestCase.COLLECTION_NAME]
        d.addCollection(col_info['name'])
        d.addIndex(col_info['name'], col_info['interesting'])
            ## FOR
        cost0 = self.cm.getCost(d)
        print "diskCost0:", cost0
        # FIXME self.assertGreater(cost0, 0.0)

        # We should get the same cost back after we execute it a second time
        cost1 = self.cm.getCost(d)
        print "diskCost1:", cost1
class TestDiskCostGuessIndex(CostModelTestCase):

    def setUp(self):
        CostModelTestCase.setUp(self)
        self.cm = DiskCostComponent(self.state)
        self.cm.no_index_size_estimation = False
        self.ops = []
        for sess in self.workload:
            map(self.ops.append, sess["operations"])
            
    def testGuessIndex_consistentAnswer(self):
        """Check that guessIndex always returns the same answer for the same input"""

        # initialize design
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field00", "field01"])
        d.addIndex("apple", ["field01", "field00"])
        d.addIndex("apple", ["field00"])
        d.addIndex("apple", ["field01"])

        for i in xrange(len(self.ops) - 2):
            op = self.ops[i]
            last_index, last_covering = (None, None)
            for i in xrange(100):
                best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op)
                self.assertIsNotNone(best_index)
                self.assertIsNotNone(covering)
                if not last_index is None:
                    self.assertEqual(last_index, best_index)
                    self.assertEqual(last_covering, covering)
                last_index, last_covering = (best_index, covering)
            ## FOR
    ## DEF

    def testGuessIndex_indexInIncorrectOrder(self):
        """
            Design with index (field01, field00)
            1. query uses index (field00)
            result: not using index because that query uses indexes in order
            2. query uses index (field01)
            result: using index (field01, field00) because this is the best match
            3. query uses index (field01, field00)
            result: using index (field01, field00) because they match the best

            Design with index (field00, field01)
            4. query uses index (field01, field00)
            result: using no index because the index order is not correct

            Design with index (field01, field02, field00)
            5. query uses index (field01, field00)
            result: using index (field01, field02, field00) because they match the best
            result: not cover index because the index order in design is not correct
        """
        # initialize design
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field01", "field00"])

        # query 1: get query, queries on field00
        op = self.ops[0]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op)

        self.assertEqual(best_index, None)
        self.assertFalse(covering)

        # query 2: get query, queries on field01
        op = self.ops[1]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op)

        self.assertEqual(len(best_index), 2)
        self.assertEqual(best_index[0], "field01")
        self.assertEqual(best_index[1], "field00")
        self.assertFalse(covering)

        # query 3: get query, queries on field01 and field00
        op = self.ops[2]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op)

        self.assertEqual(len(best_index), 2)
        self.assertEqual(best_index[0], "field01")
        self.assertEqual(best_index[1], "field00")
        self.assertFalse(covering)

        # query 4:
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field00", "field01"])

        op = self.ops[2]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op)

        self.assertEqual(len(best_index), 2)
        self.assertFalse(covering)

        # query 5:
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field01", "field02", "field00"])

        op = self.ops[2]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op)

        self.assertEqual(len(best_index), 3)
        self.assertEqual(best_index[0], "field01")
        self.assertEqual(best_index[1], "field02")
        self.assertEqual(best_index[2], "field00")
        self.assertFalse(covering)

    def testGuessIndex_indexChooseTheMostMatch(self):
        """
            Design with index (field01, field00), (field01),
            1. query uses index (field01) without projection field
            result: using index (field01) because they match the most
            2. query used index (field01, field00) without projection field
            result: using index (field01, field00) because they match the most

            If we have a design building indexes on (field01) only
            3. query uses index (field01, field00) without projection field
            result: using index (field01) because they match the most

            If we have a design building indexes on (field01, field03, field00), (field01)
            4. query uses index (field01, field00)
            result: using index (field01) because field01 is shorter
        """
        # initialize design
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field01", "field00"])
        d.addIndex("apple", ["field01"])

        # query 1: get query
        op = self.ops[1]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op)

        self.assertEqual(len(best_index), 1)
        self.assertEqual(best_index[0], 'field01')
        self.assertFalse(covering)

        # query 2:  get query
        op = self.ops[2]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op)

        self.assertEqual(len(best_index), 2)
        self.assertEqual(best_index[0], 'field01')
        self.assertEqual(best_index[1], 'field00')
        self.assertFalse(covering)

        ## query 3:
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field01"])

        op = self.ops[2]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op)

        self.assertEqual(best_index[0], 'field01')
        self.assertFalse(covering)

        # query 4:
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field01", "field03", "field00"])
        d.addIndex("apple", ["field01"])
        op = self.ops[2]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op)

        self.assertEqual(len(best_index), 1)
        self.assertEqual(best_index[0], 'field01')
        self.assertFalse(covering)

    def testGuessIndex_indexChooseWithProjectionField(self):
        """
            If a query uses one of the indexes the design has but its projection uses
            one of the indexes the design has, we should choose the index with both
            query index and projection index
        """
        # If we have a design with index (field00), (field00, field02)
        # 1. query uses field00 but its projection field is {field02: xx}
        # result: we should choose (field00, field02) as the best index
        
        # initialize design
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field00", "field02"])
        d.addIndex("apple", ["field00"])

        op = self.ops[0]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op)

        self.assertEqual(len(best_index), 2)
        self.assertEqual(best_index[0], "field00")
        self.assertEqual(best_index[1], "field02")
        self.assertTrue(covering)
    ## DEF

    def testGuessIndex_indexChooseWithoutProjectionField(self):
        """
            If a query uses all the indexes but doesn't have a projection field,
            we still think it is not a covering index
        """
        # If we have a design with indexes(field00, field01)
        # 1. query uses (field00, field01) but there is no projection field
        # result: we should choose (field00, field02) but the index is not a covering index

        # initialize design
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field00", "field01"])

        op = self.ops[3]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op)

        self.assertEqual(best_index[0], "field00")
        self.assertEqual(best_index[1], "field01")
        self.assertFalse(covering)
    ## DEF
    
    def testGuessIndex_IndexSizeEstimation(self):
        """
            Check if the size of the indexes vary
        """
        d = Design()
        d.addCollection("apple")
        
        d.addIndex("apple", ["field00"])
        d.addIndex("apple", ["field01"])
        d.addIndex("apple", ["field00", "field01"])
        
        # op0 use index (field00)
        op0 = self.ops[0]
        
        # op1 use index (field01)
        op1 = self.ops[1]
        
        # op2 use index (field01, field00)
        op2 = self.ops[2]
        
        # op3 use index (field00, field01)
        op3 = self.ops[3]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op0)
        self.assertEqual(24+8, index_size)
        
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op1)
        self.assertEqual(24+8, index_size)
        
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op2)
        self.assertEqual(24+24+8, index_size)
        
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(d, op3)
        self.assertEqual(24+24+8, index_size)
    ## DEF
    
    def testGuessIndex_IndexSizeEstimation_Denormalization(self):
        """
            If collection A is denormalized into B, then the index for collection B should have larger size now
            (If and only if the index is built on a field that is included by both collection A and collection B)
        """
        d = Design()
        d.addCollection("apple")
        d.addCollection("microsoft")
        d.addCollection("google")
        
        d.addIndex("apple", ["field00"])
        d.addIndex("microsoft", ["field00"])
        d.addIndex("google", ["field00"])
        
        # op4 use index (field00) but it only goes to collection microsoft
        op4 = self.ops[4]
        
        # Guess index
        
        # Without denormalization
        best_index, covering, index_size_0, slot_size = self.cm.guess_op_info(d, op4)
        
        # With one denormalization
        d.setDenormalizationParent("apple", "microsoft")
        self.cm.buildEmbeddingCostDictionary(d)
        best_index, covering, index_size_1, slot_size = self.cm.guess_op_info(d, op4)
        
        self.assertGreater(index_size_1, index_size_0)
        
        # With chained denormalization
        self.cm.reset()
        d.setDenormalizationParent("google", "apple")
        self.cm.buildEmbeddingCostDictionary(d)
        best_index, covering, index_size_2, slot_size = self.cm.guess_op_info(d, op4)
        
        self.assertGreater(index_size_2, index_size_1)
Exemplo n.º 12
0
class TestDiskCostGuessIndex(CostModelTestCase):
    def setUp(self):
        CostModelTestCase.setUp(self)
        self.cm = DiskCostComponent(self.state)
        self.cm.no_index_size_estimation = False
        self.ops = []
        for sess in self.workload:
            map(self.ops.append, sess["operations"])

    def testGuessIndex_consistentAnswer(self):
        """Check that guessIndex always returns the same answer for the same input"""

        # initialize design
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field00", "field01"])
        d.addIndex("apple", ["field01", "field00"])
        d.addIndex("apple", ["field00"])
        d.addIndex("apple", ["field01"])

        for i in xrange(len(self.ops) - 2):
            op = self.ops[i]
            last_index, last_covering = (None, None)
            for i in xrange(100):
                best_index, covering, index_size, slot_size = self.cm.guess_op_info(
                    d, op)
                self.assertIsNotNone(best_index)
                self.assertIsNotNone(covering)
                if not last_index is None:
                    self.assertEqual(last_index, best_index)
                    self.assertEqual(last_covering, covering)
                last_index, last_covering = (best_index, covering)
            ## FOR

    ## DEF

    def testGuessIndex_indexInIncorrectOrder(self):
        """
            Design with index (field01, field00)
            1. query uses index (field00)
            result: not using index because that query uses indexes in order
            2. query uses index (field01)
            result: using index (field01, field00) because this is the best match
            3. query uses index (field01, field00)
            result: using index (field01, field00) because they match the best

            Design with index (field00, field01)
            4. query uses index (field01, field00)
            result: using no index because the index order is not correct

            Design with index (field01, field02, field00)
            5. query uses index (field01, field00)
            result: using index (field01, field02, field00) because they match the best
            result: not cover index because the index order in design is not correct
        """
        # initialize design
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field01", "field00"])

        # query 1: get query, queries on field00
        op = self.ops[0]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op)

        self.assertEqual(best_index, None)
        self.assertFalse(covering)

        # query 2: get query, queries on field01
        op = self.ops[1]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op)

        self.assertEqual(len(best_index), 2)
        self.assertEqual(best_index[0], "field01")
        self.assertEqual(best_index[1], "field00")
        self.assertFalse(covering)

        # query 3: get query, queries on field01 and field00
        op = self.ops[2]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op)

        self.assertEqual(len(best_index), 2)
        self.assertEqual(best_index[0], "field01")
        self.assertEqual(best_index[1], "field00")
        self.assertFalse(covering)

        # query 4:
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field00", "field01"])

        op = self.ops[2]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op)

        self.assertEqual(len(best_index), 2)
        self.assertFalse(covering)

        # query 5:
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field01", "field02", "field00"])

        op = self.ops[2]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op)

        self.assertEqual(len(best_index), 3)
        self.assertEqual(best_index[0], "field01")
        self.assertEqual(best_index[1], "field02")
        self.assertEqual(best_index[2], "field00")
        self.assertFalse(covering)

    def testGuessIndex_indexChooseTheMostMatch(self):
        """
            Design with index (field01, field00), (field01),
            1. query uses index (field01) without projection field
            result: using index (field01) because they match the most
            2. query used index (field01, field00) without projection field
            result: using index (field01, field00) because they match the most

            If we have a design building indexes on (field01) only
            3. query uses index (field01, field00) without projection field
            result: using index (field01) because they match the most

            If we have a design building indexes on (field01, field03, field00), (field01)
            4. query uses index (field01, field00)
            result: using index (field01) because field01 is shorter
        """
        # initialize design
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field01", "field00"])
        d.addIndex("apple", ["field01"])

        # query 1: get query
        op = self.ops[1]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op)

        self.assertEqual(len(best_index), 1)
        self.assertEqual(best_index[0], 'field01')
        self.assertFalse(covering)

        # query 2:  get query
        op = self.ops[2]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op)

        self.assertEqual(len(best_index), 2)
        self.assertEqual(best_index[0], 'field01')
        self.assertEqual(best_index[1], 'field00')
        self.assertFalse(covering)

        ## query 3:
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field01"])

        op = self.ops[2]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op)

        self.assertEqual(best_index[0], 'field01')
        self.assertFalse(covering)

        # query 4:
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field01", "field03", "field00"])
        d.addIndex("apple", ["field01"])
        op = self.ops[2]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op)

        self.assertEqual(len(best_index), 1)
        self.assertEqual(best_index[0], 'field01')
        self.assertFalse(covering)

    def testGuessIndex_indexChooseWithProjectionField(self):
        """
            If a query uses one of the indexes the design has but its projection uses
            one of the indexes the design has, we should choose the index with both
            query index and projection index
        """
        # If we have a design with index (field00), (field00, field02)
        # 1. query uses field00 but its projection field is {field02: xx}
        # result: we should choose (field00, field02) as the best index

        # initialize design
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field00", "field02"])
        d.addIndex("apple", ["field00"])

        op = self.ops[0]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op)

        self.assertEqual(len(best_index), 2)
        self.assertEqual(best_index[0], "field00")
        self.assertEqual(best_index[1], "field02")
        self.assertTrue(covering)

    ## DEF

    def testGuessIndex_indexChooseWithoutProjectionField(self):
        """
            If a query uses all the indexes but doesn't have a projection field,
            we still think it is not a covering index
        """
        # If we have a design with indexes(field00, field01)
        # 1. query uses (field00, field01) but there is no projection field
        # result: we should choose (field00, field02) but the index is not a covering index

        # initialize design
        d = Design()
        d.addCollection("apple")
        d.addIndex("apple", ["field00", "field01"])

        op = self.ops[3]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op)

        self.assertEqual(best_index[0], "field00")
        self.assertEqual(best_index[1], "field01")
        self.assertFalse(covering)

    ## DEF

    def testGuessIndex_IndexSizeEstimation(self):
        """
            Check if the size of the indexes vary
        """
        d = Design()
        d.addCollection("apple")

        d.addIndex("apple", ["field00"])
        d.addIndex("apple", ["field01"])
        d.addIndex("apple", ["field00", "field01"])

        # op0 use index (field00)
        op0 = self.ops[0]

        # op1 use index (field01)
        op1 = self.ops[1]

        # op2 use index (field01, field00)
        op2 = self.ops[2]

        # op3 use index (field00, field01)
        op3 = self.ops[3]

        # Guess index
        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op0)
        self.assertEqual(24 + 8, index_size)

        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op1)
        self.assertEqual(24 + 8, index_size)

        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op2)
        self.assertEqual(24 + 24 + 8, index_size)

        best_index, covering, index_size, slot_size = self.cm.guess_op_info(
            d, op3)
        self.assertEqual(24 + 24 + 8, index_size)

    ## DEF

    def testGuessIndex_IndexSizeEstimation_Denormalization(self):
        """
            If collection A is denormalized into B, then the index for collection B should have larger size now
            (If and only if the index is built on a field that is included by both collection A and collection B)
        """
        d = Design()
        d.addCollection("apple")
        d.addCollection("microsoft")
        d.addCollection("google")

        d.addIndex("apple", ["field00"])
        d.addIndex("microsoft", ["field00"])
        d.addIndex("google", ["field00"])

        # op4 use index (field00) but it only goes to collection microsoft
        op4 = self.ops[4]

        # Guess index

        # Without denormalization
        best_index, covering, index_size_0, slot_size = self.cm.guess_op_info(
            d, op4)

        # With one denormalization
        d.setDenormalizationParent("apple", "microsoft")
        self.cm.buildEmbeddingCostDictionary(d)
        best_index, covering, index_size_1, slot_size = self.cm.guess_op_info(
            d, op4)

        self.assertGreater(index_size_1, index_size_0)

        # With chained denormalization
        self.cm.reset()
        d.setDenormalizationParent("google", "apple")
        self.cm.buildEmbeddingCostDictionary(d)
        best_index, covering, index_size_2, slot_size = self.cm.guess_op_info(
            d, op4)

        self.assertGreater(index_size_2, index_size_1)
Exemplo n.º 13
0
 def setUp(self):
     CostModelTestCase.setUp(self)
     self.cm = DiskCostComponent(self.state)
     self.cmn = NetworkCostComponent(self.state)
     self.col_names = [x for x in self.collections.iterkeys()]
Exemplo n.º 14
0
class TestWorkloadCombiner(CostModelTestCase):
    def setUp(self):
        CostModelTestCase.setUp(self)
        self.cm = DiskCostComponent(self.state)
        self.cmn = NetworkCostComponent(self.state)
        self.col_names = [x for x in self.collections.iterkeys()]

    ## DEF

    def testQueriesCombination(self):
        """Test if the total number of queries are reduced"""
        original_number_of_queries = 0
        for sess in self.workload:
            for op in sess["operations"]:
                original_number_of_queries += 1

        print "orignal number of queries: " + str(original_number_of_queries)

        # Initialize a combiner
        combiner = WorkloadCombiner(self.col_names, self.workload)

        # initialize a design with denormalization
        d = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d.addCollection(col_info['name'])

        d.setDenormalizationParent("koalas", "apples")

        combinedWorkload = combiner.process(d)

        number_of_queries_from_combined_workload = 0
        for sess in combinedWorkload:
            for op in sess["operations"]:
                number_of_queries_from_combined_workload += 1

        print "number of queries after query combination: " + str(
            number_of_queries_from_combined_workload)

        self.assertGreater(original_number_of_queries,
                           number_of_queries_from_combined_workload)

    ## DEF

    def testDiskCostChangesAfterQueryCombination(self):
        """
            Assume we have collection A, B, C and we want to embed C to A
            If we build index on field00 of A and field02 of C
            The cost after query combination should be lower
        """
        d0 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d0.addCollection(col_info['name'])
            d0.addIndex(col_info['name'], ['field00', 'field02'])

        cost0 = self.cm.getCost(d0)
        print "cost0 " + str(cost0)

        # Initialize a combiner
        combiner = WorkloadCombiner(self.col_names, self.workload)

        # initialize a design with denormalization
        d1 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d1.addCollection(col_info['name'])
            d1.addIndex(col_info['name'], ['field00', 'field02'])
            self.state.invalidateCache(col_info['name'])

        d1.setDenormalizationParent("koalas", "apples")

        combinedWorkload = combiner.process(d1)
        self.state.updateWorkload(combinedWorkload)

        self.cm.reset()
        self.cm.state.reset()
        cost1 = self.cm.getCost(d1)

        print "cost1 " + str(cost1)

        self.assertGreater(cost0, cost1)

        # Cost should remain the same after restoring the original workload
        self.state.restoreOriginalWorkload()
        self.cm.reset()
        print "child collection ", self.cm.child_collections
        self.cm.state.reset()
        cost2 = self.cm.getCost(d0)

        print "cost2 " + str(cost2)

        self.assertEqual(cost2, cost0)

    ## def

    def testNetworkCostShouldReduceAfterQueryCombination(self):
        """
            Network cost should be reduce after embedding collections
        """
        d0 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d0.addCollection(col_info['name'])
            d0.addIndex(col_info['name'], ['field00', 'field02'])
        cost0 = self.cmn.getCost(d0)
        print "cost0 " + str(cost0)

        # Initialize a combiner
        combiner = WorkloadCombiner(self.col_names, self.workload)

        # initialize a design with denormalization
        d1 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d1.addCollection(col_info['name'])
            d1.addIndex(col_info['name'], ['field00', 'field02'])
            self.state.invalidateCache(col_info['name'])

        d1.setDenormalizationParent("koalas", "apples")

        combinedWorkload = combiner.process(d1)
        self.state.updateWorkload(combinedWorkload)

        self.cmn.reset()
        self.cmn.state.reset()
        cost1 = self.cmn.getCost(d1)

        print "cost1 " + str(cost1)

        self.assertGreater(cost0, cost1)

        # Cost should remain the same after restoring the original workload
        self.state.restoreOriginalWorkload()
        self.cmn.reset()
        self.cmn.state.reset()
        cost2 = self.cmn.getCost(d0)

        print "cost2 " + str(cost2)

        self.assertEqual(cost2, cost0)

    ## def

    def testNotCollectionEmbeddingProcessShouldReturnNone(self):
        """
            If the given design has no collection embedding, we should return right away
        """
        d0 = Design()
        for i in xrange(len(CostModelTestCase.COLLECTION_NAMES)):
            col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[i]]
            d0.addCollection(col_info['name'])
            d0.addIndex(col_info['name'], ['field00', 'field02'])

        # Initialize a combiner
        combiner = WorkloadCombiner(self.col_names, self.workload)

        combinedWorkload = combiner.process(d0)
        self.assertEqual(None, combinedWorkload)
 def setUp(self):
     CostModelTestCase.setUp(self)
     self.cm = DiskCostComponent(self.state)
class TestDiskCost_IndexInsertionPenalty(CostModelTestCase):

    def setUp(self):
        CostModelTestCase.setUp(self)
        self.cm = DiskCostComponent(self.state)
    # DEF

    def testDiskCost_IndexInsertionPenalty(self):
        """
            IndexInsertionPenalty should be high if we build bad indexes
        """
        # 1
        d = Design()
        for col_name in CostModelTestCase.COLLECTION_NAMES:
            d.addCollection(col_name)
            d.addIndex(col_name, ["field00"])
        ## FOR

        self.cm.reset()
        self.cm.state.reset()
        self.cm.getCost(d)
        p0 = self.cm.total_index_insertion_penalty
        
        # 2
        d = Design()
        for col_name in CostModelTestCase.COLLECTION_NAMES:
            d.addCollection(col_name)
            d.addIndex(col_name, ["field01"])
        ## FOR

        self.cm.reset()
        self.cm.state.reset()
        self.cm.getCost(d)
        p1 = self.cm.total_index_insertion_penalty
        
        self.assertEqual(p0, p1)
        
        #3
        d = Design()
        for col_name in CostModelTestCase.COLLECTION_NAMES:
            d.addCollection(col_name)
            d.addIndex(col_name, ["field00", "field01"])
        ## FOR

        self.cm.reset()
        self.cm.state.reset()
        self.cm.getCost(d)
        p2 = self.cm.total_index_insertion_penalty
        
        self.assertEqual(p0, p2)
        
        #4
        d = Design()
        for col_name in CostModelTestCase.COLLECTION_NAMES:
            d.addCollection(col_name)
            d.addIndex(col_name, ["field00", "field02"])
        ## FOR

        self.cm.reset()
        self.cm.state.reset()
        self.cm.getCost(d)
        p3 = self.cm.total_index_insertion_penalty
        
        self.assertGreater(p3, p0)
        
        #5
        d = Design()
        for col_name in CostModelTestCase.COLLECTION_NAMES:
            d.addCollection(col_name)
            d.addIndex(col_name, ["field01", "field02"])
        ## FOR

        self.cm.reset()
        self.cm.state.reset()
        self.cm.getCost(d)
        p4 = self.cm.total_index_insertion_penalty
        
        self.assertGreater(p4, p0)
        
        #6
        d = Design()
        for col_name in CostModelTestCase.COLLECTION_NAMES:
            d.addCollection(col_name)
            d.addIndex(col_name, ["field00", "field01", "field02"])
        ## FOR

        self.cm.reset()
        self.cm.state.reset()
        self.cm.getCost(d)
        p5 = self.cm.total_index_insertion_penalty
        
        self.assertGreater(p5, p0)
    ## DEF
    
    def testDiskCost_IndexInsertionPenalty_integrated_to_cost_component(self):
        """