def test_potential_indexes(self): index_set_1 = set([Index([column_A_0])]) index_set_2 = set( [Index([column_A_0]), Index([column_A_1]), Index([column_A_2])]) self.assertEqual( set( Workload([query_0], database_name="test_DB").potential_indexes()), index_set_1, ) self.assertEqual( set( Workload([query_1], database_name="test_DB").potential_indexes()), index_set_2, ) self.assertEqual( set( Workload([query_0, query_1], database_name="test_DB").potential_indexes()), index_set_2, )
def setUpClass(cls): cls.db_name = "tpch_test_db_index_selection" cls.index_selection = IndexSelection() db = PostgresDatabaseConnector(None, autocommit=True) SCALE_FACTOR = 0.001 table_generator = TableGenerator("tpch", SCALE_FACTOR, db, explicit_database_name=cls.db_name) db.close() cls.index_selection.setup_db_connector(cls.db_name, "postgres") # Filter worklaod query_generator = QueryGenerator( "tpch", SCALE_FACTOR, cls.index_selection.db_connector, [3, 14], table_generator.columns, ) cls.small_tpch = Workload(query_generator.queries) query_generator = QueryGenerator( "tpch", SCALE_FACTOR, cls.index_selection.db_connector, [5, 6], table_generator.columns, ) cls.tpch_5_and_6 = Workload(query_generator.queries)
def test_workload_indexable_columns(self): table = Table("TableA") column_1 = Column(name="ColA") column_2 = Column(name="ColB") column_3 = Column(name="ColC") table.add_column(column_1) table.add_column(column_2) table.add_column(column_3) query_1 = Query( 17, "SELECT * FROM TableA WHERE ColA = 4 AND ColB = 5;", columns=[column_1, column_2], ) query_2 = Query( 18, "SELECT * FROM TableA WHERE ColA = 3 AND ColC = 2;", columns=[column_1, column_3], ) database_name = "test_DB" workload = Workload([query_1, query_2], database_name) indexable_columns = workload.indexable_columns() self.assertEqual(sorted(indexable_columns), sorted([column_1, column_2, column_3]))
def setUp(self): self.connector = MockConnector() self.algo = DropHeuristicAlgorithm(database_connector=self.connector) self.column_0 = Column("Col0") self.column_1 = Column("Col1") self.column_2 = Column("Col2") self.all_columns = [self.column_0, self.column_1, self.column_2] self.table = Table("TableA") self.table.add_columns(self.all_columns) self.index_0 = Index([self.column_0]) self.index_1 = Index([self.column_1]) self.index_2 = Index([self.column_2]) query_0 = Query(0, "SELECT * FROM TableA WHERE Col0 = 4;", [self.column_0]) query_1 = Query( 1, "SELECT * FROM TableA WHERE Col0 = 1 AND Col1 = 2 AND Col2 = 3;", self.all_columns, ) self.database_name = "test_DB" self.workload = Workload([query_0, query_1]) self.algo.workload = self.workload self.algo.cost_evaluation.calculate_cost = MagicMock( side_effect=self._calculate_cost_mock)
def test_calculate_indexes_1MB_2column(self, get_utilized_indexes_mock): algorithm = RelaxationAlgorithm( database_connector=self.connector, parameters={ "max_index_width": 2, "budget_MB": 1 }, ) algorithm.cost_evaluation.cache = mock_cache algorithm.cost_evaluation._prepare_cost_calculation = ( self.set_estimated_index_sizes) algorithm.cost_evaluation.estimate_size = self.set_estimated_index_size get_utilized_indexes_mock.return_value = ( { Index([column_A_0], 1000 * 1000), Index([column_A_0, column_A_1], 2000 * 1000), }, None, ) index_selection = algorithm.calculate_best_indexes( Workload([query_0, query_1])) # The single column index is dropped first, because of the lower penalty. # The multi column index is prefixed second. self.assertEqual(set(index_selection), {Index([column_A_0])})
def setUpClass(cls): cls.db_name = "TestDB" cls.table = Table("TestTableA") cls.columns = [ Column("Col0"), Column("Col1"), Column("Col2"), Column("Col3"), Column("Col4"), ] cls.table.add_columns(cls.columns) cls.queries = [ Query(0, "SELECT * FROM TestTableA WHERE Col0 = 4", [cls.columns[0]]), Query(1, "SELECT * FROM TestTableA WHERE Col1 = 3", [cls.columns[1]]), Query( 2, "SELECT * FROM TestTableA WHERE Col0 = 14 AND Col1 = 13", [cls.columns[0], cls.columns[1]], ), ] cls.workload = Workload(cls.queries, cls.db_name)
def test_calculate_indexes_1MB_2column(self): algorithm = RelaxationAlgorithm( database_connector=self.connector, parameters={"max_index_columns": 2, "budget": 1}, ) algorithm.cost_evaluation.cache = mock_cache algorithm.cost_evaluation._prepare_cost_calculation = ( self.set_estimated_index_sizes ) algorithm.cost_evaluation.estimate_size = self.set_estimated_index_size algorithm._exploit_virtual_indexes = lambda workload: ( None, { Index([column_A_0], 1000 * 1000), Index([column_A_0, column_A_1], 2000 * 1000), }, ) index_selection = algorithm.calculate_best_indexes( Workload([query_0, query_1], self.database_name) ) # The single column index is dropped first, because of the lower penalty. # The multi column index is prefixed second. self.assertEqual(set(index_selection), {Index([column_A_0])})
def test_calculate_indexes_3000MB_2column(self): algorithm = RelaxationAlgorithm( database_connector=self.connector, parameters={"max_index_columns": 2, "budget": 3}, ) algorithm.cost_evaluation.cache = mock_cache algorithm.cost_evaluation._prepare_cost_calculation = ( self.set_estimated_index_sizes ) algorithm.cost_evaluation.estimate_size = self.set_estimated_index_size algorithm._exploit_virtual_indexes = lambda workload: ( None, { Index([column_A_0], 1000 * 1000), Index([column_A_0, column_A_1], 2000 * 1000), }, ) index_selection = algorithm.calculate_best_indexes( Workload([query_0, query_1], self.database_name) ) self.assertEqual( set(index_selection), set([Index([column_A_0]), Index([column_A_0, column_A_1])]), )
def setUp(self): self.connector = MockConnector() self.algo = EPICAlgorithm(database_connector=self.connector) self.column_1 = Column("ColA") self.column_2 = Column("ColB") self.column_3 = Column("ColC") self.all_columns = [self.column_1, self.column_2, self.column_3] self.table = Table("TableA") self.table.add_columns(self.all_columns) self.index_1 = Index([self.column_1]) self.index_1.estimated_size = 5 self.index_2 = Index([self.column_2]) self.index_2.estimated_size = 1 self.index_3 = Index([self.column_3]) self.index_3.estimated_size = 3 query_1 = Query(0, "SELECT * FROM TableA WHERE ColA = 4;", [self.column_1]) query_2 = Query( 1, "SELECT * FROM TableA WHERE ColA = 1 AND ColB = 2 AND ColC = 3;", self.all_columns, ) self.database_name = "test_DB" self.workload = Workload([query_1, query_2], self.database_name) self.algo.workload = self.workload
def test_calculate_indexes_3000MB_2column(self, get_utilized_indexes_mock): algorithm = RelaxationAlgorithm( database_connector=self.connector, parameters={ "max_index_width": 2, "budget_MB": 3 }, ) algorithm.cost_evaluation.cache = mock_cache algorithm.cost_evaluation._prepare_cost_calculation = ( self.set_estimated_index_sizes) algorithm.cost_evaluation.estimate_size = self.set_estimated_index_size get_utilized_indexes_mock.return_value = ( { Index([column_A_0], 1000 * 1000), Index([column_A_0, column_A_1], 2000 * 1000), }, None, ) index_selection = algorithm.calculate_best_indexes( Workload([query_0, query_1])) self.assertEqual( set(index_selection), set([Index([column_A_0]), Index([column_A_0, column_A_1])]), )
def test_get_utilized_indexes(self): class CostEvaluationMock: def which_indexes_utilized_and_cost(_, query, indexes): if query.nr == 0: return [{self.index_0}, 17] if query.nr == 1: return [{self.index_0, self.index_2}, 14] def calculate_cost(_, workload, indexes): assert len(workload.queries) == 1, ( "get_utilized_indexes' calculate_cost_mock should not be " "called with workloads that contain more than one query" ) assert indexes == [], ( "get_utilized_indexes' calculate_cost_mock should not be " "called with indexes" ) query = workload.queries[0] if query.nr == 0: return 170 if query.nr == 1: return 140 query_0 = Query(0, "SELECT * FROM tablea WHERE col0 = 4;", [self.column_a_0]) query_1 = Query( 1, ( "SELECT * FROM tablea as a, tableb as b WHERE a.col0 = 4 AND " "a.col1 = 17AND b.col0 = 3;" ), [self.column_a_0, self.column_a_1, self.column_b_0], ) workload = Workload([query_0, query_1]) candidates = candidates_per_query(workload, 2, syntactically_relevant_indexes) utilized_indexes, query_details = get_utilized_indexes( workload, candidates, CostEvaluationMock() ) self.assertEqual(query_details, {}) self.assertEqual(utilized_indexes, {self.index_0, self.index_2}) expected_first_result = { "cost_without_indexes": 170, "cost_with_indexes": 17, "utilized_indexes": {self.index_0}, } expected_second_result = { "cost_without_indexes": 140, "cost_with_indexes": 14, "utilized_indexes": {self.index_0, self.index_2}, } utilized_indexes, query_details = get_utilized_indexes( workload, candidates, CostEvaluationMock(), detailed_query_information=True ) self.assertEqual(query_details[query_0], expected_first_result) self.assertEqual(query_details[query_1], expected_second_result) self.assertEqual(utilized_indexes, {self.index_0, self.index_2})
def test_workload(self): query_1 = Query(17, "SELECT * FROM TableA;") query_2 = Query(18, "SELECT * FROM nation;") database_name = "test_DB" workload = Workload([query_1, query_2], database_name) self.assertEqual(workload.queries, [query_1, query_2]) self.assertEqual(workload.database_name, database_name)
def test_calculate_best_only_executable_once(self): workload = Workload([]) selection_algorithm = NoIndexAlgorithm( PostgresDatabaseConnector(None, autocommit=True)) self.assertFalse(selection_algorithm.did_run) selection_algorithm.calculate_best_indexes(workload) self.assertTrue(selection_algorithm.did_run) with self.assertRaises(AssertionError): selection_algorithm.calculate_best_indexes(workload)
def test_calculate_indexes_2indexes_2columns(self): algorithm = AutoAdminAlgorithm( database_connector=self.connector, parameters={"max_indexes": 2, "max_index_width": 2}, ) algorithm.cost_evaluation.cache = mock_cache algorithm.cost_evaluation._prepare_cost_calculation = ( lambda indexes, store_size=False: None ) index_selection = algorithm.calculate_best_indexes(Workload([query_0, query_1])) self.assertEqual(set(index_selection), set([Index([column_A_0, column_A_1])]))
def test_db2advis_algorithm(self): parameters = {} db2advis_algorithm = self.index_selection.create_algorithm_object( "db2advis", parameters) workload = Workload([self.small_tpch.queries[0]]) possible = candidates_per_query( workload, max_index_width=3, candidate_generator=syntactically_relevant_indexes, )[0] indexes = db2advis_algorithm.calculate_best_indexes(workload) self.assertTrue(len(possible) >= len(indexes))
def test_exploit_virtual_indexes(self): def _simulate_index_mock(index, store_size): index.hypopg_name = f"<1337>btree_{index.columns}" # For some reason, the database decides to only use an index for one of # the filters def _simulate_get_plan(query): if "Table0" in query.text: return { "Total Cost": 17, "Plans": [{ "Index Name": "<1337>btree_(C table0.col1,)" }], } return { "Total Cost": 5, "Plans": [{ "Simple Table Retrieve": "table1" }] } query_0 = Query( 0, "SELECT * FROM Table0 WHERE Col0 = 1 AND Col1 = 2;", [self.column_0, self.column_1], ) query_1 = Query(1, "SELECT * FROM Table1;", []) workload = Workload([query_0, query_1], "database_name") self.algo.database_connector.get_plan = MagicMock( side_effect=_simulate_get_plan) self.algo.what_if.simulate_index = MagicMock( side_effect=_simulate_index_mock) self.algo.what_if.drop_all_simulated_indexes = MagicMock() query_results, index_candidates = self.algo._exploit_virtual_indexes( workload) self.assertEqual(len(query_results), len(workload.queries)) expected_first_result = { "cost_without_indexes": 17, "cost_with_recommended_indexes": 17, "recommended_indexes": set([Index([self.column_1])]), } expected_second_result = { "cost_without_indexes": 5, "cost_with_recommended_indexes": 5, "recommended_indexes": set(), } self.assertEqual(query_results[query_0], expected_first_result) self.assertEqual(query_results[query_1], expected_second_result) self.assertEqual(index_candidates, set([Index([self.column_1])]))
def test_cache_hit(self): self.assertEqual(self.cost_evaluation.cost_requests, 0) self.assertEqual(self.cost_evaluation.cache_hits, 0) workload = Workload([self.queries[0]]) self.cost_evaluation.calculate_cost(workload, indexes=set()) self.assertEqual(self.cost_evaluation.cost_requests, 1) self.assertEqual(self.cost_evaluation.cache_hits, 0) self.assertEqual(self.connector.get_cost.call_count, 1) self.cost_evaluation.calculate_cost(workload, indexes=set()) self.assertEqual(self.cost_evaluation.cost_requests, 2) self.assertEqual(self.cost_evaluation.cache_hits, 1) self.assertEqual(self.connector.get_cost.call_count, 1)
def test_cache_hit_different_index_same_columns(self): self.assertEqual(self.cost_evaluation.cost_requests, 0) self.assertEqual(self.cost_evaluation.cache_hits, 0) workload = Workload([self.queries[0]]) self.cost_evaluation.calculate_cost(workload, set([Index([self.columns[0]])])) self.assertEqual(self.cost_evaluation.cost_requests, 1) self.assertEqual(self.cost_evaluation.cache_hits, 0) self.assertEqual(self.connector.get_cost.call_count, 1) self.cost_evaluation.calculate_cost(workload, set([Index([self.columns[0]])])) self.assertEqual(self.cost_evaluation.cost_requests, 2) self.assertEqual(self.cost_evaluation.cache_hits, 1) self.assertEqual(self.connector.get_cost.call_count, 1)
def test_no_cache_hit_unseen(self): self.assertEqual(self.cost_evaluation.cost_requests, 0) self.assertEqual(self.cost_evaluation.cache_hits, 0) workload = Workload([self.queries[0]], self.db_name) index_0 = Index([self.columns[0]]) self.cost_evaluation.calculate_cost(workload, indexes=set()) self.assertEqual(self.cost_evaluation.cost_requests, 1) self.assertEqual(self.cost_evaluation.cache_hits, 0) self.assertEqual(self.connector.get_cost.call_count, 1) self.cost_evaluation.calculate_cost(workload, set([index_0])) self.assertEqual(self.cost_evaluation.cost_requests, 2) self.assertEqual(self.cost_evaluation.cache_hits, 0) self.assertEqual(self.connector.get_cost.call_count, 2) self.connector.simulate_index.assert_called_with(index_0)
def test_cache_hit_non_relevant_index(self): self.assertEqual(self.cost_evaluation.cost_requests, 0) self.assertEqual(self.cost_evaluation.cache_hits, 0) workload = Workload([self.queries[0]]) index_1 = Index([self.columns[1]]) self.cost_evaluation.calculate_cost(workload, indexes=set()) self.assertEqual(self.cost_evaluation.cost_requests, 1) self.assertEqual(self.cost_evaluation.cache_hits, 0) self.assertEqual(self.connector.get_cost.call_count, 1) self.cost_evaluation.calculate_cost(workload, set([index_1])) self.assertEqual(self.cost_evaluation.cost_requests, 2) self.assertEqual(self.cost_evaluation.cache_hits, 1) self.assertEqual(self.connector.get_cost.call_count, 1) self.connector.simulate_index.assert_called_with(index_1)
def setUpClass(cls): cls.db_name = "tpch_test_db_database" cls.scale_factor = 0.001 generating_connector = PostgresDatabaseConnector(None, autocommit=True) table_generator = TableGenerator( "tpch", cls.scale_factor, generating_connector, explicit_database_name=cls.db_name, ) cls.db = PostgresDatabaseConnector(cls.db_name) query_generator = QueryGenerator("tpch", cls.scale_factor, cls.db, [5, 6], table_generator.columns) cls.workload = Workload(query_generator.queries) generating_connector.close()
def test_candidates_per_query(self): MAX_INDEX_WIDTH = 2 query_1 = Query(18, """SELECT * FROM 1;""") workload = Workload([self.query_0, query_1]) syntactically_relevant_indexes_mock = MagicMock( return_value=syntactically_relevant_indexes) result = candidates_per_query( workload, max_index_width=MAX_INDEX_WIDTH, candidate_generator=syntactically_relevant_indexes_mock, ) self.assertEqual(len(result), len(workload.queries)) syntactically_relevant_indexes_mock.assert_called_with( query_1, MAX_INDEX_WIDTH) syntactically_relevant_indexes_mock.assert_any_call( self.query_0, MAX_INDEX_WIDTH)
def test_calculate_best_indexes_scenario_3(self): query_1 = Query( 0, "SELECT * FROM TableA WHERE ColA = 1 AND ColB = 2;", [self.column_1, self.column_2], ) workload = Workload([query_1]) self.algo.cost_evaluation.calculate_cost = MagicMock( side_effect=self._calculate_cost_mock_3) # Budget too small for multi self.algo.budget = 2 indexes = self.algo._calculate_best_indexes(workload) expected_indexes = [Index([self.column_2])] self.assertEqual(indexes, expected_indexes) # Picks multi with best ratio self.algo.budget = 4 indexes = self.algo._calculate_best_indexes(workload) expected_indexes = [Index([self.column_2, self.column_1])] self.assertEqual(indexes, expected_indexes)
def test_calculate_best(self): workload = Workload([], self.db_name) with self.assertRaises(NotImplementedError): self.selection_algorithm.calculate_best_indexes(workload)
def test_cost_eval_cost_empty_workload(self): workload = Workload([], self.db_name) cost_eval = self.selection_algorithm.cost_evaluation cost = cost_eval.calculate_cost(workload, []) self.assertEqual(cost, 0)
def test_workload(self): query_1 = Query(17, "SELECT * FROM TableA;") query_2 = Query(18, "SELECT * FROM nation;") workload = Workload([query_1, query_2]) self.assertEqual(workload.queries, [query_1, query_2])