def configuration_recommendation_ddpg(result_info): # pylint: disable=invalid-name LOG.info('Use ddpg to recommend configuration') result_id = result_info['newest_result_id'] result = Result.objects.filter(pk=result_id) session = Result.objects.get(pk=result_id).session agg_data = DataUtil.aggregate_data(result) metric_data = agg_data['y_matrix'].flatten() cleaned_agg_data = clean_knob_data(agg_data['X_matrix'], agg_data['X_columnlabels'], session) knob_labels = np.array(cleaned_agg_data[1]).flatten() knob_num = len(knob_labels) metric_num = len(metric_data) ddpg = DDPG(n_actions=knob_num, n_states=metric_num, alr=ACTOR_LEARNING_RATE, clr=CRITIC_LEARNING_RATE, gamma=GAMMA, batch_size=DDPG_BATCH_SIZE, tau=TAU) if session.ddpg_actor_model is not None and session.ddpg_critic_model is not None: ddpg.set_model(session.ddpg_actor_model, session.ddpg_critic_model) if session.ddpg_reply_memory is not None: ddpg.replay_memory.set(session.ddpg_reply_memory) knob_data = ddpg.choose_action(metric_data) LOG.info('recommended knob: %s', knob_data) knob_bounds = np.vstack(DataUtil.get_knob_bounds(knob_labels, session)) knob_data = MinMaxScaler().fit(knob_bounds).inverse_transform(knob_data.reshape(1, -1))[0] conf_map = {k: knob_data[i] for i, k in enumerate(knob_labels)} conf_map_res = {} conf_map_res['status'] = 'good' conf_map_res['result_id'] = result_id conf_map_res['recommendation'] = conf_map conf_map_res['info'] = 'INFO: ddpg' for k in knob_labels: LOG.info('%s: %f', k, conf_map[k]) return conf_map_res
def aggregate_target_results(result_id): # Check that we've completed the background tasks at least once. We need # this data in order to make a configuration recommendation (until we # implement a sampling technique to generate new training data). latest_pipeline_run = PipelineRun.objects.get_latest() if latest_pipeline_run is None: result = Result.objects.filter(pk=result_id) knobs_ = KnobCatalog.objects.filter(dbms=result[0].dbms, tunable=True) knobs_catalog = {k.name: k for k in knobs_} knobs = {k: v for k, v in list(knobs_catalog.items())} # generate a config randomly random_knob_result = gen_random_data(knobs) agg_data = DataUtil.aggregate_data(result) agg_data['newest_result_id'] = result_id agg_data['bad'] = True agg_data['config_recommend'] = random_knob_result return agg_data # Aggregate all knob config results tried by the target so far in this # tuning session and this tuning workload. newest_result = Result.objects.get(pk=result_id) target_results = Result.objects.filter(session=newest_result.session, dbms=newest_result.dbms, workload=newest_result.workload) if len(target_results) == 0: raise Exception('Cannot find any results for session_id={}, dbms_id={}' .format(newest_result.session, newest_result.dbms)) agg_data = DataUtil.aggregate_data(target_results) agg_data['newest_result_id'] = result_id agg_data['bad'] = False return agg_data
def configuration_recommendation_ddpg(result_info): # pylint: disable=invalid-name LOG.info('Use ddpg to recommend configuration') result_id = result_info['newest_result_id'] result = Result.objects.filter(pk=result_id) session = Result.objects.get(pk=result_id).session agg_data = DataUtil.aggregate_data(result) metric_data = agg_data['y_matrix'].flatten() metric_scalar = MinMaxScaler().fit(metric_data.reshape(1, -1)) normalized_metric_data = metric_scalar.transform(metric_data.reshape( 1, -1))[0] cleaned_knob_data = clean_knob_data(agg_data['X_matrix'], agg_data['X_columnlabels'], session) knob_labels = np.array(cleaned_knob_data[1]).flatten() knob_num = len(knob_labels) metric_num = len(metric_data) ddpg = DDPG(n_actions=knob_num, n_states=metric_num) if session.ddpg_actor_model is not None and session.ddpg_critic_model is not None: ddpg.set_model(session.ddpg_actor_model, session.ddpg_critic_model) if session.ddpg_reply_memory is not None: ddpg.replay_memory.set(session.ddpg_reply_memory) knob_data = ddpg.choose_action(normalized_metric_data) knob_bounds = np.vstack(DataUtil.get_knob_bounds(knob_labels, session)) knob_data = MinMaxScaler().fit(knob_bounds).inverse_transform( knob_data.reshape(1, -1))[0] conf_map = {k: knob_data[i] for i, k in enumerate(knob_labels)} conf_map_res = {} conf_map_res['status'] = 'good' conf_map_res['result_id'] = result_id conf_map_res['recommendation'] = conf_map conf_map_res['info'] = 'INFO: ddpg' return conf_map_res
def aggregate_target_results(result_id): # Check that we've completed the background tasks at least once. We need # this data in order to make a configuration recommendation (until we # implement a sampling technique to generate new training data). latest_pipeline_run = PipelineRun.objects.get_latest() newest_result = Result.objects.get(pk=result_id) if latest_pipeline_run is None or newest_result.session.tuning_session == 'randomly_generate': result = Result.objects.filter(pk=result_id) knobs_ = KnobCatalog.objects.filter(dbms=result[0].dbms, tunable=True) knobs_catalog = {k.name: k for k in knobs_} knobs = {k: v for k, v in list(knobs_catalog.items())} # generate a config randomly random_knob_result = gen_random_data(knobs) agg_data = DataUtil.aggregate_data(result) agg_data['newest_result_id'] = result_id agg_data['bad'] = True agg_data['config_recommend'] = random_knob_result return agg_data # Aggregate all knob config results tried by the target so far in this # tuning session and this tuning workload. target_results = Result.objects.filter(session=newest_result.session, dbms=newest_result.dbms, workload=newest_result.workload) if len(target_results) == 0: raise Exception( 'Cannot find any results for session_id={}, dbms_id={}'.format( newest_result.session, newest_result.dbms)) agg_data = DataUtil.aggregate_data(target_results) agg_data['newest_result_id'] = result_id agg_data['bad'] = False return agg_data
def test_combine(self): test_dedup_row_labels = np.array(["Workload-0", "Workload-1"]) test_dedup_x = np.matrix([[0.22, 5, "string", "11:11", "fsync", True], [0.21, 6, "string", "11:12", "fsync", True]]) test_dedup_y = np.matrix([[30, 30, 40], [10, 10, 40]]) test_x, test_y, row_labels = DataUtil.combine_duplicate_rows( test_dedup_x, test_dedup_y, test_dedup_row_labels) self.assertEqual(len(test_x), len(test_y)) self.assertEqual(len(test_x), len(row_labels)) self.assertEqual(row_labels[0], tuple([test_dedup_row_labels[0]])) self.assertEqual(row_labels[1], tuple([test_dedup_row_labels[1]])) self.assertTrue((test_x[0] == test_dedup_x[0]).all()) self.assertTrue((test_x[1] == test_dedup_x[1]).all()) self.assertTrue((test_y[0] == test_dedup_y[0]).all()) self.assertTrue((test_y[1] == test_dedup_y[1]).all()) test_row_labels = np.array(["Workload-0", "Workload-1", "Workload-2", "Workload-3"]) test_x_matrix = np.matrix([[0.22, 5, "string", "timestamp", "enum", True], [0.3, 5, "rstring", "timestamp2", "enum", False], [0.22, 5, "string", "timestamp", "enum", True], [0.3, 5, "r", "timestamp2", "enum", False]]) test_y_matrix = np.matrix([[20, 30, 40], [30, 30, 40], [20, 30, 40], [32, 30, 40]]) test_x, test_y, row_labels = DataUtil.combine_duplicate_rows( test_x_matrix, test_y_matrix, test_row_labels) self.assertTrue(len(test_x) <= len(test_x_matrix)) self.assertTrue(len(test_y) <= len(test_y_matrix)) self.assertEqual(len(test_x), len(test_y)) self.assertEqual(len(test_x), len(row_labels)) row_labels_set = set(row_labels) self.assertTrue(tuple(["Workload-0", "Workload-2"]) in row_labels_set) self.assertTrue(("Workload-1",) in row_labels_set) self.assertTrue(("Workload-3",) in row_labels_set) rows = set() for i in test_x: self.assertTrue(tuple(i) not in rows) self.assertTrue(i in test_x_matrix) rows.add(tuple(i)) rowys = set() for i in test_y: self.assertTrue(tuple(i) not in rowys) self.assertTrue(i in test_y_matrix) rowys.add(tuple(i))
def aggregate_target_results(result_id, algorithm): # Check that we've completed the background tasks at least once. We need # this data in order to make a configuration recommendation (until we # implement a sampling technique to generate new training data). newest_result = Result.objects.get(pk=result_id) has_pipeline_data = PipelineData.objects.filter( workload=newest_result.workload).exists() if not has_pipeline_data or newest_result.session.tuning_session == 'randomly_generate': if not has_pipeline_data and newest_result.session.tuning_session == 'tuning_session': LOG.debug( "Background tasks haven't ran for this workload yet, picking random data." ) result = Result.objects.filter(pk=result_id) knobs = SessionKnob.objects.get_knobs_for_session( newest_result.session) # generate a config randomly random_knob_result = gen_random_data(knobs) agg_data = DataUtil.aggregate_data(result) agg_data['newest_result_id'] = result_id agg_data['bad'] = True agg_data['config_recommend'] = random_knob_result LOG.debug('%s: Finished generating a random config.\n\ndata=%s\n', AlgorithmType.name(algorithm), JSONUtil.dumps(agg_data, pprint=True)) else: # Aggregate all knob config results tried by the target so far in this # tuning session and this tuning workload. target_results = Result.objects.filter(session=newest_result.session, dbms=newest_result.dbms, workload=newest_result.workload) if len(target_results) == 0: raise Exception( 'Cannot find any results for session_id={}, dbms_id={}'.format( newest_result.session, newest_result.dbms)) agg_data = DataUtil.aggregate_data(target_results) agg_data['newest_result_id'] = result_id agg_data['bad'] = False # Clean knob data cleaned_agg_data = clean_knob_data(agg_data['X_matrix'], agg_data['X_columnlabels'], newest_result.session) agg_data['X_matrix'] = np.array(cleaned_agg_data[0]) agg_data['X_columnlabels'] = np.array(cleaned_agg_data[1]) LOG.debug('%s: Finished aggregating target results.\n\ndata=%s\n', AlgorithmType.name(algorithm), JSONUtil.dumps(agg_data, pprint=True)) return agg_data, algorithm
def configuration_recommendation_ddpg(result_info): # pylint: disable=invalid-name start_ts = time.time() LOG.info('Use ddpg to recommend configuration') result_id = result_info['newest_result_id'] result_list = Result.objects.filter(pk=result_id) result = result_list.first() session = result.session params = JSONUtil.loads(session.hyperparameters) agg_data = DataUtil.aggregate_data(result_list) metric_data, _ = clean_metric_data(agg_data['y_matrix'], agg_data['y_columnlabels'], session) metric_data = metric_data.flatten() metric_scalar = MinMaxScaler().fit(metric_data.reshape(1, -1)) normalized_metric_data = metric_scalar.transform(metric_data.reshape( 1, -1))[0] cleaned_knob_data = clean_knob_data(agg_data['X_matrix'], agg_data['X_columnlabels'], session) knob_labels = np.array(cleaned_knob_data[1]).flatten() knob_num = len(knob_labels) metric_num = len(metric_data) ddpg = DDPG(n_actions=knob_num, n_states=metric_num, a_hidden_sizes=params['DDPG_ACTOR_HIDDEN_SIZES'], c_hidden_sizes=params['DDPG_CRITIC_HIDDEN_SIZES'], use_default=params['DDPG_USE_DEFAULT']) if session.ddpg_actor_model is not None and session.ddpg_critic_model is not None: ddpg.set_model(session.ddpg_actor_model, session.ddpg_critic_model) if session.ddpg_reply_memory is not None: ddpg.replay_memory.set(session.ddpg_reply_memory) knob_data = ddpg.choose_action(normalized_metric_data) knob_bounds = np.vstack(DataUtil.get_knob_bounds(knob_labels, session)) knob_data = MinMaxScaler().fit(knob_bounds).inverse_transform( knob_data.reshape(1, -1))[0] conf_map = {k: knob_data[i] for i, k in enumerate(knob_labels)} conf_map_res = create_and_save_recommendation(recommended_knobs=conf_map, result=result, status='good', info='INFO: ddpg') save_execution_time(start_ts, "configuration_recommendation_ddpg", result) return conf_map_res
def test_no_featured_categorical(self): featured_knobs = ['global.backend_flush_after', 'global.bgwriter_delay', 'global.wal_writer_delay', 'global.work_mem'] postgresdb = DBMSCatalog.objects.get(pk=1) categorical_info = DataUtil.dummy_encoder_helper(featured_knobs, dbms=postgresdb) self.assertEqual(len(categorical_info['n_values']), 0) self.assertEqual(len(categorical_info['categorical_features']), 0) self.assertEqual(categorical_info['cat_columnlabels'], []) self.assertEqual(categorical_info['noncat_columnlabels'], featured_knobs)
def test_no_featured_categorical(self): featured_knobs = ['global.backend_flush_after', 'global.bgwriter_delay', 'global.wal_writer_delay', 'global.work_mem'] postgres96 = DBMSCatalog.objects.get(pk=1) categorical_info = DataUtil.dummy_encoder_helper(featured_knobs, dbms=postgres96) self.assertEqual(len(categorical_info['n_values']), 0) self.assertEqual(len(categorical_info['categorical_features']), 0) self.assertEqual(categorical_info['cat_columnlabels'], []) self.assertEqual(categorical_info['noncat_columnlabels'], featured_knobs)
def test_featured_categorical(self): featured_knobs = [ 'global.backend_flush_after', 'global.bgwriter_delay', 'global.wal_writer_delay', 'global.work_mem', 'global.wal_sync_method' ] # last knob categorical postgres96 = DBMSCatalog.objects.get(pk=1) categorical_info = DataUtil.dummy_encoder_helper(featured_knobs, dbms=postgres96) self.assertEqual(len(categorical_info['n_values']), 1) self.assertEqual(categorical_info['n_values'][0], 4) self.assertEqual(len(categorical_info['categorical_features']), 1) self.assertEqual(categorical_info['categorical_features'][0], 4) self.assertEqual(categorical_info['cat_columnlabels'], ['global.wal_sync_method']) self.assertEqual(categorical_info['noncat_columnlabels'], featured_knobs[:-1])
def aggregate_results(): unique_clusters = WorkloadCluster.objects.all() unique_clusters = filter(lambda x: x.isdefault is False, unique_clusters) all_data = {} all_labels = {} for cluster in unique_clusters: results = ResultData.objects.filter(cluster=cluster) if len(results) < 2: continue if cluster.dbms.pk not in all_labels: knob_labels = np.asarray( sorted(JSONUtil.loads(results[0].param_data).keys())) metric_labels = np.asarray( sorted(JSONUtil.loads(results[0].metric_data).keys())) all_labels[cluster.dbms.pk] = (knob_labels, metric_labels) else: knob_labels, metric_labels = all_labels[cluster.dbms.pk] entry = DataUtil.aggregate_data(results, knob_labels, metric_labels) key = (cluster.dbms.pk, cluster.hardware.pk) if key not in all_data: all_data[key] = {} all_data[key][cluster.pk] = entry ts = now() tsf = ts.strftime("%Y%m%d-%H%M%S") for (dbkey, hwkey), cluster_data in all_data.iteritems(): task_name = PipelineTaskType.TYPE_NAMES[ PipelineTaskType.AGGREGATED_DATA].replace(' ', '').upper() savepaths = {} for clusterkey, entry in cluster_data.iteritems(): fname = '{}_{}_{}_{}_{}.npz'.format(task_name, dbkey, hwkey, clusterkey, tsf) savepath = os.path.join(PIPELINE_DIR, fname) savepaths[clusterkey] = savepath np.savez_compressed(savepath, **entry) value = {'data': savepaths} new_res = PipelineResult() new_res.dbms = DBMSCatalog.objects.get(pk=dbkey) new_res.hardware = Hardware.objects.get(pk=hwkey) new_res.creation_timestamp = ts new_res.task_type = PipelineTaskType.AGGREGATED_DATA new_res.value = JSONUtil.dumps(value) new_res.save()
def aggregate_data(workload): # Aggregates both the knob & metric data for the given workload. # # Parameters: # workload: aggregate data belonging to this specific workload # # Returns: two dictionaries containing the knob & metric data as # a tuple # Find the results for this workload wkld_results = Result.objects.filter(workload=workload) # Now call the aggregate_data helper function to combine all knob & # metric data into matrices and also create row/column labels # (see the DataUtil class in website/utils.py) # # The aggregate_data helper function returns a dictionary of the form: # - 'X_matrix': the knob data as a 2D numpy matrix (results x knobs) # - 'y_matrix': the metric data as a 2D numpy matrix (results x metrics) # - 'rowlabels': list of result ids that correspond to the rows in # both X_matrix & y_matrix # - 'X_columnlabels': a list of the knob names corresponding to the # columns in the knob_data matrix # - 'y_columnlabels': a list of the metric names corresponding to the # columns in the metric_data matrix aggregated_data = DataUtil.aggregate_data(wkld_results) # Separate knob & workload data into two "standard" dictionaries of the # same form knob_data = { 'data': aggregated_data['X_matrix'], 'rowlabels': aggregated_data['rowlabels'], 'columnlabels': aggregated_data['X_columnlabels'] } metric_data = { 'data': aggregated_data['y_matrix'], 'rowlabels': copy.deepcopy(aggregated_data['rowlabels']), 'columnlabels': aggregated_data['y_columnlabels'] } # Return the knob & metric data return knob_data, metric_data
def aggregate_target_results(result_id): # Check that we've completed the background tasks at least once. We need # this data in order to make a configuration recommendation (until we # implement a sampling technique to generate new training data). latest_pipeline_run = PipelineRun.objects.get_latest() if latest_pipeline_run is None: raise Exception("No previous data available. Implement me!") # Aggregate all knob config results tried by the target so far in this # tuning session. newest_result = Result.objects.get(pk=result_id) target_results = Result.objects.filter( session=newest_result.session, dbms=newest_result.dbms) if len(target_results) == 0: raise Exception('Cannot find any results for session_id={}, dbms_id={}' .format(newest_result.session, newest_result.dbms)) agg_data = DataUtil.aggregate_data(target_results) agg_data['newest_result_id'] = result_id return agg_data
def aggregate_data(wkld_results): # Aggregates both the knob & metric data for the given workload. # # Parameters: # wkld_results: result data belonging to this specific workload # # Returns: two dictionaries containing the knob & metric data as # a tuple # Now call the aggregate_data helper function to combine all knob & # metric data into matrices and also create row/column labels # (see the DataUtil class in website/utils.py) # # The aggregate_data helper function returns a dictionary of the form: # - 'X_matrix': the knob data as a 2D numpy matrix (results x knobs) # - 'y_matrix': the metric data as a 2D numpy matrix (results x metrics) # - 'rowlabels': list of result ids that correspond to the rows in # both X_matrix & y_matrix # - 'X_columnlabels': a list of the knob names corresponding to the # columns in the knob_data matrix # - 'y_columnlabels': a list of the metric names corresponding to the # columns in the metric_data matrix start_ts = time.time() aggregated_data = DataUtil.aggregate_data( wkld_results, ignore=['range_test', 'default', '*']) # Separate knob & workload data into two "standard" dictionaries of the # same form knob_data = { 'data': aggregated_data['X_matrix'], 'rowlabels': aggregated_data['rowlabels'], 'columnlabels': aggregated_data['X_columnlabels'] } metric_data = { 'data': aggregated_data['y_matrix'], 'rowlabels': copy.deepcopy(aggregated_data['rowlabels']), 'columnlabels': aggregated_data['y_columnlabels'] } # Return the knob & metric data save_execution_time(start_ts, "aggregate_data") return knob_data, metric_data
def aggregate_target_results(result_id): newest_result = Result.objects.get(pk=result_id) target_results = Result.objects.filter( application=newest_result.application, dbms=newest_result.dbms) if len(target_results) == 0: raise Exception( 'Cannot find any results for app_id={}, dbms_id={}'.format( newest_result.application, newest_result.dbms)) target_result_datas = [ ResultData.objects.get(result=tres) for tres in target_results ] knob_labels = np.asarray( sorted(JSONUtil.loads(target_result_datas[0].param_data).keys())) metric_labels = np.asarray( sorted(JSONUtil.loads(target_result_datas[0].metric_data).keys())) agg_data = DataUtil.aggregate_data(target_result_datas, knob_labels, metric_labels) agg_data['newest_result_id'] = result_id return agg_data
def test_aggregate(self): workload2 = Result.objects.filter(workload=2) num_results = Result.objects.filter(workload=2).count() knobs = list(JSONUtil.loads(workload2[0].knob_data.data).keys()) metrics = list(JSONUtil.loads(workload2[0].metric_data.data).keys()) num_knobs = len(knobs) num_metrics = len(metrics) test_result = DataUtil.aggregate_data(workload2) self.assertTrue('X_matrix' in list(test_result.keys())) self.assertTrue('y_matrix' in list(test_result.keys())) self.assertTrue('rowlabels' in list(test_result.keys())) self.assertTrue('X_columnlabels' in list(test_result.keys())) self.assertTrue('y_columnlabels' in list(test_result.keys())) self.assertEqual(test_result['X_columnlabels'], knobs) self.assertEqual(test_result['y_columnlabels'], metrics) self.assertEqual(test_result['X_matrix'].shape[0], num_results) self.assertEqual(test_result['y_matrix'].shape[0], num_results) self.assertEqual(test_result['X_matrix'].shape[1], num_knobs) self.assertEqual(test_result['y_matrix'].shape[1], num_metrics)
def run_knob_identification(knob_data, metric_data, dbms): # Performs knob identification on the knob & metric data and returns # a set of ranked knobs. # # Parameters: # knob_data & metric_data are dictionaries of the form: # - 'data': 2D numpy matrix of knob/metric data # - 'rowlabels': a list of identifiers for the rows in the matrix # - 'columnlabels': a list of the knob/metric names corresponding # to the columns in the data matrix # dbms is the foreign key pointing to target dbms in DBMSCatalog # # When running the lasso algorithm, the knob_data matrix is set of # independent variables (X) and the metric_data is the set of # dependent variables (y). knob_matrix = knob_data['data'] knob_columnlabels = knob_data['columnlabels'] metric_matrix = metric_data['data'] metric_columnlabels = metric_data['columnlabels'] # remove constant columns from knob_matrix and metric_matrix nonconst_knob_matrix = [] nonconst_knob_columnlabels = [] for col, cl in zip(knob_matrix.T, knob_columnlabels): if np.any(col != col[0]): nonconst_knob_matrix.append(col.reshape(-1, 1)) nonconst_knob_columnlabels.append(cl) assert len(nonconst_knob_matrix) > 0, "Need more data to train the model" nonconst_knob_matrix = np.hstack(nonconst_knob_matrix) nonconst_metric_matrix = [] nonconst_metric_columnlabels = [] for col, cl in zip(metric_matrix.T, metric_columnlabels): if np.any(col != col[0]): nonconst_metric_matrix.append(col.reshape(-1, 1)) nonconst_metric_columnlabels.append(cl) nonconst_metric_matrix = np.hstack(nonconst_metric_matrix) # determine which knobs need encoding (enums with >2 possible values) categorical_info = DataUtil.dummy_encoder_helper( nonconst_knob_columnlabels, dbms) # encode categorical variable first (at least, before standardize) dummy_encoder = DummyEncoder(categorical_info['n_values'], categorical_info['categorical_features'], categorical_info['cat_columnlabels'], categorical_info['noncat_columnlabels']) encoded_knob_matrix = dummy_encoder.fit_transform(nonconst_knob_matrix) encoded_knob_columnlabels = dummy_encoder.new_labels # standardize values in each column to N(0, 1) standardizer = StandardScaler() standardized_knob_matrix = standardizer.fit_transform(encoded_knob_matrix) standardized_metric_matrix = standardizer.fit_transform( nonconst_metric_matrix) # shuffle rows (note: same shuffle applied to both knob and metric matrices) shuffle_indices = get_shuffle_indices(standardized_knob_matrix.shape[0], seed=17) shuffled_knob_matrix = standardized_knob_matrix[shuffle_indices, :] shuffled_metric_matrix = standardized_metric_matrix[shuffle_indices, :] # run lasso algorithm lasso_model = LassoPath() lasso_model.fit(shuffled_knob_matrix, shuffled_metric_matrix, encoded_knob_columnlabels) # consolidate categorical feature columns, and reset to original names encoded_knobs = lasso_model.get_ranked_features() consolidated_knobs = consolidate_columnlabels(encoded_knobs) return consolidated_knobs
def configuration_recommendation(target_data): if target_data['scores'] is None: raise NotImplementedError('Implement me!') best_wkld_id = target_data['mapped_workload'][0] # Load specific workload data newest_result = Result.objects.get(pk=target_data['newest_result_id']) target_obj = newest_result.application.target_objective dbms_id = newest_result.dbms.pk hw_id = newest_result.application.hardware.pk agg_data = PipelineResult.get_latest(dbms_id, hw_id, PipelineTaskType.AGGREGATED_DATA) if agg_data is None: return None data_map = JSONUtil.loads(agg_data.value) if best_wkld_id not in data_map['data']: raise Exception(('Cannot find mapped workload' '(id={}) in aggregated data').format(best_wkld_id)) workload_data = np.load(data_map['data'][best_wkld_id]) # Mapped workload data X_wkld_matrix = workload_data['X_matrix'] y_wkld_matrix = workload_data['y_matrix'] wkld_rowlabels = workload_data['rowlabels'] X_columnlabels = workload_data['X_columnlabels'] y_columnlabels = workload_data['y_columnlabels'] # Target workload data X_target_matrix = target_data['X_matrix'] y_target_matrix = target_data['y_matrix'] target_rowlabels = target_data['rowlabels'] if not np.array_equal(X_columnlabels, target_data['X_columnlabels']): raise Exception(('The workload and target data should have ' 'identical X columnlabels (sorted knob names)')) if not np.array_equal(y_columnlabels, target_data['y_columnlabels']): raise Exception(('The workload and target data should have ' 'identical y columnlabels (sorted metric names)')) # Filter knobs ranked_knobs = JSONUtil.loads( PipelineResult.get_latest( dbms_id, hw_id, PipelineTaskType.RANKED_KNOBS).value)[:10] # FIXME X_idxs = [ i for i in range(X_columnlabels.shape[0]) if X_columnlabels[i] in ranked_knobs ] X_wkld_matrix = X_wkld_matrix[:, X_idxs] X_target_matrix = X_target_matrix[:, X_idxs] X_columnlabels = X_columnlabels[X_idxs] # Filter metrics by current target objective metric y_idx = [ i for i in range(y_columnlabels.shape[0]) if y_columnlabels[i] == target_obj ] if len(y_idx) == 0: raise Exception(('Could not find target objective in metrics ' '(target_obj={})').format(target_obj)) elif len(y_idx) > 1: raise Exception( ('Found {} instances of target objective in ' 'metrics (target_obj={})').format(len(y_idx), target_obj)) y_wkld_matrix = y_wkld_matrix[:, y_idx] y_target_matrix = y_target_matrix[:, y_idx] y_columnlabels = y_columnlabels[y_idx] # Combine duplicate rows in the target/workload data (separately) X_wkld_matrix, y_wkld_matrix, wkld_rowlabels = DataUtil.combine_duplicate_rows( X_wkld_matrix, y_wkld_matrix, wkld_rowlabels) X_target_matrix, y_target_matrix, target_rowlabels = DataUtil.combine_duplicate_rows( X_target_matrix, y_target_matrix, target_rowlabels) # Delete any rows that appear in both the workload data and the target # data from the workload data dups_filter = np.ones(X_wkld_matrix.shape[0], dtype=bool) target_row_tups = [tuple(row) for row in X_target_matrix] for i, row in enumerate(X_wkld_matrix): if tuple(row) in target_row_tups: dups_filter[i] = False X_wkld_matrix = X_wkld_matrix[dups_filter, :] y_wkld_matrix = y_wkld_matrix[dups_filter, :] wkld_rowlabels = wkld_rowlabels[dups_filter] # Combine Xs and scale X_matrix = np.vstack([X_target_matrix, X_wkld_matrix]) X_scaler = StandardScaler() X_scaled = X_scaler.fit_transform(X_matrix) if y_target_matrix.shape[0] < 5: # FIXME y_target_scaler = None y_wkld_scaler = StandardScaler() y_matrix = np.vstack([y_target_matrix, y_wkld_matrix]) y_scaled = y_wkld_scaler.fit_transform(y_matrix) else: try: y_target_scaler = StandardScaler() y_wkld_scaler = StandardScaler() y_target_scaled = y_target_scaler.fit_transform(y_target_matrix) y_wkld_scaled = y_wkld_scaler.fit_transform(y_wkld_matrix) y_scaled = np.vstack([y_target_scaled, y_wkld_scaled]) except ValueError: y_target_scaler = None y_wkld_scaler = StandardScaler() y_matrix = np.vstack([y_target_matrix, y_wkld_matrix]) y_scaled = y_wkld_scaler.fit_transform(y_matrix) ridge = np.empty(X_scaled.shape[0]) ridge[:X_target_matrix.shape[0]] = 0.01 ridge[X_target_matrix.shape[0]:] = 0.1 # FIXME num_samples = 5 X_samples = np.empty((num_samples, X_scaled.shape[1])) for i in range(X_scaled.shape[1]): col_min = X_scaled[:, i].min() col_max = X_scaled[:, i].max() X_samples[:, i] = np.random.rand(num_samples) * (col_max - col_min) + col_min model = GPR_GD() model.fit(X_scaled, y_scaled, ridge) res = model.predict(X_samples) best_idx = np.argmin(res.minL.ravel()) best_conf = res.minL_conf[best_idx, :] best_conf = X_scaler.inverse_transform(best_conf) conf_map = {k: best_conf[i] for i, k in enumerate(X_columnlabels)} return conf_map
def map_workload(target_data): # Get the latest version of pipeline data that's been computed so far. latest_pipeline_run = PipelineRun.objects.get_latest() if target_data['bad']: assert target_data is not None return target_data assert latest_pipeline_run is not None newest_result = Result.objects.get(pk=target_data['newest_result_id']) target_workload = newest_result.workload X_columnlabels = np.array(target_data['X_columnlabels']) y_columnlabels = np.array(target_data['y_columnlabels']) # Find all pipeline data belonging to the latest version with the same # DBMS and hardware as the target pipeline_data = PipelineData.objects.filter( pipeline_run=latest_pipeline_run, workload__dbms=target_workload.dbms, workload__hardware=target_workload.hardware) # FIXME (dva): we should also compute the global (i.e., overall) ranked_knobs # and pruned metrics but we just use those from the first workload for now initialized = False global_ranked_knobs = None global_pruned_metrics = None ranked_knob_idxs = None pruned_metric_idxs = None # Compute workload mapping data for each unique workload unique_workloads = pipeline_data.values_list('workload', flat=True).distinct() assert len(unique_workloads) > 0 workload_data = {} for unique_workload in unique_workloads: workload_obj = Workload.objects.get(pk=unique_workload) wkld_results = Result.objects.filter(workload=workload_obj) if wkld_results.exists() is False: # delete the workload workload_obj.delete() continue # Load knob & metric data for this workload knob_data = load_data_helper(pipeline_data, unique_workload, PipelineTaskType.KNOB_DATA) metric_data = load_data_helper(pipeline_data, unique_workload, PipelineTaskType.METRIC_DATA) X_matrix = np.array(knob_data["data"]) y_matrix = np.array(metric_data["data"]) rowlabels = np.array(knob_data["rowlabels"]) assert np.array_equal(rowlabels, metric_data["rowlabels"]) if not initialized: # For now set ranked knobs & pruned metrics to be those computed # for the first workload global_ranked_knobs = load_data_helper( pipeline_data, unique_workload, PipelineTaskType.RANKED_KNOBS)[:IMPORTANT_KNOB_NUMBER] global_pruned_metrics = load_data_helper( pipeline_data, unique_workload, PipelineTaskType.PRUNED_METRICS) ranked_knob_idxs = [ i for i in range(X_matrix.shape[1]) if X_columnlabels[i] in global_ranked_knobs ] pruned_metric_idxs = [ i for i in range(y_matrix.shape[1]) if y_columnlabels[i] in global_pruned_metrics ] # Filter X & y columnlabels by top ranked_knobs & pruned_metrics X_columnlabels = X_columnlabels[ranked_knob_idxs] y_columnlabels = y_columnlabels[pruned_metric_idxs] initialized = True # Filter X & y matrices by top ranked_knobs & pruned_metrics X_matrix = X_matrix[:, ranked_knob_idxs] y_matrix = y_matrix[:, pruned_metric_idxs] # Combine duplicate rows (rows with same knob settings) X_matrix, y_matrix, rowlabels = DataUtil.combine_duplicate_rows( X_matrix, y_matrix, rowlabels) workload_data[unique_workload] = { 'X_matrix': X_matrix, 'y_matrix': y_matrix, 'rowlabels': rowlabels, } assert len(workload_data) > 0 # Stack all X & y matrices for preprocessing Xs = np.vstack( [entry['X_matrix'] for entry in list(workload_data.values())]) ys = np.vstack( [entry['y_matrix'] for entry in list(workload_data.values())]) # Scale the X & y values, then compute the deciles for each column in y X_scaler = StandardScaler(copy=False) X_scaler.fit(Xs) y_scaler = StandardScaler(copy=False) y_scaler.fit_transform(ys) y_binner = Bin(bin_start=1, axis=0) y_binner.fit(ys) del Xs del ys # Filter the target's X & y data by the ranked knobs & pruned metrics. X_target = target_data['X_matrix'][:, ranked_knob_idxs] y_target = target_data['y_matrix'][:, pruned_metric_idxs] # Now standardize the target's data and bin it by the deciles we just # calculated X_target = X_scaler.transform(X_target) y_target = y_scaler.transform(y_target) y_target = y_binner.transform(y_target) scores = {} for workload_id, workload_entry in list(workload_data.items()): predictions = np.empty_like(y_target) X_workload = workload_entry['X_matrix'] X_scaled = X_scaler.transform(X_workload) y_workload = workload_entry['y_matrix'] y_scaled = y_scaler.transform(y_workload) for j, y_col in enumerate(y_scaled.T): # Using this workload's data, train a Gaussian process model # and then predict the performance of each metric for each of # the knob configurations attempted so far by the target. y_col = y_col.reshape(-1, 1) model = GPRNP(length_scale=DEFAULT_LENGTH_SCALE, magnitude=DEFAULT_MAGNITUDE, max_train_size=MAX_TRAIN_SIZE, batch_size=BATCH_SIZE) model.fit(X_scaled, y_col, ridge=DEFAULT_RIDGE) predictions[:, j] = model.predict(X_target).ypreds.ravel() # Bin each of the predicted metric columns by deciles and then # compute the score (i.e., distance) between the target workload # and each of the known workloads predictions = y_binner.transform(predictions) dists = np.sqrt( np.sum(np.square(np.subtract(predictions, y_target)), axis=1)) scores[workload_id] = np.mean(dists) # Find the best (minimum) score best_score = np.inf best_workload_id = None # scores_info = {workload_id: (workload_name, score)} scores_info = {} for workload_id, similarity_score in list(scores.items()): workload_name = Workload.objects.get(pk=workload_id).name if similarity_score < best_score: best_score = similarity_score best_workload_id = workload_id best_workload_name = workload_name scores_info[workload_id] = (workload_name, similarity_score) target_data['mapped_workload'] = (best_workload_id, best_workload_name, best_score) target_data['scores'] = scores_info return target_data
def map_workload(map_workload_input): start_ts = time.time() target_data, algorithm = map_workload_input if target_data['bad']: assert target_data is not None target_data['pipeline_run'] = None LOG.debug('%s: Skipping workload mapping.\n\ndata=%s\n', AlgorithmType.name(algorithm), JSONUtil.dumps(target_data, pprint=True)) return target_data, algorithm # Get the latest version of pipeline data that's been computed so far. latest_pipeline_run = PipelineRun.objects.get_latest() assert latest_pipeline_run is not None target_data['pipeline_run'] = latest_pipeline_run.pk newest_result = Result.objects.get(pk=target_data['newest_result_id']) session = newest_result.session params = JSONUtil.loads(session.hyperparameters) target_workload = newest_result.workload X_columnlabels = np.array(target_data['X_columnlabels']) y_columnlabels = np.array(target_data['y_columnlabels']) # Find all pipeline data belonging to the latest version with the same # DBMS and hardware as the target pipeline_data = PipelineData.objects.filter( pipeline_run=latest_pipeline_run, workload__dbms=target_workload.dbms, workload__hardware=target_workload.hardware, workload__project=target_workload.project) # FIXME (dva): we should also compute the global (i.e., overall) ranked_knobs # and pruned metrics but we just use those from the first workload for now initialized = False global_ranked_knobs = None global_pruned_metrics = None ranked_knob_idxs = None pruned_metric_idxs = None unique_workloads = pipeline_data.values_list('workload', flat=True).distinct() workload_data = {} # Compute workload mapping data for each unique workload for unique_workload in unique_workloads: workload_obj = Workload.objects.get(pk=unique_workload) wkld_results = Result.objects.filter(workload=workload_obj) if wkld_results.exists() is False: # delete the workload workload_obj.delete() continue # Load knob & metric data for this workload knob_data = load_data_helper(pipeline_data, unique_workload, PipelineTaskType.KNOB_DATA) knob_data["data"], knob_data["columnlabels"] = clean_knob_data( knob_data["data"], knob_data["columnlabels"], newest_result.session) metric_data = load_data_helper(pipeline_data, unique_workload, PipelineTaskType.METRIC_DATA) X_matrix = np.array(knob_data["data"]) y_matrix = np.array(metric_data["data"]) rowlabels = np.array(knob_data["rowlabels"]) assert np.array_equal(rowlabels, metric_data["rowlabels"]) if not initialized: # For now set ranked knobs & pruned metrics to be those computed # for the first workload global_ranked_knobs = load_data_helper( pipeline_data, unique_workload, PipelineTaskType.RANKED_KNOBS )[:params['IMPORTANT_KNOB_NUMBER']] global_pruned_metrics = load_data_helper( pipeline_data, unique_workload, PipelineTaskType.PRUNED_METRICS) ranked_knob_idxs = [ i for i in range(X_matrix.shape[1]) if X_columnlabels[i] in global_ranked_knobs ] pruned_metric_idxs = [ i for i in range(y_matrix.shape[1]) if y_columnlabels[i] in global_pruned_metrics ] # Filter X & y columnlabels by top ranked_knobs & pruned_metrics X_columnlabels = X_columnlabels[ranked_knob_idxs] y_columnlabels = y_columnlabels[pruned_metric_idxs] initialized = True # Filter X & y matrices by top ranked_knobs & pruned_metrics X_matrix = X_matrix[:, ranked_knob_idxs] y_matrix = y_matrix[:, pruned_metric_idxs] # Combine duplicate rows (rows with same knob settings) X_matrix, y_matrix, rowlabels = DataUtil.combine_duplicate_rows( X_matrix, y_matrix, rowlabels) workload_data[unique_workload] = { 'X_matrix': X_matrix, 'y_matrix': y_matrix, 'rowlabels': rowlabels, } if len(workload_data) == 0: # The background task that aggregates the data has not finished running yet target_data.update(mapped_workload=None, scores=None) LOG.debug( '%s: Skipping workload mapping because there is no parsed workload.\n', AlgorithmType.name(algorithm)) return target_data, algorithm # Stack all X & y matrices for preprocessing Xs = np.vstack( [entry['X_matrix'] for entry in list(workload_data.values())]) ys = np.vstack( [entry['y_matrix'] for entry in list(workload_data.values())]) # Scale the X & y values, then compute the deciles for each column in y X_scaler = StandardScaler(copy=False) X_scaler.fit(Xs) y_scaler = StandardScaler(copy=False) y_scaler.fit_transform(ys) y_binner = Bin(bin_start=1, axis=0) y_binner.fit(ys) del Xs del ys # Filter the target's X & y data by the ranked knobs & pruned metrics. X_target = target_data['X_matrix'][:, ranked_knob_idxs] y_target = target_data['y_matrix'][:, pruned_metric_idxs] # Now standardize the target's data and bin it by the deciles we just # calculated X_target = X_scaler.transform(X_target) y_target = y_scaler.transform(y_target) y_target = y_binner.transform(y_target) scores = {} for workload_id, workload_entry in list(workload_data.items()): predictions = np.empty_like(y_target) X_workload = workload_entry['X_matrix'] X_scaled = X_scaler.transform(X_workload) y_workload = workload_entry['y_matrix'] y_scaled = y_scaler.transform(y_workload) for j, y_col in enumerate(y_scaled.T): # Using this workload's data, train a Gaussian process model # and then predict the performance of each metric for each of # the knob configurations attempted so far by the target. y_col = y_col.reshape(-1, 1) if params['GPR_USE_GPFLOW']: model_kwargs = { 'lengthscales': params['GPR_LENGTH_SCALE'], 'variance': params['GPR_MAGNITUDE'], 'noise_variance': params['GPR_RIDGE'] } tf.reset_default_graph() graph = tf.get_default_graph() gpflow.reset_default_session(graph=graph) m = gpr_models.create_model(params['GPR_MODEL_NAME'], X=X_scaled, y=y_col, **model_kwargs) gpr_result = gpflow_predict(m.model, X_target) else: model = GPRNP(length_scale=params['GPR_LENGTH_SCALE'], magnitude=params['GPR_MAGNITUDE'], max_train_size=params['GPR_MAX_TRAIN_SIZE'], batch_size=params['GPR_BATCH_SIZE']) model.fit(X_scaled, y_col, ridge=params['GPR_RIDGE']) gpr_result = model.predict(X_target) predictions[:, j] = gpr_result.ypreds.ravel() # Bin each of the predicted metric columns by deciles and then # compute the score (i.e., distance) between the target workload # and each of the known workloads predictions = y_binner.transform(predictions) dists = np.sqrt( np.sum(np.square(np.subtract(predictions, y_target)), axis=1)) scores[workload_id] = np.mean(dists) # Find the best (minimum) score best_score = np.inf best_workload_id = None best_workload_name = None scores_info = {} for workload_id, similarity_score in list(scores.items()): workload_name = Workload.objects.get(pk=workload_id).name if similarity_score < best_score: best_score = similarity_score best_workload_id = workload_id best_workload_name = workload_name scores_info[workload_id] = (workload_name, similarity_score) target_data.update(mapped_workload=(best_workload_id, best_workload_name, best_score), scores=scores_info) LOG.debug('%s: Finished mapping the workload.\n\ndata=%s\n', AlgorithmType.name(algorithm), JSONUtil.dumps(target_data, pprint=True)) save_execution_time(start_ts, "map_workload", newest_result) return target_data, algorithm
def run_workload_characterization(metric_data, dbms=None): # Performs workload characterization on the metric_data and returns # a set of pruned metrics. # # Parameters: # metric_data is a dictionary of the form: # - 'data': 2D numpy matrix of metric data (results x metrics) # - 'rowlabels': a list of identifiers for the rows in the matrix # - 'columnlabels': a list of the metric names corresponding to # the columns in the data matrix start_ts = time.time() matrix = metric_data['data'] columnlabels = metric_data['columnlabels'] LOG.debug("Workload characterization ~ initial data size: %s", matrix.shape) views = None if dbms is None else VIEWS_FOR_PRUNING.get(dbms.type, None) matrix, columnlabels = DataUtil.clean_metric_data(matrix, columnlabels, views) LOG.debug("Workload characterization ~ cleaned data size: %s", matrix.shape) # Bin each column (metric) in the matrix by its decile binner = Bin(bin_start=1, axis=0) binned_matrix = binner.fit_transform(matrix) # Remove any constant columns nonconst_matrix = [] nonconst_columnlabels = [] for col, cl in zip(binned_matrix.T, columnlabels): if np.any(col != col[0]): nonconst_matrix.append(col.reshape(-1, 1)) nonconst_columnlabels.append(cl) assert len(nonconst_matrix) > 0, "Need more data to train the model" nonconst_matrix = np.hstack(nonconst_matrix) LOG.debug("Workload characterization ~ nonconst data size: %s", nonconst_matrix.shape) # Remove any duplicate columns unique_matrix, unique_idxs = np.unique(nonconst_matrix, axis=1, return_index=True) unique_columnlabels = [nonconst_columnlabels[idx] for idx in unique_idxs] LOG.debug("Workload characterization ~ final data size: %s", unique_matrix.shape) n_rows, n_cols = unique_matrix.shape # Shuffle the matrix rows shuffle_indices = get_shuffle_indices(n_rows) shuffled_matrix = unique_matrix[shuffle_indices, :] # Fit factor analysis model fa_model = FactorAnalysis() # For now we use 5 latent variables fa_model.fit(shuffled_matrix, unique_columnlabels, n_components=5) # Components: metrics * factors components = fa_model.components_.T.copy() LOG.info("Workload characterization first part costs %.0f seconds.", time.time() - start_ts) # Run Kmeans for # clusters k in range(1, num_nonduplicate_metrics - 1) # K should be much smaller than n_cols in detK, For now max_cluster <= 20 kmeans_models = KMeansClusters() kmeans_models.fit(components, min_cluster=1, max_cluster=min(n_cols - 1, 20), sample_labels=unique_columnlabels, estimator_params={'n_init': 50}) # Compute optimal # clusters, k, using gap statistics gapk = create_kselection_model("gap-statistic") gapk.fit(components, kmeans_models.cluster_map_) LOG.debug("Found optimal number of clusters: %d", gapk.optimal_num_clusters_) # Get pruned metrics, cloest samples of each cluster center pruned_metrics = kmeans_models.cluster_map_[ gapk.optimal_num_clusters_].get_closest_samples() # Return pruned metrics save_execution_time(start_ts, "run_workload_characterization") LOG.info("Workload characterization finished in %.0f seconds.", time.time() - start_ts) return pruned_metrics
def create_workload_mapping_data(): agg_datas = PipelineResult.objects.filter( task_type=PipelineTaskType.AGGREGATED_DATA) dbmss = set([ad.dbms.pk for ad in agg_datas]) hardwares = set([ad.hardware.pk for ad in agg_datas]) for dbms_id, hw_id in itertools.product(dbmss, hardwares): data = PipelineResult.get_latest(dbms_id, hw_id, PipelineTaskType.AGGREGATED_DATA) file_info = JSONUtil.loads(data.value) cluster_data = OrderedDict() for cluster, path in file_info['data'].iteritems(): compressed_data = np.load(path) X_matrix = compressed_data['X_matrix'] y_matrix = compressed_data['y_matrix'] X_columnlabels = compressed_data['X_columnlabels'] y_columnlabels = compressed_data['y_columnlabels'] rowlabels = compressed_data['rowlabels'] # Filter metrics and knobs ranked_knobs = JSONUtil.loads( PipelineResult.get_latest( dbms_id, hw_id, PipelineTaskType.RANKED_KNOBS).value)[:10] # FIXME pruned_metrics = JSONUtil.loads( PipelineResult.get_latest( dbms_id, hw_id, PipelineTaskType.PRUNED_METRICS).value) knob_idxs = [ i for i in range(X_matrix.shape[1]) if X_columnlabels[i] in ranked_knobs ] metric_idxs = [ i for i in range(y_matrix.shape[1]) if y_columnlabels[i] in pruned_metrics ] X_matrix = X_matrix[:, knob_idxs] X_columnlabels = X_columnlabels[knob_idxs] y_matrix = y_matrix[:, metric_idxs] y_columnlabels = y_columnlabels[metric_idxs] # Combine duplicate rows X_matrix, y_matrix, rowlabels = DataUtil.combine_duplicate_rows( X_matrix, y_matrix, rowlabels) cluster_data[cluster] = { 'X_matrix': X_matrix, 'y_matrix': y_matrix, 'X_columnlabels': X_columnlabels, 'y_columnlabels': y_columnlabels, 'rowlabels': rowlabels, } Xs = np.vstack([entry['X_matrix'] for entry in cluster_data.values()]) ys = np.vstack([entry['y_matrix'] for entry in cluster_data.values()]) X_scaler = StandardScaler(copy=False) X_scaler.fit(Xs) y_scaler = StandardScaler(copy=False) y_scaler.fit_transform(ys) y_binner = Bin(axis=0) y_binner.fit(ys) del Xs del ys task_name = PipelineTaskType.TYPE_NAMES[ PipelineTaskType.WORKLOAD_MAPPING_DATA].replace(' ', '').upper() timestamp = data.creation_timestamp tsf = timestamp.strftime("%Y%m%d-%H%M%S") savepaths = {} for cluster, entry in cluster_data.iteritems(): X_scaler.transform(entry['X_matrix']) y_scaler.transform(entry['y_matrix']) fname = '{}_{}_{}_{}_{}.npz'.format(task_name, dbms_id, hw_id, cluster, tsf) savepath = os.path.join(PIPELINE_DIR, fname) savepaths[cluster] = savepath np.savez_compressed(savepath, **entry) X_scaler_path = os.path.join( PIPELINE_DIR, '{}_XSCALER_{}_{}_{}.npz'.format(task_name, dbms_id, hw_id, tsf)) np.savez_compressed(X_scaler_path, mean=X_scaler.mean_, scale=X_scaler.scale_) y_scaler_path = os.path.join( PIPELINE_DIR, '{}_YSCALER_{}_{}_{}.npz'.format(task_name, dbms_id, hw_id, tsf)) np.savez_compressed(y_scaler_path, mean=y_scaler.mean_, scale=y_scaler.scale_) y_deciles_path = os.path.join( PIPELINE_DIR, '{}_YDECILES_{}_{}_{}.npz'.format(task_name, dbms_id, hw_id, tsf)) np.savez_compressed(y_deciles_path, deciles=y_binner.deciles_) value = { 'data': savepaths, 'X_scaler': X_scaler_path, 'y_scaler': y_scaler_path, 'y_deciles': y_deciles_path, 'X_columnlabels': cluster_data.values()[0]['X_columnlabels'].tolist(), 'y_columnlabels': cluster_data.values()[0]['y_columnlabels'].tolist(), } new_res = PipelineResult() new_res.dbms = DBMSCatalog.objects.get(pk=dbms_id) new_res.hardware = Hardware.objects.get(pk=hw_id) new_res.creation_timestamp = timestamp new_res.task_type = PipelineTaskType.WORKLOAD_MAPPING_DATA new_res.value = JSONUtil.dumps(value, pprint=True) new_res.save()
def run_knob_identification(knob_data, metric_data, dbms): # Performs knob identification on the knob & metric data and returns # a set of ranked knobs. # # Parameters: # knob_data & metric_data are dictionaries of the form: # - 'data': 2D numpy matrix of knob/metric data # - 'rowlabels': a list of identifiers for the rows in the matrix # - 'columnlabels': a list of the knob/metric names corresponding # to the columns in the data matrix # dbms is the foreign key pointing to target dbms in DBMSCatalog # # When running the lasso algorithm, the knob_data matrix is set of # independent variables (X) and the metric_data is the set of # dependent variables (y). knob_matrix = knob_data['data'] knob_columnlabels = knob_data['columnlabels'] metric_matrix = metric_data['data'] metric_columnlabels = metric_data['columnlabels'] # remove constant columns from knob_matrix and metric_matrix nonconst_knob_matrix = [] nonconst_knob_columnlabels = [] for col, cl in zip(knob_matrix.T, knob_columnlabels): if np.any(col != col[0]): nonconst_knob_matrix.append(col.reshape(-1, 1)) nonconst_knob_columnlabels.append(cl) assert len(nonconst_knob_matrix) > 0, "Need more data to train the model" nonconst_knob_matrix = np.hstack(nonconst_knob_matrix) nonconst_metric_matrix = [] nonconst_metric_columnlabels = [] for col, cl in zip(metric_matrix.T, metric_columnlabels): if np.any(col != col[0]): nonconst_metric_matrix.append(col.reshape(-1, 1)) nonconst_metric_columnlabels.append(cl) nonconst_metric_matrix = np.hstack(nonconst_metric_matrix) # determine which knobs need encoding (enums with >2 possible values) categorical_info = DataUtil.dummy_encoder_helper(nonconst_knob_columnlabels, dbms) # encode categorical variable first (at least, before standardize) dummy_encoder = DummyEncoder(categorical_info['n_values'], categorical_info['categorical_features'], categorical_info['cat_columnlabels'], categorical_info['noncat_columnlabels']) encoded_knob_matrix = dummy_encoder.fit_transform( nonconst_knob_matrix) encoded_knob_columnlabels = dummy_encoder.new_labels # standardize values in each column to N(0, 1) standardizer = StandardScaler() standardized_knob_matrix = standardizer.fit_transform(encoded_knob_matrix) standardized_metric_matrix = standardizer.fit_transform(nonconst_metric_matrix) # shuffle rows (note: same shuffle applied to both knob and metric matrices) shuffle_indices = get_shuffle_indices(standardized_knob_matrix.shape[0], seed=17) shuffled_knob_matrix = standardized_knob_matrix[shuffle_indices, :] shuffled_metric_matrix = standardized_metric_matrix[shuffle_indices, :] # run lasso algorithm lasso_model = LassoPath() lasso_model.fit(shuffled_knob_matrix, shuffled_metric_matrix, encoded_knob_columnlabels) # consolidate categorical feature columns, and reset to original names encoded_knobs = lasso_model.get_ranked_features() consolidated_knobs = consolidate_columnlabels(encoded_knobs) return consolidated_knobs
def train_ddpg(result_id): LOG.info('Add training data to ddpg and train ddpg') result = Result.objects.get(pk=result_id) session = Result.objects.get(pk=result_id).session params = JSONUtil.loads(session.hyperparameters) session_results = Result.objects.filter( session=session, creation_time__lt=result.creation_time) result_info = {} result_info['newest_result_id'] = result_id # Extract data from result and previous results result = Result.objects.filter(pk=result_id) if len(session_results) == 0: base_result_id = result_id prev_result_id = result_id else: base_result_id = session_results[0].pk prev_result_id = session_results[len(session_results) - 1].pk base_result = Result.objects.filter(pk=base_result_id) prev_result = Result.objects.filter(pk=prev_result_id) agg_data = DataUtil.aggregate_data(result) base_metric_data = ( DataUtil.aggregate_data(base_result))['y_matrix'].flatten() prev_metric_data = ( DataUtil.aggregate_data(prev_result))['y_matrix'].flatten() result = Result.objects.get(pk=result_id) target_objective = result.session.target_objective prev_obj_idx = [ i for i, n in enumerate(agg_data['y_columnlabels']) if n == target_objective ] # Clean metric data metric_data, metric_labels = clean_metric_data(agg_data['y_matrix'], agg_data['y_columnlabels'], session) metric_data = metric_data.flatten() metric_scalar = MinMaxScaler().fit(metric_data.reshape(1, -1)) normalized_metric_data = metric_scalar.transform(metric_data.reshape( 1, -1))[0] # Clean knob data cleaned_knob_data = clean_knob_data(agg_data['X_matrix'], agg_data['X_columnlabels'], session) knob_data = np.array(cleaned_knob_data[0]) knob_labels = np.array(cleaned_knob_data[1]) knob_bounds = np.vstack( DataUtil.get_knob_bounds(knob_labels.flatten(), session)) knob_data = MinMaxScaler().fit(knob_bounds).transform(knob_data)[0] knob_num = len(knob_data) metric_num = len(metric_data) LOG.info('knob_num: %d, metric_num: %d', knob_num, metric_num) # Filter ys by current target objective metric target_obj_idx = [ i for i, n in enumerate(metric_labels) if n == target_objective ] if len(target_obj_idx) == 0: raise Exception(('Could not find target objective in metrics ' '(target_obj={})').format(target_objective)) elif len(target_obj_idx) > 1: raise Exception( ('Found {} instances of target objective in ' 'metrics (target_obj={})').format(len(target_obj_idx), target_objective)) objective = metric_data[target_obj_idx] base_objective = base_metric_data[prev_obj_idx] prev_objective = prev_metric_data[prev_obj_idx] metric_meta = db.target_objectives.get_metric_metadata( result.session.dbms.pk, result.session.target_objective) # Calculate the reward if params['DDPG_SIMPLE_REWARD']: objective = objective / base_objective if metric_meta[target_objective].improvement == '(less is better)': reward = -objective else: reward = objective else: if metric_meta[target_objective].improvement == '(less is better)': if objective - base_objective <= 0: # positive reward reward = (np.square((2 * base_objective - objective) / base_objective) - 1)\ * abs(2 * prev_objective - objective) / prev_objective else: # negative reward reward = -(np.square(objective / base_objective) - 1) * objective / prev_objective else: if objective - base_objective > 0: # positive reward reward = (np.square(objective / base_objective) - 1) * objective / prev_objective else: # negative reward reward = -(np.square((2 * base_objective - objective) / base_objective) - 1)\ * abs(2 * prev_objective - objective) / prev_objective LOG.info('reward: %f', reward) # Update ddpg ddpg = DDPG(n_actions=knob_num, n_states=metric_num, alr=params['DDPG_ACTOR_LEARNING_RATE'], clr=params['DDPG_CRITIC_LEARNING_RATE'], gamma=params['DDPG_GAMMA'], batch_size=params['DDPG_BATCH_SIZE'], a_hidden_sizes=params['DDPG_ACTOR_HIDDEN_SIZES'], c_hidden_sizes=params['DDPG_CRITIC_HIDDEN_SIZES'], use_default=params['DDPG_USE_DEFAULT']) if session.ddpg_actor_model and session.ddpg_critic_model: ddpg.set_model(session.ddpg_actor_model, session.ddpg_critic_model) if session.ddpg_reply_memory: ddpg.replay_memory.set(session.ddpg_reply_memory) ddpg.add_sample(normalized_metric_data, knob_data, reward, normalized_metric_data) for _ in range(params['DDPG_UPDATE_EPOCHS']): ddpg.update() session.ddpg_actor_model, session.ddpg_critic_model = ddpg.get_model() session.ddpg_reply_memory = ddpg.replay_memory.get() session.save() return result_info
def combine_workload(target_data): # Load mapped workload data mapped_workload_id = target_data['mapped_workload'][0] latest_pipeline_run = PipelineRun.objects.get( pk=target_data['pipeline_run']) mapped_workload = Workload.objects.get(pk=mapped_workload_id) workload_knob_data = PipelineData.objects.get( pipeline_run=latest_pipeline_run, workload=mapped_workload, task_type=PipelineTaskType.KNOB_DATA) workload_knob_data = JSONUtil.loads(workload_knob_data.data) workload_metric_data = PipelineData.objects.get( pipeline_run=latest_pipeline_run, workload=mapped_workload, task_type=PipelineTaskType.METRIC_DATA) workload_metric_data = JSONUtil.loads(workload_metric_data.data) newest_result = Result.objects.get(pk=target_data['newest_result_id']) session = newest_result.session params = JSONUtil.loads(session.hyperparameters) cleaned_workload_knob_data = clean_knob_data( workload_knob_data["data"], workload_knob_data["columnlabels"], newest_result.session) X_workload = np.array(cleaned_workload_knob_data[0]) X_columnlabels = np.array(cleaned_workload_knob_data[1]) y_workload = np.array(workload_metric_data['data']) y_columnlabels = np.array(workload_metric_data['columnlabels']) rowlabels_workload = np.array(workload_metric_data['rowlabels']) # Target workload data newest_result = Result.objects.get(pk=target_data['newest_result_id']) X_target = target_data['X_matrix'] y_target = target_data['y_matrix'] rowlabels_target = np.array(target_data['rowlabels']) if not np.array_equal(X_columnlabels, target_data['X_columnlabels']): raise Exception(('The workload and target data should have ' 'identical X columnlabels (sorted knob names)'), X_columnlabels, target_data['X_columnlabels']) if not np.array_equal(y_columnlabels, target_data['y_columnlabels']): raise Exception(('The workload and target data should have ' 'identical y columnlabels (sorted metric names)'), y_columnlabels, target_data['y_columnlabels']) # Filter Xs by top 10 ranked knobs ranked_knobs = PipelineData.objects.get( pipeline_run=latest_pipeline_run, workload=mapped_workload, task_type=PipelineTaskType.RANKED_KNOBS) ranked_knobs = JSONUtil.loads( ranked_knobs.data)[:params['IMPORTANT_KNOB_NUMBER']] ranked_knob_idxs = [ i for i, cl in enumerate(X_columnlabels) if cl in ranked_knobs ] X_workload = X_workload[:, ranked_knob_idxs] X_target = X_target[:, ranked_knob_idxs] X_columnlabels = X_columnlabels[ranked_knob_idxs] # Filter ys by current target objective metric target_objective = newest_result.session.target_objective target_obj_idx = [ i for i, cl in enumerate(y_columnlabels) if cl == target_objective ] if len(target_obj_idx) == 0: raise Exception(('Could not find target objective in metrics ' '(target_obj={})').format(target_objective)) elif len(target_obj_idx) > 1: raise Exception( ('Found {} instances of target objective in ' 'metrics (target_obj={})').format(len(target_obj_idx), target_objective)) y_workload = y_workload[:, target_obj_idx] y_target = y_target[:, target_obj_idx] y_columnlabels = y_columnlabels[target_obj_idx] # Combine duplicate rows in the target/workload data (separately) X_workload, y_workload, rowlabels_workload = DataUtil.combine_duplicate_rows( X_workload, y_workload, rowlabels_workload) X_target, y_target, rowlabels_target = DataUtil.combine_duplicate_rows( X_target, y_target, rowlabels_target) # Delete any rows that appear in both the workload data and the target # data from the workload data dups_filter = np.ones(X_workload.shape[0], dtype=bool) target_row_tups = [tuple(row) for row in X_target] for i, row in enumerate(X_workload): if tuple(row) in target_row_tups: dups_filter[i] = False X_workload = X_workload[dups_filter, :] y_workload = y_workload[dups_filter, :] rowlabels_workload = rowlabels_workload[dups_filter] # Combine target & workload Xs for preprocessing X_matrix = np.vstack([X_target, X_workload]) # Dummy encode categorial variables if ENABLE_DUMMY_ENCODER: categorical_info = DataUtil.dummy_encoder_helper( X_columnlabels, mapped_workload.dbms) dummy_encoder = DummyEncoder(categorical_info['n_values'], categorical_info['categorical_features'], categorical_info['cat_columnlabels'], categorical_info['noncat_columnlabels']) X_matrix = dummy_encoder.fit_transform(X_matrix) binary_encoder = categorical_info['binary_vars'] # below two variables are needed for correctly determing max/min on dummies binary_index_set = set(categorical_info['binary_vars']) total_dummies = dummy_encoder.total_dummies() else: dummy_encoder = None binary_encoder = None binary_index_set = [] total_dummies = 0 # Scale to N(0, 1) X_scaler = StandardScaler() X_scaled = X_scaler.fit_transform(X_matrix) if y_target.shape[0] < 5: # FIXME # FIXME (dva): if there are fewer than 5 target results so far # then scale the y values (metrics) using the workload's # y_scaler. I'm not sure if 5 is the right cutoff. y_target_scaler = None y_workload_scaler = StandardScaler() y_matrix = np.vstack([y_target, y_workload]) y_scaled = y_workload_scaler.fit_transform(y_matrix) else: # FIXME (dva): otherwise try to compute a separate y_scaler for # the target and scale them separately. try: y_target_scaler = StandardScaler() y_workload_scaler = StandardScaler() y_target_scaled = y_target_scaler.fit_transform(y_target) y_workload_scaled = y_workload_scaler.fit_transform(y_workload) y_scaled = np.vstack([y_target_scaled, y_workload_scaled]) except ValueError: y_target_scaler = None y_workload_scaler = StandardScaler() y_scaled = y_workload_scaler.fit_transform(y_target) metric_meta = db.target_objectives.get_metric_metadata( newest_result.session.dbms.pk, newest_result.session.target_objective) lessisbetter = metric_meta[ target_objective].improvement == db.target_objectives.LESS_IS_BETTER # Maximize the throughput, moreisbetter # Use gradient descent to minimize -throughput if not lessisbetter: y_scaled = -y_scaled # Set up constraint helper constraint_helper = ParamConstraintHelper( scaler=X_scaler, encoder=dummy_encoder, binary_vars=binary_encoder, init_flip_prob=params['INIT_FLIP_PROB'], flip_prob_decay=params['FLIP_PROB_DECAY']) # FIXME (dva): check if these are good values for the ridge # ridge = np.empty(X_scaled.shape[0]) # ridge[:X_target.shape[0]] = 0.01 # ridge[X_target.shape[0]:] = 0.1 X_min = np.empty(X_scaled.shape[1]) X_max = np.empty(X_scaled.shape[1]) X_scaler_matrix = np.zeros([1, X_scaled.shape[1]]) session_knobs = SessionKnob.objects.get_knobs_for_session( newest_result.session) # Set min/max for knob values for i in range(X_scaled.shape[1]): if i < total_dummies or i in binary_index_set: col_min = 0 col_max = 1 else: col_min = X_scaled[:, i].min() col_max = X_scaled[:, i].max() for knob in session_knobs: if X_columnlabels[i] == knob["name"]: X_scaler_matrix[0][i] = knob["minval"] col_min = X_scaler.transform(X_scaler_matrix)[0][i] X_scaler_matrix[0][i] = knob["maxval"] col_max = X_scaler.transform(X_scaler_matrix)[0][i] X_min[i] = col_min X_max[i] = col_max return X_columnlabels, X_scaler, X_scaled, y_scaled, X_max, X_min,\ dummy_encoder, constraint_helper
def train_ddpg(result_id): LOG.info('Add training data to ddpg and train ddpg') result = Result.objects.get(pk=result_id) session = Result.objects.get(pk=result_id).session session_results = Result.objects.filter( session=session, creation_time__lt=result.creation_time) result_info = {} result_info['newest_result_id'] = result_id if len(session_results) == 0: LOG.info('No previous result. Abort.') return result_info # Extract data from result result = Result.objects.filter(pk=result_id) base_result_id = session_results[0].pk base_result = Result.objects.filter(pk=base_result_id) agg_data = DataUtil.aggregate_data(result) metric_data = agg_data['y_matrix'].flatten() base_metric_data = ( DataUtil.aggregate_data(base_result))['y_matrix'].flatten() metric_scalar = MinMaxScaler().fit(metric_data.reshape(1, -1)) normalized_metric_data = metric_scalar.transform(metric_data.reshape( 1, -1))[0] # Clean knob data cleaned_knob_data = clean_knob_data(agg_data['X_matrix'], agg_data['X_columnlabels'], session) knob_data = np.array(cleaned_knob_data[0]) knob_labels = np.array(cleaned_knob_data[1]) knob_bounds = np.vstack( DataUtil.get_knob_bounds(knob_labels.flatten(), session)) knob_data = MinMaxScaler().fit(knob_bounds).transform(knob_data)[0] knob_num = len(knob_data) metric_num = len(metric_data) LOG.info('knob_num: %d, metric_num: %d', knob_num, metric_num) # Filter ys by current target objective metric result = Result.objects.get(pk=result_id) target_objective = result.session.target_objective target_obj_idx = [ i for i, n in enumerate(agg_data['y_columnlabels']) if n == target_objective ] if len(target_obj_idx) == 0: raise Exception(('Could not find target objective in metrics ' '(target_obj={})').format(target_objective)) elif len(target_obj_idx) > 1: raise Exception( ('Found {} instances of target objective in ' 'metrics (target_obj={})').format(len(target_obj_idx), target_objective)) objective = metric_data[target_obj_idx] base_objective = base_metric_data[target_obj_idx] metric_meta = db.target_objectives.get_metric_metadata( result.session.dbms.pk, result.session.target_objective) # Calculate the reward objective = objective / base_objective if metric_meta[target_objective].improvement == '(less is better)': reward = -objective else: reward = objective LOG.info('reward: %f', reward) # Update ddpg ddpg = DDPG(n_actions=knob_num, n_states=metric_num, alr=ACTOR_LEARNING_RATE, clr=CRITIC_LEARNING_RATE, gamma=0, batch_size=DDPG_BATCH_SIZE) if session.ddpg_actor_model and session.ddpg_critic_model: ddpg.set_model(session.ddpg_actor_model, session.ddpg_critic_model) if session.ddpg_reply_memory: ddpg.replay_memory.set(session.ddpg_reply_memory) ddpg.add_sample(normalized_metric_data, knob_data, reward, normalized_metric_data) for _ in range(25): ddpg.update() session.ddpg_actor_model, session.ddpg_critic_model = ddpg.get_model() session.ddpg_reply_memory = ddpg.replay_memory.get() session.save() return result_info
def configuration_recommendation(target_data): LOG.info('configuration_recommendation called') latest_pipeline_run = PipelineRun.objects.get_latest() if target_data['bad'] is True: target_data_res = {} target_data_res['status'] = 'bad' target_data_res['info'] = 'WARNING: no training data, the config is generated randomly' target_data_res['recommendation'] = target_data['config_recommend'] return target_data_res # Load mapped workload data mapped_workload_id = target_data['mapped_workload'][0] mapped_workload = Workload.objects.get(pk=mapped_workload_id) workload_knob_data = PipelineData.objects.get( pipeline_run=latest_pipeline_run, workload=mapped_workload, task_type=PipelineTaskType.KNOB_DATA) workload_knob_data = JSONUtil.loads(workload_knob_data.data) workload_metric_data = PipelineData.objects.get( pipeline_run=latest_pipeline_run, workload=mapped_workload, task_type=PipelineTaskType.METRIC_DATA) workload_metric_data = JSONUtil.loads(workload_metric_data.data) X_workload = np.array(workload_knob_data['data']) X_columnlabels = np.array(workload_knob_data['columnlabels']) y_workload = np.array(workload_metric_data['data']) y_columnlabels = np.array(workload_metric_data['columnlabels']) rowlabels_workload = np.array(workload_metric_data['rowlabels']) # Target workload data newest_result = Result.objects.get(pk=target_data['newest_result_id']) X_target = target_data['X_matrix'] y_target = target_data['y_matrix'] rowlabels_target = np.array(target_data['rowlabels']) if not np.array_equal(X_columnlabels, target_data['X_columnlabels']): raise Exception(('The workload and target data should have ' 'identical X columnlabels (sorted knob names)')) if not np.array_equal(y_columnlabels, target_data['y_columnlabels']): raise Exception(('The workload and target data should have ' 'identical y columnlabels (sorted metric names)')) # Filter Xs by top 10 ranked knobs ranked_knobs = PipelineData.objects.get( pipeline_run=latest_pipeline_run, workload=mapped_workload, task_type=PipelineTaskType.RANKED_KNOBS) ranked_knobs = JSONUtil.loads(ranked_knobs.data)[:IMPORTANT_KNOB_NUMBER] ranked_knob_idxs = [i for i, cl in enumerate(X_columnlabels) if cl in ranked_knobs] X_workload = X_workload[:, ranked_knob_idxs] X_target = X_target[:, ranked_knob_idxs] X_columnlabels = X_columnlabels[ranked_knob_idxs] # Filter ys by current target objective metric target_objective = newest_result.session.target_objective target_obj_idx = [i for i, cl in enumerate(y_columnlabels) if cl == target_objective] if len(target_obj_idx) == 0: raise Exception(('Could not find target objective in metrics ' '(target_obj={})').format(target_objective)) elif len(target_obj_idx) > 1: raise Exception(('Found {} instances of target objective in ' 'metrics (target_obj={})').format(len(target_obj_idx), target_objective)) metric_meta = MetricCatalog.objects.get_metric_meta(newest_result.session.dbms, newest_result.session.target_objective) if metric_meta[target_objective] == '(less is better)': lessisbetter = True else: lessisbetter = False y_workload = y_workload[:, target_obj_idx] y_target = y_target[:, target_obj_idx] y_columnlabels = y_columnlabels[target_obj_idx] # Combine duplicate rows in the target/workload data (separately) X_workload, y_workload, rowlabels_workload = DataUtil.combine_duplicate_rows( X_workload, y_workload, rowlabels_workload) X_target, y_target, rowlabels_target = DataUtil.combine_duplicate_rows( X_target, y_target, rowlabels_target) # Delete any rows that appear in both the workload data and the target # data from the workload data dups_filter = np.ones(X_workload.shape[0], dtype=bool) target_row_tups = [tuple(row) for row in X_target] for i, row in enumerate(X_workload): if tuple(row) in target_row_tups: dups_filter[i] = False X_workload = X_workload[dups_filter, :] y_workload = y_workload[dups_filter, :] rowlabels_workload = rowlabels_workload[dups_filter] # Combine target & workload Xs for preprocessing X_matrix = np.vstack([X_target, X_workload]) # Dummy encode categorial variables categorical_info = DataUtil.dummy_encoder_helper(X_columnlabels, mapped_workload.dbms) dummy_encoder = DummyEncoder(categorical_info['n_values'], categorical_info['categorical_features'], categorical_info['cat_columnlabels'], categorical_info['noncat_columnlabels']) X_matrix = dummy_encoder.fit_transform(X_matrix) # below two variables are needed for correctly determing max/min on dummies binary_index_set = set(categorical_info['binary_vars']) total_dummies = dummy_encoder.total_dummies() # Scale to N(0, 1) X_scaler = StandardScaler() X_scaled = X_scaler.fit_transform(X_matrix) if y_target.shape[0] < 5: # FIXME # FIXME (dva): if there are fewer than 5 target results so far # then scale the y values (metrics) using the workload's # y_scaler. I'm not sure if 5 is the right cutoff. y_target_scaler = None y_workload_scaler = StandardScaler() y_matrix = np.vstack([y_target, y_workload]) y_scaled = y_workload_scaler.fit_transform(y_matrix) else: # FIXME (dva): otherwise try to compute a separate y_scaler for # the target and scale them separately. try: y_target_scaler = StandardScaler() y_workload_scaler = StandardScaler() y_target_scaled = y_target_scaler.fit_transform(y_target) y_workload_scaled = y_workload_scaler.fit_transform(y_workload) y_scaled = np.vstack([y_target_scaled, y_workload_scaled]) except ValueError: y_target_scaler = None y_workload_scaler = StandardScaler() y_scaled = y_workload_scaler.fit_transform(y_target) # Set up constraint helper constraint_helper = ParamConstraintHelper(scaler=X_scaler, encoder=dummy_encoder, binary_vars=categorical_info['binary_vars'], init_flip_prob=INIT_FLIP_PROB, flip_prob_decay=FLIP_PROB_DECAY) # FIXME (dva): check if these are good values for the ridge # ridge = np.empty(X_scaled.shape[0]) # ridge[:X_target.shape[0]] = 0.01 # ridge[X_target.shape[0]:] = 0.1 # FIXME: we should generate more samples and use a smarter sampling # technique num_samples = NUM_SAMPLES X_samples = np.empty((num_samples, X_scaled.shape[1])) X_min = np.empty(X_scaled.shape[1]) X_max = np.empty(X_scaled.shape[1]) knobs_mem = KnobCatalog.objects.filter( dbms=newest_result.session.dbms, tunable=True, resource=1) knobs_mem_catalog = {k.name: k for k in knobs_mem} mem_max = newest_result.workload.hardware.memory X_mem = np.zeros([1, X_scaled.shape[1]]) X_default = np.empty(X_scaled.shape[1]) # Get default knob values for i, k_name in enumerate(X_columnlabels): k = KnobCatalog.objects.filter(dbms=newest_result.session.dbms, name=k_name)[0] X_default[i] = k.default X_default_scaled = X_scaler.transform(X_default.reshape(1, X_default.shape[0]))[0] # Determine min/max for knob values for i in range(X_scaled.shape[1]): if i < total_dummies or i in binary_index_set: col_min = 0 col_max = 1 else: col_min = X_scaled[:, i].min() col_max = X_scaled[:, i].max() if X_columnlabels[i] in knobs_mem_catalog: X_mem[0][i] = mem_max * 1024 * 1024 * 1024 # mem_max GB col_max = X_scaler.transform(X_mem)[0][i] # Set min value to the default value # FIXME: support multiple methods can be selected by users col_min = X_default_scaled[i] X_min[i] = col_min X_max[i] = col_max X_samples[:, i] = np.random.rand(num_samples) * (col_max - col_min) + col_min # Maximize the throughput, moreisbetter # Use gradient descent to minimize -throughput if not lessisbetter: y_scaled = -y_scaled q = queue.PriorityQueue() for x in range(0, y_scaled.shape[0]): q.put((y_scaled[x][0], x)) i = 0 while i < TOP_NUM_CONFIG: try: item = q.get_nowait() # Tensorflow get broken if we use the training data points as # starting points for GPRGD. We add a small bias for the # starting points. GPR_EPS default value is 0.001 X_samples = np.vstack((X_samples, X_scaled[item[1]] + GPR_EPS)) i = i + 1 except queue.Empty: break model = GPRGD(length_scale=DEFAULT_LENGTH_SCALE, magnitude=DEFAULT_MAGNITUDE, max_train_size=MAX_TRAIN_SIZE, batch_size=BATCH_SIZE, num_threads=NUM_THREADS, learning_rate=DEFAULT_LEARNING_RATE, epsilon=DEFAULT_EPSILON, max_iter=MAX_ITER, sigma_multiplier=DEFAULT_SIGMA_MULTIPLIER, mu_multiplier=DEFAULT_MU_MULTIPLIER) model.fit(X_scaled, y_scaled, X_min, X_max, ridge=DEFAULT_RIDGE) res = model.predict(X_samples, constraint_helper=constraint_helper) best_config_idx = np.argmin(res.minl.ravel()) best_config = res.minl_conf[best_config_idx, :] best_config = X_scaler.inverse_transform(best_config) # Decode one-hot encoding into categorical knobs best_config = dummy_encoder.inverse_transform(best_config) # Although we have max/min limits in the GPRGD training session, it may # lose some precisions. e.g. 0.99..99 >= 1.0 may be True on the scaled data, # when we inversely transform the scaled data, the different becomes much larger # and cannot be ignored. Here we check the range on the original data # directly, and make sure the recommended config lies within the range X_min_inv = X_scaler.inverse_transform(X_min) X_max_inv = X_scaler.inverse_transform(X_max) best_config = np.minimum(best_config, X_max_inv) best_config = np.maximum(best_config, X_min_inv) conf_map = {k: best_config[i] for i, k in enumerate(X_columnlabels)} conf_map_res = {} conf_map_res['status'] = 'good' conf_map_res['recommendation'] = conf_map conf_map_res['info'] = 'INFO: training data size is {}'.format(X_scaled.shape[0]) return conf_map_res
def map_workload(target_data): # Get the latest version of pipeline data that's been computed so far. latest_pipeline_run = PipelineRun.objects.get_latest() if target_data['bad']: assert target_data is not None return target_data assert latest_pipeline_run is not None newest_result = Result.objects.get(pk=target_data['newest_result_id']) target_workload = newest_result.workload X_columnlabels = np.array(target_data['X_columnlabels']) y_columnlabels = np.array(target_data['y_columnlabels']) # Find all pipeline data belonging to the latest version with the same # DBMS and hardware as the target pipeline_data = PipelineData.objects.filter( pipeline_run=latest_pipeline_run, workload__dbms=target_workload.dbms, workload__hardware=target_workload.hardware) # FIXME (dva): we should also compute the global (i.e., overall) ranked_knobs # and pruned metrics but we just use those from the first workload for now initialized = False global_ranked_knobs = None global_pruned_metrics = None ranked_knob_idxs = None pruned_metric_idxs = None # Compute workload mapping data for each unique workload unique_workloads = pipeline_data.values_list('workload', flat=True).distinct() assert len(unique_workloads) > 0 workload_data = {} for unique_workload in unique_workloads: # Load knob & metric data for this workload knob_data = load_data_helper(pipeline_data, unique_workload, PipelineTaskType.KNOB_DATA) metric_data = load_data_helper(pipeline_data, unique_workload, PipelineTaskType.METRIC_DATA) X_matrix = np.array(knob_data["data"]) y_matrix = np.array(metric_data["data"]) rowlabels = np.array(knob_data["rowlabels"]) assert np.array_equal(rowlabels, metric_data["rowlabels"]) if not initialized: # For now set ranked knobs & pruned metrics to be those computed # for the first workload global_ranked_knobs = load_data_helper( pipeline_data, unique_workload, PipelineTaskType.RANKED_KNOBS)[:IMPORTANT_KNOB_NUMBER] global_pruned_metrics = load_data_helper( pipeline_data, unique_workload, PipelineTaskType.PRUNED_METRICS) ranked_knob_idxs = [i for i in range(X_matrix.shape[1]) if X_columnlabels[ i] in global_ranked_knobs] pruned_metric_idxs = [i for i in range(y_matrix.shape[1]) if y_columnlabels[ i] in global_pruned_metrics] # Filter X & y columnlabels by top ranked_knobs & pruned_metrics X_columnlabels = X_columnlabels[ranked_knob_idxs] y_columnlabels = y_columnlabels[pruned_metric_idxs] initialized = True # Filter X & y matrices by top ranked_knobs & pruned_metrics X_matrix = X_matrix[:, ranked_knob_idxs] y_matrix = y_matrix[:, pruned_metric_idxs] # Combine duplicate rows (rows with same knob settings) X_matrix, y_matrix, rowlabels = DataUtil.combine_duplicate_rows( X_matrix, y_matrix, rowlabels) workload_data[unique_workload] = { 'X_matrix': X_matrix, 'y_matrix': y_matrix, 'rowlabels': rowlabels, } # Stack all X & y matrices for preprocessing Xs = np.vstack([entry['X_matrix'] for entry in list(workload_data.values())]) ys = np.vstack([entry['y_matrix'] for entry in list(workload_data.values())]) # Scale the X & y values, then compute the deciles for each column in y X_scaler = StandardScaler(copy=False) X_scaler.fit(Xs) y_scaler = StandardScaler(copy=False) y_scaler.fit_transform(ys) y_binner = Bin(bin_start=1, axis=0) y_binner.fit(ys) del Xs del ys # Filter the target's X & y data by the ranked knobs & pruned metrics. X_target = target_data['X_matrix'][:, ranked_knob_idxs] y_target = target_data['y_matrix'][:, pruned_metric_idxs] # Now standardize the target's data and bin it by the deciles we just # calculated X_target = X_scaler.transform(X_target) y_target = y_scaler.transform(y_target) y_target = y_binner.transform(y_target) scores = {} for workload_id, workload_entry in list(workload_data.items()): predictions = np.empty_like(y_target) X_workload = workload_entry['X_matrix'] X_scaled = X_scaler.transform(X_workload) y_workload = workload_entry['y_matrix'] y_scaled = y_scaler.transform(y_workload) for j, y_col in enumerate(y_scaled.T): # Using this workload's data, train a Gaussian process model # and then predict the performance of each metric for each of # the knob configurations attempted so far by the target. y_col = y_col.reshape(-1, 1) model = GPRNP(length_scale=DEFAULT_LENGTH_SCALE, magnitude=DEFAULT_MAGNITUDE, max_train_size=MAX_TRAIN_SIZE, batch_size=BATCH_SIZE) model.fit(X_scaled, y_col, ridge=DEFAULT_RIDGE) predictions[:, j] = model.predict(X_target).ypreds.ravel() # Bin each of the predicted metric columns by deciles and then # compute the score (i.e., distance) between the target workload # and each of the known workloads predictions = y_binner.transform(predictions) dists = np.sqrt(np.sum(np.square( np.subtract(predictions, y_target)), axis=1)) scores[workload_id] = np.mean(dists) # Find the best (minimum) score best_score = np.inf best_workload_id = None for workload_id, similarity_score in list(scores.items()): if similarity_score < best_score: best_score = similarity_score best_workload_id = workload_id target_data['mapped_workload'] = (best_workload_id, best_score) target_data['scores'] = scores return target_data
def configuration_recommendation(target_data): LOG.info('configuration_recommendation called') latest_pipeline_run = PipelineRun.objects.get_latest() if target_data['bad'] is True: target_data_res = {} target_data_res['status'] = 'bad' target_data_res[ 'info'] = 'WARNING: no training data, the config is generated randomly' target_data_res['recommendation'] = target_data['config_recommend'] return target_data_res # Load mapped workload data mapped_workload_id = target_data['mapped_workload'][0] mapped_workload = Workload.objects.get(pk=mapped_workload_id) workload_knob_data = PipelineData.objects.get( pipeline_run=latest_pipeline_run, workload=mapped_workload, task_type=PipelineTaskType.KNOB_DATA) workload_knob_data = JSONUtil.loads(workload_knob_data.data) workload_metric_data = PipelineData.objects.get( pipeline_run=latest_pipeline_run, workload=mapped_workload, task_type=PipelineTaskType.METRIC_DATA) workload_metric_data = JSONUtil.loads(workload_metric_data.data) X_workload = np.array(workload_knob_data['data']) X_columnlabels = np.array(workload_knob_data['columnlabels']) y_workload = np.array(workload_metric_data['data']) y_columnlabels = np.array(workload_metric_data['columnlabels']) rowlabels_workload = np.array(workload_metric_data['rowlabels']) # Target workload data newest_result = Result.objects.get(pk=target_data['newest_result_id']) X_target = target_data['X_matrix'] y_target = target_data['y_matrix'] rowlabels_target = np.array(target_data['rowlabels']) if not np.array_equal(X_columnlabels, target_data['X_columnlabels']): raise Exception(('The workload and target data should have ' 'identical X columnlabels (sorted knob names)')) if not np.array_equal(y_columnlabels, target_data['y_columnlabels']): raise Exception(('The workload and target data should have ' 'identical y columnlabels (sorted metric names)')) # Filter Xs by top 10 ranked knobs ranked_knobs = PipelineData.objects.get( pipeline_run=latest_pipeline_run, workload=mapped_workload, task_type=PipelineTaskType.RANKED_KNOBS) ranked_knobs = JSONUtil.loads(ranked_knobs.data)[:IMPORTANT_KNOB_NUMBER] ranked_knob_idxs = [ i for i, cl in enumerate(X_columnlabels) if cl in ranked_knobs ] X_workload = X_workload[:, ranked_knob_idxs] X_target = X_target[:, ranked_knob_idxs] X_columnlabels = X_columnlabels[ranked_knob_idxs] # Filter ys by current target objective metric target_objective = newest_result.session.target_objective target_obj_idx = [ i for i, cl in enumerate(y_columnlabels) if cl == target_objective ] if len(target_obj_idx) == 0: raise Exception(('Could not find target objective in metrics ' '(target_obj={})').format(target_objective)) elif len(target_obj_idx) > 1: raise Exception( ('Found {} instances of target objective in ' 'metrics (target_obj={})').format(len(target_obj_idx), target_objective)) metric_meta = MetricCatalog.objects.get_metric_meta( newest_result.session.dbms, newest_result.session.target_objective) if metric_meta[target_objective].improvement == '(less is better)': lessisbetter = True else: lessisbetter = False y_workload = y_workload[:, target_obj_idx] y_target = y_target[:, target_obj_idx] y_columnlabels = y_columnlabels[target_obj_idx] # Combine duplicate rows in the target/workload data (separately) X_workload, y_workload, rowlabels_workload = DataUtil.combine_duplicate_rows( X_workload, y_workload, rowlabels_workload) X_target, y_target, rowlabels_target = DataUtil.combine_duplicate_rows( X_target, y_target, rowlabels_target) # Delete any rows that appear in both the workload data and the target # data from the workload data dups_filter = np.ones(X_workload.shape[0], dtype=bool) target_row_tups = [tuple(row) for row in X_target] for i, row in enumerate(X_workload): if tuple(row) in target_row_tups: dups_filter[i] = False X_workload = X_workload[dups_filter, :] y_workload = y_workload[dups_filter, :] rowlabels_workload = rowlabels_workload[dups_filter] # Combine target & workload Xs for preprocessing X_matrix = np.vstack([X_target, X_workload]) # Dummy encode categorial variables categorical_info = DataUtil.dummy_encoder_helper(X_columnlabels, mapped_workload.dbms) dummy_encoder = DummyEncoder(categorical_info['n_values'], categorical_info['categorical_features'], categorical_info['cat_columnlabels'], categorical_info['noncat_columnlabels']) X_matrix = dummy_encoder.fit_transform(X_matrix) # below two variables are needed for correctly determing max/min on dummies binary_index_set = set(categorical_info['binary_vars']) total_dummies = dummy_encoder.total_dummies() # Scale to N(0, 1) X_scaler = StandardScaler() X_scaled = X_scaler.fit_transform(X_matrix) if y_target.shape[0] < 5: # FIXME # FIXME (dva): if there are fewer than 5 target results so far # then scale the y values (metrics) using the workload's # y_scaler. I'm not sure if 5 is the right cutoff. y_target_scaler = None y_workload_scaler = StandardScaler() y_matrix = np.vstack([y_target, y_workload]) y_scaled = y_workload_scaler.fit_transform(y_matrix) else: # FIXME (dva): otherwise try to compute a separate y_scaler for # the target and scale them separately. try: y_target_scaler = StandardScaler() y_workload_scaler = StandardScaler() y_target_scaled = y_target_scaler.fit_transform(y_target) y_workload_scaled = y_workload_scaler.fit_transform(y_workload) y_scaled = np.vstack([y_target_scaled, y_workload_scaled]) except ValueError: y_target_scaler = None y_workload_scaler = StandardScaler() y_scaled = y_workload_scaler.fit_transform(y_target) # Set up constraint helper constraint_helper = ParamConstraintHelper( scaler=X_scaler, encoder=dummy_encoder, binary_vars=categorical_info['binary_vars'], init_flip_prob=INIT_FLIP_PROB, flip_prob_decay=FLIP_PROB_DECAY) # FIXME (dva): check if these are good values for the ridge # ridge = np.empty(X_scaled.shape[0]) # ridge[:X_target.shape[0]] = 0.01 # ridge[X_target.shape[0]:] = 0.1 # FIXME: we should generate more samples and use a smarter sampling # technique num_samples = NUM_SAMPLES X_samples = np.empty((num_samples, X_scaled.shape[1])) X_min = np.empty(X_scaled.shape[1]) X_max = np.empty(X_scaled.shape[1]) knobs_mem = KnobCatalog.objects.filter(dbms=newest_result.session.dbms, tunable=True, resource=1) knobs_mem_catalog = {k.name: k for k in knobs_mem} mem_max = newest_result.workload.hardware.memory X_mem = np.zeros([1, X_scaled.shape[1]]) X_default = np.empty(X_scaled.shape[1]) # Get default knob values for i, k_name in enumerate(X_columnlabels): k = KnobCatalog.objects.filter(dbms=newest_result.session.dbms, name=k_name)[0] X_default[i] = k.default X_default_scaled = X_scaler.transform( X_default.reshape(1, X_default.shape[0]))[0] # Determine min/max for knob values for i in range(X_scaled.shape[1]): if i < total_dummies or i in binary_index_set: col_min = 0 col_max = 1 else: col_min = X_scaled[:, i].min() col_max = X_scaled[:, i].max() if X_columnlabels[i] in knobs_mem_catalog: X_mem[0][i] = mem_max * 1024 * 1024 * 1024 # mem_max GB col_max = min(col_max, X_scaler.transform(X_mem)[0][i]) # Set min value to the default value # FIXME: support multiple methods can be selected by users col_min = X_default_scaled[i] X_min[i] = col_min X_max[i] = col_max X_samples[:, i] = np.random.rand(num_samples) * (col_max - col_min) + col_min # Maximize the throughput, moreisbetter # Use gradient descent to minimize -throughput if not lessisbetter: y_scaled = -y_scaled q = queue.PriorityQueue() for x in range(0, y_scaled.shape[0]): q.put((y_scaled[x][0], x)) i = 0 while i < TOP_NUM_CONFIG: try: item = q.get_nowait() # Tensorflow get broken if we use the training data points as # starting points for GPRGD. We add a small bias for the # starting points. GPR_EPS default value is 0.001 # if the starting point is X_max, we minus a small bias to # make sure it is within the range. dist = sum(np.square(X_max - X_scaled[item[1]])) if dist < 0.001: X_samples = np.vstack( (X_samples, X_scaled[item[1]] - abs(GPR_EPS))) else: X_samples = np.vstack( (X_samples, X_scaled[item[1]] + abs(GPR_EPS))) i = i + 1 except queue.Empty: break model = GPRGD(length_scale=DEFAULT_LENGTH_SCALE, magnitude=DEFAULT_MAGNITUDE, max_train_size=MAX_TRAIN_SIZE, batch_size=BATCH_SIZE, num_threads=NUM_THREADS, learning_rate=DEFAULT_LEARNING_RATE, epsilon=DEFAULT_EPSILON, max_iter=MAX_ITER, sigma_multiplier=DEFAULT_SIGMA_MULTIPLIER, mu_multiplier=DEFAULT_MU_MULTIPLIER) model.fit(X_scaled, y_scaled, X_min, X_max, ridge=DEFAULT_RIDGE) res = model.predict(X_samples, constraint_helper=constraint_helper) best_config_idx = np.argmin(res.minl.ravel()) best_config = res.minl_conf[best_config_idx, :] best_config = X_scaler.inverse_transform(best_config) # Decode one-hot encoding into categorical knobs best_config = dummy_encoder.inverse_transform(best_config) # Although we have max/min limits in the GPRGD training session, it may # lose some precisions. e.g. 0.99..99 >= 1.0 may be True on the scaled data, # when we inversely transform the scaled data, the different becomes much larger # and cannot be ignored. Here we check the range on the original data # directly, and make sure the recommended config lies within the range X_min_inv = X_scaler.inverse_transform(X_min) X_max_inv = X_scaler.inverse_transform(X_max) best_config = np.minimum(best_config, X_max_inv) best_config = np.maximum(best_config, X_min_inv) conf_map = {k: best_config[i] for i, k in enumerate(X_columnlabels)} conf_map_res = {} conf_map_res['status'] = 'good' conf_map_res['recommendation'] = conf_map conf_map_res['info'] = 'INFO: training data size is {}'.format( X_scaled.shape[0]) return conf_map_res
def configuration_recommendation(recommendation_input): target_data, algorithm = recommendation_input LOG.info('configuration_recommendation called') if target_data['bad'] is True: target_data_res = dict( status='bad', result_id=target_data['newest_result_id'], info='WARNING: no training data, the config is generated randomly', recommendation=target_data['config_recommend'], pipeline_run=target_data['pipeline_run']) LOG.debug('%s: Skipping configuration recommendation.\n\ndata=%s\n', AlgorithmType.name(algorithm), JSONUtil.dumps(target_data, pprint=True)) return target_data_res # Load mapped workload data mapped_workload_id = target_data['mapped_workload'][0] latest_pipeline_run = PipelineRun.objects.get( pk=target_data['pipeline_run']) mapped_workload = Workload.objects.get(pk=mapped_workload_id) workload_knob_data = PipelineData.objects.get( pipeline_run=latest_pipeline_run, workload=mapped_workload, task_type=PipelineTaskType.KNOB_DATA) workload_knob_data = JSONUtil.loads(workload_knob_data.data) workload_metric_data = PipelineData.objects.get( pipeline_run=latest_pipeline_run, workload=mapped_workload, task_type=PipelineTaskType.METRIC_DATA) workload_metric_data = JSONUtil.loads(workload_metric_data.data) newest_result = Result.objects.get(pk=target_data['newest_result_id']) cleaned_workload_knob_data = clean_knob_data( workload_knob_data["data"], workload_knob_data["columnlabels"], newest_result.session) X_workload = np.array(cleaned_workload_knob_data[0]) X_columnlabels = np.array(cleaned_workload_knob_data[1]) y_workload = np.array(workload_metric_data['data']) y_columnlabels = np.array(workload_metric_data['columnlabels']) rowlabels_workload = np.array(workload_metric_data['rowlabels']) # Target workload data newest_result = Result.objects.get(pk=target_data['newest_result_id']) X_target = target_data['X_matrix'] y_target = target_data['y_matrix'] rowlabels_target = np.array(target_data['rowlabels']) if not np.array_equal(X_columnlabels, target_data['X_columnlabels']): raise Exception(('The workload and target data should have ' 'identical X columnlabels (sorted knob names)')) if not np.array_equal(y_columnlabels, target_data['y_columnlabels']): raise Exception(('The workload and target data should have ' 'identical y columnlabels (sorted metric names)')) # Filter Xs by top 10 ranked knobs ranked_knobs = PipelineData.objects.get( pipeline_run=latest_pipeline_run, workload=mapped_workload, task_type=PipelineTaskType.RANKED_KNOBS) ranked_knobs = JSONUtil.loads(ranked_knobs.data)[:IMPORTANT_KNOB_NUMBER] ranked_knob_idxs = [ i for i, cl in enumerate(X_columnlabels) if cl in ranked_knobs ] X_workload = X_workload[:, ranked_knob_idxs] X_target = X_target[:, ranked_knob_idxs] X_columnlabels = X_columnlabels[ranked_knob_idxs] # Filter ys by current target objective metric target_objective = newest_result.session.target_objective target_obj_idx = [ i for i, cl in enumerate(y_columnlabels) if cl == target_objective ] if len(target_obj_idx) == 0: raise Exception(('Could not find target objective in metrics ' '(target_obj={})').format(target_objective)) elif len(target_obj_idx) > 1: raise Exception( ('Found {} instances of target objective in ' 'metrics (target_obj={})').format(len(target_obj_idx), target_objective)) metric_meta = db.target_objectives.get_metric_metadata( newest_result.session.dbms.pk, newest_result.session.target_objective) lessisbetter = metric_meta[ target_objective].improvement == db.target_objectives.LESS_IS_BETTER y_workload = y_workload[:, target_obj_idx] y_target = y_target[:, target_obj_idx] y_columnlabels = y_columnlabels[target_obj_idx] # Combine duplicate rows in the target/workload data (separately) X_workload, y_workload, rowlabels_workload = DataUtil.combine_duplicate_rows( X_workload, y_workload, rowlabels_workload) X_target, y_target, rowlabels_target = DataUtil.combine_duplicate_rows( X_target, y_target, rowlabels_target) # Delete any rows that appear in both the workload data and the target # data from the workload data dups_filter = np.ones(X_workload.shape[0], dtype=bool) target_row_tups = [tuple(row) for row in X_target] for i, row in enumerate(X_workload): if tuple(row) in target_row_tups: dups_filter[i] = False X_workload = X_workload[dups_filter, :] y_workload = y_workload[dups_filter, :] rowlabels_workload = rowlabels_workload[dups_filter] # Combine target & workload Xs for preprocessing X_matrix = np.vstack([X_target, X_workload]) # Dummy encode categorial variables categorical_info = DataUtil.dummy_encoder_helper(X_columnlabels, mapped_workload.dbms) dummy_encoder = DummyEncoder(categorical_info['n_values'], categorical_info['categorical_features'], categorical_info['cat_columnlabels'], categorical_info['noncat_columnlabels']) X_matrix = dummy_encoder.fit_transform(X_matrix) # below two variables are needed for correctly determing max/min on dummies binary_index_set = set(categorical_info['binary_vars']) total_dummies = dummy_encoder.total_dummies() # Scale to N(0, 1) X_scaler = StandardScaler() X_scaled = X_scaler.fit_transform(X_matrix) if y_target.shape[0] < 5: # FIXME # FIXME (dva): if there are fewer than 5 target results so far # then scale the y values (metrics) using the workload's # y_scaler. I'm not sure if 5 is the right cutoff. y_target_scaler = None y_workload_scaler = StandardScaler() y_matrix = np.vstack([y_target, y_workload]) y_scaled = y_workload_scaler.fit_transform(y_matrix) else: # FIXME (dva): otherwise try to compute a separate y_scaler for # the target and scale them separately. try: y_target_scaler = StandardScaler() y_workload_scaler = StandardScaler() y_target_scaled = y_target_scaler.fit_transform(y_target) y_workload_scaled = y_workload_scaler.fit_transform(y_workload) y_scaled = np.vstack([y_target_scaled, y_workload_scaled]) except ValueError: y_target_scaler = None y_workload_scaler = StandardScaler() y_scaled = y_workload_scaler.fit_transform(y_target) # Set up constraint helper constraint_helper = ParamConstraintHelper( scaler=X_scaler, encoder=dummy_encoder, binary_vars=categorical_info['binary_vars'], init_flip_prob=INIT_FLIP_PROB, flip_prob_decay=FLIP_PROB_DECAY) # FIXME (dva): check if these are good values for the ridge # ridge = np.empty(X_scaled.shape[0]) # ridge[:X_target.shape[0]] = 0.01 # ridge[X_target.shape[0]:] = 0.1 # FIXME: we should generate more samples and use a smarter sampling # technique num_samples = NUM_SAMPLES X_samples = np.empty((num_samples, X_scaled.shape[1])) X_min = np.empty(X_scaled.shape[1]) X_max = np.empty(X_scaled.shape[1]) X_scaler_matrix = np.zeros([1, X_scaled.shape[1]]) session_knobs = SessionKnob.objects.get_knobs_for_session( newest_result.session) # Set min/max for knob values for i in range(X_scaled.shape[1]): if i < total_dummies or i in binary_index_set: col_min = 0 col_max = 1 else: col_min = X_scaled[:, i].min() col_max = X_scaled[:, i].max() for knob in session_knobs: if X_columnlabels[i] == knob["name"]: X_scaler_matrix[0][i] = knob["minval"] col_min = X_scaler.transform(X_scaler_matrix)[0][i] X_scaler_matrix[0][i] = knob["maxval"] col_max = X_scaler.transform(X_scaler_matrix)[0][i] X_min[i] = col_min X_max[i] = col_max X_samples[:, i] = np.random.rand(num_samples) * (col_max - col_min) + col_min # Maximize the throughput, moreisbetter # Use gradient descent to minimize -throughput if not lessisbetter: y_scaled = -y_scaled q = queue.PriorityQueue() for x in range(0, y_scaled.shape[0]): q.put((y_scaled[x][0], x)) i = 0 while i < TOP_NUM_CONFIG: try: item = q.get_nowait() # Tensorflow get broken if we use the training data points as # starting points for GPRGD. We add a small bias for the # starting points. GPR_EPS default value is 0.001 # if the starting point is X_max, we minus a small bias to # make sure it is within the range. dist = sum(np.square(X_max - X_scaled[item[1]])) if dist < 0.001: X_samples = np.vstack( (X_samples, X_scaled[item[1]] - abs(GPR_EPS))) else: X_samples = np.vstack( (X_samples, X_scaled[item[1]] + abs(GPR_EPS))) i = i + 1 except queue.Empty: break session = newest_result.session res = None if algorithm == AlgorithmType.DNN: # neural network model model_nn = NeuralNet(n_input=X_samples.shape[1], batch_size=X_samples.shape[0], explore_iters=DNN_EXPLORE_ITER, noise_scale_begin=DNN_NOISE_SCALE_BEGIN, noise_scale_end=DNN_NOISE_SCALE_END, debug=DNN_DEBUG, debug_interval=DNN_DEBUG_INTERVAL) if session.dnn_model is not None: model_nn.set_weights_bin(session.dnn_model) model_nn.fit(X_scaled, y_scaled, fit_epochs=DNN_TRAIN_ITER) res = model_nn.recommend(X_samples, X_min, X_max, explore=DNN_EXPLORE, recommend_epochs=MAX_ITER) session.dnn_model = model_nn.get_weights_bin() session.save() elif algorithm == AlgorithmType.GPR: # default gpr model model = GPRGD(length_scale=DEFAULT_LENGTH_SCALE, magnitude=DEFAULT_MAGNITUDE, max_train_size=MAX_TRAIN_SIZE, batch_size=BATCH_SIZE, num_threads=NUM_THREADS, learning_rate=DEFAULT_LEARNING_RATE, epsilon=DEFAULT_EPSILON, max_iter=MAX_ITER, sigma_multiplier=DEFAULT_SIGMA_MULTIPLIER, mu_multiplier=DEFAULT_MU_MULTIPLIER) model.fit(X_scaled, y_scaled, X_min, X_max, ridge=DEFAULT_RIDGE) res = model.predict(X_samples, constraint_helper=constraint_helper) best_config_idx = np.argmin(res.minl.ravel()) best_config = res.minl_conf[best_config_idx, :] best_config = X_scaler.inverse_transform(best_config) # Decode one-hot encoding into categorical knobs best_config = dummy_encoder.inverse_transform(best_config) # Although we have max/min limits in the GPRGD training session, it may # lose some precisions. e.g. 0.99..99 >= 1.0 may be True on the scaled data, # when we inversely transform the scaled data, the different becomes much larger # and cannot be ignored. Here we check the range on the original data # directly, and make sure the recommended config lies within the range X_min_inv = X_scaler.inverse_transform(X_min) X_max_inv = X_scaler.inverse_transform(X_max) best_config = np.minimum(best_config, X_max_inv) best_config = np.maximum(best_config, X_min_inv) conf_map = {k: best_config[i] for i, k in enumerate(X_columnlabels)} conf_map_res = dict(status='good', result_id=target_data['newest_result_id'], recommendation=conf_map, info='INFO: training data size is {}'.format( X_scaled.shape[0]), pipeline_run=latest_pipeline_run.pk) LOG.debug('%s: Finished selecting the next config.\n\ndata=%s\n', AlgorithmType.name(algorithm), JSONUtil.dumps(conf_map_res, pprint=True)) return conf_map_res
def configuration_recommendation(target_data): LOG.info('configuration_recommendation called') latest_pipeline_run = PipelineRun.objects.get_latest() if target_data['bad'] is True: target_data_res = {} target_data_res['status'] = 'bad' target_data_res['info'] = 'WARNING: no training data, the config is generated randomly' target_data_res['recommendation'] = target_data['config_recommend'] return target_data_res # Load mapped workload data mapped_workload_id = target_data['mapped_workload'][0] mapped_workload = Workload.objects.get(pk=mapped_workload_id) workload_knob_data = PipelineData.objects.get( pipeline_run=latest_pipeline_run, workload=mapped_workload, task_type=PipelineTaskType.KNOB_DATA) workload_knob_data = JSONUtil.loads(workload_knob_data.data) workload_metric_data = PipelineData.objects.get( pipeline_run=latest_pipeline_run, workload=mapped_workload, task_type=PipelineTaskType.METRIC_DATA) workload_metric_data = JSONUtil.loads(workload_metric_data.data) X_workload = np.array(workload_knob_data['data']) X_columnlabels = np.array(workload_knob_data['columnlabels']) y_workload = np.array(workload_metric_data['data']) y_columnlabels = np.array(workload_metric_data['columnlabels']) rowlabels_workload = np.array(workload_metric_data['rowlabels']) # Target workload data newest_result = Result.objects.get(pk=target_data['newest_result_id']) X_target = target_data['X_matrix'] y_target = target_data['y_matrix'] rowlabels_target = np.array(target_data['rowlabels']) if not np.array_equal(X_columnlabels, target_data['X_columnlabels']): raise Exception(('The workload and target data should have ' 'identical X columnlabels (sorted knob names)')) if not np.array_equal(y_columnlabels, target_data['y_columnlabels']): raise Exception(('The workload and target data should have ' 'identical y columnlabels (sorted metric names)')) # Filter Xs by top 10 ranked knobs ranked_knobs = PipelineData.objects.get( pipeline_run=latest_pipeline_run, workload=mapped_workload, task_type=PipelineTaskType.RANKED_KNOBS) ranked_knobs = JSONUtil.loads(ranked_knobs.data)[:10] # FIXME ranked_knob_idxs = [i for i, cl in enumerate(X_columnlabels) if cl in ranked_knobs] X_workload = X_workload[:, ranked_knob_idxs] X_target = X_target[:, ranked_knob_idxs] X_columnlabels = X_columnlabels[ranked_knob_idxs] # Filter ys by current target objective metric target_objective = newest_result.session.target_objective target_obj_idx = [i for i, cl in enumerate(y_columnlabels) if cl == target_objective] if len(target_obj_idx) == 0: raise Exception(('Could not find target objective in metrics ' '(target_obj={})').format(target_objective)) elif len(target_obj_idx) > 1: raise Exception(('Found {} instances of target objective in ' 'metrics (target_obj={})').format(len(target_obj_idx), target_objective)) y_workload = y_workload[:, target_obj_idx] y_target = y_target[:, target_obj_idx] y_columnlabels = y_columnlabels[target_obj_idx] # Combine duplicate rows in the target/workload data (separately) X_workload, y_workload, rowlabels_workload = DataUtil.combine_duplicate_rows( X_workload, y_workload, rowlabels_workload) X_target, y_target, rowlabels_target = DataUtil.combine_duplicate_rows( X_target, y_target, rowlabels_target) # Delete any rows that appear in both the workload data and the target # data from the workload data dups_filter = np.ones(X_workload.shape[0], dtype=bool) target_row_tups = [tuple(row) for row in X_target] for i, row in enumerate(X_workload): if tuple(row) in target_row_tups: dups_filter[i] = False X_workload = X_workload[dups_filter, :] y_workload = y_workload[dups_filter, :] rowlabels_workload = rowlabels_workload[dups_filter] # Combine target & workload Xs then scale X_matrix = np.vstack([X_target, X_workload]) X_scaler = StandardScaler() X_scaled = X_scaler.fit_transform(X_matrix) if y_target.shape[0] < 5: # FIXME # FIXME (dva): if there are fewer than 5 target results so far # then scale the y values (metrics) using the workload's # y_scaler. I'm not sure if 5 is the right cutoff. y_target_scaler = None y_workload_scaler = StandardScaler() y_matrix = np.vstack([y_target, y_workload]) y_scaled = y_workload_scaler.fit_transform(y_matrix) else: # FIXME (dva): otherwise try to compute a separate y_scaler for # the target and scale them separately. try: y_target_scaler = StandardScaler() y_workload_scaler = StandardScaler() y_target_scaled = y_target_scaler.fit_transform(y_target) y_workload_scaled = y_workload_scaler.fit_transform(y_workload) y_scaled = np.vstack([y_target_scaled, y_workload_scaled]) except ValueError: y_target_scaler = None y_workload_scaler = StandardScaler() y_scaled = y_workload_scaler.fit_transform(y_target) # FIXME (dva): check if these are good values for the ridge ridge = np.empty(X_scaled.shape[0]) ridge[:X_target.shape[0]] = 0.01 ridge[X_target.shape[0]:] = 0.1 # FIXME: we should generate more samples and use a smarter sampling # technique num_samples = 20 X_samples = np.empty((num_samples, X_scaled.shape[1])) X_min = np.empty(X_scaled.shape[1]) X_max = np.empty(X_scaled.shape[1]) for i in range(X_scaled.shape[1]): col_min = X_scaled[:, i].min() col_max = X_scaled[:, i].max() X_min[i] = col_min X_max[i] = col_max X_samples[:, i] = np.random.rand( num_samples) * (col_max - col_min) + col_min # FIXME: Maximize the throughput, hardcode # Use gradient descent to minimize -throughput y_scaled = -y_scaled model = GPRGD() model.fit(X_scaled, y_scaled, X_min, X_max, ridge) res = model.predict(X_samples) # FIXME: whether we select the min/max for the best config depends # on the target objective best_config_idx = np.argmin(res.minl.ravel()) best_config = res.minl_conf[best_config_idx, :] best_config = X_scaler.inverse_transform(best_config) conf_map = {k: best_config[i] for i, k in enumerate(X_columnlabels)} conf_map_res = {} conf_map_res['status'] = 'good' conf_map_res['recommendation'] = conf_map conf_map_res['info'] = 'INFO: training data size is {}'.format(X_scaled.shape[0]) return conf_map_res
def run_background_tasks(): start_ts = time.time() LOG.info("Starting background tasks...") # Find modified and not modified workloads, we only have to calculate for the # modified workloads. modified_workloads = Workload.objects.filter( status=WorkloadStatusType.MODIFIED) num_modified = modified_workloads.count() non_modified_workloads = Workload.objects.filter( status=WorkloadStatusType.PROCESSED) non_modified_workloads = list( non_modified_workloads.values_list('pk', flat=True)) last_pipeline_run = PipelineRun.objects.get_latest() LOG.debug("Workloads: # modified: %s, # processed: %s, # total: %s", num_modified, len(non_modified_workloads), Workload.objects.all().count()) if num_modified == 0: # No previous workload data yet. Try again later. LOG.info("No modified workload data yet. Ending background tasks.") return # Create new entry in PipelineRun table to store the output of each of # the background tasks pipeline_run_obj = PipelineRun(start_time=now(), end_time=None) pipeline_run_obj.save() for i, workload in enumerate(modified_workloads): workload.status = WorkloadStatusType.PROCESSING workload.save() wkld_results = Result.objects.filter(workload=workload) num_wkld_results = wkld_results.count() workload_name = '{}@{}.{}'.format(workload.dbms.key, workload.project.name, workload.name) LOG.info("Starting workload %s (%s/%s, # results: %s)...", workload_name, i + 1, num_modified, num_wkld_results) if num_wkld_results == 0: # delete the workload LOG.info("Deleting workload %s because it has no results.", workload_name) workload.delete() continue if num_wkld_results < MIN_WORKLOAD_RESULTS_COUNT: # Check that there are enough results in the workload LOG.info( "Not enough results in workload %s (# results: %s, # required: %s).", workload_name, num_wkld_results, MIN_WORKLOAD_RESULTS_COUNT) workload.status = WorkloadStatusType.PROCESSED workload.save() continue LOG.info("Aggregating data for workload %s...", workload_name) # Aggregate the knob & metric data for this workload knob_data, metric_data = aggregate_data(wkld_results) LOG.debug( "Aggregated knob data: rowlabels=%s, columnlabels=%s, data=%s.", len(knob_data['rowlabels']), len(knob_data['columnlabels']), knob_data['data'].shape) LOG.debug( "Aggregated metric data: rowlabels=%s, columnlabels=%s, data=%s.", len(metric_data['rowlabels']), len(metric_data['columnlabels']), metric_data['data'].shape) LOG.info("Done aggregating data for workload %s.", workload_name) num_valid_results = knob_data['data'].shape[0] # pylint: disable=unsubscriptable-object if num_valid_results < MIN_WORKLOAD_RESULTS_COUNT: # Check that there are enough valid results in the workload LOG.info( "Not enough valid results in workload %s (# valid results: " "%s, # required: %s).", workload_name, num_valid_results, MIN_WORKLOAD_RESULTS_COUNT) workload.status = WorkloadStatusType.PROCESSED workload.save() continue # Knob_data and metric_data are 2D numpy arrays. Convert them into a # JSON-friendly (nested) lists and then save them as new PipelineData # objects. knob_data_copy = copy.deepcopy(knob_data) knob_data_copy['data'] = knob_data_copy['data'].tolist() knob_data_copy = JSONUtil.dumps(knob_data_copy) knob_entry = PipelineData(pipeline_run=pipeline_run_obj, task_type=PipelineTaskType.KNOB_DATA, workload=workload, data=knob_data_copy, creation_time=now()) knob_entry.save() metric_data_copy = copy.deepcopy(metric_data) metric_data_copy['data'] = metric_data_copy['data'].tolist() metric_data_copy = JSONUtil.dumps(metric_data_copy) metric_entry = PipelineData(pipeline_run=pipeline_run_obj, task_type=PipelineTaskType.METRIC_DATA, workload=workload, data=metric_data_copy, creation_time=now()) metric_entry.save() # Execute the Workload Characterization task to compute the list of # pruned metrics for this workload and save them in a new PipelineData # object. LOG.info("Pruning metrics for workload %s...", workload_name) pruned_metrics = run_workload_characterization(metric_data=metric_data, dbms=workload.dbms) LOG.info( "Done pruning metrics for workload %s (# pruned metrics: %s).\n\n" "Pruned metrics: %s\n", workload_name, len(pruned_metrics), pruned_metrics) pruned_metrics_entry = PipelineData( pipeline_run=pipeline_run_obj, task_type=PipelineTaskType.PRUNED_METRICS, workload=workload, data=JSONUtil.dumps(pruned_metrics), creation_time=now()) pruned_metrics_entry.save() # Workload target objective data ranked_knob_metrics = sorted( wkld_results.distinct('session').values_list( 'session__target_objective', flat=True).distinct()) LOG.debug("Target objectives for workload %s: %s", workload_name, ', '.join(ranked_knob_metrics)) if KNOB_IDENT_USE_PRUNED_METRICS: ranked_knob_metrics = sorted( set(ranked_knob_metrics) + set(pruned_metrics)) # Use the set of metrics to filter the metric_data metric_idxs = [ i for i, metric_name in enumerate(metric_data['columnlabels']) if metric_name in ranked_knob_metrics ] ranked_metric_data = { 'data': metric_data['data'][:, metric_idxs], 'rowlabels': copy.deepcopy(metric_data['rowlabels']), 'columnlabels': [metric_data['columnlabels'][i] for i in metric_idxs] } # Execute the Knob Identification task to compute an ordered list of knobs # ranked by their impact on the DBMS's performance. Save them in a new # PipelineData object. LOG.info( "Ranking knobs for workload %s (use pruned metric data: %s)...", workload_name, KNOB_IDENT_USE_PRUNED_METRICS) sessions = [] for result in wkld_results: if result.session not in sessions: sessions.append(result.session) rank_knob_data = copy.deepcopy(knob_data) rank_knob_data['data'], rank_knob_data['columnlabels'] =\ DataUtil.clean_knob_data(knob_data['data'], knob_data['columnlabels'], sessions) ranked_knobs = run_knob_identification(knob_data=rank_knob_data, metric_data=ranked_metric_data, dbms=workload.dbms) LOG.info( "Done ranking knobs for workload %s (# ranked knobs: %s).\n\n" "Ranked knobs: %s\n", workload_name, len(ranked_knobs), ranked_knobs) ranked_knobs_entry = PipelineData( pipeline_run=pipeline_run_obj, task_type=PipelineTaskType.RANKED_KNOBS, workload=workload, data=JSONUtil.dumps(ranked_knobs), creation_time=now()) ranked_knobs_entry.save() workload.status = WorkloadStatusType.PROCESSED workload.save() LOG.info("Done processing workload %s (%s/%s).", workload_name, i + 1, num_modified) LOG.info("Finished processing %s modified workloads.", num_modified) non_modified_workloads = Workload.objects.filter( pk__in=non_modified_workloads) # Update the latest pipeline data for the non modified workloads to have this pipeline run PipelineData.objects.filter(workload__in=non_modified_workloads, pipeline_run=last_pipeline_run)\ .update(pipeline_run=pipeline_run_obj) # Set the end_timestamp to the current time to indicate that we are done running # the background tasks pipeline_run_obj.end_time = now() pipeline_run_obj.save() save_execution_time(start_ts, "run_background_tasks") LOG.info("Finished background tasks (%.0f seconds).", time.time() - start_ts)