def test_no_categoricals(self): X = [[1, 2, 3], [4, 5, 6]] n_values = [] categorical_features = [] cat_columnlabels = [] noncat_columnlabels = ['a', 'b', 'c'] enc = DummyEncoder(n_values, categorical_features, cat_columnlabels, noncat_columnlabels) X_encoded = enc.fit_transform(X) new_labels = enc.new_labels self.assertTrue(np.all(X == X_encoded)) self.assertEqual(noncat_columnlabels, new_labels)
def test_inverse_transform(self): X = [[1, 0, 2], [1, 1, 2], [1, 2, 2]] n_values = [3] categorical_features = [1] cat_columnlabels = ['label'] noncat_columnlabels = ['a', 'b'] X_expected = [[1, 0, 0, 1, 2], [0, 1, 0, 1, 2], [0, 0, 1, 1, 2]] enc = DummyEncoder(n_values, categorical_features, cat_columnlabels, noncat_columnlabels) X_encoded = enc.fit_transform(X) self.assertTrue(np.all(X_encoded == X_expected)) X_decoded = enc.inverse_transform(X_encoded) self.assertTrue(np.all(X == X_decoded))
def test_apply_constraints_unscaled(self): n_values = [3] categorical_features = [0] encoder = DummyEncoder(n_values, categorical_features, ['a'], []) encoder.fit([[0, 17]]) X_scaler = StandardScaler() constraint_helper = ParamConstraintHelper(X_scaler, encoder, init_flip_prob=0.3, flip_prob_decay=0.5) X = [0.1, 0.2, 0.3, 17] X_expected = [0, 0, 1, 17] X_corrected = constraint_helper.apply_constraints(X, scaled=False, rescale=False) self.assertTrue(np.all(X_corrected == X_expected))
def test_mixed_categorical(self): X = [[1, 0, 2], [1, 1, 2], [1, 2, 2]] n_values = [3] categorical_features = [1] cat_columnlabels = ['label'] noncat_columnlabels = ['a', 'b'] X_expected = [[1, 0, 0, 1, 2], [0, 1, 0, 1, 2], [0, 0, 1, 1, 2]] new_labels_expected = [ 'label____0', 'label____1', 'label____2', 'a', 'b' ] enc = DummyEncoder(n_values, categorical_features, cat_columnlabels, noncat_columnlabels) X_encoded = enc.fit_transform(X) new_labels = enc.new_labels self.assertTrue(np.all(X_expected == X_encoded)) self.assertEqual(new_labels_expected, new_labels)
def test_randomize_categorical_features(self): # variable 0 is categorical, 3 values # variable 1 is not categorical # variable 2 is categorical, 4 values cat_var_0_levels = 3 cat_var_2_levels = 4 cat_var_0_idx = 0 cat_var_2_idx = 2 n_values = [cat_var_0_levels, cat_var_2_levels] categorical_features = [cat_var_0_idx, cat_var_2_idx] encoder = DummyEncoder(n_values, categorical_features, ['a', 'b'], []) encoder.fit([[0, 17, 0]]) X_scaler = StandardScaler() constraint_helper = ParamConstraintHelper(X_scaler, encoder, init_flip_prob=0.3, flip_prob_decay=0.5) # row is a sample encoded set of features, # note that the non-categorical variable is on the right row = np.array([0, 0, 1, 1, 0, 0, 0, 17], dtype=float) trials = 20 cat_var_0_counts = np.zeros(cat_var_0_levels) cat_var_2_counts = np.zeros(cat_var_2_levels) for _ in range(trials): # possibly flip the categorical features row = constraint_helper.randomize_categorical_features(row, scaled=False, rescale=False) # check that result is valid for cat_var_0 cat_var_0_dummies = row[0: cat_var_0_levels] self.assertTrue(np.all(np.logical_or(cat_var_0_dummies == 0, cat_var_0_dummies == 1))) self.assertEqual(np.sum(cat_var_0_dummies), 1) cat_var_0_counts[np.argmax(cat_var_0_dummies)] += 1 # check that result is valid for cat_var_2 cat_var_2_dummies = row[cat_var_0_levels: cat_var_0_levels + cat_var_2_levels] self.assertTrue(np.all(np.logical_or(cat_var_2_dummies == 0, cat_var_2_dummies == 1))) self.assertEqual(np.sum(cat_var_2_dummies), 1) cat_var_2_counts[np.argmax(cat_var_2_dummies)] += 1 self.assertEqual(row[-1], 17) for ct in cat_var_0_counts: self.assertTrue(ct > 0) for ct in cat_var_2_counts: self.assertTrue(ct > 0)
def test_apply_constraints(self): n_values = [3] categorical_features = [0] encoder = DummyEncoder(n_values, categorical_features, ['a'], []) encoder.fit([[0, 17]]) X_scaler = StandardScaler() X = np.array([[0, 0, 1, 17], [1, 0, 0, 17]], dtype=float) X_scaled = X_scaler.fit_transform(X) constraint_helper = ParamConstraintHelper(X_scaler, encoder, init_flip_prob=0.3, flip_prob_decay=0.5) row = X_scaled[0] new_row = np.copy(row) new_row[0: 3] += 0.1 # should still represent [0, 0, 1] encoding row_corrected = constraint_helper.apply_constraints(new_row) self.assertTrue(np.all(row == row_corrected))
def configuration_recommendation(target_data): LOG.info('configuration_recommendation called') latest_pipeline_run = PipelineRun.objects.get_latest() if target_data['bad'] is True: target_data_res = {} target_data_res['status'] = 'bad' target_data_res[ 'info'] = 'WARNING: no training data, the config is generated randomly' target_data_res['recommendation'] = target_data['config_recommend'] return target_data_res # Load mapped workload data mapped_workload_id = target_data['mapped_workload'][0] mapped_workload = Workload.objects.get(pk=mapped_workload_id) workload_knob_data = PipelineData.objects.get( pipeline_run=latest_pipeline_run, workload=mapped_workload, task_type=PipelineTaskType.KNOB_DATA) workload_knob_data = JSONUtil.loads(workload_knob_data.data) workload_metric_data = PipelineData.objects.get( pipeline_run=latest_pipeline_run, workload=mapped_workload, task_type=PipelineTaskType.METRIC_DATA) workload_metric_data = JSONUtil.loads(workload_metric_data.data) X_workload = np.array(workload_knob_data['data']) X_columnlabels = np.array(workload_knob_data['columnlabels']) y_workload = np.array(workload_metric_data['data']) y_columnlabels = np.array(workload_metric_data['columnlabels']) rowlabels_workload = np.array(workload_metric_data['rowlabels']) # Target workload data newest_result = Result.objects.get(pk=target_data['newest_result_id']) X_target = target_data['X_matrix'] y_target = target_data['y_matrix'] rowlabels_target = np.array(target_data['rowlabels']) if not np.array_equal(X_columnlabels, target_data['X_columnlabels']): raise Exception(('The workload and target data should have ' 'identical X columnlabels (sorted knob names)')) if not np.array_equal(y_columnlabels, target_data['y_columnlabels']): raise Exception(('The workload and target data should have ' 'identical y columnlabels (sorted metric names)')) # Filter Xs by top 10 ranked knobs ranked_knobs = PipelineData.objects.get( pipeline_run=latest_pipeline_run, workload=mapped_workload, task_type=PipelineTaskType.RANKED_KNOBS) ranked_knobs = JSONUtil.loads(ranked_knobs.data)[:IMPORTANT_KNOB_NUMBER] ranked_knob_idxs = [ i for i, cl in enumerate(X_columnlabels) if cl in ranked_knobs ] X_workload = X_workload[:, ranked_knob_idxs] X_target = X_target[:, ranked_knob_idxs] X_columnlabels = X_columnlabels[ranked_knob_idxs] # Filter ys by current target objective metric target_objective = newest_result.session.target_objective target_obj_idx = [ i for i, cl in enumerate(y_columnlabels) if cl == target_objective ] if len(target_obj_idx) == 0: raise Exception(('Could not find target objective in metrics ' '(target_obj={})').format(target_objective)) elif len(target_obj_idx) > 1: raise Exception( ('Found {} instances of target objective in ' 'metrics (target_obj={})').format(len(target_obj_idx), target_objective)) metric_meta = MetricCatalog.objects.get_metric_meta( newest_result.session.dbms, newest_result.session.target_objective) if metric_meta[target_objective].improvement == '(less is better)': lessisbetter = True else: lessisbetter = False y_workload = y_workload[:, target_obj_idx] y_target = y_target[:, target_obj_idx] y_columnlabels = y_columnlabels[target_obj_idx] # Combine duplicate rows in the target/workload data (separately) X_workload, y_workload, rowlabels_workload = DataUtil.combine_duplicate_rows( X_workload, y_workload, rowlabels_workload) X_target, y_target, rowlabels_target = DataUtil.combine_duplicate_rows( X_target, y_target, rowlabels_target) # Delete any rows that appear in both the workload data and the target # data from the workload data dups_filter = np.ones(X_workload.shape[0], dtype=bool) target_row_tups = [tuple(row) for row in X_target] for i, row in enumerate(X_workload): if tuple(row) in target_row_tups: dups_filter[i] = False X_workload = X_workload[dups_filter, :] y_workload = y_workload[dups_filter, :] rowlabels_workload = rowlabels_workload[dups_filter] # Combine target & workload Xs for preprocessing X_matrix = np.vstack([X_target, X_workload]) # Dummy encode categorial variables categorical_info = DataUtil.dummy_encoder_helper(X_columnlabels, mapped_workload.dbms) dummy_encoder = DummyEncoder(categorical_info['n_values'], categorical_info['categorical_features'], categorical_info['cat_columnlabels'], categorical_info['noncat_columnlabels']) X_matrix = dummy_encoder.fit_transform(X_matrix) # below two variables are needed for correctly determing max/min on dummies binary_index_set = set(categorical_info['binary_vars']) total_dummies = dummy_encoder.total_dummies() # Scale to N(0, 1) X_scaler = StandardScaler() X_scaled = X_scaler.fit_transform(X_matrix) if y_target.shape[0] < 5: # FIXME # FIXME (dva): if there are fewer than 5 target results so far # then scale the y values (metrics) using the workload's # y_scaler. I'm not sure if 5 is the right cutoff. y_target_scaler = None y_workload_scaler = StandardScaler() y_matrix = np.vstack([y_target, y_workload]) y_scaled = y_workload_scaler.fit_transform(y_matrix) else: # FIXME (dva): otherwise try to compute a separate y_scaler for # the target and scale them separately. try: y_target_scaler = StandardScaler() y_workload_scaler = StandardScaler() y_target_scaled = y_target_scaler.fit_transform(y_target) y_workload_scaled = y_workload_scaler.fit_transform(y_workload) y_scaled = np.vstack([y_target_scaled, y_workload_scaled]) except ValueError: y_target_scaler = None y_workload_scaler = StandardScaler() y_scaled = y_workload_scaler.fit_transform(y_target) # Set up constraint helper constraint_helper = ParamConstraintHelper( scaler=X_scaler, encoder=dummy_encoder, binary_vars=categorical_info['binary_vars'], init_flip_prob=INIT_FLIP_PROB, flip_prob_decay=FLIP_PROB_DECAY) # FIXME (dva): check if these are good values for the ridge # ridge = np.empty(X_scaled.shape[0]) # ridge[:X_target.shape[0]] = 0.01 # ridge[X_target.shape[0]:] = 0.1 # FIXME: we should generate more samples and use a smarter sampling # technique num_samples = NUM_SAMPLES X_samples = np.empty((num_samples, X_scaled.shape[1])) X_min = np.empty(X_scaled.shape[1]) X_max = np.empty(X_scaled.shape[1]) knobs_mem = KnobCatalog.objects.filter(dbms=newest_result.session.dbms, tunable=True, resource=1) knobs_mem_catalog = {k.name: k for k in knobs_mem} mem_max = newest_result.workload.hardware.memory X_mem = np.zeros([1, X_scaled.shape[1]]) X_default = np.empty(X_scaled.shape[1]) # Get default knob values for i, k_name in enumerate(X_columnlabels): k = KnobCatalog.objects.filter(dbms=newest_result.session.dbms, name=k_name)[0] X_default[i] = k.default X_default_scaled = X_scaler.transform( X_default.reshape(1, X_default.shape[0]))[0] # Determine min/max for knob values for i in range(X_scaled.shape[1]): if i < total_dummies or i in binary_index_set: col_min = 0 col_max = 1 else: col_min = X_scaled[:, i].min() col_max = X_scaled[:, i].max() if X_columnlabels[i] in knobs_mem_catalog: X_mem[0][i] = mem_max * 1024 * 1024 * 1024 # mem_max GB col_max = min(col_max, X_scaler.transform(X_mem)[0][i]) # Set min value to the default value # FIXME: support multiple methods can be selected by users col_min = X_default_scaled[i] X_min[i] = col_min X_max[i] = col_max X_samples[:, i] = np.random.rand(num_samples) * (col_max - col_min) + col_min # Maximize the throughput, moreisbetter # Use gradient descent to minimize -throughput if not lessisbetter: y_scaled = -y_scaled q = queue.PriorityQueue() for x in range(0, y_scaled.shape[0]): q.put((y_scaled[x][0], x)) i = 0 while i < TOP_NUM_CONFIG: try: item = q.get_nowait() # Tensorflow get broken if we use the training data points as # starting points for GPRGD. We add a small bias for the # starting points. GPR_EPS default value is 0.001 # if the starting point is X_max, we minus a small bias to # make sure it is within the range. dist = sum(np.square(X_max - X_scaled[item[1]])) if dist < 0.001: X_samples = np.vstack( (X_samples, X_scaled[item[1]] - abs(GPR_EPS))) else: X_samples = np.vstack( (X_samples, X_scaled[item[1]] + abs(GPR_EPS))) i = i + 1 except queue.Empty: break model = GPRGD(length_scale=DEFAULT_LENGTH_SCALE, magnitude=DEFAULT_MAGNITUDE, max_train_size=MAX_TRAIN_SIZE, batch_size=BATCH_SIZE, num_threads=NUM_THREADS, learning_rate=DEFAULT_LEARNING_RATE, epsilon=DEFAULT_EPSILON, max_iter=MAX_ITER, sigma_multiplier=DEFAULT_SIGMA_MULTIPLIER, mu_multiplier=DEFAULT_MU_MULTIPLIER) model.fit(X_scaled, y_scaled, X_min, X_max, ridge=DEFAULT_RIDGE) res = model.predict(X_samples, constraint_helper=constraint_helper) best_config_idx = np.argmin(res.minl.ravel()) best_config = res.minl_conf[best_config_idx, :] best_config = X_scaler.inverse_transform(best_config) # Decode one-hot encoding into categorical knobs best_config = dummy_encoder.inverse_transform(best_config) # Although we have max/min limits in the GPRGD training session, it may # lose some precisions. e.g. 0.99..99 >= 1.0 may be True on the scaled data, # when we inversely transform the scaled data, the different becomes much larger # and cannot be ignored. Here we check the range on the original data # directly, and make sure the recommended config lies within the range X_min_inv = X_scaler.inverse_transform(X_min) X_max_inv = X_scaler.inverse_transform(X_max) best_config = np.minimum(best_config, X_max_inv) best_config = np.maximum(best_config, X_min_inv) conf_map = {k: best_config[i] for i, k in enumerate(X_columnlabels)} conf_map_res = {} conf_map_res['status'] = 'good' conf_map_res['recommendation'] = conf_map conf_map_res['info'] = 'INFO: training data size is {}'.format( X_scaled.shape[0]) return conf_map_res
def run_knob_identification(knob_data, metric_data, dbms): # Performs knob identification on the knob & metric data and returns # a set of ranked knobs. # # Parameters: # knob_data & metric_data are dictionaries of the form: # - 'data': 2D numpy matrix of knob/metric data # - 'rowlabels': a list of identifiers for the rows in the matrix # - 'columnlabels': a list of the knob/metric names corresponding # to the columns in the data matrix # dbms is the foreign key pointing to target dbms in DBMSCatalog # # When running the lasso algorithm, the knob_data matrix is set of # independent variables (X) and the metric_data is the set of # dependent variables (y). knob_matrix = knob_data['data'] knob_columnlabels = knob_data['columnlabels'] metric_matrix = metric_data['data'] metric_columnlabels = metric_data['columnlabels'] # remove constant columns from knob_matrix and metric_matrix nonconst_knob_matrix = [] nonconst_knob_columnlabels = [] for col, cl in zip(knob_matrix.T, knob_columnlabels): if np.any(col != col[0]): nonconst_knob_matrix.append(col.reshape(-1, 1)) nonconst_knob_columnlabels.append(cl) assert len(nonconst_knob_matrix) > 0, "Need more data to train the model" nonconst_knob_matrix = np.hstack(nonconst_knob_matrix) nonconst_metric_matrix = [] nonconst_metric_columnlabels = [] for col, cl in zip(metric_matrix.T, metric_columnlabels): if np.any(col != col[0]): nonconst_metric_matrix.append(col.reshape(-1, 1)) nonconst_metric_columnlabels.append(cl) nonconst_metric_matrix = np.hstack(nonconst_metric_matrix) # determine which knobs need encoding (enums with >2 possible values) categorical_info = DataUtil.dummy_encoder_helper(nonconst_knob_columnlabels, dbms) # encode categorical variable first (at least, before standardize) dummy_encoder = DummyEncoder(categorical_info['n_values'], categorical_info['categorical_features'], categorical_info['cat_columnlabels'], categorical_info['noncat_columnlabels']) encoded_knob_matrix = dummy_encoder.fit_transform( nonconst_knob_matrix) encoded_knob_columnlabels = dummy_encoder.new_labels # standardize values in each column to N(0, 1) standardizer = StandardScaler() standardized_knob_matrix = standardizer.fit_transform(encoded_knob_matrix) standardized_metric_matrix = standardizer.fit_transform(nonconst_metric_matrix) # shuffle rows (note: same shuffle applied to both knob and metric matrices) shuffle_indices = get_shuffle_indices(standardized_knob_matrix.shape[0], seed=17) shuffled_knob_matrix = standardized_knob_matrix[shuffle_indices, :] shuffled_metric_matrix = standardized_metric_matrix[shuffle_indices, :] # run lasso algorithm lasso_model = LassoPath() lasso_model.fit(shuffled_knob_matrix, shuffled_metric_matrix, encoded_knob_columnlabels) # consolidate categorical feature columns, and reset to original names encoded_knobs = lasso_model.get_ranked_features() consolidated_knobs = consolidate_columnlabels(encoded_knobs) return consolidated_knobs
def run_knob_identification(knob_data, metric_data, dbms): # Performs knob identification on the knob & metric data and returns # a set of ranked knobs. # # Parameters: # knob_data & metric_data are dictionaries of the form: # - 'data': 2D numpy matrix of knob/metric data # - 'rowlabels': a list of identifiers for the rows in the matrix # - 'columnlabels': a list of the knob/metric names corresponding # to the columns in the data matrix # dbms is the foreign key pointing to target dbms in DBMSCatalog # # When running the lasso algorithm, the knob_data matrix is set of # independent variables (X) and the metric_data is the set of # dependent variables (y). knob_matrix = knob_data['data'] knob_columnlabels = knob_data['columnlabels'] metric_matrix = metric_data['data'] metric_columnlabels = metric_data['columnlabels'] # remove constant columns from knob_matrix and metric_matrix nonconst_knob_matrix = [] nonconst_knob_columnlabels = [] for col, cl in zip(knob_matrix.T, knob_columnlabels): if np.any(col != col[0]): nonconst_knob_matrix.append(col.reshape(-1, 1)) nonconst_knob_columnlabels.append(cl) assert len(nonconst_knob_matrix) > 0, "Need more data to train the model" nonconst_knob_matrix = np.hstack(nonconst_knob_matrix) nonconst_metric_matrix = [] nonconst_metric_columnlabels = [] for col, cl in zip(metric_matrix.T, metric_columnlabels): if np.any(col != col[0]): nonconst_metric_matrix.append(col.reshape(-1, 1)) nonconst_metric_columnlabels.append(cl) nonconst_metric_matrix = np.hstack(nonconst_metric_matrix) # determine which knobs need encoding (enums with >2 possible values) categorical_info = DataUtil.dummy_encoder_helper( nonconst_knob_columnlabels, dbms) # encode categorical variable first (at least, before standardize) dummy_encoder = DummyEncoder(categorical_info['n_values'], categorical_info['categorical_features'], categorical_info['cat_columnlabels'], categorical_info['noncat_columnlabels']) encoded_knob_matrix = dummy_encoder.fit_transform(nonconst_knob_matrix) encoded_knob_columnlabels = dummy_encoder.new_labels # standardize values in each column to N(0, 1) standardizer = StandardScaler() standardized_knob_matrix = standardizer.fit_transform(encoded_knob_matrix) standardized_metric_matrix = standardizer.fit_transform( nonconst_metric_matrix) # shuffle rows (note: same shuffle applied to both knob and metric matrices) shuffle_indices = get_shuffle_indices(standardized_knob_matrix.shape[0], seed=17) shuffled_knob_matrix = standardized_knob_matrix[shuffle_indices, :] shuffled_metric_matrix = standardized_metric_matrix[shuffle_indices, :] # run lasso algorithm lasso_model = LassoPath() lasso_model.fit(shuffled_knob_matrix, shuffled_metric_matrix, encoded_knob_columnlabels) # consolidate categorical feature columns, and reset to original names encoded_knobs = lasso_model.get_ranked_features() consolidated_knobs = consolidate_columnlabels(encoded_knobs) return consolidated_knobs
def configuration_recommendation(recommendation_input): target_data, algorithm = recommendation_input LOG.info('configuration_recommendation called') if target_data['bad'] is True: target_data_res = dict( status='bad', result_id=target_data['newest_result_id'], info='WARNING: no training data, the config is generated randomly', recommendation=target_data['config_recommend'], pipeline_run=target_data['pipeline_run']) LOG.debug('%s: Skipping configuration recommendation.\n\ndata=%s\n', AlgorithmType.name(algorithm), JSONUtil.dumps(target_data, pprint=True)) return target_data_res # Load mapped workload data mapped_workload_id = target_data['mapped_workload'][0] latest_pipeline_run = PipelineRun.objects.get( pk=target_data['pipeline_run']) mapped_workload = Workload.objects.get(pk=mapped_workload_id) workload_knob_data = PipelineData.objects.get( pipeline_run=latest_pipeline_run, workload=mapped_workload, task_type=PipelineTaskType.KNOB_DATA) workload_knob_data = JSONUtil.loads(workload_knob_data.data) workload_metric_data = PipelineData.objects.get( pipeline_run=latest_pipeline_run, workload=mapped_workload, task_type=PipelineTaskType.METRIC_DATA) workload_metric_data = JSONUtil.loads(workload_metric_data.data) newest_result = Result.objects.get(pk=target_data['newest_result_id']) cleaned_workload_knob_data = clean_knob_data( workload_knob_data["data"], workload_knob_data["columnlabels"], newest_result.session) X_workload = np.array(cleaned_workload_knob_data[0]) X_columnlabels = np.array(cleaned_workload_knob_data[1]) y_workload = np.array(workload_metric_data['data']) y_columnlabels = np.array(workload_metric_data['columnlabels']) rowlabels_workload = np.array(workload_metric_data['rowlabels']) # Target workload data newest_result = Result.objects.get(pk=target_data['newest_result_id']) X_target = target_data['X_matrix'] y_target = target_data['y_matrix'] rowlabels_target = np.array(target_data['rowlabels']) if not np.array_equal(X_columnlabels, target_data['X_columnlabels']): raise Exception(('The workload and target data should have ' 'identical X columnlabels (sorted knob names)')) if not np.array_equal(y_columnlabels, target_data['y_columnlabels']): raise Exception(('The workload and target data should have ' 'identical y columnlabels (sorted metric names)')) # Filter Xs by top 10 ranked knobs ranked_knobs = PipelineData.objects.get( pipeline_run=latest_pipeline_run, workload=mapped_workload, task_type=PipelineTaskType.RANKED_KNOBS) ranked_knobs = JSONUtil.loads(ranked_knobs.data)[:IMPORTANT_KNOB_NUMBER] ranked_knob_idxs = [ i for i, cl in enumerate(X_columnlabels) if cl in ranked_knobs ] X_workload = X_workload[:, ranked_knob_idxs] X_target = X_target[:, ranked_knob_idxs] X_columnlabels = X_columnlabels[ranked_knob_idxs] # Filter ys by current target objective metric target_objective = newest_result.session.target_objective target_obj_idx = [ i for i, cl in enumerate(y_columnlabels) if cl == target_objective ] if len(target_obj_idx) == 0: raise Exception(('Could not find target objective in metrics ' '(target_obj={})').format(target_objective)) elif len(target_obj_idx) > 1: raise Exception( ('Found {} instances of target objective in ' 'metrics (target_obj={})').format(len(target_obj_idx), target_objective)) metric_meta = db.target_objectives.get_metric_metadata( newest_result.session.dbms.pk, newest_result.session.target_objective) lessisbetter = metric_meta[ target_objective].improvement == db.target_objectives.LESS_IS_BETTER y_workload = y_workload[:, target_obj_idx] y_target = y_target[:, target_obj_idx] y_columnlabels = y_columnlabels[target_obj_idx] # Combine duplicate rows in the target/workload data (separately) X_workload, y_workload, rowlabels_workload = DataUtil.combine_duplicate_rows( X_workload, y_workload, rowlabels_workload) X_target, y_target, rowlabels_target = DataUtil.combine_duplicate_rows( X_target, y_target, rowlabels_target) # Delete any rows that appear in both the workload data and the target # data from the workload data dups_filter = np.ones(X_workload.shape[0], dtype=bool) target_row_tups = [tuple(row) for row in X_target] for i, row in enumerate(X_workload): if tuple(row) in target_row_tups: dups_filter[i] = False X_workload = X_workload[dups_filter, :] y_workload = y_workload[dups_filter, :] rowlabels_workload = rowlabels_workload[dups_filter] # Combine target & workload Xs for preprocessing X_matrix = np.vstack([X_target, X_workload]) # Dummy encode categorial variables categorical_info = DataUtil.dummy_encoder_helper(X_columnlabels, mapped_workload.dbms) dummy_encoder = DummyEncoder(categorical_info['n_values'], categorical_info['categorical_features'], categorical_info['cat_columnlabels'], categorical_info['noncat_columnlabels']) X_matrix = dummy_encoder.fit_transform(X_matrix) # below two variables are needed for correctly determing max/min on dummies binary_index_set = set(categorical_info['binary_vars']) total_dummies = dummy_encoder.total_dummies() # Scale to N(0, 1) X_scaler = StandardScaler() X_scaled = X_scaler.fit_transform(X_matrix) if y_target.shape[0] < 5: # FIXME # FIXME (dva): if there are fewer than 5 target results so far # then scale the y values (metrics) using the workload's # y_scaler. I'm not sure if 5 is the right cutoff. y_target_scaler = None y_workload_scaler = StandardScaler() y_matrix = np.vstack([y_target, y_workload]) y_scaled = y_workload_scaler.fit_transform(y_matrix) else: # FIXME (dva): otherwise try to compute a separate y_scaler for # the target and scale them separately. try: y_target_scaler = StandardScaler() y_workload_scaler = StandardScaler() y_target_scaled = y_target_scaler.fit_transform(y_target) y_workload_scaled = y_workload_scaler.fit_transform(y_workload) y_scaled = np.vstack([y_target_scaled, y_workload_scaled]) except ValueError: y_target_scaler = None y_workload_scaler = StandardScaler() y_scaled = y_workload_scaler.fit_transform(y_target) # Set up constraint helper constraint_helper = ParamConstraintHelper( scaler=X_scaler, encoder=dummy_encoder, binary_vars=categorical_info['binary_vars'], init_flip_prob=INIT_FLIP_PROB, flip_prob_decay=FLIP_PROB_DECAY) # FIXME (dva): check if these are good values for the ridge # ridge = np.empty(X_scaled.shape[0]) # ridge[:X_target.shape[0]] = 0.01 # ridge[X_target.shape[0]:] = 0.1 # FIXME: we should generate more samples and use a smarter sampling # technique num_samples = NUM_SAMPLES X_samples = np.empty((num_samples, X_scaled.shape[1])) X_min = np.empty(X_scaled.shape[1]) X_max = np.empty(X_scaled.shape[1]) X_scaler_matrix = np.zeros([1, X_scaled.shape[1]]) session_knobs = SessionKnob.objects.get_knobs_for_session( newest_result.session) # Set min/max for knob values for i in range(X_scaled.shape[1]): if i < total_dummies or i in binary_index_set: col_min = 0 col_max = 1 else: col_min = X_scaled[:, i].min() col_max = X_scaled[:, i].max() for knob in session_knobs: if X_columnlabels[i] == knob["name"]: X_scaler_matrix[0][i] = knob["minval"] col_min = X_scaler.transform(X_scaler_matrix)[0][i] X_scaler_matrix[0][i] = knob["maxval"] col_max = X_scaler.transform(X_scaler_matrix)[0][i] X_min[i] = col_min X_max[i] = col_max X_samples[:, i] = np.random.rand(num_samples) * (col_max - col_min) + col_min # Maximize the throughput, moreisbetter # Use gradient descent to minimize -throughput if not lessisbetter: y_scaled = -y_scaled q = queue.PriorityQueue() for x in range(0, y_scaled.shape[0]): q.put((y_scaled[x][0], x)) i = 0 while i < TOP_NUM_CONFIG: try: item = q.get_nowait() # Tensorflow get broken if we use the training data points as # starting points for GPRGD. We add a small bias for the # starting points. GPR_EPS default value is 0.001 # if the starting point is X_max, we minus a small bias to # make sure it is within the range. dist = sum(np.square(X_max - X_scaled[item[1]])) if dist < 0.001: X_samples = np.vstack( (X_samples, X_scaled[item[1]] - abs(GPR_EPS))) else: X_samples = np.vstack( (X_samples, X_scaled[item[1]] + abs(GPR_EPS))) i = i + 1 except queue.Empty: break session = newest_result.session res = None if algorithm == AlgorithmType.DNN: # neural network model model_nn = NeuralNet(n_input=X_samples.shape[1], batch_size=X_samples.shape[0], explore_iters=DNN_EXPLORE_ITER, noise_scale_begin=DNN_NOISE_SCALE_BEGIN, noise_scale_end=DNN_NOISE_SCALE_END, debug=DNN_DEBUG, debug_interval=DNN_DEBUG_INTERVAL) if session.dnn_model is not None: model_nn.set_weights_bin(session.dnn_model) model_nn.fit(X_scaled, y_scaled, fit_epochs=DNN_TRAIN_ITER) res = model_nn.recommend(X_samples, X_min, X_max, explore=DNN_EXPLORE, recommend_epochs=MAX_ITER) session.dnn_model = model_nn.get_weights_bin() session.save() elif algorithm == AlgorithmType.GPR: # default gpr model model = GPRGD(length_scale=DEFAULT_LENGTH_SCALE, magnitude=DEFAULT_MAGNITUDE, max_train_size=MAX_TRAIN_SIZE, batch_size=BATCH_SIZE, num_threads=NUM_THREADS, learning_rate=DEFAULT_LEARNING_RATE, epsilon=DEFAULT_EPSILON, max_iter=MAX_ITER, sigma_multiplier=DEFAULT_SIGMA_MULTIPLIER, mu_multiplier=DEFAULT_MU_MULTIPLIER) model.fit(X_scaled, y_scaled, X_min, X_max, ridge=DEFAULT_RIDGE) res = model.predict(X_samples, constraint_helper=constraint_helper) best_config_idx = np.argmin(res.minl.ravel()) best_config = res.minl_conf[best_config_idx, :] best_config = X_scaler.inverse_transform(best_config) # Decode one-hot encoding into categorical knobs best_config = dummy_encoder.inverse_transform(best_config) # Although we have max/min limits in the GPRGD training session, it may # lose some precisions. e.g. 0.99..99 >= 1.0 may be True on the scaled data, # when we inversely transform the scaled data, the different becomes much larger # and cannot be ignored. Here we check the range on the original data # directly, and make sure the recommended config lies within the range X_min_inv = X_scaler.inverse_transform(X_min) X_max_inv = X_scaler.inverse_transform(X_max) best_config = np.minimum(best_config, X_max_inv) best_config = np.maximum(best_config, X_min_inv) conf_map = {k: best_config[i] for i, k in enumerate(X_columnlabels)} conf_map_res = dict(status='good', result_id=target_data['newest_result_id'], recommendation=conf_map, info='INFO: training data size is {}'.format( X_scaled.shape[0]), pipeline_run=latest_pipeline_run.pk) LOG.debug('%s: Finished selecting the next config.\n\ndata=%s\n', AlgorithmType.name(algorithm), JSONUtil.dumps(conf_map_res, pprint=True)) return conf_map_res
def combine_workload(target_data): # Load mapped workload data mapped_workload_id = target_data['mapped_workload'][0] latest_pipeline_run = PipelineRun.objects.get( pk=target_data['pipeline_run']) mapped_workload = Workload.objects.get(pk=mapped_workload_id) workload_knob_data = PipelineData.objects.get( pipeline_run=latest_pipeline_run, workload=mapped_workload, task_type=PipelineTaskType.KNOB_DATA) workload_knob_data = JSONUtil.loads(workload_knob_data.data) workload_metric_data = PipelineData.objects.get( pipeline_run=latest_pipeline_run, workload=mapped_workload, task_type=PipelineTaskType.METRIC_DATA) workload_metric_data = JSONUtil.loads(workload_metric_data.data) newest_result = Result.objects.get(pk=target_data['newest_result_id']) session = newest_result.session params = JSONUtil.loads(session.hyperparameters) cleaned_workload_knob_data = clean_knob_data( workload_knob_data["data"], workload_knob_data["columnlabels"], newest_result.session) X_workload = np.array(cleaned_workload_knob_data[0]) X_columnlabels = np.array(cleaned_workload_knob_data[1]) y_workload = np.array(workload_metric_data['data']) y_columnlabels = np.array(workload_metric_data['columnlabels']) rowlabels_workload = np.array(workload_metric_data['rowlabels']) # Target workload data newest_result = Result.objects.get(pk=target_data['newest_result_id']) X_target = target_data['X_matrix'] y_target = target_data['y_matrix'] rowlabels_target = np.array(target_data['rowlabels']) if not np.array_equal(X_columnlabels, target_data['X_columnlabels']): raise Exception(('The workload and target data should have ' 'identical X columnlabels (sorted knob names)'), X_columnlabels, target_data['X_columnlabels']) if not np.array_equal(y_columnlabels, target_data['y_columnlabels']): raise Exception(('The workload and target data should have ' 'identical y columnlabels (sorted metric names)'), y_columnlabels, target_data['y_columnlabels']) # Filter Xs by top 10 ranked knobs ranked_knobs = PipelineData.objects.get( pipeline_run=latest_pipeline_run, workload=mapped_workload, task_type=PipelineTaskType.RANKED_KNOBS) ranked_knobs = JSONUtil.loads( ranked_knobs.data)[:params['IMPORTANT_KNOB_NUMBER']] ranked_knob_idxs = [ i for i, cl in enumerate(X_columnlabels) if cl in ranked_knobs ] X_workload = X_workload[:, ranked_knob_idxs] X_target = X_target[:, ranked_knob_idxs] X_columnlabels = X_columnlabels[ranked_knob_idxs] # Filter ys by current target objective metric target_objective = newest_result.session.target_objective target_obj_idx = [ i for i, cl in enumerate(y_columnlabels) if cl == target_objective ] if len(target_obj_idx) == 0: raise Exception(('Could not find target objective in metrics ' '(target_obj={})').format(target_objective)) elif len(target_obj_idx) > 1: raise Exception( ('Found {} instances of target objective in ' 'metrics (target_obj={})').format(len(target_obj_idx), target_objective)) y_workload = y_workload[:, target_obj_idx] y_target = y_target[:, target_obj_idx] y_columnlabels = y_columnlabels[target_obj_idx] # Combine duplicate rows in the target/workload data (separately) X_workload, y_workload, rowlabels_workload = DataUtil.combine_duplicate_rows( X_workload, y_workload, rowlabels_workload) X_target, y_target, rowlabels_target = DataUtil.combine_duplicate_rows( X_target, y_target, rowlabels_target) # Delete any rows that appear in both the workload data and the target # data from the workload data dups_filter = np.ones(X_workload.shape[0], dtype=bool) target_row_tups = [tuple(row) for row in X_target] for i, row in enumerate(X_workload): if tuple(row) in target_row_tups: dups_filter[i] = False X_workload = X_workload[dups_filter, :] y_workload = y_workload[dups_filter, :] rowlabels_workload = rowlabels_workload[dups_filter] # Combine target & workload Xs for preprocessing X_matrix = np.vstack([X_target, X_workload]) # Dummy encode categorial variables if ENABLE_DUMMY_ENCODER: categorical_info = DataUtil.dummy_encoder_helper( X_columnlabels, mapped_workload.dbms) dummy_encoder = DummyEncoder(categorical_info['n_values'], categorical_info['categorical_features'], categorical_info['cat_columnlabels'], categorical_info['noncat_columnlabels']) X_matrix = dummy_encoder.fit_transform(X_matrix) binary_encoder = categorical_info['binary_vars'] # below two variables are needed for correctly determing max/min on dummies binary_index_set = set(categorical_info['binary_vars']) total_dummies = dummy_encoder.total_dummies() else: dummy_encoder = None binary_encoder = None binary_index_set = [] total_dummies = 0 # Scale to N(0, 1) X_scaler = StandardScaler() X_scaled = X_scaler.fit_transform(X_matrix) if y_target.shape[0] < 5: # FIXME # FIXME (dva): if there are fewer than 5 target results so far # then scale the y values (metrics) using the workload's # y_scaler. I'm not sure if 5 is the right cutoff. y_target_scaler = None y_workload_scaler = StandardScaler() y_matrix = np.vstack([y_target, y_workload]) y_scaled = y_workload_scaler.fit_transform(y_matrix) else: # FIXME (dva): otherwise try to compute a separate y_scaler for # the target and scale them separately. try: y_target_scaler = StandardScaler() y_workload_scaler = StandardScaler() y_target_scaled = y_target_scaler.fit_transform(y_target) y_workload_scaled = y_workload_scaler.fit_transform(y_workload) y_scaled = np.vstack([y_target_scaled, y_workload_scaled]) except ValueError: y_target_scaler = None y_workload_scaler = StandardScaler() y_scaled = y_workload_scaler.fit_transform(y_target) metric_meta = db.target_objectives.get_metric_metadata( newest_result.session.dbms.pk, newest_result.session.target_objective) lessisbetter = metric_meta[ target_objective].improvement == db.target_objectives.LESS_IS_BETTER # Maximize the throughput, moreisbetter # Use gradient descent to minimize -throughput if not lessisbetter: y_scaled = -y_scaled # Set up constraint helper constraint_helper = ParamConstraintHelper( scaler=X_scaler, encoder=dummy_encoder, binary_vars=binary_encoder, init_flip_prob=params['INIT_FLIP_PROB'], flip_prob_decay=params['FLIP_PROB_DECAY']) # FIXME (dva): check if these are good values for the ridge # ridge = np.empty(X_scaled.shape[0]) # ridge[:X_target.shape[0]] = 0.01 # ridge[X_target.shape[0]:] = 0.1 X_min = np.empty(X_scaled.shape[1]) X_max = np.empty(X_scaled.shape[1]) X_scaler_matrix = np.zeros([1, X_scaled.shape[1]]) session_knobs = SessionKnob.objects.get_knobs_for_session( newest_result.session) # Set min/max for knob values for i in range(X_scaled.shape[1]): if i < total_dummies or i in binary_index_set: col_min = 0 col_max = 1 else: col_min = X_scaled[:, i].min() col_max = X_scaled[:, i].max() for knob in session_knobs: if X_columnlabels[i] == knob["name"]: X_scaler_matrix[0][i] = knob["minval"] col_min = X_scaler.transform(X_scaler_matrix)[0][i] X_scaler_matrix[0][i] = knob["maxval"] col_max = X_scaler.transform(X_scaler_matrix)[0][i] X_min[i] = col_min X_max[i] = col_max return X_columnlabels, X_scaler, X_scaled, y_scaled, X_max, X_min,\ dummy_encoder, constraint_helper
def configuration_recommendation(target_data): LOG.info('configuration_recommendation called') latest_pipeline_run = PipelineRun.objects.get_latest() if target_data['bad'] is True: target_data_res = {} target_data_res['status'] = 'bad' target_data_res['info'] = 'WARNING: no training data, the config is generated randomly' target_data_res['recommendation'] = target_data['config_recommend'] return target_data_res # Load mapped workload data mapped_workload_id = target_data['mapped_workload'][0] mapped_workload = Workload.objects.get(pk=mapped_workload_id) workload_knob_data = PipelineData.objects.get( pipeline_run=latest_pipeline_run, workload=mapped_workload, task_type=PipelineTaskType.KNOB_DATA) workload_knob_data = JSONUtil.loads(workload_knob_data.data) workload_metric_data = PipelineData.objects.get( pipeline_run=latest_pipeline_run, workload=mapped_workload, task_type=PipelineTaskType.METRIC_DATA) workload_metric_data = JSONUtil.loads(workload_metric_data.data) X_workload = np.array(workload_knob_data['data']) X_columnlabels = np.array(workload_knob_data['columnlabels']) y_workload = np.array(workload_metric_data['data']) y_columnlabels = np.array(workload_metric_data['columnlabels']) rowlabels_workload = np.array(workload_metric_data['rowlabels']) # Target workload data newest_result = Result.objects.get(pk=target_data['newest_result_id']) X_target = target_data['X_matrix'] y_target = target_data['y_matrix'] rowlabels_target = np.array(target_data['rowlabels']) if not np.array_equal(X_columnlabels, target_data['X_columnlabels']): raise Exception(('The workload and target data should have ' 'identical X columnlabels (sorted knob names)')) if not np.array_equal(y_columnlabels, target_data['y_columnlabels']): raise Exception(('The workload and target data should have ' 'identical y columnlabels (sorted metric names)')) # Filter Xs by top 10 ranked knobs ranked_knobs = PipelineData.objects.get( pipeline_run=latest_pipeline_run, workload=mapped_workload, task_type=PipelineTaskType.RANKED_KNOBS) ranked_knobs = JSONUtil.loads(ranked_knobs.data)[:IMPORTANT_KNOB_NUMBER] ranked_knob_idxs = [i for i, cl in enumerate(X_columnlabels) if cl in ranked_knobs] X_workload = X_workload[:, ranked_knob_idxs] X_target = X_target[:, ranked_knob_idxs] X_columnlabels = X_columnlabels[ranked_knob_idxs] # Filter ys by current target objective metric target_objective = newest_result.session.target_objective target_obj_idx = [i for i, cl in enumerate(y_columnlabels) if cl == target_objective] if len(target_obj_idx) == 0: raise Exception(('Could not find target objective in metrics ' '(target_obj={})').format(target_objective)) elif len(target_obj_idx) > 1: raise Exception(('Found {} instances of target objective in ' 'metrics (target_obj={})').format(len(target_obj_idx), target_objective)) metric_meta = MetricCatalog.objects.get_metric_meta(newest_result.session.dbms, newest_result.session.target_objective) if metric_meta[target_objective] == '(less is better)': lessisbetter = True else: lessisbetter = False y_workload = y_workload[:, target_obj_idx] y_target = y_target[:, target_obj_idx] y_columnlabels = y_columnlabels[target_obj_idx] # Combine duplicate rows in the target/workload data (separately) X_workload, y_workload, rowlabels_workload = DataUtil.combine_duplicate_rows( X_workload, y_workload, rowlabels_workload) X_target, y_target, rowlabels_target = DataUtil.combine_duplicate_rows( X_target, y_target, rowlabels_target) # Delete any rows that appear in both the workload data and the target # data from the workload data dups_filter = np.ones(X_workload.shape[0], dtype=bool) target_row_tups = [tuple(row) for row in X_target] for i, row in enumerate(X_workload): if tuple(row) in target_row_tups: dups_filter[i] = False X_workload = X_workload[dups_filter, :] y_workload = y_workload[dups_filter, :] rowlabels_workload = rowlabels_workload[dups_filter] # Combine target & workload Xs for preprocessing X_matrix = np.vstack([X_target, X_workload]) # Dummy encode categorial variables categorical_info = DataUtil.dummy_encoder_helper(X_columnlabels, mapped_workload.dbms) dummy_encoder = DummyEncoder(categorical_info['n_values'], categorical_info['categorical_features'], categorical_info['cat_columnlabels'], categorical_info['noncat_columnlabels']) X_matrix = dummy_encoder.fit_transform(X_matrix) # below two variables are needed for correctly determing max/min on dummies binary_index_set = set(categorical_info['binary_vars']) total_dummies = dummy_encoder.total_dummies() # Scale to N(0, 1) X_scaler = StandardScaler() X_scaled = X_scaler.fit_transform(X_matrix) if y_target.shape[0] < 5: # FIXME # FIXME (dva): if there are fewer than 5 target results so far # then scale the y values (metrics) using the workload's # y_scaler. I'm not sure if 5 is the right cutoff. y_target_scaler = None y_workload_scaler = StandardScaler() y_matrix = np.vstack([y_target, y_workload]) y_scaled = y_workload_scaler.fit_transform(y_matrix) else: # FIXME (dva): otherwise try to compute a separate y_scaler for # the target and scale them separately. try: y_target_scaler = StandardScaler() y_workload_scaler = StandardScaler() y_target_scaled = y_target_scaler.fit_transform(y_target) y_workload_scaled = y_workload_scaler.fit_transform(y_workload) y_scaled = np.vstack([y_target_scaled, y_workload_scaled]) except ValueError: y_target_scaler = None y_workload_scaler = StandardScaler() y_scaled = y_workload_scaler.fit_transform(y_target) # Set up constraint helper constraint_helper = ParamConstraintHelper(scaler=X_scaler, encoder=dummy_encoder, binary_vars=categorical_info['binary_vars'], init_flip_prob=INIT_FLIP_PROB, flip_prob_decay=FLIP_PROB_DECAY) # FIXME (dva): check if these are good values for the ridge # ridge = np.empty(X_scaled.shape[0]) # ridge[:X_target.shape[0]] = 0.01 # ridge[X_target.shape[0]:] = 0.1 # FIXME: we should generate more samples and use a smarter sampling # technique num_samples = NUM_SAMPLES X_samples = np.empty((num_samples, X_scaled.shape[1])) X_min = np.empty(X_scaled.shape[1]) X_max = np.empty(X_scaled.shape[1]) knobs_mem = KnobCatalog.objects.filter( dbms=newest_result.session.dbms, tunable=True, resource=1) knobs_mem_catalog = {k.name: k for k in knobs_mem} mem_max = newest_result.workload.hardware.memory X_mem = np.zeros([1, X_scaled.shape[1]]) X_default = np.empty(X_scaled.shape[1]) # Get default knob values for i, k_name in enumerate(X_columnlabels): k = KnobCatalog.objects.filter(dbms=newest_result.session.dbms, name=k_name)[0] X_default[i] = k.default X_default_scaled = X_scaler.transform(X_default.reshape(1, X_default.shape[0]))[0] # Determine min/max for knob values for i in range(X_scaled.shape[1]): if i < total_dummies or i in binary_index_set: col_min = 0 col_max = 1 else: col_min = X_scaled[:, i].min() col_max = X_scaled[:, i].max() if X_columnlabels[i] in knobs_mem_catalog: X_mem[0][i] = mem_max * 1024 * 1024 * 1024 # mem_max GB col_max = X_scaler.transform(X_mem)[0][i] # Set min value to the default value # FIXME: support multiple methods can be selected by users col_min = X_default_scaled[i] X_min[i] = col_min X_max[i] = col_max X_samples[:, i] = np.random.rand(num_samples) * (col_max - col_min) + col_min # Maximize the throughput, moreisbetter # Use gradient descent to minimize -throughput if not lessisbetter: y_scaled = -y_scaled q = queue.PriorityQueue() for x in range(0, y_scaled.shape[0]): q.put((y_scaled[x][0], x)) i = 0 while i < TOP_NUM_CONFIG: try: item = q.get_nowait() # Tensorflow get broken if we use the training data points as # starting points for GPRGD. We add a small bias for the # starting points. GPR_EPS default value is 0.001 X_samples = np.vstack((X_samples, X_scaled[item[1]] + GPR_EPS)) i = i + 1 except queue.Empty: break model = GPRGD(length_scale=DEFAULT_LENGTH_SCALE, magnitude=DEFAULT_MAGNITUDE, max_train_size=MAX_TRAIN_SIZE, batch_size=BATCH_SIZE, num_threads=NUM_THREADS, learning_rate=DEFAULT_LEARNING_RATE, epsilon=DEFAULT_EPSILON, max_iter=MAX_ITER, sigma_multiplier=DEFAULT_SIGMA_MULTIPLIER, mu_multiplier=DEFAULT_MU_MULTIPLIER) model.fit(X_scaled, y_scaled, X_min, X_max, ridge=DEFAULT_RIDGE) res = model.predict(X_samples, constraint_helper=constraint_helper) best_config_idx = np.argmin(res.minl.ravel()) best_config = res.minl_conf[best_config_idx, :] best_config = X_scaler.inverse_transform(best_config) # Decode one-hot encoding into categorical knobs best_config = dummy_encoder.inverse_transform(best_config) # Although we have max/min limits in the GPRGD training session, it may # lose some precisions. e.g. 0.99..99 >= 1.0 may be True on the scaled data, # when we inversely transform the scaled data, the different becomes much larger # and cannot be ignored. Here we check the range on the original data # directly, and make sure the recommended config lies within the range X_min_inv = X_scaler.inverse_transform(X_min) X_max_inv = X_scaler.inverse_transform(X_max) best_config = np.minimum(best_config, X_max_inv) best_config = np.maximum(best_config, X_min_inv) conf_map = {k: best_config[i] for i, k in enumerate(X_columnlabels)} conf_map_res = {} conf_map_res['status'] = 'good' conf_map_res['recommendation'] = conf_map conf_map_res['info'] = 'INFO: training data size is {}'.format(X_scaled.shape[0]) return conf_map_res