def outlierDetection(samples_x, samples_y_aggregation): outliers = [] for samples_idx, _ in enumerate(samples_x): #sys.stderr.write("[%s] DEBUG: Evaluating %d of %d samples\n" # \ % (os.path.basename(__file__), samples_idx + 1, len(samples_x))) diagnostic_regressor_gp = gp_create_model.create_model(\ samples_x[0:samples_idx] + samples_x[samples_idx + 1:],\ samples_y_aggregation[0:samples_idx] + samples_y_aggregation[samples_idx + 1:]) mu, sigma = gp_prediction.predict(samples_x[samples_idx], diagnostic_regressor_gp['model']) # 2.33 is the z-score for 98% confidence level if abs(samples_y_aggregation[samples_idx] - mu) > (2.33 * sigma): outliers.append({"samples_idx": samples_idx, "expected_mu": mu, "expected_sigma": sigma, "difference": \ abs(samples_y_aggregation[samples_idx] - mu) - (2.33 * sigma)}) outliers = outliers if outliers else None return outliers
def _outlierDetection_threaded(inputs): """ Detect the outlier """ [samples_idx, samples_x, samples_y_aggregation] = inputs sys.stderr.write("[%s] DEBUG: Evaluating %dth of %d samples\n" % (os.path.basename(__file__), samples_idx + 1, len(samples_x))) outlier = None # Create a diagnostic regression model which removes the sample that we # want to evaluate diagnostic_regressor_gp = gp_create_model.create_model( samples_x[0:samples_idx] + samples_x[samples_idx + 1:], samples_y_aggregation[0:samples_idx] + samples_y_aggregation[samples_idx + 1:]) mu, sigma = gp_prediction.predict( samples_x[samples_idx], diagnostic_regressor_gp['model']) # 2.33 is the z-score for 98% confidence level if abs(samples_y_aggregation[samples_idx] - mu) > (2.33 * sigma): outlier = {"samples_idx": samples_idx, "expected_mu": mu, "expected_sigma": sigma, "difference": abs(samples_y_aggregation[samples_idx] - mu) - (2.33 * sigma)} return outlier
def _selection(self, samples_x, samples_y_aggregation, samples_y, x_bounds, x_types, max_resampling_per_x=3, threshold_samplessize_exploitation=12, threshold_samplessize_resampling=50, no_candidates=False, minimize_starting_points=None, minimize_constraints_fun=None): next_candidate = None candidates = [] samples_size_all = sum([len(i) for i in samples_y]) samples_size_unique = len(samples_y) # ===== STEP 1: Compute the current optimum ===== gp_model = gp_create_model.create_model(samples_x, samples_y_aggregation) lm_current = gp_selection.selection( "lm", samples_y_aggregation, x_bounds, x_types, gp_model['model'], minimize_starting_points, minimize_constraints_fun=minimize_constraints_fun) if not lm_current: return None if no_candidates is False: candidates.append({ 'hyperparameter': lm_current['hyperparameter'], 'expected_mu': lm_current['expected_mu'], 'expected_sigma': lm_current['expected_sigma'], 'reason': "exploitation_gp" }) # ===== STEP 2: Get recommended configurations for exploration ===== results_exploration = gp_selection.selection( "lc", samples_y_aggregation, x_bounds, x_types, gp_model['model'], minimize_starting_points, minimize_constraints_fun=minimize_constraints_fun) if results_exploration is not None: if _num_past_samples(results_exploration['hyperparameter'], samples_x, samples_y) == 0: candidates.append({ 'hyperparameter': results_exploration['hyperparameter'], 'expected_mu': results_exploration['expected_mu'], 'expected_sigma': results_exploration['expected_sigma'], 'reason': "exploration" }) logger.info("DEBUG: 1 exploration candidate selected\n") else: logger.info("DEBUG: No suitable exploration candidates were") # ===== STEP 3: Get recommended configurations for exploitation ===== if samples_size_all >= threshold_samplessize_exploitation: print("Getting candidates for exploitation...\n") try: gmm = gmm_create_model.create_model( samples_x, samples_y_aggregation) results_exploitation = gmm_selection.selection( x_bounds, x_types, gmm['clusteringmodel_good'], gmm['clusteringmodel_bad'], minimize_starting_points, minimize_constraints_fun=minimize_constraints_fun) if results_exploitation is not None: if _num_past_samples( results_exploitation['hyperparameter'], samples_x, samples_y) == 0: candidates.append({'hyperparameter': results_exploitation['hyperparameter'],\ 'expected_mu': results_exploitation['expected_mu'],\ 'expected_sigma': results_exploitation['expected_sigma'],\ 'reason': "exploitation_gmm"}) logger.info( "DEBUG: 1 exploitation_gmm candidate selected\n" ) else: logger.info( "DEBUG: No suitable exploitation_gmm candidates were found\n" ) except ValueError as exception: # The exception: ValueError: Fitting the mixture model failed # because some components have ill-defined empirical covariance # (for instance caused by singleton or collapsed samples). # Try to decrease the number of components, or increase reg_covar. logger.info( "DEBUG: No suitable exploitation_gmm candidates were found due to exception." ) logger.info(exception) # ===== STEP 4: Get a list of outliers ===== if (threshold_samplessize_resampling is not None) and \ (samples_size_unique >= threshold_samplessize_resampling): logger.info("Getting candidates for re-sampling...\n") results_outliers = gp_outlier_detection.outlierDetection_threaded( samples_x, samples_y_aggregation) if results_outliers is not None: for results_outlier in results_outliers: if _num_past_samples( samples_x[results_outlier['samples_idx']], samples_x, samples_y) < max_resampling_per_x: candidates.append({'hyperparameter': samples_x[results_outlier['samples_idx']],\ 'expected_mu': results_outlier['expected_mu'],\ 'expected_sigma': results_outlier['expected_sigma'],\ 'reason': "resampling"}) logger.info("DEBUG: %d re-sampling candidates selected\n") else: logger.info( "DEBUG: No suitable resampling candidates were found\n" ) if candidates: # ===== STEP 5: Compute the information gain of each candidate towards the optimum ===== logger.info( "Evaluating information gain of %d candidates...\n") next_improvement = 0 threads_inputs = [[ candidate, samples_x, samples_y, x_bounds, x_types, minimize_constraints_fun, minimize_starting_points ] for candidate in candidates] threads_pool = ThreadPool(4) # Evaluate what would happen if we actually sample each candidate threads_results = threads_pool.map( _calculate_lowest_mu_threaded, threads_inputs) threads_pool.close() threads_pool.join() for threads_result in threads_results: if threads_result['expected_lowest_mu'] < lm_current[ 'expected_mu']: # Information gain temp_improvement = threads_result[ 'expected_lowest_mu'] - lm_current['expected_mu'] if next_improvement > temp_improvement: next_improvement = temp_improvement next_candidate = threads_result['candidate'] else: # ===== STEP 6: If we have no candidates, randomly pick one ===== logger.info( "DEBUG: No candidates from exploration, exploitation,\ and resampling. We will random a candidate for next_candidate\n" ) next_candidate = _rand_with_constraints(x_bounds, x_types) \ if minimize_starting_points is None else minimize_starting_points[0] next_candidate = lib_data.match_val_type( next_candidate, x_bounds, x_types) expected_mu, expected_sigma = gp_prediction.predict( next_candidate, gp_model['model']) next_candidate = { 'hyperparameter': next_candidate, 'reason': "random", 'expected_mu': expected_mu, 'expected_sigma': expected_sigma } # ===== STEP 7: If current optimal hyperparameter occurs in the history or exploration probability is less than the threshold, take next config as exploration step ===== outputs = self._pack_output(lm_current['hyperparameter']) ap = random.uniform(0, 1) if outputs in self.total_data or ap <= self.exploration_probability: if next_candidate is not None: outputs = self._pack_output(next_candidate['hyperparameter']) else: random_parameter = _rand_init(x_bounds, x_types, 1)[0] outputs = self._pack_output(random_parameter) self.total_data.append(outputs) return outputs
temp_improvement = threads_result['expected_lowest_mu'] - lm_current['expected_mu'] if next_improvement > temp_improvement: next_improvement = temp_improvement next_candidate = threads_result['candidate'] else: # ===== STEP 6: If we have no candidates, randomly pick one ===== logger.info( "DEBUG: No candidates from exploration, exploitation,\ and resampling. We will random a candidate for next_candidate\n" ) next_candidate = _rand_with_constraints(x_bounds, x_types) \ if minimize_starting_points is None else minimize_starting_points[0] next_candidate = lib_data.match_val_type(next_candidate, x_bounds, x_types) expected_mu, expected_sigma = gp_prediction.predict(next_candidate, gp_model['model']) next_candidate = {'hyperparameter': next_candidate, 'reason': "random", 'expected_mu': expected_mu, 'expected_sigma': expected_sigma} # ===== STEP 7: If current optimal hyperparameter occurs in the history or exploration probability is less than the threshold, take next config as exploration step ===== outputs = self._pack_output(lm_current['hyperparameter']) ap = random.uniform(0, 1) if outputs in self.history_parameters or ap<=self.exploration_probability: if next_candidate is not None: outputs = self._pack_output(next_candidate['hyperparameter']) else: random_parameter = _rand_init(x_bounds, x_types, 1)[0] outputs = self._pack_output(random_parameter) self.history_parameters.append(outputs) return outputs