Exemplo n.º 1
0
def outlierDetection(samples_x, samples_y_aggregation):
    outliers = []
    for samples_idx, _ in enumerate(samples_x):
        #sys.stderr.write("[%s] DEBUG: Evaluating %d of %d samples\n"
        #  \ % (os.path.basename(__file__), samples_idx + 1, len(samples_x)))
        diagnostic_regressor_gp = gp_create_model.create_model(\
                                        samples_x[0:samples_idx] + samples_x[samples_idx + 1:],\
                                        samples_y_aggregation[0:samples_idx] + samples_y_aggregation[samples_idx + 1:])
        mu, sigma = gp_prediction.predict(samples_x[samples_idx],
                                          diagnostic_regressor_gp['model'])
        # 2.33 is the z-score for 98% confidence level
        if abs(samples_y_aggregation[samples_idx] - mu) > (2.33 * sigma):
            outliers.append({"samples_idx": samples_idx,
                             "expected_mu": mu,
                             "expected_sigma": sigma,
                             "difference": \
                                abs(samples_y_aggregation[samples_idx] - mu) - (2.33 * sigma)})

    outliers = outliers if outliers else None
    return outliers
Exemplo n.º 2
0
def _outlierDetection_threaded(inputs):
    """
    Detect the outlier
    """
    [samples_idx, samples_x, samples_y_aggregation] = inputs
    sys.stderr.write("[%s] DEBUG: Evaluating %dth of %d samples\n"
                     % (os.path.basename(__file__), samples_idx + 1, len(samples_x)))
    outlier = None

    # Create a diagnostic regression model which removes the sample that we
    # want to evaluate
    diagnostic_regressor_gp = gp_create_model.create_model(
        samples_x[0:samples_idx] + samples_x[samples_idx + 1:],
        samples_y_aggregation[0:samples_idx] + samples_y_aggregation[samples_idx + 1:])
    mu, sigma = gp_prediction.predict(
        samples_x[samples_idx], diagnostic_regressor_gp['model'])

    # 2.33 is the z-score for 98% confidence level
    if abs(samples_y_aggregation[samples_idx] - mu) > (2.33 * sigma):
        outlier = {"samples_idx": samples_idx,
                   "expected_mu": mu,
                   "expected_sigma": sigma,
                   "difference": abs(samples_y_aggregation[samples_idx] - mu) - (2.33 * sigma)}
    return outlier
Exemplo n.º 3
0
    def _selection(self,
                   samples_x,
                   samples_y_aggregation,
                   samples_y,
                   x_bounds,
                   x_types,
                   max_resampling_per_x=3,
                   threshold_samplessize_exploitation=12,
                   threshold_samplessize_resampling=50,
                   no_candidates=False,
                   minimize_starting_points=None,
                   minimize_constraints_fun=None):

        next_candidate = None
        candidates = []
        samples_size_all = sum([len(i) for i in samples_y])
        samples_size_unique = len(samples_y)

        # ===== STEP 1: Compute the current optimum =====
        gp_model = gp_create_model.create_model(samples_x,
                                                samples_y_aggregation)
        lm_current = gp_selection.selection(
            "lm",
            samples_y_aggregation,
            x_bounds,
            x_types,
            gp_model['model'],
            minimize_starting_points,
            minimize_constraints_fun=minimize_constraints_fun)
        if not lm_current:
            return None

        if no_candidates is False:
            candidates.append({
                'hyperparameter': lm_current['hyperparameter'],
                'expected_mu': lm_current['expected_mu'],
                'expected_sigma': lm_current['expected_sigma'],
                'reason': "exploitation_gp"
            })

            # ===== STEP 2: Get recommended configurations for exploration =====
            results_exploration = gp_selection.selection(
                "lc",
                samples_y_aggregation,
                x_bounds,
                x_types,
                gp_model['model'],
                minimize_starting_points,
                minimize_constraints_fun=minimize_constraints_fun)

            if results_exploration is not None:
                if _num_past_samples(results_exploration['hyperparameter'],
                                     samples_x, samples_y) == 0:
                    candidates.append({
                        'hyperparameter':
                        results_exploration['hyperparameter'],
                        'expected_mu':
                        results_exploration['expected_mu'],
                        'expected_sigma':
                        results_exploration['expected_sigma'],
                        'reason':
                        "exploration"
                    })
                    logger.info("DEBUG: 1 exploration candidate selected\n")
            else:
                logger.info("DEBUG: No suitable exploration candidates were")

            # ===== STEP 3: Get recommended configurations for exploitation =====
            if samples_size_all >= threshold_samplessize_exploitation:
                print("Getting candidates for exploitation...\n")
                try:
                    gmm = gmm_create_model.create_model(
                        samples_x, samples_y_aggregation)
                    results_exploitation = gmm_selection.selection(
                        x_bounds,
                        x_types,
                        gmm['clusteringmodel_good'],
                        gmm['clusteringmodel_bad'],
                        minimize_starting_points,
                        minimize_constraints_fun=minimize_constraints_fun)

                    if results_exploitation is not None:
                        if _num_past_samples(
                                results_exploitation['hyperparameter'],
                                samples_x, samples_y) == 0:
                            candidates.append({'hyperparameter': results_exploitation['hyperparameter'],\
                                               'expected_mu': results_exploitation['expected_mu'],\
                                               'expected_sigma': results_exploitation['expected_sigma'],\
                                               'reason': "exploitation_gmm"})
                            logger.info(
                                "DEBUG: 1 exploitation_gmm candidate selected\n"
                            )
                    else:
                        logger.info(
                            "DEBUG: No suitable exploitation_gmm candidates were found\n"
                        )

                except ValueError as exception:
                    # The exception: ValueError: Fitting the mixture model failed
                    # because some components have ill-defined empirical covariance
                    # (for instance caused by singleton or collapsed samples).
                    # Try to decrease the number of components, or increase reg_covar.
                    logger.info(
                        "DEBUG: No suitable exploitation_gmm candidates were found due to exception."
                    )
                    logger.info(exception)

            # ===== STEP 4: Get a list of outliers =====
            if (threshold_samplessize_resampling is not None) and \
                        (samples_size_unique >= threshold_samplessize_resampling):
                logger.info("Getting candidates for re-sampling...\n")
                results_outliers = gp_outlier_detection.outlierDetection_threaded(
                    samples_x, samples_y_aggregation)

                if results_outliers is not None:
                    for results_outlier in results_outliers:
                        if _num_past_samples(
                                samples_x[results_outlier['samples_idx']],
                                samples_x, samples_y) < max_resampling_per_x:
                            candidates.append({'hyperparameter': samples_x[results_outlier['samples_idx']],\
                                               'expected_mu': results_outlier['expected_mu'],\
                                               'expected_sigma': results_outlier['expected_sigma'],\
                                               'reason': "resampling"})
                    logger.info("DEBUG: %d re-sampling candidates selected\n")
                else:
                    logger.info(
                        "DEBUG: No suitable resampling candidates were found\n"
                    )

            if candidates:
                # ===== STEP 5: Compute the information gain of each candidate towards the optimum =====
                logger.info(
                    "Evaluating information gain of %d candidates...\n")
                next_improvement = 0

                threads_inputs = [[
                    candidate, samples_x, samples_y, x_bounds, x_types,
                    minimize_constraints_fun, minimize_starting_points
                ] for candidate in candidates]
                threads_pool = ThreadPool(4)
                # Evaluate what would happen if we actually sample each candidate
                threads_results = threads_pool.map(
                    _calculate_lowest_mu_threaded, threads_inputs)
                threads_pool.close()
                threads_pool.join()

                for threads_result in threads_results:
                    if threads_result['expected_lowest_mu'] < lm_current[
                            'expected_mu']:
                        # Information gain
                        temp_improvement = threads_result[
                            'expected_lowest_mu'] - lm_current['expected_mu']

                        if next_improvement > temp_improvement:
                            next_improvement = temp_improvement
                            next_candidate = threads_result['candidate']
            else:
                # ===== STEP 6: If we have no candidates, randomly pick one =====
                logger.info(
                    "DEBUG: No candidates from exploration, exploitation,\
                                 and resampling. We will random a candidate for next_candidate\n"
                )

                next_candidate = _rand_with_constraints(x_bounds, x_types) \
                                    if minimize_starting_points is None else minimize_starting_points[0]
                next_candidate = lib_data.match_val_type(
                    next_candidate, x_bounds, x_types)
                expected_mu, expected_sigma = gp_prediction.predict(
                    next_candidate, gp_model['model'])
                next_candidate = {
                    'hyperparameter': next_candidate,
                    'reason': "random",
                    'expected_mu': expected_mu,
                    'expected_sigma': expected_sigma
                }

        # ===== STEP 7: If current optimal hyperparameter occurs in the history or exploration probability is less than the threshold, take next config as exploration step  =====
        outputs = self._pack_output(lm_current['hyperparameter'])
        ap = random.uniform(0, 1)
        if outputs in self.total_data or ap <= self.exploration_probability:
            if next_candidate is not None:
                outputs = self._pack_output(next_candidate['hyperparameter'])
            else:
                random_parameter = _rand_init(x_bounds, x_types, 1)[0]
                outputs = self._pack_output(random_parameter)
        self.total_data.append(outputs)
        return outputs
Exemplo n.º 4
0
                        temp_improvement = threads_result['expected_lowest_mu'] - lm_current['expected_mu']

                        if next_improvement > temp_improvement:
                            next_improvement = temp_improvement
                            next_candidate = threads_result['candidate']
            else:
                # ===== STEP 6: If we have no candidates, randomly pick one =====
                logger.info(
                    "DEBUG: No candidates from exploration, exploitation,\
                                 and resampling. We will random a candidate for next_candidate\n"
                )

                next_candidate = _rand_with_constraints(x_bounds, x_types) \
                                    if minimize_starting_points is None else minimize_starting_points[0]
                next_candidate = lib_data.match_val_type(next_candidate, x_bounds, x_types)
                expected_mu, expected_sigma = gp_prediction.predict(next_candidate, gp_model['model'])
                next_candidate = {'hyperparameter': next_candidate, 'reason': "random",
                                  'expected_mu': expected_mu, 'expected_sigma': expected_sigma}

        # ===== STEP 7: If current optimal hyperparameter occurs in the history or exploration probability is less than the threshold, take next config as exploration step  =====
        outputs = self._pack_output(lm_current['hyperparameter'])
        ap = random.uniform(0, 1)
        if outputs in self.history_parameters or ap<=self.exploration_probability:
            if next_candidate is not None:
                outputs = self._pack_output(next_candidate['hyperparameter'])
            else:
                random_parameter = _rand_init(x_bounds, x_types, 1)[0]
                outputs = self._pack_output(random_parameter)
        self.history_parameters.append(outputs)
        return outputs