Python balanced_scheduling示例，suod.models.parallel_processes.balanced_scheduling Python示例

示例#1

0

显示文件

文件： do_not_use_demo_full.py 项目： hsw2012s/temp

rp_flag_global = True
objective_dim = 6
rp_method = 'discrete'

# build flags for random projection
rp_flags, base_estimator_names = build_codes(base_estimators, rp_clf_list,
                                             rp_ng_clf_list, rp_flag_global)

# load the pre-trained cost predictor to forecast the train cost
clf_train = joblib.load(
    os.path.join('../suod', 'models', 'saved_models', 'bps_train.joblib'))

time_cost_pred = cost_forecast_meta(clf_train, X, base_estimator_names)

# schedule the tasks
n_estimators_list, starts, n_jobs = balanced_scheduling(
    time_cost_pred, n_estimators, n_jobs)

print(starts)  # this is the list of being split
start = time.time()

print('Parallel Training...')

# TODO: code cleanup. There is an existing bug for joblib on Windows:
# https://github.com/joblib/joblib/issues/806
# max_nbytes can be dropped on other OS
all_results = Parallel(n_jobs=n_jobs, max_nbytes=None, verbose=True)(
    delayed(_parallel_fit)(n_estimators_list[i],
                           base_estimators[starts[i]:starts[i + 1]],
                           X,
                           n_estimators,
                           rp_flags[starts[i]:starts[i + 1]],

示例#2

0

显示文件

    def decision_function(self, X):
        """Predict raw anomaly scores of X using the fitted detectors.

        The anomaly score of an input sample is computed based on the fitted
        detector. For consistency, outliers are assigned with
        higher anomaly scores.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples. Sparse matrices are accepted only
            if they are supported by the base estimator.

        Returns
        -------
        anomaly_scores : numpy array of shape (n_samples,)
            The anomaly score of the input samples.
        """
        X = check_array(X)
        n_samples, n_features = X.shape[0], X.shape[1]

        # decide whether bps is needed
        # it is turned off
        if self.bps_flag:
            # load the pre-trained cost predictor to forecast the train cost
            cost_predictor = joblib.load(self.cost_forecast_loc_pred_)

            time_cost_pred = cost_forecast_meta(cost_predictor, X,
                                                self.base_estimator_names)

            n_estimators_list, starts, n_jobs = balanced_scheduling(
                time_cost_pred, self.n_estimators, self.n_jobs)
        else:
            # use simple equal split by sklearn
            n_estimators_list, starts, n_jobs = _partition_estimators(
                self.n_estimators, self.n_jobs)

        # fit the base models
        if self.verbose:
            print('Parallel score prediction...')
            start = time.time()

        # TODO: code cleanup. There is an existing bug for joblib on Windows:
        # https://github.com/joblib/joblib/issues/806
        # max_nbytes can be dropped on other OS
        all_results_scores = Parallel(n_jobs=n_jobs, max_nbytes=None,
                                      verbose=True)(
            delayed(_parallel_decision_function)(
                n_estimators_list[i],
                self.base_estimators[starts[i]:starts[i + 1]],
                self.approximators[starts[i]:starts[i + 1]],
                X,
                self.n_estimators,
                # self.rp_flags[starts[i]:starts[i + 1]],
                self.jl_transformers_[starts[i]:starts[i + 1]],
                self.approx_flags[starts[i]:starts[i + 1]],
                verbose=True)
            for i in range(n_jobs))

        # fit the base models
        if self.verbose:
            print('Parallel Score Prediction without Approximators '
                  'Total Time:', time.time() - start)

        # unfold and generate the label matrix
        predicted_scores = np.zeros([n_samples, self.n_estimators])
        for i in range(n_jobs):
            predicted_scores[:, starts[i]:starts[i + 1]] = np.asarray(
                all_results_scores[i]).T

        return predicted_scores

示例#3

0

显示文件

    def fit(self, X, y=None):
        """Fit estimator. y is optional for unsupervised methods.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : numpy array of shape (n_samples,), optional (default=None)
            The ground truth of the input samples (labels).

        Returns
        -------
        self
        """
        X = check_array(X)
        n_samples, n_features = X.shape[0], X.shape[1]

        # Validate max_features for random projection
        if isinstance(self.max_features, (numbers.Integral, np.integer)):
            self.max_features_ = self.max_features
        else:  # float
            self.max_features_ = int(self.max_features * n_features)

        # build flags for random projection
        self.rp_flags_, _ = build_codes(
            self.n_estimators, self.base_estimators, self.rp_clf_list,
            self.rp_ng_clf_list, self.rp_flag_global)

        # decide whether bps is needed
        # it is turned off
        if self.bps_flag:
            # load the pre-trained cost predictor to forecast the train cost
            cost_predictor = joblib.load(self.cost_forecast_loc_fit_)

            time_cost_pred = cost_forecast_meta(cost_predictor, X,
                                                self.base_estimator_names)

            # use BPS
            n_estimators_list, starts, n_jobs = balanced_scheduling(
                time_cost_pred, self.n_estimators, self.n_jobs)
        else:
            # use the default sklearn equal split
            n_estimators_list, starts, n_jobs = _partition_estimators(
                self.n_estimators, self.n_jobs)

        # fit the base models
        print('Parallel Training...')
        start = time.time()

        # TODO: code cleanup. There is an existing bug for joblib on Windows:
        # https://github.com/joblib/joblib/issues/806
        # max_nbytes can be dropped on other OS
        all_results = Parallel(n_jobs=n_jobs, max_nbytes=None, verbose=True)(
            delayed(_parallel_fit)(
                n_estimators_list[i],
                self.base_estimators[starts[i]:starts[i + 1]],
                X,
                self.n_estimators,
                self.rp_flags[starts[i]:starts[i + 1]],
                self.max_features_,
                self.rp_method,
                verbose=self.verbose)
            for i in range(n_jobs))

        print('Balanced Scheduling Total Train Time:', time.time() - start)

        # reformat and unfold the lists. Save the trained estimators and transformers
        all_results = list(map(list, zip(*all_results)))

        # overwrite estimators
        self.base_estimators = _unfold_parallel(all_results[0], n_jobs)
        self.jl_transformers_ = _unfold_parallel(all_results[1], n_jobs)

        return self

示例#4

0

显示文件

    def predict(self, X):
        """Predict the class labels for the provided data.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        Returns
        -------
        labels : numpy array of shape (n_samples,)
            Class labels for each data sample.
        """
        X = check_array(X)
        n_samples, n_features = X.shape[0], X.shape[1]

        # decide whether bps is needed
        # it is turned off
        if self.bps_flag:
            # load the pre-trained cost predictor to forecast the train cost
            cost_predictor = joblib.load(self.cost_forecast_loc_pred_)

            time_cost_pred = cost_forecast_meta(cost_predictor, X,
                                                self.base_estimator_names)

            n_estimators_list, starts, n_jobs = balanced_scheduling(
                time_cost_pred, self.n_estimators, self.n_jobs)
        else:
            # use simple equal split by sklearn
            n_estimators_list, starts, n_jobs = _partition_estimators(
                self.n_estimators, self.n_jobs)

        # fit the base models
        print('Parallel label prediction...')
        start = time.time()

        # TODO: code cleanup. There is an existing bug for joblib on Windows:
        # https://github.com/joblib/joblib/issues/806
        # max_nbytes can be dropped on other OS
        all_results_pred = Parallel(n_jobs=n_jobs, max_nbytes=None,
                                    verbose=True)(
            delayed(_parallel_predict)(
                n_estimators_list[i],
                self.base_estimators[starts[i]:starts[i + 1]],
                self.approximators[starts[i]:starts[i + 1]],
                X,
                self.n_estimators,
                # self.rp_flags[starts[i]:starts[i + 1]],
                self.jl_transformers_[starts[i]:starts[i + 1]],
                self.approx_flags[starts[i]:starts[i + 1]],
                self.contamination,
                verbose=True)
            for i in range(n_jobs))

        print('Parallel Label Predicting without Approximators Total Time:',
              time.time() - start)

        # unfold and generate the label matrix
        predicted_labels = np.zeros([n_samples, self.n_estimators])
        for i in range(n_jobs):
            predicted_labels[:, starts[i]:starts[i + 1]] = np.asarray(
                all_results_pred[i]).T

        return predicted_labels

示例#5

0

显示文件

    def predict_proba(self, X):
        """Predict the probability of a sample being outlier. Two approaches
        are possible:

        1. simply use Min-max conversion to linearly transform the outlier
           scores into the range of [0,1]. The model must be
           fitted first.
        2. use unifying scores, see :cite:`kriegel2011interpreting`.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        method : str, optional (default='linear')
            probability conversion method. It must be one of
            'linear' or 'unify'.

        Returns
        -------
        outlier_probability : numpy array of shape (n_samples,)
            For each observation, tells whether or not
            it should be considered as an outlier according to the
            fitted model. Return the outlier probability, ranging
            in [0,1].
        """
        X = check_array(X)
        n_samples, n_features = X.shape[0], X.shape[1]

        # decide whether bps is needed
        # it is turned off
        if self.bps_flag:
            # load the pre-trained cost predictor to forecast the train cost
            cost_predictor = joblib.load(self.cost_forecast_loc_pred_)

            time_cost_pred = cost_forecast_meta(cost_predictor, X,
                                                self.base_estimator_names)

            n_estimators_list, starts, n_jobs = balanced_scheduling(
                time_cost_pred, self.n_estimators, self.n_jobs)
        else:
            # use simple equal split by sklearn
            n_estimators_list, starts, n_jobs = _partition_estimators(
                self.n_estimators, self.n_jobs)

        # fit the base models
        if self.verbose:
            print('Parallel score prediction...')
            start = time.time()

        # TODO: code cleanup. There is an existing bug for joblib on Windows:
        # https://github.com/joblib/joblib/issues/806
        # max_nbytes can be dropped on other OS
        all_results_scores = Parallel(
            n_jobs=n_jobs, max_nbytes=None, verbose=True)(
                delayed(_parallel_predict_proba)(
                    n_estimators_list[i],
                    self.base_estimators[starts[i]:starts[i + 1]],
                    self.approximators[starts[i]:starts[i + 1]],
                    X,
                    self.n_estimators,
                    # self.rp_flags[starts[i]:starts[i + 1]],
                    self.jl_transformers_[starts[i]:starts[i + 1]],
                    self.approx_flags[starts[i]:starts[i + 1]],
                    verbose=True) for i in range(n_jobs))

        # fit the base models
        if self.verbose:
            print(
                'Parallel Score Prediction without Approximators '
                'Total Time:',
                time.time() - start)

        # unfold and generate the label matrix
        predicted_scores = np.zeros([n_samples, self.n_estimators])
        for i in range(n_jobs):
            predicted_scores[:, starts[i]:starts[i + 1]] = np.asarray(
                all_results_scores[i]).T

        return predicted_scores