rp_flag_global = True objective_dim = 6 rp_method = 'discrete' # build flags for random projection rp_flags, base_estimator_names = build_codes(base_estimators, rp_clf_list, rp_ng_clf_list, rp_flag_global) # load the pre-trained cost predictor to forecast the train cost clf_train = joblib.load( os.path.join('../suod', 'models', 'saved_models', 'bps_train.joblib')) time_cost_pred = cost_forecast_meta(clf_train, X, base_estimator_names) # schedule the tasks n_estimators_list, starts, n_jobs = balanced_scheduling( time_cost_pred, n_estimators, n_jobs) print(starts) # this is the list of being split start = time.time() print('Parallel Training...') # TODO: code cleanup. There is an existing bug for joblib on Windows: # https://github.com/joblib/joblib/issues/806 # max_nbytes can be dropped on other OS all_results = Parallel(n_jobs=n_jobs, max_nbytes=None, verbose=True)( delayed(_parallel_fit)(n_estimators_list[i], base_estimators[starts[i]:starts[i + 1]], X, n_estimators, rp_flags[starts[i]:starts[i + 1]],
def decision_function(self, X): """Predict raw anomaly scores of X using the fitted detectors. The anomaly score of an input sample is computed based on the fitted detector. For consistency, outliers are assigned with higher anomaly scores. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. Sparse matrices are accepted only if they are supported by the base estimator. Returns ------- anomaly_scores : numpy array of shape (n_samples,) The anomaly score of the input samples. """ X = check_array(X) n_samples, n_features = X.shape[0], X.shape[1] # decide whether bps is needed # it is turned off if self.bps_flag: # load the pre-trained cost predictor to forecast the train cost cost_predictor = joblib.load(self.cost_forecast_loc_pred_) time_cost_pred = cost_forecast_meta(cost_predictor, X, self.base_estimator_names) n_estimators_list, starts, n_jobs = balanced_scheduling( time_cost_pred, self.n_estimators, self.n_jobs) else: # use simple equal split by sklearn n_estimators_list, starts, n_jobs = _partition_estimators( self.n_estimators, self.n_jobs) # fit the base models if self.verbose: print('Parallel score prediction...') start = time.time() # TODO: code cleanup. There is an existing bug for joblib on Windows: # https://github.com/joblib/joblib/issues/806 # max_nbytes can be dropped on other OS all_results_scores = Parallel(n_jobs=n_jobs, max_nbytes=None, verbose=True)( delayed(_parallel_decision_function)( n_estimators_list[i], self.base_estimators[starts[i]:starts[i + 1]], self.approximators[starts[i]:starts[i + 1]], X, self.n_estimators, # self.rp_flags[starts[i]:starts[i + 1]], self.jl_transformers_[starts[i]:starts[i + 1]], self.approx_flags[starts[i]:starts[i + 1]], verbose=True) for i in range(n_jobs)) # fit the base models if self.verbose: print('Parallel Score Prediction without Approximators ' 'Total Time:', time.time() - start) # unfold and generate the label matrix predicted_scores = np.zeros([n_samples, self.n_estimators]) for i in range(n_jobs): predicted_scores[:, starts[i]:starts[i + 1]] = np.asarray( all_results_scores[i]).T return predicted_scores
def fit(self, X, y=None): """Fit estimator. y is optional for unsupervised methods. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. y : numpy array of shape (n_samples,), optional (default=None) The ground truth of the input samples (labels). Returns ------- self """ X = check_array(X) n_samples, n_features = X.shape[0], X.shape[1] # Validate max_features for random projection if isinstance(self.max_features, (numbers.Integral, np.integer)): self.max_features_ = self.max_features else: # float self.max_features_ = int(self.max_features * n_features) # build flags for random projection self.rp_flags_, _ = build_codes( self.n_estimators, self.base_estimators, self.rp_clf_list, self.rp_ng_clf_list, self.rp_flag_global) # decide whether bps is needed # it is turned off if self.bps_flag: # load the pre-trained cost predictor to forecast the train cost cost_predictor = joblib.load(self.cost_forecast_loc_fit_) time_cost_pred = cost_forecast_meta(cost_predictor, X, self.base_estimator_names) # use BPS n_estimators_list, starts, n_jobs = balanced_scheduling( time_cost_pred, self.n_estimators, self.n_jobs) else: # use the default sklearn equal split n_estimators_list, starts, n_jobs = _partition_estimators( self.n_estimators, self.n_jobs) # fit the base models print('Parallel Training...') start = time.time() # TODO: code cleanup. There is an existing bug for joblib on Windows: # https://github.com/joblib/joblib/issues/806 # max_nbytes can be dropped on other OS all_results = Parallel(n_jobs=n_jobs, max_nbytes=None, verbose=True)( delayed(_parallel_fit)( n_estimators_list[i], self.base_estimators[starts[i]:starts[i + 1]], X, self.n_estimators, self.rp_flags[starts[i]:starts[i + 1]], self.max_features_, self.rp_method, verbose=self.verbose) for i in range(n_jobs)) print('Balanced Scheduling Total Train Time:', time.time() - start) # reformat and unfold the lists. Save the trained estimators and transformers all_results = list(map(list, zip(*all_results))) # overwrite estimators self.base_estimators = _unfold_parallel(all_results[0], n_jobs) self.jl_transformers_ = _unfold_parallel(all_results[1], n_jobs) return self
def predict(self, X): """Predict the class labels for the provided data. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. Returns ------- labels : numpy array of shape (n_samples,) Class labels for each data sample. """ X = check_array(X) n_samples, n_features = X.shape[0], X.shape[1] # decide whether bps is needed # it is turned off if self.bps_flag: # load the pre-trained cost predictor to forecast the train cost cost_predictor = joblib.load(self.cost_forecast_loc_pred_) time_cost_pred = cost_forecast_meta(cost_predictor, X, self.base_estimator_names) n_estimators_list, starts, n_jobs = balanced_scheduling( time_cost_pred, self.n_estimators, self.n_jobs) else: # use simple equal split by sklearn n_estimators_list, starts, n_jobs = _partition_estimators( self.n_estimators, self.n_jobs) # fit the base models print('Parallel label prediction...') start = time.time() # TODO: code cleanup. There is an existing bug for joblib on Windows: # https://github.com/joblib/joblib/issues/806 # max_nbytes can be dropped on other OS all_results_pred = Parallel(n_jobs=n_jobs, max_nbytes=None, verbose=True)( delayed(_parallel_predict)( n_estimators_list[i], self.base_estimators[starts[i]:starts[i + 1]], self.approximators[starts[i]:starts[i + 1]], X, self.n_estimators, # self.rp_flags[starts[i]:starts[i + 1]], self.jl_transformers_[starts[i]:starts[i + 1]], self.approx_flags[starts[i]:starts[i + 1]], self.contamination, verbose=True) for i in range(n_jobs)) print('Parallel Label Predicting without Approximators Total Time:', time.time() - start) # unfold and generate the label matrix predicted_labels = np.zeros([n_samples, self.n_estimators]) for i in range(n_jobs): predicted_labels[:, starts[i]:starts[i + 1]] = np.asarray( all_results_pred[i]).T return predicted_labels
def predict_proba(self, X): """Predict the probability of a sample being outlier. Two approaches are possible: 1. simply use Min-max conversion to linearly transform the outlier scores into the range of [0,1]. The model must be fitted first. 2. use unifying scores, see :cite:`kriegel2011interpreting`. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. method : str, optional (default='linear') probability conversion method. It must be one of 'linear' or 'unify'. Returns ------- outlier_probability : numpy array of shape (n_samples,) For each observation, tells whether or not it should be considered as an outlier according to the fitted model. Return the outlier probability, ranging in [0,1]. """ X = check_array(X) n_samples, n_features = X.shape[0], X.shape[1] # decide whether bps is needed # it is turned off if self.bps_flag: # load the pre-trained cost predictor to forecast the train cost cost_predictor = joblib.load(self.cost_forecast_loc_pred_) time_cost_pred = cost_forecast_meta(cost_predictor, X, self.base_estimator_names) n_estimators_list, starts, n_jobs = balanced_scheduling( time_cost_pred, self.n_estimators, self.n_jobs) else: # use simple equal split by sklearn n_estimators_list, starts, n_jobs = _partition_estimators( self.n_estimators, self.n_jobs) # fit the base models if self.verbose: print('Parallel score prediction...') start = time.time() # TODO: code cleanup. There is an existing bug for joblib on Windows: # https://github.com/joblib/joblib/issues/806 # max_nbytes can be dropped on other OS all_results_scores = Parallel( n_jobs=n_jobs, max_nbytes=None, verbose=True)( delayed(_parallel_predict_proba)( n_estimators_list[i], self.base_estimators[starts[i]:starts[i + 1]], self.approximators[starts[i]:starts[i + 1]], X, self.n_estimators, # self.rp_flags[starts[i]:starts[i + 1]], self.jl_transformers_[starts[i]:starts[i + 1]], self.approx_flags[starts[i]:starts[i + 1]], verbose=True) for i in range(n_jobs)) # fit the base models if self.verbose: print( 'Parallel Score Prediction without Approximators ' 'Total Time:', time.time() - start) # unfold and generate the label matrix predicted_scores = np.zeros([n_samples, self.n_estimators]) for i in range(n_jobs): predicted_scores[:, starts[i]:starts[i + 1]] = np.asarray( all_results_scores[i]).T return predicted_scores