def _create_bmr_model(model, X_val, y_val, calibration=True): y_hat_val_proba = model.predict_proba(X_val) bmr = BayesMinimumRiskClassifier(calibration=calibration) bmr.fit(y_val, y_hat_val_proba) return model, bmr
def cost_sensitive_classification(model, X_train, X_test, y_train, y_test, cost_mat_test): c_model = BayesMinimumRiskClassifier() y_prob_test = model.predict_proba(X_test) y_pred_test_model = model.predict(X_test) c_model.fit(y_test, y_prob_test) y_pred_test_c_model = c_model.predict(y_prob_test, cost_mat_test) c_accuracy = accuracy_score(y_test, y_pred_test_c_model) return c_accuracy, y_pred_test_c_model
def main(): X_train, X_test, y_train, y_test = load_data(train=True, test_size=0.4) classifiers = {"RF": {"f": RandomForestClassifier()}, "DT": {"f": DecisionTreeClassifier()}} ci_models = ['DT', 'RF'] # Fit the classifiers using the training dataset for model in classifiers.keys(): classifiers[model]["f"].fit(X_train, y_train) classifiers[model]["c"] = classifiers[model]["f"].predict(X_test) classifiers[model]["p"] = classifiers[model]["f"].predict_proba(X_test) classifiers[model]["p_train"] = classifiers[model]["f"].predict_proba(X_train) from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score measures = {"F1Score": f1_score, "Precision": precision_score, "Recall": recall_score, "Accuracy": accuracy_score} results = pd.DataFrame(columns=__labels__) from costcla.models import BayesMinimumRiskClassifier for model in ci_models: classifiers[model+"-BMR"] = {"f": BayesMinimumRiskClassifier()} # Fit classifiers[model+"-BMR"]["f"].fit(y_test, classifiers[model]["p"]) # Calibration must be made in a validation set # Predict classifiers[model+"-BMR"]["c"] = classifiers[model+"-BMR"]["f"].predict(classifiers[model]["p"], cost_mat_test)
def baeysian_clas(train, test, val_trai, val_test, auto_calibration=False, calibration_func=None, clf=None, CostMatrix=None, CostMatrixTrain=None): scaler = MinMaxScaler() train = scaler.fit_transform(train) val_trai = scaler.fit_transform(val_trai) if calibration_func is None: model = clf.fit(train, test) else: cc = CalibratedClassifierCV(clf, method=calibration_func, cv=3) model = cc.fit(train, test) prob_test = model.predict_proba(val_trai) bmr = BayesMinimumRiskClassifier(calibration=auto_calibration) pred_test = bmr.predict(prob_test, CostMatrix) prob_test_train = model.predict_proba(train) bmr_train = BayesMinimumRiskClassifier(calibration=auto_calibration) pred_train = bmr_train.predict(prob_test_train, CostMatrixTrain) print(classification_report(val_test, pred_test)) loss = cost_loss(val_test, pred_test, CostMatrix) print("%d\n" % loss) print(confusion_matrix(val_test, pred_test).T) return pred_train, pred_test
cost_matrix = np.hstack((fp, fn, tp, tn)) print("no cost minimization") clf = RandomForestClassifier(random_state=0, n_estimators=100) model = clf.fit(X_train, y_train) pred_test = model.predict(X_test) print(classification_report(y_test, pred_test, target_names=data.target_names)) loss = cost_loss(y_test, pred_test, cost_matrix) print("%d\n" % loss) print(confusion_matrix(y_test, pred_test).T) # transpose to align with slides print("no calibration") clf = RandomForestClassifier(random_state=0, n_estimators=100) model = clf.fit(X_train, y_train) prob_test = model.predict_proba(X_test) bmr = BayesMinimumRiskClassifier(calibration=False) pred_test = bmr.predict(prob_test, cost_matrix) print(classification_report(y_test, pred_test, target_names=data.target_names)) loss = cost_loss(y_test, pred_test, cost_matrix) print("%d\n" % loss) print(confusion_matrix(y_test, pred_test).T) # transpose to align with slides print("costcla calibration on training set") clf = RandomForestClassifier(random_state=0, n_estimators=100) model = clf.fit(X_train, y_train) prob_train = model.predict_proba(X_train) bmr = BayesMinimumRiskClassifier(calibration=True) bmr.fit(y_train, prob_train) prob_test = model.predict_proba(X_test) pred_test = bmr.predict(prob_test, cost_matrix) print(classification_report(y_test, pred_test, target_names=data.target_names))
def _fit_bmr_model(self, X, y): """Private function used to fit the BayesMinimumRisk model.""" self.f_bmr = BayesMinimumRiskClassifier() X_bmr = self.predict_proba(X) self.f_bmr.fit(y, X_bmr) return self
class BaseBagging(with_metaclass(ABCMeta, BaseEnsemble)): """Base class for Bagging meta-estimator. Warning: This class should not be used directly. Use derived classes instead. """ @abstractmethod def __init__(self, base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, combination='majority_voting', n_jobs=1, random_state=None, verbose=0): super(BaseBagging, self).__init__(base_estimator=base_estimator, n_estimators=n_estimators) self.max_samples = max_samples self.max_features = max_features self.bootstrap = bootstrap self.bootstrap_features = bootstrap_features self.combination = combination self.n_jobs = n_jobs self.random_state = random_state self.verbose = verbose def fit(self, X, y, cost_mat, sample_weight=None): """Build a Bagging ensemble of estimators from the training set (X, y). Parameters ---------- X : {array-like, sparse matrix} of shape = [n_samples, n_features] The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. y : array-like, shape = [n_samples] The target values (class labels in classification, real numbers in regression). cost_mat : array-like of shape = [n_samples, 4] Cost matrix of the classification problem Where the columns represents the costs of: false positives, false negatives, true positives and true negatives, for each example. sample_weight : array-like, shape = [n_samples] or None Sample weights. If None, then samples are equally weighted. Note that this is supported only if the base estimator supports sample weighting. Returns ------- self : object Returns self. """ random_state = check_random_state(self.random_state) # Convert data # X, y = check_X_y(X, y, ['csr', 'csc', 'coo']) # Not in sklearn verion 0.15 # Remap output n_samples, self.n_features_ = X.shape y = self._validate_y(y) # Check parameters self._validate_estimator() if isinstance(self.max_samples, (numbers.Integral, np.integer)): max_samples = self.max_samples else: # float max_samples = int(self.max_samples * X.shape[0]) if not (0 < max_samples <= X.shape[0]): raise ValueError("max_samples must be in (0, n_samples]") if isinstance(self.max_features, (numbers.Integral, np.integer)): max_features = self.max_features else: # float max_features = int(self.max_features * self.n_features_) if not (0 < max_features <= self.n_features_): raise ValueError("max_features must be in (0, n_features]") # Free allocated memory, if any self.estimators_ = None # Parallel loop n_jobs, n_estimators, starts = _partition_estimators( self.n_estimators, self.n_jobs) seeds = random_state.randint(MAX_INT, size=self.n_estimators) all_results = Parallel(n_jobs=n_jobs, verbose=self.verbose)( delayed(_parallel_build_estimators)(n_estimators[i], self, X, y, cost_mat, seeds[starts[i]:starts[i + 1]], verbose=self.verbose) for i in range(n_jobs)) # Reduce self.estimators_ = list( itertools.chain.from_iterable(t[0] for t in all_results)) self.estimators_samples_ = list( itertools.chain.from_iterable(t[1] for t in all_results)) self.estimators_features_ = list( itertools.chain.from_iterable(t[2] for t in all_results)) self._evaluate_oob_savings(X, y, cost_mat) if self.combination in [ 'stacking', 'stacking_proba', 'stacking_bmr', 'stacking_proba_bmr' ]: self._fit_stacking_model(X, y, cost_mat) if self.combination in [ 'majority_bmr', 'weighted_bmr', 'stacking_bmr', 'stacking_proba_bmr' ]: self._fit_bmr_model(X, y) return self def _fit_bmr_model(self, X, y): """Private function used to fit the BayesMinimumRisk model.""" self.f_bmr = BayesMinimumRiskClassifier() X_bmr = self.predict_proba(X) self.f_bmr.fit(y, X_bmr) return self def _fit_stacking_model(self, X, y, cost_mat, max_iter=100): """Private function used to fit the stacking model.""" self.f_staking = CostSensitiveLogisticRegression(verbose=self.verbose, max_iter=max_iter) X_stacking = _create_stacking_set(self.estimators_, self.estimators_features_, self.estimators_weight_, X, self.combination) self.f_staking.fit(X_stacking, y, cost_mat) return self #TODO: _evaluate_oob_savings in parallel def _evaluate_oob_savings(self, X, y, cost_mat): """Private function used to calculate the OOB Savings of each estimator.""" estimators_weight = [] for estimator, samples, features in zip(self.estimators_, self.estimators_samples_, self.estimators_features_): # Test if all examples where used for training if not np.any(~samples): # Then use training oob_pred = estimator.predict(X[:, features]) oob_savings = max(0, savings_score(y, oob_pred, cost_mat)) else: # Then use OOB oob_pred = estimator.predict((X[~samples])[:, features]) oob_savings = max( 0, savings_score(y[~samples], oob_pred, cost_mat[~samples])) estimators_weight.append(oob_savings) # Control in case were all weights are 0 if sum(estimators_weight) == 0: self.estimators_weight_ = np.ones( len(estimators_weight)) / len(estimators_weight) else: self.estimators_weight_ = (np.array(estimators_weight) / sum(estimators_weight)).tolist() return self def _validate_y(self, y): # Default implementation return column_or_1d(y, warn=True)
class BaseBagging(with_metaclass(ABCMeta, BaseEnsemble)): """Base class for Bagging meta-estimator. Warning: This class should not be used directly. Use derived classes instead. """ @abstractmethod def __init__(self, base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, combination='majority_voting', n_jobs=1, random_state=None, verbose=0): super(BaseBagging, self).__init__( base_estimator=base_estimator, n_estimators=n_estimators) self.max_samples = max_samples self.max_features = max_features self.bootstrap = bootstrap self.bootstrap_features = bootstrap_features self.combination = combination self.n_jobs = n_jobs self.random_state = random_state self.verbose = verbose def fit(self, X, y, cost_mat, sample_weight=None): """Build a Bagging ensemble of estimators from the training set (X, y). Parameters ---------- X : {array-like, sparse matrix} of shape = [n_samples, n_features] The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. y : array-like, shape = [n_samples] The target values (class labels in classification, real numbers in regression). cost_mat : array-like of shape = [n_samples, 4] Cost matrix of the classification problem Where the columns represents the costs of: false positives, false negatives, true positives and true negatives, for each example. sample_weight : array-like, shape = [n_samples] or None Sample weights. If None, then samples are equally weighted. Note that this is supported only if the base estimator supports sample weighting. Returns ------- self : object Returns self. """ random_state = check_random_state(self.random_state) # Convert data # X, y = check_X_y(X, y, ['csr', 'csc', 'coo']) # Not in sklearn verion 0.15 # Remap output n_samples, self.n_features_ = X.shape y = self._validate_y(y) # Check parameters self._validate_estimator() if isinstance(self.max_samples, (numbers.Integral, np.integer)): max_samples = self.max_samples else: # float max_samples = int(self.max_samples * X.shape[0]) if not (0 < max_samples <= X.shape[0]): raise ValueError("max_samples must be in (0, n_samples]") if isinstance(self.max_features, (numbers.Integral, np.integer)): max_features = self.max_features else: # float max_features = int(self.max_features * self.n_features_) if not (0 < max_features <= self.n_features_): raise ValueError("max_features must be in (0, n_features]") # Free allocated memory, if any self.estimators_ = None # Parallel loop n_jobs, n_estimators, starts = _partition_estimators(self.n_estimators, self.n_jobs) seeds = random_state.randint(MAX_INT, size=self.n_estimators) all_results = Parallel(n_jobs=n_jobs, verbose=self.verbose)( delayed(_parallel_build_estimators)( n_estimators[i], self, X, y, cost_mat, seeds[starts[i]:starts[i + 1]], verbose=self.verbose) for i in range(n_jobs)) # Reduce self.estimators_ = list(itertools.chain.from_iterable( t[0] for t in all_results)) self.estimators_samples_ = list(itertools.chain.from_iterable( t[1] for t in all_results)) self.estimators_features_ = list(itertools.chain.from_iterable( t[2] for t in all_results)) self._evaluate_oob_savings(X, y, cost_mat) if self.combination in ['stacking', 'stacking_proba', 'stacking_bmr', 'stacking_proba_bmr']: self._fit_stacking_model(X, y, cost_mat) if self.combination in ['majority_bmr', 'weighted_bmr', 'stacking_bmr', 'stacking_proba_bmr']: self._fit_bmr_model(X, y) return self def _fit_bmr_model(self, X, y): """Private function used to fit the BayesMinimumRisk model.""" self.f_bmr = BayesMinimumRiskClassifier() X_bmr = self.predict_proba(X) self.f_bmr.fit(y, X_bmr) return self def _fit_stacking_model(self,X, y, cost_mat, max_iter=100): """Private function used to fit the stacking model.""" self.f_staking = CostSensitiveLogisticRegression(verbose=self.verbose, max_iter=max_iter) X_stacking = _create_stacking_set(self.estimators_, self.estimators_features_, self.estimators_weight_, X, self.combination) self.f_staking.fit(X_stacking, y, cost_mat) return self #TODO: _evaluate_oob_savings in parallel def _evaluate_oob_savings(self, X, y, cost_mat): """Private function used to calculate the OOB Savings of each estimator.""" estimators_weight = [] for estimator, samples, features in zip(self.estimators_, self.estimators_samples_, self.estimators_features_): # Test if all examples where used for training if not np.any(~samples): # Then use training oob_pred = estimator.predict(X[:, features]) oob_savings = max(0, savings_score(y, oob_pred, cost_mat)) else: # Then use OOB oob_pred = estimator.predict((X[~samples])[:, features]) oob_savings = max(0, savings_score(y[~samples], oob_pred, cost_mat[~samples])) estimators_weight.append(oob_savings) # Control in case were all weights are 0 if sum(estimators_weight) == 0: self.estimators_weight_ = np.ones(len(estimators_weight)) / len(estimators_weight) else: self.estimators_weight_ = (np.array(estimators_weight) / sum(estimators_weight)).tolist() return self def _validate_y(self, y): # Default implementation return column_or_1d(y, warn=True)
sets = train_test_split(data.data, data.target, data.cost_mat, test_size=0.33, random_state=10) X_train, X_test, y_train, y_test, cost_mat_train, cost_mat_test = sets y_pred_test_rf = RandomForestClassifier(random_state=0).fit( X_train, y_train).predict(X_test) fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred_test_rf) print('The auc_score of RandomForest is {:.2f}'.format(metrics.auc(fpr, tpr))) print('*' * 90) y_prob_test = RandomForestClassifier(random_state=0).fit( X_train, y_train).predict_proba(X_test) f_bmr = BayesMinimumRiskClassifier(calibration=True) f_bmr.fit(y_test, y_prob_test) y_pred_test_bmr = f_bmr.predict(y_prob_test, cost_mat_test) fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred_test_bmr) print( 'The auc_score of using RandomForest and BayesMinimumRiskClassifieris{:.2f}' .format(metrics.auc(fpr, tpr))) print('*' * 90) f = CostSensitiveLogisticRegression(solver='ga') f.fit(X_train, y_train, cost_mat_train) y_pred_test_cslr = f.predict(X_test) fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred_test_lr) print('The auc_score of CostSensitiveLogisticRegression is {:.2f}'.format( metrics.auc(fpr, tpr))) print('*' * 90)