Пример #1
0
def _create_bmr_model(model, X_val, y_val, calibration=True):
    y_hat_val_proba = model.predict_proba(X_val)

    bmr = BayesMinimumRiskClassifier(calibration=calibration)
    bmr.fit(y_val, y_hat_val_proba)

    return model, bmr
Пример #2
0
def cost_sensitive_classification(model, X_train, X_test, y_train, y_test, cost_mat_test):

	c_model = BayesMinimumRiskClassifier()
	y_prob_test = model.predict_proba(X_test)
	y_pred_test_model = model.predict(X_test)
	c_model.fit(y_test, y_prob_test)
	y_pred_test_c_model = c_model.predict(y_prob_test, cost_mat_test)
	c_accuracy = accuracy_score(y_test, y_pred_test_c_model)
	
	return c_accuracy, y_pred_test_c_model
Пример #3
0
clf = RandomForestClassifier(random_state=0, n_estimators=100)
model = clf.fit(X_train, y_train)
prob_test = model.predict_proba(X_test)
bmr = BayesMinimumRiskClassifier(calibration=False)
pred_test = bmr.predict(prob_test, cost_matrix)
print(classification_report(y_test, pred_test, target_names=data.target_names))
loss = cost_loss(y_test, pred_test, cost_matrix)
print("%d\n" % loss)
print(confusion_matrix(y_test, pred_test).T)  # transpose to align with slides

print("costcla calibration on training set")
clf = RandomForestClassifier(random_state=0, n_estimators=100)
model = clf.fit(X_train, y_train)
prob_train = model.predict_proba(X_train)
bmr = BayesMinimumRiskClassifier(calibration=True)
bmr.fit(y_train, prob_train)
prob_test = model.predict_proba(X_test)
pred_test = bmr.predict(prob_test, cost_matrix)
print(classification_report(y_test, pred_test, target_names=data.target_names))
loss = cost_loss(y_test, pred_test, cost_matrix)
print("%d\n" % loss)
print(confusion_matrix(y_test, pred_test).T)  # transpose to align with slides

print("\nsigmoid calibration")
clf = RandomForestClassifier(random_state=0, n_estimators=100)
cc = CalibratedClassifierCV(clf, method="sigmoid", cv=3)
model = cc.fit(X_train, y_train)
prob_test = model.predict_proba(X_test)
bmr = BayesMinimumRiskClassifier(calibration=False)
pred_test = bmr.predict(prob_test, cost_matrix)
print(classification_report(y_test, pred_test, target_names=data.target_names))
class BaseBagging(with_metaclass(ABCMeta, BaseEnsemble)):
    """Base class for Bagging meta-estimator.

    Warning: This class should not be used directly. Use derived classes
    instead.
    """
    @abstractmethod
    def __init__(self,
                 base_estimator=None,
                 n_estimators=10,
                 max_samples=1.0,
                 max_features=1.0,
                 bootstrap=True,
                 bootstrap_features=False,
                 combination='majority_voting',
                 n_jobs=1,
                 random_state=None,
                 verbose=0):
        super(BaseBagging, self).__init__(base_estimator=base_estimator,
                                          n_estimators=n_estimators)

        self.max_samples = max_samples
        self.max_features = max_features
        self.bootstrap = bootstrap
        self.bootstrap_features = bootstrap_features
        self.combination = combination
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.verbose = verbose

    def fit(self, X, y, cost_mat, sample_weight=None):
        """Build a Bagging ensemble of estimators from the training set (X, y).

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape = [n_samples, n_features]
            The training input samples. Sparse matrices are accepted only if
            they are supported by the base estimator.

        y : array-like, shape = [n_samples]
            The target values (class labels in classification, real numbers in
            regression).

        cost_mat : array-like of shape = [n_samples, 4]
            Cost matrix of the classification problem
            Where the columns represents the costs of: false positives, false negatives,
            true positives and true negatives, for each example.

        sample_weight : array-like, shape = [n_samples] or None
            Sample weights. If None, then samples are equally weighted.
            Note that this is supported only if the base estimator supports
            sample weighting.

        Returns
        -------
        self : object
            Returns self.
        """
        random_state = check_random_state(self.random_state)

        # Convert data
        # X, y = check_X_y(X, y, ['csr', 'csc', 'coo'])  # Not in sklearn verion 0.15

        # Remap output
        n_samples, self.n_features_ = X.shape
        y = self._validate_y(y)

        # Check parameters
        self._validate_estimator()

        if isinstance(self.max_samples, (numbers.Integral, np.integer)):
            max_samples = self.max_samples
        else:  # float
            max_samples = int(self.max_samples * X.shape[0])

        if not (0 < max_samples <= X.shape[0]):
            raise ValueError("max_samples must be in (0, n_samples]")

        if isinstance(self.max_features, (numbers.Integral, np.integer)):
            max_features = self.max_features
        else:  # float
            max_features = int(self.max_features * self.n_features_)

        if not (0 < max_features <= self.n_features_):
            raise ValueError("max_features must be in (0, n_features]")

        # Free allocated memory, if any
        self.estimators_ = None

        # Parallel loop
        n_jobs, n_estimators, starts = _partition_estimators(
            self.n_estimators, self.n_jobs)
        seeds = random_state.randint(MAX_INT, size=self.n_estimators)

        all_results = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
            delayed(_parallel_build_estimators)(n_estimators[i],
                                                self,
                                                X,
                                                y,
                                                cost_mat,
                                                seeds[starts[i]:starts[i + 1]],
                                                verbose=self.verbose)
            for i in range(n_jobs))

        # Reduce
        self.estimators_ = list(
            itertools.chain.from_iterable(t[0] for t in all_results))
        self.estimators_samples_ = list(
            itertools.chain.from_iterable(t[1] for t in all_results))
        self.estimators_features_ = list(
            itertools.chain.from_iterable(t[2] for t in all_results))

        self._evaluate_oob_savings(X, y, cost_mat)

        if self.combination in [
                'stacking', 'stacking_proba', 'stacking_bmr',
                'stacking_proba_bmr'
        ]:
            self._fit_stacking_model(X, y, cost_mat)

        if self.combination in [
                'majority_bmr', 'weighted_bmr', 'stacking_bmr',
                'stacking_proba_bmr'
        ]:
            self._fit_bmr_model(X, y)

        return self

    def _fit_bmr_model(self, X, y):
        """Private function used to fit the BayesMinimumRisk model."""
        self.f_bmr = BayesMinimumRiskClassifier()
        X_bmr = self.predict_proba(X)
        self.f_bmr.fit(y, X_bmr)
        return self

    def _fit_stacking_model(self, X, y, cost_mat, max_iter=100):
        """Private function used to fit the stacking model."""
        self.f_staking = CostSensitiveLogisticRegression(verbose=self.verbose,
                                                         max_iter=max_iter)
        X_stacking = _create_stacking_set(self.estimators_,
                                          self.estimators_features_,
                                          self.estimators_weight_, X,
                                          self.combination)
        self.f_staking.fit(X_stacking, y, cost_mat)
        return self

    #TODO: _evaluate_oob_savings in parallel
    def _evaluate_oob_savings(self, X, y, cost_mat):
        """Private function used to calculate the OOB Savings of each estimator."""
        estimators_weight = []
        for estimator, samples, features in zip(self.estimators_,
                                                self.estimators_samples_,
                                                self.estimators_features_):
            # Test if all examples where used for training
            if not np.any(~samples):
                # Then use training
                oob_pred = estimator.predict(X[:, features])
                oob_savings = max(0, savings_score(y, oob_pred, cost_mat))
            else:
                # Then use OOB
                oob_pred = estimator.predict((X[~samples])[:, features])
                oob_savings = max(
                    0, savings_score(y[~samples], oob_pred,
                                     cost_mat[~samples]))

            estimators_weight.append(oob_savings)

        # Control in case were all weights are 0
        if sum(estimators_weight) == 0:
            self.estimators_weight_ = np.ones(
                len(estimators_weight)) / len(estimators_weight)
        else:
            self.estimators_weight_ = (np.array(estimators_weight) /
                                       sum(estimators_weight)).tolist()

        return self

    def _validate_y(self, y):
        # Default implementation
        return column_or_1d(y, warn=True)
class BaseBagging(with_metaclass(ABCMeta, BaseEnsemble)):
    """Base class for Bagging meta-estimator.

    Warning: This class should not be used directly. Use derived classes
    instead.
    """

    @abstractmethod
    def __init__(self,
                 base_estimator=None,
                 n_estimators=10,
                 max_samples=1.0,
                 max_features=1.0,
                 bootstrap=True,
                 bootstrap_features=False,
                 combination='majority_voting',
                 n_jobs=1,
                 random_state=None,
                 verbose=0):
        super(BaseBagging, self).__init__(
            base_estimator=base_estimator,
            n_estimators=n_estimators)

        self.max_samples = max_samples
        self.max_features = max_features
        self.bootstrap = bootstrap
        self.bootstrap_features = bootstrap_features
        self.combination = combination
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.verbose = verbose

    def fit(self, X, y, cost_mat, sample_weight=None):
        """Build a Bagging ensemble of estimators from the training set (X, y).

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape = [n_samples, n_features]
            The training input samples. Sparse matrices are accepted only if
            they are supported by the base estimator.

        y : array-like, shape = [n_samples]
            The target values (class labels in classification, real numbers in
            regression).

        cost_mat : array-like of shape = [n_samples, 4]
            Cost matrix of the classification problem
            Where the columns represents the costs of: false positives, false negatives,
            true positives and true negatives, for each example.

        sample_weight : array-like, shape = [n_samples] or None
            Sample weights. If None, then samples are equally weighted.
            Note that this is supported only if the base estimator supports
            sample weighting.

        Returns
        -------
        self : object
            Returns self.
        """
        random_state = check_random_state(self.random_state)

        # Convert data
        # X, y = check_X_y(X, y, ['csr', 'csc', 'coo'])  # Not in sklearn verion 0.15

        # Remap output
        n_samples, self.n_features_ = X.shape
        y = self._validate_y(y)

        # Check parameters
        self._validate_estimator()

        if isinstance(self.max_samples, (numbers.Integral, np.integer)):
            max_samples = self.max_samples
        else:  # float
            max_samples = int(self.max_samples * X.shape[0])

        if not (0 < max_samples <= X.shape[0]):
            raise ValueError("max_samples must be in (0, n_samples]")

        if isinstance(self.max_features, (numbers.Integral, np.integer)):
            max_features = self.max_features
        else:  # float
            max_features = int(self.max_features * self.n_features_)

        if not (0 < max_features <= self.n_features_):
            raise ValueError("max_features must be in (0, n_features]")

        # Free allocated memory, if any
        self.estimators_ = None

        # Parallel loop
        n_jobs, n_estimators, starts = _partition_estimators(self.n_estimators,
                                                             self.n_jobs)
        seeds = random_state.randint(MAX_INT, size=self.n_estimators)

        all_results = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
            delayed(_parallel_build_estimators)(
                n_estimators[i],
                self,
                X,
                y,
                cost_mat,
                seeds[starts[i]:starts[i + 1]],
                verbose=self.verbose)
            for i in range(n_jobs))

        # Reduce
        self.estimators_ = list(itertools.chain.from_iterable(
            t[0] for t in all_results))
        self.estimators_samples_ = list(itertools.chain.from_iterable(
            t[1] for t in all_results))
        self.estimators_features_ = list(itertools.chain.from_iterable(
            t[2] for t in all_results))

        self._evaluate_oob_savings(X, y, cost_mat)

        if self.combination in ['stacking', 'stacking_proba', 'stacking_bmr', 'stacking_proba_bmr']:
            self._fit_stacking_model(X, y, cost_mat)

        if self.combination in ['majority_bmr', 'weighted_bmr', 'stacking_bmr', 'stacking_proba_bmr']:
            self._fit_bmr_model(X, y)

        return self

    def _fit_bmr_model(self, X, y):
        """Private function used to fit the BayesMinimumRisk model."""
        self.f_bmr = BayesMinimumRiskClassifier()
        X_bmr = self.predict_proba(X)
        self.f_bmr.fit(y, X_bmr)
        return self

    def _fit_stacking_model(self,X, y, cost_mat, max_iter=100):
        """Private function used to fit the stacking model."""
        self.f_staking = CostSensitiveLogisticRegression(verbose=self.verbose, max_iter=max_iter)
        X_stacking = _create_stacking_set(self.estimators_, self.estimators_features_,
                                          self.estimators_weight_, X, self.combination)
        self.f_staking.fit(X_stacking, y, cost_mat)
        return self

    #TODO: _evaluate_oob_savings in parallel
    def _evaluate_oob_savings(self, X, y, cost_mat):
        """Private function used to calculate the OOB Savings of each estimator."""
        estimators_weight = []
        for estimator, samples, features in zip(self.estimators_, self.estimators_samples_,
                                                self.estimators_features_):
            # Test if all examples where used for training
            if not np.any(~samples):
                # Then use training
                oob_pred = estimator.predict(X[:, features])
                oob_savings = max(0, savings_score(y, oob_pred, cost_mat))
            else:
                # Then use OOB
                oob_pred = estimator.predict((X[~samples])[:, features])
                oob_savings = max(0, savings_score(y[~samples], oob_pred, cost_mat[~samples]))

            estimators_weight.append(oob_savings)

        # Control in case were all weights are 0
        if sum(estimators_weight) == 0:
            self.estimators_weight_ = np.ones(len(estimators_weight)) / len(estimators_weight)
        else:
            self.estimators_weight_ = (np.array(estimators_weight) / sum(estimators_weight)).tolist()

        return self

    def _validate_y(self, y):
        # Default implementation
        return column_or_1d(y, warn=True)
                        data.target,
                        data.cost_mat,
                        test_size=0.33,
                        random_state=10)
X_train, X_test, y_train, y_test, cost_mat_train, cost_mat_test = sets
y_pred_test_rf = RandomForestClassifier(random_state=0).fit(
    X_train, y_train).predict(X_test)
fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred_test_rf)
print('The auc_score of RandomForest is {:.2f}'.format(metrics.auc(fpr, tpr)))
print('*' * 90)

y_prob_test = RandomForestClassifier(random_state=0).fit(
    X_train, y_train).predict_proba(X_test)

f_bmr = BayesMinimumRiskClassifier(calibration=True)
f_bmr.fit(y_test, y_prob_test)
y_pred_test_bmr = f_bmr.predict(y_prob_test, cost_mat_test)
fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred_test_bmr)
print(
    'The auc_score of using RandomForest and BayesMinimumRiskClassifieris{:.2f}'
    .format(metrics.auc(fpr, tpr)))
print('*' * 90)

f = CostSensitiveLogisticRegression(solver='ga')
f.fit(X_train, y_train, cost_mat_train)
y_pred_test_cslr = f.predict(X_test)
fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred_test_lr)
print('The auc_score of CostSensitiveLogisticRegression is {:.2f}'.format(
    metrics.auc(fpr, tpr)))
print('*' * 90)
Пример #7
0
            # Probability calibration using Isotonic Method
            cc = CalibratedClassifierCV(clf, method="isotonic", cv=3)
            model = cc.fit(data, target)
            prob_test = model.predict_proba(X_test)
            bmr = BayesMinimumRiskClassifier(calibration=False)
            prediction = bmr.predict(prob_test, cost_matrix)
            loss = cost_loss(y_test[:, e], prediction, cost_matrix)
            pred_BR.append(prediction)
            cost_BR.append(loss)
            
        elif cm == 2:
            # Probability calibration using CostCla calibration            
            model = clf.fit(data, target)
            prob_train = model.predict_proba(data)
            bmr = BayesMinimumRiskClassifier(calibration=True)
            bmr.fit(target, prob_train)
            prob_test = model.predict_proba(X_test)
            prediction = bmr.predict(prob_test, cost_matrix)
            loss = cost_loss(y_test[:, e], prediction, cost_matrix)
            pred_BR.append(prediction)
            cost_BR.append(loss)

        elif cm == 3:
            # Cost minimization using class weighting
            clf = LogisticRegression(C=12.0, class_weight={0: 1, 1: cost})
            model = clf.fit(data, target)
            prediction = model.predict(X_test)
            loss = cost_loss(y_test[:, e], prediction, cost_matrix)
            pred_BR.append(prediction)
            cost_BR.append(loss)