class RC30(ClassifierMixin, BaseEstimator): def __init__(self, n_estimators=30, max_depth=3, min_samples_split=2, min_samples_leaf=1, ctype="isotonic"): self.n_estimators = n_estimators self.max_depth = max_depth self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.ctype = ctype def fit(self, X, y): X, y = check_X_y(X, y) self.model = RandomForestClassifier(n_estimators=self.n_estimators, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf) if self.ctype == "logistic": self.calibrator = LogisticRegression(C=1e20, solver="lbfgs") elif self.ctype == "isotonic": self.calibrator = IsotonicRegression(y_min=0, y_max=1, out_of_bounds="clip") X0, X1, y0, y1 = train_test_split(X, y, test_size=0.3) self.model.fit(X0, y0) if self.ctype == "logistic": y_est = self.model.predict_proba(X1)[:,[1]] self.calibrator.fit(y_est, y1) elif self.ctype == "isotonic": y_est = self.model.predict_proba(X1)[:,1] self.calibrator.fit(y_est, y1) self.is_fitted_ = True return self def predict_proba(self, X): X = check_array(X) check_is_fitted(self, 'is_fitted_') if self.ctype == "logistic": return self.calibrator.predict_proba( self.model.predict_proba(X)[:,[1]]) elif self.ctype == "isotonic": n, m = X.shape y = np.zeros((n,2)) y[:,1] = self.calibrator.predict( self.model.predict_proba(X)[:,1]) y[:,0] = 1 - y[:,1] return y
class CaliForest(ClassifierMixin, BaseEstimator): def __init__(self, n_estimators=300, criterion="gini", max_depth=5, min_samples_split=2, min_samples_leaf=1, ctype="isotonic", alpha0=100, beta0=25): self.n_estimators = n_estimators self.criterion = criterion self.max_depth = max_depth self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.ctype = ctype self.alpha0 = alpha0 self.beta0 = beta0 def fit(self, X, y): X, y = check_X_y(X, y, accept_sparse=False) self.estimators = [] self.calibrator = None for i in range(self.n_estimators): self.estimators.append( Tree(criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, max_features="auto")) if self.ctype == "logistic": self.calibrator = LR(penalty="none", solver="saga", max_iter=5000) elif self.ctype == "isotonic": self.calibrator = Iso(y_min=0, y_max=1, out_of_bounds="clip") n, m = X.shape Y_oob = np.full((n, self.n_estimators), np.nan) n_oob = np.zeros(n) IB = np.zeros((n, self.n_estimators), dtype=int) OOB = np.full((n, self.n_estimators), True) for eid in range(self.n_estimators): IB[:, eid] = np.random.choice(n, n) OOB[IB[:, eid], eid] = False for eid, est in enumerate(self.estimators): ib_idx = IB[:, eid] oob_idx = OOB[:, eid] est.fit(X[ib_idx, :], y[ib_idx]) Y_oob[oob_idx, eid] = est.predict_proba(X[oob_idx, :])[:, 1] n_oob[oob_idx] += 1 oob_idx = n_oob > 1 Y_oob_ = Y_oob[oob_idx, :] n_oob_ = n_oob[oob_idx] z_hat = np.nanmean(Y_oob_, axis=1) z_true = y[oob_idx] beta = self.beta0 + np.nanvar(Y_oob_, axis=1) * n_oob_ / 2 alpha = self.alpha0 + n_oob_ / 2 z_weight = alpha / beta if self.ctype == "logistic": self.calibrator.fit(z_hat[:, np.newaxis], z_true, z_weight) elif self.ctype == "isotonic": self.calibrator.fit(z_hat, z_true, z_weight) self.is_fitted_ = True return self def predict_proba(self, X): X = check_array(X) check_is_fitted(self, 'is_fitted_') n, m = X.shape n_est = len(self.estimators) z = np.zeros(n) y_mat = np.zeros((n, 2)) for eid, est in enumerate(self.estimators): z += est.predict_proba(X)[:, 1] z /= n_est if self.ctype == "logistic": y_mat[:, 1] = self.calibrator.predict_proba(z[:, np.newaxis])[:, 1] elif self.ctype == "isotonic": y_mat[:, 1] = self.calibrator.predict(z) y_mat[:, 0] = 1 - y_mat[:, 1] return y_mat def predict(self, X): proba = self.predict_proba(X) return np.argmax(proba, axis=1)