def _fit_stacking_model(self,X, y, cost_mat, max_iter=100): """Private function used to fit the stacking model.""" self.f_staking = CostSensitiveLogisticRegression(verbose=self.verbose, max_iter=max_iter) X_stacking = _create_stacking_set(self.estimators_, self.estimators_features_, self.estimators_weight_, X, self.combination) self.f_staking.fit(X_stacking, y, cost_mat) return self
def train(class_index): docs_bin = read_train("../Data/train-data.dat") X_train = tfIdf(docs_bin) y_train = load_labels("../Data/train-label.dat", class_index) cost_mat_train = calculate_cost_matrix(y_train) f = CostSensitiveLogisticRegression() f.fit(X_train, y_train, cost_mat_train) return f
class BaseBagging(with_metaclass(ABCMeta, BaseEnsemble)): """Base class for Bagging meta-estimator. Warning: This class should not be used directly. Use derived classes instead. """ @abstractmethod def __init__(self, base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, combination='majority_voting', n_jobs=1, random_state=None, verbose=0): super(BaseBagging, self).__init__(base_estimator=base_estimator, n_estimators=n_estimators) self.max_samples = max_samples self.max_features = max_features self.bootstrap = bootstrap self.bootstrap_features = bootstrap_features self.combination = combination self.n_jobs = n_jobs self.random_state = random_state self.verbose = verbose def fit(self, X, y, cost_mat, sample_weight=None): """Build a Bagging ensemble of estimators from the training set (X, y). Parameters ---------- X : {array-like, sparse matrix} of shape = [n_samples, n_features] The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. y : array-like, shape = [n_samples] The target values (class labels in classification, real numbers in regression). cost_mat : array-like of shape = [n_samples, 4] Cost matrix of the classification problem Where the columns represents the costs of: false positives, false negatives, true positives and true negatives, for each example. sample_weight : array-like, shape = [n_samples] or None Sample weights. If None, then samples are equally weighted. Note that this is supported only if the base estimator supports sample weighting. Returns ------- self : object Returns self. """ random_state = check_random_state(self.random_state) # Convert data # X, y = check_X_y(X, y, ['csr', 'csc', 'coo']) # Not in sklearn verion 0.15 # Remap output n_samples, self.n_features_ = X.shape y = self._validate_y(y) # Check parameters self._validate_estimator() if isinstance(self.max_samples, (numbers.Integral, np.integer)): max_samples = self.max_samples else: # float max_samples = int(self.max_samples * X.shape[0]) if not (0 < max_samples <= X.shape[0]): raise ValueError("max_samples must be in (0, n_samples]") if isinstance(self.max_features, (numbers.Integral, np.integer)): max_features = self.max_features else: # float max_features = int(self.max_features * self.n_features_) if not (0 < max_features <= self.n_features_): raise ValueError("max_features must be in (0, n_features]") # Free allocated memory, if any self.estimators_ = None # Parallel loop n_jobs, n_estimators, starts = _partition_estimators( self.n_estimators, self.n_jobs) seeds = random_state.randint(MAX_INT, size=self.n_estimators) all_results = Parallel(n_jobs=n_jobs, verbose=self.verbose)( delayed(_parallel_build_estimators)(n_estimators[i], self, X, y, cost_mat, seeds[starts[i]:starts[i + 1]], verbose=self.verbose) for i in range(n_jobs)) # Reduce self.estimators_ = list( itertools.chain.from_iterable(t[0] for t in all_results)) self.estimators_samples_ = list( itertools.chain.from_iterable(t[1] for t in all_results)) self.estimators_features_ = list( itertools.chain.from_iterable(t[2] for t in all_results)) self._evaluate_oob_savings(X, y, cost_mat) if self.combination in [ 'stacking', 'stacking_proba', 'stacking_bmr', 'stacking_proba_bmr' ]: self._fit_stacking_model(X, y, cost_mat) if self.combination in [ 'majority_bmr', 'weighted_bmr', 'stacking_bmr', 'stacking_proba_bmr' ]: self._fit_bmr_model(X, y) return self def _fit_bmr_model(self, X, y): """Private function used to fit the BayesMinimumRisk model.""" self.f_bmr = BayesMinimumRiskClassifier() X_bmr = self.predict_proba(X) self.f_bmr.fit(y, X_bmr) return self def _fit_stacking_model(self, X, y, cost_mat, max_iter=100): """Private function used to fit the stacking model.""" self.f_staking = CostSensitiveLogisticRegression(verbose=self.verbose, max_iter=max_iter) X_stacking = _create_stacking_set(self.estimators_, self.estimators_features_, self.estimators_weight_, X, self.combination) self.f_staking.fit(X_stacking, y, cost_mat) return self #TODO: _evaluate_oob_savings in parallel def _evaluate_oob_savings(self, X, y, cost_mat): """Private function used to calculate the OOB Savings of each estimator.""" estimators_weight = [] for estimator, samples, features in zip(self.estimators_, self.estimators_samples_, self.estimators_features_): # Test if all examples where used for training if not np.any(~samples): # Then use training oob_pred = estimator.predict(X[:, features]) oob_savings = max(0, savings_score(y, oob_pred, cost_mat)) else: # Then use OOB oob_pred = estimator.predict((X[~samples])[:, features]) oob_savings = max( 0, savings_score(y[~samples], oob_pred, cost_mat[~samples])) estimators_weight.append(oob_savings) # Control in case were all weights are 0 if sum(estimators_weight) == 0: self.estimators_weight_ = np.ones( len(estimators_weight)) / len(estimators_weight) else: self.estimators_weight_ = (np.array(estimators_weight) / sum(estimators_weight)).tolist() return self def _validate_y(self, y): # Default implementation return column_or_1d(y, warn=True)
cost_mat_train, cost_mat_test = cost_mat[:ratio], cost_mat[ratio:] y_train, y_test, = np.argmax(y_train, axis=1), np.argmax(y_test, axis=1) print y_train.shape, y_test.shape #random forest rfc = RandomForestClassifier(random_state=0).fit(x_train, y_train) y_pred_test_rf = rfc.predict(x_test) print evaluate(y_pred_test_rf, y_test, cost_mat_test) #logistic regression lr = LogisticRegression(random_state=0).fit(x_train, y_train) y_pred_test_lr = lr.predict(x_test) print evaluate(y_pred_test_lr, y_test, cost_mat_test) #cost-sensitive decision trees CSDT = CostSensitiveDecisionTreeClassifier().fit(x_train, y_train, cost_mat_train) y_pred_test_csdt = CSDT.predict(x_test) print evaluate(y_pred_test_csdt, y_test, cost_mat_test) #cost-sensitive lr CSLR = CostSensitiveLogisticRegression() CSLR.fit(x_train, y_train, cost_mat_train) y_pred_test_cslr = CSLR.predict(x_test) print evaluate(y_pred_test_cslr, y_test, cost_mat_test)
class BaseBagging(with_metaclass(ABCMeta, BaseEnsemble)): """Base class for Bagging meta-estimator. Warning: This class should not be used directly. Use derived classes instead. """ @abstractmethod def __init__(self, base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, combination='majority_voting', n_jobs=1, random_state=None, verbose=0): super(BaseBagging, self).__init__( base_estimator=base_estimator, n_estimators=n_estimators) self.max_samples = max_samples self.max_features = max_features self.bootstrap = bootstrap self.bootstrap_features = bootstrap_features self.combination = combination self.n_jobs = n_jobs self.random_state = random_state self.verbose = verbose def fit(self, X, y, cost_mat, sample_weight=None): """Build a Bagging ensemble of estimators from the training set (X, y). Parameters ---------- X : {array-like, sparse matrix} of shape = [n_samples, n_features] The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. y : array-like, shape = [n_samples] The target values (class labels in classification, real numbers in regression). cost_mat : array-like of shape = [n_samples, 4] Cost matrix of the classification problem Where the columns represents the costs of: false positives, false negatives, true positives and true negatives, for each example. sample_weight : array-like, shape = [n_samples] or None Sample weights. If None, then samples are equally weighted. Note that this is supported only if the base estimator supports sample weighting. Returns ------- self : object Returns self. """ random_state = check_random_state(self.random_state) # Convert data # X, y = check_X_y(X, y, ['csr', 'csc', 'coo']) # Not in sklearn verion 0.15 # Remap output n_samples, self.n_features_ = X.shape y = self._validate_y(y) # Check parameters self._validate_estimator() if isinstance(self.max_samples, (numbers.Integral, np.integer)): max_samples = self.max_samples else: # float max_samples = int(self.max_samples * X.shape[0]) if not (0 < max_samples <= X.shape[0]): raise ValueError("max_samples must be in (0, n_samples]") if isinstance(self.max_features, (numbers.Integral, np.integer)): max_features = self.max_features else: # float max_features = int(self.max_features * self.n_features_) if not (0 < max_features <= self.n_features_): raise ValueError("max_features must be in (0, n_features]") # Free allocated memory, if any self.estimators_ = None # Parallel loop n_jobs, n_estimators, starts = _partition_estimators(self.n_estimators, self.n_jobs) seeds = random_state.randint(MAX_INT, size=self.n_estimators) all_results = Parallel(n_jobs=n_jobs, verbose=self.verbose)( delayed(_parallel_build_estimators)( n_estimators[i], self, X, y, cost_mat, seeds[starts[i]:starts[i + 1]], verbose=self.verbose) for i in range(n_jobs)) # Reduce self.estimators_ = list(itertools.chain.from_iterable( t[0] for t in all_results)) self.estimators_samples_ = list(itertools.chain.from_iterable( t[1] for t in all_results)) self.estimators_features_ = list(itertools.chain.from_iterable( t[2] for t in all_results)) self._evaluate_oob_savings(X, y, cost_mat) if self.combination in ['stacking', 'stacking_proba', 'stacking_bmr', 'stacking_proba_bmr']: self._fit_stacking_model(X, y, cost_mat) if self.combination in ['majority_bmr', 'weighted_bmr', 'stacking_bmr', 'stacking_proba_bmr']: self._fit_bmr_model(X, y) return self def _fit_bmr_model(self, X, y): """Private function used to fit the BayesMinimumRisk model.""" self.f_bmr = BayesMinimumRiskClassifier() X_bmr = self.predict_proba(X) self.f_bmr.fit(y, X_bmr) return self def _fit_stacking_model(self,X, y, cost_mat, max_iter=100): """Private function used to fit the stacking model.""" self.f_staking = CostSensitiveLogisticRegression(verbose=self.verbose, max_iter=max_iter) X_stacking = _create_stacking_set(self.estimators_, self.estimators_features_, self.estimators_weight_, X, self.combination) self.f_staking.fit(X_stacking, y, cost_mat) return self #TODO: _evaluate_oob_savings in parallel def _evaluate_oob_savings(self, X, y, cost_mat): """Private function used to calculate the OOB Savings of each estimator.""" estimators_weight = [] for estimator, samples, features in zip(self.estimators_, self.estimators_samples_, self.estimators_features_): # Test if all examples where used for training if not np.any(~samples): # Then use training oob_pred = estimator.predict(X[:, features]) oob_savings = max(0, savings_score(y, oob_pred, cost_mat)) else: # Then use OOB oob_pred = estimator.predict((X[~samples])[:, features]) oob_savings = max(0, savings_score(y[~samples], oob_pred, cost_mat[~samples])) estimators_weight.append(oob_savings) # Control in case were all weights are 0 if sum(estimators_weight) == 0: self.estimators_weight_ = np.ones(len(estimators_weight)) / len(estimators_weight) else: self.estimators_weight_ = (np.array(estimators_weight) / sum(estimators_weight)).tolist() return self def _validate_y(self, y): # Default implementation return column_or_1d(y, warn=True)
print('The auc_score of RandomForest is {:.2f}'.format(metrics.auc(fpr, tpr))) print('*' * 90) y_prob_test = RandomForestClassifier(random_state=0).fit( X_train, y_train).predict_proba(X_test) f_bmr = BayesMinimumRiskClassifier(calibration=True) f_bmr.fit(y_test, y_prob_test) y_pred_test_bmr = f_bmr.predict(y_prob_test, cost_mat_test) fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred_test_bmr) print( 'The auc_score of using RandomForest and BayesMinimumRiskClassifieris{:.2f}' .format(metrics.auc(fpr, tpr))) print('*' * 90) f = CostSensitiveLogisticRegression(solver='ga') f.fit(X_train, y_train, cost_mat_train) y_pred_test_cslr = f.predict(X_test) fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred_test_lr) print('The auc_score of CostSensitiveLogisticRegression is {:.2f}'.format( metrics.auc(fpr, tpr))) print('*' * 90) f = CostSensitiveDecisionTreeClassifier() f.fit(X_train, y_train, cost_mat_train) y_pred_test_csdt = f.fit(X_train, y_train, cost_mat_train).predict(X_test) fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred_test_csdt) print('The auc_score of using CostSensitiveDecisionTreeClassifier is {:.2f}'. format(metrics.auc(fpr, tpr))) print('*' * 90)
print('The auc_score of using only RandomForest is {:.2f}'.format(metrics.auc(fpr,tpr))) fpr,tpr,threshold=metrics.roc_curve(y_test,y_pred_test_rf_t) print('The auc_score of using RandomForest and ThresholdingOptimization is {:.2f}'.format(metrics.auc(fpr,tpr))) #CostSensitiveLogisticRegression from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from costcla.datasets import load_creditscoring2 from costcla.models import CostSensitiveLogisticRegression from costcla.metrics import savings_score data = load_creditscoring2() sets = train_test_split(data.data, data.target, data.cost_mat, test_size=0.33, random_state=44) X_train, X_test, y_train, y_test, cost_mat_train, cost_mat_test = sets y_pred_test_lr = LogisticRegression(solver='lbfgs').fit(X_train, y_train).predict(X_test) f = CostSensitiveLogisticRegression(solver='ga') f.fit(X_train, y_train, cost_mat_train) y_pred_test_cslr = f.predict(X_test) fpr,tpr,threshold=metrics.roc_curve(y_test,y_pred_test_lr) print('The auc_score of using only LogisticRegression is {:.2f}'.format(metrics.auc(fpr,tpr))) fpr,tpr,threshold=metrics.roc_curve(y_test,y_pred_test_cslr) print('The auc_score of using CostSensitiveLogisticRegression is {:.2f}'.format(metrics.auc(fpr,tpr))) #CostSensitiveDecisionTreeClassifier #example-dependent ''' 1.构建树的过程中,cost-sensetive主要作用于impurity,Ic(S) = min(Cost(f0(S)), Cost(f1(S))),将其代入信息增益Gain(xj,lj)中,以cost-sensetive的方法来选择最优特征 2.修剪枝的过程,计算删除一个节点后代价 3.预测过程也同样使用cost-sensetive,对于每个leaf,模型训练完之后比较其中全预测为0和全为1的代价,谁的代价低就预测为谁 '''
# calculate the number of each class num = {} #for #num['toxic'] = train['toxic'] loss = [] loss_max = 0 losses = np.zeros((train.shape[0], 1)) for i, j in enumerate(col): print('===Fit ' + j) num[j] = train[j].sum() number = str(num[j]) print('The # of' + j + ' is ' + number) model = CostSensitiveLogisticRegression(C=3) num[j] = train[j].sum() fn = nrow_train / num[j] cost_mat = np.ones((nrow_train, 4)) #cost_mat[:, 1] = fn model.fit(X[:nrow_train], train[j], cost_mat) preds[:, i] = model.predict_proba(X[nrow_train:])[:, 1] pred_train = model.predict_proba(X[:nrow_train])[:, 1] logloss = log_loss(train[j], pred_train) print('log loss:', logloss) print('Avg_loss:', logloss / num[j]) loss.append(log_loss(train[j], pred_train)) # calculate the log loss of each pair print('mean column-wise log loss:', np.mean(loss))