def _fit_stacking_model(self,X, y, cost_mat, max_iter=100): """Private function used to fit the stacking model.""" self.f_staking = CostSensitiveLogisticRegression(verbose=self.verbose, max_iter=max_iter) X_stacking = _create_stacking_set(self.estimators_, self.estimators_features_, self.estimators_weight_, X, self.combination) self.f_staking.fit(X_stacking, y, cost_mat) return self
def train(class_index): docs_bin = read_train("../Data/train-data.dat") X_train = tfIdf(docs_bin) y_train = load_labels("../Data/train-label.dat", class_index) cost_mat_train = calculate_cost_matrix(y_train) f = CostSensitiveLogisticRegression() f.fit(X_train, y_train, cost_mat_train) return f
cost_mat_train, cost_mat_test = cost_mat[:ratio], cost_mat[ratio:] y_train, y_test, = np.argmax(y_train, axis=1), np.argmax(y_test, axis=1) print y_train.shape, y_test.shape #random forest rfc = RandomForestClassifier(random_state=0).fit(x_train, y_train) y_pred_test_rf = rfc.predict(x_test) print evaluate(y_pred_test_rf, y_test, cost_mat_test) #logistic regression lr = LogisticRegression(random_state=0).fit(x_train, y_train) y_pred_test_lr = lr.predict(x_test) print evaluate(y_pred_test_lr, y_test, cost_mat_test) #cost-sensitive decision trees CSDT = CostSensitiveDecisionTreeClassifier().fit(x_train, y_train, cost_mat_train) y_pred_test_csdt = CSDT.predict(x_test) print evaluate(y_pred_test_csdt, y_test, cost_mat_test) #cost-sensitive lr CSLR = CostSensitiveLogisticRegression() CSLR.fit(x_train, y_train, cost_mat_train) y_pred_test_cslr = CSLR.predict(x_test) print evaluate(y_pred_test_cslr, y_test, cost_mat_test)
print('The auc_score of RandomForest is {:.2f}'.format(metrics.auc(fpr, tpr))) print('*' * 90) y_prob_test = RandomForestClassifier(random_state=0).fit( X_train, y_train).predict_proba(X_test) f_bmr = BayesMinimumRiskClassifier(calibration=True) f_bmr.fit(y_test, y_prob_test) y_pred_test_bmr = f_bmr.predict(y_prob_test, cost_mat_test) fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred_test_bmr) print( 'The auc_score of using RandomForest and BayesMinimumRiskClassifieris{:.2f}' .format(metrics.auc(fpr, tpr))) print('*' * 90) f = CostSensitiveLogisticRegression(solver='ga') f.fit(X_train, y_train, cost_mat_train) y_pred_test_cslr = f.predict(X_test) fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred_test_lr) print('The auc_score of CostSensitiveLogisticRegression is {:.2f}'.format( metrics.auc(fpr, tpr))) print('*' * 90) f = CostSensitiveDecisionTreeClassifier() f.fit(X_train, y_train, cost_mat_train) y_pred_test_csdt = f.fit(X_train, y_train, cost_mat_train).predict(X_test) fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred_test_csdt) print('The auc_score of using CostSensitiveDecisionTreeClassifier is {:.2f}'. format(metrics.auc(fpr, tpr))) print('*' * 90)
# calculate the number of each class num = {} #for #num['toxic'] = train['toxic'] loss = [] loss_max = 0 losses = np.zeros((train.shape[0], 1)) for i, j in enumerate(col): print('===Fit ' + j) num[j] = train[j].sum() number = str(num[j]) print('The # of' + j + ' is ' + number) model = CostSensitiveLogisticRegression(C=3) num[j] = train[j].sum() fn = nrow_train / num[j] cost_mat = np.ones((nrow_train, 4)) #cost_mat[:, 1] = fn model.fit(X[:nrow_train], train[j], cost_mat) preds[:, i] = model.predict_proba(X[nrow_train:])[:, 1] pred_train = model.predict_proba(X[:nrow_train])[:, 1] logloss = log_loss(train[j], pred_train) print('log loss:', logloss) print('Avg_loss:', logloss / num[j]) loss.append(log_loss(train[j], pred_train)) # calculate the log loss of each pair print('mean column-wise log loss:', np.mean(loss))