示例#1
0
 def _fit_stacking_model(self,X, y, cost_mat, max_iter=100):
     """Private function used to fit the stacking model."""
     self.f_staking = CostSensitiveLogisticRegression(verbose=self.verbose, max_iter=max_iter)
     X_stacking = _create_stacking_set(self.estimators_, self.estimators_features_,
                                       self.estimators_weight_, X, self.combination)
     self.f_staking.fit(X_stacking, y, cost_mat)
     return self
示例#2
0
def train(class_index):
    docs_bin = read_train("../Data/train-data.dat")
    X_train = tfIdf(docs_bin)

    y_train = load_labels("../Data/train-label.dat", class_index)

    cost_mat_train = calculate_cost_matrix(y_train)

    f = CostSensitiveLogisticRegression()
    f.fit(X_train, y_train, cost_mat_train)
    return f
示例#3
0
cost_mat_train, cost_mat_test = cost_mat[:ratio], cost_mat[ratio:]

y_train, y_test, = np.argmax(y_train, axis=1), np.argmax(y_test, axis=1)

print y_train.shape, y_test.shape

#random forest
rfc = RandomForestClassifier(random_state=0).fit(x_train, y_train)
y_pred_test_rf = rfc.predict(x_test)

print evaluate(y_pred_test_rf, y_test, cost_mat_test)

#logistic regression
lr = LogisticRegression(random_state=0).fit(x_train, y_train)
y_pred_test_lr = lr.predict(x_test)

print evaluate(y_pred_test_lr, y_test, cost_mat_test)

#cost-sensitive decision trees
CSDT = CostSensitiveDecisionTreeClassifier().fit(x_train, y_train,
                                                 cost_mat_train)
y_pred_test_csdt = CSDT.predict(x_test)

print evaluate(y_pred_test_csdt, y_test, cost_mat_test)

#cost-sensitive lr
CSLR = CostSensitiveLogisticRegression()
CSLR.fit(x_train, y_train, cost_mat_train)
y_pred_test_cslr = CSLR.predict(x_test)

print evaluate(y_pred_test_cslr, y_test, cost_mat_test)
print('The auc_score of RandomForest is {:.2f}'.format(metrics.auc(fpr, tpr)))
print('*' * 90)

y_prob_test = RandomForestClassifier(random_state=0).fit(
    X_train, y_train).predict_proba(X_test)

f_bmr = BayesMinimumRiskClassifier(calibration=True)
f_bmr.fit(y_test, y_prob_test)
y_pred_test_bmr = f_bmr.predict(y_prob_test, cost_mat_test)
fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred_test_bmr)
print(
    'The auc_score of using RandomForest and BayesMinimumRiskClassifieris{:.2f}'
    .format(metrics.auc(fpr, tpr)))
print('*' * 90)

f = CostSensitiveLogisticRegression(solver='ga')
f.fit(X_train, y_train, cost_mat_train)
y_pred_test_cslr = f.predict(X_test)
fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred_test_lr)
print('The auc_score of CostSensitiveLogisticRegression is {:.2f}'.format(
    metrics.auc(fpr, tpr)))
print('*' * 90)

f = CostSensitiveDecisionTreeClassifier()
f.fit(X_train, y_train, cost_mat_train)
y_pred_test_csdt = f.fit(X_train, y_train, cost_mat_train).predict(X_test)
fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred_test_csdt)
print('The auc_score of using CostSensitiveDecisionTreeClassifier is {:.2f}'.
      format(metrics.auc(fpr, tpr)))
print('*' * 90)
示例#5
0
# calculate the number of each class
num = {}
#for
#num['toxic'] = train['toxic']

loss = []
loss_max = 0
losses = np.zeros((train.shape[0], 1))

for i, j in enumerate(col):
    print('===Fit ' + j)
    num[j] = train[j].sum()
    number = str(num[j])
    print('The # of' + j + ' is ' + number)
    model = CostSensitiveLogisticRegression(C=3)
    num[j] = train[j].sum()
    fn = nrow_train / num[j]
    cost_mat = np.ones((nrow_train, 4))
    #cost_mat[:, 1] = fn
    model.fit(X[:nrow_train], train[j], cost_mat)
    preds[:, i] = model.predict_proba(X[nrow_train:])[:, 1]
    pred_train = model.predict_proba(X[:nrow_train])[:, 1]
    logloss = log_loss(train[j], pred_train)
    print('log loss:', logloss)
    print('Avg_loss:', logloss / num[j])
    loss.append(log_loss(train[j], pred_train))

    # calculate the log loss of each pair

print('mean column-wise log loss:', np.mean(loss))