示例#1
0
def run_classification_configuration(X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold,test_idx_10_fold, train_idx_10_fold,trees,max_depth, min_child_weight,layer,cw=0.001):
    
    
    i = 0
    folds_AUC_testing = []
    folds_AUPR_testing = []
    folds_AUC_training = []
    folds_AUPR_training = []
    test_true_predict_compare = []
    train_true_predict_compare = []
    for X_train, X_test, y_train, y_test, test_idx_fold, train_idx_fold in zip(X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold, test_idx_10_fold, train_idx_10_fold):
#         X_train, X_test = X_train[:,12:], X_test[:,12:]
#         X_train, X_test = X_train[:,:12], X_test[:,:12]
        
        config = get_toy_config(trees,max_depth, min_child_weight,cw,layer)
        gc = GCForest(config)
        #print(config)
        X_train_enc = gc.fit_transform(X_train, y_train, X_test, y_test)

        y_pred_train = gc.predict(X_train)
        y_predprob_train = gc.predict_proba(X_train)
        y_pred_test = gc.predict(X_test)
        y_predprob_test = gc.predict_proba(X_test)
        y_predprob_test_df = pd.DataFrame(y_predprob_test)
        y_predprob_train_df = pd.DataFrame(y_predprob_train)
        test_true_predict_compare.append([test_idx_fold, y_pred_test, y_test, y_predprob_test[:,0], y_predprob_test[:,1]]) #10-cv
        train_true_predict_compare.append([train_idx_fold, y_pred_train, y_train, y_predprob_train[:,0], y_predprob_train[:,1]]) #10-cv
        
        precision_training, recall_training, _ = precision_recall_curve(y_train, y_predprob_train[:,1], pos_label=1)
        precision_testing, recall_testing, _ =   precision_recall_curve(y_test, y_predprob_test[:,1], pos_label=1)    
        AUPR_training = auc(recall_training,precision_training)
        AUPR_testing = auc(recall_testing, precision_testing)
        AUC_training = roc_auc_score(y_train, y_predprob_train[:,1]) 
        AUC_testing = roc_auc_score(y_test, y_predprob_test[:,1]) 

        folds_AUC_testing.append(AUC_testing)
        folds_AUPR_testing.append(AUPR_testing)
        folds_AUC_training.append(AUC_training)
        folds_AUPR_training.append(AUPR_training)

    Avg_AUPR_training = np.mean(folds_AUPR_training)
    Avg_AUPR_testing = np.mean(folds_AUPR_testing)
    Avg_AUC_training = np.mean(folds_AUC_training)
    Avg_AUC_testing = np.mean(folds_AUC_testing) 
    
    return [Avg_AUPR_training,Avg_AUPR_testing,folds_AUPR_testing, Avg_AUC_training,Avg_AUC_testing,folds_AUC_testing,folds_AUPR_training,folds_AUC_training], [test_true_predict_compare,train_true_predict_compare]
示例#2
0
def RUN_2(best_th):  #主函数,在获得最优分类阈值的情况下计算模型在测试集上的预测结果
    comm_s_TPR = []
    comm_s_TNR = []
    comm_s_BER = []
    comm_s_ACC = []
    comm_s_MCC = []
    comm_s_F1score = []
    comm_s_AUC = []
    comm_s_time = []
    #将原始数据分为训练集,测试集
    tiaocan_train, ceshi_train, tiaocan_train_test, ceshi_true = cross_validation.train_test_split(
        comtest.iloc[0:len(comtest), 1:comtest.shape[1] - 1],
        comtest.iloc[0:len(comtest), -1],
        test_size=0.2,
        random_state=0)

    x_train = tiaocan_train
    y_train = tiaocan_train_test
    x_test = ceshi_train
    y_true = ceshi_true

    x_train = np.array(x_train, dtype=np.float16)
    y_train = np.array(y_train, dtype=np.float16)
    x_test = np.array(x_test, dtype=np.float16)
    y_true = np.array(y_true, dtype=np.float16)
    #    x_train, y_train = RandomUnderSampler().fit_sample(x_train, y_train)  #对训练集使用欠采样的方法达到类平衡

    # 设置机器学习模型
    ##########################################################################################################################

    ############################## --XGB-- #############################
    comm = GCForest(config)
    comm.fit_transform(x_train, y_train)
    pro_comm_Pre = comm.predict_proba(x_test)
    blo_comm_Pre = blo(pro_comm_Pre, best_th)  #根据最优分类阈值与预测概率计算画着生死情况
    eva_comm = evaluating_indicator(y_true=y_true,
                                    y_test=blo_comm_Pre,
                                    y_test_value=pro_comm_Pre)

    comm_s_TPR.append(eva_comm['TPR'])
    comm_s_TNR.append(eva_comm['TNR'])
    comm_s_BER.append(eva_comm['BER'])
    comm_s_ACC.append(eva_comm['ACC'])
    comm_s_MCC.append(eva_comm['MCC'])
    comm_s_F1score.append(eva_comm['F1_score'])
    comm_s_AUC.append(eva_comm['AUC'])
    eva_comm = {
        "TPR": np.mean(comm_s_TPR),
        "TNR": np.mean(comm_s_TNR),
        "BER": np.mean(comm_s_BER),
        "ACC": np.mean(comm_s_ACC),
        "MCC": np.mean(comm_s_MCC),
        "F1_score": np.mean(comm_s_F1score),
        "AUC": np.mean(comm_s_AUC),
        "time": np.mean(comm_s_time)
    }

    return eva_comm
示例#3
0
def RUN():  #根据训练集与验证集获取最优分类阈值
    tiaocan_train, ceshi_train, tiaocan_train_test, ceshi_true = cross_validation.train_test_split(
        comtest.iloc[0:len(comtest), 1:comtest.shape[1] - 1],
        comtest.iloc[0:len(comtest), -1],
        test_size=0.2,
        random_state=0)
    position = []
    skf = StratifiedKFold(n_splits=10)  #设置十折交叉验证
    tiaocan_train = np.array(tiaocan_train, dtype=np.float16)
    tiaocan_train_test = np.array(tiaocan_train_test, dtype=np.float16)
    times = 0
    position = []
    for train, test in skf.split(tiaocan_train, tiaocan_train_test):
        alltime_start = time.time()

        times = times + 1

        x_train = tiaocan_train[train]
        y_train = tiaocan_train_test[train]
        x_test = tiaocan_train[test]
        y_true = tiaocan_train_test[test]
        #        x_train, y_train = RandomUnderSampler().fit_sample(x_train, y_train)   #使用欠采样的方法进行类平衡

        # 设置机器学习模型

        #
        ############################## --XGB-- #############################

        comm = GCForest(config)
        comm.fit_transform(x_train, y_train)  #模型训练
        pro_comm_Pre = comm.predict_proba(x_test)

        ############################### 敏感性特异性相近 ########################################
        RightIndex = []
        for jj in range(100):  #计算模型在不同分类阈值下的各项指标
            blo_comm_Pre = blo(pro_comm_Pre, jj)
            eva_comm = evaluating_indicator(y_true=y_true,
                                            y_test=blo_comm_Pre,
                                            y_test_value=pro_comm_Pre)
            RightIndex.append(abs(eva_comm['TPR'] - eva_comm['TNR']))
        RightIndex = np.array(RightIndex, dtype=np.float16)
        position = np.argmin(RightIndex)  #选择出使得敏感性特异性最小的阈值作为分类阈值输出
        alltime_end = time.time()
        print('done_0, 第%s次验证 , time: %s  s ' %
              (times, alltime_end - alltime_start))


######################################################################################
    return position.mean()  #计算交叉验证输出的多个阈值的平均值作为最优分类阈值
示例#4
0
def cross_validation(X, y, k, cpu):
    config = get_toy_config(cpu = cpu)
    classifier = GCForest(config)
    cv = StratifiedKFold(n_splits = k)
    res = {}
    i=1
    for train, test in cv.split(X, y):
        tt = classifier.fit_transform(X[train], y[train])
        yscore = classifier.predict_proba(X[test])
        tmpID = "fold_" + str(i)
        curDic = {}
        curDic["yscore"] = yscore
        curDic["ytest"] = y[test]
        res[tmpID] = curDic
        i = i + 1    
    return res
示例#5
0
def run_gcforest(train_X,
                 test_X,
                 train_y,
                 test_y,
                 rounds=3,
                 layers=100,
                 seed=0):
    config = get_toy_config(rounds, layers, seed)
    gc = GCForest(config)  # should be a dict
    X_train_enc = gc.fit_transform(train_X, train_y)
    ypred = np.array([i[1] for i in gc.predict_proba(test_X)])
    metrics = gen_eval_metrics(test_y, ypred)
    accuracy = metrics[0]

    #cor = sum([int(ypred[i] + 0.5) == test_y[i] for i in range(len(ypred))])
    #accuracy = cor / len(test_y)
    print('Fold accuracy: ' + str(accuracy))
    return metrics
示例#6
0
train_label = y

data_test = sio.loadmat('Wnt_feature_end.mat')
test_proteinA = data_test.get('feature_A')
test_protein_A = np.array(test_proteinA)
test_proteinB = data_test.get('feature_B')
test_protein_B = np.array(test_proteinB)
test_protein = np.concatenate((test_protein_A, test_protein_B), axis=1)
test_protein = np.array(test_protein)
test_protein = scaler.transform(test_protein)
test_dim = test_protein[:, mask]
test_shu = np.reshape(test_dim, (test_dim.shape[0], test_dim.shape[2]))
[row1, column1] = np.shape(test_shu)
test_y_raw = np.ones(int(row1))

test_y_ = np.mat(test_y_raw)
test_y = np.transpose(test_y_)
test_label = np.array(test_y)

with open("model_gc4.pkl", "rb") as f:
    gc = pickle.load(f)
y_score = gc.predict_proba(test_shu)
y_test = utils.to_categorical(test_label)
y_class = utils.categorical_probas_to_classes(y_score)
y_test_tmp = test_label
accu = accuracy_score(y_test_tmp, y_class)
print(accu)
acc, precision, npv, sensitivity, specificity, mcc, f1 = utils.calculate_performace(
    len(y_class), y_class, y_test_tmp)
sio.savemat('yeast_Wnt_class.mat', {'yeast_Wnt_class': y_class})

y_train2 = y_train2.values
y_train = y_train.values
y_valid = y_valid.values
y_test = y_test.values


# In[90]:


config = get_toy_config()
model = GCForest(config)

model.fit_transform(X_train2, y_train2, X_test, y_test)
gc_valid_proba = model.predict_proba(X_valid)
gc_pred = model.predict(X_valid)


# In[14]:


models = [
    LogisticRegression(),
    LinearDiscriminantAnalysis(),
    SVC(probability=True),
    DecisionTreeClassifier(),
    ExtraTreeClassifier(),
    GaussianNB(),
    KNeighborsClassifier(),
    RandomForestClassifier(random_state=random_seed),
    if name == 'cirrhosis':
        X, Y = load.cirrhosis_data()
    elif name == 't2d':
        X, Y = load.t2d_data()
    elif name == 'obesity':
        X, Y = load.obesity_data()
    else:
        raise Exception('the dataset is not defined!!!')

    output_features = pd.Series()
    for train, test in cv.split(X, Y):
        x_train = X.iloc[train]
        y_train = Y[train]

        x_test = X.iloc[test]
        y_test = Y[test]

        X_train = x_train.values.reshape(-1, 1, len(x_train.columns))
        X_test = x_test.values.reshape(-1, 1, len(x_test.columns))

        X_train_enc, _features = gc.fit_transform(X_train, y_train)

        probas_ = gc.predict_proba(X_test)
        output_features = avg_importance(output_features, _features)

    output_features = output_features.sort_values(ascending=False)
    columns = list(map(int, output_features.index.tolist()))
    output_features.index = X.columns[columns]

    output_features.to_csv("output/" + name)
示例#9
0
def GCForest_prediction(feature_data, result_data):
    random_state = 2019
    n_splits = 5
    folds = StratifiedKFold(n_splits=n_splits,
                            shuffle=True,
                            random_state=random_state).split(
                                feature_data, result_data)
    test_pred = np.zeros(feature_data.shape[0])
    test_proba = np.zeros(feature_data.shape[0])
    acc_scores = np.zeros(n_splits)
    recall_scores = np.zeros(n_splits)
    mcc_scores = np.zeros(n_splits)
    f1_scores = np.zeros(n_splits)
    for j, (train_idx, test_idx) in enumerate(folds):
        X_train = feature_data[train_idx]
        Y_train = result_data[train_idx]
        X_test = feature_data[test_idx]
        Y_test = result_data[test_idx]
        config = get_toy_config()
        gc = GCForest(config)  # should be a dict
        X_train_enc = gc.fit_transform(X_train, Y_train)
        part_X_train_enc = X_train_enc[:, ::2]
        y_pred = gc.predict(X_test)
        X_test_enc = gc.transform(X_test)
        part_X_test_enc = X_test_enc[:, ::2]
        y_proba = gc.predict_proba(X_test)[:, 1]
        acc = accuracy_score(Y_test, y_pred)
        print("Test Accuracy of GcForest (save and load) = {:.2f} %".format(
            acc * 100))
        confmat = confusion_matrix(Y_test, y_pred)
        sn = confmat[1, 1] / (confmat[1, 0] + confmat[1, 1])
        sp = confmat[0, 0] / (confmat[0, 0] + confmat[0, 1])
        print('1. The acc score of the model {}\n'.format(
            accuracy_score(Y_test, y_pred)))
        print('2. The sp score of the model {}\n'.format(sp))
        print('3. The sn score of the model {}\n'.format(sn))
        print('4. The mcc score of the model {}\n'.format(
            matthews_corrcoef(Y_test, y_pred)))
        print('9. The auc score of the model {}\n'.format(
            roc_auc_score(Y_test, y_proba, average='macro')))
        print('6. The recall score of the model {}\n'.format(
            recall_score(Y_test, y_pred, average='macro')))
        print('5. The F-1 score of the model {}\n'.format(
            f1_score(Y_test, y_pred, average='macro')))
        print('7. Classification report \n {} \n'.format(
            classification_report(Y_test, y_pred)))
        print('8. Confusion matrix \n {} \n'.format(
            confusion_matrix(Y_test, y_pred)))

        recall = recall_score(Y_test, y_pred, average='macro')
        f1 = f1_score(Y_test, y_pred, average='macro')
        acc = accuracy_score(Y_test, y_pred)
        mcc = matthews_corrcoef(Y_test, y_pred)

        recall_scores[j] = recall
        f1_scores[j] = f1
        acc_scores[j] = acc
        mcc_scores[j] = mcc

        test_pred[test_idx] = y_pred
        test_proba[test_idx] = y_proba
        print("CV- {} recall: {}, acc_score: {} , mcc_score: {}, f1_score: {}".
              format(j, recall, acc, mcc, f1))
    confmat = confusion_matrix(result_data, test_pred)
    sn = confmat[1, 1] / (confmat[1, 0] + confmat[1, 1])
    sp = confmat[0, 0] / (confmat[0, 0] + confmat[0, 1])
    print(
        "--------------------------------------深度森林------------------------------------"
    )
    print('1. The acc score of the model {}\n'.format(
        accuracy_score(result_data, test_pred)))
    print('2. The sp score of the model {}\n'.format(sp))
    print('3. The sn score of the model {}\n'.format(sn))
    print('4. The mcc score of the model {}\n'.format(
        matthews_corrcoef(result_data, test_pred)))
    print('9. The auc score of the model {}\n'.format(
        roc_auc_score(result_data, test_proba, average='macro')))
    print('6. The recall score of the model {}\n'.format(
        recall_score(result_data, test_pred, average='macro')))
    print('5. The F-1 score of the model {}\n'.format(
        f1_score(result_data, test_pred, average='macro')))
    print('7. Classification report \n {} \n'.format(
        classification_report(result_data, test_pred)))
    print('8. Confusion matrix \n {} \n'.format(
        confusion_matrix(result_data, test_pred)))
示例#10
0
def run_classification_configuration(
        X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold,
        test_idx_10_fold, train_idx_10_fold, rf_tree, rf_max_depth, rf_tree_2,
        rf_max_depth_2, xgb_tree, xgb_max_depth, min_child_weight, lr,
        xgb_tree_2, xgb_max_depth_2, min_child_weight_2, lr_2, layer):

    folds_AUC_testing, folds_AUPR_testing = [], []
    folds_AUC_training, folds_AUPR_training = [], []
    folds_metrics3_training, folds_metrics3_testing = [], []
    test_true_predict_compare, train_true_predict_compare = [], []
    folds_recall_50, folds_recall_100 = [], []
    for X_train, X_test, y_train, y_test, test_idx_fold, train_idx_fold in zip(
            X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold,
            test_idx_10_fold, train_idx_10_fold):

        config = get_toy_config(rf_tree, rf_max_depth, rf_tree_2,
                                rf_max_depth_2, xgb_tree, xgb_max_depth,
                                min_child_weight, lr, xgb_tree_2,
                                xgb_max_depth_2, min_child_weight_2, lr_2,
                                layer)
        gc = GCForest(config)
        print(config)
        X_train_enc = gc.fit_transform(X_train, y_train)

        y_pred_train = gc.predict(X_train)
        y_predprob_train = gc.predict_proba(X_train)
        y_pred_test = gc.predict(X_test)
        y_predprob_test = gc.predict_proba(X_test)

        temp = pd.DataFrame([y_test, y_predprob_test[:, 1],
                             y_pred_test]).T.sort_values(by=1, ascending=False)
        recall_50 = precision_recall_fscore_support(temp.iloc[:50, :][0],
                                                    temp.iloc[:50, :][2],
                                                    pos_label=1,
                                                    average='binary')[1]
        recall_100 = precision_recall_fscore_support(temp.iloc[:25, :][0],
                                                     temp.iloc[:25, :][2],
                                                     pos_label=1,
                                                     average='binary')[1]

        test_true_predict_compare.append([
            test_idx_fold, y_pred_test, y_test, y_predprob_test[:, 0],
            y_predprob_test[:, 1]
        ])  #10-cv
        train_true_predict_compare.append([
            train_idx_fold, y_pred_train, y_train, y_predprob_train[:, 0],
            y_predprob_train[:, 1]
        ])  #10-cv

        precision_training, recall_training, _ = precision_recall_curve(
            y_train, y_predprob_train[:, 1], pos_label=1)
        precision_testing, recall_testing, _ = precision_recall_curve(
            y_test, y_predprob_test[:, 1], pos_label=1)

        AUPR_training, AUPR_testing = auc(recall_training,
                                          precision_training), auc(
                                              recall_testing,
                                              precision_testing)
        AUC_training, AUC_testing = roc_auc_score(
            y_train,
            y_predprob_train[:, 1]), roc_auc_score(y_test, y_predprob_test[:,
                                                                           1])

        metrics3_testing = precision_recall_fscore_support(
            y_test, y_pred_test, pos_label=1, average='binary')[:3]
        metrics3_training = precision_recall_fscore_support(
            y_train, y_pred_train, pos_label=1, average='binary')[:3]

        folds_AUC_testing.append(AUC_testing)
        folds_AUPR_testing.append(AUPR_testing)
        folds_metrics3_testing.append(metrics3_testing)
        folds_AUC_training.append(AUC_training)
        folds_AUPR_training.append(AUPR_training)
        folds_metrics3_training.append(metrics3_training)
        folds_recall_50.append(recall_50)
        folds_recall_100.append(recall_100)
    Avg_AUPR_training = np.mean(folds_AUPR_training)
    Avg_AUPR_testing = np.mean(folds_AUPR_testing)
    Avg_AUC_training = np.mean(folds_AUC_training)
    Avg_AUC_testing = np.mean(folds_AUC_testing)
    Avg_metrics3_training = np.mean(folds_metrics3_training, axis=0)
    Avg_metrics3_testing = np.mean(folds_metrics3_testing, axis=0)

    return [
        Avg_AUPR_training, Avg_AUPR_testing, folds_AUPR_testing,
        Avg_AUC_training, Avg_AUC_testing, folds_AUC_testing,
        folds_AUPR_training, folds_AUC_training, Avg_metrics3_testing,
        Avg_metrics3_training, folds_recall_50, folds_recall_100
    ], [test_true_predict_compare, train_true_predict_compare]
示例#11
0
X_ = scale(X_)
new_X = pca(X_, percentage=0.90)
X_shuffle, y = get_shuffle(new_X, y_, random_state=1)
X_initial = X_shuffle
#y_raw=np.mat(label_)
#y=np.transpose(y_raw)
#X_train_origin, X_test_origin, y_train, y_test = train_test_split(X, y,test_size=0.2)
X = X_initial[:, np.newaxis, np.newaxis, :]
skf = StratifiedKFold(n_splits=5)
sepscores = []
ytest = np.ones((1, 2)) * 0.5
yscore = np.ones((1, 2)) * 0.5
for train, test in skf.split(X, y):
    X_train_enc = gc.fit_transform(X[train], y[train])
    y_score = gc.predict_proba(X[test])
    yscore = np.vstack((yscore, y_score))
    y_test = utils.to_categorical(y[test])
    ytest = np.vstack((ytest, y_test))
    fpr, tpr, _ = roc_curve(y_test[:, 0], y_score[:, 0])
    roc_auc = auc(fpr, tpr)
    y_class = utils.categorical_probas_to_classes(y_score)
    y_test_tmp = y[test]
    acc, precision, npv, sensitivity, specificity, mcc, f1 = utils.calculate_performace(
        len(y_class), y_class, y_test_tmp)
    sepscores.append(
        [acc, precision, npv, sensitivity, specificity, mcc, f1, roc_auc])
    print(
        'gcforest:acc=%f,precision=%f,npv=%f,sensitivity=%f,specificity=%f,mcc=%f,f1=%f,roc_auc=%f'
        % (acc, precision, npv, sensitivity, specificity, mcc, f1, roc_auc))
scores = np.array(sepscores)
示例#12
0
            x = np.concatenate((x_p_s, x_u_s), axis=0)
            y = np.concatenate((y_p_s, y_u_s), axis=0)

            x_train, x_test, y_train, y_test = train_test_split(x,
                                                                y,
                                                                test_size=0.2,
                                                                random_state=1)
            # scaler = StandardScaler().fit(X_train)
            # X_train_transformed = scaler.transform(X_train)
            # X_test_transformed = scaler.transform(X_test)
            config = get_toy_config()
            gc = GCForest(config)
            gc.fit_transform(x_train, y_train)

            scores = gc.predict_proba(x_u_test)[:, 0]
            orderScores = np.argsort(-scores)
            orderList = [str(item) for item in orderScores]
            orderStr = ','.join(orderList)
            top = int(y_u_test.shape[0] * 0.25)
            topNIndex = orderScores[:top]
            t = 0
            while t < top:
                index = topNIndex[t]
                x_n = x_u[index]
                X_n = np.vstack((X_n, x_n))
                t += 1
        X_n = X_n[1:, :]
        X_n = np.unique(X_n, axis=0)
        Y_n = np.zeros(X_n.shape[0])
        X = np.concatenate((x_p, X_n), axis=0)
示例#13
0
文件: AOPEDF.py 项目: liugy111/AOPEDF
test_aupr_fold = []
for train_index, test_index in kf:
    Xtrain, Xtest = feature[train_index], feature[test_index]
    Ytrain, Ytest = label[train_index], label[test_index]

    config = get_toy_config()
    rf = GCForest(config)
    Ytrain = Ytrain.flatten()
    rf.fit_transform(Xtrain, Ytrain)

    # deep forest
    predict_y = rf.predict(Xtest)
    acc = accuracy_score(Ytest, predict_y)
    print("Test Accuracy of GcForest = {:.2f} %".format(acc * 100))
    prob_predict_y = rf.predict_proba(
        Xtest
    )  # Give a result with probability values,the probability sum is 1
    predictions_validation = prob_predict_y[:, 1]
    fpr, tpr, _ = roc_curve(Ytest, predictions_validation)
    roc_auc = auc(fpr, tpr)
    aupr = average_precision_score(Ytest, predictions_validation)
    print(roc_auc)
    print(aupr)
    test_auc_fold.append(roc_auc)
    test_aupr_fold.append(aupr)
    plt.figure()
    plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc)
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
示例#14
0
def run_classification_configuration(
        X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold,
        test_idx_10_fold, train_idx_10_fold, rf_tree, rf_max_depth, rf_tree_2,
        rf_max_depth_2, xgb_tree, xgb_max_depth, min_child_weight, lr,
        xgb_tree_2, xgb_max_depth_2, min_child_weight_2, lr_2, layer, mode,
        seed):

    folds_AUC_testing, folds_AUPR_testing = [], []
    folds_AUC_training, folds_AUPR_training = [], []
    folds_metrics3_training, folds_metrics3_testing = [], []
    test_true_predict_compare, train_true_predict_compare = [], []
    folds_recall_25, folds_recall_50, folds_recall_100, folds_recall_200, folds_recall_400 = [], [], [], [], []
    folds_G_mean = []
    i = 0
    for X_train, X_test, y_train, y_test, test_idx_fold, train_idx_fold in zip(
            X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold,
            test_idx_10_fold, train_idx_10_fold):

        config = get_toy_config(rf_tree, rf_max_depth, rf_tree_2,
                                rf_max_depth_2, xgb_tree, xgb_max_depth,
                                min_child_weight, lr, xgb_tree_2,
                                xgb_max_depth_2, min_child_weight_2, lr_2,
                                layer)
        gc = GCForest(config)
        X_train_enc = gc.fit_transform(X_train, y_train)

        y_pred_train = gc.predict(X_train)
        y_predprob_train = gc.predict_proba(X_train)
        y_pred_test = gc.predict(X_test)
        y_predprob_test = gc.predict_proba(X_test)

        temp = pd.DataFrame([y_test, y_predprob_test[:, 1],
                             y_pred_test]).T.sort_values(by=1, ascending=False)
        recall_25 = precision_recall_fscore_support(temp.iloc[:25, :][0],
                                                    temp.iloc[:25, :][2],
                                                    pos_label=1,
                                                    average='binary')[1]
        recall_50 = precision_recall_fscore_support(temp.iloc[:50, :][0],
                                                    temp.iloc[:50, :][2],
                                                    pos_label=1,
                                                    average='binary')[1]

        test_true_predict_compare.append([
            test_idx_fold, y_pred_test, y_test, y_predprob_test[:, 0],
            y_predprob_test[:, 1]
        ])  #10-cv
        train_true_predict_compare.append([
            train_idx_fold, y_pred_train, y_train, y_predprob_train[:, 0],
            y_predprob_train[:, 1]
        ])  #10-cv

        precision_training, recall_training, _ = precision_recall_curve(
            y_train, y_predprob_train[:, 1], pos_label=1)
        precision_testing, recall_testing, _ = precision_recall_curve(
            y_test, y_predprob_test[:, 1], pos_label=1)

        AUPR_training, AUPR_testing = auc(recall_training,
                                          precision_training), auc(
                                              recall_testing,
                                              precision_testing)
        AUC_training, AUC_testing = roc_auc_score(
            y_train,
            y_predprob_train[:, 1]), roc_auc_score(y_test, y_predprob_test[:,
                                                                           1])

        metrics3_testing = precision_recall_fscore_support(
            y_test, y_pred_test, pos_label=1, average='binary')[:3]
        metrics3_training = precision_recall_fscore_support(
            y_train, y_pred_train, pos_label=1, average='binary')[:3]

        tn, fp, fn, tp = confusion_matrix(y_test, y_pred_test,
                                          labels=[0, 1]).ravel()
        specificity = float(tn) / float(tn + fp)
        recall = metrics3_testing[1]
        G_mean = np.sqrt(recall * specificity)

        folds_AUC_testing.append(AUC_testing)
        folds_AUPR_testing.append(AUPR_testing)
        folds_metrics3_testing.append(metrics3_testing)
        folds_AUC_training.append(AUC_training)
        folds_AUPR_training.append(AUPR_training)
        folds_metrics3_training.append(metrics3_training)
        folds_G_mean.append(G_mean)
        folds_recall_25.append(recall_25)
        folds_recall_50.append(recall_50)
        i += 1
    Avg_AUPR_training = np.mean(folds_AUPR_training)
    Avg_AUPR_testing = np.mean(folds_AUPR_testing)
    Avg_AUC_training = np.mean(folds_AUC_training)
    Avg_AUC_testing = np.mean(folds_AUC_testing)
    Avg_metrics3_training = np.mean(folds_metrics3_training, axis=0)
    Avg_metrics3_testing = np.mean(folds_metrics3_testing, axis=0)
    Avg_G_mean = np.mean(folds_G_mean)

    return [
        Avg_AUPR_training,
        Avg_AUPR_testing,
        folds_AUPR_testing,  #012
        Avg_AUC_training,
        Avg_AUC_testing,
        folds_AUC_testing,  #345
        folds_AUPR_training,
        folds_AUC_training,  #67
        Avg_metrics3_testing,
        Avg_metrics3_training,  #89
        folds_recall_25,
        folds_recall_50,
        folds_G_mean
    ], [test_true_predict_compare, train_true_predict_compare
        ]  #folds_recall_100, folds_recall_200, folds_recall_400,
示例#15
0
     train = np.append(X_train[0:int(i)],X_train[int(j):],axis=0)
     y_train = np.append(Y_train[0:int(i)],Y_train[int(j):],axis=0)
     
     y_test_all.extend(list(y_test))
     gc.fit_transform(train, y_train)
     y_predict = gc.predict(test)
     y_predict_all.extend(list(y_predict))
     
     y_predict_prob = gc.predict_proba(test)[:,1]
     y_predict_prob_all.extend(list(y_predict_prob))
     i+=length
     j+=length'''
 
 gc.fit_transform(X_train, Y_train)    
 y_predict = gc.predict(X_test)
 y_predict_prob = gc.predict_proba(X_test)[:,1]
 acc = accuracy_score(Y_test, y_predict)
 print("Test Accuracy of GcForest (save and load) = {:.2f} %".format(acc * 100))
 ROC_AUC_area=metrics.roc_auc_score(Y_test,y_predict_prob)
 print("ROC ="+str(ROC_AUC_area))
 ACC=metrics.accuracy_score(Y_test,y_predict)
 print("ACC:"+str(ACC))
 precision, recall, SN, SP, GM, TP, TN, FP, FN = performance(Y_test, y_predict)
 F1_Score=metrics.f1_score(Y_test, y_predict)
 F_measure=F1_Score
 MCC=metrics.matthews_corrcoef(Y_test, y_predict)
 pos=TP+FN
 neg=FP+TN
 savedata=[[['gcforest',ACC,precision, recall,SN, SP, GM,F_measure,F1_Score,MCC,ROC_AUC_area,TP,FN,FP,TN,pos,neg]]]
 easy_excel.save(classifier+"_crossvalidation",[str(X_train.shape[1])],savedata,'cross_validation_'+classifier+"_"+outputname+'.xls')
 
示例#16
0
def run_classification_configuration(X_train_10_fold, X_test_10_fold,
                                     y_train_10_fold, y_test_10_fold,
                                     test_idx_10_fold, train_idx_10_fold, tree,
                                     max_depth, layer):

    i = 0
    folds_AUC_testing = []
    folds_AUPR_testing = []
    folds_AUC_training = []
    folds_AUPR_training = []
    folds_metrics3_training, folds_metrics3_testing = [], []
    test_true_predict_compare = []
    train_true_predict_compare = []
    for X_train, X_test, y_train, y_test, test_idx_fold, train_idx_fold in zip(
            X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold,
            test_idx_10_fold, train_idx_10_fold):

        config = get_toy_config(tree, max_depth, layer)
        gc = GCForest(config)
        #         print(config)
        X_train_enc = gc.fit_transform(X_train, y_train, X_test, y_test)

        y_pred_train = gc.predict(X_train)
        y_predprob_train = gc.predict_proba(X_train)
        y_pred_test = gc.predict(X_test)
        y_predprob_test = gc.predict_proba(X_test)

        test_true_predict_compare.append([
            test_idx_fold, y_pred_test, y_test, y_predprob_test[:, 0],
            y_predprob_test[:, 1]
        ])  #10-cv
        train_true_predict_compare.append([
            train_idx_fold, y_pred_train, y_train, y_predprob_train[:, 0],
            y_predprob_train[:, 1]
        ])  #10-cv

        precision_training, recall_training, _ = precision_recall_curve(
            y_train, y_predprob_train[:, 1], pos_label=1)
        precision_testing, recall_testing, _ = precision_recall_curve(
            y_test, y_predprob_test[:, 1], pos_label=1)
        AUPR_training = auc(recall_training, precision_training)
        AUPR_testing = auc(recall_testing, precision_testing)
        AUC_training = roc_auc_score(y_train, y_predprob_train[:, 1])
        AUC_testing = roc_auc_score(y_test, y_predprob_test[:, 1])
        metrics3_testing = precision_recall_fscore_support(
            y_pred_test, y_test, pos_label=1, average='binary')[:3]
        metrics3_training = precision_recall_fscore_support(
            y_pred_train, y_train, pos_label=1, average='binary')[:3]

        folds_AUC_testing.append(AUC_testing)
        folds_AUPR_testing.append(AUPR_testing)
        folds_metrics3_testing.append(metrics3_testing)
        folds_AUC_training.append(AUC_training)
        folds_AUPR_training.append(AUPR_training)
        folds_metrics3_training.append(metrics3_training)

    Avg_AUPR_training = np.mean(folds_AUPR_training)
    Avg_AUPR_testing = np.mean(folds_AUPR_testing)
    Avg_AUC_training = np.mean(folds_AUC_training)
    Avg_AUC_testing = np.mean(folds_AUC_testing)
    Avg_metrics3_training = np.mean(folds_metrics3_training, axis=0)
    Avg_metrics3_testing = np.mean(folds_metrics3_testing, axis=0)

    return [
        Avg_AUPR_training, Avg_AUPR_testing, folds_AUPR_testing,
        Avg_AUC_training, Avg_AUC_testing, folds_AUC_testing,
        folds_AUPR_training, folds_AUC_training, Avg_metrics3_testing,
        Avg_metrics3_training
    ], [test_true_predict_compare, train_true_predict_compare]