Exemplo n.º 1
0
def run_classification_configuration(X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold,test_idx_10_fold, train_idx_10_fold,trees,max_depth, min_child_weight,layer,cw=0.001):
    
    
    i = 0
    folds_AUC_testing = []
    folds_AUPR_testing = []
    folds_AUC_training = []
    folds_AUPR_training = []
    test_true_predict_compare = []
    train_true_predict_compare = []
    for X_train, X_test, y_train, y_test, test_idx_fold, train_idx_fold in zip(X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold, test_idx_10_fold, train_idx_10_fold):
#         X_train, X_test = X_train[:,12:], X_test[:,12:]
#         X_train, X_test = X_train[:,:12], X_test[:,:12]
        
        config = get_toy_config(trees,max_depth, min_child_weight,cw,layer)
        gc = GCForest(config)
        #print(config)
        X_train_enc = gc.fit_transform(X_train, y_train, X_test, y_test)

        y_pred_train = gc.predict(X_train)
        y_predprob_train = gc.predict_proba(X_train)
        y_pred_test = gc.predict(X_test)
        y_predprob_test = gc.predict_proba(X_test)
        y_predprob_test_df = pd.DataFrame(y_predprob_test)
        y_predprob_train_df = pd.DataFrame(y_predprob_train)
        test_true_predict_compare.append([test_idx_fold, y_pred_test, y_test, y_predprob_test[:,0], y_predprob_test[:,1]]) #10-cv
        train_true_predict_compare.append([train_idx_fold, y_pred_train, y_train, y_predprob_train[:,0], y_predprob_train[:,1]]) #10-cv
        
        precision_training, recall_training, _ = precision_recall_curve(y_train, y_predprob_train[:,1], pos_label=1)
        precision_testing, recall_testing, _ =   precision_recall_curve(y_test, y_predprob_test[:,1], pos_label=1)    
        AUPR_training = auc(recall_training,precision_training)
        AUPR_testing = auc(recall_testing, precision_testing)
        AUC_training = roc_auc_score(y_train, y_predprob_train[:,1]) 
        AUC_testing = roc_auc_score(y_test, y_predprob_test[:,1]) 

        folds_AUC_testing.append(AUC_testing)
        folds_AUPR_testing.append(AUPR_testing)
        folds_AUC_training.append(AUC_training)
        folds_AUPR_training.append(AUPR_training)

    Avg_AUPR_training = np.mean(folds_AUPR_training)
    Avg_AUPR_testing = np.mean(folds_AUPR_testing)
    Avg_AUC_training = np.mean(folds_AUC_training)
    Avg_AUC_testing = np.mean(folds_AUC_testing) 
    
    return [Avg_AUPR_training,Avg_AUPR_testing,folds_AUPR_testing, Avg_AUC_training,Avg_AUC_testing,folds_AUC_testing,folds_AUPR_training,folds_AUC_training], [test_true_predict_compare,train_true_predict_compare]
Exemplo n.º 2
0
def run_model(features, adhd_labels, rand_params, verbose=True, test_size=0.2):
    """
    Run the gcForest using parameters from the Optimizer. Use random portions of the original dataset
    for testing and training (default 20%-80%)

    :param kind: (str) The type of functional connectivity we want to use
    :param features: (list) A matrix containing phenotypic and functional connectivity c
    :param adhd_labels: (list) The correct labels from the dataset
    :param rand_params: (dict) The generated random params from the Optimizer
    :param verbose: (bool) Whether to print classification report
    :param test_size: (float) How much of the dataset to use for testing
    :return: (float) accuracy, (float) f1, (float) precision, (float) recall
    """
    classifier = GCForest(  # Instantiate the gcForest algorithm using the random parameters we generated
        config=generate_gcforest_config(rand_params['mlp_layers'], rand_params['mlp_solver'],
                                        rand_params['logistic_regressions'],
                                        rand_params['svc_kernel'], rand_params['xgb_estimators'],
                                        rand_params['rf_estimators'],
                                        rand_params['early_stopping_iterations'], rand_params['positions']),
    )

    X_train, X_test, y_train, y_test = train_test_split(features, adhd_labels, test_size=test_size)
    # Split the data into random subsets (20% test, 80% train by default)
    classifier.fit_transform(np.array(X_train), np.array(y_train))  # Train the gcForest model
    y_pred = classifier.predict(np.array(X_test))  # Predict off of the test dataset
    y_test = np.array(y_test)
    if verbose:
        print "Classification Report\n", classification_report(y_test, y_pred)  # Print out some useful run information
        print "Accuracy:", accuracy_score(y_test, y_pred)
        print "Confusion Matrix\n", confusion_matrix(y_test, y_pred)
    positive_metrics = {
        'f1': f1_score(y_test, y_pred),  # Calculate the f1 for class "1"
        'precision': precision_score(y_test, y_pred),  # Calculate the precision for class "1"
        'recall': recall_score(y_test, y_pred),  # Calculate the recall for class "1"
    }
    negative_metrics = {
        'f1': f1_score(y_test, y_pred, pos_label=0),  # Calculate the f1 for class "0"
        'precision': precision_score(y_test, y_pred, pos_label=0),  # Calculate the precision for class "0"
        'recall': recall_score(y_test, y_pred, pos_label=0),  # Calculate the recall for class "0"
    }
    matrix = confusion_matrix(y_test, y_pred)
    confusion = {  # Return the attributes of the confusion matrix
        'true_negative': matrix[0][0],  # Predicted false and is false
        'false_positive': matrix[0][1],  # Predicted false and is true
        'false_negative': matrix[1][0],  # Predicted true and is false
        'true_positive': matrix[1][1]  # Predicted true and is true
    }
    scores = accuracy_score(y_test, y_pred), positive_metrics, negative_metrics, confusion
    # Get the accuracy, f1, precision and recall of the model

    return scores  # Return it
Exemplo n.º 3
0
def GCForest_prediction(feature_data, result_data):
    random_state = 2019
    n_splits = 5
    folds = StratifiedKFold(n_splits=n_splits,
                            shuffle=True,
                            random_state=random_state).split(
                                feature_data, result_data)
    test_pred = np.zeros(feature_data.shape[0])
    test_proba = np.zeros(feature_data.shape[0])
    acc_scores = np.zeros(n_splits)
    recall_scores = np.zeros(n_splits)
    mcc_scores = np.zeros(n_splits)
    f1_scores = np.zeros(n_splits)
    for j, (train_idx, test_idx) in enumerate(folds):
        X_train = feature_data[train_idx]
        Y_train = result_data[train_idx]
        X_test = feature_data[test_idx]
        Y_test = result_data[test_idx]
        config = get_toy_config()
        gc = GCForest(config)  # should be a dict
        X_train_enc = gc.fit_transform(X_train, Y_train)
        part_X_train_enc = X_train_enc[:, ::2]
        y_pred = gc.predict(X_test)
        X_test_enc = gc.transform(X_test)
        part_X_test_enc = X_test_enc[:, ::2]
        y_proba = gc.predict_proba(X_test)[:, 1]
        acc = accuracy_score(Y_test, y_pred)
        print("Test Accuracy of GcForest (save and load) = {:.2f} %".format(
            acc * 100))
        confmat = confusion_matrix(Y_test, y_pred)
        sn = confmat[1, 1] / (confmat[1, 0] + confmat[1, 1])
        sp = confmat[0, 0] / (confmat[0, 0] + confmat[0, 1])
        print('1. The acc score of the model {}\n'.format(
            accuracy_score(Y_test, y_pred)))
        print('2. The sp score of the model {}\n'.format(sp))
        print('3. The sn score of the model {}\n'.format(sn))
        print('4. The mcc score of the model {}\n'.format(
            matthews_corrcoef(Y_test, y_pred)))
        print('9. The auc score of the model {}\n'.format(
            roc_auc_score(Y_test, y_proba, average='macro')))
        print('6. The recall score of the model {}\n'.format(
            recall_score(Y_test, y_pred, average='macro')))
        print('5. The F-1 score of the model {}\n'.format(
            f1_score(Y_test, y_pred, average='macro')))
        print('7. Classification report \n {} \n'.format(
            classification_report(Y_test, y_pred)))
        print('8. Confusion matrix \n {} \n'.format(
            confusion_matrix(Y_test, y_pred)))

        recall = recall_score(Y_test, y_pred, average='macro')
        f1 = f1_score(Y_test, y_pred, average='macro')
        acc = accuracy_score(Y_test, y_pred)
        mcc = matthews_corrcoef(Y_test, y_pred)

        recall_scores[j] = recall
        f1_scores[j] = f1
        acc_scores[j] = acc
        mcc_scores[j] = mcc

        test_pred[test_idx] = y_pred
        test_proba[test_idx] = y_proba
        print("CV- {} recall: {}, acc_score: {} , mcc_score: {}, f1_score: {}".
              format(j, recall, acc, mcc, f1))
    confmat = confusion_matrix(result_data, test_pred)
    sn = confmat[1, 1] / (confmat[1, 0] + confmat[1, 1])
    sp = confmat[0, 0] / (confmat[0, 0] + confmat[0, 1])
    print(
        "--------------------------------------深度森林------------------------------------"
    )
    print('1. The acc score of the model {}\n'.format(
        accuracy_score(result_data, test_pred)))
    print('2. The sp score of the model {}\n'.format(sp))
    print('3. The sn score of the model {}\n'.format(sn))
    print('4. The mcc score of the model {}\n'.format(
        matthews_corrcoef(result_data, test_pred)))
    print('9. The auc score of the model {}\n'.format(
        roc_auc_score(result_data, test_proba, average='macro')))
    print('6. The recall score of the model {}\n'.format(
        recall_score(result_data, test_pred, average='macro')))
    print('5. The F-1 score of the model {}\n'.format(
        f1_score(result_data, test_pred, average='macro')))
    print('7. Classification report \n {} \n'.format(
        classification_report(result_data, test_pred)))
    print('8. Confusion matrix \n {} \n'.format(
        confusion_matrix(result_data, test_pred)))
Exemplo n.º 4
0
def GAGCForest_prediction(feature_data, result_data):
    n_splits = 5
    acc_scores = np.zeros(n_splits)
    recall_scores = np.zeros(n_splits)
    mcc_scores = np.zeros(n_splits)
    f1_scores = np.zeros(n_splits)
    skfolds = StratifiedKFold(n_splits=n_splits,
                              shuffle=True,
                              random_state=random_state).split(
                                  feature_data, result_data)
    new_test_pred = np.zeros(feature_data.shape[0])
    new_test_proba = np.zeros(feature_data.shape[0])
    for j, (train_idx, test_idx) in enumerate(skfolds):
        X_train = feature_data[train_idx]
        Y_train = result_data[train_idx]
        X_test = feature_data[test_idx]
        Y_test = result_data[test_idx]
        config = get_toy_config()
        gc = GCForest(config)  # should be a dict
        X_train_enc = gc.fit_transform(X_train, Y_train)
        y_pred = gc.predict(X_test)
        X_test_enc = gc.transform(X_test)
        # 获取函数接口地址
        AIM_M = __import__('aimfuc')
        AIM_F = 'gcforestCM'
        """============================变量设置============================"""
        w1 = [0, 1]
        w2 = [0, 1]
        w3 = [0, 1]
        b1 = [1, 1]
        b2 = [1, 1]
        b3 = [1, 1]
        ranges = np.vstack([w1, w2, w3]).T  # 生成自变量的范围矩阵
        borders = np.vstack([b1, b2, b3]).T  # 生成自变量的边界矩阵
        # ranges = np.vstack([np.zeros((1, 3)), np.ones((1, 3))])  # 生成自变量的范围矩阵
        # print(shape(ranges))
        # borders = np.vstack([.ones((1, 3)), np.ones((1, 3))])  # 生成自变量的边界矩阵
        precisions = [6] * 3  # 自变量的编码精度
        scales = [0] * 3
        codes = [1] * 3
        # print(np.ones((1, 300)))
        # scales = list(np.zeros((1, 300)))  # 采用算术刻度
        # codes = np.vstack([np.ones((1, 300)), np.ones((1, 300))])  # 变量的编码方式,2个变量均使用格雷编码
        # print(shape(codes))
        """========================遗传算法参数设置========================="""
        # NIND = 50  # 种群规模
        # MAXGEN = 100  # 最大遗传代数
        # GGAP = 0.8  # 代沟:子代与父代个体不相同的概率为0.8
        # selectStyle = 'sus';  # 遗传算法的选择方式设为"sus"——随机抽样选择
        # recombinStyle = 'xovdp'  # 遗传算法的重组方式,设为两点交叉
        # recopt = 0.9  # 交叉概率
        # pm = 0.1  # 变异概率
        # SUBPOP = 1  # 设置种群数为1
        # maxormin = 1  #
        # 设置最大最小化目标标记为1,表示是最小化目标,-1则表示最大化目标

        FieldD = ga.crtfld(ranges, borders, precisions, codes, scales)  #

        # 调用编程模板
        [weightarray, pop_trace, var_trace,
         times] = new_code_templet(AIM_M,
                                   AIM_F,
                                   None,
                                   None,
                                   FieldD,
                                   problem='R',
                                   maxormin=-1,
                                   MAXGEN=10,
                                   NIND=50,
                                   SUBPOP=1,
                                   GGAP=0.8,
                                   selectStyle='sus',
                                   recombinStyle='xovsp',
                                   recopt=0.9,
                                   pm=0.7,
                                   distribute=True,
                                   proba=X_train_enc,
                                   result=Y_train,
                                   drawing=0)
        print('用时:', times, '秒')
        # w3 = 1 - weight[0] - weight[1]
        # print(weight)

        # weightarray = np.concatenate((weight, [w3]), axis=0)
        for element in weightarray:
            print(element)
        test_probaF = X_test_enc[:, ::2].T
        test_probaT = X_test_enc[:, 1::2].T
        test_predT = np.dot(weightarray, test_probaT)
        test_predF = np.dot(weightarray, test_probaF)
        test_pred = np.zeros(len(test_predT))
        test_proba = np.zeros(len(test_predT))
        for i in range(len(test_predT)):
            temper = test_predT[i] + test_predF[i]
            test_proba = test_predT / temper
            if (test_predT[i] > test_predF[i]):
                test_pred[i] = 1
            else:
                test_pred[i] = 0
        confmat = confusion_matrix(Y_test, test_pred)
        sn = confmat[1, 1] / (confmat[1, 0] + confmat[1, 1])
        sp = confmat[0, 0] / (confmat[0, 0] + confmat[0, 1])
        print('1. The acc score of the model {}\n'.format(
            accuracy_score(Y_test, test_pred)))
        print('2. The sp score of the model {}\n'.format(sp))
        print('3. The sn score of the model {}\n'.format(sn))
        print('4. The mcc score of the model {}\n'.format(
            matthews_corrcoef(Y_test, test_pred)))

        print('9. The auc score of the model {}\n'.format(
            roc_auc_score(Y_test, test_proba, average='macro')))
        print('6. The recall score of the model {}\n'.format(
            recall_score(Y_test, test_pred, average='macro')))
        print('5. The F-1 score of the model {}\n'.format(
            f1_score(Y_test, test_pred, average='macro')))
        print('7. Classification report \n {} \n'.format(
            classification_report(Y_test, test_pred)))
        print('8. Confusion matrix \n {} \n'.format(
            confusion_matrix(Y_test, test_pred)))

        recall = recall_score(Y_test, test_pred, average='macro')
        f1 = f1_score(Y_test, test_pred, average='macro')
        acc = accuracy_score(Y_test, test_pred)
        mcc = matthews_corrcoef(Y_test, test_pred)
        recall_scores[j] = recall
        f1_scores[j] = f1
        acc_scores[j] = acc
        mcc_scores[j] = mcc
        new_test_pred[test_idx] = test_pred
        new_test_proba[test_idx] = test_proba
        print("CV- {} recall: {}, acc_score: {} , mcc_score: {}, f1_score: {}".
              format(j, recall, acc, mcc, f1))
    new_confmat = confusion_matrix(result_data, new_test_pred)
    sn = new_confmat[1, 1] / (new_confmat[1, 0] + new_confmat[1, 1])
    sp = new_confmat[0, 0] / (new_confmat[0, 0] + new_confmat[0, 1])
    print(
        "---------------------------------遗传算法-----------------------------------------"
    )
    print('1. The acc score of the model {}\n'.format(
        accuracy_score(result_data, new_test_pred)))
    print('2. The sp score of the model {}\n'.format(sp))
    print('3. The sn score of the model {}\n'.format(sn))
    print('4. The mcc score of the model {}\n'.format(
        matthews_corrcoef(result_data, new_test_pred)))
    print('9. The auc score of the model {}\n'.format(
        roc_auc_score(result_data, new_test_proba, average='macro')))
    print('6. The recall score of the model {}\n'.format(
        recall_score(result_data, new_test_pred, average='macro')))
    print('5. The F-1 score of the model {}\n'.format(
        f1_score(result_data, new_test_pred, average='macro')))
    print('7. Classification report \n {} \n'.format(
        classification_report(result_data, new_test_pred)))
    print('8. Confusion matrix \n {} \n'.format(
        confusion_matrix(result_data, new_test_pred)))
Exemplo n.º 5
0
def run_classification_configuration(
        X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold,
        test_idx_10_fold, train_idx_10_fold, rf_tree, rf_max_depth, rf_tree_2,
        rf_max_depth_2, xgb_tree, xgb_max_depth, min_child_weight, lr,
        xgb_tree_2, xgb_max_depth_2, min_child_weight_2, lr_2, layer):

    folds_AUC_testing, folds_AUPR_testing = [], []
    folds_AUC_training, folds_AUPR_training = [], []
    folds_metrics3_training, folds_metrics3_testing = [], []
    test_true_predict_compare, train_true_predict_compare = [], []
    folds_recall_50, folds_recall_100 = [], []
    for X_train, X_test, y_train, y_test, test_idx_fold, train_idx_fold in zip(
            X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold,
            test_idx_10_fold, train_idx_10_fold):

        config = get_toy_config(rf_tree, rf_max_depth, rf_tree_2,
                                rf_max_depth_2, xgb_tree, xgb_max_depth,
                                min_child_weight, lr, xgb_tree_2,
                                xgb_max_depth_2, min_child_weight_2, lr_2,
                                layer)
        gc = GCForest(config)
        print(config)
        X_train_enc = gc.fit_transform(X_train, y_train)

        y_pred_train = gc.predict(X_train)
        y_predprob_train = gc.predict_proba(X_train)
        y_pred_test = gc.predict(X_test)
        y_predprob_test = gc.predict_proba(X_test)

        temp = pd.DataFrame([y_test, y_predprob_test[:, 1],
                             y_pred_test]).T.sort_values(by=1, ascending=False)
        recall_50 = precision_recall_fscore_support(temp.iloc[:50, :][0],
                                                    temp.iloc[:50, :][2],
                                                    pos_label=1,
                                                    average='binary')[1]
        recall_100 = precision_recall_fscore_support(temp.iloc[:25, :][0],
                                                     temp.iloc[:25, :][2],
                                                     pos_label=1,
                                                     average='binary')[1]

        test_true_predict_compare.append([
            test_idx_fold, y_pred_test, y_test, y_predprob_test[:, 0],
            y_predprob_test[:, 1]
        ])  #10-cv
        train_true_predict_compare.append([
            train_idx_fold, y_pred_train, y_train, y_predprob_train[:, 0],
            y_predprob_train[:, 1]
        ])  #10-cv

        precision_training, recall_training, _ = precision_recall_curve(
            y_train, y_predprob_train[:, 1], pos_label=1)
        precision_testing, recall_testing, _ = precision_recall_curve(
            y_test, y_predprob_test[:, 1], pos_label=1)

        AUPR_training, AUPR_testing = auc(recall_training,
                                          precision_training), auc(
                                              recall_testing,
                                              precision_testing)
        AUC_training, AUC_testing = roc_auc_score(
            y_train,
            y_predprob_train[:, 1]), roc_auc_score(y_test, y_predprob_test[:,
                                                                           1])

        metrics3_testing = precision_recall_fscore_support(
            y_test, y_pred_test, pos_label=1, average='binary')[:3]
        metrics3_training = precision_recall_fscore_support(
            y_train, y_pred_train, pos_label=1, average='binary')[:3]

        folds_AUC_testing.append(AUC_testing)
        folds_AUPR_testing.append(AUPR_testing)
        folds_metrics3_testing.append(metrics3_testing)
        folds_AUC_training.append(AUC_training)
        folds_AUPR_training.append(AUPR_training)
        folds_metrics3_training.append(metrics3_training)
        folds_recall_50.append(recall_50)
        folds_recall_100.append(recall_100)
    Avg_AUPR_training = np.mean(folds_AUPR_training)
    Avg_AUPR_testing = np.mean(folds_AUPR_testing)
    Avg_AUC_training = np.mean(folds_AUC_training)
    Avg_AUC_testing = np.mean(folds_AUC_testing)
    Avg_metrics3_training = np.mean(folds_metrics3_training, axis=0)
    Avg_metrics3_testing = np.mean(folds_metrics3_testing, axis=0)

    return [
        Avg_AUPR_training, Avg_AUPR_testing, folds_AUPR_testing,
        Avg_AUC_training, Avg_AUC_testing, folds_AUC_testing,
        folds_AUPR_training, folds_AUC_training, Avg_metrics3_testing,
        Avg_metrics3_training, folds_recall_50, folds_recall_100
    ], [test_true_predict_compare, train_true_predict_compare]
Exemplo n.º 6
0
            while t < top:
                index = topNIndex[t]
                x_n = x_u[index]
                X_n = np.vstack((X_n, x_n))
                t += 1
        X_n = X_n[1:, :]
        X_n = np.unique(X_n, axis=0)
        Y_n = np.zeros(X_n.shape[0])
        X = np.concatenate((x_p, X_n), axis=0)
        Y = np.concatenate((y_p, Y_n), axis=0)
        x_train, x_test, y_train, y_test = train_test_split(
            X, Y, test_size=0.2, random_state=1)  #利用正样本和可靠负样本重新训练分类器
        config = get_toy_config()
        gc = GCForest(config)
        gc.fit_transform(x_train, y_train)
        y_pred = gc.predict(x_test)
        # acc = accuracy_score(y_test, y_pred)
        # print("Test Accuracy of GcForest = {:.2f} %".format(acc * 100))

        i = 0
        nfolds = 5
        eRecalls = np.zeros(nfolds)
        ePrecisions = np.zeros(nfolds)
        ePRAUCs = np.zeros(nfolds)
        for i in range(nfolds):
            x_p_train, x_p_test, y_p_train, y_p_test = train_test_split(
                x_p, y_p, test_size=0.2)
            x_u_train, x_u_test, y_u_train, y_u_test = train_test_split(
                x_u, y_u, test_size=0.2)
            X_test = np.concatenate((x_p_test, x_u_test), axis=0)
            Y_test = np.concatenate((y_p_test, y_u_test), axis=0)
Exemplo n.º 7
0
rs = np.random.randint(0, 1000, 1)[0]
kf = StratifiedKFold(label[:, 0], n_folds=5, shuffle=True, random_state=rs)

test_auc_fold = []
test_aupr_fold = []
for train_index, test_index in kf:
    Xtrain, Xtest = feature[train_index], feature[test_index]
    Ytrain, Ytest = label[train_index], label[test_index]

    config = get_toy_config()
    rf = GCForest(config)
    Ytrain = Ytrain.flatten()
    rf.fit_transform(Xtrain, Ytrain)

    # deep forest
    predict_y = rf.predict(Xtest)
    acc = accuracy_score(Ytest, predict_y)
    print("Test Accuracy of GcForest = {:.2f} %".format(acc * 100))
    prob_predict_y = rf.predict_proba(
        Xtest
    )  # Give a result with probability values,the probability sum is 1
    predictions_validation = prob_predict_y[:, 1]
    fpr, tpr, _ = roc_curve(Ytest, predictions_validation)
    roc_auc = auc(fpr, tpr)
    aupr = average_precision_score(Ytest, predictions_validation)
    print(roc_auc)
    print(aupr)
    test_auc_fold.append(roc_auc)
    test_aupr_fold.append(aupr)
    plt.figure()
    plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc)
Exemplo n.º 8
0
def run_classification_configuration(X_train_10_fold, X_test_10_fold,
                                     y_train_10_fold, y_test_10_fold,
                                     test_idx_10_fold, train_idx_10_fold, tree,
                                     max_depth, layer):

    i = 0
    folds_AUC_testing = []
    folds_AUPR_testing = []
    folds_AUC_training = []
    folds_AUPR_training = []
    folds_metrics3_training, folds_metrics3_testing = [], []
    test_true_predict_compare = []
    train_true_predict_compare = []
    for X_train, X_test, y_train, y_test, test_idx_fold, train_idx_fold in zip(
            X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold,
            test_idx_10_fold, train_idx_10_fold):

        config = get_toy_config(tree, max_depth, layer)
        gc = GCForest(config)
        #         print(config)
        X_train_enc = gc.fit_transform(X_train, y_train, X_test, y_test)

        y_pred_train = gc.predict(X_train)
        y_predprob_train = gc.predict_proba(X_train)
        y_pred_test = gc.predict(X_test)
        y_predprob_test = gc.predict_proba(X_test)

        test_true_predict_compare.append([
            test_idx_fold, y_pred_test, y_test, y_predprob_test[:, 0],
            y_predprob_test[:, 1]
        ])  #10-cv
        train_true_predict_compare.append([
            train_idx_fold, y_pred_train, y_train, y_predprob_train[:, 0],
            y_predprob_train[:, 1]
        ])  #10-cv

        precision_training, recall_training, _ = precision_recall_curve(
            y_train, y_predprob_train[:, 1], pos_label=1)
        precision_testing, recall_testing, _ = precision_recall_curve(
            y_test, y_predprob_test[:, 1], pos_label=1)
        AUPR_training = auc(recall_training, precision_training)
        AUPR_testing = auc(recall_testing, precision_testing)
        AUC_training = roc_auc_score(y_train, y_predprob_train[:, 1])
        AUC_testing = roc_auc_score(y_test, y_predprob_test[:, 1])
        metrics3_testing = precision_recall_fscore_support(
            y_pred_test, y_test, pos_label=1, average='binary')[:3]
        metrics3_training = precision_recall_fscore_support(
            y_pred_train, y_train, pos_label=1, average='binary')[:3]

        folds_AUC_testing.append(AUC_testing)
        folds_AUPR_testing.append(AUPR_testing)
        folds_metrics3_testing.append(metrics3_testing)
        folds_AUC_training.append(AUC_training)
        folds_AUPR_training.append(AUPR_training)
        folds_metrics3_training.append(metrics3_training)

    Avg_AUPR_training = np.mean(folds_AUPR_training)
    Avg_AUPR_testing = np.mean(folds_AUPR_testing)
    Avg_AUC_training = np.mean(folds_AUC_training)
    Avg_AUC_testing = np.mean(folds_AUC_testing)
    Avg_metrics3_training = np.mean(folds_metrics3_training, axis=0)
    Avg_metrics3_testing = np.mean(folds_metrics3_testing, axis=0)

    return [
        Avg_AUPR_training, Avg_AUPR_testing, folds_AUPR_testing,
        Avg_AUC_training, Avg_AUC_testing, folds_AUC_testing,
        folds_AUPR_training, folds_AUC_training, Avg_metrics3_testing,
        Avg_metrics3_training
    ], [test_true_predict_compare, train_true_predict_compare]
Exemplo n.º 9
0
    for i in range(0, len(val_labels)):
        if val_labels[i] == 0:
            val_fea_0.append(val_fea[i])
        else:
            val_fea_1.append(val_fea[i])
    test_fea = val_fea_1[:int(len(val_fea_1) /
                              2)] + val_fea_0[:int(len(val_fea_0) / 2)]
    test_labels = [1] * int(len(val_fea_1) / 2) + [0] * int(len(val_fea_0) / 2)
    train_fea = val_fea_1[int(len(val_fea_1) / 2
                              ):] * 1 + val_fea_0[int(len(val_fea_0) / 2):] * 1
    train_labels = [1] * (len(val_fea_1) - int(len(val_fea_1) / 2)) * 1 + [
        0
    ] * (len(val_fea_0) - int(len(val_fea_0) / 2)) * 1
    train_data = [[t, l] for t, l in zip(train_fea, train_labels)]
    test_data = [[d, l] for d, l in zip(test_fea, test_labels)]
    random.shuffle(train_data)
    random.shuffle(test_data)
    test_fea = [d[0] for d in test_data]
    test_labels = [d[1] for d in test_data]
    train_fea = [d[0] for d in train_data]
    train_labels = [d[1] for d in train_data]
    gc = GCForest(get_toy_config())  # should be a dict
    X_train_enc = gc.fit_transform(np.array(train_fea), np.array(train_labels))
    i = 0
    while os.path.exists('./gcForest_model/' + str(i)):
        i += 1
    os.makedirs('./gcForest_model/' + str(i))
    #pickle.dump(gc,open('./gcForest_model/'+ str(i)+'/model.pkl','wb+'),protocol=True)
    y_pred = gc.predict(np.array(test_fea))
    print(classification_report(test_labels, y_pred))
Exemplo n.º 10
0
    config["cascade"] = ca_config
    return config


config = get_toy_config(all_estimators=all_estimators)
gc = GCForest(config)
# If the model you use cost too much memory for you.
# You can use these methods to force gcforest not keeping model in memory
# gc.set_keep_model_in_mem(False), default is TRUE.
n_test = 500
# (X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train, y_train = train_dataset_x[:-n_test], train_dataset_y[:-n_test]
X_test_cv, y_test_cv = train_dataset_x[-n_test:], train_dataset_y[-n_test:]

X_train = X_train[:, np.newaxis, :, :]
X_test_cv = X_test_cv[:, np.newaxis, :, :]

X_train_enc = gc.fit_transform(X_train, y_train)

y_pred_cv = gc.predict(X_test_cv)
acc = accuracy_score(y_test_cv, y_pred_cv)
print("Test Accuracy CV of GcForest = {:.2f} %".format(acc * 100))

y_pred = gc.predict(X_test_preprocessed)
acc = accuracy_score(y_test, y_pred)
print("Test Accuracy of GcForest = {:.2f} %".format(acc * 100))

# save the model to disk
with open(pickle_name, "wb") as f:
    pickle.dump(gc, f, pickle.HIGHEST_PROTOCOL)
Exemplo n.º 11
0
        x_train = X.iloc[train]
        y_train = Y[train]

        x_test = X.iloc[test]
        y_test = Y[test]

        x_train = x_train.values.reshape(-1, 1, len(x_train.columns))
        x_test = x_test.values.reshape(-1, 1, len(x_test.columns))

        X_train = x_train[:, np.newaxis, :, :]
        X_test = x_test[:, np.newaxis, :, :]

        X_train_enc = clf_gc.fit_transform(X_train, y_train)

        ###############################
        y_pred = clf_gc.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        gc_pred_acc.append(acc)
        ###########################################################
        X_test_enc = clf_gc.transform(X_test)
        X_train_enc = X_train_enc.reshape((X_train_enc.shape[0], -1))
        X_test_enc = X_test_enc.reshape((X_test_enc.shape[0], -1))
        X_train_origin = X_train.reshape((X_train.shape[0], -1))
        X_test_origin = X_test.reshape((X_test.shape[0], -1))
        X_train_enc = np.hstack((X_train_enc, X_train_origin))
        X_test_enc = np.hstack((X_test_enc, X_test_origin))
        clf = XGBClassifier(n_estimators=100, n_jobs=-1)
        clf.fit(X_train_enc, y_train)

        y_pred = clf.predict(X_test_enc)
        acc = accuracy_score(y_test, y_pred)
Exemplo n.º 12
0
     y_test = Y_train[int(i):int(j)]
     train = np.append(X_train[0:int(i)],X_train[int(j):],axis=0)
     y_train = np.append(Y_train[0:int(i)],Y_train[int(j):],axis=0)
     
     y_test_all.extend(list(y_test))
     gc.fit_transform(train, y_train)
     y_predict = gc.predict(test)
     y_predict_all.extend(list(y_predict))
     
     y_predict_prob = gc.predict_proba(test)[:,1]
     y_predict_prob_all.extend(list(y_predict_prob))
     i+=length
     j+=length'''
 
 gc.fit_transform(X_train, Y_train)    
 y_predict = gc.predict(X_test)
 y_predict_prob = gc.predict_proba(X_test)[:,1]
 acc = accuracy_score(Y_test, y_predict)
 print("Test Accuracy of GcForest (save and load) = {:.2f} %".format(acc * 100))
 ROC_AUC_area=metrics.roc_auc_score(Y_test,y_predict_prob)
 print("ROC ="+str(ROC_AUC_area))
 ACC=metrics.accuracy_score(Y_test,y_predict)
 print("ACC:"+str(ACC))
 precision, recall, SN, SP, GM, TP, TN, FP, FN = performance(Y_test, y_predict)
 F1_Score=metrics.f1_score(Y_test, y_predict)
 F_measure=F1_Score
 MCC=metrics.matthews_corrcoef(Y_test, y_predict)
 pos=TP+FN
 neg=FP+TN
 savedata=[[['gcforest',ACC,precision, recall,SN, SP, GM,F_measure,F1_Score,MCC,ROC_AUC_area,TP,FN,FP,TN,pos,neg]]]
 easy_excel.save(classifier+"_crossvalidation",[str(X_train.shape[1])],savedata,'cross_validation_'+classifier+"_"+outputname+'.xls')
Exemplo n.º 13
0
if __name__ == "__main__":
    name = 'vor3'
    pickle_in = open('../../datasets/train_set_'+name+'.p',"rb")
    train_set = pickle.load(pickle_in)
    pickle_in = open('../../datasets/train_label_'+name+'.p',"rb")
    train_label = pickle.load(pickle_in)
    pickle_in = open('../../datasets/test_set_'+name+'.p', "rb")
    test_set = pickle.load(pickle_in)
    pickle_in = open('../../datasets/test_label_'+name+'.p', "rb")
    test_label = pickle.load(pickle_in)
    train_set = train_set.to_numpy()
    train_label = train_label.to_numpy()
    test_set = test_set.to_numpy()
    test_label = test_label.to_numpy()
    print(len(np.unique(train_label)), len(np.unique(train_set)))
    print(train_label.shape, train_set.shape)
    config = get_config()
    gc = GCForest(config)
    print(config)
    X_train_enc = gc.fit_transform(train_set, train_label)
    title_model = '../results/trained_deep_forest_'+name+'.p'
    pickle.dump(gc, open(title_model, 'wb'))
    y_pred = gc.predict(test_set)
    acc = accuracy_score(test_label, y_pred)
    print("Test Accuracy of GcForest = {:.2f} %".format(acc * 100))
    print(acc)
    report = classification_report(test_label, y_pred)
    title = "report_deep_forest_"+name+".txt"
    write_report = open(title,"w")
    write_report.write(report)
Exemplo n.º 14
0
    config1 = load_json("/home/qiang/repo/python/experiment-gcForest/cascade_clf/examples/demo_ca.json")
    # If the model you use cost too much memory for you.
    # You can use these methods to force gcforest not keeping model in memory
    # gc.set_keep_model_in_mem(False), default is TRUE.
    config2 = get_toy_config()
    acc_st = []
    acc_gc = []
    acc_rf = []
    for i in range(10):
        (X_train, y_train), (X_test, y_test) = uci_yeast.load_data()


        gc1 = GCForest(config1)
        gc1.fit_transform(X_train, y_train)
        y_pred = gc1.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        acc_st.append(acc)
        print("Test Accuracy of stacking GcForest = {:.2f} %".format(acc * 100))

        # X_train, y_train = X_train[:2000], y_train[:2000]
        # X_train = X_train[:, np.newaxis, :]
        # X_test = X_test[:, np.newaxis, :]

        gc2 = GCForest(config2)
        gc2.fit_transform(X_train, y_train)
        # X_enc is the concatenated predict_proba result of each estimators of the last layer of the GCForest model
        # X_enc.shape =
        #   (n_datas, n_estimators * n_classes): If cascade is provided
        #   (n_datas, n_estimators * n_classes, dimX, dimY): If only finegrained part is provided
        # You can also pass X_test, y_test to fit_transform method, then the accracy on test data will be logged when training.
gc = GCForest(config)

X_train_enc = gc.fit_transform(X_train_oversampled, y_train_oversampled)

# dump
with open("../pkl/2018_test.pkl", "wb") as f:
    pickle.dump(gc, f, pickle.HIGHEST_PROTOCOL)
# load
with open("../pkl/2018_test.pkl", "rb") as f:
    gc = pickle.load(f)

# #### test GcForest on valid datasets

# In[22]:

y_valid_pred = gc.predict(X_valid)
print("============= 2018 datasets' results on valid =============")
gc_f1, gc_accraucy, gc_precision, gc_recall = evaluate(y_valid, y_valid_pred)

# # load 2018 Test datasets

# In[23]:

lines = open("../data/water/txt/2018waterDataTesting.txt").readlines()
num_lines = len(lines) - 1

X_test = np.ones((num_lines, 9))
y_test = np.ones((num_lines, 1))
flag = 0

lines = np.delete(lines, 0, axis=0)
Exemplo n.º 16
0
    (X_train, y_train), (X_test, y_test) = mnist.load_data()
    # X_train, y_train = X_train[:2000], y_train[:2000]
    X_train = X_train[:, np.newaxis, :, :]
    X_test = X_test[:, np.newaxis, :, :]

    X_train_enc = gc.fit_transform(X_train, y_train)
    # X_enc is the concatenated predict_proba result of each estimators of the last layer of the GCForest model
    # X_enc.shape =
    #   (n_datas, n_estimators * n_classes): If cascade is provided
    #   (n_datas, n_estimators * n_classes, dimX, dimY): If only finegrained part is provided
    # You can also pass X_test, y_test to fit_transform method, then the accracy on test data will be logged when training.
    # X_train_enc, X_test_enc = gc.fit_transform(X_train, y_train, X_test=X_test, y_test=y_test)
    # WARNING: if you set gc.set_keep_model_in_mem(True), you would have to use
    # gc.fit_transform(X_train, y_train, X_test=X_test, y_test=y_test) to evaluate your model.

    y_pred = gc.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print("Test Accuracy of GcForest = {:.2f} %".format(acc * 100))

    # You can try passing X_enc to another classfier on top of gcForest.e.g. xgboost/RF.
    X_test_enc = gc.transform(X_test)
    X_train_enc = X_train_enc.reshape((X_train_enc.shape[0], -1))
    X_test_enc = X_test_enc.reshape((X_test_enc.shape[0], -1))
    X_train_origin = X_train.reshape((X_train.shape[0], -1))
    X_test_origin = X_test.reshape((X_test.shape[0], -1))
    X_train_enc = np.hstack((X_train_origin, X_train_enc))
    X_test_enc = np.hstack((X_test_origin, X_test_enc))
    print("X_train_enc.shape={}, X_test_enc.shape={}".format(
        X_train_enc.shape, X_test_enc.shape))
    clf = RandomForestClassifier(n_estimators=1000, max_depth=None, n_jobs=-1)
    clf.fit(X_train_enc, y_train)
Exemplo n.º 17
0
def load_json(path):
    import json
    """
    支持以//开头的注释
    """
    lines = []
    with open(path) as f:
        for row in f.readlines():
            if row.strip().startswith("//"):
                continue
            lines.append(row)
    return json.loads("\n".join(lines))

X, Y = load.obesity_data()

x_tr,x_te,y_tr,y_te = train_test_split(X,Y,random_state=42,stratify=Y)

clf_rf = RandomForestClassifier(n_estimators=200, random_state=0)
clf_rf.fit(x_tr,y_tr)
y_pred = clf_rf.predict(x_te)
print(accuracy_score(y_te,y_pred))


config = load_json("/home/qiang/repo/python/cascade_clf/examples/demo_ca.json")
clf_gc = GCForest(config)

clf_gc.fit_transform(x_tr.values, y_tr)
y_pred = clf_gc.predict(x_te.values)
print(accuracy_score(y_te, y_pred))
Exemplo n.º 18
0
y_train2 = y_train2.values
y_train = y_train.values
y_valid = y_valid.values
y_test = y_test.values


# In[90]:


config = get_toy_config()
model = GCForest(config)

model.fit_transform(X_train2, y_train2, X_test, y_test)
gc_valid_proba = model.predict_proba(X_valid)
gc_pred = model.predict(X_valid)


# In[14]:


models = [
    LogisticRegression(),
    LinearDiscriminantAnalysis(),
    SVC(probability=True),
    DecisionTreeClassifier(),
    ExtraTreeClassifier(),
    GaussianNB(),
    KNeighborsClassifier(),
    RandomForestClassifier(random_state=random_seed),
    ExtraTreesClassifier(random_state=random_seed),
Exemplo n.º 19
0
def run_classification_configuration(
        X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold,
        test_idx_10_fold, train_idx_10_fold, rf_tree, rf_max_depth, rf_tree_2,
        rf_max_depth_2, xgb_tree, xgb_max_depth, min_child_weight, lr,
        xgb_tree_2, xgb_max_depth_2, min_child_weight_2, lr_2, layer, mode,
        seed):

    folds_AUC_testing, folds_AUPR_testing = [], []
    folds_AUC_training, folds_AUPR_training = [], []
    folds_metrics3_training, folds_metrics3_testing = [], []
    test_true_predict_compare, train_true_predict_compare = [], []
    folds_recall_25, folds_recall_50, folds_recall_100, folds_recall_200, folds_recall_400 = [], [], [], [], []
    folds_G_mean = []
    i = 0
    for X_train, X_test, y_train, y_test, test_idx_fold, train_idx_fold in zip(
            X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold,
            test_idx_10_fold, train_idx_10_fold):

        config = get_toy_config(rf_tree, rf_max_depth, rf_tree_2,
                                rf_max_depth_2, xgb_tree, xgb_max_depth,
                                min_child_weight, lr, xgb_tree_2,
                                xgb_max_depth_2, min_child_weight_2, lr_2,
                                layer)
        gc = GCForest(config)
        X_train_enc = gc.fit_transform(X_train, y_train)

        y_pred_train = gc.predict(X_train)
        y_predprob_train = gc.predict_proba(X_train)
        y_pred_test = gc.predict(X_test)
        y_predprob_test = gc.predict_proba(X_test)

        temp = pd.DataFrame([y_test, y_predprob_test[:, 1],
                             y_pred_test]).T.sort_values(by=1, ascending=False)
        recall_25 = precision_recall_fscore_support(temp.iloc[:25, :][0],
                                                    temp.iloc[:25, :][2],
                                                    pos_label=1,
                                                    average='binary')[1]
        recall_50 = precision_recall_fscore_support(temp.iloc[:50, :][0],
                                                    temp.iloc[:50, :][2],
                                                    pos_label=1,
                                                    average='binary')[1]

        test_true_predict_compare.append([
            test_idx_fold, y_pred_test, y_test, y_predprob_test[:, 0],
            y_predprob_test[:, 1]
        ])  #10-cv
        train_true_predict_compare.append([
            train_idx_fold, y_pred_train, y_train, y_predprob_train[:, 0],
            y_predprob_train[:, 1]
        ])  #10-cv

        precision_training, recall_training, _ = precision_recall_curve(
            y_train, y_predprob_train[:, 1], pos_label=1)
        precision_testing, recall_testing, _ = precision_recall_curve(
            y_test, y_predprob_test[:, 1], pos_label=1)

        AUPR_training, AUPR_testing = auc(recall_training,
                                          precision_training), auc(
                                              recall_testing,
                                              precision_testing)
        AUC_training, AUC_testing = roc_auc_score(
            y_train,
            y_predprob_train[:, 1]), roc_auc_score(y_test, y_predprob_test[:,
                                                                           1])

        metrics3_testing = precision_recall_fscore_support(
            y_test, y_pred_test, pos_label=1, average='binary')[:3]
        metrics3_training = precision_recall_fscore_support(
            y_train, y_pred_train, pos_label=1, average='binary')[:3]

        tn, fp, fn, tp = confusion_matrix(y_test, y_pred_test,
                                          labels=[0, 1]).ravel()
        specificity = float(tn) / float(tn + fp)
        recall = metrics3_testing[1]
        G_mean = np.sqrt(recall * specificity)

        folds_AUC_testing.append(AUC_testing)
        folds_AUPR_testing.append(AUPR_testing)
        folds_metrics3_testing.append(metrics3_testing)
        folds_AUC_training.append(AUC_training)
        folds_AUPR_training.append(AUPR_training)
        folds_metrics3_training.append(metrics3_training)
        folds_G_mean.append(G_mean)
        folds_recall_25.append(recall_25)
        folds_recall_50.append(recall_50)
        i += 1
    Avg_AUPR_training = np.mean(folds_AUPR_training)
    Avg_AUPR_testing = np.mean(folds_AUPR_testing)
    Avg_AUC_training = np.mean(folds_AUC_training)
    Avg_AUC_testing = np.mean(folds_AUC_testing)
    Avg_metrics3_training = np.mean(folds_metrics3_training, axis=0)
    Avg_metrics3_testing = np.mean(folds_metrics3_testing, axis=0)
    Avg_G_mean = np.mean(folds_G_mean)

    return [
        Avg_AUPR_training,
        Avg_AUPR_testing,
        folds_AUPR_testing,  #012
        Avg_AUC_training,
        Avg_AUC_testing,
        folds_AUC_testing,  #345
        folds_AUPR_training,
        folds_AUC_training,  #67
        Avg_metrics3_testing,
        Avg_metrics3_training,  #89
        folds_recall_25,
        folds_recall_50,
        folds_G_mean
    ], [test_true_predict_compare, train_true_predict_compare
        ]  #folds_recall_100, folds_recall_200, folds_recall_400,