Python RepeatedStratifiedKFold.RepeatedStratifiedKFold 예제들, sklearn.model_selection.RepeatedStratifiedKFold.RepeatedStratifiedKFold Python 예제들

예제 #1

0

파일 보기

파일: SVM_rbf_baxter.py 프로젝트: skjq/Topcuoglu_ML_mBio_2020

SVM_plot = plt.figure()
i = 0
epochs = 50
for epoch in range(epochs):
    i = i + 1
    print(i)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        shuffle=True)
    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)
    y_train = y_train.values
    ## Define the n-folds for hyper-parameter optimization on training set.
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=50, random_state=200889)

    ## Define L2 regularized logistic classifier
    model = SVC(kernel='rbf')

    ## Define the hyper-parameters optimization on training set.
    c_values = [0.000001, 0.00001, 0.0001, 0.001, 0.01]
    gamma = [0.00000001, 0.0000001, 0.000001]
    param_grid = dict(C=c_values, gamma=gamma)
    grid = GridSearchCV(estimator=model,
                        param_grid=param_grid,
                        cv=cv,
                        scoring='roc_auc',
                        n_jobs=-1)
    grid_result = grid.fit(x_train, y_train)
    print('Best model:', grid_result.best_estimator_)

예제 #2

0

파일 보기

파일: ensemble_study.py 프로젝트: yyht/EHG-Oversampling

]]

studies = [
    AcharyaStudy, HosseinZahdeStudy, FergusStudy, Fergus2013Study, IdowuStudy,
    HussainStudy, AhmedStudy, RenStudy, KhanStudy, PengStudy,
    JagerLibensekStudy
]

Xs = [
    X_acharya, X_hosseinzahde, X_fergus, X_fergus2013, X_idowu, X_husain,
    X_ahmed, X_ren, X_khan, X_peng, X_jagerlibensek
]

y = LabelEncoder().fit_transform(y)

validator = RepeatedStratifiedKFold(n_repeats=2, n_splits=10)

results = {}
tests = {}
models = {}

for i, (train, test) in enumerate(validator.split(X, y)):
    print("fold: %d" % i)
    models[i] = {}
    results[i] = {}
    tests[i] = {}
    for j in range(len(studies)):
        print("study: %s" % studies[j].__name__)
        models[i][j] = studies[j]().fit(Xs[j].iloc[train].values, y[train])
        results[i][j] = models[i][j].predict_proba(Xs[j].iloc[test].values)[:,
                                                                            1]

예제 #3

0

파일 보기

C = np.arange(1e-05, 5.5, 0.1)
scoring = {'Accuracy': 'accuracy', 'AUC': 'roc_auc', 'Log_loss': 'neg_log_loss'}
log_reg = LogisticRegression()

#Simple pre-processing estimators
###############################################################################
std_scale = StandardScaler(with_mean=False, with_std=False)
#std_scale = StandardScaler()

#Defining the CV method: Using the Repeated Stratified K Fold
###############################################################################

n_folds=5
n_repeats=5

rskfold = RepeatedStratifiedKFold(n_splits=n_folds, n_repeats=n_repeats, random_state=2)

#Creating simple pipeline and defining the gridsearch
###############################################################################

log_clf_pipe = Pipeline(steps=[('scale',std_scale), ('clf',log_reg)])

log_clf = GridSearchCV(estimator=log_clf_pipe, cv=rskfold,
              scoring=scoring, return_train_score=True,
              param_grid=dict(clf__C=C), refit='Accuracy')

log_clf.fit(X, y)
results = log_clf.cv_results_

print('='*20)
print("best params: " + str(log_clf.best_estimator_))

예제 #4

0

파일 보기

파일: Experiments_CWRU.py 프로젝트: AntonioLoca/CWRU

    tam = len(class_names) * figprop[len(class_names)]

results = {}
models = {}
nrounds = 1 if debug else 5
"""##kfold Experiments"""

X, y = None, None
for load in range(4):
    X, y = concatenate_datasets(X, y, eval('xn_' + str(load)),
                                eval('yn_' + str(load)))
for severity in severities:
    X, y = concatenate_datasets(X, y, eval('x' + str(severity)),
                                eval('y' + str(severity)))
rskf = RepeatedStratifiedKFold(n_splits=len(severities),
                               n_repeats=nrounds,
                               random_state=36851234)
fold = 0
count_round = 0

results['kfold'] = {}
models['kfold'] = {}
y_test_round = None
y_pred_round = {}

print("k-Fold")
for train_index, test_index in rskf.split(X, y):
    print("{}/{}".format(fold + 1, rskf.get_n_splits() // nrounds), end=" x ")
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    if y_test_round is None:

예제 #5

0

파일 보기

# Spot-check algorithms
models = []
models.append(('LR', LogisticRegression(solver='liblinear')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))

# evaluate each model in turn
num_folds = 5
scoring = 'roc_auc'
results = []
names = []

for name, model in models:
    kfold = RepeatedStratifiedKFold(n_splits=num_folds, random_state=seed)
    cv_results = cross_val_score(estimator=model,
                                 X=X_train,
                                 y=y_train,
                                 cv=kfold,
                                 scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = f"{name} {cv_results.mean()} +/- ({cv_results.std()})"
    print(msg)
# -

# compare performance
fig1 = pyplot.figure()
fig1.suptitle('Algorithm comparison')
ax = fig1.add_subplot(111)

예제 #6

0

파일 보기

파일: RepeatedCV_RF.py 프로젝트: RamanLab/NBDriver

def run_repeatedCV():
    sen_df = pd.DataFrame()
    spe_df = pd.DataFrame()
    auc_df = pd.DataFrame()
    mcc_df = pd.DataFrame()
    k = 0
    pos_data, kmer_data = pd_read_pattern()
    new_kmer = shuffle_data(kmer_data)
    #KDE kmer using top 30 percentile features with tfidf scores for test derived from train using transform()
    for i in range(0, 10):
        ctr = 0
        print("Window size: ", i + 1, "\n")
        if (i == 0):
            names2 = ['df_2_cv', 'df_3_cv', 'df_2_tf', 'df_3_tf']
        else:
            names2 = [
                'df_2_cv', 'df_3_cv', 'df_4_cv', 'df_2_tf', 'df_3_tf',
                'df_4_tf'
            ]
        X = new_kmer[i]
        y = new_kmer[i]['Label']
        print("kmer size", names2[ctr])
        ctr = ctr + 1
        rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3)
        for train_index, test_index in rskf.split(X, y):
            sen_kde = []
            spe_kde = []
            acc_kde = []
            auc_kde = []
            m_kde = []
            c = []
            k = k + 1
            print(k, end=",")
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            dat_train, dat_test, names = cv_tf_transformation(X_train, X_test)

            for j in range(0, len(dat_train)):
                dat_train[j]['Chr'] = dat_train[j]['Chr'].replace(['X'], '21')
                dat_test[j]['Chr'] = dat_test[j]['Chr'].replace(['X'], '21')
                train_x = dat_train[j].drop('Label', axis=1)
                train_y = dat_train[j]['Label']
                test_x = dat_test[j].drop('Label', axis=1)
                test_y = dat_test[j]['Label']
                X_red = feature_reduction_using_trees(train_x, train_y)
                rf = RandomForestClassifier()
                param_grid = {
                    'n_estimators': [50, 100, 200, 300, 400],
                    'max_features': ['auto', 'sqrt', 'log2'],
                    'max_depth': [2, 3, 5, 7],
                    'min_samples_leaf': [1, 3],
                    'min_samples_split': [2, 5, 10],
                }
                grid = GridSearchCV(rf, param_grid, cv=3)
                grid.fit(X_red, train_y.ravel())
                best_model = grid.best_estimator_
                best_model.fit(X_red, train_y.ravel())
                y_probs = best_model.predict_proba(test_x[X_red.columns])[:, 1]
                thresholds = arange(0, 1, 0.001)
                scores = [
                    roc_auc_score(test_y, convert_to_labels(y_probs, t))
                    for t in thresholds
                ]
                ix = argmax(scores)
                y_test_predictions = np.where(
                    best_model.predict_proba(test_x[X_red.columns])[:, 1] >
                    thresholds[ix], 2, 1)
                sensi = sensitivity_score(test_y,
                                          y_test_predictions,
                                          pos_label=2)
                speci = specificity_score(test_y,
                                          y_test_predictions,
                                          pos_label=2)
                accu = accuracy_score(test_y, y_test_predictions)
                auro = roc_auc_score(test_y, y_test_predictions)
                mcc = metrics.matthews_corrcoef(test_y, y_test_predictions)
                c.append(X_red.columns)
                sen_kde.append(sensi)
                spe_kde.append(speci)
                acc_kde.append(accu)
                auc_kde.append(auro)
                m_kde.append(mcc)
            if (i == 0):
                sen_df = sen_df.append(
                    {
                        'df_2_cv': sen_kde[0],
                        'df_3_cv': sen_kde[1],
                        'df_2_tf': sen_kde[2],
                        'df_3_tf': sen_kde[3]
                    },
                    ignore_index=True)
                spe_df = spe_df.append(
                    {
                        'df_2_cv': spe_kde[0],
                        'df_3_cv': spe_kde[1],
                        'df_2_tf': spe_kde[2],
                        'df_3_tf': spe_kde[3]
                    },
                    ignore_index=True)
                auc_df = auc_df.append(
                    {
                        'df_2_cv': auc_kde[0],
                        'df_3_cv': auc_kde[1],
                        'df_2_tf': auc_kde[2],
                        'df_3_tf': auc_kde[3]
                    },
                    ignore_index=True)
                mcc_df = mcc_df.append(
                    {
                        'df_2_cv': m_kde[0],
                        'df_3_cv': m_kde[1],
                        'df_2_tf': m_kde[2],
                        'df_3_tf': m_kde[3]
                    },
                    ignore_index=True)
            else:
                sen_df = sen_df.append(
                    {
                        'df_2_cv': sen_kde[0],
                        'df_3_cv': sen_kde[1],
                        'df_4_cv': sen_kde[2],
                        'df_2_tf': sen_kde[3],
                        'df_3_tf': sen_kde[4],
                        'df_4_tf': sen_kde[5]
                    },
                    ignore_index=True)
                spe_df = spe_df.append(
                    {
                        'df_2_cv': spe_kde[0],
                        'df_3_cv': spe_kde[1],
                        'df_4_cv': spe_kde[2],
                        'df_2_tf': spe_kde[3],
                        'df_3_tf': spe_kde[4],
                        'df_4_tf': spe_kde[5]
                    },
                    ignore_index=True)
                auc_df = auc_df.append(
                    {
                        'df_2_cv': auc_kde[0],
                        'df_3_cv': auc_kde[1],
                        'df_4_cv': auc_kde[2],
                        'df_2_tf': auc_kde[3],
                        'df_3_tf': auc_kde[4],
                        'df_4_tf': auc_kde[5]
                    },
                    ignore_index=True)
                mcc_df = mcc_df.append(
                    {
                        'df_2_cv': m_kde[0],
                        'df_3_cv': m_kde[1],
                        'df_4_cv': m_kde[2],
                        'df_2_tf': m_kde[3],
                        'df_3_tf': m_kde[4],
                        'df_4_tf': m_kde[5]
                    },
                    ignore_index=True)

예제 #7

0

파일 보기

파일: split_dataset_from_mongo_db.py 프로젝트: gody7334/doodle_cnn

    query={},
    project={'key_id':1, 'recognized':1, 'word':1},
    host='localhost',
    port=27017,
    username=None,
    password=None,
    no_id=True,
    num_sample=10000
)

df = df.loc[df['recognized'] == True].reset_index()

ids = df.index.values
word_class = df.word.values

rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=1,random_state=999)

cv_idx = 0
for train_idx, test_idx in rskf.split(ids, word_class):
    train_cv = df.loc[train_idx]
    test_cv = df.loc[test_idx]
    sys.exit()
    train_cv.to_csv(f'{path_CV}/train_df_{cv_idx}.csv')
    test_cv.to_csv(f'{path_CV}/val_df_{cv_idx}.csv')
    cv_idx+=1


# In[8]:


train_cv

예제 #8

0

파일 보기

파일: age_model.py 프로젝트: jacksklar/AGPMicrobiomeHostPredictions

RANDOM_STATE_CV = 124213

group_labels = [
    "[0 - 5)", "[5 - 10)", "[10 - 15)", "[15 - 20)", "[20 - 25)", "[25 - 30)",
    "[30 - 35)", "[35 - 40)", "[40 - 45)", "[45 - 50)", "[50 - 55)",
    "[55 - 60)", "[60 - 65)", "[65 - 70)", "[70 - 75)", "75+"
]

X = otu_df.loc[age_cohort.index, :].astype(float).values
y = age_cohort["target"].astype(float)
print y.value_counts()
X = np.log(X + 1.0)
X, y = shuffle(X, y)

cv = RepeatedStratifiedKFold(n_splits=10,
                             n_repeats=10,
                             random_state=RANDOM_STATE_CV)

results = []
C_dist = [0.001, 0.01, 0.1, 1.0, 5.0]
C_dist = [0.001]
confusion_mats = []

for c in C_dist:
    print c
    alg_rf = SVC(C=c,
                 kernel='linear',
                 class_weight='balanced',
                 random_state=RANDOM_STATE_SVM)
    alg_rf = RandomForestClassifier(n_estimators=256,
                                    class_weight='balanced',

예제 #9

0

파일 보기

def objective(trial, train, test, raw_features):
    # start_time = timer()
    CLIP_FEATURES = False  # trial.suggest_categorical("clip", ["True", "False"])

    df_all_X = pd.concat([train.drop('target', axis=1), test], axis=0)
    if CLIP_FEATURES == False:
        le = LabelEncoder()
        df_all_X = df_all_X.apply(le.fit_transform)
    else:
        missing_integer = df_all_X.max().max(
        ) + 15  # we will replace any unseen test value with this one.
        for col in test.columns:  # inspect across all test set's columns
            if not all(test[col].isin(train[col].value_counts().index.tolist())
                       ):  # see if there's out of bounds value
                for i in set(test[col]).difference(set(train[col])):
                    test[col].replace(
                        i, missing_integer, inplace=True
                    )  # replace the oob value with a dummy value
                train[
                    f'{col}_{missing_integer}'] = 0  # generate a boolean column to mark all those missing values in test set
                test[
                    f'{col}_{missing_integer}'] = 1  # generate a boolean column to mark all those missing values in test set
                test[f'{col}_{missing_integer}'].where(
                    test[col] == missing_integer, 0, inplace=True)
        df_all_X = pd.concat([train.drop('target', axis=1), test], axis=0)

    # Gropu low frequency into one value
    GROUP_LOW_FREQUENCY = False
    GROUP_LOW_FREQUENCY_THRESHOLD = 0  # trial.suggest_discrete_uniform("threshold", 0, 50, 1)

    if GROUP_LOW_FREQUENCY:
        for col in raw_features:
            value_counts_SS = df_all_X[col].value_counts()
            low_freq_values = value_counts_SS.index[
                value_counts_SS < GROUP_LOW_FREQUENCY_THRESHOLD]
            if len(low_freq_values) > 0:
                df_all_X[f'{col}_low_freq'] = 0
                for i in low_freq_values.tolist():
                    df_all_X[f'{col}_low_freq'].iloc[df_all_X[col] == i] = 1

    Xtrn, Xtst = df_all_X.iloc[:len(train)], df_all_X.iloc[len(train):]
    le = LabelEncoder()
    y = pd.Series(le.fit_transform(train['target']))

    # ANCHOR CONSTRUCTION
    class0 = 0.8023844111083878
    class1 = 0.4973760913650416
    class2 = 0.8940055025348296
    class3 = 0.8641162667601383

    class_weights = [class0, class1, class2, class3]
    losses = []
    y_oof = np.zeros((Xtrn.shape[0], len(np.unique(y))))
    pruning_callback = optuna.integration.XGBoostPruningCallback(
        trial, "val-mlogloss")
    rskf = RepeatedStratifiedKFold(n_splits=N_SPLITS,
                                   n_repeats=N_REPEATS,
                                   random_state=RANDOM_SEED)

    temp_map = {
        # "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.05),
        "colsample_bytree": trial.suggest_loguniform("colsample_bytree", 0.1,
                                                     0.8),
        "subsample": trial.suggest_loguniform("subsample", 0.1, 0.8),
        "alpha": trial.suggest_loguniform("alpha", 0.01, 10.0),
        "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
        "gamma": trial.suggest_loguniform("lambda", 1e-8, 1.0),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 3,
                                                     100),
        'max_depth': trial.suggest_int('max_depth', 3, 12)
    }

    # ANCHOR CONSTRUCTION

    # from sklearn.model_selection import train_test_split
    # le = LabelEncoder()
    # y = le.fit_transform(train['target'])
    # X_A, X_B, y_A, y_B = train_test_split(train.drop('target', axis=1), y, test_size=0.33, random_state=42)
    # dtrain = xgb.DMatrix(X_A, label=y_A)
    # dtest = xgb.DMatrix(X_B, label=y_B)
    # params = {
    #     'objective': "multi:softprob",
    #     'eval_metric': 'mlogloss',
    #     'n_estimators': 10000,
    #     'booster': 'gbtree',
    #     'tree_method': 'gpu_hist',
    #     'num_class': 4
    # }
    # xgb_model = xgb.train(params,
    #     dtrain=dtrain,
    #     evals=[(dtest, 'val'), (dtrain, 'train')],
    #     verbose_eval=False)
    # tmp = xgb_model.predict(xgb.DMatrix(X_B))

    for i, (train_index, valid_index) in enumerate(rskf.split(Xtrn, y)):
        X_A, X_B = Xtrn.iloc[train_index, :], Xtrn.iloc[valid_index, :]
        y_A, y_B = y.iloc[train_index], y.iloc[valid_index]
        # sample_weight_fold = [class_weights[j] for j in y_A]
        params = {
            'objective': 'multi:softprob',
            'eval_metric': 'mlogloss',
            'n_estimators': 10000,
            'booster': 'gbtree',
            'verbosity': 0,
            'tree_method': 'gpu_hist',
            'num_class': 4
        }
        dtrain = xgb.DMatrix(X_A, label=y_A)
        dtest = xgb.DMatrix(
            X_B, label=y_B)  #, weight=[class_weights[j] for j in y_B])
        dtestX = xgb.DMatrix(X_B)
        params.update(temp_map)
        # learning api https://tinyurl.com/yz8bqyfd
        xgb_model = xgb.train(
            params,
            dtrain=dtrain,
            evals=[(dtest, 'val'), (dtrain, 'train')],
            #     sample_weight=sample_weight_fold,
            early_stopping_rounds=EARLY_STOPPING_ROUNDS,
            callbacks=[pruning_callback],
            verbose_eval=False)

        # xgb_classifier = XGBClassifier(**params)
        # xgb_classifier.fit(
        #     X_A, y_A, eval_set=[(X_B, y_B)],
        #     sample_weight=sample_weight_fold,
        #     early_stopping_rounds=EARLY_STOPPING_ROUNDS,
        #     callbacks=[pruning_callback]
        # )
        # tmp = xgb_classifier.predict_proba(X_B)
        tmp = xgb_model.predict(dtestX)
        y_oof[valid_index, :] = tmp / N_REPEATS
        loss = log_loss(y_B, tmp)
        losses.append(loss)
        # print(f'loss: {loss}')

    mean_running_loss = np.mean(losses)
    # print(f'average running loss: {mean_running_loss}')
    # oof_loss = log_loss(y, y_oof)
    # print(f'average repeat oof loss: {oof_loss}')
    # timer(start_time)

    # To avoid running out of memory and still save a copy of the best model.
    # YOU NEED THE FOLLOWING TWO LINES.
    # trial.set_user_attr(key="best_booster", value=copy.deepcopy(xgb_model)) # comment this out
    xgb_model.__del__()  # release memory https://tinyurl.com/ydw9nebm

    return mean_running_loss

예제 #10

0

파일 보기

le = LabelEncoder()
df_all_X = df_all_X.apply(le.fit_transform)
Xtrn, Xtst = df_all_X.iloc[:len(train)], df_all_X.iloc[len(train):]
le = LabelEncoder()
y = pd.Series(le.fit_transform(train['target']))
losses = []
y_oof = np.zeros((Xtst.shape[0], len(np.unique(y))))
y_val = np.zeros((Xtrn.shape[0], len(np.unique(y))))

N_SPLITS = 7
N_REPEATS = 3
EARLY_STOPPING_ROUNDS = 10
RANDOM_SEED = 2021

rskf = RepeatedStratifiedKFold(n_splits=N_SPLITS,
                               n_repeats=N_REPEATS,
                               random_state=RANDOM_SEED)

dtestX = xgb.DMatrix(Xtst)
for i, (train_index, valid_index) in enumerate(rskf.split(Xtrn, y)):
    X_A, X_B = Xtrn.iloc[train_index, :], Xtrn.iloc[valid_index, :]
    y_A, y_B = y.iloc[train_index], y.iloc[valid_index]
    dtrain = xgb.DMatrix(X_A, label=y_A)
    dval = xgb.DMatrix(X_B,
                       label=y_B)  #, weight=[class_weights[j] for j in y_B])
    dvalX = xgb.DMatrix(X_B)
    # learning api https://tinyurl.com/yz8bqyfd
    xgb_model = xgb.train(params,
                          dtrain=dtrain,
                          evals=[(dval, 'val'), (dtrain, 'train')],
                          early_stopping_rounds=EARLY_STOPPING_ROUNDS,

예제 #11

0

파일 보기

# Creamos un ColumnTransformer para el StandardScaler
scaler = ColumnTransformer([('scaler_media', scaler_media, slice(0, 8)),
                            ('scaler_moda', scaler_moda,
                             slice(8, len(X.columns)))])

# Creamos el Pipeline incorporando ColumnTransformer y Clasificador
pipeline = Pipeline([('imputer', imputer), ('scaler', scaler),
                     ('svm',
                      SVC(random_state=RANDOM_STATE,
                          class_weight=CLASS_WEIGHT,
                          probability=True))])

# InnerCV (GridSearchCV de 2-folds 5-times (stratified) para obtener mejores parámetros)
rskf = RepeatedStratifiedKFold(n_splits=2,
                               n_repeats=5,
                               random_state=RANDOM_STATE)  # inner
grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=PARAM_GRID,
                           scoring=SCORING,
                           cv=rskf)

# # OuterCV (Validación cruzada de 5 folds (stratified) para estimar Accuracy)
# scores = cross_validate(estimator=grid_search, X=X, y=y, cv=5, error_score='raise', return_estimator=True, scoring=SCORING)  # outer
# print('Scores: {}' .format(scores['test_score']))
# print('Mean score: {}' .format(np.mean(scores['test_score'])))

# # Creamos clasificador 'tonto' y obtenemos resultados también con validación cruzada (CV=5) para tener resultados más realistas
# dummy_clf = DummyClassifier(strategy='most_frequent', random_state=RANDOM_STATE)
# dummy_scores = cross_validate(estimator=dummy_clf, X=X, y=y, cv=5, error_score='raise', return_estimator=True, scoring=SCORING)
# print('Dummy scores: {}' .format(dummy_scores['test_score']))

예제 #12

0

파일 보기

파일: autos.py 프로젝트: nocibambi/ebay-car-pricing

    'n_jobs': [-1],
    'random_state': [seed],
    'verbose': [0],
    #'class_weight': [],
})

rf_gridCV_2 = param_search(X_train, y_train, RandomForestClassifier, rf_params)
print(rf_gridCV_2.best_score_)

pd.DataFrame(rf_gridCV_2.cv_results_).to_csv("last_results.csv")

# Repeated cross validation
print("Running model...")
start = time.time()
model = rf_gridCV_2.best_estimator_
kfold = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=seed)
cv_results = cross_validate(model,
                            X_train,
                            y_train,
                            scoring=['accuracy', 'roc_auc'],
                            cv=kfold,
                            n_jobs=-1,
                            verbose=2,
                            return_train_score=False)
print("Test accuracy:{}".format(cv_results['test_accuracy'].mean()))
print("Test ROC AUC:{}".format(cv_results['test_roc_auc'].mean()))
end = time.time()
print("Duration:{}".format(end - start))

pd.DataFrame(cv_results).to_csv("rep_cv_res.csv")

예제 #13

0

파일 보기

파일: evaluation.py 프로젝트: pabloangulo7/pywinEA

    def roc(self,
            idx: int = None,
            estimator=None,
            fitness_idx: int = 0,
            cv: int = 5,
            reps: int = 1,
            positive_class: int = None,
            random_state: int = 0):
        """
        Function that allows to represent the ROC curve on the solutions found in the non-dominated front
        using cross validation with repetitions.

        Parameters
        ------------
        :param idx: int
            Index of the solution to represent.
        :param fitness_idx: int
            Index of the fitness function to represent.
        :param estimator: <optional> sklearn.base.BaseEstimator
            If none is provided the algorithm estimator will be used. This must support predictions with
            probabilities, otherwise it will throw an exception.
        :param cv: <optional> int
            Default 5
        :param reps: <optional> int
            Default 1
        :param positive_class: <optional> int
            By default the class selected as positive in the algorithm. In the case that the algorithm does
            not have a positive class and one is not provided, an exception will be thrown.
        :param random_state: <optional> int
        """
        import numpy as np
        import matplotlib.pyplot as plt
        import matplotlib.lines as mlines
        from sklearn.model_selection import RepeatedStratifiedKFold
        from sklearn.metrics import roc_curve, auc

        #  Get positive class
        if positive_class is None:
            positive_class = self.algorithm.positive_class

        #  Check if the estimator can be used to compute roc curves
        if estimator is None:
            estimator = self.algorithm.fitness[fitness_idx].estimator
            try:
                estimator.probability = True
            except:
                raise UnsuitableClassifier(
                    "The classifier does not support probabilities, therefore the ROC curve cannot be computed. "
                    "Run the algorithm with a classifier that supports probabilities or provide a valid classifier "
                    "that support probabilities using the argument \"estimator\"."
                )

        # Get dataset
        x_data, y_data = self.algorithm.get_dataset()

        #  Get non-dominated solutions
        best_solutions = self._get_pareto_front()

        #  if the user has selected a certain solution use only that solution
        if idx is not None:
            indexes = [idx]
        else:
            indexes = [n for n in range(len(best_solutions))]

        fig, ax = plt.subplots(figsize=(10, 5))

        viridis = plt.cm.get_cmap('viridis', len(indexes))

        solutions_legend = []

        for index in indexes:

            mean_tp = 0.0
            mean_fp = np.linspace(0, 1, 100)
            roc_auc = []

            #  Create cross-validation iterator
            cv_iterator = list(
                RepeatedStratifiedKFold(n_splits=cv,
                                        n_repeats=reps,
                                        random_state=random_state).split(
                                            x_data[:, best_solutions[index]],
                                            y_data))

            #  Compute the ROC curve for each fold of each solution
            for i, (train, test) in enumerate(cv_iterator):
                probs = estimator.fit(x_data[np.ix_(train, best_solutions[index])], y_data[train]) \
                    .predict_proba(x_data[np.ix_(test, best_solutions[index])])

                fp, tp, thresholds = roc_curve(y_data[test],
                                               probs[:, 1],
                                               pos_label=positive_class)

                mean_tp += np.interp(mean_fp, fp, tp)
                mean_tp[0] = 0.0

                #  Compute AUC
                roc_auc.append(auc(fp, tp))

                ax.plot(fp, tp, color=viridis(index), alpha=0.3)

            solutions_legend.append(
                mlines.Line2D([], [],
                              color=viridis(index),
                              marker='.',
                              markersize=5,
                              label='Solution (%d) AUC = %.3f +/- %.3f' %
                              (index, np.mean(roc_auc), np.std(roc_auc))))

        ax.plot([0, 1], [0, 1],
                linestyle='-.',
                color='black',
                label="Random Classifier")

        ax.set_xlim([0.0, 1.0])
        ax.set_ylim([0.0, 1.05])
        ax.set_xlabel('False Positive Rate')
        ax.set_ylabel('True Positive Rate')
        ax.set_title('Receiver operating characteristic')

        plt.legend(handles=solutions_legend, loc="lower right")

        plt.show()