Exemplo n.º 1
0
def run_supervised_transfer_cv(seed,
                               dataset,
                               fold=3,
                               val_size=0,
                               k=-1,
                               batch_size=32,
                               groups=('WHITE', 'BLACK'),
                               learning_rate=0.01,
                               lr_decay=0.0,
                               dropout=0.5,
                               tune_epoch=200,
                               tune_lr=0.002,
                               train_epoch=1000,
                               L1_reg=0.001,
                               L2_reg=0.001,
                               hiddenLayers=[128, 64],
                               tune_batch=10):
    X, Y, R, y_sub, y_strat = dataset
    idx = R == groups[1]
    X_b, y_b, R_b, y_strat_b = X[idx], Y[idx], R[idx], y_strat[idx]
    idx = R == groups[0]
    X_w, y_w, R_w, y_strat_w = X[idx], Y[idx], R[idx], y_strat[idx]
    pretrain_set = (X_w, y_w)

    df = pd.DataFrame(columns=['scr', 'R', 'Y'])
    kf = StratifiedKFold(n_splits=fold, shuffle=True, random_state=seed)
    for train_index, test_index in kf.split(X_b, y_strat_b):
        X_train, X_test = X_b[train_index], X_b[test_index]
        Y_train, Y_test = y_b[train_index], y_b[test_index]
        R_train, R_test = R_b[train_index], R_b[test_index]
        strat_train, strat_test = y_strat_b[train_index], y_strat_b[test_index]

        if k > 0:
            k_best = SelectKBest(f_classif, k=k)
            k_best.fit(X_train, Y_train)
            X_train, X_test = k_best.transform(X_train), k_best.transform(
                X_test)
            X_base = k_best.transform(X_w)
            pretrain_set = (X_base, y_w)

        valid_data = None
        if val_size:
            X_train, X_val, Y_train, Y_val = train_test_split(
                X_train,
                Y_train,
                test_size=val_size,
                random_state=0,
                stratify=strat_train)
            valid_data = (X_val, Y_val)
        train_data = (X_train, Y_train)

        n_in = X_train.shape[1]
        classifier = MLP(n_in=n_in,
                         learning_rate=learning_rate,
                         lr_decay=lr_decay,
                         dropout=dropout,
                         L1_reg=L1_reg,
                         L2_reg=L2_reg,
                         hidden_layers_sizes=hiddenLayers)
        classifier.train(pretrain_set,
                         n_epochs=train_epoch,
                         batch_size=batch_size)
        classifier.learning_rate = tune_lr
        classifier.tune(train_data,
                        valid_data=valid_data,
                        batch_size=tune_batch,
                        n_epochs=tune_epoch)

        scr = classifier.get_score(X_test)
        array = np.column_stack((scr[:, 1], R_test, Y_test))
        df_temp = pd.DataFrame(array,
                               index=list(test_index),
                               columns=['scr', 'R', 'Y'])
        df = df.append(df_temp)

    y_test, y_scr = list(df['Y'].values), list(df['scr'].values)
    A_CI = roc_auc_score(y_test, y_scr)
    res = {'folds': fold, 'TL_Auc': A_CI}
    df = pd.DataFrame(res, index=[seed])
    return df
Exemplo n.º 2
0
def run_cv(seed,
           fold,
           X,
           Y,
           R,
           y_strat,
           val_size=0,
           pretrain_set=None,
           batch_size=32,
           k=-1,
           learning_rate=0.01,
           lr_decay=0.0,
           dropout=0.5,
           n_epochs=100,
           momentum=0.9,
           L1_reg=0.001,
           L2_reg=0.001,
           hiddenLayers=[128, 64]):

    X_w = pretrain_set.get_value(
        borrow=True) if k > 0 and pretrain_set else None

    m = X.shape[1] if k < 0 else k
    columns = list(range(m))
    columns.extend(['scr', 'R', 'Y'])
    df = pd.DataFrame(columns=columns)
    kf = StratifiedKFold(n_splits=fold, shuffle=True, random_state=seed)
    for train_index, test_index in kf.split(X, y_strat):
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        R_train, R_test = R[train_index], R[test_index]
        strat_train, strat_test = y_strat[train_index], y_strat[test_index]

        if k > 0:
            k_best = SelectKBest(f_classif, k=k)
            k_best.fit(X_train, Y_train)
            X_train, X_test = k_best.transform(X_train), k_best.transform(
                X_test)

            if pretrain_set:
                X_base = k_best.transform(X_w)
                pretrain_set = theano.shared(X_base,
                                             name='pretrain_set',
                                             borrow=True)

        valid_data = None
        if val_size:
            X_train, X_val, Y_train, Y_val = train_test_split(
                X_train,
                Y_train,
                test_size=val_size,
                random_state=0,
                stratify=strat_train)
            valid_data = (X_val, Y_val)
        train_data = (X_train, Y_train)

        n_in = X_train.shape[1]
        classifier = MLP(n_in=n_in,
                         learning_rate=learning_rate,
                         lr_decay=lr_decay,
                         dropout=dropout,
                         L1_reg=L1_reg,
                         L2_reg=L2_reg,
                         hidden_layers_sizes=hiddenLayers,
                         momentum=momentum)
        if pretrain_set:
            pretrain_config = {
                'pt_batchsize': 32,
                'pt_lr': 0.01,
                'pt_epochs': 500,
                'corruption_level': 0.3
            }
            classifier.pretrain(pretrain_set=pretrain_set,
                                pretrain_config=pretrain_config)
            classifier.tune(train_data,
                            valid_data=valid_data,
                            batch_size=batch_size,
                            n_epochs=n_epochs)
        else:
            classifier.train(train_data,
                             valid_data=valid_data,
                             batch_size=batch_size,
                             n_epochs=n_epochs)
        X_scr = classifier.get_score(X_test)

        array1 = np.column_stack((X_test, X_scr[:, 1], R_test, Y_test))
        df_temp1 = pd.DataFrame(array1,
                                index=list(test_index),
                                columns=columns)
        df = df.append(df_temp1)

    return df