示例#1
0
from os import path
import sys
sys.path.append(path.dirname(path.dirname(path.abspath(__file__))))
from preprocessor import *

from sklearn.cross_validation import KFold

from seqlearn.perceptron import StructuredPerceptron
from seqlearn.evaluation import bio_f_score
import numpy as np

# p = Preprocessor(inputdir="../.././../../brat/data/10k-wikipedia/00")
p = Preprocessor()


kf = KFold(p.L.shape[0], n_folds=2)

for train_ids, test_ids in kf:

    L_train = p.L[train_ids]
    L_test = p.L[test_ids]
    X_train = np.zeros((0, p.X.shape[1]))
    y_train = np.zeros((0,))
    X_test = np.zeros((0, p.X.shape[1]))
    y_test = np.zeros((0,))

    for i, l in enumerate(L_train):
        start = sum(L_train[:i])
        end = sum(L_train[:i+1])
        X_train = np.vstack([X_train, p.X[start:end]])
        y_train = np.append(y_train, p.Y[start:end])
# -*- coding: utf-8 -*-
"""
Created on Tue Sep  6 10:10:40 2016
"""
import numpy as np
from sklearn.cross_validation import KFold

y = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2])

print("kfold")
kf = KFold(9, n_folds=3)
print(len(kf))

for train, test in kf:
    print(train, test)

print()
for train, test in kf:
    print(y[train], y[test])

print()

from sklearn.cross_validation import StratifiedKFold
print("StratifiedKFold")
kf = StratifiedKFold(y, n_folds=3)
print(len(kf))
for train, test in kf:
    print(train, test)

print()
for train, test in kf:
process_mask[:, 30:] = 0
process_mask_img = nibabel.Nifti1Image(process_mask, mask_img.get_affine())

### Searchlight computation ###################################################

# Make processing parallel
# /!\ As each thread will print its progress, n_jobs > 1 could mess up the
#     information output.
n_jobs = 1

### Define the cross-validation scheme used for validation.
# Here we use a KFold cross-validation on the session, which corresponds to
# splitting the samples in 4 folds and make 4 runs using each fold as a test
# set once and the others as learning sets
from sklearn.cross_validation import KFold
cv = KFold(y.size, n_folds=4)

import nilearn.decoding
# The radius is the one of the Searchlight sphere that will scan the volume
searchlight = nilearn.decoding.SearchLight(mask_img,
                                           process_mask_img=process_mask_img,
                                           radius=5.6,
                                           n_jobs=n_jobs,
                                           verbose=1,
                                           cv=cv)
searchlight.fit(fmri_img, y)

### F-scores computation ######################################################
from nilearn.input_data import NiftiMasker

# For decoding, standardizing is often very important
示例#4
0
def main():
    parser = argparse.ArgumentParser(
        description="Transform csv files into numpy array")
    parser.add_argument('-d',
                        '--data',
                        required=True,
                        help="The data directory")
    args = parser.parse_args()
    learning_rate = 0.0001
    L1_reg = 0.00
    L2_reg = 0.0001
    n_epochs = 1000
    batch_size = 32
    n_hidden = 1000
    ds = pickle.load(open(os.path.join(args.data, 'ds.npy')))
    trI = ds['trI']
    trX = ds['trX'].toarray()
    trY = ds['trY'].astype(np.int32)
    teI = ds['teI']
    teX = ds['teX'].toarray()
    allX = np.vstack((trX, teX))
    means, stds = calculate_mean_and_std(allX)
    normailize_by_zvalue(means, stds, allX)
    #normailize_by_minmax(allX);
    trX = allX[0:trX.shape[0], :]
    teX = allX[trX.shape[0]:trX.shape[0] + teX.shape[0], :]
    kf = KFold(trX.shape[0], n_folds=5)
    trainIds = None
    testIds = None
    for train, test in kf:
        trainIds = train
        testIds = test
    cv_train_X = theano.shared(trX[trainIds, :], 'cv_train_X')
    cv_test_X = theano.shared(trX[testIds, :], 'cv_test_X')
    cv_train_Y = theano.shared(trY[trainIds], 'cv_train_Y')
    cv_test_Y = theano.shared(trY[testIds], 'cv_test_Y')
    ncvtr = len(trainIds)
    ncvte = len(testIds)
    nte = teX.shape[0]
    n_train_batches = int(np.ceil(len(trainIds) * 1.0 / batch_size))
    n_valid_batches = int(np.ceil(len(testIds) * 1.0 / batch_size))
    n_test_batches = int(np.ceil(teX.shape[0] * 1.0 / batch_size))
    teX = theano.shared(teX)
    rng = np.random.RandomState(1234)
    print('... building the model')
    left = T.lscalar()
    right = T.lscalar()
    x = T.matrix('x')
    y = T.ivector('y')
    classifier = MLP(rng=rng, input=x, n_in=149, n_hidden=n_hidden, n_out=2)
    cost = (classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 +
            L2_reg * classifier.L2_sqr)
    validate_model = theano.function(inputs=[left, right],
                                     outputs=classifier.errors(y),
                                     givens={
                                         x: cv_test_X[left:right],
                                         y: cv_test_Y[left:right]
                                     })
    gparams = [T.grad(cost, param) for param in classifier.params]
    updates = [(param, param - learning_rate * gparam)
               for param, gparam in zip(classifier.params, gparams)]
    train_model = theano.function(inputs=[left, right],
                                  outputs=cost,
                                  updates=updates,
                                  givens={
                                      x: cv_train_X[left:right],
                                      y: cv_train_Y[left:right]
                                  })
    test_model = theano.function(inputs=[left, right],
                                 outputs=classifier.output,
                                 givens={
                                     x: teX[left:right],
                                 })
    print('... training')

    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience // 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_validation_loss = np.inf
    best_iter = 0
    test_score = 0.
    start_time = timeit.default_timer()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in range(n_train_batches):

            minibatch_avg_cost = train_model(
                minibatch_index * batch_size,
                min((minibatch_index + 1) * batch_size, ncvtr))
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                validation_losses = [
                    validate_model(i * batch_size,
                                   min((i + 1) * batch_size, ncvte))
                    for i in range(n_valid_batches)
                ]
                this_validation_loss = np.mean(validation_losses)

                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                if this_validation_loss < best_validation_loss:
                    if (this_validation_loss <
                            best_validation_loss * improvement_threshold):
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    best_iter = iter

            if patience <= iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    print(('Optimization complete. Best validation score of %f %% '
           'obtained at iteration %i, with test performance %f %%') %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    probs = np.vstack([
        test_model(i * batch_size, min((i + 1) * batch_size, nte))
        for i in range(n_test_batches)
    ])
    fid = open('/local/db/uqdxingz/Santander/sub/mlp.csv', 'w')
    fid.write('ID,TARGET\n')
    for i in range(len(teI)):
        fid.write('%d,%.9f\n' % (int(teI[i]), probs[i][1]))
    fid.close()
示例#5
0
def make_mf_sliced_classification(subset_tr,
                                  subset_te,
                                  clf,
                                  n_round=3,
                                  target_col='median_relevance'):
    print '\n [make_mf_slice]'
    print clf
    mf_tr = np.zeros(len(subset_tr))
    mf_te = np.zeros(len(subset_te))

    #query-slice
    for cur_query in subset_tr.query_stem.value_counts().index:
        mask_tr = subset_tr.query_stem == cur_query
        mask_te = subset_te.query_stem == cur_query

        # build Bow
        vect = CountVectorizer(min_df=1, ngram_range=(1, 2))

        txts = (list((subset_tr[mask_tr]['title_ext']).values) + list(
            (subset_te[mask_te]['title_ext']).values))
        vect.fit(txts)

        X_loc_base = vect.transform(
            list((subset_tr[mask_tr]['title_ext']).values)).todense()
        X_loc_hold = vect.transform(
            list((subset_te[mask_te]['title_ext']).values)).todense()
        y_loc_train = subset_tr[mask_tr][target_col].values
        # intersect terms
        feat_counts = np.array(np.sum(X_loc_base, axis=0))[0] * np.array(
            np.sum(X_loc_hold, axis=0))[0]
        feat_mask = np.where(feat_counts > 0)[0]
        # build final feats matrix
        X_loc_base = np.hstack(
            (X_loc_base[:, feat_mask], subset_tr[mask_tr][feat_list]))
        X_loc_hold = np.hstack(
            (X_loc_hold[:, feat_mask], subset_te[mask_te][feat_list]))

        # metafeatures iterators
        tmp_tr = np.zeros(sum(mask_tr))
        tmp_te = np.zeros(sum(mask_te))

        #print y_loc_train.shape, X_loc_base.shape

        for i in range(n_round):
            kf = KFold(len(y_loc_train),
                       n_folds=2,
                       shuffle=True,
                       random_state=42 + i * 1000)
            for ind_tr, ind_te in kf:
                X_tr = X_loc_base[ind_tr]
                X_te = X_loc_base[ind_te]
                y_tr = y_loc_train[ind_tr]
                y_te = y_loc_train[ind_te]

                clf.fit(X_tr, y_tr)
                tmp_tr[ind_te] += clf.predict(X_te)
                tmp_te += clf.predict(X_loc_hold) * 0.5
        mf_tr[mask_tr.values] = tmp_tr / n_round
        mf_te[mask_te.values] = tmp_te / n_round

    y_valid = subset_tr[target_col].values
    kappa = pykappa.quadratic_weighted_kappa(y_valid, np.round(mf_tr))
    acc = np.mean(y_valid == np.round(mf_tr))
    print '[{}] kappa:{}, acc:{}'.format(i, kappa, acc)
    return (mf_tr, mf_te)
示例#6
0
def run_cross_validation_create_models(nfolds=10):
    # input image dimensions
    batch_size = 16
    nb_epoch = 25
    random_state = 51
    restore_from_last_checkpoint = 1

    train_data, train_target, train_id, driver_id, unique_drivers = read_and_normalize_train_data(
    )

    yfull_train = dict()
    kf = KFold(len(unique_drivers),
               n_folds=nfolds,
               shuffle=True,
               random_state=random_state)
    num_fold = 0
    sum_score = 0
    for train_drivers, test_drivers in kf:
        model = VGG_16()
        unique_list_train = [unique_drivers[i] for i in train_drivers]
        X_train, Y_train, train_index = copy_selected_drivers(
            train_data, train_target, driver_id, unique_list_train)
        unique_list_valid = [unique_drivers[i] for i in test_drivers]
        X_valid, Y_valid, test_index = copy_selected_drivers(
            train_data, train_target, driver_id, unique_list_valid)

        num_fold += 1
        print('Start KFold number {} from {}'.format(num_fold, nfolds))
        print('Split train: ', len(X_train), len(Y_train))
        print('Split valid: ', len(X_valid), len(Y_valid))
        print('Train drivers: ', unique_list_train)
        print('Test drivers: ', unique_list_valid)

        kfold_weights_path = os.path.join(
            'cache', 'weights_kfold_vgg16_' + str(num_fold) + '.h5')
        if not os.path.isfile(
                kfold_weights_path) or restore_from_last_checkpoint == 0:
            callbacks = [
                EarlyStoppingByLossVal(monitor='val_loss',
                                       value=0.00001,
                                       verbose=1),
                EarlyStopping(monitor='val_loss', patience=5, verbose=0),
                ModelCheckpoint(kfold_weights_path,
                                monitor='val_loss',
                                save_best_only=True,
                                verbose=0),
            ]
            model.fit(X_train,
                      Y_train,
                      batch_size=batch_size,
                      nb_epoch=nb_epoch,
                      shuffle=True,
                      verbose=1,
                      validation_data=(X_valid, Y_valid),
                      callbacks=callbacks)
        if os.path.isfile(kfold_weights_path):
            model.load_weights(kfold_weights_path)

        # score = model.evaluate(X_valid, Y_valid, show_accuracy=True, verbose=0)
        # print('Score log_loss: ', score[0])

        predictions_valid = model.predict(X_valid.astype('float32'),
                                          batch_size=batch_size,
                                          verbose=1)
        score = log_loss(Y_valid, predictions_valid)
        print('Score log_loss: ', score)
        sum_score += score * len(test_index)

        # Store valid predictions
        for i in range(len(test_index)):
            yfull_train[test_index[i]] = predictions_valid[i]

    score = sum_score / len(train_data)
    print("Log_loss train independent avg: ", score)

    predictions_valid = get_validation_predictions(train_data, yfull_train)

    print('Final log_loss: {}, nfolds: {} epoch: {}'.format(
        score, nfolds, nb_epoch))
    info_string = 'loss_' + str(score) \
                  + '_folds_' + str(nfolds) \
                  + '_ep_' + str(nb_epoch)

    save_useful_data(predictions_valid, train_id, model, info_string)

    score1 = log_loss(train_target, predictions_valid)
    if abs(score1 - score) > 0.0001:
        print('Check error: {} != {}'.format(score, score1))
示例#7
0
from sklearn.externals import joblib
import time
from sklearn.naive_bayes import MultinomialNB

filename = '/Users/jzhy/Downloads/train.csv'
data = pd.read_csv(filename)

X = numpy.zeros((len(data.x), 4))
X[:, 0] = data.x
X[:, 1] = data.y
X[:, 2] = data.accuracy
X[:, 3] = data.time
Y = numpy.zeros((len(data.x), 1))
Y = data.place_id

XX = preprocessing.scale(X)
YY = numpy.unique(Y)

kf = KFold(len(X), n_folds=len(Y) / 10000 + 1)

clf = MultinomialNB()

i = 0
for train, test in kf:
    clf.partial_fit(X[test, :], Y[test], YY.reshape((len(YY), -1)))
    i = i + 1
    print i
joblib.dump(clf, 'MultinomialNB.pkl')

exit()
示例#8
0
        y = np.array(y)
        y_tr = y[train_index]
        x_te = x_train[test_index]
        clf.train(x_tr, y_tr)
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)
    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)


full_df = pd.concat([train_df, test_df])
sub_item_id = test_df["item_id"]
len_train = len(train_df)
len_test = len(test_df)

kf = KFold(len_train, n_folds=NFOLDS, shuffle=True, random_state=SEED)

del train_df, test_df
gc.collect()

feature_engineering(full_df)
full_df, ready_full_df, tfvocab = data_vectorize(full_df)

#'alpha':20.0
ridge_params = {
    'alpha': 20.0,
    'fit_intercept': True,
    'normalize': False,
    'copy_X': True,
    'max_iter': None,
    'tol': 0.001,
示例#9
0
    def __init__(self):

        # Read train data
        train_data = []
        train_id = []
        train_target = []
        start_time = time.time()

        print('Read train images')
        folders = ['ALB', 'BET', 'DOL', 'LAG', 'NoF', 'OTHER', 'SHARK', 'YFT']
        for fld in folders:
            index = folders.index(fld)
            print('Load folder {} (Index: {})'.format(fld, index))
            path = os.path.join('/Users/Kevin/Desktop/fish_model/train', fld,
                                '*.jpg')
            files = glob.glob(path)
            for fl in files:
                flbase = os.path.basename(fl)
                img = get_im_cv2(fl)
                train_data.append(img)
                train_id.append(flbase)
                train_target.append(index)
            print('Read train data time: {} seconds'.format(
                round(time.time() - start_time, 2)))

        # Normalize train data
        print('Convert to numpy...')
        train_data = np.array(train_data, dtype=np.uint8)
        train_target = np.array(train_target, dtype=np.uint8)

        print('Reshape...')
        train_data = train_data.transpose((0, 3, 1, 2))

        print('Convert to float...')
        train_data = train_data.astype('float32')
        train_data = train_data / 255
        train_target = np_utils.to_categorical(train_target, 8)

        print('Train shape:', train_data.shape)
        print(train_data.shape[0], 'train samples')

        # CNN Model Building
        model = Sequential()
        model.add(
            ZeroPadding2D((1, 1), input_shape=(3, 64, 64), dim_ordering='th'))
        model.add(Convolution2D(4, 3, 3, activation='relu', dim_ordering='th'))
        model.add(ZeroPadding2D((1, 1), dim_ordering='th'))
        model.add(Convolution2D(4, 3, 3, activation='relu', dim_ordering='th'))
        model.add(
            MaxPooling2D(pool_size=(2, 2), strides=(2, 2), dim_ordering='th'))
        model.add(ZeroPadding2D((1, 1), dim_ordering='th'))
        model.add(Convolution2D(8, 3, 3, activation='relu', dim_ordering='th'))
        model.add(ZeroPadding2D((1, 1), dim_ordering='th'))
        model.add(Convolution2D(8, 3, 3, activation='relu', dim_ordering='th'))
        model.add(
            MaxPooling2D(pool_size=(2, 2), strides=(2, 2), dim_ordering='th'))
        model.add(Flatten())
        model.add(Dense(32, activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(32, activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(8, activation='softmax'))
        sgd = SGD(lr=1e-2, decay=1e-6, momentum=0.9, nesterov=True)
        model.compile(optimizer=sgd, loss='categorical_crossentropy')

        # CNN Training
        nfolds = 20
        batch_size = 16
        nb_epoch = 30
        random_state = 51
        yfull_train = dict()
        kf = KFold(len(train_id),
                   n_folds=nfolds,
                   shuffle=True,
                   random_state=random_state)
        num_fold = 0
        sum_score = 0
        models = []
        for train_index, test_index in kf:
            X_train = train_data[train_index]
            Y_train = train_target[train_index]
            X_valid = train_data[test_index]
            Y_valid = train_target[test_index]
            num_fold += 1
            print('Start KFold number {} from {}'.format(num_fold, nfolds))
            print('Split train: ', len(X_train), len(Y_train))
            print('Split valid: ', len(X_valid), len(Y_valid))
            callbacks = [
                EarlyStopping(monitor='val_loss', patience=3, verbose=0),
            ]
            model.fit(X_train,
                      Y_train,
                      batch_size=batch_size,
                      nb_epoch=nb_epoch,
                      shuffle=True,
                      verbose=2,
                      validation_data=(X_valid, Y_valid),
                      callbacks=callbacks)
            predictions_valid = model.predict(X_valid.astype('float32'),
                                              batch_size=batch_size,
                                              verbose=2)
            score = log_loss(Y_valid, predictions_valid)
            print('Score log_loss: ', score)
            sum_score += score * len(test_index)
            # Store valid predictions
            for i in range(len(test_index)):
                yfull_train[test_index[i]] = predictions_valid[i]
            models.append(model)

        score = sum_score / len(train_data)
        print("Log_loss train independent avg: ", score)
        info_string = 'loss_' + str(score) + '_folds_' + str(
            nfolds) + '_ep_' + str(nb_epoch)

        # Read test data
        path = os.path.join('/Users/Kevin/Desktop/fish_model/test', '*.jpg')
        files = sorted(glob.glob(path))

        test_data = []
        test_id = []
        for fl in files:
            flbase = os.path.basename(fl)
            img = get_im_cv2(fl)
            test_data.append(img)
            test_id.append(flbase)

        # Nomilize test data
        start_time = time.time()
        test_data, test_id = load_test()

        test_data = np.array(test_data, dtype=np.uint8)
        test_data = test_data.transpose((0, 3, 1, 2))

        test_data = test_data.astype('float32')
        test_data = test_data / 255

        print('Test shape:', test_data.shape)
        print(test_data.shape[0], 'test samples')
        print('Read and process test data time: {} seconds'.format(
            round(time.time() - start_time, 2)))

        # CNN Prediction
        batch_size = 16
        num_fold = 0
        yfull_test = []
        test_id = []
        nfolds = len(models)
        for i in range(nfolds):
            model = models[i]
            num_fold += 1
            print('Start KFold number {} from {}'.format(num_fold, nfolds))
            test_prediction = model.predict(test_data,
                                            batch_size=batch_size,
                                            verbose=2)
            yfull_test.append(test_prediction)
        test_res = merge_several_folds_mean(yfull_test, nfolds)
        info_string = 'loss_' + info_string \
                    + '_folds_' + str(nfolds)
        create_submission(test_res, test_id, info_string)
        elif pid==data[1]:
            no1=no1+1
            a1.append(int(data[4]))
            a2.append(int(data[5]))
            index.append(no1+no2)
        else:
            min_err=100000000000000000
            pid=data[1]
            a1.append(int(data[4]))
            a2.append(int(data[5]))
            index.append(no1+no2)
            no1=no1+1
            x, y = genData(no1-1,no2,N,a1,a2,stDev)
            m, n = np.shape(x)
            theta = np.ones(n)
            kf = KFold(len(x), n_folds=4)


            for train, test in kf:
                x_train, x_test, y_train, y_test = x[train], x[test], y[train], y[test]
                #print("%s %s" % (train, test))
                #print("%s %s" % (x_train,y_test))
                stDev=np.std(y_train)
                theta = gradientDescent(x_train, y_train,stDev, theta, alpha, m, numIterations)
                #print(theta)
                error=0
                err=0
                StDev=np.std(y_test)
                for i in range(0, len(x_test)):
                    if(a1[test[i]]>a2[test[i]]+theta[0]+theta[1]*StDev):
                        error=error+1
示例#11
0
features.extend(['manager_id_size', 'house_type_size'])

# In[13]:

features = list(set(features))

processMap(train_df)
processMap(test_df)
train_df = train_df.fillna(-1)
test_df = test_df.fillna(-1)
getCluster(train_df, test_df, 30)
getCluster(train_df, test_df, 10)

#K-FOLD evaluation for the statistic features
skf = KFold(len(train_df['interest_level']), 5, shuffle=True, random_state=42)
#dev set adding manager skill
for train, test in skf:
    performance_eval(train_df.iloc[train, :],
                     train_df.iloc[test, :],
                     feature='manager_id',
                     update_df=train_df,
                     smoothing=False)
    temporalManagerPerf_f(train_df.iloc[train, :],
                          train_df.iloc[test, :],
                          update_df=train_df)

performance_eval(train_df, test_df, feature='manager_id', smoothing=False)
temporalManagerPerf_f(train_df, test_df)

#statitstic
示例#12
0

path = 'G:\BigDataEvent\血糖预测\\'
# test = pd.read_csv(path + 'f_test_a_20180204.csv',encoding='gb2312')
# train = pd.read_csv(path + 'f_train_20180204.csv', encoding='gb2312')

#train_data, test_data = process_eigen.process_eigen(train, test)
train_data = pd.read_csv(path + 'process_train_fb.csv', encoding='gb2312')
test_data = pd.read_csv(path + 'process_test_fb.csv', encoding='gb2312')
print(train_data.info())
print('开始训练。。。')
predictors = [f for f in test_data.columns if f not in ['label', 'id']]

train_preds = np.zeros(train_data.shape[0])
test_preds = np.zeros((test_data.shape[0], 5))
kf = KFold(len(train_data), n_folds=5, shuffle=True, random_state=50)
for i, (train_index, test_index) in enumerate(kf):
    print('第{}次训练...'.format(i))
    train_feat = train_data.iloc[train_index]
    valid_feat = train_data.iloc[test_index]
    estimator = RandomForestClassifier(n_estimators=500,
                                       min_samples_split=70,
                                       max_depth=12,
                                       min_samples_leaf=10,
                                       max_features=16,
                                       random_state=10,
                                       oob_score=True)
    estimator.fit(train_feat[predictors], train_feat['label'])
    train_preds[test_index] += estimator.predict(valid_feat[predictors])
    print(estimator.oob_score_)
    test_preds[:, i] = estimator.predict(test_data[predictors])
    def get_ten_fold_crossvalid_perfermance(self, settings=None):
        fisher_mode = settings['fisher_mode']
        analysis_scr = []
        with_auc_score = settings['with_auc_score']
        reduce_ratio = settings['reduce_ratio']
        #for seq_no in range(1, self.ddi_obj.total_number_of_sequences+1):
        #subset_size = math.floor(self.ddi_obj.total_number_of_sequences / 10.0)
        kf = KFold(self.ddi_obj.total_number_of_sequences,
                   n_folds=10,
                   shuffle=True)
        #for subset_no in range(1, 11):
        for ((train_index, test_index), subset_no) in izip(kf, range(1, 11)):
            #for train_index, test_index in kf;
            print("Subset:", subset_no)
            print("Train index: ", train_index)
            print("Test index: ", test_index)
            #logger.info('subset number: ' + str(subset_no))
            (train_X_10fold,
             train_y_10fold), (train_X_reduced, train_y_reduced), (
                 test_X,
                 test_y) = self.ddi_obj.get_ten_fold_crossvalid_one_subset(
                     train_index,
                     test_index,
                     fisher_mode=fisher_mode,
                     reduce_ratio=reduce_ratio)
            standard_scaler = preprocessing.StandardScaler().fit(
                train_X_reduced)
            scaled_train_X = standard_scaler.transform(train_X_reduced)
            scaled_test_X = standard_scaler.transform(test_X)

            if settings['SVM']:
                print "SVM"
                Linear_SVC = LinearSVC(C=1, penalty="l2")
                Linear_SVC.fit(scaled_train_X, train_y_reduced)
                predicted_test_y = Linear_SVC.predict(scaled_test_X)
                isTest = True
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'SVM', isTest) + tuple(
                        performance_score(test_y,
                                          predicted_test_y).values()))  #new

                predicted_train_y = Linear_SVC.predict(scaled_train_X)
                isTest = False
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'SVM', isTest) + tuple(
                        performance_score(train_y_reduced,
                                          predicted_train_y).values()))
            if settings['SVM_RBF']:
                print "SVM_RBF"
                L1_SVC_RBF_Selector = SVC(C=1, gamma=0.01, kernel='rbf').fit(
                    scaled_train_X, train_y_reduced)

                predicted_test_y = L1_SVC_RBF_Selector.predict(scaled_test_X)
                isTest = True
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'SVM_RBF', isTest) +
                    tuple(
                        performance_score(test_y,
                                          predicted_test_y).values()))  #new

                predicted_train_y = L1_SVC_RBF_Selector.predict(scaled_train_X)
                isTest = False
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'SVM_RBF', isTest) +
                    tuple(
                        performance_score(train_y_reduced,
                                          predicted_train_y).values()))
            if settings['SVM_POLY']:
                print "SVM_POLY"
                L1_SVC_POLY_Selector = SVC(C=1, kernel='poly').fit(
                    scaled_train_X, train_y_reduced)

                predicted_test_y = L1_SVC_POLY_Selector.predict(scaled_test_X)
                isTest = True
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'SVM_POLY', isTest) +
                    tuple(
                        performance_score(test_y,
                                          predicted_test_y).values()))  #new

                predicted_train_y = L1_SVC_POLY_Selector.predict(
                    scaled_train_X)
                isTest = False
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'SVM_POLY', isTest) +
                    tuple(
                        performance_score(train_y_reduced,
                                          predicted_train_y).values()))

            min_max_scaler = Preprocessing_Scaler_with_mean_point5()
            X_train_pre_validation_minmax = min_max_scaler.fit(train_X_reduced)
            X_train_pre_validation_minmax = min_max_scaler.transform(
                train_X_reduced)
            x_test_minmax = min_max_scaler.transform(test_X)

            x_train_minmax, x_validation_minmax, y_train_minmax, y_validation_minmax = train_test_split(
                X_train_pre_validation_minmax,
                train_y_reduced,
                test_size=0.4,
                random_state=42)
            finetune_lr = settings['finetune_lr']
            batch_size = settings['batch_size']
            pretraining_epochs = cal_epochs(
                settings['pretraining_interations'],
                x_train_minmax,
                batch_size=batch_size)
            #pretrain_lr=0.001
            pretrain_lr = settings['pretrain_lr']
            training_epochs = cal_epochs(settings['training_interations'],
                                         x_train_minmax,
                                         batch_size=batch_size)
            hidden_layers_sizes = settings['hidden_layers_sizes']
            corruption_levels = settings['corruption_levels']
            settings['epoch_number'] = cal_epochs(
                settings['pretraining_interations'],
                x_train_minmax,
                batch_size=batch_size)
            # deep xy autoencoders
            settings['lrate'] = settings['lrate_pre'] + str(training_epochs)
            settings['n_ins'] = x_train_minmax.shape[1]
            if settings['DL_xy']:
                cfg = settings.copy()
                cfg['weight_y'] = 0.1
                print 'DL_xy'
                train_x = x_train_minmax
                train_y = y_train_minmax
                sdaf = Sda_xy_factory(cfg)
                sdaf.sda.pretraining(train_x, train_y)
                dnnf = DNN_factory(cfg)
                dnnf.dnn.load_pretrain_from_Sda(sdaf.sda)
                dnnf.dnn.finetuning((x_train_minmax, y_train_minmax),
                                    (x_validation_minmax, y_validation_minmax))

                training_predicted = dnnf.dnn.predict(x_train_minmax)
                y_train = y_train_minmax
                isTest = False
                #new
                analysis_scr.append((
                    self.ddi, subset_no, fisher_mode, 'DL_xy', isTest
                ) + tuple(
                    performance_score(y_train, training_predicted).values()))

                test_predicted = dnnf.dnn.predict(x_test_minmax)
                y_test = test_y
                isTest = True
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'DL_xy', isTest) +
                    tuple(performance_score(y_test, test_predicted).values()))
            if settings['Sda_xy_with_first']:
                cfg = settings.copy()
                cfg['weight_y'] = 0.1
                cfg['firstlayer_xy'] = 1
                print 'firstlayer_xy'
                train_x = x_train_minmax
                train_y = y_train_minmax
                sdaf = Sda_xy_factory(cfg)
                sdaf.sda.pretraining(train_x, train_y)
                dnnf = DNN_factory(cfg)
                dnnf.dnn.load_pretrain_from_Sda(sdaf.sda)
                dnnf.dnn.finetuning((x_train_minmax, y_train_minmax),
                                    (x_validation_minmax, y_validation_minmax))

                training_predicted = dnnf.dnn.predict(x_train_minmax)
                y_train = y_train_minmax
                isTest = False
                #new
                analysis_scr.append((
                    self.ddi, subset_no, fisher_mode, 'Sda_xy_with_first',
                    isTest
                ) + tuple(
                    performance_score(y_train, training_predicted).values()))

                test_predicted = dnnf.dnn.predict(x_test_minmax)
                y_test = test_y
                isTest = True
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'Sda_xy_with_first',
                     isTest) +
                    tuple(performance_score(y_test, test_predicted).values()))
            if settings['Sda_new']:
                print 'Sda_new'
                cfg = settings.copy()
                train_x = x_train_minmax
                train_y = y_train_minmax
                cfg['n_ins'] = train_x.shape[1]
                sdaf = Sda_factory(cfg)
                sda = sdaf.sda.pretraining(train_x=train_x)
                sdaf.dnn.finetuning((x_train_minmax, y_train_minmax),
                                    (x_validation_minmax, y_validation_minmax))
                training_predicted = sdaf.dnn.predict(x_train_minmax)
                y_train = y_train_minmax
                isTest = False
                #new
                analysis_scr.append((
                    self.ddi, subset_no, fisher_mode, 'Sda_new', isTest
                ) + tuple(
                    performance_score(y_train, training_predicted).values()))

                test_predicted = sdaf.dnn.predict(x_test_minmax)
                y_test = test_y
                isTest = True
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'Sda_new', isTest) +
                    tuple(performance_score(y_test, test_predicted).values()))

            #### new prepresentation
            x = X_train_pre_validation_minmax
            a_MAE_A = pretrain_a_Sda_with_estop(
                x,
                pretraining_epochs=pretraining_epochs,
                pretrain_lr=pretrain_lr,
                batch_size=batch_size,
                hidden_layers_sizes=hidden_layers_sizes,
                corruption_levels=corruption_levels)
            new_x_train_minmax_A = a_MAE_A.transform(
                X_train_pre_validation_minmax)
            new_x_test_minmax_A = a_MAE_A.transform(x_test_minmax)
            standard_scaler = preprocessing.StandardScaler().fit(
                new_x_train_minmax_A)
            new_x_train_scaled = standard_scaler.transform(
                new_x_train_minmax_A)
            new_x_test_scaled = standard_scaler.transform(new_x_test_minmax_A)
            new_x_train_combo = np.hstack((scaled_train_X, new_x_train_scaled))
            new_x_test_combo = np.hstack((scaled_test_X, new_x_test_scaled))

            if settings['SAE_SVM']:
                print 'SAE followed by SVM'

                Linear_SVC = LinearSVC(C=1, penalty="l2")
                Linear_SVC.fit(new_x_train_scaled, train_y_reduced)
                predicted_test_y = Linear_SVC.predict(new_x_test_scaled)
                isTest = True
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'SAE_SVM', isTest) +
                    tuple(
                        performance_score(test_y,
                                          predicted_test_y).values()))  #new
                predicted_train_y = Linear_SVC.predict(new_x_train_scaled)
                isTest = False
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'SAE_SVM', isTest) +
                    tuple(
                        performance_score(train_y_reduced,
                                          predicted_train_y).values()))
            if settings['SAE_SVM_RBF']:
                print 'SAE followed by SVM RBF'
                x = X_train_pre_validation_minmax
                L1_SVC_RBF_Selector = SVC(C=1, gamma=0.01, kernel='rbf').fit(
                    new_x_train_scaled, train_y_reduced)
                predicted_test_y = L1_SVC_RBF_Selector.predict(
                    new_x_test_scaled)
                isTest = True
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'SAE_SVM_RBF', isTest) +
                    tuple(
                        performance_score(test_y,
                                          predicted_test_y).values()))  #new
                predicted_train_y = L1_SVC_RBF_Selector.predict(
                    new_x_train_scaled)
                isTest = False
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'SAE_SVM_RBF', isTest) +
                    tuple(
                        performance_score(train_y_reduced,
                                          predicted_train_y).values()))
            if settings['SAE_SVM_COMBO']:
                print 'SAE followed by SVM with combo feature'
                Linear_SVC = LinearSVC(C=1, penalty="l2")
                Linear_SVC.fit(new_x_train_combo, train_y_reduced)
                predicted_test_y = Linear_SVC.predict(new_x_test_combo)
                isTest = True
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'SAE_SVM_COMBO', isTest)
                    + tuple(
                        performance_score(test_y,
                                          predicted_test_y).values()))  #new
                predicted_train_y = Linear_SVC.predict(new_x_train_combo)
                isTest = False
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'SAE_SVM_COMBO',
                     isTest) + tuple(
                         performance_score(train_y_reduced,
                                           predicted_train_y).values()))
            if settings['SAE_SVM_RBF_COMBO']:
                print 'SAE followed by SVM RBF with combo feature'
                L1_SVC_RBF_Selector = SVC(C=1, gamma=0.01, kernel='rbf').fit(
                    new_x_train_combo, train_y_reduced)
                predicted_test_y = L1_SVC_RBF_Selector.predict(
                    new_x_test_combo)
                isTest = True
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'SAE_SVM_RBF_COMBO',
                     isTest) + tuple(
                         performance_score(test_y,
                                           predicted_test_y).values()))  #new
                predicted_train_y = L1_SVC_RBF_Selector.predict(
                    new_x_train_combo)
                isTest = False
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'SAE_SVM_RBF_COMBO',
                     isTest) + tuple(
                         performance_score(train_y_reduced,
                                           predicted_train_y).values()))

            if settings['DL']:
                print "direct deep learning"
                sda = train_a_Sda(x_train_minmax, pretrain_lr, finetune_lr,
                                  y_train_minmax,
                             x_validation_minmax, y_validation_minmax ,
                             x_test_minmax, test_y,
                             hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \
                             training_epochs = training_epochs, pretraining_epochs = pretraining_epochs, n_outs = settings['n_outs']

                 )
                print 'hidden_layers_sizes:', hidden_layers_sizes
                print 'corruption_levels:', corruption_levels
                training_predicted = sda.predict(x_train_minmax)
                y_train = y_train_minmax
                isTest = False
                #new
                analysis_scr.append((
                    self.ddi, subset_no, fisher_mode, 'DL', isTest
                ) + tuple(
                    performance_score(y_train, training_predicted).values()))

                test_predicted = sda.predict(x_test_minmax)
                y_test = test_y
                isTest = True
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'DL', isTest) +
                    tuple(performance_score(y_test, test_predicted).values()))
            if settings['DL_old']:
                print "direct deep learning old without early stop"
                sda = trainSda(x_train_minmax, y_train,
                 x_validation_minmax, y_validation_minmax,
                 x_test_minmax, y_test,pretrain_lr, finetune_lr,
                 pretraining_X_minmax=None,
                             hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \
                             training_epochs = training_epochs, pretraining_epochs = pretraining_epochs, n_outs = settings['n_outs']

                 )

                print 'hidden_layers_sizes:', hidden_layers_sizes
                print 'corruption_levels:', corruption_levels
                training_predicted = sda.predict(x_train_minmax)
                y_train = y_train_minmax
                isTest = False
                #new
                analysis_scr.append((
                    self.ddi, subset_no, fisher_mode, 'DL_old', isTest
                ) + tuple(
                    performance_score(y_train, training_predicted).values()))

                test_predicted = sda.predict(x_test_minmax)
                y_test = test_y
                isTest = True
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'DL_old', isTest) +
                    tuple(performance_score(y_test, test_predicted).values()))
            if settings['DL_U']:
                # deep learning using unlabeled data for pretraining
                print 'deep learning with unlabel data'
                pretraining_X_minmax = min_max_scaler.transform(train_X_10fold)
                pretraining_epochs = cal_epochs(
                    settings['pretraining_interations'],
                    x_train_minmax,
                    batch_size=batch_size)
                sda_unlabel = trainSda(x_train_minmax, y_train_minmax,
                             x_validation_minmax, y_validation_minmax ,
                             x_test_minmax, test_y,
                             pretraining_X_minmax = pretraining_X_minmax,
                             hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \
                             training_epochs = training_epochs, pretraining_epochs = pretraining_epochs,
                             pretrain_lr = pretrain_lr, finetune_lr=finetune_lr, n_outs = settings['n_outs']
                 )
                print 'hidden_layers_sizes:', hidden_layers_sizes
                print 'corruption_levels:', corruption_levels
                training_predicted = sda_unlabel.predict(x_train_minmax)
                y_train = y_train_minmax
                isTest = False
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'DL_U', isTest) + tuple(
                        performance_score(y_train, training_predicted,
                                          with_auc_score).values()))

                test_predicted = sda_unlabel.predict(x_test_minmax)
                y_test = test_y

                isTest = True
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'DL_U', isTest) + tuple(
                        performance_score(y_test, test_predicted,
                                          with_auc_score).values()))
            if settings['DL_S']:
                # deep learning using split network
                y_test = test_y
                print 'deep learning using split network'
                # get the new representation for A set. first 784-D
                pretraining_epochs = cal_epochs(
                    settings['pretraining_interations'],
                    x_train_minmax,
                    batch_size=batch_size)

                x = x_train_minmax[:, :x_train_minmax.shape[1] / 2]
                print "original shape for A", x.shape
                a_MAE_A = pretrain_a_Sda_with_estop(
                    x,
                    pretraining_epochs=pretraining_epochs,
                    pretrain_lr=pretrain_lr,
                    batch_size=batch_size,
                    hidden_layers_sizes=hidden_layers_sizes,
                    corruption_levels=corruption_levels)
                new_x_train_minmax_A = a_MAE_A.transform(
                    x_train_minmax[:, :x_train_minmax.shape[1] / 2])
                x = x_train_minmax[:, x_train_minmax.shape[1] / 2:]

                print "original shape for B", x.shape
                a_MAE_B = pretrain_a_Sda_with_estop(
                    x,
                    pretraining_epochs=pretraining_epochs,
                    pretrain_lr=pretrain_lr,
                    batch_size=batch_size,
                    hidden_layers_sizes=hidden_layers_sizes,
                    corruption_levels=corruption_levels)
                new_x_train_minmax_B = a_MAE_B.transform(
                    x_train_minmax[:, x_train_minmax.shape[1] / 2:])

                new_x_test_minmax_A = a_MAE_A.transform(
                    x_test_minmax[:, :x_test_minmax.shape[1] / 2])
                new_x_test_minmax_B = a_MAE_B.transform(
                    x_test_minmax[:, x_test_minmax.shape[1] / 2:])
                new_x_validation_minmax_A = a_MAE_A.transform(
                    x_validation_minmax[:, :x_validation_minmax.shape[1] / 2])
                new_x_validation_minmax_B = a_MAE_B.transform(
                    x_validation_minmax[:, x_validation_minmax.shape[1] / 2:])
                new_x_train_minmax_whole = np.hstack(
                    (new_x_train_minmax_A, new_x_train_minmax_B))
                new_x_test_minmax_whole = np.hstack(
                    (new_x_test_minmax_A, new_x_test_minmax_B))
                new_x_validationt_minmax_whole = np.hstack(
                    (new_x_validation_minmax_A, new_x_validation_minmax_B))


                sda_transformed = train_a_Sda(new_x_train_minmax_whole, pretrain_lr, finetune_lr,
                                              y_train_minmax,
                     new_x_validationt_minmax_whole, y_validation_minmax ,
                     new_x_test_minmax_whole, y_test,
                     hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \
                     training_epochs = training_epochs, pretraining_epochs = pretraining_epochs, n_outs = settings['n_outs']

                     )

                print 'hidden_layers_sizes:', hidden_layers_sizes
                print 'corruption_levels:', corruption_levels
                training_predicted = sda_transformed.predict(
                    new_x_train_minmax_whole)
                y_train = y_train_minmax

                isTest = False
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'DL_S', isTest) + tuple(
                        performance_score(y_train, training_predicted,
                                          with_auc_score).values()))

                test_predicted = sda_transformed.predict(
                    new_x_test_minmax_whole)
                y_test = test_y

                isTest = True
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'DL_S', isTest) + tuple(
                        performance_score(y_test, test_predicted,
                                          with_auc_score).values()))
            if settings['DL_S_new']:
                # deep learning using split network
                print 'new deep learning using split network'

                cfg = settings.copy()
                p_sda = Parellel_Sda_factory(cfg)
                p_sda.supervised_training(x_train_minmax, x_validation_minmax,
                                          y_train_minmax, y_validation_minmax)

                isTest = False  #new
                training_predicted = p_sda.predict(x_train_minmax)
                y_train = y_train_minmax
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'DL_S_new', isTest) +
                    tuple(
                        performance_score(y_train, training_predicted,
                                          with_auc_score).values()))

                isTest = True  #new
                y_test = test_y
                test_predicted = p_sda.predict(x_test_minmax)
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'DL_S_new', isTest) +
                    tuple(
                        performance_score(y_test, test_predicted,
                                          with_auc_score).values()))
            if settings['DL_S_new_contraction']:
                print 'DL_S_new_contraction'
                cfg = settings.copy()
                cfg['contraction_level'] = 0.1
                p_sda = Parellel_Sda_factory(cfg)
                p_sda.supervised_training(x_train_minmax, x_validation_minmax,
                                          y_train_minmax, y_validation_minmax)

                isTest = False  #new
                training_predicted = p_sda.predict(x_train_minmax)
                y_train = y_train_minmax
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'DL_S_new_contraction',
                     isTest) + tuple(
                         performance_score(y_train, training_predicted,
                                           with_auc_score).values()))

                isTest = True  #new
                y_test = test_y
                test_predicted = p_sda.predict(x_test_minmax)
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'DL_S_new_contraction',
                     isTest) + tuple(
                         performance_score(y_test, test_predicted,
                                           with_auc_score).values()))

            if settings['DL_S_new_sparsity'] == 1:
                print 'DL_S_new_sparsity'
                cfg = settings.copy()
                cfg['sparsity'] = 0.1
                cfg['sparsity_weight'] = 0.1
                p_sda = Parellel_Sda_factory(cfg)
                p_sda.supervised_training(x_train_minmax, x_validation_minmax,
                                          y_train_minmax, y_validation_minmax)

                isTest = False  #new
                training_predicted = p_sda.predict(x_train_minmax)
                y_train = y_train_minmax
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'DL_S_new_sparsity',
                     isTest) + tuple(
                         performance_score(y_train, training_predicted,
                                           with_auc_score).values()))

                isTest = True  #new
                y_test = test_y
                test_predicted = p_sda.predict(x_test_minmax)
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'DL_S_new_sparsity',
                     isTest) + tuple(
                         performance_score(y_test, test_predicted,
                                           with_auc_score).values()))

            if settings['DL_S_new_weight_decay'] == 2:
                cfg = settings.copy()
                cfg['l2_reg'] = 0.1
                print 'l2_reg'
                p_sda = Parellel_Sda_factory(cfg)
                p_sda.supervised_training(x_train_minmax, x_validation_minmax,
                                          y_train_minmax, y_validation_minmax)

                isTest = False  #new
                training_predicted = p_sda.predict(x_train_minmax)
                y_train = y_train_minmax
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'l2_reg', isTest) +
                    tuple(
                        performance_score(y_train, training_predicted,
                                          with_auc_score).values()))

                isTest = True  #new
                y_test = test_y
                test_predicted = p_sda.predict(x_test_minmax)
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'l2_reg', isTest) +
                    tuple(
                        performance_score(y_test, test_predicted,
                                          with_auc_score).values()))

            if settings['DL_S_new_weight_decay'] == 1:
                print 'l1_reg'
                cfg = settings.copy()
                cfg['l1_reg'] = 0.1
                p_sda = Parellel_Sda_factory(cfg)
                p_sda.supervised_training(x_train_minmax, x_validation_minmax,
                                          y_train_minmax, y_validation_minmax)

                isTest = False  #new
                training_predicted = p_sda.predict(x_train_minmax)
                y_train = y_train_minmax
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'l1_reg', isTest) +
                    tuple(
                        performance_score(y_train, training_predicted,
                                          with_auc_score).values()))

                isTest = True  #new
                y_test = test_y
                test_predicted = p_sda.predict(x_test_minmax)
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'l1_reg', isTest) +
                    tuple(
                        performance_score(y_test, test_predicted,
                                          with_auc_score).values()))

            if settings['DL_S_new_Drop_out'] == 1:

                cfg = settings.copy()
                cfg['dropout_factor'] = 0.3
                print 'DL_S_new_Drop_out'
                p_sda = Parellel_Sda_factory(cfg)
                p_sda.supervised_training(x_train_minmax, x_validation_minmax,
                                          y_train_minmax, y_validation_minmax)

                isTest = False  #new
                training_predicted = p_sda.predict(x_train_minmax)
                y_train = y_train_minmax
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'DL_S_new_Drop_out',
                     isTest) + tuple(
                         performance_score(y_train, training_predicted,
                                           with_auc_score).values()))

                isTest = True  #new
                y_test = test_y
                test_predicted = p_sda.predict(x_test_minmax)
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'DL_S_new_Drop_out',
                     isTest) + tuple(
                         performance_score(y_test, test_predicted,
                                           with_auc_score).values()))

        report_name = filename + '_' + '_newDL_'.join(
            map(str, hidden_layers_sizes)) + '_' + str(
                pretrain_lr) + '_' + str(finetune_lr) + '_' + str(
                    settings['training_interations']) + '_' + current_date
        saveAsCsv(with_auc_score, report_name,
                  performance_score(test_y, predicted_test_y, with_auc_score),
                  analysis_scr)
示例#14
0
    GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3),
    [
        "Pclass", "Sex", "Age", "Fare", "Embarked", "FamilySize", "Title",
        "FamilyId"
    ]
],
              [
                  LogisticRegression(random_state=1),
                  [
                      "Pclass", "Sex", "Fare", "FamilySize", "Title", "Age",
                      "Embarked"
                  ]
              ]]

# Initialize the cross-validation folds
kf = KFold(train.shape[0], n_folds=3, random_state=1)

# In[ ]:

predictions = []
for train_tmp, test_tmp in kf:
    train_target = train["Survived"].iloc[train_tmp]
    full_test_predictions = []
    # Make predictions for each algorithm on each fold
    for alg, predictors in algorithms:
        # Fit the algorithm on the training data
        alg.fit(train[predictors].iloc[train_tmp, :], train_target)
        # Select and predict on the test fold
        # We need to use .astype(float) to convert the dataframe to all floats and avoid an sklearn error
        test_predictions = alg.predict_proba(
            train[predictors].iloc[test_tmp, :].astype(float))[:, 1]
示例#15
0
9.others
from shutil import copyfile
	copyfile(src,file)
# make and write file
script=open(os.path.join(output_file,'Dodge_'+str(idx)+'.txt'),'a')
script.write('1'+'\n')
script.close()
	
10. glob
path = os.path.join('..','data','train',fld,'*jpg')
files = glob.glob(path)

11. sclearn
#K-Folds cross validation iterator
from sklearn.cross_validation import KFold
	kf = KFold(len(X_train), n_folds=n_fold, shuffle=True, random_state=random_state)
	for train_idx, cv_idx in kf: 

12. keras
callbacks = [EarlyStopping(monitor='val_loss', patience=3, verbose=0)]
model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch,
      shuffle=True, verbose=2, validation_data=(X_valid, Y_valid),
      callbacks=callbacks)


model.add(Convolution2D(12,4,4, border_mode='same',trainable=False))

13. pandas
result1 = pd.DataFrame(predictions, columns=['ALB', 'BET', 'DOL', 'LAG', 'NoF', 'OTHER', 'SHARK', 'YFT'])
result1.loc[:, 'image'] = pd.Series(test_id, index=result1.index)
示例#16
0
                                   max_depth=3),
        ["Pclass", "Sex", "Age", "Fare", "Embarked", "Title"]
    ],
    [
        LogisticRegression(random_state=1),
        ["Pclass", "Sex", "Fare", "Age", "Embarked", "Title"]
    ],
    [
        neighbors.KNeighborsClassifier(algorithm='kd_tree', n_neighbors=5),
        ["Pclass", "Sex", "Fare", "Age", "Embarked", "Title"]
    ],
    [clf1, ["Pclass", "Sex", "Fare", "Age", "Embarked", "Title"]],
]

# 初始化交叉验证
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)

predictions = []
for train, test in kf:
    train_target = titanic["Survived"].iloc[train]
    full_test_predictions = []
    # 对每个交叉验证分组,分别使用两种算法进行分类
    for alg, predictors in algorithms:
        # 用训练集拟合算法
        alg.fit(titanic[predictors].iloc[train, :], train_target)
        # 选择并预测测试集上的输出
        # .astype(float) 可以把dataframe转换为浮点数类型
        test_predictions = alg.predict_proba(
            titanic[predictors].iloc[test, :].astype(float))[:, 1]
        full_test_predictions.append(test_predictions)
    # 对两个预测结果取平均值
示例#17
0
feats = df_train.drop("revenue", axis=1)
X = feats.values  #features
y = df_train["revenue"].values  #target

for i in range(0, len(y) - 1):
    if y[i] > 10000000:
        print("sdfjsd")
        X.pop(i)
        y.pop(i)

### Linear Regression ###

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

kf = KFold(len(y), n_folds=15, shuffle=True)

y_pred = np.zeros(len(y), dtype=y.dtype)  # where we'll accumulate predictions
lr = LinearRegression()

# CV Loop
for train_index, test_index in kf:
    # for each iteration of the for loop we'll do a test train split
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    t = StandardScaler()
    X_train = t.fit_transform(X_train)
    lr.fit(X_train, y_train)  # Train on the training data

    X_test = t.transform(X_test)
 metricas = ("Metricas del modelo " + nombreClasificador).capitalize()
 imprimirTextoCentrado(metricas, tamanoConsola)
 mostrarMetricasGenerales(model, x_train, y_train, y_pred_train, "train")
 mostrarMetricasGenerales(model, x_test, y_test, y_pred_test, "test")
 imprimirTextoCentrado("", tamanoConsola, "*")
 imprimirTextoCentrado("Metricas importantes para clasificación",
                       tamanoConsola, "*")
 confusion_matrix_train = mostrarMetricasClasificacion(
     model, x_train, y_train, y_pred_train, "train")
 imprimirTextoCentrado("", tamanoConsola, "#")
 confusion_matrix_test = mostrarMetricasClasificacion(
     model, x_test, y_test, y_pred_test, "test")
 #Crear un iterador de validación cruzada k-fold
 #Nota: Por defecto, la puntuación utilizada es la que se devuelve por el
 #      método de puntuación del estimador (precisión)
 cv = KFold(n=len(y_train), n_folds=5, shuffle=True, random_state=0)
 scores = cross_val_score(model, x_train, y_train, cv=cv)
 print("Scores: ", (scores))
 print("Mean score: {0:.3f} (+/-{1:.3f})".format(np.mean(scores),
                                                 sem(scores)))
 print(
     "*******************************************************************")
 ###########################################################################
 if generarCompar:
     #Graficacion
     #En esta sección se grafican los resultados obtenidos
     #Se grafican la clasificación emocional con respecto
     # con respecto a las variables independiente que en este caso son los
     # pixeles de las imágenes de los rostros.
     mostrarGraficacionPrediVsReal(x_train, y_train, y_pred_train, "train",
                                   nombreClasificador)
示例#19
0

def loadDataSet(filename):
    strArr = [line.strip().split('\t') for line in open(filename).readlines()]
    dataSet = [map(float, line) for line in strArr]
    dataMat = np.mat(dataSet)
    m, n = np.shape(dataMat)
    return dataMat[:, :n - 1], dataMat[:, -1]


if __name__ == "__main__":
    x, y = loadDataSet('../2-knn/datingTestSet2.txt')

    #拆分数据集
    m = np.shape(x)[0]
    kf = KFold(m, n_folds=5, shuffle=True)  #1000分为5份
    clf = neighbors.KNeighborsClassifier(n_neighbors=3)
    for iteration, data in enumerate(kf, start=1):
        clf.fit(x[data[0]], np.ravel(y[data[0]]))
        answer = clf.predict(x[data[1]])
        print 'iteration', iteration
        print(classification_report(y[data[1]], answer))

    #训练KNN分类器
    # x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
    # clf = neighbors.KNeighborsClassifier(n_neighbors=3)
    # clf.fit(x_train,np.ravel(y_train))
    # answer = clf.predict(x_test)
    # print(classification_report(y_test,answer))

    # precision,recall,thresholds = precision_recall_curve(y_test,answer) 二分类问题
data = featureFormat(my_dataset, features_list)

### split into labels and features (this line assumes that the first
### feature in the array is the label, which is why "poi" must always
### be first in features_list
labels, features = targetFeatureSplit(data)

### machine learning goes here!
### please name your classifier clf for easy export below

### deploying feature selection
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(
    features, labels, test_size=0.1, random_state=42)

### use KFold for split and validate algorithm
kf = KFold(len(labels), 3)
for train_indices, test_indices in kf:
    #make training and testing sets
    features_train = [features[ii] for ii in train_indices]
    features_test = [features[ii] for ii in test_indices]
    labels_train = [labels[ii] for ii in train_indices]
    labels_test = [labels[ii] for ii in test_indices]

t0 = time()

clf = DecisionTreeClassifier()
clf.fit(features_train, labels_train)
score = clf.score(features_test, labels_test)
print 'accuracy before tuning ', score

print "Decision tree algorithm time:", round(time() - t0, 3), "s"
示例#21
0
def main():
	if len(sys.argv) == 1:
		print 'need filename'
		sys.exit(-1)
	else:
		infilename = sys.argv[-1]
		print infilename

	#npzfile = np.load('data/unigram_bigram_ner_senti_pos_lda_data.npz')
	npzfile = np.load(infilename)
	X = npzfile['X'];
	y = npzfile['y'];
	#split the data into 8:2 -> training:testing 
	trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.2, random_state=0)

	print 'feature size: '+str(np.shape(X))

	
	feature_index = set() # store the best feature indices
	

	kf = KFold(trainX.shape[0], n_folds=5) # kfold on training set for feature selection
	for train_index, test_index in kf: 
		trainX_train, trainX_test = trainX[train_index], trainX[test_index]
		trainy_train, trainy_test = y[train_index], y[test_index]

		auc_best_global = 0; # best auc in each cross validation
		xtrainBest = []	# store the best feture matrix for traing section of traingX
		xtestBest = [] #store the best feture matrix for testing section of traingX
		residual_col_indices = set() # residual column indices to check for each iteration when adding new features
		for i in range(0,X.shape[1]): #init the set with all col indices
			residual_col_indices.add(i)

		for i in range(0, X.shape[1]):

			colInd_best = -1;
			auc_best_local = 0 # init to 0
			for colInd in residual_col_indices:

				if i == 0: # if it's the first feature to add
					xtrainCur = trainX_train[:,colInd].reshape(trainX_train.shape[0],-1) #convert to a column vector
					xtestCur = trainX_test[:,colInd].reshape(trainX_test.shape[0],-1)
				else: 
					xtrainCur = np.hstack((xtrainBest, trainX_train[:,colInd].reshape(trainX_train.shape[0],-1) ))
					xtestCur =  np.hstack((xtestBest, trainX_test[:,colInd].reshape(trainX_test.shape[0],-1) ))

				clf = LogisticRegression();
				clf.fit(xtrainCur, trainy_train)


				y_true, y_pred = trainy_test, clf.predict(xtestCur)
				auc = roc_auc_score(y_true, y_pred) # auc score
				
				if auc_best_local < auc:
					auc_best_local = auc
					colInd_best = colInd
					print 'auc = ' + str(auc_best_local) + '\tcolInd_best = '+str(colInd_best)
					

			if auc_best_global < auc_best_local : # if auc is increasing by adding new features
				if i == 0: # if it's the first feature to add
					xtrainBest = trainX_train[:,colInd_best].reshape(trainX_train.shape[0],-1)
					xtestBest = trainX_test[:,colInd_best].reshape(trainX_test.shape[0],-1)
				else:
					xtrainBest = np.hstack((xtrainBest,trainX_train[:,colInd_best].reshape(trainX_train.shape[0],-1)))
					xtestBest = np.hstack((xtestBest,trainX_test[:,colInd_best].reshape(trainX_test.shape[0],-1)))

				print 'feature index to add: '+str(colInd_best)
				feature_index.add(colInd_best) # union of all features selected during each k-fold CV
				residual_col_indices.remove(colInd_best)
				auc_best_global = auc_best_local

				if auc_best_global == 1:
					break;
			else: 
				break;
				

		print 'auc_best_global found on current trainX_test fold: '+str(auc_best_global)

	print '# features selected = '+str(len(feature_index)) 
	feature_index = list(feature_index)
	print 'feature_index = ' + str(feature_index)
	# should NOT sort feature_index before test!

	outfilename = infilename[0:-8] +'selected.npz' 
	np.savez(outfilename,X = X[:,feature_index], y = y)

	clf.fit(trainX[:,feature_index], trainy)
	testy_true, testy_pred = testy, clf.predict(testX[:,feature_index])
	auc_test = roc_auc_score(testy_true, testy_pred)		
	print 'auc test = '+str(auc_test)

	# ---------------------------------- tune params ----------------------------------

	# Set the parameters by cross-validation
	tuned_parameters = [{},
						{'penalty': ['l2'], 'C':np.logspace(-5, 4, 10), 'solver': ['sag'] ,'max_iter':[500] },
						{'penalty': ['l2'], 'C':np.logspace(-5, 4, 10), 'solver': ['newton-cg'] ,'max_iter':[500] },
						{'penalty': ['l2'], 'C':np.logspace(-5, 4, 10), 'solver': ['lbfgs'] ,'max_iter':[500] },
						{'penalty': ['l2','l1'], 'C':np.logspace(-5, 4, 10), 'solver': ['liblinear'] ,'max_iter':[500] }
						]
	clf = GridSearchCV(LogisticRegression(class_weight= 'balanced'), tuned_parameters, cv=5, scoring= None)

	clf.fit(trainX[:,feature_index], trainy)

	print("Best parameters set found on development set:")
	print(clf.best_params_)
	y_true, y_pred = testy, clf.predict(testX[:,feature_index])
	auc = roc_auc_score(testy_true, testy_pred)
	print 'accuracy = ' + str(accuracy_score(y_true, y_pred))
	print 'auc = ' + str(auc)
示例#22
0
lasagne.layers.set_all_param_values(net['prob'], d['param values'])
for i, (tr_ix, val_ix) in enumerate(kf):
    print('CV Fold', i)
    X_tr = X[tr_ix]
    y_tr = y[tr_ix]
    X_val = X[val_ix]
    y_val = y[val_ix]

    #net['new_output'] = DenseLayer(net['pool5/7x7_s1'], num_units=10, nonlinearity=softmax, W=lasagne.init.Normal(0.01))
    lasagne.layers.set_all_param_values(net['prob'], d['param values'])
    learning_rate.set_value(0.0002)

    for epoch in range(2):

        kf2 = KFold(len(y_tr),
                    n_folds=np.floor(len(y_tr) / BATCH_SIZE),
                    shuffle=True,
                    random_state=1)
        progbar = Progbar(np.floor(len(y_tr) / BATCH_SIZE))
        for j, (_, ix) in enumerate(kf2):
            loss, acc = train_batch(ix)
            progbar.add(1)

        learning_rate.set_value(learning_rate.get_value() *
                                learning_rate_decay)

        v_ix = range(len(y_val))
        t_ix = range(len(y_tr))
        np.random.shuffle(v_ix)
        np.random.shuffle(t_ix)

        tr_loss_tot = 0.
示例#23
0
train_feat['id'] = train_feat['id'].apply(lambda x: 0 - int(x[1:])
                                          if 'p' in x else int(x[1:]))
test_feat['id'] = test_feat['id'].apply(lambda x: 0 - int(x[1:])
                                        if 'p' in x else int(x[1:]))

predictors = train_feat.columns.drop(
    ['label', 'enddate', 'hy_16.0', 'hy_91.0', 'hy_94.0'])

print('开始CV 5折训练...')
scores = []
t0 = time.time()
mean_score = []
train_preds = np.zeros(len(train_feat))
test_preds = np.zeros(len(test_feat))
kf = KFold(len(train_feat), n_folds=5, shuffle=True, random_state=520)
for i, (train_index, test_index) in enumerate(kf):
    lgb_train = lgb.Dataset(train_feat[predictors].iloc[train_index],
                            train_feat['label'].iloc[train_index])
    lgb_test = lgb.Dataset(train_feat[predictors].iloc[test_index],
                           train_feat['label'].iloc[test_index])

    params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'max_depth': 20,
        'num_leaves': 150,
        'learning_rate': 0.01,
        'subsample': 0.7,
    def get_ten_fold_crossvalid_perfermance(self, fisher_mode, settings=None):
        analysis_scr = []
        predicted_score = False
        reduce_ratio = 1
        #for seq_no in range(1, self.ddi_obj.total_number_of_sequences+1):
        #subset_size = math.floor(self.ddi_obj.total_number_of_sequences / 10.0)
        kf = KFold(self.ddi_obj.total_number_of_sequences, n_folds=10)
        #for subset_no in range(1, 11):
        for ((train_index, test_index), subset_no) in izip(kf, range(1, 11)):
            #for train_index, test_index in kf;
            print("Subset:", subset_no)
            print("Train index: ", train_index)
            print("Test index: ", test_index)
            #logger.info('subset number: ' + str(subset_no))
            if 1:
                print "SVM"
                #start_index = int((subset_no - 1) * subset_size + 1)
                #if subset_no == 10:
                #    end_index  = int(max(start_index + subset_size, self.ddi_obj.total_number_of_sequences))
                #else:
                #    end_index  = int(start_index + subset_size)
                #print  start_index, end_index
                #(train_X_10fold, train_y_10fold),(train_X_reduced, train_y_reduced), (test_X, test_y) = self.ddi_obj.get_ten_fold_crossvalid_one_subset(start_index, end_index, reduce_ratio = reduce_ratio)
                (train_X_10fold,
                 train_y_10fold), (train_X_reduced, train_y_reduced), (
                     test_X,
                     test_y) = self.ddi_obj.get_ten_fold_crossvalid_one_subset(
                         train_index, test_index, reduce_ratio=reduce_ratio)
                standard_scaler = preprocessing.StandardScaler().fit(
                    train_X_reduced)
                scaled_train_X = standard_scaler.transform(train_X_reduced)
                scaled_test_X = standard_scaler.transform(test_X)
                Linear_SVC = LinearSVC(C=1, penalty="l2")
                Linear_SVC.fit(scaled_train_X, train_y_reduced)
                predicted_test_y = Linear_SVC.predict(scaled_test_X)
                isTest = True
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'SVM', isTest) + tuple(
                        performance_score(test_y,
                                          predicted_test_y).values()))  #new

                predicted_train_y = Linear_SVC.predict(scaled_train_X)
                isTest = False
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'SVM', isTest) + tuple(
                        performance_score(train_y_reduced,
                                          predicted_train_y).values()))

            # direct deep learning
            min_max_scaler = Precessing_Scaler_0_9()
            X_train_pre_validation_minmax = min_max_scaler.fit(train_X_reduced)
            X_train_pre_validation_minmax = min_max_scaler.transform(
                train_X_reduced)
            x_test_minmax = min_max_scaler.transform(test_X)
            pretraining_X_minmax = min_max_scaler.transform(train_X_10fold)
            x_train_minmax, x_validation_minmax, y_train_minmax, y_validation_minmax = train_test_split(
                X_train_pre_validation_minmax,
                train_y_reduced,
                test_size=0.4,
                random_state=42)
            finetune_lr = 1
            batch_size = 100
            pretraining_epochs = cal_epochs(5000,
                                            x_train_minmax,
                                            batch_size=batch_size)
            #pretrain_lr=0.001
            pretrain_lr = 0.001
            training_epochs = 1500
            hidden_layers_sizes = [100, 100]
            corruption_levels = [0.1, 0.1]
            if 1:
                print "direct deep learning"
                sda = trainSda(x_train_minmax, y_train_minmax,
                             x_validation_minmax, y_validation_minmax ,
                             x_test_minmax, test_y,
                             hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \
                             training_epochs = training_epochs, pretraining_epochs = pretraining_epochs,
                             pretrain_lr = pretrain_lr, finetune_lr=finetune_lr
                 )
                print 'hidden_layers_sizes:', hidden_layers_sizes
                print 'corruption_levels:', corruption_levels
                training_predicted = sda.predict(x_train_minmax)
                y_train = y_train_minmax
                isTest = False
                #new
                analysis_scr.append((
                    self.ddi, subset_no, fisher_mode, 'DL', isTest
                ) + tuple(
                    performance_score(y_train, training_predicted).values()))

                test_predicted = sda.predict(x_test_minmax)
                y_test = test_y
                isTest = True
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'DL', isTest) +
                    tuple(performance_score(y_test, test_predicted).values()))

            if 0:
                # deep learning using unlabeled data for pretraining
                print 'deep learning with unlabel data'
                pretraining_epochs = cal_epochs(5000,
                                                pretraining_X_minmax,
                                                batch_size=batch_size)
                sda_unlabel = trainSda(x_train_minmax, y_train_minmax,
                             x_validation_minmax, y_validation_minmax ,
                             x_test_minmax, test_y,
                             pretraining_X_minmax = pretraining_X_minmax,
                             hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \
                             training_epochs = training_epochs, pretraining_epochs = pretraining_epochs,
                             pretrain_lr = pretrain_lr, finetune_lr=finetune_lr
                 )
                print 'hidden_layers_sizes:', hidden_layers_sizes
                print 'corruption_levels:', corruption_levels
                training_predicted = sda_unlabel.predict(x_train_minmax)
                y_train = y_train_minmax
                isTest = False
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'DL_U', isTest) + tuple(
                        performance_score(y_train, training_predicted,
                                          predicted_score).values()))

                test_predicted = sda_unlabel.predict(x_test_minmax)
                y_test = test_y

                isTest = True
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'DL_U', isTest) + tuple(
                        performance_score(y_test, test_predicted,
                                          predicted_score).values()))
            if 0:
                # deep learning using split network
                print 'deep learning using split network'
                # get the new representation for A set. first 784-D
                pretraining_epochs = 5000
                hidden_layers_sizes = [100, 100, 100]
                corruption_levels = [0, 0, 0]

                x = x_train_minmax[:, :x_train_minmax.shape[1] / 2]
                print "original shape for A", x.shape
                a_MAE_A = train_a_MultipleAEs(
                    x,
                    pretraining_epochs=pretraining_epochs,
                    pretrain_lr=pretrain_lr,
                    batch_size=batch_size,
                    hidden_layers_sizes=hidden_layers_sizes,
                    corruption_levels=corruption_levels)
                new_x_train_minmax_A = a_MAE_A.transform(
                    x_train_minmax[:, :x_train_minmax.shape[1] / 2])
                x = x_train_minmax[:, x_train_minmax.shape[1] / 2:]

                print "original shape for B", x.shape
                a_MAE_B = train_a_MultipleAEs(
                    x,
                    pretraining_epochs=pretraining_epochs,
                    pretrain_lr=pretrain_lr,
                    batch_size=batch_size,
                    hidden_layers_sizes=hidden_layers_sizes,
                    corruption_levels=corruption_levels)
                new_x_train_minmax_B = a_MAE_B.transform(
                    x_train_minmax[:, x_train_minmax.shape[1] / 2:])

                new_x_test_minmax_A = a_MAE_A.transform(
                    x_test_minmax[:, :x_test_minmax.shape[1] / 2])
                new_x_test_minmax_B = a_MAE_B.transform(
                    x_test_minmax[:, x_test_minmax.shape[1] / 2:])
                new_x_validation_minmax_A = a_MAE_A.transform(
                    x_validation_minmax[:, :x_validation_minmax.shape[1] / 2])
                new_x_validation_minmax_B = a_MAE_B.transform(
                    x_validation_minmax[:, x_validation_minmax.shape[1] / 2:])
                new_x_train_minmax_whole = np.hstack(
                    (new_x_train_minmax_A, new_x_train_minmax_B))
                new_x_test_minmax_whole = np.hstack(
                    (new_x_test_minmax_A, new_x_test_minmax_B))
                new_x_validationt_minmax_whole = np.hstack(
                    (new_x_validation_minmax_A, new_x_validation_minmax_B))

                finetune_lr = 1
                batch_size = 100
                pretraining_epochs = cal_epochs(5000,
                                                x_train_minmax,
                                                batch_size=batch_size)
                #pretrain_lr=0.001
                pretrain_lr = 0.001
                training_epochs = 1500
                hidden_layers_sizes = [100, 100, 100]
                corruption_levels = [0, 0, 0]

                sda_transformed = trainSda(new_x_train_minmax_whole, y_train_minmax,
                     new_x_validationt_minmax_whole, y_validation_minmax ,
                     new_x_test_minmax_whole, y_test,
                     hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \
                     training_epochs = training_epochs, pretraining_epochs = pretraining_epochs,
                     pretrain_lr = pretrain_lr, finetune_lr=finetune_lr
                     )

                print 'hidden_layers_sizes:', hidden_layers_sizes
                print 'corruption_levels:', corruption_levels
                training_predicted = sda_transformed.predict(
                    new_x_train_minmax_whole)
                y_train = y_train_minmax

                isTest = False
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'DL_S', isTest) + tuple(
                        performance_score(y_train, training_predicted,
                                          predicted_score).values()))

                test_predicted = sda_transformed.predict(
                    new_x_test_minmax_whole)
                y_test = test_y

                isTest = True
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'DL_S', isTest) + tuple(
                        performance_score(y_test, test_predicted,
                                          predicted_score).values()))

        report_name = filename + '_' + '_test10fold_'.join(
            map(str, hidden_layers_sizes)
        ) + '_' + str(pretrain_lr) + '_' + str(finetune_lr) + '_' + str(
            reduce_ratio) + '_' + str(training_epochs) + '_' + current_date
        saveAsCsv(predicted_score, report_name,
                  performance_score(y_test, test_predicted, predicted_score),
                  analysis_scr)
示例#25
0
parser.add_argument('-b','--minbin', help='Minimum categorical bin size', type=int, default=1)
parser.add_argument('-cv','--cv', action='store_true')
parser.add_argument('-codetest','--codetest', action='store_true')
parser.add_argument('-getcached', '--getcached', action='store_true')
parser.add_argument('-extra', '--extra', action='store_true')
m_params = vars(parser.parse_args())

# Load data
X, y, X_sub, ids = data.load(m_params)

print("BNP Parabas: classification...\n") 
clf = ExtraTreesRegressor(n_estimators=700, max_features=60, min_samples_split= 4, max_depth=40, n_jobs=-1, min_samples_leaf=2)

if m_params['cv']:
	# do cross validation scoring
	kf = KFold(X.shape[0], n_folds=4, shuffle=True, random_state=1)
	scr = np.zeros([len(kf)])
	oob_pred = np.zeros(X.shape[0])

	for i, (tr_ix, val_ix) in enumerate(kf):
		clf.fit(X[tr_ix], y[tr_ix])
		pred = clf.predict(X[val_ix])
		oob_pred[val_ix] = np.array(pred)
		scr[i] = log_loss(y[val_ix], np.array(pred))
		print('Train score is:', scr[i])
	print(log_loss(y, oob_pred))
	print oob_pred[1:10]
	oob_filename = '../output/oob_pred_extrees_' + str(np.mean(scr)) + '.p'
	pkl.dump(oob_pred, open(oob_filename, 'wb'))

else:
    return et


sql = ''
Features_list, Target_list = SQLTrainData(sql)
from sklearn import cross_validation
from sklearn.cross_validation import KFold

train, test, y_train, y_test = cross_validation.train_test_split(
    Features_list, Target_list, test_size=0.3, random_state=2017)

ntrain = train.shape[0]
ntest = test.shape[0]
SEED = 0  # for reproducibility
NFOLDS = 5  # set folds for out-of-fold prediction
kf = KFold(ntrain, n_folds=NFOLDS, random_state=SEED)


# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)

    def fit(self, x, y):
示例#27
0
# 2. Вычислите TF-IDF-признаки для всех текстов. Обратите внимание, что в этом задании мы предлагаем вам
# вычислить TF-IDF по всем данным. При таком подходе получается, что признаки на обучающем множестве используют
# информацию из тестовой выборки — но такая ситуация вполне законна, поскольку мы не используем значения целевой
# переменной из теста. На практике нередко встречаются ситуации, когда признаки объектов тестовой выборки известны на
# момент обучения, и поэтому можно ими пользоваться при обучении алгоритма.

vectorizer = TfidfVectorizer()
vectorizer.fit_transform(X)

# 3. Подберите минимальный лучший параметр C из множества [10^-5, 10^-4, ... 10^4, 10^5] для SVM с
# линейным ядром (kernel='linear') при помощи кросс-валидации по 5 блокам. Укажите параметр random_state=241 и для SVM,
# и для KFold. В качестве меры качества используйте долю верных ответов (accuracy).

grid = {'C': np.power(10.0, np.arange(-5, 6))}
cv = KFold(y.size, n_folds=5, shuffle=True, random_state=241)
model = SVC(kernel='linear', random_state=241)
gs = grid_search.GridSearchCV(model, grid, scoring='accuracy', cv=cv)
gs.fit(vectorizer.transform(X), y)

score = 0
C = 0
for attempt in gs.grid_scores_:
    if attempt.mean_validation_score > score:
        score = attempt.mean_validation_score
        C = attempt.parameters['C']

# 4. Обучите SVM по всей выборке с оптимальным параметром C, найденным на предыдущем шаге.

model = SVC(kernel='linear', random_state=241, C=C)
model.fit(vectorizer.transform(X), y)
示例#28
0
#%%
# Lasso
model = linear_model.Lasso(alpha = 0.001)
model.fit(trainX, trainY)
prediction = model.predict(testX)
print("Lasso Accuracy: ", model.score(testX,testY))

#%%
#Ridge
model = linear_model.Ridge(alpha = 0.05, normalize=True)
model.fit(trainX, trainY)
prediction = model.predict(testX)
print("Ridge Accuracy: ", model.score(testX,testY))

#%%
kfold = KFold(n=10,random_state=10)

#%%
cvMean = []
results = []
classifiers = ['Linear Svm','Radial Svm','Logistic Regression','Decision Tree','KNN']
models = [svm.SVC(kernel='linear'),svm.SVC(kernel='rbf'),LogisticRegression(),DecisionTreeClassifier(),KNeighborsClassifier(n_neighbors=3)]
for i in models:
    model = i
    result = cross_val_score(model, wine[wine.columns[:11]], wine['quality'],cv=kfold, scoring='accuracy')
    results.append(result)
    cvMean.append(result.mean())
new_models_df = pd.DataFrame(cvMean, index=classifiers)
new_models_df.columns = ['CV Mean']
new_models_df
def stacking_classifier(folds, models):    
    # Level 1 regression models
    regrs = models

    # 5-fold cross validation
    kf = list(KFold(len(target_train_bin), n_folds=folds, shuffle = True, random_state = 1991))

   
    # Pre-allocate the data
    blend_train = np.zeros((regressors_train_pca.shape[0], len(regrs)))     # Number of training data x Number of classifiers
    blend_test = np.zeros((regressors_validation_pca.shape[0], len(regrs)))       # Number of testing data x Number of classifiers
                  
    
    # For each classifier, we train the number of fold times (=len(kf))
    for j, clf in enumerate(regrs):
        print('Training Regression Model [{}] - {}'.format(j, clf))
        blend_test_j = np.zeros((regressors_validation_pca.shape[0], len(kf))) # Number of testing data x Number of folds , we will take the mean of the predictions later
        for i, (train_index, cv_index) in enumerate(kf):
            print('Fold [{}]'.format(i))
            
            # This is the training and validation set
            X_train = regressors_train_pca[train_index]
            #Y_train = target_train_bin.iloc[train_index]
            Y_train = target_train_bin[train_index]
            X_cv = regressors_train_pca[cv_index]
            
            if(j == 0):
                # ANN
                Y_train = to_categorical(Y_train)
                clf.fit(X_train, Y_train, validation_split=0.2, epochs=5, batch_size=16, verbose=2)
            else:  
                clf.fit(X_train, Y_train)

            
            # This output will be the basis for our blended classifier to train against,
            # which is also the output of level 1 Regressors
            if(j==0):
                blend_train[cv_index, j] = clf.predict_classes(X_cv).flatten()
                blend_test_j[:, i] = clf.predict_classes(regressors_validation_pca).flatten()
            else:
                blend_train[cv_index, j] = clf.predict(X_cv).flatten()
                blend_test_j[:, i] = clf.predict(regressors_validation_pca).flatten()
           
        # Take the mean of the predictions of the cross validation set
        blend_test[:, j] = blend_test_j.mean(1)
    
    
    # Blending (predict Level 2 based on predictions on the train set)
 
#    ridgecv = RidgeCV(alphas=alphas, scoring='neg_mean_squared_error', normalize=False)
#    ridgecv.fit(blend_train, target_train)
#    ridgecv.alpha_
    # Fit Ridge model with best alpha

#    bclf = Ridge(alpha=ridgecv.alpha_, normalize=False, max_iter=10000)
#    bclf.fit(blend_train, target_train)

    bclf =  LogisticRegression()
    bclf.fit(blend_train, target_train_bin)
    #bclf =NN_CLF_model(len(regrs))
    #bclf.fit(blend_train, to_categorical(target_train_bin), validation_split=0.2, epochs=5, batch_size=16, verbose=2)
    # Predict now
    predicted_level2_bin = bclf.predict(blend_test)
    #predicted_level2_bin = bclf.predict_classes(blend_test)
    score = accuracy_score(target_validation_bin, predicted_level2_bin)
    return score, predicted_level2_bin
示例#30
0
            perturb[i] = 0
        return grad

    d = Orange.data.Table('housing')
    d.X = np.hstack((d.X, np.ones((d.X.shape[0], 1))))
    d.shuffle()

    #    m = LinearRegressionLearner(lambda_=1.0)
    #    print(m(d)(d))

    #    # gradient check
    #    m = LinearRegressionLearner(lambda_=1.0)
    #    theta = np.random.randn(d.X.shape[1])
    #
    #    ga = m.cost_grad(theta, d.X, d.Y.ravel())[1]
    #    gm = numerical_grad(lambda t: m.cost_grad(t, d.X, d.Y.ravel())[0], theta)
    #
    #    print(np.sum((ga - gm)**2))

    for lambda_ in (0.01, 0.03, 0.1, 0.3, 1, 3):
        m = LinearRegressionLearner(lambda_=lambda_)
        scores = []
        for tr_ind, te_ind in KFold(d.X.shape[0]):
            s = np.mean((m(d[tr_ind])(d[te_ind]) - d[te_ind].Y.ravel())**2)
            scores.append(s)
        print('{:5.2f} {}'.format(lambda_, np.mean(scores)))

    m = LinearRegressionLearner(lambda_=0)
    print('test data', np.mean((m(d)(d) - d.Y.ravel())**2))
    print('majority', np.mean((np.mean(d.Y.ravel()) - d.Y.ravel())**2))