Пример #1
0
def split(sources, n_iter=5, test_size=0.2, random_state=None):
    '''Generate train-test splits from an array of sources.

    Splits are conditioned on `key` fields of the data

    Parameters
    ----------
    sources : pd.DataFrame
        DataFrame containing the list of source data

    n_iter : int > 0
        Number of splits to generate

    test_size = float > 0
        Approximate fraction of points to land in the test set

    random_state : int or RandomState
        PRNG seed

    Yields
    ------
    train, test : sets of keys
        keys belonging to the train or test set respectively
    '''

    for train, test in LabelShuffleSplit(sources.key,
                                         n_iter=n_iter,
                                         test_size=test_size,
                                         random_state=random_state):

        yield set(sources.loc[train].key), set(sources.loc[test].key)
Пример #2
0
def shufflesplit():
    from sklearn.cross_validation import LabelShuffleSplit
    labels = [1, 1, 2, 2, 3, 3, 4, 4]
    slo = LabelShuffleSplit(labels, n_iter=4, test_size=0.5, random_state=0)

    for train, test in slo:
        print("%s %s" % (train, test))
Пример #3
0
    def __init__(self, train_or_test, shuffle=True, dir=None,random_state=1):

        assert train_or_test in ['train', 'test']
            
        DATASET_PATH = os.environ.get('DATASET_PATH', '/home/ubuntu/distracted-drivers-tf/dataset/data_large20.pkl')

        print('Loading dataset {}...'.format(DATASET_PATH))
        with open(DATASET_PATH, 'rb') as f:
            X_train_raw, y_train_raw, self.X_test, self.X_test_ids, driver_ids = pickle.load(f)
            
        _, driver_indices = np.unique(np.array(driver_ids), return_inverse=True)
        
        for train_index, valid_index in LabelShuffleSplit(driver_indices, n_iter=1, test_size=0.2, random_state=random_state):
    
            x = X_train_raw #.reshape(X_train_raw.shape[0],3, 24, 32)
            y = y_train_raw
            y = np.argmax(y,axis=1)

            x = x/np.float32(255)
            self.X_test = self.X_test/np.float32(255)
            
            self.pixel_mean = np.mean(np.vstack((self.X_test,x)),axis=0)

            x -= self.pixel_mean
        
            self.X_test -= self.pixel_mean

            pickle.dump( self.pixel_mean, open( "/home/ubuntu/tensorpack/examples/ResNet/pixel_mean.p", "wb" ) )

            X_train = x[train_index,:,:,:]
            Y_train = y[train_index]

            X_test = x[valid_index,:,:,:]
            Y_test = y[valid_index]
            ret = []

            if train_or_test == 'train':
                #####
                for i in range(len(X_train)):
                               img = X_train[i]
                               ret.append([img,Y_train[i]])

            else:
                for i in range(len(X_test)):
                               img = X_test[i]
                               ret.append([img,Y_test[i]])
            #####
                           
  
        self.train_or_test = train_or_test

        self.data = ret
        self.shuffle = shuffle
        self.rng = get_rng(self)
Пример #4
0
def classification_model(model,
                         data,
                         predictors,
                         label,
                         categorical_features=None,
                         cv_label_name=None,
                         k=5,
                         test_size=0.1,
                         n_iter=100,
                         train_only=False):
    data_len = len(data)
    cv = None
    auc, r2, rmse, acc = [], [], [], []

    # print 'Predictors:', predictors
    predictors = [p.strip() for p in predictors]

    if cv_label_name is not None:
        cv_label = data[cv_label_name]
    else:
        cv_label = None

    if k is not None and cv_label is not None:
        cv = LabelKFold(cv_label, n_folds=k)
    elif k is not None and cv_label is None:
        cv = KFold(data_len, n_folds=k, shuffle=True)

    if k is None and test_size is not None and n_iter is not None and cv_label is not None:
        cv = LabelShuffleSplit(cv_label,
                               n_iter=n_iter,
                               test_size=test_size,
                               random_state=42)
    if k is None and test_size is not None and n_iter is not None and cv_label is None:
        cv = ShuffleSplit(data_len,
                          n_iter=n_iter,
                          test_size=test_size,
                          random_state=42)

    for train, test in cv:
        x_train = (data[predictors].iloc[train, :])
        y_train = data[label].iloc[train]
        x_test = (data[predictors].iloc[test, :])
        y_test = data[label].iloc[test]

        if categorical_features is not None:
            feature_idxs = [
                x_train.columns.get_loc(name) for name in categorical_features
            ]
            encoder = OneHotEncoder(categorical_features=feature_idxs)
            encoder.fit(np.vstack((x_train, x_test)))
            x_train = encoder.transform(x_train)
            x_test = encoder.transform(x_test)

        model.fit(x_train, y_train)
        if train_only:
            x_test = x_train
            y_test = y_train
        y_pred_p = model.predict_proba(x_test)[:, 1]
        y_pred_c = model.predict(x_test)

        a, b, c, d = binary_classification_metrics(y_test, y_pred_p, y_pred_c)

        auc.append(a)
        r2.append(b)
        rmse.append(c)
        acc.append(d)

    # print 'auc:', a
    # print 'r2:', b
    # print 'rmse:', c
    # print 'accuracy:', d

    return np.mean(auc), np.mean(r2), np.mean(rmse), np.mean(acc)

def res_net_50():
    base_model = resnet50.ResNet50(input_shape=(WIDTH, HEIGHT, NB_CHANNELS), include_top=False, weights='imagenet')
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    predictions = Dense(NUM_CLASSES, activation='softmax')(x)
    model = Model(base_model.input, predictions)
    for layer in base_model.layers:
        layer.trainable = False

    model.compile(Adam(lr=1e-3), loss='categorical_crossentropy', metrics=['accuracy'])
    return model


for train_index, valid_index in LabelShuffleSplit(driver_indices, n_iter=MAX_FOLDS, test_size=0.2, random_state=67):
    print('Fold {}/{}'.format(num_folds + 1, MAX_FOLDS))

    X_train, y_train = X_train_raw[train_index, ...], y_train_raw[train_index, ...]
    X_valid, y_valid = X_train_raw[valid_index, ...], y_train_raw[valid_index, ...]
    X_train = X_train.transpose(0, 2, 3, 1)
    X_valid = X_valid.transpose(0, 2, 3, 1)
    model = choose_model(MODEL_NAME)
    model_path = os.path.join(MODEL_PATH, 'model_{}.json'.format(num_folds))
    with open(model_path, 'w') as f:
        f.write(model.to_json())

    # restore existing checkpoint, if it exists
    checkpoint_path = os.path.join(CHECKPOINT_PATH, 'model_{}.h5'.format(num_folds))
    if os.path.exists(checkpoint_path):
        print('Restoring fold from checkpoint.')
Пример #6
0
mean_epi = mean_img(func_filename)
plot_stat_map(coef_img, mean_epi, title="SVM weights", display_mode="yx")
"""

###########################################################################
# Small sample recovery experiment

from sklearn.cross_validation import LabelShuffleSplit
from sklearn import metrics

# run a model on all the data
model.fit(fmri_masked, target, connectivity=connectivity)

for proportion in [1. / 6, 1. / 4, 1. / 3, 1. / 2]:
    slo = LabelShuffleSplit(sessions,
                            n_iter=10,
                            train_size=proportion,
                            random_state=0)
    # get the coefs:
    coef_all = model.coef_
    bin_coef_all = np.abs(coef_all) > np.percentile(np.abs(coef_all), 10)
    coefs = []
    for train, _ in slo:
        coefs.append(
            model.fit(fmri_masked[train],
                      target[train],
                      connectivity=connectivity).coef_)

    auc = []
    for coef in coefs:
        fpr, tpr, _ = precision_recall_curve(bin_coef_all, np.abs(coef))
        auc.append(metrics.roc_auc_score(bin_coef_all, np.abs(coef)))