def split(sources, n_iter=5, test_size=0.2, random_state=None): '''Generate train-test splits from an array of sources. Splits are conditioned on `key` fields of the data Parameters ---------- sources : pd.DataFrame DataFrame containing the list of source data n_iter : int > 0 Number of splits to generate test_size = float > 0 Approximate fraction of points to land in the test set random_state : int or RandomState PRNG seed Yields ------ train, test : sets of keys keys belonging to the train or test set respectively ''' for train, test in LabelShuffleSplit(sources.key, n_iter=n_iter, test_size=test_size, random_state=random_state): yield set(sources.loc[train].key), set(sources.loc[test].key)
def shufflesplit(): from sklearn.cross_validation import LabelShuffleSplit labels = [1, 1, 2, 2, 3, 3, 4, 4] slo = LabelShuffleSplit(labels, n_iter=4, test_size=0.5, random_state=0) for train, test in slo: print("%s %s" % (train, test))
def __init__(self, train_or_test, shuffle=True, dir=None,random_state=1): assert train_or_test in ['train', 'test'] DATASET_PATH = os.environ.get('DATASET_PATH', '/home/ubuntu/distracted-drivers-tf/dataset/data_large20.pkl') print('Loading dataset {}...'.format(DATASET_PATH)) with open(DATASET_PATH, 'rb') as f: X_train_raw, y_train_raw, self.X_test, self.X_test_ids, driver_ids = pickle.load(f) _, driver_indices = np.unique(np.array(driver_ids), return_inverse=True) for train_index, valid_index in LabelShuffleSplit(driver_indices, n_iter=1, test_size=0.2, random_state=random_state): x = X_train_raw #.reshape(X_train_raw.shape[0],3, 24, 32) y = y_train_raw y = np.argmax(y,axis=1) x = x/np.float32(255) self.X_test = self.X_test/np.float32(255) self.pixel_mean = np.mean(np.vstack((self.X_test,x)),axis=0) x -= self.pixel_mean self.X_test -= self.pixel_mean pickle.dump( self.pixel_mean, open( "/home/ubuntu/tensorpack/examples/ResNet/pixel_mean.p", "wb" ) ) X_train = x[train_index,:,:,:] Y_train = y[train_index] X_test = x[valid_index,:,:,:] Y_test = y[valid_index] ret = [] if train_or_test == 'train': ##### for i in range(len(X_train)): img = X_train[i] ret.append([img,Y_train[i]]) else: for i in range(len(X_test)): img = X_test[i] ret.append([img,Y_test[i]]) ##### self.train_or_test = train_or_test self.data = ret self.shuffle = shuffle self.rng = get_rng(self)
def classification_model(model, data, predictors, label, categorical_features=None, cv_label_name=None, k=5, test_size=0.1, n_iter=100, train_only=False): data_len = len(data) cv = None auc, r2, rmse, acc = [], [], [], [] # print 'Predictors:', predictors predictors = [p.strip() for p in predictors] if cv_label_name is not None: cv_label = data[cv_label_name] else: cv_label = None if k is not None and cv_label is not None: cv = LabelKFold(cv_label, n_folds=k) elif k is not None and cv_label is None: cv = KFold(data_len, n_folds=k, shuffle=True) if k is None and test_size is not None and n_iter is not None and cv_label is not None: cv = LabelShuffleSplit(cv_label, n_iter=n_iter, test_size=test_size, random_state=42) if k is None and test_size is not None and n_iter is not None and cv_label is None: cv = ShuffleSplit(data_len, n_iter=n_iter, test_size=test_size, random_state=42) for train, test in cv: x_train = (data[predictors].iloc[train, :]) y_train = data[label].iloc[train] x_test = (data[predictors].iloc[test, :]) y_test = data[label].iloc[test] if categorical_features is not None: feature_idxs = [ x_train.columns.get_loc(name) for name in categorical_features ] encoder = OneHotEncoder(categorical_features=feature_idxs) encoder.fit(np.vstack((x_train, x_test))) x_train = encoder.transform(x_train) x_test = encoder.transform(x_test) model.fit(x_train, y_train) if train_only: x_test = x_train y_test = y_train y_pred_p = model.predict_proba(x_test)[:, 1] y_pred_c = model.predict(x_test) a, b, c, d = binary_classification_metrics(y_test, y_pred_p, y_pred_c) auc.append(a) r2.append(b) rmse.append(c) acc.append(d) # print 'auc:', a # print 'r2:', b # print 'rmse:', c # print 'accuracy:', d return np.mean(auc), np.mean(r2), np.mean(rmse), np.mean(acc)
def res_net_50(): base_model = resnet50.ResNet50(input_shape=(WIDTH, HEIGHT, NB_CHANNELS), include_top=False, weights='imagenet') x = base_model.output x = GlobalAveragePooling2D()(x) predictions = Dense(NUM_CLASSES, activation='softmax')(x) model = Model(base_model.input, predictions) for layer in base_model.layers: layer.trainable = False model.compile(Adam(lr=1e-3), loss='categorical_crossentropy', metrics=['accuracy']) return model for train_index, valid_index in LabelShuffleSplit(driver_indices, n_iter=MAX_FOLDS, test_size=0.2, random_state=67): print('Fold {}/{}'.format(num_folds + 1, MAX_FOLDS)) X_train, y_train = X_train_raw[train_index, ...], y_train_raw[train_index, ...] X_valid, y_valid = X_train_raw[valid_index, ...], y_train_raw[valid_index, ...] X_train = X_train.transpose(0, 2, 3, 1) X_valid = X_valid.transpose(0, 2, 3, 1) model = choose_model(MODEL_NAME) model_path = os.path.join(MODEL_PATH, 'model_{}.json'.format(num_folds)) with open(model_path, 'w') as f: f.write(model.to_json()) # restore existing checkpoint, if it exists checkpoint_path = os.path.join(CHECKPOINT_PATH, 'model_{}.h5'.format(num_folds)) if os.path.exists(checkpoint_path): print('Restoring fold from checkpoint.')
mean_epi = mean_img(func_filename) plot_stat_map(coef_img, mean_epi, title="SVM weights", display_mode="yx") """ ########################################################################### # Small sample recovery experiment from sklearn.cross_validation import LabelShuffleSplit from sklearn import metrics # run a model on all the data model.fit(fmri_masked, target, connectivity=connectivity) for proportion in [1. / 6, 1. / 4, 1. / 3, 1. / 2]: slo = LabelShuffleSplit(sessions, n_iter=10, train_size=proportion, random_state=0) # get the coefs: coef_all = model.coef_ bin_coef_all = np.abs(coef_all) > np.percentile(np.abs(coef_all), 10) coefs = [] for train, _ in slo: coefs.append( model.fit(fmri_masked[train], target[train], connectivity=connectivity).coef_) auc = [] for coef in coefs: fpr, tpr, _ = precision_recall_curve(bin_coef_all, np.abs(coef)) auc.append(metrics.roc_auc_score(bin_coef_all, np.abs(coef)))