def test_cross_val_generator_mask_indices_same(): # Test that the cross validation generators return the same results when # indices=True and when indices=False y = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2]) labels = np.array([1, 1, 2, 3, 3, 3, 4]) loo_mask = cval.LeaveOneOut(5, indices=False) loo_ind = cval.LeaveOneOut(5, indices=True) lpo_mask = cval.LeavePOut(10, 2, indices=False) lpo_ind = cval.LeavePOut(10, 2, indices=True) kf_mask = cval.KFold(10, 5, indices=False, shuffle=True, random_state=1) kf_ind = cval.KFold(10, 5, indices=True, shuffle=True, random_state=1) skf_mask = cval.StratifiedKFold(y, 3, indices=False) skf_ind = cval.StratifiedKFold(y, 3, indices=True) lolo_mask = cval.LeaveOneLabelOut(labels, indices=False) lolo_ind = cval.LeaveOneLabelOut(labels, indices=True) lopo_mask = cval.LeavePLabelOut(labels, 2, indices=False) lopo_ind = cval.LeavePLabelOut(labels, 2, indices=True) for cv_mask, cv_ind in [(loo_mask, loo_ind), (lpo_mask, lpo_ind), (kf_mask, kf_ind), (skf_mask, skf_ind), (lolo_mask, lolo_ind), (lopo_mask, lopo_ind)]: for (train_mask, test_mask), (train_ind, test_ind) in \ zip(cv_mask, cv_ind): assert_array_equal(np.where(train_mask)[0], train_ind) assert_array_equal(np.where(test_mask)[0], test_ind)
def test_leave_label_out_changing_labels(): # Check that LeaveOneLabelOut and LeavePLabelOut work normally if # the labels variable is changed before calling __iter__ labels = np.array([0, 1, 2, 1, 1, 2, 0, 0]) labels_changing = np.array(labels, copy=True) lolo = cval.LeaveOneLabelOut(labels) lolo_changing = cval.LeaveOneLabelOut(labels_changing) lplo = cval.LeavePLabelOut(labels, p=2) lplo_changing = cval.LeavePLabelOut(labels_changing, p=2) labels_changing[:] = 0 for llo, llo_changing in [(lolo, lolo_changing), (lplo, lplo_changing)]: for (train, test), (train_chan, test_chan) in zip(llo, llo_changing): assert_array_equal(train, train_chan) assert_array_equal(test, test_chan)
def train_svm(train_vector, train_labels, object_ids): """ train_svm - expects a vector of features and a nx1 set of corresponding labels Returns a trained SVM classifier """ # Create the obj_id_vector for cross validation lpl = cross_validation.LeavePLabelOut(object_ids, p=1, indices=True) # Grid search with nested cross-validation parameters = { 'kernel': ['linear'], 'C': [1, 1e1, 1e2, 1e3, 1e4], 'gamma': [1, 1e-1, 1e-2, 1e-3, 1e-4], 'degree': [2, 3, 4, 5] } #parameters = {'kernel': ['linear'], 'C': [1, 1e1], 'gamma': [1, 1e-1, 1e-2]} svm = GridSearchCV(SVC(probability=True), parameters, score_func=f1_score, cv=lpl) # Train the SVM using the best parameters svm.fit(train_vector, train_labels) svm_best = svm.best_estimator_ return svm_best
def train_gradient_boost(train_vector, train_labels, object_ids): """ train_svm - expects a vector of features and a nx1 set of corresponding labels Returns a trained Gradient Boosting classifier """ # Create the obj_id_vector for cross validation lpl = cross_validation.LeavePLabelOut(object_ids, p=1, indices=True) parameters = {'n_estimators': [1000], 'learn_rate': [1e-1, 1e-2, 1e-3]} # Grid search with nested cross-validation #parameters = {'kernel': ['rbf'], 'C': [1, 1e1, 1e2, 1e3, 1e4], 'gamma': [1, 1e-1, 1e-2, 1e-3, 1e-4]} clf = GridSearchCV(GradientBoostingClassifier(max_depth=4), parameters, score_func=f1_score, cv=lpl) # Train the SVM using the best parameters clf.fit(train_vector, train_labels) clf_best = clf.best_estimator_ return clf_best
def main(): c = mdb.connect(host=HOST, user=USER, passwd=PASSWORD, db=DATABASE) ### CLASSIFYING USERS FROM KEYSTROKE DYNAMICS # ## get list of insights and labels #userlist = [3, 9] #users = [] #for user in userlist: # values = kt.get_insights_by_user(user, c).values[:,0].tolist() # users.extend([user]*len(values)) #users = np.array(users) # #rfc = ensemble.RandomForestClassifier() #train_and_test_model(userlist, users, rfc, c) ## PREDICTING HAPPINESS LEVEL FROM ALL DATA #users = [45] users = [33, 35, 37, 39, 41, 43, 45, 47, 49] if len(sys.argv) > 1: model_type = sys.argv[1] else: raise ValueError("must specify model type as argument") split_by = 'hour' # get all data data = get_and_impute_data(users, True, split_by, c) happiness = get_var_by_users("happiness_level", users, split_by, c) energy = get_var_by_users("energy_level", users, split_by, c) relax = get_var_by_users("relax_level", users, split_by, c) insights = get_var_by_users("id", users, split_by, c) response = np.vstack((happiness, energy, relax)).T n_insights = len(np.unique(insights)) # split dataset into training and evaluation datasets for train_idx, eval_idx in cross_validation.LeavePLabelOut( insights, int(n_insights * 0.2)): train_data = data[train_idx] eval_data = data[eval_idx] train_response = response[train_idx] eval_response = response[eval_idx] train_insights = insights[train_idx] eval_insights = insights[eval_idx] break #model = train_and_test_model(train_data, train_response[:,1], train_insights, model_type, split_by, c, varname="energy level") #evaluate_model(eval_data, eval_response[:,1], model, "%s model evaluation" % model_type, "energy level") model = train_and_test_model(train_data, train_response[:, 2], train_insights, model_type, split_by, c, varname="relaxation level") #evaluate_model(eval_data, eval_response[:,2], model, "%s model evaluation" % models[model_type][2], "relaxation level") np.savetxt('response.txt', np.vstack((happiness, energy, relax)))
def leavePLabelOut(X, y_true, labels): y_pred = y_true * 0.0 lpl = cross_validation.LeavePLabelOut(labels, p=1) i = 0 for train_index, test_index in lpl: i += 1 print "CrossVal " + str(i) + " of " + str(len(lpl)) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y_true[train_index], y_true[test_index] clf = RFClassifier(n_estimators=200) #clf = GBClassifier(n_estimators=5000,learning_rate=0.05, max_features=0.25) clf.fit(X_train, y_train) y_pred[test_index] = clf.predict_proba(X_test) roc_auc(y_true, y_pred)
def train_knn(train_vector, train_labels, test_vector, test_labels, scaler): """ train_knn - expects a vector of features and a nx1 set of corresponding labels. Finally the number of neighbors used for comparison Returns a trained knn classifier """ # Data scaling train_vector_scaled = scaler.transform(train_vector[0]) test_vector_scaled = scaler.transform(test_vector[0]) # Create the obj_id_vector for cross validation lpl = cross_validation.LeavePLabelOut(np.array(train_vector[1]), p=10, indices=True) import pdb pdb.set_trace() pass ''' # Check the split for train_index, test_index in lpl: print "TRAIN_INDEX: %s TEST_INDEX: %s" % (train_index, test_index) # the train/test_vector_scaled need to by a array train = train_vector_scaled[train_index] test = train_vector_scaled[test_index] import pdb; pdb.set_trace() pass ''' # Grid search with nested cross-validation parameters = { 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] } knn = GridSearchCV(KNeighborsClassifier(), parameters, score_func=f1_score, cv=lpl) knn.fit(train_vector_scaled, train_labels) knn_best = knn.best_estimator_ score = f1_score(test_labels, knn.predict(test_vector_scaled)) proba = knn.predict_proba(test_vector_scaled) report = classification_report(test_labels, knn.predict(test_vector_scaled)) return (knn_best, proba, score, report)
def train_univariate_selection(train_X, train_Y, object_ids=None, score_fun=f1_score, verbose=0, n_jobs=6, scale=False): ''' Cross validates on the best percentage of features to keep when doing univariate feature selection ''' # Setup cross validation if (object_ids is None) or (sum(train_Y) <= 10): print "Cannot perform leave one out cross validation" cv = 3 # default 3 fold cross validation else: # Leave one object out cross validation cv = cross_validation.LeavePLabelOut(object_ids, p=1, indices=True) if scale is True: scaler = preprocessing.StandardScaler().fit(train_X) train_X = scaler.transform(train_X) else: scaler = False univ_select = SelectPercentile(f_classif, percentile=10) X_features = univ_select.fit(train_X, train_Y).transform(train_X) # class weight normalizes the lack of positive examples svm = LinearSVC(dual=False, class_weight='auto') svm.fit(X_features, train_Y) pipeline = Pipeline([("features", univ_select), ("svm", svm)]) param_grid = dict(features__percentile=[20, 40, 60, 80, 100], svm__C=np.linspace(1, 1e6, 1000)) #svm__penalty = ['l1','l2']) grid = GridSearchCV(pipeline, param_grid, cv=cv, verbose=verbose, n_jobs=n_jobs, score_func=score_fun) grid.fit(train_X, train_Y) svm_best = grid.best_estimator_ return svm_best, scaler
def test_cross_val_generator_with_mask(): X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 1, 2, 2]) labels = np.array([1, 2, 3, 4]) loo = cval.LeaveOneOut(4, indices=False) lpo = cval.LeavePOut(4, 2, indices=False) kf = cval.KFold(4, 2, indices=False) skf = cval.StratifiedKFold(y, 2, indices=False) lolo = cval.LeaveOneLabelOut(labels, indices=False) lopo = cval.LeavePLabelOut(labels, 2, indices=False) ss = cval.ShuffleSplit(4, indices=False) for cv in [loo, lpo, kf, skf, lolo, lopo, ss]: for train, test in cv: X_train, X_test = X[train], X[test] y_train, y_test = y[train], y[test]
def sample(self, X, y, folds): n = np.shape(X)[0] self.lpl = cross_validation.LeavePLabelOut(folds, p=1) if self.fold == 'all': X_train = X y_train = y X_test = X y_test = y else: for train_index, test_index in self.lpl: if folds[test_index[0]] == self.fold: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] self.train_folds, self.test_folds = folds[ train_index], folds[test_index] return X_train, X_test, y_train, y_test
def train_svm(train_vector, train_labels, test_vector, test_labels, scaler, cv_flag): """ train_svm - expects a vector of features and a nx1 set of corresponding labels Returns a trained SVM classifier """ # Data scaling train_vector_scaled = scaler.transform(train_vector[0]) test_vector_scaled = scaler.transform(test_vector[0]) # Create the obj_id_vector for cross validation lpl = cross_validation.LeavePLabelOut(np.array(train_vector[1]), p=2, indices=True) #import pdb; pdb.set_trace() #pass if cv_flag == True: cv_mode = lpl else: cv_mode = 4 # Grid search with nested cross-validation parameters = { 'kernel': ['rbf'], 'C': [1, 1e1, 1e2, 1e3, 1e4], 'gamma': [1, 1e-1, 1e-2, 1e-3, 1e-4] } svm = GridSearchCV(SVC(probability=True), parameters, score_func=f1_score, cv=cv_mode) svm.fit(train_vector_scaled, train_labels) svm_best = svm.best_estimator_ score = f1_score(test_labels, svm.predict(test_vector_scaled)) proba = svm.predict_proba(test_vector_scaled) report = classification_report(test_labels, svm.predict(test_vector_scaled)) #import pdb; pdb.set_trace() #pass return (svm_best, proba, score, report)
def test_cross_indices_exception(): X = coo_matrix(np.array([[1, 2], [3, 4], [5, 6], [7, 8]])) y = np.array([1, 1, 2, 2]) labels = np.array([1, 2, 3, 4]) loo = cval.LeaveOneOut(4, indices=False) lpo = cval.LeavePOut(4, 2, indices=False) kf = cval.KFold(4, 2, indices=False) skf = cval.StratifiedKFold(y, 2, indices=False) lolo = cval.LeaveOneLabelOut(labels, indices=False) lopo = cval.LeavePLabelOut(labels, 2, indices=False) assert_raises(ValueError, cval.check_cv, loo, X, y) assert_raises(ValueError, cval.check_cv, lpo, X, y) assert_raises(ValueError, cval.check_cv, kf, X, y) assert_raises(ValueError, cval.check_cv, skf, X, y) assert_raises(ValueError, cval.check_cv, lolo, X, y) assert_raises(ValueError, cval.check_cv, lopo, X, y)
def train_svm_gridsearch(train_X, train_Y, object_ids=None, verbose=0, n_jobs=6, score_fun=f1_score, scale=False): ''' Performs cross validation using grid search ''' # Setup cross validation if (object_ids is None) or (sum(train_Y) <= 10): print "Cannot perform leave one out cross validation" cv = 3 # 5 fold cross validation else: # Leave one object out cross validation cv = cross_validation.LeavePLabelOut(object_ids, p=1, indices=True) parameters = { 'C': np.linspace(1, 1e6, 1000), #'C': np.linspace(1,1e6,100), #'C': (1e-3,1e-2,1e-1,1.0, 10, 100, 1000, 1e4, 1e5, 1e6), 'penalty': ('l1', 'l2'), } # class weight normalizes the lack of positive examples clf = LinearSVC(dual=False, class_weight='auto') if scale is True: scaler = preprocessing.StandardScaler().fit(train_X) train_X = scaler.transform(train_X) else: scaler = False grid = GridSearchCV(clf, parameters, cv=cv, verbose=verbose, n_jobs=n_jobs, score_func=score_fun) grid.fit(train_X, train_Y) svm_best = grid.best_estimator_ return svm_best, scaler
def test_cross_val_generator_with_default_indices(): X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 1, 2, 2]) labels = np.array([1, 2, 3, 4]) loo = cval.LeaveOneOut(4) lpo = cval.LeavePOut(4, 2) kf = cval.KFold(4, 2) skf = cval.StratifiedKFold(y, 2) lolo = cval.LeaveOneLabelOut(labels) lopo = cval.LeavePLabelOut(labels, 2) ss = cval.ShuffleSplit(2) ps = cval.PredefinedSplit([1, 1, 2, 2]) for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]: for train, test in cv: assert_not_equal(np.asarray(train).dtype.kind, 'b') assert_not_equal(np.asarray(train).dtype.kind, 'b') X[train], X[test] y[train], y[test]
def test_cross_val_generator_with_default_indices(): X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 1, 2, 2]) labels = np.array([1, 2, 3, 4]) loo = cval.LeaveOneOut(4) lpo = cval.LeavePOut(4, 2) kf = cval.KFold(4, 2) skf = cval.StratifiedKFold(y, 2) lolo = cval.LeaveOneLabelOut(labels) lopo = cval.LeavePLabelOut(labels, 2) b = cval.Bootstrap(2) # only in index mode ss = cval.ShuffleSplit(2) for cv in [loo, lpo, kf, skf, lolo, lopo, b, ss]: for train, test in cv: assert_not_equal(np.asarray(train).dtype.kind, 'b') assert_not_equal(np.asarray(train).dtype.kind, 'b') X_train, X_test = X[train], X[test] y_train, y_test = y[train], y[test]
def show_cross_val(method): if method == "lolo": labels = np.array(["summer", "winter", "summer", "winter", "spring"]) cv = cross_validation.LeaveOneLabelOut(labels) elif method == "lplo": labels = np.array(["summer", "winter", "summer", "winter", "spring"]) cv = cross_validation.LeavePLabelOut(labels, p=2) elif method == "loo": cv = cross_validation.LeaveOneOut(n=len(y)) elif method == "lpo": cv = cross_validation.LeavePOut(n=len(y), p=3) for train_index, test_index in cv: print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] print "X_train:", X_train print "y_train:", y_train print "X_test:", X_test print "y_test:", y_test
def train_gradient_boost(train_X, train_Y, object_ids=None, score_fun=f1_score, verbose=0, n_jobs=6, scale=False): ''' Performs cross validation using grid search and gradient tree boosting ''' # Setup cross validation if (object_ids is None) or (sum(train_Y) <= 10): print "Cannot perform leave one out cross validation" cv = 5 # 10 fold cross validation else: # Leave one object out cross validation cv = cross_validation.LeavePLabelOut(object_ids, p=1, indices=True) if scale is True: scaler = preprocessing.StandardScaler().fit(train_X) train_X = scaler.transform(train_X) else: scaler = False parameters = { 'n_estimators': [1000], 'learn_rate': [1e-1, 1e-2, 1, 1e-3] #'max_depth':[4] } print "Beginning Grid Search" grid = GridSearchCV(GradientBoostingClassifier(max_depth=4), parameters, score_func=score_fun, cv=cv, verbose=verbose, n_jobs=n_jobs) grid.fit(train_X, train_Y) svm_best = grid.best_estimator_ return svm_best, scaler
dataset = mnist.load() dim = dataset['train']['data'][0].size N_train = len(dataset['train']['target']) N_test = len(dataset['test']['target']) test_data_dict = { 'data': dataset['test']['data'].reshape(N_test, dim).astype(np.float32), 'target': dataset['test']['target'].astype(np.int32) } #unlabeled_data_dict = {'data':dataset['train']['data'].reshape(N_train, dim).astype(np.float32), # 'target':-np.ones(N_train)} unlabeled_data_dict = { 'data': dataset['train']['data'].reshape(N_train, dim).astype(np.float32) } # making labeled data lplo = cross_validation.LeavePLabelOut(labels=six.moves.range(N_train), p=args.slabeled) fold = 1 for i in six.moves.range(fold): train_idx, test_idx = next(iter(lplo)) labeled_data_dict = { 'data': unlabeled_data_dict['data'][test_idx].astype(np.float32), 'target': dataset['train']['target'][test_idx].astype(np.int32) } # labeled_data = datafeeders.SiameseFeeder(labeled_data_dict, batchsize=args.lbatch) unlabeled_data = DataFeeder(unlabeled_data_dict, batchsize=args.ubatch) test_data = datafeeders.SiameseFeeder(test_data_dict, batchsize=args.valbatch) labeled_data.hook_preprocess(mnist_preprocess) unlabeled_data.hook_preprocess(mnist_preprocess_u)