示例#1
0
def test_cross_val_generator_mask_indices_same():
    # Test that the cross validation generators return the same results when
    # indices=True and when indices=False
    y = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2])
    labels = np.array([1, 1, 2, 3, 3, 3, 4])

    loo_mask = cval.LeaveOneOut(5, indices=False)
    loo_ind = cval.LeaveOneOut(5, indices=True)
    lpo_mask = cval.LeavePOut(10, 2, indices=False)
    lpo_ind = cval.LeavePOut(10, 2, indices=True)
    kf_mask = cval.KFold(10, 5, indices=False, shuffle=True, random_state=1)
    kf_ind = cval.KFold(10, 5, indices=True, shuffle=True, random_state=1)
    skf_mask = cval.StratifiedKFold(y, 3, indices=False)
    skf_ind = cval.StratifiedKFold(y, 3, indices=True)
    lolo_mask = cval.LeaveOneLabelOut(labels, indices=False)
    lolo_ind = cval.LeaveOneLabelOut(labels, indices=True)
    lopo_mask = cval.LeavePLabelOut(labels, 2, indices=False)
    lopo_ind = cval.LeavePLabelOut(labels, 2, indices=True)

    for cv_mask, cv_ind in [(loo_mask, loo_ind), (lpo_mask, lpo_ind),
                            (kf_mask, kf_ind), (skf_mask, skf_ind),
                            (lolo_mask, lolo_ind), (lopo_mask, lopo_ind)]:
        for (train_mask, test_mask), (train_ind, test_ind) in \
                zip(cv_mask, cv_ind):
            assert_array_equal(np.where(train_mask)[0], train_ind)
            assert_array_equal(np.where(test_mask)[0], test_ind)
示例#2
0
def test_leave_label_out_changing_labels():
    # Check that LeaveOneLabelOut and LeavePLabelOut work normally if
    # the labels variable is changed before calling __iter__
    labels = np.array([0, 1, 2, 1, 1, 2, 0, 0])
    labels_changing = np.array(labels, copy=True)
    lolo = cval.LeaveOneLabelOut(labels)
    lolo_changing = cval.LeaveOneLabelOut(labels_changing)
    lplo = cval.LeavePLabelOut(labels, p=2)
    lplo_changing = cval.LeavePLabelOut(labels_changing, p=2)
    labels_changing[:] = 0
    for llo, llo_changing in [(lolo, lolo_changing), (lplo, lplo_changing)]:
        for (train, test), (train_chan, test_chan) in zip(llo, llo_changing):
            assert_array_equal(train, train_chan)
            assert_array_equal(test, test_chan)
示例#3
0
def train_svm(train_vector, train_labels, object_ids):
    """ 
    train_svm - expects a vector of features and a nx1 set of
                corresponding labels

    Returns a trained SVM classifier
    """

    # Create the obj_id_vector for cross validation
    lpl = cross_validation.LeavePLabelOut(object_ids, p=1, indices=True)
    # Grid search with nested cross-validation
    parameters = {
        'kernel': ['linear'],
        'C': [1, 1e1, 1e2, 1e3, 1e4],
        'gamma': [1, 1e-1, 1e-2, 1e-3, 1e-4],
        'degree': [2, 3, 4, 5]
    }
    #parameters = {'kernel': ['linear'], 'C': [1, 1e1], 'gamma': [1, 1e-1, 1e-2]}
    svm = GridSearchCV(SVC(probability=True),
                       parameters,
                       score_func=f1_score,
                       cv=lpl)

    # Train the SVM using the best parameters
    svm.fit(train_vector, train_labels)
    svm_best = svm.best_estimator_

    return svm_best
示例#4
0
def train_gradient_boost(train_vector, train_labels, object_ids):
    """
    train_svm - expects a vector of features and a nx1 set of
                corresponding labels

    Returns a trained Gradient Boosting classifier
    """

    # Create the obj_id_vector for cross validation
    lpl = cross_validation.LeavePLabelOut(object_ids, p=1, indices=True)

    parameters = {'n_estimators': [1000], 'learn_rate': [1e-1, 1e-2, 1e-3]}

    # Grid search with nested cross-validation
    #parameters = {'kernel': ['rbf'], 'C': [1, 1e1, 1e2, 1e3, 1e4], 'gamma': [1, 1e-1, 1e-2, 1e-3, 1e-4]}
    clf = GridSearchCV(GradientBoostingClassifier(max_depth=4),
                       parameters,
                       score_func=f1_score,
                       cv=lpl)

    # Train the SVM using the best parameters
    clf.fit(train_vector, train_labels)
    clf_best = clf.best_estimator_

    return clf_best
示例#5
0
def main():
    c = mdb.connect(host=HOST, user=USER, passwd=PASSWORD, db=DATABASE)

    ### CLASSIFYING USERS FROM KEYSTROKE DYNAMICS
    #
    ## get list of insights and labels
    #userlist = [3, 9]
    #users = []
    #for user in userlist:
    #    values = kt.get_insights_by_user(user, c).values[:,0].tolist()
    #    users.extend([user]*len(values))
    #users = np.array(users)
    #
    #rfc = ensemble.RandomForestClassifier()
    #train_and_test_model(userlist, users, rfc, c)

    ## PREDICTING HAPPINESS LEVEL FROM ALL DATA
    #users = [45]
    users = [33, 35, 37, 39, 41, 43, 45, 47, 49]
    if len(sys.argv) > 1:
        model_type = sys.argv[1]
    else:
        raise ValueError("must specify model type as argument")
    split_by = 'hour'

    # get all data
    data = get_and_impute_data(users, True, split_by, c)
    happiness = get_var_by_users("happiness_level", users, split_by, c)
    energy = get_var_by_users("energy_level", users, split_by, c)
    relax = get_var_by_users("relax_level", users, split_by, c)
    insights = get_var_by_users("id", users, split_by, c)
    response = np.vstack((happiness, energy, relax)).T

    n_insights = len(np.unique(insights))

    # split dataset into training and evaluation datasets
    for train_idx, eval_idx in cross_validation.LeavePLabelOut(
            insights, int(n_insights * 0.2)):
        train_data = data[train_idx]
        eval_data = data[eval_idx]
        train_response = response[train_idx]
        eval_response = response[eval_idx]
        train_insights = insights[train_idx]
        eval_insights = insights[eval_idx]
        break

    #model = train_and_test_model(train_data, train_response[:,1], train_insights, model_type, split_by, c, varname="energy level")
    #evaluate_model(eval_data, eval_response[:,1], model, "%s model evaluation" % model_type, "energy level")

    model = train_and_test_model(train_data,
                                 train_response[:, 2],
                                 train_insights,
                                 model_type,
                                 split_by,
                                 c,
                                 varname="relaxation level")
    #evaluate_model(eval_data, eval_response[:,2], model, "%s model evaluation" % models[model_type][2], "relaxation level")

    np.savetxt('response.txt', np.vstack((happiness, energy, relax)))
示例#6
0
def leavePLabelOut(X, y_true, labels):
    y_pred = y_true * 0.0
    lpl = cross_validation.LeavePLabelOut(labels, p=1)
    i = 0
    for train_index, test_index in lpl:
        i += 1
        print "CrossVal " + str(i) + " of " + str(len(lpl))
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y_true[train_index], y_true[test_index]
        clf = RFClassifier(n_estimators=200)
        #clf = GBClassifier(n_estimators=5000,learning_rate=0.05, max_features=0.25)
        clf.fit(X_train, y_train)
        y_pred[test_index] = clf.predict_proba(X_test)
    roc_auc(y_true, y_pred)
def train_knn(train_vector, train_labels, test_vector, test_labels, scaler):
    """
    train_knn - expects a vector of features and a nx1 set of
                corresponding labels.  Finally the number of
                neighbors used for comparison

    Returns a trained knn classifier
    """

    # Data scaling
    train_vector_scaled = scaler.transform(train_vector[0])
    test_vector_scaled = scaler.transform(test_vector[0])

    # Create the obj_id_vector for cross validation
    lpl = cross_validation.LeavePLabelOut(np.array(train_vector[1]),
                                          p=10,
                                          indices=True)

    import pdb
    pdb.set_trace()
    pass
    '''
    # Check the split
    for train_index, test_index in lpl:
        print "TRAIN_INDEX: %s  TEST_INDEX: %s" % (train_index, test_index)
        # the train/test_vector_scaled need to by a array
        train = train_vector_scaled[train_index]
        test = train_vector_scaled[test_index]
    
    import pdb; pdb.set_trace()
    pass
    '''

    # Grid search with nested cross-validation
    parameters = {
        'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
    }
    knn = GridSearchCV(KNeighborsClassifier(),
                       parameters,
                       score_func=f1_score,
                       cv=lpl)
    knn.fit(train_vector_scaled, train_labels)
    knn_best = knn.best_estimator_
    score = f1_score(test_labels, knn.predict(test_vector_scaled))
    proba = knn.predict_proba(test_vector_scaled)
    report = classification_report(test_labels,
                                   knn.predict(test_vector_scaled))

    return (knn_best, proba, score, report)
示例#8
0
def train_univariate_selection(train_X,
                               train_Y,
                               object_ids=None,
                               score_fun=f1_score,
                               verbose=0,
                               n_jobs=6,
                               scale=False):
    '''
    Cross validates on the best percentage of features to keep
    when doing univariate feature selection
    '''

    # Setup cross validation
    if (object_ids is None) or (sum(train_Y) <= 10):
        print "Cannot perform leave one out cross validation"
        cv = 3  # default 3 fold cross validation
    else:
        # Leave one object out cross validation
        cv = cross_validation.LeavePLabelOut(object_ids, p=1, indices=True)
    if scale is True:
        scaler = preprocessing.StandardScaler().fit(train_X)
        train_X = scaler.transform(train_X)
    else:
        scaler = False

    univ_select = SelectPercentile(f_classif, percentile=10)
    X_features = univ_select.fit(train_X, train_Y).transform(train_X)

    # class weight normalizes the lack of positive examples
    svm = LinearSVC(dual=False, class_weight='auto')
    svm.fit(X_features, train_Y)

    pipeline = Pipeline([("features", univ_select), ("svm", svm)])

    param_grid = dict(features__percentile=[20, 40, 60, 80, 100],
                      svm__C=np.linspace(1, 1e6, 1000))
    #svm__penalty = ['l1','l2'])

    grid = GridSearchCV(pipeline,
                        param_grid,
                        cv=cv,
                        verbose=verbose,
                        n_jobs=n_jobs,
                        score_func=score_fun)

    grid.fit(train_X, train_Y)
    svm_best = grid.best_estimator_

    return svm_best, scaler
def test_cross_val_generator_with_mask():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = cval.LeaveOneOut(4, indices=False)
    lpo = cval.LeavePOut(4, 2, indices=False)
    kf = cval.KFold(4, 2, indices=False)
    skf = cval.StratifiedKFold(y, 2, indices=False)
    lolo = cval.LeaveOneLabelOut(labels, indices=False)
    lopo = cval.LeavePLabelOut(labels, 2, indices=False)
    ss = cval.ShuffleSplit(4, indices=False)
    for cv in [loo, lpo, kf, skf, lolo, lopo, ss]:
        for train, test in cv:
            X_train, X_test = X[train], X[test]
            y_train, y_test = y[train], y[test]
示例#10
0
文件: pred.py 项目: xiahn/yelp_fake
 def sample(self, X, y, folds):
     n = np.shape(X)[0]
     self.lpl = cross_validation.LeavePLabelOut(folds, p=1)
     if self.fold == 'all':
         X_train = X
         y_train = y
         X_test = X
         y_test = y
     else:
         for train_index, test_index in self.lpl:
             if folds[test_index[0]] == self.fold:
                 X_train, X_test = X[train_index], X[test_index]
                 y_train, y_test = y[train_index], y[test_index]
                 self.train_folds, self.test_folds = folds[
                     train_index], folds[test_index]
     return X_train, X_test, y_train, y_test
def train_svm(train_vector, train_labels, test_vector, test_labels, scaler,
              cv_flag):
    """
    train_svm - expects a vector of features and a nx1 set of
                corresponding labels

    Returns a trained SVM classifier
    """

    # Data scaling
    train_vector_scaled = scaler.transform(train_vector[0])
    test_vector_scaled = scaler.transform(test_vector[0])

    # Create the obj_id_vector for cross validation
    lpl = cross_validation.LeavePLabelOut(np.array(train_vector[1]),
                                          p=2,
                                          indices=True)

    #import pdb; pdb.set_trace()
    #pass

    if cv_flag == True:
        cv_mode = lpl
    else:
        cv_mode = 4

    # Grid search with nested cross-validation
    parameters = {
        'kernel': ['rbf'],
        'C': [1, 1e1, 1e2, 1e3, 1e4],
        'gamma': [1, 1e-1, 1e-2, 1e-3, 1e-4]
    }
    svm = GridSearchCV(SVC(probability=True),
                       parameters,
                       score_func=f1_score,
                       cv=cv_mode)
    svm.fit(train_vector_scaled, train_labels)
    svm_best = svm.best_estimator_
    score = f1_score(test_labels, svm.predict(test_vector_scaled))
    proba = svm.predict_proba(test_vector_scaled)
    report = classification_report(test_labels,
                                   svm.predict(test_vector_scaled))

    #import pdb; pdb.set_trace()
    #pass

    return (svm_best, proba, score, report)
示例#12
0
def test_cross_indices_exception():
    X = coo_matrix(np.array([[1, 2], [3, 4], [5, 6], [7, 8]]))
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = cval.LeaveOneOut(4, indices=False)
    lpo = cval.LeavePOut(4, 2, indices=False)
    kf = cval.KFold(4, 2, indices=False)
    skf = cval.StratifiedKFold(y, 2, indices=False)
    lolo = cval.LeaveOneLabelOut(labels, indices=False)
    lopo = cval.LeavePLabelOut(labels, 2, indices=False)

    assert_raises(ValueError, cval.check_cv, loo, X, y)
    assert_raises(ValueError, cval.check_cv, lpo, X, y)
    assert_raises(ValueError, cval.check_cv, kf, X, y)
    assert_raises(ValueError, cval.check_cv, skf, X, y)
    assert_raises(ValueError, cval.check_cv, lolo, X, y)
    assert_raises(ValueError, cval.check_cv, lopo, X, y)
示例#13
0
def train_svm_gridsearch(train_X,
                         train_Y,
                         object_ids=None,
                         verbose=0,
                         n_jobs=6,
                         score_fun=f1_score,
                         scale=False):
    '''
    Performs cross validation using grid search 
    '''

    # Setup cross validation
    if (object_ids is None) or (sum(train_Y) <= 10):
        print "Cannot perform leave one out cross validation"
        cv = 3  # 5 fold cross validation
    else:
        # Leave one object out cross validation
        cv = cross_validation.LeavePLabelOut(object_ids, p=1, indices=True)

    parameters = {
        'C': np.linspace(1, 1e6, 1000),
        #'C': np.linspace(1,1e6,100),
        #'C': (1e-3,1e-2,1e-1,1.0, 10, 100, 1000, 1e4, 1e5, 1e6),
        'penalty': ('l1', 'l2'),
    }

    # class weight normalizes the lack of positive examples
    clf = LinearSVC(dual=False, class_weight='auto')

    if scale is True:
        scaler = preprocessing.StandardScaler().fit(train_X)
        train_X = scaler.transform(train_X)
    else:
        scaler = False

    grid = GridSearchCV(clf,
                        parameters,
                        cv=cv,
                        verbose=verbose,
                        n_jobs=n_jobs,
                        score_func=score_fun)

    grid.fit(train_X, train_Y)
    svm_best = grid.best_estimator_

    return svm_best, scaler
示例#14
0
def test_cross_val_generator_with_default_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    ss = cval.ShuffleSplit(2)
    ps = cval.PredefinedSplit([1, 1, 2, 2])
    for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test]
示例#15
0
def test_cross_val_generator_with_default_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    b = cval.Bootstrap(2)  # only in index mode
    ss = cval.ShuffleSplit(2)
    for cv in [loo, lpo, kf, skf, lolo, lopo, b, ss]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X_train, X_test = X[train], X[test]
            y_train, y_test = y[train], y[test]
def show_cross_val(method):
    if method == "lolo":
        labels = np.array(["summer", "winter", "summer", "winter", "spring"])
        cv = cross_validation.LeaveOneLabelOut(labels)
    elif method == "lplo":
        labels = np.array(["summer", "winter", "summer", "winter", "spring"])
        cv = cross_validation.LeavePLabelOut(labels, p=2)
    elif method == "loo":
        cv = cross_validation.LeaveOneOut(n=len(y))
    elif method == "lpo":
        cv = cross_validation.LeavePOut(n=len(y), p=3)
    for train_index, test_index in cv:
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        print "X_train:", X_train
        print "y_train:", y_train
        print "X_test:", X_test
        print "y_test:", y_test
示例#17
0
def train_gradient_boost(train_X,
                         train_Y,
                         object_ids=None,
                         score_fun=f1_score,
                         verbose=0,
                         n_jobs=6,
                         scale=False):
    '''
    Performs cross validation using grid search and
    gradient tree boosting
    '''

    # Setup cross validation
    if (object_ids is None) or (sum(train_Y) <= 10):
        print "Cannot perform leave one out cross validation"
        cv = 5  # 10 fold cross validation
    else:
        # Leave one object out cross validation
        cv = cross_validation.LeavePLabelOut(object_ids, p=1, indices=True)
    if scale is True:
        scaler = preprocessing.StandardScaler().fit(train_X)
        train_X = scaler.transform(train_X)
    else:
        scaler = False

    parameters = {
        'n_estimators': [1000],
        'learn_rate': [1e-1, 1e-2, 1, 1e-3]
        #'max_depth':[4]
    }

    print "Beginning Grid Search"
    grid = GridSearchCV(GradientBoostingClassifier(max_depth=4),
                        parameters,
                        score_func=score_fun,
                        cv=cv,
                        verbose=verbose,
                        n_jobs=n_jobs)

    grid.fit(train_X, train_Y)
    svm_best = grid.best_estimator_

    return svm_best, scaler
dataset = mnist.load()
dim = dataset['train']['data'][0].size
N_train = len(dataset['train']['target'])
N_test = len(dataset['test']['target'])
test_data_dict = {
    'data': dataset['test']['data'].reshape(N_test, dim).astype(np.float32),
    'target': dataset['test']['target'].astype(np.int32)
}
#unlabeled_data_dict = {'data':dataset['train']['data'].reshape(N_train, dim).astype(np.float32),
#                    'target':-np.ones(N_train)}
unlabeled_data_dict = {
    'data': dataset['train']['data'].reshape(N_train, dim).astype(np.float32)
}

# making labeled data
lplo = cross_validation.LeavePLabelOut(labels=six.moves.range(N_train),
                                       p=args.slabeled)
fold = 1
for i in six.moves.range(fold):
    train_idx, test_idx = next(iter(lplo))
labeled_data_dict = {
    'data': unlabeled_data_dict['data'][test_idx].astype(np.float32),
    'target': dataset['train']['target'][test_idx].astype(np.int32)
}
#
labeled_data = datafeeders.SiameseFeeder(labeled_data_dict,
                                         batchsize=args.lbatch)
unlabeled_data = DataFeeder(unlabeled_data_dict, batchsize=args.ubatch)
test_data = datafeeders.SiameseFeeder(test_data_dict, batchsize=args.valbatch)

labeled_data.hook_preprocess(mnist_preprocess)
unlabeled_data.hook_preprocess(mnist_preprocess_u)