Exemplo n.º 1
0
def run_grid_search(X, Y):
    print("# Tuning hyper-parameters") 

    # C_range = 2. ** np.array([-3, -2, -1, 0, 1])
    # gamma_range = 2. ** np.array([-1, 0, 1, 2 ])
    # # epsilon_range = 2. ** np.array([-25, -50, -11 ])
    # epsilon_range= [0.00001, 0.0001, 0.01]
    
    C_range = [0.001, 0.01, 0.25, 0.5, 0.75, 1, 1.5, 2]
    gamma_range = [0.1, 0.5, 1, 1.5, 2, 2.5, 3]
    epsilon_range= [0.000001, 0.00001, 0.0001, 0.01, 0.1, 0.5, 1]

    # C_range = 2. ** np.array([-5,-3,-1, 0, 1, 3, 5, 7, 9, 11, 13, 15 ,17])
    # gamma_range = 2. ** np.array([-15 ,-11, -9, -7, -5, -3, -1, 0, 1, 3, 5, 9])
    # epsilon_range= [0, 0.0001, 0.01, 0.1, 0.5, 1, 2, 4, 9]

    n_samples = X.shape[0]
    cv = cross_validation.ShuffleSplit(n_samples, n_iter=5, test_size=0.2, random_state=777)

    parameters = {'C':C_range, 'gamma':gamma_range, 'epsilon':epsilon_range}

    svr = svm.SVR(kernel='rbf', tol=0.0000000001)
    clf = grid_search.GridSearchCV(svr, parameters, cv=cv, scoring='r2', n_jobs=-1)
    clf.fit(X, Y)
    
    return clf
Exemplo n.º 2
0
def split_data(city_data):
    """Randomly shuffle the sample set. Divide it into training and testing set."""

    # Get the features and labels from the Boston housing data
    X, y = city_data.data, city_data.target

    #    cut_outliers(city_data)

    ###################################
    ### Step 3. YOUR CODE GOES HERE ###
    rows, cols = X.shape
    rs = cva.ShuffleSplit(rows, 1, test_size=.25, random_state=1111)
    print "Testing size:", rs.n_test
    print "Training size:", rs.n_train

    #create arrays with shuffled elements
    for tr_indxs, ts_indxs in rs:
        X_train = np.zeros((len(tr_indxs), cols), dtype=float)
        y_train = np.zeros((len(tr_indxs), 1), dtype=float)
        X_test = np.zeros((len(ts_indxs), cols), dtype=float)
        y_test = np.zeros((len(ts_indxs), 1), dtype=float)
        #fill training arrays
        for i, train_i in enumerate(tr_indxs):
            X_train[i, :] = X[train_i, :]
            y_train[i] = y[train_i]
        #fill testing arrays
        for i, test_i in enumerate(ts_indxs):
            X_test[i, :] = X[test_i, :]
            y_test[i] = y[test_i]

    ###################################

    return X_train, y_train, X_test, y_test
Exemplo n.º 3
0
def train_comb_model(traindata, targets):

    model = ensemble.RandomForestRegressor(n_estimators=50,
                                           max_depth=10,
                                           max_features='sqrt',
                                           min_samples_leaf=100,
                                           n_jobs=-1)

    cv = cross_validation.ShuffleSplit(len(targets), n_iter=5, train_size=0.6)

    print("Cross-validating model")
    # get scores
    scores = cross_validation.cross_val_score(model,
                                              traindata,
                                              targets,
                                              cv=cv,
                                              n_jobs=1,
                                              scoring='mean_squared_error')
    # calculate RMSE; MSE is negative, so minus
    scores = np.sqrt(-scores)
    print("RMSE on the training set:")
    print("%0.4f (+/-%0.04f)" % (scores.mean(), scores.std() / 2))

    print("Training model")
    model.fit(traindata, targets)

    return model
Exemplo n.º 4
0
def load_images(image_h5_file, n_images=-1, shuffle_seed=1):
    """Load images and auxiliary data from h5 file.

    Args:
        image_h5_file: location of h5 file containing images.
        n_images: number of images to load, -1 loads all.
        auxvars: list of auxvar field names to load.
    Returns:
        images: array of image arrays.
        aux_data: dict of auxvar arrays.
    TODO: add support for multiple classes.
    """
    with h5py.File(image_h5_file, 'r') as h5file:
        images = h5file['images']
        auxvars = h5file['auxvars']
        if n_images < 0:
            n_images = len(images)
        elif n_images > len(images):
            print("Cannot load {0} images. Only {1} images in {2}".format(
                n_images, len(images), image_h5_file))
            n_images = len(images)
        if n_images < len(images):
            rs = cross_validation.ShuffleSplit(len(images),
                                               n_iter=1,
                                               test_size=n_images,
                                               random_state=shuffle_seed)
            for train, test in rs:
                keep = test
            images = np.take(images, keep, axis=0)
            auxvars = np.take(auxvars, keep, axis=0)
        else:
            images = h5file['images'][:]
            auxvars = h5file['auxvars'][:]
    return images, auxvars
Exemplo n.º 5
0
def sample_random_n(table,
                    n,
                    stratified=False,
                    replace=False,
                    random_state=None):
    if replace:
        if random_state is None:
            rgen = np.random
        else:
            rgen = np.random.mtrand.RandomState(random_state)
        sample = rgen.random_integers(0, len(table) - 1, n)
        o = np.ones(len(table))
        o[sample] = 0
        others = np.nonzero(o)[0]
        return others, sample
    if stratified and table.domain.has_discrete_class:
        test_size = max(len(table.domain.class_var.values), n)
        ind = skl_cross_validation.StratifiedShuffleSplit(
            table.Y.ravel(),
            n_iter=1,
            test_size=test_size,
            train_size=len(table) - test_size,
            random_state=random_state)
    else:
        ind = skl_cross_validation.ShuffleSplit(len(table),
                                                n_iter=1,
                                                test_size=n,
                                                random_state=random_state)
    return next(iter(ind))
Exemplo n.º 6
0
    def radialbasisf(self):
        clf = svm.SVC(kernel='rbf', gamma=self.gamma, C=self.c).fit(self.x_train, self.y_train)
        z = clf.predict(self.x_test)
        print(np.mean(self.y_test == z))

        # Plot also the training points
        colours = 'ryg'
        for i in range(self.x_test.shape[0]):
            c_index = int(self.y_test[i])
            plt.scatter(self.x_test[i, 0], self.x_test[i, 1], c=colours[c_index])

        plt.xlabel('Total de Palabras')
        plt.ylabel('Malas Palabras')
        plt.title('RBF kernel SVM')
        plt.show()

        # SVC is more expensive so we do a lower number of CV iterations:
        cv = cross_validation.ShuffleSplit(self.x_train.shape[0], n_iter=10,
                                           test_size=0.2, random_state=0)
        '''plot_learning_curve(clf, "Learning Curves (SVM, RBF kernel)",
                            self.x_train, self.y_train, (0.5, 1.01), cv=cv, n_jobs=4)

        plot_validation_curves(clf, self.x_train, self.y_train)
        '''
        return z
Exemplo n.º 7
0
def fit(X_vec, y_vec):
    # 切分数据集
    cv = cross_validation.ShuffleSplit(len(X_vec), n_iter=3, test_size=0.2, random_state=0)

    # 岭回归
    # for train,test in cv:
    #     svc = linear_model.Ridge().fit(X_vec[train], y_vec[train])
    #     print("train score: %.3f, test score: %.3f\n" %(
    #         svc.score(X_vec[train], y_vec[train]), svc.score(X_vec[test], y_vec[test])
    #     ))

    # 支持向量机,C是正则化项因子,gamma是核函数gamma因子
    # for train,test in cv:
    #     # SVR既可以解决分类问题,又可以解决回归问题
    #     svc = svm.SVR(kernel="rbf", C=10, gamma=1e-3).fit(X_vec[train], y_vec[train])
    #     print("train score: %.3f, test score: %.3f\n" % (
    #         svc.score(X_vec[train], y_vec[train]), svc.score(X_vec[test], y_vec[test])
    #     ))

    # 随机森林回归
    for train,test in cv:
        svc = RandomForestRegressor(n_estimators=100, max_depth=10).fit(X_vec[train], y_vec[train])
        print("train score: %.3f, test score: %.3f\n" % (
            svc.score(X_vec[train], y_vec[train]), svc.score(X_vec[test], y_vec[test])
        ))
def plot_learning_curves(raw_data, limit_size=None):

    features, weights, labels = raw_data

    if limit_size is not None:
        features = features[:limit_size]
        weights = weights[:limit_size]
        labels = labels[:limit_size]

    plt.figure(figsize=(12, 12))

    cv = cross_validation.ShuffleSplit(features.shape[0],
                                       n_iter=5,
                                       test_size=TEST_DATA_SPLIT,
                                       random_state=0)

    title = "Learning Curves (Decision Trees)"
    estimator = tree.DecisionTreeClassifier(criterion='gini',
                                            min_samples_split=60)
    plt.subplot(2, 2, 1)
    plot_learning_curve(estimator,
                        title,
                        features,
                        labels,
                        ylim=(0.7, 1.01),
                        cv=cv,
                        n_jobs=N_JOBS)

    title = "Learning Curves (AdaBoost)"
    estimator = AdaBoostClassifier(n_estimators=100, learning_rate=1.0)
    plt.subplot(2, 2, 2)
    plot_learning_curve(estimator,
                        title,
                        features,
                        labels,
                        ylim=(0.7, 1.01),
                        cv=cv,
                        n_jobs=N_JOBS)

    title = "Learning Curves (K-Nearest Neighbour)"
    estimator = KNeighborsClassifier(n_neighbors=10, p=2)
    plt.subplot(2, 2, 3)
    plot_learning_curve(estimator,
                        title,
                        features,
                        labels,
                        ylim=(0.7, 1.01),
                        cv=cv,
                        n_jobs=N_JOBS)

    title = "Learning Curves (SVM)"
    estimator = svm.SVC(C=1.0, gamma=0.1)
    plt.subplot(2, 2, 4)
    plot_learning_curve(estimator,
                        title,
                        features,
                        labels,
                        ylim=(0.7, 1.01),
                        cv=cv,
                        n_jobs=N_JOBS)
Exemplo n.º 9
0
def run_cross_validation(X, Y):

    n_samples = X.shape[0]
    cv = cross_validation.ShuffleSplit(n_samples,
                                       n_iter=10,
                                       test_size=0.1,
                                       random_state=0)
    regressor = svm.SVR(C=8, gamma=32, epsilon=0.01, tol=0.000001)
    scores = cross_validation.cross_val_score(regressor,
                                              X,
                                              Y,
                                              cv=cv,
                                              scoring='mean_squared_error')

    print "Mean Square Error : ", scores
    print "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2)

    scores = cross_validation.cross_val_score(regressor,
                                              X,
                                              Y,
                                              cv=cv,
                                              scoring='r2')

    print "R2 Score : ", scores
    print "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2)
Exemplo n.º 10
0
def do_cv(clf,
          X,
          y,
          n_samples=1000,
          n_iter=3,
          test_size=0.1,
          quiet=False,
          scoring=None,
          stratified=False,
          fit_params=None,
          reseed_classifier=True,
          n_jobs=-1):
    t0 = time.time()
    if reseed_classifier: reseed(clf)
    if type(n_samples) is float: n_samples = int(n_samples)
    try:
        if (n_samples > X.shape[0]): n_samples = X.shape[0]
    except:
        pass
    cv = cross_validation.ShuffleSplit(n_samples, n_iter=n_iter, test_size=test_size, random_state=cfg['sys_seed']) \
      if not(stratified) else cross_validation.StratifiedShuffleSplit(y, n_iter, train_size=n_samples, test_size=test_size, random_state=cfg['sys_seed'])

    test_scores = cross_validation.cross_val_score(clf,
                                                   X,
                                                   y,
                                                   cv=cv,
                                                   scoring=scoring
                                                   or cfg['scoring'],
                                                   fit_params=fit_params,
                                                   n_jobs=n_jobs)
    if not (quiet):
        dbg('%s took: %.2fm' % (mean_score(test_scores),
                                (time.time() - t0) / 60))
    return (np.mean(test_scores), sem(test_scores))
Exemplo n.º 11
0
def do_gs(clf,
          X,
          y,
          params,
          n_samples=1000,
          n_iter=3,
          n_jobs=-2,
          scoring=None,
          fit_params=None):
    if type(n_samples) is float: n_samples = int(n_samples)
    reseed(clf)
    cv = cross_validation.ShuffleSplit(n_samples,
                                       n_iter=n_iter,
                                       random_state=cfg['sys_seed'])
    gs = grid_search.GridSearchCV(clf,
                                  params,
                                  cv=cv,
                                  n_jobs=n_jobs,
                                  verbose=2,
                                  scoring=scoring or cfg['scoring'],
                                  fit_params=fit_params)
    X2, y2 = utils.shuffle(X, y, random_state=cfg['sys_seed'])
    gs.fit(X2[:n_samples], y2[:n_samples])
    dbg(gs.best_params_, gs.best_score_)
    return gs
Exemplo n.º 12
0
def split_data(df, features):
    """ split df[features] into train and test set with ShuffleSplit
        it also generates a new feature 'cnt_season' by grouping counts of four seasons
        Parameters
        ----------
        df: pandas dataframe
        features: a list of columns of df, the set of features in train set
        
        Returns
        -------
        df: dataframe + 'cnt_season'
        X_train, X_test, y_train, y_test: train set and test set for 'cnt' column
        y_train_cas, y_test_cas, y_train_reg, y_test_reg: train and test sets for 'casual' and 'registered' columns (not used in this study)
        time_test: datetime information of test set, for writing prediction results
    """
    ss = cross_validation.ShuffleSplit(len(df),
                                       n_iter=1,
                                       test_size=0.1,
                                       random_state=1234)
    for ind_train, ind_test in ss:
        # add a cnt_season column using groupby and join
        if 'cnt_season' not in df:
            season_gb = df.ix[ind_train, :].groupby('season')[['cnt']].agg(sum)
            season_gb.columns = ['cnt_season']
            df = df.join(season_gb, on='season')
        X_train = df.ix[ind_train, features].as_matrix()
        X_test = df.ix[ind_test, features].as_matrix()
        y_train = np.log1p(df.ix[ind_train, 'cnt'].as_matrix())
        y_test = np.log1p(df.ix[ind_test, 'cnt'].as_matrix())
        y_train_cas = np.log1p(df.ix[ind_train, 'casual'].as_matrix())
        y_train_reg = np.log1p(df.ix[ind_train, 'registered'].as_matrix())
        y_test_cas = np.log1p(df.ix[ind_test, 'casual'].as_matrix())
        y_test_reg = np.log1p(df.ix[ind_test, 'registered'].as_matrix())
        time_test = df.ix[ind_test, ['dteday', 'mnth', 'hr']].as_matrix()
    return df, X_train, X_test, y_train, y_test, y_train_cas, y_test_cas, y_train_reg, y_test_reg, time_test
Exemplo n.º 13
0
def cv_select(y, random_state, n_cv, cv, test_size=0.1):
    if isinstance(cv, basestring):
        if cv == 'shuffle':
            return cross_validation.StratifiedShuffleSplit(
                y, n_cv, test_size=test_size, random_state=random_state)
        elif cv == 'loo':
            return cross_validation.LeaveOneOut(n_cv)
        elif cv == 'kfold':
            return cross_validation.StratifiedKFold(y, n_folds=n_cv)
        elif cv == 'boot':
            return cross_validation.Bootstrap(len(y),
                                              n_iter=n_cv,
                                              train_size=(1 - test_size),
                                              random_state=random_state)
        elif cv == 'boot632':
            return bootstrap_632(len(y),
                                 n_iter=n_cv,
                                 random_state=random_state)
        # for regression
        elif cv == '_shuffle':
            return cross_validation.ShuffleSplit(len(y),
                                                 n_iter=n_cv,
                                                 test_size=test_size,
                                                 random_state=random_state)
        elif cv == '_kfold':
            return cross_validation.KFold(len(y), n_folds=n_cv)
        else:
            raise ValueError("bad cv:%s" % cv)
    else:
        return cv
Exemplo n.º 14
0
def sample_random_n(table,
                    n,
                    stratified=False,
                    replace=False,
                    random_state=None):
    assert n > 0
    n = int(n)
    if replace:
        ind = cross_validation.Bootstrap(len(table),
                                         train_size=n,
                                         random_state=random_state)
    elif stratified and is_discrete(table.domain.class_var):
        train_size = max(len(table.domain.class_var.values), n)
        test_size = max(len(table) - train_size, 0)
        ind = cross_validation.StratifiedShuffleSplit(
            table.Y.ravel(),
            n_iter=1,
            test_size=test_size,
            train_size=train_size,
            random_state=random_state)
    else:
        train_size = max(len(table.domain.class_var.values), n)
        test_size = max(len(table) - train_size, 0)
        ind = cross_validation.ShuffleSplit(len(table),
                                            n_iter=1,
                                            test_size=test_size,
                                            train_size=train_size,
                                            random_state=random_state)
    return next(iter(ind))
Exemplo n.º 15
0
    def shuffleCV(self, clf):
        #print ('Shuffle Process Unique Id: {0}'.format(uuid.uuid1()))
        # ============ Shuffle Split cross validation (learning Curve) ================
        t0 = time()
        title = "Learning Curves (Naive Bayes) " + str(clf).split('(')[
            0]  # prints the name of classifier also
        # Cross validation with 20 iterations to get smoother mean test and train
        # score curves, each time with 20% data randomly selected as a validation set.
        cv = cross_validation.ShuffleSplit(len(self.y_train),
                                           n_iter=20,
                                           test_size=0.2,
                                           random_state=0)

        # plots a graph showing the learning curve for the test and training data split the job to 4 threads
        plt = self.plot_learning_curve(clf,
                                       title,
                                       self.X_train,
                                       self.y_train,
                                       ylim=(0.3, 1.01),
                                       cv=cv,
                                       n_jobs=4)
        plt.draw()
        plt.savefig("results/" + str(clf).split('(')[0] +
                    '_shuffleCVlearningCurve')
        ss_time = time() - t0
        print("(",
              str(clf).split('(')[0], ") Shuffle Split time:  %0.3fs" %
              ss_time)  # orints estimated time
Exemplo n.º 16
0
def main():
    # num_columns is number of columns in file
    with open('../data/test_lung_s3.csv', 'rb') as f:
        reader = csv.reader(f, delimiter=',')
        for row in reader:
            num_columns = len(row)
            break

    # load data
    mat = np.loadtxt('../data/test_lung_s3.csv', delimiter=',', skiprows=1, usecols=range(0, 101))
    X = mat[:, 1:num_columns]  # data
    X = X.astype(float)
    y = mat[:, 0]  # label
    n_samples, n_features = X.shape

    # evalaution
    num_fea = 20
    ss = cross_validation.ShuffleSplit(n_samples, n_iter=5, test_size=0.2)
    clf = svm.LinearSVC()
    mean_acc = 0

    for train, test in ss:
        idx = CFS.cfs(X[train], y[train])
        selected_features = X[:, idx[0:num_fea]]
        clf.fit(selected_features[train], y[train])
        y_predict = clf.predict(selected_features[test])
        acc = accuracy_score(y[test], y_predict)
        print acc
        mean_acc = mean_acc + acc
    mean_acc /= 5
    print mean_acc
Exemplo n.º 17
0
def shuffle_split_binary(frame, split_col, test_col, test_fun, n_iter=100):
    from sklearn import cross_validation
    split_values = frame[split_col].unique()
    assert len(split_values) == 2

    frame_1 = frame[frame[split_col] == split_values[0]]
    frame_2 = frame[frame[split_col] == split_values[1]]

    assert len(frame_1) != len(frame_2)
    smaller = frame_1 if len(frame_1) < len(frame_2) else frame_2
    larger = frame_2 if len(frame_1) < len(frame_2) else frame_1
    smaller_name = smaller.iloc[0][split_col]
    larger_name = larger.iloc[0][test_col]

    ss = cross_validation.ShuffleSplit(len(larger),
                                       train_size=len(smaller),
                                       n_iter=n_iter)
    results = []
    sm_true, sm_false = smaller[test_col].sum(), (-smaller[test_col]).sum()

    for train_idx, test_idx in ss:
        lg_true, lg_false = larger.iloc[train_idx][test_col].sum(),\
                (-larger.iloc[train_idx][test_col]).sum()

        df = pandas.DataFrame(
            {
                "False": [sm_false, lg_false],
                "True": [sm_true, lg_true]
            },
            index=[smaller_name, larger_name])
        results.append(test_fun(df))

    return results
Exemplo n.º 18
0
def train_test_split(result):
    #move target variable 'target' to the first column
    target = result['target']
    result.drop('target', axis=1, inplace=True)
    result.insert(0, 'target', target)

    result['proAbortionCaseDecision'] = np.where(result['panelvote'] >= 2, 1,
                                                 0)
    result.drop(['year_month'], axis=1, inplace=True)
    #Following Kristen's script

    n = result.shape[0]
    # The split variable contains shuffled indices for the training data and for the testing data
    split = cross_validation.ShuffleSplit(n,
                                          n_iter=1,
                                          train_size=0.8,
                                          test_size=.20,
                                          random_state=1)

    train_idx = np.arange(n)
    test_idx = np.arange(n)

    for tr, te in split:
        train_idx = set(tr)
        test_idx = set(te)

    train_f = result.iloc[list(
        train_idx), :]  # convert train_idx from array to list of indices
    test_f = result.iloc[list(
        test_idx), :]  # convert test_idx from array to list of indices
    return train_f, test_f
Exemplo n.º 19
0
def main():
    # load MATLAB data
    mat = scipy.io.loadmat('../data/COIL20.mat')
    X = mat['fea']  # data
    y = mat['gnd']  # label
    y = y[:, 0]
    n_samples, n_features = X.shape
    X = X.astype(float)
    Y = construct_label_matrix_pan(y)

    # 5-fold cross validation
    num_fea = 20
    ss = cross_validation.ShuffleSplit(n_samples, n_iter=5, test_size=0.2)
    clf = svm.LinearSVC()
    mean_acc = 0

    for train, test in ss:
        W, obj, value_gamma = ll_l21_proximal.proximal_gradient_descent(
            X[train], Y[train], 0.1, verbose=False)
        idx = feature_ranking(W)
        selected_features = X[:, idx[0:num_fea]]
        clf.fit(selected_features[train], y[train])
        y_predict = clf.predict(selected_features[test])
        acc = accuracy_score(y[test], y_predict)
        print acc
        mean_acc = mean_acc + acc
    mean_acc /= 5
    print 'mean_acc', mean_acc
def sklearn_random_forest(train_x, train_y, test_x, test_uid):
    # 设置参数
    clf = RandomForestClassifier(
        n_estimators=5,
        bootstrap=True,  #是否有放回的采样
        oob_score=False,
        n_jobs=4,  #并行job个数
        min_samples_split=5)
    # 训练模型
    n_samples = train_x.shape[0]
    cv = cross_validation.ShuffleSplit(n_samples,
                                       n_iter=3,
                                       test_size=0.3,
                                       random_state=0)
    predicted = cross_validation.cross_val_predict(clf,
                                                   train_x,
                                                   train_y,
                                                   cv=cv)
    print(metrics.accuracy_score(train_y, predicted))

    test_y = clf.predict(test_x)
    result = pd.DataFrame({
        "uid": test_uid,
        "score": test_y
    },
                          columns=['uid', 'score'])
    result.to_csv('rf_' + str(time.time()) + '.csv', index=False)
Exemplo n.º 21
0
def test():
    import sklearn.cross_validation as skl_cross_validation
    app = QApplication([])
    w = OWVennDiagram()
    data = Orange.data.Table("brown-selected")
    data = append_column(data, "M", Orange.data.StringVariable("Test"),
                         numpy.arange(len(data)).reshape(-1, 1) % 30)

    indices = skl_cross_validation.ShuffleSplit(
        len(data), n_iter=5, test_size=0.7
    )

    indices = iter(indices)

    def select(data):
        sample, _ = next(indices)
        return data[sample]

    d1 = select(data)
    d2 = select(data)
    d3 = select(data)
    d4 = select(data)
    d5 = select(data)

    for i, data in enumerate([d1, d2, d3, d4, d5]):
        data.name = chr(ord("A") + i)
        w.setData(data, key=i)

    w.handleNewSignals()
    w.show()
    app.exec_()

    del w
    app.processEvents()
    return app
def main():
    DOC = """
================================================================================
    Compare the prediction accuracy of different models on the boston dataset
================================================================================
    """
    print(DOC)
    from sklearn import cross_validation, datasets
    boston = datasets.load_boston()
    X, y = boston.data, np.round(boston.target)
    #X -= X.mean()
    y -= y.min()

    idx = np.argsort(y)
    X = X[idx]
    y = y[idx]
    cv = cross_validation.ShuffleSplit(y.size, n_iter=50, test_size=.1, random_state=0)
    score_logistic = []
    score_ordinal_logistic = []
    score_ridge = []
    for i, (train, test) in enumerate(cv):
        #test = train
        if not np.all(np.unique(y[train]) == np.unique(y)):
            # we need the train set to have all different classes
            continue
        assert np.all(np.unique(y[train]) == np.unique(y))
        train = np.sort(train)
        test = np.sort(test)
        w, theta = ordinal_logistic_fit(X[train], y[train], verbose=True,
                                        solver='TNC')
        pred = ordinal_logistic_predict(w, theta, X[test])
        s = metrics.mean_absolute_error(y[test], pred)
        print('ERROR (ORDINAL)  fold %s: %s' % (i+1, s))
        score_ordinal_logistic.append(s)

        from sklearn import linear_model
        clf = linear_model.LogisticRegression(C=1.)
        clf.fit(X[train], y[train])
        pred = clf.predict(X[test])
        s = metrics.mean_absolute_error(y[test], pred)
        print('ERROR (LOGISTIC) fold %s: %s' % (i+1, s))
        score_logistic.append(s)

        from sklearn import linear_model
        clf = linear_model.Ridge(alpha=1.)
        clf.fit(X[train], y[train])
        pred = np.round(clf.predict(X[test]))
        s = metrics.mean_absolute_error(y[test], pred)
        print('ERROR (RIDGE)    fold %s: %s' % (i+1, s))
        score_ridge.append(s)


    print()
    print('MEAN ABSOLUTE ERROR (ORDINAL LOGISTIC):    %s' % np.mean(score_ordinal_logistic))
    print('MEAN ABSOLUTE ERROR (LOGISTIC REGRESSION): %s' % np.mean(score_logistic))
    print('MEAN ABSOLUTE ERROR (RIDGE REGRESSION):    %s' % np.mean(score_ridge))
    # print('Chance level is at %s' % (1. / np.unique(y).size))
    
    return np.mean(score_ridge)
def train_classifier(clf, X, y):
    """
    训练分类器

    Args:
        X: training samples, size=[n_samples, n_features]
        y: class labels, size=[n_samples, 1]
    Returns:
        clf: classifier, 训练完的分类器
    """
    from sklearn import grid_search, cross_validation
    import time
    """grid search 的结果
    clf.fit(X, y)
    #logger.info('Classifier fit Done. Best params are %s with a best score of %0.2f' % (clf.best_params_, clf.best_score_))
    #logger.info('And scores ars %s' % (clf.grid_scores_))
    """

    # 简单的交叉验证
    clf.fit(X, y)
    scores = cross_validation.cross_val_score(clf, X, y, cv=5)
    logger.info(
        'Classifier fit Done. And simple cross-validated scores ars %s' %
        (scores))

    # 十折法
    kf = cross_validation.KFold(len(X), n_folds=10)
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        logger.info('10 folds cross-validated scores is %s.' % (score))

    # 以 1/10的训练集作为新的训练集输入,并得出评分
    test_size = 0.9
    rs = cross_validation.ShuffleSplit(len(X),
                                       test_size=test_size,
                                       random_state=int(time.time()))
    for train_index, test_index in rs:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        logger.info('%s作为训练集输入, cross-validated scores is %s.' %
                    (1 - test_size, score))
    """
    # 以 1/100的训练集作为新的训练集输入,并得出评分
    test_size = 0.99
    rs = cross_validation.ShuffleSplit(len(X), test_size=test_size, random_state=int(time.time()))
    for train_index, test_index in rs:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        logger.info('%s作为训练集输入, cross-validated scores is %s.' % (1-test_size, score))
    """

    return clf
def test_shuffle_split_warnings():
    expected_message = ("test_fraction is deprecated in 0.11 and scheduled "
                        "for removal in 0.13, use test_size instead",
                        "train_fraction is deprecated in 0.11 and scheduled "
                        "for removal in 0.13, use train_size instead")

    with warnings.catch_warnings(record=True) as warn_queue:
        cval.ShuffleSplit(10, 3, test_fraction=0.1)
        cval.ShuffleSplit(10, 3, train_fraction=0.1)
        cval.train_test_split(range(3), test_fraction=0.1)
        cval.train_test_split(range(3), train_fraction=0.1)

    assert_equal(len(warn_queue), 4)
    assert_equal(str(warn_queue[0].message), expected_message[0])
    assert_equal(str(warn_queue[1].message), expected_message[1])
    assert_equal(str(warn_queue[2].message), expected_message[0])
    assert_equal(str(warn_queue[3].message), expected_message[1])
Exemplo n.º 25
0
def enetCV():
    print ("Doing elastic net")
    cross_val = cross_validation.ShuffleSplit(len(base_X), n_iter=5, test_size=0.2, random_state=0)
    clf4 = ElasticNetCV(cv=cross_val)
    clf4.fit(base_X, base_Y)
    print ("Score = %f" % clf4.score(base_X, base_Y))
    clf4_pred = clf4.predict(X_test)
    write_to_file("elasticCV.csv", clf4_pred)
Exemplo n.º 26
0
def split_train_test(authors):
  train = {}
  test = {}
  for author in authors:
    for tr, te in cross_validation.ShuffleSplit(len(authors[author]), 1, 0.05):
      train[author] = np.array(authors[author])[tr]
      test[author] = np.array(authors[author])[te]
  return train, test
Exemplo n.º 27
0
def lassolarscv():
    print ("Doing cross-validated LassoLars")
    cross_val = cross_validation.ShuffleSplit(len(base_X), n_iter=5, test_size=0.2, random_state=0)
    clf5 = LassoLarsCV(cv=cross_val)
    clf5.fit(base_X, base_Y)
    print ("Score = %f" % clf5.score(base_X, base_Y))
    clf5_pred = clf5.predict(X_test)
    write_to_file("lassolars.csv", clf5_pred)
Exemplo n.º 28
0
def create_and_test_model(X,
                          y,
                          n_iter=10,
                          test_size=0.1,
                          random_state=RANDOM_SEED,
                          verbose=False):
    """Create a model and test using n-fold cross validation.
    Pass random_state=None to override the fixed random seed.
    """
    # split the data in train and test using shuffle and split
    # create an iterator that generates boolean indices for each train/test run
    ss_iter = cross_validation.ShuffleSplit(len(X),
                                            n_iter=n_iter,
                                            test_size=test_size,
                                            indices=False,
                                            random_state=random_state)
    cm_combined = None
    for n_run, (train_indices, test_indices) in enumerate(ss_iter):
        # converting these to lists is much faster than leaving in Pandas DataFrame or Series
        X_train = X[train_indices].to_records(index=False).tolist()
        y_train = y[train_indices].tolist()
        X_test = X[test_indices].to_records(index=False).tolist()
        y_test = y[test_indices].tolist()
        #print(y_test)
        model = LogisticRegression(penalty='l2')
        model.fit(X_train, y_train)
        predicted = model.predict(X_test)
        cm = confusion_matrix(y_test, predicted)
        cm_df = pd.DataFrame(
            cm,
            index=[LABEL_ACTUAL_POSITIVE, LABEL_ACTUAL_NEGATIVE],
            columns=[LABEL_PREDICTED_POSITIVE, LABEL_PREDICTED_NEGATIVE])
        if cm_combined is None:
            cm_combined = cm
        else:
            cm_combined += cm
        if verbose:
            #print(model.coef_)
            #print(model.get_params())
            #print(model.transform(X_test[0:2]))
            #print(predicted.tolist())
            print("run {} of {}".format(n_run + 1, n_iter))
            print("\t" "score: {}".format(model.score(X_test, y_test)))
            print("\t"
                  "POISONOUS: {}".format(
                      sum([val == 'POISONOUS' for val in y_test])))
            print("\t"
                  "EDIBLE:    {}".format(
                      sum([val == 'EDIBLE' for val in y_test])))
            print("\t" "confusion matrix:\n{}\n".format(cm_df))
    cm_df = pd.DataFrame(
        cm_combined,
        index=[LABEL_ACTUAL_POSITIVE, LABEL_ACTUAL_NEGATIVE],
        columns=[LABEL_PREDICTED_POSITIVE, LABEL_PREDICTED_NEGATIVE])
    if verbose:
        print("combined confusion matrix:")
        print(cm_df)
    return cm_df
Exemplo n.º 29
0
def test_vc():
    digits = load_digits()
    X, y = digits.data, digits.target
    p_range = np.logspace(-6, -1, 5)
    cv = cross_validation.ShuffleSplit(digits.data.shape[0], n_iter=10, test_size=0.2, random_state=0)
    model = SVC()
    plot_validation_curve(model, X, y, scorer='accuracy', param_name="gamma", param_range=p_range,
        cv=cv, n_jobs=2, ylim=(0.0, 0.5), title="SVC validation curve ($\gamma$)")
    plt.show()
Exemplo n.º 30
0
def sample(table, n=0.7, stratified=False, replace=False, random_state=None):
    """
    Samples data instances from a data table. Returns the sample and
    a data set from input data table that are not in the sample. Also
    uses several sampling functions from
    `scikit-learn <http://scikit-learn.org>`_.

    table : data table
        A data table from which to sample.

    n : float, int (default = 0.7)
        If float, should be between 0.0 and 1.0 and represents
        the proportion of data instances in the resulting sample. If
        int, n is the number of data instances in the resulting sample.

    stratified : bool, optional (default = False)
        If true, sampling will try to consider class values and
        match distribution of class values
        in train and test subsets.

    replace : bool, optional (default = False)
        sample with replacement

    random_state : int or RandomState
        Pseudo-random number generator state used for random sampling.
    """

    if type(n) == float:
        n = int(n * len(table))

    if replace:
        if random_state is None:
            rgen = np.random
        else:
            rgen = np.random.mtrand.RandomState(random_state)
        sample = rgen.randint(0, len(table), n)
        o = np.ones(len(table))
        o[sample] = 0
        others = np.nonzero(o)[0]
        return table[sample], table[others]

    n = len(table) - n
    if stratified and table.domain.has_discrete_class:
        test_size = max(len(table.domain.class_var.values), n)
        ind = skl_cross_validation.StratifiedShuffleSplit(
            table.Y.ravel(),
            n_iter=1,
            test_size=test_size,
            train_size=len(table) - test_size,
            random_state=random_state)
    else:
        ind = skl_cross_validation.ShuffleSplit(len(table),
                                                n_iter=1,
                                                test_size=n,
                                                random_state=random_state)
    ind = next(iter(ind))
    return table[ind[0]], table[ind[1]]