def run_grid_search(X, Y): print("# Tuning hyper-parameters") # C_range = 2. ** np.array([-3, -2, -1, 0, 1]) # gamma_range = 2. ** np.array([-1, 0, 1, 2 ]) # # epsilon_range = 2. ** np.array([-25, -50, -11 ]) # epsilon_range= [0.00001, 0.0001, 0.01] C_range = [0.001, 0.01, 0.25, 0.5, 0.75, 1, 1.5, 2] gamma_range = [0.1, 0.5, 1, 1.5, 2, 2.5, 3] epsilon_range= [0.000001, 0.00001, 0.0001, 0.01, 0.1, 0.5, 1] # C_range = 2. ** np.array([-5,-3,-1, 0, 1, 3, 5, 7, 9, 11, 13, 15 ,17]) # gamma_range = 2. ** np.array([-15 ,-11, -9, -7, -5, -3, -1, 0, 1, 3, 5, 9]) # epsilon_range= [0, 0.0001, 0.01, 0.1, 0.5, 1, 2, 4, 9] n_samples = X.shape[0] cv = cross_validation.ShuffleSplit(n_samples, n_iter=5, test_size=0.2, random_state=777) parameters = {'C':C_range, 'gamma':gamma_range, 'epsilon':epsilon_range} svr = svm.SVR(kernel='rbf', tol=0.0000000001) clf = grid_search.GridSearchCV(svr, parameters, cv=cv, scoring='r2', n_jobs=-1) clf.fit(X, Y) return clf
def split_data(city_data): """Randomly shuffle the sample set. Divide it into training and testing set.""" # Get the features and labels from the Boston housing data X, y = city_data.data, city_data.target # cut_outliers(city_data) ################################### ### Step 3. YOUR CODE GOES HERE ### rows, cols = X.shape rs = cva.ShuffleSplit(rows, 1, test_size=.25, random_state=1111) print "Testing size:", rs.n_test print "Training size:", rs.n_train #create arrays with shuffled elements for tr_indxs, ts_indxs in rs: X_train = np.zeros((len(tr_indxs), cols), dtype=float) y_train = np.zeros((len(tr_indxs), 1), dtype=float) X_test = np.zeros((len(ts_indxs), cols), dtype=float) y_test = np.zeros((len(ts_indxs), 1), dtype=float) #fill training arrays for i, train_i in enumerate(tr_indxs): X_train[i, :] = X[train_i, :] y_train[i] = y[train_i] #fill testing arrays for i, test_i in enumerate(ts_indxs): X_test[i, :] = X[test_i, :] y_test[i] = y[test_i] ################################### return X_train, y_train, X_test, y_test
def train_comb_model(traindata, targets): model = ensemble.RandomForestRegressor(n_estimators=50, max_depth=10, max_features='sqrt', min_samples_leaf=100, n_jobs=-1) cv = cross_validation.ShuffleSplit(len(targets), n_iter=5, train_size=0.6) print("Cross-validating model") # get scores scores = cross_validation.cross_val_score(model, traindata, targets, cv=cv, n_jobs=1, scoring='mean_squared_error') # calculate RMSE; MSE is negative, so minus scores = np.sqrt(-scores) print("RMSE on the training set:") print("%0.4f (+/-%0.04f)" % (scores.mean(), scores.std() / 2)) print("Training model") model.fit(traindata, targets) return model
def load_images(image_h5_file, n_images=-1, shuffle_seed=1): """Load images and auxiliary data from h5 file. Args: image_h5_file: location of h5 file containing images. n_images: number of images to load, -1 loads all. auxvars: list of auxvar field names to load. Returns: images: array of image arrays. aux_data: dict of auxvar arrays. TODO: add support for multiple classes. """ with h5py.File(image_h5_file, 'r') as h5file: images = h5file['images'] auxvars = h5file['auxvars'] if n_images < 0: n_images = len(images) elif n_images > len(images): print("Cannot load {0} images. Only {1} images in {2}".format( n_images, len(images), image_h5_file)) n_images = len(images) if n_images < len(images): rs = cross_validation.ShuffleSplit(len(images), n_iter=1, test_size=n_images, random_state=shuffle_seed) for train, test in rs: keep = test images = np.take(images, keep, axis=0) auxvars = np.take(auxvars, keep, axis=0) else: images = h5file['images'][:] auxvars = h5file['auxvars'][:] return images, auxvars
def sample_random_n(table, n, stratified=False, replace=False, random_state=None): if replace: if random_state is None: rgen = np.random else: rgen = np.random.mtrand.RandomState(random_state) sample = rgen.random_integers(0, len(table) - 1, n) o = np.ones(len(table)) o[sample] = 0 others = np.nonzero(o)[0] return others, sample if stratified and table.domain.has_discrete_class: test_size = max(len(table.domain.class_var.values), n) ind = skl_cross_validation.StratifiedShuffleSplit( table.Y.ravel(), n_iter=1, test_size=test_size, train_size=len(table) - test_size, random_state=random_state) else: ind = skl_cross_validation.ShuffleSplit(len(table), n_iter=1, test_size=n, random_state=random_state) return next(iter(ind))
def radialbasisf(self): clf = svm.SVC(kernel='rbf', gamma=self.gamma, C=self.c).fit(self.x_train, self.y_train) z = clf.predict(self.x_test) print(np.mean(self.y_test == z)) # Plot also the training points colours = 'ryg' for i in range(self.x_test.shape[0]): c_index = int(self.y_test[i]) plt.scatter(self.x_test[i, 0], self.x_test[i, 1], c=colours[c_index]) plt.xlabel('Total de Palabras') plt.ylabel('Malas Palabras') plt.title('RBF kernel SVM') plt.show() # SVC is more expensive so we do a lower number of CV iterations: cv = cross_validation.ShuffleSplit(self.x_train.shape[0], n_iter=10, test_size=0.2, random_state=0) '''plot_learning_curve(clf, "Learning Curves (SVM, RBF kernel)", self.x_train, self.y_train, (0.5, 1.01), cv=cv, n_jobs=4) plot_validation_curves(clf, self.x_train, self.y_train) ''' return z
def fit(X_vec, y_vec): # 切分数据集 cv = cross_validation.ShuffleSplit(len(X_vec), n_iter=3, test_size=0.2, random_state=0) # 岭回归 # for train,test in cv: # svc = linear_model.Ridge().fit(X_vec[train], y_vec[train]) # print("train score: %.3f, test score: %.3f\n" %( # svc.score(X_vec[train], y_vec[train]), svc.score(X_vec[test], y_vec[test]) # )) # 支持向量机,C是正则化项因子,gamma是核函数gamma因子 # for train,test in cv: # # SVR既可以解决分类问题,又可以解决回归问题 # svc = svm.SVR(kernel="rbf", C=10, gamma=1e-3).fit(X_vec[train], y_vec[train]) # print("train score: %.3f, test score: %.3f\n" % ( # svc.score(X_vec[train], y_vec[train]), svc.score(X_vec[test], y_vec[test]) # )) # 随机森林回归 for train,test in cv: svc = RandomForestRegressor(n_estimators=100, max_depth=10).fit(X_vec[train], y_vec[train]) print("train score: %.3f, test score: %.3f\n" % ( svc.score(X_vec[train], y_vec[train]), svc.score(X_vec[test], y_vec[test]) ))
def plot_learning_curves(raw_data, limit_size=None): features, weights, labels = raw_data if limit_size is not None: features = features[:limit_size] weights = weights[:limit_size] labels = labels[:limit_size] plt.figure(figsize=(12, 12)) cv = cross_validation.ShuffleSplit(features.shape[0], n_iter=5, test_size=TEST_DATA_SPLIT, random_state=0) title = "Learning Curves (Decision Trees)" estimator = tree.DecisionTreeClassifier(criterion='gini', min_samples_split=60) plt.subplot(2, 2, 1) plot_learning_curve(estimator, title, features, labels, ylim=(0.7, 1.01), cv=cv, n_jobs=N_JOBS) title = "Learning Curves (AdaBoost)" estimator = AdaBoostClassifier(n_estimators=100, learning_rate=1.0) plt.subplot(2, 2, 2) plot_learning_curve(estimator, title, features, labels, ylim=(0.7, 1.01), cv=cv, n_jobs=N_JOBS) title = "Learning Curves (K-Nearest Neighbour)" estimator = KNeighborsClassifier(n_neighbors=10, p=2) plt.subplot(2, 2, 3) plot_learning_curve(estimator, title, features, labels, ylim=(0.7, 1.01), cv=cv, n_jobs=N_JOBS) title = "Learning Curves (SVM)" estimator = svm.SVC(C=1.0, gamma=0.1) plt.subplot(2, 2, 4) plot_learning_curve(estimator, title, features, labels, ylim=(0.7, 1.01), cv=cv, n_jobs=N_JOBS)
def run_cross_validation(X, Y): n_samples = X.shape[0] cv = cross_validation.ShuffleSplit(n_samples, n_iter=10, test_size=0.1, random_state=0) regressor = svm.SVR(C=8, gamma=32, epsilon=0.01, tol=0.000001) scores = cross_validation.cross_val_score(regressor, X, Y, cv=cv, scoring='mean_squared_error') print "Mean Square Error : ", scores print "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2) scores = cross_validation.cross_val_score(regressor, X, Y, cv=cv, scoring='r2') print "R2 Score : ", scores print "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2)
def do_cv(clf, X, y, n_samples=1000, n_iter=3, test_size=0.1, quiet=False, scoring=None, stratified=False, fit_params=None, reseed_classifier=True, n_jobs=-1): t0 = time.time() if reseed_classifier: reseed(clf) if type(n_samples) is float: n_samples = int(n_samples) try: if (n_samples > X.shape[0]): n_samples = X.shape[0] except: pass cv = cross_validation.ShuffleSplit(n_samples, n_iter=n_iter, test_size=test_size, random_state=cfg['sys_seed']) \ if not(stratified) else cross_validation.StratifiedShuffleSplit(y, n_iter, train_size=n_samples, test_size=test_size, random_state=cfg['sys_seed']) test_scores = cross_validation.cross_val_score(clf, X, y, cv=cv, scoring=scoring or cfg['scoring'], fit_params=fit_params, n_jobs=n_jobs) if not (quiet): dbg('%s took: %.2fm' % (mean_score(test_scores), (time.time() - t0) / 60)) return (np.mean(test_scores), sem(test_scores))
def do_gs(clf, X, y, params, n_samples=1000, n_iter=3, n_jobs=-2, scoring=None, fit_params=None): if type(n_samples) is float: n_samples = int(n_samples) reseed(clf) cv = cross_validation.ShuffleSplit(n_samples, n_iter=n_iter, random_state=cfg['sys_seed']) gs = grid_search.GridSearchCV(clf, params, cv=cv, n_jobs=n_jobs, verbose=2, scoring=scoring or cfg['scoring'], fit_params=fit_params) X2, y2 = utils.shuffle(X, y, random_state=cfg['sys_seed']) gs.fit(X2[:n_samples], y2[:n_samples]) dbg(gs.best_params_, gs.best_score_) return gs
def split_data(df, features): """ split df[features] into train and test set with ShuffleSplit it also generates a new feature 'cnt_season' by grouping counts of four seasons Parameters ---------- df: pandas dataframe features: a list of columns of df, the set of features in train set Returns ------- df: dataframe + 'cnt_season' X_train, X_test, y_train, y_test: train set and test set for 'cnt' column y_train_cas, y_test_cas, y_train_reg, y_test_reg: train and test sets for 'casual' and 'registered' columns (not used in this study) time_test: datetime information of test set, for writing prediction results """ ss = cross_validation.ShuffleSplit(len(df), n_iter=1, test_size=0.1, random_state=1234) for ind_train, ind_test in ss: # add a cnt_season column using groupby and join if 'cnt_season' not in df: season_gb = df.ix[ind_train, :].groupby('season')[['cnt']].agg(sum) season_gb.columns = ['cnt_season'] df = df.join(season_gb, on='season') X_train = df.ix[ind_train, features].as_matrix() X_test = df.ix[ind_test, features].as_matrix() y_train = np.log1p(df.ix[ind_train, 'cnt'].as_matrix()) y_test = np.log1p(df.ix[ind_test, 'cnt'].as_matrix()) y_train_cas = np.log1p(df.ix[ind_train, 'casual'].as_matrix()) y_train_reg = np.log1p(df.ix[ind_train, 'registered'].as_matrix()) y_test_cas = np.log1p(df.ix[ind_test, 'casual'].as_matrix()) y_test_reg = np.log1p(df.ix[ind_test, 'registered'].as_matrix()) time_test = df.ix[ind_test, ['dteday', 'mnth', 'hr']].as_matrix() return df, X_train, X_test, y_train, y_test, y_train_cas, y_test_cas, y_train_reg, y_test_reg, time_test
def cv_select(y, random_state, n_cv, cv, test_size=0.1): if isinstance(cv, basestring): if cv == 'shuffle': return cross_validation.StratifiedShuffleSplit( y, n_cv, test_size=test_size, random_state=random_state) elif cv == 'loo': return cross_validation.LeaveOneOut(n_cv) elif cv == 'kfold': return cross_validation.StratifiedKFold(y, n_folds=n_cv) elif cv == 'boot': return cross_validation.Bootstrap(len(y), n_iter=n_cv, train_size=(1 - test_size), random_state=random_state) elif cv == 'boot632': return bootstrap_632(len(y), n_iter=n_cv, random_state=random_state) # for regression elif cv == '_shuffle': return cross_validation.ShuffleSplit(len(y), n_iter=n_cv, test_size=test_size, random_state=random_state) elif cv == '_kfold': return cross_validation.KFold(len(y), n_folds=n_cv) else: raise ValueError("bad cv:%s" % cv) else: return cv
def sample_random_n(table, n, stratified=False, replace=False, random_state=None): assert n > 0 n = int(n) if replace: ind = cross_validation.Bootstrap(len(table), train_size=n, random_state=random_state) elif stratified and is_discrete(table.domain.class_var): train_size = max(len(table.domain.class_var.values), n) test_size = max(len(table) - train_size, 0) ind = cross_validation.StratifiedShuffleSplit( table.Y.ravel(), n_iter=1, test_size=test_size, train_size=train_size, random_state=random_state) else: train_size = max(len(table.domain.class_var.values), n) test_size = max(len(table) - train_size, 0) ind = cross_validation.ShuffleSplit(len(table), n_iter=1, test_size=test_size, train_size=train_size, random_state=random_state) return next(iter(ind))
def shuffleCV(self, clf): #print ('Shuffle Process Unique Id: {0}'.format(uuid.uuid1())) # ============ Shuffle Split cross validation (learning Curve) ================ t0 = time() title = "Learning Curves (Naive Bayes) " + str(clf).split('(')[ 0] # prints the name of classifier also # Cross validation with 20 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. cv = cross_validation.ShuffleSplit(len(self.y_train), n_iter=20, test_size=0.2, random_state=0) # plots a graph showing the learning curve for the test and training data split the job to 4 threads plt = self.plot_learning_curve(clf, title, self.X_train, self.y_train, ylim=(0.3, 1.01), cv=cv, n_jobs=4) plt.draw() plt.savefig("results/" + str(clf).split('(')[0] + '_shuffleCVlearningCurve') ss_time = time() - t0 print("(", str(clf).split('(')[0], ") Shuffle Split time: %0.3fs" % ss_time) # orints estimated time
def main(): # num_columns is number of columns in file with open('../data/test_lung_s3.csv', 'rb') as f: reader = csv.reader(f, delimiter=',') for row in reader: num_columns = len(row) break # load data mat = np.loadtxt('../data/test_lung_s3.csv', delimiter=',', skiprows=1, usecols=range(0, 101)) X = mat[:, 1:num_columns] # data X = X.astype(float) y = mat[:, 0] # label n_samples, n_features = X.shape # evalaution num_fea = 20 ss = cross_validation.ShuffleSplit(n_samples, n_iter=5, test_size=0.2) clf = svm.LinearSVC() mean_acc = 0 for train, test in ss: idx = CFS.cfs(X[train], y[train]) selected_features = X[:, idx[0:num_fea]] clf.fit(selected_features[train], y[train]) y_predict = clf.predict(selected_features[test]) acc = accuracy_score(y[test], y_predict) print acc mean_acc = mean_acc + acc mean_acc /= 5 print mean_acc
def shuffle_split_binary(frame, split_col, test_col, test_fun, n_iter=100): from sklearn import cross_validation split_values = frame[split_col].unique() assert len(split_values) == 2 frame_1 = frame[frame[split_col] == split_values[0]] frame_2 = frame[frame[split_col] == split_values[1]] assert len(frame_1) != len(frame_2) smaller = frame_1 if len(frame_1) < len(frame_2) else frame_2 larger = frame_2 if len(frame_1) < len(frame_2) else frame_1 smaller_name = smaller.iloc[0][split_col] larger_name = larger.iloc[0][test_col] ss = cross_validation.ShuffleSplit(len(larger), train_size=len(smaller), n_iter=n_iter) results = [] sm_true, sm_false = smaller[test_col].sum(), (-smaller[test_col]).sum() for train_idx, test_idx in ss: lg_true, lg_false = larger.iloc[train_idx][test_col].sum(),\ (-larger.iloc[train_idx][test_col]).sum() df = pandas.DataFrame( { "False": [sm_false, lg_false], "True": [sm_true, lg_true] }, index=[smaller_name, larger_name]) results.append(test_fun(df)) return results
def train_test_split(result): #move target variable 'target' to the first column target = result['target'] result.drop('target', axis=1, inplace=True) result.insert(0, 'target', target) result['proAbortionCaseDecision'] = np.where(result['panelvote'] >= 2, 1, 0) result.drop(['year_month'], axis=1, inplace=True) #Following Kristen's script n = result.shape[0] # The split variable contains shuffled indices for the training data and for the testing data split = cross_validation.ShuffleSplit(n, n_iter=1, train_size=0.8, test_size=.20, random_state=1) train_idx = np.arange(n) test_idx = np.arange(n) for tr, te in split: train_idx = set(tr) test_idx = set(te) train_f = result.iloc[list( train_idx), :] # convert train_idx from array to list of indices test_f = result.iloc[list( test_idx), :] # convert test_idx from array to list of indices return train_f, test_f
def main(): # load MATLAB data mat = scipy.io.loadmat('../data/COIL20.mat') X = mat['fea'] # data y = mat['gnd'] # label y = y[:, 0] n_samples, n_features = X.shape X = X.astype(float) Y = construct_label_matrix_pan(y) # 5-fold cross validation num_fea = 20 ss = cross_validation.ShuffleSplit(n_samples, n_iter=5, test_size=0.2) clf = svm.LinearSVC() mean_acc = 0 for train, test in ss: W, obj, value_gamma = ll_l21_proximal.proximal_gradient_descent( X[train], Y[train], 0.1, verbose=False) idx = feature_ranking(W) selected_features = X[:, idx[0:num_fea]] clf.fit(selected_features[train], y[train]) y_predict = clf.predict(selected_features[test]) acc = accuracy_score(y[test], y_predict) print acc mean_acc = mean_acc + acc mean_acc /= 5 print 'mean_acc', mean_acc
def sklearn_random_forest(train_x, train_y, test_x, test_uid): # 设置参数 clf = RandomForestClassifier( n_estimators=5, bootstrap=True, #是否有放回的采样 oob_score=False, n_jobs=4, #并行job个数 min_samples_split=5) # 训练模型 n_samples = train_x.shape[0] cv = cross_validation.ShuffleSplit(n_samples, n_iter=3, test_size=0.3, random_state=0) predicted = cross_validation.cross_val_predict(clf, train_x, train_y, cv=cv) print(metrics.accuracy_score(train_y, predicted)) test_y = clf.predict(test_x) result = pd.DataFrame({ "uid": test_uid, "score": test_y }, columns=['uid', 'score']) result.to_csv('rf_' + str(time.time()) + '.csv', index=False)
def test(): import sklearn.cross_validation as skl_cross_validation app = QApplication([]) w = OWVennDiagram() data = Orange.data.Table("brown-selected") data = append_column(data, "M", Orange.data.StringVariable("Test"), numpy.arange(len(data)).reshape(-1, 1) % 30) indices = skl_cross_validation.ShuffleSplit( len(data), n_iter=5, test_size=0.7 ) indices = iter(indices) def select(data): sample, _ = next(indices) return data[sample] d1 = select(data) d2 = select(data) d3 = select(data) d4 = select(data) d5 = select(data) for i, data in enumerate([d1, d2, d3, d4, d5]): data.name = chr(ord("A") + i) w.setData(data, key=i) w.handleNewSignals() w.show() app.exec_() del w app.processEvents() return app
def main(): DOC = """ ================================================================================ Compare the prediction accuracy of different models on the boston dataset ================================================================================ """ print(DOC) from sklearn import cross_validation, datasets boston = datasets.load_boston() X, y = boston.data, np.round(boston.target) #X -= X.mean() y -= y.min() idx = np.argsort(y) X = X[idx] y = y[idx] cv = cross_validation.ShuffleSplit(y.size, n_iter=50, test_size=.1, random_state=0) score_logistic = [] score_ordinal_logistic = [] score_ridge = [] for i, (train, test) in enumerate(cv): #test = train if not np.all(np.unique(y[train]) == np.unique(y)): # we need the train set to have all different classes continue assert np.all(np.unique(y[train]) == np.unique(y)) train = np.sort(train) test = np.sort(test) w, theta = ordinal_logistic_fit(X[train], y[train], verbose=True, solver='TNC') pred = ordinal_logistic_predict(w, theta, X[test]) s = metrics.mean_absolute_error(y[test], pred) print('ERROR (ORDINAL) fold %s: %s' % (i+1, s)) score_ordinal_logistic.append(s) from sklearn import linear_model clf = linear_model.LogisticRegression(C=1.) clf.fit(X[train], y[train]) pred = clf.predict(X[test]) s = metrics.mean_absolute_error(y[test], pred) print('ERROR (LOGISTIC) fold %s: %s' % (i+1, s)) score_logistic.append(s) from sklearn import linear_model clf = linear_model.Ridge(alpha=1.) clf.fit(X[train], y[train]) pred = np.round(clf.predict(X[test])) s = metrics.mean_absolute_error(y[test], pred) print('ERROR (RIDGE) fold %s: %s' % (i+1, s)) score_ridge.append(s) print() print('MEAN ABSOLUTE ERROR (ORDINAL LOGISTIC): %s' % np.mean(score_ordinal_logistic)) print('MEAN ABSOLUTE ERROR (LOGISTIC REGRESSION): %s' % np.mean(score_logistic)) print('MEAN ABSOLUTE ERROR (RIDGE REGRESSION): %s' % np.mean(score_ridge)) # print('Chance level is at %s' % (1. / np.unique(y).size)) return np.mean(score_ridge)
def train_classifier(clf, X, y): """ 训练分类器 Args: X: training samples, size=[n_samples, n_features] y: class labels, size=[n_samples, 1] Returns: clf: classifier, 训练完的分类器 """ from sklearn import grid_search, cross_validation import time """grid search 的结果 clf.fit(X, y) #logger.info('Classifier fit Done. Best params are %s with a best score of %0.2f' % (clf.best_params_, clf.best_score_)) #logger.info('And scores ars %s' % (clf.grid_scores_)) """ # 简单的交叉验证 clf.fit(X, y) scores = cross_validation.cross_val_score(clf, X, y, cv=5) logger.info( 'Classifier fit Done. And simple cross-validated scores ars %s' % (scores)) # 十折法 kf = cross_validation.KFold(len(X), n_folds=10) for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) score = clf.score(X_test, y_test) logger.info('10 folds cross-validated scores is %s.' % (score)) # 以 1/10的训练集作为新的训练集输入,并得出评分 test_size = 0.9 rs = cross_validation.ShuffleSplit(len(X), test_size=test_size, random_state=int(time.time())) for train_index, test_index in rs: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) score = clf.score(X_test, y_test) logger.info('%s作为训练集输入, cross-validated scores is %s.' % (1 - test_size, score)) """ # 以 1/100的训练集作为新的训练集输入,并得出评分 test_size = 0.99 rs = cross_validation.ShuffleSplit(len(X), test_size=test_size, random_state=int(time.time())) for train_index, test_index in rs: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) score = clf.score(X_test, y_test) logger.info('%s作为训练集输入, cross-validated scores is %s.' % (1-test_size, score)) """ return clf
def test_shuffle_split_warnings(): expected_message = ("test_fraction is deprecated in 0.11 and scheduled " "for removal in 0.13, use test_size instead", "train_fraction is deprecated in 0.11 and scheduled " "for removal in 0.13, use train_size instead") with warnings.catch_warnings(record=True) as warn_queue: cval.ShuffleSplit(10, 3, test_fraction=0.1) cval.ShuffleSplit(10, 3, train_fraction=0.1) cval.train_test_split(range(3), test_fraction=0.1) cval.train_test_split(range(3), train_fraction=0.1) assert_equal(len(warn_queue), 4) assert_equal(str(warn_queue[0].message), expected_message[0]) assert_equal(str(warn_queue[1].message), expected_message[1]) assert_equal(str(warn_queue[2].message), expected_message[0]) assert_equal(str(warn_queue[3].message), expected_message[1])
def enetCV(): print ("Doing elastic net") cross_val = cross_validation.ShuffleSplit(len(base_X), n_iter=5, test_size=0.2, random_state=0) clf4 = ElasticNetCV(cv=cross_val) clf4.fit(base_X, base_Y) print ("Score = %f" % clf4.score(base_X, base_Y)) clf4_pred = clf4.predict(X_test) write_to_file("elasticCV.csv", clf4_pred)
def split_train_test(authors): train = {} test = {} for author in authors: for tr, te in cross_validation.ShuffleSplit(len(authors[author]), 1, 0.05): train[author] = np.array(authors[author])[tr] test[author] = np.array(authors[author])[te] return train, test
def lassolarscv(): print ("Doing cross-validated LassoLars") cross_val = cross_validation.ShuffleSplit(len(base_X), n_iter=5, test_size=0.2, random_state=0) clf5 = LassoLarsCV(cv=cross_val) clf5.fit(base_X, base_Y) print ("Score = %f" % clf5.score(base_X, base_Y)) clf5_pred = clf5.predict(X_test) write_to_file("lassolars.csv", clf5_pred)
def create_and_test_model(X, y, n_iter=10, test_size=0.1, random_state=RANDOM_SEED, verbose=False): """Create a model and test using n-fold cross validation. Pass random_state=None to override the fixed random seed. """ # split the data in train and test using shuffle and split # create an iterator that generates boolean indices for each train/test run ss_iter = cross_validation.ShuffleSplit(len(X), n_iter=n_iter, test_size=test_size, indices=False, random_state=random_state) cm_combined = None for n_run, (train_indices, test_indices) in enumerate(ss_iter): # converting these to lists is much faster than leaving in Pandas DataFrame or Series X_train = X[train_indices].to_records(index=False).tolist() y_train = y[train_indices].tolist() X_test = X[test_indices].to_records(index=False).tolist() y_test = y[test_indices].tolist() #print(y_test) model = LogisticRegression(penalty='l2') model.fit(X_train, y_train) predicted = model.predict(X_test) cm = confusion_matrix(y_test, predicted) cm_df = pd.DataFrame( cm, index=[LABEL_ACTUAL_POSITIVE, LABEL_ACTUAL_NEGATIVE], columns=[LABEL_PREDICTED_POSITIVE, LABEL_PREDICTED_NEGATIVE]) if cm_combined is None: cm_combined = cm else: cm_combined += cm if verbose: #print(model.coef_) #print(model.get_params()) #print(model.transform(X_test[0:2])) #print(predicted.tolist()) print("run {} of {}".format(n_run + 1, n_iter)) print("\t" "score: {}".format(model.score(X_test, y_test))) print("\t" "POISONOUS: {}".format( sum([val == 'POISONOUS' for val in y_test]))) print("\t" "EDIBLE: {}".format( sum([val == 'EDIBLE' for val in y_test]))) print("\t" "confusion matrix:\n{}\n".format(cm_df)) cm_df = pd.DataFrame( cm_combined, index=[LABEL_ACTUAL_POSITIVE, LABEL_ACTUAL_NEGATIVE], columns=[LABEL_PREDICTED_POSITIVE, LABEL_PREDICTED_NEGATIVE]) if verbose: print("combined confusion matrix:") print(cm_df) return cm_df
def test_vc(): digits = load_digits() X, y = digits.data, digits.target p_range = np.logspace(-6, -1, 5) cv = cross_validation.ShuffleSplit(digits.data.shape[0], n_iter=10, test_size=0.2, random_state=0) model = SVC() plot_validation_curve(model, X, y, scorer='accuracy', param_name="gamma", param_range=p_range, cv=cv, n_jobs=2, ylim=(0.0, 0.5), title="SVC validation curve ($\gamma$)") plt.show()
def sample(table, n=0.7, stratified=False, replace=False, random_state=None): """ Samples data instances from a data table. Returns the sample and a data set from input data table that are not in the sample. Also uses several sampling functions from `scikit-learn <http://scikit-learn.org>`_. table : data table A data table from which to sample. n : float, int (default = 0.7) If float, should be between 0.0 and 1.0 and represents the proportion of data instances in the resulting sample. If int, n is the number of data instances in the resulting sample. stratified : bool, optional (default = False) If true, sampling will try to consider class values and match distribution of class values in train and test subsets. replace : bool, optional (default = False) sample with replacement random_state : int or RandomState Pseudo-random number generator state used for random sampling. """ if type(n) == float: n = int(n * len(table)) if replace: if random_state is None: rgen = np.random else: rgen = np.random.mtrand.RandomState(random_state) sample = rgen.randint(0, len(table), n) o = np.ones(len(table)) o[sample] = 0 others = np.nonzero(o)[0] return table[sample], table[others] n = len(table) - n if stratified and table.domain.has_discrete_class: test_size = max(len(table.domain.class_var.values), n) ind = skl_cross_validation.StratifiedShuffleSplit( table.Y.ravel(), n_iter=1, test_size=test_size, train_size=len(table) - test_size, random_state=random_state) else: ind = skl_cross_validation.ShuffleSplit(len(table), n_iter=1, test_size=n, random_state=random_state) ind = next(iter(ind)) return table[ind[0]], table[ind[1]]