def test_cross_validation_5_folds(self): model = KNeighborsClassifier(n_neighbors=self.neighbors) expected = cross_val_score(model, self.x, self.y, cv=5) returned = self.classifier.cross_validation(5) print(returned) for i in range(5): self.assertEqual(expected[i], returned[i])
def eval_linear(data_set, test_size=0.4): # load training data from feature matrix x, y = data_set.load_training_data() # split data into train and test set x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, shuffle=True, random_state=0) # train model on train set model = linear_model.LinearRegression(normalize=True) model.fit(x_train, y_train) # evaluate on test set score = cross_val_score(model, x_test, y_test, scoring='neg_mean_squared_error') print('Mean squared error: {}'.format(-score)) # plot predict = model.predict(x_test) plt.scatter(y_test, predict) plt.show()
def runsvc(kernel, data, target, nfolds): skf = StratifiedKFold(n_splits=nfolds, random_state=5) svc = svm.SVC(C=1, kernel=kernel) start = time.time() ret = (cross_val_score(svc, data, target, cv=skf, n_jobs=-1)) end = time.time() return ret, (end - start)
def evalModel(self): Log(LOG_INFO) << "Evaluate CV score ..." kfold = KFold(n_splits=10, shuffle=False) res = cross_val_score(self.mlEngine.getEstimator(), self.totalFeatureMatrix, self.totalLabels, cv=kfold, n_jobs=-1) Log(LOG_INFO) << "CV accuracy: %f" % res.mean()
def cross_validation(self, folds=None): if folds is None: y2_model = self.model.fit(self.x1, self.y1).predict(self.x2) y1_model = self.model.fit(self.x2, self.y2).predict(self.x1) return [ accuracy_score(self.y1, y1_model), accuracy_score(self.y2, y2_model) ] else: return cross_val_score(self.model, self.x, self.y, cv=folds)
def eval_cc_linear(train_data_set, test_data_set): # train model on train data set model = train_linear(train_data_set) # evaluate model on test data set (cross corpus) x, y = test_data_set.load_training_data() score = cross_val_score(model, x, y, scoring='neg_mean_squared_error') print('Mean squared error: {}'.format(-score)) # plot predict = model.predict(x) plt.scatter(y, predict) plt.show()
def score_pri(slices, x0, y0): slices = list(slices) if len(slices) <= 1: score0 = -np.inf else: slices = self.feature_unfold(slices) data_x0 = x0[:, slices] self.estimator.fit(data_x0, y0) if hasattr(self.estimator, 'best_score_'): score0 = np.mean(self.estimator.best_score_) else: score0 = np.mean( cross_val_score(self.estimator, data_x0, y0, cv=5)) return score0
def autoencoder_dim_tuning_graph(): '''run the autoencoder with a variety of hidden layer dimensionalities and plot the cross validation errors for each ''' data = read_atoms_data() scaledData = data / 10 - 0.5 kFold = KFold(n_splits=5, shuffle=True) errors = [] # for layer1Dim in range(6,16): for layer1Dim in range(4,5): print('LAYER 1 DIMENSIONALITY: ', layer1Dim) errors.append([]) latentLayerDims = range(4,layer1Dim+1) for latentLayerDim in latentLayerDims: auto = Autoencoder(hiddenDims=[layer1Dim,latentLayerDim]) errors[-1].append(-10.0 * np.mean(cross_val_score(auto, scaledData, cv=kFold))) plt.semilogy(latentLayerDims,errors[-1],label=layer1Dim) print(errors)
def eval_linear(data_set, test_size=0.4): # load training data from feature matrix x, y = data_set.load_training_data() # cross validation evaluation model = LinearRegression(normalize=True) #model = RFE(model, 10) score = cross_val_score(model, x, y, scoring='neg_mean_squared_error') print('Mean squared error: {}'.format(-score)) # to visualize: # split data into train and test set x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, shuffle=True, random_state=0) # train model on train set model = LinearRegression(normalize=True) model = model.fit(x_train, y_train) print(model.coef_) pprint(model) # plot train performance predict_train = model.predict(x_train) plt.figure() plt.title('train') plt.scatter(y_train, predict_train) # plot test performance predict = model.predict(x_test) plt.figure() plt.title('test') plt.scatter(y_test, predict) plt.show()
("Gradient Boosting", GradientBoostingClassifier()), ("Extra Trees", ExtraTreesClassifier()), # ("SVM", SVC(kernel="linear")), ("XGBOOST Classifer", XGBClassifier()), ] ## Model comparison ### start = timeit.default_timer() accuracies = [] for name, model in models: # kfold = model_selection.KFold(n_splits=10) cv_results = model_selection.cross_val_score(model, X, y, cv=5) precision = cross_val_score(model, X, y, cv=5, scoring="precision") recall = cross_val_score(model, X, y, cv=5, scoring="recall") f1 = cross_val_score(model, X, y, cv=5, scoring="f1") print( "\n ### Classifier :", name, " ###", "\nAccuracy :", cv_results.mean(), "\nprecision :", precision.mean(), "\nRecall :", recall.mean(), "\nF1 Score :", f1.mean(),
def cross_validation_leave_one(self): return cross_val_score(self.model, self.x, self.y, cv=LeaveOneOut())
def test_cross_validation_leave_one_out(self): model = KNeighborsClassifier(n_neighbors=self.neighbors) expected = cross_val_score(model, self.x, self.y, cv=LeaveOneOut()) returned = self.classifier.cross_validation_leave_one() self.assertAlmostEqual(expected.mean(), returned.mean())
'Sex': train_df['Sex'].astype('int32'), 'Age': train_df['Age'].astype('int32'), 'Embarked': train_df['Embarked'].astype('int32') } result_df = pd.DataFrame(resultData) #Task1 Q3################################################################################################################## resultDecisionTree = DecisionTreeClassifier(criterion='gini') X = result_df.drop('Survived', axis=1) y = result_df['Survived'] resultDecisionTree.fit(X, y) fig = plt.figure(figsize=(35, 30)) plot_tree(resultDecisionTree, filled=True) #plt.show() #Task1 Q4################################################################################################################## clf = DecisionTreeClassifier() scoresforDTC = cross_val_score(clf, X=X, y=y) print("Average score of DTC:", scoresforDTC.mean()) #Task1 Q5################################################################################################################## rf = ensemble.RandomForestClassifier() scoresforRFC = cross_val_score(rf, X=X, y=y) print("Average score for RFC:", scoresforRFC.mean()) #HW1 As a reference######################################################################################################## # validPassenger=[0 for i in range(len(train_df.columns))] # #Q7########################################################### # for i in [5,6,7,9]:#Age, SibSp, Parch, Fare # validPassenger[i] = list(filter(lambda x: not pd.isnull(x), train_df[train_df.columns[i]])) # print(train_df.columns[i]) # print('count ', len(validPassenger[i])) # print('mean ', sum(validPassenger[i])/len(validPassenger[i])) # print('std ', np.std(validPassenger[i]))
from sklearn.datasets import load_iris from sklearn.metrics import accuracy_score from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import KFold import numpy as np import pandas as pd from sklearn.model_selection._validation import cross_val_predict,\ cross_val_score iris = load_iris() dt_clf = DecisionTreeClassifier(random_state=156) data = iris.data label = iris.target scores = cross_val_score(dt_clf, data, label, scoring='accuracy', cv=3) print(np.round(scores, 4)) print(np.round(np.mean(scores), 4))
# 交叉验证评估,使用默认的k折交叉验证kFold import numpy as np import urllib.request from sklearn import preprocessing # url with dataset url = "http://archive.ics.uci.edu/ml/machine-learning-databases/cmc/cmc.data" # download the file raw_data = urllib.request.urlopen(url) # load the CSV file as a numpy matrix dataset = np.loadtxt(raw_data, delimiter=',') # separate the data from the target attributes X = dataset[:, 1:-1] y = dataset[:, -1] # normalize the data attributes normalized_X = preprocessing.normalize(X) from sklearn.model_selection import _validation from sklearn.tree import DecisionTreeClassifier model = DecisionTreeClassifier() score = _validation.cross_val_score(estimator=model, X=X, y=y, cv=10) print(score)
# CORRELATION MATRIX corr_matrix(X) # PCA X_pca = show_PCA(X) # LDA X_lda = show_LDA(X) # Support Vector Machines C = [0.001, 0.01, 1, 10, 100] for c in C: svm = SVC(kernel='linear', C=c) print("Values of C = ", c) print("X_stand avg_accuracy_score", np.mean(cross_val_score(svm, X, y, cv=5, scoring="accuracy"))) print("X_PCA avg_accuracy_score", np.mean(cross_val_score(svm, X_pca, y, cv=5, scoring="accuracy"))) print("X_LDA avg_accuracy_score", np.mean(cross_val_score(svm, X_lda, y, cv=5, scoring="accuracy")), "\n") X_lda_train, X_lda_test, y_lda_train, y_lda_test = train_test_split( X_lda, y, test_size=0.2) svm = SVC(kernel='linear', C=10) svm.fit(X_lda_train, y_lda_train) getContourImage(svm, X_lda_test) # KNN K = [1, 3, 5, 7] for k in K:
train_df.update(updatedEmbarkedData) #print('Embarked correlation',(train_df['Embarked'].astype('int32')).corr(train_df[train_df.columns[1]])) corr_matrix = train_df.corr(method='pearson') #print(corr_matrix['Survived']['Pclass']) #print(corr_matrix) resultData = { 'Survived': train_df['Survived'], 'Pclass': train_df['Pclass'], 'Sex': train_df['Sex'].astype('int32'), 'Age': train_df['Age'].astype('int32'), 'Fare': train_df['Fare'].astype('int32'), 'Embarked': train_df['Embarked'].astype('int32') } result_df = pd.DataFrame(resultData) #print(result_df) X = result_df.drop('Survived', axis=1) y = result_df['Survived'] SVCClf1 = SVC(kernel='linear', C=1) scores1 = cross_val_score(SVCClf1, X=X, y=y) print(scores1.mean()) SVCClf2 = SVC(kernel='poly', C=1) scores2 = cross_val_score(SVCClf2, X=X, y=y) print(scores2.mean()) SVCClf3 = SVC(kernel='rbf', C=1) scores3 = cross_val_score(SVCClf3, X=X, y=y) print(scores3.mean())
predict = model.predict(x) plt.scatter(y, predict) plt.show() if __name__ == '__main__': data_set = DataSet('cepp') x, y = data_set.load_training_data() x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, shuffle=True, random_state=0) regr = linear_model.LinearRegression(normalize=True) #regr = linear_model.Ridge(alpha=0.001, normalize=True) regr.fit(x_train, y_train) predict = regr.predict(x_test) scores = cross_val_score(regr, x_test, y_test, scoring='neg_mean_squared_error') print(scores.mean()) plt.scatter(y_test, predict) plt.show()
kernel='rbf', C=32, gamma=8, ) print("K-Folds scores:") originalclass = [] predictedclass = [] def classification_report_with_accuracy_score(y_true, y_pred): originalclass.extend(y_true) predictedclass.extend(y_pred) return accuracy_score(y_true, y_pred) # return accuracy score #inner_cv = StratifiedKFold(n_splits=10) outer_cv = StratifiedKFold(n_splits=10) # Nested CV with parameter optimization nested_score = cross_val_score( clf, X=X, y=y, cv=outer_cv, scoring=make_scorer(classification_report_with_accuracy_score)) # Average values in classification report for all folds in a K-fold Cross-validation print(classification_report(originalclass, predictedclass)) print("10 folds processing seconds: {}".format(time() - start))
y_train = np.array(label_list) # Convert label strings to numerical encoding encoder = LabelEncoder() y_train = encoder.fit_transform(y_train) # Create classifier clf = svm.SVC(kernel='linear') # Set up 5-fold cross-validation kf = KFold(len(X_train), shuffle=True, random_state=1) # Perform cross-validation scores = _validation.cross_val_score(cv=kf, estimator=clf, X=X_train, y=y_train, scoring='accuracy') print('Scores: ' + str(scores)) print('Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), 2 * scores.std())) # Gather predictions predictions = _validation.cross_val_predict(cv=kf, estimator=clf, X=X_train, y=y_train) accuracy_score = metrics.accuracy_score(y_train, predictions) print('accuracy score: ' + str(accuracy_score)) confusion_matrix = metrics.confusion_matrix(y_train, predictions)