def tune_train_test(X, y): # Utilize the so-called outer-inner-cv (Batuwitage, p55) outer_kfold = 5 inner_kfold = 5 gmeans = np.zeros(shape=(outer_kfold, 3)) # column: gmean, SE, SP outer_iter_idx = 0 skf = cv.StratifiedKFold( y, n_folds=outer_kfold ) # contains n_folds clones datasets, each contain te_ and tr_ data with the ratio of 1:(n_fols-1) for tr_idx, te_idx in skf: # the outer loop: StratifiedKFold print 'Outer loop iter-th=', outer_iter_idx + 1, '--------------------' X_tr, X_te = X[tr_idx], X[te_idx] y_tr, y_te = y[tr_idx], y[te_idx] # Inner loop: A _coarse_ grid-search k-fold CV, (Batuwitage, phdthesis, p55) coarse_param_space = {'C': [2**i for i in range(-15,0)], \ 'gamma': [2**i for i in range(1,16)]} coarse_clf = svm.SVC(kernel='rbf') coarse_grid_search = GridSearchCV(coarse_clf, param_grid=coarse_param_space, cv=inner_kfold) coarse_grid_search.fit(X_tr, y_tr) # Run fit with all sets of parameters. primary_C = coarse_grid_search.get_params()['estimator__C'] primary_gamma = coarse_grid_search.get_params()['estimator__gamma'] # Inner loop: A _narrow_ grid search narrow_param_space = {'C': [ 2**(primary_C+i) for i in [float(j)/100 for j in range(-75,100, 25)] ], \ 'gamma': [ 2**(primary_gamma+i) for i in [float(j)/100 for j in range(-75,100, 25)] ]} narrow_clf = svm.SVC(kernel='rbf') narrow_grid_search = GridSearchCV(coarse_clf, param_grid=coarse_param_space, cv=inner_kfold) narrow_grid_search.fit(X_tr, y_tr) # Run fit with all sets of parameters. secondary_C = narrow_grid_search.get_params()['estimator__C'] secondary_gamma = narrow_grid_search.get_params()['estimator__gamma'] # a new SVM model was trained by using the complete training dataset on those secondary_C and secondary_gamma clf = svm.SVC(kernel='rbf', C=secondary_C, gamma=secondary_gamma) clf.fit(X_tr, y_tr) # the performance of the resulted model was tested on the remaining separate testing partition. y_pred = clf.predict(X_te) confusion_matrix(y_te, y_pred) gmeans[outer_iter_idx, :] = util.get_gmean( confusion_matrix(y_te, y_pred)) outer_iter_idx = outer_iter_idx + 1 return (np.mean(gmeans, axis=0), np.std(gmeans, axis=0))
def train_svm_classifier(debug=False, n_jobs=-1): """train SVM classifier""" print('Start to load data') tr_mat, tr_labels, te_mat, te_labels = generate_tr_te_data() print('Data loaded') print('Training a linear SVM model') estimator = LinearSVC() k_fold = KFold(tr_mat.shape[0], n_folds=5) Cs = np.logspace(-10, 0, 10) clf = GridSearchCV(estimator=estimator, cv=k_fold, n_jobs=n_jobs, param_grid=dict(C=Cs), verbose=10) clf.fit(tr_mat, tr_labels) if debug: # debug with learning curve title = 'Learning Curves(SVM, linear kernel, $C=%.6f$)'\ % clf.best_estimator_.C estimator = LinearSVC(C=clf.best_estimator_.C) plot_learning_curve(estimator, title, tr_mat, tr_labels, cv=k_fold, n_jobs=n_jobs) plt.savefig('learning_curve.png') print('Testing linear SVM model') print('Evaluate on test set') clf = LinearSVC(C=clf.best_estimator_.C) clf.fit(tr_mat, tr_labels) score = clf.score(te_mat, te_labels) print('score: {}'.format(score)) print(clf.get_params()) joblib.dump(clf, config.model_path) print('Classifier saved to {}'.format(config.model_path))
def optimize(svr): paramaters = { 'kernel': ('rbf', 'linear', 'poly'), 'C': tuple(range(1, 10)) } clf = GridSearchCV(svr, paramaters) print 'Best Parameters:', clf.get_params() return clf
def tune_train_test(X, y): # Utilize the so-called outer-inner-cv (Batuwitage, p55) outer_kfold = 5 inner_kfold = 5 gmeans = np.zeros(shape=(outer_kfold,3))# column: gmean, SE, SP outer_iter_idx = 0 skf = cv.StratifiedKFold(y, n_folds=outer_kfold)# contains n_folds clones datasets, each contain te_ and tr_ data with the ratio of 1:(n_fols-1) for tr_idx, te_idx in skf:# the outer loop: StratifiedKFold print 'Outer loop iter-th=', outer_iter_idx+1, '--------------------' X_tr, X_te = X[tr_idx], X[te_idx] y_tr, y_te = y[tr_idx], y[te_idx] # Inner loop: A _coarse_ grid-search k-fold CV, (Batuwitage, phdthesis, p55) coarse_param_space = {'C': [2**i for i in range(-15,0)], \ 'gamma': [2**i for i in range(1,16)]} coarse_clf = svm.SVC(kernel='rbf') coarse_grid_search = GridSearchCV(coarse_clf, param_grid=coarse_param_space, cv=inner_kfold) coarse_grid_search.fit(X_tr,y_tr)# Run fit with all sets of parameters. primary_C = coarse_grid_search.get_params()['estimator__C'] primary_gamma = coarse_grid_search.get_params()['estimator__gamma'] # Inner loop: A _narrow_ grid search narrow_param_space = {'C': [ 2**(primary_C+i) for i in [float(j)/100 for j in range(-75,100, 25)] ], \ 'gamma': [ 2**(primary_gamma+i) for i in [float(j)/100 for j in range(-75,100, 25)] ]} narrow_clf = svm.SVC(kernel='rbf') narrow_grid_search = GridSearchCV(coarse_clf, param_grid=coarse_param_space, cv=inner_kfold) narrow_grid_search.fit(X_tr,y_tr)# Run fit with all sets of parameters. secondary_C = narrow_grid_search.get_params()['estimator__C'] secondary_gamma = narrow_grid_search.get_params()['estimator__gamma'] # a new SVM model was trained by using the complete training dataset on those secondary_C and secondary_gamma clf = svm.SVC(kernel='rbf', C=secondary_C, gamma=secondary_gamma) clf.fit(X_tr, y_tr) # the performance of the resulted model was tested on the remaining separate testing partition. y_pred = clf.predict(X_te) confusion_matrix(y_te, y_pred) gmeans[outer_iter_idx,:] = util.get_gmean(confusion_matrix(y_te, y_pred)) outer_iter_idx = outer_iter_idx + 1 return (np.mean(gmeans, axis=0), np.std(gmeans, axis=0))
def tune_nuSVC(X_tr, y_tr): clf = NuSVC() param_space = { 'nu': [0.3, 0.5, 0.8], 'kernel': ['rbf'], 'degree': [3, 5], 'gamma': [0.0, 0.5] } grid_search = GridSearchCV(clf, param_grid=param_space, cv=10) grid_search.fit(X_tr,y_tr)# Run fit with all sets of parameters. return grid_search.get_params()['estimator']
def eval(input_x,input_y,test_x,test,label,write_folder = None): tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}] grid_clf = GridSearchCV(SVR(C=1,epsilon=0.2), tuned_parameters) grid_clf.fit(input_x,input_y) print "params : \t" print grid_clf.get_params() result = grid_clf.predict(test_x) #py_weka = python_weka(input_x,input_y,label) #py_weka.train() #result = py_weka.predict(test_x) #py_weka.close() #clf = SVR(C=1.0, epsilon=0.2) #clf.fit(input_x,input_y) #result = clf.predict(test_x) score_index = 0 produce_set = [] for i in test: produce_set.append([]) score_list = [] index_list = [] for j in i.thread: for k in j.sentences: k.predict_score = result[score_index] score_index += 1 score_list.append(k.predict_score) index_list.append(k.index) sorted_index_array = sorted_index(score_list) sen_length = 0 for j in range(len(index_list)): if sen_length < float(len(index_list))*0.3: produce_set[-1].append(index_list[sorted_index_array[len(index_list)-j-1]]) sen_length += 1 else: break score = weightRecall(test,produce_set,write_folder) print score rouge_eval = rouge(test,produce_set) rouge_score = rouge_eval.eval()['rouge_l_f_score'] print rouge_score return score,rouge_score
def set_model_parameters(self, model_name, verbose=3, file_path=""): if not self.model is None: model_name = self.model.__class__.__name__ if model_name == "LinearSVC": model_to_set = LinearSVC() parameters = {"C": [1, 2, 4, 8], "loss": ["l1", "l2"]} elif model_name == "SVC": model_to_set = OneVsRestClassifier(SVC(kernel="poly")) parameters = { "estimator__C": [1, 2, 4, 8], "estimator__kernel": ["poly", "rbf"], "estimator__degree": [1, 2, 3, 4] } elif model_name == "LogisticRegression": model_to_set = LogisticRegression() parameters = {"penalty": ["l1", "l2"], "C": [1, 2, 4, 8]} else: raise ValueError("Invalid model name.") # Perform Grid Search with 10-fold cross-validation to estimate the parameters # cv_generator = StratifiedKFold(self.label_encoder.transform(self.train_labels), n_folds=7) cv_generator = KFold(len(self.train_labels), n_folds=10, shuffle=True) model_tunning = GridSearchCV(model_to_set, param_grid=parameters, scoring=f1_score, n_jobs=1, cv=cv_generator, verbose=verbose) # Perform parameter setting model_tunning.fit(self.train_feats, self.label_encoder.transform(self.train_labels)) if verbose > 0: print "Best model:" print model_tunning.best_estimator_ print "Best parameters:" print model_tunning.best_params_ print "Best score {}:".format( model_tunning.get_params()["score_func"]) print model_tunning.best_score_ if file_path != "": file_name = file_path + model_name + "AI_Semi.bin" if verbose > 0: print "Saving best model {}...".format(file_name) tunned_model_file = open(file_name, "wb") cPickle.dump(model_tunning.best_estimator_, tunned_model_file) tunned_model_file.close() self.model = model_tunning.best_estimator_ return self.model
def tune_NearestNeigbours(X_tr, y_tr): print 'tune_NearestNeigbours(X_tr, y_tr):' clf = KNeighborsClassifier(algorithm='ball_tree') param_space = {'n_neighbors': [3, 5, 10], \ 'leaf_size': [20, 30, 50], \ 'metric': ['minkowski']}# 'euclidean' is unapplicable to the ball_tree algorithm, but for brute grid_search = GridSearchCV(clf, param_grid=param_space, cv=10) grid_search.fit(X_tr, y_tr) # Run fit with all sets of parameters. return grid_search.get_params()['estimator']
def tune_GradientBoosting(X_tr, y_tr): print 'tune_GradientBoosting(X_tr, y_tr):' clf = GradientBoostingClassifier() param_space = {'n_estimators': [50, 100, 200], \ 'learning_rate': [0.1, 0.3], \ 'max_features': [0.3, 0.5]} grid_search = GridSearchCV(clf, param_grid=param_space, cv=10) grid_search.fit(X_tr,y_tr)# Run fit with all sets of parameters. return grid_search.get_params()['estimator']
def tune_XRandomizedTrees(X_tr, y_tr): print 'tune_XRandomizedTrees(X_tr, y_tr):' clf = ExtraTreesClassifier() param_space = {'n_estimators': [10, 30, 70], \ 'criterion': ['gini', 'entropy'], \ 'max_features': [0.3, 0.5]} grid_search = GridSearchCV(clf, param_grid=param_space, cv=10) grid_search.fit(X_tr, y_tr) # Run fit with all sets of parameters. return grid_search.get_params()['estimator']
def tune_GradientBoosting(X_tr, y_tr): print 'tune_GradientBoosting(X_tr, y_tr):' clf = GradientBoostingClassifier() param_space = {'n_estimators': [50, 100, 200], \ 'learning_rate': [0.1, 0.3], \ 'max_features': [0.3, 0.5]} grid_search = GridSearchCV(clf, param_grid=param_space, cv=10) grid_search.fit(X_tr, y_tr) # Run fit with all sets of parameters. return grid_search.get_params()['estimator']
def tune_NearestNeigbours(X_tr, y_tr): print 'tune_NearestNeigbours(X_tr, y_tr):' clf = KNeighborsClassifier(algorithm='ball_tree') param_space = {'n_neighbors': [3, 5, 10], \ 'leaf_size': [20, 30, 50], \ 'metric': ['minkowski']}# 'euclidean' is unapplicable to the ball_tree algorithm, but for brute grid_search = GridSearchCV(clf, param_grid=param_space, cv=10) grid_search.fit(X_tr,y_tr)# Run fit with all sets of parameters. return grid_search.get_params()['estimator']
def tune_XRandomizedTrees(X_tr, y_tr): print 'tune_XRandomizedTrees(X_tr, y_tr):' clf = ExtraTreesClassifier() param_space = {'n_estimators': [10, 30, 70], \ 'criterion': ['gini', 'entropy'], \ 'max_features': [0.3, 0.5]} grid_search = GridSearchCV(clf, param_grid=param_space, cv=10) grid_search.fit(X_tr,y_tr)# Run fit with all sets of parameters. return grid_search.get_params()['estimator']
def tune_SGD(X_tr, y_tr): print 'tune_SGD(X_tr, y_tr):' clf = SGDClassifier() param_space = {'loss': ['hinge', 'log', 'modified_huber'], \ 'penalty': ['l2', 'l1'], \ 'alpha' : [0.0001, 0.001]} grid_search = GridSearchCV(clf, param_grid=param_space, cv=10) grid_search.fit(X_tr,y_tr)# Run fit with all sets of parameters. return grid_search.get_params()['estimator']
def tune_SGD(X_tr, y_tr): print 'tune_SGD(X_tr, y_tr):' clf = SGDClassifier() param_space = {'loss': ['hinge', 'log', 'modified_huber'], \ 'penalty': ['l2', 'l1'], \ 'alpha' : [0.0001, 0.001]} grid_search = GridSearchCV(clf, param_grid=param_space, cv=10) grid_search.fit(X_tr, y_tr) # Run fit with all sets of parameters. return grid_search.get_params()['estimator']
def tune_nuSVC(X_tr, y_tr): print 'tune_nuSVC(X_tr, y_tr):' clf = NuSVC() param_space = {'nu': [0.3, 0.5, 0.8], \ 'kernel': ['poly', 'rbf', 'sigmoid'], \ 'degree': [3, 5], \ 'gamma': [0.0, 0.5]} grid_search = GridSearchCV(clf, param_grid=param_space, cv=10) grid_search.fit(X_tr, y_tr) # Run fit with all sets of parameters. return grid_search.get_params()['estimator']
def runDecisionTreewithAdaboost(Y, Xtrain, Xtest, isBlind): # Note time to run this setup run_start = time.time() # Reduce feature based on importance (Xtrain, Xtest) = reduceFeatureswithExtraTrees(Y, Xtrain, Xtest) model_start = time.time() # Specify parameters for GridSearch param_grid = { 'base_estimator__criterion': ["gini", "entropy"], 'base_estimator__max_depth': [8, 10], 'base_estimator__max_features': ['sqrt', 0.25], 'n_estimators': [25, 30], 'learning_rate': [0.8, 1.0] } dtc = DecisionTreeClassifier(random_state=11, max_features="auto", class_weight="balanced") abc = AdaBoostClassifier(base_estimator=dtc, algorithm="SAMME.R") # run grid search abc_tuned = GridSearchCV(abc, param_grid=param_grid, scoring='f1_weighted') # Fit the model abc_tuned.fit(Xtrain, Y) model_end = time.time() logger.info('Time to run Gridsearch with AdaBoost(DecisionTree): %0.3fs' % (model_end - model_start)) logger.info('Model params = %s' % abc_tuned.get_params()) logger.info('AUC per class = %s' % getAUCByClass(abc_tuned, Xtrain, Y, classes=[1, 2, 3, 4])) logger.info('F1 Score per class = %s' % getF1ScoreByClass(abc_tuned, Xtrain, Y, classes=[1, 2, 3, 4])) # Create submission file createSubmission(abc_tuned, Xtest, isBlind) # Note the end time run_end = time.time() logger.info('Time to run analysis(AdaBoost): %0.3fs' % (run_end - run_start))
def find_params(model, data, labels, param_grid={}, test_frac=0.6, seed=500): ''' Use a grid search to determine the optimum parameters for the given model. ''' train_set, test_set = \ train_test_split(data, labels, test_size=test_frac, random_state=seed) clf = GridSearchCV(model, param_grid) clf.fit(train_set) score = clf.score() pars = clf.get_params() return pars, score
def find_params(model, data, labels, param_grid={}, test_frac=0.6, seed=500): ''' Use a grid search to determine the optimum parameters for the given model. ''' train_set, test_set = \ train_test_split(data, labels, test_size=test_frac, random_state=seed) clf = GridSearchCV(model, param_grid) clf.fit(train_set) score = clf.score() pars = clf.get_params() return pars, score
param_grid = { 'scaler': [None, preprocessing.MinMaxScaler()], 'selector__k': [10, 15, 'all'] } clf_gnb_grd = GridSearchCV(pipe, param_grid, scoring='f1') #ptest_classifier(clf_gnb_grd, my_dataset, features_list_all) tester.dump_classifier_and_data(clf_gnb_grd, my_dataset, features_list_eng_all) tester.main(); ## Features and the Feature score of the final model print print "*********************************************************************************************************************************" print "GaussianNB parameters" print clf_gnb_grd.get_params() print "*********************************************************************************************************************************" ''' print print "*********************************************************************************************************************************" print "Features and the Feature score of the final model" gt_kbest_scores = sorted(zip(selector.scores_, features_list_eng_all[:10]), reverse=True) print gt_kbest_scores print "*********************************************************************************************************************************" ''' # # Performance Tuning and Validation of DecisionTree model
X_test = scaler.fit_transform(X_test) X_train = scaler.fit_transform(X_train) cv_1 = StratifiedKFold(y_test, n_folds=10, random_state=12) C_range = np.logspace(-5, 15, 11, base=2) gamma_range = np.logspace(-15, 3, 10, base=2) param_grid = dict(gamma=gamma_range, C=C_range) #Find the best parameters for SVM. grid = GridSearchCV(SVC(kernel='rbf'), param_grid=param_grid, cv=cv_1, scoring='accuracy', n_jobs = 320, verbose=2) print("GridSearchCV line... done") grid.fit(X_test, y_test) print("grid.fit() line... done") print(grid.get_params()) print("The best parameters are %s with a score of %0.2f" % (grid.best_params_, grid.best_score_)) print(grid.best_estimator_) clf = grid.best_estimator_ cv_2 = StratifiedKFold(y_train, n_folds=10, random_state=12) scores = cross_val_score(clf, X_train, y_train, cv=cv_2, scoring='accuracy', n_jobs=n_jobs_) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) print(scores) predicted_test_cross = cross_val_predict(clf, X_train, y_train, cv=cv_2, n_jobs=n_jobs_) print(np.count_nonzero(predicted_test_cross == 1, axis=0))
'epsilon': [0.0], 'gamma': [0.01], 'kernel': ['rbf'] }, ] # {'kernel': 'rbf', 'C': 1, 'verbose': False, 'degree': 3, 'epsilon': 0.0, 'shrinking': True, 'max_iter': -1, # 'tol': 0.001, 'cache_size': 200, 'coef0': 0.0, 'gamma': 0.01} # () model = GridSearchCV(SVR(), param_grid, n_jobs=2, cv=5) model.fit(features_train, sval_train) model = model.best_estimator_ print('Model params:') print(model.get_params()) print() # cross validation results from sklearn.cross_validation import cross_val_score scores = cross_val_score(model, features_train, sval_train, cv=5) scores_mse = cross_val_score(model, features_train, sval_train, cv=5, scoring='mean_squared_error') print('Cross validation:')
def skSVM(X, y, scoring, tuned_parameters, data_parameters, cv_parameters): '''DESCRIPTION: This function takes the data set, already processed, along with relevant parameters. It then generates models, selects the best model with the training set, trains the best model with the training dataset, and tests the model on the test dataset. This includes spliting the data into training and testing, gridding, generating cross-validation metrics for all score types, training, testing, and reporting to the terminal. PRECONDITIONS: X. This is the independent variable dataset. It must be a numpy float array that is already preprocessed (per the function process in the guiTK module). No empty vaues allowed. y. This is the dependent variable dataset. It must be a numpy float array that is already preprocessed and has the same number of elements as rows in the X set. No empty values allowed. Binary categories (0 and 1) only with at least one of each. scoring. One of the standard scikit learn scoring values (ex. accuracy, precision ...) tuned parameters. The standard scikit learn tuned_parameters list of dictionaries (ex. kernel, ...). data_parameters. This is a dictionary of the parameters used for the data. The relevant ones here are how to split the datafile. Generated by process. cv_parameters. This is a dictionary of all options required to implement the cross validations. Generated by process. POSTCONDITIONS: Print to terminal. This function prints out results to the terminal including the best estimator, cross validation results on the training data, and prediction metrics from the testing data SIDE EFFECTS: None. RETURN: None. dummy function prints. Future work will make it initialize and store variables in a class ''' # Users have the option of stratifying data, or guaranteeing that an equal number of pass/fail are in both testing and training sets. Uses Cross Validation instead of simple test split function # First we create the mask for sorting, then we apply it to new numpy arrays if data_parameters['stratify']: split = StratifiedKFold(y=y, n_folds=int(1./data_parameters['testSize']), shuffle=True, random_state=data_parameters['random']) else: split = KFold(n=len(y), n_folds=int(1./data_parameters['testSize']), shuffle=True, random_state=data_parameters['random']) for train, test in split: X_train = X[train] X_test = X[test] y_train = y[train] y_test = y[test] break print "Starting the gridding process." print # We must guarantee that we have a valid number of folds, must be at least two but no lower that the mininum count of either DV class or five samples per fold numLabel1 = np.count_nonzero(y_train) numLabel0 = len(y_train) - numLabel1 numFiveFolds = len(y_train) % 5 n_folds = max(2, min(cv_parameters['folds'], numLabel0, numLabel1, numFiveFolds)) # Build a cross validation data sorting method to pass into the gridding process based on data_parameters (from the GUI input) if cv_parameters['cvType'] == 'skf': cv = StratifiedKFold(y=y_train, n_folds=n_folds, shuffle=True, random_state=data_parameters['random']) elif cv_parameters['cvType'] == 'kf': cv = KFold(n=len(y_train), n_folds=n_folds, shuffle=True, random_state=data_parameters['random']) elif cv_parameters['cvType'] == 'sss': cv = StratifiedShuffleSplit(y=y_train, n_iter=cv_parameters['nIter'], test_size=cv_parameters['testSize'], random_state=data_parameters['random']) elif cv_parameters['cvType'] == 'ss': cv = ShuffleSplit(n=len(y_train), n_iter=cv_parameters['nIter'], test_size=cv_parameters['testSize'], random_state=data_parameters['random']) elif cv_parameters['cvType'] == 'lolo': cv = LeaveOneOut(n=len(y_train)) elif cv_parameters['cvType'] == 'lplo': cv = LeavePOut(n=len(y_train), p=cv_parameters['p']) # Now grid based on the scoring, tuned parameters and CV class above grid = GridSearchCV(estimator=SVC(), param_grid=tuned_parameters, scoring = scoring, cv=cv) grid.fit(X_train, y_train) # Retrain the best CLF from the gridding process with the entire training data set and cross validate against all scoring types clf = grid.best_estimator_ clf.fit(X_train, y_train) accuracy_scores = cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=cv) precision_scores = cross_val_score(clf, X_train, y_train, scoring='precision', cv=cv) recall_scores = cross_val_score(clf, X_train, y_train, scoring='recall', cv=cv) f1_scores = cross_val_score(clf, X_train, y_train, scoring='f1', cv=cv) y_test_pred = clf.predict(X_test) y_train_pred = clf.predict(X_train) # Print the relevant results to the terminal for # 1) The best estimator parameters # 2) Cross validation results for the training data and best estimator # 3) Prediction metrics on the test data print print "ESTIMATOR SELECTED FOR OPTIMAL {}:".format(grid.get_params(deep=True)['scoring'].upper()) print "Parameters:" pprint.pprint(grid.best_params_, width=1) print print "Cross-validation scores on training data:" print " Accuracy: {:.1f} +/- {:.1f}%".format(accuracy_scores.mean() * 100, accuracy_scores.std() * 100) print " Precision: {:.1f} +/- {:.1f}%".format(precision_scores.mean() * 100, precision_scores.std() * 100) print " Recall: {:.1f} +/- {:.1f}%".format(recall_scores.mean() * 100, recall_scores.std() * 100) print " F1: {:.1f} +/- {:.1f}%".format(f1_scores.mean() * 100, f1_scores.std() * 100) print print "Trained estimator scores on testing data:" print " Accuracy: {:.1f}%".format(accuracy_score(y_test, y_test_pred)*100) print " Precision: {:.1f}%".format(precision_score(y_test, y_test_pred, average='weighted')*100) print " Recall: {:.1f}%".format(recall_score(y_test, y_test_pred, average='weighted')*100) print " F1: {:.1f}%".format(f1_score(y_test, y_test_pred, average='weighted')*100) print print "Trained estimator classification report on testing data:" print print classification_report(y_test, y_test_pred, target_names=['Fail', 'Pass'])
from self import preprocess from sklearn.svm import SVC from sklearn.metrics import f1_score from sklearn.metrics import confusion_matrix from sklearn.grid_search import GridSearchCV features_train, features_test, labels_train, labels_test, mail = preprocess() grid = {'C': [0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10]} clf = GridSearchCV(SVC(kernel="rbf"), grid) clf.fit(features_train, labels_train) x = clf.predict(features_test) print clf.predict(mail) print clf.get_params() print f1_score(labels_test, x, average="micro") print confusion_matrix(labels_test, x)
the size of the original data set. """ X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.1, random_state=0) print "X_train shape =", X_train.shape, " y_train shape=", y_train.shape #print "X_test shape =", X==.shape, " y_test shape=", y_test.shape print """ The following lines train the SVM using our extracted training dataset and is parameterized based on the gridding results. Then the trained SVM is used to carry out predictions on the test data set. The percentage of accuracy predictions is printed """ clf = svm.SVC(kernel='rbf', C=10, gamma = 0.00001, degree = 3.0, coef0 = 0.0).fit(X_train, y_train) print "clf.get_params(deep=True) =", clf.get_params(deep=True) print "clf.score(X_test, y_test) = {0}%".format(int((clf.score(X_test, y_test) * 10000))/100.) print "clf.predict(X_test) = ", clf.predict(X_test) print "clf.decision_function(X_test) = ", clf.decision_function(X_test) print "=======================" print "clf.score(X_train, y_train) = {0}%".format(int((clf.score(X_train, y_train) * 10000))/100.) print "clf.predict(X_train) = ", clf.predict(X_train) print "clf.decision_function(X_train) = ", clf.decision_function(X_train) print "=======================" print print print "#####################################" """ http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC """ print "clf.support_ = ", clf.support_
class BaseMethod(object): def __init__(self, docs_train, y_train, extra={}, useCrossValidation=False, vect_options={}): if sys.flags.debug: self.options = {} else: self.options = { 'vect__ngram_range': [(1, 1)], # (2, 2), (3,3)], # 'vect__stop_words': ('english', None), 'vect__preprocessor': (None, pr.no_prep, pr.no_usernames, pr.remove_noise, pr.placeholders, pr.all, pr.remove_all, pr.reduced_attached, pr.no_url_usernames_reduced_attached), 'vect__use_idf': (True, False), 'vect__max_df': (0.5, ), 'vect__smooth_idf': (True, False), 'vect__sublinear_tf': (True, False) } self.train(docs_train, y_train, extra, useCrossValidation, vect_options) def train(self, docs_train, y_train, extra={}, useCrossValidation=False, vect_options={}): options = dict(self.options.items() + extra.items()) cv = StratifiedKFold(y_train, n_folds=10) if useCrossValidation else None pipeline = Pipeline([ ('vect', TfidfVectorizer(charset_error='ignore', tokenizer=t.tokenize, **vect_options)), ('clf', self.clf), ]) useGrid = sys.flags.optimize if useGrid: self.grid = GridSearchCV(pipeline, options, cv=cv, refit=True, n_jobs=-1, verbose=1) else: self.grid = pipeline cache_key = str(self.grid) + str(docs_train) cached = cache.get(cache_key) if cached and sys.flags.debug == 0: logging.debug("# Fetched cached version of %s " % self.clf.__class__.__name__) self.best_estimator = cached['est'] self.best_score = cached['scr'] self.best_params = cached['parm'] else: logging.debug("# Training new instance of %s " % self.clf.__class__.__name__) self.grid.fit(docs_train, y_train) if useGrid: self.best_estimator = self.grid.best_estimator_ self.best_params = self.grid.best_params_ self.best_score = self.grid.best_score_ else: self.best_estimator = self.grid self.best_params = self.grid.get_params(False) self.best_score = 1 logging.debug("Saving to cache for %s " % self.clf.__class__.__name__) cache.save( cache_key, { "est": self.best_estimator, "scr": self.best_score, "parm": self.best_params }) self.steps = self.best_estimator.named_steps logging.debug("# Best params for %s :" % self.clf.__class__.__name__) logging.debug(self.best_params) logging.debug("# Best score for %s :" % self.clf.__class__.__name__) logging.debug(self.best_score) return self.grid def predict(self, arg_input): orig = arg_input if isinstance(arg_input, basestring): orig = [orig] predictions = self.best_estimator.predict(orig) if isinstance(arg_input, basestring): return predictions[0] return predictions def __str__(self): return "%s" % self.__class__.__name__
gs =gs.fit(X_train,y_train) print(gs.best_score_) print(gs.best_params_) pipe_neighbors=Pipeline([('scl',StandardScaler()),('clf',KNeighborsClassifier())]) metrics = ['minkowski','euclidean','manhattan'] weights = ['uniform','distance'] #10.0**np.arange(-5,4) numNeighbors = np.arange(5,10) param_grid = dict(metric_params=metrics,weights=weights,n_neighbors=numNeighbors) neighbors_range = range(1, 21) # param_grid = dict(kneighborsclassifier__n_neighbors=neighbors_range) grid= GridSearchCV(estimator=pipe_neighbors, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=1) print grid.get_params().keys() # pipe = Pipeline.make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=3)) # cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean() # using GridSearchCV with Pipeline # neighbors_range = range(1, 21) # param_grid = [{'kneighborsclassifier__n_neighbors':neighbors_range}] # grid = GridSearchCV(pipe_neighbors, param_grid, cv=5, scoring='accuracy') grid =grid.fit(X_train,y_train) print(grid.best_score_) print(grid.best_params_)
svm = Pipeline([('scaler', StandardScaler()), ('selector', SelectKBest()), ('svm', svm.SVC())]) param_grid = ([{ 'svm__C': [1, 50, 100, 1000], 'svm__gamma': [0.5, 0.1], 'svm__degree': [1, 2], 'svm__kernel': ['rbf', 'poly'], 'selector__k': range(1, len(total_features)) }]) svm_clf = GridSearchCV(svm, param_grid, scoring='recall', cv=5).fit(features, labels).best_estimator_ tester.test_classifier(svm_clf, my_dataset, total_features) best_features = [] for i, feature in enumerate(svm_clf.get_params()['selector'].scores_): best_features.append([total_features[1:][i], feature]) pd_feature = pd.DataFrame(best_features, index=np.arange(1, len(best_features) + 1), columns=['Feature', 'Score']) best_features = ['poi'] + pd_feature.nlargest(13, 'Score')['Feature'].tolist() pd_feature.nlargest(13, 'Score') ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, best_features, sort_keys=True) labels, features = targetFeatureSplit(data) ### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info:
# testing subsets X = run1meta_features.values y = cputime.values X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed) RF = RandomForestRegressor(random_state=seed) # Run a grid search with Random Forests over n_estimators param_grid_RF = {'n_estimators': np.array([16,32,64, 128, 256, 512])} np.set_printoptions(suppress=True) print(param_grid_RF) RF_tuned = GridSearchCV(RF, param_grid_RF, verbose=3) RF_tuned.fit(X_train, y_train) print(RF_tuned.get_params()) y_RF_tuned_pred = RF_tuned.predict(X_test) mse_RF_tuned = mean_squared_error(y_test,y_RF_tuned_pred) r2_RF_tuned = RF_tuned.score(X_test, y_test) # Print some metrics of the quality of the RF modeling print('MSE (tuned RF) =', mse_RF_tuned) print('R2 score (tuned RF) =',r2_RF_tuned) RFbest = RF_tuned.best_estimator_ print(run1meta_features.keys()) print(RFbest.feature_importances_)
# 'clf__criterion': ['gini', 'entropy'], # 'clf__max_depth': [3, 6, 8, 11, 15, 20] # KNC 'clf__n_neighbors': [2, 4, 6, 10], 'clf__weights': ['distance', 'uniform'], 'clf__algorithm': ['kd_tree', 'ball_tree', 'auto', 'brute'], }, scoring='recall') grid_search.fit(X_train, y_train) clf = pipeline.set_params(**grid_search.best_params_) pipeline.fit(X_train, y_train) print(grid_search.best_params_) print(dir(grid_search)) report = classification_report(y_test, clf.predict(X_test)) print report # dump classifier and dta dump_classifier_and_data(clf, my_dataset, features_list) # Getting the feature Scores k = grid_search.get_params( True)['estimator__feature_selection__transformer_list'][0][1] features_scores = zip(features_list[1:], k.scores_) for f, s in sorted(features_scores, key=lambda x: x[1], reverse=True): print('%s: %s' % (f, s))
class BaseMethod(object): def __init__(self, docs_train, y_train, extra = {}, useCrossValidation = False, vect_options = {}): if sys.flags.debug: self.options = {} else: self.options = { 'vect__ngram_range': [(1, 1)], # (2, 2), (3,3)], # 'vect__stop_words': ('english', None), 'vect__preprocessor': (None, pr.no_prep, pr.no_usernames, pr.remove_noise, pr.placeholders, pr.all, pr.remove_all, pr.reduced_attached, pr.no_url_usernames_reduced_attached), 'vect__use_idf': (True, False), 'vect__max_df': (0.5,), 'vect__smooth_idf': (True, False), 'vect__sublinear_tf': (True, False) } self.train(docs_train, y_train, extra, useCrossValidation, vect_options) def train(self, docs_train, y_train, extra = {}, useCrossValidation = False, vect_options={}): options = dict(self.options.items() + extra.items()) cv = StratifiedKFold(y_train, n_folds=10) if useCrossValidation else None pipeline = Pipeline([ ('vect', TfidfVectorizer(charset_error='ignore', tokenizer=t.tokenize, **vect_options)), ('clf', self.clf), ]) useGrid = sys.flags.optimize if useGrid: self.grid = GridSearchCV( pipeline, options, cv=cv, refit=True, n_jobs=-1, verbose=1 ) else: self.grid = pipeline cache_key = str(self.grid) + str(docs_train) cached = cache.get(cache_key) if cached and sys.flags.debug == 0: logging.debug("# Fetched cached version of %s " % self.clf.__class__.__name__) self.best_estimator = cached['est'] self.best_score = cached['scr'] self.best_params = cached['parm'] else: logging.debug("# Training new instance of %s " % self.clf.__class__.__name__) self.grid.fit(docs_train, y_train) if useGrid: self.best_estimator = self.grid.best_estimator_ self.best_params = self.grid.best_params_ self.best_score = self.grid.best_score_ else: self.best_estimator = self.grid self.best_params = self.grid.get_params(False) self.best_score = 1 logging.debug("Saving to cache for %s " % self.clf.__class__.__name__) cache.save(cache_key, { "est": self.best_estimator, "scr": self.best_score, "parm": self.best_params }) self.steps = self.best_estimator.named_steps logging.debug("# Best params for %s :" % self.clf.__class__.__name__) logging.debug(self.best_params) logging.debug("# Best score for %s :" % self.clf.__class__.__name__) logging.debug(self.best_score) return self.grid def predict(self, arg_input): orig = arg_input if isinstance(arg_input, basestring): orig = [orig] predictions = self.best_estimator.predict(orig) if isinstance(arg_input, basestring): return predictions[0] return predictions def __str__(self): return "%s" % self.__class__.__name__
plt.plot(ln_x_test, y_predict, colors[t], lw=t + 3, label=u'%s算法估计值,$R^2$=%.3f' % (titles[t], model.best_score_)) # 图形显示 plt.legend(loc='upper left') plt.grid(True) plt.title(u"波士顿房屋价格预测") plt.show() # 模型训练 ====> 单个Lasso模型(一阶特征选择)<2参数给定1阶情况的最优参数> model = Pipeline([('ss', StandardScaler()), ('poly', PolynomialFeatures(degree=1, include_bias=True, interaction_only=True)), ('linear', LassoCV(alphas=np.logspace(-3, 1, 20), fit_intercept=False))]) # 模型训练 model.fit(x_train, y_train) # 模型评测 # 数据输出 print("参数:", zip(names, model.get_params('linear')['linear'].coef_)) print("截距:", model.get_params('linear')['linear'].intercept_) # 参数: [('CRIM', 21.135499741068376), ('ZN', -0.0), ('INDUS', -0.0), ('CHAS', -0.0), ('NOX', 0.19539929236955278), ('RM', -0.0), ('AGE', 1.5662356175920531), ('DIS', -0.38131114313786807), ('RAD', -0.69604251661926086), ('TAX', 0.0), ('PTRATIO', -0.0), ('B', -1.5063986238529539), ('LSTAT', 0.0)] # 截距: 0.0