def train(param_search=False): data = load_files(download()) y = [data.target_names[t] for t in data.target] # The random state on the LR estimator is fixed to the most arbitrary value # that I could come up with. It is biased toward the middle number keys on # my keyboard. clf = make_pipeline(TfidfVectorizer(min_df=2, dtype=float, sublinear_tf=True, ngram_range=(1, 2), strip_accents='unicode'), LogisticRegression(random_state=623, C=5000)) if param_search: params = {'tfidf__ngram_range': [(1, 1), (1, 2)], 'lr__C': [1000, 5000, 10000]} print("Starting parameter search for review sentiment classification") # We ignore the original folds in the data, preferring a simple 5-fold # CV instead; this is intended to get a working model, not results for # publication. gs = GridSearchCV(clf, params, cv=5, refit=True, n_jobs=-1, verbose=2) gs.fit(data.data, y) print("Parameters found:") pprint(gs.best_params_) print("Cross-validation accuracy: %.3f" % gs.best_score_) return gs.best_estimator_ else: print("Training logistic regression for movie review polarity") return clf.fit(data.data, y)
def test_grid_search_precomputed_kernel(): """Test that grid search works when the input features are given in the form of a precomputed kernel matrix """ X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) # compute the training kernel matrix corresponding to the linear kernel K_train = np.dot(X_[:180], X_[:180].T) y_train = y_[:180] clf = SVC(kernel='precomputed') cv = GridSearchCV(clf, {'C': [0.1, 1.0]}) cv.fit(K_train, y_train) assert_true(cv.best_score_ >= 0) # compute the test kernel matrix K_test = np.dot(X_[180:], X_[:180].T) y_test = y_[180:] y_pred = cv.predict(K_test) assert_true(np.mean(y_pred == y_test) >= 0) # test error is raised when the precomputed kernel is not array-like # or sparse assert_raises(ValueError, cv.fit, K_train.tolist(), y_train)
def do_cross_validation(self, param_grid, svmtype, score_func, inputdata_train, outputdata_train, inputdata_test, outputdata_test): """ Fitting of classifier used for cross validation """ if svmtype == 'ln': svm_clf = LinearSVC() if svmtype == 'rbf': svm_clf = SVC() #clf_cv = GridSearchCV(SVC(), param_grid, score_func=score_func, n_jobs=-1 ) #clf_cv = GridSearchCV( LinearSVC(), param_grid, score_func=score_func, n_jobs=-1 ) clf_cv = GridSearchCV(svm_clf, param_grid, score_func=score_func, n_jobs=-1 ) clf_cv.fit(inputdata_train, outputdata_train) y_pred_cv = clf_cv.predict(inputdata_test) f1 = metrics.f1_score(outputdata_test, y_pred_cv, pos_label=0) dict_param = clf_cv.best_params_ c = dict_param['C'] if svmtype == 'rbf': gamma1 = dict_param['gamma'] else: gamma1 = 0 return(f1, gamma1, c)
def search_parameters(data_file): with open(data_file, 'r') as f: data = pickle.load(f) labels = data['labels'] features = data['features'] # Split the dataset in two equal parts X_train, X_test, y_train, y_test = cross_validation.train_test_split( features, labels, test_size=0.5, random_state=0) scores = [ ('error_rate', zero_one_score),] #classifier = svm.LinearSVC() classifier = MultinomialNB() tuned_parameters = {'alpha' :(0.001, 0.01,0.1,0.5,1,1.5,2,5,10) } #tuned_parameters = {'C' :(0.00001, 0.001, 0.01, 0.1,0.5,1,1.5,2,5,10,20,50,100,500,1000)} for score_name, score_func in scores: print "# Tuning hyper-parameters for %s" % score_name print clf = GridSearchCV(classifier, tuned_parameters, score_func=score_func) clf.fit(X_train, y_train, cv=5) print "Best parameters set found on development set:" best_parameters, score,_ = max(clf.grid_scores_, key=lambda x: x[1]) for param_name in sorted(tuned_parameters.keys()): print "%s: %r" % (param_name, best_parameters[param_name])
def learn(tuned_parameters,model): # produceFeature(trainfile) dataset = genfromtxt(open('Data/'+trainfile,'r'), delimiter=',',dtype='f8')[0:] target = [x[0] for x in dataset] train = [x[1:] for x in dataset] # print train[1:10] # print target # print len(train) # produceFeature(testfile) test = genfromtxt(open('Data/'+testfile,'r'),delimiter=',',dtype='f8')[0:] test_target = [x[1:] for x in test] # X, y = digits.data, digits.target trainnp = np.asarray(train) targetnp = np.asarray(target) # turn the data in a (samples, feature) matrix: X, y = trainnp, targetnp # X = digits.images.reshape((n_samples, -1)) # y = digits.target # Split the dataset in two equal parts X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.5, random_state=0) scores = ['precision', 'recall'] for score in scores: print("# Tuning hyper-parameters for %s" % score) print() clf = GridSearchCV(model, tuned_parameters, cv=5, scoring='%s_weighted' % score) clf.fit(X_train, y_train) print("Best parameters set found on development set:") print() print(clf.best_params_) print() print("Grid scores on development set:") print() for params, mean_score, scores in clf.grid_scores_: print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params)) print() print("Detailed classification report:") print() print("The model is trained on the full development set.") print("The scores are computed on the full evaluation set.") print() y_true, y_pred = y_test, clf.predict(X_test) print(classification_report(y_true, y_pred)) print()
def train_classifier(data, labels): nIter = 50 alphaVals = [10**i for i in range(3,5)] params = { "loss": ["log"], "penalty": ['l1', 'l2'], "n_iter": [nIter], "alpha": alphaVals } params_log = { "penalty": ['l2'] , "C": [10**i for i in range(-3,-1)] } #sgd = SGDClassifier() sgd = LogisticRegression() clf = GridSearchCV(sgd, params_log) #data = data.tocsr()[:, 0:13] train, val, t_labs, val_labs = train_test_split(data,labels, train_size=.2, random_state=44) s = time.time() clf.fit(train, t_labs) print "Elapsed Training Time for ", len(params_log['C']), 'regularization vals: ', time.time() - s print clf.best_params_ print "The Validation Score: ", clf.score(val, val_labs) probs = clf.predict_proba(val) print "The log loss for the validation set is" print log_loss(probs[:,1], val_labs) return clf
def test_ovo_gridsearch(): ovo = OneVsOneClassifier(LinearSVC(random_state=0)) Cs = [0.1, 0.5, 0.8] cv = GridSearchCV(ovo, {'estimator__C': Cs}) cv.fit(iris.data, iris.target) best_C = cv.best_estimator_.estimators_[0].C assert_true(best_C in Cs)
def Gridsearch_impl(X,Y,clf,param,cv): grid_search = GridSearchCV(clf,param,verbose=10,cv=cv,n_jobs=10) start = time() grid_search.fit(X,Y) # print(grid_search.grid_scores_) best = report(grid_search.grid_scores_)
def getOptCandGamma(cv_train, cv_label): print "Finding optimal C and gamma for SVM with RBF Kernel" C_range = 10.0 ** np.arange(-2, 9) gamma_range = 10.0 ** np.arange(-5, 4) param_grid = dict(gamma=gamma_range, C=C_range) cv = StratifiedKFold(y=cv_label, n_folds=40) # Use the svm.SVC() as the cost function to evaluate parameter choices # NOTE: Perhaps we should run computations in parallel if needed. Does it # do that already within the class? grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv) grid.fit(cv_train, cv_label) score_dict = grid.grid_scores_ scores = [x[1] for x in score_dict] scores = np.array(scores).reshape(len(C_range), len(gamma_range)) pl.figure(figsize=(8,6)) pl.subplots_adjust(left=0.05, right=0.95, bottom=0.15, top=0.95) pl.imshow(scores, interpolation='nearest', cmap=pl.cm.spectral) pl.xlabel('gamma') pl.ylabel('C') pl.colorbar() pl.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45) pl.yticks(np.arange(len(C_range)), C_range) pl.show() print "The best classifier is: ", grid.best_estimator_
def run_gridsearch(X, y, clf, param_grid, cv=5): """Run a grid search for best Decision Tree parameters. Args ---- X -- features y -- targets (classes) cf -- scikit-learn Decision Tree param_grid -- [dict] parameter settings to test cv -- fold of cross-validation, default 5 Returns ------- top_params -- [dict] from report() """ grid_search = GridSearchCV(clf, param_grid=param_grid, cv=cv,scoring = 'recall') start = time() grid_search.fit(X, y) print(("\nGridSearchCV took {:.2f} " "seconds for {:d} candidate " "parameter settings.").format(time() - start, len(grid_search.grid_scores_))) top_params = report(grid_search.grid_scores_, 3) return top_params
def model_search(estimator, tuned_params, scores, X_train, y_train, X_test, y_test): cv = ShuffleSplit(len(X_train), n_iter=3, test_size=0.30, random_state=0) for score in scores: print"# Tuning hyper-parameters for %s" % score print clf = GridSearchCV(estimator, tuned_params, cv=cv, scoring='%s' % score) clf.fit(X_train, y_train) print"Best parameters set found on development set:" print print clf.best_params_ print print "Grid scores on development set:" print for params, mean_score, scores in clf.grid_scores_: print "%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params) print print "Detailed classification report:" print print "The model is trained on the full development set." print "The scores are computed on the full evaluation set." print y_true, y_pred = y_test, clf.predict(X_test) print classification_report(y_true, y_pred) print
def grid_search(dataset_loader_train, model, grid_search): with timer(logger.info, "Loading data"): X, y = dataset_loader_train() grid_search_kwargs = { 'refit': False, } grid_search_kwargs.update(grid_search) cv = grid_search_kwargs.get('cv', None) if callable(cv): grid_search_kwargs['cv'] = apply_kwargs(cv, n=len(y), y=y) if not (hasattr(model, 'score') or 'scoring' in grid_search_kwargs): raise ValueError( "Your model doesn't seem to implement a 'score' method. You may " "want to pass a 'scoring' argument to 'grid_search' instead." ) with timer(logger.info, "Running grid search"): gs = GridSearchCV(model, **grid_search_kwargs) gs.fit(X, y) scores = sorted(gs.grid_scores_, key=lambda x: -x.mean_validation_score) logger.info("\n{}".format(pformat(scores))) return scores
def dogridsearch(X,Y,param_space,clf,cv): grid_search = GridSearchCV(clf,param_space,verbose=10l,cv=cv,n_jobs=-1) start = time() grid_search.fit(X,Y) print("GridSearchCV took %.2f seconds for %d candidate parameter settings." % (time() - start, len(grid_search.grid_scores_))) best = report(grid_search.grid_scores_)
def estimateParameters(X_train, X_test, y_train, y_test): tuned_parameters = [{'kernel': ['rbf'], \ 'gamma': [1e-3, 1e-4], \ 'C': [1, 10, 100, 1000]}, \ {'kernel': ['linear'], \ 'C': [1, 10, 100, 1000]}] scores = ['precision', 'recall'] for score in scores: print("# Tuning hyper-parameters for %s\n" % score) clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5, scoring=score) clf.fit(X_train, y_train) print("Best parameters set found on development set:\n") print(clf.best_estimator_) print("\nGrid scores on development set:\n") for params, mean_score, scores in clf.grid_scores_: print("%.3f (+/-%.03f) for %r" % (mean_score, scores.std() / 2, params)) print("\nDetailed classification report:") print("The model is trained on the full development set.") print("The scores are computed on the full evaluation set.") y_true, y_pred = y_test, clf.predict(X_test) print(classification_report(y_true, y_pred)) print()
def test_nntools_functional_grid_search(mnist, monkeypatch): # Make sure that we can satisfy the grid search interface. from nolearn.nntools import NeuralNet nn = NeuralNet( layers=[], X_tensor_type=T.matrix, ) param_grid = { 'more_params': [{'hidden_num_units': 100}, {'hidden_num_units': 200}], 'update_momentum': [0.9, 0.98], } X, y = mnist vars_hist = [] def fit(self, X, y): vars_hist.append(vars(self).copy()) return self with patch.object(NeuralNet, 'fit', autospec=True) as mock_fit: mock_fit.side_effect = fit with patch('nolearn.nntools.NeuralNet.score') as score: score.return_value = 0.3 gs = GridSearchCV(nn, param_grid, cv=2, refit=False, verbose=4) gs.fit(X, y) assert [entry['update_momentum'] for entry in vars_hist] == [ 0.9, 0.9, 0.98, 0.98] * 2 assert [entry['more_params'] for entry in vars_hist] == ( [{'hidden_num_units': 100}] * 4 + [{'hidden_num_units': 200}] * 4 )
def separable_demo(): """ Generate a linearly-separable dataset D, train a linear SVM on D, then output the resulting decision boundary on a figure. """ from sklearn.datasets import make_blobs X, y = make_blobs(n_samples=200, n_features=2, centers=((0,0), (4, 4)), cluster_std=1.0) plot_data(X, y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) svc = svm.SVC(class_weight='auto') param_grid = {'kernel': ['linear'], 'C': [1e0, 1e1, 1e2, 1e3, 1e4]} strat_2fold = StratifiedKFold(y_train, k=2) print " Parameters to be chosen through cross validation:" for name, vals in param_grid.iteritems(): if name != 'kernel': print " {0}: {1}".format(name, vals) clf = GridSearchCV(svc, param_grid, n_jobs=1, cv=strat_2fold) clf.fit(X_train, y_train) print "== Best Parameters:", clf.best_params_ y_pred = clf.predict(X_test) acc = len(np.where(y_pred == y_test)[0]) / float(len(y_pred)) print "== Accuracy:", acc print classification_report(y_test, y_pred) plot_svm(clf.best_estimator_, X, y, X_test, y_test, title="SVM Decision Boundary, Linear Kernel ({0} accuracy, C={1})".format(acc, clf.best_params_['C']))
def score_nestedCV(self, G1, model, param_grid, effect, nested): k_fold = cross_validation.KFold(n=self.Y.shape[0], n_folds=self.n_folds, indices=True) i_fold=0 scores = sp.zeros(self.n_folds) params = list() for train, test in k_fold: (trainData, trainY) = self._packData(G1, train, effect) (testData, testY) = self._packData(G1, test, effect) if nested: clf = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs = self.n_jobs_grid, cv=self.n_folds_params, scoring=self.scoring, verbose=self.verbose) clf.fit(trainData, trainY.flatten()) params.append(clf.best_params_) scores[i_fold] = clf.score(testData, testY.flatten(), method_scorer=False) else: model.fit(trainData, trainY.flatten()) scores[i_fold] = SCORERS[self.scoring](model, testData, testY.flatten()) i_fold+=1 return scores,params
def classification_level_SGDReg_pipeline(classifications_DF): X = classifications_DF.iloc[:,3:89] #assign the target (session length) to y and convert to int y_actual = classifications_DF.iloc[:,2:3].astype(float) #scaling the data for feature selection X_scaled = preprocessing.scale(X) X_scaled_train, X_scaled_test, y_actual_train, y_actual_test = train_test_split(X_scaled, y_actual, test_size=0.5, random_state=0) pca_selection = PCA(n_components=2) X_features = pca_selection.fit(X_scaled_train['session_length'].values).transform(X_scaled_train) SGDReg = SGDRegressor(alpha=0.0001) # Do grid search over k, n_components and SVR parameters: pipeline = Pipeline([('pca', pca_selection),('SGDReg',SGDReg)]) tuned_params = dict(pca__n_components=[5,30,40,50], SGDReg__alpha=[0.1,0.01,0.001,0.0001,0.00001], SGDReg__l1_ratio=[.05, .15, .5, .7, .9, .95, .99, 1], SGDReg__penalty=['l2','l1','elasticnet']) grid_search = GridSearchCV(pipeline, param_grid=tuned_params,scoring='mean_squared_error',cv=3,verbose=10) grid_search.fit(X_scaled_train, y_actual_train['session_length'].values) print(grid_search.best_estimator_) y_true, y_pred = y_actual_test['session_length'].values,grid_search.best_estimator_.predict(X_scaled_test) print "Mean squared error:"+str(mean_squared_error(y_true,y_pred)) pd.DataFrame(y_true, y_pred).to_csv("SGDReg_pred_true.csv")
def make_grid_search(pipeline, parameters, model_name, params): print model_name grid_search = GridSearchCV(pipeline, parameters, n_jobs=4, verbose=3, #loss_func=f1_score, scoring="f1", iid=False, refit=True) #model_name = "ExtraTree_min_sample2_10trees_gridcv_desc_log" print("Performing grid search...") print("pipeline:", pipeline) # [name for name, _ in pipeline.steps]) print("parameters:") pprint(parameters) t0 = time() grid_search.fit(features, salaries_enc) print("done in %0.3fs" % (time() - t0)) print() print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters = grid_search.best_params_ for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) best_estimator = pipeline.set_params(**best_parameters) params = params + " ", grid_search.cv_scores_ dio.save_model(best_estimator, model_name, mae_cv=grid_search.best_score_, parameters=params) print grid_search.cv_scores_ prediction = grid_search.predict(validation_features) dio.save_prediction(model_name, prediction, "valid_classes")
def MyGridSearch(X,y): kfold = cross_validation.KFold(len(X), 5) for train, test in kfold: #X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.5, random_state = 0) #parameters = {'kernel': ('linear', 'rbf'), 'C':[1.5, 10]} #parameters = {'kernel': ['rbf'], 'gamma': [1e-1,1e-2,1e-3,1e-4,1e-5,1e-6,1e-7,1e-8,1e-9 ], 'epsilon' : [0.1], # 'C': [1, 5, 10, 50,100,500,1000,5000,10000]} #parameters = {'kernel': ['poly'], 'gamma': [1e-2,1e-3,1e-4 ], 'epsilon' : [0.1],'degree':[3], # 'C': [ 50,100,500,1000]} parameters = {'kernel': ['rbf'], 'gamma': [1e-5], 'epsilon' : [0.2], 'C': [100000]} #parameters = [{'C': sp.stats.expon(scale=100), 'gamma': sp.stats.expon(scale=.1), # 'kernel': ['rbf'], 'class_weight':['auto', None]}] model = svm.SVR() grid = GridSearchCV(model,parameters) #grid = RandomizedSearchCV(model,parameters) grid.fit(X[train], y[train]) #print grid predictions = grid.predict(X[test]) print grid.best_score_ if grid.best_score_ > 0.98: return grid break #print grid.best_estimator_.coef_ return grid
def classification_level_RandForest_pipeline(classifications_DF): X = classifications_DF.iloc[:,3:89] #assign the target (session length) to y and convert to int y_actual = classifications_DF.iloc[:,2:3].astype(float) #scaling the data for feature selection X_scaled = preprocessing.scale(X) X_scaled_train, X_scaled_test, y_actual_train, y_actual_test = train_test_split(X_scaled, y_actual, test_size=0.3, random_state=0) # Maybe some original features where good, too? selectKbest = SelectKBest(k=1,score_func=f_regression) # Build estimator from PCA and Univariate selection: X_features = selectKbest.fit(X_scaled_train,y_actual_train).transform(X_scaled_train) randomForestReg = RandomForestRegressor(n_estimators=1, criterion='mse') # Do grid search over k, n_components and SVR parameters: pipeline = Pipeline([('selectKbest', selectKbest),('randomForestReg',randomForestReg)]) tuned_params = dict(selectKbest__k=[5,10,20,30,40,50,80], randomForestReg__n_estimators=[1,2,4,8,16,32,64], randomForestReg__min_samples_split=[2,3,5,10,20]) grid_search = GridSearchCV(pipeline, param_grid=tuned_params,scoring='mean_squared_error',cv=3,verbose=10) grid_search.fit(X_scaled_train, y_actual_train['session_length'].values) print(grid_search.best_estimator_) y_true, y_pred = y_actual_test['session_length'].values,grid_search.best_estimator_.predict(X_scaled_test) print "Mean squared error:"+str(mean_squared_error(y_true,y_pred)) pd.DataFrame(y_true, y_pred).to_csv("randomForestReg_pred_true.csv")
def run_support_vector_regressor( training_features, training_labels, test_features, test_labels, passed_parameters=None ): estimator = svm.SVR() # set up parameters for the classifier if passed_parameters == None: parameters = {"kernel": ["linear"]} else: parameters = passed_parameters # create cross validation iterator cv = ShuffleSplit(training_features.shape[0], n_iter=5, test_size=0.2, random_state=0) # set up tuning algorithm regressor = GridSearchCV(estimator=estimator, cv=cv, param_grid=parameters) # fit the classifier regressor.fit(training_features, training_labels) test_prediction = regressor.predict(test_features) test_accuracy = regressor.score(test_features, test_labels) time_2 = time.time() return test_prediction, test_accuracy
def fit_predict_model(city_data): """Find and tune the optimal model. Make a prediction on housing data.""" # Get the features and labels from the Boston housing data X, y = city_data.data, city_data.target # Setup a Decision Tree Regressor regressor = DecisionTreeRegressor() parameters = {'max_depth':(1,2,3,4,5,6,7,8,9,10)} ################################### ### Step 4. YOUR CODE GOES HERE ### ################################### # 1. Find the best performance metric # should be the same as your performance_metric procedure # http://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html dtr_scorer = make_scorer(mean_squared_error, greater_is_better=False) # 2. Use gridearch to fine tune the Decision Tree Regressor and find the best model # http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html#sklearn.grid_search.GridSearchCV reg = GridSearchCV(regressor, parameters, scoring=dtr_scorer, cv=6) # Fit the learner to the training data print "Final Model: " print reg.fit(X, y) print "Best estimator choosen by GridSearchCV: ", reg.best_estimator_ # Use the model to predict the output of a particular sample x = [11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13] y = reg.predict(x) print "House: " + str(x) print "Prediction: " + str(y)
def run_linear_experiment(self, rocs_filename, iterations=10): """ Run a classification experiment by running several iterations. In each iteration data is randomized, a linear svm classifier is trained and evaluated using cross-validation over a the cost parameter in the range np.logspace(-3, 3, 7). The best classifier is used for testing and a ROC curve is computed and saved as property and locally. :param rocs_filename: the file to save all rocs computed :param iterations: number of runs (training/testing) """ for i in xrange(iterations): print "[*] Iteration {0}".format(i) print "[*] Randomizing dataset..." self.randomize_dataset() clf = GridSearchCV(svm.LinearSVC(), {'C': np.logspace(-3, 3, 7)}) print "[*] Training..." clf.fit(self.X_train, self.Y_train) out = clf.best_estimator_.decision_function(self.X_test) print "[*] Testing..." roc = eval.compute_roc(np.float32(out.flatten()), np.float32(self.Y_test)) self.rocs.append(roc) print "[*] ROC saved." pz.save(self.rocs, rocs_filename)
def run_linear_open_experiment(self, iterations=10, save=False): """ Train a classifier on test data, obtain the best combination of parameters through a grid search cross-validation and test the classifier using a open-world split of the dataset. The results from the number of iterations are saved as pz files. :param iterations: number of runs (training/testing) :save: save predictions and labels if True """ self.true_labels = np.array([]) self.predictions = np.array([]) for i in xrange(iterations): self.randomize_dataset_open_world() clf = GridSearchCV(svm.LinearSVC(), {'C': np.logspace(-3, 3, 7)}) clf.fit(self.X_train, self.Y_train) out = clf.best_estimator_.decision_function(self.X_test) classes = clf.best_estimator_.classes_ for scores in out: m = np.max(scores) if (abs(m/scores[:][:]) < 0.5).any(): self.predictions = np.append(self.predictions, 99) else: p = classes[np.where(scores==m)] self.predictions = np.append(self.predictions, p) self.true_labels = np.append(self.true_labels, self.Y_test) if save: pz.save(self.predictions, "mca_predictions_open.pz") pz.save(self.true_labels, "mca_true_labels_open.pz")
def test_krr_regP(): dim = 5 n = 1000 ntest = 1001 pref = np.random.random(size=dim) - 0.5 #pref /= np.sqrt(pref.dot(pref)) Xtrain = np.random.random((n, dim)) + 1.0 ytrain = Xtrain.dot(pref) + np.random.normal(scale=0.05, size=n) + 10.0 Xtest = np.random.random((ntest, dim)) + 1.0 yref = Xtest.dot(pref) + 10.0 krr = kRidgeRegression(kernel=Linear(), eta=1.0) gs = GridSearchCV(krr, {'eta' : [0, 1E-16, 1E-14, 1E-12, 1E-10, 1E-8, 1E-6, 1E-4, 1E-2, 1]}) gs.fit(Xtrain, ytrain) krr = gs.best_estimator_ ytest = krr.transform(Xtest).flatten() print krr.beta.shape print krr.Ku.shape print krr.score(Xtest, yref)
def grid_search_model(clf_factory, X, Y,save_file="read/best_param.txt"): u"""最適なパラメータを調べる Args: clf_factory:機械学習モデル X:特徴量 Y:ラベル Returns: clf:最も良かったモデル """ stopwords=load_stopwords_old() cv = ShuffleSplit( n=len(X), n_iter=10, test_size=0.3, random_state=0) param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)], vect__min_df=[1, 2], vect__stop_words=[None, stopwords], vect__smooth_idf=[False, True], vect__use_idf=[False, True], vect__sublinear_tf=[False, True], vect__binary=[False, True], ) grid_search = GridSearchCV(clf_factory(), param_grid=param_grid, cv=cv, verbose=10) grid_search.fit(X, Y) clf = grid_search.best_estimator_ write_to_text(grid_search.best_params_,save_file) return clf
def run_random_forest(training_features, training_labels, test_features, test_labels, passed_parameters=None): estimator = ensemble.RandomForestRegressor(random_state=0, n_estimators=25) # set up parameters for the classifier if passed_parameters == None: parameters = {"max_depth": None} else: parameters = passed_parameters # create cross validation iterator cv = ShuffleSplit(training_features.shape[0], n_iter=5, test_size=0.2, random_state=0) # set up tuning algorithm regressor = GridSearchCV(estimator=estimator, cv=cv, param_grid=parameters) # fit the classifier regressor.fit(training_features, training_labels) test_prediction = regressor.predict(test_features) test_accuracy = regressor.score(test_features, test_labels) time_2 = time.time() return test_prediction, test_accuracy
def test_krr_regbeta(): dim = 5 n = 1000 ntest = 1001 pref = np.random.random(size=dim) - 0.5 #pref /= np.sqrt(pref.dot(pref)) Xtrain = np.random.random((n, dim)) ytrain = Xtrain.dot(pref) + np.random.normal(scale=0.05, size=n) + 10.0 Xtest = np.random.random((ntest, dim)) yref = Xtest.dot(pref) + 10.0 krr = kRidgeRegression(kernel=Linear(), eta=1.0, regularize_beta=True) gs = GridSearchCV(krr, {'eta' : [1E-6, 1E-4, 1E-2, 1, 1E2, 1E4, 1E6]}) gs.fit(Xtrain, ytrain) krr = gs.best_estimator_ ytest = krr.transform(Xtest).flatten() print krr.score(Xtest, yref)
def grid_search(X, y): ''' cross validated grid search using Ridge Regressor and Random Forest Regressor ''' nids = df_subset.index titles = df_subset['title'] pars = {'alpha': [0.8, 0.6, 0.5, 0.45, 0.4, 0.2, 0.1, 0.08, 0.07, 0.06, 0.05, 0.04, 0.03, 0.02]} gs = GridSearchCV(Ridge(), pars, cv=5) gs.fit(X, y) ridge = gs.best_estimator_ dill.dump(ridge, open('ridge.pkl', 'wb')) pars = {'max_depth': [5, 8, 10, 20, 50, 100], 'min_samples_split': [2, 3, 5, 10, 20]} gs = GridSearchCV(RFR(n_estimators=100, random_state=42, n_jobs=2), pars, cv=5) rfr = gs.best_estimator_ dill.dump(rfr, open('rfr.pkl', 'wb')) return ridge, rfr
def use_pipeline_with_fs(self): ##################### #Build a vectorizer / classifier pipeline that filters out tokens that are too rare or too frequent ##################### pipeline = Pipeline([ ('vect', TfidfVectorizer(stop_words=stopwords, min_df=3, max_df=0.90)), ("selector", SelectPercentile()), ('clf', RandomForestClassifier()), ]) # Build a grid search to find the best parameter # Fit the pipeline on the training set using grid search for the parameters parameters = { 'vect__ngram_range': [(1, 1), (1, 2), (1, 3)], 'vect__use_idf': (True, False), 'clf__n_estimators': (10, 50, 100), 'clf__criterion': ("gini", "entropy"), 'clf__max_depth': (None, 2, 4), 'clf__min_samples_split': (2, 4, 6), 'selector__score_func': (chi2, f_classif), 'selector__percentile': (85, 95, 100), } ################# # Exhaustive search over specified parameter values for an estimator, use cv to generate data to be used # implements the usual estimator API: when “fitting” it on a dataset all the possible combinations of parameter values are evaluated and the best combination is retained. ################# cv = StratifiedShuffleSplit(y_train, n_iter=5, test_size=0.2, random_state=42) grid_search = GridSearchCV(pipeline, param_grid=parameters, cv=cv, n_jobs=-1) clf_gs = grid_search.fit(docs_train, y_train) ############### # print the cross-validated scores for the each parameters set explored by the grid search ############### best_parameters, score, _ = max(clf_gs.grid_scores_, key=lambda x: x[1]) for param_name in sorted(parameters.keys()): print("%s: %r" % (param_name, best_parameters[param_name])) print("Score for gridsearch is %0.2f" % score) #y_predicted = clf_gs.predict(docs_test) ############### # run the classifier again with the best parameters # in order to get 'clf' for get_important_feature function! ############### ngram_range = best_parameters['vect__ngram_range'] use_idf = best_parameters['vect__use_idf'] score_func = best_parameters['selector__score_func'] percentile = best_parameters['selector__percentile'] # vectorisation count_vect = CountVectorizer(stop_words=stopwords, min_df=3, max_df=0.90, ngram_range=ngram_range) X_CV = count_vect.fit_transform(docs_train) # print number of unique words (n_features) print("Shape of train data is " + str(X_CV.shape)) # tfidf transformation tfidf_transformer = TfidfTransformer(use_idf=use_idf) X_tfidf = tfidf_transformer.fit_transform(X_CV) ################# # feature selection ################# selector = SelectPercentile(score_func=score_func, percentile=percentile) combined_features = Pipeline([("vect", count_vect), ("tfidf", tfidf_transformer), ("feat_select", selector)]) X_features = combined_features.fit_transform(docs_train, y_train) X_test_features = combined_features.transform(docs_test) print("Shape of train data after feature selection is " + str(X_features.shape)) print("Shape of test data after feature selection is " + str(X_test_features.shape)) # run classifier on selected features clf = RandomForestClassifier().fit(X_features, y_train) # get the features which are selected and write to file feature_boolean = selector.get_support(indices=False) f = open(path_to_store_feature_selection_boolean_file, 'w') for fb in feature_boolean: f.write(str(fb) + '\n') f.close() ################## # get cross validation score ################## scores = cross_val_score(clf, X_features, y_train, cv=10, scoring='f1_weighted') print("Cross validation score: " + str(scores)) # Get average performance of classifier on training data using 10-fold CV, along with standard deviation print("Cross validation accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) ################# # run classifier on test data ################# y_predicted = clf.predict(X_test_features) # print the mean accuracy on the given test data and labels print("Classifier score on test data is: %0.2f " % clf.score(X_test_features, y_test)) # Print and plot the confusion matrix print(metrics.classification_report(y_test, y_predicted)) cm = metrics.confusion_matrix(y_test, y_predicted) print(cm) # import matplotlib.pyplot as plt # plt.matshow(cm) # plt.show() return clf, count_vect
''' '''''' ''' 九、模型调参 3.调优max_depth,min_child_samples, min_split_gain, 确定xgb整体架构 ''' '''''' '' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' param_test1 = { 'max_depth': list(range(3, 7)), 'min_child_samples': [1, 3, 5, 10], 'min_split_gain': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 1, 2], } gsearch1 = GridSearchCV(lgbm_model, param_grid=param_test1, scoring='roc_auc', cv=5) starttime = datetime.datetime.now() gsearch1.fit(X.loc[:, chosen_final_feature], Y) endtime = datetime.datetime.now() print('第一次gridsearch耗时{0} seconds'.format((endtime - starttime).seconds)) gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_ '''可以直接读取最大auc对应的参数设置,也可以根据观察和经验选取效果好且稳定的参数组合''' #lgbm_model.set_params(max_depth=gsearch1.best_params_['max_depth']) #lgbm_model.set_params(min_child_samples=gsearch1.best_params_['min_child_samples']) #lgbm_model.set_params(min_split_gain=gsearch1.best_params_['min_split_gain']) lgbm_model.set_params(max_depth=5) lgbm_model.set_params(min_child_samples=1) lgbm_model.set_params(min_split_gain=0.3) '''选好参数后重新矫正best_n_estimators''' lgbm_model.set_params(n_estimators=500) lgbm_param_temp = lgbm_model.get_params()
### function. Because of the small size of the dataset, the script uses ### stratified shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html # Example starting point. Try investigating other evaluation techniques! feature_train, feature_test, labels_train, labels_test = \ train_test_split( features, labels, test_size=0.3, random_state=42) # from sklearn.naive_bayes import GaussianNB #Naive bayes gnb_clf = GaussianNB() parameters = {} algo = GridSearchCV(gnb_clf, parameters) print '\nGaussianNB:' algo.fit(feature_train, labels_train) test_classifier(algo.best_estimator_, my_dataset, features_list) # Testing the response of classifier without using 'person_to_poi_rate' feature in our features_list print "\n Testing of classifer by removing 'person_to_poi_rate' feature from our features_list" test_classifier(algo.best_estimator_, my_dataset, ['poi', \ 'exercised_stock_options', 'total_stock_value', \ 'bonus', 'salary', 'total'] ) #Decision Tree print '\nDecision Tree:' dt_clf = tree.DecisionTreeClassifier() parameters = {'criterion': ['gini', 'entropy'], \ 'min_samples_split': [2, 5, 10, 20], \ 'max_depth': [None, 2, 5, 10], \ 'splitter': ['random', 'best'], \
X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print "done in %0.3fs" % (time() - t0) ############################################################################### # Train a SVM classification model print "Fitting the classifier to the training set" t0 = time() param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } # for sklearn version 0.16 or prior, the class_weight parameter value is 'auto' clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) clf = clf.fit(X_train_pca, y_train) print "done in %0.3fs" % (time() - t0) print "Best estimator found by grid search:" print clf.best_estimator_ ############################################################################### # Quantitative evaluation of the model quality on the test set print "Predicting the people names on the testing set" t0 = time() y_pred = clf.predict(X_test_pca) print "done in %0.3fs" % (time() - t0) print classification_report(y_test, y_pred, target_names=target_names) print confusion_matrix(y_test, y_pred, labels=range(n_classes))
def grid_search(self, X_train, y_train, param_grid, eval_func, seed=42): gsearch = GridSearchCV(self.model, param_grid,verbose=10,cv=10) gsearch.fit(X_train,y_train) print(gsearch.best_params_)
print "样本数据量:%d, 特征个数:%d" % x.shape print "target样本数据量:%d" % y.shape[0] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=28) parameters = { 'kernel': ['rbf', 'linear'], 'C': [0.1, 0.5], 'gamma': [0.0001, 0.0005] } model = GridSearchCV(SVR(), param_grid=parameters, cv=3) model.fit(x_train, y_train) print "最优参数列表:", model.best_params_ print "最优模型:", model.best_estimator_ print "最优准确率:", model.best_score_ print "训练集准确率:%.2f%%" % (model.score(x_train, y_train) * 100) print "测试集准确率:%.2f%%" % (model.score(x_test, y_test) * 100) ## 画图 colors = ['g-', 'b-'] ln_x_test = range(len(x_test)) y_predict = model.predict(x_test) plt.figure(figsize=(16, 8), facecolor='w') plt.plot(ln_x_test, y_test, 'r-', lw=2, label=u'真实值')
"pca__n_components": [0.25, 0.5, 0.75, 1], "decision__max_depth": np.linspace(1, 20, 20).astype(np.int8) }, { "decision__max_depth": np.linspace(1, 20, 20).astype(np.int8) }, { "decision__max_depth": np.linspace(1, 20, 20).astype(np.int8) }] # 获取数据 x_train2, x_test2, y_train2, y_test2 = x_train1, x_test1, y_train1, y_test1 for t in range(3): pipe = pipes[t] gscv = GridSearchCV(pipe, param_grid=parameters[t]) gscv.fit(x_train2, y_train2) print(t, "score值:", gscv.best_score_, "最优参数列表:", gscv.best_params_) # 使用最优参数看看正确率 mms_best = MinMaxScaler() decision3 = DecisionTreeRegressor(criterion='mse', max_depth=4) x_train3, x_test3, y_train3, y_test3 = x_train1, x_test1, y_train1, y_test1 x_train3 = mms_best.fit_transform(x_train3, y_train3) x_test3 = mms_best.transform(x_test3) decision3.fit(x_train3, y_train3) print("正确率:", decision3.score(x_test3, y_test3)) # 查看各个不同深度的错误率
train_predict(clf_C,X_train_100,y_train_100, X_test,y_test) train_predict(clf_C,X_train_200,y_train_200, X_test,y_test) train_predict(clf_C,X_train_300,y_train_300, X_test,y_test) # AdaBoost Model tuning # Create the parameters list you wish to tune parameters = {'n_estimators':[20,30,40,50,60,70]} # initialize the classifier clf = clf_A # Make an f1 scoriing function using 'make_scorer' f1_scorer = make_scorer(f1_score, pos_label = 'yes') # Perform grid search on the classifier using the f1_scorer as the scoring method grid_obj = GridSearchCV(clf, param_grid = parameters, scoring = f1_scorer) # Fit the grid search object to the training data and find the optimal parameters grid_obj.fit(X_train,y_train) # Get the best tuned estimator clf = grid_obj.best_estimator_ # Report the final F1 score for training and testing after parameter tuning print print "Tuned model has a training F1 score of {:.4f}.".format(predict_labels(clf,X_train,y_train)) print "Tuned model has a testing F1 score of {:.4f}.".format(predict_labels(clf,X_test,y_test)) print "Tuned model has an optimal parameter: ", grid_obj.best_params_ print "Features importances array is :", clf.feature_importances_ print "Key Features for identifying 'Pass/Fail' are:", X_all.columns[clf.feature_importances_>0.1]
X = d[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Spouse']].values # check CV score for max depth = 3 ctree = tree.DecisionTreeClassifier(max_depth=3) np.mean(cross_val_score(ctree, X, y, cv=5, scoring='roc_auc')) # check CV score for max depth = 10 ctree = tree.DecisionTreeClassifier(max_depth=10) np.mean(cross_val_score(ctree, X, y, cv=5, scoring='roc_auc')) # Conduct a grid search for the best tree depth ctree = tree.DecisionTreeClassifier(random_state=1, min_samples_leaf=20) depth_range = range(1, 20) param_grid = dict(max_depth=depth_range) grid = GridSearchCV(ctree, param_grid, cv=5, scoring='roc_auc') grid.fit(X, y) # Check out the scores of the grid search grid_mean_scores = [result[1] for result in grid.grid_scores_] print(grid_mean_scores) # Plot the results of the grid search plt.figure() plt.plot(depth_range, grid_mean_scores) plt.hold(True) plt.grid(True) plt.plot(grid.best_params_['max_depth'], grid.best_score_, 'ro', markersize=12, markeredgewidth=1.5, markerfacecolor='None', markeredgecolor='r') # Get the best estimator
spm = SPMFeature(patch_file=patches, method=method, all_x=all_x, img_size=600) svm = SVC(kernel='linear', probability = True,random_state=42) clf = Pipeline([('spm', spm),('svm',svm)]) params = { "svm__C": [0.01, 1, 100], "spm__clusters": [256, 512, 1024] } print "SEARCHING SPM+SVM" # perform a grid search over the parameter # 如果没有score,没办法搜索参数 start = time.time() gs = GridSearchCV(clf, params, cv=2, n_jobs = -1, verbose = 1) gs.fit(x_train, y_train) # print diagnostic information to the user and grab the # best model print "\ndone in %0.3fs" % (time.time() - start) print "best score: %0.3f" % (gs.best_score_) print "SPM + SVM PARAMETERS" bestParams = gs.best_estimator_.get_params() # loop over the parameters and print each of them out # so they can be manually set for p in sorted(params.keys()): print "\t %s: %f" % (p, bestParams[p]) best = gs.best_estimator_
if run_gs: parameter_grid = { 'max_depth': [4, 6, 8], 'n_estimators': [50, 10], 'max_features': ['sqrt', 'auto', 'log2'], 'min_samples_split': [1.0, 3, 10], 'min_samples_leaf': [1, 3, 10], 'bootstrap': [True, False] } forest = RandomForestClassifier() cross_validation = StratifiedKFold(targets, n_folds=5) grid_search = GridSearchCV(forest, scoring='accuracy', param_grid=parameter_grid, cv=cross_validation) grid_search.fit(train, targets) model = grid_search parameters = grid_search.best_params_ print('Best Score: {}', format(grid_search.best_score_)) print('Best Parameters: {}', format(grid_search.best_params_)) else: parameters = {'bootstrap': False, 'min_samples_leaf': 3, 'n_estimators': 50, 'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 6} model = RandomForestClassifier(**parameters) model.fit(train, targets) print compute_score(model, train, targets, scoring='accuracy')
un_train_data, un_test_data, un_train_labels, un_test_labels = train_test_split(UnProc_Data, All_labs, test_size=0.4, random_state=0) # Create feature vectors vectorizer = TfidfVectorizer() train_vectors = vectorizer.fit_transform(train_data) # Processed feature vectors test_vectors = vectorizer.transform(test_data) un_train_vectors = vectorizer.fit_transform(un_train_data) # Unprocessed feature vectors un_test_vectors = vectorizer.transform(un_test_data) # Perform classification with Optimized SVM classifier_rbf = GridSearchCV(svm.SVC(), tuned_parameters, cv=cv, scoring=make_scorer(f1_score, pos_label='pos', average='weighted'), n_jobs=7) t0 = time.time() classifier_rbf.fit(train_vectors, train_labels) t1 = time.time() prediction_rbf = classifier_rbf.predict(test_vectors) t2 = time.time() time_rbf_train = t1 - t0 time_rbf_predict = t2 - t1 # Print results in a nice table print("Results for Optimized SVM - processed data") print("Training time: %fs; Prediction time: %fs" % (time_rbf_train, time_rbf_predict)) print(classification_report(test_labels, prediction_rbf)) print print sklearn.metrics.confusion_matrix(test_labels, prediction_rbf) t0 = time.time() classifier_rbf.fit(un_train_vectors, un_train_labels)
y_predprob = gbm0.predict_proba(X)[:,1] print ("Accuracy : %.4g" % metrics.accuracy_score(y.values, y_pred)) print ("AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob)) #拟合还可以,接下去通过调参提高模型的泛化能力 #######(1)步长(learning rate)和迭代次数(n_estimators) #######一般来说,开始选择一个较小的步长来搜索最好的迭代次数。 #######这里,我们将步长初始值设置为0.1,迭代最优的迭代次数 ################## param_test1 = {'n_estimators':list(range(20,81,10))} #python3中的range()返回的是迭代对象,需要用list()转为列表 gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, min_samples_split=300, min_samples_leaf=20,max_depth=8,max_features='sqrt', subsample=0.8,random_state=10), param_grid = param_test1, scoring='roc_auc',iid=False,cv=5) gsearch1.fit(X,y) gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_ #输出表明,最好的迭代次数是60 #######(2) 决策树最大深度max_depth和内部节点再划分所需最小样本数min_samples_split进行网格搜索,将n_estimators=60加入参数集 ################## param_test2 = {'max_depth':list(range(3,14,2)), 'min_samples_split':list(range(100,801,200))} gsearch2 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=60, min_samples_leaf=20, max_features='sqrt', subsample=0.8, random_state=10), param_grid = param_test2, scoring='roc_auc',iid=False, cv=5) gsearch2.fit(X,y) gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_ #输出如下,最好的最大树深度是7,内部节点再划分所需最小样本数是300 #由于决策树深度7是一个比较合理的值,可以把它定下来,但是对于内部节点再划分所需最小样本数min_samples_split,暂时不能一起定下来,因为这个还和决策树其他的参数存在关联。
params = { #'PCA__n_components': [2], 'SKB__k': [5, 6, 7, 8, 9, 10, 11, 12], 'SKB__score_func': [f_classif] } params.update(clf_step_params) sss = StratifiedShuffleSplit(labels_train, n_iter=20, test_size=0.5, random_state=0) gscv = GridSearchCV(pipe, params, verbose=0, scoring='f1_weighted', cv=sss) gscv.fit(features_train, labels_train) pred = gscv.predict(features_test) clf = gscv.best_estimator_ # Get the selected features # pipe.fit(features_train, labels_train) # selected_features = gscv.best_estimator_.named_steps['SKB'].get_support(indices=True) # feature_scores = gscv.best_estimator_.named_steps['SKB'].scores_ # sfs = [] # for sf in selected_features: # sfs.append((features_list[sf + 1], feature_scores[sf])) # print len(sfs), "best parameters with scores:" # for f, s in sfs: print f, "{0:.3f}".format(s) #
# Split dataset in train, test # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1) # Tuning hyperparameters for logistic regression pipe_logistic = Pipeline([('scl', StandardScaler()), ('clf', LogisticRegression(penalty='l2'))]) param_grid = {'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]} gs = GridSearchCV(estimator=pipe_logistic, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=-1) gs.fit(x_train, y_train) clf = gs.best_estimator_ gs = gs.fit(x_train, y_train) print(gs.best_score_) print(gs.best_params_) clf.fit(x_train, y_train) print('Train accuracy %.3f' % clf.score(x_train, y_train)) print('Test accuracy %.3f' % clf.score(x_test, y_test)) # Tuning hyperparameters for svc via grid search pipe_svc = Pipeline([('scl', StandardScaler()), ('clf', SVC(random_state=1, probability=True))])
gbc = GradientBoostingClassifier() xgbc = XGBClassifier() # # print cross_val_score(gbc,X_train,y_train,cv=5).mean() # print cross_val_score(xgbc,X_train,y_train,cv=5).mean() params = { 'max_depth': range(2, 7), 'n_estimators': range(100, 1200, 200), 'learning_rate': [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1] } begin = datetime.datetime.now() gs_gbc = GridSearchCV(gbc, params, n_jobs=-1, cv=5, verbose=1) gs_gbc.fit(X_train, y_train) print gs_gbc.best_score_ print gs_gbc.best_params_ print datetime.datetime.now() - begin # 0.838383838384 # {'n_estimators': 900, 'learning_rate': 0.01, 'max_depth': 4} # 0:09:58.525028 gbc_y_pre = gs_gbc.predict(X_test) gbc_submission = pd.DataFrame({ 'PassengerID': test['PassengerId'], 'Survived': gbc_y_pre }) gbc_submission.to_csv('./gbc_submission.csv', index=False) # gs_xgbc=GridSearchCV(xgbc,params,n_jobs=-1,cv=5,verbose=1) # gs_xgbc.fit(X_train,y_train)
msg_train, label_train, cv=5) plt.show() params = { 'tfidf__use_idf': (True, False), 'bow__analyzer': (split_into_lemmas, split_into_tokens), } grid = GridSearchCV( pipeline, # pipeline from above params, # parameters to tune via cross validation refit= True, # fit using all available data at the end, on the best found param combination scoring='accuracy', # what score are we optimizing? cv=StratifiedKFold(label_train, n_folds=5), # what type of cross validation to use ) nb_detector = grid.fit(msg_train, label_train) print nb_detector.grid_scores_ # print nb_detector.predict(["#Wedding / Special Occasion Wear ANN BALON Designer Chain Link Maxi Evening Dress http://goo.gl/hZHDGq "])[0] predictions = nb_detector.predict(msg_test) confusion_matrix(label_test, predictions) print classification_report(label_test, predictions) conf = sklearn.metrics.confusion_matrix(label_test, predictions) plt.imshow(conf, cmap='Accent', interpolation='nearest') plt.colorbar() # plt.imshow(np.random.random((5,5)), interpolation='nearest') plt.xticks(np.arange(0, 2), ['no', 'yes']) plt.yticks(np.arange(0, 2), ['no', 'yes']) plt.show()
axarr[idx[0], idx[1]].contourf(xx, yy, Z, alpha=0.3) axarr[idx[0], idx[1]].scatter( X_train_std[y_train==0, 0], X_train_std[y_train==0, 1], c='blue', marker='^', s=50 ) axarr[idx[0], idx[1]].scatter( X_train_std[y_train==1, 0], X_train_std[y_train==1, 1], c='red', marker='o', s=50 ) axarr[idx[0], idx[1]].set_title(tt) plt.text( -3.5, -4.5, s='Sepal width [standardized]', ha='center', va='center', fontsize=12 ) plt.text( -10.5, 4.5, s='Petal length [standardized]', ha='center', va='center', fontsize=12, rotation=90 ) plt.show() from sklearn.grid_search import GridSearchCV params = {'decisiontreeclassifier__max_depth': [1,2], 'pipeline-1__clf__C': [0.001, 0.1, 100.0]} # mv_clf.get_params() grid = GridSearchCV( estimator=mv_clf, param_grid=params, cv=10, scoring='roc_auc' ) grid.fit(X_train, y_train) for params, mean_score, scores in grid.grid_scores_: print('%0.3f+/-%0.2f %r' % (mean_score, scores.std() / 2, params)) print('Best parameters: %s' % grid.best_params_) print('Accuracy: %.2f' % grid.best_score_)
cv=3) elif conf["model"] == "KNN": param_grid = {'n_neighbors': range(5, 30, 2)} model = GridSearchCV(KNeighborsClassifier(), param_grid, cv=3) elif conf["model"] == "SVM": param_grid = { 'C': [1, 10, 100, 1000], 'kernel': ['linear'] }, { 'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf'] } model = GridSearchCV(SVC(probability=True), param_grid, cv=3) model.fit(trainData, trainLabels) print("[INFO] best hyperparameters: {}".format(model.best_params_)) # open the results file for writing and initialize the total number of accurate # rank-1 and rank-5 predictions print("[INFO] evaluating...") f = open(conf["results_path"] + conf["model"] + ".txt", "w") rank1 = 0 rank5 = 0 # loop over the testing data for (label, features) in zip(testLabels, testData): # predict the probability of each class label and grab the top-5 labels # (based on probabiltiy) preds = model.predict_proba(np.atleast_2d(features))[0] preds = np.argsort(preds)[::-1][:5]
def main(): model_name = 'Radom Forest' parser = argparse.ArgumentParser(usage=model_name) parser.add_argument("train_feature", help="Input file of training features and target") parser.add_argument("test_feature", help="Input file of test features") parser.add_argument("test_pred", help="Output file of predicted test target") parser.add_argument("--prob", action='store_true', help='Predict probability of class 1') parser.add_argument("--cores", type=int, default=-1, help='Number of cores to use') args = parser.parse_args() print(model_name) # Read training data and test data print('Read training data and test data') df_train_feature_target = pd.read_csv(args.train_feature, dtype=np.float32) df_test_feature = pd.read_csv(args.test_feature, dtype=np.float32) train_X = df_train_feature_target.values[:, :-1] train_y = df_train_feature_target.values[:, -1] test_X = df_test_feature.values # Model specification and parameter range model = RandomForestClassifier(n_jobs=-1) parameters = [{'n_estimators': [200, 100, 50, 25, 10]}] # Cross validation search print('Cross validation search') clf = GridSearchCV(model, parameters, cv=5, scoring='roc_auc', n_jobs=args.cores, pre_dispatch=args.cores, verbose=3) clf.fit(train_X, train_y) # Make predictions with the best model print('Make predictions with the best model') train_pred = clf.predict(train_X) train_pred_prob = clf.predict_proba(train_X)[:, 1] test_pred = clf.predict(test_X) test_pred_prob = clf.predict_proba(test_X)[:, 1] # Write out the prediction result print('Write out the prediction result') pd.Series(test_pred_prob if args.prob else test_pred, name='Prob' if args.prob else 'Pred') \ .to_csv(args.test_pred, index=False, header=True) # Report the result print('Report the result') print('Best Score: ', clf.best_score_) print('Best Parameter: ', clf.best_params_) print('Parameter Scores: ', clf.grid_scores_) print('Model: ', clf) print('Accuracy: ', accuracy_score(train_y, train_pred)) print('F1: ', f1_score(train_y, train_pred)) print('ROC AUC: ', roc_auc_score(train_y, train_pred_prob)) print(args.test_pred + '~~' + str(clf.best_score_))
def test_no_refit(): # Test that grid search can be used for model selection only clf = MockClassifier() grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, refit=False) grid_search.fit(X, y) assert_true(hasattr(grid_search, "best_params_"))
label = train_data['y'] #Assign training data without y as features features = train_data.drop(['y'], axis=1) #prepare training and testing sets for training X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.4, random_state=0) #select best parameters, these were reached after quite a while of trial and error parameters = {'max_depth': [3], 'learning_rate': [0.1], 'n_estimators': [50], 'silent': [False], 'objective': ['reg:linear'], 'booster': ['dart'], 'gamma': [0], 'min_child_weight': [15], 'max_delta_step': [0], 'subsample': [1], 'colsample_bytree': [0.7], 'colsample_bylevel':[0.5], 'reg_alpha': [0.0001], 'reg_lambda': [1], 'scale_pos_weight': [1], 'base_score': [0.5], 'random_state': [2018]} #Needed GridSearchCV parameter for gpu usage params = {'tree_method': 'gpu_hist'} #The scoring method indicated by problem evaluation metric on the competition page scorer = make_scorer(r2_score) #Initializing the grid object with a Gradient Boosting Regressor supplied with the parameters mentioned above and with R^2 scoring method grid_obj = GridSearchCV(XGBRegressor(**params), param_grid=parameters, scoring=scorer) #Begin training grid_fit = grid_obj.fit(X_train, y_train) #We use this line to exctract the best estimator in case we used multiple hyperparameters in the grid best_estimator = grid_fit.best_estimator_ #Then we predict on the test data split that we got from the dataset features best_predictions = best_estimator.predict(X_test) #Here we register our score score = grid_fit.score(X_test, y_test) #We print the best parameters that got us our best estimator print("best_params_:", grid_fit.best_params_) #We print the score that we got print("score:", score) #test output test_output_predictions = best_estimator.predict(test_data) #Here we store the IDs from the ID column test_data = test_data[['ID']]
X_train, X_test, y_train, y_test = train_test_split(rf_data.iloc[:, 0:42], rf_data.iloc[:, [42]], test_size=0.33, random_state=42) # 训练模型_start # 首先对n_estimators进行网格搜索 param_test1 = {'n_estimators': list(range(450, 550, 10))} gsearch1 = GridSearchCV(estimator=RandomForestRegressor(max_features="log2", min_samples_leaf=2, oob_score=True), param_grid=param_test1, scoring=None, cv=5) gsearch1.fit(X_train.iloc[:, 0:18], y_train) gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_ # 接着对决策树最大深度max_depth和内部节点再划分所需最小样本数min_samples_split进行网格搜索。 param_test2 = { 'max_depth': list(range(80, 100, 2)), 'min_samples_split': list(range(2, 101, 2)) } gsearch2 = GridSearchCV(estimator=RandomForestRegressor(n_estimators=50, max_features="log2", min_samples_leaf=2, oob_score=True), param_grid=param_test2, scoring=None, iid=False, cv=5)
parameter_candidates = [ { 'C': [1, 10, 100, 1000], 'kernel': ['linear'] }, { 'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf'] }, ] clf = GridSearchCV(estimator=svm.SVC(), param_grid=parameter_candidates, n_jobs=-1) clf.fit(x_train, y_train) prediction = clf.predict(x_test) print('Best score for training data:', clf.best_score_) print('Best `C`:', clf.best_estimator_.C) print('Best kernel:', clf.best_estimator_.kernel) print('Best `gamma`:', clf.best_estimator_.gamma) #clf = cluster.KMeans(init='k-means++', n_clusters=5, random_state=42) #clf.fit(x_train) #prediction = clf.fit_predict(x_train) fig, ax = plt.subplots(2, 2, figsize=(8, 4)) print("Start Down Scale") from sklearn.decomposition import PCA
# In[49]: param = { 'max_depth': [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], 'min_samples_split': [4, 8, 12, 16, 20, 25], 'criterion': ['gini', 'entropy'] } grid_search_params = { 'estimator': DecisionTreeClassifier(), 'param_grid': param, # 前面定义的我们想要优化的参数 'cv': ps, # 使用前面自定义的split验证策略 'n_jobs': -1 } # 并行运行的任务数,-1表示使用所有CPU gridsearch = GridSearchCV(**grid_search_params) gridsearch.fit(train_val_features, train_val_labels) # In[86]: import pandas as pd cv_result = pd.DataFrame.from_dict(gridsearch.grid_scores_) criterion, max_depth, min_samples_split, score = [], [], [], [] for i in range(len(cv_result)): criterion.append(cv_result['parameters'][i]['criterion']) max_depth.append(cv_result['parameters'][i]['max_depth']) min_samples_split.append(cv_result['parameters'][i]['min_samples_split']) # score.append(str(cv_result['cv_validation_scores'][i]).split('[')[1].split(']')[0]) score.append(cv_result['cv_validation_scores'][i]) df = pd.DataFrame({ 'criterion': criterion, 'max_depth': max_depth,
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) for train_index, test_index in kf: xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index]) predictions = xgb_model.predict(X[test_index]) actuals = y[test_index] print(confusion_matrix(actuals, predictions)) print("Boston Housing: regression") boston = load_boston() y = boston['target'] X = boston['data'] kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) for train_index, test_index in kf: xgb_model = xgb.XGBRegressor().fit(X[train_index], y[train_index]) predictions = xgb_model.predict(X[test_index]) actuals = y[test_index] print(mean_squared_error(actuals, predictions)) print("Parameter optimization") y = boston['target'] X = boston['data'] xgb_model = xgb.XGBRegressor() clf = GridSearchCV(xgb_model, { 'max_depth': [2, 4, 6], 'n_estimators': [50, 100, 200] }, verbose=1) clf.fit(X, y) print(clf.best_score_) print(clf.best_params_)
features.append(FeatureExtractor.getFeatures(image)) features = np.array(features) labels = np.array(image_labels) selector = SelectKBest(chi2, k='all') scaler = StandardScaler().fit(features) features = scaler.transform(features) if calculate_best_params: C_range = 10.**np.arange(-10, 10) gamma_range = 10.**np.arange(-10, 10) param_grid = dict(gamma=gamma_range, C=C_range) grid = GridSearchCV(SVC(), param_grod=param_grid, cv=StratifiedKFold(labels, 5)) grid.fit(features, labels) print "Best classifier is :", grid.best_estimator_ classifier = SVC(kernel="rbf", gamma=0.1, C=100.0, probability=True, class_weight=None, coef0=0.0, degree=3, shrinking=True, tol=0.001, verbose=False) classifier.fit(features, labels) joblib.dump((classifier, training_names, normalization_parameter, features,
param_test1 = {'max_depth': [5], 'min_child_weight': [1]} gsearch1 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.05, n_estimators=160, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='multi:softprob', scale_pos_weight=1, seed=123), param_grid=param_test1, scoring='neg_log_loss', iid=False, cv=5) gsearch1.fit(train, y) print gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_ param_test2 = { 'max_depth': [4, 5, 6], } gsearch2 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.05, n_estimators=150, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='multi:softprob', scale_pos_weight=1, seed=123),
#regressorAtual = names[min_index] #if (regressorAtual == 'gbm'): print "procurando os melhores parametros para GBM" for train_index_skf, test_index_skf in skf.split(X_data_train): X_train_skf, X_test_skf = \ X_data_train[train_index_skf], X_data_train[test_index_skf] Y_train_skf, Y_test_skf = \ values[train_index_skf], values[test_index_skf] print "Fazendo o GridSearsh para o Gradien Boosting regressor...." clf3 = GridSearchCV(gbmObj, param_grid=parameters_gbm, scoring='neg_mean_absolute_error', n_jobs=n_cores) clf3.fit(X_train_skf, Y_train_skf) print "Finalizado o GridSearsh para a Gradien Boosting regressor." print "MAE do Gradien Boosting obtido para o conjunto de treino: " + str( -clf3.best_score_) + "com os parametros: " + str(clf3.best_params_) gbmPreditcTestKfold = clf3.predict(X_test_skf) MAE_GBM = mean_absolute_error(Y_test_skf, gbmPreditcTestKfold) print "MAE_GBM obtido para o conjunto de teste: " + str(MAE_GBM) if (MAE_GBM < bestScore_gbm): bestScore_gbm = MAE_GBM gbm_n_estimators_best = clf3.best_params_['n_estimators'] gbm_max_features_best = clf3.best_params_['max_features'] gbm_max_depth_best = clf3.best_params_['max_depth'] gbm_learning_rate_best = clf3.best_params_['learning_rate'] print "melhor GBM parametros ate o momento: " + str(bestScore_gbm) + " n_estimators: " + str( gbm_n_estimators_best) \ + " max_features: " + str(gbm_max_features_best) + " max_depth: " + str(gbm_max_depth_best) + \
# The last column describes the targets explanatory_variable_columns.remove(len(df.columns.values) - 1) y = [1 if e == 'ad.' else 0 for e in response_variable_column] X = df[list(explanatory_variable_columns)] X.replace(to_replace=' *\?', value=-1, regex=True, inplace=True) X_train, X_test, y_train, y_test = train_test_split(X, y) pipeline = Pipeline([('clf', DecisionTreeClassifier(criterion='entropy'))]) parameters = { 'clf__max_depth': (150, 155, 160), 'clf__min_samples_split': (1, 2, 3), 'clf__min_samples_leaf': (1, 2, 3) } grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='f1') grid_search.fit(X_train, y_train) print('Best score: %0.3f' % grid_search.best_score_) print('Best parameters set:') best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print('\t%s: %r' % (param_name, best_parameters[param_name])) predictions = grid_search.predict(X_test) print(classification_report(y_test, predictions)) print(grid_search.score(X_test, y_test))