def search_parameters(data_file): with open(data_file, 'r') as f: data = pickle.load(f) labels = data['labels'] features = data['features'] # Split the dataset in two equal parts X_train, X_test, y_train, y_test = cross_validation.train_test_split( features, labels, test_size=0.5, random_state=0) scores = [ ('error_rate', zero_one_score),] #classifier = svm.LinearSVC() classifier = MultinomialNB() tuned_parameters = {'alpha' :(0.001, 0.01,0.1,0.5,1,1.5,2,5,10) } #tuned_parameters = {'C' :(0.00001, 0.001, 0.01, 0.1,0.5,1,1.5,2,5,10,20,50,100,500,1000)} for score_name, score_func in scores: print "# Tuning hyper-parameters for %s" % score_name print clf = GridSearchCV(classifier, tuned_parameters, score_func=score_func) clf.fit(X_train, y_train, cv=5) print "Best parameters set found on development set:" best_parameters, score,_ = max(clf.grid_scores_, key=lambda x: x[1]) for param_name in sorted(tuned_parameters.keys()): print "%s: %r" % (param_name, best_parameters[param_name])
def test_grid_search_precomputed_kernel(): """Test that grid search works when the input features are given in the form of a precomputed kernel matrix """ X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) # compute the training kernel matrix corresponding to the linear kernel K_train = np.dot(X_[:180], X_[:180].T) y_train = y_[:180] clf = SVC(kernel='precomputed') cv = GridSearchCV(clf, {'C': [0.1, 1.0]}) cv.fit(K_train, y_train) assert_true(cv.best_score_ >= 0) # compute the test kernel matrix K_test = np.dot(X_[180:], X_[:180].T) y_test = y_[180:] y_pred = cv.predict(K_test) assert_true(np.mean(y_pred == y_test) >= 0) # test error is raised when the precomputed kernel is not array-like # or sparse assert_raises(ValueError, cv.fit, K_train.tolist(), y_train)
def do_cross_validation(self, param_grid, svmtype, score_func, inputdata_train, outputdata_train, inputdata_test, outputdata_test): """ Fitting of classifier used for cross validation """ if svmtype == 'ln': svm_clf = LinearSVC() if svmtype == 'rbf': svm_clf = SVC() #clf_cv = GridSearchCV(SVC(), param_grid, score_func=score_func, n_jobs=-1 ) #clf_cv = GridSearchCV( LinearSVC(), param_grid, score_func=score_func, n_jobs=-1 ) clf_cv = GridSearchCV(svm_clf, param_grid, score_func=score_func, n_jobs=-1 ) clf_cv.fit(inputdata_train, outputdata_train) y_pred_cv = clf_cv.predict(inputdata_test) f1 = metrics.f1_score(outputdata_test, y_pred_cv, pos_label=0) dict_param = clf_cv.best_params_ c = dict_param['C'] if svmtype == 'rbf': gamma1 = dict_param['gamma'] else: gamma1 = 0 return(f1, gamma1, c)
def test_krr_regbeta(): dim = 5 n = 1000 ntest = 1001 pref = np.random.random(size=dim) - 0.5 #pref /= np.sqrt(pref.dot(pref)) Xtrain = np.random.random((n, dim)) ytrain = Xtrain.dot(pref) + np.random.normal(scale=0.05, size=n) + 10.0 Xtest = np.random.random((ntest, dim)) yref = Xtest.dot(pref) + 10.0 krr = kRidgeRegression(kernel=Linear(), eta=1.0, regularize_beta=True) gs = GridSearchCV(krr, {'eta' : [1E-6, 1E-4, 1E-2, 1, 1E2, 1E4, 1E6]}) gs.fit(Xtrain, ytrain) krr = gs.best_estimator_ ytest = krr.transform(Xtest).flatten() print krr.score(Xtest, yref)
def learn(tuned_parameters,model): # produceFeature(trainfile) dataset = genfromtxt(open('Data/'+trainfile,'r'), delimiter=',',dtype='f8')[0:] target = [x[0] for x in dataset] train = [x[1:] for x in dataset] # print train[1:10] # print target # print len(train) # produceFeature(testfile) test = genfromtxt(open('Data/'+testfile,'r'),delimiter=',',dtype='f8')[0:] test_target = [x[1:] for x in test] # X, y = digits.data, digits.target trainnp = np.asarray(train) targetnp = np.asarray(target) # turn the data in a (samples, feature) matrix: X, y = trainnp, targetnp # X = digits.images.reshape((n_samples, -1)) # y = digits.target # Split the dataset in two equal parts X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.5, random_state=0) scores = ['precision', 'recall'] for score in scores: print("# Tuning hyper-parameters for %s" % score) print() clf = GridSearchCV(model, tuned_parameters, cv=5, scoring='%s_weighted' % score) clf.fit(X_train, y_train) print("Best parameters set found on development set:") print() print(clf.best_params_) print() print("Grid scores on development set:") print() for params, mean_score, scores in clf.grid_scores_: print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params)) print() print("Detailed classification report:") print() print("The model is trained on the full development set.") print("The scores are computed on the full evaluation set.") print() y_true, y_pred = y_test, clf.predict(X_test) print(classification_report(y_true, y_pred)) print()
def train_classifier(data, labels): nIter = 50 alphaVals = [10**i for i in range(3,5)] params = { "loss": ["log"], "penalty": ['l1', 'l2'], "n_iter": [nIter], "alpha": alphaVals } params_log = { "penalty": ['l2'] , "C": [10**i for i in range(-3,-1)] } #sgd = SGDClassifier() sgd = LogisticRegression() clf = GridSearchCV(sgd, params_log) #data = data.tocsr()[:, 0:13] train, val, t_labs, val_labs = train_test_split(data,labels, train_size=.2, random_state=44) s = time.time() clf.fit(train, t_labs) print "Elapsed Training Time for ", len(params_log['C']), 'regularization vals: ', time.time() - s print clf.best_params_ print "The Validation Score: ", clf.score(val, val_labs) probs = clf.predict_proba(val) print "The log loss for the validation set is" print log_loss(probs[:,1], val_labs) return clf
def test_ovo_gridsearch(): ovo = OneVsOneClassifier(LinearSVC(random_state=0)) Cs = [0.1, 0.5, 0.8] cv = GridSearchCV(ovo, {'estimator__C': Cs}) cv.fit(iris.data, iris.target) best_C = cv.best_estimator_.estimators_[0].C assert_true(best_C in Cs)
def Gridsearch_impl(X,Y,clf,param,cv): grid_search = GridSearchCV(clf,param,verbose=10,cv=cv,n_jobs=10) start = time() grid_search.fit(X,Y) # print(grid_search.grid_scores_) best = report(grid_search.grid_scores_)
def getOptCandGamma(cv_train, cv_label): print "Finding optimal C and gamma for SVM with RBF Kernel" C_range = 10.0 ** np.arange(-2, 9) gamma_range = 10.0 ** np.arange(-5, 4) param_grid = dict(gamma=gamma_range, C=C_range) cv = StratifiedKFold(y=cv_label, n_folds=40) # Use the svm.SVC() as the cost function to evaluate parameter choices # NOTE: Perhaps we should run computations in parallel if needed. Does it # do that already within the class? grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv) grid.fit(cv_train, cv_label) score_dict = grid.grid_scores_ scores = [x[1] for x in score_dict] scores = np.array(scores).reshape(len(C_range), len(gamma_range)) pl.figure(figsize=(8,6)) pl.subplots_adjust(left=0.05, right=0.95, bottom=0.15, top=0.95) pl.imshow(scores, interpolation='nearest', cmap=pl.cm.spectral) pl.xlabel('gamma') pl.ylabel('C') pl.colorbar() pl.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45) pl.yticks(np.arange(len(C_range)), C_range) pl.show() print "The best classifier is: ", grid.best_estimator_
def run_gridsearch(X, y, clf, param_grid, cv=5): """Run a grid search for best Decision Tree parameters. Args ---- X -- features y -- targets (classes) cf -- scikit-learn Decision Tree param_grid -- [dict] parameter settings to test cv -- fold of cross-validation, default 5 Returns ------- top_params -- [dict] from report() """ grid_search = GridSearchCV(clf, param_grid=param_grid, cv=cv,scoring = 'recall') start = time() grid_search.fit(X, y) print(("\nGridSearchCV took {:.2f} " "seconds for {:d} candidate " "parameter settings.").format(time() - start, len(grid_search.grid_scores_))) top_params = report(grid_search.grid_scores_, 3) return top_params
def model_search(estimator, tuned_params, scores, X_train, y_train, X_test, y_test): cv = ShuffleSplit(len(X_train), n_iter=3, test_size=0.30, random_state=0) for score in scores: print"# Tuning hyper-parameters for %s" % score print clf = GridSearchCV(estimator, tuned_params, cv=cv, scoring='%s' % score) clf.fit(X_train, y_train) print"Best parameters set found on development set:" print print clf.best_params_ print print "Grid scores on development set:" print for params, mean_score, scores in clf.grid_scores_: print "%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params) print print "Detailed classification report:" print print "The model is trained on the full development set." print "The scores are computed on the full evaluation set." print y_true, y_pred = y_test, clf.predict(X_test) print classification_report(y_true, y_pred) print
def grid_search(dataset_loader_train, model, grid_search): with timer(logger.info, "Loading data"): X, y = dataset_loader_train() grid_search_kwargs = { 'refit': False, } grid_search_kwargs.update(grid_search) cv = grid_search_kwargs.get('cv', None) if callable(cv): grid_search_kwargs['cv'] = apply_kwargs(cv, n=len(y), y=y) if not (hasattr(model, 'score') or 'scoring' in grid_search_kwargs): raise ValueError( "Your model doesn't seem to implement a 'score' method. You may " "want to pass a 'scoring' argument to 'grid_search' instead." ) with timer(logger.info, "Running grid search"): gs = GridSearchCV(model, **grid_search_kwargs) gs.fit(X, y) scores = sorted(gs.grid_scores_, key=lambda x: -x.mean_validation_score) logger.info("\n{}".format(pformat(scores))) return scores
def dogridsearch(X,Y,param_space,clf,cv): grid_search = GridSearchCV(clf,param_space,verbose=10l,cv=cv,n_jobs=-1) start = time() grid_search.fit(X,Y) print("GridSearchCV took %.2f seconds for %d candidate parameter settings." % (time() - start, len(grid_search.grid_scores_))) best = report(grid_search.grid_scores_)
def test_nntools_functional_grid_search(mnist, monkeypatch): # Make sure that we can satisfy the grid search interface. from nolearn.nntools import NeuralNet nn = NeuralNet( layers=[], X_tensor_type=T.matrix, ) param_grid = { 'more_params': [{'hidden_num_units': 100}, {'hidden_num_units': 200}], 'update_momentum': [0.9, 0.98], } X, y = mnist vars_hist = [] def fit(self, X, y): vars_hist.append(vars(self).copy()) return self with patch.object(NeuralNet, 'fit', autospec=True) as mock_fit: mock_fit.side_effect = fit with patch('nolearn.nntools.NeuralNet.score') as score: score.return_value = 0.3 gs = GridSearchCV(nn, param_grid, cv=2, refit=False, verbose=4) gs.fit(X, y) assert [entry['update_momentum'] for entry in vars_hist] == [ 0.9, 0.9, 0.98, 0.98] * 2 assert [entry['more_params'] for entry in vars_hist] == ( [{'hidden_num_units': 100}] * 4 + [{'hidden_num_units': 200}] * 4 )
def getGridSearch(self): # Set the search parameters parameters = {'vect__ngram_range': [(1,1),(1,2)], # Try either words or bi grams 'vect__max_df': (0.5, 0.1, 0.09), #'vect__max_features': (None, 5000, 10000, 50000), 'tfidf__use_idf': (True, False), 'tfidf__norm': ('l1', 'l2'), 'clf__penalty': ('l2', 'elasticnet', 'l1'), # Default l2 'clf__alpha': (0.0001, 0.0009), # Default 0.0001 #'clf_fit_intercept': (True, False), # Default True 'clf__n_iter': (5, 50, 25), # Default 1 or 5 depending, optional #'clf__random_state':(0, 42), # Default None 'clf__epsilon':(0.01, 0.005)} # Default 0.01, depends on classifier (loss) # Use all cores to create a grid search classifierGS = GridSearchCV(self.pipeline, parameters, n_jobs=-1) # Fit the CS estimator for use as a classifier classifierGS = classifierGS.fit(self.tweets, self.labels) # Get the scores using the GS classifier bestParam, score, _ = max(classifierGS.grid_scores_, key=lambda x: x[1]) # Print the parameter values for param_name in sorted(parameters.keys()): print("%s: %r" % (param_name,bestParam[param_name])) # Print the classifier score print("Classifier score: " + str(score) + "\n") # End of func return statement return
def score_nestedCV(self, G1, model, param_grid, effect, nested): k_fold = cross_validation.KFold(n=self.Y.shape[0], n_folds=self.n_folds, indices=True) i_fold=0 scores = sp.zeros(self.n_folds) params = list() for train, test in k_fold: (trainData, trainY) = self._packData(G1, train, effect) (testData, testY) = self._packData(G1, test, effect) if nested: clf = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs = self.n_jobs_grid, cv=self.n_folds_params, scoring=self.scoring, verbose=self.verbose) clf.fit(trainData, trainY.flatten()) params.append(clf.best_params_) scores[i_fold] = clf.score(testData, testY.flatten(), method_scorer=False) else: model.fit(trainData, trainY.flatten()) scores[i_fold] = SCORERS[self.scoring](model, testData, testY.flatten()) i_fold+=1 return scores,params
def estimateParameters(X_train, X_test, y_train, y_test): tuned_parameters = [{'kernel': ['rbf'], \ 'gamma': [1e-3, 1e-4], \ 'C': [1, 10, 100, 1000]}, \ {'kernel': ['linear'], \ 'C': [1, 10, 100, 1000]}] scores = ['precision', 'recall'] for score in scores: print("# Tuning hyper-parameters for %s\n" % score) clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5, scoring=score) clf.fit(X_train, y_train) print("Best parameters set found on development set:\n") print(clf.best_estimator_) print("\nGrid scores on development set:\n") for params, mean_score, scores in clf.grid_scores_: print("%.3f (+/-%.03f) for %r" % (mean_score, scores.std() / 2, params)) print("\nDetailed classification report:") print("The model is trained on the full development set.") print("The scores are computed on the full evaluation set.") y_true, y_pred = y_test, clf.predict(X_test) print(classification_report(y_true, y_pred)) print()
def make_grid_search(pipeline, parameters, model_name, params): print model_name grid_search = GridSearchCV(pipeline, parameters, n_jobs=4, verbose=3, #loss_func=f1_score, scoring="f1", iid=False, refit=True) #model_name = "ExtraTree_min_sample2_10trees_gridcv_desc_log" print("Performing grid search...") print("pipeline:", pipeline) # [name for name, _ in pipeline.steps]) print("parameters:") pprint(parameters) t0 = time() grid_search.fit(features, salaries_enc) print("done in %0.3fs" % (time() - t0)) print() print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters = grid_search.best_params_ for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) best_estimator = pipeline.set_params(**best_parameters) params = params + " ", grid_search.cv_scores_ dio.save_model(best_estimator, model_name, mae_cv=grid_search.best_score_, parameters=params) print grid_search.cv_scores_ prediction = grid_search.predict(validation_features) dio.save_prediction(model_name, prediction, "valid_classes")
def separable_demo(): """ Generate a linearly-separable dataset D, train a linear SVM on D, then output the resulting decision boundary on a figure. """ from sklearn.datasets import make_blobs X, y = make_blobs(n_samples=200, n_features=2, centers=((0,0), (4, 4)), cluster_std=1.0) plot_data(X, y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) svc = svm.SVC(class_weight='auto') param_grid = {'kernel': ['linear'], 'C': [1e0, 1e1, 1e2, 1e3, 1e4]} strat_2fold = StratifiedKFold(y_train, k=2) print " Parameters to be chosen through cross validation:" for name, vals in param_grid.iteritems(): if name != 'kernel': print " {0}: {1}".format(name, vals) clf = GridSearchCV(svc, param_grid, n_jobs=1, cv=strat_2fold) clf.fit(X_train, y_train) print "== Best Parameters:", clf.best_params_ y_pred = clf.predict(X_test) acc = len(np.where(y_pred == y_test)[0]) / float(len(y_pred)) print "== Accuracy:", acc print classification_report(y_test, y_pred) plot_svm(clf.best_estimator_, X, y, X_test, y_test, title="SVM Decision Boundary, Linear Kernel ({0} accuracy, C={1})".format(acc, clf.best_params_['C']))
def classification_level_RandForest_pipeline(classifications_DF): X = classifications_DF.iloc[:,3:89] #assign the target (session length) to y and convert to int y_actual = classifications_DF.iloc[:,2:3].astype(float) #scaling the data for feature selection X_scaled = preprocessing.scale(X) X_scaled_train, X_scaled_test, y_actual_train, y_actual_test = train_test_split(X_scaled, y_actual, test_size=0.3, random_state=0) # Maybe some original features where good, too? selectKbest = SelectKBest(k=1,score_func=f_regression) # Build estimator from PCA and Univariate selection: X_features = selectKbest.fit(X_scaled_train,y_actual_train).transform(X_scaled_train) randomForestReg = RandomForestRegressor(n_estimators=1, criterion='mse') # Do grid search over k, n_components and SVR parameters: pipeline = Pipeline([('selectKbest', selectKbest),('randomForestReg',randomForestReg)]) tuned_params = dict(selectKbest__k=[5,10,20,30,40,50,80], randomForestReg__n_estimators=[1,2,4,8,16,32,64], randomForestReg__min_samples_split=[2,3,5,10,20]) grid_search = GridSearchCV(pipeline, param_grid=tuned_params,scoring='mean_squared_error',cv=3,verbose=10) grid_search.fit(X_scaled_train, y_actual_train['session_length'].values) print(grid_search.best_estimator_) y_true, y_pred = y_actual_test['session_length'].values,grid_search.best_estimator_.predict(X_scaled_test) print "Mean squared error:"+str(mean_squared_error(y_true,y_pred)) pd.DataFrame(y_true, y_pred).to_csv("randomForestReg_pred_true.csv")
def classification_level_SGDReg_pipeline(classifications_DF): X = classifications_DF.iloc[:,3:89] #assign the target (session length) to y and convert to int y_actual = classifications_DF.iloc[:,2:3].astype(float) #scaling the data for feature selection X_scaled = preprocessing.scale(X) X_scaled_train, X_scaled_test, y_actual_train, y_actual_test = train_test_split(X_scaled, y_actual, test_size=0.5, random_state=0) pca_selection = PCA(n_components=2) X_features = pca_selection.fit(X_scaled_train['session_length'].values).transform(X_scaled_train) SGDReg = SGDRegressor(alpha=0.0001) # Do grid search over k, n_components and SVR parameters: pipeline = Pipeline([('pca', pca_selection),('SGDReg',SGDReg)]) tuned_params = dict(pca__n_components=[5,30,40,50], SGDReg__alpha=[0.1,0.01,0.001,0.0001,0.00001], SGDReg__l1_ratio=[.05, .15, .5, .7, .9, .95, .99, 1], SGDReg__penalty=['l2','l1','elasticnet']) grid_search = GridSearchCV(pipeline, param_grid=tuned_params,scoring='mean_squared_error',cv=3,verbose=10) grid_search.fit(X_scaled_train, y_actual_train['session_length'].values) print(grid_search.best_estimator_) y_true, y_pred = y_actual_test['session_length'].values,grid_search.best_estimator_.predict(X_scaled_test) print "Mean squared error:"+str(mean_squared_error(y_true,y_pred)) pd.DataFrame(y_true, y_pred).to_csv("SGDReg_pred_true.csv")
def grid_search(X, y): ''' cross validated grid search using Ridge Regressor and Random Forest Regressor ''' nids = df_subset.index titles = df_subset['title'] pars = {'alpha': [0.8, 0.6, 0.5, 0.45, 0.4, 0.2, 0.1, 0.08, 0.07, 0.06, 0.05, 0.04, 0.03, 0.02]} gs = GridSearchCV(Ridge(), pars, cv=5) gs.fit(X, y) ridge = gs.best_estimator_ dill.dump(ridge, open('ridge.pkl', 'wb')) pars = {'max_depth': [5, 8, 10, 20, 50, 100], 'min_samples_split': [2, 3, 5, 10, 20]} gs = GridSearchCV(RFR(n_estimators=100, random_state=42, n_jobs=2), pars, cv=5) rfr = gs.best_estimator_ dill.dump(rfr, open('rfr.pkl', 'wb')) return ridge, rfr
def run_random_forest(training_features, training_labels, test_features, test_labels, passed_parameters=None): estimator = ensemble.RandomForestRegressor(random_state=0, n_estimators=25) # set up parameters for the classifier if passed_parameters == None: parameters = {"max_depth": None} else: parameters = passed_parameters # create cross validation iterator cv = ShuffleSplit(training_features.shape[0], n_iter=5, test_size=0.2, random_state=0) # set up tuning algorithm regressor = GridSearchCV(estimator=estimator, cv=cv, param_grid=parameters) # fit the classifier regressor.fit(training_features, training_labels) test_prediction = regressor.predict(test_features) test_accuracy = regressor.score(test_features, test_labels) time_2 = time.time() return test_prediction, test_accuracy
def run_linear_open_experiment(self, iterations=10, save=False): """ Train a classifier on test data, obtain the best combination of parameters through a grid search cross-validation and test the classifier using a open-world split of the dataset. The results from the number of iterations are saved as pz files. :param iterations: number of runs (training/testing) :save: save predictions and labels if True """ self.true_labels = np.array([]) self.predictions = np.array([]) for i in xrange(iterations): self.randomize_dataset_open_world() clf = GridSearchCV(svm.LinearSVC(), {'C': np.logspace(-3, 3, 7)}) clf.fit(self.X_train, self.Y_train) out = clf.best_estimator_.decision_function(self.X_test) classes = clf.best_estimator_.classes_ for scores in out: m = np.max(scores) if (abs(m/scores[:][:]) < 0.5).any(): self.predictions = np.append(self.predictions, 99) else: p = classes[np.where(scores==m)] self.predictions = np.append(self.predictions, p) self.true_labels = np.append(self.true_labels, self.Y_test) if save: pz.save(self.predictions, "mca_predictions_open.pz") pz.save(self.true_labels, "mca_true_labels_open.pz")
def fit_predict_model(city_data): """Find and tune the optimal model. Make a prediction on housing data.""" # Get the features and labels from the Boston housing data X, y = city_data.data, city_data.target # Setup a Decision Tree Regressor regressor = DecisionTreeRegressor() parameters = {'max_depth':(1,2,3,4,5,6,7,8,9,10)} ################################### ### Step 4. YOUR CODE GOES HERE ### ################################### # 1. Find the best performance metric # should be the same as your performance_metric procedure # http://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html dtr_scorer = make_scorer(mean_squared_error, greater_is_better=False) # 2. Use gridearch to fine tune the Decision Tree Regressor and find the best model # http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html#sklearn.grid_search.GridSearchCV reg = GridSearchCV(regressor, parameters, scoring=dtr_scorer, cv=6) # Fit the learner to the training data print "Final Model: " print reg.fit(X, y) print "Best estimator choosen by GridSearchCV: ", reg.best_estimator_ # Use the model to predict the output of a particular sample x = [11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13] y = reg.predict(x) print "House: " + str(x) print "Prediction: " + str(y)
def grid_search_model(clf_factory, X, Y,save_file="read/best_param.txt"): u"""最適なパラメータを調べる Args: clf_factory:機械学習モデル X:特徴量 Y:ラベル Returns: clf:最も良かったモデル """ stopwords=load_stopwords_old() cv = ShuffleSplit( n=len(X), n_iter=10, test_size=0.3, random_state=0) param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)], vect__min_df=[1, 2], vect__stop_words=[None, stopwords], vect__smooth_idf=[False, True], vect__use_idf=[False, True], vect__sublinear_tf=[False, True], vect__binary=[False, True], ) grid_search = GridSearchCV(clf_factory(), param_grid=param_grid, cv=cv, verbose=10) grid_search.fit(X, Y) clf = grid_search.best_estimator_ write_to_text(grid_search.best_params_,save_file) return clf
def run_linear_experiment(self, rocs_filename, iterations=10): """ Run a classification experiment by running several iterations. In each iteration data is randomized, a linear svm classifier is trained and evaluated using cross-validation over a the cost parameter in the range np.logspace(-3, 3, 7). The best classifier is used for testing and a ROC curve is computed and saved as property and locally. :param rocs_filename: the file to save all rocs computed :param iterations: number of runs (training/testing) """ for i in xrange(iterations): print "[*] Iteration {0}".format(i) print "[*] Randomizing dataset..." self.randomize_dataset() clf = GridSearchCV(svm.LinearSVC(), {'C': np.logspace(-3, 3, 7)}) print "[*] Training..." clf.fit(self.X_train, self.Y_train) out = clf.best_estimator_.decision_function(self.X_test) print "[*] Testing..." roc = eval.compute_roc(np.float32(out.flatten()), np.float32(self.Y_test)) self.rocs.append(roc) print "[*] ROC saved." pz.save(self.rocs, rocs_filename)
def run_support_vector_regressor( training_features, training_labels, test_features, test_labels, passed_parameters=None ): estimator = svm.SVR() # set up parameters for the classifier if passed_parameters == None: parameters = {"kernel": ["linear"]} else: parameters = passed_parameters # create cross validation iterator cv = ShuffleSplit(training_features.shape[0], n_iter=5, test_size=0.2, random_state=0) # set up tuning algorithm regressor = GridSearchCV(estimator=estimator, cv=cv, param_grid=parameters) # fit the classifier regressor.fit(training_features, training_labels) test_prediction = regressor.predict(test_features) test_accuracy = regressor.score(test_features, test_labels) time_2 = time.time() return test_prediction, test_accuracy
def test_krr_regP(): dim = 5 n = 1000 ntest = 1001 pref = np.random.random(size=dim) - 0.5 #pref /= np.sqrt(pref.dot(pref)) Xtrain = np.random.random((n, dim)) + 1.0 ytrain = Xtrain.dot(pref) + np.random.normal(scale=0.05, size=n) + 10.0 Xtest = np.random.random((ntest, dim)) + 1.0 yref = Xtest.dot(pref) + 10.0 krr = kRidgeRegression(kernel=Linear(), eta=1.0) gs = GridSearchCV(krr, {'eta' : [0, 1E-16, 1E-14, 1E-12, 1E-10, 1E-8, 1E-6, 1E-4, 1E-2, 1]}) gs.fit(Xtrain, ytrain) krr = gs.best_estimator_ ytest = krr.transform(Xtest).flatten() print krr.beta.shape print krr.Ku.shape print krr.score(Xtest, yref)
def MyGridSearch(X,y): kfold = cross_validation.KFold(len(X), 5) for train, test in kfold: #X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.5, random_state = 0) #parameters = {'kernel': ('linear', 'rbf'), 'C':[1.5, 10]} #parameters = {'kernel': ['rbf'], 'gamma': [1e-1,1e-2,1e-3,1e-4,1e-5,1e-6,1e-7,1e-8,1e-9 ], 'epsilon' : [0.1], # 'C': [1, 5, 10, 50,100,500,1000,5000,10000]} #parameters = {'kernel': ['poly'], 'gamma': [1e-2,1e-3,1e-4 ], 'epsilon' : [0.1],'degree':[3], # 'C': [ 50,100,500,1000]} parameters = {'kernel': ['rbf'], 'gamma': [1e-5], 'epsilon' : [0.2], 'C': [100000]} #parameters = [{'C': sp.stats.expon(scale=100), 'gamma': sp.stats.expon(scale=.1), # 'kernel': ['rbf'], 'class_weight':['auto', None]}] model = svm.SVR() grid = GridSearchCV(model,parameters) #grid = RandomizedSearchCV(model,parameters) grid.fit(X[train], y[train]) #print grid predictions = grid.predict(X[test]) print grid.best_score_ if grid.best_score_ > 0.98: return grid break #print grid.best_estimator_.coef_ return grid
#use PCA to lower the dimensionality from sklearn.svm import SVC from sklearn.decomposition import PCA from sklearn.pipeline import make_pipeline pca = PCA(n_components=150, whiten=True, random_state=42, svd_solver = 'randomized') svc = SVC(kernel='rbf', class_weight='balanced') model = make_pipeline(pca, svc) #split the data from sklearn.model_selection import train_test_split Xtrain, Xtest, ytrain, ytest = train_test_split(faces.data, faces.target, random_state=42) #grid search - CV to emplore parameters C (margin of hardness) and gamma (size of radial basis function kernel) from sklearn.grid_search import GridSearchCV param_grid = {'svc__C': [1, 5, 10, 50], 'svc__gamma': [0.0001, 0.0005, 0.001, 0.005]} grid = GridSearchCV(model, param_grid) #run the search and time it %time grid.fit(Xtrain, ytrain) print(grid.best_params_) #predict model = grid.best_estimator_ yfit = model.predict(Xtest) fig, ax = plt.subplots(4, 6) for i, axi in enumerate(ax.flat): axi.imshow(Xtest[i].reshape(62, 47), cmap='bone') axi.set(xticks=[], yticks=[]) axi.set_ylabel(faces.target_names[yfit[i]].split()[-1], color='black' if yfit[i] == ytest[i] else 'red')
def cv_optimize(clf, parameters, Xtrain, ytrain, n_folds=5): gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds) gs.fit(Xtrain, ytrain) print("BEST PARAMS", gs.best_params_) best = gs.best_estimator_ return best
# gsearch4 = GridSearchCV(estimator = estimator, param_grid = param_test4,n_jobs=1,iid=False, cv=5) # gsearch4.fit(df_train[predictors],df_train[targetname]) # print(gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_) # subsample = 0.8 param_test5 = {'subsample': [0.8, 0.85, 0.9, 0.95, 0.1]} estimator = GradientBoostingClassifier(n_estimators=50, learning_rate=0.1, min_samples_split=1, max_depth=5, min_samples_leaf=50, random_state=10, max_features=4) gsearch5 = GridSearchCV(estimator=estimator, param_grid=param_test5, n_jobs=1, iid=False, cv=5) gsearch5.fit(df_train[predictors], df_train[targetname]) print(gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_) # proporcionalne znizim learning rate a zvysim pocet stromov - vyzera ze najlepsie je klasika povodny ##gbm_tuned_0 = GradientBoostingClassifier(learning_rate=0.1, n_estimators=50,max_depth=5, min_samples_split=1,min_samples_leaf=50, subsample=0.8, random_state=10, max_features=4) ##modelfit(gbm_tuned_0, df_train, predictors, targetname, performCV=True, printFeatureImportance=True, cv_folds=5) ##gbm_tuned_1 = GradientBoostingClassifier(learning_rate=0.05, n_estimators=100,max_depth=5, min_samples_split=1,min_samples_leaf=50, subsample=0.8, random_state=10, max_features=4) ##modelfit(gbm_tuned_1, df_train, predictors, targetname, performCV=True, printFeatureImportance=True, cv_folds=5) ##gbm_tuned_2 = GradientBoostingClassifier(learning_rate=0.2, n_estimators=25,max_depth=5, min_samples_split=1,min_samples_leaf=50, subsample=0.8, random_state=10, max_features=4) ##modelfit(gbm_tuned_2, df_train, predictors, targetname, performCV=True, printFeatureImportance=True, cv_folds=5)
t0 = time() X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print "done in %0.3fs" % (time() - t0) ############################################################################### # Train a SVM classification model print "Fitting the classifier to the training set" t0 = time() param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } # for sklearn version 0.16 or prior, the class_weight parameter value is 'auto' clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) clf = clf.fit(X_train_pca, y_train) print "done in %0.3fs" % (time() - t0) print "Best estimator found by grid search:" print clf.best_estimator_ ############################################################################### # Quantitative evaluation of the model quality on the test set print "Predicting the people names on the testing set" t0 = time() y_pred = clf.predict(X_test_pca) print "done in %0.3fs" % (time() - t0) print classification_report(y_test, y_pred, target_names=target_names) print confusion_matrix(y_test, y_pred, labels=range(n_classes))
from sklearn.cross_validation import train_test_split from sklearn.metrics import classification_report, confusion_matrix from sklearn.svm import SVC from sklearn.grid_search import GridSearchCV iris = sb.load_dataset('iris') iris = pd.DataFrame(iris) sb.pairplot(iris) plt.show() sb.jointplot(x='sepal_width', y='sepal_length', data=iris, kind='kde', color='red') plt.show() X = iris.drop('species', axis=1) y = iris['species'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101) plot1 = SVC() plot1.fit(X_train, y_train) pred1 = plot1.predict(X_test) print(classification_report(y_test, pred1)) dic = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1000, 100, 10, 1, 0.1]} plot2 = GridSearchCV(SVC(), dic, verbose=3) plot2.fit(X_train, y_train) pred2 = plot2.predict(X_test) print(classification_report(y_test, pred2))
'kernel': ['poly'], 'degree': [3] }, { 'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['poly'], 'degree': [4] }] scores = ['precision'] # you can alter this by adding, for example, `recall' for score in scores: print("# Tuning hyper-parameters for %s" % score) print() clf = GridSearchCV(SVC(C=1), tuned_parameters) clf.fit(x_train, t_train) print("Best parameters set found on development set:") print() print(clf.best_estimator_) print() print("Grid scores on development set:") print() for params, mean_score, scores in clf.grid_scores_: print("%0.3f (+/-%0.03f) for %s" % (mean_score, scores.std() / 2, params)) print() print(time.time() - start_time)
run_gs = False if run_gs: parameter_grid = { 'n_estimators': [100, 200, 300, 400, 500], 'max_depth': range(1, 9, 2), 'learning_rate': [0.0001, 0.001, 0.01, 0.1], 'subsample': [0.5, 0.6, 0.7, 0.8, 1.0], 'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 1.0], 'colsample_bylevel': [0.5, 0.6, 0.7, 0.8, 1.0], } xgb = XGBClassifier() cross_validation = StratifiedKFold(targets, n_folds=5) grid_search = GridSearchCV(xgb, scoring='accuracy', param_grid=parameter_grid, cv=cross_validation) grid_search.fit(train, targets) model = grid_search parameters = grid_search.best_params_ print('Best score: {}'.format(grid_search.best_score_)) print('Best parameters: {}'.format(grid_search.best_params_)) else: parameters = {'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.1, 'subsample': 0.6, 'colsample_bytree': 1.0, 'colsample_bylevel': 0.5} seed = 7 test_size = 0.33 X_train, X_test, y_train, y_test = train_test_split(train, targets, test_size=test_size, random_state=seed)
from sklearn.svm import SVC from sklearn.grid_search import GridSearchCV # ## The Data # [Iris flower data set](http://en.wikipedia.org/wiki/Iris_flower_data_set). # The data set consists of 50 samples from each of three species of Iris (Iris setosa, Iris virginica and Iris versicolor), so 150 total samples. Four features were measured from each sample: the length and the width of the sepals and petals, in centimeters. iris = sns.load_dataset('iris') ## # Model Selection X = iris.drop('species', axis=1) y = iris['species'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) svc_model = SVC() svc_model.fit(X_train, y_train) # ## Model Evaluation pred = svc_model.predict(X_test) print(confusion_matrix(y_test, pred)) print(classification_report(y_test, pred)) #Using Gridsearch for better SVM parameters param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001]} grid = GridSearchCV(SVC(), param_grid, verbose=2) grid.fit(X_train, y_train) grid_predictions = grid.predict(X_test) print(confusion_matrix(y_test, grid_predictions)) print(classification_report(y_test, grid_predictions))
test_classifier(LogisticRegression(), imputed_data, features_list, folds=1000) #Note that for decision trees and random forests, the results are always different #This is because different partitions of decision boundaries are used #answer question from Understanding the dataset and question #Try gridesearchcv and answer why parameter tuning is important #addresses why validation is important, talk about precision and recall from sklearn.svm import SVC #Tuning SVM print "\n Support vector machines" parameters = {'C': [0.001, 0.01, 0.1, 1, 10], 'gamma': [0.001, 0.01, 0.1, 1]} sv = SVC(kernel='rbf') #cv = StratifiedShuffleSplit(test_size = 0.2, random_state = 42) clf_svc = GridSearchCV(sv, parameters, scoring="f1") fit_classify_scaled(clf_svc, list_all_features) #clf_svc.fit(features_train, labels_train) sv_params = clf_svc.best_params_ print sv_params #Going through the StratifiedShuffleSplit and then GridSearchCV did not give better results #clf2 = GridSearchCV(sv, parameters, scoring = "f1") #test_classifier(clf2, imputed_data, features_list, folds = 1000) #Tuning Decision trees from sklearn.tree import DecisionTreeClassifier print "\n Tuning Decision Trees" parameters = { 'max_depth': [None, 10, 5, 2], 'min_samples_split': [2, 10, 5],
clf = clf.fit(features_train, labels_train) from tester import test_classifier test_classifier(clf, data_dict, features_list) # ### Tuning by GridSearchCV 函数 # In[24]: from sklearn.grid_search import GridSearchCV t0 = time() param_grid = { 'min_samples_split': [2, 3, 4, 5, 6, 7, 8], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8], 'max_features': range(3, 7) } clf = GridSearchCV(DecisionTreeClassifier(), param_grid, scoring='f1') clf = clf.fit(features_train, labels_train) print "done in %0.3fs" % (time() - t0) print "Best estimator found by grid search:" clf = clf.best_estimator_ from tester import test_classifier test_classifier(clf, data_dict, features_list) # ### 使用GridSearchCV进行参数调整,通过多次运行之后,确定的参数为 class_weight=None, criterion='gini', max_depth=7,max_features=3, max_leaf_nodes=None, min_samples_leaf=1,min_samples_split=2, min_weight_fraction_leaf=0.0,presort=False, random_state=None, splitter='best' 将最后的DT算法中的参数调整为GridSearchCV所得到的结果 # # In[25]: clf = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
batch_size = [j for j in range(2,30,2)] epochs = [i for i in range(1,10,1)] param_grid = dict(batch_size=batch_size, nb_epoch=epochs)''' neurons = [i for i in range(1, 151, 10)] param_grid = dict(neurons=neurons) ''' optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam'] param_grid = dict(optimizer=optimizer)''' ''' init_mode = ['uniform', 'lecun_uniform', 'normal', 'zero', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'] param_grid = dict(init_mode=init_mode)''' '''activation = ['softmax', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear'] param_grid = dict(activation=activation)''' '''weight_constraint = [1, 2, 3, 4, 5] dropout_rate = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] param_grid = dict(dropout_rate=dropout_rate, weight_constraint=weight_constraint)''' grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, scoring='accuracy', cv=10) grid_result = grid.fit(data, mark) # summarize results print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) for params, mean_score, scores in grid_result.grid_scores_: print("%f (%f) with: %r" % (scores.mean(), scores.std(), params)) ''' model.fit(x_train, y_train, epochs=50, batch_size=10,verbose=0) loss_and_metrics = model.evaluate(x_test, y_test,verbose=0) print ('/n','----------',loss_and_metrics[1])'''
classesNameCol = 'ClassStr' ratutils.populateClumpsWithClassTraining(outputClumps, classesDict, tmpPath, classesIntCol, classesNameCol) rsgislib.classification.classratutils.balanceSampleTrainingRandom( outputClumps, classesIntCol, 'classesIntColBal', 50, 5000) # balance the training data classesIntCol = 'classesIntColBal' ############################################################################################## # use grid search to define the classifier variables = [ 'VVMin', 'VHMin', 'VVdivVHMin', 'VVMax', 'VHMax', 'VVdivVHMax', 'VVAvg', 'VHAvg', 'VVdivVHAvg', 'VVStd', 'VHStd', 'VVdivVHStd' ] classParameters = {'n_estimators': [10, 100, 500], 'max_features': [2, 3, 4]} gsearch = GridSearchCV(ExtraTreesClassifier(bootstrap=True), classParameters) classifier = classratutils.findClassifierParameters(outputClumps, classesIntCol, variables, preProcessor=None, gridSearch=gsearch) # define the output colours classColours = dict() classColours['Other'] = [212, 125, 83] classColours['Water'] = [157, 212, 255] classColours['VegWater'] = [191, 255, 0] classColours['Unclassified'] = [0, 0, 0] ############################################################################################## # run the classification
plt.ylabel('Feature Importance') plt.show() clf = GradientBoostingClassifier(random_state=15) print(hackathon_GBC_model(clf, train_data, features)) estimators = [x for x in range(10, 131, 10)] first_tune = {'n_estimators': estimators} first_search = GridSearchCV(estimator=GradientBoostingClassifier( learning_rate=0.05, min_samples_split=700, min_samples_leaf=70, max_depth=8, max_features='sqrt', subsample=0.8, random_state=15, ), param_grid=first_tune, scoring='roc_auc', n_jobs=6, iid=False, cv=5) first_search.fit(train_data[features], train_data["Class"]) print(first_search.grid_scores_, first_search.best_params_, first_search.best_score_) min_split = [x for x in range(300, 1101, 100)] depth = [x for x in range(5, 15, 1)] second_tune = {'max_depth': depth, 'min_samples_split': min_split}
def svmModel(filenameL, filenameU, output): tweets = [] for line in open(filenameL, "r").readlines(): tweet = json.loads(line) tweets.append([tweet[0], tweet[1].lower().strip()]) # Extract the vocabulary of keywords vocab = dict() for class_label, text in tweets: for term in text.split(): term = term.lower() if len(term) > 2 and term not in stopwords: if vocab.has_key(term): vocab[term] = vocab[term] + 1 else: vocab[term] = 1 # Remove terms whose frequencies are less than 15 vocab = {term: freq for term, freq in vocab.items() if freq > 15} # Generate an id starting from 0 for each term in vocab vocab = {term: idx for idx, (term, freq) in enumerate(vocab.items())} print vocab # Generate X and y X = [] y = [] for class_label, text in tweets: x = [0] * len(vocab) terms = [term for term in text.split() if len(term) > 2] for term in terms: if vocab.has_key(term): x[vocab[term]] += 1 y.append(class_label) X.append(x) # 10 folder cross validation to estimate the best w and b svc = svm.SVC(kernel='linear') Cs = range(1, 20) clf = GridSearchCV(estimator=svc, param_grid=dict(C=Cs), cv = 10) clf.fit(X, y) # predict the class labels of new tweets print clf.predict(X) tweets = [] for line in open(filenameU).readlines(): tweets.append(line) # Generate X for testing tweets X = [] for text in tweets: x = [0] * len(vocab) terms = [term for term in text.split() if len(term) > 2] for term in terms: if vocab.has_key(term): x[vocab[term]] += 1 X.append(x) y = clf.predict(X) #write all positive tweets to the output file f = open(output, "a") for idx in range(0, len(tweets)): if(y[idx] == 1): print 'Sentiment Class (1 means positive; 0 means negative): ', y[idx] print 'TEXT: ', idx, tweets[idx] labeledTweet = [y[idx], json.loads(tweets[idx])] f.write(labeledTweet)
if best_err is None or best_err > val_err: best_l2 = l2 best_err = val_err print 'best ' + str(best_l2) + ' and ' + str(best_err) plt.plot(l2_penalty[2:12], total_val[2:12], 'k-') plt.xlabel('$\L2_penalty$') plt.ylabel('K-fold cross validation error') plt.xscale('log') plt.yscale('log') model_python = RidgeCV(l2_penalty) model_python.fit(data15[my_features], data15['price']) ''' rss=[] for alpha in alphas: model=Ridge(alpha) model.fit(training[my_features],training['price']) pred=model.predict(testing) val_err=sum((pred-training['price'])**2) rss.append(val_err) plt.plot(alphas,rss,'k-') plt.xlabel('$\L2_penalty$') plt.ylabel('K-fold cross validation error') plt.xscale('log') plt.yscale('log')''' tuned_parameters = {'alpha': l2_penalty} model2 = GridSearchCV(Ridge(), tuned_parameters, cv=10) model2.fit(data15[my_features], data15['price'])
def my_svm(): tweets = [] for line in open('data.txt').readlines()[:500]: items = line.split(',') tweets.append([int(items[0]), items[1].lower().strip()]) # Extract the vocabulary of keywords vocab = dict() for class_label, text in tweets: for term in text.split(): term = term.lower() if len(term) > 2 and term not in stopwords: if vocab.has_key(term): vocab[term] = vocab[term] + 1 else: vocab[term] = 1 # Remove terms whose frequencies are less than a threshold (e.g., 10) vocab = {term: freq for term, freq in vocab.items() if freq > 10} # Generate an id (starting from 0) for each term in vocab vocab = {term: idx for idx, (term, freq) in enumerate(vocab.items())} print "******Features*******" print vocab # Generate X and y X = [] y = [] for class_label, text in tweets: x = [0] * len(vocab) terms = [term for term in text.split()] for term in terms: if vocab.has_key(term): x[vocab[term]] += 1 y.append(class_label) X.append(x) print "The total number of training tweets: {} ({} positives, {}: negatives)".format(len(y), sum(y), len(y) - sum(y)) # 10 folder cross validation to estimate the best w and b svc = svm.SVC(kernel='linear') Cs = range(1, 20) clf = GridSearchCV(estimator=svc, param_grid=dict(C=Cs), cv = 10) clf.fit(X, y) print "The estimated w: " print clf.best_estimator_.coef_ print "The estimated b: " print clf.best_estimator_.intercept_ print "The estimated C after the grid search for 10 fold cross validation: " print clf.best_params_ print "Accuracy " print str(clf.best_score_) # predict the class labels of new tweets tweets = [] for line in open('tJumanji.txt').readlines(): tweets.append(line) # Generate X for testing tweets test_X = [] for text in tweets: x = [0] * len(vocab) terms = [term for term in text.split() if len(term) > 2] for term in terms: if vocab.has_key(term): x[vocab[term]] += 1 test_X.append(x) test_y = clf.predict(test_X) c = 0 fl = open("p_Jumanji.txt","w") fl_n = open("n_Jumanji.txt","w") for text in tweets: if(test_y[c]==1): fl.write(text) else: fl_n.write(text) c+=1 print "The total number of testing tweets: {} ({} are predicted as positives, {} are predicted as negatives)".format(len(test_y), sum(test_y), len(test_y) - sum(test_y))
y = y.flatten() # x.shape=(150,4) y.shape=(150,) x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, train_size=0.6) pipeline = Pipeline([('clf', SVC(kernel='rbf', gamma=0.01, C=100))]) parameters = { 'clf__gamma': (0.01, 0.03, 0.1, 0.3, 1, 1.3), 'clf__C': (0.1, 0.3, 1, 3, 10, 30, 40), } grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy', refit=True) grid_search.fit(x_train, y_train) print('最佳效果:%0.3f' % grid_search.best_score_) print('最优参数集:') best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print('\t%s: %r' % (param_name, best_parameters[param_name])) predictions = grid_search.predict(x_test) print("-------------------") print(classification_report(y_test, predictions)) print(accuracy_score(y_test, predictions))
n_train = len(y_train) n_test = len(y_sts_val) param_grid = LARGE_PARAM_GRID def score_stub(true, pred): return correlation(true, postprocess(test_input_val, pred)) # grid cv on test held cv_split = [(range(n_train), range(n_train, n_train + n_test))] grid = GridSearchCV(SVR(), param_grid, cv=cv_split, verbose=1, n_jobs=N_JOBS, scoring=make_scorer(correlation)) grid.fit(vstack([X_train, X_sts13_val]), hstack([y_train, y_sts_val])) held_cv_score = grid.best_score_ held_cv_params = grid.best_params_ print held_cv_score print held_cv_params regressor = SVR(**held_cv_params) regressor.fit(X_train, y_train) y_test = regressor.predict(X_sts13_held) # y_test = postprocess(test_input_held, y_test)
def predict(self, X, y=None): try: getattr(self, "treshold_") except AttributeError: raise RuntimeError( "You must train classifer before predicting data!") return ([self._meaning(x) for x in X]) def score(self, X, y=None): # counts number of values bigger than mean return (sum(self.predict(X))) from sklearn.grid_search import GridSearchCV from sklearn.utils.estimator_checks import check_estimator check_estimator(MeanClassifier) # passes trainJZ = [i for i in range(0, 150, 5)] testJZ = [i + 3 for i in range(-5, 5, 5)] tuned_params = {"intValue": [-10, -1, 0, 1, 10]} gs = GridSearchCV(MeanClassifier(), tuned_params) # for some reason I have to pass y with same shape # otherwise gridsearch throws an error. Not sure why. y = [1 for i in range(20)] gs.fit(trainJZ, y) print gs.best_params_ # {'intValue': -10} # and that is what we expect :)
def run(): # Read dataset author_ids = [] labels = [] for line in open('learning/dataset/similarity_training_ids', 'r'): vals = map(lambda x: int(x), line.strip().split('\t')) author_ids.append((vals[0], vals[1])) labels.append(vals[2]) all_papers_model = WordCount() all_papers_model.load('models/everything.pkl') contributors_model = WordCount() contributors_model.load('models/contributors.pkl') sc = SimilarityClassifier(languageModel=all_papers_model, contributorsModel=contributors_model) author_data_cache = dict() def getAuthorData(sc, id): try: return author_data_cache[id] except KeyError: x = sc.getDataById(id) author_data_cache[id] = x return x recompute = False if recompute: print("Computing features") features = [] for (idA, idB) in author_ids: f = sc.computeFeatures(getAuthorData(sc, idA), getAuthorData(sc, idB)) features.append(f) print("Writing features back") outf = open('learning/dataset/author-features', 'w') for i in range(len(labels)): print('\t'.join(map(lambda x: str(x), features[i])), file=outf) outf.close() else: inf = open('learning/dataset/author-features', 'r') features = [] for line in inf: f = map(lambda x: float(x), line.strip().split('\t')) features.append(f) inf.close() if True: def custom_scorer(estimator, X, y): conf = estimator.confusion(X, y) print(conf) return -conf[1][0] param_grid = [{ 'positiveSampleWeight': [0.001, 0.005, 0.01, 0.02, 0.05] }] cv = GridSearchCV(sc, param_grid, cv=5, scoring=custom_scorer, iid=False) cv.fit(features, labels) print("Grid scores:") print(cv.grid_scores_) sc = cv.best_estimator_ def paper_url(pk): print('http://beta.ens.dissem.in/paper/' + str(Author.objects.get(pk=pk).paper_id)) print("Curious papers") pubSc = sc.simFeatures[1] for i in range(len(labels)): prediction = sc.classifier.predict(features[i])[0] if labels[i] == 0 and prediction == 1: print("#####") paper_url(author_ids[i][0]) paper_url(author_ids[i][1]) #print("Explanation") #pubSc.compute(Author.objects.get(pk=author_ids[i][0]), # Author.objects.get(pk=author_ids[i][1]), explain=True) print(sc.confusion(features, labels)) #sc.plotClassification(features, labels) sc.save('models/similarity.pkl')
def run_k_nearest_neighbors(training_features, training_labels, test_features, test_labels, passed_parameters=None): """ Classifies the data using sklearn's k nearest neighbors classifier Parameters ---------- training_data: data used to train the classifier. For each row, item 0 assumed to be the label test_data: data used to test the classifier. For each row, item 0 assumed to be the label k: number of nearest neighbors used in the algorithm Returns ------- prediction: predicted labels of the test data accuracy: percent of test data labels accurately predicted """ time_1 = time.time() estimator = neighbors.KNeighborsClassifier() #set up parameters for the classifier if (passed_parameters is None): parameters = { 'n_neighbors': range(1, 11), 'weights': ['uniform', 'distance'], 'p': [1, 2] } else: parameters = passed_parameters #create cross validation iterator cv = ShuffleSplit(training_features.shape[0], n_iter=5, test_size=0.2, random_state=0) #plot the validation curves for param in parameters: if (is_number(parameters[param][0])): title = 'Validation Curves \n(kNN)' save_name = "Validation Curves - kNN - %s.png" % param plot_validation_curve(estimator, training_features, training_labels, title, param, parameters[param], cv) pylab.savefig(os.path.join(results_location, save_name)) #set up tuning algorithm classifier = GridSearchCV(estimator=estimator, cv=cv, param_grid=parameters) #fit the classifier classifier.fit(training_features, training_labels) test_prediction = classifier.predict(test_features) test_accuracy = classifier.score(test_features, test_labels) time_2 = time.time() #show the best result estimator = neighbors.KNeighborsClassifier( n_neighbors=classifier.best_estimator_.n_neighbors, weights=classifier.best_estimator_.weights, algorithm=classifier.best_estimator_.algorithm, leaf_size=classifier.best_estimator_.leaf_size, p=classifier.best_estimator_.p, metric=classifier.best_estimator_.metric) #plot the learning curve title = 'Learning Curves \n(k-NN, k-neighbors=%i weights=%s algorithm=%s leaf size=%i p=%i )' % ( classifier.best_estimator_.n_neighbors, classifier.best_estimator_.weights, classifier.best_estimator_.algorithm, classifier.best_estimator_.leaf_size, classifier.best_estimator_.p) plot_learning_curve(estimator, title, training_features, training_labels, cv=cv) pylab.savefig(os.path.join(results_location, 'Learning Curves - kNN.png')) #plt.show() time_3 = time.time() #output time stats #time 1 -> time 2 is optimization time #time 2 -> time 3 is run for just one case print("kNN Time Stats") print("Optimization Time -> %f" % (time_2 - time_1)) print("Single Run Time -> %f" % (time_3 - time_2)) #output classification report and confusion matrix print('\n\n----------------------------') print('Classification Report') print('----------------------------\n') print(classification_report(y_true=test_labels, y_pred=test_prediction)) print('\n\n----------------------------') print('Confusion Matrix') print('----------------------------\n') print(confusion_matrix(y_true=test_labels, y_pred=test_prediction)) return test_prediction, test_accuracy
def initialize(filename, labels_train, typetoread, toexclude=None, n_estimators=None, estimators_to_test=None, class_weight=None): """ Takes in features and labels pertaining to a tag and fits and returns a TfidfVectorizer, SelectPercentile, and RandomForestClassifier :param filename: The base file location where information about the dataset can be found. :param labels_train: The labels to use when classifying. :param typetoread: The features list to use ("Use" or "Description") :param toexclude: A list of indices of the features list to exclude from classification. Useful to exclude values known to be positive or negative without classifier use. If not given, assumes all features are valid. :param n_estimators: The number of trees to use in the Random Forest Classifier as per the sklearn documentation. If not given, GridSearchCV will select between 50, 150, and 250. :param estimators_to_test: A list of different numbers of estimators to test using GridSearch CV as per the sklearn documentation. If not given, GridSearchCV will select between 50, 150, and 250. :param class_weight: The weightings to use for the various classes as per the sklearn documentation. If not given, all classes have equal weight :return forest: A fitted RandomForestVectorizer. :return vectorizer: A fitted TfidfVectorizer. :return selector: A fitted Selector at 10%. """ features_train = pickle.load( open( os.path.abspath("../DataFiles/" + filename + "features" + typetoread), "rb")) labels_train = pd.Series(labels_train) if toexclude: features_train = pd.Series( np.delete(np.array(features_train), toexclude, axis=0)) print("Creating Vectorizer") vectorizer = TfidfVectorizer(stop_words="english", max_df=.5, ngram_range=(1, 3)) print("Fitting Vectorizer") features_train_transformed = vectorizer.fit_transform(features_train) features_train = None print("Creating Selector") selector = SelectKBest(k=18000) print("Fitting Selector") selector.fit(features_train_transformed, labels_train) print("Transforming data") features_train_transformed_selected = selector.transform( features_train_transformed) features_train_transformed = None features_train_transformed_selected = features_train_transformed_selected.toarray( ) print("Creating Forest") if not n_estimators: forest = RandomForestClassifier(min_samples_leaf=2, class_weight=class_weight) if not estimators_to_test: parameters = { "n_estimators": [50, 150, 250], } else: parameters = { "n_estimators": estimators_to_test, } forest = GridSearchCV(forest, parameters) else: forest = RandomForestClassifier(n_estimators=n_estimators, min_samples_leaf=2, class_weight=class_weight) print("Fitting Forest") forest.fit(features_train_transformed_selected, labels_train) return forest, vectorizer, selector
def run_support_vector_machines(training_features, training_labels, test_features, test_labels, passed_parameters=None): """ Classifies the data using sklearn's support vector machine classifier Parameters ---------- training_data: data used to train the classifier. For each row, item 0 assumed to be the label test_data: data used to test the classifier. For each row, item 0 assumed to be the label kernel: (optional) Kernel to be used in the svm classifier can be 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' Returns ------- prediction: predicted labels of the test data accuracy: percent of test data labels accurately predicted """ time_1 = time.time() estimator = svm.SVC() #set up parameters that will be used by all kernels if (passed_parameters is None): parameters = {'C': [1e0, 5e0, 1e1, 5e1]} else: parameters = passed_parameters #create cross validation iterator cv = ShuffleSplit(training_features.shape[0], n_iter=5, test_size=0.2, random_state=0) #plot the validation curves for param in parameters: if (is_number(parameters[param][0])): title = 'Validation Curves' save_name = "Validation Curves - SVC - %s.png" % param plot_validation_curve(estimator, training_features, training_labels, title, param, parameters[param], cv) pylab.savefig(os.path.join(results_location, save_name)) #set up tuning algorithm classifier = GridSearchCV(estimator=estimator, cv=cv, param_grid=parameters) #fit the classifier classifier.fit(training_features, training_labels) test_prediction = classifier.predict(test_features) test_accuracy = classifier.score(test_features, test_labels) time_2 = time.time() #show the best result estimator = svm.SVC(kernel=classifier.best_estimator_.kernel, C=classifier.best_estimator_.C, gamma=classifier.best_estimator_.gamma, degree=classifier.best_estimator_.degree) #plot the learning curve title = 'Learning Curves (SVM, kernel=%s degree=%i gamma=%f C=%i )' % ( classifier.best_estimator_.kernel, classifier.best_estimator_.degree, classifier.best_estimator_.gamma, classifier.best_estimator_.C) plot_learning_curve(estimator, title, training_features, training_labels, cv=cv) save_file_name = 'Learning Curves - SVM.png' pylab.savefig(os.path.join(results_location, save_file_name)) #plt.show() time_3 = time.time() if (classifier.best_estimator_.kernel == 'linear'): coefficients = classifier.estimator.coef_ print('\n\n-----------------------') print(' Coefficients') print(coefficients) #output time stats #time 1 -> time 2 is optimization time #time 2 -> time 3 is run for just one case print("SVM Time Stats") print("Optimization Time -> %f" % (time_2 - time_1)) print("Single Run Time -> %f" % (time_3 - time_2)) #output classification report and confusion matrix print('\n\n----------------------------') print('Classification Report') print('----------------------------\n') print(classification_report(y_true=test_labels, y_pred=test_prediction)) print('\n\n----------------------------') print('Confusion Matrix') print('----------------------------\n') print(confusion_matrix(y_true=test_labels, y_pred=test_prediction)) return test_prediction, test_accuracy
def run_boosting(training_features, training_labels, test_features, test_labels, passed_parameters=None): """ Classifies the data using sklearn's ADAboost Does not natively support pruning so max_depth is being used for the decision tree Parameters ---------- training_data: data used to train the classifier. For each row, item 0 assumed to be the label test_data: data used to test the classifier. For each row, item 0 assumed to be the label max_depth: maximum tree depth to be applied (will simulate pruning) Returns ------- prediction: predicted labels of the test data accuracy: percent of test data labels accurately predicted """ time_1 = time.time() #set up underlying decision tree classifier base_classifier = tree.DecisionTreeClassifier() #set up the boosting method estimator = ensemble.AdaBoostClassifier(base_estimator=base_classifier) #set up parameters for the classifier parameters = { 'base_estimator__max_depth': range(1, 5), 'n_estimators': range(10, 500, 50), 'learning_rate': [.25, .5, .75, 1.0] } #create cross validation iterator cv = ShuffleSplit(training_features.shape[0], n_iter=5, test_size=0.2, random_state=0) #plot the validation curves for param in parameters: if (is_number(parameters[param][0])): title = 'Validation Curves \n(AdaBoost)' save_name = "Validation Curves - AdaBoost - %s.png" % param plot_validation_curve(estimator, training_features, training_labels, title, param, parameters[param], cv) pylab.savefig(os.path.join(results_location, save_name)) #set up parameters for the classifier if (passed_parameters is None): parameters = { 'base_estimator__max_depth': range(1, 3), 'n_estimators': range(5, 51, 5), 'learning_rate': [1.0] } else: parameters = passed_parameters #set up tuning algorithm classifier = GridSearchCV(estimator=estimator, cv=cv, param_grid=parameters) #fit the classifier classifier.fit(training_features, training_labels) #get the prediction and accuracy of the test set test_prediction = classifier.predict(test_features) test_accuracy = classifier.score(test_features, test_labels) time_2 = time.time() #graph the best result base_classifier = tree.DecisionTreeClassifier( max_depth=classifier.best_estimator_.base_estimator_.max_depth) estimator = ensemble.AdaBoostClassifier( base_estimator=base_classifier, n_estimators=classifier.best_estimator_.n_estimators, learning_rate=classifier.best_estimator_.learning_rate) #plot the learning curve title = 'Learning Curves (AdaBoost - Decision Tree)\n max_depth=%i estimators=%i learning_rate=%f$' % ( classifier.best_estimator_.base_estimator_.max_depth, classifier.best_estimator_.n_estimators, classifier.best_estimator_.learning_rate) plot_learning_curve(estimator, title, training_features, training_labels, cv=cv) pylab.savefig( os.path.join(results_location, 'Learning Curves - AdaBoost - Decision Tree.png')) time_3 = time.time() #fit the best eetimator estimator.fit(training_features, training_labels) #plot the learning curve by number of estimators plot_adaclassifier(estimator, classifier.best_estimator_.n_estimators, training_features, test_features, training_labels, test_labels) pylab.savefig( os.path.join(results_location, 'Estimator Curves - AdaBoost - Decision Tree.png')) #output time stats #time 1 -> time 2 is optimization time #time 2 -> time 3 is run for just one case print("Decision Tree Time Stats") print("Optimization Time -> %f" % (time_2 - time_1)) print("Single Run Time -> %f" % (time_3 - time_2)) #output classification report and confusion matrix print('\n\n----------------------------') print('Classification Report') print('----------------------------\n') print(classification_report(y_true=test_labels, y_pred=test_prediction)) print('\n\n----------------------------') print('Confusion Matrix') print('----------------------------\n') print(confusion_matrix(y_true=test_labels, y_pred=test_prediction)) return test_prediction, test_accuracy
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=0) # 数据集分割 pipe_scv = Pipeline([('scl', StandardScaler()), ('clf', SVC(random_state=1))]) param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0] param_grid = [{ 'clf_C': param_range, 'clf_kernel': ['linear'] }, { 'clf_C': param_range, 'clf_gamma': param_range, 'clf_kernel': ['rbf'] }] gs = GridSearchCV(estimator=pipe_scv, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=-1) gs = gs.fit(X_train, y_train) print(gs.best_score_) print(gs.best_params_) # 获取最优模型进行评估 clf = gs.best_estimator_ clf.fit(X_train, y_train) y_pre = clf.score(X_test, y_test) print(y_pre)
def run_decision_tree(training_features, training_labels, test_features, test_labels, passed_parameters=None, headings=None): """ Classifies the data using sklearn's decision tree Does not natively support pruning so max_depth is being used Parameters ---------- training_data: data used to train the classifier. For each row, item 0 assumed to be the label test_data: data used to test the classifier. For each row, item 0 assumed to be the label max_depth: maximum tree depth to be applied (will simulate pruning) Returns ------- prediction: predicted labels of the test data accuracy: percent of test data labels accurately predicted """ time_1 = time.time() estimator = tree.DecisionTreeClassifier() #set up parameters for the classifier if (passed_parameters == None): parameters = {'max_depth': None} else: parameters = passed_parameters #create cross validation iterator cv = ShuffleSplit(training_features.shape[0], n_iter=5, test_size=0.2, random_state=0) #plot the validation curves for param in parameters: if (is_number(parameters[param][0])): title = 'Validation Curves \n(Decision Tree)' save_name = "Validation Curves - Decision Tree - %s.png" % param plot_validation_curve(estimator, training_features, training_labels, title, param, parameters[param], cv) pylab.savefig(os.path.join(results_location, save_name)) #set up tuning algorithm classifier = GridSearchCV(estimator=estimator, cv=cv, param_grid=parameters) #fit the classifier classifier.fit(training_features, training_labels) test_prediction = classifier.predict(test_features) test_accuracy = classifier.score(test_features, test_labels) time_2 = time.time() #show the best result estimator = tree.DecisionTreeClassifier( max_depth=classifier.best_estimator_.max_depth, criterion=classifier.best_estimator_.criterion) estimator.fit(training_features, training_labels) #plot the learning curve title = 'Learning Curves \n(Decision Tree, max depth=%i)' % classifier.best_estimator_.max_depth plot_learning_curve(estimator, title, training_features, training_labels, cv=cv) pylab.savefig( os.path.join(results_location, 'Learning Curves - Decision Tree.png')) #plt.show() #save the visualization of the decision tree only use the top 5 levels for now tree_data = StringIO() tree.export_graphviz(estimator, out_file=tree_data, max_depth=5, feature_names=headings) graph = pydot.graph_from_dot_data(tree_data.getvalue()) graph.write_pdf(os.path.join(results_location, "Decision Tree Model.pdf")) time_3 = time.time() #output time stats #time 1 -> time 2 is optimization time #time 2 -> time 3 is run for just one case print("Decision Tree Time Stats") print("Optimization Time -> %f" % (time_2 - time_1)) print("Single Run Time -> %f" % (time_3 - time_2)) #output classification report and confusion matrix print('\n\n----------------------------') print('Classification Report') print('----------------------------\n') print(classification_report(y_true=test_labels, y_pred=test_prediction)) print('\n\n----------------------------') print('Confusion Matrix') print('----------------------------\n') print(confusion_matrix(y_true=test_labels, y_pred=test_prediction)) return test_prediction, test_accuracy
t_train = df[i].str.split('/', expand=True, n=1).astype(float) X_train = np.c_[((t_train.values)[:, 0] + ((t_train.values)[:, 1] - 1988) * 12) - ((d_train.values)[:, 0] + ((d_train.values)[:, 1] - 1988) * 12), X_train] X_test = np.c_[((t_test.values)[:, 0] + ((t_test.values)[:, 1] - 1988) * 12) - ((d_test.values)[:, 0] + ((d_test.values)[:, 1] - 1988) * 12), X_test] imputer = Imputer() X_train = imputer.fit_transform(X_train) X_test = imputer.transform(X_test) clf = GradientBoostingClassifier() param_grid = dict(n_estimators=[800], max_depth=[8], max_features=[0.3], learning_rate=[0.1], min_samples_split=[600], min_samples_leaf=[40], subsample=[1.], random_state=[1]) grid = GridSearchCV(clf, param_grid=param_grid, cv=5, scoring='roc_auc') grid.fit(X_train, y_train) print("Best score %f" % grid.best_score_) #score = cross_validation.cross_val_score(clf, X_train, y_train, cv=3, scoring='roc_auc') Y_predict = grid.predict_proba(X_test) np.savetxt('y_predDaniel.txt', Y_predict, fmt='%s')
def run_neural_net(training_features, training_labels, test_features, test_labels): """ Classifies the data using pybrain's neural net Parameters ---------- training_data: data used to train the classifier. For each row, item 0 assumed to be the label test_data: data used to test the classifier. For each row, item 0 assumed to be the label hidden_units: sets the hidden unit count for the neural net training_epochs: sets the training epochs for the neural net training_iterations: # of training loops Returns ------- prediction: predicted labels of the test data accuracy: percent of test data labels accurately predicted """ time_1 = time.time() #set the number of classes in the data number_of_outputs = training_labels.astype(int).max() + 1 number_of_inputs = training_features.shape[1] #determine optimal hidden nodes based on Huang et al. (2003) first_layer_nodes = int( math.sqrt((number_of_outputs + 2) * number_of_inputs) + 2 * math.sqrt(number_of_inputs / (number_of_outputs + 2))) second_layer_nodes = int(number_of_outputs * math.sqrt(number_of_inputs / (number_of_outputs + 2))) #set up the layers input_layer = mlp_nn.Layer("Linear", units=number_of_inputs) hidden_layer1 = mlp_nn.Layer("Sigmoid", units=first_layer_nodes) hidden_layer2 = mlp_nn.Layer("Sigmoid", units=second_layer_nodes) output_layer = mlp_nn.Layer("Softmax", units=number_of_outputs) layers = [input_layer, hidden_layer1, hidden_layer2, output_layer] #set up the classifier neural_net = mlp_nn.Classifier(layers=layers, learning_rate=0.02, n_iter=5) #set up tuning parameters parameters = {"learning_rate": [0.02], "n_iter": [1, 5, 10, 25, 50]} #create cross validation iterator cv = ShuffleSplit(training_features.shape[0], n_iter=5, test_size=0.2, random_state=0) #set up tuning algorithm classifier = GridSearchCV(estimator=neural_net, cv=cv, param_grid=parameters) classifier.fit(training_features, training_labels) test_prediction = classifier.predict(test_features) test_accuracy = classifier.score(test_features, test_labels) time_2 = time.time() graph_title = "Learning Curves \n(Neural Net, learning rate=%f)" % classifier.best_estimator_.learning_rate plot_learning_curve_iter(classifier, graph_title) pylab.savefig( os.path.join(results_location, 'Validator Curves - Neural Net.png')) time_3 = time.time() #output time stats #time 1 -> time 2 is optimization time #time 2 -> time 3 is run for just one case print("Neural Net Time Stats") print("Optimization Time -> %f" % (time_2 - time_1)) print("Single Run Time -> %f" % (time_3 - time_2)) #output classification report and confusion matrix print('\n\n----------------------------') print('Classification Report') print('----------------------------\n') print(classification_report(y_true=test_labels, y_pred=test_prediction)) print('\n\n----------------------------') print('Confusion Matrix') print('----------------------------\n') print(confusion_matrix(y_true=test_labels, y_pred=test_prediction)) return test_prediction, test_accuracy
# At this point, we have two options: # 1. use more training data, to overcome low model complexity # 2. use a more complex (lower bias) model to start with, to get more out of the existing data params = { 'tfidf__use_idf': (True, False), 'bow__analyzer': (split_into_lemmas, split_into_tokens), } grid = GridSearchCV( pipeline, # pipeline from above params, # parameters to tune via cross validation refit= True, # fit using all available data at the end, on the best found param combination n_jobs=-1, # number of cores to use for parallelization; -1 for "all cores" scoring='accuracy', # what score are we optimizing? cv=StratifiedKFold(label_train, n_folds=5), # what type of cross validation to use ) #% time nb_detector = grid.fit(msg_train, label_train) print(nb_detector.grid_scores_) print(nb_detector.predict_proba(["Hi mom, how are you?"])[0]) print(nb_detector.predict_proba(["WINNER! Credit for free!"])[0]) print(nb_detector.predict(["Hi mom, how are you?"])[0]) print(nb_detector.predict(["WINNER! Credit for free!"])[0])
data_cls = np.asarray(data_cls) data_pln = np.asarray(data_pln) X = np.vstack([data_cls, data_pln]) y = np.concatenate([np.zeros(len(data_cls)), np.ones(len(data_pln))]) cv = StratifiedKFold(y, n_folds=6, shuffle=True) cv_params = { "learning_rate": np.arange(0.1, 1.1, 0.1), 'n_estimators': np.arange(1, 80, 2) } grid = GridSearchCV(AdaBoostClassifier(), cv_params, scoring='accuracy', cv=cv, n_jobs=1, verbose=1) grid.fit(X, y) ada_cv = grid.best_estimator_ scores = cross_val_score(ada_cv, X, y, cv=cv) scores_all[k, :] = scores # save the classifier joblib.dump( ada_cv, source_folder + "graph_data/sk_models/eigen_ada_pln_%s.plk" % band) np.save(source_folder + "graph_data/eigen_scores_all_ada_pln.npy", scores_all)