def nearest_neighbors(self): neighbors_array = [11, 31, 201, 401, 601] tuned_parameters = {"n_neighbors" : neighbors_array} knn = KNeighborsClassifier() clf = GridSearchCV(knn, tuned_parameters, cv=5, n_jobs= 5, scoring = "f1") clf.fit(self.train_data_x, self.train_labels_y) self.models.append(clf)
def tuner(clf, parameters, data): from sklearn.model_selection import GridSearchCV labels, features = targetFeatureSplit(data) scaler = MinMaxScaler() select = SelectKBest() steps = [("scale", scaler), ("select", select), ("classifier", clf)] pipeline = Pipeline(steps) shuffle = StratifiedShuffleSplit(n_splits=1000, test_size=0.3, random_state=42) my_scorer = make_scorer(my_score_func) scoring_metric = my_scorer grid_searcher = GridSearchCV(pipeline, param_grid=parameters, cv=shuffle, scoring=scoring_metric) features = select.fit_transform(features, labels) grid_searcher.fit(features, labels) print("Cross-validated {0} score: {1}".format(scoring_metric, grid_searcher.best_score_)) print("Params: ", grid_searcher.best_params_)
def test_grid_search_precomputed_kernel(): # Test that grid search works when the input features are given in the # form of a precomputed kernel matrix X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) # compute the training kernel matrix corresponding to the linear kernel K_train = np.dot(X_[:180], X_[:180].T) y_train = y_[:180] clf = SVC(kernel='precomputed') cv = GridSearchCV(clf, {'C': [0.1, 1.0]}) cv.fit(K_train, y_train) assert_true(cv.best_score_ >= 0) # compute the test kernel matrix K_test = np.dot(X_[180:], X_[:180].T) y_test = y_[180:] y_pred = cv.predict(K_test) assert_true(np.mean(y_pred == y_test) >= 0) # test error is raised when the precomputed kernel is not array-like # or sparse assert_raises(ValueError, cv.fit, K_train.tolist(), y_train)
def build_and_train(): data = pd.read_csv('../data/training.csv') data = data.dropna(subset=['Gender', 'Married', 'Credit_History', 'LoanAmount']) pred_var = ['Gender','Married','Dependents','Education','Self_Employed','ApplicantIncome','CoapplicantIncome',\ 'LoanAmount','Loan_Amount_Term','Credit_History','Property_Area'] X_train, X_test, y_train, y_test = train_test_split(data[pred_var], data['Loan_Status'], \ test_size=0.25, random_state=42) y_train = y_train.replace({'Y':1, 'N':0}).as_matrix() y_test = y_test.replace({'Y':1, 'N':0}).as_matrix() pipe = make_pipeline(PreProcessing(), RandomForestClassifier()) param_grid = {"randomforestclassifier__n_estimators" : [10, 20, 30], "randomforestclassifier__max_depth" : [None, 6, 8, 10], "randomforestclassifier__max_leaf_nodes": [None, 5, 10, 20], "randomforestclassifier__min_impurity_split": [0.1, 0.2, 0.3]} grid = GridSearchCV(pipe, param_grid=param_grid, cv=3) grid.fit(X_train, y_train) return(grid)
def tune_parameters(features, labels): """ Use GridSearchCV to identify and return the best parameters to use for the Decision Tree algorithm. features = features list as returned by the targetFeatureSplit script labels = target list as returned by the targetFeatureSplit script """ from sklearn import tree from sklearn.model_selection import GridSearchCV from sklearn.metrics import make_scorer # Make scorer for the GridSearchCV function scorer = make_scorer(custom_scorer, greater_is_better = True) # Parameters names and settings to be used by GridSearchCV parameters = [{"criterion": ["gini", "entropy"], "splitter": ["best", "random"], "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10], "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "min_impurity_split": [1e-9, 1e-8, 1e-7, 1e-6, 1e-5], "presort": [True, False], "random_state": [42]}] # Use GridSearchCV to identify the best parameters # K-fold cross-validation is used (100 folds) # F1 score from custom_scorer function is used as the evaluator clf = GridSearchCV(tree.DecisionTreeClassifier(), parameters, cv = 100, scoring = scorer) clf.fit(features, labels) best_parameters = clf.best_params_ return best_parameters
def test_ovo_gridsearch(): ovo = OneVsOneClassifier(LinearSVC(random_state=0)) Cs = [0.1, 0.5, 0.8] cv = GridSearchCV(ovo, {'estimator__C': Cs}) cv.fit(iris.data, iris.target) best_C = cv.best_estimator_.estimators_[0].C assert_true(best_C in Cs)
def svm_hitters_params(to_predict_hitters, x_hitters, hitter_predictions): # create lists of parameters to search through c = [10**i for i in np.arange(-3,3)] gamma = c poly_coeff0 = [10**i for i in np.arange(0,3)] # finding optimal parameters for svm best_params = [] # preprocess the x values x_hitters = preprocessing.scale(x_hitters) for col in to_predict_hitters: y = hitter_predictions[col].tolist() x_train, x_test, y_train, y_test = train_test_split(x_hitters, y) svr = svm.SVC() parameters = {'kernel':['rbf'], 'gamma': gamma,'coef0': poly_coeff0} clf = GridSearchCV(svr, parameters) clf.fit(x_train, y_train) best_params.append({col:clf.best_params_}) return best_params
def test_pipeline(): param_grid = [{'logisticregression__C': [1, 0.1, 10]}] pipe = make_pipeline(StandardScaler(), CopyTransformer(), LogisticRegression()) grid = GridSearchCV(pipe, param_grid, cv=3, n_jobs=1) grid.fit(X, y)
def test_stochastic_gradient_loss_param(): # Make sure the predict_proba works when loss is specified # as one of the parameters in the param_grid. param_grid = { 'loss': ['log'], } X = np.arange(24).reshape(6, -1) y = [0, 0, 0, 1, 1, 1] clf = GridSearchCV(estimator=SGDClassifier(loss='hinge'), param_grid=param_grid) # When the estimator is not fitted, `predict_proba` is not available as the # loss is 'hinge'. assert_false(hasattr(clf, "predict_proba")) clf.fit(X, y) clf.predict_proba(X) clf.predict_log_proba(X) # Make sure `predict_proba` is not available when setting loss=['hinge'] # in param_grid param_grid = { 'loss': ['hinge'], } clf = GridSearchCV(estimator=SGDClassifier(loss='hinge'), param_grid=param_grid) assert_false(hasattr(clf, "predict_proba")) clf.fit(X, y) assert_false(hasattr(clf, "predict_proba"))
def plot_cross_val_selection(): iris = load_iris() X_trainval, X_test, y_trainval, y_test = train_test_split(iris.data, iris.target, random_state=0) param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]} grid_search = GridSearchCV(SVC(), param_grid, cv=5) grid_search.fit(X_trainval, y_trainval) scores = grid_search.grid_scores_[15:] best = np.argmax([x.mean_validation_score for x in scores]) plt.figure(figsize=(10, 3)) plt.xlim(-1, len(scores)) plt.ylim(0, 1.1) for i, score in enumerate(scores): marker_cv, = plt.plot([i] * 5, score.cv_validation_scores, '^', c='gray', markersize=5, alpha=.5) marker_mean, = plt.plot(i, score.mean_validation_score, 'v', c='none', alpha=1, markersize=10) if i == best: marker_best, = plt.plot(i, score.mean_validation_score, 'o', c='red', fillstyle="none", alpha=1, markersize=20, markeredgewidth=3) plt.xticks(range(len(scores)), [str(score.parameters).strip("{}").replace("'", "") for score in scores], rotation=90); plt.ylabel("validation accuracy") plt.xlabel("parameter settings") plt.legend([marker_cv, marker_mean, marker_best], ["cv accuracy", "mean accuracy", "best parameter setting"], loc=(1.05, .4))
def build(X, y=None): """ Inner build function that builds a single model. :param X: :param y: :return: """ model = Pipeline([ ('vectorizer', TfidfVectorizer( tokenizer=self.spacy_tokenizer, preprocessor=None, lowercase=False)), ('clf', SVC(C=1,kernel="linear", probability=True, class_weight='balanced'))]) from sklearn.model_selection import GridSearchCV items,counts= np.unique(y, return_counts=True) cv_splits = max(2, min(5, np.min(counts) // 5)) Cs = [0.01,0.25,1, 2, 5, 10, 20, 100] param_grid = {'clf__C': Cs, 'clf__kernel': ["linear"]} grid_search = GridSearchCV(model, param_grid=param_grid, scoring='f1_weighted', cv=cv_splits, verbose=2, n_jobs=-1 ) grid_search.fit(X, y) return grid_search
def train_sgd(data, result, scoring=None): print("train SGDClassifier {}".format(len(data))) #scaler = None #scaler = preprocessing.MinMaxScaler() print("Scale: {}".format(type(scaler))) if scaler != None: data = scaler.fit_transform(data) #classifier = SGDClassifier(loss="hinge", penalty="l2") #classifier.fit(data, result) #return scaler, classifier parameters = { 'loss': ('hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'), 'penalty': ('none', 'l2', 'l1', 'elasticnet') } print(parameters) search = GridSearchCV(SGDClassifier(), parameters, scoring=scoring, n_jobs=1) search.fit(data, result) print("best params: {}".format(search.best_params_)) print("best score: {}".format(search.best_score_)) print return scaler, search.best_estimator_.fit(data,result)
def fit(self, X, y, sample_weight=None, check_input=True): """Fit Ridge regression model after searching for the best mu and tau. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training data y : array-like, shape = [n_samples] or [n_samples, n_targets] Target values sample_weight : float or array-like of shape [n_samples] Sample weight Returns ------- self : Returns self. """ self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1) y = self._label_binarizer.fit_transform(y) if self._label_binarizer.y_type_.startswith('multilabel'): raise ValueError( "%s doesn't support multi-label classification" % ( self.__class__.__name__)) else: y = column_or_1d(y, warn=False) param_grid = {'tau': self.taus, 'lamda': self.lamdas} fit_params = {'sample_weight': sample_weight, 'check_input': check_input} estimator = L1L2TwoStepClassifier( mu=self.mu, fit_intercept=self.fit_intercept, use_gpu=self.use_gpu, threshold=self.threshold, normalize=self.normalize, precompute=self.precompute, max_iter=self.max_iter, copy_X=self.copy_X, tol=self.tol, warm_start=self.warm_start, positive=self.positive, random_state=self.random_state, selection=self.selection) gs = GridSearchCV( estimator=estimator, param_grid=param_grid, fit_params=fit_params, cv=self.cv, scoring=self.scoring, n_jobs=self.n_jobs, iid=self.iid, refit=self.refit, verbose=self.verbose, pre_dispatch=self.pre_dispatch, error_score=self.error_score, return_train_score=self.return_train_score) gs.fit(X, y) estimator = gs.best_estimator_ self.tau_ = estimator.tau self.lamda_ = estimator.lamda self.coef_ = estimator.coef_ self.intercept_ = estimator.intercept_ self.best_estimator_ = estimator # XXX DEBUG if self.classes_.shape[0] > 2: ndim = self.classes_.shape[0] else: ndim = 1 self.coef_ = self.coef_.reshape(ndim, -1) return self
def score_nestedCV(self, G1, model, param_grid, effect, nested): k_fold = model_selection.KFold(n_splits=self.n_folds).split(range(self.Y.shape[0])) i_fold=0 scores = sp.zeros(self.n_folds) params = list() for train, test in k_fold: (trainData, trainY) = self._packData(G1, train, effect) (testData, testY) = self._packData(G1, test, effect) if nested: clf = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs = self.n_jobs_grid, cv=self.n_folds_params, scoring=self.scoring, verbose=self.verbose) clf.fit(trainData, trainY.flatten()) params.append(clf.best_params_) scores[i_fold] = clf.score(testData, testY.flatten(), method_scorer=False) else: model.fit(trainData, trainY.flatten()) scores[i_fold] = SCORERS[self.scoring](model, testData, testY.flatten()) i_fold+=1 return scores,params
def fit_branchmodel(self,xwl2,hm_y): clf = LogisticRegression(C=1.,class_weight='balanced',penalty='l1',solver='liblinear',max_iter=300) param_grid = dict(C=(np.logspace(-.2, 1, 15))) #gridclf = GridSearchCV(clf, param_grid=param_grid, cv=StratifiedKFold(hm_y,n_folds=4), n_jobs=-1,scoring='precision_weighted') gridclf = GridSearchCV(clf, param_grid=param_grid, cv=StratifiedShuffleSplit(n_splits=50, test_size=.2,random_state=1), n_jobs=-1,scoring='accuracy') gridclf.fit(xwl2,hm_y) return gridclf.best_estimator_
def optimize_model_regress(data, tc): train_data = data.sample(frac=.8) test_data = data.drop(train_data.index) train_y = train_data['temperature']/tc train_X = train_data.drop(['T/Tc','temperature'], axis=1) test_y = test_data['temperature']/tc test_X = test_data.drop(['T/Tc','temperature'], axis=1) tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1,.5,.1,1e-2,1e-3, 1e-4], 'C': [.1,.5, 1,5, 10, 50, 100,500, 1000]}, {'kernel': ['linear'], 'C': [.1,.5, 1,5, 10, 50, 100,500, 1000]}] model = GridSearchCV(svm.SVR(), tuned_parameters, cv=5) model.fit(train_X, train_y) print() print("Best parameters:") print() print(model.best_params_) print() print("Grid scores:") print() means = model.cv_results_['mean_test_score'] stds = model.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, model.cv_results_['params']): print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) print() y_true, y_pred = test_y, model.predict(test_X) print("Mean Absolute Error : " + str(mean_absolute_error(y_pred,y_true))) print()
def model_selection( x_matrix, y_vector, param_grid, cv=None, scoring=None): pipeline = Pipeline( [('resampler', None), ('classifier', DummyClassifier())]) grid_search_cv = GridSearchCV(pipeline, param_grid, cv=cv, scoring=scoring) grid_search_cv.fit(x_matrix, y_vector) return grid_search_cv
def inner_cv_loop(Xtrain,Ytrain,clf,parameters, oversample=None,fa_dims=20, verbose=False): """ use GridSearchCV to find best classifier for training set """ rocscore={} best_est={} facanal={} for fa_d in [0,fa_dims]: clfname='fa' if fa_d>0 else "nofa" if fa_d>0: facanal[clfname]=FactorAnalysis(fa_d) Xtrain=facanal[clfname].fit_transform(Xtrain) else: facanal[clfname]=None if verbose: print(clfname) gs=GridSearchCV(clf,parameters,scoring='roc_auc') gs.fit(Xtrain,Ytrain) rocscore[clfname]=gs.best_score_ best_est[clfname]=gs.best_estimator_ bestscore=numpy.max([rocscore[i] for i in rocscore.keys()]) bestclf=[i for i in rocscore.keys() if rocscore[i]==bestscore][0] if verbose: print('best:',bestclf,bestscore,best_est[bestclf],facanal[bestclf]) return best_est[bestclf],bestscore,facanal[bestclf]
def _search_param(self, metric, X, y): ''' Find best potential parameters set using few n_estimators ''' # Make sure user specified params are in the grid. max_depth_grid = list(np.unique([self.model_instance.max_depth, 5, 7])) colsample_bytree_grid = list( np.unique([self.model_instance.colsample_bytree, 0.66, 0.9])) reg_lambda_grid = list(np.unique([self.model_instance.reg_lambda, 1, 5])) param_grid = { 'max_depth': max_depth_grid, 'learning_rate': [max(self.model_instance.learning_rate, 0.3)], 'n_estimators': [min(self.model_instance.n_estimators, 60)], 'gamma': [self.model_instance.gamma], 'min_child_weight': [self.model_instance.min_child_weight], 'max_delta_step': [self.model_instance.max_delta_step], 'subsample': [self.model_instance.subsample], 'colsample_bytree': colsample_bytree_grid, 'colsample_bylevel': [self.model_instance.colsample_bylevel], 'reg_alpha': [self.model_instance.reg_alpha], 'reg_lambda': reg_lambda_grid, 'scale_pos_weight': [self.model_instance.scale_pos_weight], 'base_score': [self.model_instance.base_score], 'seed': [self.model_instance.seed] } grid_search = GridSearchCV( self.model_instance, param_grid, cv=2, refit=False, scoring=metric) grid_search.fit(X, y) best_params = grid_search.best_params_ # Change params back original params best_params['learning_rate'] = self.model_instance.learning_rate best_params['n_estimators'] = self.model_instance.n_estimators return best_params
def test_gridsearch(): iris = load_iris() X = iris.data y = iris.target knn = KNeighborsClassifier(n_neighbors=2) sfs1 = SFS(estimator=knn, k_features=3, forward=True, floating=False, cv=5) pipe = Pipeline([('sfs', sfs1), ('knn', knn)]) param_grid = [ {'sfs__k_features': [1, 2, 3, 4], 'sfs__estimator__n_neighbors': [1, 2, 3, 4]} ] gs = GridSearchCV(estimator=pipe, param_grid=param_grid, n_jobs=1, iid=False, cv=5, refit=False) gs = gs.fit(X, y) assert gs.best_params_['sfs__k_features'] == 3
def __param_search(self, clasif, train_data): """ :param clasif: clasificador para realizar la CV. String :param train_data: datos para el entrenamiento :return: objeto GridSearchCV Recompone los parámetros a utilizar y realiza una validación cruzada """ param_grid = dict() if self.preprocess_data is False: pipeline = self.pipelines[clasif] self.parameters[clasif].update(parameters.vect_params) param_grid = self.parameters[clasif] else: pipeline = self.pipelines_train[clasif] for k in self.parameters[clasif].keys(): param_grid[k[4:]] = self.parameters[clasif][k] print(param_grid) print(pipeline) print("\n#Searching parameters for ", clasif) if self.v >= 1 else None print("Parametros: ", param_grid) if self.v >= 2 else None print("train_data: ", train_data) if self.v >= 3 else None print("Longitudes: %d %d" % (len(train_data[0]), len(train_data[1]))) if self.v >= 2 else None if self.bin_flag: try: grid_search = GridSearchCV(pipeline, param_grid, verbose=self.v, scoring='roc_auc', refit=True, cv=3) except Exception as e: os.system('cat %s >> archivo.txt' % e) else: grid_search = GridSearchCV(pipeline, param_grid, verbose=self.v, scoring='accuracy', refit=True, cv=3) grid_search.fit(train_data[0], train_data[1]) return grid_search
def svr_linear(X,Y,x,y): reg = GridSearchCV(SVR(kernel='linear'), cv=10,param_grid={"C":[1e0, 1e1, 1e2, 1e3], "degree":[1,2,3,4]}) reg.fit(X, Y) y_predict = reg.predict(x) rmse = RMSE(y=y, y_predict=y_predict) print "rmse: ", str(rmse) return rmse, y_predict
def PipeFeauture(Xtrain, Ytrain): pipeline = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier()), ]) # uncommenting more parameters will give better exploring power but will # increase processing time in a combinatorial way parameters = { 'vect__max_df': (0.5, 0.75, 1.0), #'vect__max_features': (None, 5000, 10000, 50000), 'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams #'tfidf__use_idf': (True, False), #'tfidf__norm': ('l1', 'l2'), 'clf__alpha': (0.00001, 0.000001), 'clf__penalty': ('l2', 'elasticnet'), #'clf__n_iter': (10, 50, 80), } grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1) print("Performing grid search...") print("pipeline:", [name for name, _ in pipeline.steps]) print("parameters:") pprint(parameters) t0 = time() grid_search.fit(Xtrain, Ytrain) print("done in %0.3fs" % (time() - t0)) print() print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name]))
def test_count_vectorizer_pipeline_grid_selection(): # raw documents data = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS # label junk food as -1, the others as +1 target = [-1] * len(JUNK_FOOD_DOCS) + [1] * len(NOTJUNK_FOOD_DOCS) # split the dataset for model development and final evaluation train_data, test_data, target_train, target_test = train_test_split( data, target, test_size=.2, random_state=0) pipeline = Pipeline([('vect', CountVectorizer()), ('svc', LinearSVC())]) parameters = { 'vect__ngram_range': [(1, 1), (1, 2)], 'svc__loss': ('hinge', 'squared_hinge') } # find the best parameters for both the feature extraction and the # classifier grid_search = GridSearchCV(pipeline, parameters, n_jobs=1) # Check that the best model found by grid search is 100% correct on the # held out evaluation set. pred = grid_search.fit(train_data, target_train).predict(test_data) assert_array_equal(pred, target_test) # on this toy dataset bigram representation which is used in the last of # the grid_search is considered the best estimator since they all converge # to 100% accuracy models assert_equal(grid_search.best_score_, 1.0) best_vectorizer = grid_search.best_estimator_.named_steps['vect'] assert_equal(best_vectorizer.ngram_range, (1, 1))
def logistic_regression(self): C_array = [2**i for i in range(-10, 10)] tuned_parameters = {'C' : C_array} logi_reg = LogisticRegression() clf = GridSearchCV(logi_reg, tuned_parameters, cv=5, scoring = "recall")# make_scorer(my_scorer)) clf.fit(self.train_data_x, self.train_labels_y) self.models.append(clf)
def kernel_ridge_linear(X,Y,x,y): reg = GridSearchCV(KernelRidge(kernel='linear'), cv=10,param_grid={"alpha": [1e0,0.1,1e-2,1e-3],"degree":[1,2,3,4] }) reg.fit(X, Y) y_predict = reg.predict(x) rmse = RMSE(y=y, y_predict=y_predict) print "rmse: ", str(rmse) return y_predict
def test_grid_search_correct_score_results(): # test that correct scores are used n_splits = 3 clf = LinearSVC(random_state=0) X, y = make_blobs(random_state=0, centers=2) Cs = [.1, 1, 10] for score in ['f1', 'roc_auc']: grid_search = GridSearchCV(clf, {'C': Cs}, scoring=score, cv=n_splits) results = grid_search.fit(X, y).cv_results_ # Test scorer names result_keys = list(results.keys()) expected_keys = (("mean_test_score", "rank_test_score") + tuple("split%d_test_score" % cv_i for cv_i in range(n_splits))) assert_true(all(in1d(expected_keys, result_keys))) cv = StratifiedKFold(n_splits=n_splits) n_splits = grid_search.n_splits_ for candidate_i, C in enumerate(Cs): clf.set_params(C=C) cv_scores = np.array( list(grid_search.cv_results_['split%d_test_score' % s][candidate_i] for s in range(n_splits))) for i, (train, test) in enumerate(cv.split(X, y)): clf.fit(X[train], y[train]) if score == "f1": correct_score = f1_score(y[test], clf.predict(X[test])) elif score == "roc_auc": dec = clf.decision_function(X[test]) correct_score = roc_auc_score(y[test], dec) assert_almost_equal(correct_score, cv_scores[i])
accuracies.mean() accuracies.std() # Applying Grid Search to find the best model and the best parameters from sklearn.model_selection import GridSearchCV parameters = [{ 'C': [1, 2, 3, 4], 'kernel': ['linear'] }, { 'C': [1, 2, 3, 4], 'kernel': ['rbf'], 'gamma': [0.67, 0.68, 0.69, 0.675, 0.685] }] grid_search = GridSearchCV(estimator=classifier, param_grid=parameters, scoring='accuracy', cv=10, n_jobs=-1) grid_search = grid_search.fit(X_train, y_train) best_accuracy = grid_search.best_score_ best_parameters = grid_search.best_params_ # Visualising the Training set results from matplotlib.colors import ListedColormap X_set, y_set = X_train, y_train X1, X2 = np.meshgrid( np.arange(start=X_set[:, 0].min() - 1, stop=X_set[:, 0].max() + 1, step=0.01), np.arange(start=X_set[:, 1].min() - 1, stop=X_set[:, 1].max() + 1,
def get_grid_result(self, param_grid): model = KerasClassifier(build_fn=self.build_fn, epochs=self.epochs, batch_size=self.batch_size, verbose=0) clf = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=self.split) grid_result = clf.fit(self.train_x, self.train_y) return grid_result
def A2_SVM_ParameterTuning(x_train, y_train): param_grid = {'kernel': ('linear', 'poly', 'rbf'), 'C': [1, 10]} grid = GridSearchCV(SVC(), param_grid=param_grid, cv=4) # GridSearchCV grid.fit(x_train, y_train) return grid.best_params_
kfold = StratifiedKFold(n_splits=10) rf_param_grid = { "max_depth": [None], "max_features": [1, 3, 6], "min_samples_split": [2, 3, 10], "min_samples_leaf": [1, 3, 10], "bootstrap": [False], "n_estimators": [50, 100, 300, 500], "criterion": ["gini", "entropy"] } from sklearn.model_selection import GridSearchCV gsRFC = GridSearchCV(classifier, param_grid=rf_param_grid, cv=kfold, scoring="accuracy", n_jobs=-1, verbose=1) gsRFC.fit(X_train, y_train) RFC_best = gsRFC.best_estimator_ gsRFC.best_score_ # Generate competition submission X_comp, y_none = pre_process(comp_test_dataset) comp_test_pred = gsRFC.predict(X_comp) comp_output = pd.DataFrame(data=np.append(comp_test_dataset[["PassengerId"
# Model estimator = RandomForestRegressor(n_estimators=250, criterion='mse', n_jobs=15, verbose=1, random_state=0) pipeline = Pipeline([ ('imputation', make_union(SimpleImputer(strategy="median"), MissingIndicator(error_on_new=False))), ('estimator', estimator)]) cv = ShuffleSplit(n_splits=100, test_size=0.1, random_state=0) param_grid = {'estimator__max_depth': [5, 10, 20, 40, None], 'estimator__max_features': [1, 5, 'log2', 'sqrt', 'auto', None]} grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, verbose=2, n_jobs=15) metrics = [] def predict_collect_save(data_pred, data_collect, y_true, test_index, split, save_type): scores = {} pred_ = grid_search.predict(data_pred) y_true_ = y_true.iloc[test_index] predictions = pd.DataFrame(pred_, columns=['predicted'], index=y_true_.index) predictions['true'] = y_true_ predictions['test_indices'] = pd.DataFrame(test_index, columns=['test indices'], index=y_true_.index)
# We create a instance of model. Estimator_DTree = DecisionTreeClassifier() # Now, we are going to use a grid search cross-validation to explore combinations of parameters. param_grid = { 'criterion': ['gini'], 'max_features': ['auto'], 'splitter': ['random', 'best'], 'min_samples_split': [25, 30, 35, 40, 45], 'max_depth': range(4, 6), 'random_state': [0] } Grid_DTree = GridSearchCV(Estimator_DTree, param_grid, cv=10, verbose=2, scoring='f1') Grid_DTree.fit(X_train, Y_train) # Once it has been fitted, we get several parameters. #print("ParameterGrid: ",'\n',list(ParameterGrid(param_grid)),'\n') print("Best estimator: ", Grid_DTree.best_estimator_, '\n') print("Best Score: ", round(Grid_DTree.best_score_, 2)) print("Best Parameters ", Grid_DTree.best_params_) # Now, we came back fit it Best_Grid_estimator with. Best_Grid_estimator_DTree = Grid_DTree.best_estimator_
return normalized_gini gini_scorer = make_scorer(normalized_gini, greater_is_better=True) cv=StratifiedShuffleSplit(n_splits=5, test_size=0.2) gsc = GridSearchCV( estimator=rf, param_grid={ #'class_weight': [{0: 1, 1: x} for x in range(300, 701, 100)] + ['balanced'], #'min_samples_leaf': range(5,51,5), #'min_samples_split': range(5,56,10), #'n_estimators': range(200, 601, 200), #'criterion': ('gini', 'entropy') #'max_features': ('auto', 'sqrt'), #'max_features': range(3, 9), #'max_depth': range(3, 7), }, #scoring='neg_log_loss', scoring='roc_auc', #scoring='f1', #scoring=gini_scorer, cv=cv, verbose=2 ) rsc = RandomizedSearchCV( estimator=rf, param_distributions={ 'n_estimators': randint(250, 2500), #'class_weight': [{0: 1, 1: x} for x in range(15, 51, 5)],
environ[ "PYTHONWARNINGS"] = "ignore" # Also affect subprocesses (n_jobs > 1) # Linear SVM print("\rLinear SVM ", end='') parameters = {'C': [0.01, 0.1, 1, 10, 100]} # svc = svm.LinearSVC(class_weight=args.class_weight, random_state=seed) svc = svm.SVC(kernel='linear', class_weight=args.class_weight, random_state=args.seed, probability=True, max_iter=max_iters) clf = GridSearchCV(svc, parameters, cv=sss, n_jobs=-1, scoring=scoring, refit='roc_auc', return_train_score=True) try: clf.fit(data, labels) except Exception as e: if hasattr(e, 'message'): print(e.message) else: print(e) save_results(args.savefile, 'a', 'Linear SVM', clf) # RBF SVM print("\rRBF SVM ", end='')
from sklearn.preprocessing import MinMaxScaler Scaler = MinMaxScaler() X_train_scaler = Scaler.fit_transform(X_train) X_test_scaler = Scaler.transform(X_test) # In[ ]: from sklearn.linear_model import LogisticRegression from sklearn.model_selection import GridSearchCV from sklearn.metrics import classification_report,roc_auc_score,precision_score,recall_score LoR = LogisticRegression(random_state=0) c_values = {'C':[1,15,10,100,150,250]} grdClf = GridSearchCV(LoR,param_grid=c_values,scoring='precision') grdClf.fit(X_train_scaler,y_train) y_decs = grdClf.decision_function(X_test_scaler) (grdClf.best_params_,grdClf.best_score_,roc_auc_score(y_test,y_decs)) # Performing Cross validation to evaluate the model # In[ ]: from sklearn.model_selection import cross_val_score cross_val_score(DTClf,X_train,y_train,cv=5,scoring='precision') # In[ ]:
print("Accuracy of logistic regression classifier: ", logreg.score(rescaledX_test,y_test)) confusion_matrix(y_test,y_pred) # From our confusion matrix we can see that accuracy is pretty low. # In[35]: from sklearn.model_selection import GridSearchCV tol = [0.01, 0.001, 0.0001] max_iter = [100, 150, 200] param_grid = {'tol':tol, 'max_iter': max_iter} # In[38]: grid_model = GridSearchCV(logreg,param_grid,cv = 5) rescaledX = scaler.fit_transform(X) grid_model_result = grid_model.fit(rescaledX,y) best_score, best_params = grid_model_result.best_score_, grid_model_result.best_params_ print(best_score, best_params) # Best Score : 0.186
from sklearn.pipeline import Pipeline # TODO: 1.生成数据集并进行数据划分 X, y = make_blobs(n_samples=200, centers=2, cluster_std=5, random_state=16) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=16) # TODO: 2.创建管道模型,并创建预处理模块scaler和神经网络模块mlp params = { 'mlp__hidden_layer_sizes': [[50], [100], [100, 100]], 'mlp__alpha': [0.0001, 0.001, 0.01, 0.1] } pipeline = Pipeline(steps=[('scaler', StandardScaler()), ('mlp', MLPClassifier(max_iter=1600, random_state=16))], verbose=0) # TODO: 3.创建网格搜索模型,并输出预测结果 grid = GridSearchCV(pipeline, param_grid=params, cv=5, iid=False, n_jobs=8, verbose=1) grid.fit(X_train, y_train) print('交叉验证评分:{:.2f}'.format(grid.best_score_)) print('模型最优参数:{}'.format(grid.best_params_)) print('测试集得分:{}'.format(grid.score(X_test, y_test))) print(pipeline.steps)
def model_selection(X, y, rep=10, random_state=42): """ Uses grid search and cross validation to choose the best clf for the task (X,y)""" X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=random_state) # models = [ # ## ("XGB",XGBClassifier(seed=random_state,objective = "multi:softmax")), # ("NaiveBayes",GaussianNB()), # ("RF",RandomForestClassifier(random_state=random_state)), # ("AdaBoost",AdaBoostClassifier(random_state=random_state)), # ("LR",LogisticRegression(random_state=random_state)), # ## ("linSVC",LinearSVC(multi_class="ovr")), # ("SVC",SVC(random_state=random_state,probability=True)), # ("MLP",MLPClassifier(random_state=random_state)) # # ("KNN",KNeighborsClassifier()) # ] # hyperparameters = [ # ## [("max_depth",[15,20]),("n_estimators",[100,200])], # [], # [("max_depth",[5,6,10])], # [("n_estimators",[10,50,100])], # [("C",[1.0,0.95,0.9])], # ## [], # [], # [("hidden_layer_sizes",[(200,),(150,),(100,100),(300,)])] # # [("n_neighbors",[5,6,7,10])] # ] best_est = None best_score = 0.0 results_summary = [] all_models = [] for model, hyperp_setting in zip(models_clfs, hyperparameters_clfs): print("Fitting " + model[0]) pipeline = Pipeline([model]) # pipeline = Pipeline([("scaling",StandardScaler()),model]) param_grid = {} for param in hyperp_setting: param_grid[model[0] + "__" + param[0]] = param[1] grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=True, scoring="f1_weighted", cv=3, n_jobs=5) grid_search.fit(X_train, y_train) clf = grid_search.best_estimator_ scores = [] np.random.seed(random_state) for i in range(0, rep): rows = np.random.randint(2, size=len(X_train)).astype('bool') clf.fit(X_train[rows], y_train[rows]) preds = clf.predict(X_test) scores.append(f1_score(y_test, preds, average='weighted')) results_summary.append([model, scores]) print(results_summary[-1]) avg_score = pd.DataFrame(scores).mean()[0] if (avg_score > best_score): best_score = avg_score best_est = clf clf.fit(X, y) all_models.append([model[0], clf]) y_pred = best_est.predict(X_test) rocs = [] preds_score = best_est.predict_proba(X_test) for i in range(0, len(best_est.classes_)): correct_class = best_est.classes_[i] fpr, tpr, _ = roc_curve(y_test.as_matrix(), [p[i] for p in preds_score], pos_label=correct_class) roc_df = pd.DataFrame(tpr, columns=["tpr"]).join( pd.DataFrame(fpr, columns=["fpr"])).join( pd.DataFrame([correct_class] * len(tpr), columns=["class"])) rocs.append(roc_df) rocs_df = pd.concat(rocs) #using whole data after cv best_est.fit(X, y) return best_est, best_score, confusion_matrix( y_test, y_pred), rocs_df, results_summary, all_models
# model details model_name = f'{args.model}_t{args.topic_num:02d}_l{args.max_seq_len}_d{args.drop_out}' model_group = models_info[args.model]['group'] params = models_info[args.model]['params'] # model training print(f'{model_name} training...') if model_group == 'sklearn': # build models model = build_SKM(model_type=args.model, max_features=20000, selectK=10000) # grid search clf = GridSearchCV(model, params, cv=3, n_jobs=-1) clf.fit(X_train, y_train) # train with best params model.set_params(**clf.best_params_) model.fit(X_train, y_train) # test and save save_file(model, 'models', model_name) results = model_evaluation(X_test, y_test, model=model) if model_group == 'keras': # preprocessing X_train, word_index, doc2seq = tokenizer_transform( X_train,
# #'model__beta_2' : [1e-7, 1e-8, 1e-9] # # } # Two options: # 1) Grid search to find best model # 2) Train best model and save to disk # Option 1) Grid search to find best model if args.type == "gridsearch": print("Start Grid search...") X_train, X_test, y_train, y_test = train_test_split( V, labels.values.ravel(), test_size=0.3, random_state=0) grid = GridSearchCV(pipeline, parameters, cv=10, n_jobs=-1, verbose=1) #grid = grid.fit(V, labels.values.ravel()) grid = grid.fit(X_train, y_train) y_pred = grid.best_estimator_.predict(X_test) print("\nBest: %f using %s" % (grid.best_score_, grid.best_params_)) print("F1-Score:", f1_score(y_test, y_pred)) print("Precision: ", precision_score(y_test, y_pred)) print("Recall: ", recall_score(y_test, y_pred)) print("Accuracy: ", accuracy_score(y_test, y_pred)) print("roc auc: ", roc_auc_score(y_test, y_pred)) print("Performance overall: ") print(classification_report(y_test, y_pred))
random_state=42) rnd_search_cv.fit(train_prepared, price_labels) #%% rnd_search_cv.best_estimator_ #%% #Random Forest Regressor from sklearn.linear_model import Ridge ridge_reg = Ridge(alpha=1, solver="cholesky", random_state=42) ridge_reg.fit(train_prepared, price_labels) print(ridge_reg.predict(test_prepared)) #%% #Random Forest from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import GridSearchCV parameters = {'n_estimators': (1000, 10, 100), 'max_features': [15, 5, 10]} forest = RandomForestRegressor(random_state=0) gsearch = GridSearchCV(forest, parameters, cv=5) gsearch.fit(X_train, y_train) #%% gsearch.best_estimator_ gsearch.best_score_ #%%
forest_rmse_scores = np.sqrt(-forest_scores) display_scores("Random Forest Regression", forest_rmse_scores) #SupportVectorRegressor Validation sv_scores = cross_val_score(sv_reg, X_train, Y_train, scoring="neg_mean_squared_error", cv=10) sv_rmse_scores = np.sqrt(-sv_scores) display_scores("Support Vector Regression", sv_rmse_scores) #8.Fine-Tune RandomForestRegressor Model using Grid Search param_grid = [{'n_estimators': [10, 20, 30, 40], 'max_features': [4, 8]}] grid_search = GridSearchCV(forest_reg, param_grid, cv=10, scoring='neg_mean_squared_error') grid_search.fit(X_train, Y_train) cvres = grid_search.cv_results_ for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]): print(np.sqrt(-mean_score), params) print("Best Parameter: ") print(grid_search.best_params_) final_model = grid_search.best_estimator_ final_predictions = final_model.predict(X_test) final_mse = mean_squared_error(Y_test, final_predictions) final_rmse = np.sqrt(final_mse) print("Final Prediction: ") print(final_rmse)
class GradientBoost: def __init__(self): """ Don't pass in anything """ self.m = GradientBoostingClassifier() def log_loss_score(self, y_true, y_pred): """ input: y_true, y_pred: 1d arrays of size n output: float: log loss score. It's a negative number, closer to zero is better. """ return -log_loss(y_true, y_pred) def _change_data(self, old_df): #return jconvert(a_convert(old_df)) """ take dataframe, output dataframe """ return max_data_pipeline(jconvert(a_convert(do_it(old_df)))) def fit(self, data): """ data is a dataframe no output """ self.data = self._change_data(data) self.X = self.data.drop(['fraud'], axis=1).values self.y = self.data['fraud'].values self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( self.X, self.y) self.m.fit(self.X_train, self.y_train) def optimize(self, param_grid, n_jobs=-1, cv=5): """ run grid search to change model """ self.grid = GridSearchCV(self.m, param_grid=param_grid, cv=cv, n_jobs=n_jobs, scoring='neg_log_loss') self.grid.fit(self.X_train, self.y_train) self.m = self.grid.best_estimator_ def score(self): return self.log_loss_score(self.y_test, self.m.predict_proba(self.X_test)) def predict(self, df): X = self._change_data(df).values return self.m.predict(X) def predict_proba(self, df): converted = self._change_data(df) default_col = pd.Series([0] * len(converted)) for col in self.data.columns: if col not in converted.columns and col != 'fraud': converted[col] = default_col return self.m.predict_proba(converted.values)
def cross_validate(featnames, tasknames, cvs, classifiers, gps, logger, n_jobs, npy_suffix='', mid_layer=4): '''featnames: list of string, ['mine', 'mfcc'] - tasknames = list of stringm ['ballroom_extended', 'gtzan_genre', 'gtzan_speechmusic', 'emoMusic', 'jamendo_vc', 'urbansound'] - cvs: list of cv, 10 for rest, split arrays for urbansound and jamendo_vd - classifier: list of classifier class, e.g [KNeighborsClassifier, SVC] - gps: list of gp, e.g. [{"n_neighbors":[1, 2, 8, 12, 16]}, {"C":[0.1, 8.0], "kernel":['linear', 'rbf']}] - mid_layer: scalar, or list of scalar . ''' np.random.seed(1209) if not isinstance(mid_layer, list): mid_layer = [mid_layer] logger.info('') logger.info('--- Cross-validation started for {} ---'.format(''.join( [str(i) for i in mid_layer]))) for featname in featnames: logger.info(' * feat_name: {} ---'.format(featname)) for classifier, gp in zip(classifiers, gps): clname = classifier.__name__ logger.info(' - classifier: {} ---'.format(clname)) for taskname, cv in zip(tasknames, cvs): logger.info(' . task: {} ---'.format(taskname)) model_filename = 'clf_{}_{}_{}.cP'.format( featname, taskname, clname) x, y = load_xy_many(taskname, featname, npy_suffix, logger, mid_layer=mid_layer) estimators = [('stdd', OptionalStandardScaler()), ('clf', classifier())] pipe = Pipeline(estimators) if isinstance(gp, dict): # k-nn or svm with single kernel params = {'stdd__on': [True, False]} params.update({ 'clf__' + key: value for (key, value) in gp.iteritems() }) elif isinstance( gp, list): # svm: grid param can be a list of dictionaries params = [] for dct in gp: # should be dict of list for e.g. svm sub_params = {'stdd__on': [True, False]} sub_params.update({ 'clf__' + key: value for (key, value) in dct.iteritems() }) params.append(sub_params) clf = GridSearchCV(pipe, params, cv=cv, n_jobs=n_jobs, pre_dispatch='8*n_jobs').fit(x, y) logger.info(' . best score {}'.format(clf.best_score_)) logger.info(clf.best_params_) print('best score of {}, {}, {}: {}'.format( featname, taskname, clname, clf.best_score_)) print(clf.best_params_) cP.dump(clf, open(os.path.join(PATH_CLS, model_filename), 'w')) featname_midlayer = '{}_{}'.format( featname, ''.join([str(i) for i in mid_layer])) save_result(featname_midlayer, taskname, clname, clf.best_score_)
import pandas as pd (X, y), _ = load_preproccessed_dataset(test_split=0.0, include_grades=True) hyperparams = { 'n_estimators': [ 100, 500, ], 'max_depth': [ 3, None, ], 'min_samples_leaf': [ 1, 0.05, ], } clf = GridSearchCV(estimator=RandomForestClassifier(), param_grid=hyperparams, cv=10) clf.fit(X, y) cv_results = pd.DataFrame(clf.cv_results_) print(cv_results[[ *(f'param_{p}' for p in hyperparams.keys()), 'mean_fit_time', 'mean_test_score', 'std_test_score' ]])
# In[60]: y_pred_c # # GridSearchCV # In[63]: parameters = [{'gamma': [0.001, 0.005, 0.01, 0.02, 0.05, 0.1], 'C': [0.1, 0.2, 0.25, 0.5, 1, 1.5, 2]}] #'nu': [0.75, 0.8, 0.85, 0.9, 0.95, 0.97]}] reg1 = GridSearchCV(SVR(kernel='rbf', tol=0.01), parameters, cv=5, scoring='neg_mean_absolute_error') reg1.fit(x_train, y_train.flatten()) y_pred1 = reg1.predict(x_train) print("Best CV score: {:.4f}".format(reg1.best_score_)) print(reg1.best_params_) #print(y_pred1) # In[ ]: # In[ ]:
# open a file, where you ant to store the data file = open('regression_model.pkl', 'wb') # dump information to that file pickle.dump(reg, file) ########################## Ridge Regression ####################### from sklearn.linear_model import Ridge from sklearn.model_selection import GridSearchCV ridge = Ridge() parameters = { 'alpha': [1e-15, 1e-10, 1e-8, 1e-3, 1e-2, 1, 5, 10, 20, 30, 35, 40] } ridge_regressor = GridSearchCV(ridge, parameters, scoring='neg_mean_squared_error', cv=5) ridge_regressor.fit(X, y) print(ridge_regressor.best_params_) print(ridge_regressor.best_score_) ########################## Lasso Regression ####################### from sklearn.linear_model import Lasso from sklearn.model_selection import GridSearchCV lasso = Lasso() parameters = { 'alpha': [1e-15, 1e-10, 1e-8, 1e-3, 1e-2, 1, 5, 10, 20, 30, 35, 40] } lasso_regressor = GridSearchCV(lasso,
# random_state=100, topic_word_prior=None, # total_samples=1000000.0, verbose=0) # Log Likelyhood: Higher the better print("Log Likelihood: ", lda_model.score(data_vectorized)) # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word) print("Perplexity: ", lda_model.perplexity(data_vectorized)) # See model parameters pprint(lda_model.get_params()) # Define Search Param search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]} # Init the Model lda = LatentDirichletAllocation(max_iter=5, learning_method='online', learning_offset=50., random_state=0) # Init Grid Search Class model = GridSearchCV(lda, param_grid=search_params) # Do the Grid Search model.fit(data_vectorized) GridSearchCV(cv=None, error_score='raise', estimator=LatentDirichletAllocation(batch_size=128, doc_topic_prior=None, evaluate_every=-1, learning_decay=0.7, learning_method=None, learning_offset=10.0, max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001, n_components=10, n_jobs=1, n_topics=None, perp_tol=0.1, random_state=None, topic_word_prior=None, total_samples=1000000.0, verbose=0), fit_params=None, iid=True, n_jobs=1, param_grid={'n_topics': [10, 15, 20, 25, 30], 'learning_decay': [0.5, 0.7, 0.9]}, pre_dispatch='2*n_jobs', refit=True, return_train_score='warn', scoring=None, verbose=0)
def findClassifierParameters(clumpsImg, classesIntCol, variables, preProcessor=None, gridSearch=GridSearchCV(RandomForestClassifier(), {})): """ Find the optimal parameters for a classifier using a grid search and return a classifier instance with those optimal parameters. :param clumpsImg: is the clumps image on which the classification is to be performed :param classesIntCol: is the column with the training data as int values :param variables: is an array of column names which are to be used for the classification :param preProcessor: is a scikit-learn processors such as sklearn.preprocessing.MaxAbsScaler() which can rescale the input variables independently as read in (Define: None; i.e., not in use). :param gridSearch: is an instance of GridSearchCV parameterised with a classifier and parameters to be searched. :return: Instance of the classifier with optimal parameters defined. Example:: from rsgislib.classification import classratutils from sklearn.svm import SVC from sklearn.model_selection import GridSearchCV from sklearn.preprocessing import MaxAbsScaler clumpsImg = "./LS8_20150621_lat10lon652_r67p233_clumps.kea" classesIntCol = 'ClassInt' classParameters = {'kernel':['linear', 'rbf', 'poly', 'sigmoid'], 'C':[1, 2, 3, 4, 5, 10, 100, 400, 500, 1e3, 5e3, 1e4, 5e4, 1e5], 'gamma':[0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1, 'auto'], 'degree':[2, 3, 4, 5, 6, 7, 8], 'class_weight':['', 'balanced'], 'decision_function_shape':['ovo', 'ovr', None]} variables = ['BlueRefl', 'GreenRefl', 'RedRefl', 'NIRRefl', 'SWIR1Refl', 'SWIR2Refl'] gSearch = GridSearchCV(SVC(), classParameters) classifier = classratutils.findClassifierParameters(clumpsImg, classesIntCol, variables, preProcessor=MaxAbsScaler(), gridSearch=gSearch) """ # Check gdal is available if not haveGDALPy: raise Exception( "The GDAL python bindings required for this function could not be imported\n\t" + gdalErr) # Check numpy is available if not haveNumpy: raise Exception( "The numpy module is required for this function could not be imported\n\t" + numErr) # Check rios rat is available if not haveRIOSRat: raise Exception( "The RIOS rat tools are required for this function could not be imported\n\t" + riosRatErr) # Check scikit-learn pre-processing is available if not haveSKLearnPreProcess: raise Exception( "The scikit-learn pre-processing tools are required for this function could not be imported\n\t" + sklearnPreProcessErr) # Check scikit-learn Grid Search is available if not haveSKLearnGS: raise Exception( "The scikit-learn grid search tools are required for this function could not be imported\n\t" + sklearnGSErr) ratDataset = gdal.Open(clumpsImg, gdal.GA_Update) numpyVars = [] for var in variables: print("Reading " + var) tmpArr = rat.readColumn(ratDataset, var) if not preProcessor is None: tmpArr = tmpArr.reshape(-1, 1) tmpArr = preProcessor.fit_transform(tmpArr) tmpArr = tmpArr.reshape(-1) numpyVars.append(tmpArr) # Read in training classes classesInt = rat.readColumn(ratDataset, classesIntCol) xData = numpy.array(numpyVars) xData = xData.transpose() xData = numpy.where(numpy.isfinite(xData), xData, 0) print("Input data size: {} x {}".format(xData.shape[0], xData.shape[1])) trainingData = xData[numpy.isfinite(xData).all(axis=1)] classesInt = classesInt[numpy.isfinite(xData).all(axis=1)] trainingData = trainingData[classesInt > 0] classesInt = classesInt[classesInt > 0] print("Training data size: {} x {}".format(trainingData.shape[0], trainingData.shape[1])) print("Training data IDs size: {}".format(classesInt.shape[0])) classIDs = numpy.unique(classesInt) print(classIDs) for id in classIDs: print("Class {} has {} samples.".format( id, classesInt[classesInt == id].shape[0])) gridSearch.fit(trainingData, classesInt) if not gridSearch.refit: raise Exception("Grid Search did no find a fit therefore failed...") print("Best score was {} and has parameters {}.".format( gridSearch.best_score_, gridSearch.best_params_)) return gridSearch.best_estimator_
# X = sc.fit_transform(X) # X_test = sc.transform(X_test) # ============================================================================= ################################################################################ # logistic regression ################################################################################ logistic = linear_model.LogisticRegression() # Create regularization penalty space penalty = ['l1', 'l2'] # Create regularization hyperparameter space C = np.logspace(0, 4, 10) # Create hyperparameter options hyperparameters = dict(C=C, penalty=penalty) # Create grid search using 5-fold cross validation clf = GridSearchCV(logistic, hyperparameters, cv=5, verbose=0, n_jobs=-1) # Fit grid search best_model = clf.fit(X, y) # View best hyperparameters print('Best Penalty:', best_model.best_estimator_.get_params()['penalty']) print('Best C:', best_model.best_estimator_.get_params()['C']) # prediction on test data using the best model # ============================================================================= # y_pred = best_model.predict(X_test) # ============================================================================= ################################################################################ # random forest regression ################################################################################ # ============================================================================= # from sklearn.ensemble import RandomForestRegressor
'knn__weights': ['uniform', 'distance'] } # -------------------------------- pipe_ext = Pipeline([ ('ext', ExtraTreesClassifier()) ]) pipe_ext_params = { 'ext__n_estimators': [100, 150, 200], 'ext__max_depth': [None, 1, 2, 3, 4], 'ext__min_samples_split': [2, 3, 4], 'ext__min_samples_leaf': [1, 2, 3, 4] } gs_lr = GridSearchCV(pipe_lr, pipe_lr_params, cv=3, verbose=0, n_jobs = -1) gs_rf= GridSearchCV(pipe_rf, pipe_rf_params, cv=3, verbose=0, n_jobs = -1) gs_gbc = GridSearchCV(pipe_gbc, pipe_gbc_params, cv =3, verbose=0, n_jobs = -1) gs_knn = GridSearchCV(pipe_knn, pipe_knn_params, cv =3, verbose=0, n_jobs = -1) gs_ext = GridSearchCV(pipe_ext, pipe_ext_params, cv =3, verbose=0, n_jobs = -1) gs_lr.fit(X_t_train_sc, y_t_train) gs_rf.fit(X_t_train_sc, y_t_train) gs_gbc.fit(X_t_train_sc, y_t_train) gs_knn.fit(X_t_train_sc, y_t_train) gs_ext.fit(X_t_train_sc, y_t_train)
df = pd.DataFrame({'default.payment.next.month': next_month.index,'values': next_month.values}) plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签 plt.figure(figsize = (6,6)) plt.title('信用卡违约率客户\n (违约:1,守约:0)') sns.set_color_codes("pastel") sns.barplot(x = 'default.payment.next.month', y="values", data=df) locs, labels = plt.xticks() plt.show() # 特征选择,去掉ID字段、最后一个结果字段即可 data.drop(['ID'], inplace=True, axis =1) #ID这个字段没有用 target = data['default.payment.next.month'].values columns = data.columns.tolist() columns.remove('default.payment.next.month') features = data[columns].values # 30%作为测试集,其余作为训练集 train_x, test_x, train_y, test_y = train_test_split(features, target, test_size=0.30, stratify = target, random_state = 1) #分类器 ada=AdaBoostClassifier( random_state=1) #需要调整的参数 parameters={'n_estimators':[10,50,100]} # 使用 GridSearchCV 进行参数调优 clf=GridSearchCV(estimator=ada,param_grid=parameters,scoring = 'accuracy') clf.fit(train_x,train_y) print("GridSearch最优参数:", clf.best_params_) print("GridSearch最优分数: %0.4lf" %clf.best_score_) predict_y=clf.predict(test_x) print("准确率 %0.4lf" %accuracy_score(test_y, predict_y))
('bow', CountVectorizer(strip_accents='ascii', stop_words='english', lowercase=True)), # strings to token integer counts ('tfidf', TfidfTransformer()), # integer counts to weighted TF-IDF scores ('classifier', MultinomialNB()), # train on TF-IDF vectors w/ Naive Bayes classifier ]) # this is where we define the values for GridSearchCV to iterate over parameters = { 'bow__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'classifier__alpha': (1e-2, 1e-3), } # do 10-fold cross validation for each of the 6 possible combinations of the above params grid = GridSearchCV(pipeline, cv=10, param_grid=parameters, verbose=1) grid.fit(X_train, y_train) # summarize results print("\nBest Model: %f using %s" % (grid.best_score_, grid.best_params_)) print('\n') means = grid.cv_results_['mean_test_score'] stds = grid.cv_results_['std_test_score'] params = grid.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("Mean: %f Stdev:(%f) with: %r" % (mean, stdev, param)) X_test = combi['tidy_tweet'][31962:] # save best model to current working directory joblib.dump(grid, "twitter_sentiment.pkl") # load from file and predict using the best configs found in the CV step model_NB = joblib.load("twitter_sentiment.pkl")
def get_predict_model(): parent = root + '/type_err_feature/' types = os.listdir(parent) for t in types: print(t) if not os.path.exists(parent + t + '/gdbt.model'): #if True: data = pd.read_csv(parent + t + '/err_feature.csv', encoding='utf-8', usecols=[ 'result', 'YEAR_USE', 'dow', 'doy', 'month', 'hour', 'result_before_1', 'result_before_7' ]) n_estimators = range(40, 81, 10) min_sample_split = range(20, 81, 5) if data.shape[0] > 1000000: n_estimators = range(250, 501, 50) min_sample_split = range(120, 241, 30) elif data.shape[0] > 500000: n_estimators = range(100, 351, 50) min_sample_split = range(60, 141, 20) col = [ c for c in data.columns.tolist() if c not in ['TIME', 'PAR_ROOM', 'NE_OBJ_ID', 'result'] ] trainX, testX, train_y, test_y = train_test_split( data[col], data['result'], test_size=0.3, random_state=80, stratify=data['result']) param_grid = { 'n_estimators': n_estimators, 'min_samples_split': min_sample_split } estimator = GradientBoostingClassifier(random_state=80, max_depth=5, learning_rate=0.005) cv = StratifiedShuffleSplit(n_splits=3, test_size=0.3, random_state=80) gbm = GridSearchCV(estimator=estimator, param_grid=param_grid, refit=True, n_jobs=-1, return_train_score=True, scoring='roc_auc', cv=cv) # 训练和输出 if train_y.values.tolist().count(1) >= 3: gbm.fit(trainX, train_y) # 预测 y_pred = gbm.predict(testX) y_predprob = gbm.predict_proba(testX)[:, 1] print("型号 " + t + " roc_auc得分为:" + str( metrics.roc_auc_score( test_y, y_predprob, average='weighted'))) # 输出结果至csv clf = gbm.best_estimator_ result = pd.DataFrame() result["BEST_PARAMS"] = [str(gbm.best_params_)] result["FEATURE_RANK"] = [str(clf.feature_importances_)] result["TRAIN_ROC_AUC"] = [str(gbm.best_score_)] result["ACCURACY"] = [ metrics.accuracy_score(test_y.values, y_pred) ] result["ROC_AUC"] = [ metrics.roc_auc_score(test_y, y_predprob, average='weighted') ] result['y_pred'] = [y_pred.tolist()] result['y_true'] = [test_y.tolist()] result.to_csv(parent + t + '/gdbt_result.csv', header=True, index=False, encoding='utf-8') joblib.dump(clf, parent + t + '/gdbt.model') else: print(t + "型号样本故障数据过少,无法训练模型")
X, y = load_data("Dataset") F = extract_lbp_features(X) # Apply grid and search for SCV classifier # Create hyperparameter options hyperparams = { 'C': [0.001, 0.005, 0.01, 0.5, 1, 5, 10, 50, 100, 500, 1000], 'gamma': [1, 0.1, 0.001, 0.0001], 'kernel': ['linear', 'rbf', 'poly'], 'degree': [1, 2, 3, 4] } grid_svc = GridSearchCV(SVC(), hyperparams, cv=5) grid_svc.fit(F, y) print('the best parameters for SVC classifier using GridSearchCV are ' + str(grid_svc.best_params_)) searchcv_svc = RandomizedSearchCV(SVC(), hyperparams, n_iter=20, cv=5) searchcv_svc.fit(F, y) print('the best parameters for SVC classifier using RandomizedSearchCV are ' + str(searchcv_svc.best_params_)) # Apply grid and search for logreg classifier
# Confusion Matrix for model 2 xgb_cm = confusion_matrix(y_test, y_pred) print("Confusion Matrix :\n", xgb_cm) print('-' * 50) # Classification Report for model 2 xgb_cr = classification_report(y_test, y_pred) print("Classification Report :\n", xgb_cr) print('-' * 50) print( "*************Model3 (XGBoost Classifier using grid search)*************") # Initialize Grid search model xgb_clf = XGBClassifier(random_state=0) clf_model = GridSearchCV(estimator=xgb_clf, param_grid=parameters) # Fit the grid model clf_model.fit(X_train, y_train) # Prediction for the grid model y_pred = clf_model.predict(X_test) # Accuracy of the grid model clf_score = accuracy_score(y_test, y_pred) print("Accuracy score of model 3 :", round(clf_score, 2)) print('-' * 50) # Confusion Matrix for the grid model clf_cm = confusion_matrix(y_test, y_pred) print("Confusion Matrix :\n", clf_cm)
def best_estimator(classifier,features_train,labels_train,features_test): param_mapping={ 'lor':{ 'reduce_dim__n_components':list(range(1,10)), 'lor__C':[0.00000001,0.00001,1.0], 'lor__tol':[1e-3,1e-1], 'lor__penalty':['l1','l2'], 'lor__random_state':[42] }, 'svc':{ 'svc__kernel':['rbf'], 'svc__C':[10000,100000,1000], 'svc__gamma':[0.001,0.0001,'auto'], 'svc__random_state':[68]}, 'dtc':{ 'dtc__criterion':['entropy','gini'], 'dtc__min_samples_split':[5,10,8,10,12], 'dtc__random_state':[68], 'dtc__min_samples_leaf':[4,6,8,10,12], }, 'knn':{ 'reduce_dim__n_components':list(range(1,10)), 'knn__n_neighbors':[5,7,11], 'knn__algorithm':['ball_tree','kd_tree','brute','auto'], 'knn__leaf_size':[2,3,5,10,12]} } steps={ 'svc':[(),('scale',StandardScaler()),('svc',SVC())], 'dtc':[('scale',StandardScaler()),('dtc',DecisionTreeClassifier())], 'knn':[('scale',StandardScaler()),('reduce_dim',PCA()),('knn',KNeighborsClassifier())], 'lor':[('scale',StandardScaler()),('reduce_dim',PCA()),('lor',LogisticRegression())] } pipe = Pipeline(steps[classifier]) tune_params=param_mapping[classifier] sss = StratifiedShuffleSplit(n_splits=50, test_size=0.1, random_state=42) grid_search = GridSearchCV(estimator=pipe, param_grid=tune_params, scoring='f1', error_score=0, cv=sss ) grid_search.fit(features_train, labels_train) predictions = grid_search.predict(features_test) clf1 = grid_search.best_estimator_ clf1_parm=grid_search.best_params_ return clf1,clf1_parm
def plot_cross_val_selection(): iris = load_iris() X_trainval, X_test, y_trainval, y_test = train_test_split(iris.data, iris.target, random_state=0) param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]} grid_search = GridSearchCV(SVC(), param_grid, cv=5) grid_search.fit(X_trainval, y_trainval) results = pd.DataFrame(grid_search.cv_results_)[15:] best = np.argmax(results.mean_test_score.values) plt.figure(figsize=(10, 3)) plt.xlim(-1, len(results)) plt.ylim(0, 1.1) for i, (_, row) in enumerate(results.iterrows()): scores = row[['test_split%d_test_score' % i for i in range(5)]] marker_cv, = plt.plot([i] * 5, scores, '^', c='gray', markersize=5, alpha=.5) marker_mean, = plt.plot(i, row.mean_test_score, 'v', c='none', alpha=1, markersize=10, markeredgecolor='k') if i == best: marker_best, = plt.plot(i, row.mean_test_score, 'o', c='red', fillstyle="none", alpha=1, markersize=20, markeredgewidth=3) plt.xticks(range(len(results)), [str(x).strip("{}").replace("'", "") for x in grid_search.cv_results_['params']], rotation=90) plt.ylabel("Validation accuracy") plt.xlabel("Parameter settings") plt.legend([marker_cv, marker_mean, marker_best], ["cv accuracy", "mean accuracy", "best parameter setting"], loc=(1.05, .4))
def main(): # 1 查看训练集和测试集的数据特征 train_data = pandas.read_csv('data/train.csv') test_data = pandas.read_csv('data/test.csv') print(train_data.info()) print(test_data.info()) # 2 人工选取预测有效的特征 selected_features = ['Pclass', 'Sex', 'Age', 'Embarked', 'SibSp', 'Parch', 'Fare'] x_train = train_data[selected_features] x_test = test_data[selected_features] y_train = train_data['Survived'] # 3 补充缺失值 # 得知Embared特征惨在缺失值,需要补完 print(x_train['Embarked'].value_counts()) print(x_test['Embarked'].value_counts()) # 对于类别型特征,使用出现频率最高的特征来填充,可以作为减少引入误差的方法之一 x_train['Embarked'].fillna('S', inplace=True) x_test['Embarked'].fillna('S', inplace=True) x_train['Age'].fillna(x_train['Age'].mean(), inplace=True) x_test['Age'].fillna(x_test['Age'].mean(), inplace=True) x_test['Fare'].fillna(x_test['Fare'].mean(), inplace=True) print(x_train.info()) print(x_test.info()) # 4 采用DictVectorizer对特征向量化 dict_vectorizer = DictVectorizer(sparse=False) x_train = dict_vectorizer.fit_transform(x_train.to_dict(orient='record')) print(dict_vectorizer.feature_names_) x_test = dict_vectorizer.transform(x_test.to_dict(orient='record')) # 5 训练模型 forest_classifier = RandomForestClassifier() xgb_classifier = XGBClassifier() # 使用5折交叉验证的方式进行性能评估 forest_mean_score = cross_val_score(forest_classifier, x_train, y_train, cv=5).mean() print(forest_mean_score) xgb_mean_score = cross_val_score(xgb_classifier, x_train, y_train, cv=5).mean() print(xgb_mean_score) # 6 使用并行网格搜索的方式选择更好的超参组合 params = { 'max_depth': range(2, 8), 'n_estimators': range(100, 1200, 200), 'learning_rate': [0.05, 0.1, 0.25, 0.5, 1.0] } xgbc_best = XGBClassifier() grid_search_cv = GridSearchCV(xgbc_best, params, n_jobs=-1, cv=5) grid_search_cv.fit(x_train, y_train) print(grid_search_cv.best_score_) print(grid_search_cv.best_params_) # 7 预测结果并写入文件 predict_result = grid_search_cv.predict(x_test) submission_data = pandas.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': predict_result}) submission_data.to_csv('data/submission/titanic_submission.csv', index=False)