def fine_tune_gradient_boosting_hyper_params(data_train_x, data_test_x, data_train_y, data_test_y): from sklearn.ensemble import GradientBoostingRegressor from sklearn.grid_search import RandomizedSearchCV print "-- {} --".format("Fine-tuning Gradient Boosting Regression") rf = GradientBoostingRegressor( n_estimators=1000 ) param_dist = { "learning_rate": [0.01, 0.03, 0.05, 0.07, 0.09, 0.1, 0.15, 0.2], "max_depth": sp_randint(1, 15), "min_samples_split": sp_randint(1, 15), "min_samples_leaf": sp_randint(1, 15), "subsample": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], "max_features": sp_randint(1, 15) } n_iter_search = 300 random_search = RandomizedSearchCV( rf, param_distributions=param_dist, n_iter=n_iter_search, n_jobs=-1, cv=5, verbose=1 ) start = time() random_search.fit(data_train_x, data_train_y) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_search.grid_scores_)
def Gradient(self): X_train, y_train =self.X_train,self.y_train parameters_boost={'max_depth':randint(3,self.max_depth_max+1), 'n_estimators':randint(80,100+self.n_estimators_max)} boost_reg=RandomizedSearchCV(GradientBoostingRegressor(loss=self.loss),param_distributions=parameters_boost,cv=self.cv, n_iter=self.n_iter,n_jobs=-1) boost_reg.fit(X_train,y_train) self.boost_reg=boost_reg.best_estimator_
def test_sklearn_cv(): model = LightFM(loss='warp', random_state=42) # Set distributions for hyperparameters randint = stats.randint(low=1, high=65) randint.random_state = 42 gamma = stats.gamma(a=1.2, loc=0, scale=0.13) gamma.random_state = 42 distr = {'no_components': randint, 'learning_rate': gamma} # Custom score function def scorer(est, x, y=None): return precision_at_k(est, x).mean() # Custom CV which sets train_index = test_index class CV(KFold): def __iter__(self): ind = np.arange(self.n) for test_index in self._iter_test_masks(): train_index = np.logical_not(test_index) train_index = ind[train_index] yield train_index, train_index cv = CV(n=train.shape[0], random_state=42) search = RandomizedSearchCV(estimator=model, param_distributions=distr, n_iter=10, scoring=scorer, random_state=42, cv=cv) search.fit(train) assert search.best_params_['no_components'] == 52
def run(src_dir, mod, random_state=1234): if isinstance(src_dir, str): mat, labels_arr = load_mat_and_labels(src_dir, mod) else: mat, labels_arr = (src_dir, mod) masker = SimpleMaskerPipeline(threshold=.2) svc = SVC(kernel='linear') pipeline = Pipeline([('masker', masker), ('anova', SelectKBest(k=500)), ('svc', svc)]) c_range = gamma.rvs(size=100, a=1.99, random_state=random_state) param_dist = {"svc__C": c_range} n_iter = 100 cv = StratifiedShuffleSplit(labels_arr, n_iter=n_iter, test_size=1/6.0, random_state=random_state) total_runs = n_iter scorer = verbose_scorer(total_runs) search = RandomizedSearchCV(pipeline, param_distributions=param_dist, cv=cv, scoring=scorer, random_state=random_state) search.fit(mat, labels_arr) return search
def main(): data = pd.read_csv(args.dataset) X = data.drop(['Id', 'Class'], axis=1) Y = data.loc[:, 'Class'] X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42) estimator = [('reduce_dim', SelectFromModel(RandomForestClassifier())), ('classifier', XGBClassifier())] # transform the threshold to the quantile of median tmp = map(str, np.arange(args.threshold[0],args.threshold[1],args.threshold[2])) threshold = map(lambda x: x+'*median', tmp) clf = Pipeline(estimator) params = {} params['reduce_dim__estimator__n_estimators'] = list(np.arange(args.components[0], args.components[1], args.components[2])) params['reduce_dim__threshold'] = threshold params['classifier__n_estimators'] = list(np.arange(args.num_tree[0], args.num_tree[1], args.num_tree[2])) params['classifier__max_depth'] = list(np.arange(args.depths[0], args.depths[1], args.depths[2])) params['classifier__learning_rate'] = list(np.arange(args.lr[0], args.lr[1], args.lr[2])) params['classifier__subsample'] = list(np.arange(args.subsample[0], args.subsample[1], args.subsample[2])) params['classifier__colsample_bytree'] = list(np.arange(args.colsample[0], args.colsample[1], args.colsample[2])) # Cross_validation for grid search try: grid_search = RandomizedSearchCV(clf, param_distributions=params, n_iter=args.iter, cv=5, n_jobs=-1) grid_search.fit(X_train, y_train) except: grid_search = GridSearchCV(clf, param_grid=params, cv=5, n_jobs=-1) grid_search.fit(X_train, y_train) best_parameters, score, _ = max(grid_search.grid_scores_, key=lambda x: x[1]) result = accuracy_score(y_test, grid_search.predict(X_test)) print("Predict Accuracy: " + str(result)) print("XGboost using raw pixel features:\n%s\n" % (metrics.classification_report(y_test, grid_search.predict(X_test)))) print best_parameters
def fitAlgo(clf, Xtrain, Ytrain, opt = False, param_dict = None, opt_metric = 'roc_auc', n_iter = 5): '''Return the fitted classifier Keyword arguments: clf - - base classifier Xtrain - - training feature matrix Ytrain - - training target array param_dict - - the parameter distribution of param, grids space, if opt == False, every element should have length 1 opt_metric - - optimization metric opt - - whether to do optimization or not ''' if opt & (param_dict != None): assert(map(lambda x: isinstance(param_dict[x],list), param_dict)) rs = RandomizedSearchCV(estimator = clf, n_iter = n_iter, param_distributions = param_dict, scoring = opt_metric, refit = True, n_jobs=-1, cv = 3, verbose = 3) rs.fit(Xtrain, Ytrain) imp = [] if clf.__class__.__name__ == "RandomForestClassifier": imp = rs.best_estimator_.feature_importances_ return rs.best_estimator_, rs.grid_scores_, imp else: if param_dict != None: assert(map(lambda x: not isinstance(param_dict[x], list), param_dict)) for k in param_dict.keys(): clf.set_params(k = param_dict[k]) clf.fit(Xtrain, Ytrain) return clf, [], []
def test_randomized_search_grid_scores(): # Make a dataset with a lot of noise to get various kind of prediction # errors across CV folds and parameter settings X, y = make_classification(n_samples=200, n_features=100, n_informative=3, random_state=0) # XXX: as of today (scipy 0.12) it's not possible to set the random seed # of scipy.stats distributions: the assertions in this test should thus # not depend on the randomization params = dict(C=distributions.expon(scale=10), gamma=distributions.expon(scale=0.1)) n_cv_iter = 3 n_search_iter = 30 search = RandomizedSearchCV(SVC(), n_iter=n_search_iter, cv=n_cv_iter, param_distributions=params, iid=False) search.fit(X, y) assert_equal(len(search.grid_scores_), n_search_iter) # Check consistency of the structure of each cv_score item for cv_score in search.grid_scores_: assert_equal(len(cv_score.cv_validation_scores), n_cv_iter) # Because we set iid to False, the mean_validation score is the # mean of the fold mean scores instead of the aggregate sample-wise # mean score assert_almost_equal(np.mean(cv_score.cv_validation_scores), cv_score.mean_validation_score) assert_equal(list(sorted(cv_score.parameters.keys())), list(sorted(params.keys()))) # Check the consistency with the best_score_ and best_params_ attributes sorted_grid_scores = list(sorted(search.grid_scores_, key=lambda x: x.mean_validation_score)) best_score = sorted_grid_scores[-1].mean_validation_score assert_equal(search.best_score_, best_score) tied_best_params = [s.parameters for s in sorted_grid_scores if s.mean_validation_score == best_score] assert_true( search.best_params_ in tied_best_params, "best_params_={0} is not part of the" " tied best models: {1}".format(search.best_params_, tied_best_params), )
def makeRandomCV(dataset,dbtype='CATH', level=1, k_iters=10, minsamples=500, clf = ExtraTreesClassifier(n_estimators=5,class_weight='auto')): from scipy.stats import randint as sp_randint dataDict = dbParser(dataset,level=level,dbtype=dbtype,minsamples=minsamples) print dataDict labels = dataDict['target_names'] param_dist = {"max_depth": [3, None], "max_features": sp_randint(1, 11), "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False], "criterion": ["gini", "entropy"]} n_iter_search = k_iters random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search) random_search.fit(dataDict['vectors'], dataDict['target_names']) report(random_search.grid_scores_)
def find_best_parameters_and_get_fitted_model(self, **kwargs): """ Finds the best set of hyperparameters for a Random Forest for the provided data. The best hyperparameters are found by repeatedly drawing random samples from a distribution of parameters and evaluating them by using cross validation. """ # load data data = kwargs['data'] X = data['features'] y = data['targets'] out_args = {} # we choose Random Fores Classifier as the Machine Learning algorithm for # this DPModel. rc = RandomForestClassifier() # here we define the space of parameters over which we want to perform the random search param_distributions = {} param_distributions["n_estimators"] = [50, 100, 150] # do random search random_search_outer = RandomizedSearchCV(rc, param_distributions=param_distributions, cv=5, n_iter=3) random_search_outer.fit(X, y) predictor = random_search_outer.best_estimator_ return predictor, out_args
def _compute_thresh(this_data, ch_type, cv=10): """ Compute the rejection threshold for one channel. Parameters ---------- this_data: array (n_epochs, n_times) Data for one channel. ch_type: str 'mag', 'grad' or 'eeg'. cv : iterator Iterator for cross-validation. """ est = ChannelAutoReject() Limits = namedtuple('Limits', 'low high') limits = dict(eeg=Limits(low=20e-7, high=400e-6), grad=Limits(low=400e-13, high=20000e-13), mag=Limits(low=400e-15, high=20000e-15)) param_dist = dict(thresh=uniform(limits[ch_type].low, limits[ch_type].high)) rs = RandomizedSearchCV(est, # XXX : is random really better than grid? param_distributions=param_dist, n_iter=20, cv=cv) rs.fit(this_data) best_thresh = rs.best_estimator_.thresh return best_thresh
def train_cv(): # ---------------------- load the data train_df = pd.read_csv("train_processed.csv",index_col="PassengerId") Xtrain = train_df[feature_names] ytrain = train_df["Survived"] # ---------------------- train loss = ['deviance', 'exponential'] learning_rate = np.logspace(-5,1) n_estimate_dist = sp_randint(1000,4800) max_depth_dist = sp_randint(1,10) param_dist = dict(loss=loss, learning_rate=learning_rate, n_estimators=n_estimate_dist, max_depth=max_depth_dist) gbdt = GradientBoostingClassifier(verbose=1) searchcv = RandomizedSearchCV(estimator=gbdt, param_distributions=param_dist,n_iter=210,verbose=1,n_jobs=-1) print "--------------------- RandomizedSearchCV begins" searchcv.fit(Xtrain,ytrain) print "--------------------- RandomizedSearchCV ends" print "best score: ",searchcv.best_score_ print "best parameters: ",searchcv.best_params_ common.dump_predictor('gbdt-cv.pkl',searchcv.best_estimator_) print "--------------------- GBDT saved into file"
def searchBestModelParameters(algorithm, trainingData): #using randomforest if algorithm == 'rf': numTrees = range(10, 100, 10) numMinLeafSamples = range(2, 20, 2) numMinSamplesSplit = range(1, 20, 3) paramDistribution = dict(n_estimators = numTrees, min_samples_leaf = numMinLeafSamples, min_samples_split = numMinSamplesSplit) model = RandomForestClassifier() elif algorithm == 'knn': # model the data using knn # define the parameter values that should be searched k_range = range(1, 50) weight_options = ['uniform', 'distance'] # specify "parameter distributions" rather than a "parameter grid" paramDistribution = dict(n_neighbors = k_range, weights = weight_options) model = KNeighborsClassifier() elif algorithm == 'logr': #model data using logistic regression model = LogisticRegression() get_ipython().magic("time print(np.sqrt(-cross_val_score(model, trainingData, trainingData['isSpam'], cv=10, scoring='mean_squared_error')).mean())") return bestRun = [] for _ in range(20): rand = RandomizedSearchCV(model, paramDistribution, cv=10, scoring = 'accuracy', n_iter = 10) rand.fit(trainingData, trainingData['isSpam']) # examine the best model bestRun.append({'score' : round(rand.best_score_,3), 'params' : rand.best_params_}) print(max(bestRun, key=lambda x:x['score'])) return max(bestRun, key=lambda x:x['score'])
def run(full, target_col, random_state=1234, c_range_alpha=.05, c_range_size=100, normalize=False, score_fn=r2_score): svr = linearSVRPermuteCoefFactory() pipeline_steps = [('svr', svr)] pipeline = Pipeline(pipeline_steps) c_range = gamma.rvs(size=c_range_size, a=c_range_alpha, random_state=random_state) param_dist = {"svr__C": c_range} data, target = separate(full, target_col) if normalize: data = scale(data) n_iter = 100 cv = ShuffleSplit(len(target), n_iter=n_iter, test_size=1/6.0, random_state=random_state) total_runs = n_iter scorer = verbose_scorer(total_runs, score_fn) search = RandomizedSearchCV(pipeline, param_distributions=param_dist, cv=cv, scoring=scorer, random_state=random_state) search.fit(data, target) return search
def run_randomsearch(X, y, clf, para_dist, cv=5, n_iter_search=20): """Run a random search for best Decision Tree parameters. Args ---- X -- features y -- targets (classes) cf -- scikit-learn Decision Tree param_dist -- [dict] list, distributions of parameters to sample cv -- fold of cross-validation, default 5 n_iter_search -- number of random parameter sets to try, default 20. Returns ------- top_params -- [dict] from report() """ random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search) start = time() random_search.fit(X, y) print(("\nRandomizedSearchCV took {:.2f} seconds " "for {:d} candidates parameter " "settings.").format((time() - start), n_iter_search)) top_params = report(random_search.grid_scores_, 3) return top_params
def tuneSGD(data,labels, clf=None): from sklearn.cross_validation import StratifiedShuffleSplit from sklearn.linear_model import SGDClassifier sss = StratifiedShuffleSplit(labels, n_iter = 10, test_size = .2, random_state = 42) clf = Pipeline([#('num_features',SelectPercentile(f_classif,percentile = 5)), ('sgd', SGDClassifier(random_state = 11, penalty = 'elasticnet', n_jobs = 1, alpha = 10**-4))]) param_grid = { #'num_features__percentile': list(range(1,101)), 'sgd__loss':['modified_huber','squared_hinge'],#,'hinge','log'], 'sgd__class_weight':['balanced',None], 'sgd__l1_ratio': list(np.arange(0,1.0,.01)), 'sgd__alpha': list(10.**np.arange(-6,-3,.1)) } grid_search = RandomizedSearchCV(clf, param_grid, n_iter = 250, random_state = 42, cv=sss, scoring = 'roc_auc',#roc_score, n_jobs= -2, pre_dispatch = '2*n_jobs') grid_search.fit(data,labels) print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters = grid_search.best_estimator_.get_params() for p in param_grid.keys(): print (p, best_parameters[p]) return grid_search plot_cs(grid_search)
def RandomFo(self): parameters_forest={'n_estimators':randint(10,self.n_estimators_max), "bootstrap": [True, False]} X_train, y_train =self.X_train,self.y_train forest_reg=RandomizedSearchCV(RandomForestRegressor(),param_distributions=parameters_forest,cv=self.cv, n_iter=self.n_iter,n_jobs=-1) forest_reg.fit(X_train,y_train) self.forest_reg=forest_reg.best_estimator_
def svc_appr(): """ Best params: {'C': 0.022139881953014046} Submission: E_val: E_in: E_out: """ from sklearn.svm import LinearSVC from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.cross_validation import StratifiedKFold from sklearn.grid_search import RandomizedSearchCV from scipy.stats import expon X, y = dataset.load_train() raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) svc = LinearSVC(dual=False, class_weight='auto') rs = RandomizedSearchCV(svc, n_iter=50, scoring='roc_auc', n_jobs=-1, cv=StratifiedKFold(y, 5), verbose=2, param_distributions={'C': expon()}) rs.fit(X_scaled, y) logger.debug('Got best SVC.') logger.debug('Best params: %s', rs.best_params_) logger.debug('Grid scores:') for i, grid_score in enumerate(rs.grid_scores_): print('\t%s' % grid_score) logger.debug('Best score (E_val): %s', rs.best_score_) logger.debug('E_in: %f', Util.auc_score(rs, X_scaled, y))
def pick_best_features(df): """ Grid search to find best features. TODO refactor :param train: train data :param test: test data :return: """ #X = sample_data_random(df, .25) X = df[0:int(df.shape[0] * .25)] overfit_models = dict() for out in outputs: print out pipe_clf = CustomPipeline.get_transforms() clf = SGDClassifier(loss='log') tuned_parameters = {'alpha': sp_rand()} score = 'log_loss' tran_x = pipe_clf.fit_transform(X) grid = RandomizedSearchCV(clf, tuned_parameters, cv=5, scoring=score) grid.fit(tran_x, X[out]) print grid.best_estimator_ overfit_models[out] = grid.best_estimator_ return overfit_models
def optimized_classifier(X, y, classifier, distributions, scorer='f1_weighted', n_iter=30, cv=3): """ Return best classifier and scores for X,y from a randomized search over parameters X -- Features for each sample y -- Class label for each sample classifier -- An estimator class or pipeline from sklearn distributions -- The parameter distributions to search for that estimator scorer -- Scoring function (e.g. accuracy or f1) n_iter -- The number of random iterations to try """ # Make a pipeline out of the classifier, to allow for feature scaling in the first step. # Add prefix to parameters to support use in pipeline class_name = classifier.__class__.__name__.lower() distributions = dict((class_name + "__" + key, val) for key, val in distributions.iteritems()) # It is important to handle scaling here so we don't accidentally overfit some to the # test data by scaling using that information as well. classifier = make_pipeline(preprocessing.RobustScaler(), classifier) randomized_search = RandomizedSearchCV( classifier, param_distributions=distributions, n_iter=n_iter, scoring=scorer, cv=cv, n_jobs=1) randomized_search.fit(X, y) print randomized_search.best_estimator_ print "Validation Score ({}): {:.2f}".format(scorer, randomized_search.best_score_) print "" return randomized_search.best_estimator_, randomized_search.best_score_
def main(): NUM_TRAIN = bw_componentrecognition.NUM_TRAIN N_BINS = 23 N_HU_MOMENTS = 7 N_FEATURES = N_BINS + N_HU_MOMENTS X, y = bw_componentrecognition.Data.loadTrain(NUM_TRAIN, N_BINS) scaler = preprocessing.StandardScaler().fit(X) X = scaler.transform(X) clfs = [ RandomForestClassifier(n_estimators=20), ] param_dists = [ {"max_depth": [10, 5, 3, None], "max_features": sp_randint(1, 11), "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False], "criterion": ["gini", "entropy"]},] for clf, param_dist in zip(clfs, param_dists): # run randomized search n_iter_search = 25 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search) random_search.fit(X, y) report(random_search.grid_scores_)
def runGridSearch(self, model): logging.debug("run grid search on model: {}".format(model.__class__.__name__)) logging.debug("cross validation strategy: {}".format(model.holdout_split)) logging.debug("used features: {}".format(model.usedFeatures)) logging.debug("tuned parameters: {}".format(model.getTunedParamterOptions())) features,labels,cv = model.getFeaturesLabel() # do grid search if self.do_random_gridsearch: estimator = RandomizedSearchCV(model.clf, model.getTunedParamterOptions(), cv=cv, n_jobs=self.n_jobs, scoring=mean_absolute_percentage_error_scoring, verbose = 500, n_iter=self.n_iter_randomsearch) else: estimator = GridSearchCV(model.clf, model.getTunedParamterOptions(), cv=cv,n_jobs=-self.n_jobs, fit_params=model.get_fit_params(), scoring=mean_absolute_percentage_error_scoring, verbose = 500) estimator.fit(features, labels) model.clf = estimator.best_estimator_ model.save_final_model = True model.save_model() # model.dispFeatureImportance() logging.debug('estimaator parameters: {}'.format(estimator.get_params)) logging.debug('Best parameters: {}'.format(estimator.best_params_)) logging.debug('Best Scores: {}'.format(-estimator.best_score_)) logging.debug('Score grid: {}'.format(estimator.grid_scores_ )) for i in estimator.grid_scores_ : logging.debug('parameters: {}'.format(i.parameters )) logging.debug('mean_validation_score: {}'.format(np.absolute(i.mean_validation_score))) logging.debug('cv_validation_scores: {}'.format(np.absolute(i.cv_validation_scores) )) return
def tune(data,labels, clf=None): from sklearn.cross_validation import StratifiedShuffleSplit sss = StratifiedShuffleSplit(labels, n_iter = 10, test_size = .1, random_state = 42) clf = Pipeline([('num_features', SelectKBest(f_classif,k=100)), ('svm', svm.SVC(C=.01, kernel = 'linear', probability = True, random_state = 11))]) param_grid = { 'num_features__k':range(250,2500,250), 'svm__C':10.**np.arange(-3,4), #'svm__loss':['hinge','squared_hinge'], 'svm__class_weight':['balanced',None] } grid_search = RandomizedSearchCV(clf, param_grid, n_iter = 100, cv=sss, scoring='f1', n_jobs=-1, pre_dispatch = '2*n_jobs', random_state = 42) grid_search.fit(data,labels) print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters = grid_search.best_estimator_.get_params() for p in param_grid.keys(): print (p, best_parameters[p]) #plot_cs(grid_search) return grid_search
def doRandomSearch(self, clfName, clf, param_dist, X, Y): if self._custRandomSearchFlag == True: return self.doCustRandomSearch(clfName, clf, param_dist, X, Y) else: start = time.time() multiCores = -1 if clfName == "Logistic_Regression": multiCores = 1 if self._setXgboostTheradToOne == True and clfName =="Xgboost": multiCores = 1 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=self._n_iter_search, n_jobs=multiCores, scoring='log_loss' ,verbose=10) random_search.fit(X, Y) log(clfName + " randomized search cost: " , time.time() - start , " sec") self._bestClf[clfName] = random_search.best_estimator_ #self._bestLoglossDict[clfName] = self.getLogloss(self._bestClf[clfName], X, Y) self._bestLoglossDict[clfName] = self.validation(self._bestClf[clfName], X, Y, test_size=0.3) log("customize logloss: ",self._bestLoglossDict[clfName]) self.report(random_search.grid_scores_, clfName) random_search.best_params_ dumpModel(random_search.best_estimator_, clfName, self._expInfo, self._subFolderName) self._lastRandomSearchBestParam = random_search.best_params_ return random_search.best_estimator_
def buildRandomForest(self, X_train, X_test, y_train, cv = 3, n_iter = 5, save = False): rf = RandomForestClassifier(random_state = 9) #Tune the model param_distributions = { 'n_estimators': range(1,50,1), 'max_depth': range(1,70,1), 'max_features': range(6,15,1), 'min_samples_split':[2,3,4], 'min_samples_leaf':[1,2,3,4], 'n_jobs':[-1] } rf_optimized = RandomizedSearchCV( estimator = rf, param_distributions = param_distributions, n_iter= n_iter, scoring = 'f1', cv = cv, random_state = 1 ) rf_optimized.fit(X_train, y_train) if save == True: joblib.dump(value = rf_optimized, filename = "rf_optimized.pkl", compress=1) print "Best parameter: %s" %rf_optimized.best_params_ print "Best average cross validated F1 score: %0.4f" %rf_optimized.best_score_ print "--------------------------------------------" #predictions predicted_y_train = rf_optimized.predict(X_train) predicted_y_test = rf_optimized.predict(X_test) return predicted_y_train, predicted_y_test
def fit_estimator(estimator, positive_data_matrix=None, negative_data_matrix=None, target=None, cv=10, n_jobs=-1, n_iter_search=40, random_state=1): # hyperparameter optimization param_dist = {"n_iter": randint(5, 100), "power_t": uniform(0.1), "alpha": uniform(1e-08, 1e-03), "eta0": uniform(1e-03, 1), "penalty": ["l1", "l2", "elasticnet"], "learning_rate": ["invscaling", "constant", "optimal"]} scoring = 'roc_auc' n_iter_search = n_iter_search random_search = RandomizedSearchCV(estimator, param_distributions=param_dist, n_iter=n_iter_search, cv=cv, scoring=scoring, n_jobs=n_jobs, random_state=random_state, refit=True) X, y = make_data_matrix(positive_data_matrix=positive_data_matrix, negative_data_matrix=negative_data_matrix, target=target) random_search.fit(X, y) logger.debug('\nClassifier:') logger.debug('%s' % random_search.best_estimator_) logger.debug('\nPredictive performance:') # assess the generalization capacity of the model via a 10-fold cross validation for scoring in ['accuracy', 'precision', 'recall', 'f1', 'average_precision', 'roc_auc']: scores = cross_validation.cross_val_score(random_search.best_estimator_, X, y, cv=cv, scoring=scoring, n_jobs=n_jobs) logger.debug('%20s: %.3f +- %.3f' % (scoring, np.mean(scores), np.std(scores))) return random_search.best_estimator_
def search_classifier(n_iter): assignments = load_structure()['ASS_ASSIGNMENT'] features = load_featurized_training_set("files/train_featurized.pkl") # print(len(features.columns)) X = features.drop(['DATE', 'n_calls'], axis=1).as_matrix().astype(float) y = (features.n_calls > 0).astype(int).as_matrix() calls = features.n_calls.as_matrix() X = StandardScaler().fit_transform(X) pipe = Pipeline([ # ('scaler', StandardScaler()), # ('pca', RandomizedPCA()), ('clf', SGDClassifier()) ]) params = { # 'pca__n_components': [30, 50, 70, 86], 'clf__class_weight': ['balanced'], 'clf__loss': ['hinge'], 'clf__penalty': ['l1'], 'clf__alpha': st.uniform(0, 0.0003), 'clf__fit_intercept': [False] # 'clf__alpha': [0.0001] } kf = KFold(len(X), n_folds=3, shuffle=True) grid_search = RandomizedSearchCV(pipe, params, scoring='accuracy', cv=kf, verbose=1000, n_iter=n_iter) grid_search.fit(X, y) print("\n") print(grid_search.best_params_) print(grid_search.best_score_) joblib.dump(grid_search.best_estimator_, "files/best_classifier.pkl")
def test_grid_search_with_multioutput_data(): # Test search with multi-output estimator X, y = make_multilabel_classification(random_state=0) est_parameters = {"max_depth": [1, 2, 3, 4]} cv = KFold(y.shape[0], random_state=0) estimators = [DecisionTreeRegressor(random_state=0), DecisionTreeClassifier(random_state=0)] # Test with grid search cv for est in estimators: grid_search = GridSearchCV(est, est_parameters, cv=cv) grid_search.fit(X, y) for parameters, _, cv_validation_scores in grid_search.grid_scores_: est.set_params(**parameters) for i, (train, test) in enumerate(cv): est.fit(X[train], y[train]) correct_score = est.score(X[test], y[test]) assert_almost_equal(correct_score, cv_validation_scores[i]) # Test with a randomized search for est in estimators: random_search = RandomizedSearchCV(est, est_parameters, cv=cv, n_iter=3) random_search.fit(X, y) for parameters, _, cv_validation_scores in random_search.grid_scores_: est.set_params(**parameters) for i, (train, test) in enumerate(cv): est.fit(X[train], y[train]) correct_score = est.score(X[test], y[test]) assert_almost_equal(correct_score, cv_validation_scores[i])
def rf_cv(fv_train,target_train,fv_test,target_test): ####---- cross validation of train dataset, gridsearch the best parameters for random forest # Set the parameters by cross-validation tuned_parameters = {'n_estimators': [1000, 2000], "max_depth": [3, 6, 9, None], "max_features": ["auto","log2",None], "class_weight": [None, 'balanced']} scores = ['recall_macro'] n_iter_search = 20 for score in scores: print("# Tuning hyper-parameters for %s" % score) print() mycv = StratifiedKFold(target_train, n_folds = 5) clf = RandomizedSearchCV(RandomForestClassifier(n_jobs=-1), tuned_parameters, cv=mycv, n_iter=n_iter_search, scoring='%s' % score) clf.fit(fv_train, target_train) report_cv(clf,fv_test,target_test)
def train(model_id,train_x,train_y,valid_x,valid_y,test_x): train_x,train_y=shuffle(train_x,train_y) random_state=random.randint(0, 1000000) rf = RandomForestClassifier(n_jobs=8) param_dist = { "n_estimators":sp_randint(100,300), "criterion": ["gini"], #"max_depth": sp_randint(3, 10000), #"min_samples_split": sp_randint(1, 300), #"min_samples_leaf": sp_randint(1, 300), "max_features": sp_randint(10, 26), "bootstrap": [True, False], 'random_state':sp_randint(1, 1000000), } clf = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=50,cv=10,scoring='roc_auc') clf.fit(train_x, train_y) valid_predictions = clf.predict_proba(valid_x)[:, 1] test_predictions= clf.predict_proba(test_x)[:, 1] loss = roc_auc_score(valid_y,valid_predictions) print('loss:') print(loss) print(clf.best_estimator_) data.saveData(valid_id,valid_predictions,"./valid_results/valid_"+str(model_id)+".csv") data.saveData(test_id,test_predictions,"./results/results_"+str(model_id)+".csv")
def randomized_search_rfc(txt_lst, y): # build a classifier pipeline = Pipeline([ ('vect', CountVectorizer(stop_words='english', analyzer = analyzer, ngram_range=(1, 3))) ,('tfidf', TfidfTransformer()) , ('clf', RandomForestClassifier(n_estimators=100)) ]) # specify parameters and distributions to sample from param_dist = { 'vect__ngram_range':[None, (1, 2), (1,3),(1,4)], "clf__max_depth": map(lambda x: int(x), np.logspace(1, 4, 10)), #sp.stats.randint(10,1000), "clf__max_features": map(lambda x: int(x), np.logspace(0, 3, 10)), "clf__min_samples_split": sp.stats.randint(1, 11), "clf__min_samples_leaf": sp.stats.randint(2, 11), "clf__criterion": ["gini", "entropy"] } n_iter_search = 50 grid_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, verbose = 1, n_iter=n_iter_search, cv = 5, n_jobs=1, scoring = 'accuracy') start = time.time() grid_search.fit(txt_lst, y) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time.time() - start), n_iter_search)) report(grid_search.grid_scores_, n_top = 5) return grid_search
'median_width': expon(scale=1, loc=median_w), 'kernel_size': [2, 3, 4, 5, 6, 7, 8] } param_grid = [] for i in xrange(N): param_grid.append(params) i = 0 for params in param_grid: mkl = mkl_regressor() rs = RS(mkl, param_distributions=params, n_iter=20, n_jobs=24, cv=k, scoring="mean_squared_error") #"r2") rs.fit(data, labels) rs.best_estimator_.save( '/almac/ignacio/data/mkl_models/mkl_%d.model' % i) if args.estimate: # If user wants to save estimates test_predict(data=data, machine=rs.best_estimator_, labels=labels, out_file=out_file) if args.predict: # If user wants to predict and save just after training. assert not args.X is None # If test data is provided #preds = rs.best_estimator_.predict(data_t) if args.Y: # Get performance if test labels are provided test_predict(data=data_t, machine=rs.best_estimator_, labels=labels_t,
param_distribs = { 'n_estimators': randint(low=1, high=400), 'learning_rate': [0.1, 0.2, 0.4, 0.6, 0.8, 1.0], 'max_features': randint(10, 115), 'max_depth': randint(low=1, high=4) } boost_reg = GradientBoostingRegressor(random_state=42) rnd_search = RandomizedSearchCV(boost_reg, param_distributions=param_distribs, n_iter=10, cv=10, scoring='mean_squared_error', random_state=42) rnd_search.fit(train_features, train_labels) print('Best Params' + str(rnd_search.best_params_)) print('Best Estimator' + str(rnd_search.best_estimator_)) feature_importances = rnd_search.best_estimator_.feature_importances_ #print('Feature importance') #sorted(zip(feature_importances, attributes), reverse=True) # In[21]: final_model = rnd_search.best_estimator_ print('Best Score ' + str(np.sqrt(-rnd_search.best_score_))) final_predictions = final_model.predict(test_features) final_mse = mean_squared_error(test_labels, final_predictions) final_rmse = np.sqrt(final_mse)
'algorithm__min_samples_split': [3, 4, 5, 7, 9, 10, 15], 'algorithm__min_samples_leaf': [2, 3, 5, 7, 10], 'algorithm__max_leaf_nodes': [2, 4, 6, 8, 10], 'algorithm__max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'algorithm__criterion': ["gini", "entropy"], } scaler = MinMaxScaler() algo = DecisionTreeClassifier() #skb = SelectKBest(k='all') pipeline = Pipeline(steps=[('scaler', scaler), ("features", combined_features), ('algorithm', algo)]) cv = StratifiedShuffleSplit(labels, 5, test_size=0.3, random_state=42) gs = RandomizedSearchCV(pipeline, param_grid, cv=cv, scoring='f1') gs.fit(features, labels) print "Best estimator", gs.best_estimator_ clf = gs.best_estimator_ # fit the optimal model clf.fit(features_train, labels_train) # predict based on the optimal model pred = clf.predict(features_test) #print "predicting time:", round(time()-t1, 3), "s" accuracy = accuracy_score(pred, labels_test) print accuracy print classification_report(labels_test, pred) dump_classifier_and_data(clf, my_dataset, features_list)
# Utility function to report best scores def report(grid_scores, n_top=3): top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top] for i, score in enumerate(top_scores): print("Model with rank: {0}".format(i + 1)) print("Mean validation score: {0:.3f} (std: {1:.3f})".format( score.mean_validation_score, np.std(score.cv_validation_scores))) print("Parameters: {0}".format(score.parameters)) print("") param_dist = { "n_neighbors": randint(2, 400), "weights": ["uniform", "distance"] } # run randomized search n_iter_search = 30 random_search = RandomizedSearchCV(neigh, param_distributions=param_dist, n_iter=n_iter_search, scoring='roc_auc') #or neigh start = time() random_search.fit(x_train, y_train) print( "RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_search.grid_scores_)
paramDist = {'n_estimators': scipy.stats.randint(20,50), 'learning_rate': [0.1], 'max_features':['auto'], 'max_depth': scipy.stats.expon(scale=7), # 'min_samples_split':scipy.stats.expon(scale=2), 'min_samples_leaf':[1]} Rforest = RandomForestRegressor() GBM = GradientBoostingRegressor() grid_search = RandomizedSearchCV(Rforest,cv=3,param_distributions=paramDist,n_iter=40,n_jobs=4,scoring='mean_squared_error') grid_search = RandomizedSearchCV(GBM,cv=5,param_distributions=paramDist,n_iter=12,n_jobs=4,scoring='mean_squared_error') grid_search.fit(X, Y) scoresGrid = grid_search.grid_scores_ print grid_search.best_score_ print grid_search.best_estimator_ report(grid_search.grid_scores_) cols = np.array(mat.drop(colRemoved,axis=1).columns) importance = grid_search.best_estimator_.feature_importances_ featImport = pd.concat((pd.DataFrame(cols),pd.DataFrame(importance)),axis=1) featImport.columns=['f','v'] featImport.sort('v',ascending=False,inplace=True) featImport.set_index('f',inplace=True) featImport.plot(kind='bar') plt.subplots_adjust(bottom = 0.3)
35, 40, 50, 75, 100, 125, 150, 200, ] }, verbose=1, n_jobs=2, cv=4, scoring='roc_auc', n_iter=1000) clf.fit(train_data, outcome_train) print('best clf score', clf.best_score_) print('best params:', clf.best_params_) bst = xgb.train(plst, dtrain, num_round, watchlist) # this is prediction preds = bst.predict(dcv) pred_test = bst.predict(dtest) labels = dcv.get_label() print('error=%f' % (sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds)))) print('{0:<25} {1:>5}'.format('Feature', 'Importance')) print("--------------------------------------") for i in range(len(df_train_df.columns.values)): key = 'f' + str(i)
X_train, X_test, y_train, y_test = train_test_split(transformed_data, y, test_size=0.1) pipeline = Pipeline([('rf', RandomForestClassifier())]) param_grid = { 'rf__max_depth': list(range(9, 20)), 'rf__n_estimators': list(range(45, 70, 5)), 'rf__criterion': ["gini", "entropy"], "rf__max_features": ["auto", None] } # searcher = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring='accuracy') n_iter_search = 20 searcher = RandomizedSearchCV(estimator=pipeline, param_distributions=param_grid, n_iter=n_iter_search) searcher.fit(X_train, y_train) print("Best hyper parameters") print(searcher.best_params_) # {'rf__max_depth': 19, 'rf__n_estimators': 55, 'rf__criterion': 'entropy', 'rf__max_features': None} clf = searcher.best_estimator_ clf.fit(X_train, y_train) print("Train accuracy: %.3f" % clf.score(X_train, y_train)) print("Test accuracy: %.3f" % clf.score(X_test, y_test))
} # use the same metric for evaluation f1_scorer = make_scorer(metrics.flat_f1_score, average='weighted', labels=labels) # search rs = RandomizedSearchCV(crf, params_space, cv=3, verbose=1, n_jobs=-1, n_iter=50, scoring=f1_scorer) rs.fit(X_train, y_train) # crf = rs.best_estimator_ print('best params:', rs.best_params_) print('best CV score:', rs.best_score_) print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000)) crf = rs.best_estimator_ y_pred = crf.predict(X_test) print( metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3)) from collections import Counter
"max_depth": [3, None], "max_features": sp_randint(1, 11), "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False], "criterion": ["gini", "entropy"] } # run randomized search n_iter_search = 20 random_search = RandomizedSearchCV(clf, param_distributions=param_dister, n_iter=n_iter_search, n_jobs=2) start = time() random_search.fit(X, y) print("RandomizedSearchCV took %.2f s for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_search.grid_scores_) # Load the testing data test_mat = genfromtxt(TRAINING_INPUT_DIRECTORY + '/testing_matrix.csv', delimiter=',') test_y = test_mat[:, 0] test_x = test_mat[:, 1:] y_true, y_pred = test_y, random_search.predict(test_x)
def load_dataset_and_analyse(): iris = load_iris() X = iris.data y = iris.target knn1 = k_nearest(X, y, 1) X_new = [[3, 5, 4, 2], [5, 4, 3, 2]] knn1.predict(X_new) # Returns 2,1 knn5 = k_nearest(X, y, 5) knn5.predict(X_new) # Returns 1, 1 # logreg = logreg_prediciting(X,y) # logreg.predict(X_new) # Returns 2,0 # print (metrics.accuracy_score(y, knn5.predict(X))) # Training accuracy # print (metrics.accuracy_score(y, knn1.predict(X))) # Training accuracy X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=4) # 0.4 means test size is 40% of the original data. Standard for it is around 20-40% # Random_state helps split the data same way every time. Without it, it will split it differently everytime knn5 = k_nearest(X_train, y_train, 5) print(metrics.accuracy_score(y_test, knn5.predict(X_test))) # Testing accuracy # Can we locate an even better value for K? scores = [] for k in range(1, 26): # Testing K = 1 to 25 knn = k_nearest(X_train, y_train, k) scores.append(metrics.accuracy_score(y_test, knn.predict(X_test))) # Cross validation example # Simulate splitting a dataset of 25 observations into 5 folds kf = KFold(25, n_folds=5, shuffle=False) # 1. Dataset contains 25 observations (numbered 0 through 24) # 2. 5 fold cross validation, thus it runs for 5 iterations # 3. For each iteration, every observation is either in the # training set or testing set but not both # 4. Every observation is in the testing set exactly once # Print the contents of each training and testing set print('{} {:^61} {}'.format('Iteration', 'Training set observations', 'Testing set observations')) for iteration, data in enumerate(kf, start=1): print('{} {} {}'.format(iteration, data[0], data[1])) # 10 fold cross validation with k=5 for knn knn = KNeighborsClassifier(n_neighbors=5) scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy') # cv=10 means 10 fold cross validation # scoring='accuracy' classification accuracy as the evaluation metrics print(scores) # use average accuracy as an estimate of out of sample accuracy print(scores.mean()) # Search for an optimal value of k for knn k_scores = [] for k in range(1, 31): # Testing K = 1 to 30 knn = KNeighborsClassifier(n_neighbors=k) scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy') k_scores.append(scores.mean()) print(k_scores) plt.plot(range(1, 31), k_scores) plt.xlabel('Value of K for KNN') plt.ylabel('Cross-Validated Accuracy') plt.show() # K = 20 should be picked from this graph even though # K =13, 18 and 20 have the same highest accuracy of 0.98. # This is because we want our models to be simples # and higher k values means less complexity # 10 fold cross validation with logistics regression logreg = LogisticRegression() print(cross_val_score(logreg, X, y, cv=10, scoring='accuracy').mean()) # 0.95333 # It means knn = 20 is a better fit than logreg # The above strategy of using a for loop to find the optimal value of K # can be done through GridSearchCV. It replaces the for loop and provides # addtional functionality # Define the values that should be searched k_range = range(1, 31) # Create a param grid: map the paramter names to the values that should # be searched param_grid = dict(n_neighbors=k_range) knn = KNeighborsClassifier() # instantiate the grid grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy') # Set n_jobs = -1 to run computations in parallel # (if your computer and OS allows it) grid.fit(X, y) # This step can take a while depending on the model and data # view the complete results (list of named tuples) grid.grid_scores_ # [mean: 0.96, std: 0.0533, params: {'n_neighbors': 1}, # mean: 0.9533, std: 0.05207, params: {'n_neighbors': 2}, ..] grid.grid_scores_[0].parameters grid.grid_scores_[0].cv_validation_scores grid.grid_scores_[0].mean_validation_score grid_mean_scores = [ result.mean_validation_score for result in grid.grid_scores_ ] plt.plot(k_range, grid_mean_scores) plt.xlabel('Value of K for KNN') plt.ylabel('Cross-Validated Accuracy') plt.show() # plotting a graph isnt the most efficient way of finding the optimal k value # examine the best model print(grid.best_score_) # best accuracy print(grid.best_params_) # best param used for that accuracy print(grid.best_estimator_) # best model used for the param weight_options = ['uniform', 'distance'] # Another param of knn that can be tuned is the weights # Default value is uniform which means it puts uniform weight into all the # k neighbour. Distance is another option where the closer neighbours are # weighted more than further neighbours param_grid = dict(n_neighbors=k_range, weights=weight_options) grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy') grid.fit(X, y) # examine the best model print(grid.best_score_) # 0.98 print(grid.best_params_) # {'n_neighbors': 13, 'weights' : 'uniform'} # Distance on knn didnt improve over uniform # train your model using all the data and best known parameters knn = KNeighborsClassifier(n_neighbour=13, weights='uniform') knn.fit(X, y) knn.predict([3, 5, 4, 2]) # predict out of sample data # Shortcut: grid can do the prediction grid.predict([3, 5, 4, 2]) # Reducing computational expense using RandomizedSearchCV # RandomizedSearchCV is close cousin of GridSearchCV # RandomizedSearchCV seaches a subset of the parameters # and you control the computational "budget" # Specify "parameter distn" rather than "parameter grid" param_dist = dict(n_neighbors=k_range, weights=weight_options) # Important: If one of your tuning parameters is continous, Specify # a continous distn rather than a list of values # n_iter controls the number of searches # random_state is there for the purpose of reproducability rand = RandomizedSearchCV(knn, param_dist, cv=10, scoring='accuracy', n_iter=10, random_state=5) rand.fit(X, y) rand.grid_scores_ print(rand.best_score_) print(rand.best_params_)
"max_features": sp_randint(1, 11), "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False], "criterion": ["gini", "entropy"], "n_estimators": sp_randint(100, 600) } # In[4]: search_GB = RandomizedSearchCV(model, param_grid, scoring='log_loss', n_jobs=-1, n_iter=n_iter, cv=cv, verbose=True) search_GB.fit(X_train, y_train.flatten()) # In[5]: log_model = search_GB.score(X_val, y_val.flatten()) print "Log loss = %s" % log_model X_test = get_test() y_pred = search_GB.predict_proba(X_test) save_submission(model_name, log_model, y_pred) # In[7]: model_name
n_iter=25) predictor1.fit(X, Y) # Hyperparameters search space for a 1-hidden layer MLP params = { 'dropout_rate': sp.stats.uniform(0, 0.5), 'hidden0__units': sp.stats.randint(10, 1000) } random_search1 = RandomizedSearchCV(predictor1, param_distributions=params, n_iter=n_iter_search_1, cv=CViterator, n_jobs=1) random_search1.fit(X, Y) ## 2-layers predictor2 = Classifier(layers=[ Layer("Sigmoid", units=100, dropout=0), Layer("Sigmoid", units=100, dropout=0), Layer("Softmax", units=2) ], learning_rate=0.001, n_iter=25) predictor2.fit(X, Y) # Hyperparameters search space for a 2-hidden layers MLP params = {
def train_classifier(x_train, y_train, clf_type='lr', lr_regularization='l1', svc_kernel='rbf', optimize_params=True, use_pca=False, param_optimization_iter=100, verbose=0): # Define classifiers if clf_type == 'lr': clf = LogisticRegression(penalty=lr_regularization) param_dist = {"clf__C": scipy.stats.expon(scale=100)} has_prob = True elif clf_type == 'svc': clf = SVC(kernel=svc_kernel) param_dist = { 'clf__C': scipy.stats.expon(scale=100), 'clf__gamma': scipy.stats.expon(scale=.1) } has_prob = False elif clf_type == 'rf': clf = RandomForestClassifier(n_estimators=20) param_dist = { "clf__max_depth": [3, None], "clf__max_features": scipy.stats.randint(1, 11), "clf__min_samples_split": scipy.stats.randint(1, 11), "clf__min_samples_leaf": scipy.stats.randint(1, 11), "clf__bootstrap": [True, False], "clf__criterion": ["gini", "entropy"] } has_prob = True else: print('Classifier type {} not found'.format(clf_type)) return -1 if use_pca: clf = Pipeline([('scale', sklearn.preprocessing.StandardScaler()), ('pca', sklearn.decomposition.PCA(0.95)), ('clf', clf)]) else: clf = Pipeline([('scale', sklearn.preprocessing.StandardScaler()), ('clf', clf)]) # Run parameter optimization over training set if optimize_params: random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=param_optimization_iter, scoring='roc_auc', verbose=verbose) random_search.fit(x_train, y_train) if verbose > 0: report(random_search.grid_scores_) params = random_search.best_params_ clf.set_params(**params) # Train final model clf.fit(x_train, y_train) return clf, has_prob
param_distributions=param_dist, n_iter=n_iter_search, scoring='mean_absolute_error') search = GridSearchCV(clf, param_grid=param_dist, scoring='mean_absolute_error') lle = manifold.LocallyLinearEmbedding(n_components=nfeats) for oidx, (train, test) in enumerate(cv): # print '=========\ncv %d/%d\n========='%(oidx+1,nfolds) X_train, X_test = X[train], X[test] y_train, y_test = y[train], y[test] # X_train = lle.fit_transform(X_train) # X_test = lle.transform(X_test) search.fit(X_train, y_train) clf = search.best_estimator_ clf.fit(X_train, y_train) test_scores.append(mean_absolute_error(clf.predict(X_test), y_test)) train_scores.append(mean_absolute_error(clf.predict(X_train), y_train)) clf = DummyRegressor(strategy='median') clf.fit(X_train, y_train) dummy_scores.append(mean_absolute_error(clf.predict(X_test), y_test)) print '\n', seed, b print 'dummy: %.3f' % np.median(dummy_scores) print 'test: %.3f' % np.median(test_scores) print 'train: %.3f' % np.median(train_scores)
# k-NN print("\n") print("[INFO] evaluating raw pixel accuracy...") knn1 = KNeighborsClassifier(n_neighbors=15) knn1.fit(trainRI, trainRL) acc = knn1.score(testRI, testRL) #print("[INFO] k-NN classifier: k=%d" % args["neighbors"]) print("[INFO] raw pixel accuracy: {:.2f}%".format(acc * 100)) k_range = list(range(1, 31)) weight_options = ['uniform', 'distance'] # specify "parameter distributions" rather than a "parameter grid" param_dist = dict(n_neighbors=k_range, weights=weight_options) # n_iter controls the number of searches rand = RandomizedSearchCV(knn1, param_dist, cv=3, scoring='accuracy', n_iter=10,n_jobs=-1, random_state=5) rand.fit(rawImages, class_names) rand.grid_scores_ # examine the best model print(rand.best_score_) print(rand.best_params_) # run RandomizedSearchCV 20 times (with n_iter=10) and record the best score best_scores = [] for _ in range(20): rand = RandomizedSearchCV(knn1, param_dist, cv=3, scoring='accuracy', n_iter=10,n_jobs=-1) rand.fit(rawImages, class_names) best_scores.append(round(rand.best_score_, 3)) print(best_scores)
X, y, test_size=0.1, random_state=42) # Create a Random Forest Classifier ## Run Randomized Search for Hyperparameter Optimization cv_call = StratifiedKFold(y_train,n_folds=10) # Specify cross-validation settings param_dist = {"n_estimators": randint(5, 500), "class_weight": ["balanced","balanced_subsample"]} n_iter_search = 30 clf = RandomForestClassifier(random_state=42,n_jobs=-1) random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search,cv=cv_call, scoring='f1') random_search = random_search.fit(X_train[feature_set], y_train) ## Retrieve Optimal Hyperparameter Values from Random Search best_parameters, score, _ = max(random_search.grid_scores_, key=lambda x: x[1]) clf = RandomForestClassifier(random_state=42,n_jobs=-1, n_estimators=187,#best_parameters["n_estimators"], class_weight="balanced_subsample")#best_parameters["class_weight"]) # best_parameters["n_estimators"]=187,best_parameters["class_weight"]="balanced_subsample" ## Run Model with Optimized Parameters on Entire Training Dataset clf = clf.fit(X[feature_set], y) # Join Test Datasets X_test = prepare_datasets(amazon,rot,test) preds_test = clf.predict(X_test[feature_set])