def train(array, embedDim, interval): XTrain, yTrain = pp.makeTrainset(array, embedDim, interval, 1) kfold = cross_validation.KFold(len(XTrain), n_folds=5, shuffle=False) params = { 'C': uniform(1, 99), 'gamma': uniform(0.01, 0.29), 'kernel': ['rbf', 'poly'] } bestModels = [] for i in range(len(yTrain[0])): svr = svm.SVR() clf = grid_search.RandomizedSearchCV(svr, param_distributions=params, n_iter=30, cv=kfold, scoring='mean_squared_error', n_jobs=1, verbose=0) clf.fit(XTrain, yTrain[:, i]) bestModels.append(clf.best_estimator_) for i in range(1, 12): XTrain, yTrain = pp.makeTrainset(array, embedDim, interval, i) # 模型的预测天数递增 XPredict = pp.makeXPredict(array, embedDim, interval, i) # 待预测的输入递增 subyPredict = [] for j in range(len(yTrain[0])): bestModels[j].fit(XTrain, yTrain[:, j]) subyPredict.append(bestModels[j].predict(XPredict)) array = np.hstack( (array, np.array(copy(subyPredict)))) # 将一个模型的预测值作为已知数据,训练下一个模型 yPredict = array[0, -65:-5] # 一共可以预测66天,取其中对应的数据 return yPredict
def lr_model(all_file, num=200, debug=True): if debug: all_data = pd.read_csv(all_file, nrows=500, encoding='gb18030') else: all_data = pd.read_csv(all_file, encoding='gb18030') train_data = all_data[all_data['tag'] == 1] feature_data = train_data.drop(['Idx', 'ListingInfo', 'target', 'tag'], axis=1) feature_data.fillna(-1, inplace=True) # labels = train_data['target'] feature_importance = pd.read_csv(features_importance_file) feature_importance_columns = feature_importance['feature'].tolist() feature_importance_columns = feature_importance_columns[:num] final_train_data = feature_data[feature_importance_columns] # final_train_data = feature_data print final_train_data.shape labels = train_data['target'] clf = LogisticRegression() param_grid = { 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'penalty': ['l2'], 'solver': ['lbfgs', 'liblinear'] } model = grid_search.RandomizedSearchCV(estimator=clf, param_distributions=param_grid, n_jobs=1, cv=2, verbose=0, n_iter=5, scoring=AUC) model.fit(final_train_data, labels) report(model.grid_scores_)
def train(array, embedDim, interval): XTrain, yTrain = pp.makeTrainset(array, embedDim, interval, 1) kfold = cross_validation.KFold(len(XTrain), n_folds=4, shuffle=False) params = { "n_estimators": randint(5, 100), "max_depth": [1, 2, 3, 5, 8, 10, None], "max_features": randint(1, len(XTrain[0])), "min_samples_split": randint(1, 3), "min_samples_leaf": randint(1, 3) } bestModels = [] for i in range(len(yTrain[0])): erf = ExtraTreesRegressor() clf = grid_search.RandomizedSearchCV(erf, param_distributions=params, n_iter=10, scoring='mean_squared_error', cv=kfold, n_jobs=-1) clf.fit(XTrain, yTrain[:, i]) bestModels.append(clf.best_estimator_) for i in range(60): XTrain, yTrain = pp.makeTrainset(array, embedDim, interval, 1) # 模型的嵌入维度递增 XPredict = pp.makeXPredict(array, embedDim, interval, 1) # 待预测的嵌入维度递增 subyPredict = [] for j in range(len(yTrain[0])): bestModels[j].fit(XTrain, yTrain[:, j]) subyPredict.append(bestModels[j].predict(XPredict)) array = np.hstack( (array, np.array(copy(subyPredict)))) # 将一个模型的预测值作为已知数据,训练下一个模型 embedDim += 1 yPredict = array[0, -60:] # 一共可以预测60天,取其中对应的数据 return yPredict
def train(array, embedDim, interval): distance = 7 for i in range(9): XTrain, yTrain = pp.makeTrainset(array, embedDim, interval, distance) # 模型的预测天数 XPredict = pp.makeXPredict(array, embedDim, interval, distance) params = { 'C': uniform(1, 99), 'gamma': uniform(0.01, 0.29), 'kernel': ['rbf', 'poly'] } kfold = cross_validation.KFold(len(XTrain), n_folds=5, shuffle=False) subyPredict = [] for j in range(len(yTrain[0])): svr = svm.SVR() clf = grid_search.RandomizedSearchCV(svr, param_distributions=params, n_iter=10, cv=kfold, scoring='mean_squared_error', n_jobs=1, verbose=0) clf.fit(XTrain, yTrain[:, j]) subyPredict.append(clf.predict(XPredict)) array = np.hstack( (array, np.array(copy(subyPredict)))) # 将一个模型的预测值作为已知数据,训练下一个模型 embedDim += distance yPredict = array[0, -62:-2] # 一共可以预测63天,取其中对应的数据 return yPredict
def test_grid_search(self): def scorer(network, X, y): result = network.predict(X) return rmsle(result[:, 0], y) dataset = datasets.load_diabetes() x_train, x_test, y_train, y_test = train_test_split(dataset.data, dataset.target, train_size=0.7) grnnet = algorithms.GRNN(std=0.5, verbose=False) grnnet.train(x_train, y_train) error = scorer(grnnet, x_test, y_test) self.assertAlmostEqual(0.513, error, places=3) random_search = grid_search.RandomizedSearchCV( grnnet, param_distributions={'std': np.arange(1e-2, 0.1, 1e-4)}, n_iter=10, scoring=scorer) random_search.fit(dataset.data, dataset.target) scores = random_search.grid_scores_ best_score = min(scores, key=itemgetter(1)) self.assertAlmostEqual(0.4303, best_score[1], places=3)
def train(XTrain, yTrain, XPredict): XTrain = np.array(XTrain, dtype=float) yTrain = np.array(yTrain, dtype=float) params = { 'n_estimators': randint(50, 150), 'max_depth': randint(1, 4), 'learning_rate': uniform(0.01, 0.19), 'min_child_weight': [1], 'max_delta_step': randint(0, 50), 'subsample': uniform(0.5, 0.5), 'colsample_bytree': uniform(0.5, 0.5), 'colsample_bylevel': uniform(0.5, 0.5), 'scale_pos_weight': [0], 'gamma': uniform(1, 6) } kfold = cross_validation.KFold(len(XTrain), n_folds=4, shuffle=False) xgbr = xgb.XGBRegressor() clf = grid_search.RandomizedSearchCV(xgbr, param_distributions=params, n_iter=5, n_jobs=1, scoring='mean_squared_error', cv=kfold, verbose=0) yPredict = [] for i in range(yTrain.shape[1]): clf.fit(XTrain, yTrain[:, i]) # 训练distance个模型 yPredict.extend(clf.predict(XPredict)) return np.array(yPredict)
def train(XTrain, yTrain, XPredict): params = { 'n_estimators': randint(20, 200), 'max_depth': randint(1, 4), 'learning_rate': uniform(0.01, 0.19), 'min_child_weight': [1], 'max_delta_step': randint(0, 50), 'subsample': uniform(0.5, 0.5), 'colsample_bytree': uniform(0.5, 0.5), 'colsample_bylevel': uniform(0.5, 0.5), 'scale_pos_weight': [0], 'gamma': uniform(1, 10) } kfold = cross_validation.KFold(len(XTrain), n_folds=5, shuffle=False) xgbr = xgb.XGBRegressor() clf = grid_search.RandomizedSearchCV(xgbr, param_distributions=params, n_iter=50, n_jobs=1, scoring='mean_absolute_error', cv=kfold, verbose=0) clf.fit(XTrain, yTrain) # 一次性训练模型 # print clf.best_score_, clf.best_estimator_ yPredict = clf.predict(XPredict) return yPredict
def train(array, embedDim, interval): distance = 7 XTrain, yTrain = pp.makeTrainset(array, embedDim, interval, distance) kfold = cross_validation.KFold(len(XTrain), n_folds=5, shuffle=False) params = {'n_estimators': randint(20, 200), 'loss': ['ls', 'lad', 'huber'], 'learning_rate': uniform(0.01, 0.19), 'subsample': uniform(0.5, 0.5), 'max_depth': randint(1, 5), 'min_samples_split': randint(1, 3), 'min_samples_leaf': randint(1, 3), 'max_features': randint(1, len(XTrain[0]))} bestModels = [] for i in range(len(yTrain[0])): gbrt = GradientBoostingRegressor() clf = grid_search.RandomizedSearchCV(gbrt, param_distributions=params, n_iter=30, scoring='mean_squared_error', cv=kfold, n_jobs=-1) clf.fit(XTrain, yTrain[:, i]) bestModels.append(clf.best_estimator_) for i in range(9): XTrain, yTrain = pp.makeTrainset(array, embedDim, interval, distance) # 模型的嵌入维度递增 XPredict = pp.makeXPredict(array, embedDim, interval, distance) # 待预测的嵌入维度递增 subyPredict = [] for j in range(len(yTrain[0])): bestModels[j].fit(XTrain, yTrain[:, j]) subyPredict.append(bestModels[j].predict(XPredict)) array = np.hstack((array, np.array(copy(subyPredict)))) # 将一个模型的预测值作为已知数据,训练下一个模型 embedDim += distance yPredict = array[0, -62:-2] # 一共可以预测63天,取其中对应的数据 return yPredict
def train(XTrain, yTrain, XPredict): XTrain = np.array(XTrain, dtype=float) yTrain = np.array(yTrain, dtype=float) params = { 'n_estimators': randint(50, 150), 'loss': ['ls', 'lad', 'huber'], 'learning_rate': uniform(0.01, 0.19), 'subsample': uniform(0.5, 0.5), 'max_depth': randint(1, 4), 'min_samples_split': randint(1, 4), 'min_samples_leaf': randint(1, 4), 'max_features': randint(1, len(XTrain[0])) } gbrt = GradientBoostingRegressor() kfold = cross_validation.KFold(len(XTrain), n_folds=4, shuffle=False) clf = grid_search.RandomizedSearchCV(gbrt, param_distributions=params, n_iter=5, scoring='mean_squared_error', cv=kfold, n_jobs=-1) yPredict = [] for i in range(yTrain.shape[1]): clf.fit(XTrain, yTrain[:, i]) # 训练distance个模型 yPredict.extend(clf.predict(XPredict)) return np.array(yPredict)
def train(XTrain, yTrain, testsize): XTrain = np.array(XTrain, dtype=float) yTrain = np.array(yTrain, dtype=float) params = {'C': uniform(1, 99), 'gamma': uniform(0.01, 0.29), 'kernel': ['rbf', 'poly']} kfold = cross_validation.KFold(len(XTrain), n_folds=4, shuffle=False) models = [] for i in range(len(yTrain[0])): svr = svm.SVR() clf = grid_search.RandomizedSearchCV(svr, param_distributions=params, n_iter=30, cv=kfold, scoring='mean_squared_error', n_jobs=-1,verbose=1) clf.fit(transArray(XTrain), yTrain[:, i]) models.append(clf.best_estimator_) yPredict = [] XPredict = copy(XTrain[-1]) for i in range(testsize): XPredict = np.delete(XPredict, 0, axis=0) XPredict = np.insert(XPredict, len(XPredict), yTrain[-1], axis=0) subyPredict = np.array([]) for j in range(len(models)): models[j].fit(transArray(XTrain), yTrain[:, j]) # 重复训练模型 newPredict = models[j].predict([transRow(XPredict)]) subyPredict = np.hstack((subyPredict, newPredict)) XTrain = np.delete(XTrain, 0, axis=0) XTrain = np.insert(XTrain, len(XTrain), copy(XPredict), axis=0) yTrain = np.delete(yTrain, 0, axis=0) yTrain = np.insert(yTrain, len(yTrain), copy(subyPredict), axis=0) yPredict.append(copy(subyPredict[0])) return np.array(yPredict)
def randomized_grid_search_linear(datamat, classvect, C=4.6, n=20, n_jobs=1): """Retun best parameters from randomized grid search""" clf = svm.LinearSVC(class_weight='auto') param_dist = {'C': scipy.stats.expon(scale=C)} # run randomized search random_search = grid_search.RandomizedSearchCV( clf, param_distributions=param_dist, n_iter=n, n_jobs=n_jobs) random_search.fit(datamat, classvect) return random_search
def fit(train, target): # set up pipeline est = pipeline.Pipeline([ ('xgb', xgb.XGBRegressor(silent=True)), ]) # create param grid for grid search params = { 'xgb__learning_rate': [ 0.05, 0.1, 0.3, ], 'xgb__min_child_weight': [ 1, 2, ], 'xgb__subsample': [ 1, ], 'xgb__colsample_bytree': [ 1, ], 'xgb__max_depth': [ 15, 20, ], 'xgb__n_estimators': [ 1000, ], } # set up scoring mechanism gini_scorer = metrics.make_scorer(normalized_gini, greater_is_better=True) # initialize gridsearch gridsearch = grid_search.RandomizedSearchCV( estimator=est, param_distributions=params, scoring=gini_scorer, verbose=10, n_jobs=-1, cv=5, n_iter=3, ) # fit gridsearch gridsearch.fit(train, target) print('Best score: %.3f' % gridsearch.best_score_) print('Best params:') for k, v in sorted(gridsearch.best_params_.items()): print("\t%s: %r" % (k, v)) # get best estimator return gridsearch.best_estimator_
def train(XTrain, yTrain, XPredict): params = {'n_estimators': randint(1, 100)} kfold = cross_validation.KFold(len(XTrain), n_folds=3) svr = svm.SVR(kernel='rbf', C=50, gamma=0.1) baggingsvr = ensemble.BaggingRegressor(svr) clf = grid_search.RandomizedSearchCV(baggingsvr, param_distributions=params, n_iter=10, scoring='mean_squared_error', cv=kfold, n_jobs=-1) clf.fit(XTrain, yTrain) # 一次性训练模型 # print clf.best_score_, clf.best_estimator_ yPredict = clf.predict(XPredict) return yPredict, clf.best_params_
def fitAlgo(clf, Xtrain, Ytrain, opt = False, param_dict = None, opt_metric = 'roc_auc', n_iter = 5, n_optFolds = 3): '''Return the fitted classifier Keyword arguments: clf - - base classifier Xtrain - - training feature matrix Ytrain - - training target array param_dict - - the parameter distribution of param, grids space, if opt == False, every element should have length 1 opt_metric - - optimization metric opt - - whether to do optimization or not ''' if opt & (param_dict != None): assert(map(lambda x: isinstance(param_dict[x],list), param_dict)) prod_feature_05 =np.prod([math.pow(len(v),0.5) for x, v in param_dict.iteritems()]) prod_feature =np.prod([len(v) for x, v in param_dict.iteritems()]) N_iter = int(np.ceil(prod_feature_05* n_iter / 5 * 1.5)) N_iter = N_iter if N_iter < prod_feature else prod_feature print("Using N_iter = " + str(N_iter)) if n_iter != 0: rs = gd.RandomizedSearchCV(estimator = clf, n_iter = N_iter, param_distributions = param_dict, scoring = opt_metric, refit = True, n_jobs=-1, cv = n_optFolds, verbose = 1) else: rs = gd.GridSearchCV(estimator = clf, param_grid = param_dict, scoring = opt_metric, refit = True, n_jobs=-1, cv = n_optFolds, verbose = 1) print("Simulation with num_features=", num_features) print("max_features=") print(param_dict) rs.fit(Xtrain, Ytrain) print("\n### Optimal parameters: ###") pprint(rs.best_params_) print("####################### \n") imp = [] if clf.__class__.__name__ == "RandomForestClassifier": imp = rs.best_estimator_.feature_importances_ return rs.best_estimator_, rs.grid_scores_, imp else: if param_dict != None: assert(map(lambda x: not isinstance(param_dict[x], list), param_dict)) for k in param_dict.keys(): # print k # print opt # print param_dict clf.set_params(k = param_dict[k]) clf.fit(Xtrain, Ytrain) return clf, [], []
def test_example_randomized_search(self): # The classic example from the sklearn documentation iris = datasets.load_iris() parameters = {'kernel': ('linear', 'rbf'), 'C': range(1, 10)} svr = svm.SVC() clf = grid_search.RandomizedSearchCV(svr, parameters, random_state=4) clf.fit(iris.data, iris.target) clf2 = RandomizedSearchCV(self.sc, svr, parameters, random_state=4) clf2.fit(iris.data, iris.target) b1 = clf.estimator b2 = clf2.estimator self.assertEqual(b1.get_params(), b2.get_params())
def randomized_grid_search_rbf(datamat, classvect, C=4.6, gamma=0.01, n=20, n_jobs=1): """Retun best parameters from randomized grid search""" clf = svm.SVC(kernel='rbf', cache_size=4000, class_weight='auto') param_dist = { 'C': scipy.stats.expon(scale=C), 'gamma': scipy.stats.expon(scale=gamma) } # run randomized search random_search = grid_search.RandomizedSearchCV( clf, param_distributions=param_dist, n_iter=n, n_jobs=n_jobs) random_search.fit(datamat, classvect) return random_search
def train(XTrain, yTrain, testsize): XTrain = np.array(XTrain, dtype=float) yTrain = np.array(yTrain, dtype=float) params = { 'n_estimators': randint(50, 150), 'max_depth': randint(1, 4), 'learning_rate': uniform(0.01, 0.19), 'min_child_weight': [1], 'max_delta_step': randint(0, 50), 'subsample': uniform(0.5, 0.5), 'colsample_bytree': uniform(0.5, 0.5), 'colsample_bylevel': uniform(0.5, 0.5), 'scale_pos_weight': [0], 'gamma': uniform(1, 6) } kfold = cross_validation.KFold(len(XTrain), n_folds=4, shuffle=False) models = [] for i in range(len(yTrain[0])): xgbr = xgb.XGBRegressor() clf = grid_search.RandomizedSearchCV(xgbr, param_distributions=params, n_iter=10, n_jobs=1, scoring='mean_squared_error', cv=kfold, verbose=0) clf.fit(transArray(XTrain), yTrain[:, i]) models.append(clf.best_estimator_) yPredict = [] XPredict = copy(XTrain[-1]) for i in range(testsize): XPredict = np.delete(XPredict, 0, axis=0) XPredict = np.insert(XPredict, len(XPredict), yTrain[-1], axis=0) subyPredict = np.array([]) XTrainTrans = transArray(XTrain) XPredictTrans = transRow(XPredict) for j in range(len(models)): models[j].fit(XTrainTrans, yTrain[:, j]) # 重复训练模型 newPredict = models[j].predict([XPredictTrans]) subyPredict = np.hstack((subyPredict, newPredict)) XTrain = np.delete(XTrain, 0, axis=0) XTrain = np.insert(XTrain, len(XTrain), copy(XPredict), axis=0) yTrain = np.delete(yTrain, 0, axis=0) yTrain = np.insert(yTrain, len(yTrain), copy(subyPredict), axis=0) yPredict.append(copy(subyPredict[0])) return np.array(yPredict)
def classifier_comparison(X, y): """ 分类器比较 Args: X: training samples, size=[n_samples, n_features] y: class labels, size=[n_samples, 1] Returns: None """ from sklearn import grid_search from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.naive_bayes import GaussianNB from sklearn.lda import LDA from sklearn.qda import QDA import scipy # Exhaustive Grid Search exhaustive_parameters = {'kernel':['rbf'], 'C':[1, 10, 100, 1000], 'gamma':[1e-3, 1e-4]} clf_SVC_exhaustive = grid_search.GridSearchCV(SVC(), exhaustive_parameters) # Randomized Parameter Optimization randomized_parameter = {'kernel':['rbf'], 'C': scipy.stats.expon(scale=100), 'gamma': scipy.stats.expon(scale=.1)} clf_SVC_randomized = grid_search.RandomizedSearchCV(SVC(), randomized_parameter) names = ["Linear SVM", "RBF SVM", "RBF SVM with Grid Search", "RBF SVM with Random Grid Search", "Decision Tree", "Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA"] classifiers = [ SVC(kernel="linear", C=0.025), SVC(gamma=2, C=1), clf_SVC_exhaustive, clf_SVC_randomized, DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), AdaBoostClassifier(), GaussianNB(), LDA(), QDA()] for name, clf in zip(names, classifiers): logger.info('Use %s:' % (name)) train_classifier(clf, X, y)
def train(XTrain, yTrain, XPredict): params = { 'C': uniform(1, 999), 'gamma': uniform(0.01, 0.29), 'kernel': ['rbf', 'poly'] } kfold = cross_validation.KFold(len(XTrain), n_folds=3, shuffle=False) svr = svm.SVR() clf = grid_search.RandomizedSearchCV(svr, param_distributions=params, n_iter=20, cv=kfold, scoring='mean_squared_error', n_jobs=-1) clf.fit(XTrain, yTrain) # 一次性训练模型 # print clf.best_score_, clf.best_estimator_ yPredict = clf.predict(XPredict) return yPredict, clf.best_params_
def find_best_estimator(base_estimator, X, y, cfg, section, grid_search_params_key, random_search=True, scoring="accuracy", verbosity=3): # grid_search_params_key : key under the indicated section of the # configuration YML file containing the grid search parameters cv_nfold = cfg[section]["cv_nfold"] name = type(base_estimator).__name__ n_iter = cfg[section]["n_iters"] n_jobs = cfg[section]["n_jobs"] param_dist = cfg[section][grid_search_params_key] random_state = cfg["common"]["seed"] logger.info("Finding the best %s based on %s score" % (name, scoring)) if random_search == cfg[section]["use_random_search"]: logger.info("Using random search to find the best %s" % name) search = grid_search.RandomizedSearchCV(estimator=base_estimator, param_distributions=param_dist, n_iter=n_iter, n_jobs=n_jobs, cv=cv_nfold, random_state=random_state, scoring=scoring, verbose=verbosity) else: logger.info("Using grid search to find the best %s" % name) search = grid_search.GridSearchCV(estimator=base_estimator, param_grid=param_dist, n_jobs=n_jobs, cv=cv_nfold, scoring=scoring, verbose=verbosity) logger.info(search) start = time.time() search.fit(X, y) logger.info("Took %.2f seconds to find the best %s." % ((time.time() - start), name)) report_grid_search_scores(search.grid_scores_, n_top=3) return search.best_estimator_
def grid_search(self, col2fit, **kwargs): """Using grid search to find the best parameters.""" n_jobs = kwargs.get('n_jobs', 1) # use a full grid over all parameters parameters = { "max_depth": sp_randint(1, 30), "max_features": [1.0, 0.8, 0.6, 0.4, 0.2, 0.1], "min_samples_leaf": sp_randint(1, 25), "learning_rate": [0.01, 0.02, 0.05, 0.1] } if not self.iscleaned: print 'Preparing the data...' self.prepare_data(self.df_full, True, col2fit) else: print 'data frame is already cleaned...' train_values = self.df_full[col2fit].values target_values = self.df_full['Expected'].values pre_dispatch = '2*n_jobs' # Fit the grid print 'fitting the grid with njobs = {}...'.format(n_jobs) start = time() estimator = GradientBoostingRegressor(n_estimators=200) rf_grid = grid_search.RandomizedSearchCV(estimator, parameters, n_jobs=n_jobs, verbose=2, pre_dispatch=pre_dispatch, scoring=kaggle_score, error_score=0, n_iter=25) rf_grid.fit(train_values, target_values) print('Grid search finished') print( "\n\nGridSearchCV took %.2f seconds for %d candidate parameter settings." % (time() - start, len(rf_grid.grid_scores_))) self.grid_report(rf_grid.grid_scores_, 15) print('\n\nBest score = {}'.format(rf_grid.best_score_)) print('Best params = {}\n\n'.format(rf_grid.best_params_))
def do_gs(clf, X, y, params, n_samples=1000, n_iter=3, n_jobs=-2, scoring=None, fit_params=None, random_iterations=None): start('starting grid search') if type(n_samples) is float: n_samples = int(n_samples) reseed(clf) cv = cross_validation.ShuffleSplit(n_samples, n_iter=n_iter, random_state=cfg['sys_seed']) if random_iterations is None: gs = grid_search.GridSearchCV(clf, params, cv=cv, n_jobs=n_jobs, verbose=2, scoring=scoring or cfg['scoring'], fit_params=fit_params) else: gs = grid_search.RandomizedSearchCV(clf, params, random_iterations, cv=cv, n_jobs=n_jobs, verbose=2, scoring=scoring or cfg['scoring'], fit_params=fit_params, refit=False) X2, y2 = utils.shuffle(X, y, random_state=cfg['sys_seed']) gs.fit(X2[:n_samples], y2[:n_samples]) stop('done grid search') dbg(gs.best_params_, gs.best_score_) return gs
def generate_model(data, classes, args): # Define the parameters tuned_parameters = {'C': C_RANGE, 'class_weight': CLASS_WEIGHTS} # Define the classifier if args.kernel == 'rbf': clf = svm.SVC(cache_size=CACHE_SIZE) tuned_parameters['gamma'] = GAMMA_RANGE else: clf = svm.LinearSVC(dual=False) print_verbose("Classifier: %s" % str(clf), 5) print_verbose("Parameters: %s" % str(tuned_parameters), 5) # Generate the K-fold development skf = cross_validation.StratifiedKFold(classes, n_folds=K_FOLD, shuffle=True) print_verbose("KFold: %s" % str(skf), 5) # Generate the grid search if args.search == 'grid': gscv = grid_search.GridSearchCV(clf, tuned_parameters, cv=skf, scoring='f1', n_jobs=1, verbose=get_verbose_level()) else: gscv = grid_search.RandomizedSearchCV(clf, tuned_parameters, cv=skf, scoring='f1', n_jobs=1, verbose=get_verbose_level(), n_iter=args.iter) # Search print_verbose("GridSearch: %s" % str(gscv), 5) gscv.fit(data, classes) # Print scores print_verbose("GridSearch scores:", 5) for params, mean_score, scores in gscv.grid_scores_: print_verbose("%0.6f (+/-%0.06f) for %r" % (mean_score, scores.std() / 2, params), 5) # Print best score print_verbose("GridSearch best score:", 0) print_verbose("%0.6f for %r" % (gscv.best_score_, gscv.best_params_), 0) return gscv
def train(XTrain, yTrain, XPredict): params = { "n_estimators": randint(5, 100), "max_depth": [1, 2, 3, 5, 8, 10, None], "max_features": randint(1, len(XTrain[0])), "min_samples_split": randint(1, 3), "min_samples_leaf": randint(1, 3) } erf = ExtraTreesRegressor() kfold = cross_validation.KFold(len(XTrain), n_folds=5, shuffle=False) clf = grid_search.RandomizedSearchCV(erf, param_distributions=params, n_iter=50, scoring='mean_absolute_error', cv=kfold, n_jobs=-1) clf.fit(XTrain, yTrain) # print clf.best_score_, clf.best_estimator_ yPredict = clf.predict(XPredict) return yPredict
def train(XTrain, yTrain, XPredict): XTrain = np.array(XTrain, dtype=float) yTrain = np.array(yTrain, dtype=float) params = { 'C': uniform(1, 999), 'gamma': uniform(0.01, 0.29), 'kernel': ['rbf', 'poly'] } kfold = cross_validation.KFold(len(XTrain), n_folds=4, shuffle=False) svr = svm.SVR() clf = grid_search.RandomizedSearchCV(svr, param_distributions=params, n_iter=20, cv=kfold, scoring='mean_squared_error', n_jobs=-1) yPredict = [] for i in range(yTrain.shape[1]): clf.fit(XTrain, yTrain[:, i]) # 训练distance个模型 yPredict.extend(clf.predict(XPredict)) return np.array(yPredict)
def GRNN(data_model): x_train, x_test, y_train, y_test = create_dataset(data_model['2017']) grnnet = algorithms.GRNN(std=0.5, verbose=True) grnnet.train(x_train, y_train) error = scorer(grnnet, x_test, y_test) print("GRNN RMSLE = {:.3f}\n".format(error)) part_to_predict = data_model['2018'].copy() df_test = part_to_predict.copy() index_predict = df_test.index df_test.reset_index(inplace=True) df_test.drop(["Date"], axis=1, inplace=True) # fix random seed for reproducibility pd.np.random.seed(7) X = df_test.drop([pr.PowerPV], axis=1) y = df_test.drop([x for x in df_test.columns if x not in [pr.PowerPV]], axis=1) pred = grnnet.predict(X) prediction_to_plot = pd.DataFrame(index=index_predict, data={ 'observed': pd.np.array(y[pr.PowerPV]), 'predicted': pred.reshape(pred.shape[0], ) }) pr.plot_data(prediction_to_plot['2018-04-01':'2018-04-05'], prediction_to_plot.columns, 1) print("Run Random Search CV") grnnet.verbose = False random_search = grid_search.RandomizedSearchCV( grnnet, param_distributions={'std': np.arange(1e-2, 1, 1e-4)}, n_iter=400, scoring=scorer, ) random_search.fit( data_model[[x for x in df_test.columns if x not in [pr.PowerPV]]], data_model[pr.PowerPV]) report(random_search.grid_scores_)
def train(XTrain, yTrain, XPredict): XTrain = np.array(XTrain, dtype=float) yTrain = np.array(yTrain, dtype=float) params = { "n_estimators": randint(50, 150), "max_depth": [1, 3, 5, None], "max_features": randint(1, len(XTrain[0])), "min_samples_split": randint(1, 4), "min_samples_leaf": randint(1, 4) } erf = ExtraTreesRegressor() kfold = cross_validation.KFold(len(XTrain), n_folds=4, shuffle=False) clf = grid_search.RandomizedSearchCV(erf, param_distributions=params, n_iter=5, scoring='mean_squared_error', cv=kfold, n_jobs=-1) yPredict = [] for i in range(yTrain.shape[1]): clf.fit(XTrain, yTrain[:, i]) # 训练distance个模型 yPredict.extend(clf.predict(XPredict)) return np.array(yPredict)
def train(XTrain, yTrain, XPredict): params = { 'n_estimators': randint(20, 200), 'loss': ['ls', 'lad', 'huber'], 'learning_rate': uniform(0.01, 0.19), 'subsample': uniform(0.5, 0.5), 'max_depth': randint(1, 5), 'min_samples_split': randint(1, 3), 'min_samples_leaf': randint(1, 3), 'max_features': randint(1, len(XTrain[0])) } gbrt = GradientBoostingRegressor() kfold = cross_validation.KFold(len(XTrain), n_folds=5, shuffle=False) clf = grid_search.RandomizedSearchCV(gbrt, param_distributions=params, n_iter=50, scoring='mean_absolute_error', cv=kfold, n_jobs=-1) clf.fit(XTrain, yTrain) # print clf.best_score_, clf.best_estimator_ yPredict = clf.predict(XPredict) return yPredict
def random_parameter_search(variants, n_iter=10, classifier_name='RandomForest', folds_name='kfold', folds_params=None, label_name=None): # Get X, y and classifier X, y = _get_X_y(variants) estimator = _get_estimator(classifier_name) # Get labels if given labels = variants[label_name] if label_name else None # Get folds logging.info('Generating %s folds, params: %s', folds_name, str(folds_params)) folds = _get_folds(folds_name, y, folds_params, labels) # User Mathews Correlation Coefficient scorer mcc_scorer = metrics.make_scorer(metrics.matthews_corrcoef) # Get parameters distribution for estimator param_distributions = { 'classifier__' + k: v for k, v in CLASSIFIERS[classifier_name] ['param_distributions'].items() } search = grid_search.RandomizedSearchCV(estimator, param_distributions, cv=folds, n_iter=n_iter, scoring=mcc_scorer, verbose=1) search.fit(X, y) return search
def benchmark(): from solaris.run import load_data from sklearn import grid_search from sklearn import metrics def rmse(y_true, pred): return np.sqrt(metrics.mean_squared_error(y_true, pred)) data = load_data() X = data['X_train'] y = data['y_train'] x = Interpolate._grid_data() fx = 0 day = 180 y = X.nm[day, fx].mean(axis=0)[3] #nugget = X.nm[day, fx].std(axis=0)[3] mask = np.ones_like(y, dtype=np.bool) rs = np.random.RandomState(5) test_idx = np.c_[rs.randint(2, 7, 20), rs.randint(3, 13, 20)] print test_idx.shape mask[test_idx[:, 0], test_idx[:, 1]] = False mask = mask.ravel() y = y.ravel() print '_' * 80 est = GaussianProcess(corr='squared_exponential', theta0=(10, 10, 10)) est.fit(x[mask], y[mask]) pred = est.predict(x[~mask]) print 'MAE: %.2f' % metrics.mean_absolute_error(y[~mask], pred) print '_' * 80 sys.exit(0) #import IPython #IPython.embed() class KFold(object): n_folds = 1 def __iter__(self): yield mask, ~mask def __len__(self): return 1 est = Ridge() params = {'normalize': [True, False], 'alpha': 10.0 ** np.arange(-7, 1, 1)} gs = grid_search.GridSearchCV(est, params, cv=KFold(), scoring='mean_squared_error').fit(x, y) print gs.grid_scores_ print gs.best_score_ est = GaussianProcess() params = {'corr': ['squared_exponential'], 'theta0': MultivariateNormal(), } ## params = {'corr': ['squared_exponential'], ## #'regr': ['constant', 'linear', 'quadratic'], ## 'theta0': np.arange(4, 11), ## } # gs = grid_search.GridSearchCV(est, params, cv=KFold(), # loss_func=rmse).fit(x, y) gs = grid_search.RandomizedSearchCV(est, params, cv=KFold(), scoring='mean_squared_error', n_iter=100).fit(x, y) print gs.grid_scores_ print gs.best_params_ print gs.best_score_