def fine_tune_gradient_boosting_hyper_params(data_train_x, data_test_x, data_train_y, data_test_y): from sklearn.ensemble import GradientBoostingRegressor from sklearn.grid_search import RandomizedSearchCV print "-- {} --".format("Fine-tuning Gradient Boosting Regression") rf = GradientBoostingRegressor( n_estimators=1000 ) param_dist = { "learning_rate": [0.01, 0.03, 0.05, 0.07, 0.09, 0.1, 0.15, 0.2], "max_depth": sp_randint(1, 15), "min_samples_split": sp_randint(1, 15), "min_samples_leaf": sp_randint(1, 15), "subsample": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], "max_features": sp_randint(1, 15) } n_iter_search = 300 random_search = RandomizedSearchCV( rf, param_distributions=param_dist, n_iter=n_iter_search, n_jobs=-1, cv=5, verbose=1 ) start = time() random_search.fit(data_train_x, data_train_y) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_search.grid_scores_)
def sk_generate_params(method, columns=None): param_dist = {} if 'rf' in method: param_dist = { "max_features": sp_unif(0.01, 1.0), "n_estimators": sp_randint(32, 256), "bootstrap": [True, False], "criterion": ["gini", "entropy"], "min_samples_split": sp_randint(2, 10), "min_samples_leaf": sp_randint(2, 10), "min_weight_fraction_leaf": sp_unif(0., 0.5), "class_weight": ['balanced', 'balanced_subsample'] } elif 'svm' in method: param_dist = { "C": sp_unif(0.01, 20.), "kernel": ["linear"] } elif 'lr' in method: param_dist = { "C": sp_unif(0.01, 20.), "penalty": ["l1", "l2"], } if 'bagged' in method: _param_dist = {} for c in columns: for k, v in param_dist.items(): _param_dist['{}__{}'.format(c, k)] = v _param_dist['weights'] = uniform_gen_5(0., 1.) return _param_dist else: return param_dist
def getRandomForestClf(self, X, Y, param_list): clfName = "Random_Forest" ## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html clf = rf(n_estimators=300, max_depth=None, min_samples_split=1, random_state=0, bootstrap=True, oob_score = True) if self._gridSearchFlag == True: log(clfName + " start searching param...") tmpLowDepth = 8 tmpHighDepth = 30 param_dist = { "max_depth": sp_randint(tmpLowDepth, tmpHighDepth), "max_features": sp_randf(0,1), "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "criterion": ["gini", "entropy"], "n_estimators" : sp_randint(5, 12), } clf = self.doRandomSearch(clfName, clf, param_dist, X, Y) else: if not param_list is None: clf = rf() clf.set_params(**param_list) clf.fit(X,Y) return clf
def train(model_id,train_x,train_y,valid_x,valid_y,test_x): train_x,train_y=shuffle(train_x,train_y) random_state=random.randint(0, 1000000) rf = RandomForestClassifier(n_jobs=8) param_dist = { "n_estimators":sp_randint(100,300), "criterion": ["gini"], #"max_depth": sp_randint(3, 10000), #"min_samples_split": sp_randint(1, 300), #"min_samples_leaf": sp_randint(1, 300), "max_features": sp_randint(10, 26), "bootstrap": [True, False], 'random_state':sp_randint(1, 1000000), } clf = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=50,cv=10,scoring='roc_auc') clf.fit(train_x, train_y) valid_predictions = clf.predict_proba(valid_x)[:, 1] test_predictions= clf.predict_proba(test_x)[:, 1] loss = roc_auc_score(valid_y,valid_predictions) print('loss:') print(loss) print(clf.best_estimator_) data.saveData(valid_id,valid_predictions,"./valid_results/valid_"+str(model_id)+".csv") data.saveData(test_id,test_predictions,"./results/results_"+str(model_id)+".csv")
def __init__(self, X: csr_matrix, Y: np.array, tune_parameters=False): super().__init__(X, Y, tune_parameters) if tune_parameters: self.param_dist_random = {'max_features': sp_randint(1, self.X.shape[1]), 'n_estimators': sp_randint(1, 100)} self.clf = BaggingClassifier(self.classifier, n_estimators=self.estimators, n_jobs=8, max_samples=self.max_samples, max_features=self.max_features)
def train_cv(): # ---------------------- load the data train_df = pd.read_csv("train_processed.csv",index_col="PassengerId") Xtrain = train_df[feature_names] ytrain = train_df["Survived"] # ---------------------- train loss = ['deviance', 'exponential'] learning_rate = np.logspace(-5,1) n_estimate_dist = sp_randint(1000,4800) max_depth_dist = sp_randint(1,10) param_dist = dict(loss=loss, learning_rate=learning_rate, n_estimators=n_estimate_dist, max_depth=max_depth_dist) gbdt = GradientBoostingClassifier(verbose=1) searchcv = RandomizedSearchCV(estimator=gbdt, param_distributions=param_dist,n_iter=210,verbose=1,n_jobs=-1) print "--------------------- RandomizedSearchCV begins" searchcv.fit(Xtrain,ytrain) print "--------------------- RandomizedSearchCV ends" print "best score: ",searchcv.best_score_ print "best parameters: ",searchcv.best_params_ common.dump_predictor('gbdt-cv.pkl',searchcv.best_estimator_) print "--------------------- GBDT saved into file"
def get_random_grid_CV_params(): """Define the Random Grid Search parameters for each model.""" logit_params = {"C": sp_expon(loc=0.001, scale=1), "fit_intercept": [True, False], "intercept_scaling": sp_randint(1, 5), "warm_start": [False, True] } rf_params = {"min_samples_split": sp_randint(1, 50), "min_samples_leaf": sp_randint(1, 50), "criterion": ["gini", "entropy"], "class_weight": ['balanced', 'balanced_subsample'] } ada_dt_params = {"learning_rate": sp_expon(loc=0.001, scale=1.5), "algorithm": ['SAMME.R', 'SAMME'] } gbc_params = {"learning_rate": sp_expon(loc=0.001, scale=0.5), "subsample": sp_uniform(loc=0.2, scale=0.8), "max_features": [None, 'auto'], "max_depth": sp_randint(2, 6), } svc_params = {"C": sp_expon(loc=0.001, scale=2), "kernel": ['rbf', 'poly'], "degree": sp_randint(2, 10), "coef0": [0, 1, 2], "shrinking": [True, False] } rnd_CV_param_distributions = {'Logistic': logit_params, 'RandomForest': rf_params, 'AdaBoost_DT': ada_dt_params, 'GBC': gbc_params, 'SVC': svc_params } return rnd_CV_param_distributions
def makeRandomCV(dataset,dbtype='CATH', level=1, k_iters=10, minsamples=500, clf = ExtraTreesClassifier(n_estimators=5,class_weight='auto')): from scipy.stats import randint as sp_randint dataDict = dbParser(dataset,level=level,dbtype=dbtype,minsamples=minsamples) print dataDict labels = dataDict['target_names'] param_dist = {"max_depth": [3, None], "max_features": sp_randint(1, 11), "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False], "criterion": ["gini", "entropy"]} n_iter_search = k_iters random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search) random_search.fit(dataDict['vectors'], dataDict['target_names']) report(random_search.grid_scores_)
def best_ExtraTree(X, y): from sklearn.grid_search import RandomizedSearchCV from scipy.stats import randint as sp_randint from sklearn.metrics import accuracy_score X_train, X_test, y_train, y_test = train_test_split(X, y.ravel(), random_state=42) clf = ExtraTreesClassifier(max_depth=None, bootstrap = False) grid = {'n_estimators': sp_randint(250, 400), 'min_samples_leaf' : sp_randint(1, 12), 'max_features' : sp_randint(5, 50)} clf_rfc = RandomizedSearchCV(clf, n_jobs=4, n_iter=10, param_distributions=grid, scoring='accuracy') y_hat = clf_rfc.fit(X_train, y_train.ravel()).predict(X_test) print('Best Params: \n', clf_rfc.best_params_ ) print("Accuracy with Extra Forest = %4.4f" % accuracy_score(y_test.ravel(), y_hat)) binarize_y_confustion_matrix(y_test.ravel(), y_hat) return(clf_rfc.best_params_)
def main(): NUM_TRAIN = bw_componentrecognition.NUM_TRAIN N_BINS = 23 N_HU_MOMENTS = 7 N_FEATURES = N_BINS + N_HU_MOMENTS X, y = bw_componentrecognition.Data.loadTrain(NUM_TRAIN, N_BINS) scaler = preprocessing.StandardScaler().fit(X) X = scaler.transform(X) clfs = [ RandomForestClassifier(n_estimators=20), ] param_dists = [ {"max_depth": [10, 5, 3, None], "max_features": sp_randint(1, 11), "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False], "criterion": ["gini", "entropy"]},] for clf, param_dist in zip(clfs, param_dists): # run randomized search n_iter_search = 25 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search) random_search.fit(X, y) report(random_search.grid_scores_)
def getKnnClf(self, X, Y): clfName = "K_NN" ## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html clf = KNeighborsClassifier( n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, ) if self._gridSearchFlag == True: log(clfName + " start searching param...") param_dist = { "n_neighbors": sp_randint(4, 8), "weights": ['uniform', 'uniform'], "leaf_size": sp_randint(30, 60), "algorithm": ['auto', 'auto'], } clf = self.doRandomSearch(clfName, clf, param_dist, X, Y) return clf
def __init__(self, X: csr_matrix, Y: np.array, tune_parameters=False): super().__init__(X, Y, tune_parameters) if tune_parameters: self.param_dist_random = {'max_depth': sp_randint(1, 100), 'min_samples_leaf': sp_randint(1, 100), 'max_features': sp_randint(1, self.X.shape[1] - 1), 'criterion': ['entropy', 'gini']} self.clf = RandomForestClassifier(n_estimators=100, n_jobs=8)
def deploy_07(train, test): """ Deploy 07: Ensemble modeling with two types of cross validation """ from operator import itemgetter from scipy.stats import randint as sp_randint from sklearn.grid_search import GridSearchCV, RandomizedSearchCV from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import VotingClassifier predictors = ['PassengerId','Gender', 'AgeRange', 'Family', 'AdjFare'] # Data Munging train, test = deploy_03_features(train, test) train = train.fillna(0) test = test.fillna(0) X = train[predictors] y = train["Survived"] # Algorithm specs clf = RandomForestClassifier( n_estimators=100 ) # specify parameters and distributions to sample from param_dist = {"max_depth": [3, None], "max_features": sp_randint(1, 5), "min_samples_split": sp_randint(1, 5), "min_samples_leaf": sp_randint(1, 5), "bootstrap": [True, False], "criterion": ["gini", "entropy"]} # run randomized search n_iter_search = 20 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search) # use a full grid over all parameters param_grid = {"max_depth": [3, None], "max_features": [1, 3, 5], "min_samples_split": [1, 3, 5], "min_samples_leaf": [1, 3, 5], "bootstrap": [True, False], "criterion": ["gini", "entropy"]} # run grid search grid_search = GridSearchCV(clf, param_grid=param_grid) # Using both with a Voting Classifier alg = VotingClassifier(estimators=[('gr', grid_search), ('rs', random_search)], voting='soft') # Make submission create_submission(alg, train, test, predictors, "results/deploy-07.csv")
def learn_and_predict_xgb(self, dataset='train'): ''' Use xgboost to do work ''' #predictors = ["Pclass", "Sex", "Age", "Fare", "Embarked", "FamilySize", "Titles", "FamilyId"] predictors = self.PREDICTORS if dataset == 'train': param_dist = {'max_depth': sp_randint(3, 10), 'learning_rate': [0.01, 0.03, 0.1, 0.3, 1.0], 'gamma': [0, 0.1, 0.2, 0.3], 'subsample': [.1, .2, .3, .4, 0.5], 'colsample_bytree': [.4, .5], 'objective': ['binary:logistic'], 'n_estimators': sp_randint(20, 150), } clf = xgb.XGBClassifier() #random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=500, cv=3) #random_search.fit(self.train_df[predictors], self.train_df['Survived']) #report(random_search.grid_scores_) params = {'max_depth': 6, 'learning_rate': 0.1, 'colsample_bytree': 0.5, 'n_estimators': 54, 'subsample': .3, 'gamma': 0, 'objective':'binary:logistic', 'eval_metric': 'auc'} #0.845, cv=3 bst = xgb.train(params, self.DMatrix_train) predictions = pd.Series(bst.predict(self.DMatrix_train)) predictions[predictions >= .5] = 1 predictions[predictions < .5] = 0 predictions = [int(x) for x in predictions.tolist()] train_model = pd.DataFrame({ 'PassengerId': self.train_df['PassengerId'], 'Survived': predictions, }) train_model.to_csv('./xgb_train.csv', index=False) else: params = {'max_depth': 6, 'learning_rate': 0.1, 'colsample_bytree': 0.5, 'n_estimators': 54, 'subsample': .3, 'gamma': 0, 'objective':'binary:logistic', 'eval_metric': 'auc'} #0.845, cv=3 bst = xgb.train(params, self.DMatrix_train) #clf = xgb.XGBClassifier(params) #clf.fit(self.train_df[predictors], self.train_df['Survived'], verbose=True) #print(self.test_df[predictors]) predictions = pd.Series(bst.predict(self.DMatrix_test)) predictions_proba = predictions.copy() predictions[predictions >= .5] = 1 predictions[predictions < .5] = 0 predictions = [int(x) for x in predictions.tolist()] print(predictions) submission = pd.DataFrame({ 'PassengerId': self.test_df['PassengerId'], 'Survived': predictions }) submission.to_csv("xgboost_845.csv", index=False) submission_proba = pd.DataFrame({ 'PassengerId': self.test_df['PassengerId'], 'Survived': predictions_proba, }) submission_proba.to_csv("xgboost_845_soft.csv", index=False)
def build_sample(regressor, name): # print estimator.get_params().keys() : specify parameters and distributions to sample from param_dist = {"max_depth": [3, None], "max_features": sp_randint(1, 11), "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11)}#, #"bootstrap": [True, False], #"criterion": ["mse", "entropy"]} # run randomized search n_iter_search = 20 random_search = RandomizedSearchCV(regressor, param_distributions=param_dist, n_iter=n_iter_search) # time... start = time() # repeat the CV procedure 10 times to get more precise results n = 10 # for each iteration, randomly hold out 10% of the data as CV set for i in range(n): X_train, X_cv, y_train, y_cv = cross_validation.train_test_split( sample_X, sample_y, test_size=.10, random_state=i*SEED) # train with rand... random_search.fit(X_train, y_train) # train... #regressor = regressor.fit(X_train, y_train) # save model #store_pkl(regressor, name + ".pkl") # predict on train preds = random_search.predict(X_cv) # print #print preds # create DataFrame #preds = DataFrame(preds, columns = ["prime_tot_ttc_preds"]) #print preds #print y_cv # mape mape_r = mape(y_cv, preds) # print print "MAPE of (fold %d/%d) of %s is : %f" % (i+1 , n, name, mape_r) # time... print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_search.grid_scores_) # predict on test predict_res = random_search.predict(sample_t) preds_on_test = DataFrame(list(zip(sample_id, predict_res)), columns = ["ID", "CODIS"]) preds_on_test['ID'].astype(int) # save predictions store_csv(preds_on_test, name + ".csv") return predict_res
def build_nn(x_train, y_train, x_test, y_test, n_features): """ Constructing a regression neural network model from input dataframe :param x_train: features dataframe for model training :param y_train: target dataframe for model training :param x_test: features dataframe for model testing :param y_test: target dataframe for model testing :return: None """ net = NeuralNet(layers=[('input', InputLayer), ('hidden0', DenseLayer), ('hidden1', DenseLayer), ('output', DenseLayer)], input_shape=(None, x_train.shape[1]), # Number of i/p nodes = number of columns in x hidden0_num_units=15, hidden0_nonlinearity=lasagne.nonlinearities.softmax, hidden1_num_units=17, hidden1_nonlinearity=lasagne.nonlinearities.softmax, output_num_units=1, # Number of o/p nodes = number of columns in y output_nonlinearity=lasagne.nonlinearities.softmax, max_epochs=100, update_learning_rate=0.01, regression=True, verbose=0) # Finding the optimal set of params for each variable in the training of the neural network param_dist = {'hidden0_num_units':sp_randint(3, 30), 'hidden1_num_units':sp_randint(3, 30)} clf = RandomizedSearchCV(estimator=net, param_distributions=param_dist, n_iter=15, n_jobs=-1) clf.fit(x_train, y_train) y_pred = clf.predict(x_test) # Mean absolute error regression loss mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred) # Mean squared error regression loss mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred) # Median absolute error regression loss median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred) # R^2 (coefficient of determination) regression score function r2 = sklearn.metrics.r2_score(y_test, y_pred) # Explained variance regression score function exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred) with open('../trained_networks/nn_%d_data.pkl' % n_features, 'wb') as results: pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL) pickle.dump(net, results, pickle.HIGHEST_PROTOCOL) pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL) pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL) pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL) pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL) pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL) pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL) return
def best_RandomForest(self, df=pd.DataFrame(), flag_interactions=False, flag_clean_features=False, impute_func=None, fill_test_func=None): df = self.df if impute_func: print('imputing data...') df, self.df_X_realtest = self.impute_data(df, self.df_X_realtest, impute_func, fill_test_func) print('get X, y from training set') (self.X, self.y) = self.ready_for_model_train( df, flag_interactions=flag_interactions, flag_clean_features=flag_clean_features) clf = RandomForestClassifier(bootstrap=False) grid = {'n_estimators': sp_randint(170, 350), 'min_samples_leaf': sp_randint(1, 12), 'max_features': sp_randint(2, 50), 'max_depth': sp_randint(5, 30), 'criterion': ['entropy','gini']} clf_rfc = RandomizedSearchCV(clf, n_jobs=4, n_iter=25, cv=6, param_distributions=grid, scoring='accuracy') print('Finding the best parameters...') clf_rfc.fit(self.X, self.y.ravel()) print('preparing X, y from test set...') X_test, y_test = self.ready_for_model_test( self.df_X_realtest, flag_interactions) y_hat = clf_rfc.predict(X_test) print('Best Params: \n') for k, v in clf_rfc.best_params_.items(): print(k, v) print("Accuracy with Random Forest = %4.4f" % accuracy_score(y_test, y_hat)) #binarize_y_confustion_matrix(y_test, y_hat) return(clf_rfc.best_params_)
def randomized_search_and_grid_search_for_hyperparameter_estimation(train_data, labels): # build a classifier clf = RandomForestClassifier(n_estimators = 20) # Utility function to report best scores def report(grid_scores, n_top = 3): top_scores = sorted(grid_scores, key = itemgetter(1), reverse = True)[:n_top] for i, score in enumerate(top_scores): print("Model with rank: {0}".format(i + 1)) print("Mean validation score: {0:.3f} (std: {1:.3f})".format( score.mean_validation_score, np.std(score.cv_validation_scores))) print("Parameters: {0}".format(score.parameters)) print("") # specify parameters and distributions to sample from param_dist = {"max_depth": [3, None], "max_features": sp_randint(1, 11), "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False], "criterion": ["gini", "entropy"]} # run randomized search n_iter_search = 20 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search) start = time() random_search.fit(train_data, labels) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_search.grid_scores_) # use a full grid over all parameters param_grid = {"max_depth": [3, None], "max_features": [1, 3, 10], "min_samples_split": [1, 3, 10], "min_samples_leaf": [1, 3, 10], "bootstrap": [True, False], "criterion": ["gini", "entropy"]} # run grid search grid_search = GridSearchCV(clf, param_grid=param_grid) start = time() grid_search.fit(train_data, labels) print("GridSearchCV took %.2f seconds for %d candidate parameter settings." % (time() - start, len(grid_search.grid_scores_))) report(grid_search.grid_scores_)
def __init__(self, training_iters=1000, cv_folds=5, scoring_method="f1", n_jobs=4): super(AdoptionPredictor, self).__init__() self.clf = RandomForestClassifier(class_weight="balanced_subsample") self.cv_params = {"folds": cv_folds, "scorer": scoring_method} self.n_iter = training_iters self.n_jobs = n_jobs # define param search space for RandomizedSearchCV self.param_grid = {"max_depth": [3, 5, 7, 9, None], "n_estimators": sp_randint(10, 100), "max_features": ["sqrt", "log2"], "min_samples_split": sp_randint(3, 10), "min_samples_leaf": sp_randint(1, 10), "criterion": ["gini", "entropy"]}
def __init__(self, X: np.array, Y: np.array, tune_parameters=False): super().__init__(X, Y, tune_parameters=False) self.X = X.todense() # TensorFlow/Skflow doesn't support sparse matrices output_layer = len(np.unique(Y)) if tune_parameters: self.param_dist_random = {'learning_rate': random.random(100), 'optimizer': ['Adam'], 'hidden_units': [sp_randint(50, 500), sp_randint(50, 500)]} self.clf = skflow.TensorFlowDNNClassifier(hidden_units=self.hidden_units, n_classes=output_layer, steps=self.steps, learning_rate=self.learning_rate, verbose=0, optimizer=self.optimizer)
def RFC_Classifier(Train_DS, y, Actual_DS, grid=True): print("***************Starting Random Forest Classifier***************") t0 = time() if grid: #used for checking the best performance for the model using hyper parameters print("Starting model fit with Grid Search") # specify parameters and distributions to sample from param_dist = { "max_depth": [1, 2, 3, 4, 5, None], "max_features": sp_randint(1, 11), "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False] } clf = RandomForestClassifier(n_estimators=100,n_jobs=1) # run randomized search n_iter_search = 20 clf = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search, scoring = 'log_loss') start = time() clf.fit(Train_DS, y) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(clf.grid_scores_) print("Best estimator found by grid search:") print(clf.best_estimator_) print(clf.grid_scores_) print(clf.best_score_) print(clf.best_params_) print(clf.scorer_) else: clf = RandomForestClassifier(n_estimators=10,n_jobs=1) Kfold_score = Kfold_Cross_Valid(Train_DS, y, clf) clf.fit(Train_DS, y) #incase if it is required for stacking pred_Train = clf.predict_proba(Train_DS) #Predict actual model pred_Actual = clf.predict_proba(Actual_DS) print("Actual Model predicted") print("***************Ending Random Forest Classifier***************") return pred_Train, pred_Actual
def best_XGboost(self, df=pd.DataFrame(), flag_interactions=False, flag_clean_features=False, impute_func=None, fill_test_func=None): df = self.df if impute_func: print('imputing data...') df, self.df_X_realtest = self.impute_data(df, self.df_X_realtest, impute_func, fill_test_func) print('get X, y from training set') (self.X, self.y) = self.ready_for_model_train( df, flag_interactions=flag_interactions, flag_clean_features=flag_clean_features) clf = XGBClassifier() grid = {'n_estimators': sp_randint(100, 600), 'learning_rate': [0.01, 0.03, 0.05, 0.1, 0.3, 0.5], 'max_depth': sp_randint(5, 30), 'min_child_weight': sp_randint(1, 5)} clf_rfc = RandomizedSearchCV(clf, n_jobs=3, n_iter=15, cv=4, param_distributions=grid, scoring='accuracy') print('Finding the best parameters...') clf_rfc.fit(self.X, self.y.ravel()) print('preparing X, y from test set...') X_test, y_test = self.ready_for_model_test( self.df_X_realtest, flag_interactions) y_hat = clf_rfc.predict(X_test) print('Best Params: \n') for k, v in clf_rfc.best_params_.items(): print(k, v) print("Accuracy with Random Forest = %4.4f" % accuracy_score(y_test, y_hat)) #binarize_y_confustion_matrix(y_test, y_hat) return(clf_rfc.best_params_)
def train(feature_names,estimators): traindf = pd.read_csv("train_extend.csv",index_col="datetime") parm_dist = dict(learning_rate=[0.001,0.005,0.01,0.02,0.05,0.1,0.3], n_estimators=sp_randint(100,2000), max_depth=sp_randint(3,6), min_samples_leaf = range(1,10) ) n_iter = 300 n_jobs = 6 for estimator in estimators: best_cv_score = estimator.train(traindf,parm_dist,n_iter,n_jobs) print "************* '%s' got best CV score: %f"%(estimator.target_column,best_cv_score) estimator.dump()
def __init__(self, X: csr_matrix, Y: np.array, tune_parameters=False): super().__init__(X, Y, tune_parameters) if tune_parameters: self.param_dist_random = {'shrinking': [True, False], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'degree': sp_randint(2, 5)} self.clf = SVC(kernel='rbf', shrinking=True)
def return_best_rf_regressor(df, target, num_trees_hyperparameter, num_trees_final_clf, num_iterations): print "entering return best rf regressor function" if df.shape[0] < 10000: num_samples = df.shape[0] else: num_samples = int(df.shape[0]*0.7) print "Sample dataframe" #use X, y, column_list_for_sampled = sample_data_frame_return_x_y_column_name(df, True, target, num_samples) # figure out a vary this some how """ param_dist = {"max_depth": [5, None], "max_features": sp_randint(1, df.shape[1]), "min_samples_split": sp_randint(1, 15), "min_samples_leaf": sp_randint(1, 15), "bootstrap": [True, False], "criterion": ["gini", "entropy"]} """ param_dist = {"max_depth": [5, None], "max_features": sp_randint(1, df.shape[1]), "min_samples_split": sp_randint(1, 15), "min_samples_leaf": sp_randint(1, 15), "bootstrap": [True]} clf = RandomForestRegressor(n_estimators=num_trees_hyperparameter) print "starting hyperparameter search" clf_best, best_params = hyperparameter_search_random(X, y, clf, param_dist, num_iterations) print "sample data for fitting model" #train new classifier on the entire dataset X, y, column_list_for_sampled = sample_data_frame_return_x_y_column_name(df, True, target, num_samples=df.shape[0]) clf_final = RandomForestRegressor(n_estimators=num_trees_final_clf, max_depth = best_params["max_depth"], min_samples_leaf = best_params["min_samples_leaf"], min_samples_split = best_params["min_samples_split"], bootstrap = best_params["bootstrap"], max_features = best_params["max_features"]) print "Fitting Random Forest Regressor" clf_final.fit(X,y) return clf_final, column_list_for_sampled
def grid_search(self, **kwargs): """Using grid search to find the best parameters.""" n_jobs = kwargs.get('n_jobs', 1) n_iter = kwargs.get('n_iter', 5) col2fit = kwargs.get('features') bids_path = kwargs.get('bids_path', 'data/bids.csv') score = kwargs.get('score') # use a full grid over all parameters parameters = {"max_depth": sp_randint(1, 30), "criterion": ["gini", "entropy"], "max_features": [1.0, 0.8, 0.6, 0.4, 0.2, 0.1], "min_samples_leaf": sp_randint(1, 25), "min_samples_split": sp_randint(1, 25), "bootstrap": [True, False], "class_weight": [None, "auto", "subsample"]} if not self.iscleaned: print 'Preparing the data...' self.prepare_data(bids_path, **kwargs) else: print 'data frame is already cleaned...' train_values = self.df_train[col2fit].values target_values = self.df_train['outcome'].values pre_dispatch = '2*n_jobs' # Fit the grid print 'fitting the grid with n_jobs = {}...'.format(n_jobs) start = time() self.set_model(**kwargs) rf_grid = grid_search.RandomizedSearchCV(self.learner, parameters, n_jobs=n_jobs, verbose=2, pre_dispatch=pre_dispatch, scoring=score, error_score=0, n_iter=n_iter) rf_grid.fit(train_values, target_values) print('Grid search finished') print("\n\nGridSearchCV took %.2f seconds for %d candidate parameter settings." % (time() - start, len(rf_grid.grid_scores_))) self.grid_report(rf_grid.grid_scores_, 15) print('\n\nBest score = {}'.format(rf_grid.best_score_)) print('Best params = {}\n\n'.format(rf_grid.best_params_))
def getExtraTressClf(self, X, Y, param_list=-1): clfName = "Extra_Trees" ## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html clf = ExtraTreesClassifier( n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=False, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None) if self._gridSearchFlag == True: log(clfName + " start searching param...") tmpLowDepth = int(len(X.columns) * 0.7) tmpHighDepth = int(len(X.columns) ) param_dist = { "max_depth": sp_randint(tmpLowDepth, tmpHighDepth), "max_features": sp_randf(0,1), "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, True], "criterion": ["gini", "entropy"], "oob_score":[True, True], "n_estimators" : sp_randint(800, 1200), } clf = self.doRandomSearch(clfName, clf, param_dist, X, Y) else: if param_list != -1: clf = ExtraTreesClassifier(param_list) clf.set_params(**param_list) clf.fit(X,Y) return clf
def evalModel(train_data, eval_data, train_labels, eval_labels, seed): joined_data = np.concatenate((train_data, eval_data), axis=0) joined_labels = np.concatenate((train_labels, eval_labels), axis=0) train_mask = np.zeros(train_data.shape[0]) - 1.0 eval_mask = np.zeros(eval_data.shape[0]) joined_mask = np.concatenate((train_mask, eval_mask), axis=0) ps = PredefinedSplit(test_fold=joined_mask) loss = make_scorer(get_rmsle, greater_is_better=False) train_data = sparse.csr_matrix(train_data) eval_data = sparse.csr_matrix(eval_data) clf = RandomForestRegressor(random_state=seed, verbose=1) # clf.fit(train_data, train_labels) # preds = clf.predict(eval_data) # print(get_rmsle(eval_labels, preds)) ## achieves 0.263 # specify parameters and distributions to sample from param_dist = { "n_estimators": sp_randint(300, 800), "max_depth": sp_randint(10, 50), "max_features": ["auto", "sqrt", "log2"], "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), } # run randomized search n_iter_search = 60 random_search = RandomizedSearchCV( clf, param_distributions=param_dist, cv=ps, scoring=loss, n_iter=n_iter_search, n_jobs=-1, pre_dispatch="n_jobs", verbose=2, ) start = time() random_search.fit(joined_data, joined_labels) print( "RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search) ) report(random_search.grid_scores_)
def evalModel(data, labels): loss = make_scorer(get_rmsle, greater_is_better=False) seed1 = 42 clf = xgb.XGBRegressor(seed=seed1, silent=True) param_dist = { "learning_rate":sp_uniform(0.01,0.1), "n_estimators":sp_randint(50,500), "max_depth": sp_randint(2,6), "subsample": sp_uniform(0.5,0.4), "max_delta_step": sp_uniform(1,2), "min_child_weight":sp_uniform(1,6), "colsample_bytree":sp_uniform(0.8,0.2)}; n_iter_search = 60 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, cv=5, scoring=loss, n_iter=n_iter_search,n_jobs=-1,pre_dispatch='n_jobs',verbose=2) report(random_search.grid_scores_,n_top=5)
def createRandomSearch(clf, X, y): param_dist = {"max_depth": [3, None], \ "max_features": sp_randint(1, 11), \ "min_samples_split": sp_randint(1, 11), \ "min_samples_leaf": sp_randint(1, 11), \ "bootstrap": [True, False], \ "criterion": ["gini", "entropy"] } n_iter_search = 20 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, \ n_iter=n_iter_search) start = time() random_search.fit(X, y) print("RandomizedSearchCV took %.2f seconds for %d candidates parameter settings." % ((time() - start), n_iter_search)) report(random_search.grid_scores_)
# Split data into train/test by year pickups_14_15['day_of_week'] = pickups_14_15['day_of_week'].apply(str) pickups_14_15['hour'] = pickups_14_15['hour'].apply(str) pickups_14_15['month'] = pickups_14_15['month'].apply(str) # Create training data (2014) X_train = pickups_14_15[pickups_14_15['year'] == 2014] Y_train = X_train['passenger_count'] X_train.drop(['date', 'passenger_count', 'year'], axis=1, inplace=True) X_train = pd.get_dummies(X_train) # Try random forest model, randomized search for optimal parameters rf = RandomForestRegressor() # Specify parameters and distributions to sample from param_dist = {'n_estimators': sp_randint(1, 101), 'max_depth': [1, 2, 3, None], 'max_features': sp_randint(1, X_train.shape[1]), 'min_samples_split': sp_randint(1, 11), 'min_samples_leaf': sp_randint(1, 11), 'bootstrap': [True, False]} # Run randomized search # Try it on subset of training data to speed up search sample_index = X_train.sample(50000).index n_iter_search = 50 random_search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=n_iter_search) random_search.fit(X_train.ix[sample_index], Y_train.ix[sample_index]) random_search.best_params_ # Create test data (2015)
# Import data tweets = pd.read_csv('labeled_data.csv') grouped_tweets = tweets.groupby(['author', 'class'])['text'].apply(' '.join).reset_index() # Train_test_split X = grouped_tweets['text'] Y = grouped_tweets['class'] X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = .25, random_state = 0) # Define the pipeline tfidf = TfidfVectorizer(tokenizer = spacy_tokenizer, max_features = 5000) rf_grid_pipe = Pipeline([('vect', tfidf),('fit', RandomForestClassifier())]) # Grid search param_grid = {"fit__bootstrap" : [True, False], "fit__n_estimators" : sp_randint(50, 150), "fit__max_depth" : [10, 50, None], "fit__max_leaf_nodes" : sp_randint(10, 50) } grid = RandomizedSearchCV(rf_grid_pipe, param_grid, cv=3, n_iter=20, n_jobs=14, random_state=0) grid.fit(X_train, y_train) preds = grid.predict(X_test) accuracy = accuracy_score(y_test, preds) precision = precision_score(y_test, preds) recall = recall_score(y_test, preds) f1 = f1_score(y_test, preds) # logs for comet_ml
# ElasticNet EN_gscv_param_grid = dict(ElasticNetReg__alpha=[.1, 0.5, 1., 5., 10.], ElasticNetReg__l1_ratio=[.05, .1, .5, .7, .9, .95]) EN_param_distro = dict( ElasticNetReg__alpha=EN_gscv_param_grid["ElasticNetReg__alpha"], ElasticNetReg__l1_ratio=sp_exp(scale=1)) # KNeighborsReg KNR_gscv_param_grid = dict(KNeighborsReg__weights=['uniform', 'distance'], KNeighborsReg__n_neighbors=[5, 10, 30, 50]) KNR_param_distro = dict(KNeighborsReg__weights=['uniform', 'distance'], KNeighborsReg__n_neighbors=sp_randint(5, 50)) # RandomForestReg RFR_gscv_param_grid = dict( RandomForestReg__n_estimators=[10, 30, 50, 100, 200, 500, 1000], RandomForestReg__criterion=['mse', 'mae'], RandomForestReg__max_features=[None, .75, .5, 'sqrt'], # RandomForestReg__max_features=[0.25, 'auto', 'sqrt', 'log2'], RandomForestReg__min_samples_leaf=[1, 3, 5], RandomForestReg__max_depth=[3, 5, 7, 10]) RFR_param_distro = RFR_gscv_param_grid RFR_param_distro['RandomForestReg__n_estimators'] = sp_randint( 10, 500) # 100, 1000 RFR_param_distro['RandomForestReg__max_depth'] = sp_randint(3, 10)
# gskf = list(StatifiedGroupK_Fold.StratifiedGroupKfold(n_splits=5).split(X_resampled, Y_resampled, groups)) sgkf = StatifiedGroupK_Fold.StratifiedGroupKfold(n_splits=5) params = { 'colsample_bytree': 0.9009933084016689, 'min_child_samples': 123, 'min_child_weight': 0.001, 'num_leaves': 40, 'reg_alpha': 0, 'reg_lambda': 0, 'subsample': 0.8426999443200605 } # GRID param_test = { 'num_leaves': sp_randint(6, 50), 'min_child_samples': sp_randint(100, 500), 'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4], 'subsample': sp_uniform(loc=0.2, scale=0.8), 'colsample_bytree': sp_uniform(loc=0.4, scale=0.7), 'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100], 'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100], #'feval': 'auc', } # This parameter defines the number of HP points to be tested n_HP_points_to_test = 100 # n_estimators is set to a "large value". The actual number of trees build will depend on early # stopping and 5000 define only the absolute maximum # clf = lgb.LGBMClassifier(max_depth=-1, random_state=314, silent=True, metrics='none', n_jobs=-1, n_estimators=5000, class_weight='balanced')
X_train_tfidf = tfidf_transformer.fit_transform(X_train_count) pca = TruncatedSVD(n_components=20) pca.fit(X_train_tfidf) X_train_pca = pca.transform(X_train_tfidf) # Test data transformations X_test_count = count_vect.transform(X_test) X_test_tfidf = tfidf_transformer.transform(X_test_count) X_test_pca = pca.transform(X_test_tfidf) clf = RandomForestClassifier() parameters_rand = { "n_estimators": sp_randint(300, 2000), "max_depth": [3, None], "max_features": sp_randint(1, 11), "min_samples_split": sp_randint(2, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False], "criterion": ["gini", "entropy"] } # run randomized search # Accuracy should be comparable to grid search, but runs much much faster n_iter_search = 20 random_search = RandomizedSearchCV(clf, param_distributions=parameters_rand, n_iter=n_iter_search, n_jobs=-1)
for candidate in candidates: print("Model with rank: {0}".format(i)) print("Mean validation score: {0:.3f} (std: {1:.3f})".format( results['mean_test_score'][candidate], results['std_test_score'][candidate])) print("Parameters: {0}".format(results['params'][candidate])) print("") # specify parameters and distributions to sample from # param_dist = { "n_estimators": [50,100,150,200], # "learning_rate": [0.1,0.5,1,1.5,2], # } param_dist = { "max_depth": sp_randint(1, 15), "n_estimators": [100, 150, 200, 400, 500], "min_samples_split": sp_randint(2, 10), "min_samples_leaf": sp_randint(1, 10) } # run randomized search n_iter_search = 10 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search) start = time() random_search.fit(X, y) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search))
# In[2]: n_iter = 100 k_fold = 10 # cv = kfold # initialize the classifier X_train, X_val, y_train, y_val, cv = load_train_and_kfold(n_folds=k_fold) # In[4]: model = KNeighborsClassifier() model_name = model.__class__.__name__ param_grid = { "n_neighbors": sp_randint(4,400), "algorithm" : ["auto", "ball_tree", "kd_tree", "brute"], } # In[ ]: search_GB = RandomizedSearchCV(model,param_grid,scoring='log_loss',n_jobs=-1, n_iter=n_iter,cv=cv,verbose=True) search_GB.fit(X_train,y_train.flatten()) # In[ ]:
# Utility function to report best scores def report(grid_scores, n_top=3): top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top] for i, score in enumerate(top_scores): print("Model with rank: {0}".format(i + 1)) print("Mean validation score: {0:.3f} (std: {1:.3f})".format( score.mean_validation_score, np.std(score.cv_validation_scores))) print("Parameters: {0}".format(score.parameters)) print("") # specify parameters and distributions to sample from param_dist = { "max_depth": [3, None], "max_features": sp_randint(1, 30), "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "n_estimators": sp_randint(1, X_train.shape[1]), } # run randomized search n_iter_search = 20 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search) start = time() random_search.fit(X_train, y_train) print( "RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search))
print("Parameters: {0}".format(score.parameters)) print("") print("Starting RandomizedSearchCV") n_features = X_train.shape[1] N_FOLDS = 10 model = xgb.XGBRegressor() # specify parameters and distributions to sample from param_dist = { "objective": ["reg:linear"], # "booster" : ["gbtree"], # "eta": [0.1, 0.3, 0.5, 0.7], "max_depth": sp_randint(10, 30), "subsample": sp_uniform(0.1, 0.9), "colsample_bytree": sp_uniform(0.1, 1.0), "silent": [1], "seed": [42] } # run randomized search n_iter_search = 30 folds = cv.KFold(n=len(y_train), n_folds=N_FOLDS, shuffle=True, random_state=42) random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=n_iter_search,
aid_mimic.load_candidates(mimic_files['candidates']) # aid_mimic.compute_features(features='filtered') # aid_mimic.save_features(mimic_files['feature_values']) aid_mimic.load_features(mimic_files['feature_values']) aid_mimic.load_y_true(mimic_files['y_true']) # ____________________________________________________________________________________________________________________ sample_weights = compute_sample_weight('balanced', aid_mimic.y_true) tuning_params = [ { 'name': "Nearest Neighbors", 'predictor': make_sklearn_pipeline(KNeighborsClassifier()), 'parameters': { 'clf__n_neighbors': sp_randint(2, 20), 'clf__weights': ['uniform', 'distance'], }, 'n_iter': 1000, 'fit_params': None, }, { 'name': "Linear SVM", 'predictor': make_sklearn_pipeline( SVC(kernel="linear", class_weight='balanced', random_state=1)), 'parameters': { 'clf__C': sp_expon(), }, 'n_iter':
from scipy.stats import randint as sp_randint from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier() param_dist = { "n_estimators": [10, 20, 30, 40, 50], "max_depth": [3, 5, 7, 9, 10, None], "max_features": sp_randint(1, 20), "min_samples_split": sp_randint(2, 12), "min_samples_leaf": sp_randint(1, 12), "bootstrap": [True, False], "criterion": ["gini", "entropy"] }
for i, score in enumerate(top_scores): print("Model with rank: {0}".format(i + 1)) print("Mean validation score: {0:.3f} (std: {1:.3f})".format( score.mean_validation_score, np.std(score.cv_validation_scores))) print("Parameters: {0}".format(score.parameters)) print("") #In the following randomSearchCV/GridSearchCV accept parameters values as dictionaries #In example given below we have constructed dictionary for different parameter values #that we want to try for randomForest model (We are setting up the grid search parameters here) param_dist = { "n_estimators": [10, 100, 500, 700], "max_depth": [3, 5, None], "max_features": sp_randint(5, 11), "min_samples_split": sp_randint(5, 11), "min_samples_leaf": sp_randint(5, 11), "bootstrap": [True, False], "criterion": ["gini", "entropy"] } #Total number of combinations we would get from above is 4 X 3 X 7 X 7 X 7 X 2 X 2 which would take a lot #of time to run and hence we would need to randomly choose certain combinations from the above #Running the randomized Search (We are randomly picking 100 combinations out of the above mentioned #combinations) n_iter_search = 100 #n_iter parameter of RandomizedSearchCV controls, how many parameter combination will be tried; out #of all possible given values, we are also setting the param_distributions parameter as param_dist #Which contains all the combinations of parameters
if best_algo == 'SVR': algo = getattr(sklearn.svm, best_algo)(gamma='auto') if best_algo == 'MLPRegressor': algo = getattr(sklearn.neural_network, best_algo)() if best_algo == 'XGBRegressor': algo = getattr(xgboost, best_algo)() if best_algo == 'KNeighborsRegressor': algo = getattr(sklearn.neighbors, best_algo)() ## Begin the tune by setting hyper parameter limits per algorithm if best_algo == 'LinearRegression': hyperparameters = {"penalty": ["l1", "l2"], "C": sp_randint(1, 10)} scoring_metric = make_scorer(explained_variance_score) if best_algo == 'SGDRegressor': hyperparameters = { 'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3], "learning_rate": ["constant", "optimal", "invscaling", "adaptive"] } scoring_metric = make_scorer(roc_auc_score) if (best_algo == 'RandomForestRegressor') or (best_algo == 'AdaBoostRegressor') or ( best_algo == 'GradientBoostingRegressor') or (best_algo == 'BaggingRegressor'): hyperparameters = {"n_estimators": sp_randint(1, 1000)}
def report(grid_scores, n_top=3): top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top] for i, score in enumerate(top_scores): print("Model with rank: {0}".format(i + 1)) print("Mean validation score: {0:.3f} (std: {1:.3f})".format( score.mean_validation_score, np.std(score.cv_validation_scores))) print("Parameters: {0}".format(score.parameters)) print("") xgbnew = XGBRegressor() # specify parameters and distributions to sample from one_to_left = st.beta(10, 1) from_zero_positive = st.expon(0, 50) param_dist = { "n_estimators": sp_randint(80, 120), "max_depth": sp_randint(2, 15), "learning_rate": st.uniform(0.05, 0.1), "gamma": st.uniform(0, 10), 'reg_alpha': from_zero_positive, "min_child_weight": from_zero_positive, } # run randomized search n_iter_search = 20 random_search = RandomizedSearchCV(xgbnew, param_distributions=param_dist, n_iter=n_iter_search) start = time() random_search.fit(x_train2num, y_train2num)
def test(self): im_ratio = [10, 100, 200] sep_condition = [1, 4, 8] params_dict1 = { 'test__max_depth': [None], 'test__max_features': sp_randint(1, 3), #Number of features per bag 'test__min_samples_split': sp_randint(2, 100), #Min number of samples in a leaf node split 'test__min_samples_leaf': sp_randint(1, 100), #Min number of samples in a leaf node 'test__bootstrap': [True, False], #Sample \mathbf{x}x with/without replacement 'test__n_estimators': [1, 2, 5, 10, 50, 75, 100, 250, 500, 1000], #Number of trees in the forest "test__n_jobs": [-1], "test__class_weight": ["balanced", "balanced_subsample"] } params_dict2 = { 'test__n_estimators': [1, 2, 5, 10, 50, 75, 100, 250, 500], "test__n_jobs": [-1], "test__replacement": [True, False] } params_dict3 = { 'test__n_estimators': [1, 2, 5, 10, 50, 75, 100], "test__algorithm": ["SAMME", "SAMME.R"], "test__replacement": [True, False] } params_dict4 = { 'test__n_estimators': [1, 2, 5, 10, 50, 75, 100, 250, 500, 1000], "test__n_jobs": [-1], "test__replacement": [True, False], "test__base_estimator": [ SVC(C=1.0, cache_size=200, class_weight="balanced", coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) ], "test__max_samples": sp_unif(0, 1), "test__max_features": sp_unif(0, 1) } ensembler = [("BalancedRandomForestClassifier", BalancedRandomForestClassifier, params_dict1), ("EasyEnsembleClassifier", EasyEnsembleClassifier, params_dict2), ("RUSBoostClassifier", RUSBoostClassifier, params_dict3), ("BalancedBaggingClassifier", BalancedBaggingClassifier, params_dict4)] # with open("Ensembler.txt", "a") as f: # f.write("Strategy\timbalanced_ratio\tclass_separability\tk_J1\tu_J1\tcv_score\tcv_gms\treplace_score\treplace_gms\n") for classifier in ensembler: for sep in sep_condition: for ratio in im_ratio: X_train, Y_train, X_test, Y_test = self.load_data( ratio=ratio, sep=sep) batch = len(X_train) for i in range(1): self.classifier = classifier[1] self.params_dict = classifier[2] self.X_train, self.Y_train, self.X_test, self.Y_test = X_train[ i], Y_train[i], X_test[i], Y_test[i] k_J1, u_J1 = self.J1_estimate(self.X_train, self.Y_train, self.X_test, self.Y_test) f1_cv, g_cv, f1_rp, g_rp = self.phase2( self.X_train, self.Y_train) with open("Ensembler.txt", "a") as f: f.write( "{}\t{}\t{}\t{:.2f}\t{:.2f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\n" .format(classifier[0], ratio, sep, k_J1, u_J1, f1_cv, g_cv, f1_rp, g_rp))
print("TRAIN:", train_index, "TEST:", test_index) doc_train = doc_data[train_index] doc_test = doc_data[test_index] X_train, y_train = utils.convert_docs_to_lines(doc_train) X_test, y_test = utils.convert_docs_to_lines(doc_test) argument_sets += [(X_train, X_test, y_train, y_test)] text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('pca', TruncatedSVD(n_components=20)), ('clf', RandomForestClassifier(n_jobs=-1))]) param_distributions = { "vect__ngram_range": [(1, 3)], "pca__n_components": sp_randint(20, 400), "clf__n_estimators": sp_randint(100, 2000), "clf__max_features": sp_randint(1, 8), "clf__min_samples_leaf": sp_randint(1, 6), # "clf__class_weight": [ # {0: 1, 1: 1.5, 2: 1.75}, # {0: 1, 1: 2, 2: 3}, # {0: 1, 1: 3, 2: 5}, # ], "clf__criterion": ["entropy", "gini"] } n_iter_search = 10 random_search = RandomizedSearchCV(text_clf, param_distributions=param_distributions, n_iter=n_iter_search,
trainData = df.ix[:, :-1].values trainLabels = df.ix[:, -1].values dataset = "output/fungi/googlenetwithcontrol-test.csv" # Loading dataset df = pd.read_csv(dataset) testData = df.ix[:, :-1].values testLabels = df.ix[:, -1].values #================================================================================================================ print("MultiLayer Perceptron") #================================================================================================================ from sklearn.neural_network import MLPClassifier clf = MLPClassifier(random_state=84) n_iter_search = 20 param_dist = { 'activation': ['identity', 'logistic', 'tanh', 'relu'], 'solver': ['lbgfs', 'sgd', 'adam'], 'alpha': sp_randint(0.0001, 1), 'learning_rate': ['constant', 'invscaling', 'adaptive'], 'momentum': [0.9, 0.95, 0.99] } model = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search) model.fit(trainData, trainLabels) predictionsMLP = model.predict(testData) print(classification_report(testLabels, predictionsMLP)) print(accuracy_score(testLabels, predictionsMLP))
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=0.3) print(x_train.shape, x_test.shape) # RandomSearch from sklearn.model_selection import RandomizedSearchCV from scipy.stats import randint as sp_randint # Statistikpaket -> für zufällige WErte aus Intervall für RandomSearch # Liste der Möglichen Parameter mit # sp_randint(1,15) -> Zufällige Zahl zwischen 1 und 15. Intervall sollte jedoch gut überlegt werden # "weights": ["uniform", "distance"] -> Zufälliger Auswahl zwischen uniform und Distance # "p": [1, 2] -> Zufälliger Auswahl zwischen p = 1 oder 2 param_dist = { "n_neighbors": sp_randint(1, 15), "weights": ["uniform", "distance"], "p": [1, 2] } n_iter_search = 20 # Es sollen 20 Modelle erstellt werden neigh = KNeighborsClassifier() clf = RandomizedSearchCV( neigh, param_distributions=param_dist, n_iter=n_iter_search, cv=3 ) # CV (Crossvalidation). Bei kleinen Datensätzen CV~ 3. Bei großen Datensätzen (mehrere 1000) CV = 5...10 clf.fit(x_train, y_train) for key in clf.cv_results_.keys( ): # Listet Attribute auf die Aufgerufen werden können print(key)
bestParams = [] cv = StratifiedShuffleSplit(n_splits=10, test_size=0.6, random_state=0) for grp in all_train_data: print 'Working on group : %s' % (grp) # get some data X = all_train_data[grp]['features'].values.astype(np.float32) y = all_train_data[grp]['labels'].astype(np.int16) # build a classifier clf = RandomForestClassifier() # specify parameters and distributions to sample from param_dist = { "n_estimators": sp_randint(1, 1000), "max_depth": sp_randint(3, 303), "max_features": sp_randint(1, 350), "min_samples_split": sp_randint(2, 350), "min_samples_leaf": sp_randint(1, 350), "bootstrap": [True, False], "criterion": ["gini", "entropy"] } # run randomized search n_iter_search = 5000 random_search = RandomizedSearchCV(clf, scoring='neg_log_loss', param_distributions=param_dist, n_iter=n_iter_search, cv=cv,
# Hence, the default parameters are: # * criterion (default=‘mse’). The function to measure the quality of a split. # * splitter (default=‘best’). The strategy used to choose the split at each node. # * min_samples_leaf (default=1). The minimum number of samples required to be at a leaf node. # * min_weight_fraction_leaf (default=0.). The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. # * random_state (default=None). The random number generator is the RandomState instance used by np.random. # * max_leaf_nodes (default=None). Unlimited number of nodes. # * min_impurity_decrease (default=0.). A node will be split if this split induces a decrease of the impurity greater than or equal to 0. # * min_impurity_split (default=1e-7). Threshold for early stopping in tree growth. # In[10]: from scipy.stats import randint as sp_randint param_grid = { 'max_depth': sp_randint(2, 20), 'min_samples_split': sp_randint(2, 20) } from sklearn.model_selection import RandomizedSearchCV rgt_grid = RandomizedSearchCV(rgt, param_grid, scoring='neg_mean_squared_error', cv=tr_val_partition, n_jobs=1, verbose=1) rgt_grid.fit(X_train_minmax, y_train) y_test_pred_rgt_G = rgt_grid.predict(X_test_minmax)
class initialConfig: ## The following parameters correspond to the machine learning ## part of the framework. # This parameter refers to the number of outer folds that # are being used in order for the k-fold cross-validation # to take place. kfold_parameter = 5 kfold_inner_parameter = 4 # Number of parallel jobs to be initiated: # -1: use all processors # int: no of processors to use n_jobs = -1 test_dataset = './datasets/dataset-string-similarity_original_1k.csv' # test_dataset = './datasets/dataset-string-similarity_latin_EU_NA_1k.txt' # test_dataset = './datasets/dataset-string-similarity-100.csv' # the classification method used: basic, basic_sorted, lgm classification_method = 'lgm' # This parameter contains a list of the various classifiers # the results of which will be compared in the experiments. # classifiers = ['SVM', 'Decision Tree', 'Random Forest', 'AdaBoost', # 'Naive Bayes', 'MLP', 'Gaussian Process', 'Extra Trees'] # Search Method to use for best hyperparameters: randomized, grid, hyperband - not yet implemented!!! hyperparams_search_method = 'randomized' # These are the parameters that constitute the search space for GridSearchCV # in our experiments. SVM_hyperparameters = [ {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'max_iter': [300]}, {'kernel': ['poly'], 'degree': [1, 2, 3, 4], 'gamma': ['scale'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'max_iter': [300]}, {'kernel': ['linear'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'gamma': ['scale'], 'max_iter': [300]} ] DecisionTree_hyperparameters = { 'max_depth': [i for i in range(1, 33)], 'min_samples_split': list(np.linspace(0.1, 1, 10)), 'min_samples_leaf': list(np.linspace(0.1, 0.5, 5)), 'max_features': [i for i in range(1, 10)] } RandomForest_hyperparameters = { 'bootstrap': [True, False], 'max_depth': [10, 20, 30, 40, 50, 60, 100, None], 'criterion': ['gini', 'entropy'], 'max_features': ['log2', 'sqrt'], # auto is equal to sqrt 'min_samples_leaf': [1, 2, 4], 'min_samples_split': [2, 5, 10], "n_estimators": [250, 500, 1000] } XGBoost_hyperparameters = { "n_estimators": [500, 1000, 3000], # 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], # hyperparameters to avoid overfitting 'eta': list(np.linspace(0.01, 0.2, 10)), # 'learning_rate' 'gamma': [0, 1, 5], 'subsample': [0.8, 0.9, 1], 'colsample_bytree': list(np.linspace(0.3, 1, 8)), 'min_child_weight': [1, 5, 10], } MLP_hyperparameters = { 'learning_rate_init': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1], 'max_iter': [300, 500, 1000], 'solver': ['sgd', 'adam'] } # These are the parameters that constitute the search space for RandomizedSearchCV # in our experiments. SVM_hyperparameters_dist = { 'C': expon(scale=100), 'gamma': expon(scale=.1), 'kernel': ['rbf'], 'class_weight': ['balanced', None] } DecisionTree_hyperparameters_dist = { 'max_depth': sp_randint(10, 100), 'min_samples_split': list(np.linspace(0.1, 1, 50)), 'min_samples_leaf': list(np.linspace(0.1, 0.5, 25)), 'max_features': sp_randint(1, 11), } RandomForest_hyperparameters_dist = { 'bootstrap': [True, False], 'max_depth': [10, 20, 30, 40, 50, 60, 100, None], 'criterion': ['gini', 'entropy'], 'max_features': ['sqrt', 'log2'], # sp_randint(1, 11) 'min_samples_leaf': sp_randint(1, 5), 'min_samples_split': sp_randint(2, 11), "n_estimators": sp_randint(250, 1000), } XGBoost_hyperparameters_dist = { "n_estimators": sp_randint(500, 4000), # 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], # hyperparameters to avoid overfitting 'eta': expon(loc=0.01, scale=0.1), # 'learning_rate' 'gamma': [0, 1, 5], 'subsample': truncnorm(0.7, 1), 'colsample_bytree': truncnorm(0, 1), 'min_child_weight': [1, 5, 10], } MLP_hyperparameters_dist = { 'learning_rate_init': expon(loc=0.0001, scale=0.1), 'max_iter': [300, 500, 1000], 'solver': ['sgd', 'adam'] } max_iter = 250
import psycopg2 as pg from scipy.stats import randint as sp_randint from scipy.stats import uniform as sp_uniform from sqlalchemy import create_engine ALERT_AGENT_THRESHOLD = .15 LOCK_ACCOUNT_THRESHOLD = .3 connection = pg.connect( host='localhost', port=54320, dbname='ht_db', user='******' ) engine = create_engine('postgresql://*****:*****@127.0.0.1:54320/ht_db') data_path = '../data' model_path = 'artifacts' schema_path = '../misc/schemas.yaml' # LGBM RandomSearch parameters param_test = {'num_leaves': sp_randint(6, 50), 'subsample': sp_uniform(loc=0.2, scale=0.8), 'colsample_bytree': sp_uniform(loc=0.4, scale=0.6), 'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100], 'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100], 'min_data_in_leaf': sp_randint(100, 3000), 'max_bin': sp_randint(150, 400), 'scale_pos_weight': sp_randint(2, 90)}
def ModelFit(): global best_model #contruct hyperparameter grid param_dist = {"max_depth": [3, 10, 20, 70, None], "max_features": [2, 10, 41, 80, 'sqrt'], "min_samples_split": sp_randint(2, 11), "min_samples_leaf": sp_randint(1, 11), #"bootstrap": [True, False], "criterion": ["gini", "entropy"], "n_estimators": [100, 300, 500, 800, 1000]} pprint(param_dist) #define random forest classifier function rf = RandomForestClassifier(random_state = 120) #search across 1000 randomized combinations in the above grid estimator = RandomizedSearchCV(estimator = rf, param_distributions = param_dist, n_iter = 1000, cv = 10, verbose = 10, random_state = 12, scoring = 'roc_auc', n_jobs = -1) #fit the model grid_result = estimator.fit(X_train, y_train) #find and define best estimator based on grid search best_model = grid_result.best_estimator_ print('\nbest_model:\n', best_model) #predict y based on test data y_pred = grid_result.predict(X_test) #accuracy score print('accuracy score:', accuracy_score(y_test, y_pred)) #confusion matrix tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() print(tn,fp,fn,tp) #classification report print('\nclassification report:\n',classification_report(y_test, y_pred)) #AUC and ROC curve y_pred_prob = grid_result.predict_proba(X_test)[:,1] auc = roc_auc_score(y_test, y_pred_prob) print('auc:', auc) false_positive, true_positive, _ = roc_curve(y_test, y_pred_prob) font = {'fontname':'Helvetica'} plt.figure() plt.plot([0, 1], [0, 1], 'k--') plt.plot(false_positive, true_positive, color='black') plt.xlabel('False positive rate', **font) plt.ylabel('True positive rate', **font) plt.savefig('feces_roc.png', dpi=300) plt.show() # Save the model as a pickle in a file joblib.dump(grid_result, 'campy_rf_feces.pkl') #determine best features feature_importances = grid_result.best_estimator_.feature_importances_ column_names=list(feces) del column_names[-0] importance = pd.DataFrame(feature_importances, index=column_names, columns=["Importance"]) sort_importance = importance.sort_values(by=['Importance'], ascending = False) sort_column_names = sort_importance.index.values.tolist() mult = 100/(sort_importance['Importance'].iloc[0]) sort_imp_mult = sort_importance * mult top_imp = sort_imp_mult['Importance'].iloc[0:15].tolist() top_column_names = sort_column_names[0:15] top_column_names = ['AvgMaxGustSpeed1.6', 'AvgAverageHumidity1.7', 'AverageHumidityTwoDayBefore', 'AvgMaxGustSpeed1.3', 'AvgMaxGustSpeed1.5', 'AvgMinTemperature1.7', 'AvgMaxWindSpeed1.7', 'AvgMinHumidity1.4', 'AvgMaxHumidity1.3', 'AvgPrecipitation1.4', 'MaxGustSpeedOneDayBefore', 'AvgMaxGustSpeedS1.2', 'AvgMaxWindSpeed1.4', 'AvgAverageHumidity1.3', 'MaxGustSpeedTwoDayBefore'] plt.rcParams.update(plt.rcParamsDefault) y_ticks = np.arange(0, len(top_column_names)) fig, ax = plt.subplots() ax.barh(y_ticks, top_imp, color = "dimgray") ax.set_yticklabels(top_column_names, **font) ax.set_yticks(y_ticks) plt.xlabel('Relative Importance', **font) fig.tight_layout() plt.gca().invert_yaxis() plt.savefig('feces_var.png', dpi=300) plt.show() return
y, scoring=self.scoring, cv=self.cv).mean() print("Model with rank: {0}".format(i)) print("Pred score: {0}".format(pred[idx])) print("Mean validation score: {0:.3f}".format(score_mean)) print("Parameters: {0}".format(param)) print("") if __name__ == '__main__': # get some data digits = load_digits() X, y = digits.data, digits.target clf = RandomForestClassifier(n_estimators=20) param_dist = { "max_depth": sp_randint(1, 11), "max_features": sp_randint(1, 11), "min_samples_split": sp_randint(2, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False], "criterion": ["gini", "entropy"] } rpt = RPTune(clf, param_distributions=param_dist, n_iter='auto', n_jobs=-1, random_state=42, scoring='accuracy') rpt.fit(X, y)
file.close() CLASSIFIER_MAPPING = { # clf - classifier # prep - according prepare data method # dist - specify parameters and distributions to sample from # grid - use a full grid over all parameters 'perceptron': { 'clf': linear_model.Perceptron, 'prep': prepare_two_class_data, 'dist': { 'penalty': [None, 'l2', 'l1', 'elasticnet'], 'alpha': [0.0001, 0.001, 0.01, 0.1, 1], 'fit_intercept': [True, False], 'n_iter': sp_randint(1, 20), 'shuffle': [True, False], 'verbose': sp_randint(1, 5), 'eta0': [1.0, 1.5, 2.0], 'random_state': [0, None], 'class_weight': ['balanced', None], 'warm_start': [True, False] }, 'grid': { 'penalty': [None, 'l2', 'l1', 'elasticnet'], 'alpha': [0.0001, 0.0003, 0.0005, 0.0007, 0.0008, 0.0009], 'fit_intercept': [True, False], 'n_iter': [5, 10, 20, 30], 'shuffle': [True, False], 'verbose': [0, 2, 4], 'eta0': [1.0, 1.5, 2.0],
def report(results, n_top=3): for i in range(1, n_top + 1): candidates = np.flatnonzero(results['rank_test_score'] == i) for candidate in candidates: print("Model with rank: {0}".format(i)) print("Mean validation score: {0:.3f} (std: {1:.3f})".format( results['mean_test_score'][candidate], results['std_test_score'][candidate])) print("Parameters: {0}".format(results['params'][candidate])) print("") # specify parameters and distributions to sample from param_dist = { "svm__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000], "xgb__max_depth": sp_randint(3, 25), "xgb__min_child_weight": sp_randint(1, 7), "xgb__subsample": [0.6, 0.7, 0.8, 0.9, 1.0], "xgb__reg_lambda": [0.01, 0.1, 1.0], "xgb__reg_alpha": [0, 0.1, 0.5, 1.0], "rf__n_estimators": [10, 50, 100, 150, 200, 300, 500], "rf__max_depth": [5, 8, 15, 25, 30, None], "rf__max_features": sp_randint(1, 11), "rf__min_samples_split": sp_randint(2, 100), "rf__min_samples_leaf": sp_randint(1, 11), "rf__bootstrap": [True, False], "rf__criterion": ["gini", "entropy"] } # run randomized search n_iter_search = 10000
#switch between motions and particle sets for RandomizedSearchCV test_motions = False if test_motions: model = model_m X_train = X_train_m y_train = y_train_m print("RandomizedSearchCV testing the Motions dataset") else: model = model_p X_train = X_train_p y_train = y_train_p print("RandomizedSearchCV testing the Particles dataset") # specify parameters and distributions to sample from param_dist = {"min_samples_leaf": sp_randint(5, 3000), "max_leaf_nodes": sp_randint(5, 300), "max_depth": sp_randint(3, 100), "criterion": ['gini', 'entropy'] } #https://scikit-learn.org/stable/auto_examples/model_selection/plot_randomized_search.html#sphx-glr-auto-examples-model-selection-plot-randomized-search-py n_iter_search = 1000 random_search = RandomizedSearchCV(model, param_distributions = param_dist, n_iter=n_iter_search, cv = 5, scoring='accuracy', verbose=10) random_search.fit(X_train, y_train) report(random_search.cv_results_) #scores = random_search.cv_results_['mean_test_score'] grid_cv = False if grid_cv:
def finding_parameter(): """ Finding parameter for machine learning model :return: parameter for traditional machine learning algorithm """ def report(results, n_top=3): for i in range(1, n_top + 1): candidates = np.flatnonzero(results['rank_test_score'] == i) for candidate in candidates: print("Model with rank: {0}".format(i)) print("Mean validation score: {0:.3f} (std: {1:.3f})".format( results['mean_test_score'][candidate], results['std_test_score'][candidate])) print("Parameters: {0}".format(results['params'][candidate])) print("") df = pd.read_csv(args.raw_data) df_train_input_sc, df_train_target, df_test_input_sc, df_test_target = lib.clear_data(df, args) if args.algo == 'decisiontree': clf = tree.DecisionTreeClassifier() param_dist = {"max_depth": [3, 4, 5, 6, 7, 8, 9, 10], "max_features": sp_randint(1, 11), "min_samples_split": sp_randint(2, 11), "min_samples_leaf": [0.05, 0.1, 1], "criterion": ["gini", "entropy"], "splitter": ["best", "random"], "class_weight": ["balanced", None] } if args.algo == 'randomforest': print("Finding parameter for random forest") # build a classifier clf = RandomForestClassifier(n_estimators=1000) param_dist = {"max_depth": [3, 4, 5, 6, 7, 8, 9, 10], "max_features": sp_randint(1, 11), "min_samples_split": sp_randint(2, 11), "bootstrap": [True, False], "criterion": ["gini", "entropy"]} # Utility function to report best scores if args.algo == "logisticregression": print("Finding parameter for logisticregression") clf = LogisticRegression() param_dist = { "penalty": ["l1", "l2"], "tol": [0.00001, 0.0001, 0.001, 0.01, 0.1, 1], "C": [0.05, 0.1], "fit_intercept": [True, False], "intercept_scaling": [0.01, 0.1, 1], "max_iter": [10, 100, 1000] } # specify parameters and distributions to sample from # run randomized search n_iter_search = 10000 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search, cv=5, iid=False) start = time() random_search.fit(df_train_input_sc, df_train_target) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_search.cv_results_)
xgbm = XGBClassifier(**xparams) lgbm = LGBMClassifier(**lparams) cgbm = CatBoostClassifier(**cparams) rdf = RandomForestClassifier() classifiers = [rdf, xgbm, lgbm] classifiers = [xgbm, lgbm, cgbm] classifiers = [xgbm, lgbm] lr = LogisticRegression(C=0.1) grid = StackingClassifier(classifiers=classifiers, use_probas=True, average_probas=False, meta_classifier=lr) n_estimators = [100, 300] n_estimators = sp_randint(250, 500) max_depth = [2, 3] subsample = [0.5, 0.7] subsample = sp_rec(0.3, 0.8) C = [0.01, 0.2] C = sp_rec(0.01, 0.2) learning_rate = [0.1, 0.4] learning_rate = sp_rec(0.1, 0.4) reg_lambda = [2, 6] reg_lambda = sp_randint(2, 10) reg_alpha = [0.1, 0.3] reg_alpha = sp_rec(0.1, 0.8) gamma = sp_rec(0.1, 0.8) feature_fraction = sp_rec(0.3, 0.8) bagging_fraction = sp_rec(0.3, 0.8) bagging_freq = sp_randint(3, 8)
def tune_xgb_params_randomized(estimator_cls, folds: Union[KFold, StratifiedKFold], label: np.ndarray, metric_sklearn: str, n_jobs: int, params: dict, train: np.ndarray, n_iter: int = 20, verbosity_level: int = 10, **kwargs): """ :param estimator_cls: The class type of the estimator to instantiate - either an XGBClassifier or an XGBRegressor. :param folds: A KFold or StratifiedKFold object to cross validate the parameters. :param label: An array-like containing the labels of the classification or regression problem. :param metric_sklearn: The evaluation metric to be passed to scikit-learn's GridSearchCV - see http://scikit-learn.org/stable/modules/model_evaluation.html for the options this can take - e.g. 'neg_mean_squared_error' for RMSE. :param n_jobs: The number of jobs to run simultaneously. :param params: A dictionary of XGB parameters. :param train: An array-like containing the training input samples. :param n_iter: An optional parameter to control the number of parameter settings that are sampled. :param n_jobs: An optional parameter to control the amount of parallel jobs - defaults to the amount of CPUs available. :param verbosity_level: An optional parameter to control the verbosity of the grid searching - defaults to the most verbose option. :param kwargs: Parameter distributions may be controlled through keyword arguments - e.g. to sample uniformly between 0.5 and 0.7 for colsample_bytree, supply colsample_bytree_loc=0.5 and colsample_bytree_scale=0.2. :return: A dictionary of tuned parameters and a list of the parameters found at each step with their respective scores. """ params_copy = clean_params_for_sk(params) param_distributions = { 'colsample_bytree': uniform(kwargs.get('colsample_bytree_loc', 0.2), kwargs.get('colsample_bytree_scale', 0.8)), 'gamma': uniform(kwargs.get('gamma_loc', 0), kwargs.get('gamma_scale', 0.9)), 'max_depth': sp_randint(kwargs.get('max_depth_low', 2), kwargs.get('max_depth_high', 11)), 'min_child_weight': sp_randint(kwargs.get('min_child_weight_low', 1), kwargs.get('min_child_weight_high', 11)), 'reg_alpha': halfnorm(kwargs.get('reg_alpha_loc', 0), kwargs.get('reg_alpha_scale', 5)), 'reg_lambda': halfnorm(kwargs.get('reg_alpha_loc', 0), kwargs.get('reg_alpha_scale', 5)), 'subsample': uniform(kwargs.get('subsample_loc', 0.2), kwargs.get('subsample_scale', 0.8)) } rand_search = RandomizedSearchCV(cv=folds.split(train, label), estimator=estimator_cls(**params_copy), n_iter=n_iter, n_jobs=n_jobs, param_distributions=param_distributions, scoring=metric_sklearn, verbose=verbosity_level) rand_search.fit(train, label) return rand_search.best_params_, [(rand_search.best_params_, rand_search.best_score_)]