# storage structure for forecasts mvalid = np.zeros((xtrain.shape[0],len(param_grid))) mfull = np.zeros((xtest.shape[0],len(param_grid))) ## build 2nd level forecasts for i in range(len(param_grid)): print "processing parameter combo:", i # configure model with j-th combo of parameters x = param_grid[i] model.max_depth = int(x[0]) model.max_features = int(x[1]) model.max_features = int(x[2]) model.min_samples_leaf = int(x[3]) model.min_weight_fraction_leaf = x[4] model.n_estimators = int(x[5]) # loop over folds for j in range(0,n_folds): idx0 = np.where(fold_index != j) idx1 = np.where(fold_index == j) x0 = np.array(xtrain)[idx0,:][0]; x1 = np.array(xtrain)[idx1,:][0] y0 = np.array(y)[idx0]; y1 = np.array(y)[idx1] model.fit(x0, y0) y_pre = model.predict_proba(x1)[:,1] mvalid[idx1,i] = y_pre print 'log loss: ', log_loss(y1,y_pre) print "finished fold:", j
n_minleaf = [1] n_minsplit = [1] n_maxfeat = [0.1] param_grid = tuple([n_vals, n_minleaf, n_minsplit, n_maxfeat]) param_grid = list(product(*param_grid)) # storage structure for forecasts mvalid = np.zeros((xtrain.shape[0], len(param_grid))) mfull = np.zeros((xtest.shape[0], len(param_grid))) ## build 2nd level forecasts for i in range(len(param_grid)): print "processing parameter combo:", i # configure model with j-th combo of parameters x = param_grid[i] model.n_estimators = x[0] model.min_samples_leaf = x[1] model.min_samples_split = x[2] model.max_features = x[3] # loop over folds for j in range(0, n_folds): idx0 = np.where(fold_index != j) idx1 = np.where(fold_index == j) x0 = np.array(xtrain)[idx0, :][0] x1 = np.array(xtrain)[idx1, :][0] y0 = np.array(y_train)[idx0] y1 = np.array(y_train)[idx1] # fit the model on observations associated with subject whichSubject in this fold model.fit(x0, y0)
def init_hyperparameters(self, trial, X, y): self.name = id_name('SelectKBest') self.k_fraction = trial.suggest_uniform(self.name + 'k_fraction', 0.0, 1.0) self.sparse = False score_func = trial.suggest_categorical(self.name + 'score_func', [ 'chi2', 'f_classif', 'mutual_info', 'ExtraTreesClassifier', 'LinearSVC' ]) if score_func == "chi2": self.score_func = sklearn.feature_selection.chi2 elif score_func == "f_classif": self.score_func = sklearn.feature_selection.f_classif elif score_func == "mutual_info": self.score_func = sklearn.feature_selection.mutual_info_classif elif score_func == 'ExtraTreesClassifier': new_name = self.name + '_' + score_func + '_' model = ExtraTreesClassifier() model.n_estimators = 100 model.criterion = trial.suggest_categorical( new_name + "criterion", ["gini", "entropy"]) model.max_features = trial.suggest_uniform( new_name + "max_features", 0, 1) model.max_depth = None model.max_leaf_nodes = None model.min_samples_split = trial.suggest_int(new_name + "min_samples_split", 2, 20, log=False) model.min_samples_leaf = trial.suggest_int(new_name + "min_samples_leaf", 1, 20, log=False) model.min_weight_fraction_leaf = 0. model.min_impurity_decrease = 0. model.bootstrap = trial.suggest_categorical( new_name + "bootstrap", [True, False]) self.score_func = functools.partial( model_score, estimator=model) #bindFunction1(model) elif score_func == 'LinearSVC': new_name = self.name + '_' + score_func + '_' model = sklearn.svm.LinearSVC() model.penalty = "l1" model.loss = "squared_hinge" model.dual = False model.tol = trial.suggest_loguniform(new_name + "tol", 1e-5, 1e-1) model.C = trial.suggest_loguniform(new_name + "C", 0.03125, 32768) model.multi_class = "ovr" model.fit_intercept = True model.intercept_scaling = 1 self.score_func = functools.partial(model_score, estimator=model)
n_estimators = 100 et_params = dict(criterion='entropy', max_depth=40, min_samples_split=6, min_samples_leaf=6, max_features=6, bootstrap=False, n_jobs=-1, random_state=1) et0 = ExtraTreesClassifier(warm_start=True, **et_params) et1 = ExtraTreesClassifier(warm_start=True, **et_params) for n in range(10, n_estimators + 1, 10): et0.n_estimators = n et0.fit(X_1, y_1) pred_0 = et0.predict_proba(X_0)[:, 1] s0 = roc_auc_score(y_0, pred_0) et1.n_estimators = n et1.fit(X_0, y_0) pred_1 = et1.predict_proba(X_1)[:, 1] s1 = roc_auc_score(y_1, pred_1) scores = (s0, s1) scores_text = ', '.join('%0.5f' % s for s in scores) print('%3d, %0.4f, [%s]' % (n, np.mean(scores), scores_text)) print('final scores:', scores)
print('+ Training on full dataset...') et_params = dict( criterion='entropy', max_depth=40, min_samples_split=6, min_samples_leaf=6, max_features=6, bootstrap=False, n_jobs=-1, random_state=1 ) et_full = ExtraTreesClassifier(warm_start=True, **et_params) et_full.n_estimators = 100 et_full.fit(X, y) del X, y gc.collect() print('+ Making predictions for test...') df_test = feather.read_dataframe('tmp/mtv_df_test.feather', columns=features) X_test = df_test[features].values del df_test pred_test = et_full.predict_proba(X_test)[:, 1].astype('float32') np.save('predictions/et_pred_test.npy', pred_test)
# storage structure for forecasts mvalid = np.zeros((xtrain.shape[0], len(param_grid))) mfull = np.zeros((xtest.shape[0], len(param_grid))) ## build 2nd level forecasts for i in range(len(param_grid)): print "processing parameter combo:", i # configure model with j-th combo of parameters x = param_grid[i] model.max_depth = int(x[0]) model.max_features = int(x[1]) model.max_features = int(x[2]) model.min_samples_leaf = int(x[3]) model.min_weight_fraction_leaf = x[4] model.n_estimators = int(x[5]) # loop over folds for j in range(0, n_folds): idx0 = np.where(fold_index != j) idx1 = np.where(fold_index == j) x0 = np.array(xtrain)[idx0, :][0] x1 = np.array(xtrain)[idx1, :][0] y0 = np.array(y)[idx0] y1 = np.array(y)[idx1] model.fit(x0, y0) y_pre = model.predict_proba(x1)[:, 1] mvalid[idx1, i] = y_pre print 'log loss: ', log_loss(y1, y_pre) print "finished fold:", j
n_minleaf = [1] n_minsplit = [1] n_maxfeat = [0.1] param_grid = tuple([n_vals, n_minleaf, n_minsplit, n_maxfeat]) param_grid = list(product(*param_grid)) # storage structure for forecasts mvalid = np.zeros((xtrain.shape[0],len(param_grid))) mfull = np.zeros((xtest.shape[0],len(param_grid))) ## build 2nd level forecasts for i in range(len(param_grid)): print "processing parameter combo:", i # configure model with j-th combo of parameters x = param_grid[i] model.n_estimators = x[0] model.min_samples_leaf = x[1] model.min_samples_split = x[2] model.max_features = x[3] # loop over folds for j in range(0,n_folds): idx0 = np.where(fold_index != j) idx1 = np.where(fold_index == j) x0 = np.array(xtrain)[idx0,:][0]; x1 = np.array(xtrain)[idx1,:][0] y0 = np.array(y_train)[idx0]; y1 = np.array(y_train)[idx1] # fit the model on observations associated with subject whichSubject in this fold model.fit(x0, y0)