def rolling_model_PLS(df_X, df_Y): split_num = 200 * 60 X_traindata = df_X[:split_num * 2] Y_traindata = df_Y[:split_num * 2] X_vdata = df_X[split_num:split_num * 2] X_testdata = df_X[split_num * 2:split_num * 3] Y_testdata = df_Y[split_num * 2:split_num * 3] # specify parameters and distributions to sample from num_valid_size = len(X_traindata) - len(X_vdata) test_fold = -1 * np.ones(len(X_traindata)) test_fold[num_valid_size:] = 0 ps = PredefinedSplit(test_fold) # specify parameters and distributions to sample from param_dist = {'n_components': sp_randint(1, 100), 'max_iter': sp_randint(50, len(X_traindata)), 'tol': [0.0001, 0.00001, 0.000001, 0.0000001]} # param_dist = {'n_components':[3,4]} PLS_model = PLSRegression(scale=False) # run gridsearchcv make_scorer(r2_score) n_iter_search = 50 estim = RandomizedSearchCV(PLS_model, param_distributions=param_dist, scoring='r2', cv=ps.split(), iid=False, n_jobs=1, n_iter=n_iter_search) estim.fit(X_traindata, Y_traindata) best_estimator = estim.best_estimator_ v_pred = best_estimator.predict(df_X[:split_num]) v_performance_score = r2_score(df_Y[:split_num], v_pred) test_pre_y_array = best_estimator.predict(X_testdata) test_performance_score = r2_score(Y_testdata, test_pre_y_array) return v_performance_score, test_performance_score
def test_jdata_fscorer_class(): monkey_patch.run() user_sku_pair, data, target, pred_proba, test_fold, expected_scores = ( get_jdata_test_cases()) pred_map = {} clf = MockEstimatorWithPredefinedPrediction(pred_map) ps = PredefinedSplit(test_fold) for train_index, test_index in ps.split(): print("TRAIN:", train_index, "TEST:", test_index) clf.set(data[train_index, :], pred_proba[train_index]) clf.set(data[test_index, :], pred_proba[test_index]) scoring = { "custom_score_index": JDataScore(), "custom_score_index_with_user_sku_pair": JDataScore(user_sku_pair), } scores = cross_validate(clf, data, target, scoring=scoring, cv=ps, return_estimator=True) for name in scoring.keys(): assert_almost_equal(scores[f"test_{name}"], expected_scores)
def decode(X, y, cv_ids, model): """ Parameters -------------- X: np.array, n_stimuli x n_voxels y: np.array, n_stimuli, cv_ids: np.array - n_stimuli, Return -------------- models, scores """ scores = [] models = [] ps = PredefinedSplit(cv_ids) for train_index, test_index in ps.split(): # split the data X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # fit the model on the training set model.fit(X_train, y_train) # calculate the accuracy for the hold out run score = model.score(X_test, y_test) # save stuff models.append(deepcopy(model)) scores.append(score) return models, scores
def validate(self, cv_splits, num_runs): x = pd.concat([self.x_train, self.x_val], axis=0) y = pd.concat([self.y_train, self.y_val], axis=0) if cv_splits == 1: splitter = PredefinedSplit([-1 for _ in range(len(x) - 12)] + [0 for _ in range(12)]) split = list(splitter.split(X=x, y=y)) * num_runs else: splitter = TimeSeriesSplit(cv_splits, max_train_size=len(x) - 12) split = list(splitter.split(X=x, y=y)) * num_runs res = map(self._validate, split) res = np.mean(list(res), axis=0) # K.clear_session() return res[0][0], res[1][0]
def main(argv): start_time = datetime.now() logger.info("START") args = argparser.parse_args() inFile = args.inFile testFile = args.testFile nameModel = args.nameModel conf_file = args.mod mod = __import__(conf_file, fromlist=['*']) model_conf = mod.gridSearch_Model_types[nameModel] conf = getattr(__import__(conf_file, fromlist=[model_conf]), model_conf) prefix_dict = conf['prefix_dict'] out_dict = h.outfileName(fo=args.outFile, fi=inFile, prefix_dict=prefix_dict, add_date=True) logger.info("RUNNING WITH MOD: %s, INFILE: %s" % (conf_file, inFile)) logger.info("LOADING THE DATA SET") param_grid = PARAM_DICT[nameModel] # scoring = {'Accuracy': make_scorer(accuracy_score),'RMS':make_scorer(mean_squared_error)} scoring = {'RMS': make_scorer(r2_score)} X, Y, len_train, numFeatures = readFile(inFile) cv = None if testFile: logger.info("USING TEST FILE %s AS TEST SET FOR THE CORSS VALIDATION" % testFile) X_test, Y_test, len_train_test, numFeatures_test = readFile(inFile) X = pd.concat([X, X_test], ignore_index=True) Y = pd.concat([Y, Y_test], ignore_index=True) cv_arr = [1] * len_train cv_arr.extend([0] * len_train_test) cv = PredefinedSplit(test_fold=cv_arr) print("Stampa di cv: ", cv) print("numero di fold", cv.get_n_splits()) for train_index, test_index in cv.split(): print("TRAIN:", train_index, "TEST:", test_index) logger.info("SHAPE OF X:%s AND Y:%s AFTER APPEND", X.shape, Y.shape) logger.info("CREATION OF THE MODEL") t = TestClass(conf=conf, nm=nameModel, nf=numFeatures) if nameModel == 'NN': model = KerasClassifier(build_fn=t.createModelNN) X = X.as_matrix() Y = Y.as_matrix() else: model = t.selectModel() logger.info("START GRID SEARCH") grid_result = gridSearch(model, param_grid, cv, X, Y, scoring) logger.info("END OF GRID SEARCH") logger.info("PRINTING RESULTS") gridResults(grid_result, X, nameModel) SaveModel(nameModel, grid_result) logger.info("EXECUTED IN %f SEC" % ((datetime.now() - start_time)).total_seconds()) logger.info("END")
def split_dataset(dataset): X = dataset.drop(y_col, axis=1) y = dataset[y_col] test_fold = (fold_pattern * ( (dataset.shape[0] - 1) // len(fold_pattern) + 1))[:dataset.shape[0]] splitter = PredefinedSplit(test_fold) for train_index, test_index in splitter.split(): X_train, X_test = safe_indexing(X, train_index), safe_indexing( X, test_index) y_train, y_test = safe_indexing(y, train_index), safe_indexing( y, test_index) return X_train, y_train, X_test, y_test
def split(self, data): """Perform a data split with a fixed size for the test set""" data_size = 0 if data.is_row_split_validation(): #Time series split data by columns data_size = data.get_features().shape[1] else: data_size = data.get_features().shape[0] test_fold = [-1 for i in range(0, data_size - self.test_size_)] test_fold += [0 for i in range(data_size - self.test_size_, data_size)] splitter = PredefinedSplit(test_fold=test_fold) return splitter.split()
def rolling_model_ENetH(df_X, df_Y): split_num = 200 * 60 X_traindata = df_X[:split_num * 2] Y_traindata = df_Y[:split_num * 2] X_vdata = df_X[split_num:split_num * 2] X_testdata = df_X[split_num * 2:split_num * 3] Y_testdata = df_Y[split_num * 2:split_num * 3] # specify parameters and distributions to sample from num_valid_size = len(X_traindata) - len(X_vdata) test_fold = -1 * np.ones(len(X_traindata)) test_fold[num_valid_size:] = 0 ps = PredefinedSplit(test_fold) # specify parameters and distributions to sample from param_dist = { 'alpha': uniform(0.00001, 0.1), 'power_t': uniform(0.1, 0.9), 'l1_ratio': uniform(0.1, 0.9), 'eta0': uniform(0.00001, 0.1), 'epsilon': uniform(0.01, 0.9), 'max_iter': sp_randint(5, 10000), 'tol': [0.01, 0.001, 0.0001, 0.00001], 'fit_intercept': [True, False] } clf = SGDRegressor(shuffle=False, loss='huber', penalty='elasticnet', random_state=100) # run randomized search n_iter_search = 100 estim = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search, scoring='r2', cv=ps.split(), iid=False, random_state=100, n_jobs=1) estim.fit(X_traindata, Y_traindata) best_estimator = estim.best_estimator_ v_pred = best_estimator.predict(df_X[:split_num]) v_performance_score = r2_score(df_Y[:split_num], v_pred) test_pre_y_array = best_estimator.predict(X_testdata) test_performance_score = r2_score(Y_testdata, test_pre_y_array) return v_performance_score, test_performance_score
def aggregate_fold_stats(db_paths, cv_pkl_file): preprocessed_db = imglmdb.multidbwrapper(sorted(db_paths)) with open(cv_pkl_file, "rb") as pkl: test_fold, nested_test_folds = pickle.load(pkl) splitter = PredefinedSplit(test_fold) data = [{}] * splitter.get_n_splits() for i, (nested_test_fold, (_, test_idx)) in enumerate(zip(nested_test_folds, splitter.split())): per_pixel_stats = preprocessing.compute_per_pixel_stats( preprocessed_db, None, idx=test_idx) std_per_pixel = numpy.where(per_pixel_stats[1] == 0.0, 1, per_pixel_stats[1]) data[i]["outer"] = (per_pixel_stats[0], std_per_pixel) nested_splitter = PredefinedSplit(nested_test_fold) data[i]["nested"] = [{}] * nested_splitter.get_n_splits() for j, (train_idx, val_idx) in enumerate(nested_splitter.split()): per_pixel_stats = preprocessing.compute_per_pixel_stats( preprocessed_db, None, idx=train_idx) std_per_pixel = numpy.where(per_pixel_stats[1] == 0.0, 1, per_pixel_stats[1]) data[i]["nested"][j]["train"] = (per_pixel_stats[0], std_per_pixel) per_pixel_stats = preprocessing.compute_per_pixel_stats( preprocessed_db, None, idx=val_idx) std_per_pixel = numpy.where(per_pixel_stats[1] == 0.0, 1, per_pixel_stats[1]) data[i]["nested"][j]["val"] = (per_pixel_stats[0], std_per_pixel) with open(os.path.splitext(cv_pkl_file)[0] + "_stats.pkl", "wb") as pkl: pickle.dump(data, pkl) return data
def train(self, data, clf='rf', param_search='single', tune_size=0.15, scoring='roc_auc', n_jobs=1, verbose=1): """Trains a classifier with the specified training data. data: tuple including training data. clf: string of {'rf' 'lr', 'xgb'}. Returns trained classifier.""" x_train, y_train, _, features = data if param_search == 'single' or tune_size == 0: model, params = self.classifier(clf, param_search='single') model.set_params(**params) elif tune_size > 0: t1 = self.out('tuning...') model, params = self.classifier(clf, param_search=param_search) train_len = x_train.shape[0] split_ndx = train_len - int(train_len * tune_size) sm_x_train, x_val = x_train[:split_ndx], x_train[split_ndx:] sm_train_fold = np.full(sm_x_train.shape[0], -1) val_fold = np.full(x_val.shape[0], 0) predefined_fold = np.append(sm_train_fold, val_fold) ps = PredefinedSplit(predefined_fold) cv = ps.split(x_train, y_train) m = GridSearchCV(model, params, scoring=scoring, cv=cv, verbose=verbose, n_jobs=n_jobs) m.fit(x_train, y_train) model = m.best_estimator_ self.time(t1) t1 = self.out('training...') if clf == 'lgb': cat_feat = ['app', 'device', 'os', 'channel', 'hour'] cat_feat_ndx = [features.index(x) for x in cat_feat] train_len = x_train.shape[0] split_ndx = train_len - int(train_len * tune_size) sm_x_train, x_val = x_train[:split_ndx], x_train[split_ndx:] sm_y_train, y_val = y_train[:split_ndx], y_train[split_ndx:] eval_set = (x_val, y_val) model = model.fit(sm_x_train, sm_y_train, eval_set=eval_set, early_stopping_rounds=50, eval_metric='auc', categorical_feature=cat_feat_ndx) else: model = model.fit(x_train, y_train) self.time(t1) self.out(str(model)) return model
def train_self(scoring='accuracy'): csv_dir = Path("Features/CSV") for i in range(10): train_test_dir = csv_dir / f"train_test{i}" results_dir = Path("Results") / f"self{i}" results_dir.mkdir(exist_ok=True) for dataset_file in train_test_dir.glob("*test*"): dataset = str(dataset_file.stem).split("_test")[0] suffixes = ["train", "train_train", "train_val"] keys = [f"{s}" for s in suffixes] df_dict = { key: pd.read_csv(train_test_dir / f"{dataset}_{key}.csv") for key in keys } #xgboost with eval ################################### data = pd.concat([df_dict["train_train"], df_dict["train_val"]], axis=0) data.reset_index(inplace=True, drop=True) val_idx = np.concatenate( ((-1) * np.ones(df_dict["train_train"].shape[0]), np.zeros(df_dict["train_val"].shape[0]))) ps = PredefinedSplit(val_idx) X = data.drop(columns=["Label", "microRNA_name"]) y = data.Label.ravel() train_index, val_index = next(ps.split()) X_val = X.iloc[val_index] y_val = y[val_index] output_file = results_dir / f"{dataset}_xgbs_val_results.csv" print(output_file) if not output_file.exists(): clf = XGBClassifier(silent=True) grid_obj = GridSearchCV(clf, XGBS_PARAMS, scoring=scoring, cv=ps, verbose=3) fit_params = { "eval_set": [(X_val, y_val)], "early_stopping_rounds": 50 } grid_obj.fit(X, y, **fit_params) print('\n Best estimator:') print(grid_obj.best_estimator_) print(grid_obj.best_score_ * 2 - 1) results = pd.DataFrame(grid_obj.cv_results_) results.to_csv(output_file, index=False)
class LeavePOutByGroup(): def __init__(self, X, p=5, n_splits=2): self.X = X self.p = p self.n_splits = n_splits test_fold = self.X.groupby("user_id").cumcount().apply( lambda x: int(x / p) if x < (n_splits * p) else -1) self.s = PredefinedSplit(test_fold) def get_n_splits(self, X=None, y=None, groups=None): return self.n_splits def split(self, X=None, y=None, groups=None): return self.s.split()
def rolling_model_PLS( X_traindata=X_traindata, Y_traindata_demean=np.ravel(Y_traindata_demean), X_traindata1=X_traindata1, Y_traindata1=np.ravel(Y_traindata1), X_testdata=X_testdata, Y_testdata=np.ravel(Y_testdata), mean_Ytrain=mean_Ytrain): # specify parameters and distributions to sample from split_num = 200 * 60 num_valid_size = split_num test_fold = -1 * np.ones(len(X_traindata)) test_fold[num_valid_size:] = 0 ps = PredefinedSplit(test_fold) # specify parameters and distributions to sample from param_dist = { 'n_components': sp_randint(1, 31), 'max_iter': sp_randint(50, len(X_traindata)), 'tol': [0.0001, 0.00001, 0.000001, 0.0000001] } PLS_model = PLSRegression(scale=False) # run gridsearchcv make_scorer(r2_score) n_iter_search = 50 estim = RandomizedSearchCV(PLS_model, param_distributions=param_dist, scoring='r2', cv=ps.split(), iid=False, n_jobs=-1, n_iter=n_iter_search) estim.fit(X_traindata, Y_traindata_demean) best_estimator = estim.best_estimator_ train_predict = best_estimator.predict( X_traindata1) + mean_Ytrain IS_score = r2_score(Y_traindata1, train_predict) test_predict = best_estimator.predict(X_testdata) + mean_Ytrain test_predict = test_predict[:, 0] OOS_score = 1 - np.sum( (Y_testdata - test_predict)**2) / np.sum( (Y_testdata - mean_Ytrain)**2) return IS_score, OOS_score
def predefined_train_test_split(data, labels, folds, workflow, label_encoder): folds = np.asarray(folds) fold_encoder = LabelEncoder() split_encoded = fold_encoder.fit_transform(folds) num_classes = len(label_encoder.classes_) performance = { 'classes': label_encoder.classes_.tolist(), 'intervals': {key: np.sum(folds == key) for key in sorted(list(set(folds)))} } split = PredefinedSplit(split_encoded) for fold_index, (train_inds, test_inds) in enumerate(split.split()): train_x, train_y = [data[ii] for ii in train_inds], [labels[ii] for ii in train_inds] test_x, test_y = [data[ii] for ii in test_inds], [labels[ii] for ii in test_inds] prior_train = [0] * num_classes for yy in train_y: prior_train[yy] += 1 prior_test = [0] * num_classes for yy in test_y: prior_test[yy] += 1 clf = deepcopy(workflow) clf.fit(train_x, train_y) param_dict = {kk: vv.__dict__ for kk, vv in clf.named_steps.iteritems()} test_pred = clf.predict(test_x) test_ind = folds[test_inds[0]] performance[test_ind] = { 'accuracy': metrics.accuracy_score(test_y, test_pred), 'precision_micro': metrics.precision_score(test_y, test_pred, average='micro'), 'precision_macro': metrics.precision_score(test_y, test_pred, average='micro'), 'recall_micro': metrics.recall_score(test_y, test_pred, average='micro'), 'recall_macro': metrics.recall_score(test_y, test_pred, average='macro'), 'f1_score_micro': metrics.f1_score(test_y, test_pred, average='micro'), 'f1_score_macro': metrics.f1_score(test_y, test_pred, average='macro'), 'confusion_matrix': metrics.confusion_matrix(test_y, test_pred).tolist(), 'prior_train': prior_train, 'prior_test': prior_test, 'model': serialise_dict(param_dict) } return serialise_dict(performance)
def neighbors(train, test, target, cv: PredefinedSplit, k=5, n_trees=10): res_train = np.zeros((train.shape[0], 2)) res_test = np.zeros((test.shape[0], 2)) for i, (trn_idx, val_idx) in tqdm(enumerate(cv.split(train)), total=cv.get_n_splits()): target_trn = target.iloc[trn_idx] X_trn = train.iloc[trn_idx] X_val = train.iloc[val_idx] n = X_trn[target_trn == 0] p = X_trn[target_trn == 1] for j, X in enumerate([n, p]): u = build(X, n_trees) res_train[val_idx, j] = get_feat(X_val, u, k=k) res_test[:, j] += get_feat(test, u, k) res_test /= cv.get_n_splits() return res_train, res_test
def k_fold_cv(X, y, feature_desc): # since fold 4 will be used as a blind set and not part of training, it is removed from fold_ids list. fold_ids = pd.read_csv( "data/raw_data/CV_fold_ids_trval.csv")['FoldID'][0:132] ps = PredefinedSplit(fold_ids) fold_id = 0 y = y[valence_classifier.label_type] for train_index, test_index in ps.split(): X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y[train_index], y[test_index] clf = tune_on_devset(X_train, y_train, X_test, y_test) joblib.dump( clf, "data/models/" + feature_desc + "_fold" + str(fold_id) + '.pkl') fold_id += 1 return
def rolling_model_RF(X_traindata=X_traindata, Y_traindata_demean=np.ravel(Y_traindata_demean), X_traindata1=X_traindata1, Y_traindata1=np.ravel(Y_traindata1), X_testdata=X_testdata, Y_testdata=np.ravel(Y_testdata), mean_Ytrain=mean_Ytrain): # specify parameters and distributions to sample from split_num = 200 * 60 num_valid_size = split_num test_fold = -1 * np.ones(len(X_traindata)) test_fold[num_valid_size:] = 0 ps = PredefinedSplit(test_fold) # specify parameters and distributions to sample from param_dist = {"max_features": sp_randint(5, 100), "max_depth": sp_randint(3, 10), "min_samples_split": sp_randint(10, 1000), "min_samples_leaf": sp_randint(10, 1000), "n_estimators": sp_randint(3, 100), "oob_score": [True, False] } clf_RF = RandomForestRegressor(random_state=100) # run randomized search n_iter_search = 50 estim = RandomizedSearchCV(clf_RF, param_distributions=param_dist, n_iter=n_iter_search, scoring='r2',n_jobs=-1, cv=ps.split(), iid=False, random_state=100) estim.fit(X_traindata, Y_traindata_demean) best_estimator = estim.best_estimator_ best_VIP = best_estimator.feature_importances_ train_predict = best_estimator.predict(X_traindata1) + mean_Ytrain IS_score = r2_score(Y_traindata1, train_predict) test_predict = best_estimator.predict(X_testdata) + mean_Ytrain OOS_score = 1- np.sum((Y_testdata-test_predict)**2)/sum((Y_testdata-mean_Ytrain)**2) return IS_score, OOS_score, best_VIP
def rolling_model_GBRTH(df_X, df_Y): split_num = 200 * 60 X_traindata = df_X[:split_num * 2] Y_traindata = df_Y[:split_num * 2] X_vdata = df_X[split_num:split_num * 2] X_testdata = df_X[split_num * 2:split_num * 3] Y_testdata = df_Y[split_num * 2:split_num * 3] # specify parameters and distributions to sample from num_valid_size = len(X_traindata) - len(X_vdata) test_fold = -1 * np.ones(len(X_traindata)) test_fold[num_valid_size:] = 0 ps = PredefinedSplit(test_fold) # specify parameters and distributions to sample from param_dist = { "max_features": sp_randint(5, 100), "max_depth": sp_randint(3, 12), "min_samples_split": sp_randint(100, 1000), "min_samples_leaf": sp_randint(100, 1000), "n_estimators": sp_randint(5, 100), "learning_rate": uniform(0.001, 0.1), "subsample": uniform(0.6, 0.4) } clf_GBRT = GradientBoostingRegressor(loss='huber', random_state=100) # run randomized search n_iter_search = 100 estim = RandomizedSearchCV(clf_GBRT, param_distributions=param_dist, n_iter=n_iter_search, scoring='r2', cv=ps.split(), iid=False, random_state=100) estim.fit(X_traindata, Y_traindata) best_estimator = estim.best_estimator_ v_pred = best_estimator.predict(df_X[:split_num]) v_performance_score = r2_score(df_Y[:split_num], v_pred) test_pre_y_array = best_estimator.predict(X_testdata) test_performance_score = r2_score(Y_testdata, test_pre_y_array) return v_performance_score, test_performance_score
def test_predefinedsplit_with_kfold_split(): # Check that PredefinedSplit can reproduce a split generated by Kfold. folds = -1 * np.ones(10) kf_train = [] kf_test = [] for i, (train_ind, test_ind) in enumerate(KFold(5, shuffle=True).split(X)): kf_train.append(train_ind) kf_test.append(test_ind) folds[test_ind] = i ps_train = [] ps_test = [] ps = PredefinedSplit(folds) # n_splits is simply the no of unique folds assert_equal(len(np.unique(folds)), ps.get_n_splits()) for train_ind, test_ind in ps.split(): ps_train.append(train_ind) ps_test.append(test_ind) assert_array_equal(ps_train, kf_train) assert_array_equal(ps_test, kf_test)
def target_encoding(X_train, y_train, X_test, cols, cv_id): cols = list(cols) train_new = X_train.copy() test_new = X_test.copy() test_new[:] = 0 cv = PredefinedSplit(cv_id) X_train.index = X_train.index.astype(int) for trn_idx, val_idx in tqdm(cv.split(X_train), total=cv.get_n_splits()): enc = TargetEncoder(cols=cols) enc.fit(X_train.iloc[trn_idx], y_train[trn_idx]) train_new.iloc[val_idx] = enc.transform(X_train.iloc[val_idx]) test_new += enc.transform(X_test) test_new /= cv.get_n_splits() train_new = train_new[cols] test_new = test_new[cols] train_new.columns = train_new.columns + '_target' test_new.columns = test_new.columns + '_target' print(list(train_new.columns)) return train_new, test_new
def extract_data(self, df_dict: dict): for key in df_dict.keys(): df_dict[key] = drop_unnamed(df_dict[key]) data = pd.concat([df_dict["train_train"], df_dict["train_val"]], axis=0) data.reset_index(inplace=True, drop=True) val_idx = np.concatenate( ((-1) * np.ones(df_dict["train_train"].shape[0]), np.zeros(df_dict["train_val"].shape[0]))) ps = PredefinedSplit(val_idx) X, y = self.extract_Xy(data) train_index, val_index = next(ps.split()) X_val = X.iloc[val_index] y_val = y[val_index] X_test, y_test = self.extract_Xy(df_dict["test"]) assert set(X.columns) == set( X_test.columns), f"""X and X_test must have the same columns. {np.setdiff1d(set(X.columns), set(X_test.columns))}""" return X, y, X_val, y_val, X_test, y_test, ps
def split(self, X, y=None, groups=None): """Generate indices to split data into training and test set. Parameters ---------- X : array-like, shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) The target variable for supervised learning problems. groups : array-like, with shape (n_samples,), optional Group labels for the samples used while splitting the dataset into train/test set. Yields ------ train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. """ X, y, groups = indexable(X, y, groups) n_samples = X.shape[0] if self.n_splits > n_samples: raise ValueError( ("Cannot have number of splits n_splits={0} greater" " than the number of samples: n_samples={1}." ).format(self.n_splits, n_samples)) # generate test fold test_fold = np.arange(n_samples, dtype=int) % self.n_splits cv = PredefinedSplit(test_fold) return(cv.split())
lgb_params = { 'n_estimators': 1000, 'learning_rate': 0.1, 'num_leaves': 31, 'colsample_bytree': 0.8, 'subsample': 0.9, 'reg_alpha': 0.1, 'reg_lambda': 0.1, 'min_split_gain': 0.01, 'min_child_weight': 2, 'random_state': 77 } print('5-fold CV') score = cross_validate(lgb.LGBMClassifier(**lgb_params), X_train, y_train, cv=cv.split(X_train, y_train), scoring='roc_auc', n_jobs=4, verbose=4) valid_score = score['test_score'].mean() print('val:', valid_score) print('train') model = lgb.LGBMClassifier(**lgb_params) model.fit(X_train, y_train) print(f'val = {valid_score};\nfeats = {feats};\nlgb_params = {lgb_params}') generate_submit(model.predict_proba(X_test)[:, 1], f'{NAME}_{valid_score:.4f}') print('output feature importances') feat_df = pd.DataFrame({'importance': model.feature_importances_}, index=X_train.columns).sort_values('importance') feat_df[-50:].plot.barh(figsize=(20, 15))
result = bayes_cv_tuner.fit(X.values, y.values, callback=status_print) # ## Example 3: Different cross-validators # Some people have asked about CV strategy, and as seen I've just used the basic Stratified K-fold strategy; that was however mostly due to time constraints, and thus me not thinking that much about it. There are a lot of potentially better options, especially considering the temporal nature of this problem. Adding these are really easy using scikit-learn cross-validators; you just plug-n-play a new cross-validator into the `cv = ` options of BayesSearchCV. Examples could be a single train-test split, where we e.g. use one day for training, and one for testing (adjust accordingly): # In[6]: from sklearn.model_selection import PredefinedSplit # Training [index == -1], testing [index == 0]) test_fold = np.zeros(len(X)) test_fold[:(TRAINING_SIZE - TEST_SIZE)] = -1 cv = PredefinedSplit(test_fold) # Check that we only have a single train-test split, and the size train_idx, test_idx = next(cv.split()) print( f"Splits: {cv.get_n_splits()}, Train size: {len(train_idx)}, Test size: {len(test_idx)}" ) # Alternatively, we could want to use the [TimeSeriesSplit](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html#sklearn.model_selection.TimeSeriesSplit) cross-validator, which allows us to do several "into the future folds" for predictions # In[14]: from sklearn.model_selection import TimeSeriesSplit # Here we just do 3-fold timeseries CV cv = TimeSeriesSplit(max_train_size=None, n_splits=3) # Let us check the sizes of the folds. Note that you can keep train size constant with max_train_size if needed for i, (train_index, test_index) in enumerate(cv.split(X)):
ps_clf = PredefinedSplit(test_fold=[ 0 if i in val_idx else -1 for i in sorted(train_or_val_idx) ]) ps_reg = [ PredefinedSplit(test_fold=[ 0 if k in val_idx else -1 for k in sorted( set(idx_by_class[c]).intersection(set(train_or_val_idx))) ]) for c in classes ] # Construct grid search cross validation to select the best classifier given the validation set clf = GridSearchCV(LinearSVC(max_iter=1e9), parameters_clf, scoring='accuracy', refit=True, cv=list(ps_clf.split())) # Construct grid search cross validation to select the best regressors given the validation set reg = [ GridSearchCV(LinearSVR(loss='squared_epsilon_insensitive', max_iter=1e9), parameters_reg, cv=list(ps_reg[i].split()), scoring=make_scorer(EVAL_SCORE, greater_is_better=False), n_jobs=4, refit=True) for i, _ in enumerate(classes) ] # Train the classifier model clf.fit(X_tv, y_tv) #clf.score(X_tv, y_tv)
def ada(X, Y, kfold=3, feature_set=None): arr = index_splitter(N=len(X), fold=kfold) ps = PredefinedSplit(arr) for train, test in ps.split(): train_index = train test_index = test train_X, train_y = X.values[train_index, :], Y.values[train_index] test_X, test_y = X.values[test_index, :], Y.values[test_index] arr = index_splitter(N=len(train_X), fold=kfold) ps2 = PredefinedSplit(arr) learning_rate = [x for x in np.linspace(0.1, 1, num=10)] n_estimators = [int(x) for x in np.linspace(start=20, stop=1000, num=100)] loss = ['square'] random_grid = { 'n_estimators': n_estimators, 'learning_rate': learning_rate, 'loss': loss } # Use the random grid to search for best hyperparameters # First create the base model to tune ada = AdaBoostRegressor(random_state=42, loss='square') # Look at parameters used by our current forest print('Parameters for baseline:\n') pprint(ada.get_params()) # Random search of parameters, using 3 fold cross validation, # search across 100 different combinations, and use all available cores ada_random = RandomizedSearchCV(estimator=ada, n_iter=200, param_distributions=random_grid, scoring='neg_mean_squared_error', cv=ps2.split(), verbose=2, random_state=42, n_jobs=-1) # Fit the random search model ada_random.fit(train_X, train_y) pprint(ada_random.best_params_) cv_result_rd = ada_random.cv_results_ BestPara_random = ada_random.best_params_ ## Grid search of parameters, using 3 fold cross validation based on Random search lr = [BestPara_random['learning_rate']] #n_estimators = [BestPara_random["n_estimators"]] n_estimators = [ int(x) for x in range(BestPara_random["n_estimators"] - 10, BestPara_random["n_estimators"] + 10, 20) ] n_estimators = [item for item in n_estimators if item > 0] grid_grid = { 'n_estimators': n_estimators, 'learning_rate': lr, 'loss': loss } ada_grid = GridSearchCV(estimator=ada, param_grid=grid_grid, scoring='neg_mean_squared_error', cv=ps2.split(), verbose=2, n_jobs=-1) # Fit the grid search model ada_grid.fit(train_X, train_y) BestPara_grid = ada_grid.best_params_ pprint(ada_grid.best_params_) cv_results_grid = ada_grid.cv_results_ # Fit the base line search model ada.fit(train_X, train_y) #prediction predict_y = ada_random.predict(test_X) predict_y_grid = ada_grid.predict(test_X) predict_y_base = ada.predict(test_X) # Performance metrics def RMLSE(predict_y_grid, predict_y, predict_y_base, test_y): errors_Grid_CV = np.sqrt(mean_squared_log_error( predict_y_grid, test_y)) errors_Random_CV = np.sqrt(mean_squared_log_error(predict_y, test_y)) errors_baseline = np.sqrt( mean_squared_log_error(predict_y_base, test_y)) return errors_Grid_CV, errors_Random_CV, errors_baseline errors_Grid_CV = (mean_squared_error(predict_y_grid, test_y)) #,squared = False)) errors_Random_CV = (mean_squared_error(predict_y, test_y)) #,squared = False)) errors_baseline = (mean_squared_error(predict_y_base, test_y)) #,squared = False)) x_axis = range(3) results = [errors_Grid_CV, errors_Random_CV, errors_baseline] print('Adaboot Results:', results) if True: fig = plt.figure(figsize=(15, 8)) x_axis = range(3) plt.bar(x_axis, results) plt.xticks(x_axis, ('GridSearchCV', 'RandomizedSearchCV', 'Baseline')) #plt.show() plt.savefig('ada_compare_error.png') #feature importance num_feature = len(ada_grid.best_estimator_.feature_importances_) plt.figure(figsize=(24, 6)) plt.bar(range(0, num_feature * 4, 4), ada_grid.best_estimator_.feature_importances_) label_name = X.keys() plt.xticks(range(0, num_feature * 4, 4), label_name) plt.title("Feature Importances" + ",kfold=" + str(kfold)) #plt.show() plt.savefig('ada_feature_importance.png') fig = plt.figure(figsize=(20, 8)) ax = fig.gca() x_label = range(0, len(predict_y_grid)) plt.title("kfold=" + str(kfold)) ax.plot(x_label, predict_y_grid, 'r--', label="predict") ax.plot(x_label, test_y, label="ground_truth") ax.set_ylim(0, 200) ax.legend() #plt.show() plt.savefig('ada_prediction.png') #return a dictionary for all results return ada_grid.predict, ada_grid.best_estimator_
########search for parameters using own scoring function def my_own_scorer(clf, X, y_true): class_labels = clf.classes_ loss = log_loss(y_true,clf.predict_proba(X),class_labels) return loss Cs = [0.1,0.3,0.5,1,3,5] tols = [0.01,0.03] gs = GridSearchCV( estimator=LogisticRegression(random_state=0), ######machine learning algorithm param_grid={'C': Cs,'tol':tols},#####list of parameters to search for cv=ps.split(), #######evaluate performance on training and validation set; in this case, we are using a predefined training and validation set verbose=True, scoring=my_own_scorer######evaluate the performance using this scoring function ) model = gs.fit(X,y) print(pd.DataFrame(model.cv_results_) )#####results are available in here ########search for parameters using sklearn predefined scorer Cs = [0.1,0.3,0.5,1,3,5] tols = [0.01,0.03]
def get_dataloaders(dataset, batch, dataroot, split=0.15, split_idx=0, multinode=False, target_lb=-1, gr_assign=None, gr_id=None, gr_ids=None, rand_val=False): if 'cifar' in dataset or 'svhn' in dataset: if "cifar" in dataset: _mean, _std = _CIFAR_MEAN, _CIFAR_STD else: _mean, _std = _SVHN_MEAN, _SVHN_STD transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(_mean, _std), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(_mean, _std), ]) elif 'imagenet' in dataset: input_size = 224 sized_size = 256 if 'efficientnet' in C.get()['model']['type']: input_size = EfficientNet.get_image_size(C.get()['model']['type']) sized_size = input_size + 32 # TODO # sized_size = int(round(input_size / 224. * 256)) # sized_size = input_size logger.info('size changed to %d/%d.' % (input_size, sized_size)) transform_train = transforms.Compose([ EfficientNetRandomCrop(input_size), transforms.Resize((input_size, input_size), interpolation=Image.BICUBIC), # transforms.RandomResizedCrop(input_size, scale=(0.1, 1.0), interpolation=Image.BICUBIC), transforms.RandomHorizontalFlip(), transforms.ColorJitter( brightness=0.4, contrast=0.4, saturation=0.4, ), transforms.ToTensor(), Lighting(0.1, _IMAGENET_PCA['eigval'], _IMAGENET_PCA['eigvec']), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) transform_test = transforms.Compose([ EfficientNetCenterCrop(input_size), transforms.Resize((input_size, input_size), interpolation=Image.BICUBIC), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) else: raise ValueError('dataset=%s' % dataset) if isinstance(C.get()['aug'], list): logger.debug('augmentation provided.') transform_train.transforms.insert(0, Augmentation(C.get()['aug'])) elif isinstance(C.get()['aug'], dict): # group version logger.debug('group augmentation provided.') else: logger.debug('augmentation: %s' % C.get()['aug']) if C.get()['aug'] == 'fa_reduced_cifar10': transform_train.transforms.insert(0, Augmentation(fa_reduced_cifar10())) elif C.get()['aug'] == 'fa_reduced_imagenet': transform_train.transforms.insert(0, Augmentation(fa_resnet50_rimagenet())) elif C.get()['aug'] == 'fa_reduced_svhn': transform_train.transforms.insert(0, Augmentation(fa_reduced_svhn())) elif C.get()['aug'] == 'arsaug': transform_train.transforms.insert(0, Augmentation(arsaug_policy())) elif C.get()['aug'] == 'autoaug_cifar10': transform_train.transforms.insert(0, Augmentation(autoaug_paper_cifar10())) elif C.get()['aug'] == 'autoaug_extend': transform_train.transforms.insert(0, Augmentation(autoaug_policy())) elif C.get()['aug'] in ['default', "clean", "nonorm", "nocut"]: pass else: raise ValueError('not found augmentations. %s' % C.get()['aug']) if C.get()['cutout'] > 0 and C.get()['aug'] != "nocut": transform_train.transforms.append(CutoutDefault(C.get()['cutout'])) if C.get()['aug'] == "clean": transform_train = transform_test elif C.get()['aug'] == "nonorm": transform_train = transforms.Compose([ transforms.ToTensor() ]) train_idx = valid_idx = None if dataset == 'cifar10': if isinstance(C.get()['aug'], dict): total_trainset = GrAugCIFAR10(root=dataroot, gr_assign=gr_assign, gr_policies=C.get()['aug'], train=True, download=False, transform=transform_train) else: total_trainset = torchvision.datasets.CIFAR10(root=dataroot, train=True, download=False, transform=transform_train) testset = torchvision.datasets.CIFAR10(root=dataroot, train=False, download=False, transform=transform_test) elif dataset == 'reduced_cifar10': if isinstance(C.get()['aug'], dict): total_trainset = GrAugCIFAR10(root=dataroot, gr_assign=gr_assign, gr_policies=C.get()['aug'], train=True, download=False, transform=transform_train) else: total_trainset = torchvision.datasets.CIFAR10(root=dataroot, train=True, download=False, transform=transform_train) sss = StratifiedShuffleSplit(n_splits=5, train_size=4000, random_state=0) # 4000 trainset sss = sss.split(list(range(len(total_trainset))), total_trainset.targets) for _ in range(split_idx+1): train_idx, valid_idx = next(sss) testset = torchvision.datasets.CIFAR10(root=dataroot, train=False, download=False, transform=transform_test) elif dataset == 'cifar100': if isinstance(C.get()['aug'], dict): total_trainset = GrAugData("CIFAR100", root=dataroot, gr_assign=gr_assign, gr_policies=C.get()['aug'], train=True, download=False, transform=transform_train) else: total_trainset = torchvision.datasets.CIFAR100(root=dataroot, train=True, download=False, transform=transform_train) testset = torchvision.datasets.CIFAR100(root=dataroot, train=False, download=False, transform=transform_test) elif dataset == 'svhn': #TODO trainset = torchvision.datasets.SVHN(root=dataroot, split='train', download=False, transform=transform_train) extraset = torchvision.datasets.SVHN(root=dataroot, split='extra', download=False, transform=transform_train) total_trainset = ConcatDataset([trainset, extraset]) testset = torchvision.datasets.SVHN(root=dataroot, split='test', download=False, transform=transform_test) elif dataset == 'reduced_svhn': if isinstance(C.get()['aug'], dict): total_trainset = GrAugData("SVHN", root=dataroot, gr_assign=gr_assign, gr_policies=C.get()['aug'], split='train', download=False, transform=transform_train) else: total_trainset = torchvision.datasets.SVHN(root=dataroot, split='train', download=False, transform=transform_train) sss = StratifiedShuffleSplit(n_splits=5, train_size=1000, test_size=7325, random_state=0) sss = sss.split(list(range(len(total_trainset))), total_trainset.labels) for _ in range(split_idx+1): train_idx, valid_idx = next(sss) # targets = [total_trainset.labels[idx] for idx in train_idx] # total_trainset = Subset(total_trainset, train_idx) # total_trainset.targets = targets testset = torchvision.datasets.SVHN(root=dataroot, split='test', download=False, transform=transform_test) elif dataset == 'imagenet': total_trainset = ImageNet(root=os.path.join(dataroot, 'imagenet-pytorch'), transform=transform_train) testset = ImageNet(root=os.path.join(dataroot, 'imagenet-pytorch'), split='val', transform=transform_test) # compatibility total_trainset.targets = [lb for _, lb in total_trainset.samples] elif dataset == 'reduced_imagenet': # randomly chosen indices # idx120 = sorted(random.sample(list(range(1000)), k=120)) idx120 = [16, 23, 52, 57, 76, 93, 95, 96, 99, 121, 122, 128, 148, 172, 181, 189, 202, 210, 232, 238, 257, 258, 259, 277, 283, 289, 295, 304, 307, 318, 322, 331, 337, 338, 345, 350, 361, 375, 376, 381, 388, 399, 401, 408, 424, 431, 432, 440, 447, 462, 464, 472, 483, 497, 506, 512, 530, 541, 553, 554, 557, 564, 570, 584, 612, 614, 619, 626, 631, 632, 650, 657, 658, 660, 674, 675, 680, 682, 691, 695, 699, 711, 734, 736, 741, 754, 757, 764, 769, 770, 780, 781, 787, 797, 799, 811, 822, 829, 830, 835, 837, 842, 843, 845, 873, 883, 897, 900, 902, 905, 913, 920, 925, 937, 938, 940, 941, 944, 949, 959] total_trainset = ImageNet(root=os.path.join(dataroot, 'imagenet-pytorch'), transform=transform_train) testset = ImageNet(root=os.path.join(dataroot, 'imagenet-pytorch'), split='val', transform=transform_test) # compatibility total_trainset.targets = [lb for _, lb in total_trainset.samples] sss = StratifiedShuffleSplit(n_splits=1, test_size=len(total_trainset) - 50000, random_state=0) # 4000 trainset sss = sss.split(list(range(len(total_trainset))), total_trainset.targets) train_idx, valid_idx = next(sss) # filter out train_idx = list(filter(lambda x: total_trainset.labels[x] in idx120, train_idx)) valid_idx = list(filter(lambda x: total_trainset.labels[x] in idx120, valid_idx)) test_idx = list(filter(lambda x: testset.samples[x][1] in idx120, range(len(testset)))) targets = [idx120.index(total_trainset.targets[idx]) for idx in train_idx] for idx in range(len(total_trainset.samples)): if total_trainset.samples[idx][1] not in idx120: continue total_trainset.samples[idx] = (total_trainset.samples[idx][0], idx120.index(total_trainset.samples[idx][1])) total_trainset = Subset(total_trainset, train_idx) total_trainset.targets = targets for idx in range(len(testset.samples)): if testset.samples[idx][1] not in idx120: continue testset.samples[idx] = (testset.samples[idx][0], idx120.index(testset.samples[idx][1])) testset = Subset(testset, test_idx) print('reduced_imagenet train=', len(total_trainset)) elif dataset == "cifar10_svhn": if isinstance(C.get()['aug'], dict): # last stage: benchmark test total_trainset = GrAugMix(dataset.split("_"), gr_assign=gr_assign, gr_policies=C.get()['aug'], root=dataroot, train=True, download=False, transform=transform_train, gr_ids=gr_ids) else: # eval_tta & childnet training total_trainset = GrAugMix(dataset.split("_"), root=dataroot, train=True, download=False, transform=transform_train) testset = GrAugMix(dataset.split("_"), root=dataroot, train=False, download=False, transform=transform_test) else: raise ValueError('invalid dataset name=%s' % dataset) if not hasattr(total_trainset, "gr_ids"): total_trainset.gr_ids = None if gr_ids is not None: total_trainset.gr_ids = gr_ids if gr_assign is not None and total_trainset.gr_ids is None: # eval_tta3 temp_trainset = copy.deepcopy(total_trainset) # temp_trainset.transform = transform_test # just normalize temp_loader = torch.utils.data.DataLoader( temp_trainset, batch_size=batch, shuffle=False, num_workers=4, drop_last=False) gr_dist = gr_assign(temp_loader) gr_ids = torch.max(gr_dist)[1].numpy() if split > 0.0: if train_idx is None or valid_idx is None: # filter by split ratio sss = StratifiedShuffleSplit(n_splits=5, test_size=split, random_state=0) sss = sss.split(list(range(len(total_trainset))), total_trainset.targets) for _ in range(split_idx + 1): train_idx, valid_idx = next(sss) if gr_id is not None: # filter by group idx2gr = total_trainset.gr_ids ps = PredefinedSplit(idx2gr) ps = ps.split() for _ in range(gr_id + 1): _, gr_split_idx = next(ps) train_idx = [idx for idx in train_idx if idx in gr_split_idx] valid_idx = [idx for idx in valid_idx if idx in gr_split_idx] if target_lb >= 0: train_idx = [i for i in train_idx if total_trainset.targets[i] == target_lb] valid_idx = [i for i in valid_idx if total_trainset.targets[i] == target_lb] train_sampler = SubsetRandomSampler(train_idx) valid_sampler = SubsetSampler(valid_idx) if not rand_val else SubsetRandomSampler(valid_idx) if multinode: train_sampler = torch.utils.data.distributed.DistributedSampler(Subset(total_trainset, train_idx), num_replicas=dist.get_world_size(), rank=dist.get_rank()) else: train_sampler = None valid_sampler = SubsetSampler([]) if gr_id is not None: # filter by group idx2gr = total_trainset.gr_ids ps = PredefinedSplit(idx2gr) ps = ps.split() for _ in range(gr_id + 1): _, gr_split_idx = next(ps) targets = [total_trainset.targets[idx] for idx in gr_split_idx] total_trainset = Subset(total_trainset, gr_split_idx) total_trainset.targets = targets if train_idx is not None and valid_idx is not None: if dataset in ["svhn", "reduced_svhn"]: targets = [total_trainset.labels[idx] for idx in train_idx] else: targets = [total_trainset.targets[idx] for idx in train_idx] total_trainset = Subset(total_trainset, train_idx) total_trainset.targets = targets if multinode: train_sampler = torch.utils.data.distributed.DistributedSampler(total_trainset, num_replicas=dist.get_world_size(), rank=dist.get_rank()) logger.info(f'----- dataset with DistributedSampler {dist.get_rank()}/{dist.get_world_size()}') trainloader = torch.utils.data.DataLoader( total_trainset, batch_size=batch, shuffle=True if train_sampler is None else False, num_workers=8 if torch.cuda.device_count()==8 else 4, pin_memory=True, sampler=train_sampler, drop_last=True) validloader = torch.utils.data.DataLoader( total_trainset, batch_size=batch, shuffle=False, num_workers=4, pin_memory=True, sampler=valid_sampler, drop_last=False if not rand_val else True) testloader = torch.utils.data.DataLoader( testset, batch_size=batch, shuffle=False, num_workers=8 if torch.cuda.device_count()==8 else 4, pin_memory=True, drop_last=False ) return train_sampler, trainloader, validloader, testloader
print('y') print(y) print() print() print('------------------------------') ########manually splitting into training and validation set test_fold = [-1] * 6 + [0] * 4 #-1 indicates that index will belong to the training set #in this case, the first 6 records will belong to the training set #others will belong to validation set ps = PredefinedSplit(test_fold) #########splitted print('splitted training and validation set') for train_index, test_index in ps.split(): print('train_index', train_index) print('X_train (first 6 records of X)') print(X[train_index]) print('y_train (first 6 records of y)') print(y[train_index]) print() print() print('test_index', test_index) print('X_test (last 4 records of X)') print(X[test_index]) print('y_test (last 4 records of y)') print(y[test_index])
errors='ignore') df = df.set_index(pd.DatetimeIndex(df['time'])) X = df.drop(columns=[ df.keys()[0], 'sbi', 'bemp', 'time', 'sbi_1h', 'sbi_2h', 'sbi_3h', 'sbi_4h', 'sbi_5h', 'sbi_6h', 'sbi_7h', 'sbi_8h', 'sbi_9h', 'sbi_10h', 'sbi_11h', 'sbi_12h', 'sbi_1d', 'sbi_2d', 'sbi_3d', 'sbi_4d', 'sbi_5d', 'sbi_6d', 'sbi_7d', 'y_sbi' ]) Y = df['bemp'] print(X.columns) # Data Splitter arr = index_splitter(N=len(X), fold=3) ps = PredefinedSplit(arr) for train, test in ps.split(): train_index = train test_index = test train_X, train_y = X.iloc[train_index, :], Y.iloc[train_index] test_X, test_y = X.iloc[test_index, :], Y.iloc[test_index] if True: regressor = LinearRegression() regressor.fit(train_X, train_y) pickle.dump(regressor, open('bemp_model.pkl', 'wb')) model = pickle.load(open('bemp_model.pkl', 'rb')) print(model.predict(test_X))
def lasso(X, Y, kfold=3, feature_set=None): arr = index_splitter(N=len(X), fold=kfold) ps = PredefinedSplit(arr) for train, test in ps.split(): train_index = train test_index = test train_X, train_y = X.values[train_index, :], Y.values[train_index] test_X, test_y = X.values[test_index, :], Y.values[test_index] arr = index_splitter(N=len(train_X), fold=kfold) ps2 = PredefinedSplit(arr) # Create the random grid alpha = np.linspace(0, 1, 10) random_grid = {'alpha': alpha} lasso = Lasso(random_state=42) # Look at parameters used by our current forest print('Parameters currently in use:\n') pprint(lasso.get_params()) # Use the random grid to search for best hyperparameters # First create the base model to tune # Random search of parameters, using 3 fold cross validation, # search across 100 different combinations, and use all available cores lasso_random = RandomizedSearchCV(estimator=lasso, param_distributions=random_grid, scoring='neg_mean_squared_error', cv=ps2.split(), verbose=2, random_state=42, n_jobs=-1) # Fit the random search model lasso_random.fit(train_X, train_y) pprint(lasso_random.best_params_) cv_result_rd = lasso_random.cv_results_ BestPara_random = lasso_random.best_params_ ## Grid search of parameters, using 3 fold cross validation based on Random search from sklearn.model_selection import GridSearchCV # Number of trees in random forest alpha = np.linspace(BestPara_random["alpha"] - 0.2, BestPara_random["alpha"] + 0.2, 10) # Create the random grid grid_grid = {'alpha': alpha} lasso_grid = GridSearchCV(estimator=lasso, param_grid=grid_grid, scoring='neg_mean_squared_error', cv=ps2.split(), verbose=2, n_jobs=-1) # Fit the grid search model lasso_grid.fit(train_X, train_y) BestPara_grid = lasso_grid.best_params_ pprint(lasso_grid.best_params_) cv_results_grid = lasso_grid.cv_results_ # Fit the base line search model lasso.fit(train_X, train_y) #prediction predict_y = lasso_random.predict(test_X) predict_y_grid = lasso_grid.predict(test_X) predict_y_base = lasso.predict(test_X) def RMLSE(predict_y_grid, predict_y, predict_y_base, test_y): errors_Grid_CV = np.sqrt(mean_squared_log_error( predict_y_grid, test_y)) errors_Random_CV = np.sqrt(mean_squared_log_error(predict_y, test_y)) errors_baseline = np.sqrt( mean_squared_log_error(predict_y_base, test_y)) return errors_Grid_CV, errors_Random_CV, errors_baseline errors_Grid_CV = (mean_squared_error(predict_y_grid, test_y)) #,squared = False)) errors_Random_CV = (mean_squared_error(predict_y, test_y)) #,squared = False)) errors_baseline = (mean_squared_error(predict_y_base, test_y)) #,squared = False)) results = [errors_Grid_CV, errors_Random_CV, errors_baseline] print('lasso results:', results) if True: fig = plt.figure(figsize=(20, 8)) x_axis = range(3) plt.bar(x_axis, results) plt.xticks(x_axis, ('GridSearchCV', 'RandomizedSearchCV', 'Baseline')) #plt.show() plt.savefig('lasso_error_compare.png') #feature importance #num_feature = len(lasso.best_estimator_.feature_importances_) #plt.figure(figsize=(24,6)) #plt.bar(range(0,num_feature*4,4),lasso.best_estimator_.feature_importances_) #label_name = X.keys() #plt.xticks(range(0,num_feature*4,4), label_name) #plt.title("Feature Importances"+",kfold="+str(kfold)) #plt.show() #plt.savefig('lasso_feature_importance.png') fig = plt.figure(figsize=(20, 8)) ax = fig.gca() x_label = range(0, len(predict_y_grid)) plt.title("kfold=" + str(kfold)) ax.plot(x_label, predict_y_grid, 'r--', label="predict") ax.plot(x_label, test_y, label="ground_truth") ax.set_ylim(0, 200) ax.legend() #plt.show() plt.savefig('lasso_prediction.png') return lasso_grid.predict, lasso_grid.best_estimator_