def feature_selection(self, test_function='rmse'): """ Runs through a feature selection algorithm which enumerates the possible subsets of the input features and attempts to minimise the test_metric error on the validation set after training the classifier on the training data. Updates the self.best_features attribute which can then be used to run the full model on the training and test data. This is only really appropriate for a relatively small number of features. To avoid computational intensity, there is no hyperparameter optimisation, instead standard parameters are calculated from the data. Once the best feature set has been identified, one can use the additional functionality in the library to tune the hyperparameters. Args: test_function : function - default is rmse testing, but others are available, see _test_metric? for more information Note: the input data for the features must be in the form of a pd.DataFrame """ try: if not isinstance(self.get_data('X_train'), pd.DataFrame): raise TypeError( "ERROR: The input training data was not in the form of a pd.DataFrame." ) feature_set = list(powerset(self.input_features)) print("Feature Selection") print("=================") print(" ") print("Running feature selection on a feature set of size: ", len(feature_set) - 1) print(" ") feature_dict = {} list_results = [] counter = 0 X_train_data = self.get_data('X_train') X_val_data = self.get_data('X_val') Y_train_data = self.get_data('Y_train') Y_val_data = self.get_data('Y_val') if self._latest_params == None: self.optimise_parameters() print("First optimising parameters over training set.") if (len(feature_set) < 100): counter_check = 10 elif (len(feature_set) < 1000): counter_check = 100 elif (len(feature_set) < 2500): counter_check = 250 elif (len(feature_set) < 5000): counter_check = 500 else: counter_check = 1000 for _features in feature_set[1:]: if (counter % counter_check == counter_check - 1): print('-------------------Completed ', counter + 1, ' feature sets out of ', len(feature_set) - 1, '-------------------\n') X_train_data_temp = X_train_data[list(_features)] X_val_data_temp = X_val_data[list(_features)] feature_dict[counter] = list(_features) temp_model = xgboost.XGBRegressor(**self._latest_params) temp_model.fit(X_train_data_temp, Y_train_data) val_forecast = temp_model.predict(X_val_data_temp) val_rmse = _test_metric(Y_val_data, val_forecast, test_function)[0] list_results.append(val_rmse) counter += 1 print( '-------------------Finished iterating through possible feature sets.-------------------\n' ) test_mse_df = pd.DataFrame({'test_mse': list_results}) lowest_test_mse = test_mse_df.sort_values(['test_mse']) index = lowest_test_mse.index self.best_features = feature_dict[index[0]] X_train_data_temp = X_train_data[feature_dict[index[0]]] X_val_data_temp = X_val_data[feature_dict[index[0]]] temp_model = xgboost.XGBRegressor(**self._latest_params) temp_model.fit(X_train_data_temp, Y_train_data) val_forecast = temp_model.predict(X_val_data_temp) val_rmse = _test_metric(Y_val_data, val_forecast, test_function)[0] val_forecast = temp_model.predict(X_val_data_temp) final_rmse = _test_metric(Y_val_data, val_forecast, test_function) print('Lowest Error on validation set with feature set: ', feature_dict[index[0]], '\n\n') print( 'Set best_features attribute to this set. With this choice, the following regression results were obtained on the training data:\n\n' ) print('The RMSE on the validation set was: ', final_rmse[0]) print('The mean percentage error is: ', final_rmse[1], '%.') print( '\nFinished feature selection. To see list of best_features, call get_best_features() on your classifier. To access the regression parameters, call get_latest_params()' ) except TypeError as te: print(te.args[0])
def feature_selection(self, test_function='rmse'): """ Runs through a feature selection algorithm which enumerates the possible subsets of the input features and attempts to minimise the test_metric error on the validation set after training the classifier on the training data. Updates the self.best_features attribute which can then be used to run the full model on the training and test data. This is only really appropriate for a relatively small number of features. Args: test_function : function - default is rmse testing, but others are available, see _test_metric? for more information Note: the input data for the features must be in the form of a pd.DataFrame """ try: if not isinstance(self.get_data('X_train'), pd.DataFrame): raise TypeError( "ERROR: The input training data was not in the form of a pd.DataFrame." ) feature_set = list(powerset(self.input_features)) print("Feature Selection") print("=================") print(" ") print("Running feature selection on a feature set of size: ", len(feature_set) - 1) print(" ") feature_dict = {} list_results = [] counter = 0 X_train_data = self.get_data('X_train') X_val_data = self.get_data('X_val') Y_train_data = self.get_data('Y_train') Y_val_data = self.get_data('Y_val') if (len(feature_set) < 100): counter_check = 10 elif (len(feature_set) < 1000): counter_check = 100 elif (len(feature_set) < 2500): counter_check = 250 elif (len(feature_set) < 5000): counter_check = 500 else: counter_check = 1000 for _features in feature_set[1:]: if (counter % counter_check == counter_check - 1): print('-------------------Completed ', counter + 1, ' feature sets out of ', len(feature_set) - 1, '-------------------\n') X_train_data_temp = X_train_data[list(_features)] X_val_data_temp = X_val_data[list(_features)] feature_dict[counter] = list(_features) lin_model = sm.OLS(Y_train_data, X_train_data_temp) result = lin_model.fit() val_forecast = (X_val_data_temp * result.params).sum(axis=1) val_rmse = _test_metric(Y_val_data, val_forecast, test_function)[0] list_results.append(val_rmse) # train_rsquared = result.rsquared # list_results.append(train_rsquared) counter += 1 print( '-------------------Finished iterating through possible feature sets.-------------------\n' ) test_mse_df = pd.DataFrame({'test_mse': list_results}) lowest_test_mse = test_mse_df.sort_values(['test_mse']) index = lowest_test_mse.index # test_rsquared_df = pd.DataFrame({'test_rsquared': list_results}) # highest_test_rsquared = test_rsquared_df.sort_values(['test_rsquared'], ascending=False) # index = highest_test_rsquared.index X_train_data_temp = X_train_data[feature_dict[index[0]]] X_val_data_temp = X_val_data[feature_dict[index[0]]] lin_model = sm.OLS(Y_train_data, X_train_data_temp) result = lin_model.fit() val_forecast = (X_val_data_temp * result.params).sum(axis=1) final_rmse = _test_metric(Y_val_data, val_forecast, test_function) print('Lowest Error on validation set with feature set: ', feature_dict[index[0]], '\n\n') print( 'Set best_features attribute to this set. With this choice, the following regression results were obtained on the training data:\n\n' ) self.best_features = feature_dict[index[0]] self.__result = result self._params = result.params self._was_regularised = False print(result.summary(), '\n\n') print('The RMSE on the validation set was: ', final_rmse[0]) print('The mean percentage error is: ', final_rmse[1], '%.') print( '\nFinished feature selection. To see list of best_features, call get_best_features() on your classifier. To access the regression parameters, call get_latest_params()' ) except TypeError as te: print(te.args[0])