def main(): args = parse_args() data_path = Path(args.data_folder) output_path = Path(args.output) output_path.mkdir(parents=True) interval_size_scorer = IntervalScorer(mean_interval_size, {"confidence": args.confidence}) error_rate_scorer = IntervalScorer(mean_error_rate, {"confidence": args.confidence}) scorers = { "mean_interval_size": interval_size_scorer, "mean_error_rate": error_rate_scorer } for filepath in data_path.glob("*.arff"): X, y = load_arff_data(filepath) print(X.shape) for i in range(args.repeats): mfr = MondrianForestRegressor(n_estimators=args.n_estimators) results = prequential_evaluation(mfr, X, y, scorers, args.window_size) out_file = output_path / (filepath.stem + "_{}.json".format(i)) results["arguments"] = args results["learner_params"] = mfr.get_params() out_file.write_text(json.dumps(results))
def __init__(self, batch_size): """ :param batch_size: Integer value, defined by the competition and available at competition page :param server_port: Connection string ('IP:port') :param user_email: String, e-mail used for registering to competition :param token: String, received after subscription to a competition :param competition_code: String, received after subscription to a competition :param first_prediction: Prediction, class generated from .proto file. Used to initiate communication with the server. Not influencing the results. Should contain appropriate fields from .proto file. """ # mondrian self.mfr = MondrianForestRegressor(random_state=1, n_estimators=100, bootstrap=True) self.previous_target_3 = pd.Series() self.features_for_rowID = Queue() self.previous_train_batch = np.array([-1, -1, -1, -1, -1]) # rrcf self.num_trees = 40 self.tree_size = 256 self.forest = [] self.avg_codisp = {} self.curr_sum = 0 self.curr_num = 0 self.idx = 0 self._init_modeling() a = 1 while a == 1: print("wait") now = datetime.datetime.now() starttime = now.replace(hour=21, minute=0, second=0, microsecond=0) if now >= starttime: print(now) print("시작!") break self.batch_size = batch_size self.stop_thread = False self.predictions_to_send = Queue() self.channel = grpc.insecure_channel( 'app.streaming-challenge.com:50051') self.stub = file_pb2_grpc.DataStreamerStub(self.channel) self.user_email = '*****@*****.**' self.competition_code = 'jR' #oj self.token = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VyX2lkIjoieXU5OTA1MjRAZ21haWwuY29tIiwiY29tcGV0aXRpb25faWQiOiIxIn0.B7CAjAsEbTjp4l1K4GR1Y0IJZj6_mKEbKBXsXXJmGBg' self.predictions_to_send.put( file_pb2.Prediction(rowID=1000, target=333)) self.metadata = self.create_metadata(user_id=self.user_email, code=self.competition_code, token=self.token)
def test_partial_fit_equivalence(): X, y = make_regression(random_state=0, n_samples=100) mfr = MondrianForestRegressor(random_state=0) mfr.partial_fit(X, y) for batch_size in [10, 20, 25, 50, 90]: check_partial_fit_equivalence(batch_size, mfr, 0, X, y) X, y = make_classification(random_state=0, n_samples=100) mtc = MondrianForestClassifier(random_state=0) mtc.partial_fit(X, y) for batch_size in [10, 20, 25, 50, 90]: check_partial_fit_equivalence(batch_size, mtc, 0, X, y, is_clf=True)
def test_fit_after_partial_fit(): rng = np.random.RandomState(0) X = rng.randn(10, 5) y = np.floor(rng.randn(10)) mfr = MondrianForestRegressor(random_state=0) check_fit_after_partial_fit(mfr, X, y) mfc = MondrianForestClassifier(random_state=0) check_fit_after_partial_fit(mfc, X, y)
def test_min_samples_split(): X_c, y_c = load_digits(return_X_y=True) X_r, y_r = make_regression(n_samples=10000, random_state=0) for mss in [2, 4, 10, 20]: mfr = MondrianForestRegressor(random_state=0, min_samples_split=mss) mfr.partial_fit(X_r[:X_r.shape[0] // 2], y_r[:X_r.shape[0] // 2]) mfr.partial_fit(X_r[X_r.shape[0] // 2:], y_r[X_r.shape[0] // 2:]) for est in mfr.estimators_: n_node_samples = est.tree_.n_node_samples[ est.tree_.children_left != -1] assert_greater(np.min(n_node_samples) + 1, mss) mfc = MondrianForestClassifier(random_state=0, min_samples_split=mss) mfc.partial_fit(X_c[:X_c.shape[0] // 2], y_c[:X_c.shape[0] // 2]) mfc.partial_fit(X_c[X_c.shape[0] // 2:], y_c[X_c.shape[0] // 2:]) for est in mfc.estimators_: n_node_samples = est.tree_.n_node_samples[ est.tree_.children_left != -1] assert_greater(np.min(n_node_samples) + 1, mss)
def test_forest_attributes(): mr = MondrianForestRegressor(n_estimators=5, random_state=0) mr.fit([[1, 2, 3], [4, 5, 6]], [1, 2]) assert_false(hasattr(mr, "classes_")) assert_false(hasattr(mr, "n_classes_")) mr = MondrianForestClassifier(n_estimators=5, random_state=0) mr.fit([[1, 2, 3], [4, 5, 6]], [1, 2]) assert_true(hasattr(mr, "classes_")) assert_true(hasattr(mr, "n_classes_"))
def test_quantile_toy_data(): rng = np.random.RandomState(1) x1 = rng.randn(1, 10) X1 = np.tile(x1, (10000, 1)) x2 = 20.0 * rng.randn(1, 10) X2 = np.tile(x2, (10000, 1)) X = np.concatenate((X1, X2)) y1 = rng.randn(10000) y2 = 5.0 + rng.randn(10000) y = np.concatenate((y1, y2)) est = MondrianForestRegressor(random_state=1) # est.set_params(max_depth=1) est.fit(X, y) for quantile in range(10, 90, 10): tree_quantile = 0.01 * quantile assert_array_almost_equal( est.predict_quantile(x1, quantile=tree_quantile), [np.percentile(y1, quantile)], 2) assert_array_almost_equal( est.predict_quantile(x2, quantile=tree_quantile), [np.percentile(y2, quantile)], 2)
def check_partial_fit_equivalence(size_batch, f, random_state, X, y, is_clf=False): start_ptr = list(range(0, 100, size_batch)) end_ptr = start_ptr[1:] + [100] if not is_clf: p_f = MondrianForestRegressor(random_state=random_state) else: p_f = MondrianForestClassifier(random_state=random_state) for start, end in zip(start_ptr, end_ptr): p_f.partial_fit(X[start:end], y[start:end]) for est, p_est in zip(f.estimators_, p_f.estimators_): assert_array_equal(p_est.tree_.n_node_samples, est.tree_.n_node_samples) assert_array_equal(p_est.tree_.threshold, est.tree_.threshold) assert_array_equal(p_est.tree_.feature, est.tree_.feature) assert_equal(p_est.tree_.root, est.tree_.root) assert_array_equal(p_est.tree_.value, est.tree_.value) assert_equal(est.tree_.n_node_samples[est.tree_.root], 100) assert_equal(p_est.tree_.n_node_samples[est.tree_.root], 100)
def test_interval_scorer(): # Fit a simple linear model n_samples = 200 n_features = 10 rng = np.random.RandomState(0) X = rng.normal(size=(n_samples, n_features)) w = rng.normal(size=n_features) # simple linear function without noise y = np.dot(X, w) mfr = MondrianForestRegressor() mfr.fit(X, y) # Create a scorer that measures the mean interval size interval_size_scorer = IntervalScorer(mean_interval_size, sign=-1, kwargs={'confidence': 0.9}) # Get prediction intervals intervals = mfr.predict_interval(X, 0.9) interval_size = intervals[:, 1] - intervals[:, 0] calc_mean = np.mean(interval_size) # Ensure the scorer performs the correct calculation assert_almost_equal(interval_size_scorer(mfr, X, y), -1 * calc_mean)
def test_mean_std_forest_regressor(): mfr = MondrianForestRegressor(random_state=0) mfr.fit(X, y) # For points completely in the training data. # and max depth set to None. # mean should converge to the actual target value. # variance should converge to 0.0 mean, std = mfr.predict(X, return_std=True) assert_array_almost_equal(mean, y, 5) assert_array_almost_equal(std, 0.0, 2) # For points completely far away from the training data, this # should converge to the empirical mean and variance. # X is scaled between to -1.0 and 1.0 X_inf = np.vstack( (30.0 * np.ones(X.shape[1]), -30.0 * np.ones(X.shape[1]))) inf_mean, inf_std = mfr.predict(X_inf, return_std=True) assert_array_almost_equal(inf_mean, y.mean(), 1) assert_array_almost_equal(inf_std, y.std(), 2)
from skgarden import MondrianForestClassifier from skgarden import MondrianForestRegressor train_test_split.__test__ = False boston = load_boston() # The time of split and feature chosen for splitting are highly # scale-sensitive. scaler = MinMaxScaler() X, y = boston.data, boston.target y = np.round(y) X = scaler.fit_transform(X) ensembles = [ MondrianForestRegressor(random_state=0), MondrianForestClassifier(random_state=0)] def check_boston(est): score = est.score(X, y) assert_greater(score, 0.94, "Failed with score = %f" % score) def test_boston(): mr = MondrianForestRegressor(n_estimators=5, random_state=0) mr.fit(X, y) check_boston(mr) mr.partial_fit(X, y) check_boston(mr)
def test_boston(): mr = MondrianForestRegressor(n_estimators=5, random_state=0) mr.fit(X, y) score = mr.score(X, y) assert_greater(score, 0.94, "Failed with score = %f" % score)
class Client: """ gRPC Client class for streaming competition platform""" channel = None stub = None def __init__(self, batch_size): """ :param batch_size: Integer value, defined by the competition and available at competition page :param server_port: Connection string ('IP:port') :param user_email: String, e-mail used for registering to competition :param token: String, received after subscription to a competition :param competition_code: String, received after subscription to a competition :param first_prediction: Prediction, class generated from .proto file. Used to initiate communication with the server. Not influencing the results. Should contain appropriate fields from .proto file. """ # mondrian self.mfr = MondrianForestRegressor(random_state=1, n_estimators=100, bootstrap=True) self.previous_target_3 = pd.Series() self.features_for_rowID = Queue() self.previous_train_batch = np.array([-1, -1, -1, -1, -1]) # rrcf self.num_trees = 40 self.tree_size = 256 self.forest = [] self.avg_codisp = {} self.curr_sum = 0 self.curr_num = 0 self.idx = 0 self._init_modeling() a = 1 while a == 1: print("wait") now = datetime.datetime.now() starttime = now.replace(hour=21, minute=0, second=0, microsecond=0) if now >= starttime: print(now) print("시작!") break self.batch_size = batch_size self.stop_thread = False self.predictions_to_send = Queue() self.channel = grpc.insecure_channel( 'app.streaming-challenge.com:50051') self.stub = file_pb2_grpc.DataStreamerStub(self.channel) self.user_email = '*****@*****.**' self.competition_code = 'jR' #oj self.token = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VyX2lkIjoieXU5OTA1MjRAZ21haWwuY29tIiwiY29tcGV0aXRpb25faWQiOiIxIn0.B7CAjAsEbTjp4l1K4GR1Y0IJZj6_mKEbKBXsXXJmGBg' self.predictions_to_send.put( file_pb2.Prediction(rowID=1000, target=333)) self.metadata = self.create_metadata(user_id=self.user_email, code=self.competition_code, token=self.token) @staticmethod def create_metadata(user_id, code, token): """ :param user_id: :param code: :param token: :return: """ metadata = [(b'authorization', bytes(token, 'utf-8')), (b'user_id', bytes(user_id, 'utf-8')), (b'competition_id', bytes(code, 'utf-8'))] return metadata @staticmethod def create_forest(num_trees): forest = [] for _ in range(num_trees): tree = rrcf.RCTree() forest.append(tree) return forest def partial_train(self, X_test, y_test): y_pred, y_std = self.mfr.predict(X_test, return_std=True) self.mfr.partial_fit(X_test, y_test) #print('pred : %f, std: %f, y: %f'%(y_pred, y_std, y_test)) return y_pred, y_std def _init_modeling(self): network = pd.read_csv('initial_training_data.csv', index_col='date', parse_dates=['date']) self.forest = [] for _ in range(self.num_trees): tree = rrcf.RCTree() self.forest.append(tree) train_len = len(network) #train_len = 1000 train_start = 80000 self.idx = 0 print("start!") for index in range(train_start, train_len): point = float(network[index:index + 1].values) # get one by one for tree in self.forest: if len(tree.leaves) > self.tree_size: tree.forget_point(self.idx - self.tree_size) tree.insert_point(point, index=self.idx) if not index in self.avg_codisp: self.avg_codisp[self.idx] = 0 self.avg_codisp[self.idx] += tree.codisp( self.idx) / self.num_trees # avg_codisp은 (각 tree 이 point를 anomaly로 생각하는 정도)의 평균 mean = np.array(list(self.avg_codisp.values())).mean() std = np.array(list(self.avg_codisp.values())).std() z = (self.avg_codisp[self.idx] - mean) / std self.idx += 1 if z > 3.0 or z < -3.0: # if abs(z-score) is over 3.0 # replace the value with the mean of prev 5 days network.iloc[index] = network[index - 5:index].mean() # print("init_modeling에서 anomaly detection 완료") print("init_modeling에서 trainign 시작") for i in range(7 + train_start, train_len): X_train = pd.Series() X_train['prev1'] = float(network[i - 7:i - 6]['target'].values) X_train['prev2'] = float(network[i - 6:i - 5]['target'].values) X_train['prev3'] = float(network[i - 5:i - 4]['target'].values) y_train = (network[i:i + 1]['target'].values) self.mfr.partial_fit(X_train.values.reshape(1, -1), y_train) print("train 완료") self.previous_target_3['prev3'] = float( network[train_len - 8:train_len - 7]['target'].values) self.previous_target_3['prev2'] = float( network[train_len - 7:train_len - 6]['target'].values) self.previous_target_3['prev1'] = float( network[train_len - 6:train_len - 5]['target'].values) self.previous_train_batch = network[train_len - 5:train_len]['target'].values print('endebded') def generate_predictions(self): """ Sending predictions :return: Prediction """ while True: try: prediction = self.predictions_to_send.get(block=True, timeout=60) print("Prediction: ", prediction) yield prediction except queue.Empty: self.stop_thread = True break #check anomaly with RRCF def anomaly_detection(self, data): for tree in self.forest: if len(tree.leaves) > self.tree_size: tree.forget_point(self.idx - self.tree_size) tree.insert_point(data, index=self.idx) if not self.idx in self.avg_codisp: self.avg_codisp[self.idx] = 0 self.avg_codisp[self.idx] += tree.codisp(self.idx) / self.num_trees # avg_codisp은 (각 tree 이 point를 anomaly로 생각하는 정도)의 평균 mean = np.array(list(self.avg_codisp.values())).mean() std = np.array(list(self.avg_codisp.values())).std() z = (self.avg_codisp[self.idx] - mean) / std self.idx += 1 if z > 3.0 or z < -3.0: return self.previous_train_batch.mean() # if abs(z-score) is over 3.0 # replace the value with the mean of whole data we met else: return data #if not over 3.0, then no need to replace the value def loop_messages(self): """ Getting messages (data instances) from the stream. :return: """ #generate prediction -> get prediction from predictions_to_send one by one ans SEND to server messages = self.stub.sendData(self.generate_predictions(), metadata=self.metadata) test_idx = 0 test_feature = self.previous_target_3 try: for message in messages: message = json.loads(json_format.MessageToJson(message)) print("message:", message) if message['tag'] == 'TEST': print('test') test_feature['prev3'] = test_feature['prev2'] test_feature['prev2'] = test_feature['prev1'] test_feature['prev1'] = float( self.previous_train_batch[test_idx]) pred = self.mfr.predict(test_feature.values.reshape(1, -1)) prediction = file_pb2.Prediction(rowID=message['rowID'], target=pred) self.predictions_to_send.put(prediction) # test_idx = (test_idx + 1) % 5 print(test_idx) print('test end') if message['tag'] == 'TRAIN': print('train') #training data to train my model. target = message['target'] target = self.anomaly_detection(target) print(self.previous_target_3) # i-5, i-6, i-7 의 값을 갖고 학습 if self.previous_target_3['prev3'] < 0: self.previous_target_3['prev3'] = target elif self.previous_target_3['prev2'] < 0: self.previous_target_3['prev2'] = target elif self.previous_target_3['prev1'] < 0: self.previous_target_3['prev1'] = target else: print('else') #replace the oldest value self.previous_target_3[ 'prev3'] = self.previous_target_3['prev2'] #-7 self.previous_target_3[ 'prev2'] = self.previous_target_3['prev1'] #-6 self.previous_target_3['prev1'] = float( self.previous_train_batch[0]) #-5 # partial fit with 3 previous values as feature self.mfr.partial_fit( self.previous_target_3.values.reshape(1, -1), [target]) #현재 train data의 target값 저장 self.previous_train_batch = np.roll( self.previous_train_batch, -1) self.previous_train_batch[4] = target print('else end') print('train end') if self.stop_thread: break except Exception as e: print(str(e)) pass def run(self): """ Start thread. """ print("Start") t1 = Thread(target=self.loop_messages) t1.start()
import numpy as np from sklearn.datasets import load_boston X = load_boston(return_X_y=True) X_train = X[0] y_train = X[1] #@print(X_train) print(X_train.shape) print(np.amax(X_train)) print(np.amin(X_train)) ### Use MondrianForests for variance estimation from skgarden import MondrianForestRegressor mfr = MondrianForestRegressor() mfr.fit(X_train, y_train) y_mean, y_std = mfr.predict(X_train, return_std=True) print(y_mean) #print(y_std) ### Use QuantileForests for quantile estimation #from skgarden import RandomForestQuantileRegressor #rfqr = RandomForestQuantileRegressor(random_state=0) #rfqr.fit(X, y) #y_mean = rfqr.predict(X) #y_median = rfqr.predict(X, 50)
scaler_X = preprocessing.MinMaxScaler() features = scaler_X.fit_transform(features) features = pd.DataFrame(features) scaler_y = preprocessing.MinMaxScaler() labels = scaler_y.fit_transform(labels) labels = pd.DataFrame(labels) #REGRESSORS PAR = PassiveAggressiveRegressor() SGDR = SGDRegressor() MLPR = MLPRegressor() RHT = RegressionHoeffdingTree() RHAT = RegressionHAT() MFR = MondrianForestRegressor() MTR = MondrianTreeRegressor() regressors = [PAR, SGDR, MLPR, RHT, RHAT, MFR, MTR] #7 regressors_names = [] for r in range(len(regressors)): reg_name = regressors[r].__class__.__name__ if reg_name == 'PassiveAggressiveRegressor': regressors_names.append('PAR') elif reg_name == 'SGDRegressor': regressors_names.append('SGDR') elif reg_name == 'MLPRegressor': regressors_names.append('MLPR') elif reg_name == 'RegressionHoeffdingTree':
def test_mean_std_forest_regressor(): mfr = MondrianForestRegressor(random_state=0) mfr.fit(X, y) check_mean_std_forest_regressor(mfr) mfr.partial_fit(X, y) check_mean_std_forest_regressor(mfr)
def test_boston(): mr = MondrianForestRegressor(n_estimators=5, random_state=0) mr.fit(X, y) check_boston(mr) mr.partial_fit(X, y) check_boston(mr)
def RF_regressor(X_data,Y_data,options=None): from sklearn.ensemble import RandomForestRegressor #################### # Parse user options #################### params = {} gridsearch = False GS_settings = None randomsearch = False RS_settings = None feature_selection = False accuracy = False cv_type = 'logo' scoring = 'neg_mean_absolute_error' mondrian = False search_std = False if (options is not None): if (("RF_parameters" in options)==True): params = options['RF_parameters'] if (("grid_search" in options)==True): from sklearn.model_selection import GridSearchCV import time gridsearch = True GS_params = options['grid_search']['parameter_grid'] if (("settings" in options['grid_search'])==True): GS_settings = options['grid_search']['settings'] if (("search std" in options['grid_search'])==True): search_std = options['grid_search']['search std'] if (("random_search" in options)==True): from sklearn.model_selection import RandomizedSearchCV from cfd2ml.utilities import convert_param_dist import time randomsearch = True RS_params, RS_Nmax = convert_param_dist(options['random_search']['parameter_grid']) print('RS_Nmax = ', RS_Nmax) if (("settings" in options['random_search'])==True): RS_settings = options['random_search']['settings'] if(randomsearch==True and gridsearch==True): quit('********** Stopping! grid_search and random_search both set *********') if (("feature_selection" in options)==True): from cfd2ml.utilities import RFE_perm feature_selection = True feats = options['feature_selection']['feats'] # if("step" in options['feature_selection']): step = options['feature_selection']['step'] # if("min_features" in options['feature_selection']): min_features = options['feature_selection']['min_features'] if(randomsearch==True or gridsearch==True): quit('******** Stopping! grid/random_search and feature selection both set ********') if (("accuracy" in options)==True): accuracy = options['accuracy'] if (accuracy==True): from sklearn.model_selection import cross_validate from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error if (("scoring" in options)==True): scoring = options['scoring'] if (("cv_type" in options)==True): cv_type = options['cv_type'] if (("mondrian" in options)==True): mondrian = options['mondrian'] if mondrian: from skgarden import MondrianForestRegressor ############## # Prepare data ############## if(cv_type=='logo'): groups = X_data['group'] X_data = X_data.drop(columns='group') # Find feature and target headers X_headers = X_data.columns Y_header = Y_data.name nX = X_headers.size print('\nFeatures:') for i in range(0,nX): print('%d/%d: %s' %(i+1,nX,X_headers[i]) ) print('\nTarget: ', Y_header) ######################## # Prepare other settings ######################## # Setting cross-validation type (either leave-one-group-out or 5-fold) if(cv_type=='logo'): from sklearn.model_selection import LeaveOneGroupOut logo = LeaveOneGroupOut() ngroup = logo.get_n_splits(groups=groups) print('\nUsing Leave-One-Group-Out cross validation on ', ngroup, ' groups') elif(cv_type=='kfold'): from sklearn.model_selection import StratifiedKFold print('\nUsing 10-fold cross validation') k_fold = StratifiedKFold(n_splits=10, random_state=42,shuffle=True) cv = k_fold.split(X_data,Y_data) ######################### # Training the regressor ######################### if(gridsearch==True): # Finding optimal hyperparameters with GridSearchCV if mondrian: print('\n Performing GridSearchCV to find optimal hyperparameters for mondrian forest regressor') regr = MondrianForestRegressor(**params,random_state=42,bootstrap=False) if search_std: # MESSY HACK! Ignore "best model etc" if using this def my_scorer(model, X, y_true): y_pred, y_sd = model.predict(X,return_std=True) return np.mean(y_sd) scoring=my_scorer else: print('\n Performing GridSearchCV to find optimal hyperparameters for random forest regressor') regr = RandomForestRegressor(**params,random_state=42) if (cv_type=='logo'): cv = logo.split(X_data,Y_data,groups) GS_regr = GridSearchCV(estimator=regr,param_grid=GS_params, cv=cv, scoring=scoring, iid=False, verbose=2, **GS_settings) GS_regr.fit(X_data,Y_data) # Write out results to file scores_df = pd.DataFrame(GS_regr.cv_results_)#.sort_values(by='rank_test_score') scores_df.to_csv('GridSearch_results.csv') # Pich out best results best_params = GS_regr.best_params_ best_score = GS_regr.best_score_ regr = GS_regr.best_estimator_ # (this regr has been fit to all of the X_data,Y_data) print('\nBest hyperparameters found:', best_params) print('\nScore with these hyperparameters:', best_score) elif(randomsearch==True): # Finding optimal hyperparameters with RandomSearchCV if mondrian: print('\n Performing RandomizedSearchCV to find optimal hyperparameters for mondrian forest regressor') regr = MondrianForestRegressor(**params,random_state=42,bootstrap=False) else: print('\n Performing RandomizedSearchCV to find optimal hyperparameters for random forest regressor') regr = RandomForestRegressor(**params,random_state=42) if (cv_type=='logo'): cv = logo.split(X_data,Y_data,groups) RS_regr = RandomizedSearchCV(estimator=regr,param_distributions=RS_params, cv=cv, scoring=scoring,iid=False, verbose=2, error_score=np.nan, **RS_settings) RS_regr.fit(X_data,Y_data) # Write out results to file scores_df = pd.DataFrame(RS_regr.cv_results_)#.sort_values(by='rank_test_score') scores_df.to_csv('RandomSearch_results.csv') # Pick out best results best_params = RS_regr.best_params_ best_score = RS_regr.best_score_ regr = RS_regr.best_estimator_ # (this regr has been fit to all of the X_data,Y_data) print('\nBest hyperparameters found:', best_params) print('\nScore with these hyperparameters:', best_score) else: # Train RF regressor with hyperparameters given by user if mondrian: print('\nTraining mondrian forest regressor with given hyperparameters') regr = MondrianForestRegressor(**params,bootstrap=False) else: print('\nTraining random forest regressor with given hyperparameters') regr = RandomForestRegressor(**params) # Feature selection before final fit if (feature_selection): if (cv_type=='logo'): cv = logo.split(X_data,Y_data,groups) # [nfeats,scores,traintimes,predtimes], bestscore, bestfeat, featsets = RFE_perm(regr,X_data,Y_data,cv=cv,scoring=scoring,step=step,min_features=min_features,timing=True) [nfeats,scores,traintimes,predtimes], bestscore, bestfeat, featsets = RFE_perm(regr,X_data,Y_data,feats,cv=cv,scoring=scoring,timing=True) if (scoring=='neg_mean_absolute_error'): scores = -scores bestscore = -bestscore elif(scoring=='neg_mean_squared_error'): scores = np.sqrt(-scores) bestscore = np.sqrt(-bestscore) import matplotlib.pyplot as plt plt.figure() plt.plot(nfeats,100*scores,lw=2) plt.xlabel('$N_{features}$') plt.ylabel('Score (%)') plt.figure() plt.plot(nfeats,traintimes,label='Training',lw=2) plt.plot(nfeats, predtimes,label='Prediction',lw=2) plt.xlabel('$N_{features}$') plt.ylabel('Time (s)') plt.legend() plt.show() print('Best score: %.2f' %(100*bestscore)) print('Feature set:') print(X_headers[bestfeat]) # Save results in CSV file featselect_df = pd.DataFrame(featsets,columns=X_headers) featselect_df['score'] = scores featselect_df['traintimes'] = traintimes featselect_df['predtimes'] = predtimes featselect_df['nfeats'] = nfeats featselect_df.to_csv('FeatSelect_results.csv') # cut down to optimial feature set X_data = X_data.iloc[:,bestfeat] # Fit model to data regr.fit(X_data,Y_data) # Cross validation accuracy metrics if(accuracy==True): print('\nPerforming cross validation to determine train and test accuracy/error') # Get generator object depending on cv strategy if (cv_type=='logo'): cv = logo.split(X_data,Y_data,groups) elif(cv_type=='kfold'): cv = k_fold.split(X_data,Y_data) # Need to regen "Generator" object from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error # Init lists train_r2 = [] test_r2 = [] train_MAE = [] test_MAE = [] train_MSE = [] test_MSE = [] # Loop through CV folds i = 0 for train_index, test_index in cv: X_train, X_test = X_data.iloc[train_index], X_data.iloc[test_index] Y_train, Y_test = Y_data.iloc[train_index], Y_data.iloc[test_index] # Train regressor regr_cv = regr regr_cv.fit(X_train, Y_train) # Predict Y Y_pred_train = regr_cv.predict(X_train) Y_pred_test = regr_cv.predict(X_test ) # r2 scores r2score = r2_score(Y_test , Y_pred_test) train_r2.append(r2_score(Y_train, Y_pred_train) ) test_r2.append(r2score) # Mean absolute error scores MAEscore = mean_absolute_error(Y_test , Y_pred_test) train_MAE.append(mean_absolute_error(Y_train, Y_pred_train) ) test_MAE.append(MAEscore) # Mean squared error scores MSEscore = mean_squared_error(Y_test , Y_pred_test) train_MSE.append(mean_squared_error(Y_train, Y_pred_train) ) test_MSE.append(MSEscore) # Print validation scores (training scores are stored to print mean later, but not printed for each fold) if(cv_type=='logo'): print('\nTest group = ', groups.iloc[test_index[0]]) elif(cv_type=='kfold'): print('\nFold = ', i) print('-------------------') print('r2 score = %.2f %%' %(r2score*100) ) print('Mean absolute error = %.2f %%' %(MAEscore*100) ) print('Mean squared error = %.2f %%' %(MSEscore*100) ) i += 1 # Print performance scores print('\nMean training scores:') print('r2 score = %.2f %%' %(np.mean(train_r2)*100) ) print('Mean absolute error = %.2f %%' %(np.mean(train_MAE)*100) ) print('Mean squared error = %.2f %%' %(np.mean(train_MSE)*100) ) print('\nMean validation scores:') print('r2 score = %.2f %%' %(np.mean(test_r2)*100) ) print('Mean absolute error = %.2f %%' %(np.mean(test_MAE)*100) ) print('Mean squared error = %.2f %%' %(np.mean(test_MSE)*100) ) return regr