def samples_per_leaf_node_ensemble(meta_m, m): path_out_tr = r"/home/irene/PycharmProjects/04_Risk_Model/data/skewed_leaves/samples_leaf_node_ensemble_train.csv" path_out_te = r"/home/irene/PycharmProjects/04_Risk_Model/data/skewed_leaves/samples_leaf_node_ensemble_test.csv" Y = m[:, 0] X = m[:, 1:] xtrain, xtest, ytrain, ytest, meta_m_train, meta_m_test = train_test_split(X, Y, meta_m, train_size=0.60, random_state=0) print('Type xtrain: ', xtrain.dtype) print("Raw data: ", Y.shape, X.shape) print("Train data", ytrain.shape, xtrain.shape) print("Test data: ", ytest.shape, xtest.shape) n_estimators = [50] samples_per_leaf = [1000] header = ["ntree", "nnode", "tb_per_px"] with open(path_out_tr, "w", newline="") as wtr: with open(path_out_te, "w", newline="") as wte: writer_tr = csv.writer(wtr, delimiter=";") writer_tr.writerow(header) writer_te = csv.writer(wte, delimiter=";") writer_te.writerow(header) for spl in samples_per_leaf: for n_esti in n_estimators: print() print("Analysis: RF with Skewed Leaves") print("Samples per leaf node: ", spl) print("Number of estimators: ", n_esti) print("-" * 50) ensemble = RandomForestRegressor(n_estimators=n_esti, min_samples_leaf=spl, bootstrap=True) ensemble.fit(xtrain, ytrain) leaves_train = ensemble.apply(xtrain) dicori_train = samples_per_leaf_node(leaves_train, xtrain, ytrain) l = [] for key in sorted(dicori_train.keys()): for sam in dicori_train[key]: l.append(sam[0]) newrow = [key[0], key[1]] + l writer_tr.writerow(newrow) l = [] leaves_test = ensemble.apply(xtest) dicori_test = samples_per_leaf_node(leaves_test, xtest, ytest) l = [] for key in sorted(dicori_test.keys()): for sam in dicori_test[key]: l.append(sam[0]) newrow = [key[0], key[1]] + l writer_te.writerow(newrow) l = []
class stack_model(BaseEstimator): def __init__(self, cols1 = None, cols2 = None): self.rf_stack = RandomForestRegressor(n_estimators=100, max_features=None, max_depth = 5, min_impurity_decrease = 0.0, min_samples_split = 10, min_samples_leaf=10, bootstrap=True, random_state=42) self.lm_stack = Lasso(alpha=0.001, normalize=True, max_iter=1000, random_state=42) self.cols1 = cols1 self.cols2 = cols2 self.lm_models = {} def get_params(self, deep=True): return {'cols1' : self.cols1, 'cols2' : self.cols2} def set_params(self, **parameters): self.cols1 = parameters['cols1'] self.cols2 = parameters['cols2'] return self def fit(self, df, y): cols1 = list(df.columns) if self.cols1 is None else self.cols1 cols2 = list(df.columns) if self.cols2 is None else self.cols2 self.rf_stack.fit(df[cols1],y) leaf = self.rf_stack.apply(df[cols1]) #one lm model for every rf estimator and leaf for f_idx in range(leaf.shape[1]): for leaf_num, idxs in pd.DataFrame(leaf[:,f_idx]).reset_index().groupby(0): idxs = idxs['index'].values df_leaf = df[cols2].iloc[idxs].copy() y_leaf = y.iloc[idxs].copy() lm_model = clone(self.lm_stack) lm_model.fit(df_leaf,y_leaf) self.lm_models[(f_idx, leaf_num)] = lm_model return self def predict(self, df): cols1 = list(df.columns) if self.cols1 is None else self.cols1 cols2 = list(df.columns) if self.cols2 is None else self.cols2 leaf = self.rf_stack.apply(df[cols1]) stack_preds = np.zeros_like(leaf, dtype=float) #predict unsing lm models for every rf estimator and leaf for f_idx in range(leaf.shape[1]): for leaf_num, idxs in pd.DataFrame(leaf[:,f_idx]).reset_index().groupby(0): idxs = idxs['index'].values df_leaf = df[cols2].iloc[idxs].copy() lm_model = self.lm_models[(f_idx, leaf_num)] leaf_pred = lm_model.predict(df_leaf) stack_preds[idxs,f_idx] = leaf_pred y_pred = stack_preds.mean(axis = 1) return y_pred #check_estimator(stack_model)
def _data_clusterings(self, data_z, data_p, data_y): ''' Returns the centers and precisions of an epsilon cover of gaussians. Currently the centers are just an epsilon grid and the precisions 1/(3*epsilon), i.e. the standard deviation is 3 times the distance between two grid points. Later this is exactly the function that will be implementing the tree style splitting. and returning a more tailored to the data epsilon cover.''' if self._cluster_type == 'forest': from sklearn.ensemble import RandomForestRegressor dtree = RandomForestRegressor(n_estimators=self._num_trees, max_leaf_nodes=self._n_critics, min_samples_leaf=self._min_cluster_size) dtree.fit(data_z, data_p) cluster_labels = dtree.apply(data_z) #dtree.fit(data_z, data_y) #cluster_labels = np.concatenate((cluster_labels, dtree.apply(data_z)), axis=1) cluster_ids = [np.unique(cluster_labels[:, c]) for c in range(cluster_labels.shape[1])] elif self._cluster_type == 'kmeans': from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=self._n_critics).fit(data_z) cluster_labels = kmeans.labels_.reshape(-1, 1) cluster_ids = [np.unique(cluster_labels)] elif self._cluster_type == 'random_points': center_ids = np.random.choice(np.arange(data_z.shape[0]), size=self._n_critics, replace=False) cluster_labels = np.zeros((data_z.shape[0], self._n_critics)) cluster_ids = np.ones((self._n_critics, 1)) for it, center in enumerate(center_ids): distances = np.linalg.norm(data_z - data_z[center], axis=1) cluster_members = np.argsort(distances)[:self._min_cluster_size] cluster_labels[cluster_members, it] = 1 else: raise Exception("Unknown option {}".format(self._cluster_type)) #z_min = np.percentile(data_z, 0) - self._epsilon #z_max = np.percentile(data_z, 100) + self._epsilon #center_grid = np.arange(z_min, z_max, self._epsilon) #precision_grid = np.ones(center_grid.shape[0]) / (3 * self._epsilon) return cluster_labels, cluster_ids
def test_varying_samples_per_node(meta_m, m): print("Type m: ", m.dtype) path_out = r"/home/irene/PycharmProjects/04_Risk_Model/data/poisson_leaves/prediction_ytest.csv" Y = m[:, 0] X = m[:, 1:] # Ynz, Xnz = trim_value(Y, X, 0) xtrain, xtest, ytrain, ytest, meta_m_train, meta_m_test = train_test_split( X, Y, meta_m, train_size=0.60, random_state=42) print('Type xtrain: ', xtrain.dtype) print("Raw data: ", Y.shape, X.shape) print("Train data", ytrain.shape, xtrain.shape) print("Test data: ", ytest.shape, xtest.shape) n_estimators = [1, 5, 10, 50, 100] samples_per_leaf = range(100, 1600, 100) start_all = time.time() for spl in samples_per_leaf: start_it = time.time() for n_esti in n_estimators: print() print("Analysis: RF with Poisson Leaves") print("Samples per leaf node: ", spl) print("Number of estimators: ", n_esti) print("-" * 50) ensemble = RandomForestRegressor(n_estimators=n_esti, min_samples_leaf=spl, bootstrap=False) ensemble.fit(xtrain, ytrain) leaves = ensemble.apply(xtrain) dicori = samples_per_leaf_node(leaves, xtrain, ytrain) pack = fitting_four_models_leaf_nodes(dicori) pred_rf = ensemble.predict(xtest) pred = predicting_four_models_leaf_nodes(spl, n_esti, ytest, xtest, pack, pred_rf).T stack = np.hstack((meta_m_test, pred)) dicens = ensemble_predictions_leaf_nodes(ensemble, dicori) stop_it = time.time() print("--- Iteration elapsed {0} minutes ---".format( np.divide(stop_it - start_it, 60))) end_all = time.time() print("--- Full program elapsed {0} hours ---".format( np.divide(end_all - start_all, 3600)))
def test_drf_regressor_backupsklearn(backend='auto'): df = pd.read_csv("./open_data/simple.txt", delim_whitespace=True) X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C') y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C') import h2o4gpu Solver = h2o4gpu.RandomForestRegressor #Run h2o4gpu version of RandomForest Regression drf = Solver(backend=backend, random_state=1234, oob_score=True) print("h2o4gpu fit()") drf.fit(X, y) #Run Sklearn version of RandomForest Regression from sklearn.ensemble import RandomForestRegressor drf_sk = RandomForestRegressor(random_state=1234, oob_score=True, max_depth=3) print("Scikit fit()") drf_sk.fit(X, y) if backend == "sklearn": assert (drf.predict(X) == drf_sk.predict(X)).all() == True assert (drf.score(X, y) == drf_sk.score(X, y)).all() == True assert (drf.decision_path(X)[1] == drf_sk.decision_path(X)[1] ).all() == True assert (drf.apply(X) == drf_sk.apply(X)).all() == True print("Estimators") print(drf.estimators_) print(drf_sk.estimators_) print("n_features") print(drf.n_features_) print(drf_sk.n_features_) assert drf.n_features_ == drf_sk.n_features_ print("n_outputs") print(drf.n_outputs_) print(drf_sk.n_outputs_) assert drf.n_outputs_ == drf_sk.n_outputs_ print("Feature importance") print(drf.feature_importances_) print(drf_sk.feature_importances_) assert (drf.feature_importances_ == drf_sk.feature_importances_ ).all() == True print("oob_score") print(drf.oob_score_) print(drf_sk.oob_score_) assert drf.oob_score_ == drf_sk.oob_score_ print("oob_prediction") print(drf.oob_prediction_) print(drf_sk.oob_prediction_) assert (drf.oob_prediction_ == drf_sk.oob_prediction_).all() == True
def prox_matrix(df, y, features, cluster_dimension, trees=10): #https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm#prox #initialize datframe for independant variables independant = pd.DataFrame() #Handle Categoricals: This should really be added to RandomForestRegressor for column, data_type in df[features].dtypes.iteritems(): try: independant[column] = pd.to_numeric(df[column], downcast='integer') except ValueError: contains_nulls = df[column].isnull().values.any() dummies = pd.get_dummies(df[column], prefix=column, dummy_na=contains_nulls, drop_first=True) independant[dummies.columns] = dummies if len(independant.index) != len(df.index): raise Exception('independant variables not stored properly') #train Model clf = RandomForestRegressor(n_estimators=trees, n_jobs=-1) clf.fit(independant, y) #Final leaf for each tree leaves = clf.apply(independant) #value in cluster dimension labels = df[cluster_dimension].values numerator_matrix = {} for i, value_i in enumerate(labels): for j, value_j in enumerate(labels): if i >= j: numerator_matrix[(value_i, value_j)] = numerator_matrix.get( (value_i, value_j), 0) + np.count_nonzero(leaves[i] == leaves[j]) numerator_matrix[(value_j, value_i)] = numerator_matrix[(value_i, value_j)] #normalize by the total number of possible matchnig leaves prox_matrix = { key: 1.0 - float(x) / (trees * np.count_nonzero(labels == key[0]) * np.count_nonzero(labels == key[1])) for key, x in numerator_matrix.items() } #make sorted dataframe levels = np.unique(labels) D = pd.DataFrame(data=[[prox_matrix[(i, j)] for i in levels] for j in levels], index=levels, columns=levels) return D
def GetBinFeatures(stage, train_imgs, train_shapes, train_bboxes, mean_rshape, targets): bin_features=[] forests=[] random_poses=[] for ilandmark in range(param_landmark_num): t1=time.time() #### get locations feature_pair_pos=np.zeros((param_local_feature_num[stage]*2, 2)) for i in range(param_local_feature_num[stage]): while True: pair=np.random.rand(4)*2-1 x1,y1,x2,y2=pair if x1*x1+y1*y1<1 and x2*x2+y2*y2<1 and (x1, y1)!=(x2, y2): break feature_pair_pos[2*i:2*i+2]=(pair*param_local_radius[stage]).reshape((2,2)) random_poses.append(feature_pair_pos) #### get pixel difference features=np.zeros((len(train_shapes), param_local_feature_num[stage])) for i in range(len(train_shapes)): #origin_img=cv2.imread(train_imgs[i], 0).astype(np.float) origin_img=train_imgs[i] # transform from mean space to current training space sim_trans=transform.estimate_transform('similarity', CenterShape(mean_rshape), CenterShape(Shape2Relative(train_shapes[i], train_bboxes[i]))) #trans_feature_pair_pos=Shape2Absolute(sim_trans(feature_pair_pos), train_bboxes[i])+train_shapes[i][ilandmark] trans_feature_pair_pos=GetLocalFeatureAbsolutePos(sim_trans(feature_pair_pos), train_bboxes[i], train_shapes[i][ilandmark]).astype(np.int) #trans_feature_pair_pos=trans_feature_pair_pos.astype(np.int) for j in range(param_local_feature_num[stage]): x1,y1=trans_feature_pair_pos[2*j] x2,y2=trans_feature_pair_pos[2*j+1] # in case out of boundary x1=max(0, min(origin_img.shape[1]-1, x1)) x2=max(0, min(origin_img.shape[1]-1, x2)) y1=max(0, min(origin_img.shape[0]-1, y1)) y2=max(0, min(origin_img.shape[0]-1, y2)) features[i,j]=origin_img[y1,x1] - origin_img[y2,x2] #del origin_img #gc.collect() #### train random forest forest=RandomForestRegressor(max_depth=param_tree_depth, n_estimators=param_tree_num, n_jobs=8) forest.fit(features, targets[:, ilandmark]) forests.append(forest) #### extract binary features for every training sample leaves, leaves_num=GetLeaves(forest) reach_nodes=forest.apply(features) landmark_bin_features=np.zeros((len(train_shapes), leaves_num)) for i in range(len(train_shapes)): begin_leaf_ind=0 for j in range(len(leaves)): node=reach_nodes[i, j] landmark_bin_features[i][begin_leaf_ind+leaves[j][node]]=1 begin_leaf_ind+=len(leaves[j]) bin_features.append(landmark_bin_features) print('landmark:', ilandmark+1, 'use:', time.time()-t1, 's') return np.hstack(bin_features), forests, random_poses
def predict_models(meta_m, m, meta_p, p): print("Type m: ", m.dtype) path_out = r"/home/irene/PycharmProjects/04_Risk_Model/data/poisson_leaves/prediction_nl_four_models.csv" Y = m[:, 0] X = m[:, 1:] n_esti = 5 spl = 800 # Ynz, Xnz = trim_value(Y, X, 0) xtrain, xtest, ytrain, ytest, meta_m_train, meta_m_test = train_test_split( X, Y, meta_m, train_size=0.60, random_state=0) ensemble = RandomForestRegressor(n_estimators=n_esti, min_samples_leaf=spl, bootstrap=False) ensemble.fit(xtrain, ytrain) leaves = ensemble.apply(xtrain) dicori = samples_per_leaf_node(leaves, xtrain, ytrain) pack = fitting_four_models_leaf_nodes(dicori) print("Weirdos?") print(np.isnan(p).any(), np.isinf(p).any(), np.isneginf(p).any()) # pred = predicting_four_models_leaf_nodes(spl, n_esti, ytest, xtest, pack, pred_rf).T print("Prediction with the four models") pred = predicting_four_models_leaf_nodes_nl(p, pack) print("Predicting with random forest") pred_rf = ensemble.predict(p).reshape(-1, 1) print(meta_p.shape, pred.T.shape, pred_rf.shape) stack = np.hstack((meta_p, pred.T, pred_rf)) print(stack.shape, meta_p.shape, pred.shape) # dicens = ensemble_predictions_leaf_nodes(ensemble, dicori) with open(path_out, "w", newline="") as w: writer = csv.writer(w, delimiter=";") for item in stack: writer.writerow(item) placed_list = place(stack) write_tif(placed_list, spl, n_esti)
def test_drf_regressor_backupsklearn(backend='auto'): df = pd.read_csv("./open_data/simple.txt", delim_whitespace=True) X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C') y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C') import h2o4gpu Solver = h2o4gpu.RandomForestRegressor #Run h2o4gpu version of RandomForest Regression drf = Solver(backend=backend, random_state=1234, oob_score=True) print("h2o4gpu fit()") drf.fit(X, y) #Run Sklearn version of RandomForest Regression from sklearn.ensemble import RandomForestRegressor drf_sk = RandomForestRegressor(random_state=1234, oob_score=True, max_depth=3) print("Scikit fit()") drf_sk.fit(X, y) if backend == "sklearn": assert (drf.predict(X) == drf_sk.predict(X)).all() == True assert (drf.score(X, y) == drf_sk.score(X, y)).all() == True assert (drf.decision_path(X)[1] == drf_sk.decision_path(X)[1]).all() == True assert (drf.apply(X) == drf_sk.apply(X)).all() == True print("Estimators") print(drf.estimators_) print(drf_sk.estimators_) print("n_features") print(drf.n_features_) print(drf_sk.n_features_) assert drf.n_features_ == drf_sk.n_features_ print("n_outputs") print(drf.n_outputs_) print(drf_sk.n_outputs_) assert drf.n_outputs_ == drf_sk.n_outputs_ print("Feature importance") print(drf.feature_importances_) print(drf_sk.feature_importances_) assert (drf.feature_importances_ == drf_sk.feature_importances_).all() == True print("oob_score") print(drf.oob_score_) print(drf_sk.oob_score_) assert drf.oob_score_ == drf_sk.oob_score_ print("oob_prediction") print(drf.oob_prediction_) print(drf_sk.oob_prediction_) assert (drf.oob_prediction_ == drf_sk.oob_prediction_).all() == True
def predict_models(meta_m, m, meta_p, p): print("Type m: ", m.dtype) path_out = r"D:/UTwente/PycharmProjects/04_Risk_Model/data/skewed_leaves/prediction_nl_four_models_v3_20T_200S.csv" Y = m[:, 0] X = m[:, 1:] n_esti = 20 spl = 200 xtrain, xtest, ytrain, ytest, meta_m_train, meta_m_test = train_test_split(X, Y, meta_m, train_size=0.60, random_state=0) ensemble = RandomForestRegressor(n_estimators=n_esti, min_samples_leaf=spl, bootstrap=False) ensemble.fit(xtrain, ytrain) leaves = ensemble.apply(xtrain) dicori = samples_per_leaf_node(leaves, xtrain, ytrain) pack = fitting_four_models_leaf_nodes(dicori) print("Predicting with random forest") pred_rf = ensemble.predict(p).reshape(-1, 1) print("Prediction with the four models") pred_sk = predicting_four_models_leaf_nodes_NL(ensemble, meta_p, p, pack).T print(meta_p.shape, pred_sk.shape, pred_rf.shape) stack = np.hstack((meta_p, pred_sk, pred_rf)) print("Stacked predictions: ", stack.shape, meta_p.shape) # dicens = ensemble_predictions_leaf_nodes(ensemble, dicori) # print(meta_m_test.shape, ytest.shape) # stack = np.hstack((meta_m, Y.reshape(-1, 1))) with open(path_out, "w", newline="") as w: writer = csv.writer(w, delimiter=";") for item in stack: writer.writerow(item)
def _get_fitted_model(self, X, y): model = RandomForestRegressor( criterion=self.criterion, n_estimators=self.n_estimators, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, min_impurity_decrease=self.min_impurity_decrease, bootstrap=self.bootstrap, oob_score=self.oob_score, n_jobs=self.n_jobs, random_state=self.random_state, verbose=self.verbose, warm_start=self.warm_start, ccp_alpha=self.ccp_alpha, max_samples=self.max_samples) self.model_ = model.fit(X, y) self.train_leaf_indices_ = model.apply(X)
def test_varying_samples_per_node(meta_m, m): print("Type m: ", m.dtype) Y = m[:,0] X = m[:,1:] # Ynz, Xnz = trim_value(Y, X, 0) xtrain, xtest, ytrain, ytest = train_test_split(X, Y, train_size=0.60, random_state=0) print('Type xtrain: ', xtrain.dtype) print("Raw data: ", Y.shape, X.shape) print("Train data", ytrain.shape, xtrain.shape) print("Test data: ", ytest.shape, xtest.shape) for samples_per_leaf in range(500, 600, 100): print("Samples per leaf node: ", samples_per_leaf) ensemble = RandomForestRegressor(n_estimators=1, min_samples_leaf=samples_per_leaf, bootstrap=False) ensemble.fit(xtrain, ytrain) leaves = ensemble.apply(xtrain) print(leaves) print(leaves.shape) dicori = samples_per_leaf_node(leaves, xtrain, ytrain) pack = fitting_four_models_leaf_nodes(dicori) pred_ = predicting_four_models_leaf_nodes(xtest, pack) # dicens = ensemble_predictions_leaf_nodes(dicori) break
def predict_one(self, X0): ''' X0 must be a array of shape 1 x n_features ''' assert X0.shape == ( 1, self.n_features), "The shape of X0 should be 1 x n_features" predict_one_leaf_indices = RandomForestRegressor.apply( self, X0) ## 1 x n_estimators leaf_equal_bool = np.equal( self.leaf_indices, predict_one_leaf_indices) ## n_sample x n_estimator leaf_count = np.sum(leaf_equal_bool, axis=0).reshape(1, -1) ## 1 x n_estimators alpha_weights = 1 / self.B * np.sum( leaf_equal_bool.astype(float) / leaf_count, axis=1) ## n_sample x 1 # print(leaf_equal_bool.shape, leaf_equal_bool) # print(leaf_count.shape, leaf_count) # print(alpha_weights, np.sum(alpha_weights)) # print(alpha_weights.shape, np.sum(alpha_weights)) assert abs(np.sum(alpha_weights) - 1) < 0.01, "alpha weights calculation is wrong" ## A diagonal matrix A = np.diag(alpha_weights) ## n_sample x n_sample J_1d = np.ones((self.n_features + 1, 1)) J_1d[0] = 0 J = np.diag(J_1d) ## n_features + 1 x n_features + 1 delta_m = np.ones((self.n_samples, self.n_features + 1)) ## n_sample x n_features + 1 delta_m[:, 1:] = self.train_x - X0 local_mu_theta = np.linalg.inv(delta_m.T @ A @ delta_m + self.lam * J) @ delta_m.T @ A @ self.train_y mu = local_mu_theta[0] theta = local_mu_theta[1:] return mu, theta
X1, X2 = np.meshgrid(x1, x2) R1 = X1 - X2 R2 = X1 + X2 Z = 20 * np.maximum.reduce([np.exp(-2 * R1 ** 2), np.exp(-1 * R2 ** 2), 2 * np.exp(-0.5 * (X1 ** 2 + X2 ** 2))]) fig, axes = plt.subplots(ncols=4, figsize=(18, 6)) for ax in axes.flat: ax.set_aspect('equal', 'box') ax.set_xlim(-3, 3) ax.set_ylim(-3, 3) rf_kernel = 1 - pairwise_distances( forest.apply([[-1.5, 1.5]]), forest.apply(X), metric='hamming') rf_kernel = rf_kernel.ravel() / rf_kernel.ravel().sum() axes[0].imshow( Z, extent=[-3, 3, -3, 3], origin='lower', cmap='YlGnBu_r', alpha=0.5) axes[0].contour(X1, X2, Z, levels=n_contours, linewidths=0.5, colors='k', linestyles='--') axes[0].scatter(X[:, 0], X[:, 1], edgecolor='k', color='white', sizes=50 * np.sqrt(rf_kernel)) axes[0].scatter(-1.5, 1.5, color='tomato', edgecolor='black', marker='P', s=50) axes[0].set_title("Random Forest", fontsize=fontsize) rf_kernel = 1 - pairwise_distances( forest.apply([[0.5, -0.5]]), forest.apply(X), metric='hamming')
def test_varying_samples_per_node(meta_m, m): print("Type m: ", m.dtype) Y = m[:, 0] X = m[:, 1:] Ynz, Xnz, meta_m_nz = trim_value(Y, X, meta_m, 0) xtrain, xtest, ytrain, ytest, meta_m_train, meta_m_test = train_test_split( X, Y, meta_m, train_size=0.60, random_state=0) print('Type xtrain: ', xtrain.dtype) print("Raw data: ", Y.shape, X.shape) print("Train data", ytrain.shape, xtrain.shape) print("Test data: ", ytest.shape, xtest.shape) # n_estimators = [1, 5, 10, 50, 100] # samples_per_leaf = range(100, 1600, 100) n_estimators = [10] samples_per_leaf = [200, 400, 600, 1000, 1200] start_all = time.time() fig, ax = plt.subplots(nrows=5, ncols=5) nrow = 0 for spl in samples_per_leaf: start_it = time.time() for n_esti in n_estimators: print() print("Analysis: RF with Skewed Leaves") print("Samples per leaf node: ", spl) print("Number of estimators: ", n_esti) print("-" * 50) ensemble = RandomForestRegressor(n_estimators=n_esti, min_samples_leaf=spl, bootstrap=True) ensemble.fit(xtrain, ytrain) leaves = ensemble.apply(xtrain) dicori = samples_per_leaf_node(leaves, xtrain, ytrain) pack = fitting_four_models_leaf_nodes(dicori) pred_rf = ensemble.predict(xtest) pred_sk = testing_four_models_leaf_nodes_v2( ensemble, spl, n_esti, ytest, xtest, pack, pred_rf, meta_m_test).T print("This is pred sk: ", pred_sk.shape) # stack = np.hstack((meta_m_test, pred)) # dicens = ensemble_predictions_leaf_nodes(ensemble, dicori) write_proportion_of_zeros(spl, n_esti) # plt.subplot(2, 2, 1) # plt.hist(pred_sk[:, 0], bins=20) # plt.subplot(2, 2, 2) # plt.hist(pred_sk[:, 1], bins=20) # plt.subplot(2, 2, 3) # plt.hist(pred_sk[:, 2], bins=20) # plt.subplot(2, 2, 4) # plt.hist(pred_sk[:, 3], bins=20) # plt.show() plot_compare_histograms(ax, nrow, spl, ytest, pred_sk, pred_rf) nrow += 1 stop_it = time.time() print("--- Iteration elapsed {0} minutes ---".format( np.divide(stop_it - start_it, 60))) plt.show() end_all = time.time() print("--- Full program elapsed {0} hours ---".format( np.divide(end_all - start_all, 3600)))
estimators = regr.estimators_ # description of each tree importance = regr.feature_importances_ # an array of the fractional importance of the each feature num_features = regr.n_features_ # the number of features num_outputs = regr.n_outputs_ # the number of outputs when the model is built #oob_score = regr.oob_score_ # score the training dataset using an out-of-bag estimator, this computes the average of correct classifications # basically the coefficent of determination of R**2 using 'unseen' data not used to build the model #oob_predict = regr.oob_prediction_ # The prediction for the values of training dataset using the oob method # now having a look at the methods leaf_indices = regr.apply( x_test ) # get the numbers of the all the leaves the test dataset ends up in decision_path = regr.decision_path(x_test) parameters = regr.get_params() # the parameters of the model predicted_age_array = regr.predict( x_test ) # running the test dataset through the model, giving an array of predicted values r_2_train = regr.score( x_train, y_train) # calculating the R squared of the train dataset r_2_test = regr.score(x_test, y_test) # calculating the R squared of the test dataset
class PDFRandomForestRegressor(BaseEstimator, RegressorMixin): """A normal random forest, except that it stores the final leaf positions and delay times for each row of the training set. It will also have a specialized scoring method.""" def __init__(self,delaymin,delaymax,**kwargs): self.rforest = RandomForestRegressor(**kwargs) self.delay_min = delaymin self.delay_max = delaymax self.delay_bin_indices = np.arange(self.delay_max-self.delay_min+1) self.delay_bin_values = np.arange(self.delay_min,self.delay_max+1) #For each random forest, a dictionary mapping node id numbers to numpy arrays is also stored. These numpy arrays contain a histogram of the number of training models which fell into that node and their delay times. self.node_delay_pdfs = [{}]*self.rforest.n_estimators def fit(self, X,y,compute_pdf = False): y_fit = self.restrict_range(y) self.rforest.fit(X,y_fit) if compute_pdf == True: #Get the node ids for the training set: self.set_node_pdfs(X,y_fit) return self def set_node_pdfs(self,X,y): y_fit = self.restrict_range(y) #Map the y values onto indices for the arrays: y_indices = self.map_y_vals(y_fit) nodes = self.apply(X) #For each tree, make a 2D array containing the full range of integer target values along one axis (first axis), and the unique nodes along the other. Now, when the regression predicts a set of nodes for a given set of inputs, the full delay time distribution can be extracted by taking a slice along the unique node axis for i in range(nodes.shape[1]): unique_nodes,idxes = np.unique(nodes[:,i],return_inverse=True) unique_node_indices = np.arange(len(unique_nodes)+1) node_dict = {unique_nodes[i]:unique_node_indices[i] for i in range(len(unique_node_indices)-1)} node_indices = unique_node_indices[idxes] pdf_arr,xedges,yedges = np.histogram2d(y_fit,node_indices,bins=[self.delay_bin_values,unique_node_indices]) #print 'testing',np.sum(pdf_arr) self.node_delay_pdfs[i] = {'node_dict':node_dict,'pdf_arr':pdf_arr} def restrict_range(self,y): y_restrict = y.copy() y_restrict[y < self.delay_min] = self.delay_min y_restrict[y > self.delay_max-1] = self.delay_max-1 return y_restrict def map_y_vals(self,y): y_map = self.restrict_range(y) y_indices = y_map-self.delay_min return y_indices def predict(self,X): return self.rforest.predict(X) #Instead of just the normal prediction, which I believe just gives the average value of everything in the leaf node, predict a set of quantiles: def predict_percentiles(self,X,percentiles): p_nodes = self.apply(X) pdf_arr = self.get_node_pdfs(p_nodes) #print np.sum(pdf_arr,axis=1) sys.exit cdf_arr = np.cumsum(pdf_arr,axis=1) cdf_arr_frac = (cdf_arr.T/cdf_arr[:,-1].astype(np.float)).T #print pdf_arr[0,:] #print cdf_arr_frac[0,:] #sys.exit(1) #print "test",cdf_arr_frac.shape,len(percentiles) percentile_yvals = np.zeros((cdf_arr_frac.shape[0],len(percentiles)),dtype=np.int) for i,ptile in enumerate(percentiles): temp_cdf_arr_frac = cdf_arr_frac.copy()#These steps ensure that the y value is taken as the first index where the cdf goes above the percentile temp_cdf_arr_frac[temp_cdf_arr_frac < ptile/100.] = 1000 indices = np.argmin(temp_cdf_arr_frac-ptile/100.,axis=1) #indices = np.argmin(np.abs(cdf_arr_frac-ptile/100.),axis=1) #print indices[0] percentile_yvals[:,i] = self.delay_bin_values[indices] #print i,self.delay_bin_values[indices] #print pdf_arr[0,:] #print cdf_arr_frac[0,:] #print percentile_yvals[0,:],percentile_yvals.shape #sys.exit(1) return percentile_yvals def compute_percentiles(self,X,y): y_fit = self.restrict_range(y) y_indices = self.map_y_vals(y_fit).astype(np.int) p_nodes = self.apply(X) pdf_arr = self.get_node_pdfs(p_nodes) cdf_arr = np.cumsum(pdf_arr,axis=1) cdf_arr_frac = (cdf_arr.T/cdf_arr[:,-1].astype(np.float)).T #print cdf_arr_frac[0,:] #print self.delay_bin_values #print cdf_arr_frac.shape,y_fit.shape,y_fit[0] #print 'test',cdf_arr_frac.shape,y_fit.shape,y_indices.min(),y_indices.max(),self.delay_bin_values.shape #Now just need to compute the percentiles for all the y_indices cdf_at_y = cdf_arr_frac[np.arange(len(y_indices)),y_indices] #print "debug",cdf_at_y[0] return cdf_at_y # print cdf_at_y[:10],cdf_at_y.shape def get_node_pdfs(self,nodes): pdf_arr = np.zeros((nodes.shape[0],len(self.delay_bin_values)-1),dtype=np.int) #print nodes.shape,pdf_arr.shape for i,node_info in enumerate(self.node_delay_pdfs): #print i,node_info['node_dict'] node_ids = [node_info['node_dict'][node] for node in nodes[:,i]] #print node_ids #print node_info['pdf_arr'].shape #print 'debug',node_info['pdf_arr'][:,node_ids[0]] temp_arr = np.array([node_info['pdf_arr'][:,node_id] for node_id in node_ids],dtype=pdf_arr.dtype) pdf_arr += temp_arr #print 'temp',temp_arr[0,:] #print "" return pdf_arr def apply(self,X): return self.rforest.apply(X) def score(self,X,y): return self.rforest.score(X,y) #Compute how good each predicted value is based on how far away it is from the median value in percentiles: def score_percentiles(self,X,y): #First, compute the medians: y_med = self.predict_percentiles(X,[50]).ravel() #print y_med.shape percentiles = self.compute_percentiles(X,y) med_percentiles = self.compute_percentiles(X,y_med)#Have to do this step to take into account the discrete nature of the y values. #print med_percentiles[:10],percentiles[:10] return 1.-np.sum((med_percentiles-percentiles)**2)/float(len(y))
def predict_models(meta_m, m, meta_p, p): path_out_tmp = r"D:/PycharmProjects/IGM_PhD_Materials/data/P04/out/pred_csv/prediction_nl_four_models_{0}T_{1}S.csv" Y = m[:, 0] X = m[:, 1:] n_estimators = [10, 20, 50] samples_per_leaf = range(100, 900, 100) # The complete experiment for this paper corresponds to: # n_estimators = [10, 20, 50] # samples_per_leaf = range(100, 900, 100) # # NOTE THAT: # + The complete execution takes ~40h in a Inteli7-8700 CPU @ 3.20GHz, 6 Core(s), 12 Logical Processor(s) w/ 16GB of RAM # + The paper only shows results for SPL = [100, 200, 400, 600, 800] due to space constraints start_all = time.time() for n_esti in n_estimators: start_it = time.time() for spl in samples_per_leaf: path_out = path_out_tmp.format(n_esti, spl) print("\nTraining ensemble: ({0} T, {1} SPL)".format(n_esti, spl)) xtrain, xtest, ytrain, ytest, meta_m_train, meta_m_test = train_test_split( X, Y, meta_m, train_size=0.60, random_state=0) ensemble = RandomForestRegressor(n_estimators=n_esti, min_samples_leaf=spl, bootstrap=False) ensemble.fit(xtrain, ytrain) leaves = ensemble.apply(xtrain) dicori = samples_per_leaf_node(leaves, xtrain, ytrain) pack = fitting_four_models_leaf_nodes(dicori) print("\tPredicting with random forest") pred_rf = ensemble.predict(p).reshape(-1, 1) print("\tPrediction with the four models") pred_sk = predicting_four_models_leaf_nodes_NL( ensemble, meta_p, p, pack, n_esti).T stack = np.hstack((meta_p, pred_sk, pred_rf)) # dicens = ensemble_predictions_leaf_nodes(ensemble, dicori) print("\tWriting results in CSV file") with open(path_out, "w", newline="") as w: writer = csv.writer(w, delimiter=";") for item in stack: writer.writerow(item) stop_it = time.time() print("--- Iteration elapsed {0} minutes ---".format( np.divide(stop_it - start_it, 60))) print() end_all = time.time() print("--- Full program elapsed {0} hours ---".format( np.divide(end_all - start_all, 3600)))
# In[42]: print prediction print bias + np.sum(contributions, axis=1) # In[43]: # the basic feature importance feature provided by sklearn fit1.feature_importances_ # In[44]: # treeinterpreter uses the apply function to retrieve the leave indicies with the help of which, # the tree path is retrieved rf.apply # In[47]: rf.apply(instances) # In[ ]:
def get_data_clustering(data_z, data_p, n_instruments, n_critics=50, cluster_type="kmeans", num_trees=5, min_cluster_size=50, critic_type="Gaussian"): """Return the centers, precisions, and normalizers of a data cover. """ if cluster_type == "forest": from sklearn.ensemble import RandomForestRegressor dtree = RandomForestRegressor(n_estimators=num_trees, max_leaf_nodes=n_critics, min_samples_leaf=min_cluster_size) dtree.fit(data_z, data_p) cluster_labels = dtree.apply(data_z) cluster_ids = [ np.unique(cluster_labels[:, c]) for c in range(cluster_labels.shape[1]) ] elif cluster_type == "kmeans": from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=n_critics).fit(data_z) cluster_labels = kmeans.labels_.reshape(-1, 1) cluster_ids = [np.unique(cluster_labels)] elif cluster_type == "random_points": center_ids = np.random.choice(np.arange(data_z.shape[0]), size=n_critics, replace=False) cluster_labels = np.zeros((data_z.shape[0], n_critics)) cluster_ids = np.ones((n_critics, 1)) for it, center in enumerate(center_ids): distances = np.linalg.norm(data_z - data_z[center], axis=1) cluster_members = np.argsort(distances)[:min_cluster_size] cluster_labels[cluster_members, it] = 1 else: raise Exception("Unknown option {}".format(cluster_type)) if critic_type == "Gaussian": # We put a symmetric gaussian encompassing # all the data points of each cluster of each clustering center_grid = [] precision_grid = [] normalizers = [] data_z = np.array(data_z) for tree in range(cluster_labels.shape[1]): for leaf in cluster_ids[tree]: center = np.mean( data_z[cluster_labels[:, tree].flatten() == leaf, :], axis=0) distance = np.linalg.norm(data_z - center, axis=1) / data_z.shape[1] precision = 1. / (np.sqrt(2) * (np.sort(distance)[min_cluster_size])) normalizer = (precision**n_instruments) * np.sum( np.exp(-(precision * distance)**2)) / (np.power( 2. * np.pi, n_instruments / 2.)) normalizers.append(normalizer) center_grid.append(center) precision_grid.append(precision) # The proposed normalizing constant results in too small function values # which result in too small losses and respective lack of scaling # when using the exp function for the weights update. # The code is kept for future fixes but overwritten # with the following command. # TODO: Explore normalizers and normalization of f(x) normalizers = np.ones(len(center_grid), dtype="float32") normalizers = np.array(normalizers, dtype="float32") center_grid = np.array(center_grid, dtype="float32") precision_grid = np.array(precision_grid, dtype="float32") normalizers = tf.constant(normalizers, name="normalizers") center_grid = tf.constant(center_grid, name="centers") precision_grid = tf.constant(precision_grid, name="precisions") else: raise NotImplementedError("Uniform functions not supported.") return normalizers, precision_grid, center_grid
class PDFRandomForestRegressor(BaseEstimator, RegressorMixin): """A normal random forest, except that it stores the final leaf positions and delay times for each row of the training set. It will also have a specialized scoring method.""" def __init__(self, delaymin, delaymax, **kwargs): self.rforest = RandomForestRegressor(**kwargs) self.delay_min = delaymin self.delay_max = delaymax self.delay_bin_indices = np.arange(self.delay_max - self.delay_min + 1) self.delay_bin_values = np.arange(self.delay_min, self.delay_max + 1) #For each random forest, a dictionary mapping node id numbers to numpy arrays is also stored. These numpy arrays contain a histogram of the number of training models which fell into that node and their delay times. self.node_delay_pdfs = [{}] * self.rforest.n_estimators def fit(self, X, y, compute_pdf=False): y_fit = self.restrict_range(y) self.rforest.fit(X, y_fit) if compute_pdf == True: #Get the node ids for the training set: self.set_node_pdfs(X, y_fit) return self def set_node_pdfs(self, X, y): y_fit = self.restrict_range(y) #Map the y values onto indices for the arrays: y_indices = self.map_y_vals(y_fit) nodes = self.apply(X) #For each tree, make a 2D array containing the full range of integer target values along one axis (first axis), and the unique nodes along the other. Now, when the regression predicts a set of nodes for a given set of inputs, the full delay time distribution can be extracted by taking a slice along the unique node axis for i in range(nodes.shape[1]): unique_nodes, idxes = np.unique(nodes[:, i], return_inverse=True) unique_node_indices = np.arange(len(unique_nodes) + 1) node_dict = { unique_nodes[i]: unique_node_indices[i] for i in range(len(unique_node_indices) - 1) } node_indices = unique_node_indices[idxes] pdf_arr, xedges, yedges = np.histogram2d( y_fit, node_indices, bins=[self.delay_bin_values, unique_node_indices]) #print 'testing',np.sum(pdf_arr) self.node_delay_pdfs[i] = { 'node_dict': node_dict, 'pdf_arr': pdf_arr } def restrict_range(self, y): y_restrict = y.copy() y_restrict[y < self.delay_min] = self.delay_min y_restrict[y > self.delay_max - 1] = self.delay_max - 1 return y_restrict def map_y_vals(self, y): y_map = self.restrict_range(y) y_indices = y_map - self.delay_min return y_indices def predict(self, X): return self.rforest.predict(X) #Instead of just the normal prediction, which I believe just gives the average value of everything in the leaf node, predict a set of quantiles: def predict_percentiles(self, X, percentiles): p_nodes = self.apply(X) pdf_arr = self.get_node_pdfs(p_nodes) #print np.sum(pdf_arr,axis=1) sys.exit cdf_arr = np.cumsum(pdf_arr, axis=1) cdf_arr_frac = (cdf_arr.T / cdf_arr[:, -1].astype(np.float)).T #print pdf_arr[0,:] #print cdf_arr_frac[0,:] #sys.exit(1) #print "test",cdf_arr_frac.shape,len(percentiles) percentile_yvals = np.zeros((cdf_arr_frac.shape[0], len(percentiles)), dtype=np.int) for i, ptile in enumerate(percentiles): temp_cdf_arr_frac = cdf_arr_frac.copy( ) #These steps ensure that the y value is taken as the first index where the cdf goes above the percentile temp_cdf_arr_frac[temp_cdf_arr_frac < ptile / 100.] = 1000 indices = np.argmin(temp_cdf_arr_frac - ptile / 100., axis=1) #indices = np.argmin(np.abs(cdf_arr_frac-ptile/100.),axis=1) #print indices[0] percentile_yvals[:, i] = self.delay_bin_values[indices] #print i,self.delay_bin_values[indices] #print pdf_arr[0,:] #print cdf_arr_frac[0,:] #print percentile_yvals[0,:],percentile_yvals.shape #sys.exit(1) return percentile_yvals def compute_percentiles(self, X, y): y_fit = self.restrict_range(y) y_indices = self.map_y_vals(y_fit).astype(np.int) p_nodes = self.apply(X) pdf_arr = self.get_node_pdfs(p_nodes) cdf_arr = np.cumsum(pdf_arr, axis=1) cdf_arr_frac = (cdf_arr.T / cdf_arr[:, -1].astype(np.float)).T #print cdf_arr_frac[0,:] #print self.delay_bin_values #print cdf_arr_frac.shape,y_fit.shape,y_fit[0] #print 'test',cdf_arr_frac.shape,y_fit.shape,y_indices.min(),y_indices.max(),self.delay_bin_values.shape #Now just need to compute the percentiles for all the y_indices cdf_at_y = cdf_arr_frac[np.arange(len(y_indices)), y_indices] #print "debug",cdf_at_y[0] return cdf_at_y # print cdf_at_y[:10],cdf_at_y.shape def get_node_pdfs(self, nodes): pdf_arr = np.zeros((nodes.shape[0], len(self.delay_bin_values) - 1), dtype=np.int) #print nodes.shape,pdf_arr.shape for i, node_info in enumerate(self.node_delay_pdfs): #print i,node_info['node_dict'] node_ids = [node_info['node_dict'][node] for node in nodes[:, i]] #print node_ids #print node_info['pdf_arr'].shape #print 'debug',node_info['pdf_arr'][:,node_ids[0]] temp_arr = np.array( [node_info['pdf_arr'][:, node_id] for node_id in node_ids], dtype=pdf_arr.dtype) pdf_arr += temp_arr #print 'temp',temp_arr[0,:] #print "" return pdf_arr def apply(self, X): return self.rforest.apply(X) def score(self, X, y): return self.rforest.score(X, y) #Compute how good each predicted value is based on how far away it is from the median value in percentiles: def score_percentiles(self, X, y): #First, compute the medians: y_med = self.predict_percentiles(X, [50]).ravel() #print y_med.shape percentiles = self.compute_percentiles(X, y) med_percentiles = self.compute_percentiles( X, y_med ) #Have to do this step to take into account the discrete nature of the y values. #print med_percentiles[:10],percentiles[:10] return 1. - np.sum((med_percentiles - percentiles)**2) / float(len(y))
class QuantileRandomForestRegressor: """A quantile random forest regressor based on the scikit-learn RandomForestRegressor A wrapper around the RandomForestRegressor which summarizes based on quantiles rather than the mean. Note that quantile predicitons take much longer than mean predictions. Parameters ---------- nthreads : int, default=1 number of threads to used rf_kwargs : array or array like kwargs to be passed to the RandomForestRegressor See Also -------- https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html?highlight=randomforestregressor#sklearn.ensemble.RandomForestRegressor.apply """ def __init__(self, nthreads=1, **rf_kwargs): rf_kwargs['n_jobs'] = nthreads self.forest = RandomForestRegressor(**rf_kwargs) set_num_threads(nthreads) def fit(self, X, y, sample_weight=None): """ Build a forest of trees from the training set (X, y). Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The training input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csc_matrix``. y : array-like of shape (n_samples,) or (n_samples, n_outputs) The target values (class labels in classification, real numbers in regression). sample_weight : array-like of shape (n_samples,), default=None Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. In the case of classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. Returns ------- self : object """ self.forest.fit(X, y, sample_weight) self.trainy = y.copy() self.trainX = X.copy() def predict(self, X, qntl): """ Predict regression target for X. The predicted regression target of an input sample is computed as the quantile predicted regression targets of the trees in the forest. Note: Not possible for multioutput regression. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csr_matrix``. qntl : {array-like} of shape (n_quantiles) Quantile or sequence of quantiles to compute, which must be between 0 and 1 inclusive. Passed to numpy.quantile. Returns ------- y : ndarray of shape (n_samples, n_quantiles) The predicted values. """ if len(self.trainy.shape)>1: raise RuntimeError("Quantile prediction is not possible with multioutput regression.") qntl = np.asanyarray(qntl) ntrees = self.forest.n_estimators ntrain = self.trainy.shape[0] train_tree_node_ID = np.zeros([ntrain, ntrees]) npred = X.shape[0] pred_tree_node_ID = np.zeros([npred, ntrees]) for i in range(ntrees): train_tree_node_ID[:, i] = self.forest.estimators_[i].apply(self.trainX) pred_tree_node_ID[:, i] = self.forest.estimators_[i].apply(X) ypred_pcts = find_quant(self.trainy, train_tree_node_ID, pred_tree_node_ID, qntl) return ypred_pcts def predict_sample(self, X, n_draws): """ Predict regression target for X. The predicted regression target of an input sample is computed as a random sample of the predicted regression targets of the trees in the forest. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csr_matrix``. n_sample : {int} number of sample to draw from the predicted regression targets Returns ------- y : ndarray of shape (n_samples, n_draws) or (n_samples, n_outputs, n_draws) The predicted values. """ ntrees = self.forest.n_estimators ntrain = self.trainy.shape[0] train_tree_node_ID = np.zeros([ntrain, ntrees]) npred = X.shape[0] pred_tree_node_ID = np.zeros([npred, ntrees]) for i in range(ntrees): train_tree_node_ID[:, i] = self.forest.estimators_[i].apply(self.trainX) pred_tree_node_ID[:, i] = self.forest.estimators_[i].apply(X) ypred_draws = find_sample(self.trainy, train_tree_node_ID, pred_tree_node_ID, n_draws) return ypred_draws def apply(self, X): """ wrapper for sklearn.ensemble.RandomForestRegressor.apply Apply trees in the forest to X, return leaf indices. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csr_matrix``. Returns ------- X_leaves : ndarray of shape (n_samples, n_estimators) For each datapoint x in X and for each tree in the forest, return the index of the leaf x ends up in. """ return self.forest.apply(X) def decision_path(self, X): """ wrapper for sklearn.ensemble.RandomForestRegressor.decision_path Return the decision path in the forest. .. versionadded:: 0.18 Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csr_matrix``. Returns ------- indicator : sparse matrix of shape (n_samples, n_nodes) Return a node indicator matrix where non zero elements indicates that the samples goes through the nodes. The matrix is of CSR format. n_nodes_ptr : ndarray of shape (n_estimators + 1,) The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]] gives the indicator value for the i-th estimator. """ return self.forest.decision_path(X) def set_params(self, **params): """ wrapper for sklearn.ensemble.RandomForestRegressor.set_params Set the parameters of this estimator. The method works on simple estimators as well as on nested objects (such as pipelines). The latter have parameters of the form ``<component>__<parameter>`` so that it's possible to update each component of a nested object. Parameters ---------- **params : dict Estimator parameters. Returns ------- self : object Estimator instance. """ return self.forestset_params(**params)
print "Instance", i print "Bias (trainset mean)", bias[i] print "Feature contributions:" for c, feature in sorted(zip(contributions[i], boston.feature_names), key=lambda x: -abs(x[0])): print feature, round(c, 2) print "-" * 20 # In[42]: print prediction print bias + np.sum(contributions, axis=1) # In[43]: # the basic feature importance feature provided by sklearn fit1.feature_importances_ # In[44]: # treeinterpreter uses the apply function to retrieve the leave indicies with the help of which, # the tree path is retrieved rf.apply # In[47]: rf.apply(instances) # In[ ]:
def test_varying_samples_per_node(meta_m, m): print("Type m: ", m.dtype) path_out = r"/home/irene/PycharmProjects/04_Risk_Model/data/poisson_leaves/prediction_ytest.csv" Y = m[:, 0] X = m[:, 1:] # Ynz, Xnz = trim_value(Y, X, 0) xtrain, xtest, ytrain, ytest, meta_m_train, meta_m_test = train_test_split( X, Y, meta_m, train_size=0.60, random_state=0) print('Type xtrain: ', xtrain.dtype) print("Raw data: ", Y.shape, X.shape) print("Train data", ytrain.shape, xtrain.shape) print("Test data: ", ytest.shape, xtest.shape) nrow = 0 nrow2 = 0 fig, ax = plt.subplots(nrows=5, ncols=5) fig2, ax2 = plt.subplots(nrows=1, ncols=5, figsize=(20, 8)) tit1 = "Effect of the number of samples per leaf node on the predicted distributions (n_esti=5)" tit2 = "Predicted distributions from above in function of the number of samples per leaf node" plt.suptitle(tit2, size=20) mean_ytest = np.mean(ytest) for samples_per_leaf in range(500, 1400, 200): print("Samples per leaf node: ", samples_per_leaf) ensemble = RandomForestRegressor(n_estimators=1, min_samples_leaf=samples_per_leaf, bootstrap=False) ensemble.fit(xtrain, ytrain) leaves = ensemble.apply(xtrain) dicori = samples_per_leaf_node(leaves, xtrain, ytrain) pack = fitting_four_models_leaf_nodes(dicori) pred = predicting_four_models_leaf_nodes(ytest, xtest, pack).T print(pred.shape, meta_m_test.shape) stack = np.hstack((meta_m_test, pred)) print("Shape of stack: ", stack.shape) dicens = ensemble_predictions_leaf_nodes(ensemble, dicori) header = "rowid;longitude;latitude;predpoi;prednb;predzip;predzinb" fmts = ["%d", "%d", "%d", "%.4f", "%.4f", "%.4f", "%.4f"] # np.savetxt(path_out, stack, delimiter=";", fmt=fmts, header=header) pred_rf = ensemble.predict(xtest) rmse_rf = np.sqrt(mean_squared_error(ytest, pred_rf)) print() print("RMSE RF: ", rmse_rf) # ax = plot_compare_histograms(ax, nrow, samples_per_leaf, mean_ytest, ytest, pred, pred_rf) print(pred.shape, pred_rf.reshape(-1, 1).shape) allpreds = np.hstack((pred, pred_rf.reshape(-1, 1))) labels = ['Poisson', 'NB', 'ZIP', "ZINB", "RF-Classic"] labelsize = 16 rcParams['xtick.labelsize'] = labelsize rcParams['ytick.labelsize'] = labelsize ax2[nrow2].set_title("SPL: {0}".format(samples_per_leaf)) ax2[nrow2].set_facecolor('#F5F5F5') box = ax2[nrow2].boxplot(allpreds, patch_artist=True) # ax2[nrow2].xaxis.set_ticks(labels) ax2[nrow2].set_xticklabels(labels, fontsize=16, fontdict={'fontsize': 16}) colors = ['#A87128', '#004561', '#3C5B43', '#85243C', '#615048'] for patch, color in zip(box['boxes'], colors): patch.set_facecolor(color) ax2[nrow2].yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5) nrow += 1 nrow2 += 1 plt.show()
test.append(test_data[i]) test_y.append(float(y[i])) else: data.append(test_data[i]) i += 1 test1_y = np.asarray(test_y, dtype=np.float32) #test = np.asarray(test, dtype=np.float32) #test1_y = test1_y.transpose #print(test_y) #print data for i1 in range(0,10): forest = RandomForestRegressor(n_estimators = 100, max_depth = 3) #print("--- %s seconds ---" % (time.clock() - start_time)) forest = forest.fit(test,test1_y) out1 = forest.apply(test) out = forest.score(test,test1_y) print out print out1 #print("--- %s seconds ---" % (time.clock() - start_time)) output = forest.predict(data) i = 0 error = 0 error1 = 0 while i < len(output): if abs(output[i] - y[test_len+i]) > 0.01: #print(i) #print(y[test_len+i]) #print(output[i]) error += abs(output[i] - y[test_len+i]) error1 += 1
random_state=0) print('Type xtrain: ', xtrain.dtype) print("Raw data: ", Y.shape, X.shape) print("Trim data: ", Ynz.shape, Xnz.shape) print("Train data", ytrain.shape, xtrain.shape) print("Test data: ", ytest.shape, xtest.shape) ensemble = RandomForestRegressor(n_estimators=100, min_samples_leaf=500, bootstrap=False) ensemble.fit(xtrain, ytrain) leaves = ensemble.apply(xtrain) dicori = samples_per_leaf_node(leaves, xtrain, ytrain) dicpoi = poisson_predictions_leaf_nodes(dicori) dicens = ensemble_predictions_leaf_nodes(dicori) ypred_poi = np.mean(poisson_predictions_testing(dicori, dicpoi, xtest), axis=1) ypred_ens = ensemble_predictions_testing(ensemble, xtest) plot_poisson_ensemble_raw(ypred_poi, ypred_ens, Yavg, ytest) # rmse = np.sqrt(mean_squared_error(ytest, ypred_poi)) #
feature[i], threshold[i], children_right[i], )) print() # First let's retrieve the decision path of each sample. The decision_path # method allows to retrieve the node indicator functions. A non zero element of # indicator matrix at the position (i, j) indicates that the sample i goes # through the node j. node_indicator = estimator.decision_path(X_test) # Similarly, we can also have the leaves ids reached by each sample. leave_id = estimator.apply(X_test) # Now, it's possible to get the tests that were used to predict a sample or # a group of samples. First, let's make it for the sample. sample_id = 0 node_index = node_indicator.indices[node_indicator.indptr[sample_id]: node_indicator.indptr[sample_id + 1]] print('Rules used to predict sample %s: ' % sample_id) for node_id in node_index: if leave_id[sample_id] == node_id: continue if X_test[sample_id, feature[node_id]] <= threshold[node_id]: threshold_sign = "<="
def __init__(self, X_train, MR_train, X_val, MR_val, fe_type="rf", fe=None, n_estimators=200, max_features=0.5, min_samples_leaf=10, regularization=0.001): # Features and the target model response self.X_train = X_train self.MR_train = MR_train self.X_val = X_val self.MR_val = MR_val # Forest Ensemble Parameters self.n_estimators = n_estimators self.max_features = max_features self.min_samples_leaf = min_samples_leaf # Local Linear Model Parameters self.regularization = regularization # Data parameters num_features = X_train.shape[1] self.num_features = num_features num_train = X_train.shape[0] self.num_train = num_train num_val = X_val.shape[0] # Fit a Forest Ensemble to the model response if fe is None: if fe_type == "rf": fe = RandomForestRegressor(n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, max_features=max_features) elif fe_type == "gbrt": fe = GradientBoostingRegressor( n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, max_features=max_features, max_depth=None) else: print("Unknown FE type ", fe) import sys sys.exit(0) fe.fit(X_train, MR_train) else: self.n_estimators = n_estimators = len(fe.estimators_) self.fe = fe train_leaf_ids = fe.apply(X_train) self.train_leaf_ids = train_leaf_ids val_leaf_ids_list = fe.apply(X_val) # Compute the feature importances: Non-normalized @ Root scores = np.zeros(num_features) if fe_type == "rf": for i in range(n_estimators): splits = fe[ i].tree_.feature #-2 indicates leaf, index 0 is root if splits[0] != -2: scores[splits[0]] += fe[i].tree_.impurity[ 0] #impurity reduction not normalized per tree elif fe_type == "gbrt": for i in range(n_estimators): splits = fe[ i, 0].tree_.feature #-2 indicates leaf, index 0 is root if splits[0] != -2: scores[splits[0]] += fe[i, 0].tree_.impurity[ 0] #impurity reduction not normalized per tree self.feature_scores = scores mostImpFeats = np.argsort(-scores) # Find the number of features to use for MAPLE retain_best = 0 rmse_best = np.inf for retain in range(1, num_features + 1): # Drop less important features for local regression X_train_p = np.delete(X_train, mostImpFeats[retain:], axis=1) X_val_p = np.delete(X_val, mostImpFeats[retain:], axis=1) lr_predictions = np.empty([num_val], dtype=float) for i in range(num_val): weights = self.training_point_weights(val_leaf_ids_list[i]) # Local linear model lr_model = Ridge(alpha=regularization) lr_model.fit(X_train_p, MR_train, weights) lr_predictions[i] = lr_model.predict(X_val_p[i].reshape(1, -1)) rmse_curr = np.sqrt(mean_squared_error(lr_predictions, MR_val)) if rmse_curr < rmse_best: rmse_best = rmse_curr retain_best = retain self.retain = retain_best self.X = np.delete(X_train, mostImpFeats[retain_best:], axis=1)
def test_varying_samples_per_node(meta_m, m): path_out = r"/home/irene/PycharmProjects/04_Risk_Model/data/skewed_leaves/to_keep_for_taylor/remove_testing_SPL{0}_NESTI{1}.csv" print("Type m: ", m.dtype) Y = m[:, 0] X = m[:, 1:] Ynz, Xnz, meta_m_nz = trim_value(Y, X, meta_m, 0) xtrain, xtest, ytrain, ytest, meta_m_train, meta_m_test = train_test_split( X, Y, meta_m, train_size=0.60, random_state=0) print('Type xtrain: ', xtrain.dtype) print("Raw data: ", Y.shape, X.shape) print("Train data", ytrain.shape, xtrain.shape) print("Test data: ", ytest.shape, xtest.shape) n_estimators = [5] samples_per_leaf = [1000] start_all = time.time() fig, ax = plt.subplots(nrows=5, ncols=5) nrow = 0 for spl in samples_per_leaf: start_it = time.time() for n_esti in n_estimators: print() print("Analysis: RF with Skewed Leaves") print("Samples per leaf node: ", spl) print("Number of estimators: ", n_esti) print("-" * 50) ensemble = RandomForestRegressor(n_estimators=n_esti, min_samples_leaf=spl, bootstrap=True) ensemble.fit(xtrain, ytrain) leaves = ensemble.apply(xtrain) for feature in leaves.T: nnodes = str(len(np.unique(feature))) + "\n" f.write(nnodes) dicori = samples_per_leaf_node(leaves, xtrain, ytrain) pack = fitting_four_models_leaf_nodes(dicori) pred_rf = ensemble.predict(xtest) pred_sk = testing_four_models_leaf_nodes_v2( ensemble, spl, n_esti, ytest, xtest, pack, pred_rf, meta_m_test).T print("Now saving") print(meta_m_test.shape, ytest.shape, pred_sk.shape, pred_rf.shape) stack = np.hstack((meta_m_test, ytest.reshape(-1, 1), pred_sk, pred_rf.reshape(-1, 1))) # dicens = ensemble_predictions_leaf_nodes(ensemble, dicori) write_proportion_of_zeros(spl, n_esti) # with open(path_out.format(spl, n_esti), "w", newline="") as w: # writer = csv.writer(w, delimiter=";") # for item in stack: # writer.writerow(item) plot_compare_histograms(ax, nrow, spl, ytest, pred_sk, pred_rf) nrow += 1 stop_it = time.time() print("--- Iteration elapsed {0} minutes ---".format( np.divide(stop_it - start_it, 60))) plt.show() end_all = time.time() print("--- Full program elapsed {0} hours ---".format( np.divide(end_all - start_all, 3600)))
class _LinearForest(BaseEstimator): """Base class for Linear Forest meta-estimator. Warning: This class should not be used directly. Use derived classes instead. """ def __init__(self, base_estimator, *, n_estimators, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_features, max_leaf_nodes, min_impurity_decrease, bootstrap, oob_score, n_jobs, random_state, ccp_alpha, max_samples): self.base_estimator = base_estimator self.n_estimators = n_estimators self.max_depth = max_depth self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = max_features self.max_leaf_nodes = max_leaf_nodes self.min_impurity_decrease = min_impurity_decrease self.bootstrap = bootstrap self.oob_score = oob_score self.n_jobs = n_jobs self.random_state = random_state self.ccp_alpha = ccp_alpha self.max_samples = max_samples def _sigmoid(self, y): """Expit function (a.k.a. logistic sigmoid). Parameters ---------- y : array-like of shape (n_samples, ) The array to apply expit to element-wise. Returns ------- y : array-like of shape (n_samples, ) Expits. """ return np.exp(y) / (1 + np.exp(y)) def _inv_sigmoid(self, y): """Logit function. Parameters ---------- y : array-like of shape (n_samples, ) The array to apply logit to element-wise. Returns ------- y : array-like of shape (n_samples, ) Logits. """ y = y.clip(1e-3, 1 - 1e-3) return np.log(y / (1 - y)) def _fit(self, X, y, sample_weight=None): """Build a Linear Boosting from the training set (X, y). Parameters ---------- X : array-like of shape (n_samples, n_features) The training input samples. y : array-like of shape (n_samples, ) or also (n_samples, n_targets) for multitarget regression. The target values (class labels in classification, real numbers in regression). sample_weight : array-like of shape (n_samples, ), default=None Sample weights. Returns ------- self : object """ if not hasattr(self.base_estimator, "fit_intercept"): raise ValueError( "Only linear models are accepted as base_estimator. " "Select one from linear_model class of scikit-learn.") if not is_regressor(self.base_estimator): raise ValueError( "Select a regressor linear model as base_estimator.") n_sample, self.n_features_in_ = X.shape if hasattr(self, "classes_"): class_to_int = dict(map(reversed, enumerate(self.classes_))) y = np.array([class_to_int[i] for i in y]) y = self._inv_sigmoid(y) self.base_estimator_ = deepcopy(self.base_estimator) self.base_estimator_.fit(X, y, sample_weight) resid = y - self.base_estimator_.predict(X) criterion = "squared_error" if _sklearn_v1 else "mse" self.forest_estimator_ = RandomForestRegressor( n_estimators=self.n_estimators, criterion=criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, min_impurity_decrease=self.min_impurity_decrease, bootstrap=self.bootstrap, oob_score=self.oob_score, n_jobs=self.n_jobs, random_state=self.random_state, ccp_alpha=self.ccp_alpha, max_samples=self.max_samples, ) self.forest_estimator_.fit(X, resid, sample_weight) if hasattr(self.base_estimator_, "coef_"): self.coef_ = self.base_estimator_.coef_ if hasattr(self.base_estimator_, "intercept_"): self.intercept_ = self.base_estimator_.intercept_ self.feature_importances_ = self.forest_estimator_.feature_importances_ return self def apply(self, X): """Apply trees in the forest to X, return leaf indices. Parameters ---------- X : array-like of shape (n_samples, n_features) The input samples. Returns ------- X_leaves : ndarray of shape (n_samples, n_estimators) For each datapoint x in X and for each tree in the forest, return the index of the leaf x ends up in. """ check_is_fitted(self, attributes="base_estimator_") return self.forest_estimator_.apply(X) def decision_path(self, X): """Return the decision path in the forest. Parameters ---------- X : array-like of shape (n_samples, n_features) The input samples. Returns ------- indicator : sparse matrix of shape (n_samples, n_nodes) Return a node indicator matrix where non zero elements indicates that the samples goes through the nodes. The matrix is of CSR format. n_nodes_ptr : ndarray of shape (n_estimators + 1, ) The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]] gives the indicator value for the i-th estimator. """ check_is_fitted(self, attributes="base_estimator_") return self.forest_estimator_.decision_path(X)
def test_RandomForest(): X1 = np.arange(0, 10, 0.1) X2 = np.arange(10, 20, 0.1) y = np.sin(X1).ravel() + np.cos(X2).ravel() X_df = pd.DataFrame(np.array([X1, X2]).T, columns=['x1', 'x2']) rf_regr = RandomForestRegressor(n_estimators=1000, max_depth=5, bootstrap=False) rf_regr.fit(X_df, y) with StopWatch("LucidEnsemble Random Forest construction"): lucid_rf = make_LucidEnsemble(rf_regr, feature_names=X_df.columns, print_precision=5) # If this is not float32 there are precision errors # apparently DecisionTreeRegressor within RandomForestRegressor # requires that the matrix be of type float32 so there is a # type conversion from types to float32 X_df = X_df.astype(np.float32) with StopWatch("Scikit-learn Random Forest prediction"): rf_pred = rf_regr.predict(X_df) with StopWatch("Lucid Random Forest (non-compressed) prediction"): lucid_rf_pred = lucid_rf.predict(X_df) ###################################################### # test prediction outputted from LucidEnsemble np.testing.assert_almost_equal(lucid_rf_pred, rf_pred) assert (np.all(rf_regr.apply(X_df) == lucid_rf.apply(X_df))) with StopWatch("Compression of Lucid Random Forest"): compressed_lucid_rf = lucid_rf.compress() print("{} unique nodes and {} # of estimators".format( compressed_lucid_rf.n_leaves, len(lucid_rf))) with StopWatch("Lucid Random Forest (compressed) prediction"): crf_pred = compressed_lucid_rf.predict(X_df) np.testing.assert_almost_equal(crf_pred, rf_pred) ###################################################### # test comparison, compare the leaves of two # LucidEnsembles made from the the same arguments lucid_rf2 = make_LucidEnsemble(rf_regr, feature_names=X_df.columns, print_precision=3) compressed_lucid_rf2 = lucid_rf2.compress() assert (set(compressed_lucid_rf.leaves) == set( compressed_lucid_rf2.leaves)) script_dir = os.path.dirname(__name__) ###################################################### # test pickling functionality pickle_path = os.path.join(script_dir, 'lucid_rf.pkl') with open(pickle_path, 'wb') as fh: pickle.dump(lucid_rf, fh) with open(pickle_path, 'rb') as fh: lucid_rf_pickle = pickle.load(fh) np.testing.assert_almost_equal(lucid_rf_pickle.predict(X_df), lucid_rf_pred) os.remove(pickle_path) pickle_path = os.path.join(script_dir, 'compressed_lucid_rf.pkl') with open(pickle_path, 'wb') as fh: pickle.dump(compressed_lucid_rf, fh) with open(pickle_path, 'rb') as fh: compressed_lucid_rf_pickle = pickle.load(fh) np.testing.assert_almost_equal( compressed_lucid_rf_pickle.predict(X_df), crf_pred) os.remove(pickle_path)
def test_varying_samples_per_node(meta_m, m): hs = [] path_out = r"/home/irene/PycharmProjects/04_Risk_Model/data/skewed_leaves/explore_trees/testing_SPL{0}_NESTI{1}.csv" print("Type m: ", m.dtype) Y = m[:, 0] X = m[:, 1:] Ynz, Xnz, meta_m_nz = trim_value(Y, X, meta_m, 0) xtrain, xtest, ytrain, ytest, meta_m_train, meta_m_test = train_test_split( X, Y, meta_m, train_size=0.60, random_state=0) print('Type xtrain: ', xtrain.dtype) print("Raw data: ", Y.shape, X.shape) print("Train data", ytrain.shape, xtrain.shape) print("Test data: ", ytest.shape, xtest.shape) nt = 10 n_estimators = [nt] samples_per_leaf = [100] start_all = time.time() fig, ax = plt.subplots(nrows=6, ncols=5, sharex=False, sharey=False) plt.subplots_adjust(wspace=0.5, hspace=0.5) nrow = 0 for spl in samples_per_leaf: start_it = time.time() for n_esti in n_estimators: print() print("Analysis: RF with Skewed Leaves") print("Samples per leaf node: ", spl) print("Number of estimators: ", n_esti) print("-" * 50) ensemble = RandomForestRegressor(n_estimators=n_esti, min_samples_leaf=spl, bootstrap=True) ensemble.fit(xtrain, ytrain) leaves = ensemble.apply(xtrain) dicori = samples_per_leaf_node(leaves, xtrain, ytrain) pack = fitting_four_models_leaf_nodes(dicori) pred_rf = ensemble.predict(xtest) pred_sk = testing_four_models_leaf_nodes_v2( ensemble, spl, n_esti, ytest, xtest, pack, pred_rf, meta_m_test).T stack = np.hstack((meta_m_test, ytest.reshape(-1, 1), pred_sk, pred_rf.reshape(-1, 1))) save_tree_graph(ensemble, nt) # dicens = ensemble_predictions_leaf_nodes(ensemble, dicori) # write_proportion_of_zeros(spl, n_esti) # with open(path_out.format(spl, n_esti), "w", newline="") as w: # writer = csv.writer(w, delimiter=";") # for item in stack: # writer.writerow(item) # h = plot_compare_histograms(ax, nrow, spl, ytest, pred_sk, pred_rf) # hs.append(h) # plot_pred_vs_true(ax, nrow, spl, ytest, pred_sk, pred_rf) nrow += 1 stop_it = time.time() print("--- Iteration elapsed {0} minutes ---".format( np.divide(stop_it - start_it, 60))) # fig.legend(hs, loc="center right", borderaxespad=0.1, title="Legend Title # path_fig_out = r"/home/irene/Pictures/0403_Compare_Histograms_full05.png" # manager = plt.get_current_fig_manager() # manager.window.showMaximized() # plt.pause(10) # plt.gcf().savefig(path_fig_out, format='png', dpi=300) end_all = time.time() print("--- Full program elapsed {0} hours ---".format( np.divide(end_all - start_all, 3600)))
def test_varying_samples_per_node(meta_m, m): print("Type m: ", m.dtype) path_out = r"/home/irene/PycharmProjects/04_Risk_Model/data/poisson_leaves/prediction_ytest.csv" Y = m[:, 0] X = m[:, 1:] # Ynz, Xnz = trim_value(Y, X, 0) xtrain, xtest, ytrain, ytest, meta_m_train, meta_m_test = train_test_split( X, Y, meta_m, train_size=0.60, random_state=0) print('Type xtrain: ', xtrain.dtype) print("Raw data: ", Y.shape, X.shape) print("Train data", ytrain.shape, xtrain.shape) print("Test data: ", ytest.shape, xtest.shape) for samples_per_leaf in range(500, 600, 100): print("Samples per leaf node: ", samples_per_leaf) ensemble = RandomForestRegressor(n_estimators=1, min_samples_leaf=1000, bootstrap=False) ensemble.fit(xtrain, ytrain) leaves = ensemble.apply(xtrain) dicori = samples_per_leaf_node(leaves, xtrain, ytrain) pack = fitting_four_models_leaf_nodes(dicori) pred = predicting_four_models_leaf_nodes(ytest, xtest, pack).T print(pred.shape, meta_m_test.shape) stack = np.hstack((meta_m_test, pred)) print("Shape of stack: ", stack.shape) dicens = ensemble_predictions_leaf_nodes(ensemble, dicori) header = "rowid;longitude;latitude;predpoi;prednb;predzip;predzinb" fmts = ["%d", "%d", "%d", "%.4f", "%.4f", "%.4f", "%.4f"] # np.savetxt(path_out, stack, delimiter=";", fmt=fmts, header=header) pred_rf = ensemble.predict(xtest) rmse_rf = np.sqrt(mean_squared_error(ytest, pred_rf)) print() print("RMSE RF: ", rmse_rf) plt.subplot(2, 3, 1) plt.hist(pred[:, 0], bins=50) plt.hist(ytest, bins=50) plt.subplot(2, 3, 2) plt.hist(pred[:, 1], bins=50) plt.hist(ytest, bins=50) plt.subplot(2, 3, 3) plt.hist(pred[:, 2], bins=50) plt.hist(ytest, bins=50) plt.subplot(2, 3, 4) plt.hist(pred[:, 3], bins=50) plt.hist(ytest, bins=50) plt.subplot(2, 3, 5) plt.hist(ytest, bins=50) plt.hist(pred_rf) plt.show() break