def _choose_feature(self, train_data, train_target, x_range, remaining_features): py = np.mean(train_target, axis=0) H = safe_entropy(py[:, None]) max_ig = -1 split_feature = -1 best_split_val = -1 t = 1 for x_i in remaining_features: prog_bar(t, len(remaining_features)) t += 1 # Here, we take num_splits over the range of features, each of # which we will evaluate. We do not include the last value, # because all features will be less than or equal to this value. split_vals = np.linspace(np.min(x_range[x_i]), np.max(x_range[x_i]), self.num_splits, endpoint=False) # If there are no informative splits, then skip this feature if split_vals[0] == split_vals[-1]: continue for split_val in split_vals: split = (train_data[:, x_i] <= split_val).astype(float) sum_x = np.sum(split, axis=0).astype(float) sum_notx = train_data.shape[0] - sum_x py_given_x = np.zeros((train_target.shape[1], 1)) py_given_notx = np.zeros((train_target.shape[1], 1)) for y in range(train_target.shape[1]): y_given_x = ((split == 1) & (train_target[:, y] == 1)) y_given_notx = ((split == 0) & (train_target[:, y] == 1)) py_given_x[y] = np.sum(y_given_x) / sum_x py_given_notx[y] = np.sum(y_given_notx) / sum_notx # Compute the conditional entropy and information gain px = np.mean(split) cond_H = px * safe_entropy(py_given_x) + ( 1 - px) * safe_entropy(py_given_notx) ig = H - cond_H if ig > max_ig: max_ig = ig split_feature = x_i best_split_val = split_val return split_feature, best_split_val, max_ig
def _choose_feature(self, train_data, train_target, x_range, remaining_features): py = np.mean(train_target, axis=0) H = safe_entropy(py[:, None]) max_ig = -1 split_feature = -1 best_split_val = -1 t = 1 for x_i in remaining_features: prog_bar(t, len(remaining_features)) t += 1 # Here, we take num_splits over the range of features, each of # which we will evaluate. We do not include the last value, # because all features will be less than or equal to this value. split_vals = np.linspace(np.min(x_range[x_i]), np.max(x_range[x_i]), self.num_splits, endpoint=False) # If there are no informative splits, then skip this feature if split_vals[0] == split_vals[-1]: continue for split_val in split_vals: split = (train_data[:, x_i] <= split_val).astype(float) sum_x = np.sum(split, axis=0).astype(float) sum_notx = train_data.shape[0] - sum_x py_given_x = np.zeros((train_target.shape[1], 1)) py_given_notx = np.zeros((train_target.shape[1], 1)) for y in range(train_target.shape[1]): y_given_x = ((split==1) & (train_target[:, y]==1)) y_given_notx = ((split==0) & (train_target[:, y]==1)) py_given_x[y] = np.sum(y_given_x) / sum_x py_given_notx[y] = np.sum(y_given_notx) / sum_notx # Compute the conditional entropy and information gain px = np.mean(split) cond_H = px * safe_entropy(py_given_x) + (1 - px) * safe_entropy(py_given_notx) ig = H - cond_H if ig > max_ig: max_ig = ig split_feature = x_i best_split_val = split_val return split_feature, best_split_val, max_ig
def test(self, test_data, test_target): t = 0 # TODO: refactor the RF test function to depend not on an external # root but on itself dt = FastDecisionTree(1, 1) yhat_forest = np.zeros((test_data.shape[0], self.n_trees)) for i in range(len(self.roots)): r = self.roots[i] prog_bar(t, self.n_trees) t += 1 yhat_forest[:, i:] = dt.test_preds(r, test_data) prog_bar(self.n_trees, self.n_trees) yhat = stats.mode(yhat_forest, axis=1)[0] return yhat
def grad_desent_lasso(self, train_data, train_target, regularization, lam, step_size=1E-5, num_epochs=1000): self.w = np.zeros((train_data.shape[1], 1)) t = 0 for i in range(0, num_epochs): t += 1 if t % 100 == 0: prog_bar(t, num_epochs) yhat = np.dot(train_data, self.w) print np.sum((yhat - train_target)*train_data, axis=0)[:, None].shape print self._calc_reg(self.w, regularization, lam).shape grad = np.sum((yhat - train_target)*train_data, axis=0)[:] - \ self._calc_reg(self.w, regularization, lam) self.w -= step_size*grad prog_bar(num_epochs, num_epochs)
def train(self, train_data, train_target): t = 0 for i in range(self.n_trees): prog_bar(t, self.n_trees) t += 1 keep_idx = np.random.rand(train_data.shape[0]) <= \ self.boot_percent boot_train_data = train_data[keep_idx, :] boot_train_target = train_target[keep_idx] dt = FastDecisionTree(self.max_depth, self.num_splits, feat_subset=self.feat_percent, debug=self.debug) r = dt.train(boot_train_data, boot_train_target) self.roots.append(r) prog_bar(self.n_trees, self.n_trees)
def _choose_feature(self, train_data_original, train_target, x_range_original): # Subsample the features, if applicable train_data = None x_range = dict() if self.feat_subset != 1.0: r = np.random.rand(train_data_original.shape[1]) keep_idx = r < self.feat_subset train_data = train_data_original[:, keep_idx] sorted_keys = sorted(x_range_original.keys()) for i in range(len(keep_idx)): if keep_idx[i]: x_range[sorted_keys[i]] = x_range_original[sorted_keys[i]] else: train_data = train_data_original x_range = x_range_original sorted_x_range = sorted(x_range.items(), key=operator.itemgetter(0)) py = np.mean(train_target, axis=0) H = safe_entropy(py[:, None]) max_ig = -1 split_feature = -1 best_split_val = -1 t = 1 for s in range(self.num_splits): splits = [l[1][s] for l in sorted_x_range] split = (train_data <= splits) sum_x = (np.sum(split, axis=0)).astype(float) sum_notx = train_data.shape[0] - sum_x py_given_x = np.zeros((train_target.shape[1], len(x_range))) py_given_notx = np.zeros((train_target.shape[1], len(x_range))) for y in range(train_target.shape[1]): if self.debug: prog_bar(t, self.num_splits*train_target.shape[1]) t += 1 y_given_x = (split & (train_target[:, y]==1)[:, None]) y_given_notx = ((split == False) & (train_target[:, y]==1)[:, None]) y_given_x_sum = (np.sum(y_given_x, axis=0)).astype(float) y_given_notx_sum = (np.sum(y_given_notx, axis=0)).astype(float) py_given_x[y, :] = y_given_x_sum / sum_x py_given_notx[y, :] = y_given_notx_sum / sum_notx px = np.mean(split, axis=0) cond_H = px * safe_entropy(py_given_x) + (1 - px) * safe_entropy(py_given_notx) ig = H - cond_H ig_max = np.max(ig) ig_argmax = np.argmax(ig) if ig_max > max_ig: max_ig = ig_max split_feature = sorted_x_range[ig_argmax][0] best_split_val = splits[ig_argmax] return split_feature, best_split_val, max_ig