예제 #1
0
    def _choose_feature(self, train_data, train_target, x_range,
                        remaining_features):
        py = np.mean(train_target, axis=0)
        H = safe_entropy(py[:, None])

        max_ig = -1
        split_feature = -1
        best_split_val = -1

        t = 1
        for x_i in remaining_features:
            prog_bar(t, len(remaining_features))
            t += 1

            # Here, we take num_splits over the range of features, each of
            # which we will evaluate. We do not include the last value,
            # because all features will be less than or equal to this value.
            split_vals = np.linspace(np.min(x_range[x_i]),
                                     np.max(x_range[x_i]),
                                     self.num_splits,
                                     endpoint=False)

            # If there are no informative splits, then skip this feature
            if split_vals[0] == split_vals[-1]:
                continue

            for split_val in split_vals:
                split = (train_data[:, x_i] <= split_val).astype(float)

                sum_x = np.sum(split, axis=0).astype(float)
                sum_notx = train_data.shape[0] - sum_x

                py_given_x = np.zeros((train_target.shape[1], 1))
                py_given_notx = np.zeros((train_target.shape[1], 1))

                for y in range(train_target.shape[1]):
                    y_given_x = ((split == 1) & (train_target[:, y] == 1))
                    y_given_notx = ((split == 0) & (train_target[:, y] == 1))
                    py_given_x[y] = np.sum(y_given_x) / sum_x
                    py_given_notx[y] = np.sum(y_given_notx) / sum_notx

                # Compute the conditional entropy and information gain
                px = np.mean(split)
                cond_H = px * safe_entropy(py_given_x) + (
                    1 - px) * safe_entropy(py_given_notx)
                ig = H - cond_H

                if ig > max_ig:
                    max_ig = ig
                    split_feature = x_i
                    best_split_val = split_val

        return split_feature, best_split_val, max_ig
예제 #2
0
    def _choose_feature(self, train_data, train_target, x_range,
                        remaining_features):
        py = np.mean(train_target, axis=0)
        H = safe_entropy(py[:, None])

        max_ig = -1
        split_feature = -1
        best_split_val = -1

        t = 1
        for x_i in remaining_features:
            prog_bar(t, len(remaining_features))
            t += 1

            # Here, we take num_splits over the range of features, each of
            # which we will evaluate. We do not include the last value,
            # because all features will be less than or equal to this value.
            split_vals = np.linspace(np.min(x_range[x_i]),
                                     np.max(x_range[x_i]),
                                     self.num_splits,
                                     endpoint=False)

            # If there are no informative splits, then skip this feature
            if split_vals[0] == split_vals[-1]:
                continue

            for split_val in split_vals:
                split = (train_data[:, x_i] <= split_val).astype(float)

                sum_x = np.sum(split, axis=0).astype(float)
                sum_notx = train_data.shape[0] - sum_x

                py_given_x = np.zeros((train_target.shape[1], 1))
                py_given_notx = np.zeros((train_target.shape[1], 1))

                for y in range(train_target.shape[1]):
                    y_given_x = ((split==1) & (train_target[:, y]==1))
                    y_given_notx = ((split==0) & (train_target[:, y]==1))
                    py_given_x[y] = np.sum(y_given_x) / sum_x
                    py_given_notx[y] = np.sum(y_given_notx) / sum_notx


                # Compute the conditional entropy and information gain
                px = np.mean(split)
                cond_H = px * safe_entropy(py_given_x) + (1 - px) * safe_entropy(py_given_notx)
                ig = H - cond_H

                if ig > max_ig:
                    max_ig = ig
                    split_feature = x_i
                    best_split_val = split_val

        return split_feature, best_split_val, max_ig
예제 #3
0
    def test(self, test_data, test_target):
        t = 0
        # TODO: refactor the RF test function to depend not on an external
        # root but on itself
        dt = FastDecisionTree(1, 1)
        yhat_forest = np.zeros((test_data.shape[0], self.n_trees))
        for i in range(len(self.roots)):
            r = self.roots[i]
            prog_bar(t, self.n_trees)
            t += 1

            yhat_forest[:, i:] = dt.test_preds(r, test_data)

        prog_bar(self.n_trees, self.n_trees)

        yhat = stats.mode(yhat_forest, axis=1)[0]
        return yhat
예제 #4
0
    def test(self, test_data, test_target):
        t = 0
        # TODO: refactor the RF test function to depend not on an external
        # root but on itself
        dt = FastDecisionTree(1, 1)
        yhat_forest = np.zeros((test_data.shape[0], self.n_trees))
        for i in range(len(self.roots)):
            r = self.roots[i]
            prog_bar(t, self.n_trees)
            t += 1

            yhat_forest[:, i:] = dt.test_preds(r, test_data)

        prog_bar(self.n_trees, self.n_trees)

        yhat = stats.mode(yhat_forest, axis=1)[0]
        return yhat
예제 #5
0
    def grad_desent_lasso(self, train_data, train_target, regularization, lam,
            step_size=1E-5, num_epochs=1000):

        self.w = np.zeros((train_data.shape[1], 1))

        t = 0
        for i in range(0, num_epochs):
            t += 1
            if t % 100 == 0:
                prog_bar(t, num_epochs)

            yhat = np.dot(train_data, self.w)
            print np.sum((yhat - train_target)*train_data, axis=0)[:,
                  None].shape
            print self._calc_reg(self.w, regularization, lam).shape
            grad = np.sum((yhat - train_target)*train_data, axis=0)[:] - \
                   self._calc_reg(self.w, regularization, lam)
            self.w -= step_size*grad

        prog_bar(num_epochs, num_epochs)
예제 #6
0
    def train(self, train_data, train_target):
        t = 0
        for i in range(self.n_trees):
            prog_bar(t, self.n_trees)
            t += 1

            keep_idx = np.random.rand(train_data.shape[0]) <= \
                       self.boot_percent

            boot_train_data = train_data[keep_idx, :]
            boot_train_target = train_target[keep_idx]

            dt = FastDecisionTree(self.max_depth, self.num_splits,
                                  feat_subset=self.feat_percent,
                                  debug=self.debug)

            r = dt.train(boot_train_data, boot_train_target)
            self.roots.append(r)

        prog_bar(self.n_trees, self.n_trees)
예제 #7
0
    def train(self, train_data, train_target):
        t = 0
        for i in range(self.n_trees):
            prog_bar(t, self.n_trees)
            t += 1

            keep_idx = np.random.rand(train_data.shape[0]) <= \
                       self.boot_percent

            boot_train_data = train_data[keep_idx, :]
            boot_train_target = train_target[keep_idx]

            dt = FastDecisionTree(self.max_depth,
                                  self.num_splits,
                                  feat_subset=self.feat_percent,
                                  debug=self.debug)

            r = dt.train(boot_train_data, boot_train_target)
            self.roots.append(r)

        prog_bar(self.n_trees, self.n_trees)
예제 #8
0
    def _choose_feature(self, train_data_original, train_target,
                        x_range_original):

        # Subsample the features, if applicable
        train_data = None
        x_range = dict()
        if self.feat_subset != 1.0:
            r = np.random.rand(train_data_original.shape[1])
            keep_idx = r < self.feat_subset

            train_data = train_data_original[:, keep_idx]
            sorted_keys = sorted(x_range_original.keys())
            for i in range(len(keep_idx)):
                if keep_idx[i]:
                    x_range[sorted_keys[i]] = x_range_original[sorted_keys[i]]

        else:
            train_data = train_data_original
            x_range = x_range_original

        sorted_x_range = sorted(x_range.items(), key=operator.itemgetter(0))

        py = np.mean(train_target, axis=0)
        H = safe_entropy(py[:, None])

        max_ig = -1
        split_feature = -1
        best_split_val = -1

        t = 1
        for s in range(self.num_splits):

            splits = [l[1][s] for l in sorted_x_range]
            split = (train_data <= splits)

            sum_x = (np.sum(split, axis=0)).astype(float)
            sum_notx = train_data.shape[0] - sum_x

            py_given_x = np.zeros((train_target.shape[1], len(x_range)))
            py_given_notx = np.zeros((train_target.shape[1], len(x_range)))

            for y in range(train_target.shape[1]):
                if self.debug:
                    prog_bar(t, self.num_splits*train_target.shape[1])
                    t += 1

                y_given_x = (split & (train_target[:, y]==1)[:, None])
                y_given_notx = ((split == False) & (train_target[:, y]==1)[:,
                                                   None])

                y_given_x_sum = (np.sum(y_given_x, axis=0)).astype(float)
                y_given_notx_sum = (np.sum(y_given_notx, axis=0)).astype(float)

                py_given_x[y, :] = y_given_x_sum / sum_x
                py_given_notx[y, :] = y_given_notx_sum / sum_notx

            px = np.mean(split, axis=0)
            cond_H = px * safe_entropy(py_given_x) + (1 - px) * safe_entropy(py_given_notx)
            ig = H - cond_H
            ig_max = np.max(ig)
            ig_argmax = np.argmax(ig)

            if ig_max > max_ig:
                max_ig = ig_max
                split_feature = sorted_x_range[ig_argmax][0]
                best_split_val = splits[ig_argmax]

        return split_feature, best_split_val, max_ig