示例#1
0
文件: err.py 项目: Ulden/news
    def calc_swap_deltas(self, qid, targets):
        n_targets = len(targets)
        deltas = np.zeros((n_targets, n_targets))
        satisfied_probs = np.zeros(n_targets)
        prefix_sums = np.zeros(n_targets + 1)
        point_residuals = np.ones(n_targets + 1)

        for i, t in enumerate(targets):
            assert t <= self.highest_score
            sprob = self._get_satisfied_prob(t)
            satisfied_probs[i] = sprob
            prefix_sums[i + 1] = (
                prefix_sums[i] +
                ((point_residuals[i] * sprob / (1.0 + i))
                 if i < self.k else 0.0))
            point_residuals[i + 1] = point_residuals[i] * (1.0 - sprob)

        for i in range(min(n_targets, self.k)):
            for j in range(i + 1, n_targets):
                if satisfied_probs[i] == satisfied_probs[j]:
                    continue

                ratio = (1.0 - satisfied_probs[j]) / (1.0 - satisfied_probs[i])
                deltas[i, j] = (
                    # delta on i-th position
                    ((satisfied_probs[j] - satisfied_probs[i]) *
                     point_residuals[i] / (i + 1.0)) +
                    # delta on i+1 to j-1 positions
                    (prefix_sums[j] - prefix_sums[i + 1]) * (ratio - 1.0) +
                    # delta on j-th position
                    (((point_residuals[j] / (j + 1.0)) *
                      (satisfied_probs[i] * ratio - satisfied_probs[j]))
                     if j < self.k else 0.0))

        return deltas
示例#2
0
    def calc_swap_deltas(self, qid, targets):
        n_targets = len(targets)
        deltas = np.zeros((n_targets, n_targets))
        satisfied_probs = np.zeros(n_targets)
        prefix_sums = np.zeros(n_targets + 1)
        point_residuals = np.ones(n_targets + 1)

        for i, t in enumerate(targets):
            assert t <= self.highest_score
            sprob = self._get_satisfied_prob(t)
            satisfied_probs[i] = sprob
            prefix_sums[i + 1] = (prefix_sums[i] +
                                  ((point_residuals[i] * sprob /
                                    (1.0 + i)) if i < self.k else 0.0))
            point_residuals[i + 1] = point_residuals[i] * (1.0 - sprob)

        for i in range(min(n_targets, self.k)):
            for j in range(i + 1, n_targets):
                if satisfied_probs[i] == satisfied_probs[j]:
                    continue

                ratio = (1.0 - satisfied_probs[j]) / (1.0 - satisfied_probs[i])
                deltas[i, j] = (
                    # delta on i-th position
                    ((satisfied_probs[j] - satisfied_probs[i]) *
                     point_residuals[i] / (i + 1.0)) +
                    # delta on i+1 to j-1 positions
                    (prefix_sums[j] - prefix_sums[i + 1]) * (ratio - 1.0) +
                    # delta on j-th position
                    (((point_residuals[j] / (j + 1.0)) *
                      (satisfied_probs[i] * ratio - satisfied_probs[j]))
                     if j < self.k else 0.0))

        return deltas
示例#3
0
文件: ap.py 项目: Ulden/news
    def calc_swap_deltas(self, qid, targets):
        n_targets = len(targets)
        deltas = np.zeros((n_targets, n_targets))
        total_num_rel = 0
        total_metric = 0.0
        for i in range(min(n_targets, self.k)):
            if targets[i] >= self.cutoff:
                total_num_rel += 1
                total_metric += total_num_rel / (i + 1.0)
        metric = (total_metric / total_num_rel) if total_num_rel > 0 else 0.0

        num_rel_i = 0
        for i in range(min(n_targets, self.k)):
            if targets[i] >= self.cutoff:
                num_rel_i += 1
                num_rel_j = num_rel_i
                sub = num_rel_i / (i + 1.0)

                for j in range(i + 1, n_targets):
                    if targets[j] >= self.cutoff:
                        if j < self.k:
                            num_rel_j += 1
                            sub += 1 / (j + 1.0)
                    else:
                        add = (num_rel_j / (j + 1.0)) if j < self.k else 0.0
                        new_total_metric = total_metric + add - sub
                        new_num_rel = (total_num_rel
                                       if j < self.k
                                       else (total_num_rel - 1))
                        new_metric = ((new_total_metric / new_num_rel)
                                      if new_num_rel > 0
                                      else 0.0)
                        deltas[i, j] = new_metric - metric

            else:
                num_rel_j = num_rel_i
                add = (num_rel_i + 1) / (i + 1.0)

                for j in range(i + 1, n_targets):
                    if targets[j] >= self.cutoff:
                        sub = (((num_rel_j + 1) / (j + 1.0))
                               if j < self.k
                               else 0.0)
                        new_total_metric = total_metric + add - sub
                        new_num_rel = (total_num_rel
                                       if j < self.k
                                       else (total_num_rel + 1))
                        new_metric = ((new_total_metric / new_num_rel)
                                      if new_num_rel > 0
                                      else 0.0)
                        deltas[i, j] = new_metric - metric

                        if j < self.k:
                            num_rel_j += 1
                            add += 1 / (j + 1.0)

        return deltas
示例#4
0
文件: ap.py 项目: pombredanne/pyltr
    def calc_swap_deltas(self, qid, targets):
        n_targets = len(targets)
        deltas = np.zeros((n_targets, n_targets))
        total_num_rel = 0
        total_metric = 0.0
        for i in range(min(n_targets, self.k)):
            if targets[i] >= self.cutoff:
                total_num_rel += 1
                total_metric += total_num_rel / (i + 1.0)
        metric = (total_metric / total_num_rel) if total_num_rel > 0 else 0.0

        num_rel_i = 0
        for i in range(min(n_targets, self.k)):
            if targets[i] >= self.cutoff:
                num_rel_i += 1
                num_rel_j = num_rel_i
                sub = num_rel_i / (i + 1.0)

                for j in range(i + 1, n_targets):
                    if targets[j] >= self.cutoff:
                        if j < self.k:
                            num_rel_j += 1
                            sub += 1 / (j + 1.0)
                    else:
                        add = (num_rel_j / (j + 1.0)) if j < self.k else 0.0
                        new_total_metric = total_metric + add - sub
                        new_num_rel = (total_num_rel
                                       if j < self.k
                                       else (total_num_rel - 1))
                        new_metric = ((new_total_metric / new_num_rel)
                                      if new_num_rel > 0
                                      else 0.0)
                        deltas[i, j] = new_metric - metric

            else:
                num_rel_j = num_rel_i
                add = (num_rel_i + 1) / (i + 1.0)

                for j in range(i + 1, n_targets):
                    if targets[j] >= self.cutoff:
                        sub = (((num_rel_j + 1) / (j + 1.0))
                               if j < self.k
                               else 0.0)
                        new_total_metric = total_metric + add - sub
                        new_num_rel = (total_num_rel
                                       if j < self.k
                                       else (total_num_rel + 1))
                        new_metric = ((new_total_metric / new_num_rel)
                                      if new_num_rel > 0
                                      else 0.0)
                        deltas[i, j] = new_metric - metric

                        if j < self.k:
                            num_rel_j += 1
                            add += 1 / (j + 1.0)

        return deltas
示例#5
0
    def calc_swap_deltas(self, qid, targets, coeff=1.0):
        n_targets = len(targets)
        deltas = np.zeros((n_targets, n_targets))

        for i in range(min(n_targets, self.k)):
            for j in range(i + 1, n_targets):
                deltas[i, j] = coeff * \
                    (self._gain_fn(targets[i]) - self._gain_fn(targets[j])) * \
                    (self._get_discount(j) - self._get_discount(i))

        return deltas
示例#6
0
文件: dcg.py 项目: cpjha13/pyltr
    def calc_swap_deltas(self, qid, targets, coeff=1.0):
        n_targets = len(targets)
        deltas = np.zeros((n_targets, n_targets))

        for i in range(min(n_targets, self.k)):
            for j in range(i + 1, n_targets):
                deltas[i, j] = coeff * \
                    (self._gain_fn(targets[i]) - self._gain_fn(targets[j])) * \
                    (self._get_discount(j) - self._get_discount(i))

        return deltas
示例#7
0
    def calc_random_ev(self, qid, targets):
        """Calculates the expectied value of the metric on randomized targets.

        This implementation just averages the metric over 100 shuffles.
        Not implemented for non-LTR metrics.

        Parameters
        ----------
        qid : object
            See `evaluate`.
        targets : array_like of shape = [n_targets]
            See `evaluate`.

        Returns
        -------
        float
            Expected value of the metric from random ordering of targets.

        """
        targets = np.copy(targets)
        scores = []
        for _ in range(100):
            np.random.shuffle(targets)
            scores.append(self.evaluate(qid, targets))
        return np.mean(scores)
示例#8
0
文件: svmIO.py 项目: junjiek/cmu-exp
def _dump_svmlight(X, y, f, one_based, comment, query_id):
    is_sp = int(hasattr(X, "tocsr"))
    if X.dtype.kind == 'i':
        value_pattern = u("%d:%d")
    else:
        value_pattern = u("%d:%.16g")

    line_pattern = u("%s")

    line_pattern += u(" %s\n")

    for i in range(X.shape[0]):
        if is_sp:
            span = slice(X.indptr[i], X.indptr[i + 1])
            row = zip(X.indices[span], X.data[span])
        else:
            nz = X[i] != 0
            row = zip(np.where(nz)[0], X[i, nz])

        s = " ".join(value_pattern % (j + one_based, x) for j, x in row)
        label = ""
        first = True
        for l in y[i]:
            if not first:
                label += ","
            label += str(int(l))
            first = False
        feat = (label, s)
        f.write((line_pattern % feat).encode('ascii'))
示例#9
0
    def calc_random_ev(self, qid, targets):
        """Calculates the expectied value of the metric on randomized targets.

        This implementation just averages the metric over 100 shuffles.
        Not implemented for non-LTR metrics.

        Parameters
        ----------
        qid : object
            See `evaluate`.
        targets : array_like of shape = [n_targets]
            See `evaluate`.

        Returns
        -------
        float
            Expected value of the metric from random ordering of targets.

        """
        targets = np.copy(targets)
        scores = []
        for _ in range(100):
            np.random.shuffle(targets)
            scores.append(self.evaluate(qid, targets))
        return np.mean(scores)
示例#10
0
 def evaluate(self, qid, targets):
     num_rel = 0
     total_prec = 0.0
     for i in range(min(len(targets), self.k)):
         if targets[i] >= self.cutoff:
             num_rel += 1
             total_prec += num_rel / (i + 1.0)
     return (total_prec / num_rel) if num_rel > 0 else 0.0
示例#11
0
文件: ap.py 项目: Ulden/news
 def evaluate(self, qid, targets):
     num_rel = 0
     total_prec = 0.0
     for i in range(min(len(targets), self.k)):
         if targets[i] >= self.cutoff:
             num_rel += 1
             total_prec += num_rel / (i + 1.0)
     return (total_prec / num_rel) if num_rel > 0 else 0.0
def _partial_dependence_recursion(est, grid, target_variables):

    # grid needs to be DTYPE
    grid = np.asarray(grid, dtype=DTYPE, order='C')

    n_trees_per_stage = est.estimators_.shape[1]
    n_estimators = est.estimators_.shape[0]
    learning_rate = est.learning_rate
    averaged_predictions = np.zeros((n_trees_per_stage, grid.shape[0]),
                                    dtype=np.float64, order='C')
    for stage in range(n_estimators):
        for k in range(n_trees_per_stage):
            tree = est.estimators_[stage, k].tree_
            _partial_dependence_tree(tree, grid, target_variables,
                                     learning_rate, averaged_predictions[k])

    return averaged_predictions
def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
    """Generate a grid of points based on the ``percentiles of ``X``.
    The grid is a cartesian product between the columns of Z. The ith column of
    Z consists in ``grid_resolution`` equally-spaced points between the
    percentiles of the ith column of X.
    If ``grid_resolution`` is bigger than the number of unique values in the
    ith column of X, then those unique values will be used instead.
    Parameters
    ----------
    X : ndarray
        The data
    percentiles : tuple of floats
        The percentiles which are used to construct the extreme values of
        the grid.
    grid_resolution : int
        The number of equally spaced points to be placed on the grid for a
        given column.
    Returns
    -------
    grid : ndarray, shape=(n_points, X.shape[1])
        All data points on the grid. n_points is always ``<= grid_resolution **
        X.shape[1]``.
    Z: list of ndarray
        The values with which the grid has been created. The ndarrays may be of
        different shape: either (grid_resolution,) or (n_unique_values,).
    """
    try:
        assert len(percentiles) == 2
    except (AssertionError, TypeError):
        raise ValueError('percentiles must be a sequence of 2 elements.')
    if not all(0. <= x <= 1. for x in percentiles):
        raise ValueError('percentiles values must be in [0, 1].')
    if percentiles[0] >= percentiles[1]:
        raise ValueError('percentiles[0] must be strictly less '
                         'than percentiles[1].')

    if grid_resolution <= 1:
        raise ValueError('grid_resolution must be strictly greater than 1.')

    values = []
    for feature in range(X.shape[1]):
        uniques = np.unique(X[:, feature])
        if uniques.shape[0] < grid_resolution:
            # feature has low resolution use unique vals
            axis = uniques
        else:
            # create axis based on percentiles and grid resolution
            emp_percentiles = mquantiles(X, prob=percentiles, axis=0)
            if np.allclose(emp_percentiles[0, feature],
                           emp_percentiles[1, feature]):
                raise ValueError('percentiles are too close to each other, '
                                 'unable to build the grid.')
            axis = np.linspace(emp_percentiles[0, feature],
                               emp_percentiles[1, feature],
                               num=grid_resolution, endpoint=True)
        values.append(axis)

    return cartesian(values), values
def test_numeric_stability():
    X_init = np.array([2., 4., 6., 8., 10.]).reshape(-1, 1)
    Xt_expected = np.array([0, 0, 1, 1, 1]).reshape(-1, 1)

    # Test up to discretizing nano units
    for i in range(1, 9):
        X = X_init / 10**i
        Xt = KBinsDiscretizer(n_bins=2, encode='ordinal').fit_transform(X)
        assert_array_equal(Xt_expected, Xt)
示例#15
0
 def evaluate(self, qid, targets):
     n_targets = len(targets)
     num_rel = 0.
     for i in range(n_targets):
         if i >= self.k:
             break
         if targets[i] >= self.cutoff:
             num_rel += 1
     return (num_rel / self.k)
示例#16
0
def test_numeric_stability():
    X_init = np.array([2., 4., 6., 8., 10.]).reshape(-1, 1)
    Xt_expected = np.array([0, 0, 1, 1, 1]).reshape(-1, 1)

    # Test up to discretizing nano units
    for i in range(1, 9):
        X = X_init / 10**i
        Xt = KBinsDiscretizer(n_bins=2, encode='ordinal').fit_transform(X)
        assert_array_equal(Xt_expected, Xt)
示例#17
0
文件: ap.py 项目: asds25810/Tuning
 def evaluate(self, qid, targets):
     n_targets = len(targets)
     num_rel = 0
     total_prec = 0.0
     for i in range(n_targets):
         if targets[i] >= self.cutoff:
             num_rel += 1
             if i < self.k:
                 total_prec += num_rel / (i + 1.0)
     return (total_prec / num_rel) if num_rel > 0 else 0.0
示例#18
0
    def _calc_lambdas_deltas(self, qid, y, y_pred):
        ns = y.shape[0]
        positions = get_sorted_y_positions(y, y_pred, check=False)
        actual = y[positions]

        swap_deltas = self.metric.calc_swap_deltas(qid, actual)
        max_k = self.metric.max_k()
        if max_k is None or ns < max_k:
            max_k = ns

        lambdas = np.zeros(ns)
        deltas = np.zeros(ns)

        for i in range(max_k):
            for j in range(i + 1, ns):
                if actual[i] == actual[j]:
                    continue

                delta_metric = swap_deltas[i, j]
                if delta_metric == 0.0:
                    continue

                a, b = positions[i], positions[j]
                # invariant: y_pred[a] >= y_pred[b]

                if actual[i] < actual[j]:
                    assert delta_metric > 0.0
                    logistic = scipy.special.expit(y_pred[a] - y_pred[b])
                    l = logistic * delta_metric
                    lambdas[a] -= l
                    lambdas[b] += l
                else:
                    assert delta_metric < 0.0
                    logistic = scipy.special.expit(y_pred[b] - y_pred[a])
                    l = logistic * -delta_metric
                    lambdas[a] += l
                    lambdas[b] -= l

                gradient = (1 - logistic) * l
                deltas[a] += gradient
                deltas[b] += gradient

        return lambdas, deltas
示例#19
0
文件: lambdamart.py 项目: Ulden/news
    def _calc_lambdas_deltas(self, qid, y, y_pred):
        ns = y.shape[0]
        positions = get_sorted_y_positions(y, y_pred, check=False)
        actual = y[positions]

        swap_deltas = self.metric.calc_swap_deltas(qid, actual)
        max_k = self.metric.max_k()
        if max_k is None or ns < max_k:
            max_k = ns

        lambdas = np.zeros(ns)
        deltas = np.zeros(ns)

        for i in range(max_k):
            for j in range(i + 1, ns):
                if actual[i] == actual[j]:
                    continue

                delta_metric = swap_deltas[i, j]
                if delta_metric == 0.0:
                    continue

                a, b = positions[i], positions[j]
                # invariant: y_pred[a] >= y_pred[b]

                if actual[i] < actual[j]:
                    assert delta_metric > 0.0
                    logistic = scipy.special.expit(y_pred[a] - y_pred[b])
                    l = logistic * delta_metric
                    lambdas[a] -= l
                    lambdas[b] += l
                else:
                    assert delta_metric < 0.0
                    logistic = scipy.special.expit(y_pred[b] - y_pred[a])
                    l = logistic * -delta_metric
                    lambdas[a] += l
                    lambdas[b] -= l

                gradient = (1 - logistic) * l
                deltas[a] += gradient
                deltas[b] += gradient

        return lambdas, deltas
示例#20
0
文件: roc.py 项目: asds25810/Tuning
    def calc_swap_deltas(self, qid, targets):
        n_targets = len(targets)
        deltas = np.zeros((n_targets, n_targets))
        rel = np.array(targets) >= self.cutoff
        total_num_rel = sum(rel)

        if total_num_rel == 0 or total_num_rel == n_targets:
            return deltas

        denom = total_num_rel * float(n_targets - total_num_rel)
        for i in range(n_targets):
            irel = rel[i]
            for j in range(i + 1, n_targets):
                jrel = rel[j]
                if not irel and jrel:
                    deltas[i, j] = (j - i) / denom
                elif irel and not jrel:
                    deltas[i, j] = (i - j) / denom

        return deltas
示例#21
0
文件: roc.py 项目: Ulden/news
    def calc_swap_deltas(self, qid, targets):
        n_targets = len(targets)
        deltas = np.zeros((n_targets, n_targets))
        rel = np.array(targets) >= self.cutoff
        total_num_rel = sum(rel)

        if total_num_rel == 0 or total_num_rel == n_targets:
            return deltas

        denom = total_num_rel * float(n_targets - total_num_rel)
        for i in range(n_targets):
            irel = rel[i]
            for j in range(i + 1, n_targets):
                jrel = rel[j]
                if not irel and jrel:
                    deltas[i, j] = (j - i) / denom
                elif irel and not jrel:
                    deltas[i, j] = (i - j) / denom

        return deltas
示例#22
0
    def transform(self, X):
        '''
        Compute kernels from X to :attr:`features_`.

        Parameters
        ----------
        X : list of arrays or :class:`skl_groups.features.Features`
            The bags to compute "from". Must have same dimension as
            :attr:`features_`.

        Returns
        -------
        K : array of shape ``[len(X), len(features_)]``
            The kernel evaluations from X to :attr:`features_`.
        '''

        X = as_features(X, stack=True, bare=True)
        Y = self.features_

        if X.dim != Y.dim:
            raise ValueError(
                "MMK transform got dimension {} but had {} at fit".format(
                    X.dim, Y.dim))

        pointwise = pairwise_kernels(X.stacked_features,
                                     Y.stacked_features,
                                     metric=self.kernel,
                                     filter_params=True,
                                     **self._get_kernel_params())

        # TODO: is there a way to do this without a Python loop?
        K = np.empty((len(X), len(Y)))
        for i in range(len(X)):
            for j in range(len(Y)):
                K[i,
                  j] = pointwise[X._boundaries[i]:X._boundaries[i + 1],
                                 Y._boundaries[j]:Y._boundaries[j + 1]].mean()

        return K
示例#23
0
def test_mmk():
    bags = [np.random.normal(size=(np.random.randint(10, 100), 10))
            for _ in range(20)]

    res = MeanMapKernel(gamma=2.38).fit_transform(bags)
    for i in range(20):
        for j in range(20):
            exp = pairwise_kernels(bags[j], bags[i], metric='rbf', gamma=2.38)
            assert_almost_equal(res[i, j], exp.mean(),
                                err_msg="({} to {})".format(i, j))

    res = MeanMapKernel(kernel='linear').fit(bags[:5]).transform(bags[-2:])
    for i in range(5):
        for j in range(18, 20):
            exp = pairwise_kernels(bags[j], bags[i], metric='linear')
            assert_almost_equal(res[j - 18, i], exp.mean(),
                                err_msg="({} to {})".format(i, j))

    # fails on wrong dimension
    assert_raises(
        ValueError,
        lambda:MeanMapKernel().fit(bags).transform([np.random.randn(20, 8)]))
示例#24
0
    def calc_swap_deltas(self, qid, targets):
        """Returns an upper triangular matrix.

        Each (i, j) contains the change in the metric from swapping
        targets[i, j].

        Parameters
        ----------
        qid : object
            See `evaluate`.
        targets : array_like of shape = [n_targets]
            See `evaluate`.

        Returns
        -------
        deltas = array_like of shape = [n_targets, n_targets]
            Upper triangular matrix, where ``deltas[i, j]`` is the change in
            the metric from swapping ``targets[i]`` with ``targets[j]``.

        """
        n_targets = len(targets)
        deltas = np.zeros((n_targets, n_targets))
        original = self.evaluate(qid, targets)
        max_k = self.max_k()
        if max_k is None or n_targets < max_k:
            max_k = n_targets

        for i in range(max_k):
            for j in range(i + 1, n_targets):
                tmp = targets[i]
                targets[i] = targets[j]
                targets[j] = tmp
                deltas[i, j] = self.evaluate(qid, targets) - original
                tmp = targets[i]
                targets[i] = targets[j]
                targets[j] = tmp

        return deltas
示例#25
0
    def calc_swap_deltas(self, qid, targets):
        """Returns an upper triangular matrix.

        Each (i, j) contains the change in the metric from swapping
        targets[i, j].

        Parameters
        ----------
        qid : object
            See `evaluate`.
        targets : array_like of shape = [n_targets]
            See `evaluate`.

        Returns
        -------
        deltas = array_like of shape = [n_targets, n_targets]
            Upper triangular matrix, where ``deltas[i, j]`` is the change in
            the metric from swapping ``targets[i]`` with ``targets[j]``.

        """
        n_targets = len(targets)
        deltas = np.zeros((n_targets, n_targets))
        original = self.evaluate(qid, targets)
        max_k = self.max_k()
        if max_k is None or n_targets < max_k:
            max_k = n_targets

        for i in range(max_k):
            for j in range(i + 1, n_targets):
                tmp = targets[i]
                targets[i] = targets[j]
                targets[j] = tmp
                deltas[i, j] = self.evaluate(qid, targets) - original
                tmp = targets[i]
                targets[i] = targets[j]
                targets[j] = tmp

        return deltas
示例#26
0
文件: mmk.py 项目: cimor/skl-groups
    def transform(self, X):
        '''
        Compute kernels from X to :attr:`features_`.

        Parameters
        ----------
        X : list of arrays or :class:`skl_groups.features.Features`
            The bags to compute "from". Must have same dimension as
            :attr:`features_`.

        Returns
        -------
        K : array of shape ``[len(X), len(features_)]``
            The kernel evaluations from X to :attr:`features_`.
        '''

        X = as_features(X, stack=True, bare=True)
        Y = self.features_

        if X.dim != Y.dim:
            raise ValueError("MMK transform got dimension {} but had {} at fit"
                             .format(X.dim, Y.dim))

        pointwise = pairwise_kernels(X.stacked_features, Y.stacked_features,
                                     metric=self.kernel,
                                     filter_params=True,
                                     **self._get_kernel_params())

        # TODO: is there a way to do this without a Python loop?
        K = np.empty((len(X), len(Y)))
        for i in range(len(X)):
            for j in range(len(Y)):
                K[i, j] = pointwise[X._boundaries[i]:X._boundaries[i+1],
                                    Y._boundaries[j]:Y._boundaries[j+1]].mean()

        return K
示例#27
0
def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
    """Generate a grid of points based on the ``percentiles of ``X``.

    The grid is generated by placing ``grid_resolution`` equally
    spaced points between the ``percentiles`` of each column
    of ``X``.

    Parameters
    ----------
    X : ndarray
        The data
    percentiles : tuple of floats
        The percentiles which are used to construct the extreme
        values of the grid axes.
    grid_resolution : int
        The number of equally spaced points that are placed
        on the grid.

    Returns
    -------
    grid : ndarray
        All data points on the grid; ``grid.shape[1] == X.shape[1]``
        and ``grid.shape[0] == grid_resolution * X.shape[1]``.
    axes : seq of ndarray
        The axes with which the grid has been created.
    """
    if len(percentiles) != 2:
        raise ValueError('percentile must be tuple of len 2')
    if not all(0. <= x <= 1. for x in percentiles):
        raise ValueError('percentile values must be in [0, 1]')

    axes = []
    emp_percentiles = mquantiles(X, prob=percentiles, axis=0)
    for col in range(X.shape[1]):
        uniques = np.unique(X[:, col])
        if uniques.shape[0] < grid_resolution:
            # feature has low resolution use unique vals
            axis = uniques
        else:
            # create axis based on percentiles and grid resolution
            axis = np.linspace(emp_percentiles[0, col],
                               emp_percentiles[1, col],
                               num=grid_resolution,
                               endpoint=True)
        axes.append(axis)

    return cartesian(axes), axes
示例#28
0
    def evaluate(self, qid, targets):
        n_targets = len(targets)
        if n_targets < 2:
            return 0.0

        concordant, discordant = 0, 0
        for i, t1 in enumerate(targets):
            for j in range(i + 1, n_targets):
                t2 = targets[j]
                if abs(t1 - t2) < _EPS:
                    continue
                rank_higher = i < j
                score_higher = t1 > t2
                if rank_higher == score_higher:
                    concordant += 1
                else:
                    discordant += 1
        return (concordant - discordant) / (n_targets * (n_targets - 1) / 2.0)
示例#29
0
文件: kendall.py 项目: Ulden/news
    def evaluate(self, qid, targets):
        n_targets = len(targets)
        if n_targets < 2:
            return 0.0

        concordant, discordant = 0, 0
        for i, t1 in enumerate(targets):
            for j in range(i + 1, n_targets):
                t2 = targets[j]
                if abs(t1 - t2) < _EPS:
                    continue
                rank_higher = i < j
                score_higher = t1 > t2
                if rank_higher == score_higher:
                    concordant += 1
                else:
                    discordant += 1
        return (concordant - discordant) / (n_targets * (n_targets - 1) / 2.0)
示例#30
0
 def _pretty_print_score(self, score):
     if score.size == 1:
         return '%12.4f' % score
     return ''.join('%8.4f' % score[i] for i in range(score.size))
示例#31
0
def plot_partial_dependence(gbrt,
                            X,
                            features,
                            feature_names=None,
                            label=None,
                            n_cols=3,
                            grid_resolution=100,
                            percentiles=(0.05, 0.95),
                            n_jobs=1,
                            verbose=0,
                            ax=None,
                            line_kw=None,
                            contour_kw=None,
                            **fig_kw):
    """Partial dependence plots for ``features``.
    The ``len(features)`` plots are arranged in a grid with ``n_cols``
    columns. Two-way partial dependence plots are plotted as contour
    plots.
    Read more in the :ref:`User Guide <partial_dependence>`.
    Parameters
    ----------
    gbrt : BaseGradientBoosting
        A fitted gradient boosting model.
    X : array-like, shape=(n_samples, n_features)
        The data on which ``gbrt`` was trained.
    features : seq of tuples or ints
        If seq[i] is an int or a tuple with one int value, a one-way
        PDP is created; if seq[i] is a tuple of two ints, a two-way
        PDP is created.
    feature_names : seq of str
        Name of each feature; feature_names[i] holds
        the name of the feature with index i.
    label : object
        The class label for which the PDPs should be computed.
        Only if gbrt is a multi-class model. Must be in ``gbrt.classes_``.
    n_cols : int
        The number of columns in the grid plot (default: 3).
    percentiles : (low, high), default=(0.05, 0.95)
        The lower and upper percentile used to create the extreme values
        for the PDP axes.
    grid_resolution : int, default=100
        The number of equally spaced points on the axes.
    n_jobs : int
        The number of CPUs to use to compute the PDs. -1 means 'all CPUs'.
        Defaults to 1.
    verbose : int
        Verbose output during PD computations. Defaults to 0.
    ax : Matplotlib axis object, default None
        An axis object onto which the plots will be drawn.
    line_kw : dict
        Dict with keywords passed to the ``pylab.plot`` call.
        For one-way partial dependence plots.
    contour_kw : dict
        Dict with keywords passed to the ``pylab.plot`` call.
        For two-way partial dependence plots.
    fig_kw : dict
        Dict with keywords passed to the figure() call.
        Note that all keywords not recognized above will be automatically
        included here.
    Returns
    -------
    fig : figure
        The Matplotlib Figure object.
    axs : seq of Axis objects
        A seq of Axis objects, one for each subplot.
    Examples
    --------
    >>> from sklearn.datasets import make_friedman1
    >>> from sklearn.ensemble import GradientBoostingRegressor
    >>> X, y = make_friedman1()
    >>> clf = GradientBoostingRegressor(n_estimators=10).fit(X, y)
    >>> fig, axs = plot_partial_dependence(clf, X, [0, (0, 1)]) #doctest: +SKIP
    ...
    """
    import matplotlib.pyplot as plt
    from matplotlib import transforms
    from matplotlib.ticker import MaxNLocator
    from matplotlib.ticker import ScalarFormatter

    # if not isinstance(gbrt, BaseGradientBoosting):
    #     raise ValueError('gbrt has to be an instance of BaseGradientBoosting')
    if gbrt.estimators_.shape[0] == 0:
        raise ValueError('Call %s.fit before partial_dependence' %
                         gbrt.__class__.__name__)

    # set label_idx for multi-class GBRT
    if hasattr(gbrt, 'classes_') and np.size(gbrt.classes_) > 2:
        if label is None:
            raise ValueError('label is not given for multi-class PDP')
        label_idx = np.searchsorted(gbrt.classes_, label)
        if gbrt.classes_[label_idx] != label:
            raise ValueError('label %s not in ``gbrt.classes_``' % str(label))
    else:
        # regression and binary classification
        label_idx = 0

    X = check_array(X, dtype=DTYPE, order='C')
    if gbrt.n_features != X.shape[1]:
        raise ValueError('X.shape[1] does not match gbrt.n_features')

    if line_kw is None:
        line_kw = {'color': 'green'}
    if contour_kw is None:
        contour_kw = {}

    # convert feature_names to list
    if feature_names is None:
        # if not feature_names use fx indices as name
        feature_names = [str(i) for i in range(gbrt.n_features)]
    elif isinstance(feature_names, np.ndarray):
        feature_names = feature_names.tolist()

    def convert_feature(fx):
        if isinstance(fx, six.string_types):
            try:
                fx = feature_names.index(fx)
            except ValueError:
                raise ValueError('Feature %s not in feature_names' % fx)
        return fx

    # convert features into a seq of int tuples
    tmp_features = []
    for fxs in features:
        if isinstance(fxs, (numbers.Integral, ) + six.string_types):
            fxs = (fxs, )
        try:
            fxs = np.array([convert_feature(fx) for fx in fxs], dtype=np.int32)
        except TypeError:
            raise ValueError('features must be either int, str, or tuple '
                             'of int/str')
        if not (1 <= np.size(fxs) <= 2):
            raise ValueError('target features must be either one or two')

        tmp_features.append(fxs)

    features = tmp_features

    names = []
    try:
        for fxs in features:
            l = []
            # explicit loop so "i" is bound for exception below
            for i in fxs:
                l.append(feature_names[i])
            names.append(l)
    except IndexError:
        raise ValueError('features[i] must be in [0, n_features) '
                         'but was %d' % i)

    # compute PD functions
    pd_result = Parallel(n_jobs=n_jobs, verbose=verbose)(delayed(
        partial_dependence
    )(gbrt, fxs, X=X, grid_resolution=grid_resolution, percentiles=percentiles)
                                                         for fxs in features)

    # get global min and max values of PD grouped by plot type
    pdp_lim = {}
    for pdp, axes in pd_result:
        min_pd, max_pd = pdp[label_idx].min(), pdp[label_idx].max()
        n_fx = len(axes)
        old_min_pd, old_max_pd = pdp_lim.get(n_fx, (min_pd, max_pd))
        min_pd = min(min_pd, old_min_pd)
        max_pd = max(max_pd, old_max_pd)
        pdp_lim[n_fx] = (min_pd, max_pd)

    # create contour levels for two-way plots
    if 2 in pdp_lim:
        Z_level = np.linspace(*pdp_lim[2], num=8)

    if ax is None:
        fig = plt.figure(**fig_kw)
    else:
        fig = ax.get_figure()
        fig.clear()

    n_cols = min(n_cols, len(features))
    n_rows = int(np.ceil(len(features) / float(n_cols)))
    axs = []
    for i, fx, name, (pdp, axes) in zip(count(), features, names, pd_result):
        ax = fig.add_subplot(n_rows, n_cols, i + 1)

        if len(axes) == 1:
            ax.plot(axes[0], pdp[label_idx].ravel(), **line_kw)
        else:
            # make contour plot
            assert len(axes) == 2
            XX, YY = np.meshgrid(axes[0], axes[1])
            Z = pdp[label_idx].reshape(list(map(np.size, axes))).T
            CS = ax.contour(XX,
                            YY,
                            Z,
                            levels=Z_level,
                            linewidths=0.5,
                            colors='k')
            ax.contourf(XX,
                        YY,
                        Z,
                        levels=Z_level,
                        vmax=Z_level[-1],
                        vmin=Z_level[0],
                        alpha=0.75,
                        **contour_kw)
            ax.clabel(CS, fmt='%2.2f', colors='k', fontsize=10, inline=True)

        # plot data deciles + axes labels
        deciles = mquantiles(X[:, fx[0]], prob=np.arange(0.1, 1.0, 0.1))
        trans = transforms.blended_transform_factory(ax.transData,
                                                     ax.transAxes)
        ylim = ax.get_ylim()
        ax.vlines(deciles, [0], 0.05, transform=trans, color='k')
        ax.set_xlabel(name[0])
        ax.set_ylim(ylim)

        # prevent x-axis ticks from overlapping
        ax.xaxis.set_major_locator(MaxNLocator(nbins=6, prune='lower'))
        tick_formatter = ScalarFormatter()
        tick_formatter.set_powerlimits((-3, 4))
        ax.xaxis.set_major_formatter(tick_formatter)

        if len(axes) > 1:
            # two-way PDP - y-axis deciles + labels
            deciles = mquantiles(X[:, fx[1]], prob=np.arange(0.1, 1.0, 0.1))
            trans = transforms.blended_transform_factory(
                ax.transAxes, ax.transData)
            xlim = ax.get_xlim()
            ax.hlines(deciles, [0], 0.05, transform=trans, color='k')
            ax.set_ylabel(name[1])
            # hline erases xlim
            ax.set_xlim(xlim)
        else:
            ax.set_ylabel('Partial dependence')

        if len(axes) == 1:
            ax.set_ylim(pdp_lim[1])
        axs.append(ax)

    fig.subplots_adjust(bottom=0.15,
                        top=0.7,
                        left=0.1,
                        right=0.95,
                        wspace=0.4,
                        hspace=0.3)
    return fig, axs
示例#32
0
    def calc_lambdas_deltas(self, qid, targets, preds):
        """Returns the first and second (psuedo-)derivatives.

        Lambdas is the negative gradient of the loss with respect
        to the prediction.  Deltas is the derivative of that.

        Parameters
        ----------
        qid : object
            See `evaluate`.
        targets : array_like of shape = [n_targets]
            See `evaluate`.
        preds : array_like of shape = [n_targets]
            List of predicted scores corresponding to the targets.

        Returns
        -------
        lambdas = array_like of shape = [n_targets]
        deltas = array_like of shape = [n_targets]

        """
        ns = targets.shape[0]
        positions = get_sorted_y_positions(targets, preds, check=False)
        actual = targets[positions]

        swap_deltas = self.calc_swap_deltas(qid, actual)
        max_k = self.max_k()
        if max_k is None or ns < max_k:
            max_k = ns

        lambdas = np.zeros(ns)
        deltas = np.zeros(ns)

        for i in range(max_k):
            for j in range(i + 1, ns):
                if actual[i] == actual[j]:
                    continue

                delta_metric = swap_deltas[i, j]
                if delta_metric == 0.0:
                    continue

                a, b = positions[i], positions[j]
                # invariant: preds[a] >= preds[b]

                if actual[i] < actual[j]:
                    assert delta_metric > 0.0
                    logistic = scipy.special.expit(preds[a] - preds[b])
                    l = logistic * delta_metric
                    lambdas[a] -= l
                    lambdas[b] += l
                else:
                    assert delta_metric < 0.0
                    logistic = scipy.special.expit(preds[b] - preds[a])
                    l = logistic * -delta_metric
                    lambdas[a] += l
                    lambdas[b] -= l

                hess = (1 - logistic) * l
                deltas[a] += hess
                deltas[b] += hess

        return lambdas, deltas
示例#33
0
 def _pretty_print_score(self, score):
     if score.size == 1:
         return '%12.4f' % score
     return ''.join('%8.4f' % score[i] for i in range(score.size))
示例#34
0
 def calc_random_ev(self, qid, targets):
     total_gains = sum(self._gain_fn(t) for t in targets)
     total_discounts = sum(
         self._get_discount(i) for i in range(min(self.k, len(targets))))
     return total_gains * total_discounts / len(targets)
示例#35
0
    def _fit_stages(self, X, y, qids, y_pred, random_state,
                    begin_at_stage=0, monitor=None):
        n_samples = X.shape[0]
        do_subsample = self.subsample < 1.0
        sample_weight = np.ones(n_samples, dtype=np.float64)

        n_queries = check_qids(qids)
        query_groups = np.array([(qid, a, b, np.arange(a, b))
                                 for qid, a, b in get_groups(qids)],
                                dtype=np.object)
        assert n_queries == len(query_groups)
        do_query_oob = self.query_subsample < 1.0
        query_mask = np.ones(n_queries, dtype=np.bool)
        query_idx = np.arange(n_queries)
        q_inbag = max(1, int(self.query_subsample * n_queries))

        if self.verbose:
            verbose_reporter = _VerboseReporter(self.verbose)
            verbose_reporter.init(self, begin_at_stage, self.n_metrics,
                                  monitor is not None)

        for i in range(begin_at_stage, self.n_estimators):
            if do_query_oob:
                random_state.shuffle(query_idx)
                query_mask = np.zeros(n_queries, dtype=np.bool)
                query_mask[query_idx[:q_inbag]] = 1

            query_groups_to_use = query_groups[query_mask]
            sample_mask = np.zeros(n_samples, dtype=np.bool)
            for qid, a, b, sidx in query_groups_to_use:
                sidx_to_use = sidx
                if do_subsample:
                    query_samples_inbag = max(
                        1, int(self.subsample * (b - 1)))
                    random_state.shuffle(sidx)
                    sidx_to_use = sidx[:query_samples_inbag]
                sample_mask[sidx_to_use] = 1

            if do_query_oob:
                old_oob_total_score = np.zeros(self.n_metrics)
                for midx, metric in enumerate(self.metrics):
                    if metric.is_ltr_metric:
                        for qid, a, b, _ in query_groups[~query_mask]:
                            old_oob_total_score[midx] += metric.evaluate_preds(
                                qid, y[a:b], y_pred[a:b])
                    else:
                        old_oob_total_score[midx] = metric.evaluate_preds(
                                None, y[~sample_mask], y_pred[~sample_mask])

            y_pred = self._fit_stage(i, X, y, qids, y_pred, sample_weight,
                                     sample_mask, query_groups_to_use,
                                     random_state)

            for midx, metric in enumerate(self.metrics):
                train_total_score, oob_total_score = 0.0, 0.0
                if metric.is_ltr_metric:
                    for qidx, (qid, a, b, _) in enumerate(query_groups):
                        score = metric.evaluate_preds(
                            qid, y[a:b], y_pred[a:b])
                        if query_mask[qidx]:
                            train_total_score += score
                        else:
                            oob_total_score += score
                else:
                    train_total_score = metric.evaluate_preds(
                        None, y[sample_mask], y_pred[sample_mask])
                    oob_total_score = metric.evaluate_preds(
                        None, y[~sample_mask], y_pred[~sample_mask])

                train_normalizer = q_inbag if metric.is_ltr_metric else 1.0
                oob_normalizer = n_queries - q_inbag if metric.is_ltr_metric else 1.0
                self.train_score_[i, midx] = train_total_score / train_normalizer
                if do_query_oob:
                    if q_inbag < n_queries:
                        self.oob_improvement_[i, midx] = \
                            (oob_total_score - old_oob_total_score[midx]) / oob_normalizer

            early_stop = False
            monitor_output = None
            if monitor is not None:
                monitor_output = monitor(i, self, locals())
                if monitor_output is True:
                    early_stop = True

            if self.verbose > 0:
                verbose_reporter.update(i, self, monitor_output)

            if early_stop:
                break

        return i + 1
示例#36
0
def _exact_partial_dependence(est, target_variables, grid, X, output=None):
    """Calculate the partial dependence of ``target_variables``.
    The function will be calculated by calling the ``predict_proba`` method of
    ``est`` for classification or ``predict`` for regression on ``X`` for every
    point in the grid.
    Parameters
    ----------
    est : BaseEstimator
        A fitted classification or regression model.
    target_variables : array-like, dtype=int
        The target features for which the partial dependency should be
        computed (size should be smaller than 3 for visual renderings).
    grid : array-like, shape=(n_points, len(target_variables))
        The grid of ``target_variables`` values for which the
        partial dependency should be evaluated (either ``grid`` or ``X``
        must be specified).
    X : array-like, shape=(n_samples, n_features)
        The data on which ``est`` was trained.
    output : int, optional (default=None)
        The output index to use for multi-output estimators.
    Returns
    -------
    pdp : array, shape=(n_classes, n_points)
        The partial dependence function evaluated on the ``grid``.
        For regression and binary classification ``n_classes==1``.
    """
    n_samples = X.shape[0]
    pdp = []
    for row in range(grid.shape[0]):
        X_eval = X.copy()
        for i, variable in enumerate(target_variables):
            X_eval[:, variable] = np.repeat(grid[row, i], n_samples)
        if est._estimator_type == 'regressor':
            try:
                pdp_row = est.predict(X_eval)
            except:
                raise ValueError('Call %s.fit before partial_dependence' %
                                 est.__class__.__name__)
            if pdp_row.ndim != 1 and pdp_row.shape[1] != 1:
                # Multi-output
                if not 0 <= output < pdp_row.shape[1]:
                    raise ValueError('Valid output must be specified for '
                                     'multi-output models.')
                pdp_row = pdp_row[:, output]
            pdp.append(np.mean(pdp_row))
        elif est._estimator_type == 'classifier':
            try:
                pdp_row = est.predict_proba(X_eval)
            except:
                raise ValueError('Call %s.fit before partial_dependence' %
                                 est.__class__.__name__)
            if isinstance(pdp_row, list):
                # Multi-output
                if not 0 <= output < len(pdp_row):
                    raise ValueError('Valid output must be specified for '
                                     'multi-output models.')
                pdp_row = pdp_row[output]
            pdp_row = np.log(np.clip(pdp_row, 1e-16, 1))
            pdp_row = np.subtract(pdp_row, np.mean(pdp_row, 1)[:, np.newaxis])
            pdp.append(np.mean(pdp_row, 0))
        else:
            raise ValueError('est must be a fitted regressor or classifier '
                             'model.')
    pdp = np.array(pdp).transpose()
    if pdp.shape[0] == 2:
        # Binary classification
        pdp = pdp[1, :][np.newaxis]
    elif len(pdp.shape) == 1:
        # Regression
        pdp = pdp[np.newaxis]
    return pdp
示例#37
0
def partial_dependence(model,
                       target_variables,
                       grid=None,
                       X=None,
                       percentiles=(0.05, 0.95),
                       grid_resolution=100):
    """Partial dependence of ``target_variables``.

    Partial dependence plots show the dependence between the joint values
    of the ``target_variables`` and the function represented
    by the ``model``.

    Read more in the :ref:`User Guide <partial_dependence>`.

    Parameters
    ----------
    model : BaseBoosting
        A fitted boosting model.
    target_variables : array-like, dtype=int
        The target features for which the partial dependecy should be
        computed (size should be smaller than 3 for visual renderings).
    grid : array-like, shape=(n_points, len(target_variables))
        The grid of ``target_variables`` values for which the
        partial dependecy should be evaluated (either ``grid`` or ``X``
        must be specified).
    X : array-like, shape=(n_samples, n_features)
        The data on which ``model`` was trained. It is used to generate
        a ``grid`` for the ``target_variables``. The ``grid`` comprises
        ``grid_resolution`` equally spaced points between the two
        ``percentiles``.
    percentiles : (low, high), default=(0.05, 0.95)
        The lower and upper percentile used create the extreme values
        for the ``grid``. Only if ``X`` is not None.
    grid_resolution : int, default=100
        The number of equally spaced points on the ``grid``.

    Returns
    -------
    pdp : array, shape=(n_classes, n_points)
        The partial dependence function evaluated on the ``grid``.
        For regression and binary classification ``n_classes==1``.
    axes : seq of ndarray or None
        The axes with which the grid has been created or None if
        the grid has been given.

    Examples
    --------
    >>> from KTBoost.partial_dependence import partial_dependence
    >>> import matplotlib.pyplot as plt
    >>> 
    >>> Xtrain=np.random.rand(1000,10)
    >>> ytrain=2*Xtrain[:,0]+2*Xtrain[:,1]+np.random.rand(1000)
    >>> model = KTBoost.BoostingRegressor()
    >>> model.fit(Xtrain,ytrain)
    >>> 
    >>> kwargs = dict(X=Xtrain, percentiles=(0, 1))
    >>> partial_dependence(model,[0],**kwargs)
    """
    if not isinstance(model, BaseBoosting):
        raise ValueError('model has to be an instance of BaseBoosting')
    if not model.base_learner == "tree":
        raise ValueError("Partial dependencies are only "
                         "defined for trees as base "
                         "learners. Use option 'base_learner=\"tree\"'.")
    check_is_fitted(model, 'estimators_')
    if (grid is None and X is None) or (grid is not None and X is not None):
        raise ValueError('Either grid or X must be specified')

    target_variables = np.asarray(target_variables, dtype=np.int32,
                                  order='C').ravel()

    if any([not (0 <= fx < model.n_features_) for fx in target_variables]):
        raise ValueError('target_variables must be in [0, %d]' %
                         (model.n_features_ - 1))

    if X is not None:
        X = check_array(X, dtype=DTYPE, order='C')
        grid, axes = _grid_from_X(X[:, target_variables], percentiles,
                                  grid_resolution)
    else:
        assert grid is not None
        # dont return axes if grid is given
        axes = None
        # grid must be 2d
        if grid.ndim == 1:
            grid = grid[:, np.newaxis]
        if grid.ndim != 2:
            raise ValueError('grid must be 2d but is %dd' % grid.ndim)

    grid = np.asarray(grid, dtype=DTYPE, order='C')
    assert grid.shape[1] == target_variables.shape[0]

    n_trees_per_stage = model.estimators_.shape[1]
    n_estimators = model.estimators_.shape[0]
    pdp = np.zeros((
        n_trees_per_stage,
        grid.shape[0],
    ),
                   dtype=np.float64,
                   order='C')
    for stage in range(n_estimators):
        for k in range(n_trees_per_stage):
            tree = model.estimators_[stage, k].tree_
            _partial_dependence_tree(tree, grid, target_variables,
                                     model.learning_rate, pdp[k])

    return pdp, axes
示例#38
0
    def _fit_stages(self,
                    X,
                    y,
                    qids,
                    y_pred,
                    random_state,
                    begin_at_stage=0,
                    monitor=None):
        n_samples = X.shape[0]
        do_subsample = self.subsample < 1.0
        sample_weight = np.ones(n_samples, dtype=np.float64)

        n_queries = check_qids(qids)
        query_groups = np.array([(qid, a, b, np.arange(a, b))
                                 for qid, a, b in get_groups(qids)],
                                dtype=np.object)
        assert n_queries == len(query_groups)
        do_query_oob = self.query_subsample < 1.0
        query_mask = np.ones(n_queries, dtype=np.bool)
        query_idx = np.arange(n_queries)
        q_inbag = max(1, int(self.query_subsample * n_queries))

        if self.verbose:
            verbose_reporter = _VerboseReporter(self.verbose)
            verbose_reporter.init(self, begin_at_stage, self.n_metrics, monitor
                                  is not None)

        for i in range(begin_at_stage, self.n_estimators):
            if do_query_oob:
                random_state.shuffle(query_idx)
                query_mask = np.zeros(n_queries, dtype=np.bool)
                query_mask[query_idx[:q_inbag]] = 1

            query_groups_to_use = query_groups[query_mask]
            sample_mask = np.zeros(n_samples, dtype=np.bool)
            for qid, a, b, sidx in query_groups_to_use:
                sidx_to_use = sidx
                if do_subsample:
                    query_samples_inbag = max(1, int(self.subsample * (b - 1)))
                    random_state.shuffle(sidx)
                    sidx_to_use = sidx[:query_samples_inbag]
                sample_mask[sidx_to_use] = 1

            if do_query_oob:
                old_oob_total_score = np.zeros(self.n_metrics)
                for midx, metric in enumerate(self.metrics):
                    if metric.is_ltr_metric:
                        for qid, a, b, _ in query_groups[~query_mask]:
                            old_oob_total_score[midx] += metric.evaluate_preds(
                                qid, y[a:b], y_pred[a:b])
                    else:
                        old_oob_total_score[midx] = metric.evaluate_preds(
                            None, y[~sample_mask], y_pred[~sample_mask])

            y_pred = self._fit_stage(i, X, y, qids, y_pred, sample_weight,
                                     sample_mask, query_groups_to_use,
                                     random_state)

            for midx, metric in enumerate(self.metrics):
                train_total_score, oob_total_score = 0.0, 0.0
                if metric.is_ltr_metric:
                    for qidx, (qid, a, b, _) in enumerate(query_groups):
                        score = metric.evaluate_preds(qid, y[a:b], y_pred[a:b])
                        if query_mask[qidx]:
                            train_total_score += score
                        else:
                            oob_total_score += score
                else:
                    train_total_score = metric.evaluate_preds(
                        None, y[sample_mask], y_pred[sample_mask])
                    oob_total_score = metric.evaluate_preds(
                        None, y[~sample_mask], y_pred[~sample_mask])

                train_normalizer = q_inbag if metric.is_ltr_metric else 1.0
                oob_normalizer = n_queries - q_inbag if metric.is_ltr_metric else 1.0
                self.train_score_[i,
                                  midx] = train_total_score / train_normalizer
                if do_query_oob:
                    if q_inbag < n_queries:
                        self.oob_improvement_[i, midx] = \
                            (oob_total_score - old_oob_total_score[midx]) / oob_normalizer

            early_stop = False
            monitor_output = None
            if monitor is not None:
                monitor_output = monitor(i, self, locals())
                if monitor_output is True:
                    early_stop = True

            if self.verbose > 0:
                verbose_reporter.update(i, self, monitor_output)

            if early_stop:
                break

        return i + 1
示例#39
0
文件: dcg.py 项目: cpjha13/pyltr
 def calc_random_ev(self, qid, targets):
     total_gains = sum(self._gain_fn(t) for t in targets)
     total_discounts = sum(self._get_discount(i)
                           for i in range(min(self.k, len(targets))))
     return total_gains * total_discounts / len(targets)
示例#40
0
    def calc_lambdas_deltas(self, qid, targets, preds):
        """Returns the first and second (psuedo-)derivatives.

        Lambdas is the negative gradient of the loss with respect
        to the prediction.  Deltas is the derivative of that.

        Parameters
        ----------
        qid : object
            See `evaluate`.
        targets : array_like of shape = [n_targets]
            See `evaluate`.
        preds : array_like of shape = [n_targets]
            List of predicted scores corresponding to the targets.

        Returns
        -------
        lambdas = array_like of shape = [n_targets]
        deltas = array_like of shape = [n_targets]

        """
        ns = targets.shape[0]
        positions = get_sorted_y_positions(targets, preds, check=False)
        actual = targets[positions]

        swap_deltas = self.calc_swap_deltas(qid, actual)
        max_k = self.max_k()
        if max_k is None or ns < max_k:
            max_k = ns

        lambdas = np.zeros(ns)
        deltas = np.zeros(ns)

        for i in range(max_k):
            for j in range(i + 1, ns):
                if actual[i] == actual[j]:
                    continue

                delta_metric = swap_deltas[i, j]
                if delta_metric == 0.0:
                    continue

                a, b = positions[i], positions[j]
                # invariant: preds[a] >= preds[b]

                if actual[i] < actual[j]:
                    assert delta_metric > 0.0
                    logistic = scipy.special.expit(preds[a] - preds[b])
                    l = logistic * delta_metric
                    lambdas[a] -= l
                    lambdas[b] += l
                else:
                    assert delta_metric < 0.0
                    logistic = scipy.special.expit(preds[b] - preds[a])
                    l = logistic * -delta_metric
                    lambdas[a] += l
                    lambdas[b] -= l

                hess = (1 - logistic) * l
                deltas[a] += hess
                deltas[b] += hess

        return lambdas, deltas
def plot_partial_dependence(est, X, features, feature_names=None,
                            target=None, n_cols=3, grid_resolution=100,
                            percentiles=(0.05, 0.95), method='auto',
                            n_jobs=1, verbose=0, ax=None, line_kw=None,
                            contour_kw=None, **fig_kw):
    """Partial dependence plots.
    The ``len(features)`` plots are arranged in a grid with ``n_cols``
    columns. Two-way partial dependence plots are plotted as contour plots.
    Read more in the :ref:`User Guide <partial_dependence>`.
    Parameters
    ----------
    est : BaseEstimator
        A fitted classification or regression model. Classifiers must have a
        ``predict_proba()`` method. Multioutput-multiclass estimators aren't
        supported.
    X : array-like, shape=(n_samples, n_features)
        The data to use to build the grid of values on which the dependence
        will be evaluated. This is usually the training data.
    features : list of ints or strings, or tuples of ints or strings
        The target features for which to create the PDPs.
        If features[i] is an int or a string, a one-way PDP is created; if
        features[i] is a tuple, a two-way PDP is created. Each tuple must be
        of size 2.
        if any entry is a string, then it must be in ``feature_names``.
    feature_names : seq of str, shape=(n_features,)
        Name of each feature; feature_names[i] holds the name of the feature
        with index i.
    target : int, optional (default=None)
        - In a multiclass setting, specifies the class for which the PDPs
          should be computed. Note that for binary classification, the
          positive class (index 1) is always used.
        - In a multioutput setting, specifies the task for which the PDPs
          should be computed
        Ignored in binary classification or classical regression settings.
    n_cols : int, optional (default=3)
        The number of columns in the grid plot.
    grid_resolution : int, optional (default=100)
        The number of equally spaced points on the axes of the plots, for each
        target feature.
    percentiles : tuple of float, optional (default=(0.05, 0.95))
        The lower and upper percentile used to create the extreme values
        for the PDP axes.
    method : str, optional (default='auto')
        The method to use to calculate the partial dependence predictions:
        - 'recursion' is only supported for objects inheriting from
          `BaseGradientBoosting`, but is more efficient in terms of speed.
        - 'brute' is supported for any estimator, but is more
          computationally intensive.
        - If 'auto', then 'recursion' will be used for
          ``BaseGradientBoosting`` estimators, and 'brute' used for other
          estimators.
        Unlike the 'brute' method, 'recursion' does not account for the
        ``init`` predictor of the boosting process. In practice this still
        produces the same plots, up to a constant offset in the target
        response.
    n_jobs : int, optional (default=1)
        The number of CPUs to use to compute the PDs. -1 means 'all CPUs'.
        See :term:`Glossary <n_jobs>` for more details.
    verbose : int, optional (default=0)
        Verbose output during PD computations.
    ax : Matplotlib axis object, optional (default=None)
        An axis object onto which the plots will be drawn.
    line_kw : dict, optional
        Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.
        For one-way partial dependence plots.
    contour_kw : dict, optional
        Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.
        For two-way partial dependence plots.
    **fig_kw : dict, optional
        Dict with keywords passed to the figure() call.
        Note that all keywords not recognized above will be automatically
        included here.
    Returns
    -------
    fig : figure
        The Matplotlib Figure object.
    axs : seq of Axis objects
        A seq of Axis objects, one for each subplot.
    Examples
    --------
    >>> from sklearn.datasets import make_friedman1
    >>> from sklearn.ensemble import GradientBoostingRegressor
    >>> X, y = make_friedman1()
    >>> clf = GradientBoostingRegressor(n_estimators=10).fit(X, y)
    >>> fig, axs = plot_partial_dependence(clf, X, [0, (0, 1)]) #doctest: +SKIP
    ...
    """
    import matplotlib.pyplot as plt
    from matplotlib import transforms
    from matplotlib.ticker import MaxNLocator
    from matplotlib.ticker import ScalarFormatter

    # set target_idx for multi-class estimators
    if hasattr(est, 'classes_') and np.size(est.classes_) > 2:
        if target is None:
            raise ValueError('target must be specified for multi-class')
        target_idx = np.searchsorted(est.classes_, target)
        if (not (0 <= target_idx < len(est.classes_)) or
                est.classes_[target_idx] != target):
            raise ValueError('target not in est.classes_, got {}'.format(
                target))
    else:
        # regression and binary classification
        target_idx = 0

    X = check_array(X)
    n_features = X.shape[1]

    # convert feature_names to list
    if feature_names is None:
        # if feature_names is None, use feature indices as name
        feature_names = [str(i) for i in range(n_features)]
    elif isinstance(feature_names, np.ndarray):
        feature_names = feature_names.tolist()

    def convert_feature(fx):
        if isinstance(fx, six.string_types):
            try:
                fx = feature_names.index(fx)
            except ValueError:
                raise ValueError('Feature %s not in feature_names' % fx)
        return int(fx)

    # convert features into a seq of int tuples
    tmp_features = []
    for fxs in features:
        if isinstance(fxs, (numbers.Integral, six.string_types)):
            fxs = (fxs,)
        try:
            fxs = [convert_feature(fx) for fx in fxs]
        except TypeError:
            raise ValueError('Each entry in features must be either an int, '
                             'a string, or an iterable of size at most 2.')
        if not (1 <= np.size(fxs) <= 2):
            raise ValueError('Each entry in features must be either an int, '
                             'a string, or an iterable of size at most 2.')

        tmp_features.append(fxs)

    features = tmp_features

    names = []
    try:
        for fxs in features:
            names_ = []
            # explicit loop so "i" is bound for exception below
            for i in fxs:
                names_.append(feature_names[i])
            names.append(names_)
    except IndexError:
        raise ValueError('All entries of features must be less than '
                         'len(feature_names) = {0}, got {1}.'
                         .format(len(feature_names), i))

    # compute averaged predictions
    pd_result = Parallel(n_jobs=n_jobs, verbose=verbose)(
        delayed(partial_dependence)(est, fxs, X=X, method=method,
                                    grid_resolution=grid_resolution,
                                    percentiles=percentiles)
        for fxs in features)

    # For multioutput regression, we can only check the validity of target
    # now that we have the predictions.
    # Also note: as multiclass-multioutput classifiers are not supported,
    # multiclass and multioutput scenario are mutually exclusive. So there is
    # no risk of overwriting target_idx here.
    pd, _ = pd_result[0]  # checking the first result is enough
    if is_regressor(est) and pd.shape[0] > 1:
        if target is None:
            raise ValueError(
                'target must be specified for multi-output regressors')
        if not 0 <= target <= pd.shape[0]:
                raise ValueError(
                    'target must be in [0, n_tasks], got {}.'.format(
                        target))
        target_idx = target
    else:
        target_idx = 0

    # get global min and max values of PD grouped by plot type
    pdp_lim = {}
    for pd, values in pd_result:
        min_pd, max_pd = pd[target_idx].min(), pd[target_idx].max()
        n_fx = len(values)
        old_min_pd, old_max_pd = pdp_lim.get(n_fx, (min_pd, max_pd))
        min_pd = min(min_pd, old_min_pd)
        max_pd = max(max_pd, old_max_pd)
        pdp_lim[n_fx] = (min_pd, max_pd)

    # create contour levels for two-way plots
    if 2 in pdp_lim:
        Z_level = np.linspace(*pdp_lim[2], num=8)

    if ax is None:
        fig = plt.figure(**fig_kw)
    else:
        fig = ax.get_figure()
        fig.clear()

    if line_kw is None:
        line_kw = {'color': 'green'}
    if contour_kw is None:
        contour_kw = {}

    n_cols = min(n_cols, len(features))
    n_rows = int(np.ceil(len(features) / float(n_cols)))
    axs = []
    for i, fx, name, (pd, values) in zip(count(), features, names, pd_result):
        ax = fig.add_subplot(n_rows, n_cols, i + 1)

        if len(values) == 1:
            ax.plot(values[0], pd[target_idx].ravel(), **line_kw)
        else:
            # make contour plot
            assert len(values) == 2
            XX, YY = np.meshgrid(values[0], values[1])
            Z = pd[target_idx].reshape(list(map(np.size, values))).T
            CS = ax.contour(XX, YY, Z, levels=Z_level, linewidths=0.5,
                            colors='k')
            ax.contourf(XX, YY, Z, levels=Z_level, vmax=Z_level[-1],
                        vmin=Z_level[0], alpha=0.75, **contour_kw)
            ax.clabel(CS, fmt='%2.2f', colors='k', fontsize=10, inline=True)

        # plot data deciles + axes labels
        deciles = mquantiles(X[:, fx[0]], prob=np.arange(0.1, 1.0, 0.1))
        trans = transforms.blended_transform_factory(ax.transData,
                                                     ax.transAxes)
        ylim = ax.get_ylim()
        ax.vlines(deciles, [0], 0.05, transform=trans, color='k')
        ax.set_xlabel(name[0])
        ax.set_ylim(ylim)

        # prevent x-axis ticks from overlapping
        ax.xaxis.set_major_locator(MaxNLocator(nbins=6, prune='lower'))
        tick_formatter = ScalarFormatter()
        tick_formatter.set_powerlimits((-3, 4))
        ax.xaxis.set_major_formatter(tick_formatter)

        if len(values) > 1:
            # two-way PDP - y-axis deciles + labels
            deciles = mquantiles(X[:, fx[1]], prob=np.arange(0.1, 1.0, 0.1))
            trans = transforms.blended_transform_factory(ax.transAxes,
                                                         ax.transData)
            xlim = ax.get_xlim()
            ax.hlines(deciles, [0], 0.05, transform=trans, color='k')
            ax.set_ylabel(name[1])
            # hline erases xlim
            ax.set_xlim(xlim)
        else:
            ax.set_ylabel('Partial dependence')

        if len(values) == 1:
            ax.set_ylim(pdp_lim[1])
        axs.append(ax)

    fig.subplots_adjust(bottom=0.15, top=0.7, left=0.1, right=0.95, wspace=0.4,
                        hspace=0.3)
    return fig, axs
示例#42
0
    def fit(self, X, y, sample_weight=None):
        """Build a boosted classifier/regressor from the training set (X, y).
        Parameters
        ----------
        X : {array-like, sparse matrix} of shape = [n_samples, n_features]
            The training input samples. Sparse matrix can be CSC, CSR, COO,
            DOK, or LIL. COO, DOK, and LIL are converted to CSR. The dtype is
            forced to DTYPE from tree._tree if the base classifier of this
            ensemble weighted boosting classifier is a tree or forest.
        y : array-like of shape = [n_samples]
            The target values (class labels in classification, real numbers in
            regression).
        sample_weight : array-like of shape = [n_samples], optional
            Sample weights. If None, the sample weights are initialized to
            1 / n_samples.
        Returns
        -------
        self : object
            Returns self.
        """
        # Check parameters
        if self.learning_rate <= 0:
            raise ValueError("learning_rate must be greater than zero")

        if (self.base_estimator is None or
                isinstance(self.base_estimator, (BaseDecisionTree,
                                                 BaseForest))):
            dtype = DTYPE
            accept_sparse = 'csc'
        else:
            dtype = None
            accept_sparse = ['csr', 'csc']

        X, y = check_X_y(X, y, accept_sparse=accept_sparse, dtype=dtype,
                         y_numeric=is_regressor(self))

        if sample_weight is None:
            # Initialize weights to 1 / n_samples
            sample_weight = np.empty(X.shape[0], dtype=np.float64)
            sample_weight[:] = 1. / X.shape[0]
        else:
            sample_weight = check_array(sample_weight, ensure_2d=False)
            # Normalize existing weights
            sample_weight = sample_weight / sample_weight.sum(dtype=np.float64)

            # Check that the sample weights sum is positive
            if sample_weight.sum() <= 0:
                raise ValueError(
                    "Attempting to fit with a non-positive "
                    "weighted number of samples.")

        # Check parameters
        self._validate_estimator()

        # Clear any previous fit results
        self.estimators_ = []
        self.estimator_weights_ = np.zeros(self.n_estimators, dtype=np.float64)
        self.estimator_errors_ = np.ones(self.n_estimators, dtype=np.float64)

        random_state = check_random_state(self.random_state)

        for iboost in range(self.n_estimators):
            # Boosting step
            sample_weight, estimator_weight, estimator_error = self._boost(
                iboost,
                X, y,
                sample_weight,
                random_state)

            # Early termination
            if sample_weight is None:
                break

            self.estimator_weights_[iboost] = estimator_weight
            self.estimator_errors_[iboost] = estimator_error

            # Stop if error is zero
            if estimator_error == 0:
                break

            sample_weight_sum = np.sum(sample_weight)

            # Stop if the sum of sample weights has become non-positive
            if sample_weight_sum <= 0:
                break

            if iboost < self.n_estimators - 1:
                # Normalize
                sample_weight /= sample_weight_sum

        return self
示例#43
0
def partial_dependence(est,
                       target_variables,
                       grid=None,
                       X=None,
                       output=None,
                       percentiles=(0.05, 0.95),
                       grid_resolution=100,
                       method=None):
    """Partial dependence of ``target_variables``.
    Partial dependence plots show the dependence between the joint values
    of the ``target_variables`` and the function represented
    by the ``est``.
    Read more in the :ref:`User Guide <partial_dependence>`.
    Parameters
    ----------
    est : BaseEstimator
        A fitted classification or regression model.
    target_variables : array-like, dtype=int
        The target features for which the partial dependency should be
        computed (size should be smaller than 3 for visual renderings).
    grid : array-like, shape=(n_points, len(target_variables))
        The grid of ``target_variables`` values for which the
        partial dependency should be evaluated (either ``grid`` or ``X``
        must be specified).
    X : array-like, shape=(n_samples, n_features)
        The data on which ``est`` was trained. It is used to generate
        a ``grid`` for the ``target_variables``. The ``grid`` comprises
        ``grid_resolution`` equally spaced points between the two
        ``percentiles``.
    output : int, optional (default=None)
        The output index to use for multi-output estimators.
    percentiles : (low, high), default=(0.05, 0.95)
        The lower and upper percentile used create the extreme values
        for the ``grid``. Only if ``X`` is not None.
    grid_resolution : int, default=100
        The number of equally spaced points on the ``grid``.
    method : {'recursion', 'exact', 'estimated', None}, optional (default=None)
        The method to use to calculate the partial dependence function:
        - If 'recursion', the underlying trees of ``est`` will be recursed to
          calculate the function. Only supported for BaseGradientBoosting and
          ForestRegressor.
        - If 'exact', the function will be calculated by calling the
          ``predict_proba`` method of ``est`` for classification or ``predict``
          for regression on ``X``for every point in the grid. To speed up this
          method, you can use a subset of ``X`` or a more coarse grid.
        - If 'estimated', the function will be calculated by calling the
          ``predict_proba`` method of ``est`` for classification or ``predict``
          for regression on the mean of ``X``.
        - If None, then 'recursion' will be used if ``est`` is
          BaseGradientBoosting or ForestRegressor, and 'exact' used for other
          estimators.
    Returns
    -------
    pdp : array, shape=(n_classes, n_points)
        The partial dependence function evaluated on the ``grid``.
        For regression and binary classification ``n_classes==1``.
    axes : seq of ndarray or None
        The axes with which the grid has been created or None if
        the grid has been given.
    Examples
    --------
    >>> samples = [[0, 0, 2], [1, 0, 0]]
    >>> labels = [0, 1]
    >>> from sklearn.ensemble import GradientBoostingClassifier
    >>> gb = GradientBoostingClassifier(random_state=0).fit(samples, labels)
    >>> kwargs = dict(X=samples, percentiles=(0, 1), grid_resolution=2)
    >>> partial_dependence(gb, [0], **kwargs) # doctest: +SKIP
    (array([[-4.52...,  4.52...]]), [array([ 0.,  1.])])
    """
    if method is None:
        if isinstance(est, (BaseGradientBoosting, ForestRegressor)):
            method = 'recursion'
        else:
            method = 'exact'
    if (not isinstance(est, (BaseGradientBoosting, ForestRegressor))
            and method == 'recursion'):
        raise ValueError('est has to be an instance of BaseGradientBoosting or'
                         ' ForestRegressor for the "recursion" method. Try '
                         'using method="exact" or "estimated".')
    if (not hasattr(est, '_estimator_type')
            or est._estimator_type not in ('classifier', 'regressor')):
        raise ValueError('est must be a fitted regressor or classifier model.')
    # if method != 'recursion' and est._estimator_type == 'classifier':
    #     raise ValueError('est requires a predict_proba method for '
    #                      'method="exact" or "estimated" for classification.')
    if method == 'recursion':
        if len(est.estimators_) == 0:
            raise ValueError('Call %s.fit before partial_dependence' %
                             est.__class__.__name__)
        if isinstance(est, BaseGradientBoosting):
            n_features = est.n_features
        else:
            n_features = est.n_features_
    elif X is None:
        raise ValueError('X is required for method="exact" or "estimated".')
    else:
        n_features = X.shape[1]
    if (grid is None and X is None) or (grid is not None and X is not None):
        raise ValueError('Either grid or X must be specified')

    target_variables = np.asarray(target_variables, dtype=np.int32,
                                  order='C').ravel()

    if any([not (0 <= fx < n_features) for fx in target_variables]):
        raise ValueError('target_variables must be in [0, %d]' %
                         (n_features - 1))

    if X is not None:
        X = check_array(X, dtype=DTYPE, order='C')
        grid, axes = _grid_from_X(X[:, target_variables], percentiles,
                                  grid_resolution)
    else:
        assert grid is not None
        # don't return axes if grid is given
        axes = None
        # grid must be 2d
        if grid.ndim == 1:
            grid = grid[:, np.newaxis]
        if grid.ndim != 2:
            raise ValueError('grid must be 2d but is %dd' % grid.ndim)

    grid = np.asarray(grid, dtype=DTYPE, order='C')
    assert grid.shape[1] == target_variables.shape[0]

    if method == 'recursion':
        if isinstance(est, BaseGradientBoosting):
            n_trees_per_stage = est.estimators_.shape[1]
            n_estimators = est.estimators_.shape[0]
            learning_rate = est.learning_rate
        else:
            n_trees_per_stage = 1
            n_estimators = len(est.estimators_)
            learning_rate = 1.
        pdp = np.zeros((
            n_trees_per_stage,
            grid.shape[0],
        ),
                       dtype=np.float64,
                       order='C')
        for stage in range(n_estimators):
            for k in range(n_trees_per_stage):
                if isinstance(est, BaseGradientBoosting):
                    tree = est.estimators_[stage, k].tree_
                else:
                    tree = est.estimators_[stage].tree_
                _partial_dependence_tree(tree, grid, target_variables,
                                         learning_rate, pdp[k])
        if isinstance(est, ForestRegressor):
            pdp /= n_estimators
    elif method == 'exact':
        pdp = _exact_partial_dependence(est, target_variables, grid, X, output)
    elif method == 'estimated':
        pdp = _estimated_partial_dependence(est, target_variables, grid, X,
                                            output)
    else:
        raise ValueError('method "%s" is invalid. Use "recursion", "exact", '
                         '"estimated", or None.' % method)

    return pdp, axes