예제 #1
0
    def __init__(self, wrt, f, fprime, min_grad=1e-6, args=None):
        """Create a NonlinearConjugateGradient object.

        Parameters
        ----------

        wrt : array_like
            Array of parameters of the problem.

        f : callable
            Objective function.

        fprime : callable
            First derivative of the objective function.

        min_grad : float
            If all components of the gradient fall below this value, stop
            minimization.

        args : iterable, optional
            Iterable of arguments passed on to the objective function and its
            derivatives.
        """
        super(NonlinearConjugateGradient, self).__init__(wrt, args=args)
        self.f = f
        self.fprime = fprime

        self.line_search = WolfeLineSearch(wrt, self.f, self.fprime, c2=0.2)
        self.min_grad = min_grad
예제 #2
0
파일: bfgs.py 프로젝트: datahacking/climin
    def __init__(self,
                 wrt,
                 f,
                 fprime,
                 initial_hessian_diag=1,
                 n_factors=10,
                 line_search=None,
                 args=None):
        """
        Create an Lbfgs object.

        Attributes
        ----------
        wrt : array_like
            Current solution to the problem. Can be given as a first argument to \
            ``.f`` and ``.fprime``.

        f : Callable
            The object function.

        fprime : Callable
            First derivative of the objective function. Returns an array of the \
            same shape as ``.wrt``.

        initial_hessian_diag : array_like
            The initial estimate of the diagonal of the Hessian.

        n_factors : int
            The number of factors that should be used to implicitly represent the inverse Hessian.

        line_search : LineSearch object.
            Line search object to perform line searches with.

        args : iterable
            Iterator over arguments which ``fprime`` will be called with.

        """

        super(Lbfgs, self).__init__(wrt, args=args)

        self.f = f
        self.fprime = fprime
        self.initial_hessian_diag = initial_hessian_diag
        self.n_factors = n_factors
        if line_search is not None:
            self.line_search = line_search
        else:
            self.line_search = WolfeLineSearch(wrt, self.f, self.fprime)
예제 #3
0
파일: cg.py 프로젝트: SFPD/rlreloaded
    def __init__(self, wrt, f, fprime, min_grad=1e-6, args=None):
        """Create a NonlinearConjugateGradient object.

        Parameters
        ----------

        wrt : array_like
            Array of parameters of the problem.

        f : callable
            Objective function.

        fprime : callable
            First derivative of the objective function.

        min_grad : float
            If all components of the gradient fall below this value, stop
            minimization.

        args : iterable, optional
            Iterable of arguments passed on to the objective function and its
            derivatives.
        """
        super(NonlinearConjugateGradient, self).__init__(wrt, args=args)
        self.f = f
        self.fprime = fprime

        self.line_search = WolfeLineSearch(wrt, self.f, self.fprime, c2=0.2)
        self.min_grad = min_grad
예제 #4
0
파일: ncg.py 프로젝트: osdf/climin
    def __init__(self, wrt, f, fprime, epsilon=1e-6,
                 args=None, logfunc=None):
        super(NonlinearConjugateGradient, self).__init__(
            wrt, args=args, logfunc=logfunc)
        self.f = f
        self.fprime = fprime

        self.line_search = WolfeLineSearch(wrt, self.f, self.fprime, c2=0.2)
        self.epsilon = epsilon
예제 #5
0
파일: bfgs.py 프로젝트: datahacking/climin
    def __init__(self,
                 wrt,
                 f,
                 fprime,
                 initial_inv_hessian=None,
                 line_search=None,
                 args=None):
        """Create a BFGS object.

        Parameters
        ----------

        wrt : array_like
            Array that represents the solution. Will be operated upon in
            place.  ``f`` and ``fprime`` should accept this array as a first argument.

        f : callable
            The objective function.

        fprime : callable
            Callable that given a solution vector as first parameter and *args
            and **kwargs drawn from the iterations ``args`` returns a
            search direction, such as a gradient.

        initial_inv_hessian : array_like
            The initial estimate of the approximiate Hessian.

        line_search : LineSearch object.
            Line search object to perform line searches with.

        args : iterable
            Iterator over arguments which ``fprime`` will be called with.
        """
        super(Bfgs, self).__init__(wrt, args=args)
        self.f = f
        self.fprime = fprime
        self.inv_hessian = initial_inv_hessian

        if line_search is not None:
            self.line_search = line_search
        else:
            self.line_search = WolfeLineSearch(wrt, self.f, self.fprime)
예제 #6
0
파일: bfgs.py 프로젝트: osdf/climin
    def __init__(self, wrt, f, fprime, initial_inv_hessian=None,
                 line_search=None, args=None, logfunc=None):
        super(Bfgs, self).__init__(wrt, args=args, logfunc=logfunc)
        self.f = f
        self.fprime = fprime
        self.inv_hessian = initial_inv_hessian

        if line_search is not None:
            self.line_search = line_search
        else:
            self.line_search = WolfeLineSearch(wrt, self.f, self.fprime)
예제 #7
0
파일: lbfgs.py 프로젝트: osdf/climin
    def __init__(self, wrt, f, fprime, initial_hessian_diag=1,
                 n_factors=10, line_search=None,
                 args=None, logfunc=None):
        super(Lbfgs, self).__init__(wrt, args=args, logfunc=logfunc)

        self.f = f
        self.fprime = fprime
        self.initial_hessian_diag = initial_hessian_diag
        self.n_factors = n_factors
        if line_search is not None:
            self.line_search = line_search
        else:
            self.line_search = WolfeLineSearch(wrt, self.f, self.fprime)
예제 #8
0
파일: bfgs.py 프로젝트: SFPD/rlreloaded
    def __init__(self, wrt, f, fprime, initial_hessian_diag=1,
                 n_factors=10, line_search=None,
                 args=None):
        """
        Create an Lbfgs object.

        Attributes
        ----------
        wrt : array_like
            Current solution to the problem. Can be given as a first argument to \
            ``.f`` and ``.fprime``.

        f : Callable
            The object function.

        fprime : Callable
            First derivative of the objective function. Returns an array of the \
            same shape as ``.wrt``.

        initial_hessian_diag : array_like
            The initial estimate of the diagonal of the Hessian.

        n_factors : int
            The number of factors that should be used to implicitly represent the inverse Hessian.

        line_search : LineSearch object.
            Line search object to perform line searches with.

        args : iterable
            Iterator over arguments which ``fprime`` will be called with.

        """

        super(Lbfgs, self).__init__(wrt, args=args)

        self.f = f
        self.fprime = fprime
        self.initial_hessian_diag = initial_hessian_diag
        self.n_factors = n_factors
        if line_search is not None:
            self.line_search = line_search
        else:
            self.line_search = WolfeLineSearch(wrt, self.f, self.fprime)
예제 #9
0
파일: bfgs.py 프로젝트: SFPD/rlreloaded
    def __init__(self, wrt, f, fprime, initial_inv_hessian=None,
                 line_search=None, args=None):
        """Create a BFGS object.

        Parameters
        ----------

        wrt : array_like
            Array that represents the solution. Will be operated upon in
            place.  ``f`` and ``fprime`` should accept this array as a first argument.

        f : callable
            The objective function.

        fprime : callable
            Callable that given a solution vector as first parameter and *args
            and **kwargs drawn from the iterations ``args`` returns a
            search direction, such as a gradient.

        initial_inv_hessian : array_like
            The initial estimate of the approximiate Hessian.

        line_search : LineSearch object.
            Line search object to perform line searches with.

        args : iterable
            Iterator over arguments which ``fprime`` will be called with.
        """
        super(Bfgs, self).__init__(wrt, args=args)
        self.f = f
        self.fprime = fprime
        self.inv_hessian = initial_inv_hessian

        if line_search is not None:
            self.line_search = line_search
        else:
            self.line_search = WolfeLineSearch(wrt, self.f, self.fprime)
예제 #10
0
파일: lbfgs.py 프로젝트: osdf/climin
class Lbfgs(Minimizer):

    def __init__(self, wrt, f, fprime, initial_hessian_diag=1,
                 n_factors=10, line_search=None,
                 args=None, logfunc=None):
        super(Lbfgs, self).__init__(wrt, args=args, logfunc=logfunc)

        self.f = f
        self.fprime = fprime
        self.initial_hessian_diag = initial_hessian_diag
        self.n_factors = n_factors
        if line_search is not None:
            self.line_search = line_search
        else:
            self.line_search = WolfeLineSearch(wrt, self.f, self.fprime)

    def find_direction(self, grad_diffs, steps, grad, hessian_diag, idxs):
        grad = grad.copy()  # We will change this.
        n_current_factors = len(idxs)

        # TODO: find a good name for this variable.
        rho = scipy.empty(n_current_factors)

        # TODO: vectorize this function
        for i in idxs:
            rho[i] = 1 / scipy.inner(grad_diffs[i], steps[i])

        # TODO: find a good name for this variable as well.
        alpha = scipy.empty(n_current_factors)

        for i in idxs[::-1]:
            alpha[i] = rho[i] * scipy.inner(steps[i], grad)
            grad -= alpha[i] * grad_diffs[i]
        z = hessian_diag * grad

        # TODO: find a good name for this variable (surprise!)
        beta = scipy.empty(n_current_factors)

        for i in idxs:
            beta[i] = rho[i] * scipy.inner(grad_diffs[i], z)
            z += steps[i] * (alpha[i] - beta[i])

        return z, {}

    def __iter__(self):
        args, kwargs = self.args.next()
        grad = self.fprime(self.wrt, *args, **kwargs)
        grad_m1 = scipy.zeros(grad.shape)
        factor_shape = self.n_factors, self.wrt.shape[0]
        grad_diffs = scipy.zeros(factor_shape)
        steps = scipy.zeros(factor_shape)
        hessian_diag = self.initial_hessian_diag
        step_length = None
        step = scipy.empty(grad.shape)
        grad_diff = scipy.empty(grad.shape)

        # We need to keep track in which order the different statistics
        # from different runs are saved.
        #
        # Why?
        #
        # Each iteration, we save statistics such as the difference between
        # gradients and the actual steps taken. This are then later combined
        # into an approximation of the Hessian. We call them factors. Since we
        # don't want to create a new matrix of factors each iteration, we
        # instead keep track externally, which row of the matrix corresponds
        # to which iteration. `idxs` now is a list which maps its i'th element
        # to the corresponding index for the array. Thus, idx[i] contains the
        # rowindex of the for the (n_factors - i)'th iteration prior to the
        # current one.
        idxs = []

        for i, (next_args, next_kwargs) in enumerate(self.args):
            if i == 0:
                direction = -grad
                info = {}
            else:
                sTgd = scipy.inner(step, grad_diff)
                if sTgd > 1E-10:
                    # Don't do an update if this value is too small.
                    # Determine index for the current update.
                    if not idxs:
                        # First iteration.
                        this_idx = 0
                    elif len(idxs) < self.n_factors:
                        # We are not "full" yet. Thus, append the next idxs.
                        this_idx = idxs[-1] + 1
                    else:
                        # we are full and discard the first index.
                        this_idx = idxs.pop(0)

                    idxs.append(this_idx)
                    grad_diffs[this_idx] = grad_diff
                    steps[this_idx] = step
                    hessian_diag = sTgd / scipy.inner(grad_diff, grad_diff)

                direction, info = self.find_direction(
                    grad_diffs, steps, -grad, hessian_diag, idxs)

            if not is_nonzerofinite(direction):
                self.logfunc(
                    {'message': 'direction is invalid--need to bail out.'})
                break

            step_length = self.line_search.search(
                direction, None, args, kwargs)

            step[:] = step_length * direction
            if step_length != 0:
                self.wrt += step
            else:
                self.logfunc({'message': 'step length is 0.'})

            # Prepare everything for the next loop.
            args, kwargs = next_args, next_kwargs
            # TODO: not all line searches have .grad!
            grad_m1[:], grad[:] = grad, self.line_search.grad
            grad_diff = grad - grad_m1

            info.update({
                'step_length': step_length,
                'n_iter': i,
                'args': args,
                'kwargs': kwargs,
                'loss': self.line_search.val,
                'gradient': grad,
                'gradient_m1': grad_m1,
            })
            yield info
예제 #11
0
class NonlinearConjugateGradient(Minimizer):
    """
    NonlinearConjugateGradient optimizer.

    NCG minimizes functions by following directions which are conjugate to each
    other with respect to the Hessian. Since the curvature changes if the
    objective is nonquadratic, the Hessian will not be accurate and thus the
    conjugacy of successive search directions as well. Furthermore, the optimal
    step length cannot be found in closed form and has to be obtained with a
    line search.

    During optimization, we sometimes perform a restart. That means we give up
    on trying to find conjugate directions and use the gradient as a new search
    direction. This is done whenever two successive directions are far from
    orthogonal, an indicator that the quadratic assumption is either inaccurate
    or the Hessian has changed too much lately.

    Attributes
    ----------

    wrt : array_like
        Array of parameters of the problem.

    f : callable
        Objective function.

    fprime : callable
        First derivative of the objective function.

    min_grad : float
        If all components of the gradient fall below this value, stop
        minimization.

    line_search : LineSearch object.
        Line search object to perform line searches with.

    args : iterable
        Iterable of arguments passed on to the objective function and its
        derivatives.
    """
    def __init__(self, wrt, f, fprime, min_grad=1e-6, args=None):
        """Create a NonlinearConjugateGradient object.

        Parameters
        ----------

        wrt : array_like
            Array of parameters of the problem.

        f : callable
            Objective function.

        fprime : callable
            First derivative of the objective function.

        min_grad : float
            If all components of the gradient fall below this value, stop
            minimization.

        args : iterable, optional
            Iterable of arguments passed on to the objective function and its
            derivatives.
        """
        super(NonlinearConjugateGradient, self).__init__(wrt, args=args)
        self.f = f
        self.fprime = fprime

        self.line_search = WolfeLineSearch(wrt, self.f, self.fprime, c2=0.2)
        self.min_grad = min_grad

    def set_from_info(self, info):
        raise NotImplemented('nobody has found the time to implement this yet')

    def extended_info(self, **kwargs):
        raise NotImplemented('nobody has found the time to implement this yet')

    def find_direction(self, grad_m1, grad, direction_m1):
        # Computation of beta as a compromise between Fletcher-Reeves
        # and Polak-Ribiere.
        grad_norm_m1 = np.dot(grad_m1, grad_m1)
        grad_diff = grad - grad_m1
        betaFR = np.dot(grad, grad) / grad_norm_m1
        betaPR = np.dot(grad, grad_diff) / grad_norm_m1
        betaHS = np.dot(grad, grad_diff) / np.dot(direction_m1, grad_diff)
        beta = max(-betaFR, min(betaPR, betaFR))

        # Restart if not a direction of sufficient descent, ie if two
        # consecutive gradients are far from orthogonal.
        if np.dot(grad, grad_m1) / grad_norm_m1 > 0.1:
            beta = 0

        direction = -grad + beta * direction_m1
        return direction, {}

    def __iter__(self):
        args, kwargs = self.args.next()
        grad = self.fprime(self.wrt, *args, **kwargs)
        grad_m1 = np.zeros(grad.shape)
        loss = self.f(self.wrt, *args, **kwargs)
        loss_m1 = 0

        for i, (next_args, next_kwargs) in enumerate(self.args):
            if i == 0:
                direction, info = -grad, {}
            else:
                direction, info = self.find_direction(grad_m1, grad, direction)

            if not is_nonzerofinite(direction):
                warnings.warn('gradient is either zero, nan or inf')
                break

            # Line search minimization.
            initialization = 2 * (loss - loss_m1) / np.dot(grad, direction)
            initialization = min(1, initialization)
            step_length = self.line_search.search(direction, initialization,
                                                  args, kwargs)
            self.wrt += step_length * direction

            # If we don't bail out here, we will enter regions of numerical
            # instability.
            if (abs(grad) < self.min_grad).all():
                warnings.warn('gradient is too small')
                break

            # Prepare everything for the next loop.
            args, kwargs = next_args, next_kwargs
            grad_m1[:], grad[:] = grad, self.line_search.grad
            loss_m1, loss = loss, self.line_search.val

            info.update({
                'n_iter': i,
                'args': args,
                'kwargs': kwargs,
                'loss': loss,
                'gradient': grad,
                'gradient_m1': grad_m1,
                'step_length': step_length,
            })
            yield info
예제 #12
0
파일: ncg.py 프로젝트: osdf/climin
class NonlinearConjugateGradient(Minimizer):
    """
    Nonlinear Conjugate Gradient Method
    """

    def __init__(self, wrt, f, fprime, epsilon=1e-6,
                 args=None, logfunc=None):
        super(NonlinearConjugateGradient, self).__init__(
            wrt, args=args, logfunc=logfunc)
        self.f = f
        self.fprime = fprime

        self.line_search = WolfeLineSearch(wrt, self.f, self.fprime, c2=0.2)
        self.epsilon = epsilon

    def find_direction(self, grad_m1, grad, direction_m1):
        # Computation of beta as a compromise between Fletcher-Reeves
        # and Polak-Ribiere.
        grad_norm_m1 = scipy.dot(grad_m1, grad_m1)
        grad_diff = grad - grad_m1
        betaFR = scipy.dot(grad, grad) / grad_norm_m1
        betaPR = scipy.dot(grad, grad_diff) / grad_norm_m1
        betaHS = scipy.dot(grad, grad_diff) / scipy.dot(direction_m1, grad_diff)
        beta = max(-betaFR, min(betaPR, betaFR))

        # Restart if not a direction of sufficient descent, ie if two
        # consecutive gradients are far from orthogonal.
        if scipy.dot(grad, grad_m1) / grad_norm_m1 > 0.1:
            beta = 0

        direction = -grad + beta * direction_m1
        return direction, {}

    def __iter__(self):
        args, kwargs = self.args.next()
        grad = self.fprime(self.wrt, *args, **kwargs)
        grad_m1 = scipy.zeros(grad.shape)
        loss = self.f(self.wrt, *args, **kwargs)
        loss_m1 = 0

        for i, (next_args, next_kwargs) in enumerate(self.args):
            if i == 0:
                direction, info = -grad, {}
            else:
                direction, info = self.find_direction(grad_m1, grad, direction)

            if not is_nonzerofinite(direction):
                self.logfunc(
                    {'message': 'direction is invalid -- neet to bail out.'})
                break

            # Line search minimization.
            initialization = 2 * (loss - loss_m1) / scipy.dot(grad, direction)
            initialization = min(1, initialization)
            step_length = self.line_search.search(
                direction, initialization,  args, kwargs)
            self.wrt += step_length * direction

            # If we don't bail out here, we will enter regions of numerical
            # instability.
            if (abs(grad) < self.epsilon).all():
                self.logfunc(
                    {'message': 'converged - gradient smaller than epsilon'})
                break

            # Prepare everything for the next loop.
            args, kwargs = next_args, next_kwargs
            grad_m1[:], grad[:] = grad, self.line_search.grad
            loss_m1, loss = loss, self.line_search.val

            info.update({
                'loss': loss,
                'step_length': step_length,
                'n_iter': i,
                'args': args,
                'gradient': grad,
                'gradient_m1': grad_m1,
                'kwargs': kwargs,
            })
            yield info
예제 #13
0
파일: cg.py 프로젝트: SFPD/rlreloaded
class NonlinearConjugateGradient(Minimizer):
    """
    NonlinearConjugateGradient optimizer.

    NCG minimizes functions by following directions which are conjugate to each
    other with respect to the Hessian. Since the curvature changes if the
    objective is nonquadratic, the Hessian will not be accurate and thus the
    conjugacy of successive search directions as well. Furthermore, the optimal
    step length cannot be found in closed form and has to be obtained with a
    line search.

    During optimization, we sometimes perform a restart. That means we give up
    on trying to find conjugate directions and use the gradient as a new search
    direction. This is done whenever two successive directions are far from
    orthogonal, an indicator that the quadratic assumption is either inaccurate
    or the Hessian has changed too much lately.

    Attributes
    ----------

    wrt : array_like
        Array of parameters of the problem.

    f : callable
        Objective function.

    fprime : callable
        First derivative of the objective function.

    min_grad : float
        If all components of the gradient fall below this value, stop
        minimization.

    line_search : LineSearch object.
        Line search object to perform line searches with.

    args : iterable
        Iterable of arguments passed on to the objective function and its
        derivatives.
    """

    def __init__(self, wrt, f, fprime, min_grad=1e-6, args=None):
        """Create a NonlinearConjugateGradient object.

        Parameters
        ----------

        wrt : array_like
            Array of parameters of the problem.

        f : callable
            Objective function.

        fprime : callable
            First derivative of the objective function.

        min_grad : float
            If all components of the gradient fall below this value, stop
            minimization.

        args : iterable, optional
            Iterable of arguments passed on to the objective function and its
            derivatives.
        """
        super(NonlinearConjugateGradient, self).__init__(wrt, args=args)
        self.f = f
        self.fprime = fprime

        self.line_search = WolfeLineSearch(wrt, self.f, self.fprime, c2=0.2)
        self.min_grad = min_grad

    def find_direction(self, grad_m1, grad, direction_m1):
        # Computation of beta as a compromise between Fletcher-Reeves
        # and Polak-Ribiere.
        grad_norm_m1 = np.dot(grad_m1, grad_m1)
        grad_diff = grad - grad_m1
        betaFR = np.dot(grad, grad) / grad_norm_m1
        betaPR = np.dot(grad, grad_diff) / grad_norm_m1
        betaHS = np.dot(grad, grad_diff) / np.dot(direction_m1, grad_diff)
        beta = max(-betaFR, min(betaPR, betaFR))

        # Restart if not a direction of sufficient descent, ie if two
        # consecutive gradients are far from orthogonal.
        if np.dot(grad, grad_m1) / grad_norm_m1 > 0.1:
            beta = 0

        direction = -grad + beta * direction_m1
        return direction, {}

    def __iter__(self):
        args, kwargs = self.args.next()
        grad = self.fprime(self.wrt, *args, **kwargs)
        grad_m1 = np.zeros(grad.shape)
        loss = self.f(self.wrt, *args, **kwargs)
        loss_m1 = 0

        for i, (next_args, next_kwargs) in enumerate(self.args):
            if i == 0:
                direction, info = -grad, {}
            else:
                direction, info = self.find_direction(grad_m1, grad, direction)

            if not is_nonzerofinite(direction):
                warnings.warn('gradient is either zero, nan or inf')
                break

            # Line search minimization.
            initialization = 2 * (loss - loss_m1) / np.dot(grad, direction)
            initialization = min(1, initialization)
            step_length = self.line_search.search(
                direction, initialization,  args, kwargs)
            self.wrt += step_length * direction

            # If we don't bail out here, we will enter regions of numerical
            # instability.
            if (abs(grad) < self.min_grad).all():
                warnings.warn('gradient is too small')
                break

            # Prepare everything for the next loop.
            args, kwargs = next_args, next_kwargs
            grad_m1[:], grad[:] = grad, self.line_search.grad
            loss_m1, loss = loss, self.line_search.val

            info.update({
                'n_iter': i,
                'args': args,
                'kwargs': kwargs,

                'loss': loss,
                'gradient': grad,
                'gradient_m1': grad_m1,
                'step_length': step_length,
            })
            yield info
예제 #14
0
파일: bfgs.py 프로젝트: SFPD/rlreloaded
class Bfgs(Minimizer):
    """BFGS (Broyden-Fletcher-Goldfarb-Shanno) is one of the most well-knwon
    quasi-Newton methods. The main idea is to iteratively construct an approximate inverse
    Hessian :math:`B^{-1}_t` by a rank-2 update:

        .. math::
            B^{-1}_{t+1} = B^{-1}_t + (1 + \\frac{y_t^TB^{-1}_ty_t}{y_t^Ts_t})\\frac{s_ts_t^T}{s_t^Ty_t} - \\frac{s_ty_t^TB^{-1}_t + B^{-1}_ty_ts_t^T}{s_t^Ty_t},

    where :math:`y_t = f(\\theta_{t+1}) - f(\\theta_{t})` and :math:`s_t = \\theta_{t+1} - \\theta_t`.

    The storage requirements for BFGS scale quadratically with the number of
    variables. For detailed derivations, see [nocedal2006a]_, chapter 6.

    .. [nocedal2006a]  Nocedal, J. and Wright, S. (2006),
        Numerical Optimization, 2nd edition, Springer.

    Attributes
    ----------
    wrt : array_like
        Current solution to the problem. Can be given as a first argument to \
        ``.f`` and ``.fprime``.

    f : Callable
        The object function.

    fprime : Callable
        First derivative of the objective function. Returns an array of the \
        same shape as ``.wrt``.

    initial_inv_hessian : array_like
        The initial estimate of the approximiate Hessian.

    line_search : LineSearch object.
        Line search object to perform line searches with.

    args : iterable
        Iterator over arguments which ``fprime`` will be called with.

    """

    def __init__(self, wrt, f, fprime, initial_inv_hessian=None,
                 line_search=None, args=None):
        """Create a BFGS object.

        Parameters
        ----------

        wrt : array_like
            Array that represents the solution. Will be operated upon in
            place.  ``f`` and ``fprime`` should accept this array as a first argument.

        f : callable
            The objective function.

        fprime : callable
            Callable that given a solution vector as first parameter and *args
            and **kwargs drawn from the iterations ``args`` returns a
            search direction, such as a gradient.

        initial_inv_hessian : array_like
            The initial estimate of the approximiate Hessian.

        line_search : LineSearch object.
            Line search object to perform line searches with.

        args : iterable
            Iterator over arguments which ``fprime`` will be called with.
        """
        super(Bfgs, self).__init__(wrt, args=args)
        self.f = f
        self.fprime = fprime
        self.inv_hessian = initial_inv_hessian

        if line_search is not None:
            self.line_search = line_search
        else:
            self.line_search = WolfeLineSearch(wrt, self.f, self.fprime)

    def find_direction(self, grad_m1, grad, step, inv_hessian):
        H = self.inv_hessian
        grad_diff = grad - grad_m1
        ys = np.inner(grad_diff, step)
        Hy = np.dot(H, grad_diff)
        yHy = np.inner(grad_diff, Hy)
        H += (ys + yHy) * np.outer(step, step) / ys ** 2
        H -= (np.outer(Hy, step) + np.outer(step, Hy)) / ys
        direction = -np.dot(H, grad)
        return direction, {'gradient_diff': grad_diff}

    def __iter__(self):
        args, kwargs = self.args.next()
        grad = self.fprime(self.wrt, *args, **kwargs)
        grad_m1 = scipy.zeros(grad.shape)

        if self.inv_hessian is None:
            self.inv_hessian = scipy.eye(grad.shape[0])

        for i, (next_args, next_kwargs) in enumerate(self.args):
            if i == 0:
                direction, info = -grad, {}
            else:
                direction, info = self.find_direction(
                    grad_m1, grad, step, self.inv_hessian)

            if not is_nonzerofinite(direction):
                # TODO: inform the user here.
                break

            step_length = self.line_search.search(
                direction, None, args, kwargs)

            if step_length != 0:
                step = step_length * direction
                self.wrt += step
            else:
                self.logfunc(
                    {'message': 'step length is 0--need to bail out.'})
                break

            # Prepare everything for the next loop.
            args, kwargs = next_args, next_kwargs
            # TODO: not all line searches have .grad!
            grad_m1[:], grad[:] = grad, self.line_search.grad

            info.update({
                'step_length': step_length,
                'n_iter': i,
                'args': args,
                'kwargs': kwargs,
            })
            yield info
예제 #15
0
파일: bfgs.py 프로젝트: SFPD/rlreloaded
class Lbfgs(Minimizer):
    """l-BFGS (limited-memory BFGS) is a limited memory variation of the well-known
    BFGS algorithm. The storage requirement for BFGS scale quadratically with the number of variables,
    and thus it tends to be used only for smaller problems. Limited-memory BFGS reduces the
    storage by only using the :math:`l` latest updates (factors) in computing the approximate Hessian inverse
    and representing this approximation only implicitly. More specifically, it stores the last
    :math:`l` BFGS update vectors :math:`y_t` and :math:`s_t` and uses these to implicitly perform
    the matrix operations of BFGS (see [nocedal2006a]_).

    .. note::
       In order to handle simple box constraints, consider ``scipy.optimize.fmin_l_bfgs_b``.

    Attributes
    ----------
    wrt : array_like
        Current solution to the problem. Can be given as a first argument to \
        ``.f`` and ``.fprime``.

    f : Callable
        The object function.

    fprime : Callable
        First derivative of the objective function. Returns an array of the \
        same shape as ``.wrt``.

    initial_hessian_diag : array_like
        The initial estimate of the diagonal of the Hessian.

    n_factors : int
        The number of factors that should be used to implicitly represent the inverse Hessian.

    line_search : LineSearch object.
        Line search object to perform line searches with.

    args : iterable
        Iterator over arguments which ``fprime`` will be called with.

    """

    def __init__(self, wrt, f, fprime, initial_hessian_diag=1,
                 n_factors=10, line_search=None,
                 args=None):
        """
        Create an Lbfgs object.

        Attributes
        ----------
        wrt : array_like
            Current solution to the problem. Can be given as a first argument to \
            ``.f`` and ``.fprime``.

        f : Callable
            The object function.

        fprime : Callable
            First derivative of the objective function. Returns an array of the \
            same shape as ``.wrt``.

        initial_hessian_diag : array_like
            The initial estimate of the diagonal of the Hessian.

        n_factors : int
            The number of factors that should be used to implicitly represent the inverse Hessian.

        line_search : LineSearch object.
            Line search object to perform line searches with.

        args : iterable
            Iterator over arguments which ``fprime`` will be called with.

        """

        super(Lbfgs, self).__init__(wrt, args=args)

        self.f = f
        self.fprime = fprime
        self.initial_hessian_diag = initial_hessian_diag
        self.n_factors = n_factors
        if line_search is not None:
            self.line_search = line_search
        else:
            self.line_search = WolfeLineSearch(wrt, self.f, self.fprime)

    def find_direction(self, grad_diffs, steps, grad, hessian_diag, idxs):
        grad = grad.copy()  # We will change this.
        n_current_factors = len(idxs)

        # TODO: find a good name for this variable.
        rho = scipy.empty(n_current_factors)

        # TODO: vectorize this function
        for i in idxs:
            rho[i] = 1 / scipy.inner(grad_diffs[i], steps[i])

        # TODO: find a good name for this variable as well.
        alpha = scipy.empty(n_current_factors)

        for i in idxs[::-1]:
            alpha[i] = rho[i] * scipy.inner(steps[i], grad)
            grad -= alpha[i] * grad_diffs[i]
        z = hessian_diag * grad

        # TODO: find a good name for this variable (surprise!)
        beta = scipy.empty(n_current_factors)

        for i in idxs:
            beta[i] = rho[i] * scipy.inner(grad_diffs[i], z)
            z += steps[i] * (alpha[i] - beta[i])

        return z, {}

    def __iter__(self):
        args, kwargs = self.args.next()
        grad = self.fprime(self.wrt, *args, **kwargs)
        grad_m1 = scipy.zeros(grad.shape)
        factor_shape = self.n_factors, self.wrt.shape[0]
        grad_diffs = scipy.zeros(factor_shape)
        steps = scipy.zeros(factor_shape)
        hessian_diag = self.initial_hessian_diag
        step_length = None
        step = scipy.empty(grad.shape)
        grad_diff = scipy.empty(grad.shape)

        # We need to keep track in which order the different statistics
        # from different runs are saved.
        #
        # Why?
        #
        # Each iteration, we save statistics such as the difference between
        # gradients and the actual steps taken. These are then later combined
        # into an approximation of the Hessian. We call them factors. Since we
        # don't want to create a new matrix of factors each iteration, we
        # instead keep track externally, which row of the matrix corresponds
        # to which iteration. `idxs` now is a list which maps its i'th element
        # to the corresponding index for the array. Thus, idx[i] contains the
        # rowindex of the for the (n_factors - i)'th iteration prior to the
        # current one.
        idxs = []

        for i, (next_args, next_kwargs) in enumerate(self.args):
            if i == 0:
                direction = -grad
                info = {}
            else:
                sTgd = scipy.inner(step, grad_diff)
                if sTgd > 1E-10:
                    # Don't do an update if this value is too small.
                    # Determine index for the current update.
                    if not idxs:
                        # First iteration.
                        this_idx = 0
                    elif len(idxs) < self.n_factors:
                        # We are not "full" yet. Thus, append the next idxs.
                        this_idx = idxs[-1] + 1
                    else:
                        # we are full and discard the first index.
                        this_idx = idxs.pop(0)

                    idxs.append(this_idx)
                    grad_diffs[this_idx] = grad_diff
                    steps[this_idx] = step
                    hessian_diag = sTgd / scipy.inner(grad_diff, grad_diff)

                direction, info = self.find_direction(
                    grad_diffs, steps, -grad, hessian_diag, idxs)

            if not is_nonzerofinite(direction):
                warnings.warn('search direction is either 0, nan or inf')
                break

            step_length = self.line_search.search(
                direction, None, args, kwargs)

            step[:] = step_length * direction
            if step_length != 0:
                self.wrt += step
            else:
                warnings.warn('step length is 0')
                pass

            # Prepare everything for the next loop.
            args, kwargs = next_args, next_kwargs
            # TODO: not all line searches have .grad!
            grad_m1[:], grad[:] = grad, self.line_search.grad
            grad_diff = grad - grad_m1

            info.update({
                'step_length': step_length,
                'n_iter': i,
                'args': args,
                'kwargs': kwargs,
                'loss': self.line_search.val,
                'gradient': grad,
                'gradient_m1': grad_m1,
            })
            yield info
예제 #16
0
파일: bfgs.py 프로젝트: datahacking/climin
class Bfgs(Minimizer):
    """BFGS (Broyden-Fletcher-Goldfarb-Shanno) is one of the most well-knwon
    quasi-Newton methods. The main idea is to iteratively construct an approximate inverse
    Hessian :math:`B^{-1}_t` by a rank-2 update:

        .. math::
            B^{-1}_{t+1} = B^{-1}_t + (1 + \\frac{y_t^TB^{-1}_ty_t}{y_t^Ts_t})\\frac{s_ts_t^T}{s_t^Ty_t} - \\frac{s_ty_t^TB^{-1}_t + B^{-1}_ty_ts_t^T}{s_t^Ty_t},

    where :math:`y_t = f(\\theta_{t+1}) - f(\\theta_{t})` and :math:`s_t = \\theta_{t+1} - \\theta_t`.

    The storage requirements for BFGS scale quadratically with the number of
    variables. For detailed derivations, see [nocedal2006a]_, chapter 6.

    .. [nocedal2006a]  Nocedal, J. and Wright, S. (2006),
        Numerical Optimization, 2nd edition, Springer.

    Attributes
    ----------
    wrt : array_like
        Current solution to the problem. Can be given as a first argument to \
        ``.f`` and ``.fprime``.

    f : Callable
        The object function.

    fprime : Callable
        First derivative of the objective function. Returns an array of the \
        same shape as ``.wrt``.

    initial_inv_hessian : array_like
        The initial estimate of the approximiate Hessian.

    line_search : LineSearch object.
        Line search object to perform line searches with.

    args : iterable
        Iterator over arguments which ``fprime`` will be called with.

    """
    def __init__(self,
                 wrt,
                 f,
                 fprime,
                 initial_inv_hessian=None,
                 line_search=None,
                 args=None):
        """Create a BFGS object.

        Parameters
        ----------

        wrt : array_like
            Array that represents the solution. Will be operated upon in
            place.  ``f`` and ``fprime`` should accept this array as a first argument.

        f : callable
            The objective function.

        fprime : callable
            Callable that given a solution vector as first parameter and *args
            and **kwargs drawn from the iterations ``args`` returns a
            search direction, such as a gradient.

        initial_inv_hessian : array_like
            The initial estimate of the approximiate Hessian.

        line_search : LineSearch object.
            Line search object to perform line searches with.

        args : iterable
            Iterator over arguments which ``fprime`` will be called with.
        """
        super(Bfgs, self).__init__(wrt, args=args)
        self.f = f
        self.fprime = fprime
        self.inv_hessian = initial_inv_hessian

        if line_search is not None:
            self.line_search = line_search
        else:
            self.line_search = WolfeLineSearch(wrt, self.f, self.fprime)

    def find_direction(self, grad_m1, grad, step, inv_hessian):
        H = self.inv_hessian
        grad_diff = grad - grad_m1
        ys = np.inner(grad_diff, step)
        Hy = np.dot(H, grad_diff)
        yHy = np.inner(grad_diff, Hy)
        H += (ys + yHy) * np.outer(step, step) / ys**2
        H -= (np.outer(Hy, step) + np.outer(step, Hy)) / ys
        direction = -np.dot(H, grad)
        return direction, {'gradient_diff': grad_diff}

    def __iter__(self):
        args, kwargs = self.args.next()
        grad = self.fprime(self.wrt, *args, **kwargs)
        grad_m1 = scipy.zeros(grad.shape)

        if self.inv_hessian is None:
            self.inv_hessian = scipy.eye(grad.shape[0])

        for i, (next_args, next_kwargs) in enumerate(self.args):
            if i == 0:
                direction, info = -grad, {}
            else:
                direction, info = self.find_direction(grad_m1, grad, step,
                                                      self.inv_hessian)

            if not is_nonzerofinite(direction):
                # TODO: inform the user here.
                break

            step_length = self.line_search.search(direction, None, args,
                                                  kwargs)

            if step_length != 0:
                step = step_length * direction
                self.wrt += step
            else:
                self.logfunc(
                    {'message': 'step length is 0--need to bail out.'})
                break

            # Prepare everything for the next loop.
            args, kwargs = next_args, next_kwargs
            # TODO: not all line searches have .grad!
            grad_m1[:], grad[:] = grad, self.line_search.grad

            info.update({
                'step_length': step_length,
                'n_iter': i,
                'args': args,
                'kwargs': kwargs,
            })
            yield info
예제 #17
0
파일: bfgs.py 프로젝트: osdf/climin
class Bfgs(Minimizer):

    def __init__(self, wrt, f, fprime, initial_inv_hessian=None,
                 line_search=None, args=None, logfunc=None):
        super(Bfgs, self).__init__(wrt, args=args, logfunc=logfunc)
        self.f = f
        self.fprime = fprime
        self.inv_hessian = initial_inv_hessian

        if line_search is not None:
            self.line_search = line_search
        else:
            self.line_search = WolfeLineSearch(wrt, self.f, self.fprime)

    def find_direction(self, grad_m1, grad, step, inv_hessian):
        H = self.inv_hessian
        grad_diff = grad - grad_m1
        ys = np.inner(grad_diff, step)
        ss = np.inner(step, step)
        yy = np.inner(grad_diff, grad_diff)
        Hy = np.dot(H, grad_diff)
        yHy = np.inner(grad_diff, Hy)
        H += (ys + yHy) * np.outer(step, step) / ys**2 
        H -= (np.outer(Hy, step) + np.outer(step, Hy)) / ys
        direction = -np.dot(H, grad)
        return direction, {'gradient_diff': grad_diff}

    def __iter__(self):
        args, kwargs = self.args.next()
        grad = self.fprime(self.wrt, *args, **kwargs)
        grad_m1 = scipy.zeros(grad.shape)

        if self.inv_hessian is None:
            self.inv_hessian = scipy.eye(grad.shape[0])

        for i, (next_args, next_kwargs) in enumerate(self.args):
            if i == 0:
                direction, info = -grad, {}
            else:
                direction, info = self.find_direction(
                    grad_m1, grad, step, self.inv_hessian)

            if not is_nonzerofinite(direction):
                self.logfunc(
                    {'message': 'direction is invalid -- need to bail out.'})
                break

            step_length = self.line_search.search(
                direction, None, args, kwargs)

            if step_length != 0:
                step = step_length * direction
                self.wrt += step
            else:
                self.logfunc(
                    {'message': 'step length is 0--need to bail out.'})
                break

            # Prepare everything for the next loop.
            args, kwargs = next_args, next_kwargs
            # TODO: not all line searches have .grad!
            grad_m1[:], grad[:] = grad, self.line_search.grad

            info.update({
                'step_length': step_length,
                'n_iter': i,
                'args': args,
                'kwargs': kwargs,
            })
            yield info
예제 #18
0
파일: bfgs.py 프로젝트: datahacking/climin
class Lbfgs(Minimizer):
    """l-BFGS (limited-memory BFGS) is a limited memory variation of the well-known
    BFGS algorithm. The storage requirement for BFGS scale quadratically with the number of variables,
    and thus it tends to be used only for smaller problems. Limited-memory BFGS reduces the
    storage by only using the :math:`l` latest updates (factors) in computing the approximate Hessian inverse
    and representing this approximation only implicitly. More specifically, it stores the last
    :math:`l` BFGS update vectors :math:`y_t` and :math:`s_t` and uses these to implicitly perform
    the matrix operations of BFGS (see [nocedal2006a]_).

    .. note::
       In order to handle simple box constraints, consider ``scipy.optimize.fmin_l_bfgs_b``.

    Attributes
    ----------
    wrt : array_like
        Current solution to the problem. Can be given as a first argument to \
        ``.f`` and ``.fprime``.

    f : Callable
        The object function.

    fprime : Callable
        First derivative of the objective function. Returns an array of the \
        same shape as ``.wrt``.

    initial_hessian_diag : array_like
        The initial estimate of the diagonal of the Hessian.

    n_factors : int
        The number of factors that should be used to implicitly represent the inverse Hessian.

    line_search : LineSearch object.
        Line search object to perform line searches with.

    args : iterable
        Iterator over arguments which ``fprime`` will be called with.

    """
    def __init__(self,
                 wrt,
                 f,
                 fprime,
                 initial_hessian_diag=1,
                 n_factors=10,
                 line_search=None,
                 args=None):
        """
        Create an Lbfgs object.

        Attributes
        ----------
        wrt : array_like
            Current solution to the problem. Can be given as a first argument to \
            ``.f`` and ``.fprime``.

        f : Callable
            The object function.

        fprime : Callable
            First derivative of the objective function. Returns an array of the \
            same shape as ``.wrt``.

        initial_hessian_diag : array_like
            The initial estimate of the diagonal of the Hessian.

        n_factors : int
            The number of factors that should be used to implicitly represent the inverse Hessian.

        line_search : LineSearch object.
            Line search object to perform line searches with.

        args : iterable
            Iterator over arguments which ``fprime`` will be called with.

        """

        super(Lbfgs, self).__init__(wrt, args=args)

        self.f = f
        self.fprime = fprime
        self.initial_hessian_diag = initial_hessian_diag
        self.n_factors = n_factors
        if line_search is not None:
            self.line_search = line_search
        else:
            self.line_search = WolfeLineSearch(wrt, self.f, self.fprime)

    def find_direction(self, grad_diffs, steps, grad, hessian_diag, idxs):
        grad = grad.copy()  # We will change this.
        n_current_factors = len(idxs)

        # TODO: find a good name for this variable.
        rho = scipy.empty(n_current_factors)

        # TODO: vectorize this function
        for i in idxs:
            rho[i] = 1 / scipy.inner(grad_diffs[i], steps[i])

        # TODO: find a good name for this variable as well.
        alpha = scipy.empty(n_current_factors)

        for i in idxs[::-1]:
            alpha[i] = rho[i] * scipy.inner(steps[i], grad)
            grad -= alpha[i] * grad_diffs[i]
        z = hessian_diag * grad

        # TODO: find a good name for this variable (surprise!)
        beta = scipy.empty(n_current_factors)

        for i in idxs:
            beta[i] = rho[i] * scipy.inner(grad_diffs[i], z)
            z += steps[i] * (alpha[i] - beta[i])

        return z, {}

    def __iter__(self):
        args, kwargs = self.args.next()
        grad = self.fprime(self.wrt, *args, **kwargs)
        grad_m1 = scipy.zeros(grad.shape)
        factor_shape = self.n_factors, self.wrt.shape[0]
        grad_diffs = scipy.zeros(factor_shape)
        steps = scipy.zeros(factor_shape)
        hessian_diag = self.initial_hessian_diag
        step_length = None
        step = scipy.empty(grad.shape)
        grad_diff = scipy.empty(grad.shape)

        # We need to keep track in which order the different statistics
        # from different runs are saved.
        #
        # Why?
        #
        # Each iteration, we save statistics such as the difference between
        # gradients and the actual steps taken. These are then later combined
        # into an approximation of the Hessian. We call them factors. Since we
        # don't want to create a new matrix of factors each iteration, we
        # instead keep track externally, which row of the matrix corresponds
        # to which iteration. `idxs` now is a list which maps its i'th element
        # to the corresponding index for the array. Thus, idx[i] contains the
        # rowindex of the for the (n_factors - i)'th iteration prior to the
        # current one.
        idxs = []

        for i, (next_args, next_kwargs) in enumerate(self.args):
            if i == 0:
                direction = -grad
                info = {}
            else:
                sTgd = scipy.inner(step, grad_diff)
                if sTgd > 1E-10:
                    # Don't do an update if this value is too small.
                    # Determine index for the current update.
                    if not idxs:
                        # First iteration.
                        this_idx = 0
                    elif len(idxs) < self.n_factors:
                        # We are not "full" yet. Thus, append the next idxs.
                        this_idx = idxs[-1] + 1
                    else:
                        # we are full and discard the first index.
                        this_idx = idxs.pop(0)

                    idxs.append(this_idx)
                    grad_diffs[this_idx] = grad_diff
                    steps[this_idx] = step
                    hessian_diag = sTgd / scipy.inner(grad_diff, grad_diff)

                direction, info = self.find_direction(grad_diffs, steps, -grad,
                                                      hessian_diag, idxs)

            if not is_nonzerofinite(direction):
                warnings.warn('search direction is either 0, nan or inf')
                break

            step_length = self.line_search.search(direction, None, args,
                                                  kwargs)

            step[:] = step_length * direction
            if step_length != 0:
                self.wrt += step
            else:
                warnings.warn('step length is 0')
                pass

            # Prepare everything for the next loop.
            args, kwargs = next_args, next_kwargs
            # TODO: not all line searches have .grad!
            grad_m1[:], grad[:] = grad, self.line_search.grad
            grad_diff = grad - grad_m1

            info.update({
                'step_length': step_length,
                'n_iter': i,
                'args': args,
                'kwargs': kwargs,
                'loss': self.line_search.val,
                'gradient': grad,
                'gradient_m1': grad_m1,
            })
            yield info