Пример #1
0
def ls_wolfe12(f, fprime, xk, pk, gfk, old_fval, old_old_fval):
    """
    Same as line_search_wolfe1, but fall back to line_search_wolfe2 if
    suitable step length is not found, and raise an exception if a
    suitable step length is not found.
    Raises
    ------
    _LineSearchError
        If no suitable step size is found
    """

    ret = line_search_wolfe1(f, fprime, xk, pk, gfk, old_fval, old_old_fval)
    alpha = ret[0]

    if alpha is None or alpha < 1e-12:
        #print('A')
        # line search failed: try different one.
        ret = line_search_wolfe2(f, fprime, xk, pk, gfk, old_fval,
                                 old_old_fval)
        alpha = ret[0]

    if alpha is None or alpha < 1e-12:
        #print('B')
        ret = line_search_armijo(f, xk, pk, gfk, old_fval)
        alpha = ret[0]

    if alpha is None or alpha < 1e-12:
        #print('C')
        alpha = backtracking_line_search(f, gfk, xk, pk)

    return alpha
 def test_line_search_armijo(self):
     c = 0
     for name, f, fprime, x, p, old_f in self.line_iter():
         f0 = f(x)
         g0 = fprime(x)
         self.fcount = 0
         s, fc, fv = ls.line_search_armijo(f, x, p, g0, f0)
         c += 1
         assert_equal(self.fcount, fc)
         assert_equal(fv, f(x + s*p))
         assert_line_armijo(x, p, s, f, err_msg=name)
     assert_(c >= 9)
 def test_line_search_armijo(self):
     c = 0
     for name, f, fprime, x, p, old_f in self.line_iter():
         f0 = f(x)
         g0 = fprime(x)
         self.fcount = 0
         s, fc, fv = ls.line_search_armijo(f, x, p, g0, f0)
         c += 1
         assert_equal(self.fcount, fc)
         assert_fp_equal(fv, f(x + s * p))
         assert_line_armijo(x, p, s, f, err_msg=name)
     assert_(c >= 9)
Пример #4
0
def lineSearch(encoded_d_k, fval_x_k):
	"""
	returns:
		alpha_k: float or None
			alpha for which x_kp1 = x_k + alpha * d_k, or None if line search algorithm did not converge.
		new_fval : float or None
			New function value f(x_kp1), or None if the line search algorithm did not converge.
	"""
	d_k = np.frombuffer(base64.decodestring(encoded_d_k),dtype=np.float64)
	alpha_k, fc, new_fval = \
			line_search_armijo(costFunction, params, d_k, accruedGradients, fval_x_k, args=(X,y), c1=1e-5)
	#cast to float because line_search_armijo returns type numpy float
	alpha_k = float(alpha_k) if alpha_k is not None else None 
	new_fval = float(new_fval) if new_fval is not None else None 
	
	return (alpha_k, new_fval)
Пример #5
0
def fista(grad,
          obj,
          prox,
          x0,
          momentum=True,
          max_iter=100,
          step_size=None,
          early_stopping=True,
          eps=np.finfo(np.float32).eps,
          times=False,
          debug=False,
          verbose=0,
          name="Optimization"):
    """ F/ISTA algorithm. """
    if verbose and not debug:
        warnings.warn("Can't have verbose if cost-func is not computed, "
                      "enable it by setting debug=True")

    adaptive_step_size = False
    if step_size is None:
        adaptive_step_size = True
        step_size = 1.0

    # prepare the iterate
    t = t_old = 1
    z_old = np.zeros_like(x0)
    x = np.copy(x0)

    if adaptive_step_size and x.ndim > 1:
        raise ValueError("Backtracking line search need to have 1D gradient")

    # saving variables
    pobj_, times_ = [obj(x)], [0.0]

    # precompute L.op(y)
    if adaptive_step_size:
        old_fval = obj(x)

    # main loop
    for ii in range(max_iter):

        if times:
            t0 = time.time()

        grad_ = grad(x)

        # step-size
        if adaptive_step_size:
            step_size, _, old_fval = line_search_armijo(obj,
                                                        x.ravel(),
                                                        -grad_.ravel(),
                                                        grad_.ravel(),
                                                        old_fval,
                                                        c1=1.0e-5,
                                                        alpha0=step_size)
            if step_size is None:
                step_size = 0.0

        # descent step
        z = prox(x - step_size * grad_, step_size)

        # fista acceleration
        if momentum:
            t = 0.5 * (1.0 + np.sqrt(1.0 + 4.0 * t_old**2))
            x = z + (t_old - 1.0) / t * (z - z_old)
        else:
            x = z

        # savings
        if debug:
            if adaptive_step_size:
                pobj_.append(old_fval)
            else:
                pobj_.append(obj(x))

        # printing
        if debug and verbose > 0:
            print("[{0}] Iteration {1} / {2}, "
                  "loss = {3}".format(name, ii + 1, max_iter, pobj_[ii]))

        # early-stopping
        l1_diff = np.sum(np.abs(z - z_old))
        if l1_diff <= eps and early_stopping:
            if debug:
                print("---> [{0}] early-stopping "
                      "done at {1}/{2}".format(name, ii + 1, max_iter))
            break
        if l1_diff > np.finfo(np.float64).max:
            raise RuntimeError("[{}] {} have diverged during.".format(
                name, ["ISTA", "FISTA"][momentum]))

        # update iterates
        t_old = t
        z_old = z

        # savings
        if times:
            times_.append(time.time() - t0)

    if not times and not debug:
        return x
    if times and not debug:
        return x, times_
    if not times and debug:
        return x, pobj_
    if times and debug:
        return x, pobj_, times_
Пример #6
0
def hfn(func, x0, hess_vec, tol=1e-5, max_iter=500, c1=1e-4, c2=0.9, disp=False, trace=False):

    if (trace):
        hist = {}
        hist['f'] = []
        hist['norm_g'] = []
        hist['elaps_t'] = []
        start_time = time.clock()

    f = lambda x: func(x)[0];
    df = lambda x: func(x)[1];

    x = x0
    [loss, grad, extra] = func(x)
    grad_norm = linalg.norm(grad, inf)
    eps = min(1 / 2, sqrt(grad_norm)) * grad_norm

    for i in range(0, max_iter):

        #Start cg
        z = zeros(shape(x))
        g =  grad
        d = -g
        u = hess_vec(x, d, extra)

        for j in range(0,1000):
            gamma = g.transpose().dot(g)/(d.transpose().dot(u))
            z = z + gamma*d
            g1 = g + gamma*u
            b = True
            if linalg.norm(g1,inf)<eps:
                b = False
                break
            else:
                betta = g1.transpose().dot(g1)/(g.transpose().dot(g))
                d = -g1+betta*d
                u = hess_vec(x,d,extra)
                g = g1
        if b:
            print('CG не сошелся')

        #Одномерный линейный поиск
        alpha = line_search_wolfe2(f = f,myfprime = df, xk = x, pk = z, gfk = grad, old_fval = loss, c1 = c1, c2 = c2)
        if (alpha[0] == None):
            alpha = line_search_armijo(f = f,myfprime = df, xk = x, pk = z, gfk = grad, old_fval = loss, c1 = c1, alpha0 = 1)
        x = x + alpha[0]*z

        [loss, grad, extra] = func(x)
        grad_norm = linalg.norm(grad, inf)
        eps = min(1 / 2, sqrt(grad_norm)) * grad_norm

        if (disp):
            print(str(1+i) + ')', loss, grad_norm);
        if (trace):
            hist['f'].append(loss)
            hist['norm_g'].append(grad_norm)
            current_time = time.clock() - start_time
            hist['elaps_t'].append(current_time)

        if grad_norm<tol:
            return x, loss, 0

    return x, loss, 1
Пример #7
0
    def _update_sol(self, solver, objective, niter):

        if (niter % (self.k + 1)) == 0:  # Extrapolate at each k iterations

            self.buffer.append(solver.sol)

            # (Normalized) matrix of differences
            U = np.diff(self.buffer, axis=0)
            UU = np.dot(U, U.T)
            UU /= np.linalg.norm(UU)

            # If no parameter grid was provided, assemble one.
            if self.adaptive and (len(self.lambda_) <= 1):
                svals = np.sort(np.abs(np.linalg.eigvals(UU)))
                svals = np.log(svals)
                svals = 0.5 * (svals[:-1] + svals[1:])
                self.lambda_ = np.concatenate(([0.], np.exp(svals)))

            # Grid search for the best parameter for the extrapolation
            fvals = []
            c = np.zeros((self.k,))
            extrap = np.zeros(np.shape(solver.sol))

            for lambda_ in self.lambda_:
                # Coefficients of the extrapolation
                c[:] = np.linalg.solve(UU + lambda_ * np.eye(self.k),
                                       np.ones(self.k))
                c[:] /= np.sum(c)

                extrap[:] = np.dot(np.asarray(self.buffer[:-1]).T, c)

                fvals.append(np.sum([f.eval(extrap) for f in self.functions]))

            if self.forcedecrease and (min(fvals) > np.sum(objective[-1])):
                # If we have bad extrapolations, keep solution as is
                extrap[:] = solver.sol
            else:
                # Return the best extrapolation from the grid search
                lambda_ = self.lambda_[fvals.index(min(fvals))]

                # We can afford to solve the linear system here again because
                # self.k is normally very small. Alternatively, we could have
                # kept track of the best extrapolations during the grid search,
                # but that would require at least double the memory, as we'd
                # have to store both the current extrapolation and the best
                # extrapolation.
                c[:] = np.linalg.solve(UU + lambda_ * np.eye(self.k),
                                       np.ones(self.k))
                c[:] /= np.sum(c)
                extrap[:] = np.dot(np.asarray(self.buffer[:-1]).T, c)

            # Improve proposal with line search
            if self.dolinesearch:
                # Objective evaluation functional
                def f(x):
                    return np.sum([f.eval(x) for f in self.functions])
                # Solution at previous extrapolation
                xk = self.buffer[0]
                # Search direction
                pk = extrap - xk
                # Objective value during the previous extrapolation
                old_fval = np.sum(objective[-self.k])

                a, fc, fa = line_search_armijo(f=f,
                                               xk=xk,
                                               pk=pk,
                                               gfk=-pk,
                                               old_fval=old_fval,
                                               c1=1e-4,
                                               alpha0=1.)

                # New point proposal
                if a is None:
                    warnings.warn('Line search failed to find good step size')
                else:
                    extrap[:] = xk + a * pk

            # Clear buffer and parameter grid for next extrapolation process
            self.buffer = []
            self.lambda_ = [] if self.adaptive else self.lambda_

            return extrap

        else:  # Gather points for future extrapolation
            self.buffer.append(copy.copy(solver.sol))
            return solver.sol
def opt_hyper_gaussian_natural_gradient(gpr, hyperparams, mean_key, variance_key, 
                                        maxiter=500,inner_iter=10, 
                                        Ifilter=None,gradcheck=False,
                                        bounds = None,callback=None, 
                                        optimizer=OPT.fmin_tnc,gradient_tolerance=-1,
                                        messages=False, *args,**kw_args):
    
    def f(x, *args):
        x_ = X0
        x_[Ifilter_x] = x
        rv =  gpr.LML(param_list_to_dict(x_,param_struct,skeys),*args,**kw_args)
        #LG.debug("L("+str(x_)+")=="+str(rv))
        if numpy.isnan(rv):
            return 1E6
        return rv
    
    def df(x, *args):
        x_ = X0
        x_[Ifilter_x] = x
        rv =  gpr.LMLgrad(param_list_to_dict(x_,param_struct,skeys),*args,**kw_args)
        rv = param_dict_to_list(rv,skeys)
        #LG.debug("dL("+str(x_)+")=="+str(rv))
        if not numpy.isfinite(rv).all(): #numpy.isnan(rv).any():
            In = numpy.isnan(rv)
            rv[In] = 1E6
        return rv[Ifilter_x]

    #0. store parameter structure
    skeys = numpy.sort(hyperparams.keys())
    param_struct = dict([(name,hyperparams[name].shape) for name in skeys])

    
    #1. convert the dictionaries to parameter lists
    X0 = param_dict_to_list(hyperparams,skeys)
    if Ifilter is not None:
        Ifilter_x = numpy.array(param_dict_to_list(Ifilter,skeys),dtype='bool')
    else:
        Ifilter_x = numpy.ones(len(X0),dtype='bool')

    #2. bounds
    if bounds is not None:
        #go through all hyperparams and build bound array (flattened)
        _b = []
        for key in skeys:
            if key in bounds.keys():
                _b.extend(bounds[key])
            else:
                _b.extend([(-numpy.inf,+numpy.inf)]*hyperparams[key].size)
        bounds = numpy.array(_b)
        bounds = bounds[Ifilter_x]
        pass
       
        
    #2. set stating point of optimization, truncate the non-used dimensions
    x  = X0.copy()[Ifilter_x]
        
    LG.debug("startparameters for opt:"+str(x))
    
    if gradcheck:
        checkgrad(f, df, x)
        LG.info("check_grad (pre) (Enter to continue):" + str(OPT.check_grad(f,df,x)))
        raw_input()
##        
    LG.debug("start optimization")


    
    if gradient_tolerance < 0:
        gradient_tolerance = numpy.sqrt(numpy.finfo(float).eps)
    
    hyper_for_opt = hyperparams.copy()
    normal_keys = [mean_key, variance_key]
    hyperparam_keys = [v for v in hyperparams.keys() if not v in normal_keys]
    last_lml = gpr.LML(hyperparams)
    curr_lml = last_lml + 2*gradient_tolerance
    
    direction_dict = dict([(k,numpy.zeros_like(v)) for k,v in hyperparams.iteritems()])
    Q = hyper_for_opt[variance_key].shape[1]
    
    while maxiter > 0 and numpy.abs(last_lml-curr_lml) > gradient_tolerance:
        print "Iteration Loops left %s" % maxiter
        last_lml = gpr.LML(hyper_for_opt)
        # optimize non gaussian parameters
        #general optimizer interface
        #note: x is a subset of X, indexing the parameters that are optimized over
        # Ifilter_x pickes the subest of X, yielding x
        hyper_for_opt = optimizer(f, x, fprime=df, args=[hyperparam_keys], maxfun=int(inner_iter),
                           pgtol=gradient_tolerance, messages=messages, bounds=bounds)
        #    optimizer = OPT.fmin_l_bfgs_b
        #    opt_RV=optimizer(f, x, fprime=df, maxfun=int(maxiter),iprint =1, bounds=bounds, factr=10.0, pgtol=1e-10)

        Xopt = X0.copy()
        Xopt[Ifilter_x] = hyper_for_opt[0]
        #convert into dictionary
        hyper_for_opt = param_list_to_dict(hyper_for_opt[0],param_struct,skeys)
        curr_lml = gpr.LML(hyper_for_opt)
    
        # optimize natural gradient parameters:
        print("  NIT   NF   F                       GTG")
        grad_mean_last = numpy.ones((1,Q)); grad_variance_last=numpy.ones((0,Q)); direction_last = 0
        for i in xrange(inner_iter):
            #mean = hyper_for_opt[mean_key]
            variance = hyper_for_opt[variance_key]

            grad_gaussian = gpr.LMLgrad(hyper_for_opt, hyperparam_keys=normal_keys)
            grad_mean = grad_gaussian[mean_key]
            grad_variance = grad_gaussian[variance_key]
            
            grad = numpy.append(grad_mean, grad_variance, 0)
            grad_ = numpy.append(grad_mean / variance, grad_variance / 2., 0)
            grad_last = numpy.append(grad_mean_last, grad_variance_last, 0)
            beta = ((grad_ * (grad - grad_last)) / (grad * grad_).sum(0)).sum(0)

            direction = -grad_ + beta * direction_last

            direction_dict[mean_key] = direction[:grad_mean.shape[0],:]
            direction_dict[variance_key] = direction[grad_mean.shape[0]:,:]
            
            alpha = line_search_armijo(f, param_dict_to_list(hyper_for_opt, skeys), 
                                param_dict_to_list(direction_dict, skeys), 
                                0,
                                [normal_keys])
            
            hyper_for_opt[mean_key] = alpha[0] * direction[:grad_mean.shape[0],:]
            hyper_for_opt[variance_key] = alpha[0] * direction[grad_mean.shape[0]:,:]
            
            grad_mean_last = grad_mean
            grad_variance_last = grad_variance
            direction_last = direction
            
        maxiter -= 1
    
    
    
    
    #relate back to X
    Xopt = X0.copy()
    Xopt[Ifilter_x] = hyper_for_opt
    #convert into dictionary
    opt_hyperparams = param_list_to_dict(Xopt,param_struct,skeys)
    #get the log marginal likelihood at the optimum:
    opt_lml = gpr.LML(opt_hyperparams,**kw_args)

    
    return hyper_for_opt, curr_lml
Пример #9
0
    def _update_sol(self, solver, objective, niter):

        if (niter % (self.k + 1)) == 0:  # Extrapolate at each k iterations

            self.buffer.append(solver.sol)

            # (Normalized) matrix of differences
            U = np.diff(self.buffer, axis=0)
            UU = np.dot(U, U.T)
            UU /= np.linalg.norm(UU)

            # If no parameter grid was provided, assemble one.
            if self.adaptive and (len(self.lambda_) <= 1):
                svals = np.sort(np.abs(np.linalg.eigvals(UU)))
                svals = np.log(svals)
                svals = 0.5 * (svals[:-1] + svals[1:])
                self.lambda_ = np.concatenate(([0.], np.exp(svals)))

            # Grid search for the best parameter for the extrapolation
            fvals = []
            c = np.zeros((self.k, ))
            extrap = np.zeros(np.shape(solver.sol))

            for lambda_ in self.lambda_:
                # Coefficients of the extrapolation
                c[:] = np.linalg.solve(UU + lambda_ * np.eye(self.k),
                                       np.ones(self.k))
                c[:] /= np.sum(c)

                extrap[:] = np.dot(np.asarray(self.buffer[:-1]).T, c)

                fvals.append(np.sum([f.eval(extrap) for f in self.functions]))

            if self.forcedecrease and (min(fvals) > np.sum(objective[-1])):
                # If we have bad extrapolations, keep solution as is
                extrap[:] = solver.sol
            else:
                # Return the best extrapolation from the grid search
                lambda_ = self.lambda_[fvals.index(min(fvals))]

                # We can afford to solve the linear system here again because
                # self.k is normally very small. Alternatively, we could have
                # kept track of the best extrapolations during the grid search,
                # but that would require at least double the memory, as we'd
                # have to store both the current extrapolation and the best
                # extrapolation.
                c[:] = np.linalg.solve(UU + lambda_ * np.eye(self.k),
                                       np.ones(self.k))
                c[:] /= np.sum(c)
                extrap[:] = np.dot(np.asarray(self.buffer[:-1]).T, c)

            # Improve proposal with line search
            if self.dolinesearch:
                # Objective evaluation functional
                def f(x):
                    return np.sum([f.eval(x) for f in self.functions])

                # Solution at previous extrapolation
                xk = self.buffer[0]
                # Search direction
                pk = extrap - xk
                # Objective value during the previous extrapolation
                old_fval = np.sum(objective[-self.k])

                a, fc, fa = line_search_armijo(f=f,
                                               xk=xk,
                                               pk=pk,
                                               gfk=-pk,
                                               old_fval=old_fval,
                                               c1=1e-4,
                                               alpha0=1.)

                # New point proposal
                if a is None:
                    warnings.warn('Line search failed to find good step size')
                else:
                    extrap[:] = xk + a * pk

            # Clear buffer and parameter grid for next extrapolation process
            self.buffer = []
            self.lambda_ = [] if self.adaptive else self.lambda_

            return extrap

        else:  # Gather points for future extrapolation
            self.buffer.append(copy.copy(solver.sol))
            return solver.sol
Пример #10
0
def fista(grad,
          obj,
          prox,
          x0,
          momentum='fista',
          restarting=None,
          max_iter=100,
          step_size=None,
          early_stopping=True,
          eps=np.finfo(np.float64).eps,
          times=False,
          debug=False,
          verbose=0,
          name="Optimization"):
    """ ISTA like algorithm. """
    # parameters checking
    if verbose and not debug:
        print(f"[{name}] Can't have verbose if cost-func is not computed, "
              f"enable it by setting debug=True")

    if momentum not in [None, 'fista', 'greedy']:
        raise ValueError(f"[{name}] momentum should be ['fista', 'ista', "
                         f"'greedy'], got {momentum}")

    if restarting not in [None, 'obj', 'descent']:
        raise ValueError(f"[{name}] restarting should be [None, 'obj', "
                         f"'descent'], got {restarting}")

    if momentum == 'ista' and restarting in ['obj', 'descent']:
        raise ValueError(f"[{name}] restarting can't be set to 'obj' or "
                         f"'descent' if momentum == 'ista'")

    # prepare the iterate
    x_old, x, y, y_old = np.copy(x0), np.copy(x0), np.copy(x0), np.copy(x0)
    pobj_, times_, diff_ = [obj(x)], [0.0], [0.0]
    t = t_old = 1

    # prepare the adaptative-step variables
    adaptive_step_size = False
    if step_size is None:
        adaptive_step_size = True
        step_size = 1.0
        old_fval = pobj_[0]

    # main loop
    for ii in range(max_iter):

        if times:
            t0 = time.time()

        grad_ = grad(y)

        # adaptative step-size
        if adaptive_step_size:
            step_size, _, old_fval = line_search_armijo(obj,
                                                        y.ravel(),
                                                        -grad_.ravel(),
                                                        grad_.ravel(),
                                                        old_fval,
                                                        c1=1.0e-5,
                                                        alpha0=step_size)
            if step_size is None:
                step_size = 0.0

        # main descent step
        x = prox(y - step_size * grad_, step_size)

        # fista acceleration
        if momentum is None:
            y = x

        elif momentum == 'fista':
            t = 0.5 * (1.0 + np.sqrt(1.0 + 4.0 * t_old**2))
            y = x + (t_old - 1.0) / t * (x - x_old)

        elif momentum == 'greedy':
            y = x + (x - x_old)

        diff_.append(np.linalg.norm(x - x_old))

        # savings times
        if times:
            # skip cost-function computation for benchmark
            delta_t = time.time() - t0

        # savings cost-function values
        if debug:
            pobj_.append(obj(x))

        # savings times, restart after cost-function computation
        if times:
            t0 = time.time()

        if restarting == 'obj' and (pobj_[-1] > pobj_[-2]):
            # restart if cost function increase
            if momentum == 'fista':
                x = x_old
                t = 1.0
            elif momentum == 'greedy':
                y = x

        if restarting == 'descent' and np.sum((y_old - x) * (x - x_old)) > 0.0:
            # restart if x_k+1 - x_k has the same direction than x_k - x_k-1
            if momentum == 'fista':
                x = x_old
                t = 1.0
            elif momentum == 'greedy':
                y = x

        # variables updates k+1, k, k-1
        t_old = t
        x_old = x
        y_old = y

        # verbose at every 100th iterations
        if debug and verbose > 0 and ii % 100:
            print(
                f"\r[{name}] Iteration {100.0 * (ii + 1) / max_iter:.0f}%, "
                f"loss = {pobj_[ii]:.3e}, "
                f"grad-norm = {np.linalg.norm(grad_):.3e}",
                end='',
                flush=True)

        # early-stopping on || x_k - x_k-1 || < eps
        if early_stopping and diff_[-1] <= eps:
            if debug:
                print(f"\r[{name}] early-stopping "
                      f"done at {100.0 * (ii + 1) / max_iter:.0f}%, "
                      f"loss = {pobj_[ii]:.3e}, "
                      f"grad-norm = {np.linalg.norm(grad_):.3e}")
                print("\n")
            break

        # divergence safeguarding
        if diff_[-1] > np.finfo(np.float64).max:
            raise RuntimeError(f"\n[{name}] algo. have diverged during.")

        # savings times
        if times:
            times_.append(delta_t + time.time() - t0)

    if not times and not debug:
        return x
    if times and not debug:
        return x, np.array(times_)
    if not times and debug:
        return x, np.array(pobj_)
    if times and debug:
        return x, np.array(pobj_), np.array(times_)